From 1b0c62c07566fd190fca88a5b98792a6e6f752da Mon Sep 17 00:00:00 2001 From: manfredcalvo Date: Tue, 17 Dec 2024 16:48:53 -0600 Subject: [PATCH 1/2] Adding autogen integration --- .../01_data_pipeline.ipynb | 537 ++++++++++++++ .../02_agent_setup.ipynb | 113 +++ .../03_create_synthetic_eval.ipynb | 144 ++++ .../04_create_tools.ipynb | 688 ++++++++++++++++++ .../05_tool_calling_agent.ipynb | 445 +++++++++++ .../06_multi_agent_with_genie.ipynb | 493 +++++++++++++ autogen_agent_app_sample_code/README.md | 160 ++++ autogen_agent_app_sample_code/__init__.py | 0 .../autogen_started.ipynb | 195 +++++ .../configs/README.md | 1 + .../configs/agent_storage_config.yaml | 4 + .../configs/data_pipeline_config.yaml | 19 + .../function_calling_agent_config.yaml | 60 ++ .../cookbook/__init__.py | 0 .../cookbook/agents/__init__.py | 0 .../cookbook/agents/function_calling_agent.py | 214 ++++++ .../cookbook/agents/multi_agent_supervisor.py | 616 ++++++++++++++++ .../cookbook/agents/utils/__init__.py | 0 .../cookbook/agents/utils/chat.py | 145 ++++ .../utils/databricks_model_serving_client.py | 40 + .../cookbook/agents/utils/execute_function.py | 8 + .../cookbook/agents/utils/load_config.py | 138 ++++ .../agents/utils/playground_parser.py | 98 +++ .../cookbook/agents/utils/signatures.py | 49 ++ .../cookbook/config/__init__.py | 99 +++ .../cookbook/config/agents/__init__.py | 0 .../config/agents/function_calling_agent.py | 75 ++ .../cookbook/config/agents/genie_agent.py | 37 + .../config/agents/multi_agent_supervisor.py | 266 +++++++ .../cookbook/config/agents/rag_only.py | 25 + .../cookbook/config/data_pipeline/__init__.py | 49 ++ .../data_pipeline/data_pipeline_output.py | 314 ++++++++ .../data_pipeline/recursive_text_splitter.py | 89 +++ .../config/data_pipeline/uc_volume_source.py | 132 ++++ .../cookbook/config/shared/__init__.py | 0 .../config/shared/agent_storage_location.py | 118 +++ .../cookbook/config/shared/llm.py | 42 ++ .../cookbook/data_pipeline/__init__.py | 0 .../data_pipeline/build_retriever_index.py | 123 ++++ .../cookbook/data_pipeline/chunk_docs.py | 44 ++ .../cookbook/data_pipeline/default_parser.py | 162 +++++ .../cookbook/data_pipeline/parse_docs.py | 159 ++++ .../recursive_character_text_splitter.py | 255 +++++++ .../cookbook/data_pipeline/utils/__init__.py | 0 .../utils/typed_dicts_to_spark_schema.py | 103 +++ .../cookbook/databricks_utils/__init__.py | 225 ++++++ .../agent_evaluation/__init__.py | 0 .../agent_evaluation/evaluation_set.py | 236 ++++++ .../agent_framework/__init__.py | 0 .../agent_framework/get_inference_tables.py | 35 + .../install_cluster_library.py | 107 +++ .../cookbook/tools/__init__.py | 45 ++ .../cookbook/tools/local_function.py | 165 +++++ .../cookbook/tools/uc_tool.py | 172 +++++ .../cookbook/tools/uc_tool_utils.py | 132 ++++ .../cookbook/tools/vector_search.py | 455 ++++++++++++ .../environment.yaml | 4 + autogen_agent_app_sample_code/pyproject.toml | 36 + .../requirements.txt | 14 + .../requirements_datapipeline.txt | 9 + .../tests/conftest.py | 6 + .../tests/test_data_pipeline_utils.py | 113 +++ autogen_agent_app_sample_code/tools/README.md | 1 + .../tools/__init__.py | 0 .../tools/code_exec.py | 20 + .../tools/sample_tool.py | 46 ++ .../tools/test_code_exec.py | 89 +++ .../tools/test_code_exec_as_uc_tool.py | 102 +++ .../tools/test_sample_tool.py | 52 ++ .../tools/test_sample_tool_uc.py | 72 ++ 70 files changed, 8395 insertions(+) create mode 100644 autogen_agent_app_sample_code/01_data_pipeline.ipynb create mode 100644 autogen_agent_app_sample_code/02_agent_setup.ipynb create mode 100644 autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb create mode 100644 autogen_agent_app_sample_code/04_create_tools.ipynb create mode 100644 autogen_agent_app_sample_code/05_tool_calling_agent.ipynb create mode 100644 autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb create mode 100644 autogen_agent_app_sample_code/README.md create mode 100644 autogen_agent_app_sample_code/__init__.py create mode 100644 autogen_agent_app_sample_code/autogen_started.ipynb create mode 100644 autogen_agent_app_sample_code/configs/README.md create mode 100644 autogen_agent_app_sample_code/configs/agent_storage_config.yaml create mode 100644 autogen_agent_app_sample_code/configs/data_pipeline_config.yaml create mode 100644 autogen_agent_app_sample_code/configs/function_calling_agent_config.yaml create mode 100644 autogen_agent_app_sample_code/cookbook/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/function_calling_agent.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/multi_agent_supervisor.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/chat.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/execute_function.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/load_config.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/playground_parser.py create mode 100644 autogen_agent_app_sample_code/cookbook/agents/utils/signatures.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/agents/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/agents/function_calling_agent.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/agents/genie_agent.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/agents/multi_agent_supervisor.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/agents/rag_only.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/data_pipeline/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/data_pipeline/data_pipeline_output.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/data_pipeline/recursive_text_splitter.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/data_pipeline/uc_volume_source.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/shared/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/shared/agent_storage_location.py create mode 100644 autogen_agent_app_sample_code/cookbook/config/shared/llm.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/build_retriever_index.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/chunk_docs.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/default_parser.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/parse_docs.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/recursive_character_text_splitter.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/utils/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/data_pipeline/utils/typed_dicts_to_spark_schema.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/evaluation_set.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/get_inference_tables.py create mode 100644 autogen_agent_app_sample_code/cookbook/databricks_utils/install_cluster_library.py create mode 100644 autogen_agent_app_sample_code/cookbook/tools/__init__.py create mode 100644 autogen_agent_app_sample_code/cookbook/tools/local_function.py create mode 100644 autogen_agent_app_sample_code/cookbook/tools/uc_tool.py create mode 100644 autogen_agent_app_sample_code/cookbook/tools/uc_tool_utils.py create mode 100644 autogen_agent_app_sample_code/cookbook/tools/vector_search.py create mode 100644 autogen_agent_app_sample_code/environment.yaml create mode 100644 autogen_agent_app_sample_code/pyproject.toml create mode 100644 autogen_agent_app_sample_code/requirements.txt create mode 100644 autogen_agent_app_sample_code/requirements_datapipeline.txt create mode 100644 autogen_agent_app_sample_code/tests/conftest.py create mode 100644 autogen_agent_app_sample_code/tests/test_data_pipeline_utils.py create mode 100644 autogen_agent_app_sample_code/tools/README.md create mode 100644 autogen_agent_app_sample_code/tools/__init__.py create mode 100644 autogen_agent_app_sample_code/tools/code_exec.py create mode 100644 autogen_agent_app_sample_code/tools/sample_tool.py create mode 100644 autogen_agent_app_sample_code/tools/test_code_exec.py create mode 100644 autogen_agent_app_sample_code/tools/test_code_exec_as_uc_tool.py create mode 100644 autogen_agent_app_sample_code/tools/test_sample_tool.py create mode 100644 autogen_agent_app_sample_code/tools/test_sample_tool_uc.py diff --git a/autogen_agent_app_sample_code/01_data_pipeline.ipynb b/autogen_agent_app_sample_code/01_data_pipeline.ipynb new file mode 100644 index 0000000..72b147d --- /dev/null +++ b/autogen_agent_app_sample_code/01_data_pipeline.ipynb @@ -0,0 +1,537 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC # Unstructured data pipeline for the Agent's Retriever +# MAGIC +# MAGIC By the end of this notebook, you will have transformed your unstructured documents into a vector index that can be queried by your Agent. +# MAGIC +# MAGIC This means: +# MAGIC - Documents loaded into a delta table. +# MAGIC - Documents are chunked. +# MAGIC - Chunks have been embedded with an embedding model and stored in a vector index. +# MAGIC +# MAGIC The important resulting artifact of this notebook is the chunked vector index. This will be used in the next notebook to power our Retriever. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 👉 START HERE: How to Use This Notebook +# MAGIC +# MAGIC Follow these steps to build and refine your data pipeline's quality: +# MAGIC +# MAGIC 1. **Build a v0 index with default settings** +# MAGIC - Configure the data source and destination tables in the `1️⃣ 📂 Data source & destination configuration` cells +# MAGIC - Press `Run All` to create the vector index. +# MAGIC +# MAGIC *Note: While you can adjust the other settings and modify the parsing/chunking code, we suggest doing so only after evaluating your Agent's quality so you can make improvements that specifically address root causes of quality issues.* +# MAGIC +# MAGIC 2. **Use later notebooks to integrate the retriever into an the agent and evaluate the agent/retriever's quality.** +# MAGIC +# MAGIC 3. **If the evaluation results show retrieval issues as a root cause, use this notebook to iterate on your data pipeline's code & config.** Below are some potential fixes you can try, see the AI Cookbook's [debugging retrieval issues](https://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-1-retrieval.html) section for details.** +# MAGIC - Add missing, but relevant source documents into in the index. +# MAGIC - Resolve any conflicting information in source documents. +# MAGIC - Adjust the data pipeline configuration: +# MAGIC - Modify chunk size or overlap. +# MAGIC - Experiment with different embedding models. +# MAGIC - Adjust the data pipeline code: +# MAGIC - Create a custom parser or use different parsing libraries. +# MAGIC - Develop a custom chunker or use different chunking techniques. +# MAGIC - Extract additional metadata for each document. +# MAGIC - Adjust the Agent's code/config in subsequent notebooks: +# MAGIC - Change the number of documents retrieved (K). +# MAGIC - Try a re-ranker. +# MAGIC - Use hybrid search. +# MAGIC - Apply extracted metadata as filters. +# MAGIC +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: +# MAGIC - ✅✏️ *should* customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality +# MAGIC - 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to execute the pipeline +# MAGIC +# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Install Python libraries (Databricks Notebook only) +# MAGIC +# MAGIC 🚫✏️ Only modify if you need additional packages in your code changes to the document parsing or chunking logic. +# MAGIC +# MAGIC Versions of Databricks code are not locked since Databricks ensures changes are backwards compatible. +# MAGIC Versions of open source packages are locked since package authors often make backwards compatible changes + +# COMMAND ---------- + +# MAGIC %pip install -qqqq -U -r requirements.txt +# MAGIC %pip install -qqqq -U -r requirements_datapipeline.txt +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### Connect to Databricks (Local IDE only) +# MAGIC +# MAGIC If running from an IDE with [`databricks-connect`](https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html), connect to a Spark session & install the necessary packages on that cluster. + +# COMMAND ---------- + +from cookbook.databricks_utils import get_cluster_url +from cookbook.databricks_utils import get_active_cluster_id +from cookbook.databricks_utils.install_cluster_library import install_requirements + +# UNCOMMENT TO INSTALL PACKAGES ON THE ACTIVE CLUSTER; this is code that is not super battle tested. +# cluster_id = get_active_cluster_id() +# print(f"Installing packages on the active cluster: {get_cluster_url(cluster_id)}") + + +# install_requirements(cluster_id, "requirements.txt") +# install_requirements(cluster_id, "requirements_datapipeline.txt") + +# THIS MUST BE DONE MANUALLY! TODO: Automate it. +# - Go to openai_sdk_agent_app_sample_code/ +# - Run `poetry build` +# - Copy the wheel file to a UC Volume or Workspace folder +# - Go to the cluster's Libraries page and install the wheel file as a new library + +# Get Spark session if using Databricks Connect from an IDE +from mlflow.utils import databricks_utils as du + +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + + spark = DatabricksSession.builder.getOrCreate() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC ## 1️⃣ 📂 Data source & destination configuration + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Configure the data pipeline's source location. +# MAGIC +# MAGIC Choose a [Unity Catalog Volume](https://docs.databricks.com/en/volumes/index.html) containing PDF, HTML, etc documents to be parsed/chunked/embedded. +# MAGIC +# MAGIC - `uc_catalog_name`: Name of the Unity Catalog. +# MAGIC - `uc_schema_name`: Name of the Unity Catalog schema. +# MAGIC - `uc_volume_name`: Name of the Unity Catalog volume. +# MAGIC +# MAGIC Running this cell with validate that the UC Volume exists, trying to create it if not. +# MAGIC + +# COMMAND ---------- + +from cookbook.config.data_pipeline.uc_volume_source import UCVolumeSourceConfig + +# Configure the UC Volume that contains the source documents +source_config = UCVolumeSourceConfig( + # uc_catalog_name="REPLACE_ME", # REPLACE_ME + # uc_schema_name="REPLACE_ME", # REPLACE_ME + # uc_volume_name=f"REPLACE_ME", # REPLACE_ME + uc_catalog_name="casaman_ssa", # REPLACE_ME + uc_schema_name="demos", # REPLACE_ME + uc_volume_name="volume_databricks_documentation", # REPLACE_ME +) + +# Check if volume exists, create otherwise +is_valid, msg = source_config.create_or_validate_volume() +if not is_valid: + raise Exception(msg) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Configure the data pipeline's output location. +# MAGIC +# MAGIC Choose where the data pipeline outputs the parsed, chunked, and embedded documents. +# MAGIC +# MAGIC Required parameters: +# MAGIC * `uc_catalog_name`: Unity Catalog name where tables will be created +# MAGIC * `uc_schema_name`: Schema name within the catalog +# MAGIC * `base_table_name`: Core name used as prefix for all generated tables +# MAGIC * `vector_search_endpoint`: Vector Search endpoint to store the index +# MAGIC +# MAGIC Optional parameters: +# MAGIC * `docs_table_postfix`: Suffix for the parsed documents table (default: "docs") +# MAGIC * `chunked_table_postfix`: Suffix for the chunked documents table (default: "docs_chunked") +# MAGIC * `vector_index_postfix`: Suffix for the vector index (default: "docs_chunked_index") +# MAGIC * `version_suffix`: Version identifier (e.g. 'v1', 'test') to maintain multiple versions +# MAGIC +# MAGIC The generated tables follow this naming convention: +# MAGIC * Parsed docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix} +# MAGIC * Chunked docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix} +# MAGIC * Vector index: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix} +# MAGIC +# MAGIC *Note: If you are comparing different chunking/parsing/embedding strategies, set the `version_suffix` parameter to maintain multiple versions of the pipeline output with the same base_table_name.* +# MAGIC +# MAGIC *Databricks suggests sharing a Vector Search endpoint across multiple agents.* + +# COMMAND ---------- + +from cookbook.config.data_pipeline.data_pipeline_output import DataPipelineOuputConfig + +# Output configuration +output_config = DataPipelineOuputConfig( + # Required parameters + uc_catalog_name=source_config.uc_catalog_name, # usually same as source volume catalog, by default is the same as the source volume catalog + uc_schema_name=source_config.uc_schema_name, # usually same as source volume schema, by default is the same as the source volume schema + #base_table_name=source_config.uc_volume_name, # usually similar / same as the source volume name; by default, is the same as the volume_name + base_table_name="test_product_docs", # usually similar / same as the source volume name; by default, is the same as the volume_name + # vector_search_endpoint="REPLACE_ME", # Vector Search endpoint to store the index + vector_search_endpoint="one-env-shared-endpoint-3", # Vector Search endpoint to store the index + + # Optional parameters, showing defaults + docs_table_postfix="docs", # default value is `docs` + chunked_table_postfix="docs_chunked", # default value is `docs_chunked` + vector_index_postfix="docs_chunked_index", # default value is `docs_chunked_index` + version_suffix="v2" # default is None + + # Output tables / indexes follow this naming convention: + # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix} + # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix} + # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix} +) + +# Alternatively, you can directly pass in the UC locations of the tables / indexes +# output_config = DataPipelineOuputConfig( +# chunked_docs_table="catalog.schema.docs_chunked", +# parsed_docs_table="catalog.schema.parsed_docs", +# vector_index="catalog.schema.docs_chunked_index", +# vector_search_endpoint="REPLACE_ME", +# ) + +# Check UC locations exist +is_valid, msg = output_config.validate_catalog_and_schema() +if not is_valid: + raise Exception(msg) + +# Check Vector Search endpoint exists +is_valid, msg = output_config.validate_vector_search_endpoint() +if not is_valid: + raise Exception(msg) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Configure chunk size and the embedding model. +# MAGIC +# MAGIC **Chunk size and overlap** control how a larger document is turned into smaller chunks that can be processed by an embedding model. See the AI Cookbook [chunking deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#chunking) for more details. +# MAGIC +# MAGIC **The embedding model** is an AI model that is used to identify the most similar documents to a given user's query. See the AI Cookbook [embedding model deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#embedding-model) for more details. +# MAGIC +# MAGIC This notebook supports the following [Foundational Models](https://docs.databricks.com/en/machine-learning/foundation-models/index.html) or [External Model](https://docs.databricks.com/en/generative-ai/external-models/index.html) of type `/llm/v1/embeddings`/. If you want to try another model, you will need to modify the `utils/get_recursive_character_text_splitter` Notebook to add support. +# MAGIC - `databricks-gte-large-en` or `databricks-bge-large-en` +# MAGIC - Azure OpenAI or OpenAI External Model of type `text-embedding-ada-002`, `text-embedding-3-small` or `text-embedding-3-large` + +# COMMAND ---------- + +from cookbook.config.data_pipeline.recursive_text_splitter import RecursiveTextSplitterChunkingConfig + +chunking_config = RecursiveTextSplitterChunkingConfig( + embedding_model_endpoint="databricks-gte-large-en", # A Model Serving endpoint supporting the /llm/v1/embeddings task + chunk_size_tokens=1024, + chunk_overlap_tokens=256, +) + +# Validate the embedding endpoint & chunking config +is_valid, msg = chunking_config.validate_embedding_endpoint() +if not is_valid: + raise Exception(msg) + +is_valid, msg = chunking_config.validate_chunk_size_and_overlap() +if not is_valid: + raise Exception(msg) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### 🚫✏️ Write the data pipeline configuration to a YAML +# MAGIC +# MAGIC This allows the configuration to be loaded referenced by the Agent's notebook. + +# COMMAND ---------- + +from cookbook.config.data_pipeline import DataPipelineConfig +from cookbook.config import serializable_config_to_yaml_file + +data_pipeline_config = DataPipelineConfig( + source=source_config, + output=output_config, + chunking_config=chunking_config, +) + +serializable_config_to_yaml_file(data_pipeline_config, "./configs/data_pipeline_config.yaml") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### 🛑 If you are running your initial data pipeline, you do not need to configure anything else, you can just `Run All` the notebook cells before. You can modify these cells later to tune the quality of your data pipeline by changing the parsing logic. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 3️⃣ ⌨️ Data pipeline code +# MAGIC +# MAGIC The code below executes the data pipeline. You can modify the below code as indicated to implement different parsing or chunking strategies or to extract additional metadata fields + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Pipeline step 1: Load & parse documents into a Delta Table +# MAGIC +# MAGIC In this step, we'll load files from the UC Volume defined in `source_config` into the Delta Table `storage_config.parsed_docs_table` . The contents of each file will become a separate row in our delta table. +# MAGIC +# MAGIC The path to the source document will be used as the `doc_uri` which is displayed to your end users in the Agent Evalution web application. +# MAGIC +# MAGIC After you test your POC with stakeholders, you can return here to change the parsing logic or extraction additional metadata about the documents to help improve the quality of your retriever. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ##### ✅✏️ Customize the parsing function +# MAGIC +# MAGIC This default implementation parses PDF, HTML, and DOCX files using open source libraries. Adjust `file_parser(...)` and `ParserReturnValue` in `cookbook/data_pipeline/default_parser.py` to add change the parsing logic, add support for more file types, or extract additional metadata about each document. + +# COMMAND ---------- + +from cookbook.data_pipeline.default_parser import file_parser, ParserReturnValue + +# Print the code of file_parser function for inspection +import inspect +print(inspect.getsource(ParserReturnValue)) +print(inspect.getsource(file_parser)) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC The below cell is debugging code to test your parsing function on a single record. + +# COMMAND ---------- + +from cookbook.data_pipeline.parse_docs import load_files_to_df +from pyspark.sql import functions as F + + +raw_files_df = load_files_to_df( + spark=spark, + source_path=source_config.volume_path, +) + +print(f"Loaded {raw_files_df.count()} files from {source_config.volume_path}. Files: {source_config.list_files()}") + +test_records_dict = raw_files_df.toPandas().to_dict(orient="records") + +for record in test_records_dict: + print() + print("Testing parsing for file: ", record["path"]) + print() + test_result = file_parser(raw_doc_contents_bytes=record['content'], doc_path=record['path'], modification_time=record['modificationTime'], doc_bytes_length=record['length']) + print(test_result) + break # pause after 1 file. if you want to test more files, remove the break statement + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC 🚫✏️ The below cell is boilerplate code to apply the parsing function using Spark. + +# COMMAND ---------- + +from cookbook.data_pipeline.parse_docs import ( + load_files_to_df, + apply_parsing_fn, + check_parsed_df_for_errors, + check_parsed_df_for_empty_parsed_files +) +from cookbook.data_pipeline.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema +from cookbook.databricks_utils import get_table_url + +# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small. +NUM_PARTITIONS = 50 + +# Load the UC Volume files into a Spark DataFrame +raw_files_df = load_files_to_df( + spark=spark, + source_path=source_config.volume_path, +).repartition(NUM_PARTITIONS) + +# Apply the parsing UDF to the Spark DataFrame +parsed_files_df = apply_parsing_fn( + raw_files_df=raw_files_df, + # Modify this function to change the parser, extract additional metadata, etc + parse_file_fn=file_parser, + # The schema of the resulting Delta Table will follow the schema defined in ParserReturnValue + parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue), +) + +# Write to a Delta Table +parsed_files_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable( + output_config.parsed_docs_table +) + +# Get resulting table +parsed_files_df = spark.table(output_config.parsed_docs_table) +parsed_files_no_errors_df = parsed_files_df.filter( + parsed_files_df.parser_status == "SUCCESS" +) + +# Show successfully parsed documents +print(f"Parsed {parsed_files_df.count()} / {parsed_files_no_errors_df.count()} documents successfully. Inspect `parsed_files_no_errors_df` or visit {get_table_url(output_config.parsed_docs_table)} to see all parsed documents, including any errors.") +display(parsed_files_no_errors_df.toPandas()) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Show any parsing failures or successfully parsed files that resulted in an empty document. + +# COMMAND ---------- + + +# Any documents that failed to parse +is_error, msg, failed_docs_df = check_parsed_df_for_errors(parsed_files_df) +if is_error: + display(failed_docs_df.toPandas()) + raise Exception(msg) + +# Any documents that returned empty parsing results +is_error, msg, empty_docs_df = check_parsed_df_for_empty_parsed_files(parsed_files_df) +if is_error: + display(empty_docs_df.toPandas()) + raise Exception(msg) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### Pipeline step 2: Compute chunks of documents +# MAGIC +# MAGIC In this step, we will split our documents into smaller chunks so they can be indexed in our vector database. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC ##### ✅✏️ Chunking logic. +# MAGIC +# MAGIC We provide a default implementation of a recursive text splitter. To create your own chunking logic, adapt the `get_recursive_character_text_splitter()` function inside `cookbook.data_pipeline.recursive_character_text_splitter.py`. + +# COMMAND ---------- + +from cookbook.data_pipeline.recursive_character_text_splitter import ( + get_recursive_character_text_splitter, +) + +# Get the chunking function +recursive_character_text_splitter_fn = get_recursive_character_text_splitter( + model_serving_endpoint=chunking_config.embedding_model_endpoint, + chunk_size_tokens=chunking_config.chunk_size_tokens, + chunk_overlap_tokens=chunking_config.chunk_overlap_tokens, +) + +# Determine which columns to propagate from the docs table to the chunks table. + +# Get the columns from the parser except for the content +# You can modify this to adjust which fields are propagated from the docs table to the chunks table. +propagate_columns = [ + field.name + for field in typed_dicts_to_spark_schema(ParserReturnValue).fields + if field.name != "content" +] + +# If you want to implement retrieval strategies such as presenting the entire document vs. the chunk to the LLM, include `contentich contains the doc's full parsed text. By default this is not included because the size of contcontentquite large and cause performance issues. +# propagate_columns = [ +# field.name +# for field in typed_dicts_to_spark_schema(ParserReturnValue).fields +# ] + +# COMMAND ---------- + +# MAGIC %md +# MAGIC 🚫✏️ Run the chunking function within Spark + +# COMMAND ---------- + +from cookbook.data_pipeline.chunk_docs import apply_chunking_fn +from cookbook.databricks_utils import get_table_url + +# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small. +NUM_PARTITIONS = 50 + +# Load parsed docs +parsed_files_df = spark.table(output_config.parsed_docs_table).repartition(NUM_PARTITIONS) + +chunked_docs_df = chunked_docs_table = apply_chunking_fn( + # The source documents table. + parsed_docs_df=parsed_files_df, + # The chunking function that takes a string (document) and returns a list of strings (chunks). + chunking_fn=recursive_character_text_splitter_fn, + # Choose which columns to propagate from the docs table to chunks table. `doc_uri` column is required we can propagate the original document URL to the Agent's web app. + propagate_columns=propagate_columns, +) + +# Write to Delta Table +chunked_docs_df.write.mode("overwrite").option( + "overwriteSchema", "true" +).saveAsTable(output_config.chunked_docs_table) + +# Get resulting table +chunked_docs_df = spark.table(output_config.chunked_docs_table) + +# Show number of chunks created +print(f"Created {chunked_docs_df.count()} chunks. Inspect `chunked_docs_df` or visit {get_table_url(output_config.chunked_docs_table)} to see the results.") + +# enable CDC feed for VS index sync +cdc_results = spark.sql(f"ALTER TABLE {output_config.chunked_docs_table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)") + +# Show chunks +display(chunked_docs_df.toPandas()) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### 🚫✏️ Pipeline step 3: Create the vector index +# MAGIC +# MAGIC In this step, we'll embed the documents to compute the vector index over the chunks and create our retriever index that will be used to query relevant documents to the user question. The embedding pipeline is handled within Databricks Vector Search using [Delta Sync](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#create-a-vector-search-index) + +# COMMAND ---------- + +from cookbook.data_pipeline.build_retriever_index import build_retriever_index +from cookbook.databricks_utils import get_table_url + +is_error, msg = retriever_index_result = build_retriever_index( + # Spark requires `` to escape names with special chars, VS client does not. + chunked_docs_table_name=output_config.chunked_docs_table.replace("`", ""), + vector_search_endpoint=output_config.vector_search_endpoint, + vector_search_index_name=output_config.vector_index, + + # Must match the embedding endpoint you used to chunk your documents + embedding_endpoint_name=chunking_config.embedding_model_endpoint, + + # Set to true to re-create the vector search endpoint when re-running the data pipeline. If set to True, syncing will not work if re-run the pipeline and change the schema of chunked_docs_table_name. Keeping this as False will allow Vector Search to avoid recomputing embeddings for any row with that has a chunk_id that was previously computed. + force_delete_index_before_create=False, +) +if is_error: + raise Exception(msg) +else: + print("NOTE: This cell will complete before the vector index has finished syncing/embedding your chunks & is ready for queries!") + print(f"View sync status here: {get_table_url(output_config.vector_index)}") + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### 🚫✏️ Print links to view the resulting tables/index + +# COMMAND ---------- + +from cookbook.databricks_utils import get_table_url + +print() +print(f"Parsed docs table: {get_table_url(output_config.parsed_docs_table)}\n") +print(f"Chunked docs table: {get_table_url(output_config.chunked_docs_table)}\n") +print(f"Vector search index: {get_table_url(output_config.vector_index)}\n") \ No newline at end of file diff --git a/autogen_agent_app_sample_code/02_agent_setup.ipynb b/autogen_agent_app_sample_code/02_agent_setup.ipynb new file mode 100644 index 0000000..ad0a5f2 --- /dev/null +++ b/autogen_agent_app_sample_code/02_agent_setup.ipynb @@ -0,0 +1,113 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC ## 👉 START HERE: How to use this notebook +# MAGIC +# MAGIC ### Step 1: Agent storage configuration +# MAGIC +# MAGIC This notebook initializes a `AgentStorageConfig` Pydantic class to define the locations where the Agent's code/config and its supporting data & metadata is stored in the Unity Catalog: +# MAGIC - **Unity Catalog Model:** Stores staging/production versions of the Agent's code/config +# MAGIC - **MLflow Experiment:** Stores every development version of the Agent's code/config, each version's associated quality/cost/latency evaluation results, and any MLflow Traces from your development & evaluation processes +# MAGIC - **Evaluation Set Delta Table:** Stores the Agent's evaluation set +# MAGIC +# MAGIC This notebook does the following: +# MAGIC 1. Validates the provided locations exist. +# MAGIC 2. Serializes this configuration to `config/agent_storage_config.yaml` so other notebooks can use it + +# COMMAND ---------- + +# MAGIC %md +# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: +# MAGIC - ✅✏️ *should* customize - these cells contain config settings to change +# MAGIC - 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to validate / save the configuration +# MAGIC +# MAGIC *Cells that don't require customization still need to be run!* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Install Python libraries + +# COMMAND ---------- + +# MAGIC %pip install -qqqq -U -r requirements.txt +# MAGIC %restart_python + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Connect to Databricks +# MAGIC +# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. + +# COMMAND ---------- + +from mlflow.utils import databricks_utils as du +import os +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + + spark = DatabricksSession.builder.getOrCreate() + os.environ["MLFLOW_TRACKING_URI"] = "databricks" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Get current user info to set default values + +# COMMAND ---------- + +from cookbook.databricks_utils import get_current_user_info + +user_email, user_name, default_catalog = get_current_user_info(spark) + +print(f"User email: {user_email}") +print(f"User name: {user_name}") +print(f"Default UC catalog: {default_catalog}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### ✅✏️ Configure your Agent's storage locations +# MAGIC +# MAGIC Either review & accept the default values or enter your preferred location. + +# COMMAND ---------- + +from cookbook.config.shared.agent_storage_location import AgentStorageConfig +from cookbook.databricks_utils import get_mlflow_experiment_url +import mlflow + +# Default values below for `AgentStorageConfig` +agent_name = "my_agent_autogen" +uc_catalog_name = "casaman_ssa" +uc_schema_name = "demos" + +# Agent storage configuration +agent_storage_config = AgentStorageConfig( + uc_model_name=f"{uc_catalog_name}.{uc_schema_name}.{agent_name}", # UC model to store staging/production versions of the Agent's code/config + evaluation_set_uc_table=f"{uc_catalog_name}.{uc_schema_name}.{agent_name}_eval_set", # UC table to store the evaluation set + mlflow_experiment_name=f"/Users/{user_email}/{agent_name}_mlflow_experiment", # MLflow Experiment to store development versions of the Agent and their associated quality/cost/latency evaluation results + MLflow Traces +) + +# Validate the UC catalog and schema for the Agent'smodel & evaluation table +is_valid, msg = agent_storage_config.validate_catalog_and_schema() +if not is_valid: + raise Exception(msg) + +# Set the MLflow experiment, validating the path is valid +experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) +# If running in a local IDE, set the MLflow experiment name as an environment variable +os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name + +print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Save the configuration for use by other notebooks + +# COMMAND ---------- + +from cookbook.config import serializable_config_to_yaml_file + +serializable_config_to_yaml_file(agent_storage_config, "./configs/agent_storage_config.yaml") \ No newline at end of file diff --git a/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb b/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb new file mode 100644 index 0000000..18c451e --- /dev/null +++ b/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb @@ -0,0 +1,144 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC ## 👉 START HERE: How to use this notebook +# MAGIC +# MAGIC ### Step 1: Create synthetic evaluation data +# MAGIC +# MAGIC To measure your Agent's quality, you need a diverse, representative evaluation set. This notebook turns your unstructured documents into a high-quality synthetic evaluation set so that you can start to evaluate and improve your Agent's quality before subject matter experts are available to label data. +# MAGIC +# MAGIC This notebook does the following: +# MAGIC 1. +# MAGIC +# MAGIC THIS DOES NOT WORK FROM LOCAL IDE YET. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: +# MAGIC - ✅✏️ *should* customize - these cells contain config settings to change +# MAGIC - 🚫✏️ *typically will not* customize - these cells contain code that is parameterized by your configuration. +# MAGIC +# MAGIC *Cells that don't require customization still need to be run!* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Install Python libraries + +# COMMAND ---------- + +# MAGIC %pip install -qqqq -U -r requirements.txt +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Connect to Databricks +# MAGIC +# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. + +# COMMAND ---------- + +from mlflow.utils import databricks_utils as du +import os + +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + + spark = DatabricksSession.builder.getOrCreate() + os.environ["MLFLOW_TRACKING_URI"] = "databricks" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Load the Agent's storage locations +# MAGIC +# MAGIC This notebook writes to the evaluation set table that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. + +# COMMAND ---------- + +from cookbook.config.shared.agent_storage_location import AgentStorageConfig +from cookbook.databricks_utils import get_table_url +from cookbook.config import load_serializable_config_from_yaml_file + +# Load the Agent's storage configuration +agent_storage_config: AgentStorageConfig = load_serializable_config_from_yaml_file('./configs/agent_storage_config.yaml') + +# Check if the evaluation set already exists +try: + eval_dataset = spark.table(agent_storage_config.evaluation_set_uc_table) + if eval_dataset.count() > 0: + print(f"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} already exists! By default, this notebook will append to the evaluation dataset. If you would like to overwrite the existing evaluation set, please delete the table before running this notebook.") + else: + print(f"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} exists, but is empty! By default, this notebook will NOT change the schema of this table - if you experience schema related errors, drop this table before running this notebook so it can be recreated with the correct schema.") +except Exception: + print(f"Evaluation set `{agent_storage_config.evaluation_set_uc_table}` does not exist. This notebook will create a new Delta Table at this location.") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Load the source documents for synthetic evaluation data generation +# MAGIC +# MAGIC Most often, this will be the same as the document output table from the [data pipeline](01_data_pipeline.ipynb). +# MAGIC +# MAGIC Here, we provide code to load the documents table that was created in the [data pipeline](01_data_pipeline.ipynb). +# MAGIC +# MAGIC Alternatively, this can be a Spark DataFrame, Pandas DataFrame, or list of dictionaries with the following keys/columns: +# MAGIC - `doc_uri`: A URI pointing to the document. +# MAGIC - `content`: The content of the document. + +# COMMAND ---------- + +from cookbook.config.data_pipeline import DataPipelineConfig +from cookbook.config import load_serializable_config_from_yaml_file + +datapipeline_config: DataPipelineConfig= load_serializable_config_from_yaml_file('./configs/data_pipeline_config.yaml') + +source_documents = spark.table(datapipeline_config.output.parsed_docs_table) + +display(source_documents.toPandas()) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Run the synthetic evaluation data generation +# MAGIC +# MAGIC Optionally, you can customize the guidelines to guide the synthetic data generation. By default, guidelines are not applied - to apply the guidelines, uncomment `guidelines=guidelines` in the `generate_evals_df(...)` call. See our [documentation](https://docs.databricks.com/en/generative-ai/agent-evaluation/synthesize-evaluation-set.html) for more details. + +# COMMAND ---------- + +from databricks.agents.evals import generate_evals_df + +# NOTE: The guidelines you provide are a free-form string. The markdown string below is the suggested formatting for the set of guidelines, however you are free +# to add your sections here. Note that this will be prompt-engineering an LLM that generates the synthetic data, so you may have to iterate on these guidelines before +# you get the results you desire. +guidelines = """ +# Task Description +The Agent is a RAG chatbot that answers questions about using Spark on Databricks. The Agent has access to a corpus of Databricks documents, and its task is to answer the user's questions by retrieving the relevant docs from the corpus and synthesizing a helpful, accurate response. The corpus covers a lot of info, but the Agent is specifically designed to interact with Databricks users who have questions about Spark. So questions outside of this scope are considered irrelevant. + +# User personas +- A developer who is new to the Databricks platform +- An experienced, highly technical Data Scientist or Data Engineer + +# Example questions +- what API lets me parallelize operations over rows of a delta table? +- Which cluster settings will give me the best performance when using Spark? + +# Additional Guidelines +- Questions should be succinct, and human-like +""" + +synthesized_evals_df = generate_evals_df( + docs=source_documents, + # The number of evaluations to generate for each doc. + num_evals=10, + # A optional set of guidelines that help guide the synthetic generation. This is a free-form string that will be used to prompt the generation. + # guidelines=guidelines +) + +# Write the synthetic evaluation data to the evaluation set table +spark.createDataFrame(synthesized_evals_df).write.format("delta").mode("append").saveAsTable(agent_storage_config.evaluation_set_uc_table) + +# Display the synthetic evaluation data +eval_set_df = spark.table(agent_storage_config.evaluation_set_uc_table) +display(eval_set_df.toPandas()) \ No newline at end of file diff --git a/autogen_agent_app_sample_code/04_create_tools.ipynb b/autogen_agent_app_sample_code/04_create_tools.ipynb new file mode 100644 index 0000000..c7587b0 --- /dev/null +++ b/autogen_agent_app_sample_code/04_create_tools.ipynb @@ -0,0 +1,688 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC ## 👉 START HERE: How to use this notebook +# MAGIC +# MAGIC # Step 2: Create tools for your Agent +# MAGIC +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: +# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. +# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent +# MAGIC +# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Install Python libraries +# MAGIC +# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. + +# COMMAND ---------- + +# MAGIC %pip install -qqqq -U -r requirements.txt +# MAGIC # Restart to load the packages into the Python environment +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Connect to Databricks +# MAGIC +# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. + +# COMMAND ---------- + +from mlflow.utils import databricks_utils as du +import os + +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + + spark = DatabricksSession.builder.getOrCreate() + os.environ["MLFLOW_TRACKING_URI"] = "databricks" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment +# MAGIC +# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. + +# COMMAND ---------- + +from cookbook.config.shared.agent_storage_location import AgentStorageConfig +from cookbook.databricks_utils import get_mlflow_experiment_url +from cookbook.config import load_serializable_config_from_yaml_file +import mlflow + +# Load the Agent's storage locations +agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") + +# Show the Agent's storage locations +agent_storage_config.pretty_print() + +# set the MLflow experiment +experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) +# If running in a local IDE, set the MLflow experiment name as an environment variable +os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name + +print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # create tools +# MAGIC +# MAGIC - we will store all tools in the `user_tools` folder +# MAGIC - first, create a local function & test it with pytest +# MAGIC - then, deploy it as a UC tool & test it with pytest +# MAGIC - then, add the tool to the Agent + +# COMMAND ---------- + +# MAGIC %md +# MAGIC always reload the tool's code + +# COMMAND ---------- + +# MAGIC %load_ext autoreload +# MAGIC %autoreload 3 + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## lets do an example of a simple, but fake tool that translates old to new SKUs. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC 1, create the python function that will become your UC function. you need to annotate the function with docstrings & type hints - these are used to create the tool's metadata in UC. + +# COMMAND ---------- + +# MAGIC %%writefile tools/sample_tool.py +# MAGIC +# MAGIC def sku_sample_translator(old_sku: str) -> str: +# MAGIC """ +# MAGIC Translates a pre-2024 SKU formatted as "OLD-XXX-YYYY" to the new SKU format "NEW-YYYY-XXX". +# MAGIC +# MAGIC Args: +# MAGIC old_sku (str): The old SKU in the format "OLD-XXX-YYYY". +# MAGIC +# MAGIC Returns: +# MAGIC str: The new SKU in the format "NEW-YYYY-XXX". +# MAGIC +# MAGIC Raises: +# MAGIC ValueError: If the SKU format is invalid, providing specific error details. +# MAGIC """ +# MAGIC import re +# MAGIC +# MAGIC if not isinstance(old_sku, str): +# MAGIC raise ValueError("SKU must be a string") +# MAGIC +# MAGIC # Normalize input by removing extra whitespace and converting to uppercase +# MAGIC old_sku = old_sku.strip().upper() +# MAGIC +# MAGIC # Define the regex pattern for the old SKU format +# MAGIC pattern = r"^OLD-([A-Z]{3})-(\d{4})$" +# MAGIC +# MAGIC # Match the old SKU against the pattern +# MAGIC match = re.match(pattern, old_sku) +# MAGIC if not match: +# MAGIC if not old_sku.startswith("OLD-"): +# MAGIC raise ValueError("SKU must start with 'OLD-'") +# MAGIC if not re.match(r"^OLD-[A-Z]{3}-\d{4}$", old_sku): +# MAGIC raise ValueError( +# MAGIC "SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" +# MAGIC ) +# MAGIC raise ValueError("Invalid SKU format") +# MAGIC +# MAGIC # Extract the letter code and numeric part +# MAGIC letter_code, numeric_part = match.groups() +# MAGIC +# MAGIC # Additional validation for numeric part +# MAGIC if not (1 <= int(numeric_part) <= 9999): +# MAGIC raise ValueError("Numeric part must be between 0001 and 9999") +# MAGIC +# MAGIC # Construct the new SKU +# MAGIC new_sku = f"NEW-{numeric_part}-{letter_code}" +# MAGIC return new_sku +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, let's import the tool and test it locally + +# COMMAND ---------- + +from tools.sample_tool import sku_sample_translator + +sku_sample_translator("OLD-XXX-1234") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC now, lets write some pyTest unit tests for the tool - these are just samples, you will need to write your own + +# COMMAND ---------- + +# MAGIC %%writefile tools/test_sample_tool.py +# MAGIC import pytest +# MAGIC from tools.sample_tool import sku_sample_translator +# MAGIC +# MAGIC +# MAGIC +# MAGIC def test_valid_sku_translation(): +# MAGIC """Test successful SKU translation with valid input.""" +# MAGIC assert sku_sample_translator("OLD-ABC-1234") == "NEW-1234-ABC" +# MAGIC assert sku_sample_translator("OLD-XYZ-0001") == "NEW-0001-XYZ" +# MAGIC assert sku_sample_translator("old-def-5678") == "NEW-5678-DEF" # Test case insensitivity +# MAGIC +# MAGIC +# MAGIC def test_whitespace_handling(): +# MAGIC """Test that the function handles extra whitespace correctly.""" +# MAGIC assert sku_sample_translator(" OLD-ABC-1234 ") == "NEW-1234-ABC" +# MAGIC assert sku_sample_translator("\tOLD-ABC-1234\n") == "NEW-1234-ABC" +# MAGIC +# MAGIC +# MAGIC def test_invalid_input_type(): +# MAGIC """Test that non-string inputs raise ValueError.""" +# MAGIC with pytest.raises(ValueError, match="SKU must be a string"): +# MAGIC sku_sample_translator(123) +# MAGIC with pytest.raises(ValueError, match="SKU must be a string"): +# MAGIC sku_sample_translator(None) +# MAGIC +# MAGIC +# MAGIC def test_invalid_prefix(): +# MAGIC """Test that SKUs not starting with 'OLD-' raise ValueError.""" +# MAGIC with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): +# MAGIC sku_sample_translator("NEW-ABC-1234") +# MAGIC with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): +# MAGIC sku_sample_translator("XXX-ABC-1234") +# MAGIC +# MAGIC +# MAGIC def test_invalid_format(): +# MAGIC """Test various invalid SKU formats.""" +# MAGIC invalid_skus = [ +# MAGIC "OLD-AB-1234", # Too few letters +# MAGIC "OLD-ABCD-1234", # Too many letters +# MAGIC "OLD-123-1234", # Numbers instead of letters +# MAGIC "OLD-ABC-123", # Too few digits +# MAGIC "OLD-ABC-12345", # Too many digits +# MAGIC "OLD-ABC-XXXX", # Letters instead of numbers +# MAGIC "OLD-A1C-1234", # Mixed letters and numbers in middle +# MAGIC ] +# MAGIC +# MAGIC for sku in invalid_skus: +# MAGIC with pytest.raises( +# MAGIC ValueError, +# MAGIC match="SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit", +# MAGIC ): +# MAGIC sku_sample_translator(sku) +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC now, lets run the tests + +# COMMAND ---------- + +import pytest + +# Run tests from test_sku_translator.py +pytest.main(["-v", "tools/test_sample_tool.py"]) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, lets deploy the tool to Unity catalog. + +# COMMAND ---------- + +from unitycatalog.ai.core.databricks import DatabricksFunctionClient +from tools.sample_tool import sku_sample_translator + +client = DatabricksFunctionClient() +CATALOG = "casaman_ssa" # Change me! +SCHEMA = "demos" # Change me if you want + +# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints +tool_uc_info = client.create_python_function(func=sku_sample_translator, catalog=CATALOG, schema=SCHEMA, replace=True) + +# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function +# Print the deployed Unity Catalog function name +print(f"Deployed Unity Catalog function name: {tool_uc_info.full_name}") + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, wrap it into a UCTool that will be used by our Agent. UC tool is just a Pydnatic base model that is serializable to YAML that will load the tool's metadata from UC and wrap it in a callable object. + +# COMMAND ---------- + +from cookbook.tools.uc_tool import UCTool + +# wrap the tool into a UCTool which can be passed to our Agent +translate_sku_tool = UCTool(uc_function_name=tool_uc_info.full_name) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, let's test the UC tool - the UCTool is a directly callable wrapper around the UC function, so it can be used just like a local function, but the output will be put into a dictionary with either the output in a 'value' key or an 'error' key if an error is raised. +# MAGIC +# MAGIC when an error happens, the UC tool will also return an instruction prompt to show the agent how to think about handling the error. this can be changed via the `error_prompt` parameter in the UCTool.. +# MAGIC + +# COMMAND ---------- + +# successful call +translate_sku_tool(old_sku="OLD-XXX-1234") + +# COMMAND ---------- + +# unsuccessful call +translate_sku_tool(old_sku="OxxLD-XXX-1234") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC now, let's convert our pytests to work with the UC tool. this requires a bit of transformation to the test code to account for the fact that the output is in a dictionary & exceptions are not raised directly. + +# COMMAND ---------- + +# MAGIC %%writefile tools/test_sample_tool_uc.py +# MAGIC import pytest +# MAGIC from cookbook.tools.uc_tool import UCTool +# MAGIC +# MAGIC # Load the function from the UCTool versus locally +# MAGIC @pytest.fixture +# MAGIC def uc_tool(): +# MAGIC """Fixture to translate a UC tool into a local function.""" +# MAGIC UC_FUNCTION_NAME = "ep.cookbook_local_test.sku_sample_translator" +# MAGIC loaded_tool = UCTool(uc_function_name=UC_FUNCTION_NAME) +# MAGIC return loaded_tool +# MAGIC +# MAGIC +# MAGIC # Note: The value will be post processed into the `value` key, so we must check the returned value there. +# MAGIC def test_valid_sku_translation(uc_tool): +# MAGIC """Test successful SKU translation with valid input.""" +# MAGIC assert uc_tool(old_sku="OLD-ABC-1234")["value"] == "NEW-1234-ABC" +# MAGIC assert uc_tool(old_sku="OLD-XYZ-0001")["value"] == "NEW-0001-XYZ" +# MAGIC assert ( +# MAGIC uc_tool(old_sku="old-def-5678")["value"] == "NEW-5678-DEF" +# MAGIC ) # Test case insensitivity +# MAGIC +# MAGIC +# MAGIC # Note: The value will be post processed into the `value` key, so we must check the returned value there. +# MAGIC def test_whitespace_handling(uc_tool): +# MAGIC """Test that the function handles extra whitespace correctly.""" +# MAGIC assert uc_tool(old_sku=" OLD-ABC-1234 ")["value"] == "NEW-1234-ABC" +# MAGIC assert uc_tool(old_sku="\tOLD-ABC-1234\n")["value"] == "NEW-1234-ABC" +# MAGIC +# MAGIC +# MAGIC # Note: the input validation happens BEFORE the function is called by Spark, so we will never get these exceptions from the function. +# MAGIC # Instead, we will get invalid parameters errors from Spark. +# MAGIC def test_invalid_input_type(uc_tool): +# MAGIC """Test that non-string inputs raise ValueError.""" +# MAGIC assert ( +# MAGIC uc_tool(old_sku=123)["error"]["error_message"] +# MAGIC == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" +# MAGIC ) +# MAGIC assert ( +# MAGIC uc_tool(old_sku=None)["error"]["error_message"] +# MAGIC == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" +# MAGIC ) +# MAGIC +# MAGIC +# MAGIC # Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. +# MAGIC def test_invalid_prefix(uc_tool): +# MAGIC """Test that SKUs not starting with 'OLD-' raise ValueError.""" +# MAGIC assert ( +# MAGIC uc_tool(old_sku="NEW-ABC-1234")["error"]["error_message"] +# MAGIC == "ValueError: SKU must start with 'OLD-'" +# MAGIC ) +# MAGIC assert ( +# MAGIC uc_tool(old_sku="XXX-ABC-1234")["error"]["error_message"] +# MAGIC == "ValueError: SKU must start with 'OLD-'" +# MAGIC ) +# MAGIC +# MAGIC +# MAGIC # Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. +# MAGIC def test_invalid_format(uc_tool): +# MAGIC """Test various invalid SKU formats.""" +# MAGIC invalid_skus = [ +# MAGIC "OLD-AB-1234", # Too few letters +# MAGIC "OLD-ABCD-1234", # Too many letters +# MAGIC "OLD-123-1234", # Numbers instead of letters +# MAGIC "OLD-ABC-123", # Too few digits +# MAGIC "OLD-ABC-12345", # Too many digits +# MAGIC "OLD-ABC-XXXX", # Letters instead of numbers +# MAGIC "OLD-A1C-1234", # Mixed letters and numbers in middle +# MAGIC ] +# MAGIC +# MAGIC expected_error = "ValueError: SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" +# MAGIC for sku in invalid_skus: +# MAGIC assert uc_tool(old_sku=sku)["error"]["error_message"] == expected_error +# MAGIC + +# COMMAND ---------- + +import pytest + +# Run tests from test_sku_translator.py +pytest.main(["-v", "tools/test_sample_tool_uc.py"]) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC # Now, here's another example of a tool that executes python code. + +# COMMAND ---------- + +# MAGIC %%writefile tools/code_exec.py +# MAGIC def python_exec(code: str) -> str: +# MAGIC """ +# MAGIC Executes Python code in the sandboxed environment and returns its stdout. The runtime is stateless and you can not read output of the previous tool executions. i.e. No such variables "rows", "observation" defined. Calling another tool inside a Python code is NOT allowed. +# MAGIC Use only standard python libraries and these python libraries: bleach, chardet, charset-normalizer, defusedxml, googleapis-common-protos, grpcio, grpcio-status, jmespath, joblib, numpy, packaging, pandas, patsy, protobuf, pyarrow, pyparsing, python-dateutil, pytz, scikit-learn, scipy, setuptools, six, threadpoolctl, webencodings, user-agents, cryptography. +# MAGIC +# MAGIC Args: +# MAGIC code (str): Python code to execute. Remember to print the final result to stdout. +# MAGIC +# MAGIC Returns: +# MAGIC str: The output of the executed code. +# MAGIC """ +# MAGIC import sys +# MAGIC from io import StringIO +# MAGIC +# MAGIC sys_stdout = sys.stdout +# MAGIC redirected_output = StringIO() +# MAGIC sys.stdout = redirected_output +# MAGIC exec(code) +# MAGIC sys.stdout = sys_stdout +# MAGIC return redirected_output.getvalue() +# MAGIC + +# COMMAND ---------- + +from tools.code_exec import python_exec + +python_exec("print('hello')") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Test it locally + +# COMMAND ---------- + +# MAGIC %%writefile tools/test_code_exec.py +# MAGIC +# MAGIC import pytest +# MAGIC from .code_exec import python_exec +# MAGIC +# MAGIC +# MAGIC def test_basic_arithmetic(): +# MAGIC code = """result = 2 + 2\nprint(result)""" +# MAGIC assert python_exec(code).strip() == "4" +# MAGIC +# MAGIC +# MAGIC def test_multiple_lines(): +# MAGIC code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" +# MAGIC assert python_exec(code).strip() == "15" +# MAGIC +# MAGIC +# MAGIC def test_multiple_prints(): +# MAGIC code = """print('first')\nprint('second')\nprint('third')\n""" +# MAGIC expected = "first\nsecond\nthird\n" +# MAGIC assert python_exec(code) == expected +# MAGIC +# MAGIC +# MAGIC def test_using_pandas(): +# MAGIC code = ( +# MAGIC "import pandas as pd\n" +# MAGIC "data = {'col1': [1, 2], 'col2': [3, 4]}\n" +# MAGIC "df = pd.DataFrame(data)\n" +# MAGIC "print(df.shape)" +# MAGIC ) +# MAGIC assert python_exec(code).strip() == "(2, 2)" +# MAGIC +# MAGIC +# MAGIC def test_using_numpy(): +# MAGIC code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" +# MAGIC assert python_exec(code).strip() == "2.0" +# MAGIC +# MAGIC +# MAGIC def test_syntax_error(): +# MAGIC code = "if True\n" " print('invalid syntax')" +# MAGIC with pytest.raises(SyntaxError): +# MAGIC python_exec(code) +# MAGIC +# MAGIC +# MAGIC def test_runtime_error(): +# MAGIC code = "x = 1 / 0\n" "print(x)" +# MAGIC with pytest.raises(ZeroDivisionError): +# MAGIC python_exec(code) +# MAGIC +# MAGIC +# MAGIC def test_undefined_variable(): +# MAGIC code = "print(undefined_variable)" +# MAGIC with pytest.raises(NameError): +# MAGIC python_exec(code) +# MAGIC +# MAGIC +# MAGIC def test_multiline_string_manipulation(): +# MAGIC code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" +# MAGIC expected = "Hello\nWorld" +# MAGIC assert python_exec(code).strip() == expected +# MAGIC +# MAGIC # Will not fail locally, but will fail in UC. +# MAGIC # def test_unauthorized_flask(): +# MAGIC # code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" +# MAGIC # with pytest.raises(ImportError): +# MAGIC # python_exec(code) +# MAGIC +# MAGIC +# MAGIC def test_no_print_statement(): +# MAGIC code = "x = 42\n" "y = x * 2" +# MAGIC assert python_exec(code) == "" +# MAGIC +# MAGIC +# MAGIC def test_calculation_without_print(): +# MAGIC code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" +# MAGIC assert python_exec(code) == "" +# MAGIC +# MAGIC +# MAGIC def test_function_definition_without_call(): +# MAGIC code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" +# MAGIC assert python_exec(code) == "" +# MAGIC +# MAGIC +# MAGIC def test_class_definition_without_instantiation(): +# MAGIC code = ( +# MAGIC "class Calculator:\n" +# MAGIC " def add(self, a, b):\n" +# MAGIC " return a + b\n" +# MAGIC "calc = Calculator()" +# MAGIC ) +# MAGIC assert python_exec(code) == "" +# MAGIC + +# COMMAND ---------- + +import pytest + +# Run tests from test_sku_translator.py +pytest.main(["-v", "tools/test_code_exec.py"]) + + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Deploy to UC + +# COMMAND ---------- + +from unitycatalog.ai.core.databricks import DatabricksFunctionClient +from tools.code_exec import python_exec +from cookbook.tools.uc_tool import UCTool + +client = DatabricksFunctionClient() +CATALOG = "casaman_ssa" # Change me! +SCHEMA = "demos" # Change me if you want + +# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints +python_exec_tool_uc_info = client.create_python_function(func=python_exec, catalog=CATALOG, schema=SCHEMA, replace=True) + +# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function +# Print the deployed Unity Catalog function name +print(f"Deployed Unity Catalog function name: {python_exec_tool_uc_info.full_name}") + + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Test as UC Tool for the Agent + +# COMMAND ---------- + +from cookbook.tools.uc_tool import UCTool + + +# wrap the tool into a UCTool which can be passed to our Agent +python_exec_tool = UCTool(uc_function_name=python_exec_tool_uc_info.full_name) + +python_exec_tool(code="print('hello')") + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC New tests + +# COMMAND ---------- + +# MAGIC %%writefile tools/test_code_exec_as_uc_tool.py +# MAGIC +# MAGIC import pytest +# MAGIC from cookbook.tools.uc_tool import UCTool +# MAGIC +# MAGIC CATALOG = "ep" +# MAGIC SCHEMA = "cookbook_local_test" +# MAGIC +# MAGIC +# MAGIC @pytest.fixture +# MAGIC def python_exec(): +# MAGIC """Fixture to provide the python_exec function from UCTool.""" +# MAGIC python_exec_tool = UCTool(uc_function_name=f"{CATALOG}.{SCHEMA}.python_exec") +# MAGIC return python_exec_tool +# MAGIC +# MAGIC +# MAGIC def test_basic_arithmetic(python_exec): +# MAGIC code = """result = 2 + 2\nprint(result)""" +# MAGIC assert python_exec(code=code)["value"].strip() == "4" +# MAGIC +# MAGIC +# MAGIC def test_multiple_lines(python_exec): +# MAGIC code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" +# MAGIC assert python_exec(code=code)["value"].strip() == "15" +# MAGIC +# MAGIC +# MAGIC def test_multiple_prints(python_exec): +# MAGIC code = """print('first')\nprint('second')\nprint('third')\n""" +# MAGIC expected = "first\nsecond\nthird\n" +# MAGIC assert python_exec(code=code)["value"] == expected +# MAGIC +# MAGIC +# MAGIC def test_using_pandas(python_exec): +# MAGIC code = ( +# MAGIC "import pandas as pd\n" +# MAGIC "data = {'col1': [1, 2], 'col2': [3, 4]}\n" +# MAGIC "df = pd.DataFrame(data)\n" +# MAGIC "print(df.shape)" +# MAGIC ) +# MAGIC assert python_exec(code=code)["value"].strip() == "(2, 2)" +# MAGIC +# MAGIC +# MAGIC def test_using_numpy(python_exec): +# MAGIC code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" +# MAGIC assert python_exec(code=code)["value"].strip() == "2.0" +# MAGIC +# MAGIC +# MAGIC def test_syntax_error(python_exec): +# MAGIC code = "if True\n" " print('invalid syntax')" +# MAGIC result = python_exec(code=code) +# MAGIC assert "Syntax error at or near 'invalid'." in result["error"]["error_message"] +# MAGIC +# MAGIC +# MAGIC def test_runtime_error(python_exec): +# MAGIC code = "x = 1 / 0\n" "print(x)" +# MAGIC result = python_exec(code=code) +# MAGIC assert "ZeroDivisionError" in result["error"]["error_message"] +# MAGIC +# MAGIC +# MAGIC def test_undefined_variable(python_exec): +# MAGIC code = "print(undefined_variable)" +# MAGIC result = python_exec(code=code) +# MAGIC assert "NameError" in result["error"]["error_message"] +# MAGIC +# MAGIC +# MAGIC def test_multiline_string_manipulation(python_exec): +# MAGIC code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" +# MAGIC expected = "Hello\nWorld" +# MAGIC assert python_exec(code=code)["value"].strip() == expected +# MAGIC +# MAGIC +# MAGIC def test_unauthorized_flask(python_exec): +# MAGIC code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" +# MAGIC result = python_exec(code=code) +# MAGIC assert ( +# MAGIC "ModuleNotFoundError: No module named 'flask'" +# MAGIC in result["error"]["error_message"] +# MAGIC ) +# MAGIC +# MAGIC +# MAGIC def test_no_print_statement(python_exec): +# MAGIC code = "x = 42\n" "y = x * 2" +# MAGIC assert python_exec(code=code)["value"] == "" +# MAGIC +# MAGIC +# MAGIC def test_calculation_without_print(python_exec): +# MAGIC code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" +# MAGIC assert python_exec(code=code)["value"] == "" +# MAGIC +# MAGIC +# MAGIC def test_function_definition_without_call(python_exec): +# MAGIC code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" +# MAGIC assert python_exec(code=code)["value"] == "" +# MAGIC +# MAGIC +# MAGIC def test_class_definition_without_instantiation(python_exec): +# MAGIC code = ( +# MAGIC "class Calculator:\n" +# MAGIC " def add(self, a, b):\n" +# MAGIC " return a + b\n" +# MAGIC "calc = Calculator()" +# MAGIC ) +# MAGIC assert python_exec(code=code)["value"] == "" +# MAGIC + +# COMMAND ---------- + +import pytest + +# Run tests from test_sku_translator.py +pytest.main(["-v", "tools/test_code_exec_as_uc_tool.py"]) + diff --git a/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb b/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb new file mode 100644 index 0000000..aa72187 --- /dev/null +++ b/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb @@ -0,0 +1,445 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC ## 👉 START HERE: How to use this notebook +# MAGIC +# MAGIC # Step 3: Build, evaluate, & deploy your Agent +# MAGIC +# MAGIC Use this notebook to iterate on the code and configuration of your Agent. +# MAGIC +# MAGIC By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation. +# MAGIC +# MAGIC Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui). +# MAGIC +# MAGIC +# MAGIC For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains: +# MAGIC - Your Agent's code & config +# MAGIC - Evaluation metrics for cost, quality, and latency + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: +# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. +# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent +# MAGIC +# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Install Python libraries +# MAGIC +# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. + +# COMMAND ---------- + +# MAGIC %pip install -qqqq -U -r requirements.txt +# MAGIC # # Restart to load the packages into the Python environment +# MAGIC dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Connect to Databricks +# MAGIC +# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. + +# COMMAND ---------- + +from mlflow.utils import databricks_utils as du +import os + +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + + spark = DatabricksSession.builder.getOrCreate() + os.environ["MLFLOW_TRACKING_URI"] = "databricks" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment +# MAGIC +# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. + +# COMMAND ---------- + +from cookbook.config.shared.agent_storage_location import AgentStorageConfig +from cookbook.databricks_utils import get_mlflow_experiment_url +from cookbook.config import load_serializable_config_from_yaml_file +import mlflow + +# Load the Agent's storage locations +agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") + +# Show the Agent's storage locations +agent_storage_config.pretty_print() + +# set the MLflow experiment +experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) +# If running in a local IDE, set the MLflow experiment name as an environment variable +os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name + +print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Helper method to log the Agent's code & config to MLflow +# MAGIC +# MAGIC Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production. + +# COMMAND ---------- + + +import mlflow +from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA +from mlflow.models.rag_signatures import StringResponse +from mlflow.models import ModelConfig +from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES +from mlflow.models.signature import ModelSignature +from cookbook.agents.function_calling_agent import FunctionCallingAgent +from cookbook.config.agents.function_calling_agent import FunctionCallingAgentConfig + +# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI +# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. +# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! +def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig): + # Get the agent's code path from the imported Agent class + agent_code_path = f"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py" + + # Get the pip requirements from the requirements.txt file + with open("requirements.txt", "r") as file: + pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark + + logged_agent_info = mlflow.pyfunc.log_model( + artifact_path="agent", + python_model=agent_code_path, + input_example=agent_config.input_example, + model_config=agent_config.model_dump(), + resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc + signature=ModelSignature( + inputs=CHAT_MODEL_INPUT_SCHEMA, + # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature + outputs=StringResponse() + ), + code_paths=[os.path.join(os.getcwd(), "cookbook")], + pip_requirements=pip_requirements, + ) + + return logged_agent_info + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC ## 1️⃣ Iterate on the Agent's code & config to improve quality +# MAGIC +# MAGIC The below cells are used to execute your inner dev loop to improve the Agent's quality. +# MAGIC +# MAGIC We suggest the following process: +# MAGIC 1. Vibe check the Agent for 5 - 10 queries to verify it works +# MAGIC 2. Make any necessary changes to the code/config +# MAGIC 3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues +# MAGIC 4. Based on that evaluation, make & test changes to the code/config to improve quality +# MAGIC 5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality +# MAGIC 6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing +# MAGIC 7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues +# MAGIC 8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7 +# MAGIC 9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6) +# MAGIC + +# COMMAND ---------- + +# Import Cookbook Agent configurations, which are Pydantic models +from cookbook.config import serializable_config_to_yaml_file +from cookbook.config.agents.function_calling_agent import ( + FunctionCallingAgentConfig, +) +from cookbook.config.data_pipeline import ( + DataPipelineConfig, +) +from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig +from cookbook.config import load_serializable_config_from_yaml_file +from cookbook.tools.vector_search import ( + VectorSearchRetrieverTool, + VectorSearchSchema, +) +from cookbook.tools.uc_tool import UCTool + +import json +import mlflow +import yaml + +######################## +# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration +# Usage: +# - If you used `01_data_pipeline` to create your Vector Index, run this cell. +# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config. +######################## + +#data_pipeline_config: DataPipelineConfig = #load_serializable_config_from_yaml_file( +# "./configs/data_pipeline_config.yaml" +#) + +######################## +# #### ✅✏️ Retriever tool that connects to the Vector Search index +######################## + +retriever_tool = VectorSearchRetrieverTool( + name="search_product_docs", + description="Use this tool to search for product documentation.", + vector_search_index="casaman_ssa.demos.test_product_docs_docs_chunked_index__v2", + vector_search_schema=VectorSearchSchema( + # These columns are the default values used in the `01_data_pipeline` notebook + # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. + chunk_text="content_chunked", # Contains the text of each document chunk + document_uri="doc_uri", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App + # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM + ), + # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below. + # doc_similarity_threshold=0.0, + # vector_search_parameters=VectorSearchParameters( + # num_results=5, + # query_type="ann" + # ), + # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query. + # filterable_columns=[] +) + +######################## +# #### ✅✏️ Add Unity Catalog tools to the Agent +######################## + +translate_sku_tool = UCTool(uc_function_name="casaman_ssa.demos.sku_sample_translator") + + +######################## +# #### ✅✏️ Add a local Python function as a tool in the Agent +######################## + +from cookbook.tools.local_function import LocalFunctionTool +from tools.sample_tool import sku_sample_translator + +# translate_sku_tool = LocalFunctionTool(func=translate_sku, description="Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.") + +tools = [retriever_tool, translate_sku_tool] + +######################## +#### ✅✏️ Agent's LLM configuration +######################## + +system_prompt = """ +## Role +You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request. + +## Objective +Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses. + +## Instructions +1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. + +2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query. + +3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: "I'm sorry, I can't help you with that." +""".strip() + +fc_agent_config = FunctionCallingAgentConfig( + llm_config=LLMConfig( + llm_endpoint_name="casaman-gpt4", # Model serving endpoint w/ a Chat Completions API + llm_system_prompt_template=system_prompt, # System prompt template + llm_parameters=LLMParametersConfig( + temperature=0.01, max_tokens=1500 + ), # LLM parameters + ), + # Add one or more tools that comply with the CookbookTool interface + tools=tools, +) + +# Print the configuration as a JSON string to see it all together +# print(json.dumps(fc_agent_config.model_dump(), indent=4)) + +######################## +##### Dump the configuration to a YAML +# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration. +######################## +# Import the default YAML config file name from the Agent's code file +from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME + +# Dump the configuration to a YAML file +serializable_config_to_yaml_file(fc_agent_config, "./configs/"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ Optionally, adjust the Agent's code +# MAGIC +# MAGIC Here, we import the Agent's code so we can run the Agent locally within the notebook. To modify the code, open the Agent's code file in a separate window, enable reload, make your changes, and re-run this cell. +# MAGIC +# MAGIC **Typically, when building the first version of your agent, we suggest first trying to tune the configuration (prompts, etc) to improve quality. If you need more control to fix quality issues, you can then modify the Agent's code.** + +# COMMAND ---------- + +from cookbook.agents.function_calling_agent import FunctionCallingAgent +import inspect + +# Print the Agent code for inspection +print(inspect.getsource(FunctionCallingAgent)) + +# COMMAND ---------- + +# MAGIC %load_ext autoreload +# MAGIC %autoreload 3 + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ 🅰 Vibe check the Agent for a single query +# MAGIC +# MAGIC Running this cell will produce an MLflow Trace that you can use to see the Agent's outputs and understand the steps it took to produce that output. +# MAGIC +# MAGIC If you are running in a local IDE, browse to the MLflow Experiment page to view the Trace (link to the Experiment UI is at the top of this notebook). If running in a Databricks Notebook, your trace will appear inline below. + +# COMMAND ---------- + +from cookbook.databricks_utils import get_mlflow_experiment_traces_url +from cookbook.agents.function_calling_agent import FunctionCallingAgent + +# Load the Agent's code with the above configuration +agent = FunctionCallingAgent(agent_config=fc_agent_config) + +# Vibe check the Agent for a single query +output = agent.predict(model_input={"messages": [{"role": "user", "content": "What is mlflow in databricks?"}]}) +# output = agent.predict(model_input={"messages": [{"role": "user", "content": "Translate the sku `OLD-abs-1234` to the new format"}]}) + +print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") +print(f"Agent's final response:\n----\n{output['content']}\n----") +print() +# print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(output['messages'], indent=2)}\n----") + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Now, let's test a multi-turn conversation with the Agent. + +# COMMAND ---------- + +output['content'] + +# COMMAND ---------- + +second_turn = {'messages': output['messages'] + [{"role": "user", "content": "How can I use it for versioning my model?"}]} + +# Run the Agent again with the same input to continue the conversation +second_turn_output = agent.predict(model_input=second_turn) + +print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") +print(f"Agent's final response:\n----\n{second_turn_output['content']}\n----") +print() +print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(second_turn_output['messages'], indent=2)}\n----") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC #### ✅✏️ 🅱 Evaluate the Agent using your evaluation set +# MAGIC +# MAGIC Note: If you do not have an evaluation set, you can create a synthetic evaluation set by using the 03_synthetic_evaluation notebook. + +# COMMAND ---------- + +evaluation_set = spark.table(agent_storage_config.evaluation_set_uc_table) + +mlflow.langchain.autolog(disable=True, log_traces=False) +mlflow.autogen.autolog(log_traces=False) + +with mlflow.start_run(): + logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config) + + # Run the agent for these queries, using Agent evaluation to parallelize the calls + eval_results = mlflow.evaluate( + model=logged_agent_info.model_uri, # use the MLflow logged Agent + data=evaluation_set, # Evaluate the Agent for every row of the evaluation set + model_type="databricks-agent", # use Agent Evaluation + ) + + # Show all outputs. Click on a row in this table to display the MLflow Trace. + display(eval_results.tables["eval_results"]) + + # Click 'View Evaluation Results' to see the Agent's inputs/outputs + quality evaluation displayed in a UI + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## 2️⃣ Deploy a version of your Agent - either to the Review App or Production +# MAGIC +# MAGIC Once you have a version of your Agent that has sufficient quality, you will register the Agent's model from the MLflow Experiment into the Unity Catalog & use Agent Framework's `agents.deploy(...)` command to deploy it. Note these steps are the same for deploying to pre-production (e.g., the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) or production. +# MAGIC +# MAGIC By the end of this step, you will have deployed a version of your Agent that you can interact with and share with your business stakeholders for feedback, even if they don't have access to your Databricks workspace: +# MAGIC +# MAGIC 1. A production-ready scalable REST API deployed as a Model Serving endpoint that logged every request/request/MLflow Trace to a Delta Table. +# MAGIC - REST API for querying the Agent +# MAGIC - REST API for sending user feedback from your UI to the Agent +# MAGIC 2. Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) connected to these endpoints. +# MAGIC 3. [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) connected to these endpoints. + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Option 1: Deploy the last agent you logged above + +# COMMAND ---------- + +from databricks import agents + +# Use Unity Catalog as the model registry +mlflow.set_registry_uri("databricks-uc") + +# Register the Agent's model to the Unity Catalog +uc_registered_model_info = mlflow.register_model( + model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name +) + +# Deploy the model to the review app and a model serving endpoint +agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Option 2: Log the latest copy of the Agent's code/config and deploy it + +# COMMAND ---------- + +from databricks import agents + +# Use Unity Catalog as the model registry +mlflow.set_registry_uri("databricks-uc") + +with mlflow.start_run(): + logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config) + + # Register the Agent's model to the Unity Catalog + uc_registered_model_info = mlflow.register_model( + model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name + ) + +# Deploy the model to the review app and a model serving endpoint +# agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Load the logged model to test it locally + +# COMMAND ---------- + +mlflow.autogen.autolog(log_traces=False) + +# COMMAND ---------- + +import mlflow + +loaded_model = mlflow.pyfunc.load_model(logged_agent_info.model_uri) + +loaded_model.predict({"messages": [{"role": "user", "content": "A test question?"}]}) \ No newline at end of file diff --git a/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb b/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb new file mode 100644 index 0000000..1d13cec --- /dev/null +++ b/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb @@ -0,0 +1,493 @@ +# Databricks notebook source +# MAGIC %md +# MAGIC ## 👉 START HERE: How to use this notebook +# MAGIC +# MAGIC # Step 3: Build, evaluate, & deploy your Agent +# MAGIC +# MAGIC Use this notebook to iterate on the code and configuration of your Agent. +# MAGIC +# MAGIC By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation. +# MAGIC +# MAGIC Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui). +# MAGIC +# MAGIC +# MAGIC For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains: +# MAGIC - Your Agent's code & config +# MAGIC - Evaluation metrics for cost, quality, and latency + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: +# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. +# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent +# MAGIC +# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Install Python libraries +# MAGIC +# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. + +# COMMAND ---------- + +# %pip install -qqqq -U -r requirements.txt +# # Restart to load the packages into the Python environment +# dbutils.library.restartPython() + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Connect to Databricks +# MAGIC +# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. + +# COMMAND ---------- + +from mlflow.utils import databricks_utils as du + +if not du.is_in_databricks_notebook(): + from databricks.connect import DatabricksSession + import os + + spark = DatabricksSession.builder.getOrCreate() + os.environ["MLFLOW_TRACKING_URI"] = "databricks" + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment +# MAGIC +# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. + +# COMMAND ---------- + +from cookbook.config.shared.agent_storage_location import AgentStorageConfig +from cookbook.databricks_utils import get_mlflow_experiment_url +from cookbook.config import load_serializable_config_from_yaml_file +import mlflow + +# Load the Agent's storage locations +agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") + +# Show the Agent's storage locations +agent_storage_config.pretty_print() + +# set the MLflow experiment +experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) +# If running in a local IDE, set the MLflow experiment name as an environment variable +os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name + +print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 🚫✏️ Helper method to log the Agent's code & config to MLflow +# MAGIC +# MAGIC Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production. + +# COMMAND ---------- + + +import mlflow +from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA +from mlflow.models.rag_signatures import StringResponse +from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES +from mlflow.models.signature import ModelSignature +from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor, MultiAgentSupervisorConfig +from cookbook.agents.genie_agent import GenieAgent, GenieAgentConfig +from cookbook.agents.function_calling_agent import FunctionCallingAgent +from cookbook.agents.function_calling_agent import FunctionCallingAgentConfig + +# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI +# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. +# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! +def log_multi_agent_supervisor_to_mlflow(agent_config: MultiAgentSupervisorConfig): + # Get the agent's code path from the imported Agent class + agent_code_path = f"{os.getcwd()}/{MultiAgentSupervisor.__module__.replace('.', '/')}.py" + + # Get the pip requirements from the requirements.txt file + with open("requirements.txt", "r") as file: + pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark + + logged_agent_info = mlflow.pyfunc.log_model( + artifact_path="agent", + python_model=agent_code_path, + input_example=agent_config.input_example, + model_config=agent_config.model_dump(), + resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc + signature=ModelSignature( + inputs=CHAT_MODEL_INPUT_SCHEMA, + # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature + outputs=StringResponse() + ), + code_paths=[os.path.join(os.getcwd(), "cookbook")], + pip_requirements=pip_requirements, + ) + + return logged_agent_info + +def log_genie_agent_to_mlflow(agent_config: GenieAgentConfig): + # Get the agent's code path from the imported Agent class + agent_code_path = f"{os.getcwd()}/{GenieAgent.__module__.replace('.', '/')}.py" + + # Get the pip requirements from the requirements.txt file + with open("requirements.txt", "r") as file: + pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark + + logged_agent_info = mlflow.pyfunc.log_model( + artifact_path="agent", + python_model=agent_code_path, + input_example=agent_config.input_example, + model_config=agent_config.model_dump(), + resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc + signature=ModelSignature( + inputs=CHAT_MODEL_INPUT_SCHEMA, + # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature + outputs=StringResponse() + ), + code_paths=[os.path.join(os.getcwd(), "cookbook")], + pip_requirements=pip_requirements, + ) + + return logged_agent_info + +# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI +# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. +# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! +def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig): + # Get the agent's code path from the imported Agent class + agent_code_path = f"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py" + + # Get the pip requirements from the requirements.txt file + with open("requirements.txt", "r") as file: + pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark + + logged_agent_info = mlflow.pyfunc.log_model( + artifact_path="agent", + python_model=agent_code_path, + input_example=agent_config.input_example, + model_config=agent_config.model_dump(), + resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc + signature=ModelSignature( + inputs=CHAT_MODEL_INPUT_SCHEMA, + # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature + outputs=StringResponse() + ), + code_paths=[os.path.join(os.getcwd(), "cookbook")], + pip_requirements=pip_requirements, + ) + + return logged_agent_info + +# COMMAND ---------- + +# MAGIC %md +# MAGIC +# MAGIC ## 1️⃣ Iterate on the Agent's code & config to improve quality +# MAGIC +# MAGIC The below cells are used to execute your inner dev loop to improve the Agent's quality. +# MAGIC +# MAGIC We suggest the following process: +# MAGIC 1. Vibe check the Agent for 5 - 10 queries to verify it works +# MAGIC 2. Make any necessary changes to the code/config +# MAGIC 3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues +# MAGIC 4. Based on that evaluation, make & test changes to the code/config to improve quality +# MAGIC 5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality +# MAGIC 6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing +# MAGIC 7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues +# MAGIC 8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7 +# MAGIC 9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6) +# MAGIC + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Create the agents to be overseen by the multi-agent supervisor + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 1. create the genie agent + +# COMMAND ---------- + + +from cookbook.config.agents.genie_agent import GenieAgentConfig +from cookbook.agents.genie_agent import GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, GenieAgent +from cookbook.config import serializable_config_to_yaml_file + + +genie_agent_config = GenieAgentConfig(genie_space_id="01ef92e3b5631f0da85834290964831d") +serializable_config_to_yaml_file(genie_agent_config, "./configs/"+GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + + +# COMMAND ---------- + +mlflow.set_registry_uri("databricks-uc") + +with mlflow.start_run(run_name="genie_agent_test_1"): + logged_genie_info = log_genie_agent_to_mlflow(genie_agent_config) + uc_registered_model_info = mlflow.register_model( + model_uri=logged_genie_info.model_uri, name=agent_storage_config.uc_model_name+"_genie_test_1" + ) + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ### 2. create the FC agent + +# COMMAND ---------- + +# Import Cookbook Agent configurations, which are Pydantic models +from cookbook.config import serializable_config_to_yaml_file +from cookbook.config.agents.function_calling_agent import ( + FunctionCallingAgentConfig, +) +from cookbook.config.data_pipeline import ( + DataPipelineConfig, +) +from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig +from cookbook.config import load_serializable_config_from_yaml_file +from cookbook.tools.vector_search import ( + VectorSearchRetrieverTool, + VectorSearchSchema, +) +import json +from cookbook.tools.uc_tool import UCTool + + +######################## +# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration +# Usage: +# - If you used `01_data_pipeline` to create your Vector Index, run this cell. +# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config. +######################## + +data_pipeline_config: DataPipelineConfig = load_serializable_config_from_yaml_file( + "./configs/data_pipeline_config.yaml" +) + +######################## +# #### ✅✏️ Retriever tool that connects to the Vector Search index +######################## + +retriever_tool = VectorSearchRetrieverTool( + name="search_product_docs", + description="Use this tool to search for product documentation.", + vector_search_index="ep.cookbook_local_test.product_docs_docs_chunked_index__v1", + vector_search_schema=VectorSearchSchema( + # These columns are the default values used in the `01_data_pipeline` notebook + # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. + chunk_text="content_chunked", # Contains the text of each document chunk + document_uri="doc_uri", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App + additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM + ), + # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below. + # doc_similarity_threshold=0.0, + # vector_search_parameters=VectorSearchParameters( + # num_results=5, + # query_type="ann" + # ), + # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query. + # filterable_columns=[] +) + +######################## +# #### ✅✏️ Add Unity Catalog tools to the Agent +######################## + +translate_sku_tool = UCTool(uc_function_name="ep.cookbook_local_test.translate_sku") + +from tools.sku_translator import translate_sku +# from cookbook.config import serializable_config_to_yaml_file + +# translate_sku("OLD-XXX-1234") + +from cookbook.tools.local_function import LocalFunctionTool +from tools.sku_translator import translate_sku + +# translate_sku_tool = LocalFunctionTool(func=translate_sku, description="Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.") + +######################## +#### ✅✏️ Agent's LLM configuration +######################## + +system_prompt = """ +## Role +You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request. + +## Objective +Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses. + +## Instructions +1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. + +2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query. + +3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: "I'm sorry, I can't help you with that." +""".strip() + +fc_agent_config = FunctionCallingAgentConfig( + llm_config=LLMConfig( + llm_endpoint_name="ep-gpt4o-new", # Model serving endpoint w/ a Chat Completions API + llm_system_prompt_template=system_prompt, # System prompt template + llm_parameters=LLMParametersConfig( + temperature=0.01, max_tokens=1500 + ), # LLM parameters + ), + # Add one or more tools that comply with the CookbookTool interface + tools=[retriever_tool, translate_sku_tool], + # tools=[retriever_tool], +) + +# Print the configuration as a JSON string to see it all together +# print(json.dumps(fc_agent_config.model_dump(), indent=4)) + +######################## +##### Dump the configuration to a YAML +# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration. +######################## +# Import the default YAML config file name from the Agent's code file +from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME + +# Dump the configuration to a YAML file +serializable_config_to_yaml_file(fc_agent_config, "./configs/"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + +# COMMAND ---------- + +# MAGIC %md +# MAGIC ## Create the multi-agent supervisor + +# COMMAND ---------- + +from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig +from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig +from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME +from cookbook.config.shared.llm import LLMConfig +from cookbook.config import serializable_config_to_yaml_file +from cookbook.agents.function_calling_agent import FunctionCallingAgent +from cookbook.config.shared.llm import LLMParametersConfig + + +fc_supervised = SupervisedAgentConfig(name="fc_agent", + description="looks up product docs", + endpoint_name="", + agent_config=fc_agent_config, + agent_class=FunctionCallingAgent) + +genie_supervised = SupervisedAgentConfig(name="genie_agent", + description="queries for customer info", + endpoint_name="", + agent_config=genie_agent_config, + agent_class=GenieAgent) + + +multi_agent_config = MultiAgentSupervisorConfig( + llm_endpoint_name="ep-gpt4o-new", + llm_parameters=LLMParametersConfig( + max_tokens= 1500, + temperature= 0.01 + ), + + playground_debug_mode=True, + agent_loading_mode="local", + agents=[fc_supervised, genie_supervised] +) + +serializable_config_to_yaml_file(multi_agent_config, "./configs/"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + + +# COMMAND ---------- + +from cookbook.databricks_utils import get_mlflow_experiment_traces_url +from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor + +# Load the Agent's code with the above configuration +agent = MultiAgentSupervisor(multi_agent_config) + +# Vibe check the Agent for a single query +output = agent.predict(model_input={"messages": [{"role": "user", "content": "How does the blender work?"}]}) +# output = agent.predict(model_input={"messages": [{"role": "user", "content": "Translate the sku `OLD-abs-1234` to the new format"}]}) + +print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") +print(f"Agent's final response:\n----\n{output['content']}\n----") +print() +print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(output['messages'], indent=2)}\n----") + + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Design for multi-agent +# MAGIC +# MAGIC requirements +# MAGIC * can test locally with just the agent's pyfunc classes +# MAGIC * when you change any config, it all just reloads +# MAGIC +# MAGIC when you deploy: +# MAGIC * you deploy each supervised agent separately to model serving +# MAGIC * then mutli agent picks these up +# MAGIC * then mutli agent deploys +# MAGIC +# MAGIC * each child agent has [name, description, config, code] +# MAGIC - when deployed, it reads it from the UC +# MAGIC - locally, from the config + +# COMMAND ---------- + +# MAGIC %md +# MAGIC Testing endpoint based + +# COMMAND ---------- + +from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig +from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME +# from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME +from cookbook.config.shared.llm import LLMConfig +from cookbook.config import serializable_config_to_yaml_file +from cookbook.agents.function_calling_agent import FunctionCallingAgent +from cookbook.config.shared.llm import LLMParametersConfig + + +fc_supervised_ep = SupervisedAgentConfig(name="fc_agent", + description="looks up product docs", + endpoint_name="agents_ep-cookbook_local_test-my_agent_new_test_with_ONLY_retri", + # agent_config=fc_agent_config, + # agent_class=FunctionCallingAgent + ) + +# genie_supervised = SupervisedAgentConfig(name="genie_agent", +# description="queries for customer info", +# endpoint_name="", +# agent_config=genie_agent_config, +# agent_class=GenieAgent) + + +multi_agent_config_with_ep = MultiAgentSupervisorConfig( + llm_endpoint_name="ep-gpt4o-new", + llm_parameters=LLMParametersConfig( + max_tokens= 1500, + temperature= 0.01 + ), + + playground_debug_mode=True, + agent_loading_mode="model_serving", + agents=[fc_supervised_ep] +) + +serializable_config_to_yaml_file(multi_agent_config_with_ep, "./configs/"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + + +# COMMAND ---------- + +from cookbook.config import load_serializable_config_from_yaml_file + +multi_agent_config_with_ep_loaded = load_serializable_config_from_yaml_file("./configs/multi_agent_supervisor_config.yaml") + +print(multi_agent_config_with_ep_loaded) \ No newline at end of file diff --git a/autogen_agent_app_sample_code/README.md b/autogen_agent_app_sample_code/README.md new file mode 100644 index 0000000..ab90eca --- /dev/null +++ b/autogen_agent_app_sample_code/README.md @@ -0,0 +1,160 @@ +# How to use local IDE + +- databricks auth profile DEFAULT is set up +``` +databricks auth profile login +``` +- add a cluster_id in ~/.databrickscfg (if you want to use Spark code) +- add `openai_sdk_agent_app_sample_code/.env` to point to mlflow exp + dbx tracking uri (if you want to run any agent code from the terminal and have it logged to mlflow). Make sure this mlflow experiment maps to the one in 02_agent_setup.ipynb. +``` +MLFLOW_TRACKING_URI=databricks +MLFLOW_EXPERIMENT_NAME=/Users/your.name@company.com/my_agent_mlflow_experiment +``` +- install poetry env & activate in your IDE +``` +poetry install +``` + +if you want to use the data pipeline code in spark, you need to build the cookbook wheel and install it in the cluster +- build cookbook wheel +``` +poetry build +``` +- install cookbook wheel in cluster + - Copy the wheel file to a UC Volume or Workspace folder + - Go to the cluster's Libraries page and install the wheel file as a new library + + +# NOTES/what doesn't work: +- Works locally & once deployed: + - Tool calling agent with vector search, UC tool +- Works locally, deployment not tested. + - Genie Agent + - Multi-Agent supervisor w/ "local" mode + - Multi-Agent supervisor w/ "endpoint" mode + +# TODO: +- Refactor the cookbook folder to + - make it easy to add as `code_path` without putting all agent code + data pipeline code into the agent mlflow model + - make the data pipeline competely seperate + - make the tools inherit from a version of serializableConfig that is "serializableTool" - same exact thing just not overloaded. + +- Multi-agent + - test with deployed endpoints + - make deployed endpoint optional if model = local, otherwise, make class/config optional. + +- Create a version of each of these Agents with LangGraph, LlamaIndex, and AutoGen. + +# Docs + +This cookbook contains example Agents built using Python code + the OpenAI SDK to call Databricks Model Serving/External Models. Each Agent is configurable via a Pydantic-based configuration classes and is wrapped in an MLflow PyFunc class for logging and deployment. + +Included are 3 types of Agents: +- Tool Calling Agent +- Genie Agent +- Multi-Agent Supervisor Agent + +## Genie Agent + +The Genie Agent is a simple wrapper around AI/BI Genie Spaces API. It does not use the OpenAI SDK. It is configured using the `GenieAgentConfig` class: +- Required + - `genie_space_id: str` - The ID of the Genie Space +- Optional Variables with Default Values + - `input_example: Any` - Defaults to: + ```python + { + "messages": [ + { + "role": "user", + "content": "What types of data can I query?", + }, + ] + } + ``` + - `encountered_error_user_message: str` - Defaults to: + > "I encountered an error trying to answer your question, please try again." + +## Tool-calling Agent + +The tool-calling agent uses the configured LLM to decide which tool(s) to call based on the user's query. + +The agent is configured using the `FunctionCallingAgentConfig` class: + +- Required: + - `llm_config: LLMConfig` - Configuration for the LLM endpoint + - `tools: List[BaseTool]` - List of tools available to the agent. + +- Optional Variables with Default Values: + - `input_example: Any` - Defaults to: + ```python + { + "messages": [ + { + "role": "user", + "content": "What can you help me with?", + }, + ] + } + ``` + +The `LLMConfig` requires: +- `llm_endpoint_name: str` - Name of the model serving endpoint +- `llm_system_prompt_template: str` - System prompt for the LLM +- `llm_parameters: Dict` - Parameters for the LLM (temperature, max_tokens, etc.) + +The `BaseTool` class is used to define a tool that the agent can call. The cookbook includes several pre-built tools. If you need to create your own tool, we suggest creating a UC Function and calling that function using the `UCTool`. +- UC Tool + - Wraps the uc toolkit. Adds add't code to parse errors from spark exceptions to just show the Python errors. +- Vector Search Retriever Tool + - A + + + + +## How Pydantic configuration classes work +All configuration classes inherit from `SerializableConfig`, defined in `config/__init__.py`. This class enables a Pydantic BaseModel to be serialized to a YAML file and loaded back from that YAML file. + + + +The Genie Agent is a simple wrapper around AI/BI Genie Spaces API. It does not use the OpenAI SDK. + + + +s code is wrapped in an MLflow PyFunc class and + + +UC Function Tool + +Local Function Tool + +## Vector Search Retriever Tool + +Issues +- Vector Search index does not store the source table's column name / description metadata, so the tool currently uses the source table's metadata to populate the filterable columns. However, this causes deployment to fail since the deployed model does not have access to the source table, so it is toggled off by `USE_SOURCE_TABLE_FOR_METADATA`. + + +Features: +* User can specify a list of filterable columns; these are presented to the tool-calling LLM as parameters of the tool. + + +* Validates all provided columns exist + + + + +what do you need to do? + +- make your data pipeline +- create your genie spaces +- create your tools +- create your agents +- create your multi-agent supervisor + + + +create a unsutrcutred data agent +- create data pipeline +- create synthetic data +- create agent with retriever tool +- evaluate and iterate +- maybe add some tools diff --git a/autogen_agent_app_sample_code/__init__.py b/autogen_agent_app_sample_code/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/autogen_started.ipynb b/autogen_agent_app_sample_code/autogen_started.ipynb new file mode 100644 index 0000000..f49be86 --- /dev/null +++ b/autogen_agent_app_sample_code/autogen_started.ipynb @@ -0,0 +1,195 @@ +# Databricks notebook source +# MAGIC %pip install -qqqq -U -r requirements.txt + +# COMMAND ---------- + +dbutils.library.restartPython() + +# COMMAND ---------- + +from cookbook.tools.vector_search import ( + VectorSearchRetrieverTool, + VectorSearchSchema, +) +from cookbook.tools.uc_tool import UCTool + +# COMMAND ---------- + +retriever_tool = VectorSearchRetrieverTool( + name="search_product_docs", + description="Use this tool to search for product documentation.", + vector_search_index="dbdemos.dbdemos_rag_chatbot.databricks_documentation_vs_index", + vector_search_schema=VectorSearchSchema( + # These columns are the default values used in the `01_data_pipeline` notebook + # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. + chunk_text="content", # Contains the text of each document chunk + document_uri="url", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App + # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM + ) +) + + +translate_sku_tool = UCTool(uc_function_name="devanshu_pandey.cmhc_demo.vector_index_search_tool") + +tools = [retriever_tool] + +# COMMAND ---------- + +entry_point = dbutils.notebook.entry_point + +host_name = f'https://{entry_point.getDbutils().notebook().getContext().browserHostName().get()}' + +token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() + +# COMMAND ---------- + +base_url = f"{host_name}/serving-endpoints/" +base_url + +# COMMAND ---------- + +def is_termination_message(message): + content = message.get("content", "") + return (content and "TERMINATE" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message) + +# COMMAND ---------- + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.serving import ChatMessage, ChatMessageRole +from typing import List, Optional + + +class DatabricksModelServingClient: + def __init__(self, config, **kwargs): + self.workspace = WorkspaceClient() + self.openai_client = self.workspace.serving_endpoints.get_open_ai_client() + self.endpoint_name = config.get("endpoint_name") + self.llm_config = config.get("llm_config") + + def create(self, input_data): + messages = [] + for message in input_data['messages']: + message.pop("name", None) + messages.append(message) + + response = self.openai_client.chat.completions.create( + model=self.endpoint_name, + messages=messages, + tools=input_data['tools'], + tool_choice="auto", + **self.llm_config + ) + + return response + + def message_retrieval(self, response): + # Process and return messages from the response + return [choice.message for choice in response.choices] + + def cost(self, response): + # Implement cost calculation if applicable + return 0 + + def get_usage(self, response): + usage = response.usage + # Implement usage statistics if available + return {"prompt_tokens": usage.prompt_tokens, "total_tokens": usage.total_tokens, "completion_tokens": usage.completion_tokens} + +# COMMAND ---------- + +config_list = { + "model_client_cls": "DatabricksModelServingClient", + "model": "gpt4o", + "endpoint_name": "casaman-gpt4", # "databricks-meta-llama-3-3-70b-instruct", + "llm_config": {"temperature": 0.5, "max_tokens": 1500} + +} + +# COMMAND ---------- + +import os + +from autogen import ConversableAgent + +def create_agents(system_prompt, chat_history): + + def is_termination_message(message): + content = message.get("content", "") + return (content and "TERMINATE" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message) + + # The user proxy agent is used for interacting with the assistant agent + # and executes tool calls. + user_proxy = ConversableAgent( + name="User", + llm_config=False, + is_termination_msg=is_termination_message, + human_input_mode="NEVER", + ) + + assistant = ConversableAgent( + name="Assistant", + system_message="You are a helpful AI assistant. " + "You can help with simple calculations. " + "Return 'TERMINATE' when the task is done.", + llm_config={"config_list": [config_list]}, + chat_messages={user_proxy: chat_history} + ) + + return assistant, user_proxy + +assistant, user_proxy = create_agents('test', []) + +# COMMAND ---------- + +from autogen import register_function + +for tool in tools: + register_function( + tool, + caller=assistant, # The assistant agent can suggest calls to the calculator. + executor=user_proxy, # The user proxy agent can execute the calculator calls. + name=tool.name, + description=tool.description, # A description of the tool. + ) + +# COMMAND ---------- + +translate_sku_tool._toolkit.tools[0].register_function(callers = assistant, + executors = user_proxy ) + +# COMMAND ---------- + +assistant.register_model_client(model_client_cls=DatabricksModelServingClient) + +# COMMAND ---------- + +chat_result = user_proxy.initiate_chat(assistant, message="What is mlflow in databricks?") + +# COMMAND ---------- + +assistant.last_message(user_proxy) + +# COMMAND ---------- + +history = assistant.chat_messages[user_proxy] +history + +# COMMAND ---------- + +assistant, user_proxy = create_agents('test', history) + +# COMMAND ---------- + +assistant.chat_messages[user_proxy] + +# COMMAND ---------- + +chat_result = user_proxy.initiate_chat(assistant, message="This is the second turn of the conversation. Can you summary our actual conversation?", clear_history=False) + +# COMMAND ---------- + +assistant.llm_config["tools"] + +# COMMAND ---------- + +assistant.last_message(user_proxy) \ No newline at end of file diff --git a/autogen_agent_app_sample_code/configs/README.md b/autogen_agent_app_sample_code/configs/README.md new file mode 100644 index 0000000..afa743f --- /dev/null +++ b/autogen_agent_app_sample_code/configs/README.md @@ -0,0 +1 @@ +This folder stores the configurations generated by the cookbook notebooks. \ No newline at end of file diff --git a/autogen_agent_app_sample_code/configs/agent_storage_config.yaml b/autogen_agent_app_sample_code/configs/agent_storage_config.yaml new file mode 100644 index 0000000..976730b --- /dev/null +++ b/autogen_agent_app_sample_code/configs/agent_storage_config.yaml @@ -0,0 +1,4 @@ +class_path: cookbook.config.shared.agent_storage_location.AgentStorageConfig +evaluation_set_uc_table: casaman_ssa.demos.my_agent_autogen_eval_set +mlflow_experiment_name: /Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment +uc_model_name: casaman_ssa.demos.my_agent_autogen diff --git a/autogen_agent_app_sample_code/configs/data_pipeline_config.yaml b/autogen_agent_app_sample_code/configs/data_pipeline_config.yaml new file mode 100644 index 0000000..6ad4b46 --- /dev/null +++ b/autogen_agent_app_sample_code/configs/data_pipeline_config.yaml @@ -0,0 +1,19 @@ +chunking_config: + chunk_overlap_tokens: 256 + chunk_size_tokens: 1024 + class_path: cookbook.config.data_pipeline.recursive_text_splitter.RecursiveTextSplitterChunkingConfig + embedding_model_endpoint: databricks-gte-large-en +class_path: cookbook.config.data_pipeline.DataPipelineConfig +output: + chunked_docs_table: casaman_ssa.demos.test_product_docs_docs_chunked__v2 + class_path: cookbook.config.data_pipeline.data_pipeline_output.DataPipelineOuputConfig + parsed_docs_table: casaman_ssa.demos.test_product_docs_docs__v2 + vector_index: casaman_ssa.demos.test_product_docs_docs_chunked_index__v2 + vector_search_endpoint: dbdemos_vs_endpoint +source: + class_path: cookbook.config.data_pipeline.uc_volume_source.UCVolumeSourceConfig + uc_catalog_name: casaman_ssa + uc_schema_name: demos + uc_volume_name: volume_databricks_documentation + volume_path: /Volumes/casaman_ssa/demos/volume_databricks_documentation + volume_uc_fqn: casaman_ssa.demos.volume_databricks_documentation diff --git a/autogen_agent_app_sample_code/configs/function_calling_agent_config.yaml b/autogen_agent_app_sample_code/configs/function_calling_agent_config.yaml new file mode 100644 index 0000000..b80c0b5 --- /dev/null +++ b/autogen_agent_app_sample_code/configs/function_calling_agent_config.yaml @@ -0,0 +1,60 @@ +class_path: cookbook.config.agents.function_calling_agent.FunctionCallingAgentConfig +input_example: + messages: + - content: What can you help me with? + role: user +llm_config: + llm_endpoint_name: casaman-gpt4 + llm_parameters: + max_tokens: 1500 + temperature: 0.01 + llm_system_prompt_template: "## Role\nYou are a helpful assistant that answers questions\ + \ using a set of tools. If needed, you ask the user follow-up questions to clarify\ + \ their request.\n\n## Objective\nYour goal is to provide accurate, relevant,\ + \ and helpful response based solely on the outputs from these tools. You are concise\ + \ and direct in your responses.\n\n## Instructions\n1. **Understand the Query**:\ + \ Think step by step to analyze the user's question and determine the core need\ + \ or problem. \n\n2. **Assess available tools**: Think step by step to consider\ + \ each available tool and understand their capabilities in the context of the\ + \ user's query.\n\n3. **Select the appropriate tool(s) OR ask follow up questions**:\ + \ Based on your understanding of the query and the tool descriptions, decide which\ + \ tool(s) should be used to generate a response. If you do not have enough information\ + \ to use the available tools to answer the question, ask the user follow up questions\ + \ to refine their request. If you do not have a relevant tool for a question\ + \ or the outputs of the tools are not helpful, respond with: \"I'm sorry, I can't\ + \ help you with that.\"" +tools: +- class_path: cookbook.tools.vector_search.VectorSearchRetrieverTool + description: Use this tool to search for product documentation. + doc_similarity_threshold: 0.0 + filterable_columns: [] + name: search_product_docs + retriever_filter_parameter_prompt: optional filters to apply to the search. An array + of objects, each specifying a field name and the filters to apply to that field. + retriever_query_parameter_prompt: query to look up in retriever + vector_search_index: casaman_ssa.demos.test_product_docs_docs_chunked_index__v2 + vector_search_parameters: + num_results: 5 + query_type: ann + vector_search_schema: + additional_metadata_columns: [] + chunk_text: content_chunked + document_uri: doc_uri +- class_path: cookbook.tools.uc_tool.UCTool + error_prompt: 'The tool call generated an Exception, detailed in `error`. Think + step-by-step following these instructions to determine your next step. + + [1] Is the error due to a problem with the input parameters? + + [2] Could it succeed if retried with exactly the same inputs? + + [3] Could it succeed if retried with modified parameters using the input we already + have from the user? + + [4] Could it succeed if retried with modified parameters informed by collecting + additional input from the user? What specific input would we need from the user? + + Based on your thinking, if the error is due to a problem with the input parameters, + either call this tool again in a way that avoids this exception or collect additional + information from the user to modify the inputs to avoid this exception.' + uc_function_name: casaman_ssa.demos.sku_sample_translator diff --git a/autogen_agent_app_sample_code/cookbook/__init__.py b/autogen_agent_app_sample_code/cookbook/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/agents/__init__.py b/autogen_agent_app_sample_code/cookbook/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/agents/function_calling_agent.py b/autogen_agent_app_sample_code/cookbook/agents/function_calling_agent.py new file mode 100644 index 0000000..7c2eb46 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/function_calling_agent.py @@ -0,0 +1,214 @@ +# In this file, we construct a function-calling Agent with a Retriever tool using MLflow + the OpenAI SDK connected to Databricks Model Serving. This Agent is encapsulated in a MLflow PyFunc class called `FunctionCallingAgent()`. + +# Add the parent directory to the path so we can import the `cookbook` modules +# import sys +# sys.path.append(os.path.dirname(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))) + + +import os +import json +from typing import Any, Dict, List, Optional, Union +import mlflow +import pandas as pd +from mlflow.models import set_model, ModelConfig +from mlflow.models.rag_signatures import StringResponse, ChatCompletionRequest +from databricks.sdk import WorkspaceClient +from autogen import ConversableAgent +from autogen import register_function + +from cookbook.agents.utils.execute_function import execute_function +from cookbook.agents.utils.chat import ( + get_messages_array, + extract_user_query_string, + extract_chat_history, +) +from cookbook.config.agents.function_calling_agent import ( + FunctionCallingAgentConfig, +) +from cookbook.tools.uc_tool import UCTool +from cookbook.agents.utils.execute_function import execute_function +from cookbook.agents.utils.load_config import load_config +from cookbook.agents.utils.databricks_model_serving_client import DatabricksModelServingClient +import logging + + +FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME = "function_calling_agent_config.yaml" + +class FunctionCallingAgent(mlflow.pyfunc.PythonModel): + """ + Class representing an Agent that does function-calling with tools using Autogen + """ + + def __init__( + self, + agent_config: Optional[Union[FunctionCallingAgentConfig, str]] = None + ): + super().__init__() + # Empty variables that will be initialized after loading the agent config. + self.agent_config = None + self.tools = None + + # load the Agent's configuration. See load_config() for details. + self.agent_config = load_config( + passed_agent_config=agent_config, + default_config_file_name=FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, + ) + if not self.agent_config: + logging.error( + f"No agent config found. If you are in your local development environment, make sure you either [1] are calling init(agent_config=...) with either an instance of FunctionCallingAgentConfig or the full path to a YAML config file or [2] have a YAML config file saved at {{your_project_root_folder}}/configs/{FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME}." + ) + else: + logging.info("Successfully loaded agent config in __init__.") + self.tools = self.agent_config.tools + + def create_agents(self, chat_history): + + def is_termination_message(message): + content = message.get("content", "") + return (content and "TERMINATE" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message) + + # The user proxy agent is used for interacting with the assistant agent + # and executes tool calls. + user_proxy = ConversableAgent( + name="User", + llm_config=False, + is_termination_msg=is_termination_message, + human_input_mode="NEVER", + ) + + llm_config = self.agent_config.llm_config + + system_prompt = llm_config.llm_system_prompt_template + + config_list = [{ + "model_client_cls": "DatabricksModelServingClient", + "model": llm_config.llm_endpoint_name, + "endpoint_name": llm_config.llm_endpoint_name, + "llm_config": llm_config.llm_parameters.dict()}] + + assistant = ConversableAgent( + name="Assistant", + system_message=system_prompt, + llm_config={"config_list": config_list, "cache_seed": None}, + chat_messages={user_proxy: chat_history} + ) + + for tool in self.tools: + if isinstance(tool, UCTool): + tool._toolkit.tools[0].register_function(callers = assistant, + executors = user_proxy ) + else: + register_function(tool, + caller=assistant, + executor=user_proxy, + name=tool.name, + description=tool.description) + + return assistant, user_proxy + + + @mlflow.trace(name="agent", span_type="AGENT") + def predict( + self, + context: Any = None, + model_input: Union[ChatCompletionRequest, Dict, pd.DataFrame] = None, + params: Any = None, + ) -> StringResponse: + # Check here to allow the Agent class to be initialized without a configuration file, which is required to import the class as a module in other files. + if not self.agent_config: + raise RuntimeError("Agent config not loaded. Cannot call predict()") + + ############################################################################## + # Extract `messages` key from the `model_input` + messages = get_messages_array(model_input) + + ############################################################################## + # Parse `messages` array into the user's query & the chat history + with mlflow.start_span(name="parse_input", span_type="PARSER") as span: + span.set_inputs({"messages": messages}) + # in a multi-agent setting, the last message can be from another assistant, not the user + last_message_content = extract_user_query_string(messages) + last_message_role = messages[-1]["role"] + last_message = {"role": last_message_role, "content": last_message_content} + # Save the history inside the Agent's internal state + chat_history = extract_chat_history(messages) + span.set_outputs( + { + "last_message": last_message, + "chat_history": chat_history + } + ) + + ############################################################################## + # Call the LLM to recursively calls tools and eventually deliver a generation to send back to the user + ( + model_response, + messages_log_with_tool_calls, + ) = self.recursively_call_and_run_tools(last_message=last_message, + chat_history=chat_history) + + return { + "content": model_response['content'], + # messages should be returned back to the Review App (or any other front end app) and stored there so it can be passed back to this stateless agent with the next turns of converastion. + "messages": messages_log_with_tool_calls, + } + + @mlflow.trace(span_type="AGENT") + def recursively_call_and_run_tools(self, + last_message, + chat_history, + last_max_iter=10): + + assistant, user_proxy = self.create_agents(chat_history) + + assistant.register_model_client(model_client_cls=DatabricksModelServingClient) + + model_response = user_proxy.initiate_chat(assistant, + message=last_message['content'], + max_turns=last_max_iter, + clear_history=False) + + return assistant.last_message(user_proxy), assistant.chat_messages[user_proxy] + + +logging.basicConfig(level=logging.INFO) + +# tell MLflow logging where to find the agent's code +set_model(FunctionCallingAgent()) + + +# IMPORTANT: set this to False before logging the model to MLflow +debug = False + +if debug: + # logging.basicConfig(level=logging.INFO) + # print(find_config_folder_location()) + # print(os.path.abspath(os.getcwd())) + # mlflow.tracing.disable() + agent = FunctionCallingAgent() + + vibe_check_query = { + "messages": [ + # {"role": "user", "content": f"what is agent evaluation?"}, + # {"role": "user", "content": f"How does the blender work?"}, + # { + # "role": "user", + # "content": f"find all docs from the section header 'Databricks documentation archive' or 'Work with files on Databricks'", + # }, + { + "role": "user", + "content": "Translate the sku `OLD-abs-1234` to the new format", + } + # { + # "role": "user", + # "content": f"convert sku 'OLD-XXX-1234' to the new format", + # }, + # { + # "role": "user", + # "content": f"what are recent customer issues? what words appeared most frequently?", + # }, + ] + } + + output = agent.predict(model_input=vibe_check_query) + print(output["content"]) diff --git a/autogen_agent_app_sample_code/cookbook/agents/multi_agent_supervisor.py b/autogen_agent_app_sample_code/cookbook/agents/multi_agent_supervisor.py new file mode 100644 index 0000000..37927f2 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/multi_agent_supervisor.py @@ -0,0 +1,616 @@ +import json +import os +from typing import Any, Callable, Dict, List, Optional, Union +from cookbook.config.agents.multi_agent_supervisor import ( + MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, +) +import mlflow +from dataclasses import asdict, dataclass, field +import pandas as pd +from mlflow.models import set_model, ModelConfig +from mlflow.models.rag_signatures import StringResponse, ChatCompletionRequest, Message +from databricks.sdk import WorkspaceClient +import os +from cookbook.agents.utils.chat import ( + remove_message_keys_with_null_values, + remove_tool_calls_from_messages, +) +from cookbook.agents.utils.load_config import load_config +from cookbook.config.agents.multi_agent_supervisor import ( + MultiAgentSupervisorConfig, + WORKER_PROMPT_TEMPLATE, + ROUTING_FUNCTION_NAME, + CONVERSATION_HISTORY_THINKING_PARAM, + WORKER_CAPABILITIES_THINKING_PARAM, + NEXT_WORKER_OR_FINISH_PARAM, + FINISH_ROUTE_NAME, + SUPERVISOR_ROUTE_NAME, +) +from cookbook.agents.utils.chat import get_messages_array +from cookbook.agents.utils.playground_parser import ( + convert_messages_to_playground_tool_display_strings, +) +import importlib +import logging + +# logging.basicConfig(level=logging.INFO) + +from mlflow.entities import Trace +import mlflow.deployments + + +AGENT_RAW_OUTPUT_KEY = "raw_agent_output" +AGENT_NEW_MESSAGES_KEY = "new_messages" + + +@dataclass +class SupervisorState: + """Tracks essential conversation state""" + + chat_history: List[Dict[str, str]] = field(default_factory=list) + last_agent_called: str = "" + number_of_supervisor_loops_completed: int = 0 + num_messages_at_start: int = 0 + # error: Optional[str] = None + + @mlflow.trace(span_type="FUNCTION", name="state.append_new_message_to_history") + def append_new_message_to_history(self, message: Dict[str, str]) -> None: + span = mlflow.get_current_active_span() + if span: # TODO: Hack, when mlflow tracing is disabled, span == None. + span.set_inputs({"message": message}) + with mlflow.start_span( + name="remove_message_keys_with_null_values" + ) as span_inner: + span_inner.set_inputs({"message": message}) + message_with_no_null_values_for_keys = remove_message_keys_with_null_values( + message + ) + span_inner.set_outputs( + { + "message_with_no_null_values_for_keys": message_with_no_null_values_for_keys + } + ) + self.chat_history.append(message_with_no_null_values_for_keys) + span.set_outputs(self.chat_history) + + @mlflow.trace(span_type="FUNCTION", name="state.overwrite_chat_history") + def overwrite_chat_history(self, new_chat_history: List[Dict[str, str]]) -> None: + span = mlflow.get_current_active_span() + if span: # TODO: Hack, when mlflow tracing is disabled, span == None. + span.set_inputs( + { + "new_chat_history": new_chat_history, + "current_chat_history": self.chat_history, + } + ) + messages_with_no_null_values_for_keys = [] + with mlflow.start_span( + name="remove_message_keys_with_null_values" + ) as span_inner: + span_inner.set_inputs({"new_chat_history": new_chat_history}) + for message in new_chat_history: + messages_with_no_null_values_for_keys.append( + remove_message_keys_with_null_values(message) + ) + span_inner.set_outputs( + { + "messages_with_no_null_values_for_keys": messages_with_no_null_values_for_keys + } + ) + self.chat_history = messages_with_no_null_values_for_keys.copy() + span.set_outputs(self.chat_history) + + +class MultiAgentSupervisor(mlflow.pyfunc.PythonModel): + """ + Class representing an Agent that does function-calling with tools using OpenAI SDK + """ + + def __init__( + self, agent_config: Optional[Union[MultiAgentSupervisorConfig, str]] = None + ): + logging.info("Initializing MultiAgentSupervisor") + + # load the Agent's configuration. See load_config() for details. + self.agent_config = load_config( + passed_agent_config=agent_config, + default_config_file_name=MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, + ) + if not self.agent_config: + raise ValueError( + f"No agent config found. If you are in your local development environment, make sure you either [1] are calling init(agent_config=...) with either an instance of MultiAgentSupervisorConfig or the full path to a YAML config file or [2] have a YAML config file saved at {{your_project_root_folder}}/configs/{MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME}." + ) + else: + logging.info("Successfully loaded agent config in __init__.") + + # Initialize clients + self._initialize_model_serving_clients() + + # Set up agents and routing + self._initialize_supervised_agents() + + # Set up prompts and tools + self._initialize_supervisor_prompts_and_tools() + + # Initialize state + self.state = None # Will be initialized per conversation + logging.info("Initialized MultiAgentSupervisor") + + def _initialize_model_serving_clients(self): + """Initialize API clients for model serving""" + w = WorkspaceClient() + self.model_serving_client = w.serving_endpoints.get_open_ai_client() + + # used for calling the child agent's deployments + self.mlflow_serving_client = mlflow.deployments.get_deploy_client("databricks") + logging.info("Initialized model serving clients") + + def _initialize_supervised_agents(self): + """Initialize the agent registry and capabilities""" + self.agents = {} + + # Add configured worker agents + if self.agent_config.agent_loading_mode == "model_serving": + # using the model serving endpoints of the agents + for agent in self.agent_config.agents: + self.agents[agent.name] = { + "agent_description": agent.description, + "endpoint_name": agent.endpoint_name, + } + elif self.agent_config.agent_loading_mode == "local": + # using the local agent classes + for agent in self.agent_config.agents: + # load the agent class + module_name, class_name = agent.agent_class_path.rsplit(".", 1) + + module = importlib.import_module(module_name) + # Load the Agent class, which will be a PyFunc + agent_class_obj = getattr(module, class_name) + self.agents[agent.name] = { + "agent_description": agent.description, + "agent_pyfunc_instance": agent_class_obj( + agent_config=agent.agent_config + ), # instantiate the PyFunc + } + logging.info(f"Loaded agent: {agent.name}") + else: + raise ValueError( + f"Invalid agent loading mode: {self.agent_config.agent_loading_mode}" + ) + + def _initialize_supervisor_prompts_and_tools(self): + """Initialize prompts and function calling tools""" + # Create agents string for system prompt + agents_info = [ + WORKER_PROMPT_TEMPLATE.format( + worker_name=key, worker_description=value["agent_description"] + ) + for key, value in self.agents.items() + ] + workers_names_and_descriptions = "".join(agents_info) + + # Update system prompt with template variables + self.supervisor_system_prompt = ( + self.agent_config.supervisor_system_prompt.format( + ROUTING_FUNCTION_NAME=ROUTING_FUNCTION_NAME, + CONVERSATION_HISTORY_THINKING_PARAM=CONVERSATION_HISTORY_THINKING_PARAM, + WORKER_CAPABILITIES_THINKING_PARAM=WORKER_CAPABILITIES_THINKING_PARAM, + NEXT_WORKER_OR_FINISH_PARAM=NEXT_WORKER_OR_FINISH_PARAM, + FINISH_ROUTE_NAME=FINISH_ROUTE_NAME, + workers_names_and_descriptions=workers_names_and_descriptions, + ) + ) + + self.supervisor_user_prompt = self.agent_config.supervisor_user_prompt.format( + worker_names_with_finish=list(self.agents.keys()) + [FINISH_ROUTE_NAME], + NEXT_WORKER_OR_FINISH_PARAM=NEXT_WORKER_OR_FINISH_PARAM, + ROUTING_FUNCTION_NAME=ROUTING_FUNCTION_NAME, + FINISH_ROUTE_NAME=FINISH_ROUTE_NAME, + ) + + # Initialize routing function schema + self.route_function = { + "type": "function", + "function": { + "name": ROUTING_FUNCTION_NAME, + "description": "Route the conversation by providing your thinking and next worker selection.", + "parameters": { + "properties": { + CONVERSATION_HISTORY_THINKING_PARAM: {"type": "string"}, + WORKER_CAPABILITIES_THINKING_PARAM: {"type": "string"}, + NEXT_WORKER_OR_FINISH_PARAM: { + "enum": list(self.agents.keys()), + "type": "string", + }, + }, + "required": [ + CONVERSATION_HISTORY_THINKING_PARAM, + WORKER_CAPABILITIES_THINKING_PARAM, + NEXT_WORKER_OR_FINISH_PARAM, + ], + "type": "object", + }, + }, + } + self.tool_json_schemas = [self.route_function] + + @mlflow.trace(span_type="AGENT") + def _get_supervisor_routing_decision(self, messages: List[Dict[str, str]]) -> str: + + supervisor_messages = ( + [{"role": "system", "content": self.supervisor_system_prompt}] + + messages + + [ + { + "role": "user", + "content": self.supervisor_user_prompt, + } + ] + ) + + response = self.chat_completion(messages=supervisor_messages, tools=True) + supervisor_llm_response = response.choices[0].message + supervisor_tool_calls = supervisor_llm_response.tool_calls + + if supervisor_tool_calls: + for tool_call in supervisor_tool_calls: + function = tool_call.function + args = json.loads(function.arguments) + if function.name == ROUTING_FUNCTION_NAME: + return args # includes all keys from the function call + else: + logging.error( + f"Supervisor LLM failed to call the {ROUTING_FUNCTION_NAME}(...) function to determine the next step, so we will default to finishing. It tried to call `{function.name}` with args `{function.arguments}`." + ) + return FINISH_ROUTE_NAME + else: + logging.error( + f"Supervisor LLM failed to choose a tool at all, so we will default to finishing. It said `{supervisor_llm_response}`." + ) + return FINISH_ROUTE_NAME + + @mlflow.trace() + def _call_supervised_agent( + self, agent_name: str, input_messages: List[Dict[str, str]] + ) -> Dict[str, Any]: + """ + Calls a supervised agent and returns ONLY the new [messages] produced by that agent. + """ + span = mlflow.get_current_active_span() + if span: # TODO: Hack, when mlflow tracing is disabled, span == None. + span.set_attribute( + "self.agent_config.agent_loading_mode", + self.agent_config.agent_loading_mode, + ) + raw_agent_output = {} + if self.agent_config.agent_loading_mode == "model_serving": + endpoint_name = self.agents.get(agent_name).get("endpoint_name") + if endpoint_name: + # this request will grab the mlflow trace from the endpoint + request = { + "databricks_options": {"return_trace": True}, + "messages": input_messages.copy(), + } + completion = self.mlflow_serving_client.predict( + endpoint=endpoint_name, inputs=request + ) + + logging.info(f"Called agent: {agent_name}") + logging.info(f"Got response agent: {completion}") + + # Add the trace from model serving API call to the active trace + if trace := completion.pop("databricks_output", {}).get("trace"): + trace = Trace.from_dict(trace) + mlflow.add_trace(trace) + + raw_agent_output = completion + else: + raise ValueError(f"Invalid agent selected: {agent_name}") + elif self.agent_config.agent_loading_mode == "local": + agent_pyfunc_instance = self.agents.get(agent_name).get( + "agent_pyfunc_instance" + ) + if agent_pyfunc_instance: + request = { + # "databricks_options": {"return_trace": True}, + "messages": input_messages.copy(), + } + raw_agent_output = agent_pyfunc_instance.predict(model_input=request) + else: + raise ValueError(f"Invalid agent selected: {agent_name}") + else: + raise ValueError( + f"Invalid agent loading mode: {self.agent_config.agent_loading_mode}" + ) + + # return only the net new messages produced by the agent + agent_output_messages = raw_agent_output.get("messages", []) + num_messages_previously = len(input_messages) + num_messages_after_agent = len(agent_output_messages) + if ( + num_messages_after_agent == 0 + or num_messages_after_agent == num_messages_previously + ): + raise Exception( + f"Agent {agent_name} either returned no messages at all or returned the same number of messages it received, indicating it did not produce any new messages." + ) + + else: + # Add the Agent's name to its messages + new_messages = agent_output_messages[num_messages_previously:].copy() + for new_message in new_messages: + new_message["name"] = agent_name + return { + # agent's raw output + AGENT_RAW_OUTPUT_KEY: raw_agent_output, + # new messages produced by the agent + AGENT_NEW_MESSAGES_KEY: new_messages, + } + + @mlflow.trace(name="agent", span_type="AGENT") + def predict( + self, + context: Any = None, + model_input: Union[ChatCompletionRequest, Dict, pd.DataFrame] = None, + params: Any = None, + ) -> StringResponse: + # Check here to allow the Agent class to be initialized without a configuration file, which is required to import the class as a module in other files. + if not self.agent_config: + raise RuntimeError("Agent config not loaded. Cannot call predict()") + # try: + # Initialize conversation state + messages = get_messages_array(model_input) + self.state = SupervisorState() + self.state.overwrite_chat_history(messages) + self.state.num_messages_at_start = len(messages) + + # Run the supervisor loop up to self.agent_config.max_workers_called times + while ( + self.state.number_of_supervisor_loops_completed + < self.agent_config.max_supervisor_loops + ): + with mlflow.start_span(name="supervisor_loop_iteration") as span: + self.state.number_of_supervisor_loops_completed += 1 + + chat_history_without_tool_calls = remove_tool_calls_from_messages( + self.state.chat_history + ) + routing_function_output = self._get_supervisor_routing_decision( + chat_history_without_tool_calls + ) + + next_agent = routing_function_output.get(NEXT_WORKER_OR_FINISH_PARAM) + span.set_inputs( + { + f"supervisor.{NEXT_WORKER_OR_FINISH_PARAM}": next_agent, + f"supervisor.{CONVERSATION_HISTORY_THINKING_PARAM}": routing_function_output.get( + CONVERSATION_HISTORY_THINKING_PARAM + ), + f"supervisor.{WORKER_CAPABILITIES_THINKING_PARAM}": routing_function_output.get( + WORKER_CAPABILITIES_THINKING_PARAM + ), + "state.number_of_workers_called": self.state.number_of_supervisor_loops_completed, + "state.chat_history": self.state.chat_history, + "chat_history_without_tool_calls": chat_history_without_tool_calls, + } + ) + + if next_agent is None: + logging.error( + f"Supervisor returned no next agent, so we will default to finishing." + ) + span.set_outputs( + { + "post_processed_decision": FINISH_ROUTE_NAME, + "post_processing_reason": "Supervisor returned no next agent, so we will default to finishing.", + "updated_chat_history": self.state.chat_history, + } + ) + break + if next_agent == FINISH_ROUTE_NAME: + logging.info( + f"Supervisor called {FINISH_ROUTE_NAME} after {self.state.number_of_supervisor_loops_completed} workers being called." + ) + span.set_outputs( + { + "post_processed_decision": FINISH_ROUTE_NAME, + "post_processing_reason": "Supervisor selected it.", + "updated_chat_history": self.state.chat_history, + } + ) + break # finish by exiting the while loop + # prevent the supervisor from calling an agent multiple times in a row + elif next_agent != self.state.last_agent_called: + # Call worker agent and update history + try: + agent_output = self._call_supervised_agent( + next_agent, chat_history_without_tool_calls + ) + agent_new_messages = agent_output[AGENT_NEW_MESSAGES_KEY] + agent_raw_output = agent_output[AGENT_RAW_OUTPUT_KEY] + + self.state.overwrite_chat_history( + self.state.chat_history + agent_new_messages + ) + self.state.last_agent_called = next_agent + span.set_outputs( + { + "post_processed_decision": next_agent, + "post_processing_reason": "Supervisor selected it.", + "updated_chat_history": self.state.chat_history, + f"called_agent.{AGENT_NEW_MESSAGES_KEY}": agent_new_messages, + f"called_agent.{AGENT_RAW_OUTPUT_KEY}": agent_raw_output, + } + ) + + except ValueError as e: + logging.error( + f"Error calling agent {next_agent}: {e}. We will default to finishing." + ) + span.set_outputs( + { + "post_processed_decision": FINISH_ROUTE_NAME, + "post_processing_reason": "Supervisor selected an invalid agent, so defaulting to finishing.", + "updated_chat_history": self.state.chat_history, + } + ) + break # finish by exiting the while loop + else: + logging.warning( + f"Supervisor called the same agent {next_agent} twice in a row. We will default to finishing." + ) + span.set_outputs( + { + "post_processed_decision": FINISH_ROUTE_NAME, + "post_processing_reason": f"Supervisor selected {next_agent} twice in a row, so business logic decided to finish instead.", + "updated_chat_history": self.state.chat_history, + } + ) + break # finish by exiting the while loop + + # if the last message is not from the assistant, we need to add a fake assistant message + # TODO: add the name of the supervisor agent here + if self.state.chat_history[-1]["role"] != "assistant": + logging.warning( + "No assistant ended up replying, so we'll add an error response" + ) + with mlflow.start_span(name="add_error_response_to_history") as span: + span.set_inputs( + { + "state.chat_history": self.state.chat_history, + } + ) + self.state.append_new_message_to_history( + { + "role": "assistant", + "content": self.agent_config.supervisor_error_response, + # "name": "supervisor", + } + ) + span.set_outputs( + { + "updated_chat_history": self.state.chat_history, + } + ) + + # Return the resulting conversation back to the user + with mlflow.start_span(name="return_conversation_to_user") as span: + span.set_inputs( + { + "state.chat_history": self.state.chat_history, + "agent_config.playground_debug_mode": self.agent_config.playground_debug_mode, + } + ) + if self.agent_config.playground_debug_mode is True: + return_value = { + "response": ( + self.state.chat_history[-1]["content"] + if self.state.chat_history + else "" + ), + "messages": self.state.chat_history, + # only parse the new messages we added into playground format + "content": convert_messages_to_playground_tool_display_strings( + self.state.chat_history[self.state.num_messages_at_start :] + ), + } + span.set_outputs(return_value) + return return_value + else: + return_value = { + "content": ( + self.state.chat_history[-1]["content"] + if self.state.chat_history + else "" + ), + "messages": self.state.chat_history, + } + span.set_outputs(return_value) + return return_value + + def chat_completion(self, messages: List[Dict[str, str]], tools: bool = False): + endpoint_name = self.agent_config.llm_endpoint_name + llm_options = self.agent_config.llm_parameters.model_dump() + + # # Trace the call to Model Serving - openai versio + traced_create = mlflow.trace( + self.model_serving_client.chat.completions.create, + name="chat_completions_api", + span_type="CHAT_MODEL", + ) + + # Openai - start + if tools: + return traced_create( + model=endpoint_name, + messages=messages, + tools=self.tool_json_schemas, + parallel_tool_calls=False, + **llm_options, + ) + else: + return traced_create(model=endpoint_name, messages=messages, **llm_options) + # Openai - end + + +# tell MLflow logging where to find the agent's code +set_model(MultiAgentSupervisor()) + + +# IMPORTANT: set this to False before logging the model to MLflow +debug = False + +if debug: + + # agent = MultiAgentSupervisor(agent_config=MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) + agent = MultiAgentSupervisor() + + vibe_check_query = { + "messages": [ + # {"role": "user", "content": f"how does the CoolTech Elite 5500 work?"}, + {"role": "user", "content": f"calculate the value of 2+2?"}, + # { + # "role": "user", + # "content": f"How does account age affect the likelihood of churn?", + # }, + ] + } + + output = agent.predict(model_input=vibe_check_query) + print(output["content"]) + # print(output) + + # input_2 = output["messages"].copy() + # input_2.append( + # { + # "role": "user", + # "content": f"who is the user most likely to do this?", + # # "content": f"how do i turn it on?", + # }, + # ) + + # output_2 = agent.predict(model_input={"messages": input_2}) + # print(output_2["content"]) + +# # COMMAND ---------- + +# if debug: +# agent = MultiAgentSupervisor(agent_config="supervisor_config.yml") +# vibe_check_query = { +# "messages": [ +# # {"role": "user", "content": f"What is agent evaluation?"}, +# # {"role": "user", "content": f"What users have churned?"}, +# { +# "role": "user", +# "content": f"What is the capacity of the BrewMaster Elite 3000 coffee maker?", +# }, +# # {"role": "user", "content": f"calculate the value of 2+2?"}, +# # { +# # "role": "user", +# # "content": f"did user 8e753fa6-2464-4354-887c-a25ace971a7e experience any issues?", +# # }, +# ] +# } + +# output = agent.predict(model_input=vibe_check_query) +# # print(output) diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/__init__.py b/autogen_agent_app_sample_code/cookbook/agents/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/chat.py b/autogen_agent_app_sample_code/cookbook/agents/utils/chat.py new file mode 100644 index 0000000..a817c02 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/chat.py @@ -0,0 +1,145 @@ +import mlflow +from typing import Dict, List, Union +from dataclasses import asdict +import pandas as pd +from mlflow.models.rag_signatures import ChatCompletionRequest, Message + + +@mlflow.trace(span_type="PARSER") +def get_messages_array( + model_input: Union[ChatCompletionRequest, Dict, pd.DataFrame] +) -> List[Dict[str, str]]: + if type(model_input) == ChatCompletionRequest: + return model_input.messages + elif type(model_input) == dict: + return model_input.get("messages") + elif type(model_input) == pd.DataFrame: + return model_input.iloc[0].to_dict().get("messages") + + +@mlflow.trace(span_type="PARSER") +def extract_user_query_string(chat_messages_array: List[Dict[str, str]]) -> str: + """ + Extracts user query string from the chat messages array. + + Args: + chat_messages_array: Array of chat messages. + + Returns: + User query string. + """ + + if isinstance(chat_messages_array, pd.Series): + chat_messages_array = chat_messages_array.tolist() + + if isinstance(chat_messages_array[-1], dict): + return chat_messages_array[-1]["content"] + elif isinstance(chat_messages_array[-1], Message): + return chat_messages_array[-1].content + else: + return chat_messages_array[-1] + + +@mlflow.trace(span_type="PARSER") +def extract_chat_history( + chat_messages_array: List[Dict[str, str]] +) -> List[Dict[str, str]]: + """ + Extracts the chat history from the chat messages array. + + Args: + chat_messages_array: Array of chat messages. + + Returns: + The chat history. + """ + # Convert DataFrame to dict + if isinstance(chat_messages_array, pd.Series): + chat_messages_array = chat_messages_array.tolist() + + # Dictionary, return as is + if isinstance(chat_messages_array[0], dict): + return chat_messages_array[:-1] # return all messages except the last one + # MLflow Message, convert to Dictionary + elif isinstance(chat_messages_array[0], Message): + new_array = [] + for message in chat_messages_array[:-1]: + new_array.append(asdict(message)) + return new_array + else: + raise ValueError( + "chat_messages_array is not an Array of Dictionary, Pandas DataFrame, or array of MLflow Message." + ) + + +@mlflow.trace(span_type="PARSER") +def convert_messages_to_open_ai_format( + chat_messages_array: List[Dict[str, str]] +) -> List[Dict[str, str]]: + """ + Extracts the chat history from the chat messages array. + + Args: + chat_messages_array: Array of chat messages. + + Returns: + The chat history. + """ + # Convert DataFrame to dict + if isinstance(chat_messages_array, pd.Series): + chat_messages_array = chat_messages_array.tolist() + + # Dictionary, return as is + if isinstance(chat_messages_array[0], dict): + return chat_messages_array # return all messages except the last one + # MLflow Message, convert to Dictionary + elif isinstance(chat_messages_array[0], Message): + new_array = [] + for message in chat_messages_array: + new_array.append(asdict(message)) + return new_array + else: + raise ValueError( + "chat_messages_array is not an Array of Dictionary, Pandas DataFrame, or array of MLflow Message." + ) + + +@mlflow.trace(span_type="PARSER") +def concat_messages_array_to_string(messages): + concatenated_message = "\n".join( + [ + ( + f"{message.get('role', message.get('name', 'unknown'))}: {message.get('content', '')}" + if message.get("role") in ("assistant", "user") + else "" + ) + for message in messages + ] + ) + return concatenated_message + + +def remove_message_keys_with_null_values(message: Dict[str, str]) -> Dict[str, str]: + """ + Remove any keys with None/null values from the message. + Having a null value for a key breaks DBX model serving input validation even if that key is marked as optional in the schema, so we remove them. + Example: refusal key is set as None by OpenAI + """ + return {k: v for k, v in message.items() if v is not None} + + +@mlflow.trace(span_type="PARSER") +def remove_tool_calls_from_messages( + messages: List[Dict[str, str]] +) -> List[Dict[str, str]]: + modified_messages = messages.copy() + return [ + msg + for msg in modified_messages + if not ( + msg.get("role") == "tool" # Remove tool messages + or ( + msg.get("role") == "assistant" and "tool_calls" in msg + ) # Remove assistant messages with tool_calls + ) + ] diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py b/autogen_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py new file mode 100644 index 0000000..0214930 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py @@ -0,0 +1,40 @@ +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.serving import ChatMessage, ChatMessageRole +from typing import List, Optional + + +class DatabricksModelServingClient: + def __init__(self, config, **kwargs): + self.workspace = WorkspaceClient() + self.openai_client = self.workspace.serving_endpoints.get_open_ai_client() + self.endpoint_name = config.get("endpoint_name") + self.llm_config = config.get("llm_config") + + def create(self, input_data): + messages = [] + for message in input_data['messages']: + message.pop("name", None) + messages.append(message) + + response = self.openai_client.chat.completions.create( + model=self.endpoint_name, + messages=messages, + tools=input_data['tools'], + tool_choice="auto", + **self.llm_config + ) + + return response + + def message_retrieval(self, response): + # Process and return messages from the response + return [choice.message for choice in response.choices] + + def cost(self, response): + # Implement cost calculation if applicable + return 0 + + def get_usage(self, response): + usage = response.usage + # Implement usage statistics if available + return {"prompt_tokens": usage.prompt_tokens, "total_tokens": usage.total_tokens, "completion_tokens": usage.completion_tokens} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/execute_function.py b/autogen_agent_app_sample_code/cookbook/agents/utils/execute_function.py new file mode 100644 index 0000000..1d0a7df --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/execute_function.py @@ -0,0 +1,8 @@ +import mlflow +import json + + +@mlflow.trace(span_type="FUNCTION") +def execute_function(tool, args): + result = tool(**args) + return json.dumps(result) diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/load_config.py b/autogen_agent_app_sample_code/cookbook/agents/utils/load_config.py new file mode 100644 index 0000000..9a797d0 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/load_config.py @@ -0,0 +1,138 @@ +import logging +from typing import List +from cookbook.config import SerializableConfig +import yaml +import mlflow +from mlflow.models import ModelConfig +from cookbook.config import ( + load_serializable_config_from_yaml, +) +import os + + +def load_first_yaml_file(config_paths: List[str]) -> str: + for path in config_paths: + if os.path.exists(path): + logging.info(f"Found YAML config file at {path}") + with open(path, "r") as handle: + return handle.read() + raise ValueError( + f"No config file found at any of the following paths: {config_paths}. " + f"Please ensure a config file exists at one of those paths." + ) + + +def load_config_from_mlflow_model_config() -> SerializableConfig: + try: + model_config_as_yaml = yaml.dump(mlflow.models.ModelConfig()._read_config()) + loaded_config = load_serializable_config_from_yaml(model_config_as_yaml) + logging.info(f"Loaded config from mlflow.models.ModelConfig(): {loaded_config}") + return loaded_config + except Exception as e: + logging.info(f"Could not load config from mlflow.models.ModelConfig(): {e}") + return None + + +def try_to_load_config_file(agent_config_file_or_path: str) -> SerializableConfig: + """ + Try to load configuration from a local YAML file. + """ + + # otherwise, we try to look for the YAML file + # this logic accounts for the fact that the agent can be called from any working directory, so we have to search for the config folder to find the YAML. + config_paths = [] + config_paths.append( + agent_config_file_or_path + ) # will try from the passed location first. + + # Then try a from a few common locations - these are set based on the common working directory locations for a notebook/shell. + config_paths.extend( + [ + "./configs/" + agent_config_file_or_path, + "../configs/" + agent_config_file_or_path, + "../../configs/" + agent_config_file_or_path, + "../openai_sdk_agent_app_sample_code/configs/" + agent_config_file_or_path, + "./openai_sdk_agent_app_sample_code/configs/" + agent_config_file_or_path, + ] + ) + + logging.info( + f"Trying to load YAML file {agent_config_file_or_path} from paths: {config_paths}" + ) + try: + config_file = load_first_yaml_file(config_paths) + return load_serializable_config_from_yaml(config_file) + except Exception as e: + logging.info( + f"Exception loading YAML file {agent_config_file_or_path} at {config_paths}: {e}" + ) + raise ValueError( + f"Could not load the provided YAML file {agent_config_file_or_path}." + ) + + +def load_config( + passed_agent_config: SerializableConfig | str | None = None, + default_config_file_name: str = None, +) -> SerializableConfig: + """ + Load configuration from various sources in order of precedence: + # load the Agent's configuration. Priority order: + 1. MLflow Model config + 2. passed_agent_config + 3. default_config_file_name + + Returns: + SerializableModel: Loaded configuration object + """ + + # 1. Try to use MLflow ModelConfig + try: + logging.info("Trying to load config from mlflow.models.ModelConfig()") + model_config_as_yaml = yaml.dump(mlflow.models.ModelConfig()._read_config()) + loaded_config = load_serializable_config_from_yaml(model_config_as_yaml) + logging.info(f"Loaded config from mlflow.models.ModelConfig(): {loaded_config}") + return loaded_config + except FileNotFoundError as e: + logging.info(f"Could not load config from mlflow.models.ModelConfig(): {e}") + + # 2a. passed_agent_config is an instantiated config class, use that + if isinstance(passed_agent_config, ModelConfig): + logging.info( + "passed_agent_config` is an instantiated config class, using that." + ) + return passed_agent_config + + # 2b. passed_agent_config is a YAML file name or file path, try to load from that YAML file + # try_to_load_config_file logic accounts for the fact that the agent can be called from any working directory, so we will search for the config folder to find the YAML. + if isinstance(passed_agent_config, str): + print("ENTRO AQUI") + logging.info( + f"`passed_agent_config` is a string, trying to load from YAML: {passed_agent_config}" + ) + try: + loaded_config = try_to_load_config_file(passed_agent_config) + logging.info( + f"Loaded config from YAML file {passed_agent_config}: {loaded_config}" + ) + return loaded_config + except ValueError as e: + logging.info(f"{passed_agent_config} was not found.") + + # 3. Try to load from default config file + if default_config_file_name: + logging.info(f"Trying to load from YAML: {default_config_file_name}") + try: + loaded_config = try_to_load_config_file(default_config_file_name) + logging.info( + f"Loaded config from YAML file {default_config_file_name}: {loaded_config}" + ) + return loaded_config + except ValueError as e: + logging.info(f"{default_config_file_name} was not found.") + + # If no config is found so far, return None + logging.error( + "load_config could not find a config file. Returning None. Refer to your Agent's error message for next steps." + ) + return None diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/playground_parser.py b/autogen_agent_app_sample_code/cookbook/agents/utils/playground_parser.py new file mode 100644 index 0000000..20800cc --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/playground_parser.py @@ -0,0 +1,98 @@ +import mlflow +from typing import List, Dict +import json + +## +# Utility functions for formatting OpenAI tool calls and responses for display in Databricks +# playground and review applications. These functions convert the raw message format into +# a more readable, XML-tagged format suitable for UI rendering. +## + + +@mlflow.trace(span_type="PARSER") +def convert_messages_to_playground_tool_display_strings( + messages: List[Dict[str, str]] +) -> str: + """Format a list of OpenAI chat messages for display in Databricks playground/review UI. + + Processes a sequence of OpenAI chat messages, with special handling for tool calls + and their responses. Tool-related content is wrapped in XML-like tags for proper + UI rendering and readability. + + Args: + messages (List[Dict[str, str]]): List of OpenAI message dictionaries containing role + (user/assistant/tool), content, and optional tool_calls from the chat completion API. + + Returns: + str: UI-friendly string with tool calls wrapped in tags and + tool responses wrapped in tags. + """ + output = "" + for msg in messages: # ignore first user input + if msg["role"] == "assistant" and msg.get("tool_calls"): # tool call + for tool_call in msg["tool_calls"]: + output += stringify_tool_call(tool_call) + # output += f"{json.dumps(msg, indent=2)}" + elif msg["role"] == "tool": # tool response + output += stringify_tool_result(msg) + # output += f"{json.dumps(msg, indent=2)}" + else: + output += msg["content"] if msg["content"] != None else "" + return output + + +@mlflow.trace(span_type="PARSER") +def stringify_tool_call(tool_call) -> str: + """Format an OpenAI tool call for display in Databricks playground/review UI. + + Extracts relevant information from an OpenAI tool call and formats it into a + UI-friendly string wrapped in XML-like tags for proper rendering. + + Args: + tool_call (dict): OpenAI tool call dictionary containing function details + (name, arguments) and call ID from the chat completion API. + + Returns: + str: UI-friendly string wrapped in tags, containing the + tool's name, ID, and arguments in a structured format. + """ + try: + function = tool_call["function"] + args_dict = json.loads(function["arguments"]) + request = { + "id": tool_call["id"], + "name": function["name"], + "arguments": json.dumps(args_dict), + } + + return f"{json.dumps(request)}" + + except Exception as e: + print("Failed to stringify tool call: ", e) + return str(tool_call) + + +@mlflow.trace(span_type="PARSER") +def stringify_tool_result(tool_msg) -> str: + """Format an OpenAI tool response for display in Databricks playground/review UI. + + Processes a tool's response message and formats it into a UI-friendly string + wrapped in XML-like tags for proper rendering. + + Args: + tool_msg (dict): OpenAI tool response dictionary containing the tool_call_id + and response content from the chat completion API. + + Returns: + str: UI-friendly string wrapped in tags, containing the + tool's response ID and content. + """ + try: + + result = json.dumps( + {"id": tool_msg["tool_call_id"], "content": tool_msg["content"]} + ) + return f"{result}" + except Exception as e: + print("Failed to stringify tool result:", e) + return str(tool_msg) diff --git a/autogen_agent_app_sample_code/cookbook/agents/utils/signatures.py b/autogen_agent_app_sample_code/cookbook/agents/utils/signatures.py new file mode 100644 index 0000000..4c0f5e7 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/agents/utils/signatures.py @@ -0,0 +1,49 @@ +from mlflow.types.schema import Array, ColSpec, DataType, Map, Object, Property, Schema + +# This is a custom version of the StringResponse class from Databricks Agents +# that includes the `messages` field. +# StringResponse: from mlflow.models.rag_signatures import StringResponse + +STRING_RESPONSE_WITH_MESSAGES = Schema( + [ + ColSpec(name="content", type=DataType.string), + ColSpec( + name="messages", + type=Array( + Object( + [ + Property("role", DataType.string), + Property("content", DataType.string, False), + Property("name", DataType.string, False), + Property("refusal", DataType.string, False), + Property( + "tool_calls", + Array( + Object( + [ + Property("id", DataType.string), + Property( + "function", + Object( + [ + Property("name", DataType.string), + Property( + "arguments", DataType.string + ), + ] + ), + ), + Property("type", DataType.string), + ] + ) + ), + False, + ), + Property("tool_call_id", DataType.string, False), + ] + ), + ), + required=False, + ), + ] +) diff --git a/autogen_agent_app_sample_code/cookbook/config/__init__.py b/autogen_agent_app_sample_code/cookbook/config/__init__.py new file mode 100644 index 0000000..20222c0 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/__init__.py @@ -0,0 +1,99 @@ +from typing import Any, Dict, Tuple, Type +import yaml +from pydantic import BaseModel +import importlib +import json + + +def serializable_config_to_yaml(obj: BaseModel) -> str: + data = obj.model_dump() + return yaml.dump(data) + + +# The way serialization works: +# The goal of serialization is to save the class name (e.g., util.xx.xx.configClassName) with the dumped YAML. +# This allows ANY config to be dynamically loaded from a YAML without knowing about the configClassName before OR having it imported in your python env. +# This is necessary for MultiAgent.`agents` and FunctionCallingAgent.`tools` since they can have multiple types of agent or tool configs in them -- when the config is loaded in the serving or local env, we don't know what these configClassName will be ahead of time & we want to avoid importing them all in the python env. +# How it works: +# the ONLY way to dump a class is to call model_dump() on it, which will return a dict with the _CLASS_PATH_KEY key containing the class path e.g., util.xx.xx.configClassName +# all other dumping methods (yaml, etc) call model_dump() since it is a Pydantic method +# the ONLY way to load a serialized class is to call load_obj_from_yaml with the YAML string +# load_obj_from_yaml will parse the YAML string and get the class path key +# it will then use that class path key to dynamically load the class from the python path +# it will then call that class's _load_class_from_dict method with the remaining data to let it do anything custom e.g,. load the tools or the agents +# if you haven't overridden _load_class_from_dict, it will call the default implementation of this method from SerializableModel +# otherwise, it will call your overridden _load_class_from_dict method + +# How to use: +# Inherit your config class from SerializableModel +# If you don't have any SerializableModel fields, you can just call load_obj_from_yaml directly on your class's dumped YAML string, nothing else required +# If you have SerializableModel fields, you need to +# 1. Override the _load_class_from_dict method to handle the deserialization of those sub-configs +# 2. Override the model_dump method to call the model_dump of each of those sub-configs properly +# +# Examples +# 1. No sub-configs: GenieAgentConfig, UCTool +# 2. Has sub-configs: FunctionCallingAgentConfig (in `tools`), MultiAgentConfig (in `agents`) +# load_obj_from_yaml --> the only way a class is loaded, will get the class path key + +# TODO: add tests. this was tested manually in a notebook verifying that all classes worked. + + +_CLASS_PATH_KEY = "class_path" + + +class SerializableConfig(BaseModel): + def to_yaml(self) -> str: + return serializable_config_to_yaml(self) + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + model_dumped = super().model_dump(**kwargs) + model_dumped[_CLASS_PATH_KEY] = f"{self.__module__}.{self.__class__.__name__}" + return model_dumped + + @classmethod + def _load_class_from_dict( + cls, class_object, data: Dict[str, Any] + ) -> "SerializableConfig": + return class_object(**data) + + def pretty_print(self): + print(json.dumps(self.model_dump(), indent=2)) + + +def serializable_config_to_yaml_file(obj: BaseModel, yaml_file_path: str) -> None: + with open(yaml_file_path, "w") as handle: + handle.write(serializable_config_to_yaml(obj)) + + +# Helper method used by SerializableModel's with fields containing SerializableModels +def _load_class_from_dict(data: Dict[str, Any]) -> Tuple[Type, Dict[str, Any]]: + """Dynamically load a class from data containing a class path. + + Args: + data: Dictionary containing _CLASS_PATH_KEY and other data + + Returns: + Tuple[Type, Dict[str, Any]]: The class object and the remaining data + """ + class_path = data.pop(_CLASS_PATH_KEY) + + module_name, class_name = class_path.rsplit(".", 1) + module = importlib.import_module(module_name) + return getattr(module, class_name), data + + +def load_serializable_config_from_yaml(yaml_str: str) -> SerializableConfig: + data = yaml.safe_load(yaml_str) + class_obj, remaining_data = _load_class_from_dict(data) + return class_obj._load_class_from_dict(class_obj, remaining_data) + + +def load_serializable_config_from_yaml_file(yaml_file_path: str) -> SerializableConfig: + with open(yaml_file_path, "r") as file: + return load_serializable_config_from_yaml(file.read()) diff --git a/autogen_agent_app_sample_code/cookbook/config/agents/__init__.py b/autogen_agent_app_sample_code/cookbook/config/agents/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/config/agents/function_calling_agent.py b/autogen_agent_app_sample_code/cookbook/config/agents/function_calling_agent.py new file mode 100644 index 0000000..f536ff6 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/agents/function_calling_agent.py @@ -0,0 +1,75 @@ +from typing import List, Any, Dict +from cookbook.config import serializable_config_to_yaml +import yaml +from pydantic import BaseModel +from cookbook.config import ( + load_serializable_config_from_yaml, +) +from cookbook.config.shared.llm import LLMConfig +from cookbook.config import ( + SerializableConfig, +) +from mlflow.models.resources import DatabricksResource, DatabricksServingEndpoint + + +class FunctionCallingAgentConfig(SerializableConfig): + """ + Configuration for the agent with MLflow input example. + + Attributes: + llm_config (LLMConfig): Configuration for the function-calling LLM. + input_example (Any): Used by MLflow to set the Agent's input schema. + tools (List[BaseTool]): List of tools used by the agent. + """ + + tools: List[Any] + llm_config: LLMConfig + # Used by MLflow to set the Agent's input schema + input_example: Any = { + "messages": [ + { + "role": "user", + "content": "What can you help me with?", + }, + ] + } + + # name: str + # description: str + # endpoint_name: str + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + model_dumped = super().model_dump(**kwargs) + model_dumped["tools"] = [ + yaml.safe_load(serializable_config_to_yaml(tool)) for tool in self.tools + ] + return model_dumped + + @classmethod + def _load_class_from_dict( + cls, class_object, data: Dict[str, Any] + ) -> "SerializableConfig": + # Deserialize tools, dynamically reconstructing each tool + tools = [] + for tool_dict in data["tools"]: + tool_yml = yaml.dump(tool_dict) + tools.append(load_serializable_config_from_yaml(tool_yml)) + + # Replace tools with deserialized instances + data["tools"] = tools + return class_object(**data) + + def get_resource_dependencies(self) -> List[DatabricksResource]: + dependencies = [ + DatabricksServingEndpoint(endpoint_name=self.llm_config.llm_endpoint_name), + ] + + # Add the Databricks resources for the retriever's vector indexes + for tool in self.tools: + dependencies.extend(tool.get_resource_dependencies()) + return dependencies diff --git a/autogen_agent_app_sample_code/cookbook/config/agents/genie_agent.py b/autogen_agent_app_sample_code/cookbook/config/agents/genie_agent.py new file mode 100644 index 0000000..42d9c37 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/agents/genie_agent.py @@ -0,0 +1,37 @@ +from typing import Any, List +from cookbook.config import SerializableConfig +from mlflow.models.resources import DatabricksResource, DatabricksGenieSpace + + +class GenieAgentConfig(SerializableConfig): + """ + Configuration for the agent with MLflow input example. + + Attributes: + llm_config (FunctionCallingLLMConfig): Configuration for the function-calling LLM. + input_example (Any): Used by MLflow to set the Agent's input schema. + """ + + # TODO: Add validation for the genie_space_id once the API is available. + genie_space_id: str + + # Used by MLflow to set the Agent's input schema + input_example: Any = { + "messages": [ + { + "role": "user", + "content": "What types of data can I query?", + }, + ] + } + + encountered_error_user_message: str = ( + "I encountered an error trying to answer your question, please try again." + ) + + # name: str + # description: str + # endpoint_name: str + + def get_resource_dependencies(self) -> List[DatabricksResource]: + return [DatabricksGenieSpace(genie_space_id=self.genie_space_id)] diff --git a/autogen_agent_app_sample_code/cookbook/config/agents/multi_agent_supervisor.py b/autogen_agent_app_sample_code/cookbook/config/agents/multi_agent_supervisor.py new file mode 100644 index 0000000..4588b0b --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/agents/multi_agent_supervisor.py @@ -0,0 +1,266 @@ +from cookbook.config import _CLASS_PATH_KEY, serializable_config_to_yaml +from pydantic import BaseModel, field_validator +from typing import Any, List, Literal, Dict +from cookbook.config import ( + SerializableConfig, +) +from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig +from cookbook.config import ( + load_serializable_config_from_yaml, +) +import yaml +from mlflow.pyfunc import PythonModel +from typing import Optional + + +# Design for multi-agent + +# requirements +# * can test locally with just the agent's pyfunc classes +# * when you change any config, it all just reloads + +# when you deploy: +# * you deploy each supervised agent separately to model serving +# * then mutli agent picks these up +# * then mutli agent deploys + +# * each child agent has [name, description, config, code] +# - when deployed, it reads it from the UC +# - locally, from the config + +# Internal implementation details for strings that the LLM sees and may need tuning +# These constants can be adjusted to improve the quality and reliability of the LLM's responses +FINISH_ROUTE_NAME = "FINISH" # reserved name for the finish agent which is hardcoded logic to return the last worker's response to the user +SUPERVISOR_ROUTE_NAME = "SUPERVISOR" # reserved name for the supervisor agent which is the main agent that controls the conversation +ROUTING_FUNCTION_NAME = "decide_next_worker_or_finish" # function name presented to the supervisor LLM via OpenAI function calling. Used by supervisor to return it's routing decision. +WORKER_PROMPT_TEMPLATE = "{worker_name}{worker_description}\n" +# Variable names in the ROUTING_FUNCTION_NAME for the supervisor agent's outputted thinking process and decision making +CONVERSATION_HISTORY_THINKING_PARAM = "conversation_history_thinking" +WORKER_CAPABILITIES_THINKING_PARAM = "worker_capabilities_thinking" +NEXT_WORKER_OR_FINISH_PARAM = "next_worker_or_finish" + +MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME = "multi_agent_supervisor_config.yaml" + + +class MultiAgentSupervisorConfig(SerializableConfig): + """ + Configuration for the multi-agent supervisor. + + Attributes: + llm_endpoint_name (str): Databricks Model Serving endpoint name for the supervisor's LLM. + llm_parameters (LLMParametersConfig): Parameters controlling LLM response behavior. + input_example (Any): Example input used by MLflow to set the model's input schema. + playground_debug_mode (bool): When True, outputs debug info to playground UI. Defaults to False. + agent_loading_mode (str): Mode for loading supervised agents - "local" or "model_serving". + max_workers_called (int): Maximum number of worker agent turns before finishing. + supervisor_system_prompt (str): System prompt template for the supervisor agent. + """ + + llm_endpoint_name: str + """ + Databricks Model Serving endpoint name. + This is the LLM used by the supervisor to make decisions. + Databricks foundational model endpoints can be found here: https://docs.databricks.com/en/machine-learning/foundation-models/index.html + """ + + llm_parameters: LLMParametersConfig + """ + Parameters that control how the LLM responds, including temperature and max_tokens. + See LLMParametersConfig for details on available parameters. + """ + input_example: Any = { + "messages": [ + { + "role": "user", + "content": "What can you help me with?", + }, + ] + } + """ + Example input used by MLflow to set the Agent's input schema when calling mlflow.pyfunc.log_model(). + This should match the format of inputs that will be passed to the model's predict() method. + For chat agents, this is typically a dictionary containing a 'messages' key with an array of message objects. + Example: {'messages': [{'role': 'user', 'content': 'Hello'}]} + """ + + playground_debug_mode: bool = False + """ + Outputs details of all supervised agent's tool calling to the playground UI by adding it to the agent's response. + Turn off if you don't want end users to see this debugging information, but highly recommended to keep enabled + during development and pre-prod to visualize the agent's logic in playground/review app. + """ + + agent_loading_mode: Literal["local", "model_serving"] = "local" + """ + Mode for loading supervised agents: + - local: Supervised agent's code and config are loaded from your local environment. Use this mode during development for faster inner loop testing. + - model_serving: Supervised agent is deployed as a Databricks Model Serving endpoint that gets called. Use this mode when deploying the agent to pre-prod/prod environments. + """ + + @field_validator("max_supervisor_loops") + def validate_max_workers(cls, v: int) -> int: + if v <= 1: + raise ValueError("max_workers_called must be greater than 1") + return v + + max_supervisor_loops: int = 5 + """ + The maximum turns of conversations with the workers before the last worker's response is returned to the user by the supervisor's hard coded logic. + Must be greater than 1. + """ + + supervisor_system_prompt: str = """## Role +You are a supervisor responsible for managing a conversation between a user and the following workers. You select the next worker to respond or end the conversation to return the last worker's response to the user. Use the `{ROUTING_FUNCTION_NAME}` function to share your step-by-step reasoning and decision. + +## Workers +{workers_names_and_descriptions} + +## Objective +Your goal is to facilitate the conversation and ensure the user receives a helpful response. + +## Instructions +1. **Review the Conversation History**: Think step by step by to understand the user's request and the conversation history which includes previous worker's responses. Output to the `{CONVERSATION_HISTORY_THINKING_PARAM}` variable. +2. **Assess Worker Descriptions**: Think step by step to consider the description of each worker to understand their capabilities in the context of the conversation history. Output to the `{WORKER_CAPABILITIES_THINKING_PARAM}` variable. +3. **Select the next worker OR finish the conversation**: Based on the converastion history, the worker's descriptions and your thinking, decide which worker should respond next OR if the conversation should finish with the last worker's response going to the user. Output either the or "{FINISH_ROUTE_NAME}" to the `{NEXT_WORKER_OR_FINISH_PARAM}` variable. + +## Additional Notes +- A conversation is considered "stuck" if there is no progress or if workers are unable to proceed with their tasks.""" + """ + System prompt sent to the supervisor agent before the conversation history to guide its decision-making process. + The variable names like {ROUTING_FUNCTION_NAME}, {workers_names_and_descriptions}, etc. will be used by format() in the agent's code to populate the prompt at runtime, so do not change them. + Improving quality: You will tune this prompt to improve the supervisor's ability to route the conversation - start with worker descriptions & names, then tune the rest of the prompt. + """ + + supervisor_user_prompt: str = ( + """Given the converastion history, the worker's descriptions and your thinking, which worker should act next OR should we FINISH? Respond with one of {worker_names_with_finish} to the `{NEXT_WORKER_OR_FINISH_PARAM}` variable in the `{ROUTING_FUNCTION_NAME}` function.""" + ) + """ + Prompt sent to supervisor after system prompt and conversation history to request next worker selection. + The variable names will be populated at runtime via format(). + """ + + supervisor_error_response: str = "I'm sorry, I don't know how to help with that." + + finish_agent_description: str = ( + "End the conversation, returning the last role='assistant'message to the user." + ) + + agents: List[Any] + """ + List of supervised agents that will be called by the supervisor agent. Each agent must be a agent that implements the cookbook's Agent configuration interface. + """ + + @classmethod + def _load_class_from_dict( + cls, class_object, data: Dict[str, Any] + ) -> "SerializableConfig": + # Deserialize tools, dynamically reconstructing each tool + agents = [] + for agent_dict in data["agents"]: + agent_yml = yaml.dump(agent_dict) + agents.append(load_serializable_config_from_yaml(agent_yml)) + + # Replace tools with deserialized instances + data["agents"] = agents + return class_object(**data) + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + + model_dumped = super().model_dump(**kwargs) + model_dumped["agents"] = [ + yaml.safe_load(serializable_config_to_yaml(agent)) for agent in self.agents + ] + return model_dumped + + +class SupervisedAgentConfig(SerializableConfig): + name: str + description: str + endpoint_name: Optional[str] = None + agent_config: Optional[SerializableConfig] = None + agent_class_path: Optional[str] = None + + # TODO: check agent_class is a subclass of our Agent - need to refactor Agent to a common base class + def __init__( + self, + name: str, + description: str, + *, + endpoint_name: Optional[str] = None, + agent_config: Optional[SerializableConfig] = None, + agent_class: Optional[type] = None, + agent_class_path: Optional[str] = None, + ): + """Initialize a SupervisedAgentConfig instance. + + Args: + name (str): Name of the supervised agent + description (str): Description of the agent's capabilities + endpoint_name (str): Databricks Model Serving endpoint name + config (Any): Agent's configuration + code (Any): Agent's implementation class + """ + if agent_class is not None and agent_class_path is not None: + raise ValueError( + "Only one of agent_class or agent_class_path can be provided" + ) + + if agent_class is not None: + if not isinstance(agent_class, type): + raise ValueError("agent_class must be an uninstantiated class") + if not issubclass(agent_class, PythonModel): + raise ValueError("agent_class must be a subclass of PythonModel") + + agent_class_path = f"{agent_class.__module__}.{agent_class.__name__}" + + if (endpoint_name is None) and ( + agent_config is None and agent_class_path is None + ): + raise ValueError( + "One of endpoint_name or agent_config/agent_class(_path) must be provided" + ) + + super().__init__( + name=name, + description=description, + endpoint_name=endpoint_name, + agent_config=agent_config, + agent_class_path=agent_class_path, + ) + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + + # only modify the method if agent_config is present, otherwise, this is not needed + if self.agent_config is not None: + kwargs["exclude"] = {"agent_config"}.union(kwargs.get("exclude", set())) + model_dumped = super().model_dump(**kwargs) + model_dumped["agent_config"] = yaml.safe_load( + serializable_config_to_yaml(self.agent_config) + ) + return model_dumped + else: + return super().model_dump(**kwargs) + + @classmethod + def _load_class_from_dict( + cls, class_object, data: Dict[str, Any] + ) -> "SerializableConfig": + + # Deserialize agent config but only if it is present + if data["agent_config"] is not None: + agent_config = load_serializable_config_from_yaml( + yaml.dump(data["agent_config"]) + ) + data["agent_config"] = agent_config + + return class_object(**data) diff --git a/autogen_agent_app_sample_code/cookbook/config/agents/rag_only.py b/autogen_agent_app_sample_code/cookbook/config/agents/rag_only.py new file mode 100644 index 0000000..59fbe29 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/agents/rag_only.py @@ -0,0 +1,25 @@ +from cookbook.config.shared.llm import LLMConfig +from cookbook.config.tools.vector_search_tool import VectorSearchRetrieverTool + + +from pydantic import BaseModel + + +from typing import Any + + +class RAGConfig(BaseModel): + """ + Configuration for a RAG chain with MLflow input example. + + Attributes: + llm_config (LLMConfig): Configuration for the function-calling LLM. + vector_search_retriever_config (VectorSearchRetrieverConfig): Configuration for the Databricks vector search + index. + input_example (Any): Used by MLflow to set the RAG chain's input schema. + """ + + vector_search_retriever_config: VectorSearchRetrieverTool + llm_config: LLMConfig + # Used by MLflow to set the Agent's input schema + input_example: Any diff --git a/autogen_agent_app_sample_code/cookbook/config/data_pipeline/__init__.py b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/__init__.py new file mode 100644 index 0000000..f89bde9 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/__init__.py @@ -0,0 +1,49 @@ +from cookbook.config import SerializableConfig, serializable_config_to_yaml +import yaml +from cookbook.config import ( + load_serializable_config_from_yaml, +) +from cookbook.config.data_pipeline.data_pipeline_output import DataPipelineOuputConfig +from cookbook.config.data_pipeline.recursive_text_splitter import ( + RecursiveTextSplitterChunkingConfig, +) +from cookbook.config.data_pipeline.uc_volume_source import UCVolumeSourceConfig + + +from typing import Any, Dict + + +class DataPipelineConfig(SerializableConfig): + source: UCVolumeSourceConfig + output: DataPipelineOuputConfig + chunking_config: RecursiveTextSplitterChunkingConfig + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + model_dumped = super().model_dump(**kwargs) + model_dumped["source"] = yaml.safe_load( + serializable_config_to_yaml(self.source) + ) + model_dumped["output"] = yaml.safe_load( + serializable_config_to_yaml(self.output) + ) + model_dumped["chunking_config"] = yaml.safe_load( + serializable_config_to_yaml(self.chunking_config) + ) + return model_dumped + + @classmethod + def _load_class_from_dict( + cls, class_object, data: Dict[str, Any] + ) -> "SerializableConfig": + # Deserialize sub-configs + data["source"] = load_serializable_config_from_yaml(yaml.dump(data["source"])) + data["output"] = load_serializable_config_from_yaml(yaml.dump(data["output"])) + data["chunking_config"] = load_serializable_config_from_yaml( + yaml.dump(data["chunking_config"]) + ) + return class_object(**data) diff --git a/autogen_agent_app_sample_code/cookbook/config/data_pipeline/data_pipeline_output.py b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/data_pipeline_output.py new file mode 100644 index 0000000..2a2a19b --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/data_pipeline_output.py @@ -0,0 +1,314 @@ +from cookbook.config import SerializableConfig +from typing import Optional + +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound +from databricks.sdk.errors.platform import ResourceDoesNotExist +from databricks.sdk.service.vectorsearch import EndpointType + + +class DataPipelineOuputConfig(SerializableConfig): + """Configuration for managing output locations and naming conventions in the data pipeline. + + This class handles the configuration of table names and vector search endpoints for the data pipeline. + It follows a consistent naming pattern for all generated tables and provides version control capabilities. + + Naming Convention: + {catalog}.{schema}.{base_table_name}_{table_postfix}__{version_suffix} + + Generated Tables: + 1. Parsed docs table: Stores the raw parsed documents + 2. Chunked docs table: Stores the documents split into chunks + 3. Vector index: Stores the vector embeddings for search + + Args: + uc_catalog_name (str): Unity Catalog name where tables will be created + uc_schema_name (str): Schema name within the catalog + base_table_name (str): Core name used as prefix for all generated tables + docs_table_postfix (str, optional): Suffix for the parsed documents table. Defaults to "docs" + chunked_table_postfix (str, optional): Suffix for the chunked documents table. Defaults to "docs_chunked" + vector_index_postfix (str, optional): Suffix for the vector index. Defaults to "docs_chunked_index" + version_suffix (str, optional): Version identifier (e.g., 'v1', 'test') to maintain multiple pipeline versions + vector_search_endpoint (str): Name of the vector search endpoint to use + + Examples: + With version_suffix="v1": + >>> config = DataPipelineOuputConfig( + ... uc_catalog_name="my_catalog", + ... uc_schema_name="my_schema", + ... base_table_name="agent", + ... version_suffix="v1" + ... ) + # Generated tables: + # - my_catalog.my_schema.agent_docs__v1 + # - my_catalog.my_schema.agent_docs_chunked__v1 + # - my_catalog.my_schema.agent_docs_chunked_index__v1 + + Without version_suffix: + # - my_catalog.my_schema.agent_docs + # - my_catalog.my_schema.agent_docs_chunked + # - my_catalog.my_schema.agent_docs_chunked_index + """ + + vector_search_endpoint: str + parsed_docs_table: str + chunked_docs_table: str + vector_index: str + + def __init__( + self, + *, + vector_search_endpoint: str, + parsed_docs_table: Optional[str] = None, + chunked_docs_table: Optional[str] = None, + vector_index: Optional[str] = None, + uc_catalog_name: Optional[str] = None, + uc_schema_name: Optional[str] = None, + base_table_name: Optional[str] = None, + docs_table_postfix: str = "docs", + chunked_table_postfix: str = "docs_chunked", + vector_index_postfix: str = "docs_chunked_index", + version_suffix: Optional[str] = None, + ): + """Initialize a new DataPipelineOuputConfig instance. + + Supports two initialization styles: + 1. Direct table names: + - parsed_docs_table + - chunked_docs_table + - vector_index + + 2. Generated table names using: + - uc_catalog_name + - uc_schema_name + - base_table_name + - [optional] postfixes and version_suffix + + Args: + vector_search_endpoint (str): Name of the vector search endpoint to use + parsed_docs_table (str, optional): Direct table name for parsed docs + chunked_docs_table (str, optional): Direct table name for chunked docs + vector_index (str, optional): Direct name for vector index + uc_catalog_name (str, optional): Unity Catalog name where tables will be created + uc_schema_name (str, optional): Schema name within the catalog + base_table_name (str, optional): Core name used as prefix for all generated tables + docs_table_postfix (str, optional): Suffix for parsed documents table. Defaults to "docs" + chunked_table_postfix (str, optional): Suffix for chunked documents table. Defaults to "docs_chunked" + vector_index_postfix (str, optional): Suffix for vector index. Defaults to "docs_chunked_index" + version_suffix (str, optional): Version identifier for multiple pipeline versions + """ + _validate_not_default(vector_search_endpoint) + + if parsed_docs_table and chunked_docs_table and vector_index: + # Direct table names provided + if any([uc_catalog_name, uc_schema_name, base_table_name]): + raise ValueError( + "Cannot provide both direct table names and table name generation parameters" + ) + elif all([uc_catalog_name, uc_schema_name, base_table_name]): + # Generate table names + _validate_not_default(uc_catalog_name) + _validate_not_default(uc_schema_name) + _validate_not_default(base_table_name) + + parsed_docs_table = _build_table_name( + uc_catalog_name, + uc_schema_name, + base_table_name, + docs_table_postfix, + version_suffix, + ) + chunked_docs_table = _build_table_name( + uc_catalog_name, + uc_schema_name, + base_table_name, + chunked_table_postfix, + version_suffix, + ) + vector_index = _build_table_name( + uc_catalog_name, + uc_schema_name, + base_table_name, + vector_index_postfix, + version_suffix, + escape=False, + ) + else: + raise ValueError( + "Must provide either all direct table names or all table name generation parameters" + ) + + super().__init__( + parsed_docs_table=parsed_docs_table, + chunked_docs_table=chunked_docs_table, + vector_index=vector_index, + vector_search_endpoint=vector_search_endpoint, + ) + + def check_if_vector_search_endpoint_exists(self): + w = WorkspaceClient() + vector_search_endpoints = w.vector_search_endpoints.list_endpoints() + if ( + sum( + [ + self.vector_search_endpoint == ve.name + for ve in vector_search_endpoints + ] + ) + == 0 + ): + return False + else: + return True + + def create_vector_search_endpoint(self): + w = WorkspaceClient() + print( + f"Please wait, creating Vector Search endpoint `{self.vector_search_endpoint}`. This can take up to 20 minutes..." + ) + w.vector_search_endpoints.create_endpoint_and_wait( + self.vector_search_endpoint, endpoint_type=EndpointType.STANDARD + ) + # Make sure vector search endpoint is online and ready. + w.vector_search_endpoints.wait_get_endpoint_vector_search_endpoint_online( + self.vector_search_endpoint + ) + + def create_or_validate_vector_search_endpoint(self): + if not self.check_if_vector_search_endpoint_exists(): + self.create_vector_search_endpoint() + return self.validate_vector_search_endpoint() + + def validate_vector_search_endpoint(self) -> tuple[bool, str]: + """ + Validates that the specified Vector Search endpoint exists + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + if not self.check_if_vector_search_endpoint_exists(): + msg = f"Vector Search endpoint '{self.vector_search_endpoint}' does not exist. Please either manually create it or call `output_config.create_or_validate_vector_search_endpoint()` to create it." + return (False, msg) + + msg = f"Vector Search endpoint '{self.vector_search_endpoint}' exists." + print(msg) + return (True, msg) + + def validate_catalog_and_schema(self) -> tuple[bool, str]: + """ + Validates that the specified catalog and schema exist + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + + # Check catalog and schema for parsed_docs_table + parsed_docs_catalog = _get_uc_catalog_name(self.parsed_docs_table) + parsed_docs_schema = _get_uc_schema_name(self.parsed_docs_table) + if not _check_if_catalog_exists(parsed_docs_catalog): + msg = f"Catalog '{parsed_docs_catalog}' does not exist for parsed_docs_table. Please create it first." + return (False, msg) + if not _check_if_schema_exists(parsed_docs_catalog, parsed_docs_schema): + msg = f"Schema '{parsed_docs_schema}' does not exist in catalog '{parsed_docs_catalog}' for parsed_docs_table. Please create it first." + return (False, msg) + + # Check catalog and schema for chunked_docs_table + chunked_docs_catalog = _get_uc_catalog_name(self.chunked_docs_table) + chunked_docs_schema = _get_uc_schema_name(self.chunked_docs_table) + if not _check_if_catalog_exists(chunked_docs_catalog): + msg = f"Catalog '{chunked_docs_catalog}' does not exist for chunked_docs_table. Please create it first." + return (False, msg) + if not _check_if_schema_exists(chunked_docs_catalog, chunked_docs_schema): + msg = f"Schema '{chunked_docs_schema}' does not exist in catalog '{chunked_docs_catalog}' for chunked_docs_table. Please create it first." + return (False, msg) + + # Check catalog and schema for vector_index + vector_index_catalog = _get_uc_catalog_name(self.vector_index) + vector_index_schema = _get_uc_schema_name(self.vector_index) + if not _check_if_catalog_exists(vector_index_catalog): + msg = f"Catalog '{vector_index_catalog}' does not exist for vector_index. Please create it first." + return (False, msg) + if not _check_if_schema_exists(vector_index_catalog, vector_index_schema): + msg = f"Schema '{vector_index_schema}' does not exist in catalog '{vector_index_catalog}' for vector_index. Please create it first." + return (False, msg) + + msg = f"All catalogs and schemas exist for parsed_docs_table, chunked_docs_table, and vector_index." + print(msg) + return (True, msg) + + +def _escape_uc_fqn(uc_fqn: str) -> str: + """ + Escape the fully qualified name (FQN) for a Unity Catalog asset if it contains special characters. + + Args: + uc_fqn (str): The fully qualified name of the asset. + + Returns: + str: The escaped fully qualified name if it contains special characters, otherwise the original FQN. + """ + if "-" in uc_fqn: + parts = uc_fqn.split(".") + escaped_parts = [f"`{part}`" for part in parts] + return ".".join(escaped_parts) + else: + return uc_fqn + + +def _build_table_name( + uc_catalog_name: str, + uc_schema_name: str, + base_table_name: str, + postfix: str, + version_suffix: str = None, + escape: bool = True, +) -> str: + """Helper to build consistent table names + + Args: + postfix: The table name postfix to append + escape: Whether to escape special characters in the table name. Defaults to True. + + Returns: + The constructed table name, optionally escaped + """ + suffix = f"__{version_suffix}" if version_suffix else "" + raw_name = f"{uc_catalog_name}.{uc_schema_name}.{base_table_name}_{postfix}{suffix}" + return _escape_uc_fqn(raw_name) if escape else raw_name + + +def _validate_not_default(value: str) -> str: + if value == "REPLACE_ME": + raise ValueError( + "Please replace the default value 'REPLACE_ME' with your actual configuration" + ) + return value + + +def _get_uc_catalog_name(uc_fqn: str) -> str: + unescaped_uc_fqn = uc_fqn.replace("`", "") + return unescaped_uc_fqn.split(".")[0] + + +def _get_uc_schema_name(uc_fqn: str) -> str: + unescaped_uc_fqn = uc_fqn.replace("`", "") + return unescaped_uc_fqn.split(".")[1] + + +def _check_if_catalog_exists(uc_catalog_name) -> bool: + w = WorkspaceClient() + try: + w.catalogs.get(name=uc_catalog_name) + return True + except (ResourceDoesNotExist, NotFound): + return False + + +def _check_if_schema_exists(uc_catalog_name, uc_schema_name) -> bool: + w = WorkspaceClient() + try: + full_name = f"{uc_catalog_name}.{uc_schema_name}" + w.schemas.get(full_name=full_name) + return True + except (ResourceDoesNotExist, NotFound): + return False diff --git a/autogen_agent_app_sample_code/cookbook/config/data_pipeline/recursive_text_splitter.py b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/recursive_text_splitter.py new file mode 100644 index 0000000..17c15aa --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/recursive_text_splitter.py @@ -0,0 +1,89 @@ +from cookbook.config import SerializableConfig +from cookbook.databricks_utils import ( + get_workspace_hostname, +) +from cookbook.data_pipeline.recursive_character_text_splitter import ( + EMBEDDING_MODELS, + detect_fmapi_embedding_model_type, + validate_chunk_size, +) + + +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import ResourceDoesNotExist +from databricks.sdk.service.serving import EndpointStateReady + + +class RecursiveTextSplitterChunkingConfig(SerializableConfig): + """ + Configuration for the Unstructured Data Pipeline. + + Args: + embedding_model_endpoint (str): + Embedding model endpoint hosted on Model Serving. Default is `databricks-gte-large`. This can be an External Model, such as OpenAI or a Databricks hosted model on Foundational Model API. The list of Databricks hosted models can be found here: https://docs.databricks.com/en/machine-learning/foundation-models/index.html + chunk_size_tokens (int): + The size of each chunk of the document in tokens. Default is 1024. + chunk_overlap_tokens (int): + The overlap of tokens between chunks. Default is 256. + """ + + embedding_model_endpoint: str = "databricks-gte-large-en" + chunk_size_tokens: int = 1024 + chunk_overlap_tokens: int = 256 + + def validate_embedding_endpoint(self) -> tuple[bool, str]: + """ + Validates that the specified embedding endpoint exists and is of the correct type + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + task_type = "llm/v1/embeddings" + w = WorkspaceClient() + browser_url = get_workspace_hostname() + try: + llm_endpoint = w.serving_endpoints.get(name=self.embedding_model_endpoint) + except ResourceDoesNotExist as e: + msg = f"Model serving endpoint {self.embedding_model_endpoint} not found." + return (False, msg) + if llm_endpoint.state.ready != EndpointStateReady.READY: + msg = f"Model serving endpoint {self.embedding_model_endpoint} is not in a READY state. Please visit the status page to debug: {browser_url}/ml/endpoints/{self.embedding_model_endpoint}" + return (False, msg) + if llm_endpoint.task != task_type: + msg = f"Model serving endpoint {self.embedding_model_endpoint} is online & ready, but does not support task type {task_type}. Details at: {browser_url}/ml/endpoints/{self.embedding_model_endpoint}" + return (False, msg) + + msg = f"Validated serving endpoint {self.embedding_model_endpoint} as READY and of type {task_type}. View here: {browser_url}/ml/endpoints/{self.embedding_model_endpoint}" + print(msg) + return (True, msg) + + def validate_chunk_size_and_overlap(self) -> tuple[bool, str]: + """ + Validates that chunk_size and overlap values are valid + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + # Detect the embedding model and its configuration + embedding_model_name, chunk_spec = detect_fmapi_embedding_model_type( + self.embedding_model_endpoint + ) + + # Update chunk specification based on provided parameters + chunk_spec["chunk_size_tokens"] = self.chunk_size_tokens + chunk_spec["chunk_overlap_tokens"] = self.chunk_overlap_tokens + + if chunk_spec is None or embedding_model_name is None: + # Fall back to using provided embedding_model_name + chunk_spec = EMBEDDING_MODELS.get(embedding_model_name) + if chunk_spec is None: + msg = f"Embedding model `{embedding_model_name}` not found, so can't validate chunking config. Chunking config must be validated for a specific embedding model. Available models: {EMBEDDING_MODELS.keys()}" + return (False, msg) + + # Validate chunk size and overlap + is_valid, msg = validate_chunk_size(chunk_spec) + if not is_valid: + return (False, msg) + else: + print(msg) + return (True, msg) diff --git a/autogen_agent_app_sample_code/cookbook/config/data_pipeline/uc_volume_source.py b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/uc_volume_source.py new file mode 100644 index 0000000..f471165 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/data_pipeline/uc_volume_source.py @@ -0,0 +1,132 @@ +from cookbook.config import SerializableConfig +from cookbook.databricks_utils import get_volume_url + + +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import NotFound +from databricks.sdk.errors.platform import ResourceAlreadyExists, ResourceDoesNotExist +from databricks.sdk.service.catalog import VolumeType +from pydantic import Field, computed_field, field_validator + + +class UCVolumeSourceConfig(SerializableConfig): + """ + Source data configuration for the Unstructured Data Pipeline. You can modify this class to add additional configuration settings. + + Args: + uc_catalog_name (str): + Required. Name of the Unity Catalog. + uc_schema_name (str): + Required. Name of the Unity Catalog schema. + uc_volume_name (str): + Required. Name of the Unity Catalog volume. + """ + + @field_validator("uc_catalog_name", "uc_schema_name", "uc_volume_name") + def validate_not_default(cls, value: str) -> str: + if value == "REPLACE_ME": + raise ValueError( + "Please replace the default value 'REPLACE_ME' with your actual configuration" + ) + return value + + uc_catalog_name: str = Field(..., min_length=1) + uc_schema_name: str = Field(..., min_length=1) + uc_volume_name: str = Field(..., min_length=1) + + @computed_field() + def volume_path(self) -> str: + return f"/Volumes/{self.uc_catalog_name}/{self.uc_schema_name}/{self.uc_volume_name}" + + @computed_field() + def volume_uc_fqn(self) -> str: + return f"{self.uc_catalog_name}.{self.uc_schema_name}.{self.uc_volume_name}" + + def check_if_volume_exists(self) -> bool: + w = WorkspaceClient() + try: + # Use the computed field instead of reconstructing the FQN + w.volumes.read(name=self.volume_uc_fqn) + return True + except (ResourceDoesNotExist, NotFound): + return False + + def create_volume(self): + try: + w = WorkspaceClient() + w.volumes.create( + catalog_name=self.uc_catalog_name, + schema_name=self.uc_schema_name, + name=self.uc_volume_name, + volume_type=VolumeType.MANAGED, + ) + except ResourceAlreadyExists: + pass + + def check_if_catalog_exists(self) -> bool: + w = WorkspaceClient() + try: + w.catalogs.get(name=self.uc_catalog_name) + return True + except (ResourceDoesNotExist, NotFound): + return False + + def check_if_schema_exists(self) -> bool: + w = WorkspaceClient() + try: + full_name = f"{self.uc_catalog_name}.{self.uc_schema_name}" + w.schemas.get(full_name=full_name) + return True + except (ResourceDoesNotExist, NotFound): + return False + + def create_or_validate_volume(self) -> tuple[bool, str]: + """ + Validates that the volume exists and creates it if it doesn't + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + if not self.check_if_catalog_exists(): + msg = f"Catalog '{self.uc_catalog_name}' does not exist. Please create it first." + return (False, msg) + + if not self.check_if_schema_exists(): + msg = f"Schema '{self.uc_schema_name}' does not exist in catalog '{self.uc_catalog_name}'. Please create it first." + return (False, msg) + + if not self.check_if_volume_exists(): + print(f"Volume {self.volume_path} does not exist. Creating...") + try: + self.create_volume() + except Exception as e: + msg = f"Failed to create volume: {str(e)}" + return (False, msg) + msg = f"Successfully created volume {self.volume_path}. View here: {get_volume_url(self.volume_uc_fqn)}" + print(msg) + return (True, msg) + + msg = f"Volume {self.volume_path} exists. View here: {get_volume_url(self.volume_uc_fqn)}" + print(msg) + return (True, msg) + + def list_files(self) -> list[str]: + """ + Lists all files in the Unity Catalog volume using dbutils.fs. + + Returns: + list[str]: A list of file paths in the volume + + Raises: + Exception: If the volume doesn't exist or there's an error accessing it + """ + if not self.check_if_volume_exists(): + raise Exception(f"Volume {self.volume_path} does not exist") + + w = WorkspaceClient() + try: + # List contents using dbutils.fs + files = w.dbutils.fs.ls(self.volume_path) + return [file.name for file in files] + except Exception as e: + raise Exception(f"Failed to list files in volume: {str(e)}") diff --git a/autogen_agent_app_sample_code/cookbook/config/shared/__init__.py b/autogen_agent_app_sample_code/cookbook/config/shared/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/config/shared/agent_storage_location.py b/autogen_agent_app_sample_code/cookbook/config/shared/agent_storage_location.py new file mode 100644 index 0000000..db490b7 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/shared/agent_storage_location.py @@ -0,0 +1,118 @@ +from pydantic import ( + field_validator, + FieldValidationInfo, +) +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import ( + ResourceDoesNotExist, + NotFound, +) +from pydantic import Field +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import ResourceDoesNotExist +from databricks.sdk.errors import NotFound +from cookbook.config import SerializableConfig +from databricks.sdk import WorkspaceClient + + +class AgentStorageConfig(SerializableConfig): + """ + Source data configuration for the Unstructured Data Pipeline. You can modify this class to add additional configuration settings. + + Args: + uc_model_name (str): + Required. Fully qualified name of the model in format: catalog.schema.model_name + evaluation_set_uc_table (str): + Required. Fully qualified name of the evaluation table in format: catalog.schema.table_name + """ + + uc_model_name: str = Field(..., min_length=1) + evaluation_set_uc_table: str = Field(..., min_length=1) + mlflow_experiment_name: str = Field(None) + + @field_validator("uc_model_name", "evaluation_set_uc_table") + @classmethod + def validate_uc_fqn_format(cls, v: str, info: FieldValidationInfo) -> str: + if v.count(".") != 2: + raise ValueError( + f"{info.field_name} must be in format: catalog.schema.name" + ) + return v + + @classmethod + def escape_uc_fqn(cls, uc_fqn: str) -> str: + """ + Escape the fully qualified name (FQN) for a Unity Catalog asset if it contains special characters. + + Args: + uc_fqn (str): The fully qualified name of the asset. + + Returns: + str: The escaped fully qualified name if it contains special characters, otherwise the original FQN. + """ + if "-" in uc_fqn: + parts = uc_fqn.split(".") + escaped_parts = [f"`{part}`" for part in parts] + return ".".join(escaped_parts) + else: + return uc_fqn + + def check_if_catalog_exists(self, catalog_name: str) -> bool: + w = WorkspaceClient() + try: + w.catalogs.get(name=catalog_name) + return True + except (ResourceDoesNotExist, NotFound): + return False + + def check_if_schema_exists(self, catalog_name: str, schema_name: str) -> bool: + w = WorkspaceClient() + try: + full_name = f"{catalog_name}.{schema_name}" + w.schemas.get(full_name=full_name) + return True + except (ResourceDoesNotExist, NotFound): + return False + + def validate_catalog_and_schema(self) -> tuple[bool, str]: + """ + Validates that the specified catalogs and schemas exist for both uc_model_name and evaluation_set_uc_table + Returns: + tuple[bool, str]: A tuple containing (success, error_message). + If validation passes, returns (True, success_message). If validation fails, returns (False, error_message). + """ + # Extract catalog and schema from uc_model_name + model_catalog, model_schema, _ = self.uc_model_name.split(".") + + # Extract catalog and schema from evaluation_set_uc_table + eval_catalog, eval_schema, _ = self.evaluation_set_uc_table.split(".") + + # Check model catalog and schema + if not self.check_if_catalog_exists(model_catalog): + return ( + False, + f"Model catalog '{model_catalog}' does not exist. Please create it first.", + ) + + if not self.check_if_schema_exists(model_catalog, model_schema): + return ( + False, + f"Model schema '{model_schema}' does not exist in catalog '{model_catalog}'. Please create it first.", + ) + + # Check evaluation table catalog and schema + if not self.check_if_catalog_exists(eval_catalog): + return ( + False, + f"Evaluation catalog '{eval_catalog}' does not exist. Please create it first.", + ) + + if not self.check_if_schema_exists(eval_catalog, eval_schema): + return ( + False, + f"Evaluation schema '{eval_schema}' does not exist in catalog '{eval_catalog}'. Please create it first.", + ) + + msg = f"All catalogs and schemas exist for both model `{self.uc_model_name}` and evaluation table `{self.evaluation_set_uc_table}`." + print(msg) + return (True, msg) diff --git a/autogen_agent_app_sample_code/cookbook/config/shared/llm.py b/autogen_agent_app_sample_code/cookbook/config/shared/llm.py new file mode 100644 index 0000000..9341fd5 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/config/shared/llm.py @@ -0,0 +1,42 @@ +from pydantic import BaseModel + +class LLMParametersConfig(BaseModel): + """ + Configuration for LLM response parameters. + + Attributes: + temperature (float): Controls randomness in the response. + max_tokens (int): Maximum number of tokens in the response. + top_p (float): Controls diversity via nucleus sampling. + top_k (int): Limits the number of highest probability tokens considered. + """ + + # Parameters that control how the LLM responds. + temperature: float = None + max_tokens: int = None + + +class LLMConfig(BaseModel): + """ + Configuration for the function-calling LLM. + + Attributes: + llm_endpoint_name (str): Databricks Model Serving endpoint name. + This is the generator LLM where your LLM queries are sent. + Databricks foundational model endpoints can be found here: + https://docs.databricks.com/en/machine-learning/foundation-models/index.html + llm_system_prompt_template (str): Template for the LLM prompt. + This is how the RAG chain combines the user's question and the retrieved context. + llm_parameters (LLMParametersConfig): Parameters that control how the LLM responds. + """ + + # Databricks Model Serving endpoint name + # This is the generator LLM where your LLM queries are sent. + # Databricks foundational model endpoints can be found here: https://docs.databricks.com/en/machine-learning/foundation-models/index.html + llm_endpoint_name: str + + # Define a template for the LLM prompt. This is how the RAG chain combines the user's question and the retrieved context. + llm_system_prompt_template: str + + # Parameters that control how the LLM responds. + llm_parameters: LLMParametersConfig diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/__init__.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/build_retriever_index.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/build_retriever_index.py new file mode 100644 index 0000000..e1e80c4 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/build_retriever_index.py @@ -0,0 +1,123 @@ +from databricks.sdk.service.vectorsearch import ( + VectorSearchIndexesAPI, + DeltaSyncVectorIndexSpecRequest, + EmbeddingSourceColumn, + PipelineType, + VectorIndexType, +) +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors.platform import ResourceDoesNotExist, BadRequest +import time +from cookbook.databricks_utils import get_table_url + + +# %md +# ##### `build_retriever_index` + +# `build_retriever_index` will build the vector search index which is used by our RAG to retrieve relevant documents. + +# Arguments: +# - `chunked_docs_table`: The chunked documents table. There is expected to be a `chunked_text` column, a `chunk_id` column, and a `url` column. +# - `primary_key`: The column to use for the vector index primary key. +# - `embedding_source_column`: The column to compute embeddings for in the vector index. +# - `vector_search_endpoint`: An optional vector search endpoint name. It not defined, defaults to the `{table_id}_vector_search`. +# - `vector_search_index_name`: An optional index name. If not defined, defaults to `{chunked_docs_table}_index`. +# - `embedding_endpoint_name`: An embedding endpoint name. +# - `force_delete_vector_search_endpoint`: Setting this to true will rebuild the vector search endpoint. + + +def build_retriever_index( + vector_search_endpoint: str, + chunked_docs_table_name: str, + vector_search_index_name: str, + embedding_endpoint_name: str, + force_delete_index_before_create=False, + primary_key: str = "chunk_id", # hard coded in the apply_chunking_fn + embedding_source_column: str = "content_chunked", # hard coded in the apply_chunking_fn +) -> tuple[bool, str]: + # Initialize workspace client and vector search API + w = WorkspaceClient() + vsc = w.vector_search_indexes + + def find_index(index_name): + try: + return vsc.get_index(index_name=index_name) + except ResourceDoesNotExist: + return None + + def wait_for_index_to_be_ready(index): + while not index.status.ready: + print( + f"Index {vector_search_index_name} exists, but is not ready, waiting 30 seconds..." + ) + time.sleep(30) + index = find_index(index_name=vector_search_index_name) + + def wait_for_index_to_be_deleted(index): + while index: + print( + f"Waiting for index {vector_search_index_name} to be deleted, waiting 30 seconds..." + ) + time.sleep(30) + index = find_index(index_name=vector_search_index_name) + + existing_index = find_index(index_name=vector_search_index_name) + if existing_index: + print(f"Found existing index {get_table_url(vector_search_index_name)}...") + if force_delete_index_before_create: + print(f"Deleting index {vector_search_index_name}...") + vsc.delete_index(index_name=vector_search_index_name) + wait_for_index_to_be_deleted(existing_index) + create_index = True + else: + wait_for_index_to_be_ready(existing_index) + create_index = False + print( + f"Starting the sync of index {vector_search_index_name}, this can take 15 minutes or much longer if you have a larger number of documents." + ) + # print(existing_index) + try: + vsc.sync_index(index_name=vector_search_index_name) + msg = f"Kicked off index sync for {vector_search_index_name}." + return (False, msg) + except BadRequest as e: + msg = f"Index sync already in progress, so failed to kick off index sync for {vector_search_index_name}. Please wait for the index to finish syncing and try again." + return (True, msg) + else: + print( + f'Creating new vector search index "{vector_search_index_name}" on endpoint "{vector_search_endpoint}"' + ) + create_index = True + + if create_index: + print( + "Computing document embeddings and Vector Search Index. This can take 15 minutes or much longer if you have a larger number of documents." + ) + try: + # Create delta sync index spec using the proper class + delta_sync_spec = DeltaSyncVectorIndexSpecRequest( + source_table=chunked_docs_table_name, + pipeline_type=PipelineType.TRIGGERED, + embedding_source_columns=[ + EmbeddingSourceColumn( + name=embedding_source_column, + embedding_model_endpoint_name=embedding_endpoint_name, + ) + ], + ) + + vsc.create_index( + name=vector_search_index_name, + endpoint_name=vector_search_endpoint, + primary_key=primary_key, + index_type=VectorIndexType.DELTA_SYNC, + delta_sync_index_spec=delta_sync_spec, + ) + msg = ( + f"Successfully created vector search index {vector_search_index_name}." + ) + print(msg) + return (False, msg) + except Exception as e: + msg = f"Vector search index creation failed. Wait 5 minutes and try running this cell again." + return (True, msg) diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/chunk_docs.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/chunk_docs.py new file mode 100644 index 0000000..793a721 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/chunk_docs.py @@ -0,0 +1,44 @@ +from typing import Literal, Optional, Any, Callable +from databricks.vector_search.client import VectorSearchClient +from pyspark.sql.functions import explode +import pyspark.sql.functions as func +from typing import Callable +from pyspark.sql.types import StructType, StringType, StructField, MapType, ArrayType +from pyspark.sql import DataFrame, SparkSession + + +def apply_chunking_fn( + parsed_docs_df: DataFrame, + chunking_fn: Callable[[str], list[str]], + propagate_columns: list[str], + doc_column: str = "content", +) -> DataFrame: + # imports here to avoid requiring these libraries in all notebooks since the data pipeline config imports this package + from langchain_text_splitters import RecursiveCharacterTextSplitter + from transformers import AutoTokenizer + import tiktoken + + print( + f"Applying chunking UDF to {parsed_docs_df.count()} documents using Spark - this may take a long time if you have many documents..." + ) + + parser_udf = func.udf( + chunking_fn, returnType=ArrayType(StringType()), useArrow=True + ) + chunked_array_docs = parsed_docs_df.withColumn( + "content_chunked", parser_udf(doc_column) + ) # .drop(doc_column) + chunked_docs = chunked_array_docs.select( + *propagate_columns, explode("content_chunked").alias("content_chunked") + ) + + # Add a primary key: "chunk_id". + chunks_with_ids = chunked_docs.withColumn( + "chunk_id", func.md5(func.col("content_chunked")) + ) + # Reorder for better display. + chunks_with_ids = chunks_with_ids.select( + "chunk_id", "content_chunked", *propagate_columns + ) + + return chunks_with_ids diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/default_parser.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/default_parser.py new file mode 100644 index 0000000..277fdc1 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/default_parser.py @@ -0,0 +1,162 @@ +from typing import TypedDict +from datetime import datetime +import warnings +import traceback +import os +from urllib.parse import urlparse + +# PDF libraries +import fitz +import pymupdf4llm + +# HTML libraries +import markdownify +import re + +## DOCX libraries +import pypandoc +import tempfile + +## JSON libraries +import json + + +# Schema of the dict returned by `file_parser(...)` +# This is used to create the output Delta Table's schema. +# Adjust the class if you want to add additional columns from your parser, such as extracting custom metadata. +class ParserReturnValue(TypedDict): + # DO NOT CHANGE THESE NAMES + # Parsed content of the document + content: str # do not change this name + # The status of whether the parser succeeds or fails, used to exclude failed files downstream + parser_status: str # do not change this name + # Unique ID of the document + doc_uri: str # do not change this name + + # OK TO CHANGE THESE NAMES + # Optionally, you can add additional metadata fields here + # example_metadata: str + last_modified: datetime + + +# Parser function. Adjust this function to modify the parsing logic. +def file_parser( + raw_doc_contents_bytes: bytes, + doc_path: str, + modification_time: datetime, + doc_bytes_length: int, +) -> ParserReturnValue: + """ + Parses the content of a PDF document into a string. + + This function takes the raw bytes of a PDF document and its path, attempts to parse the document using PyPDF, + and returns the parsed content and the status of the parsing operation. + + Parameters: + - raw_doc_contents_bytes (bytes): The raw bytes of the document to be parsed (set by Spark when loading the file) + - doc_path (str): The DBFS path of the document, used to verify the file extension (set by Spark when loading the file) + - modification_time (timestamp): The last modification time of the document (set by Spark when loading the file) + - doc_bytes_length (long): The size of the document in bytes (set by Spark when loading the file) + + Returns: + - ParserReturnValue: A dictionary containing the parsed document content and the status of the parsing operation. + The 'contenty will contain the parsed text as a string, and the 'parser_status' key will indicate + whether the parsing was successful or if an error occurred. + """ + try: + from markdownify import markdownify as md + + filename, file_extension = os.path.splitext(doc_path) + + if file_extension == ".pdf": + pdf_doc = fitz.Document(stream=raw_doc_contents_bytes, filetype="pdf") + md_text = pymupdf4llm.to_markdown(pdf_doc) + + parsed_document = { + "content": md_text.strip(), + "parser_status": "SUCCESS", + } + elif file_extension == ".html": + html_content = raw_doc_contents_bytes.decode("utf-8") + + markdown_contents = md( + str(html_content).strip(), heading_style=markdownify.ATX + ) + markdown_stripped = re.sub(r"\n{3,}", "\n\n", markdown_contents.strip()) + + parsed_document = { + "content": markdown_stripped, + "parser_status": "SUCCESS", + } + elif file_extension == ".docx": + with tempfile.NamedTemporaryFile(delete=True) as temp_file: + temp_file.write(raw_doc_contents_bytes) + temp_file_path = temp_file.name + md = pypandoc.convert_file(temp_file_path, "markdown", format="docx") + + parsed_document = { + "content": md.strip(), + "parser_status": "SUCCESS", + } + elif file_extension in [".txt", ".md"]: + parsed_document = { + "content": raw_doc_contents_bytes.decode("utf-8").strip(), + "parser_status": "SUCCESS", + } + elif file_extension in [".json", ".jsonl"]: + # NOTE: This is a placeholder for a JSON parser. It's not a "real" parser, it just returns the raw JSON formatted into XML-like strings that LLMs tend to like. + json_data = json.loads(raw_doc_contents_bytes.decode("utf-8")) + + def flatten_json_to_xml(obj, parent_key=""): + xml_parts = [] + if isinstance(obj, dict): + for key, value in obj.items(): + if isinstance(value, (dict, list)): + xml_parts.append(flatten_json_to_xml(value, key)) + else: + xml_parts.append(f"<{key}>{str(value)}") + elif isinstance(obj, list): + for i, item in enumerate(obj): + if isinstance(item, (dict, list)): + xml_parts.append( + flatten_json_to_xml(item, f"{parent_key}_{i}") + ) + else: + xml_parts.append( + f"<{parent_key}_{i}>{str(item)}" + ) + else: + xml_parts.append(f"<{parent_key}>{str(obj)}") + return "\n".join(xml_parts) + + flattened_content = flatten_json_to_xml(json_data) + parsed_document = { + "content": flattened_content.strip(), + "parser_status": "SUCCESS", + } + else: + raise Exception(f"No supported parser for {doc_path}") + + # Extract the required doc_uri + # convert from `dbfs:/Volumes/catalog/schema/pdf_docs/filename.pdf` to `/Volumes/catalog/schema/pdf_docs/filename.pdf` + modified_path = urlparse(doc_path).path + parsed_document["doc_uri"] = modified_path + + # Sample metadata extraction logic + # if "test" in parsed_document["content + # parsed_document["example_metadata"] = "test" + # else: + # parsed_document["example_metadata"] = "not test" + + # Add the modified time + parsed_document["last_modified"] = modification_time + + return parsed_document + + except Exception as e: + status = f"An error occurred: {e}\n{traceback.format_exc()}" + warnings.warn(status) + return { + "content": "", + "parser_status": f"ERROR: {status}", + } diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/parse_docs.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/parse_docs.py new file mode 100644 index 0000000..182de01 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/parse_docs.py @@ -0,0 +1,159 @@ +import traceback +from datetime import datetime +from typing import Any, Callable, TypedDict, Dict +import os +from IPython.display import display_markdown +import warnings +import pyspark.sql.functions as func +from pyspark.sql.types import StructType +from pyspark.sql import DataFrame, SparkSession + + +def _parse_and_extract( + raw_doc_contents_bytes: bytes, + modification_time: datetime, + doc_bytes_length: int, + doc_path: str, + parse_file_udf: Callable[[[dict, Any]], str], +) -> Dict[str, Any]: + """Parses raw bytes & extract metadata.""" + try: + # Run the parser + parser_output_dict = parse_file_udf( + raw_doc_contents_bytes=raw_doc_contents_bytes, + doc_path=doc_path, + modification_time=modification_time, + doc_bytes_length=doc_bytes_length, + ) + + if parser_output_dict.get("parser_status") == "SUCCESS": + return parser_output_dict + else: + raise Exception(parser_output_dict.get("parser_status")) + + except Exception as e: + status = f"An error occurred: {e}\n{traceback.format_exc()}" + warnings.warn(status) + return { + "content": "", + "doc_uri": doc_path, + "parser_status": status, + } + + +def _get_parser_udf( + # extract_metadata_udf: Callable[[[dict, Any]], str], + parse_file_udf: Callable[[[dict, Any]], str], + spark_dataframe_schema: StructType, +): + """Gets the Spark UDF which will parse the files in parallel. + + Arguments: + - extract_metadata_udf: A function that takes parsed content and extracts the metadata + - parse_file_udf: A function that takes the raw file and returns the parsed text. + - spark_dataframe_schema: The resulting schema of the document delta table + """ + # This UDF will load each file, parse the doc, and extract metadata. + parser_udf = func.udf( + lambda raw_doc_contents_bytes, modification_time, doc_bytes_length, doc_path: _parse_and_extract( + raw_doc_contents_bytes, + modification_time, + doc_bytes_length, + doc_path, + parse_file_udf, + ), + returnType=spark_dataframe_schema, + useArrow=True, + ) + return parser_udf + + +def load_files_to_df(spark: SparkSession, source_path: str) -> DataFrame: + """ + Load files from a directory into a Spark DataFrame. + Each row in the DataFrame will contain the path, length, and content of the file; for more + details, see https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html + """ + + print(f"Loading the raw files from {source_path}...") + # Load the raw riles + raw_files_df = ( + spark.read.format("binaryFile") + .option("recursiveFileLookup", "true") + .load(source_path) + ) + + # Check that files were present and loaded + if raw_files_df.count() == 0: + raise Exception(f"`{source_path}` does not contain any files.") + + # display_markdown( + # f"### Found {raw_files_df.count()} files in {source_path}: ", raw=True + # ) + # raw_files_df.display() + return raw_files_df + + +def apply_parsing_fn( + raw_files_df: DataFrame, + parse_file_fn: Callable[[[dict, Any]], str], + parsed_df_schema: StructType, +) -> DataFrame: + """ + Apply a file-parsing UDF to a DataFrame whose rows correspond to file content/metadata loaded via + https://spark.apache.org/docs/latest/sql-data-sources-binaryFile.html + Returns a DataFrame with the parsed content and metadata. + """ + print( + f"Applying parsing & metadata extraction to {raw_files_df.count()} files using Spark - this may take a long time if you have many documents..." + ) + + parser_udf = _get_parser_udf(parse_file_fn, parsed_df_schema) + + # Run the parsing + parsed_files_staging_df = raw_files_df.withColumn( + "parsing", parser_udf("content", "modificationTime", "length", "path") + ).drop("content") + + # Filter for successfully parsed files + parsed_files_df = parsed_files_staging_df # .filter( + # parsed_files_staging_df.parsing.parser_status == "SUCCESS" + # ) + + # Change the schema to the resulting schema + resulting_fields = [field.name for field in parsed_df_schema.fields] + + parsed_files_df = parsed_files_df.select( + *[func.col(f"parsing.{field}").alias(field) for field in resulting_fields] + ) + return parsed_files_df + + +def check_parsed_df_for_errors(parsed_files_df) -> tuple[bool, str, DataFrame]: + # Check and warn on any errors + errors_df = parsed_files_df.filter(func.col(f"parser_status") != "SUCCESS") + + num_errors = errors_df.count() + if num_errors > 0: + msg = f"{num_errors} documents ({round(errors_df.count()/parsed_files_df.count(), 2)*100}) of documents had parse errors. Please review." + return (True, msg, errors_df) + else: + msg = "All documents were parsed." + print(msg) + return (False, msg, None) + + +def check_parsed_df_for_empty_parsed_files(parsed_files_df): + # Check and warn on any errors + num_empty_df = parsed_files_df.filter( + func.col(f"parser_status") == "SUCCESS" + ).filter(func.col("content") == "") + + num_errors = num_empty_df.count() + if num_errors > 0: + msg = f"{num_errors} documents ({round(num_empty_df.count()/parsed_files_df.count(), 2)*100}) of documents returned empty parsing results. Please review." + return (True, msg, num_empty_df) + else: + msg = "All documents produced non-null parsing results." + print(msg) + return (False, msg, None) diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/recursive_character_text_splitter.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/recursive_character_text_splitter.py new file mode 100644 index 0000000..d9f6ed8 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/recursive_character_text_splitter.py @@ -0,0 +1,255 @@ +from typing import Callable, Tuple, Optional +from databricks.sdk import WorkspaceClient +from pydantic import BaseModel + +# %md +# ##### `get_recursive_character_text_splitter` + +# `get_recursive_character_text_splitter` creates a new function that, given an embedding endpoint, returns a callable that can chunk text documents. This utility allows you to write the core business logic of the chunker, without dealing with the details of text splitting. You can decide to write your own, or edit this code if it does not fit your use case. + +# **Arguments:** + +# - `model_serving_endpoint`: The name of the Model Serving endpoint with the embedding model. +# - `embedding_model_name`: The name of the embedding model e.g., `gte-large-en-v1.5`, etc. If `model_serving_endpoint` is an OpenAI External Model or FMAPI model and set to `None`, this will be automatically detected. +# - `chunk_size_tokens`: An optional size for each chunk in tokens. Defaults to `None`, which uses the model's entire context window. +# - `chunk_overlap_tokens`: Tokens that should overlap between chunks. Defaults to `0`. + +# **Returns:** A callable that takes a document (`str`) and produces a list of chunks (`list[str]`). + +# Constants +HF_CACHE_DIR = "/tmp/hf_cache/" + +# Embedding Models Configuration +EMBEDDING_MODELS = { + "gte-large-en-v1.5": { + # "tokenizer": lambda: AutoTokenizer.from_pretrained( + # "Alibaba-NLP/gte-large-en-v1.5", cache_dir=HF_CACHE_DIR + # ), + "context_window": 8192, + "type": "SENTENCE_TRANSFORMER", + }, + "bge-large-en-v1.5": { + # "tokenizer": lambda: AutoTokenizer.from_pretrained( + # "BAAI/bge-large-en-v1.5", cache_dir=HF_CACHE_DIR + # ), + "context_window": 512, + "type": "SENTENCE_TRANSFORMER", + }, + "bge_large_en_v1_5": { + # "tokenizer": lambda: AutoTokenizer.from_pretrained( + # "BAAI/bge-large-en-v1.5", cache_dir=HF_CACHE_DIR + # ), + "context_window": 512, + "type": "SENTENCE_TRANSFORMER", + }, + "text-embedding-ada-002": { + "context_window": 8192, + # "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-ada-002"), + "type": "OPENAI", + }, + "text-embedding-3-small": { + "context_window": 8192, + # "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-3-small"), + "type": "OPENAI", + }, + "text-embedding-3-large": { + "context_window": 8192, + # "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-3-large"), + "type": "OPENAI", + }, +} + + +def get_workspace_client() -> WorkspaceClient: + """Returns a WorkspaceClient instance.""" + return WorkspaceClient() + + +# TODO: this is a cheap hack to avoid importing tokenizer libs at the top level - the datapipeline utils are imported by the agent notebook which won't have these libs loaded & we don't want to since autotokenizer is heavy weight. +def get_embedding_model_tokenizer(endpoint_type: str) -> Optional[dict]: + from transformers import AutoTokenizer + import tiktoken + + # copy here to prevent needing to install tokenizer libraries everywhere this is imported + EMBEDDING_MODELS_W_TOKENIZER = { + "gte-large-en-v1.5": { + "tokenizer": lambda: AutoTokenizer.from_pretrained( + "Alibaba-NLP/gte-large-en-v1.5", cache_dir=HF_CACHE_DIR + ), + "context_window": 8192, + "type": "SENTENCE_TRANSFORMER", + }, + "bge-large-en-v1.5": { + "tokenizer": lambda: AutoTokenizer.from_pretrained( + "BAAI/bge-large-en-v1.5", cache_dir=HF_CACHE_DIR + ), + "context_window": 512, + "type": "SENTENCE_TRANSFORMER", + }, + "bge_large_en_v1_5": { + "tokenizer": lambda: AutoTokenizer.from_pretrained( + "BAAI/bge-large-en-v1.5", cache_dir=HF_CACHE_DIR + ), + "context_window": 512, + "type": "SENTENCE_TRANSFORMER", + }, + "text-embedding-ada-002": { + "context_window": 8192, + "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-ada-002"), + "type": "OPENAI", + }, + "text-embedding-3-small": { + "context_window": 8192, + "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-3-small"), + "type": "OPENAI", + }, + "text-embedding-3-large": { + "context_window": 8192, + "tokenizer": lambda: tiktoken.encoding_for_model("text-embedding-3-large"), + "type": "OPENAI", + }, + } + return EMBEDDING_MODELS_W_TOKENIZER.get(endpoint_type).get("tokenizer") + + +def get_embedding_model_config(endpoint_type: str) -> Optional[dict]: + """ + Retrieve embedding model configuration by endpoint type. + """ + + return EMBEDDING_MODELS.get(endpoint_type) + + +def extract_endpoint_type(llm_endpoint) -> Optional[str]: + """ + Extract the endpoint type from the given llm_endpoint object. + """ + try: + return llm_endpoint.config.served_entities[0].external_model.name + except AttributeError: + try: + return llm_endpoint.config.served_entities[0].foundation_model.name + except AttributeError: + return None + + +def detect_fmapi_embedding_model_type( + model_serving_endpoint: str, +) -> Tuple[Optional[str], Optional[dict]]: + """ + Detects the embedding model type and configuration for the given endpoint. + Returns a tuple of (endpoint_type, embedding_config) or (None, None) if not found. + """ + client = get_workspace_client() + + try: + llm_endpoint = client.serving_endpoints.get(name=model_serving_endpoint) + endpoint_type = extract_endpoint_type(llm_endpoint) + except Exception as e: + endpoint_type = None + + embedding_config = ( + get_embedding_model_config(endpoint_type) if endpoint_type else None + ) + + embedding_config["tokenizer"] = ( + get_embedding_model_tokenizer(endpoint_type) if endpoint_type else None + ) + + return (endpoint_type, embedding_config) + + +def validate_chunk_size(chunk_spec: dict): + """ + Validate the chunk size and overlap settings in chunk_spec. + Raises ValueError if any condition is violated. + """ + if ( + chunk_spec["chunk_overlap_tokens"] + chunk_spec["chunk_size_tokens"] + ) > chunk_spec["context_window"]: + msg = ( + f'Proposed chunk_size of {chunk_spec["chunk_size_tokens"]} + overlap of {chunk_spec["chunk_overlap_tokens"]} ' + f'is {chunk_spec["chunk_overlap_tokens"] + chunk_spec["chunk_size_tokens"]} which is greater than context ' + f'window of {chunk_spec["context_window"]} tokens.', + ) + return (False, msg) + elif chunk_spec["chunk_overlap_tokens"] > chunk_spec["chunk_size_tokens"]: + msg = ( + f'Proposed `chunk_overlap_tokens` of {chunk_spec["chunk_overlap_tokens"]} is greater than the ' + f'`chunk_size_tokens` of {chunk_spec["chunk_size_tokens"]}. Reduce the size of `chunk_size_tokens`.', + ) + return (False, msg) + else: + context_usage = ( + round( + (chunk_spec["chunk_size_tokens"] + chunk_spec["chunk_overlap_tokens"]) + / chunk_spec["context_window"], + 2, + ) + * 100 + ) + msg = f'Chunk size in tokens: {chunk_spec["chunk_size_tokens"]} and chunk overlap in tokens: {chunk_spec["chunk_overlap_tokens"]} are valid. Using {round(context_usage, 2)}% ({chunk_spec["chunk_size_tokens"] + chunk_spec["chunk_overlap_tokens"]} tokens) of the {chunk_spec["context_window"]} token context window.' + return (True, msg) + + +def get_recursive_character_text_splitter( + model_serving_endpoint: str, + embedding_model_name: str = None, + chunk_size_tokens: int = None, + chunk_overlap_tokens: int = 0, +) -> Callable[[str], list[str]]: + # imports here to prevent needing to install everywhere + + from langchain_text_splitters import RecursiveCharacterTextSplitter + from transformers import AutoTokenizer + import tiktoken + + try: + # Detect the embedding model and its configuration + embedding_model_name, chunk_spec = detect_fmapi_embedding_model_type( + model_serving_endpoint + ) + + if chunk_spec is None or embedding_model_name is None: + # Fall back to using provided embedding_model_name + chunk_spec = EMBEDDING_MODELS.get(embedding_model_name) + if chunk_spec is None: + raise KeyError + + # Update chunk specification based on provided parameters + chunk_spec["chunk_size_tokens"] = ( + chunk_size_tokens or chunk_spec["context_window"] + ) + chunk_spec["chunk_overlap_tokens"] = chunk_overlap_tokens + + # Validate chunk size and overlap + is_valid, msg = validate_chunk_size(chunk_spec) + if not is_valid: + raise ValueError(msg) + else: + print(msg) + + except KeyError: + raise ValueError( + f"Embedding model `{embedding_model_name}` not found. Available models: {EMBEDDING_MODELS.keys()}" + ) + + def _recursive_character_text_splitter(text: str) -> list[str]: + tokenizer = chunk_spec["tokenizer"]() + if chunk_spec["type"] == "SENTENCE_TRANSFORMER": + splitter = RecursiveCharacterTextSplitter.from_huggingface_tokenizer( + tokenizer, + chunk_size=chunk_spec["chunk_size_tokens"], + chunk_overlap=chunk_spec["chunk_overlap_tokens"], + ) + elif chunk_spec["type"] == "OPENAI": + splitter = RecursiveCharacterTextSplitter.from_tiktoken_encoder( + tokenizer.name, + chunk_size=chunk_spec["chunk_size_tokens"], + chunk_overlap=chunk_spec["chunk_overlap_tokens"], + ) + else: + raise ValueError(f"Unsupported model type: {chunk_spec['type']}") + return splitter.split_text(text) + + return _recursive_character_text_splitter diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/utils/__init__.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/utils/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/data_pipeline/utils/typed_dicts_to_spark_schema.py b/autogen_agent_app_sample_code/cookbook/data_pipeline/utils/typed_dicts_to_spark_schema.py new file mode 100644 index 0000000..195c16e --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/data_pipeline/utils/typed_dicts_to_spark_schema.py @@ -0,0 +1,103 @@ +from pyspark.sql.types import ( + StructType, + StructField, + StringType, + IntegerType, + DoubleType, + BooleanType, + ArrayType, + TimestampType, + DateType, +) +from typing import TypedDict, get_type_hints, List +from datetime import datetime, date, time + + +def typed_dict_to_spark_fields(typed_dict: type[TypedDict]) -> StructType: + """ + Converts a TypedDict into a list of Spark StructField objects. + + This function maps Python types defined in a TypedDict to their corresponding + Spark SQL data types, facilitating the creation of a Spark DataFrame schema + from Python type annotations. + + Parameters: + - typed_dict (type[TypedDict]): The TypedDict class to be converted. + + Returns: + - StructType: A list of StructField objects representing the Spark schema. + + Raises: + - ValueError: If an unsupported type is encountered or if dictionary types are used. + """ + + # Mapping of type names to Spark type objects + type_mapping = { + str: StringType(), + int: IntegerType(), + float: DoubleType(), + bool: BooleanType(), + list: ArrayType(StringType()), # Default to StringType for arrays + datetime: TimestampType(), + date: DateType(), + } + + def get_spark_type(value_type): + """ + Helper function to map a Python type to a Spark SQL data type. + + This function supports basic Python types, lists of a single type, and raises + an error for unsupported types or dictionaries. + + Parameters: + - value_type: The Python type to be converted. + + Returns: + - DataType: The corresponding Spark SQL data type. + + Raises: + - ValueError: If the type is unsupported or if dictionary types are used. + """ + if value_type in type_mapping: + return type_mapping[value_type] + elif hasattr(value_type, "__origin__") and value_type.__origin__ == list: + # Handle List[type] types + return ArrayType(get_spark_type(value_type.__args__[0])) + elif hasattr(value_type, "__origin__") and value_type.__origin__ == dict: + # Handle Dict[type, type] types (not fully supported) + raise ValueError("Dict types are not fully supported") + else: + raise ValueError(f"Unsupported type: {value_type}") + + # Get the type hints for the TypedDict + type_hints = get_type_hints(typed_dict) + + # Convert the type hints into a list of StructField objects + fields = [ + StructField(key, get_spark_type(value), True) + for key, value in type_hints.items() + ] + + # Create and return the StructType object + return fields + + +def typed_dicts_to_spark_schema(*typed_dicts: type[TypedDict]) -> StructType: + """ + Converts multiple TypedDicts into a Spark schema. + + This function allows for the combination of multiple TypedDicts into a single + Spark DataFrame schema, enabling the creation of complex data structures. + + Parameters: + - *typed_dicts: Variable number of TypedDict classes to be converted. + + Returns: + - StructType: A Spark schema represented as a StructType object, which is a collection + of StructField objects derived from the provided TypedDicts. + """ + fields = [] + for typed_dict in typed_dicts: + fields.extend(typed_dict_to_spark_fields(typed_dict)) + + return StructType(fields) diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/__init__.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/__init__.py new file mode 100644 index 0000000..94fd8fb --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/databricks_utils/__init__.py @@ -0,0 +1,225 @@ +# Helper functions for displaying Delta Table and Volume URLs + +from typing import Optional +import json +import subprocess + +from databricks.sdk import WorkspaceClient +from mlflow.utils import databricks_utils as du + + +def get_databricks_cli_config() -> dict: + """Retrieve the Databricks CLI configuration by running 'databricks auth describe' command. + + Returns: + dict: The parsed JSON configuration from the Databricks CLI, or None if an error occurs + + Note: + Requires the Databricks CLI to be installed and configured + """ + try: + # Run databricks auth describe command and capture output + process = subprocess.run( + ["databricks", "auth", "describe", "-o", "json"], + capture_output=True, + text=True, + check=True, # Raises CalledProcessError if command fails + ) + + # Parse JSON output + return json.loads(process.stdout) + except subprocess.CalledProcessError as e: + print(f"Error running databricks CLI command: {e}") + return None + except json.JSONDecodeError as e: + print(f"Error parsing databricks CLI JSON output: {e}") + return None + except Exception as e: + print(f"Unexpected error getting databricks config from CLI: {e}") + return None + + +def get_workspace_hostname() -> str: + """Get the Databricks workspace hostname. + + Returns: + str: The full workspace hostname (e.g., 'https://my-workspace.cloud.databricks.com') + + Raises: + RuntimeError: If not in a Databricks notebook and unable to get workspace hostname from CLI config + """ + if du.is_in_databricks_notebook(): + return "https://" + du.get_browser_hostname() + else: + cli_config = get_databricks_cli_config() + if cli_config is None: + raise RuntimeError("Could not get Databricks CLI config") + try: + return cli_config["details"]["host"] + except KeyError: + raise RuntimeError( + "Could not find workspace hostname in Databricks CLI config" + ) + + +def get_table_url(table_fqdn: str) -> str: + """Generate the URL for a Unity Catalog table in the Databricks UI. + + Args: + table_fqdn: Fully qualified table name in format 'catalog.schema.table'. + Can optionally include backticks around identifiers. + + Returns: + str: The full URL to view the table in the Databricks UI. + + Example: + >>> get_table_url("main.default.my_table") + 'https://my-workspace.cloud.databricks.com/explore/data/main/default/my_table' + """ + table_fqdn = table_fqdn.replace("`", "") + catalog, schema, table = table_fqdn.split(".") + browser_url = get_workspace_hostname() + url = f"{browser_url}/explore/data/{catalog}/{schema}/{table}" + return url + + +def get_volume_url(volume_fqdn: str) -> str: + """Generate the URL for a Unity Catalog volume in the Databricks UI. + + Args: + volume_fqdn: Fully qualified volume name in format 'catalog.schema.volume'. + Can optionally include backticks around identifiers. + + Returns: + str: The full URL to view the volume in the Databricks UI. + + Example: + >>> get_volume_url("main.default.my_volume") + 'https://my-workspace.cloud.databricks.com/explore/data/volumes/main/default/my_volume' + """ + volume_fqdn = volume_fqdn.replace("`", "") + catalog, schema, volume = volume_fqdn.split(".") + browser_url = get_workspace_hostname() + url = f"{browser_url}/explore/data/volumes/{catalog}/{schema}/{volume}" + return url + + +def get_mlflow_experiment_url(experiment_id: str) -> str: + """Generate the URL for an MLflow experiment in the Databricks UI. + + Args: + experiment_id: The ID of the MLflow experiment + + Returns: + str: The full URL to view the MLflow experiment in the Databricks UI. + + Example: + >>> get_mlflow_experiment_url("") + 'https://my-workspace.cloud.databricks.com/ml/experiments/' + """ + browser_url = get_workspace_hostname() + url = f"{browser_url}/ml/experiments/{experiment_id}" + return url + + +def get_mlflow_experiment_traces_url(experiment_id: str) -> str: + """Generate the URL for the MLflow experiment traces in the Databricks UI.""" + return get_mlflow_experiment_url(experiment_id) + "?compareRunsMode=TRACES" + + +def get_function_url(function_fqdn: str) -> str: + """Generate the URL for a Unity Catalog function in the Databricks UI. + + Args: + function_fqdn: Fully qualified function name in format 'catalog.schema.function'. + Can optionally include backticks around identifiers. + + Returns: + str: The full URL to view the function in the Databricks UI. + + Example: + >>> get_function_url("main.default.my_function") + 'https://my-workspace.cloud.databricks.com/explore/data/functions/main/default/my_function' + """ + function_fqdn = function_fqdn.replace("`", "") + catalog, schema, function = function_fqdn.split(".") + browser_url = get_workspace_hostname() + url = f"{browser_url}/explore/data/functions/{catalog}/{schema}/{function}" + return url + + +def get_cluster_url(cluster_id: str) -> str: + """Generate the URL for a Databricks cluster in the Databricks UI. + + Args: + cluster_id: The ID of the cluster + + Returns: + str: The full URL to view the cluster in the Databricks UI. + + Example: + >>> get_cluster_url("") + 'https://my-workspace.cloud.databricks.com/compute/clusters/' + """ + browser_url = get_workspace_hostname() + url = f"{browser_url}/compute/clusters/{cluster_id}" + return url + + +def get_active_cluster_id_from_databricks_auth() -> Optional[str]: + """Get the active cluster ID from the Databricks CLI authentication configuration. + + Returns: + Optional[str]: The active cluster ID if found, None if not found or if an error occurs + + Note: + This function relies on the Databricks CLI configuration having a cluster_id set + """ + if du.is_in_databricks_notebook(): + raise ValueError( + "Cannot get active cluster ID from the Databricks CLI in a Databricks notebook" + ) + try: + # Get config from the databricks cli + auth_output = get_databricks_cli_config() + + # Safely navigate nested dict + details = auth_output.get("details", {}) + config = details.get("configuration", {}) + cluster = config.get("cluster_id", {}) + cluster_id = cluster.get("value") + + if cluster_id is None: + raise ValueError("Could not find cluster_id in Databricks auth config") + + return cluster_id + + except Exception as e: + print(f"Unexpected error: {e}") + return None + + +def get_active_cluster_id() -> Optional[str]: + """Get the active cluster ID. + + Returns: + Optional[str]: The active cluster ID if found, None if not found or if an error occurs + """ + if du.is_in_databricks_notebook(): + return du.get_active_cluster_id() + else: + return get_active_cluster_id_from_databricks_auth() + + +def get_current_user_info(spark) -> tuple[str, str, str]: + # Get current user's name & email + w = WorkspaceClient() + user_email = w.current_user.me().user_name + user_name = user_email.split("@")[0].replace(".", "_") + + # Get the workspace default UC catalog + default_catalog = spark.sql("select current_catalog() as cur_catalog").collect()[0][ + "cur_catalog" + ] + + return user_email, user_name, default_catalog diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/__init__.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/evaluation_set.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/evaluation_set.py new file mode 100644 index 0000000..6cd2e84 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_evaluation/evaluation_set.py @@ -0,0 +1,236 @@ +from typing import List, Mapping, Optional + +import mlflow.entities as mlflow_entities + +from pyspark import sql +from pyspark.sql import functions as F, types as T +from pyspark.sql.window import Window + +from databricks.rag_eval.evaluation import traces + +# Deduplicate the assessment log + +# By default, the assessment log contains one row for every action/click the user does in the Review App. This code translates these logs into a single row for each request. + +_REQUEST_ID = "request_id" +_TIMESTAMP = "timestamp" +_ROW_NUMBER = "row_number" +_SOURCE = "source" +_SOURCE_ID = "source.id" +_STEP_ID = "step_id" +_TEXT_ASSESSMENT = "text_assessment" +_RETRIEVAL_ASSESSMENT = "retrieval_assessment" + + +def _dedup_by_assessment_window( + assessment_log_df: sql.DataFrame, window: Window +) -> sql.DataFrame: + """ + Dedup the assessment logs by taking the first row from each group, defined by the window + :param assessment_log_df: Pyspark DataFrame of the assessment logs + :param window: Pyspark window to group assessments by + :return: Pyspark DataFrame of the deduped assessment logs + """ + return ( + assessment_log_df.withColumn(_ROW_NUMBER, F.row_number().over(window)) + .filter(F.col(_ROW_NUMBER) == 1) + .drop(_ROW_NUMBER) + ) + + +def _dedup_assessment_log(assessment_log_df: sql.DataFrame) -> sql.DataFrame: + """ + Dedup the assessment logs to get the latest assessments. + :param assessment_log_df: Pyspark DataFrame of the assessment logs + :return: Pyspark DataFrame of the deduped assessment logs + """ + # Dedup the text assessments + text_assessment_window = Window.partitionBy(_REQUEST_ID, _SOURCE_ID).orderBy( + F.col(_TIMESTAMP).desc() + ) + deduped_text_assessment_df = _dedup_by_assessment_window( + # Filter rows with null text assessments + assessment_log_df.filter(F.col(_TEXT_ASSESSMENT).isNotNull()), + text_assessment_window, + ) + + # Dedup the retrieval assessments + retrieval_assessment_window = Window.partitionBy( + _REQUEST_ID, + _SOURCE_ID, + f"{_RETRIEVAL_ASSESSMENT}.position", + f"{_RETRIEVAL_ASSESSMENT}.{_STEP_ID}", + ).orderBy(F.col(_TIMESTAMP).desc()) + deduped_retrieval_assessment_df = _dedup_by_assessment_window( + # Filter rows with null retrieval assessments + assessment_log_df.filter(F.col(_RETRIEVAL_ASSESSMENT).isNotNull()), + retrieval_assessment_window, + ) + + # Collect retrieval assessments from the same request/step/source into a single list + nested_retrieval_assessment_df = ( + deduped_retrieval_assessment_df.groupBy(_REQUEST_ID, _SOURCE_ID, _STEP_ID).agg( + F.any_value(_TIMESTAMP).alias(_TIMESTAMP), + F.any_value(_SOURCE).alias(_SOURCE), + F.collect_list(_RETRIEVAL_ASSESSMENT).alias("retrieval_assessments"), + ) + # Drop the old retrieval assessment, source id, and text assessment columns + .drop(_RETRIEVAL_ASSESSMENT, "id", _TEXT_ASSESSMENT) + ) + + # Join the deduped text assessments with the nested deduped retrieval assessments + deduped_assessment_log_df = deduped_text_assessment_df.alias("a").join( + nested_retrieval_assessment_df.alias("b"), + (F.col(f"a.{_REQUEST_ID}") == F.col(f"b.{_REQUEST_ID}")) + & (F.col(f"a.{_SOURCE_ID}") == F.col(f"b.{_SOURCE_ID}")), + "full_outer", + ) + + # Coalesce columns from both dataframes in case a request does not have either assessment + return deduped_assessment_log_df.select( + F.coalesce(F.col(f"a.{_REQUEST_ID}"), F.col(f"b.{_REQUEST_ID}")).alias( + _REQUEST_ID + ), + F.coalesce(F.col(f"a.{_STEP_ID}"), F.col(f"b.{_STEP_ID}")).alias(_STEP_ID), + F.coalesce(F.col(f"a.{_TIMESTAMP}"), F.col(f"b.{_TIMESTAMP}")).alias( + _TIMESTAMP + ), + F.coalesce(F.col(f"a.{_SOURCE}"), F.col(f"b.{_SOURCE}")).alias(_SOURCE), + F.col(f"a.{_TEXT_ASSESSMENT}").alias(_TEXT_ASSESSMENT), + F.col("b.retrieval_assessments").alias(_RETRIEVAL_ASSESSMENT), + # F.col("schema_version") + ) + + ## Attach ground truth + + +def attach_ground_truth(request_log_df, deduped_assessment_log_df): + suggested_output_col = F.col(f"{_TEXT_ASSESSMENT}.suggested_output") + is_correct_col = F.col(f"{_TEXT_ASSESSMENT}.ratings.answer_correct.value") + # Extract out the thumbs up/down rating and the suggested output + rating_log_df = ( + deduped_assessment_log_df.withColumn("is_correct", is_correct_col) + .withColumn( + "suggested_output", + F.when(suggested_output_col == "", None).otherwise(suggested_output_col), + ) + .withColumn("source_user", F.col("source.id")) + .select( + "request_id", + "is_correct", + "suggested_output", + "source_user", + _RETRIEVAL_ASSESSMENT, + ) + ) + # Join the request log with the ratings from above + raw_requests_with_feedback_df = request_log_df.join( + rating_log_df, + request_log_df.databricks_request_id == rating_log_df.request_id, + "left", + ) + + raw_requests_with_feedback_df = raw_requests_with_feedback_df.drop("request_id") + return raw_requests_with_feedback_df + +_EXPECTED_RETRIEVAL_CONTEXT_SCHEMA = T.ArrayType( + T.StructType( + [ + T.StructField("doc_uri", T.StringType()), + T.StructField("content", T.StringType()), + ] + ) +) + + +def extract_retrieved_chunks_from_trace(trace_str: str) -> List[Mapping[str, str]]: + """Helper to extract the retrieved chunks from a trace string""" + trace = mlflow_entities.Trace.from_json(trace_str) + chunks = traces.extract_retrieval_context_from_trace(trace) + return [{"doc_uri": chunk.doc_uri, "content": chunk.content} for chunk in chunks] + + +@F.udf(_EXPECTED_RETRIEVAL_CONTEXT_SCHEMA) +def construct_expected_retrieval_context( + trace_str: Optional[str], chunk_at_i_relevance: Optional[List[str]] +) -> Optional[List[Mapping[str, str]]]: + """Helper to construct the expected retrieval context. Any retrieved chunks that are not relevant are dropped.""" + if chunk_at_i_relevance is None or trace_str is None: + return None + retrieved_chunks = extract_retrieved_chunks_from_trace(trace_str) + expected_retrieval_context = [ + chunk + for chunk, rating in zip(retrieved_chunks, chunk_at_i_relevance) + if rating == "true" + ] + return expected_retrieval_context if len(expected_retrieval_context) else None + + +# ================================= + + +def identify_potential_eval_set_records(raw_requests_with_feedback_df): + # For thumbs up, use either the suggested output or the response, in that order + positive_feedback_df = ( + raw_requests_with_feedback_df.where(F.col("is_correct") == F.lit("positive")) + .withColumn( + "expected_response", + F.when( + F.col("suggested_output") != None, F.col("suggested_output") + ).otherwise(F.col("response")), + ) + .withColumn("source_tag", F.lit("thumbs_up")) + ) + + # For thumbs down, use the suggested output if there is one + negative_feedback_df = ( + raw_requests_with_feedback_df.where(F.col("is_correct") == F.lit("negative")) + .withColumn("expected_response", F.col("suggested_output")) + .withColumn("source_tag", F.lit("thumbs_down_edited")) + ) + + # For no feedback or IDK, there is no expected response. + no_or_unknown_feedback_df = ( + raw_requests_with_feedback_df.where( + (F.col("is_correct").isNull()) + | ( + (F.col("is_correct") != F.lit("negative")) + & (F.col("is_correct") != F.lit("positive")) + ) + ) + .withColumn("expected_response", F.lit(None)) + .withColumn("source_tag", F.lit("no_feedback_provided")) + ) + # Join the above feedback tables and select the relevant columns for the eval harness + requests_with_feedback_df = positive_feedback_df.unionByName( + negative_feedback_df + ).unionByName(no_or_unknown_feedback_df) + # Get the thumbs up/down for each retrieved chunk + requests_with_feedback_df = requests_with_feedback_df.withColumn( + "chunk_at_i_relevance", + F.transform( + F.col(_RETRIEVAL_ASSESSMENT), lambda x: x.ratings.answer_correct.value + ), + ).drop(_RETRIEVAL_ASSESSMENT) + + requests_with_feedback_df = requests_with_feedback_df.withColumnRenamed( + "databricks_request_id", "request_id" + ) + + # Add the expected retrieved context column + requests_with_feedback_df = requests_with_feedback_df.withColumn( + "expected_retrieved_context", + construct_expected_retrieval_context( + F.col("trace"), F.col("chunk_at_i_relevance") + ), + ) + return requests_with_feedback_df + +def create_potential_evaluation_set(request_log_df, assessment_log_df): + raw_requests_with_feedback_df = attach_ground_truth( + request_log_df, assessment_log_df + ) + requests_with_feedback_df = identify_potential_eval_set_records( + raw_requests_with_feedback_df + ) + return requests_with_feedback_df \ No newline at end of file diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/__init__.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/get_inference_tables.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/get_inference_tables.py new file mode 100644 index 0000000..1d1183c --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/databricks_utils/agent_framework/get_inference_tables.py @@ -0,0 +1,35 @@ +from databricks.sdk import WorkspaceClient +from databricks import agents + +def get_inference_tables(uc_model_fqn): + w = WorkspaceClient() + + deployment = agents.get_deployments(uc_model_fqn) + if len(deployment) == 0: + raise ValueError(f"No deployments found for model {uc_model_fqn}") + endpoint = w.serving_endpoints.get(deployment[0].endpoint_name) + + + try: + endpoint_config = endpoint.config.auto_capture_config + except AttributeError as e: + endpoint_config = endpoint.pending_config.auto_capture_config + + inference_table_name = endpoint_config.state.payload_table.name + inference_table_catalog = endpoint_config.catalog_name + inference_table_schema = endpoint_config.schema_name + + # Cleanly formatted tables + assessment_log_table_name = f"{inference_table_name}_assessment_logs" + request_log_table_name = f"{inference_table_name}_request_logs" + + return { + 'uc_catalog_name': inference_table_catalog, + 'uc_schema_name': inference_table_schema, + 'table_names': { + 'raw_payload_logs': inference_table_name, + 'assessment_logs': assessment_log_table_name, + 'request_logs': request_log_table_name, + } + + } diff --git a/autogen_agent_app_sample_code/cookbook/databricks_utils/install_cluster_library.py b/autogen_agent_app_sample_code/cookbook/databricks_utils/install_cluster_library.py new file mode 100644 index 0000000..e7a0074 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/databricks_utils/install_cluster_library.py @@ -0,0 +1,107 @@ +from typing import List + +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.compute import ( + Library, + LibraryFullStatus, + LibraryInstallStatus, + PythonPyPiLibrary, +) +import time + + +def parse_requirements(requirements_path: str) -> List[str]: + """Parse requirements.txt file and return list of package specifications.""" + packages = [] + with open(requirements_path) as f: + for line in f: + line = line.strip() + if line and not line.startswith("#"): + packages.append(line) + return packages + + +def wait_for_library_installation( + w: WorkspaceClient, cluster_id: str, timeout_minutes: int = 20 +): + """Wait for all libraries to be installed or fail.""" + start_time = time.time() + timeout_seconds = timeout_minutes * 60 + final_states = { + LibraryInstallStatus.INSTALLED, + LibraryInstallStatus.FAILED, + LibraryInstallStatus.SKIPPED, + } + + while True: + if time.time() - start_time > timeout_seconds: + print( + f"Timeout after {timeout_minutes} minutes waiting for library installation" + ) + break + + status: List[LibraryFullStatus] = w.libraries.cluster_status(cluster_id) + all_finished = True + + for lib in status: + if lib.status not in final_states: + all_finished = False + break + + if all_finished: + break + + print("Installation in progress, waiting 15 seconds...") + time.sleep(15) # Check every 15 seconds + + # Print final status + status = w.libraries.cluster_status(cluster_id) + for lib in status: + if lib.library.pypi: + status_msg = ( + f"Package: {lib.library.pypi.package} - Status: {lib.status.value}" + ) + if lib.messages: + status_msg += f" - Messages: {', '.join(lib.messages)}" + print(status_msg) + + +def install_requirements(cluster_id: str, requirements_path: str): + """Install all packages from requirements.txt into specified cluster.""" + # Initialize workspace client + w = WorkspaceClient() + + # Parse requirements file + packages = parse_requirements(requirements_path) + + # Get current library status + current_status = w.libraries.cluster_status(cluster_id) + existing_packages = { + lib.library.pypi.package: lib.status.value + for lib in current_status + if lib.library.pypi + } + + # Filter out already installed packages + libraries = [] + for package in packages: + if ( + package not in existing_packages + or existing_packages[package] != LibraryInstallStatus.INSTALLED.value + ): + libraries.append(Library(pypi=PythonPyPiLibrary(package=package))) + else: + print(f"Package {package} is already installed, skipping...") + + if not libraries: + print("All packages are already installed.") + return + + # Install libraries + package_names = [lib.pypi.package for lib in libraries] + print(f"Installing {len(libraries)} packages: {', '.join(package_names)}") + w.libraries.install(cluster_id, libraries=libraries) + + # Wait for installation to complete + print("Waiting for installation to complete...") + wait_for_library_installation(w, cluster_id) diff --git a/autogen_agent_app_sample_code/cookbook/tools/__init__.py b/autogen_agent_app_sample_code/cookbook/tools/__init__.py new file mode 100644 index 0000000..6fc89bd --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/tools/__init__.py @@ -0,0 +1,45 @@ +from cookbook.config import SerializableConfig +from mlflow.models.resources import DatabricksResource + + +from typing import Any, List + + +class Tool(SerializableConfig): + """Base class for all tools""" + + def __call__(self, **kwargs) -> Any: + """Execute the tool with validated inputs""" + raise NotImplementedError( + "__call__ must be implemented by Tool subclasses. This method should execute " + "the tool's functionality with the provided validated inputs and return the result." + ) + + name: str + description: str + + def get_json_schema(self) -> dict: + """Returns an OpenAPI-compatible JSON schema for the tool.""" + return { + "type": "function", + "function": { + "name": self.name, + "description": self.description, + "parameters": self._get_parameters_schema(), + }, + } + + def _get_parameters_schema(self) -> dict: + """Returns the JSON schema for the tool's parameters.""" + raise NotImplementedError( + "_get_parameters_schema must be implemented by Tool subclasses. This method should " + "return an OpenAPI-compatible JSON schema dict describing the tool's input parameters. " + "The schema should include parameter names, types, descriptions, and any validation rules." + ) + + def get_resource_dependencies(self) -> List[DatabricksResource]: + """Returns a list of Databricks resources (mlflow.models.resources.* objects) that the tool uses. Used to securely provision credentials for these resources when the tool is deployed to Model Serving.""" + raise NotImplementedError( + "get_resource_dependencies must be implemented by Tool subclasses. This method should " + "return a list of mlflow.models.resources.* objects that the tool depends on." + ) diff --git a/autogen_agent_app_sample_code/cookbook/tools/local_function.py b/autogen_agent_app_sample_code/cookbook/tools/local_function.py new file mode 100644 index 0000000..afbc719 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/tools/local_function.py @@ -0,0 +1,165 @@ +from cookbook.tools import Tool + +from mlflow.models.resources import DatabricksResource +from pydantic import BaseModel, Field, create_model +from unitycatalog.ai.core.utils.docstring_utils import parse_docstring +from typing import Optional + +import inspect +from typing import Any, Callable, List, Type, get_type_hints +import importlib +import mlflow + + +class LocalFunctionTool(Tool): + """Tool implementation that wraps a function""" + + # func: Callable + func_path: str + name: str + description: str + _input_schema: Type[BaseModel] + + def _process_function( + self, func: Callable, name: Optional[str], description: Optional[str] + ) -> tuple[str, str, Type[BaseModel]]: + """Process a function to extract name, description and input schema. + + Args: + func: The function to process + name: Optional override for the function name + description: Optional override for the function description + + Returns: + Tuple of (processed_name, processed_description, processed_input_schema) + """ + processed_name = name or func.__name__ + + # Validate function has type annotations + if not all(get_type_hints(func).values()): + raise ValueError( + f"Tool '{processed_name}' must have complete type annotations for all parameters " + "and return value." + ) + + # Parse the docstring and get description + docstring = inspect.getdoc(func) + if not docstring: + raise ValueError( + f"Tool '{processed_name}' must have a docstring with Google-style formatting." + ) + + doc_info = parse_docstring(docstring) + processed_description = description or doc_info.description + + # Ensure we have parameter documentation + if not doc_info.params: + raise ValueError( + f"Tool '{processed_name}' must have documented parameters in Google-style format. " + "Example:\n Args:\n param_name: description" + ) + + # Validate all parameters are documented + sig_params = set(inspect.signature(func).parameters.keys()) + doc_params = set(doc_info.params.keys()) + if sig_params != doc_params: + missing = sig_params - doc_params + extra = doc_params - sig_params + raise ValueError( + f"Tool '{processed_name}' parameter documentation mismatch. " + f"Missing docs for: {missing if missing else 'none'}. " + f"Extra docs for: {extra if extra else 'none'}." + ) + + # Create the input schema + processed_input_schema = self._create_schema_from_function( + func, doc_info.params + ) + + return processed_name, processed_description, processed_input_schema + + def __init__( + self, + name: Optional[str] = None, + description: Optional[str] = None, + *, + func: Optional[Callable] = None, + func_path: Optional[str] = None, + ): + if func is not None and func_path is not None: + raise ValueError("Only one of func or func_path can be provided") + + if func is not None: + # Process the function to get name, description and input schema + processed_name, processed_description, processed_input_schema = ( + self._process_function(func, name, description) + ) + + # Serialize the function's location + func_path = f"{func.__module__}.{func.__name__}" + + # Now call parent class constructor with processed values + super().__init__( + func_path=func_path, + name=processed_name, + description=processed_description, + ) + + self._input_schema = processed_input_schema + + self._loaded_callable = None + self.load_func() + elif func_path is not None: + + super().__init__( + func_path=func_path, + name=name, + description=description, + # _input_schema=None, + ) + + self._loaded_callable = None + self.load_func() + + _, _, processed_input_schema = self._process_function( + self._loaded_callable, name, description + ) + + self._input_schema = processed_input_schema + + @staticmethod + def _create_schema_from_function( + func: Callable, param_descriptions: dict[str, str] + ) -> Type[BaseModel]: + """Creates a Pydantic model from function signature and parsed docstring""" + sig = inspect.signature(func) + type_hints = get_type_hints(func) + + fields = {} + for name, param in sig.parameters.items(): + fields[name] = ( + type_hints.get(name, Any), + Field(description=param_descriptions.get(name, f"Parameter: {name}")), + ) + + return create_model(f"{func.__name__.title()}Inputs", **fields) + + def load_func(self): + if self._loaded_callable is None: + module_name, func_name = self.func_path.rsplit(".", 1) + module = importlib.import_module(module_name) + self._loaded_callable = getattr(module, func_name) + + @mlflow.trace(span_type="TOOL", name="local_function") + def __call__(self, **kwargs) -> Any: + """Execute the tool's function with validated inputs""" + self.load_func() + validated_inputs = self._input_schema(**kwargs) + return self._loaded_callable(**validated_inputs.model_dump()) + + def _get_parameters_schema(self) -> dict: + """Returns the JSON schema for the tool's parameters.""" + return self._input_schema.model_json_schema() + + def get_resource_dependencies(self) -> List[DatabricksResource]: + return [] diff --git a/autogen_agent_app_sample_code/cookbook/tools/uc_tool.py b/autogen_agent_app_sample_code/cookbook/tools/uc_tool.py new file mode 100644 index 0000000..c70b290 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/tools/uc_tool.py @@ -0,0 +1,172 @@ +from cookbook.tools import Tool +from cookbook.databricks_utils import get_function_url + + +from cookbook.tools.uc_tool_utils import ( + _parse_SparkException_from_tool_execution, + _parse_ParseException_from_tool_execution, +) +import mlflow +from databricks.sdk import WorkspaceClient +from databricks.sdk.errors import ResourceDoesNotExist +from mlflow.models.resources import DatabricksFunction, DatabricksResource +from pydantic import Field, model_validator +from pyspark.errors import SparkRuntimeException +from pyspark.errors.exceptions.connect import ParseException +from unitycatalog.ai.core.databricks import DatabricksFunctionClient +from unitycatalog.ai.autogen.toolkit import UCFunctionToolkit +from dataclasses import asdict + +import json +from typing import Any, Dict, List, Union + +ERROR_INSTRUCTIONS_KEY = "error_instructions" +ERROR_STATUS_KEY = "error" + + +class UCTool(Tool): + """Configuration for a Unity Catalog function tool. + + This class defines the configuration for a Unity Catalog function that can be used + as a tool in an agent system. + + Args: + uc_function_name: Unity Catalog location of the function in format: catalog.schema.function_name. + Example: my_catalog.my_schema.my_function + + Returns: + UCTool: A configured Unity Catalog function tool object. + """ + + uc_function_name: str + """Unity Catalog location of the function in format: catalog.schema.function_name.""" + + error_prompt: str = ( + f"""The tool call generated an Exception, detailed in `{ERROR_STATUS_KEY}`. Think step-by-step following these instructions to determine your next step.\n""" + "[1] Is the error due to a problem with the input parameters?\n" + "[2] Could it succeed if retried with exactly the same inputs?\n" + "[3] Could it succeed if retried with modified parameters using the input we already have from the user?\n" + "[4] Could it succeed if retried with modified parameters informed by collecting additional input from the user? What specific input would we need from the user?\n" + """Based on your thinking, if the error is due to a problem with the input parameters, either call this tool again in a way that avoids this exception or collect additional information from the user to modify the inputs to avoid this exception.""" + ) + + # Optional b/c we set these automatically in model_post_init from the UC function itself. + # Suggest not overriding these, but rather updating the UC function's metadata directly. + name: str = Field(default=None) # Make it optional in the constructor + description: str = Field(default=None) # Make it optional in the constructor + + @model_validator(mode="after") + def validate_uc_function_name(self) -> "UCTool": + """Validates that the UC function exists and is accessible. + + Checks that the function name is properly formatted and exists in Unity Catalog + with proper permissions. + + Returns: + UCTool: The validated tool instance. + + Raises: + ValueError: If function name is invalid or function is not accessible. + """ + parts = self.uc_function_name.split(".") + if len(parts) != 3: + raise ValueError( + f"uc_function_name must be in format: catalog.schema.function_name; got `{self.uc_function_name}`" + ) + + # Validate that the function exists in Unity Catalog & user has EXECUTE permission on the function + # Docs: https://databricks-sdk-py.readthedocs.io/en/stable/workspace/catalog/functions.html#get + w = WorkspaceClient() + try: + w.functions.get(name=self.uc_function_name) + except ResourceDoesNotExist: + raise ValueError( + f"Function `{self.uc_function_name}` not found in Unity Catalog or you do not have permission to access it. Ensure the function exists, and you have EXECUTE permission on the function, USE CATALOG and USE SCHEMA permissions on the catalog and schema. If function exists, you can verify permissions here: {get_function_url(self.uc_function_name)}." + ) + + return self + + def model_post_init(self, __context: Any) -> None: + + # Initialize the UC clients + self._uc_client = DatabricksFunctionClient() + self._toolkit = UCFunctionToolkit( + function_names=[self.uc_function_name], client=self._uc_client + ) + + # OK to use [0] position b/c we know that there is only one function initialized in the toolkit. + self.name = self._toolkit.tools[0].tool["function"]["name"] + self.description = self._toolkit.tools[0].tool["function"]["description"] + + def _get_parameters_schema(self) -> dict: + """Gets the parameter schema for the UC function. + + Returns: + dict: JSON schema describing the function's parameters. + """ + # OK to use [0] position b/c we know that there is only one function initialized in the toolkit. + return self._toolkit.tools[0].tool["function"]["parameters"] + + @mlflow.trace(span_type="TOOL", name="uc_tool") + def __call__(self, **kwargs) -> Dict[str, str]: + # annotate the span with the tool name + span = mlflow.get_current_active_span() + if span: # TODO: Hack, when mlflow tracing is disabled, span == None. + span.set_attributes({"uc_tool_name": self.uc_function_name}) + + # trace the function call + traced_exec_function = mlflow.trace( + span_type="FUNCTION", name="_uc_client.execute_function" + )(self._uc_client.execute_function) + + # convert input args to json + args_json = json.loads(json.dumps(kwargs, default=str)) + + # TODO: Add in Ben's code parser + + # Try to execute the function & return its value as a dict + try: + result = traced_exec_function( + function_name=self.uc_function_name, parameters=args_json + ) + return asdict(result) + + # Parse the error into a format that's easier for the LLM to understand w/ out any of the Spark runtime error noise + except SparkRuntimeException as tool_exception: + return { + ERROR_STATUS_KEY: _parse_SparkException_from_tool_execution( + tool_exception + ), + ERROR_INSTRUCTIONS_KEY: self.error_prompt, + } + except ParseException as tool_exception: + return { + ERROR_STATUS_KEY: _parse_ParseException_from_tool_execution( + tool_exception + ), + ERROR_INSTRUCTIONS_KEY: self.error_prompt, + } + except Exception as tool_exception: + # some other type of error that is unknown, parse into the same format as the Spark exceptions + # will first try to parse using the SparkException parsing code, if that fails, will then try the generic one + return { + ERROR_STATUS_KEY: _parse_SparkException_from_tool_execution( + tool_exception + ), + ERROR_INSTRUCTIONS_KEY: self.error_prompt, + } + + def model_dump(self, **kwargs) -> Dict[str, Any]: + """Override model_dump to exclude name and description fields. + + Returns: + Dict[str, Any]: Dictionary representation of the model excluding name and description. + """ + kwargs["exclude"] = {"name", "description"}.union(kwargs.get("exclude", set())) + return super().model_dump(**kwargs) + + def get_resource_dependencies(self) -> List[DatabricksResource]: + return [DatabricksFunction(function_name=self.uc_function_name)] + + def _remove_udfbody_from_stack_trace(self, stack_trace: str) -> str: + return stack_trace.replace('File "",', "").strip() diff --git a/autogen_agent_app_sample_code/cookbook/tools/uc_tool_utils.py b/autogen_agent_app_sample_code/cookbook/tools/uc_tool_utils.py new file mode 100644 index 0000000..c4f7825 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/tools/uc_tool_utils.py @@ -0,0 +1,132 @@ +import mlflow +from pyspark.errors import SparkRuntimeException +from pyspark.errors.exceptions.connect import ParseException +import re + +import logging +from typing import Dict, Union + +ERROR_KEY = "error_message" +STACK_TRACE_KEY = "stack_trace" + + +@mlflow.trace(span_type="PARSER") +def _remove_udfbody_from_pyspark_stack_trace(stack_trace: str) -> str: + return stack_trace.replace('File "",', "").strip() + + +@mlflow.trace(span_type="PARSER") +def _parse_PySpark_exception_dumped_as_string(error_msg: str) -> Dict[str, str]: + # Extract error section between == Error == and == Stacktrace == + error = error_msg.split("== Error ==")[1].split("== Stacktrace ==")[0].strip() + + # Extract stacktrace section after == Stacktrace == and before SQL + stack_trace = error_msg.split("== Stacktrace ==")[1].split("== SQL")[0].strip() + + # Remove SQLSTATE and anything after it from the stack trace + if "SQLSTATE" in stack_trace: + stack_trace = stack_trace.split("SQLSTATE")[0].strip() + + return { + STACK_TRACE_KEY: _remove_udfbody_from_pyspark_stack_trace(stack_trace), + ERROR_KEY: error, + } + + +@mlflow.trace(span_type="PARSER") +def _parse_PySpark_exception_from_known_structure( + tool_exception: SparkRuntimeException, +) -> Dict[str, str]: + raw_stack_trace = tool_exception.getMessageParameters()["stack"] + return { + STACK_TRACE_KEY: _remove_udfbody_from_pyspark_stack_trace(raw_stack_trace), + ERROR_KEY: tool_exception.getMessageParameters()["error"], + } + + +@mlflow.trace(span_type="PARSER") +def _parse_generic_tool_exception(tool_exception: Exception) -> Dict[str, str]: + return { + STACK_TRACE_KEY: None, + ERROR_KEY: str(tool_exception), + } + + +@mlflow.trace(span_type="PARSER") +def _parse_SparkException_from_tool_execution( + tool_exception: Union[SparkRuntimeException, Exception], +) -> Dict[str, str]: + error_info_to_return: Union[Dict, str] = None + + # First attempt: first try to parse from the known structure + try: + logging.info( + f"Trying to parse spark exception {tool_exception} using its provided structured data." + ) + # remove the from the stack trace which the LLM knows nothing about + # raw_stack_trace = tool_exception.getMessageParameters()["stack"] + return _parse_PySpark_exception_from_known_structure(tool_exception) + + except Exception as e: + # 2nd attempt: that failed, let's try to parse the SparkException's raw formatting + logging.info( + f"Error parsing spark exception using its provided structured data: {e}, will now try to parse its string output..." + ) + + logging.info( + f"Trying to parse spark exception {tool_exception} using its raw string output." + ) + try: + raw_error_msg = str(tool_exception) + return _parse_PySpark_exception_dumped_as_string(raw_error_msg) + except Exception as e: + # Last attempt: if that fails, just use the raw error + logging.info( + f"Error parsing spark exception using its raw string formatting: {e}, will just return the raw error message." + ) + + logging.info(f"returning the raw error message: {str(tool_exception)}.") + return _parse_generic_tool_exception(tool_exception) + + +# TODO: this might be over fit to python code execution tool, need to test it more +@mlflow.trace(span_type="PARSER") +def _parse_ParseException_from_tool_execution( + tool_exception: ParseException, +) -> Dict[str, str]: + try: + error_msg = tool_exception.getMessage() + # Extract the main error message (remove SQLSTATE and position info) + error = error_msg.split("SQLSTATE:")[0].strip() + if "[PARSE_SYNTAX_ERROR]" in error: + error = error.split("[PARSE_SYNTAX_ERROR]")[1].strip() + + # Pattern to match "line X, pos Y" + pattern = r"line (\d+), pos (\d+)" + match = re.search(pattern, error_msg) + + if match: + line_num = match.group(1) + pos_num = match.group(2) + line_info = f"(line {line_num}, pos {pos_num})" + error = error + " " + line_info + + # Extract the SQL section with the error pointer + sql_section = ( + error_msg.split("== SQL ==")[1].split("JVM stacktrace:")[0].strip() + if "== SQL ==" in error_msg + else "" + ) + + # Remove the SELECT statement from the error message + select_pattern = r"SELECT\s+`[^`]+`\.`[^`]+`\.`[^`]+`\('" + # error_without_sql_parts = sql_section.replace(select_pattern, "").strip() + error_without_sql_parts = re.sub(select_pattern, "", sql_section).strip() + + return {STACK_TRACE_KEY: error_without_sql_parts, ERROR_KEY: error} + except Exception as e: + logging.info(f"Error parsing ParseException: {e}") + return { + STACK_TRACE_KEY: None, + ERROR_KEY: str(tool_exception), + } diff --git a/autogen_agent_app_sample_code/cookbook/tools/vector_search.py b/autogen_agent_app_sample_code/cookbook/tools/vector_search.py new file mode 100644 index 0000000..e857cd7 --- /dev/null +++ b/autogen_agent_app_sample_code/cookbook/tools/vector_search.py @@ -0,0 +1,455 @@ +import mlflow +from mlflow.entities import Document +from mlflow.models.resources import ( + DatabricksVectorSearchIndex, + DatabricksServingEndpoint, + DatabricksResource, +) + +import json +from typing import Literal, Any, Dict, List, Union +from pydantic import BaseModel, model_validator +from databricks.sdk import WorkspaceClient +from databricks.sdk.service.vectorsearch import VectorIndexType +from databricks.sdk.errors import ResourceDoesNotExist +from cookbook.tools import Tool +from dataclasses import asdict + +FilterDict = Dict[str, Union[str, int, float, List[Union[str, int, float]]]] + +# Change this to True to use the source table's metadata for the filterable columns. +# This causes deployment to fail since the deployed model doesn't have access to the source table. +USE_SOURCE_TABLE_FOR_FILTERS_METADATA = False + + +class VectorSearchSchema(BaseModel): + """Configuration for the schema used in the retriever's response. + + This class defines the schema configuration for how the vector search retriever + structures and returns results. + + Args: + primary_key: The column name in the retriever's response referred to the unique key. + If using Databricks vector search with delta sync, this should be the column + of the delta table that acts as the primary key. + chunk_text: The column name in the retriever's response that contains the + returned chunk. + document_uri: The template of the chunk returned by the retriever - used to format + the chunk for presentation to the LLM & to display chunk's from the same + document_uri together in Agent Evaluation Review App. + additional_metadata_columns: Additional metadata columns to present to the LLM. + filterable_columns: List of columns that can be used as filters by the LLM. + + Returns: + VectorSearchSchema: A configured schema object for the vector search retriever. + """ + + _primary_key: str | None = None + """The column name in the retriever's response referred to the unique key. + If using Databricks vector search with delta sync, this should be the column + of the delta table that acts as the primary key, and will be set by reading the index's metadata.""" + + chunk_text: str + """The column name in the retriever's response that contains the returned chunk.""" + + document_uri: str + """The template of the chunk returned by the retriever - used to format + the chunk for presentation to the LLM & to display chunk's from the same + document_uri together in Agent Evaluation Review App.""" + + additional_metadata_columns: List[str] = [] + """Additional metadata columns to present to the LLM.""" + + @property + def all_columns(self) -> List[str]: + cols = [ + self.primary_key, + self.chunk_text, + self.document_uri, + ] + self.additional_metadata_columns + # de-duplicate + return list(set(cols)) + + @property + def primary_key(self) -> str: + """The primary key field, which must be set by VectorSearchRetrieverConfig""" + if self._primary_key is None: + raise ValueError("primary_key must be set by VectorSearchRetrieverConfig") + return self._primary_key + + +class VectorSearchParameters(BaseModel): + """Configuration for the input schema (parameters) used in the retriever. + + This class defines the configuration parameters for how the vector search retriever + performs searches and returns results. + + Args: + num_results: The number of chunks to return for each query. For example, + setting this to 5 will return the top 5 most relevant search results. + query_type: The type of search to use - either 'ann' for semantic similarity + using embeddings only, or 'hybrid' which combines keyword and semantic + similarity search. + + Returns: + VectorSearchParameters: A configured parameters object for the vector search retriever. + """ + + num_results: int = 5 + """The number of chunks to return for each query.""" + + query_type: Literal["ann", "hybrid"] = "ann" + """The type of search to use - either 'ann' for semantic similarity using embeddings only, + or 'hybrid' which combines keyword and semantic similarity search.""" + + +class VectorSearchRetrieverTool(Tool): + """Configuration for a Databricks Vector Search retriever. + + This class defines the configuration for a Vector Search retriever that can be used + either deterministically in a fixed RAG chain or as a tool. + + Args: + vector_search_index: Unity Catalog location of the Vector Search index. + Example: catalog.schema.vector_index. + vector_search_schema: Schema configuration for the retriever. + doc_similarity_threshold: Threshold (0-1) for the retrieved document's similarity score. Used + to exclude dissimilar results. Increase if retriever returns irrelevant content. + vector_search_parameters: Parameters passed to index.similarity_search(...). + See https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint for details. + retriever_query_parameter_prompt: Description of the query parameter for the retriever. + + Returns: + VectorSearchRetrieverConfig: A configured retriever config object. + """ + + vector_search_index: str + """Unity Catalog location of the Vector Search index. + Example: catalog.schema.vector_index.""" + + filterable_columns: List[str] = [] + """List of columns that can be used as filters by the LLM. Columns will be validated against the source table & metadata about each column loaded from the Unity Catalog to improve the LLM's ability to filter.""" + + vector_search_schema: VectorSearchSchema + """Schema configuration for the retriever.""" + + doc_similarity_threshold: float = 0.0 + """Threshold (0-1) for the retrieved document's similarity score. + Used to exclude dissimilar results. Increase if retriever returns irrelevant content.""" + + vector_search_parameters: VectorSearchParameters = VectorSearchParameters() + """Parameters passed to index.similarity_search(...). + See https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#query-a-vector-search-endpoint for details.""" + + retriever_query_parameter_prompt: str = "query to look up in retriever" + retriever_filter_parameter_prompt: str = ( + "optional filters to apply to the search. An array of objects, each specifying a field name and the filters to apply to that field." + ) + + name: str + description: str + + def __init__(self, **data): + """Initialize the WorkspaceClient and set the MLflow retriever schema.""" + super().__init__(**data) + mlflow.models.set_retriever_schema( + name=self.vector_search_index, + primary_key=self.vector_search_schema.primary_key, + text_column=self.vector_search_schema.chunk_text, + doc_uri=self.vector_search_schema.document_uri, + ) + + def _validate_columns_exist( + self, columns: List[str], source_table: str, table_columns: set, context: str + ) -> None: + """Helper method to validate that columns exist in the source table. + + Args: + columns: List of columns to validate + source_table: Name of the source table + table_columns: Set of available columns in the table + context: Context string for error message (e.g. "filterable columns", "chunk_text") + """ + for col in columns: + if col not in table_columns: + raise ValueError( + f"Column '{col}' specified in {context} not found in source table {source_table}. " + f"Available columns: {', '.join(sorted(table_columns))}" + ) + + def _get_index_info(self): + w = WorkspaceClient() + return w.vector_search_indexes.get_index(self.vector_search_index) + + def _check_if_index_exists(self): + w = WorkspaceClient() + try: + index_info = w.vector_search_indexes.get_index(self.vector_search_index) + return index_info is not None + except ResourceDoesNotExist as e: + return False + + @property + def filterable_columns_descriptions_for_llm(self) -> str: + """Returns a formatted description of all filterable columns for use in prompts.""" + if USE_SOURCE_TABLE_FOR_FILTERS_METADATA: + # Present the LLM with the source table's metadata for the filterable columns. + # TODO: be able to get this data directly from the index's metadata + # Get source table info + index_info = self._get_index_info() + if index_info.index_type != VectorIndexType.DELTA_SYNC: + raise ValueError( + f"Unsupported index type: {index_info.index_type}. Only DELTA_SYNC is supported." + ) + + w = WorkspaceClient() + source_table = index_info.delta_sync_index_spec.source_table + table_info = w.tables.get(source_table) + + # Create mapping of column name to description and type + column_info = { + col.name: (col.type_text, col.comment if col.comment else None) + for col in table_info.columns + } + # print(column_info) + + # Build descriptions list + descriptions = [] + for col in self.filterable_columns: + type_text, desc = column_info.get(col, (None, None)) + formatted_desc = f"(`{col}`, {type_text}" + ( + f", '{desc}'" + ")" if desc else "" + ) + descriptions.append(formatted_desc) + return ", ".join(descriptions) + + else: + # just use the column names as metadata + return ", ".join(str(col) for col in self.filterable_columns) + + @model_validator(mode="after") + def validate_index_and_columns(self): + """Validates the index exists and all columns after the model is fully initialized""" + + # Check that index exists + if not self._check_if_index_exists(): + raise ValueError( + f"Vector search index {self.vector_search_index} does not exist." + ) + + index_info = self._get_index_info() + + # Set primary key from index if not already set + if not self.vector_search_schema._primary_key: + if index_info.primary_key: + self.vector_search_schema._primary_key = index_info.primary_key + else: + raise ValueError( + f"Could not find primary key in index {self.vector_search_index}" + ) + + # TODO: Validate all configured schema columns exist in the index. Currently, this data is not available in the index metadata. + + return self + + @model_validator(mode="after") + def validate_threshold(self): + if not 0 <= self.doc_similarity_threshold <= 1: + raise ValueError("doc_similarity_threshold must be between 0 and 1") + return self + + def _get_parameters_schema(self) -> dict: + schema = { + "type": "object", + "required": ["query"], + "additionalProperties": False, + "properties": { + "query": { + # "default": None, + "description": self.retriever_query_parameter_prompt, + "type": "string", + }, + }, + } + + if self.filterable_columns: + schema["properties"]["filters"] = { + # "default": None, + "description": self.retriever_filter_parameter_prompt, + "type": "array", + "items": { + "type": "object", + "properties": { + "field": { + "type": "string", + "enum": self.filterable_columns, + "description": "The fields to apply the filter to. Can use any of the following as filters, where each is (`field_name`, field_type, 'field_description'): " + + self.filterable_columns_descriptions_for_llm + + "For string fields, only use LIKE filter; for numeric fields, either provide a number to achieve == or use <, <=, >, >= filters; for array fields, either provide an array of 1+ values to achieve IN or use NOT to exclude.", + }, + "filter": { + "anyOf": [ + {"type": "string"}, + {"type": "number"}, + { + "type": "array", + "items": { + "anyOf": [ + {"type": "string"}, + {"type": "number"}, + ] + }, + }, + { + "type": "object", + "properties": { + "<": {"type": "number"}, + "<=": {"type": "number"}, + ">": {"type": "number"}, + ">=": {"type": "number"}, + "LIKE": {"type": "string"}, + "NOT": { + "anyOf": [ + {"type": "string"}, + {"type": "number"}, + ] + }, + }, + "additionalProperties": False, + "minProperties": 1, + "maxProperties": 1, + }, + ] + }, + }, + "required": ["field", "filter"], + "additionalProperties": False, + }, + } + + return schema + + @mlflow.trace(span_type="RETRIEVER", name="vector_search_retriever") + def __call__(self, query: str, filters: List[Dict[Any, Any]] = None) -> List[Document]: + """ + Performs vector search to retrieve relevant chunks. + + Args: + query: Search query. + filters: Optional list of filters to apply to the search. Should follow the LLM-generated filter pattern of a list of field/filter pairs that will be converted to Databricks Vector Search filter format. + + Returns: + List of retrieved Documents. + """ + span = mlflow.get_current_active_span() + if span: # TODO: Hack, when mlflow tracing is disabled, span == None. + span.set_attributes({"vector_search_index": self.vector_search_index}) + + w = WorkspaceClient() + + traced_search = mlflow.trace( + w.vector_search_indexes.query_index, + name="_workspace_client.vector_search_indexes.query_index", + span_type="FUNCTION", + ) + + # Parse filters written by the LLM into Vector Search compatible format + vs_filters = json.dumps(self.parse_filters(filters)) if filters else None + + results = traced_search( + index_name=self.vector_search_index, + query_text=query, + filters_json=vs_filters, + columns=self.vector_search_schema.all_columns, + **self.vector_search_parameters.model_dump(exclude_none=True), + ) + + # We turn the config into a dict and pass it here + return self.convert_vector_search_to_documents( + results.as_dict(), self.doc_similarity_threshold + ) + + @mlflow.trace(span_type="PARSER") + def convert_vector_search_to_documents( + self, vs_results, vector_search_threshold + ) -> List[Document]: + column_names = [] + for column in vs_results["manifest"]["columns"]: + column_names.append(column) + + docs = [] + if vs_results["result"]["row_count"] > 0: + for item in vs_results["result"]["data_array"]: + metadata = {} + score = item[-1] + if score >= vector_search_threshold: + metadata["similarity_score"] = score + for i, field in enumerate(item[0:-1]): + metadata[column_names[i]["name"]] = field + # put contents of the chunk into page_content + page_content = metadata[self.vector_search_schema.chunk_text] + del metadata[self.vector_search_schema.chunk_text] + + # put the primary key into id + id = metadata[self.vector_search_schema.primary_key] + del metadata[self.vector_search_schema.primary_key] + + doc = Document(page_content=page_content, metadata=metadata, id=id) + docs.append(asdict(doc)) + + return docs + + @mlflow.trace(span_type="PARSER") + def parse_filters(self, filters: List[Dict[str, Any]]) -> Dict[str, Any]: + """ + Parse input filters into Vector Search compatible format. + + Args: + filters: List of input filters in the new format. + + Returns: + Filters in Vector Search compatible format. + """ + vs_filters = {} + for filter_item in filters: + suggested_field = filter_item["field"] + suggested_filter = filter_item["filter"] + + if isinstance(suggested_filter, list): + # vs_filters[key] = {"OR": value} + vs_filters[suggested_field] = suggested_filter + elif isinstance(suggested_filter, dict): + operator, operand = next(iter(suggested_filter.items())) + vs_filters[suggested_field + " " + operator] = operand + # if operator in ["<", "<=", ">", ">="]: + # vs_filters[f"{key} {operator}"] = operand + # elif operator.upper() == "LIKE": + # vs_filters[f"{key} LIKE"] = operand + # elif operator.upper() == "NOT": + # vs_filters[f"{key} !="] = operand + else: + vs_filters[suggested_field] = suggested_filter + return vs_filters + + def get_resource_dependencies(self) -> List[DatabricksResource]: + dependencies = [ + DatabricksVectorSearchIndex(index_name=self.vector_search_index) + ] + + # Get the embedding model endpoint + index_info = self._get_index_info() + if index_info.index_type == VectorIndexType.DELTA_SYNC: + # Only DELTA_SYNC indexes have embedding model endpoints + for ( + embedding_source_col + ) in index_info.delta_sync_index_spec.embedding_source_columns: + endpoint_name = embedding_source_col.embedding_model_endpoint_name + if endpoint_name is not None: + dependencies.append( + DatabricksServingEndpoint(endpoint_name=endpoint_name), + ) + else: + print( + f"Could not identify the embedding model endpoint resource for {self.vector_search_index}. Please manually add the embedding model endpoint to `databricks_resources`." + ) + return dependencies diff --git a/autogen_agent_app_sample_code/environment.yaml b/autogen_agent_app_sample_code/environment.yaml new file mode 100644 index 0000000..76883b2 --- /dev/null +++ b/autogen_agent_app_sample_code/environment.yaml @@ -0,0 +1,4 @@ +client: "1" +dependencies: + - --index-url https://pypi.org/simple + - -r requirements.txt \ No newline at end of file diff --git a/autogen_agent_app_sample_code/pyproject.toml b/autogen_agent_app_sample_code/pyproject.toml new file mode 100644 index 0000000..fa57fb0 --- /dev/null +++ b/autogen_agent_app_sample_code/pyproject.toml @@ -0,0 +1,36 @@ +[tool.poetry] +name = "genai-cookbook" +version = "0.1.0" +description = "" +authors = ["Eric Peter "] +readme = "README.md" +packages = [{include = "cookbook"}] + +[tool.poetry.dependencies] +python = "^3.11" +databricks-connect = "15.1.0" +pydantic = "^2.9.2" +pyyaml = "^6.0.2" +databricks-vectorsearch = "^0.42" +databricks-sdk = {extras = ["openai"], version = "^0.36.0"} +mlflow = "^2.18.0" +databricks-agents = "^0.10.0" +pymupdf4llm = "0.0.5" +pymupdf = "1.24.13" +markdownify = "0.12.1" +transformers = "4.41.1" +torch = "2.3.0" +tiktoken = "0.7.0" +langchain-text-splitters = "0.2.0" +ipykernel = "^6.29.5" +hatchling = "^1.25.0" +pypandoc-binary = "1.13" +tabulate = "^0.9.0" +ipywidgets = "^8.1.5" +unitycatalog-ai = {git = "https://github.com/unitycatalog/unitycatalog.git", subdirectory = "ai/core"} +unitycatalog-openai = {git = "https://github.com/unitycatalog/unitycatalog.git", subdirectory = "ai/integrations/openai"} +pytest = "^8.3.3" + +[build-system] +requires = ["poetry-core"] +build-backend = "poetry.core.masonry.api" diff --git a/autogen_agent_app_sample_code/requirements.txt b/autogen_agent_app_sample_code/requirements.txt new file mode 100644 index 0000000..3826356 --- /dev/null +++ b/autogen_agent_app_sample_code/requirements.txt @@ -0,0 +1,14 @@ +pydantic>=2.9.2 +databricks-agents +mlflow>=2.18.0 +databricks-sdk[openai] +databricks-vectorsearch +pyyaml +git+https://github.com/unitycatalog/unitycatalog.git#subdirectory=ai/core +git+https://github.com/unitycatalog/unitycatalog.git#subdirectory=ai/integrations/autogen +tabulate +pandas +pyspark +databricks-connect==15.1.0 +autogen-agentchat~=0.2 +pytest \ No newline at end of file diff --git a/autogen_agent_app_sample_code/requirements_datapipeline.txt b/autogen_agent_app_sample_code/requirements_datapipeline.txt new file mode 100644 index 0000000..c7d2e90 --- /dev/null +++ b/autogen_agent_app_sample_code/requirements_datapipeline.txt @@ -0,0 +1,9 @@ +pymupdf4llm==0.0.5 +pymupdf==1.24.13 +markdownify==0.12.1 +transformers==4.41.1 +torch==2.3.0 +tiktoken==0.7.0 +langchain-text-splitters==0.2.0 +pypandoc_binary==1.13 +pyyaml \ No newline at end of file diff --git a/autogen_agent_app_sample_code/tests/conftest.py b/autogen_agent_app_sample_code/tests/conftest.py new file mode 100644 index 0000000..6de2ed7 --- /dev/null +++ b/autogen_agent_app_sample_code/tests/conftest.py @@ -0,0 +1,6 @@ +import sys +import os + +# Add the parent directory to sys.path, so that we can treat directories like +# `cookbook` as modules +sys.path.insert(0, os.path.abspath(os.path.join(os.path.dirname(__file__), '..'))) diff --git a/autogen_agent_app_sample_code/tests/test_data_pipeline_utils.py b/autogen_agent_app_sample_code/tests/test_data_pipeline_utils.py new file mode 100644 index 0000000..9cb3cd2 --- /dev/null +++ b/autogen_agent_app_sample_code/tests/test_data_pipeline_utils.py @@ -0,0 +1,113 @@ +from datetime import datetime + +import pytest +import pyspark +import pandas as pd +from typing import TypedDict + +from cookbook.data_pipeline.parse_docs import load_files_to_df, apply_parsing_fn +from cookbook.data_pipeline.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema + + +@pytest.fixture(scope="module") +def spark(): + return ( + pyspark.sql.SparkSession.builder.master("local[1]") + .config("spark.driver.bindAddress", "127.0.0.1") + .config("spark.task.maxFailures", "1") # avoid retry failed spark tasks + .getOrCreate() + ) + + +@pytest.fixture() +def example_files_dir(tmpdir): + temp_dir = tmpdir.mkdir("files_subdir") + file_1 = temp_dir.join("file1.txt") + file_2 = temp_dir.join("file2.txt") + file_1.write("file1 content") + file_2.write("file2 content") + yield temp_dir, file_1, file_2 + + +def test_load_files_to_df(spark, example_files_dir): + temp_dir, file_1, file_2 = example_files_dir + raw_files_df = ( + load_files_to_df(spark, str(temp_dir)).drop("modificationTime").orderBy("path") + ) + assert raw_files_df.count() == 2 + raw_pandas_df = raw_files_df.toPandas() + # Decode the content from bytes to string + raw_pandas_df["content"] = raw_pandas_df["content"].apply( + lambda x: bytes(x).decode("utf-8") + ) + # Expected DataFrame + expected_df = pd.DataFrame( + [ + { + "path": f"file:{str(file_1)}", + "length": len("file1 content"), + "content": "file1 content", + }, + { + "path": f"file:{str(file_2)}", + "length": len("file2 content"), + "content": "file2 content", + }, + ] + ) + pd.testing.assert_frame_equal(raw_pandas_df, expected_df) + + +def test_load_files_to_df_throws_if_no_files(spark, tmpdir): + temp_dir = tmpdir.mkdir("files_subdir") + with pytest.raises(Exception, match="does not contain any files"): + load_files_to_df(spark, str(temp_dir)) + + +class ParserReturnValue(TypedDict): + # Parsed content of the document + content: str # do not change this name + # The status of whether the parser succeeds or fails, used to exclude failed files downstream + parser_status: str # do not change this name + # Unique ID of the document + doc_uri: str # do not change this name + + +def test_apply_parsing_fn(spark, example_files_dir): + def _mock_file_parser( + raw_doc_contents_bytes: bytes, + doc_path: str, + modification_time: datetime, + doc_bytes_length: int, + ): + return { + "content": raw_doc_contents_bytes.decode("utf-8"), + "parser_status": "SUCCESS", + "doc_uri": doc_path, + } + + temp_dir, file_1, file_2 = example_files_dir + raw_files_df = load_files_to_df(spark, str(temp_dir)).orderBy("path") + parsed_df = apply_parsing_fn( + raw_files_df, + _mock_file_parser, + parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue), + ) + assert parsed_df.count() == 2 + parsed_pandas_df = parsed_df.toPandas() + # Expected DataFrame + expected_df = pd.DataFrame( + [ + { + "content": file_1.read_text(encoding="utf-8"), + "parser_status": "SUCCESS", + "doc_uri": f"file:{str(file_1)}", + }, + { + "content": file_2.read_text(encoding="utf-8"), + "parser_status": "SUCCESS", + "doc_uri": f"file:{str(file_2)}", + }, + ] + ) + pd.testing.assert_frame_equal(parsed_pandas_df, expected_df) diff --git a/autogen_agent_app_sample_code/tools/README.md b/autogen_agent_app_sample_code/tools/README.md new file mode 100644 index 0000000..e7acbf9 --- /dev/null +++ b/autogen_agent_app_sample_code/tools/README.md @@ -0,0 +1 @@ +Store user-created tools in this directory. \ No newline at end of file diff --git a/autogen_agent_app_sample_code/tools/__init__.py b/autogen_agent_app_sample_code/tools/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/autogen_agent_app_sample_code/tools/code_exec.py b/autogen_agent_app_sample_code/tools/code_exec.py new file mode 100644 index 0000000..633a34b --- /dev/null +++ b/autogen_agent_app_sample_code/tools/code_exec.py @@ -0,0 +1,20 @@ +def python_exec(code: str) -> str: + """ + Executes Python code in the sandboxed environment and returns its stdout. The runtime is stateless and you can not read output of the previous tool executions. i.e. No such variables "rows", "observation" defined. Calling another tool inside a Python code is NOT allowed. + Use only standard python libraries and these python libraries: bleach, chardet, charset-normalizer, defusedxml, googleapis-common-protos, grpcio, grpcio-status, jmespath, joblib, numpy, packaging, pandas, patsy, protobuf, pyarrow, pyparsing, python-dateutil, pytz, scikit-learn, scipy, setuptools, six, threadpoolctl, webencodings, user-agents, cryptography. + + Args: + code (str): Python code to execute. Remember to print the final result to stdout. + + Returns: + str: The output of the executed code. + """ + import sys + from io import StringIO + + sys_stdout = sys.stdout + redirected_output = StringIO() + sys.stdout = redirected_output + exec(code) + sys.stdout = sys_stdout + return redirected_output.getvalue() diff --git a/autogen_agent_app_sample_code/tools/sample_tool.py b/autogen_agent_app_sample_code/tools/sample_tool.py new file mode 100644 index 0000000..eef313c --- /dev/null +++ b/autogen_agent_app_sample_code/tools/sample_tool.py @@ -0,0 +1,46 @@ + +def sku_sample_translator(old_sku: str) -> str: + """ + Translates a pre-2024 SKU formatted as "OLD-XXX-YYYY" to the new SKU format "NEW-YYYY-XXX". + + Args: + old_sku (str): The old SKU in the format "OLD-XXX-YYYY". + + Returns: + str: The new SKU in the format "NEW-YYYY-XXX". + + Raises: + ValueError: If the SKU format is invalid, providing specific error details. + """ + import re + + if not isinstance(old_sku, str): + raise ValueError("SKU must be a string") + + # Normalize input by removing extra whitespace and converting to uppercase + old_sku = old_sku.strip().upper() + + # Define the regex pattern for the old SKU format + pattern = r"^OLD-([A-Z]{3})-(\d{4})$" + + # Match the old SKU against the pattern + match = re.match(pattern, old_sku) + if not match: + if not old_sku.startswith("OLD-"): + raise ValueError("SKU must start with 'OLD-'") + if not re.match(r"^OLD-[A-Z]{3}-\d{4}$", old_sku): + raise ValueError( + "SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" + ) + raise ValueError("Invalid SKU format") + + # Extract the letter code and numeric part + letter_code, numeric_part = match.groups() + + # Additional validation for numeric part + if not (1 <= int(numeric_part) <= 9999): + raise ValueError("Numeric part must be between 0001 and 9999") + + # Construct the new SKU + new_sku = f"NEW-{numeric_part}-{letter_code}" + return new_sku diff --git a/autogen_agent_app_sample_code/tools/test_code_exec.py b/autogen_agent_app_sample_code/tools/test_code_exec.py new file mode 100644 index 0000000..a4c5418 --- /dev/null +++ b/autogen_agent_app_sample_code/tools/test_code_exec.py @@ -0,0 +1,89 @@ + +import pytest +from .code_exec import python_exec + + +def test_basic_arithmetic(): + code = """result = 2 + 2\nprint(result)""" + assert python_exec(code).strip() == "4" + + +def test_multiple_lines(): + code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" + assert python_exec(code).strip() == "15" + + +def test_multiple_prints(): + code = """print('first')\nprint('second')\nprint('third')\n""" + expected = "first\nsecond\nthird\n" + assert python_exec(code) == expected + + +def test_using_pandas(): + code = ( + "import pandas as pd\n" + "data = {'col1': [1, 2], 'col2': [3, 4]}\n" + "df = pd.DataFrame(data)\n" + "print(df.shape)" + ) + assert python_exec(code).strip() == "(2, 2)" + + +def test_using_numpy(): + code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" + assert python_exec(code).strip() == "2.0" + + +def test_syntax_error(): + code = "if True\n" " print('invalid syntax')" + with pytest.raises(SyntaxError): + python_exec(code) + + +def test_runtime_error(): + code = "x = 1 / 0\n" "print(x)" + with pytest.raises(ZeroDivisionError): + python_exec(code) + + +def test_undefined_variable(): + code = "print(undefined_variable)" + with pytest.raises(NameError): + python_exec(code) + + +def test_multiline_string_manipulation(): + code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" + expected = "Hello\nWorld" + assert python_exec(code).strip() == expected + +# Will not fail locally, but will fail in UC. +# def test_unauthorized_flask(): +# code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" +# with pytest.raises(ImportError): +# python_exec(code) + + +def test_no_print_statement(): + code = "x = 42\n" "y = x * 2" + assert python_exec(code) == "" + + +def test_calculation_without_print(): + code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" + assert python_exec(code) == "" + + +def test_function_definition_without_call(): + code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" + assert python_exec(code) == "" + + +def test_class_definition_without_instantiation(): + code = ( + "class Calculator:\n" + " def add(self, a, b):\n" + " return a + b\n" + "calc = Calculator()" + ) + assert python_exec(code) == "" diff --git a/autogen_agent_app_sample_code/tools/test_code_exec_as_uc_tool.py b/autogen_agent_app_sample_code/tools/test_code_exec_as_uc_tool.py new file mode 100644 index 0000000..0d7e360 --- /dev/null +++ b/autogen_agent_app_sample_code/tools/test_code_exec_as_uc_tool.py @@ -0,0 +1,102 @@ + +import pytest +from cookbook.tools.uc_tool import UCTool + +CATALOG = "ep" +SCHEMA = "cookbook_local_test" + + +@pytest.fixture +def python_exec(): + """Fixture to provide the python_exec function from UCTool.""" + python_exec_tool = UCTool(uc_function_name=f"{CATALOG}.{SCHEMA}.python_exec") + return python_exec_tool + + +def test_basic_arithmetic(python_exec): + code = """result = 2 + 2\nprint(result)""" + assert python_exec(code=code)["value"].strip() == "4" + + +def test_multiple_lines(python_exec): + code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" + assert python_exec(code=code)["value"].strip() == "15" + + +def test_multiple_prints(python_exec): + code = """print('first')\nprint('second')\nprint('third')\n""" + expected = "first\nsecond\nthird\n" + assert python_exec(code=code)["value"] == expected + + +def test_using_pandas(python_exec): + code = ( + "import pandas as pd\n" + "data = {'col1': [1, 2], 'col2': [3, 4]}\n" + "df = pd.DataFrame(data)\n" + "print(df.shape)" + ) + assert python_exec(code=code)["value"].strip() == "(2, 2)" + + +def test_using_numpy(python_exec): + code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" + assert python_exec(code=code)["value"].strip() == "2.0" + + +def test_syntax_error(python_exec): + code = "if True\n" " print('invalid syntax')" + result = python_exec(code=code) + assert "Syntax error at or near 'invalid'." in result["error"]["error_message"] + + +def test_runtime_error(python_exec): + code = "x = 1 / 0\n" "print(x)" + result = python_exec(code=code) + assert "ZeroDivisionError" in result["error"]["error_message"] + + +def test_undefined_variable(python_exec): + code = "print(undefined_variable)" + result = python_exec(code=code) + assert "NameError" in result["error"]["error_message"] + + +def test_multiline_string_manipulation(python_exec): + code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" + expected = "Hello\nWorld" + assert python_exec(code=code)["value"].strip() == expected + + +def test_unauthorized_flask(python_exec): + code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" + result = python_exec(code=code) + assert ( + "ModuleNotFoundError: No module named 'flask'" + in result["error"]["error_message"] + ) + + +def test_no_print_statement(python_exec): + code = "x = 42\n" "y = x * 2" + assert python_exec(code=code)["value"] == "" + + +def test_calculation_without_print(python_exec): + code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" + assert python_exec(code=code)["value"] == "" + + +def test_function_definition_without_call(python_exec): + code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" + assert python_exec(code=code)["value"] == "" + + +def test_class_definition_without_instantiation(python_exec): + code = ( + "class Calculator:\n" + " def add(self, a, b):\n" + " return a + b\n" + "calc = Calculator()" + ) + assert python_exec(code=code)["value"] == "" diff --git a/autogen_agent_app_sample_code/tools/test_sample_tool.py b/autogen_agent_app_sample_code/tools/test_sample_tool.py new file mode 100644 index 0000000..f818d70 --- /dev/null +++ b/autogen_agent_app_sample_code/tools/test_sample_tool.py @@ -0,0 +1,52 @@ +import pytest +from tools.sample_tool import sku_sample_translator + + + +def test_valid_sku_translation(): + """Test successful SKU translation with valid input.""" + assert sku_sample_translator("OLD-ABC-1234") == "NEW-1234-ABC" + assert sku_sample_translator("OLD-XYZ-0001") == "NEW-0001-XYZ" + assert sku_sample_translator("old-def-5678") == "NEW-5678-DEF" # Test case insensitivity + + +def test_whitespace_handling(): + """Test that the function handles extra whitespace correctly.""" + assert sku_sample_translator(" OLD-ABC-1234 ") == "NEW-1234-ABC" + assert sku_sample_translator("\tOLD-ABC-1234\n") == "NEW-1234-ABC" + + +def test_invalid_input_type(): + """Test that non-string inputs raise ValueError.""" + with pytest.raises(ValueError, match="SKU must be a string"): + sku_sample_translator(123) + with pytest.raises(ValueError, match="SKU must be a string"): + sku_sample_translator(None) + + +def test_invalid_prefix(): + """Test that SKUs not starting with 'OLD-' raise ValueError.""" + with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): + sku_sample_translator("NEW-ABC-1234") + with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): + sku_sample_translator("XXX-ABC-1234") + + +def test_invalid_format(): + """Test various invalid SKU formats.""" + invalid_skus = [ + "OLD-AB-1234", # Too few letters + "OLD-ABCD-1234", # Too many letters + "OLD-123-1234", # Numbers instead of letters + "OLD-ABC-123", # Too few digits + "OLD-ABC-12345", # Too many digits + "OLD-ABC-XXXX", # Letters instead of numbers + "OLD-A1C-1234", # Mixed letters and numbers in middle + ] + + for sku in invalid_skus: + with pytest.raises( + ValueError, + match="SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit", + ): + sku_sample_translator(sku) diff --git a/autogen_agent_app_sample_code/tools/test_sample_tool_uc.py b/autogen_agent_app_sample_code/tools/test_sample_tool_uc.py new file mode 100644 index 0000000..1539f11 --- /dev/null +++ b/autogen_agent_app_sample_code/tools/test_sample_tool_uc.py @@ -0,0 +1,72 @@ +import pytest +from cookbook.tools.uc_tool import UCTool + +# Load the function from the UCTool versus locally +@pytest.fixture +def uc_tool(): + """Fixture to translate a UC tool into a local function.""" + UC_FUNCTION_NAME = "ep.cookbook_local_test.sku_sample_translator" + loaded_tool = UCTool(uc_function_name=UC_FUNCTION_NAME) + return loaded_tool + + +# Note: The value will be post processed into the `value` key, so we must check the returned value there. +def test_valid_sku_translation(uc_tool): + """Test successful SKU translation with valid input.""" + assert uc_tool(old_sku="OLD-ABC-1234")["value"] == "NEW-1234-ABC" + assert uc_tool(old_sku="OLD-XYZ-0001")["value"] == "NEW-0001-XYZ" + assert ( + uc_tool(old_sku="old-def-5678")["value"] == "NEW-5678-DEF" + ) # Test case insensitivity + + +# Note: The value will be post processed into the `value` key, so we must check the returned value there. +def test_whitespace_handling(uc_tool): + """Test that the function handles extra whitespace correctly.""" + assert uc_tool(old_sku=" OLD-ABC-1234 ")["value"] == "NEW-1234-ABC" + assert uc_tool(old_sku="\tOLD-ABC-1234\n")["value"] == "NEW-1234-ABC" + + +# Note: the input validation happens BEFORE the function is called by Spark, so we will never get these exceptions from the function. +# Instead, we will get invalid parameters errors from Spark. +def test_invalid_input_type(uc_tool): + """Test that non-string inputs raise ValueError.""" + assert ( + uc_tool(old_sku=123)["error"]["error_message"] + == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" + ) + assert ( + uc_tool(old_sku=None)["error"]["error_message"] + == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" + ) + + +# Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. +def test_invalid_prefix(uc_tool): + """Test that SKUs not starting with 'OLD-' raise ValueError.""" + assert ( + uc_tool(old_sku="NEW-ABC-1234")["error"]["error_message"] + == "ValueError: SKU must start with 'OLD-'" + ) + assert ( + uc_tool(old_sku="XXX-ABC-1234")["error"]["error_message"] + == "ValueError: SKU must start with 'OLD-'" + ) + + +# Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. +def test_invalid_format(uc_tool): + """Test various invalid SKU formats.""" + invalid_skus = [ + "OLD-AB-1234", # Too few letters + "OLD-ABCD-1234", # Too many letters + "OLD-123-1234", # Numbers instead of letters + "OLD-ABC-123", # Too few digits + "OLD-ABC-12345", # Too many digits + "OLD-ABC-XXXX", # Letters instead of numbers + "OLD-A1C-1234", # Mixed letters and numbers in middle + ] + + expected_error = "ValueError: SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" + for sku in invalid_skus: + assert uc_tool(old_sku=sku)["error"]["error_message"] == expected_error From 7a8d392a084585f692ab92c545180045b98f71e2 Mon Sep 17 00:00:00 2001 From: manfredcalvo Date: Fri, 20 Dec 2024 13:56:58 -0600 Subject: [PATCH 2/2] Updating all notebok files to be ipython files instead of py files. --- .../01_data_pipeline.ipynb | 104749 ++++++++++++++- .../02_agent_setup.ipynb | 481 +- .../03_create_synthetic_eval.ipynb | 49289 ++++++- .../04_create_tools.ipynb | 2701 +- .../05_tool_calling_agent.ipynb | 9405 +- .../06_multi_agent_with_genie.ipynb | 1306 +- .../autogen_started.ipynb | 1096 +- 7 files changed, 166412 insertions(+), 2615 deletions(-) diff --git a/autogen_agent_app_sample_code/01_data_pipeline.ipynb b/autogen_agent_app_sample_code/01_data_pipeline.ipynb index 72b147d..edc80c9 100644 --- a/autogen_agent_app_sample_code/01_data_pipeline.ipynb +++ b/autogen_agent_app_sample_code/01_data_pipeline.ipynb @@ -1,537 +1,104212 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC # Unstructured data pipeline for the Agent's Retriever -# MAGIC -# MAGIC By the end of this notebook, you will have transformed your unstructured documents into a vector index that can be queried by your Agent. -# MAGIC -# MAGIC This means: -# MAGIC - Documents loaded into a delta table. -# MAGIC - Documents are chunked. -# MAGIC - Chunks have been embedded with an embedding model and stored in a vector index. -# MAGIC -# MAGIC The important resulting artifact of this notebook is the chunked vector index. This will be used in the next notebook to power our Retriever. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 👉 START HERE: How to Use This Notebook -# MAGIC -# MAGIC Follow these steps to build and refine your data pipeline's quality: -# MAGIC -# MAGIC 1. **Build a v0 index with default settings** -# MAGIC - Configure the data source and destination tables in the `1️⃣ 📂 Data source & destination configuration` cells -# MAGIC - Press `Run All` to create the vector index. -# MAGIC -# MAGIC *Note: While you can adjust the other settings and modify the parsing/chunking code, we suggest doing so only after evaluating your Agent's quality so you can make improvements that specifically address root causes of quality issues.* -# MAGIC -# MAGIC 2. **Use later notebooks to integrate the retriever into an the agent and evaluate the agent/retriever's quality.** -# MAGIC -# MAGIC 3. **If the evaluation results show retrieval issues as a root cause, use this notebook to iterate on your data pipeline's code & config.** Below are some potential fixes you can try, see the AI Cookbook's [debugging retrieval issues](https://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-1-retrieval.html) section for details.** -# MAGIC - Add missing, but relevant source documents into in the index. -# MAGIC - Resolve any conflicting information in source documents. -# MAGIC - Adjust the data pipeline configuration: -# MAGIC - Modify chunk size or overlap. -# MAGIC - Experiment with different embedding models. -# MAGIC - Adjust the data pipeline code: -# MAGIC - Create a custom parser or use different parsing libraries. -# MAGIC - Develop a custom chunker or use different chunking techniques. -# MAGIC - Extract additional metadata for each document. -# MAGIC - Adjust the Agent's code/config in subsequent notebooks: -# MAGIC - Change the number of documents retrieved (K). -# MAGIC - Try a re-ranker. -# MAGIC - Use hybrid search. -# MAGIC - Apply extracted metadata as filters. -# MAGIC -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: -# MAGIC - ✅✏️ *should* customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality -# MAGIC - 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to execute the pipeline -# MAGIC -# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Install Python libraries (Databricks Notebook only) -# MAGIC -# MAGIC 🚫✏️ Only modify if you need additional packages in your code changes to the document parsing or chunking logic. -# MAGIC -# MAGIC Versions of Databricks code are not locked since Databricks ensures changes are backwards compatible. -# MAGIC Versions of open source packages are locked since package authors often make backwards compatible changes - -# COMMAND ---------- - -# MAGIC %pip install -qqqq -U -r requirements.txt -# MAGIC %pip install -qqqq -U -r requirements_datapipeline.txt -# MAGIC dbutils.library.restartPython() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### Connect to Databricks (Local IDE only) -# MAGIC -# MAGIC If running from an IDE with [`databricks-connect`](https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html), connect to a Spark session & install the necessary packages on that cluster. - -# COMMAND ---------- - -from cookbook.databricks_utils import get_cluster_url -from cookbook.databricks_utils import get_active_cluster_id -from cookbook.databricks_utils.install_cluster_library import install_requirements - -# UNCOMMENT TO INSTALL PACKAGES ON THE ACTIVE CLUSTER; this is code that is not super battle tested. -# cluster_id = get_active_cluster_id() -# print(f"Installing packages on the active cluster: {get_cluster_url(cluster_id)}") - - -# install_requirements(cluster_id, "requirements.txt") -# install_requirements(cluster_id, "requirements_datapipeline.txt") - -# THIS MUST BE DONE MANUALLY! TODO: Automate it. -# - Go to openai_sdk_agent_app_sample_code/ -# - Run `poetry build` -# - Copy the wheel file to a UC Volume or Workspace folder -# - Go to the cluster's Libraries page and install the wheel file as a new library - -# Get Spark session if using Databricks Connect from an IDE -from mlflow.utils import databricks_utils as du - -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - - spark = DatabricksSession.builder.getOrCreate() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ## 1️⃣ 📂 Data source & destination configuration - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Configure the data pipeline's source location. -# MAGIC -# MAGIC Choose a [Unity Catalog Volume](https://docs.databricks.com/en/volumes/index.html) containing PDF, HTML, etc documents to be parsed/chunked/embedded. -# MAGIC -# MAGIC - `uc_catalog_name`: Name of the Unity Catalog. -# MAGIC - `uc_schema_name`: Name of the Unity Catalog schema. -# MAGIC - `uc_volume_name`: Name of the Unity Catalog volume. -# MAGIC -# MAGIC Running this cell with validate that the UC Volume exists, trying to create it if not. -# MAGIC - -# COMMAND ---------- - -from cookbook.config.data_pipeline.uc_volume_source import UCVolumeSourceConfig - -# Configure the UC Volume that contains the source documents -source_config = UCVolumeSourceConfig( - # uc_catalog_name="REPLACE_ME", # REPLACE_ME - # uc_schema_name="REPLACE_ME", # REPLACE_ME - # uc_volume_name=f"REPLACE_ME", # REPLACE_ME - uc_catalog_name="casaman_ssa", # REPLACE_ME - uc_schema_name="demos", # REPLACE_ME - uc_volume_name="volume_databricks_documentation", # REPLACE_ME -) - -# Check if volume exists, create otherwise -is_valid, msg = source_config.create_or_validate_volume() -if not is_valid: - raise Exception(msg) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Configure the data pipeline's output location. -# MAGIC -# MAGIC Choose where the data pipeline outputs the parsed, chunked, and embedded documents. -# MAGIC -# MAGIC Required parameters: -# MAGIC * `uc_catalog_name`: Unity Catalog name where tables will be created -# MAGIC * `uc_schema_name`: Schema name within the catalog -# MAGIC * `base_table_name`: Core name used as prefix for all generated tables -# MAGIC * `vector_search_endpoint`: Vector Search endpoint to store the index -# MAGIC -# MAGIC Optional parameters: -# MAGIC * `docs_table_postfix`: Suffix for the parsed documents table (default: "docs") -# MAGIC * `chunked_table_postfix`: Suffix for the chunked documents table (default: "docs_chunked") -# MAGIC * `vector_index_postfix`: Suffix for the vector index (default: "docs_chunked_index") -# MAGIC * `version_suffix`: Version identifier (e.g. 'v1', 'test') to maintain multiple versions -# MAGIC -# MAGIC The generated tables follow this naming convention: -# MAGIC * Parsed docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix} -# MAGIC * Chunked docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix} -# MAGIC * Vector index: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix} -# MAGIC -# MAGIC *Note: If you are comparing different chunking/parsing/embedding strategies, set the `version_suffix` parameter to maintain multiple versions of the pipeline output with the same base_table_name.* -# MAGIC -# MAGIC *Databricks suggests sharing a Vector Search endpoint across multiple agents.* - -# COMMAND ---------- - -from cookbook.config.data_pipeline.data_pipeline_output import DataPipelineOuputConfig - -# Output configuration -output_config = DataPipelineOuputConfig( - # Required parameters - uc_catalog_name=source_config.uc_catalog_name, # usually same as source volume catalog, by default is the same as the source volume catalog - uc_schema_name=source_config.uc_schema_name, # usually same as source volume schema, by default is the same as the source volume schema - #base_table_name=source_config.uc_volume_name, # usually similar / same as the source volume name; by default, is the same as the volume_name - base_table_name="test_product_docs", # usually similar / same as the source volume name; by default, is the same as the volume_name - # vector_search_endpoint="REPLACE_ME", # Vector Search endpoint to store the index - vector_search_endpoint="one-env-shared-endpoint-3", # Vector Search endpoint to store the index - - # Optional parameters, showing defaults - docs_table_postfix="docs", # default value is `docs` - chunked_table_postfix="docs_chunked", # default value is `docs_chunked` - vector_index_postfix="docs_chunked_index", # default value is `docs_chunked_index` - version_suffix="v2" # default is None - - # Output tables / indexes follow this naming convention: - # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix} - # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix} - # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix} -) - -# Alternatively, you can directly pass in the UC locations of the tables / indexes -# output_config = DataPipelineOuputConfig( -# chunked_docs_table="catalog.schema.docs_chunked", -# parsed_docs_table="catalog.schema.parsed_docs", -# vector_index="catalog.schema.docs_chunked_index", -# vector_search_endpoint="REPLACE_ME", -# ) - -# Check UC locations exist -is_valid, msg = output_config.validate_catalog_and_schema() -if not is_valid: - raise Exception(msg) - -# Check Vector Search endpoint exists -is_valid, msg = output_config.validate_vector_search_endpoint() -if not is_valid: - raise Exception(msg) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Configure chunk size and the embedding model. -# MAGIC -# MAGIC **Chunk size and overlap** control how a larger document is turned into smaller chunks that can be processed by an embedding model. See the AI Cookbook [chunking deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#chunking) for more details. -# MAGIC -# MAGIC **The embedding model** is an AI model that is used to identify the most similar documents to a given user's query. See the AI Cookbook [embedding model deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#embedding-model) for more details. -# MAGIC -# MAGIC This notebook supports the following [Foundational Models](https://docs.databricks.com/en/machine-learning/foundation-models/index.html) or [External Model](https://docs.databricks.com/en/generative-ai/external-models/index.html) of type `/llm/v1/embeddings`/. If you want to try another model, you will need to modify the `utils/get_recursive_character_text_splitter` Notebook to add support. -# MAGIC - `databricks-gte-large-en` or `databricks-bge-large-en` -# MAGIC - Azure OpenAI or OpenAI External Model of type `text-embedding-ada-002`, `text-embedding-3-small` or `text-embedding-3-large` - -# COMMAND ---------- - -from cookbook.config.data_pipeline.recursive_text_splitter import RecursiveTextSplitterChunkingConfig - -chunking_config = RecursiveTextSplitterChunkingConfig( - embedding_model_endpoint="databricks-gte-large-en", # A Model Serving endpoint supporting the /llm/v1/embeddings task - chunk_size_tokens=1024, - chunk_overlap_tokens=256, -) - -# Validate the embedding endpoint & chunking config -is_valid, msg = chunking_config.validate_embedding_endpoint() -if not is_valid: - raise Exception(msg) - -is_valid, msg = chunking_config.validate_chunk_size_and_overlap() -if not is_valid: - raise Exception(msg) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### 🚫✏️ Write the data pipeline configuration to a YAML -# MAGIC -# MAGIC This allows the configuration to be loaded referenced by the Agent's notebook. - -# COMMAND ---------- - -from cookbook.config.data_pipeline import DataPipelineConfig -from cookbook.config import serializable_config_to_yaml_file - -data_pipeline_config = DataPipelineConfig( - source=source_config, - output=output_config, - chunking_config=chunking_config, -) - -serializable_config_to_yaml_file(data_pipeline_config, "./configs/data_pipeline_config.yaml") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### 🛑 If you are running your initial data pipeline, you do not need to configure anything else, you can just `Run All` the notebook cells before. You can modify these cells later to tune the quality of your data pipeline by changing the parsing logic. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## 3️⃣ ⌨️ Data pipeline code -# MAGIC -# MAGIC The code below executes the data pipeline. You can modify the below code as indicated to implement different parsing or chunking strategies or to extract additional metadata fields - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Pipeline step 1: Load & parse documents into a Delta Table -# MAGIC -# MAGIC In this step, we'll load files from the UC Volume defined in `source_config` into the Delta Table `storage_config.parsed_docs_table` . The contents of each file will become a separate row in our delta table. -# MAGIC -# MAGIC The path to the source document will be used as the `doc_uri` which is displayed to your end users in the Agent Evalution web application. -# MAGIC -# MAGIC After you test your POC with stakeholders, you can return here to change the parsing logic or extraction additional metadata about the documents to help improve the quality of your retriever. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ##### ✅✏️ Customize the parsing function -# MAGIC -# MAGIC This default implementation parses PDF, HTML, and DOCX files using open source libraries. Adjust `file_parser(...)` and `ParserReturnValue` in `cookbook/data_pipeline/default_parser.py` to add change the parsing logic, add support for more file types, or extract additional metadata about each document. - -# COMMAND ---------- - -from cookbook.data_pipeline.default_parser import file_parser, ParserReturnValue - -# Print the code of file_parser function for inspection -import inspect -print(inspect.getsource(ParserReturnValue)) -print(inspect.getsource(file_parser)) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC The below cell is debugging code to test your parsing function on a single record. - -# COMMAND ---------- - -from cookbook.data_pipeline.parse_docs import load_files_to_df -from pyspark.sql import functions as F - - -raw_files_df = load_files_to_df( - spark=spark, - source_path=source_config.volume_path, -) - -print(f"Loaded {raw_files_df.count()} files from {source_config.volume_path}. Files: {source_config.list_files()}") - -test_records_dict = raw_files_df.toPandas().to_dict(orient="records") - -for record in test_records_dict: - print() - print("Testing parsing for file: ", record["path"]) - print() - test_result = file_parser(raw_doc_contents_bytes=record['content'], doc_path=record['path'], modification_time=record['modificationTime'], doc_bytes_length=record['length']) - print(test_result) - break # pause after 1 file. if you want to test more files, remove the break statement - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC 🚫✏️ The below cell is boilerplate code to apply the parsing function using Spark. - -# COMMAND ---------- - -from cookbook.data_pipeline.parse_docs import ( - load_files_to_df, - apply_parsing_fn, - check_parsed_df_for_errors, - check_parsed_df_for_empty_parsed_files -) -from cookbook.data_pipeline.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema -from cookbook.databricks_utils import get_table_url - -# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small. -NUM_PARTITIONS = 50 - -# Load the UC Volume files into a Spark DataFrame -raw_files_df = load_files_to_df( - spark=spark, - source_path=source_config.volume_path, -).repartition(NUM_PARTITIONS) - -# Apply the parsing UDF to the Spark DataFrame -parsed_files_df = apply_parsing_fn( - raw_files_df=raw_files_df, - # Modify this function to change the parser, extract additional metadata, etc - parse_file_fn=file_parser, - # The schema of the resulting Delta Table will follow the schema defined in ParserReturnValue - parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue), -) - -# Write to a Delta Table -parsed_files_df.write.mode("overwrite").option("overwriteSchema", "true").saveAsTable( - output_config.parsed_docs_table -) - -# Get resulting table -parsed_files_df = spark.table(output_config.parsed_docs_table) -parsed_files_no_errors_df = parsed_files_df.filter( - parsed_files_df.parser_status == "SUCCESS" -) - -# Show successfully parsed documents -print(f"Parsed {parsed_files_df.count()} / {parsed_files_no_errors_df.count()} documents successfully. Inspect `parsed_files_no_errors_df` or visit {get_table_url(output_config.parsed_docs_table)} to see all parsed documents, including any errors.") -display(parsed_files_no_errors_df.toPandas()) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Show any parsing failures or successfully parsed files that resulted in an empty document. - -# COMMAND ---------- - - -# Any documents that failed to parse -is_error, msg, failed_docs_df = check_parsed_df_for_errors(parsed_files_df) -if is_error: - display(failed_docs_df.toPandas()) - raise Exception(msg) - -# Any documents that returned empty parsing results -is_error, msg, empty_docs_df = check_parsed_df_for_empty_parsed_files(parsed_files_df) -if is_error: - display(empty_docs_df.toPandas()) - raise Exception(msg) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### Pipeline step 2: Compute chunks of documents -# MAGIC -# MAGIC In this step, we will split our documents into smaller chunks so they can be indexed in our vector database. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ##### ✅✏️ Chunking logic. -# MAGIC -# MAGIC We provide a default implementation of a recursive text splitter. To create your own chunking logic, adapt the `get_recursive_character_text_splitter()` function inside `cookbook.data_pipeline.recursive_character_text_splitter.py`. - -# COMMAND ---------- - -from cookbook.data_pipeline.recursive_character_text_splitter import ( - get_recursive_character_text_splitter, -) - -# Get the chunking function -recursive_character_text_splitter_fn = get_recursive_character_text_splitter( - model_serving_endpoint=chunking_config.embedding_model_endpoint, - chunk_size_tokens=chunking_config.chunk_size_tokens, - chunk_overlap_tokens=chunking_config.chunk_overlap_tokens, -) - -# Determine which columns to propagate from the docs table to the chunks table. - -# Get the columns from the parser except for the content -# You can modify this to adjust which fields are propagated from the docs table to the chunks table. -propagate_columns = [ - field.name - for field in typed_dicts_to_spark_schema(ParserReturnValue).fields - if field.name != "content" -] - -# If you want to implement retrieval strategies such as presenting the entire document vs. the chunk to the LLM, include `contentich contains the doc's full parsed text. By default this is not included because the size of contcontentquite large and cause performance issues. -# propagate_columns = [ -# field.name -# for field in typed_dicts_to_spark_schema(ParserReturnValue).fields -# ] - -# COMMAND ---------- - -# MAGIC %md -# MAGIC 🚫✏️ Run the chunking function within Spark - -# COMMAND ---------- - -from cookbook.data_pipeline.chunk_docs import apply_chunking_fn -from cookbook.databricks_utils import get_table_url - -# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small. -NUM_PARTITIONS = 50 - -# Load parsed docs -parsed_files_df = spark.table(output_config.parsed_docs_table).repartition(NUM_PARTITIONS) - -chunked_docs_df = chunked_docs_table = apply_chunking_fn( - # The source documents table. - parsed_docs_df=parsed_files_df, - # The chunking function that takes a string (document) and returns a list of strings (chunks). - chunking_fn=recursive_character_text_splitter_fn, - # Choose which columns to propagate from the docs table to chunks table. `doc_uri` column is required we can propagate the original document URL to the Agent's web app. - propagate_columns=propagate_columns, -) - -# Write to Delta Table -chunked_docs_df.write.mode("overwrite").option( - "overwriteSchema", "true" -).saveAsTable(output_config.chunked_docs_table) - -# Get resulting table -chunked_docs_df = spark.table(output_config.chunked_docs_table) - -# Show number of chunks created -print(f"Created {chunked_docs_df.count()} chunks. Inspect `chunked_docs_df` or visit {get_table_url(output_config.chunked_docs_table)} to see the results.") - -# enable CDC feed for VS index sync -cdc_results = spark.sql(f"ALTER TABLE {output_config.chunked_docs_table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)") - -# Show chunks -display(chunked_docs_df.toPandas()) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### 🚫✏️ Pipeline step 3: Create the vector index -# MAGIC -# MAGIC In this step, we'll embed the documents to compute the vector index over the chunks and create our retriever index that will be used to query relevant documents to the user question. The embedding pipeline is handled within Databricks Vector Search using [Delta Sync](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#create-a-vector-search-index) - -# COMMAND ---------- - -from cookbook.data_pipeline.build_retriever_index import build_retriever_index -from cookbook.databricks_utils import get_table_url - -is_error, msg = retriever_index_result = build_retriever_index( - # Spark requires `` to escape names with special chars, VS client does not. - chunked_docs_table_name=output_config.chunked_docs_table.replace("`", ""), - vector_search_endpoint=output_config.vector_search_endpoint, - vector_search_index_name=output_config.vector_index, - - # Must match the embedding endpoint you used to chunk your documents - embedding_endpoint_name=chunking_config.embedding_model_endpoint, - - # Set to true to re-create the vector search endpoint when re-running the data pipeline. If set to True, syncing will not work if re-run the pipeline and change the schema of chunked_docs_table_name. Keeping this as False will allow Vector Search to avoid recomputing embeddings for any row with that has a chunk_id that was previously computed. - force_delete_index_before_create=False, -) -if is_error: - raise Exception(msg) -else: - print("NOTE: This cell will complete before the vector index has finished syncing/embedding your chunks & is ready for queries!") - print(f"View sync status here: {get_table_url(output_config.vector_index)}") - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### 🚫✏️ Print links to view the resulting tables/index - -# COMMAND ---------- - -from cookbook.databricks_utils import get_table_url - -print() -print(f"Parsed docs table: {get_table_url(output_config.parsed_docs_table)}\n") -print(f"Chunked docs table: {get_table_url(output_config.chunked_docs_table)}\n") -print(f"Vector search index: {get_table_url(output_config.vector_index)}\n") \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7c756f50-2063-4a07-b964-e5d6de29abb4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Unstructured data pipeline for the Agent's Retriever\n", + "\n", + "By the end of this notebook, you will have transformed your unstructured documents into a vector index that can be queried by your Agent.\n", + "\n", + "This means:\n", + "- Documents loaded into a delta table.\n", + "- Documents are chunked.\n", + "- Chunks have been embedded with an embedding model and stored in a vector index.\n", + "\n", + "The important resulting artifact of this notebook is the chunked vector index. This will be used in the next notebook to power our Retriever." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d3777205-4dfe-418c-9d21-c67961a18070", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 👉 START HERE: How to Use This Notebook\n", + "\n", + "Follow these steps to build and refine your data pipeline's quality:\n", + "\n", + "1. **Build a v0 index with default settings**\n", + " - Configure the data source and destination tables in the `1️⃣ 📂 Data source & destination configuration` cells\n", + " - Press `Run All` to create the vector index.\n", + "\n", + " *Note: While you can adjust the other settings and modify the parsing/chunking code, we suggest doing so only after evaluating your Agent's quality so you can make improvements that specifically address root causes of quality issues.*\n", + "\n", + "2. **Use later notebooks to integrate the retriever into an the agent and evaluate the agent/retriever's quality.**\n", + "\n", + "3. **If the evaluation results show retrieval issues as a root cause, use this notebook to iterate on your data pipeline's code & config.** Below are some potential fixes you can try, see the AI Cookbook's [debugging retrieval issues](https://ai-cookbook.io/nbs/5-hands-on-improve-quality-step-1-retrieval.html) section for details.**\n", + " - Add missing, but relevant source documents into in the index.\n", + " - Resolve any conflicting information in source documents.\n", + " - Adjust the data pipeline configuration:\n", + " - Modify chunk size or overlap.\n", + " - Experiment with different embedding models.\n", + " - Adjust the data pipeline code:\n", + " - Create a custom parser or use different parsing libraries.\n", + " - Develop a custom chunker or use different chunking techniques.\n", + " - Extract additional metadata for each document.\n", + " - Adjust the Agent's code/config in subsequent notebooks:\n", + " - Change the number of documents retrieved (K).\n", + " - Try a re-ranker.\n", + " - Use hybrid search.\n", + " - Apply extracted metadata as filters.\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1a6053b9-3135-4097-9ed0-64bdb03a6b9f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "**Important note:** Throughout this notebook, we indicate which cells you:\n", + "- ✅✏️ *should* customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality\n", + "- 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to execute the pipeline\n", + "\n", + "*Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "16b35cfd-7c99-4419-8978-33939faf24a6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Install Python libraries (Databricks Notebook only)\n", + "\n", + "🚫✏️ Only modify if you need additional packages in your code changes to the document parsing or chunking logic.\n", + "\n", + "Versions of Databricks code are not locked since Databricks ensures changes are backwards compatible.\n", + "Versions of open source packages are locked since package authors often make backwards compatible changes" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6b4eebb3-448a-4236-99fb-19e44858e3c6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.2 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.2 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.24 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.3 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\ndatabricks-agents 0.12.0 requires tiktoken>=0.8.0, but you have tiktoken 0.7.0 which is incompatible.\nlangchain-openai 0.2.12 requires langchain-core<0.4.0,>=0.3.21, but you have langchain-core 0.2.43 which is incompatible.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.2.43 which is incompatible.\nlangchain 0.1.20 requires langchain-text-splitters<0.1,>=0.0.1, but you have langchain-text-splitters 0.2.0 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.2.43 which is incompatible.\ntorchvision 0.18.1+cpu requires torch==2.3.1, but you have torch 2.3.0 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt\n", + "%pip install -qqqq -U -r requirements_datapipeline.txt\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bfc3b4a6-cf99-4cd4-bd83-47f16f73e525", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### Connect to Databricks (Local IDE only)\n", + "\n", + "If running from an IDE with [`databricks-connect`](https://docs.databricks.com/en/dev-tools/databricks-connect/python/index.html), connect to a Spark session & install the necessary packages on that cluster." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6ea6d1eb-7d70-4b9e-b608-12258ace7b5d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.databricks_utils import get_cluster_url\n", + "from cookbook.databricks_utils import get_active_cluster_id\n", + "from cookbook.databricks_utils.install_cluster_library import install_requirements\n", + "\n", + "# UNCOMMENT TO INSTALL PACKAGES ON THE ACTIVE CLUSTER; this is code that is not super battle tested.\n", + "# cluster_id = get_active_cluster_id()\n", + "# print(f\"Installing packages on the active cluster: {get_cluster_url(cluster_id)}\")\n", + "\n", + "\n", + "# install_requirements(cluster_id, \"requirements.txt\")\n", + "# install_requirements(cluster_id, \"requirements_datapipeline.txt\")\n", + "\n", + "# THIS MUST BE DONE MANUALLY! TODO: Automate it.\n", + "# - Go to openai_sdk_agent_app_sample_code/\n", + "# - Run `poetry build`\n", + "# - Copy the wheel file to a UC Volume or Workspace folder\n", + "# - Go to the cluster's Libraries page and install the wheel file as a new library\n", + "\n", + "# Get Spark session if using Databricks Connect from an IDE\n", + "from mlflow.utils import databricks_utils as du\n", + "\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7a27dc10-44ae-4489-bc75-0d61c89b4268", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "## 1️⃣ 📂 Data source & destination configuration" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cf8bd6ab-827e-4ba6-805f-091349906ef6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Configure the data pipeline's source location.\n", + "\n", + "Choose a [Unity Catalog Volume](https://docs.databricks.com/en/volumes/index.html) containing PDF, HTML, etc documents to be parsed/chunked/embedded.\n", + "\n", + "- `uc_catalog_name`: Name of the Unity Catalog.\n", + "- `uc_schema_name`: Name of the Unity Catalog schema.\n", + "- `uc_volume_name`: Name of the Unity Catalog volume.\n", + "\n", + "Running this cell with validate that the UC Volume exists, trying to create it if not.\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "59b3efc5-0591-4a44-b88d-184003cabfb6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Volume /Volumes/casaman_ssa/demos/volume_databricks_documentation exists. View here: https://adb-984752964297111.11.azuredatabricks.net/explore/data/volumes/casaman_ssa/demos/volume_databricks_documentation\n" + ] + } + ], + "source": [ + "from cookbook.config.data_pipeline.uc_volume_source import UCVolumeSourceConfig\n", + "\n", + "# Configure the UC Volume that contains the source documents\n", + "source_config = UCVolumeSourceConfig(\n", + " # uc_catalog_name=\"REPLACE_ME\", # REPLACE_ME\n", + " # uc_schema_name=\"REPLACE_ME\", # REPLACE_ME\n", + " # uc_volume_name=f\"REPLACE_ME\", # REPLACE_ME\n", + " uc_catalog_name=\"casaman_ssa\", # REPLACE_ME\n", + " uc_schema_name=\"demos\", # REPLACE_ME\n", + " uc_volume_name=\"volume_databricks_documentation\", # REPLACE_ME\n", + ")\n", + "\n", + "# Check if volume exists, create otherwise\n", + "is_valid, msg = source_config.create_or_validate_volume()\n", + "if not is_valid:\n", + " raise Exception(msg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "083e203f-e468-4ce7-b645-31507a36c86b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Configure the data pipeline's output location.\n", + " \n", + "Choose where the data pipeline outputs the parsed, chunked, and embedded documents.\n", + "\n", + "Required parameters:\n", + "* `uc_catalog_name`: Unity Catalog name where tables will be created\n", + "* `uc_schema_name`: Schema name within the catalog \n", + "* `base_table_name`: Core name used as prefix for all generated tables\n", + "* `vector_search_endpoint`: Vector Search endpoint to store the index\n", + "\n", + "Optional parameters:\n", + "* `docs_table_postfix`: Suffix for the parsed documents table (default: \"docs\")\n", + "* `chunked_table_postfix`: Suffix for the chunked documents table (default: \"docs_chunked\") \n", + "* `vector_index_postfix`: Suffix for the vector index (default: \"docs_chunked_index\")\n", + "* `version_suffix`: Version identifier (e.g. 'v1', 'test') to maintain multiple versions\n", + "\n", + "The generated tables follow this naming convention:\n", + "* Parsed docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix}\n", + "* Chunked docs: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix}\n", + "* Vector index: {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix}\n", + "\n", + "*Note: If you are comparing different chunking/parsing/embedding strategies, set the `version_suffix` parameter to maintain multiple versions of the pipeline output with the same base_table_name.*\n", + "\n", + "*Databricks suggests sharing a Vector Search endpoint across multiple agents.*" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7345d9f0-5fd4-4545-b23a-d8b737063849", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "All catalogs and schemas exist for parsed_docs_table, chunked_docs_table, and vector_index.\nVector Search endpoint 'one-env-shared-endpoint-3' exists.\n" + ] + } + ], + "source": [ + "from cookbook.config.data_pipeline.data_pipeline_output import DataPipelineOuputConfig\n", + "\n", + "# Output configuration\n", + "output_config = DataPipelineOuputConfig(\n", + " # Required parameters\n", + " uc_catalog_name=source_config.uc_catalog_name, # usually same as source volume catalog, by default is the same as the source volume catalog\n", + " uc_schema_name=source_config.uc_schema_name, # usually same as source volume schema, by default is the same as the source volume schema\n", + " #base_table_name=source_config.uc_volume_name, # usually similar / same as the source volume name; by default, is the same as the volume_name\n", + " base_table_name=\"test_product_docs\", # usually similar / same as the source volume name; by default, is the same as the volume_name\n", + " # vector_search_endpoint=\"REPLACE_ME\", # Vector Search endpoint to store the index\n", + " vector_search_endpoint=\"one-env-shared-endpoint-3\", # Vector Search endpoint to store the index\n", + "\n", + " # Optional parameters, showing defaults\n", + " docs_table_postfix=\"docs\", # default value is `docs`\n", + " chunked_table_postfix=\"docs_chunked\", # default value is `docs_chunked`\n", + " vector_index_postfix=\"docs_chunked_index\", # default value is `docs_chunked_index`\n", + " version_suffix=\"v2\" # default is None\n", + "\n", + " # Output tables / indexes follow this naming convention:\n", + " # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{docs_table_postfix}__{version_suffix}\n", + " # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{chunked_table_postfix}__{version_suffix}\n", + " # {uc_catalog_name}.{uc_schema_name}.{base_table_name}_{vector_index_postfix}__{version_suffix}\n", + ")\n", + "\n", + "# Alternatively, you can directly pass in the UC locations of the tables / indexes\n", + "# output_config = DataPipelineOuputConfig(\n", + "# chunked_docs_table=\"catalog.schema.docs_chunked\",\n", + "# parsed_docs_table=\"catalog.schema.parsed_docs\",\n", + "# vector_index=\"catalog.schema.docs_chunked_index\",\n", + "# vector_search_endpoint=\"REPLACE_ME\",\n", + "# )\n", + "\n", + "# Check UC locations exist\n", + "is_valid, msg = output_config.validate_catalog_and_schema()\n", + "if not is_valid:\n", + " raise Exception(msg)\n", + "\n", + "# Check Vector Search endpoint exists\n", + "is_valid, msg = output_config.validate_vector_search_endpoint()\n", + "if not is_valid:\n", + " raise Exception(msg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b5b380e5-1d9a-4c93-b8fe-ec23f00442a9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Configure chunk size and the embedding model.\n", + "\n", + "**Chunk size and overlap** control how a larger document is turned into smaller chunks that can be processed by an embedding model. See the AI Cookbook [chunking deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#chunking) for more details.\n", + "\n", + "**The embedding model** is an AI model that is used to identify the most similar documents to a given user's query. See the AI Cookbook [embedding model deep dive](https://ai-cookbook.io/nbs/3-deep-dive-data-pipeline.html#embedding-model) for more details.\n", + "\n", + "This notebook supports the following [Foundational Models](https://docs.databricks.com/en/machine-learning/foundation-models/index.html) or [External Model](https://docs.databricks.com/en/generative-ai/external-models/index.html) of type `/llm/v1/embeddings`/. If you want to try another model, you will need to modify the `utils/get_recursive_character_text_splitter` Notebook to add support.\n", + "- `databricks-gte-large-en` or `databricks-bge-large-en`\n", + "- Azure OpenAI or OpenAI External Model of type `text-embedding-ada-002`, `text-embedding-3-small` or `text-embedding-3-large`" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "06ee684b-c7bd-4c0e-8fd8-f54416948a5a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Validated serving endpoint databricks-gte-large-en as READY and of type llm/v1/embeddings. View here: https://adb-984752964297111.11.azuredatabricks.net/ml/endpoints/databricks-gte-large-en\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2024-12-11 17:37:59.493478: I tensorflow/core/platform/cpu_feature_guard.cc:210] This TensorFlow binary is optimized to use available CPU instructions in performance-critical operations.\nTo enable the following instructions: AVX2 AVX512F FMA, in other operations, rebuild TensorFlow with the appropriate compiler flags.\n2024-12-11 17:38:04.299358: W tensorflow/compiler/tf2tensorrt/utils/py_utils.cc:38] TF-TRT Warning: Could not find TensorRT\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Chunk size in tokens: 1024 and chunk overlap in tokens: 256 are valid. Using 16.0% (1280 tokens) of the 8192 token context window.\n" + ] + } + ], + "source": [ + "from cookbook.config.data_pipeline.recursive_text_splitter import RecursiveTextSplitterChunkingConfig\n", + "\n", + "chunking_config = RecursiveTextSplitterChunkingConfig(\n", + " embedding_model_endpoint=\"databricks-gte-large-en\", # A Model Serving endpoint supporting the /llm/v1/embeddings task\n", + " chunk_size_tokens=1024,\n", + " chunk_overlap_tokens=256,\n", + ")\n", + "\n", + "# Validate the embedding endpoint & chunking config\n", + "is_valid, msg = chunking_config.validate_embedding_endpoint()\n", + "if not is_valid:\n", + " raise Exception(msg)\n", + "\n", + "is_valid, msg = chunking_config.validate_chunk_size_and_overlap()\n", + "if not is_valid:\n", + " raise Exception(msg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6ac12e95-afbc-453d-9090-201ae4587f49", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### 🚫✏️ Write the data pipeline configuration to a YAML\n", + "\n", + "This allows the configuration to be loaded referenced by the Agent's notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a5244ac6-a0cf-4879-91f2-4bd7c2a59966", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config.data_pipeline import DataPipelineConfig\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "\n", + "data_pipeline_config = DataPipelineConfig(\n", + " source=source_config,\n", + " output=output_config,\n", + " chunking_config=chunking_config,\n", + ")\n", + "\n", + "serializable_config_to_yaml_file(data_pipeline_config, \"./configs/data_pipeline_config.yaml\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a28cbf99-c4ca-4adc-905a-e7ebfe015730", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### 🛑 If you are running your initial data pipeline, you do not need to configure anything else, you can just `Run All` the notebook cells before. You can modify these cells later to tune the quality of your data pipeline by changing the parsing logic." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "95b6971b-b00b-4f42-bbe8-cc64eea2fff8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 3️⃣ ⌨️ Data pipeline code\n", + "\n", + "The code below executes the data pipeline. You can modify the below code as indicated to implement different parsing or chunking strategies or to extract additional metadata fields" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c85ddc92-10c5-405c-ae78-8ded5462333e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Pipeline step 1: Load & parse documents into a Delta Table\n", + "\n", + "In this step, we'll load files from the UC Volume defined in `source_config` into the Delta Table `storage_config.parsed_docs_table` . The contents of each file will become a separate row in our delta table.\n", + "\n", + "The path to the source document will be used as the `doc_uri` which is displayed to your end users in the Agent Evalution web application.\n", + "\n", + "After you test your POC with stakeholders, you can return here to change the parsing logic or extraction additional metadata about the documents to help improve the quality of your retriever." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "27466460-1ee7-4fe4-8faf-da9ddff11847", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "##### ✅✏️ Customize the parsing function\n", + "\n", + "This default implementation parses PDF, HTML, and DOCX files using open source libraries. Adjust `file_parser(...)` and `ParserReturnValue` in `cookbook/data_pipeline/default_parser.py` to add change the parsing logic, add support for more file types, or extract additional metadata about each document." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d09fd38c-5b7b-47c5-aa6a-ff571ce2f83b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "class ParserReturnValue(TypedDict):\n # DO NOT CHANGE THESE NAMES\n # Parsed content of the document\n content: str # do not change this name\n # The status of whether the parser succeeds or fails, used to exclude failed files downstream\n parser_status: str # do not change this name\n # Unique ID of the document\n doc_uri: str # do not change this name\n\n # OK TO CHANGE THESE NAMES\n # Optionally, you can add additional metadata fields here\n # example_metadata: str\n last_modified: datetime\n\ndef file_parser(\n raw_doc_contents_bytes: bytes,\n doc_path: str,\n modification_time: datetime,\n doc_bytes_length: int,\n) -> ParserReturnValue:\n \"\"\"\n Parses the content of a PDF document into a string.\n\n This function takes the raw bytes of a PDF document and its path, attempts to parse the document using PyPDF,\n and returns the parsed content and the status of the parsing operation.\n\n Parameters:\n - raw_doc_contents_bytes (bytes): The raw bytes of the document to be parsed (set by Spark when loading the file)\n - doc_path (str): The DBFS path of the document, used to verify the file extension (set by Spark when loading the file)\n - modification_time (timestamp): The last modification time of the document (set by Spark when loading the file)\n - doc_bytes_length (long): The size of the document in bytes (set by Spark when loading the file)\n\n Returns:\n - ParserReturnValue: A dictionary containing the parsed document content and the status of the parsing operation.\n The 'contenty will contain the parsed text as a string, and the 'parser_status' key will indicate\n whether the parsing was successful or if an error occurred.\n \"\"\"\n try:\n from markdownify import markdownify as md\n\n filename, file_extension = os.path.splitext(doc_path)\n\n if file_extension == \".pdf\":\n pdf_doc = fitz.Document(stream=raw_doc_contents_bytes, filetype=\"pdf\")\n md_text = pymupdf4llm.to_markdown(pdf_doc)\n\n parsed_document = {\n \"content\": md_text.strip(),\n \"parser_status\": \"SUCCESS\",\n }\n elif file_extension == \".html\":\n html_content = raw_doc_contents_bytes.decode(\"utf-8\")\n\n markdown_contents = md(\n str(html_content).strip(), heading_style=markdownify.ATX\n )\n markdown_stripped = re.sub(r\"\\n{3,}\", \"\\n\\n\", markdown_contents.strip())\n\n parsed_document = {\n \"content\": markdown_stripped,\n \"parser_status\": \"SUCCESS\",\n }\n elif file_extension == \".docx\":\n with tempfile.NamedTemporaryFile(delete=True) as temp_file:\n temp_file.write(raw_doc_contents_bytes)\n temp_file_path = temp_file.name\n md = pypandoc.convert_file(temp_file_path, \"markdown\", format=\"docx\")\n\n parsed_document = {\n \"content\": md.strip(),\n \"parser_status\": \"SUCCESS\",\n }\n elif file_extension in [\".txt\", \".md\"]:\n parsed_document = {\n \"content\": raw_doc_contents_bytes.decode(\"utf-8\").strip(),\n \"parser_status\": \"SUCCESS\",\n }\n elif file_extension in [\".json\", \".jsonl\"]:\n # NOTE: This is a placeholder for a JSON parser. It's not a \"real\" parser, it just returns the raw JSON formatted into XML-like strings that LLMs tend to like.\n json_data = json.loads(raw_doc_contents_bytes.decode(\"utf-8\"))\n\n def flatten_json_to_xml(obj, parent_key=\"\"):\n xml_parts = []\n if isinstance(obj, dict):\n for key, value in obj.items():\n if isinstance(value, (dict, list)):\n xml_parts.append(flatten_json_to_xml(value, key))\n else:\n xml_parts.append(f\"<{key}>{str(value)}\")\n elif isinstance(obj, list):\n for i, item in enumerate(obj):\n if isinstance(item, (dict, list)):\n xml_parts.append(\n flatten_json_to_xml(item, f\"{parent_key}_{i}\")\n )\n else:\n xml_parts.append(\n f\"<{parent_key}_{i}>{str(item)}\"\n )\n else:\n xml_parts.append(f\"<{parent_key}>{str(obj)}\")\n return \"\\n\".join(xml_parts)\n\n flattened_content = flatten_json_to_xml(json_data)\n parsed_document = {\n \"content\": flattened_content.strip(),\n \"parser_status\": \"SUCCESS\",\n }\n else:\n raise Exception(f\"No supported parser for {doc_path}\")\n\n # Extract the required doc_uri\n # convert from `dbfs:/Volumes/catalog/schema/pdf_docs/filename.pdf` to `/Volumes/catalog/schema/pdf_docs/filename.pdf`\n modified_path = urlparse(doc_path).path\n parsed_document[\"doc_uri\"] = modified_path\n\n # Sample metadata extraction logic\n # if \"test\" in parsed_document[\"content\n # parsed_document[\"example_metadata\"] = \"test\"\n # else:\n # parsed_document[\"example_metadata\"] = \"not test\"\n\n # Add the modified time\n parsed_document[\"last_modified\"] = modification_time\n\n return parsed_document\n\n except Exception as e:\n status = f\"An error occurred: {e}\\n{traceback.format_exc()}\"\n warnings.warn(status)\n return {\n \"content\": \"\",\n \"parser_status\": f\"ERROR: {status}\",\n }\n\n" + ] + } + ], + "source": [ + "from cookbook.data_pipeline.default_parser import file_parser, ParserReturnValue\n", + "\n", + "# Print the code of file_parser function for inspection\n", + "import inspect\n", + "print(inspect.getsource(ParserReturnValue))\n", + "print(inspect.getsource(file_parser))\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "61034803-4bdd-4f0b-b173-a82448ee1790", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "The below cell is debugging code to test your parsing function on a single record. " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "48a3ab67-2e30-4e39-b05e-3a8ff304fd5b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading the raw files from /Volumes/casaman_ssa/demos/volume_databricks_documentation...\nLoaded 29 files from /Volumes/casaman_ssa/demos/volume_databricks_documentation. Files: ['databricks-pdf/']\n\nTesting parsing for file: dbfs:/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\n\n{'content': '**eBook**\\n\\n## The Data Team’s Guide to the Databricks Lakehouse Platform\\n\\n\\n-----\\n\\n#### Contents\\n\\n\\n**C H A P TE R 1**\\n\\n**C H A P TE R 2**\\n\\n**C H A P TE R 3**\\n\\n**C H A P TE R 4**\\n\\n**C H A P TE R 5**\\n\\n**C H A P TE R 6**\\n\\n**C H A P TE R 7**\\n\\n**C H A P TE R 8**\\n\\n**C H A P TE R 9**\\n\\n**C H A P TE R 10**\\n\\n**C H A P TE R 11**\\n\\n**C H A P TE R 12**\\n\\n\\n**The data lakehouse** ...................................................................................................................................................................................... **4**\\n\\n**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\\n\\n**Data reliability and performance** ................................................................................................................................... **18**\\n\\n**Unified governance and sharing for data, analytics and AI** ....................................... **28**\\n\\n**Security** .............................................................................................................................................................................................................................. **41**\\n\\n**Instant compute and serverless** ................................................................................................................................... **48**\\n\\n**Data warehousing** ......................................................................................................................................................................................... **52**\\n\\n**Data engineering** ............................................................................................................................................................................................. **56**\\n\\n**Data streaming** .................................................................................................................................................................................................. **68.**\\n\\n**Data science and machine learning** ........................................................................................................................ **7** **3.**\\n\\n**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\\n\\n**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\\n\\n\\n-----\\n\\n**I N T R O D U C T I O N**\\n\\n#### The Data Team’s Guide to the Databricks Lakehouse Platform\\n\\n_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\\ndesigned for data practitioners and leaders who are embarking\\non their journey into the data lakehouse architecture.\\n\\nIn this eBook, you will learn the full capabilities of the data lakehouse architecture\\nand how the Databricks Lakehouse Platform helps organizations of all sizes — from\\nenterprises to startups in every industry — with all their data, analytics, AI and\\nmachine learning use cases on one platform.\\n\\nYou will see how the platform combines the best elements of data warehouses\\nand data lakes to increase the reliability, performance and scalability of your\\ndata platform. Discover how the lakehouse simplifies complex workloads in data\\nengineering, data warehousing, data streaming, data science and machine learning\\n— and bolsters collaboration for your data teams, allowing them to maintain new\\nlevels of governance, flexibility and agility in an open and multicloud environment.\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n### The data lakehouse\\n# 01\\n\\n\\n-----\\n\\n#### The evolution of data architectures\\n\\n\\nData has moved front and center within every organization as data-driven insights\\nhave fueled innovation, competitive advantage and better customer experiences.\\n\\nHowever, as companies place mandates on becoming more data-driven,\\ntheir data teams are left in a sprint to deliver the right data for business\\ninsights and innovation. With the widespread adoption of cloud, data teams\\noften invest in large-scale complex data systems that have capabilities for\\nstreaming, business intelligence, analytics and machine learning to support\\nthe overall business objectives.\\n\\nTo support these objectives, data teams have deployed cloud data\\n\\nwarehouses and data lakes.\\n\\n\\nTraditional data systems: The data warehouse and data lake\\n\\nWith the advent of big data, companies began collecting large amounts of\\ndata from many different sources, such as weblogs, sensor data and images.\\nData warehouses — which have a long history as the foundation for decision\\nsupport and business intelligence applications — cannot handle large volumes\\nof data.\\n\\nWhile data warehouses are great for structured data and historical analysis,\\nthey weren’t designed for unstructured data, semi-structured data, and data\\nwith high variety, velocity and volume, making them unsuitable for many types\\nof data.\\n\\nThis led to the introduction of data lakes, providing a single repository of raw\\ndata in a variety of formats. While suitable for storing big data, data lakes do\\nnot support transactions, nor do they enforce data quality, and their lack of\\nconsistency/isolation makes it almost impossible to read, write or process data.\\n\\nFor these reasons, many of the promises of data lakes never materialized and,\\nin many cases, reduced the benefits of data warehouses.\\n\\nAs companies discovered new use cases for data exploration, predictive modeling\\nand prescriptive analytics, the need for a single, flexible, high-performance system\\nonly grew. Data teams require systems for diverse data applications including SQL\\nanalytics, real-time analytics, data science and machine learning.\\n\\n\\n-----\\n\\nTo solve for new use cases and new users, a common approach is to use multiple\\nsystems — a data lake, several data warehouses and other specialized systems\\nsuch as streaming, time-series, graph and image databases. But having multiple\\nsystems introduces complexity and delay, as data teams invariably need to\\nmove or copy data between different systems, effectively losing oversight and\\ngovernance over data usage.\\n\\n\\nYou have now duplicated data in two different systems and the changes you\\nmake in one system are unlikely to find their way to the other. So, you are going\\nto have data drift almost immediately, not to mention paying to store the same\\ndata multiple times.\\n\\nThen, because governance is happening at two distinct levels across these\\nplatforms, you are not able to control things consistently.\\n\\n\\n**Challenges with data, analytics and AI**\\n\\nIn a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\\nmeasurable value from data. The challenge is that most companies continue to\\nimplement two different platforms: data warehouses for BI and data lakes for AI.\\nThese platforms are incompatible with each other, but data from both systems\\nis generally needed to deliver game-changing outcomes, which makes success\\nwith AI extremely difficult.\\n\\nToday, most of the data is landing in the data lake, and a lot of it is unstructured.\\nIn fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\\nunstructured by 2025. But, this data is where much of the value from AI resides.\\nSubsets of the data are then copied to the data warehouse into structured\\ntables, and back again in some cases.\\n\\nYou also must secure and govern the data in both warehouses and offer\\nfine-grained governance, while lakes tend to be coarser grained at the file level.\\nThen, you stand up different stacks of tools on these platforms to do either\\nBI or AI.\\n\\n\\n-----\\n\\nFinally, the tool stacks on top of these platforms\\nare fundamentally different, which makes it difficult\\nto get any kind of collaboration going between the\\nteams that support them.\\n\\nThis is why AI efforts fail. There is a tremendous\\namount of complexity and rework being introduced\\ninto the system. Time and resources are being\\nwasted trying to get the right data to the right\\npeople, and everything is happening too slowly\\nto get in front of the competition.\\n\\n\\n**Realizing this requires two disparate,**\\n**incompatible data platforms**\\n\\n\\n**Business** **SQL** **Incomplete** **Data science** **Data**\\n\\n**support for**\\n\\n**intelligence** **analytics** **and ML** **streaming**\\n\\n\\n**SQL**\\n**analytics**\\n\\n\\n**Incomplete**\\n**support for**\\n**use cases**\\n\\n\\n**Incompatible**\\n**security and**\\n**governance models**\\n\\n**Copy subsets of data**\\n\\n\\n\\n|Col1|Col2|Col3|Col4|\\n|---|---|---|---|\\n|Governa T|n a|c b|e and security le ACLs|\\n|||||\\n\\n|Col1|Col2|Col3|Col4|\\n|---|---|---|---|\\n|Governa File|n s|c a|e and security nd blobs|\\n|||||\\n\\n\\n**Disjointed**\\n**and duplicative**\\n\\n**Data warehouse** **data silos** **Data lake**\\nStructured tables Unstructured files:\\nlogs, text, images, video\\n\\n\\n-----\\n\\n**Moving forward with a lakehouse architecture**\\n\\nTo satisfy the need to support AI and BI directly on vast amounts of data stored\\nin data lakes (on low-cost cloud storage), a new data management architecture\\nemerged independently across many organizations and use cases: the\\ndata lakehouse.\\n\\nThe data lakehouse can store _all_ and _any_ type of data once in a data lake and\\nmake that data accessible directly for AI and BI. The lakehouse paradigm has\\nspecific capabilities to efficiently allow both AI and BI on all the enterprise’s data\\nat a massive scale. Namely, it has the SQL and performance capabilities such as\\nindexing, caching and MPP processing to make BI work fast on data lakes. It also\\nhas direct file access and direct native support for Python, data science and AI\\nframeworks without the need for a separate data warehouse.\\n\\nIn short, a lakehouse is a data architecture that combines the best elements\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\ndesign, which implements similar data structures and data management features\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\n\\n\\n-----\\n\\n##### Data lakehouse\\n\\nOne platform to unify all your data, analytics and AI workloads\\n\\n###### Lakehouse Platform\\n\\nAll machine learning, SQL,\\nBI, and streaming use cases\\n\\nOne security and governance\\napproach for all data assets\\non all clouds\\n\\n\\n-----\\n\\n**Key features for a lakehouse**\\n\\nRecent innovations with the data lakehouse architecture can help simplify\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\nthe kind of flexibility and openness that allows your organization to stay agile\\nas you scale. Here are key features to consider when evaluating data lakehouse\\narchitectures:\\n\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\nConsistency, Isolation and Durability) transactions ensures consistency as\\nmultiple parties concurrently read or write data.\\n\\nSchema enforcement and governance: The lakehouse should have\\na way to support schema enforcement and evolution, supporting data\\nwarehouse schema paradigms such as star/snowflake. The system should\\nbe able to reason about data integrity, and it should have robust governance\\nand auditing mechanisms.\\n\\nData governance: Capabilities including auditing, retention and lineage\\nhave become essential, particularly considering recent privacy regulations.\\n\\nTools that allow data discovery have become popular, such as data catalogs\\nand data usage metrics.\\n\\nBI support: Lakehouses allow the use of BI tools directly on the source\\ndata. This reduces staleness and latency, improves recency and lowers cost\\nby not having to operationalize two copies of the data in both a data lake\\nand a warehouse.\\n\\n\\nStorage decoupled from compute: In practice, this means storage and\\ncompute use separate clusters, thus these systems can scale to many more\\nconcurrent users and larger data sizes. Some modern data warehouses also\\nhave this property.\\n\\nOpenness: The storage formats, such as Apache Parquet, are open and\\nstandardized, so a variety of tools and engines, including machine learning\\nand Python/R libraries, can efficiently access the data directly.\\n\\nSupport for diverse data types (unstructured and structured):\\nThe lakehouse can be used to store, refine, analyze and access data types\\nneeded for many new data applications, including images, video, audio,\\nsemi-structured data and text.\\n\\nSupport for diverse workloads: Use the same data repository for a range\\nof workloads including data science, machine learning and SQL analytics.\\nMultiple tools might be needed to support all these workloads.\\n\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\nSupport for streaming eliminates the need for separate systems dedicated to\\nserving real-time data applications.\\n\\n**Learn more**\\n\\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\n\\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\n\\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n# 02\\n\\n\\n### The Databricks Lakehouse Platform\\n\\n\\n-----\\n\\n#### Lakehouse: A new generation of open platforms\\n\\n\\n###### This is the lakehouse paradigm\\n\\n\\nDatabricks is the inventor and pioneer of the\\ndata lakehouse architecture. The data lakehouse\\narchitecture was coined in the research paper,\\n[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\n[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\\nintroduced by Databricks’ founders, UC Berkeley\\nand Stanford University at the 11th Conference on\\nInnovative Data Systems Research (CIDR) in 2021.\\n\\nAt Databricks, we are continuously innovating on\\nthe lakehouse architecture to help customers deliver\\non their data, analytics and AI aspirations. The ideal\\ndata, analytics and AI platform needs to operate\\ndifferently. Rather than copying and transforming\\ndata in multiple systems, you need one platform\\nthat accommodates all data types.\\n\\n\\n**Data science** **Data**\\n**and ML** **streaming**\\n\\n\\n**All ML, SQL, BI**\\n**and streaming use cases**\\n\\n**One security and governance**\\n**approach for all data assets**\\n**on all clouds**\\n\\n**A reliable data platform**\\n**to efficiently handle**\\n**all data types**\\n\\n\\n**Persona-based**\\n**use cases**\\n\\n**Unity Catalog**\\nFine-grained governance\\nfor data and AI\\n\\n**Delta Lake**\\nData reliability and performance\\n\\n\\n**Business**\\n**intelligence**\\n\\n\\n**SQL**\\n**analytics**\\n\\n\\nFiles and blobs and table ACLs\\n\\n\\nIdeally, the platform must be open, so that you\\nare not locked into any walled gardens. You would\\nalso have one security and governance model.\\nIt would not only manage all data types, but it\\nwould also be cloud-agnostic to govern data\\nwherever it is stored.\\n\\nLast, it would support all major data, analytics and AI\\nworkloads, so that your teams can easily collaborate\\nand get access to all the data they need to innovate.\\n\\n\\n-----\\n\\n#### What is the Databricks Lakehouse Platform?\\n\\nThe Databricks Lakehouse Platform unifies your\\ndata warehousing and AI uses cases on a single\\nplatform. It combines the best elements of data\\nlakes and data warehouses to deliver the reliability,\\nstrong governance and performance of data\\nwarehouses with the openness, flexibility and\\nmachine learning support of data lakes.\\n\\nThis unified approach simplifies your modern data\\nstack by eliminating the data silos that traditionally\\nseparate and complicate data engineering, analytics,\\nBI, data science and machine learning. It’s built\\non open source and open standards to maximize\\nflexibility. And, its common approach to data\\nmanagement, security and governance helps you\\n\\noperate more efficiently and innovate faster.\\n\\n\\n**Lakehouse Platform**\\n\\nData Data Data Data science\\nwarehousing engineering streaming and ML\\n\\n\\n-----\\n\\n#### Benefits of the Databricks Lakehouse Platform\\n\\n\\n**Simple**\\n\\nThe unified approach simplifies your data\\narchitecture by eliminating the data silos that\\ntraditionally separate analytics, BI, data science\\nand machine learning. With a lakehouse, you\\ncan eliminate the complexity and expense that\\nmake it hard to achieve the full potential of\\nyour analytics and AI initiatives.\\n\\n\\n**Open**\\n\\nDelta Lake forms the open foundation of\\nthe lakehouse by providing reliability and\\nperformance directly on data in the data\\nlake. You’re able to avoid proprietary walled\\ngardens, easily share data and build your\\nmodern data stack with unrestricted access\\nto the ecosystem of open source data projects\\nand the broad Databricks partner network.\\n\\n\\n**Multicloud**\\n\\nThe Databricks Lakehouse Platform offers\\nyou a consistent management, security and\\ngovernance experience across all clouds. You\\ndo not need to invest in reinventing processes\\nfor every cloud platform that you are using to\\nsupport your data and AI efforts. Instead, your\\ndata teams can simply focus on putting all\\nyour data to work to discover new insights.\\n\\n\\n-----\\n\\n#### The Databricks Lakehouse Platform architecture\\n\\n**Data reliability and performance for lakehouse**\\n\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\\nwith all major analytics tools and works with the widest variety of formats to\\nstore and process data.\\n\\n\\n**Instant compute and serverless**\\n\\nServerless compute is a fully managed service where Databricks provisions and\\nmanages the compute layer on behalf of the customer in the Databricks cloud\\naccount instead of the customer account. As of the current release, serverless\\ncompute is supported for use with Databricks SQL.\\n\\nIn Chapter 6, we explore the details of instant compute and serverless for lakehouse.\\n\\n\\n[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\\na state-of-the-art vectorized engine for fast querying and provides the best\\nperformance for all workloads in the lakehouse.\\n\\nIn Chapter 3, we explore the details of data reliability and performance\\n\\nfor the lakehouse.\\n\\n**Unified governance and security for lakehouse**\\n\\nThe Databricks Lakehouse Platform provides unified governance with enterprise\\nscale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\\ngovernance for your data and AI assets in the lakehouse — files, tables,\\ndashboards, and machine learning models — giving you much better control,\\nmanagement and security across clouds.\\n\\n[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\\ndata across the organization in real time, independent of the platform\\non which the data resides.\\n\\nIn Chapter 4, we go into the details of unified governance for lakehouse\\n\\nand, in Chapter 5, we dive into the details of security for lakehouse.\\n\\n\\n-----\\n\\n#### The Databricks Lakehouse Platform workloads\\n\\nThe Databricks Lakehouse Platform architecture supports different workloads\\nsuch as data warehousing, data engineering, data streaming, data science and\\nmachine learning on one simple, open and multicloud data platform.\\n\\n**Data warehousing**\\n\\nData warehousing is one of the most business-critical workloads for data teams,\\nand the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\\nlets you run all your SQL and BI applications at scale with up to 12x better price/\\nperformance, a unified governance model, open formats and APIs, and your tools\\nof choice — no lock-in. Reduce resource management overhead with serverless\\ncompute, and easily ingest, transform and query all your data in-place to deliver\\nreal-time business insights faster.\\n\\nBuilt on open standards and APIs, the Databricks Lakehouse Platform provides\\nthe reliability, quality and performance that data lakes natively lack, plus\\nintegrations with the ecosystem for maximum flexibility.\\n\\nIn Chapter 7, we go into the details of data warehousing on the lakehouse.\\n\\n**Data engineering**\\n\\nData engineering on the lakehouse allows data teams to unify batch and\\nstreaming operations on a simplified architecture, streamline data pipeline\\ndevelopment and testing, build reliable data, analytics and AI workflows on\\nany cloud platform, and meet regulatory requirements to maintain governance.\\n\\n\\nautomates the complexity of building and maintaining pipelines and running ETL\\nworkloads so data engineers and analysts can focus on quality and reliability to\\ndrive valuable insights.\\n\\nIn Chapter 8, we go into the details of data engineering on the lakehouse.\\n\\n**Data streaming**\\n\\n[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\\nLakehouse Platform and is the future of all data processing. Real-time processing\\nprovides the freshest possible data to an organization’s analytics and machine\\nlearning models enabling them to make better, faster decisions, more accurate\\npredictions, offer improved customer experiences and more.\\n\\nThe Databricks Lakehouse Platform Dramatically simplifies data streaming to\\ndeliver real-time analytics, machine learning and applications on one platform.\\n\\nIn Chapter 9, we go into the details of data streaming on the lakehouse.\\n\\n**Data science and machine learning**\\n\\nData science and machine learning (DSML) on the lakehouse is a powerful\\nworkload that is unique to many other data offerings. DSML on the lakehouse\\nprovides a data-native and collaborative solution for the full ML lifecycle. It\\ncan maximize data and ML team productivity, streamline collaboration, empower\\nML teams to prepare, process and manage data in a self-service manner,\\nand standardize the ML lifecycle from experimentation to production.\\n\\nIn Chapter 10, we go into the details of DSML on the lakehouse.\\n\\n\\nThe lakehouse provides an end-to-end data engineering and ETL platform that\\n\\n\\n-----\\n\\n**Databricks Lakehouse Platform and your**\\n**modern data stack**\\n\\nThe Databricks Lakehouse Platform is open and provides the flexibility to\\ncontinue using existing infrastructure, to easily share data and build your modern\\ndata stack with unrestricted access to the ecosystem of open source data\\nprojects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\\n\\nIn Chapter 11, we go into the details of our technology partners and the\\n\\nmodern data stack.\\n\\n#### Global adoption of the Databricks Lakehouse Platform\\n\\n\\nToday, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\\nacross industries doing transformational work. Organizations around the globe\\nare driving change and delivering a new generation of data, analytics and AI\\napplications. We believe that the unfulfilled promise of data and AI can finally\\nbe fulfilled with one platform for data analytics, data science and machine\\nlearning with the Databricks Lakehouse Platform.\\n\\n\\n**Learn more**\\n\\n[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\\n\\n[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\\n\\n[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\\n\\n[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\\n\\n[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\\n\\n[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n# 03\\n\\n\\n### Data reliability and performance\\n\\nTo bring openness, reliability and lifecycle management to data lake\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nple task orchestration from the underlying\\ndata processing platform reduce the overall reliability of their production\\nworkloads, limit observability, and increase complexity for end users.\\n\\n#### What is Databricks Workflows?\\n\\n[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\\n[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\\nany cloud.\\n\\n\\nWorkflows lets you orchestrate data flow pipelines (written in DLT or dbt),\\nas well as machine learning pipelines, or any other tasks such as notebooks\\nor Python wheels. Since Databricks Workflows is fully managed, it eliminates\\noperational overhead for data engineers, enabling them to focus on your\\nworkflows not on managing your infrastructure. It provides an easy point-and-click\\nauthoring experience for all your data teams, not just those with specialized skills.\\nDeep integration with the underlying lakehouse platform ensures you will create\\nand run reliable production workloads on any cloud while providing deep and\\ncentralized monitoring with simplicity for end users.\\n\\nSharing job clusters over multiple tasks reduces the time a job takes, reduces\\ncosts by eliminating overhead and increases cluster utilization with parallel tasks.\\n\\n\\n-----\\n\\nDatabricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\\nshows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\\ntriggers the execution of all dependent tasks.\\n\\nYou can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\\n\\nexternal orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\\n\\n\\n-----\\n\\n#### Orchestrate anything\\n\\nRemember that DLT is one of many task types for Databricks Workflows.\\nThis is where the managed data flow pipelines with DLT tie together with\\nthe easy point-and-click authoring experience of Databricks Workflows.\\n\\nIn the following example, you can see an end-to-end workflow built with\\ncustomers in a workshop: Data is streamed from Twitter according to search\\nterms, then ingested with Auto Loader using automatic schema detection and\\nenforcement. In the next step, the data is cleaned and transformed with Delta\\nLive table pipelines written in SQL, and finally run through a pre-trained BERT\\nlanguage model from Hugging Face for sentiment analysis of the tweets.\\nDifferent task types for ingest, cleanse/transform and ML are combined\\nin a single workflow.\\n\\nUsing Workflows, these tasks can be scheduled to provide a daily overview of\\nsocial media coverage and customer sentiment for a business. After streaming\\ntweets with filtering for keywords such as “data engineering,” “lakehouse” and\\n“Delta Lake,” we curated a list of those tweets that were classified as positive\\nwith the highest probability score.\\n\\n**Learn more**\\n\\n\\n[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\\n[Lakehouse](https://databricks.com/solutions/data-pipelines)\\n\\n\\n[Delta Live Tables](https://databricks.com/product/delta-live-tables)\\n\\n[Databricks Workflows](https://www.databricks.com/product/workflows)\\n\\n\\n[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n### Data streaming\\n# 09\\n\\n\\n**CHAPTER**\\n\\n\\nThere are two types of data processing: batch processing\\nand streaming processing.\\n\\n\\nBatch processing refers to the discontinuous, periodic processing\\nof data that has been stored for a period of time. For example,\\nan organization may need to run weekly reports on a set of\\npredictable transaction data. There is no need for this data\\nto be streaming — it can be processed on a weekly basis.\\n\\nStreaming processing, on the other hand, refers to unbounded\\nprocessing of data as it arrives.\\n\\n\\n-----\\n\\n**Data Streaming Challenges**\\n\\nHowever, getting value from streaming data can be a tricky practice. While most\\ndata today can be considered streaming data, organizations are overwhelmed by\\nthe need to access, process and analyze the volume, speed and variety of this\\ndata moving through their platforms. To keep pace with innovation, they must\\nquickly make sense of data streams decisively, consistently and in real time.\\n\\nThree common technical challenges organizations experience\\nwith implementing real-time data streaming include:\\n\\n**•** **Specialized APIs and language skills:** Data practitioners encounter\\nbarriers to adopting streaming skillsets because there are new languages,\\nAPIs and tools to learn.\\n\\n**•** **Operational complexity:** To implement data streaming at scale, data\\nteams need to integrate and manage streaming-specific tools with\\ntheir other cloud services. They also have to manually build complex\\noperational tooling to help these systems recover from failure, restart\\nworkloads without reprocessing data, optimize performance, scale the\\nunderlying infrastructure, and so on.\\n\\n**•** **Incompatible governance models:** Different governance and security\\nmodels across real-time and historical data platforms makes it difficult\\nto provide the right access to the right users, see the end-to-end data\\nlineage, and/or meet compliance requirements.\\n\\n\\nIn a wide variety of cases, an organization might find it useful to\\nleverage streaming data. Here are some common examples:\\n\\n**•** **Retail:** Real-time inventory updates help support business activities, such\\nas inventory and pricing optimization and optimization of the supply chain,\\nlogistics and just-in-time delivery.\\n\\n**•** **Smart energy:** Smart meter monitoring in real time allows for smart\\nelectricity pricing models and connection with renewable energy sources\\nto optimize power generation and distribution.\\n\\n**•** **Preventative maintenance:** By reducing unplanned outages and\\nunnecessary site and maintenance visits, real-time streaming analytics can\\nlower operational and equipment costs.\\n\\n**•** **Industrial automation:** Manufacturers can use streaming and predictive\\nanalytics to improve production processes and product quality, including\\nsetting up automated alerts.\\n\\n**•** **Healthcare:** To optimize care recommendations, real-time data allows\\nfor the integration of various smart sensors to monitor patient condition,\\nmedication levels and even recovery speed.\\n\\n**•** **Financial institutions:** Firms can conduct real-time analysis of\\n\\ntransactions to detect fraudulent transactions and send alerts. They\\ncan use fraud analytics to identify patterns and feed data into machine\\nlearning algorithms.\\n\\n\\nRegardless of specific use cases, the central tenet of streaming data is that it\\ngives organizations the opportunity to leverage the freshest possible insights for\\nbetter decision-making and more optimized customer experiences.\\n\\n\\n-----\\n\\n**Data streaming architecture**\\n\\nBefore addressing these challenges head-on, it may help to take a step back and\\ndiscuss the ingredients of a streaming data pipeline. Then, we will explain how\\nthe Databricks Lakehouse Platform operates within this context to address the\\naforementioned challenges.\\n\\nEvery application of streaming data requires a pipeline that brings the data from\\nits origin point — whether sensors, IoT devices or database transactions — to its\\nfinal destination.\\n\\nIn building this pipeline, streaming architectures typically employ two layers.\\nFirst, streaming capture systems **capture** and temporarily store streaming data\\nfor processing. Sometimes these systems are also called messaging systems\\nor messaging buses. These systems are optimized for small payloads and high\\nfrequency inputs/outputs. Second, streaming **processing** systems continuously\\nprocess data from streaming capture systems and other storage systems.\\n\\n**Capturing** **Processing**\\n\\n\\nIt may help to think of a simplified streaming pipeline\\naccording to the following seven phases:\\n\\n1. Data is continuously generated at origin points\\n\\n2. The generated data is captured from those origin points by\\na capture system like Apache Kafka (with limited retention)\\n\\n**3. The captured data is extracted and incrementally ingested to**\\n**a processing platform like Databricks; data is ingested exactly**\\n**once and stored permanently, even if this step is rerun**\\n\\n**4. The ingested data is converted into a workable format**\\n\\n**5. The formatted data is cleansed, transformed and joined in**\\n**a number of pipeline steps**\\n\\n**6. The transformed data is processed downstream through**\\n**analysis or ML modeling**\\n\\n7. The resulting analysis or model is used for some sort of practical\\napplication, which may be anything from basic reporting to an\\nevent-driven software application\\n\\nYou will notice four of the steps in this list are in boldface. This is because the\\nlakehouse architecture is specifically designed to optimize this part of the\\npipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\\nanalyze and model on streaming data _alongside_ batch-processed data. It can\\naccommodate both structured _and_ unstructured data. It is here that the value\\nof unifying the best pieces of data lakes and data warehouses really shines for\\ncomplex enterprise use cases.\\n\\n\\n-----\\n\\n**Data Streaming on the Lakehouse**\\n\\nNow let’s zoom in a bit and see how the Databricks Lakehouse\\nPlatform addresses each part of the pipeline mentioned above.\\n\\n**Streaming data ingestion and transformation** begins with continuously\\nand incrementally collecting raw data from streaming sources through a\\nfeature called Auto Loader. Once the data is ingested, it can be transformed\\nfrom raw, messy data into clean, fresh, reliable data appropriate for downstream\\nanalytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\\nmanage these data pipelines while automatically taking care of infrastructure\\nmanagement and scaling, data quality, error testing and other administrative\\ntasks. DLT is a high-level abstraction built on Spark Structured Streaming,\\na scalable and fault-tolerant stream processing engine.\\n\\n**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\\nof streaming data. With fresher data streaming into SQL analytics or BI\\nreporting, more actionable insights can be achieved, resulting in better\\nbusiness outcomes.\\n\\n**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\\ndeployment is supported with structured streaming for continuous inference\\nfrom a live data stream. Like real-time analytics, real-time ML is a downstream\\nimpact of streaming data, but for different business use cases (i.e., AI instead\\nof BI). Real-time modeling has many benefits, including more accurate\\npredictions about the future.\\n\\n\\n**Real-time applications** process data directly from streaming pipelines and\\ntrigger programmatic actions, such as displaying a relevant ad, updating the\\nprice on a pricing page, stopping a fraudulent transaction, etc. There typically\\nis no human-in-the-loop for such applications.\\n\\n\\nData in cloud storage and message stores\\n\\n\\n-----\\n\\n**Databricks Lakehouse Platform differentiators**\\n\\nUnderstanding what the lakehouse architecture provides is one\\n\\nthing, but it is useful to understand how Databricks uniquely\\n\\napproaches the common challenges mentioned earlier around\\n\\nworking with streaming data.\\n\\n**Databricks empowers unified data teams.** Data engineers, data scientists\\nand analysts can easily build streaming data workloads with the languages\\nand tools they already know and the APIs they already use.\\n\\n**Databricks simplifies development and operations.** Organizations can\\nfocus on getting value from data by reducing complexity and automating\\nmuch of the production aspects associated with building and maintaining\\nreal-time data workloads.\\n\\n\\nSee why customers love streaming on the Databricks\\nLakehouse Platform with these resources.\\n\\n**Learn more**\\n\\n[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\\n\\n[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\\n[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\\n\\n[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\\n\\n[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\\n\\n\\n**Databricks is one platform for streaming and batch data.** Organizations\\ncan eliminate data silos, centralize security and governance models, and\\nprovide complete support for all their real-time use cases under one roof —\\nthe roof of the lakehouse.\\n\\nFinally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\\n\\n[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\\ndeeply integrated with Spark Structured Streaming and overcomes many of\\nthe limitations typically associated with streaming systems and files.\\n\\nIn summary, the Databricks Lakehouse Platform dramatically simplifies data\\nstreaming to deliver real-time analytics, machine learning and applications on\\none platform. And, that platform is built on a foundation with streaming at its\\ncore. This means organizations of all sizes can use their data in motion and\\nmake more informed decisions faster than ever.\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n### Data science and machine learning\\n# 10\\n\\n\\n**CHAPTER**\\n\\n\\nWhile most companies are aware of the potential benefits of applying\\nmachine learning and AI, realizing these potentials can often be quite\\nchallenging for those brave enough to take the leap. Some of the\\nlargest hurdles come from siloed/disparate data systems, complex\\nexperimentation environments, and getting models served in a\\nproduction setting.\\n\\n\\nFortunately, the Databricks Lakehouse Platform provides a helping\\nhand and lets you use data to derive innovative insights, build\\npowerful predictive models, and enable data scientists, ML engineers,\\nand developers of all kinds to create within the space of machine\\nlearning and AI.\\n\\n\\n-----\\n\\n#### Databricks Machine Learning\\n\\n\\n-----\\n\\n#### Exploratory data analysis\\n\\nWith all the data in one place, data is easily\\nexplored and visualized from within the\\nnotebook-style experience that provides support\\nfor various languages (R, SQL, Python and Scala)\\nas well as built-in visualizations and dashboards.\\nConfidently and securely share code with\\nco-authoring, commenting, automatic versioning,\\nGit integrations and role-based access controls.\\nThe platform provides laptop-like simplicity at\\nproduction-ready scale.\\n\\n\\n-----\\n\\n#### Model creation and management\\n\\nFrom data ingestion to model training and tuning, all the way through to\\nproduction model serving and versioning, the Lakehouse brings the tools\\nneeded to simplify those tasks.\\n\\nGet right into experimenting with the Databricks ML runtimes, optimized and\\npreconfigured to include most popular libraries like scikit-learn, XGBoost and\\nmore. Massively scale thanks to built-in support for distributed training and\\nhardware acceleration with GPUs.\\n\\nFrom within the runtimes, you can track model training sessions, package and\\nreuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\\ncreated by Databricks and included as a managed service within the Lakehouse.\\nIt provides a centralized location from which to manage models and package\\ncode in an easily reusable way.\\n\\nTraining these models often involves the use of features housed in a centralized\\nfeature store. Fortunately, Databricks has a built-in feature store that allows you\\nto create new features, explore and re-use existing features, select features for\\ntraining and scoring machine learning models, and publish features to low-latency\\nonline stores for real-time inference.\\n\\nIf you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\\nexperimentation by pointing to your data set and automatically training models\\nand tuning hyperparameters to save both novice and advanced users precious\\ntime in the machine learning process.\\n\\n\\nAutoML will also report back metrics related to the model training results as well\\nas the code needed to repeat the training already custom-tailored to your data\\nset. This glass box approach ensures that you are never trapped or suffer from\\nvendor lock-in.\\n\\nIn that regard, the Lakehouse supports the industry’s widest range of data tools,\\ndevelopment environments, and a thriving ISV ecosystem so you can make your\\nworkspace your own and put out your best work.\\n\\n##### Compute platform\\n\\n**Any ML workload optimized and accelerated**\\n\\n**Databricks Machine Learning Runtime**\\n\\n- Optimized and preconfigured ML frameworks\\n\\n- Turnkey distribution ML\\n\\n- Built-in AutoML\\n\\n- GPU support out of the box\\n\\n\\nBuilt-in **ML frameworks**\\nand **model explainability**\\n\\nBuilt-in support for **AutoML**\\nand **hyperparameter tuning**\\n\\n\\nBuilt-in support for\\n**distributed training**\\n\\nBuilt-in support for\\n**hardware accelerators**\\n\\n\\n-----\\n\\n#### Deploy your models to production\\n\\nExploring and creating your machine learning models\\ntypically represents only part of the task. Once the\\nmodels exist and perform well, they must become\\npart of a pipeline that keeps models updated,\\nmonitored and available for use by others.\\n\\n**Webhooks** allow registering of\\n\\n\\nDatabricks can help here by providing a world-class\\nexperience for model versioning, monitoring and\\nserving within the same platform that you can use\\nto generate the models themselves. This means you\\ncan make all your ML pipelines in the same place,\\nmonitor them for drift, retrain them with new data,\\nand promote and serve them easily and at scale.\\n\\nThroughout the ML lifecycle, rest assured knowing\\nthat lineage and governance are being tracked the\\nentire way. This means regulatory compliance and\\nsecurity woes are significantly reduced, potentially\\nsaving costly issues down the road.\\n\\n\\ncallbacks on events like stage\\n\\ntransitions to integrate with CI/CD\\n\\nautomation.\\n\\n**Tags** allow storing deployment\\n\\n— specific metadata with model\\n\\nversions, e.g., whether the\\n\\ndeployment was successful.\\n\\n\\n**Model lifecycle management**\\n\\nStaging Production Archived\\n\\n\\nLogged\\nmodel\\n\\n**Comments** allow communication\\n\\nand collaboration between\\n\\nteammates when reviewing\\n\\nmodel versions.\\n\\n\\n-----\\n\\n**Learn more**\\n\\n[Databricks Machine Learning](https://databricks.com/product/machine-learning)\\n\\n[Databricks Data Science](https://databricks.com/product/data-science)\\n\\n[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n# 11\\n\\n\\n### Databricks Technology Partners and the modern data stack\\n\\nDatabricks Technology Partners integrate their solutions with Databricks to\\nprovide complementary capabilities for ETL, data ingestion, business intelligence,\\nmachine learning and governance. These integrations allow customers to leverage\\nthe Databricks Lakehouse Platform’s reliability and scalability to innovate faster\\nwhile deriving valuable data insights. Use preferred analytical tools with optimized\\nconnectors for fast performance, low latency and high user concurrency to your\\ndata lake.\\n\\n\\n-----\\n\\nWith [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\\ntools to your lakehouse using validated integrations and helps you discover and try new solutions.\\n\\n**Databricks thrives within your modern data stack**\\n\\n**BI and dashboards** **Machine learning** **Data science**\\n\\n\\n**Data governance**\\n\\n**Data pipelines**\\n\\n**Data ingestion**\\n\\n\\nData Data Data\\nwarehousing engineering streaming\\n\\n**Unity Catalog**\\n\\n\\nData science\\nand ML\\n\\n\\n**Consulting**\\n**and SI partners**\\n\\n\\n**Delta Lake**\\n\\n**Cloud Data Lake**\\n\\n**Learn more**\\n\\n\\n[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\\n\\n[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\\n\\n\\n[Partner Connect](https://databricks.com/partnerconnect)\\n\\n[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n### Get started with the Databricks Lakehouse Platform\\n# 12\\n\\n\\n-----\\n\\n#### Databricks Trial\\n\\nGet a collaborative environment for data teams to build solutions together with interactive\\nnotebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\\nscikit-learn and more.\\n\\n**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\\nhosted by Databricks\\n\\n**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\\n\\n\\n**[Databricks documentation](https://databricks.com/documentation)**\\n\\nGet detailed documentation to get started with\\nthe Databricks Lakehouse Platform on your cloud\\nof choice: Databricks on AWS, Azure Databricks\\nand [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\\n\\n**[Databricks Demo Hub](https://databricks.com/discover/demos)**\\n\\nGet a firsthand look at Databricks from the\\npractitioner’s perspective with these simple\\non-demand videos. Each demo is paired with\\nrelated materials — including notebooks, videos\\nand eBooks — so that you can try it out for\\nyourself on Databricks.\\n\\n\\n**[Databricks Academy](https://databricks.com/learn/training/home)**\\n\\nWhether you are new to the data lake or building on\\nan existing skill set, you can find a curriculum tailored\\nto your role or interest. With training and certification\\nthrough Databricks Academy, you will learn to master\\nthe Databricks Lakehouse Platform for all your big\\ndata analytics projects.\\n\\n**[Databricks Community](https://community.databricks.com/)**\\n\\n\\n**[Databricks Labs](https://databricks.com/learn/labs)**\\n\\nDatabricks Labs are projects created by the\\nfield to help customers get their use cases\\ninto production faster.\\n\\n**[Databricks customers](https://databricks.com/customers)**\\n\\nDiscover how innovative companies across\\nevery industry are leveraging the Databricks\\nLakehouse Platform.\\n\\n\\nGet answers, network with peers and solve\\nthe world’s toughest problems, together.\\n\\n\\n-----\\n\\n#### About Databricks\\n\\nDatabricks is the data and AI company. More than 7,000\\norganizations worldwide — including Comcast, Condé Nast,\\nH&M and over 40% of the Fortune 500 — rely on the Databricks\\nLakehouse Platform to unify their data, analytics and AI. Databricks\\nis headquartered in San Francisco, with offices around the globe.\\nFounded by the original creators of Apache Spark™, Delta Lake\\nand MLflow, Databricks is on a mission to help data teams solve the\\nworld’s toughest problems. To learn more, follow Databricks on\\n[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\\n\\n© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\\n\\n\\n-----', 'parser_status': 'SUCCESS', 'doc_uri': '/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf', 'last_modified': Timestamp('2024-09-19 16:57:20')}\n" + ] + } + ], + "source": [ + "from cookbook.data_pipeline.parse_docs import load_files_to_df\n", + "from pyspark.sql import functions as F\n", + "\n", + "\n", + "raw_files_df = load_files_to_df(\n", + " spark=spark,\n", + " source_path=source_config.volume_path,\n", + ")\n", + "\n", + "print(f\"Loaded {raw_files_df.count()} files from {source_config.volume_path}. Files: {source_config.list_files()}\")\n", + "\n", + "test_records_dict = raw_files_df.toPandas().to_dict(orient=\"records\")\n", + "\n", + "for record in test_records_dict:\n", + " print()\n", + " print(\"Testing parsing for file: \", record[\"path\"])\n", + " print()\n", + " test_result = file_parser(raw_doc_contents_bytes=record['content'], doc_path=record['path'], modification_time=record['modificationTime'], doc_bytes_length=record['length'])\n", + " print(test_result)\n", + " break # pause after 1 file. if you want to test more files, remove the break statement\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9fb6db6c-faa0-4dac-be84-a832bbbb49b9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "🚫✏️ The below cell is boilerplate code to apply the parsing function using Spark." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "165706b2-5824-42e7-a22b-3ca0edfd0a77", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Loading the raw files from /Volumes/casaman_ssa/demos/volume_databricks_documentation...\nApplying parsing & metadata extraction to 29 files using Spark - this may take a long time if you have many documents...\nParsed 29 / 29 documents successfully. Inspect `parsed_files_no_errors_df` or visit https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs__v2 to see all parsed documents, including any errors.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
contentparser_statusdoc_urilast_modified
**EBOOK**\n", + "\n", + "## The Big Book of Data Engineering 2nd Edition\n", + "\n", + "A collection of technical\n", + "blogs, including code\n", + "samples and notebooks\n", + "\n", + "##### With all-new content\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n", + "\n", + "**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n", + "\n", + "**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n", + "\n", + "**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n", + "\n", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n", + "\n", + "**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n", + "\n", + "**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n", + "\n", + "**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n", + "\n", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n", + "\n", + "**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n", + "\n", + "**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n", + "\n", + "**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n", + "\n", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77\n", + "\n", + "**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n", + "\n", + "**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n", + "\n", + "**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n", + "\n", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90\n", + "\n", + "**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 01\n", + "\n", + "\n", + "### Introduction to Data Engineering on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "Organizations realize the value data plays as a strategic asset for various\n", + "business-related initiatives, such as growing revenues, improving the customer\n", + "experience, operating efficiently or improving a product or service. However,\n", + "accessing and managing data for these initiatives has become increasingly\n", + "complex. Most of the complexity has arisen with the explosion of data volumes\n", + "and data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\n", + "to increase, 73% of the data goes unused for analytics or decision-making. In\n", + "order to try and decrease this percentage and make more data usable, data\n", + "engineering teams are responsible for building data pipelines to efficiently and\n", + "reliably deliver data. But the process of building these complex data pipelines\n", + "comes with a number of difficulties:\n", + "\n", + "**•** In order to get data into a data lake, data engineers are required\n", + "to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state\n", + "\n", + "**Data pipeline observability**\n", + "Monitor overall data pipeline status from a dataflow graph dashboard and\n", + "visually track end-to-end pipeline health for performance, quality and latency.\n", + "Data pipeline observability capabilities include:\n", + "\n", + "**•** A high-quality, high-fidelity lineage diagram that provides visibility\n", + "into how data flows for impact analysis\n", + "\n", + "**•** Granular logging with performance and status of the data pipeline\n", + "at a row level\n", + "\n", + "**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n", + "\n", + "\n", + "**Automatic deployments and operations**\n", + "Ensure reliable and predictable delivery of data for analytics and machine\n", + "learning use cases by enabling easy and automatic data pipeline deployments\n", + "and rollbacks to minimize downtime. Benefits include:\n", + "\n", + "**•** Complete, parameterized and automated deployment for the\n", + "continuous delivery of data\n", + "\n", + "**•** End-to-end orchestration, testing and monitoring of data pipeline\n", + "deployment across all major cloud providers\n", + "\n", + "**Migrations**\n", + "Accelerating and de-risking the migration journey to the lakehouse, whether\n", + "from legacy on-prem systems or disparate cloud services.\n", + "\n", + "The migration process starts with a detailed discovery and assessment to\n", + "get insights on legacy platform workloads and estimate migration as well as\n", + "Databricks platform consumption costs. Get help with the target architecture\n", + "and how the current technology stack maps to Databricks, followed by a\n", + "phased implementation based on priorities and business needs. Throughout\n", + "this journey companies can leverage:\n", + "\n", + "**•** Automation tools from Databricks and its ISV partners\n", + "\n", + "**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n", + "\n", + "**•** Databricks Professional Services and training\n", + "\n", + "This is the recommended approach for a successful migration, whereby\n", + "customers have seen a 25-50% reduction in costs and 2-3x faster time to value\n", + "for their use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified governance**\n", + "With Unity Catalog, data engineering and governance teams benefit from an\n", + "enterprisewide data catalog with a single interface to manage permissions,\n", + "centralize auditing, automatically track data lineage down to the column level,\n", + "and share data across platforms, clouds and regions. Benefits:\n", + "\n", + "**•** Discover all your data in one place, no matter where it lives,\n", + "and centrally manage fine-grained access permissions using an\n", + "ANSI SQL-based interface\n", + "\n", + "**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**\n", + "\n", + "As organizations strive to become data-driven, data engineering is a focal\n", + "point for success. To deliver reliable, trustworthy data, data engineers shouldn’t\n", + "need to spend time manually developing and maintaining an end-to-end\n", + "ETL lifecycle. Data engineering teams need an efficient, scalable way to\n", + "simplify ETL development, improve data reliability and manage operations.\n", + "\n", + "As described, the eight key differentiating capabilities simplify the\n", + "management of the ETL lifecycle by automating and maintaining all data\n", + "dependencies, leveraging built-in quality controls with monitoring and by\n", + "providing deep visibility into pipeline operations with automatic recovery.\n", + "Data engineering teams can now focus on easily and rapidly building reliable\n", + "end-to-end production-ready data pipelines using only SQL or Python\n", + "for batch and streaming that deliver high-value data for analytics, data\n", + "science or machine learning.\n", + "\n", + "\n", + "**Follow proven best practices**\n", + "\n", + "In the next section, we describe best practices for data engineering\n", + "end-to end use cases drawn from real-world examples. From data ingestion\n", + "and real-time processing to analytics and machine learning, you’ll learn\n", + "how to translate raw data into actionable data.\n", + "\n", + "As you explore the rest of this guide, you can find data sets and code\n", + "samples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\n", + "get your hands dirty as you explore all aspects of the data lifecycle on the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### Guidance and Best Practices\n", + "\n", + "**2.1** Top 5 Databricks Performance Tips\n", + "\n", + "**2.2** How to Profile PySpark\n", + "\n", + "**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n", + "\n", + "**2.4** Streaming in Production: Collected Best Practices\n", + "\n", + "**2.5** Streaming in Production: Collected Best Practices, Part 2\n", + "\n", + "**2.6** Building Geospatial Data Products\n", + "\n", + "**2.7** Data Lineage With Unity Catalog\n", + "\n", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:\n", + "\n", + "**•** **Use larger clusters.** It may sound obvious, but this is the number\n", + "one problem we see. It’s actually not any more expensive to use a large\n", + "cluster for a workload than it is to use a smaller one. It’s just faster.\n", + "If there’s anything you should take away from this article, it’s this.\n", + "\n", + "Read section 1. Really.\n", + "\n", + "**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\n", + "to learn more. You won’t regret it.\n", + "\n", + "\n", + "\n", + "**•** **Clean out your configurations** . Configurations carried from one\n", + "Apache Spark™ version to the next can cause massive problems. Clean up!\n", + "Read section 3 to learn more.\n", + "\n", + "**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\n", + "correctly, if at all. See Section 4 to learn more.\n", + "\n", + "**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\n", + "you’re writing Spark code, jump to section 5.\n", + "\n", + "**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\n", + "blog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n", + "\n", + "**1. Give your clusters horsepower!**\n", + "\n", + "This is the number one mistake customers make. Many customers create tiny\n", + "clusters of two workers with four cores each, and it takes forever to do anything.\n", + "The concern is always the same: they don’t want to spend too much money on\n", + "larger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n", + "**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n", + "\n", + "\n", + "-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2\n", + "\n", + "Notice that the total cost of the workload stays the same while the real-world\n", + "time it takes for the job to run drops significantly. So, bump up your Databricks\n", + "cluster specs and speed up your workloads without spending any more money. It\n", + "\n", + "can’t really get any simpler than that.\n", + "\n", + "**2. Use Photon**\n", + "\n", + "Our colleagues in engineering have rewritten the Spark execution engine in C++\n", + "and dubbed it Photon. The results are impressive!\n", + "\n", + "\n", + "Beyond the obvious improvements due to running the engine in native code,\n", + "they’ve also made use of CPU-level performance features and better memory\n", + "\n", + "management. On top of this, they’ve rewritten the Parquet writer in C++. So this\n", + "makes writing to Parquet and Delta (based on Parquet) super fast as well!\n", + "\n", + "But let’s also be clear about what Photon is speeding up. It improves\n", + "computation speed for any built-in functions or operations, as well as writes to\n", + "Parquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n", + "(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\n", + "spending most of its time reading from an ancient on-prem database? Won’t\n", + "help there either, unfortunately.\n", + "\n", + "\n", + "-----\n", + "\n", + "The good news is that it helps where it can. So even if part of your job can’t be\n", + "sped up, it will speed up the other parts. Also, most jobs are written with the\n", + "native operations and spend a lot of time writing to Delta, and Photon helps a lot\n", + "there. So give it a try. You may be amazed by the results!\n", + "\n", + "**3. Clean out old configurations**\n", + "\n", + "You know those Spark configurations you’ve been carrying along from version to\n", + "version and no one knows what they do anymore? They may not be harmless.\n", + "We’ve seen jobs go from running for hours down to minutes simply by cleaning\n", + "out old configurations. There may have been a quirk in a particular version of\n", + "Spark, a performance tweak that has not aged well, or something pulled off\n", + "some blog somewhere that never really made sense. At the very least, it’s worth\n", + "revisiting your Spark configurations if you’re in this situation. Often the default\n", + "configurations are the best, and they’re only getting better. Your configurations\n", + "may be holding you back.\n", + "\n", + "**4. The Delta Cache is your friend**\n", + "\n", + "This may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.\n", + "\n", + "Of course, your mileage may vary. If you’re doing BI, which involves reading the\n", + "same tables over and over again, caching gives an amazing boost. However, if\n", + "you’re simply reading a table once and writing out the results as in some ETL\n", + "jobs, you may not get much benefit. You know your jobs better than anyone.\n", + "Go forth and conquer.\n", + "\n", + "\n", + "-----\n", + "\n", + "**5. Be aware of lazy evaluation**\n", + "\n", + "\n", + "However, there is a catch here. Every time you try to display or write out\n", + "results, it runs the execution plan again. Let’s look at the same block of code\n", + "but extend it and do a few more operations.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ".filter(...)\n", + ")\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "\n", + "_# Unfortunately this will run the plan again, including filtering, joining,_\n", + "_etc_\n", + "df2.display()\n", + "\n", + "_# So will this…_\n", + "df2.count()\n", + "—------\n", + "\n", + "\n", + "If you’re a data analyst or data scientist only using SQL or doing BI you can skip\n", + "this section. However, if you’re in data engineering and writing pipelines or doing\n", + "processing using Databricks/Spark, read on.\n", + "\n", + "When you’re writing Spark code like select, groupBy, filter, etc., you’re really\n", + "building an execution plan. You’ll notice the code returns almost immediately when\n", + "you run these functions. That’s because it’s not actually doing any computation. So\n", + "even if you have petabytes of data, it will return in less than a second.\n", + "\n", + "However, once you go to write your results out you’ll notice it takes longer. This\n", + "is due to lazy evaluation. It’s not until you try to display or write results that your\n", + "execution plan is actually run.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.\n", + "\n", + "\n", + "This works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\n", + "benefit greatly from lazy evaluation, but it’s something a lot of customers trip\n", + "over. So be aware of its existence and save results you reuse in order to avoid\n", + "unnecessary computation.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "Let’s look at the same block of code again, but this time let’s avoid the\n", + "recomputation:\n", + "\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + ")\n", + "\n", + "_# save it_\n", + "df2.write.save(path)\n", + "\n", + "_# load it back in_\n", + "df3 = spark.read.load(path)\n", + "\n", + "_# now use it_\n", + "df3.display()\n", + "\n", + "_# this is not doing any extra computation anymore. No joins, filtering,_\n", + "_etc. It’s already done and saved._\n", + "df3.display()\n", + "\n", + "_# nor is this_\n", + "df3.count()\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.2 \u0007\n", + "\n", + "**How to Profile PySpark**\n", + "\n", + "by **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n", + "\n", + "October 6, 2022\n", + "\n", + "\n", + "In Apache Spark™, declarative Python APIs are supported for big data workloads.\n", + "They are powerful enough to handle most common use cases. Furthermore,\n", + "PySpark UDFs offer more flexibility since they enable users to run arbitrary\n", + "Python code on top of the Apache Spark™ engine. Users only have to state\n", + "“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\n", + "PySpark easier to use, but it can be difficult to identify performance bottlenecks\n", + "and apply custom optimizations.\n", + "\n", + "To address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**\n", + "\n", + "PySpark applications run as independent sets of processes on a cluster,\n", + "coordinated by the SparkContext object in the driver program. On the driver\n", + "side, PySpark is a regular Python process; thus, we can profile it as a normal\n", + "Python program using cProfile as illustrated below:\n", + "\n", + "import cProfile\n", + "\n", + "with cProfile.Profile() as pr:\n", + "_# Your code_\n", + "\n", + "pr.print_stats()\n", + "\n", + "**Workers profiling**\n", + "\n", + "Executors are distributed on worker nodes in the cluster, which introduces\n", + "complexity because we need to aggregate profiles. Furthermore, a Python worker\n", + "process is spawned per executor for PySpark UDF execution, which makes the\n", + "profiling more intricate.\n", + "\n", + "\n", + "-----\n", + "\n", + "The UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\n", + "and becomes a major tool to profile workers for PySpark applications. We’ll\n", + "illustrate how to use the UDF profiler with a simple Pandas UDF example.\n", + "\n", + "Firstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n", + "```\n", + " sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n", + " 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n", + " by 'id' )\n", + " ).withColumn( 'v' , rand())\n", + "\n", + "```\n", + "Later, we will group by the id column, which results in 8 groups with 1,000 rows\n", + "per group.\n", + "\n", + "The Pandas UDF plus_one is then created and applied as shown below:\n", + "```\n", + " import pandas as pd\n", + " def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf.apply( lambda x: x + 1 , axis= 1 )\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "Executing the example above and running sc.show_profiles() prints the\n", + "following profile. The profile below can also be dumped to disk by sc.dump_\n", + "profiles(path).\n", + "\n", + "The UDF id in the profile (271, highlighted above) matches that in the Spark plan\n", + "for res. The Spark plan can be shown by calling res.explain() .\n", + "\n", + "\n", + "Note that plus_one takes a pandas DataFrame and returns another pandas\n", + "DataFrame. For each group, all columns are passed together as a pandas\n", + "DataFrame to the plus_one UDF, and the returned pandas DataFrames are\n", + "combined into a PySpark DataFrame.\n", + "\n", + "\n", + "-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each function\n", + "\n", + "Digging into the column details: plus_one is triggered once per group, 8 times\n", + "in total; _arith_method of pandas Series is called once per row, 8,000 times\n", + "in total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\n", + "row, thus suffering from high invocation overhead.\n", + "\n", + "We can reduce such overhead by substituting the pandas.DataFrame.apply\n", + "with pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\n", + "follows:\n", + "```\n", + " import pandas as pd\n", + " def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf + 1\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n", + " schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "The updated profile is as shown below.\n", + "\n", + "We can summarize the optimizations as follows:\n", + "\n", + "**•** Arithmetic operation from 8,000 calls to 8 calls\n", + "\n", + "**•** Total function calls from 2,898,160 calls to 2,384 calls\n", + "\n", + "**•** Total execution time from 2.300 seconds to 0.004 seconds\n", + "\n", + "The short example above demonstrates how the UDF profiler helps us deeply\n", + "understand the execution, identify the performance bottleneck and enhance\n", + "the overall performance of the user-defined function.\n", + "\n", + "The UDF profiler was implemented based on the executor-side profiler,\n", + "which is designed for PySpark RDD API. The executor-side profiler is available\n", + "in all active Databricks Runtime versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "Both the UDF profiler and the executor-side profiler run on Python workers.\n", + "They are controlled by the spark.python.profile Spark configuration, which\n", + "is false by default. We can enable that Spark configuration on a Databricks\n", + "Runtime cluster as shown below.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "PySpark profilers are implemented based on cProfile; thus, the profile reporting\n", + "relies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\n", + "collecting profile reports from Python workers.\n", + "\n", + "Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022\n", + "\n", + "\n", + "[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\n", + "approach for creating reliable data pipelines and fully manages the underlying\n", + "infrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\n", + "actionable insights derived from near real-time data. Delta Live Tables enables\n", + "low-latency streaming data pipelines to support such use cases with low\n", + "latencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n", + "[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n", + "\n", + "This article will walk through using DLT with Apache Kafka while providing the\n", + "required Python code to ingest streams. The recommended system architecture\n", + "will be explained, and related DLT settings worth considering will be explored\n", + "along the way.\n", + "\n", + "**Streaming platforms**\n", + "\n", + "Event buses or message buses decouple message producers from consumers.\n", + "A popular streaming use case is the collection of click-through data from\n", + "users navigating a website where every user interaction is stored as an event in\n", + "\n", + "\n", + "Apache Kafka. The event stream from Kafka is then used for real-time streaming\n", + "data analytics. Multiple message consumers can read the same data from Kafka\n", + "and use the data to learn about audience interests, conversion rates, and bounce\n", + "reasons. The real-time, streaming event data from the user interactions often\n", + "also needs to be correlated with actual purchases stored in a billing database.\n", + "\n", + "**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”\n", + "\n", + "When developing DLT with Python, the @dlt.table decorator is used to create a\n", + "Delta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\n", + "which are simple SQL constraints clauses that define the pipeline’s behavior with\n", + "invalid records.\n", + "\n", + "Since streaming workloads often come with unpredictable data volumes,\n", + "Databricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\n", + "overall end-to-end latency while reducing cost by shutting down unnecessary\n", + "infrastructure.\n", + "\n", + "**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\n", + "pipeline run.\n", + "\n", + "In contrast, **streaming Delta Live Tables** are stateful, incrementally computed\n", + "and only process data that has been added since the last pipeline run. If the\n", + "query which defines a streaming live tables changes, new data will be processed\n", + "based on the new query but existing data is not recomputed. Streaming live\n", + "tables always use a streaming source and only work over append-only streams,\n", + "such as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\n", + "Structured Streaming.\n", + "\n", + "\n", + "You can chain multiple streaming pipelines, for example, workloads with very\n", + "large data volume and low latency requirements.\n", + "\n", + "**Direct ingestion from streaming engines**\n", + "\n", + "Delta Live Tables written in Python can directly ingest data from an event bus like\n", + "Kafka using Spark Structured Streaming. You can set a short retention period for\n", + "the Kafka topic to avoid compliance issues, reduce costs and then benefit from\n", + "the cheap, elastic and governable storage that Delta provides.\n", + "\n", + "As a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n", + "(raw) table and avoid complex transformations that could drop important data.\n", + "Like any Delta table the Bronze table will retain the history and allow it to perform\n", + "GDPR and other compliance tasks.\n", + "\n", + "Ingest streaming data from Apache Kafka\n", + "\n", + "\n", + "-----\n", + "\n", + "When writing DLT pipelines in Python, you use the @dlt.table annotation\n", + "to create a DLT table. There is no special attribute to mark streaming DLTs in\n", + "Python; simply use spark.readStream() to access the stream. Example code\n", + "for creating a DLT table with the name kafka_bronze that is consuming data\n", + "from a Kafka topic looks as follows:\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "TOPIC = \"tracker-events\"\n", + "KAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n", + "_# subscribe to TOPIC at KAFKA_BROKER_\n", + "raw_kafka_events = (spark.readStream\n", + ". format ( \"kafka\" )\n", + ".option( \"subscribe\" , TOPIC)\n", + ".option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n", + ".option( \"startingOffsets\" , \"earliest\" )\n", + ".load()\n", + ")\n", + "\n", + "**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n", + "```\n", + " def kafka_bronze ():\n", + "\n", + "```\n", + "return raw_kafka_events\n", + "\n", + "pipelines.reset.allowed\n", + "\n", + "Note that event buses typically expire messages after a certain period of time,\n", + "whereas Delta is designed for infinite retention.\n", + "\n", + "This might lead to the effect that source data on Kafka has already been deleted\n", + "when running a full refresh for a DLT pipeline. In this case, not all historic data\n", + "could be backfilled from the messaging platform, and data would be missing in\n", + "DLT tables. To prevent dropping data, use the following DLT table property:\n", + "\n", + "\n", + "pipelines.reset.allowed=false\n", + "\n", + "Setting pipelines.reset.allowed to false prevents refreshes to the table but\n", + "does not prevent incremental writes to the tables or new data from flowing into\n", + "the table.\n", + "\n", + "**Checkpointing**\n", + "\n", + "If you are an experienced Spark Structured Streaming developer, you will notice\n", + "the absence of checkpointing in the above code. In Spark Structured Streaming\n", + "checkpointing is required to persist progress information about what data has\n", + "been successfully processed and upon failure, this metadata is used to restart a\n", + "failed query exactly where it left off.\n", + "\n", + "Whereas checkpoints are necessary for failure recovery with exactly-once\n", + "guarantees in Spark Structured Streaming, DLT handles state automatically\n", + "without any manual configuration or explicit checkpointing required.\n", + "\n", + "**Mixing SQL and Python for a DLT pipeline**\n", + "\n", + "A DLT pipeline can consist of multiple notebooks but one DLT notebook is\n", + "required to be written entirely in either SQL or Python (unlike other Databricks\n", + "notebooks where you can have cells of different languages in a single notebook).\n", + "\n", + "Now, if your preference is SQL, you can code the data ingestion from Apache\n", + "Kafka in one notebook in Python and then implement the transformation logic of\n", + "your data pipelines in another notebook in SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Schema mapping**\n", + "\n", + "When reading data from messaging platform, the data stream is opaque and a\n", + "schema has to be provided.\n", + "\n", + "The Python example below shows the schema definition of events from a fitness\n", + "tracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n", + "\n", + "event_schema = StructType([ \\\n", + "StructField( \"time\" , TimestampType(), True ) , \\\n", + "StructField( \"version\" , StringType(), True ), \\\n", + "StructField( \"model\" , StringType(), True ) , \\\n", + "StructField( \"heart_bpm\" , IntegerType(), True ), \\\n", + "StructField( \"kcal\" , IntegerType(), True ) \\\n", + "])\n", + "\n", + "_# temporary table, visible in pipeline but not in data browser,_\n", + "_# cannot be queried interactively_\n", + "**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n", + "**temporary=** **True** **)**\n", + "```\n", + " def kafka_silver ():\n", + "\n", + "```\n", + "return (\n", + "_# kafka streams are (timestamp,value)_\n", + "_# value contains the kafka payload_\n", + "\n", + "dlt.read_stream( \"kafka_bronze\" )\n", + ".select(col( \"timestamp\" ),from_json(col( \"value\" )\n", + ".cast( \"string\" ), event_schema).alias( \"event\" ))\n", + ".select( \"timestamp\" , \"event.*\" )\n", + "\n", + "\n", + "**Benefits**\n", + "\n", + "Reading streaming data in DLT directly from a message broker minimizes the\n", + "architectural complexity and provides lower end-to-end latency since data is\n", + "directly streamed from the messaging broker and no intermediary step is involved.\n", + "\n", + "**Streaming ingest with cloud object store intermediary**\n", + "\n", + "For some specific use cases, you may want to offload data from Apache Kafka,\n", + "e.g., using a Kafka connector, and store your streaming data in a cloud object\n", + "intermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\n", + "ingest the files.\n", + "\n", + "Auto Loader can ingest data with a single line of SQL code. The syntax to ingest\n", + "JSON files into a DLT table is shown below (it is wrapped across two lines for\n", + "readability).\n", + "\n", + "_-- INGEST with Auto Loader_\n", + "create or replace streaming live table raw\n", + "as select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n", + "\n", + "\n", + "-----\n", + "\n", + "Note that Auto Loader itself is a streaming data source and all newly arrived files\n", + "will be processed exactly once, hence the streaming keyword for the raw table\n", + "that indicates data is ingested incrementally to that table.\n", + "\n", + "Since offloading streaming data to a cloud object store introduces an additional\n", + "step in your system architecture it will also increase the end-to-end latency\n", + "and create additional storage costs. Keep in mind that the Kafka connector\n", + "writing event data to the cloud object store needs to be managed, increasing\n", + "operational complexity.\n", + "\n", + "Therefore Databricks recommends as a best practice to directly access event\n", + "bus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n", + "\n", + "**Other event buses or messaging systems**\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to other event buses or messaging systems. DLT supports any data\n", + "source that Databricks Runtime directly supports.\n", + "\n", + "**Amazon Kinesis**\n", + "In Kinesis, you write messages to a fully managed serverless stream. Same as\n", + "Kafka, Kinesis does not permanently store messages. The default message\n", + "retention in Kinesis is one day.\n", + "\n", + "When using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\n", + "Python code for streaming ingestion above and add Amazon Kinesis-specific\n", + "settings with option(). For more information, check the section about Kinesis\n", + "Integration in the Spark Structured Streaming documentation.\n", + "\n", + "\n", + "**Azure Event Hubs**\n", + "\n", + "For Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\n", + "the article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n", + "\n", + "**Summary**\n", + "\n", + "DLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\n", + "streaming and batch sources, cleanse and transform data on the Databricks\n", + "Lakehouse Platform on any cloud with guaranteed data quality.\n", + "\n", + "Data from Apache Kafka can be ingested by directly connecting to a Kafka broker\n", + "from a DLT notebook in Python. Data loss can be prevented for a full pipeline\n", + "refresh even when the source data in the Kafka streaming layer expired.\n", + "\n", + "**Get started**\n", + "\n", + "If you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\n", + "release notes to learn more about what’s included in this GA release. If you are\n", + "not an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\n", + "detailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n", + "\n", + "Join the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\n", + "are chatting about Data + AI Summit 2022 announcements and updates. Learn.\n", + "Network.\n", + "\n", + "Last but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\n", + "summit. In that session, I walk you through the code of another streaming data\n", + "example with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\n", + "Hugging Face sentiment analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.4 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices**\n", + "\n", + "by **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "December 12, 2022\n", + "\n", + "\n", + "Releasing any data pipeline or application into a production state requires\n", + "planning, testing, monitoring, and maintenance. Streaming pipelines are no\n", + "different in this regard; in this blog we present some of the most important\n", + "considerations for deploying streaming pipelines and applications to a\n", + "production environment.\n", + "\n", + "At Databricks, we offer two different ways of building and running streaming\n", + "pipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\n", + "DLT is our flagship, fully managed ETL product that supports both batch and\n", + "streaming pipelines. It offers declarative development, automated operations,\n", + "data quality, advanced observability capabilities, and more. Workflows enable\n", + "customers to run Apache Spark™ workloads in Databricks’ optimized runtime\n", + "environment (i.e., Photon) with access to unified governance (Unity Catalog) and\n", + "storage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n", + "\n", + "share the same core streaming engine — Spark Structured Streaming. In the\n", + "case of DLT, customers program against the DLT API and DLT uses the Structured\n", + "Streaming engine under the hood. In the case of Jobs, customers program\n", + "against the Spark API directly.\n", + "\n", + "\n", + "The recommendations in this blog post are written from the Structured\n", + "Streaming engine perspective, most of which apply to both DLT and Workflows\n", + "(although DLT does take care of some of these automatically, like Triggers and\n", + "Checkpoints). We group the recommendations under the headings “Before\n", + "Deployment” and “After Deployment” to highlight when these concepts will\n", + "need to be applied and are releasing this blog series with this split between\n", + "the two. There will be additional deep-dive content for some of the sections\n", + "beyond as well. We recommend reading all sections before beginning work\n", + "to productionalize a streaming pipeline or application, and revisiting these\n", + "recommendations as you promote it from dev to QA and eventually production.\n", + "\n", + "**Before deployment**\n", + "\n", + "There are many things you need to consider when creating your streaming\n", + "application to improve the production experience. Some of these topics, like\n", + "unit testing, checkpoints, triggers, and state management, will determine how\n", + "your streaming application performs. Others, like naming conventions and how\n", + "many streams to run on which clusters, have more to do with managing multiple\n", + "streaming applications in the same environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unit testing**\n", + "\n", + "\n", + "The cost associated with finding and fixing a bug goes up exponentially\n", + "the farther along you get in the SDLC process, and a Structured Streaming\n", + "application is no different. When you’re turning that prototype into a hardened\n", + "production pipeline you need a CI/CD process with built-in tests. So how do you\n", + "create those tests?\n", + "\n", + "At first you might think that unit testing a streaming pipeline requires something\n", + "special, but that isn’t the case. The general guidance for streaming pipelines is\n", + "no different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\n", + "organizing your code so that it can be unit tested effectively:\n", + "\n", + "**•** Divide your code into testable chunks\n", + "\n", + "**•** Organize your business logic into functions calling other functions.\n", + "If you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n", + "[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\n", + "multiple functions that can be individually tested.\n", + "\n", + "**•** Do not code in dependencies on the global state or external systems\n", + "\n", + "**•** Any function manipulating a DataFrame or data set should be organized\n", + "to take the DataFrame/data set/configuration as input and output the\n", + "DataFrame/data set\n", + "\n", + "Once your code is separated out in a logical manner you can implement unit\n", + "tests for each of your functions. Spark-agnostic functions can be tested like any\n", + "other function in that language. For testing UDFs and functions with DataFrames\n", + "and data sets, there are multiple Spark testing frameworks available. These\n", + "\n", + "\n", + "frameworks support all of the DataFrame/data set APIs so that you can easily\n", + "create input, and they have specialized assertions that allow you to compare\n", + "DataFrame content and schemas. Some examples are:\n", + "\n", + "**•** The built-in Spark test suite, designed to test all parts of Spark\n", + "\n", + "**•** spark-testing-base, which has support for both Scala and Python\n", + "\n", + "**•** spark-fast-tests, for testing Scala Spark 2 & 3\n", + "\n", + "**•** chispa, a Python version of spark-fast-tests\n", + "\n", + "Code examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n", + "\n", + "But wait! I’m testing a streaming application here — don’t I need to make\n", + "streaming DataFrames for my unit tests? The answer is no; you do not! Even\n", + "though a streaming DataFrame represents a data set with no defined ending,\n", + "when functions are executed on it they are executed on a microbatch — a\n", + "discrete set of data. You can use the same unit tests that you would use for a\n", + "batch application, for both stateless and stateful streams. One of the advantages\n", + "of Structured Streaming over other frameworks is the ability to use the same\n", + "transformation code for both streaming and with other batch operations for\n", + "the same sink. This allows you to simplify some operations, like backfilling\n", + "data, for example, where rather than trying to sync the logic between two\n", + "different applications, you can just modify the input sources and write to the\n", + "same destination. If the sink is a Delta table, you can even do these operations\n", + "concurrently if both processes are append-only operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Triggers**\n", + "\n", + "\n", + "process a microbatch in order to maximize resource utilization, but setting the\n", + "interval longer would make sense if your stream is running on a shared cluster\n", + "and you don’t want it to constantly take the cluster resources.\n", + "\n", + "If you do not need your stream to run continuously, either because data doesn’t\n", + "come that often or your SLA is 10 minutes or greater, then you can use the\n", + "Trigger.Once option. This option will start up the stream, check for anything new\n", + "since the last time it ran, process it all in one big batch, and then shut down.\n", + "Just like with a continuously running stream when using Trigger.Once, the\n", + "checkpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n", + "\n", + "Spark has a new version of Trigger.Once called Trigger.AvailableNow. While\n", + "Trigger.Once will process everything in one big batch, which depending on your\n", + "data size may not be ideal, Trigger.AvailableNow will split up the data based on\n", + "maxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\n", + "processed in multiple batches. Those settings are ignored with Trigger.Once.\n", + "You can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n", + "\n", + "**Pop quiz —** how do you turn your streaming process into a batch process\n", + "that automatically keeps track of where it left off with just one line of code?\n", + "\n", + "**Answer —** change your processing time trigger to Trigger.Once/Trigger.\n", + "AvailableNow! Exact same code, running on a schedule, that will neither miss nor\n", + "reprocess any records.\n", + "\n", + "\n", + "Now that you know your code works, you need to determine how often your\n", + "stream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\n", + "one of the options for the writeStream command, and it looks like this:\n", + "\n", + "_// Scala/Java_\n", + ".trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n", + "\n", + "_# Python_\n", + ".trigger(processingTime= '30 seconds' )\n", + "\n", + "In the above example, if a microbatch completes in less than 30 seconds,\n", + "then the engine will wait for the rest of the time before kicking off the next\n", + "microbatch. If a microbatch takes longer than 30 seconds to complete, then the\n", + "engine will start the next microbatch immediately after the previous one finishes.\n", + "\n", + "The two factors you should consider when setting your trigger interval are how\n", + "long you expect your stream to process a microbatch and how often you want\n", + "the system to check for new data. You can lower the overall processing latency\n", + "by using a shorter trigger interval and increasing the resources available for\n", + "the streaming query by adding more workers or using compute or memory\n", + "optimized instances tailored to your application’s performance. These increased\n", + "resources come with increased costs, so if your goal is to minimize costs, then a\n", + "longer trigger interval with less compute can work. Normally you would not set a\n", + "trigger interval longer than what it would typically take for your stream to\n", + "\n", + "\n", + "-----\n", + "\n", + "**Name your stream**\n", + "\n", + "\n", + "You name your children, you name your pets, now it’s time to name your streams.\n", + "There’s a writeStream option called .queryName that allows you to provide a\n", + "friendly name for your stream. Why bother? Well, suppose you don’t name it. In\n", + "that case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\n", + "is the string and the unintelligible guid that is automatically generated\n", + "as the stream’s unique identifier. If you have more than one stream running on a\n", + "cluster, and all of them have and unintelligible strings as identifiers,\n", + "how do you find the one you want? If you’re exporting metrics how do you tell\n", + "which is which?\n", + "\n", + "Make it easy on yourself, and name your streams. When you’re managing them in\n", + "production you’ll be glad you did, and while you’re at it, go and name your batch\n", + "queries in any foreachBatch() code you have.\n", + "\n", + "**Fault tolerance**\n", + "\n", + "How does your stream recover from being shut down? There are a few different\n", + "cases where this can come into play, like cluster node failures or intentional\n", + "halts, but the solution is to set up checkpointing. Checkpoints with write-ahead\n", + "logs provide a degree of protection from your streaming application being\n", + "interrupted, ensuring it will be able to pick up again where it last left off.\n", + "\n", + "Checkpoints store the current offsets and state values (e.g., aggregate values) for\n", + "your stream. Checkpoints are stream specific so each should be set to its own\n", + "location. Doing this will let you recover more gracefully from shutdowns, failures\n", + "from your application code or unexpected cloud provider failures or limitations.\n", + "\n", + "\n", + "To configure checkpoints, add the checkpointLocation option to your stream\n", + "definition:\n", + "\n", + "_// Scala/Java/Python_\n", + "streamingDataFrame.writeStream\n", + ".format( \"delta\" )\n", + ".option( \"path\" , \"\" )\n", + ".queryName( \"TestStream\" )\n", + ".option( \"checkpointLocation\" , \"\" )\n", + ".start()\n", + "\n", + "To keep it simple — every time you call .writeStream, you must specify the\n", + "checkpoint option with a unique checkpoint location. Even if you’re using\n", + "foreachBatch and the writeStream itself doesn’t specify a path or table option,\n", + "you must still specify that checkpoint. It’s how Spark Structured Streaming gives\n", + "you hassle-free fault tolerance.\n", + "\n", + "Efforts to manage the checkpointing in your stream should be of little concern\n", + "in general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\n", + "analytics is not having to reason about streaming at all.” That said, one setting\n", + "\n", + "deserves mention as questions around the maintenance of checkpoint files\n", + "come up occasionally. Though it is an internal setting that doesn’t require direct\n", + "configuration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\n", + "controls the number of checkpoint files that get created. Basically, the number\n", + "of files will be roughly this number times two, as there is a file created noting the\n", + "offsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\n", + "on completing the batch (commits). The number of files is checked periodically\n", + "for cleanup as part of the internal processes. This simplifies at least one aspect\n", + "of long-term streaming application maintenance for you.\n", + "\n", + "\n", + "-----\n", + "\n", + "It is also important to note that some changes to your application code can\n", + "invalidate the checkpoint. Checking for any of these changes during code\n", + "reviews before deployment is recommended. You can find examples of changes\n", + "where this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n", + "[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\n", + "whether asynchronous checkpointing might improve the latency in your\n", + "streaming application. In that case, these are covered in greater depth in\n", + "[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n", + "\n", + "**State management and RocksDB**\n", + "\n", + "Stateful streaming applications are those where current records may depend\n", + "on previous events, so Spark has to retain data in between microbatches.\n", + "The data it retains is called state, and Spark will store it in a state store and\n", + "read, update and delete it during each microbatch. Typical stateful operations\n", + "are streaming aggregations, streaming dropDuplicates, stream-stream joins,\n", + "mapGroupsWithState, or flatMapGroupsWithState. Some common types of\n", + "examples where you’ll need to think about your application state could be\n", + "sessionization or hourly aggregation using group by methods to calculate\n", + "\n", + "business metrics. Each record in the state store is identified by a key that is used\n", + "as part of the stateful computation, and the more unique keys that are required\n", + "the larger the amount of state data that will be stored.\n", + "\n", + "When the amount of state data needed to enable these stateful operations\n", + "grows large and complex, it can degrade your workloads’ performance, leading\n", + "to increased latency or even failures. A typical indicator of the state store being\n", + "\n", + "\n", + "the culprit of added latency is large amounts of time spent in garbage collection\n", + "(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\n", + "this could look like a continual increase or wildly varying processing time across\n", + "microbatches.\n", + "\n", + "The default configuration for a state store, which is sufficient for most general\n", + "streaming workloads, is to store the state data in the executors’ JVM memory.\n", + "Large number of keys (typically millions, see the Monitoring & Instrumentation\n", + "section in part 2 of this blog) can add excessive memory pressure on the\n", + "machine memory and increase the frequency of hitting these GC pauses as it\n", + "tries to free up resources.\n", + "\n", + "On the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\n", + "use [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\n", + "memory pressure. RocksDB is an embeddable persistent key-value store for fast\n", + "storage. It features high performance through a log-structured database engine\n", + "written entirely in C++ and optimized for fast, low-latency storage.\n", + "\n", + "Leveraging RocksDB as the state store provider still uses machine memory\n", + "but no longer occupies space in the JVM and makes for a more efficient\n", + "state management system for large amounts of keys. This doesn’t come for\n", + "free, however, as it introduces an extra step in processing every microbatch.\n", + "Introducing RocksDB shouldn’t be expected to reduce latency except when it is\n", + "related to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\n", + "regular state storage as it is included in the stream checkpointing.\n", + "\n", + "\n", + "-----\n", + "\n", + "RocksDB configuration, like checkpoint configuration, is minimal by design and so\n", + "you only need to declare it in your overall Spark configuration:\n", + "\n", + "spark.conf. set (\n", + "\"spark.sql.streaming.stateStore.providerClass\" ,\n", + "\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n", + "\n", + "If you are monitoring your stream using the streamingQueryListener class, then\n", + "you will also notice that RocksDB metrics will be included in the stateOperators\n", + "field. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n", + "[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n", + "\n", + "It’s worth noting that large numbers of keys can have other adverse impacts in\n", + "addition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\n", + "also gets backed up in checkpoints for fault tolerance. So it makes sense that\n", + "if you have state files being created so that they will not expire, you will keep\n", + "accumulating files in the checkpoint, increasing the amount of storage required\n", + "and potentially the time to write it or recover from failures as well. For the data\n", + "in memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n", + "\n", + "this situation can lead to somewhat vague out-of-memory errors, and for the\n", + "checkpointed data written to cloud storage you might observe unexpected\n", + "and unreasonable growth. Unless you have a business need to retain streaming\n", + "state for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n", + "[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\n", + "operations so that the system can drop state records that are no longer needed\n", + "(pay close attention to dropDuplicates and stream-stream joins).\n", + "\n", + "\n", + "**Running multiple streams on a cluster**\n", + "\n", + "Once your streams are fully tested and configured, it’s time to figure out how to\n", + "organize them in production. It’s a common pattern to stack multiple streams on\n", + "the same Spark cluster to maximize resource utilization and save cost. This is fine\n", + "to a point, but there are limits to how much you can add to one cluster before\n", + "performance is affected. The driver has to manage all of the streams running on\n", + "the cluster, and all streams will compete for the same cores across the workers.\n", + "You need to understand what your streams are doing and plan your capacity\n", + "appropriately to stack effectively.\n", + "\n", + "Here is what you should take into account when you’re planning on stacking\n", + "multiple streams on the same cluster:\n", + "\n", + "**•** Make sure your driver is big enough to manage all of your streams. Is your\n", + "driver struggling with a high CPU utilization and garbage collection? That\n", + "means it’s struggling to manage all of your streams. Either reduce the\n", + "number of streams or increase the size of your driver.\n", + "\n", + "**•** Consider the amount of data each stream is processing. The more data\n", + "you are ingesting and writing to a sink, the more cores you will need in\n", + "order to maximize your throughput for each stream. You’ll need to reduce\n", + "the number of streams or increase the number of workers depending on\n", + "how much data is being processed. For sources like Kafka you will need to\n", + "configure how many cores are being used to ingest with the minPartitions\n", + "option if you don’t have enough cores for all of the partitions across all of\n", + "your streams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**•** Consider the complexity and data volume of your streams. If all of the\n", + "streams are doing minimal manipulation and just appending to a sink, then\n", + "each stream will need fewer resources per microbatch and you’ll be able to\n", + "stack more. If the streams are doing stateful processing or computation/\n", + "memory-intensive operations, that will require more resources for good\n", + "performance and you’ll want to stack fewer streams.\n", + "\n", + "**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\n", + "contending for the same workers and cores, and one stream that needs a\n", + "lot of cores will cause the other streams to wait. Scheduler pools enable\n", + "you to have different streams execute on different parts of the cluster.\n", + "This will enable streams to execute in parallel with a subset of the available\n", + "resources.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "Some of the ideas we’ve addressed here certainly deserve their own time\n", + "and special treatment with a more in-depth discussion, which you can look\n", + "forward to in later deep dives. However, we hope these recommendations are\n", + "useful as you begin your journey or seek to enhance your production streaming\n", + "experience. Be sure to continue with the next post, “Streaming in Production:\n", + "Collected Best Practices, Part 2.”\n", + "\n", + "**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n", + "\n", + "\n", + "\n", + "**•** Consider your SLA. If you have mission critical streams, isolate them as a\n", + "best practice so lower-criticality streams do not affect them.\n", + "\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "On Databricks we typically see customers stack between 10-30 streams on a\n", + "cluster, but this varies depending on the use case. Consider the factors above so\n", + "that you can have a good experience with performance, cost and maintainability.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.5 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices, Part 2**\n", + "\n", + "by **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "January 10, 2023\n", + "\n", + "\n", + "In our two-part blog series titled “Streaming in Production: Collected Best\n", + "Practices,” this is the second article. Here we discuss the “After Deployment”\n", + "considerations for a Structured Streaming Pipeline. The majority of the\n", + "suggestions in this post are relevant to both Structured Streaming Jobs and\n", + "Delta Live Tables (our flagship and fully managed ETL product that supports\n", + "both batch and streaming pipelines).\n", + "\n", + "**After deployment**\n", + "\n", + "After the deployment of your streaming application, there are typically three\n", + "main things you’ll want to know:\n", + "\n", + "**•** How is my application running?\n", + "\n", + "**•** Are resources being used efficiently?\n", + "\n", + "**•** How do I manage any problems that come up?\n", + "\n", + "We’ll start with an introduction to these topics, followed by a deeper dive later in\n", + "this blog series.\n", + "\n", + "\n", + "**Monitoring and instrumentation (How is my application running?)**\n", + "\n", + "Streaming workloads should be pretty much hands-off once deployed to\n", + "production. However, one thing that may sometimes come to mind is: “how is my\n", + "application running?” Monitoring applications can take on different levels and\n", + "forms depending on:\n", + "\n", + "**•** the metrics collected for your application (batch duration/latency,\n", + "throughput, …)\n", + "\n", + "**•** where you want to monitor the application from\n", + "\n", + "At the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n", + "[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\n", + "used in a variety of situations.\n", + "\n", + "This is in addition to setting up failure alerts on jobs running streaming\n", + "workloads.\n", + "\n", + "If you want more fine-grained metrics or to create custom actions based on\n", + "these metrics as part of your code base, then the StreamingQueryListener is\n", + "better aligned with what you’re looking for.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you want the Spark metrics to be reported (including machine level traces for\n", + "drivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n", + "\n", + "The Apache Spark Structured Streaming UI\n", + "\n", + "\n", + "Another point to consider is where you want to surface these metrics for\n", + "observability. There is a Ganglia dashboard at the cluster level, integrated partner\n", + "applications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\n", + "source options you can build using tools like Prometheus and Grafana. Each\n", + "has advantages and disadvantages to consider around cost, performance, and\n", + "maintenance requirements.\n", + "\n", + "Whether you have low volumes of streaming workloads where interactions in the\n", + "UI are sufficient or have decided to invest in a more robust monitoring platform,\n", + "you should know how to observe your production streaming workloads. Further\n", + "“Monitoring and Alerting” posts later in this series will contain a more thorough\n", + "discussion. In particular, we’ll see different measures on which to monitor\n", + "streaming applications and then later take a deeper look at some of the tools\n", + "you can leverage for observability.\n", + "\n", + "**Application optimization (Are resources being used effectively?**\n", + "\n", + "**Think “cost”)**\n", + "\n", + "The next concern we have after deploying to production is “is my application\n", + "\n", + "using resources effectively?” As developers, we understand (or quickly learn) the\n", + "distinction between working code and well-written code. Improving the way your\n", + "code runs is usually very satisfying, but what ultimately matters is the overall\n", + "cost of running it. Cost considerations for Structured Streaming applications will\n", + "be largely similar to those for other Spark applications. One notable difference\n", + "is that failing to optimize for production workloads can be extremely costly,\n", + "as these workloads are frequently “always-on” applications, and thus wasted\n", + "expenditure can quickly compound. Because assistance with cost optimization is\n", + "\n", + "\n", + "-----\n", + "\n", + "frequently requested, a separate post in this series will address it. The key points\n", + "that we’ll focus on will be efficiency of usage and sizing.\n", + "\n", + "Getting the cluster sizing right is one of the most significant differences between\n", + "efficiency and wastefulness in streaming applications. This can be particularly\n", + "tricky because in some cases it’s difficult to estimate the full load conditions of\n", + "the application in production before it’s actually there. In other cases, it may be\n", + "difficult due to natural variations in volume handled throughout the day, week, or\n", + "year. When first deploying, it can be beneficial to oversize slightly, incurring the\n", + "extra expense to avoid inducing performance bottlenecks. Utilize the monitoring\n", + "tools you chose to employ after the cluster has been running for a few weeks\n", + "to ensure proper cluster utilization. For example, are CPU and memory levels\n", + "being used at a high level during peak load or is the load generally small and the\n", + "cluster may be downsized? Maintain regular monitoring of this and keep an eye\n", + "out for changes in data volume over time; if either occurs, a cluster resize may be\n", + "required to maintain cost-effective operation.\n", + "\n", + "As a general guideline, you should avoid excessive shuffle operations, joins, or an\n", + "excessive or extreme watermark threshold (don’t exceed your needs), as each\n", + "can increase the number of resources you need to run your application. A large\n", + "watermark threshold will cause Structured Streaming to keep more data in the\n", + "state store between batches, leading to an increase in memory requirements\n", + "across the cluster. Also, pay attention to the type of VM configured — are you\n", + "using memory-optimized for your memory-intense stream? Compute-optimized\n", + "for your computationally-intensive stream? If not, look at the utilization levels\n", + "for each and consider trying a machine type that could be a better fit. Newer\n", + "families of servers from cloud providers with more optimal CPUs often lead to\n", + "faster execution, meaning you might need fewer of them to meet your SLA.\n", + "\n", + "\n", + "**Troubleshooting (How do I manage any problems that come up?)**\n", + "\n", + "The last question we ask ourselves after deployment is “how do I manage any\n", + "problems that come up?” As with cost optimization, troubleshooting streaming\n", + "applications in Spark often looks the same as other applications since most of\n", + "the mechanics remain the same under the hood. For streaming applications,\n", + "issues usually fall into two categories — failure scenarios and latency scenarios\n", + "\n", + "**Failure scenarios**\n", + "\n", + "Failure scenarios typically manifest with the stream stopping with an error,\n", + "executors failing or a driver failure causing the whole cluster to fail. Common\n", + "causes for this are:\n", + "\n", + "**•** Too many streams running on the same cluster, causing the driver to be\n", + "overwhelmed. On Databricks, this can be seen in Ganglia, where the driver\n", + "node will show up as overloaded before the cluster fails.\n", + "\n", + "**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\n", + "This can also be seen on Databricks in Ganglia before an executor fails,\n", + "or in the Spark UI under the executors tab.\n", + "\n", + "**•** Using a collect to send too much data to the driver, causing it to fail\n", + "with an Out Of Memory error.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Latency scenarios**\n", + "\n", + "For latency scenarios, your stream will not execute as fast as you want or expect.\n", + "A latency issue can be intermittent or constant. Too many streams or too small\n", + "of a cluster can be the cause of this as well. Some other common causes are:\n", + "\n", + "**•** Data skew — when a few tasks end up with much more data than the rest\n", + "of the tasks. With skewed data, these tasks take longer to execute than the\n", + "others, often spilling to disk. Your stream can only run as fast as its slowest\n", + "task.\n", + "\n", + "**•** Executing a stateful query without defining a watermark or defining a very\n", + "long one will cause your state to grow very large, slowing down your stream\n", + "over time and potentially leading to failure.\n", + "\n", + "**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n", + "\n", + "**•** Stable but high latency (batch execution time). Depending on the cause,\n", + "adding more workers to increase the number of cores concurrently available\n", + "for Spark tasks can help. Increasing the number of input partitions and/or\n", + "decreasing the load per core through batch size settings can also reduce\n", + "the latency.\n", + "\n", + "Just like troubleshooting a batch job, you’ll use Ganglia to check cluster\n", + "utilization and the Spark UI to find performance bottlenecks. There is a\n", + "specific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\n", + "troubleshoot streaming applications. On that tab each stream that is running will\n", + "be listed, and you’ll see either your stream name if you named your stream or\n", + "\n", + "\n", + " if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\n", + "tab of the Spark UI so that you can tell which jobs are for a given stream.\n", + "\n", + "You’ll notice above we said which jobs are for a given stream. It’s a common\n", + "misconception that if you were to look at a streaming application in the Spark\n", + "UI you would just see one job in the Jobs tab running continuously. Instead,\n", + "depending on your code, you will see one or more jobs that start and complete\n", + "for each microbatch. Each job will have the stream ID from the Structured\n", + "Streaming tab and a microbatch number in the description, so you’ll be able to\n", + "tell which jobs go with which stream. You can click into those jobs to find the\n", + "longest running stages and tasks, check for disk spills, and search by Job ID in\n", + "the SQL tab to find the slowest queries and check their explain plans.\n", + "\n", + "The Jobs tab in the Apache Spark UI\n", + "\n", + "\n", + "-----\n", + "\n", + "If you click on your stream in the Structured Streaming tab you’ll see how much\n", + "time the different streaming operations are taking for each microbatch, such as\n", + "adding a batch, query planning and committing (see earlier screenshot of the\n", + "Apache Spark Structured Streaming UI). You can also see how many rows are\n", + "being processed as well as the size of your state store for a stateful stream.\n", + "This can give insights into where potential latency issues are.\n", + "\n", + "We will go more in-depth with troubleshooting later in this blog series, where\n", + "we’ll look at some of the causes and remedies for both failure scenarios and\n", + "latency scenarios as we outlined above.\n", + "\n", + "**Conclusion**\n", + "\n", + "You may have noticed that many of the topics covered here are very similar to\n", + "how other production Spark applications should be deployed. Whether your\n", + "workloads are primarily streaming applications or batch processes, the majority\n", + "of the same principles will apply. We focused more on things that become\n", + "especially important when building out streaming applications, but as we’re\n", + "\n", + "\n", + "sure you’ve noticed by now, the topics we discussed should be included in\n", + "most production deployments.\n", + "\n", + "Across the majority of industries in the world today information is needed\n", + "faster than ever, but that won’t be a problem for you. With Spark Structured\n", + "Streaming you’re set to make it happen at scale in production. Be on the lookout\n", + "for more in-depth discussions on some of the topics we’ve covered in this blog,\n", + "and in the meantime keep streaming!\n", + "\n", + "**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n", + "**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.6 \u0007\n", + "\n", + "**Building Geospatial Data Products**\n", + "\n", + "by **M I L O S C O L I C**\n", + "\n", + "January 6, 2023\n", + "\n", + "\n", + "Geospatial data has been driving innovation for centuries, through use of\n", + "maps, cartography and more recently through digital content. For example,\n", + "the oldest map has been found etched in a piece of mammoth tusk and dates\n", + "[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\n", + "sources used by society to make decisions. A more recent example, labeled\n", + "as the birth of spatial analysis, is that of Charles Picquet in 1832 who used\n", + "geospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\n", + "later John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n", + "[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\n", + "problems of their times and in effect save countless lives. Fast-forwarding to the\n", + "20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n", + "[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\n", + "Rural Development.\n", + "\n", + "Today we are in the midst of the cloud computing industry revolution —\n", + "supercomputing scale available to any organization, virtually infinitely scalable\n", + "for both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\n", + "are emerging within the data community to address questions like platform\n", + "federation and interoperability. How can we adopt these concepts to geospatial\n", + "data, spatial analysis and GIS systems? By adopting the concept of data\n", + "products and approaching the design of geospatial data as a product.\n", + "\n", + "\n", + "In this blog we will provide a point of view on how to design scalable geospatial\n", + "data products that are modern and robust. We will discuss how Databricks\n", + "Lakehouse Platform can be used to unlock the full potential of geospatial\n", + "products that are one of the most valuable assets in solving the toughest\n", + "problems of today and the future.\n", + "\n", + "**What is a data product? And how to design one?**\n", + "\n", + "The most broad and the most concise definition of a “data product” was coined\n", + "by DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n", + "_Data into Product:_ “a product that facilitates an end goal through the use of\n", + "data.” The complexity of this definition (as admitted by Patil himself) is needed to\n", + "encapsulate the breadth of possible products, to include dashboards, reports, Excel\n", + "\n", + "spreadsheets, and even CSV extracts shared via emails. You might notice that the\n", + "examples provided deteriorate rapidly in quality, robustness and governance.\n", + "\n", + "What are the concepts that differentiate a successful product versus an\n", + "unsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\n", + "content? Or is it only the product adoption in the market? Forbes defines the\n", + "10 must-haves of a successful product. A good framework to summarize this is\n", + "through the value pyramid.\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 1: Product value pyramid (source)\n", + "\n", + "The value pyramid provides a priority on each aspect of the product. Not every\n", + "value question we ask about the product carries the same amount of weight. If\n", + "the output is not useful none of the other aspects matter — the output isn’t really\n", + "a product but becomes more of a data pollutant to the pool of useful results.\n", + "Likewise, scalability only matters after simplicity and explainability are addressed.\n", + "\n", + "How does the value pyramid relate to the data products? Each data output, in\n", + "order to be a data product:\n", + "\n", + "**•** **Should have clear usefulness.** The amount of the data society is\n", + "generating is rivaled only by the amount of data pollutants we are\n", + "generating. These are outputs lacking clear value and use, much less a\n", + "strategy for what to do with them.\n", + "\n", + "\n", + "\n", + "**•** **Should be explainable.** With the emergence of AI/ML, explainability has\n", + "become even more important for data driven decision-making. Data\n", + "is as good as the metadata describing it. Think of it in terms of food —\n", + "taste does matter, but a more important factor is the nutritional value\n", + "of ingredients.\n", + "\n", + "**•** **Should be simple.** An example of product misuse is using a fork to eat\n", + "cereal instead of using a spoon. Furthermore, simplicity is essential but\n", + "not sufficient — beyond simplicity the products should be intuitive.\n", + "Whenever possible both intended and unintended uses of the data\n", + "should be obvious.\n", + "\n", + "**•** **Should be scalable.** Data is one of the few resources that grows with\n", + "use. The more data you process the more data you have. If both inputs\n", + "and outputs of the system are unbounded and ever-growing, then the\n", + "system has to be scalable in compute power, storage capacity and\n", + "compute expressive power. Cloud data platforms like Databricks are in\n", + "a unique position to answer for all of the three aspects.\n", + "\n", + "**•** **Should generate habits.** In the data domain we are not concerned\n", + "with customer retention as is the case for the retail products. However,\n", + "the value of habit generation is obvious if applied to best practices.\n", + "The systems and data outputs should exhibit the best practices and\n", + "promote them — it should be easier to use the data and the system in\n", + "the intended way than the opposite.\n", + "\n", + "The geospatial data should adhere to all the aforementioned aspects — any data\n", + "products should. On top of this tall order, geospatial data has some specific needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Geospatial data standards**\n", + "\n", + "\n", + "\n", + "**•** **“Advocate the understanding and use of geospatial data standards**\n", + "**within other sectors of government.”** — Value pyramid applies to\n", + "the standards as well — concepts like ease of adherence (usefulness/\n", + "simplicity), purpose of the standard (explainability/usefulness), adoption\n", + "(habit generation) are critical for the value generation of a standard.\n", + "\n", + "A critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\n", + "principles:\n", + "\n", + "**•** **Findable** — The first step in (re)using data is to find them. Metadata\n", + "and data should be easy to find for both humans and computers.\n", + "Machine-readable metadata are essential for automatic discovery of\n", + "data sets and services.\n", + "\n", + "**•** **Accessible** — Once the user finds the required data, she/he/they\n", + "need to know how they can be accessed, possibly including\n", + "authentication and authorization.\n", + "\n", + "**•** **Interoperable** — The data usually needs to be integrated with\n", + "other data. In addition, the data needs to interoperate with\n", + "applications or workflows for analysis, storage, and processing.\n", + "\n", + "**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\n", + "To achieve this, metadata and data should be well-described so that\n", + "they can be replicated and/or combined in different settings.\n", + "\n", + "\n", + "Geospatial data standards are used to ensure that geographic data is collected,\n", + "organized, and shared in a consistent and reliable way. These standards can\n", + "include guidelines for things like data formatting, coordinate systems, map\n", + "projections, and metadata. Adhering to standards makes it easier to share data\n", + "between different organizations, allowing for greater collaboration and broader\n", + "access to geographic information.\n", + "\n", + "The Geospatial Commision (UK government) has defined the UK Geospatial\n", + "Data Standards Register as a central repository for data standards to be applied\n", + "in the case of geospatial data. Furthermore, the mission of this registry is to:\n", + "\n", + "**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n", + "**across a wider range of systems.”** — These concepts are a callout for the\n", + "importance of explainability, usefulness and habit generation (possibly\n", + "other aspects of the value pyramid).\n", + "\n", + "**•** **“Empower the UK geospatial community to become more engaged with**\n", + "**the relevant standards and standards bodies.”** — Habit generation within\n", + "the community is as important as the robust and critical design on the\n", + "standard. If not adopted standards are useless.\n", + "\n", + "\n", + "-----\n", + "\n", + "We share the belief that the FAIR principles are crucial for the design of scalable\n", + "data products we can trust. To be fair, FAIR is based on common sense, so why\n", + "is it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n", + "_does well is to articulate, in an accessible way, the need for a holistic approach_\n", + "_to data improvement. This ease in communication is why FAIR is being used_\n", + "_increasingly widely as an umbrella for data improvement — and not just in the_\n", + "_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n", + "\n", + "To further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\n", + "developed the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\n", + "covers the years 2021-2024 and was approved in November 2020. The goals\n", + "of NSDI are in essence FAIR principles and convey the same message of designing\n", + "systems that promote the circular economy of data — data products that flow\n", + "between organizations following common standards and in each step through the\n", + "data supply chain unlock new value and new opportunities. The fact that these\n", + "principles are permeating different jurisdictions and are adopted across different\n", + "regulators is a testament to the robustness and soundness of the approach.\n", + "\n", + "\n", + "The FAIR concepts weave really well together with the data product design.\n", + "In fact FAIR is traversing the whole product value pyramid and forms a value\n", + "cycle. By adopting both the value pyramid and FAIR principles we design data\n", + "products with both internal and external outlook. This promotes data reuse\n", + "as opposed to data accumulation.\n", + "\n", + "Why do FAIR principles matter for geospatial data and geospatial data\n", + "\n", + "products? FAIR is transcendent to geospatial data, it is actually transcendent\n", + "to data, it is a simple yet coherent system of guiding principles for good design\n", + "— and that good design can be applied to anything including geospatial data\n", + "and geospatial systems.\n", + "\n", + "\n", + "Figure 2:\n", + "NDSI Strategic Goals\n", + "\n", + "\n", + "-----\n", + "\n", + "**Grid index systems**\n", + "\n", + "In traditional GIS solutions’ performance of spatial operations are usually\n", + "achieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\n", + "The issue with tree approaches is that they eventually break the scalability\n", + "principle — when the data is too big to be processed in order to build the tree\n", + "and the computation required to build the tree is too long and defeats the\n", + "purpose. This also negatively affects the accessibility of data; if we cannot\n", + "construct the tree we cannot access the complete data and in effect we cannot\n", + "reproduce the results. In this case, grid index systems provide a solution.\n", + "\n", + "\n", + "Grid index systems are built from the start with the scalability aspects of the\n", + "geospatial data in mind. Rather than building the trees, they define a series of\n", + "grids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\n", + "the grid covers the area of the Earth; in the case of local grid index systems\n", + "(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\n", + "These grids are composed of cells that have unique identifiers. There is a\n", + "mathematical relationship between location and the cell in the grid. This makes\n", + "the grid index systems very scalable and parallel in nature.\n", + "\n", + "\n", + "Figure 4: Grid Index Systems (H3, British National Grid)\n", + "\n", + "\n", + "-----\n", + "\n", + "Another important aspect of grid index systems is that they are open source,\n", + "allowing index values to be universally leveraged by data producers and\n", + "consumers alike. Data can be enriched with the grid index information at any\n", + "step of its journey through the data supply chain. This makes the grid index\n", + "systems an example of community driven data standards. Community driven\n", + "data standards by nature do not require enforcement, which fully adheres\n", + "to the habit generation aspect of value pyramid and meaningfully addresses\n", + "interoperability and accessibility principles of FAIR.\n", + "\n", + "\n", + "Databricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\n", + "following the same value proposition. Adopting common industry standards\n", + "driven by the community is the only way to properly drive habit generation and\n", + "interoperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\n", + "and [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\n", + "GIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n", + "[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\n", + "the UK government. Grid index systems are key for the scalability of geospatial\n", + "data processing and for properly designing solutions for complex problems\n", + "(e.g., figure 5 — flight holding patterns using H3).\n", + "\n", + "**Geospatial data diversity**\n", + "\n", + "Geospatial data standards spend a solid amount of effort regarding data\n", + "format standardization, and format for that matter is one of the most\n", + "important considerations when it comes to interoperability and reproducibility.\n", + "Furthermore, if the reading of your data is complex — how can we talk about\n", + "simplicity? Unfortunately geospatial data formats are typically complex, as\n", + "data can be produced in a number of formats including both open source\n", + "\n", + "and vendor-specific formats. Considering only vector data, we can expect\n", + "data to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\n", + "and many others. On the other hand, if we are considering raster data we can\n", + "expect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\n", + "GeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n", + "\n", + "\n", + "Figure 5: Example of using H3 to express flight holding patterns\n", + "\n", + "\n", + "-----\n", + "\n", + "Geospatial data domain is so diverse and has organically grown over the years\n", + "around the use cases it was addressing. Unification of such a diverse ecosystem\n", + "is a massive challenge. A recent effort by the Open Geospatial Consortium\n", + "(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n", + "[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\n", + "of designing a good scalable and robust product — unification leads to simplicity\n", + "and addresses one of the main sources of friction in the ecosystem — the data\n", + "ingestion. Standardizing to GeoParquet brings a lot of value that addresses all of\n", + "the aspects of FAIR data and value pyramid.\n", + "\n", + "Figure 6: Geoparquet as a geospatial standard data format\n", + "\n", + "\n", + "Why introduce another format into an already complex ecosystem? GeoParquet\n", + "isn’t a new format — it is a schema specification for Apache Parquet format that\n", + "is already widely adopted and used by the industry and the community. Parquet\n", + "as the base format supports binary columns and allows for storage of arbitrary\n", + "data payload. At the same time the format supports structured data columns\n", + "that can store metadata together with the data payload. This makes it a choice\n", + "that promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\n", + "has been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\n", + "properties of a format are crucial for reproducibility and for trusted outputs. In\n", + "addition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n", + "\n", + "Delta Sharing enables enterprise scale data sharing between any public cloud\n", + "using Databricks (DIY options for private cloud are available using open source\n", + "building blocks). Delta Sharing completely abstracts the need for custom built\n", + "Rest APIs for exposing data to other third parties. Any data asset stored in Delta\n", + "(using GeoParquet schema) automatically becomes a data product that can be\n", + "exposed to external parties in a controlled and governed manner. Delta Sharing\n", + "has been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 7: Delta Sharing simplifying data access in the ecosystem\n", + "\n", + "**Circular data economy**\n", + "\n", + "\n", + "Borrowing the concepts from the sustainability domain, we can define a circular\n", + "data economy as a system in which data is collected, shared, and used in a way\n", + "that maximizes its value while minimizing waste and negative impacts, such as\n", + "unnecessary compute time, untrustworthy insights, or biased actions based\n", + "data pollutants. Reusability is the key concept in this consideration — how can\n", + "we minimize the \"reinvention of the wheel.\" There are countless data assets out\n", + "in the wild that represent the same area, same concepts with just ever slight\n", + "alterations to better match a specific use case. Is this due to the actual\n", + "\n", + "\n", + "optimizations or due to the fact it was easier to create a new copy of the assets\n", + "than to reuse the existing ones? Or was it too hard to find the existing data\n", + "assets, or maybe it was too complex to define data access patterns.\n", + "\n", + "Data asset duplication has many negative aspects in both FAIR considerations\n", + "and data value pyramid considerations — having many disparate similar (but\n", + "different) data assets that represent the same area and same concepts can\n", + "deteriorate simplicity considerations of the data domain — it becomes hard\n", + "to identify the data asset we actually can trust. It can also have very negative\n", + "\n", + "\n", + "-----\n", + "\n", + "implications toward habit generation. Many niche communities will emerge\n", + "that will standardize to themselves ignoring the best practices of the wider\n", + "ecosystem, or worse yet they will not standardize at all.\n", + "\n", + "In a circular data economy, data is treated as a valuable resource that can be\n", + "used to create new products and services, as well as improving existing ones.\n", + "This approach encourages the reuse and recycling of data, rather than treating it\n", + "as a disposable commodity. Once again, we are using the sustainability analogy\n", + "in a literal sense — we argue that this is the correct way of approaching the\n", + "problem. Data pollutants are a real challenge for organizations both internally and\n", + "externally. An article by The Guardian states that less than 1% of collected data is\n", + "actually analyzed. There is too much data duplication, the majority of data is hard\n", + "to access and deriving actual value is too cumbersome. Circular data economy\n", + "promotes best practices and reusability of existing data assets allowing for a more\n", + "consistent interpretation and insights across the wider data ecosystem.\n", + "\n", + "\n", + "Figure 8: Databricks Marketplace\n", + "\n", + "\n", + "-----\n", + "\n", + "Interoperability is a key component of FAIR data principles, and from\n", + "interoperability a question of circularity comes to mind. How can we design an\n", + "ecosystem that maximizes data utilization and data reuse? Once again, FAIR\n", + "together with the value pyramid holds answers. Findability of the data is key to\n", + "the data reuse and to solving for data pollution. With data assets that can be\n", + "discovered easily we can avoid the recreation of same data assets in multiple\n", + "places with just slight alteration. Instead we gain a coherent data ecosystem\n", + "with data that can be easily combined and reused. Databricks has recently\n", + "announced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\n", + "line with the original definition of data product by DJ Patel. The marketplace\n", + "will support sharing of data sets, notebooks, dashboards, and machine learning\n", + "models. The critical building block for such a marketplace is the concept of\n", + "Delta Sharing — the scalable, flexible and robust channel for sharing any data —\n", + "geospatial data included.\n", + "\n", + "\n", + "Designing scalable data products that will live in the marketplace is crucial.\n", + "In order to maximize the value add of each data product one should strongly\n", + "consider FAIR principles and the product value pyramid. Without these guiding\n", + "principles we will only increase the issues that are already present in the\n", + "current systems. Each data product should solve a unique problem and should\n", + "solve it in a simple, reproducible and robust way.\n", + "\n", + "**You can read more on how Databricks Lakehouse**\n", + "**Platform can help you accelerate time to value from**\n", + "**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n", + "**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.7 \u0007\n", + "\n", + "**Data Lineage With Unity Catalog**\n", + "\n", + "by **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n", + "\n", + "June 8, 2022\n", + "\n", + "\n", + "This blog will discuss the importance of data lineage, some of the common\n", + "use cases, our vision for better data transparency and data understanding with\n", + "data lineage.\n", + "\n", + "**What is data lineage and why is it important?**\n", + "\n", + "Data lineage describes the transformations and refinements of data from source\n", + "to insight. Lineage includes capturing all the relevant metadata and events\n", + "associated with the data in its lifecycle, including the source of the data set,\n", + "what other data sets were used to create it, who created it and when, what\n", + "transformations were performed, what other data sets leverage it, and many other\n", + "events and attributes. With a data lineage solution, data teams get an end-to-end\n", + "view of how data is transformed and how it flows across their data estate.\n", + "\n", + "As more and more organizations embrace a data-driven culture and set up\n", + "processes and tools to democratize and scale data and AI, data lineage is\n", + "becoming an essential pillar of a pragmatic data management and governance\n", + "strategy.\n", + "\n", + "To understand the importance of data lineage, we have highlighted some of the\n", + "common use cases we have heard from our customers below.\n", + "\n", + "\n", + "**Impact analysis**\n", + "Data goes through multiple updates or revisions over its lifecycle, and\n", + "understanding the potential impact of any data changes on downstream\n", + "consumers becomes important from a risk management standpoint. With data\n", + "lineage, data teams can see all the downstream consumers — applications,\n", + "dashboards, machine learning models or data sets, etc. — impacted by data\n", + "changes, understand the severity of the impact, and notify the relevant\n", + "stakeholders. Lineage also helps IT teams proactively communicate data\n", + "migrations to the appropriate teams, ensuring business continuity.\n", + "\n", + "**Data understanding and transparency**\n", + "Organizations deal with an influx of data from multiple sources, and building\n", + "a better understanding of the context around data is paramount to ensure\n", + "the trustworthiness of the data. Data lineage is a powerful tool that enables\n", + "data leaders to drive better transparency and understanding of data in their\n", + "organizations. Data lineage also empowers data consumers such as data scientists,\n", + "data engineers and data analysts to be context-aware as they perform analyses,\n", + "resulting in better quality outcomes. Finally, data stewards can see which data sets\n", + "are no longer accessed or have become obsolete to retire unnecessary data and\n", + "ensure data quality for end business users .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Debugging and diagnostics**\n", + "You can have all the checks and balances in place, but something will eventually\n", + "break. Data lineage helps data teams perform a root cause analysis of any errors\n", + "in their data pipelines, applications, dashboards, machine learning models, etc.,\n", + "by tracing the error to its source. This significantly reduces the debugging time,\n", + "saving days, or in many cases, months of manual effort.\n", + "\n", + "**Compliance and audit readiness**\n", + "Many compliance regulations, such as the General Data Protection Regulation\n", + "(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\n", + "Accountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\n", + "and Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\n", + "and visibility of data flow. As a result, data traceability becomes a key requirement\n", + "in order for their data architecture to meet legal regulations. Data lineage helps\n", + "organizations be compliant and audit-ready, thereby alleviating the operational\n", + "overhead of manually creating the trails of data flows for audit reporting purposes.\n", + "\n", + "\n", + "**Effortless transparency and proactive control with**\n", + "**data lineage**\n", + "\n", + "The [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\n", + "substantially simplifies enterprise data infrastructure and accelerates innovation\n", + "by unifying your data warehousing and AI use cases on a single platform.\n", + "We believe data lineage is a key enabler of better data transparency and data\n", + "understanding in your lakehouse, surfacing the relationships between data,\n", + "jobs, and consumers, and helping organizations move toward proactive data\n", + "management practices. For example:\n", + "\n", + "**•** As the owner of a dashboard, do you want to be notified next time that a\n", + "table your dashboard depends upon wasn’t loaded correctly?\n", + "\n", + "**•** As a machine learning practitioner developing a model, do you want to be\n", + "alerted that a critical feature in your model will be deprecated soon?\n", + "\n", + "**•** As a governance admin, do you want to automatically control access to\n", + "data based on its provenance?\n", + "\n", + "All of these capabilities rely upon the automatic collection of data lineage across\n", + "all use cases and personas — which is why the lakehouse and data lineage are a\n", + "powerful combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data lineage for tables\n", + "\n", + "Data lineage for table columns\n", + "\n", + "\n", + "Data Lineage for notebooks, workflows, dashboards\n", + "\n", + "**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\n", + "the same permission model as Unity Catalog. If users do not have access to\n", + "a table, they will not be able to explore the lineage associated with the table,\n", + "adding an additional layer of security for privacy considerations.\n", + "\n", + "**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\n", + "in near real-time, and retrieved via REST API to support integrations with our\n", + "catalog partners.\n", + "\n", + "**Getting started with data lineage in Unity Catalog**\n", + "\n", + "Data lineage is available with Databricks Premium and Enterprise tiers for\n", + "no additional cost. If you already are a Databricks customer, follow the data\n", + "lineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\n", + "customer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.8\n", + "\n", + "**Easy Ingestion to Lakehouse With COPY INTO**\n", + "\n", + "by **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n", + "\n", + "January 17, 2023\n", + "\n", + "\n", + "A new data management architecture known as the data lakehouse emerged\n", + "independently across many organizations and use cases to support AI and BI\n", + "directly on vast amounts of data. One of the key success factors for using the\n", + "data lakehouse for analytics and machine learning is the ability to quickly and\n", + "easily ingest data of various types, including data from on-premises storage\n", + "platforms (data warehouses, mainframes), real-time streaming data, and bulk\n", + "data assets.\n", + "\n", + "As data ingestion into the lakehouse is an ongoing process that feeds the\n", + "proverbial ETL pipeline, you will need multiple options to ingest various formats,\n", + "types and latency of data. For data stored in cloud object stores such as AWS\n", + "S3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\n", + "Auto Loader, a natively integrated feature, that allows data engineers to ingest\n", + "millions of files from the cloud storage continuously. In other streaming cases\n", + "\n", + "(e.g., IoT sensor or clickstream data), Databricks provides native connectors\n", + "for Apache Spark Structured Streaming to quickly ingest data from popular\n", + "message queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\n", + "latencies. Furthermore, many customers can leverage popular ingestion tools\n", + "\n", + "\n", + "that integrate with Databricks, such as Fivetran — to easily ingest data from\n", + "enterprise applications, databases, mainframes and more into the lakehouse.\n", + "Finally, analysts can use the simple “COPY INTO” command to pull new data into\n", + "the lakehouse automatically, without the need to keep track of which files have\n", + "already been processed.\n", + "\n", + "This blog focuses on COPY INTO, a simple yet powerful SQL command that allows\n", + "you to perform batch file ingestion into Delta Lake from cloud object stores.\n", + "It’s idempotent, which guarantees to ingest files with exactly-once semantics\n", + "when executed multiple times, supporting incremental appends and simple\n", + "transformations. It can be run once, in an ad hoc manner, and can be scheduled\n", + "through Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\n", + "INTO introduced new functionalities for data preview, validation, enhanced error\n", + "handling, and a new way to copy into a schemaless Delta Lake table so that users\n", + "\n", + "can get started quickly, completing the end-to-end user journey to ingest from\n", + "cloud object stores. Let’s take a look at the popular COPY INTO use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. Ingesting data for the first time**\n", + "\n", + "\n", + "The default for data validation is to parse all the data in the source directory to\n", + "ensure that there aren’t any issues, but the rows returned for preview are limited.\n", + "Optionally, you can provide the number of rows to preview after VALIDATE.\n", + "\n", + "The COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\n", + "of your target Delta table. Schema evolution only allows the addition of new\n", + "columns, and does not support data type changes for existing columns. In other\n", + "use cases, you can omit this option if you intend to manage your table schema\n", + "more strictly as your data pipeline may have strict schema requirements and\n", + "may not want to evolve the schema at all times. However, our target Delta table\n", + "in the example above is an empty, columnless table at the moment; therefore,\n", + "we have to specify the COPY_OPTION “mergeSchema” here.\n", + "\n", + "Figure 1: COPY INTO VALIDATE mode output\n", + "\n", + "\n", + "COPY INTO requires a table to exist as it ingests the data into a target Delta\n", + "table. However, you have no idea what your data looks like. You first create an\n", + "empty Delta table.\n", + "```\n", + " CREATE TABLE my_example_data;\n", + "\n", + "```\n", + "Before you write out your data, you may want to preview it and ensure the\n", + "data looks correct. The COPY INTO Validate mode is a new feature in\n", + "Databricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\n", + "source data before ingesting many files from the cloud object stores.\n", + "These validations include:\n", + "\n", + "**•** if the data can be parsed\n", + "\n", + "**•** the schema matches that of the target table or if the schema\n", + "needs to be evolved\n", + "\n", + "**•** all nullability and check constraints on the table are met\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Configuring COPY INTO**\n", + "\n", + "\n", + "Figure 2 shows the validate output that the header is properly parsed.\n", + "\n", + "Figure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n", + "\n", + "**3. Appending data to a Delta table**\n", + "\n", + "Now that the preview looks good, we can remove the VALIDATE keyword and\n", + "execute the COPY INTO command.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "When looking over the results of VALIDATE (see Figure 1), you may notice that\n", + "your data doesn’t look like what you want. Aren’t you glad you previewed your\n", + "data set first? The first thing you notice is the column names are not what is\n", + "specified in the CSV header. What’s worse, the header is shown as a row in your\n", + "data. You can configure the CSV parser by specifying FORMAT_OPTIONS.\n", + "Let’s add those next.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "When using the FORMAT OPTION, you can tell COPY INTO to infer the data types\n", + "of the CSV file by specifying the inferSchema option; otherwise, all default\n", + "data types are STRINGs. On the other hand, binary file formats like AVRO and\n", + "PARQUET do not need this option since they define their own schema. Another\n", + "\n", + "option, “mergeSchema” states that the schema should be inferred over a\n", + "comprehensive sample of CSV files rather than just one. The comprehensive list\n", + "of format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO keeps track of the state of files that\n", + "have been ingested. Unlike commands like INSERT\n", + "INTO, users get idempotency with COPY INTO,\n", + "which means users won’t get duplicate data in\n", + "the target table when running COPY INTO multiple\n", + "times from the same source data.\n", + "\n", + "COPY INTO can be run once, in an ad hoc manner,\n", + "and can be scheduled with Databricks Workflows.\n", + "While COPY INTO does not support low latencies\n", + "for ingesting natively, you can trigger COPY INTO\n", + "through orchestrators like Apache Airflow.\n", + "\n", + "\n", + "Figure 3: Databricks workflow UI to schedule a task\n", + "\n", + "\n", + "-----\n", + "\n", + "**4. Secure data access with COPY INTO**\n", + "\n", + "COPY INTO supports secure access in several ways. In this section, we want to\n", + "highlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\n", + "from recent releases:\n", + "\n", + "**Unity Catalog**\n", + "With the general availability of Databrick Unity Catalog, you can use COPY INTO\n", + "to ingest data to Unity Catalog managed or external tables from any source and\n", + "file format supported by COPY INTO. Unity Catalog also adds new options for\n", + "configuring secure access to raw data, allowing you to use Unity Catalog external\n", + "locations or storage credentials to access data in cloud object storage. Learn\n", + "more about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n", + "\n", + "**Temporary Credentials**\n", + "What if you have not configured Unity Catalog or instance profile? How about\n", + "data from a trusted third party bucket? Here is a convenient COPY INTO feature\n", + "that allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\n", + "hoc bulk ingestion use case.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath' WITH (\n", + "CREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\n", + "TOKEN `=` '...' )\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**5. Filtering files for ingestion**\n", + "\n", + "What about ingesting a subset of files where the filenames match a pattern? You\n", + "can apply glob patterns — a glob pattern that identifies the files to load from the\n", + "source directory. For example, let’s filter and ingest files which contain the word\n", + "`raw_data` in the filename below.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data*.csv'\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' )\n", + "\n", + "**6. Ingest files in a time period**\n", + "\n", + "In data engineering, it is frequently necessary to ingest files that have been\n", + "modified before or after a specific timestamp. Data between two timestamps\n", + "may also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\n", + "offered by COPY INTO allow users to ingest data from a chosen time window into\n", + "a Delta table.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_*.csv'\n", + "FORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n", + "\n", + "\n", + "-----\n", + "\n", + "**7. Correcting data with the force option**\n", + "\n", + "Because COPY INTO is by default idempotent, running the same query against\n", + "the same source files more than once has no effect on the destination table\n", + "after the initial execution. You must propagate changes to the target table\n", + "because, in real-world circumstances, source data files in cloud object storage\n", + "may be altered for correction at a later time. In such a case, it is possible to first\n", + "erase the data from the target table before ingesting the more recent data files\n", + "from the source. For this operation you only need to set the copy option ‘force’\n", + "to ‘true’.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_2022*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "\n", + "**8. Applying simple transformations**\n", + "\n", + "What if you want to rename columns? Or the source data has changed and a\n", + "previous column has been renamed to something else? You don’t want to ingest\n", + "that data as two separate columns, but as a single column. We can leverage the\n", + "SELECT statement in COPY INTO perform simple transformations.\n", + "\n", + "COPY INTO demo.my_example_data\n", + "FROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n", + "`*` EXCEPT (first_name, last_name)\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "**9. Error handling and observability with COPY INTO**\n", + "\n", + "**Error handling:**\n", + "How about ingesting data with file corruption issues? Common examples of file\n", + "corruption are:\n", + "\n", + "**•** Files with an incorrect file format\n", + "\n", + "**•** Failure to decompress\n", + "\n", + "**•** Unreadable files (e.g., invalid Parquet)\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO’s format option ignoreCorruptFiles helps skip those files while\n", + "processing. The result of the COPY INTO command returns the number of files\n", + "skipped in the num_skipped_corrupt_files column. In addition, these corrupt\n", + "files aren’t tracked by the ingestion state in COPY INTO, therefore they can be\n", + "reloaded in a subsequent execution once the corruption is fixed. This option is\n", + "available in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n", + "\n", + "You can see which files have been detected as corrupt by running COPY INTO in\n", + "VALIDATE mode.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE ALL\n", + "FORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n", + "\n", + "**Observability:**\n", + "In Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\n", + "input file metadata information, which allows users to monitor and get key\n", + "properties of the ingested files like path, name, size and modification time, by\n", + "querying a hidden STRUCT column called _metadata. To include this information\n", + "in the destination, you must explicitly reference the _metadata column in your\n", + "query in COPY INTO.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM (\n", + "SELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\n", + "exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**How does it compare to Auto Loader?**\n", + "\n", + "COPY INTO is a simple and powerful command to use when your source\n", + "directory contains a small number of files (i.e., thousands of files or less), and if\n", + "you prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\n", + "Delta Lake at your convenience, a common pattern by many ingestion partners.\n", + "To ingest a larger number of files both in streaming and batch we recommend\n", + "using [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n", + "[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\n", + "leveraging advanced capabilities of automatic error handling, quality control,\n", + "data lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n", + "\n", + "**How to get started?**\n", + "\n", + "To get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\n", + "example SQL commands to ingest from your cloud object stores. Check out\n", + "the options in No. 4 to establish secure access to your data for querying it in\n", + "Databricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\n", + "follow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n", + "\n", + "As an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\n", + "Machine Learning workspaces to learn most of the COPY INTO features in this\n", + "blog, where source data and target Delta tables are generated in DBFS.\n", + "\n", + "More tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.9 \u0007\n", + "\n", + "**Simplifying Change Data Capture With Databricks Delta Live Tables**\n", + "\n", + "by **M O J G A N M A Z O U C H I**\n", + "\n", + "April 25, 2022\n", + "\n", + "\n", + "This guide will demonstrate how you can leverage change data capture in Delta\n", + "Live Tables pipelines to identify new records and capture changes made to the\n", + "data set in your data lake. Delta Live Tables pipelines enable you to develop\n", + "scalable, reliable and low latency data pipelines, while performing change data\n", + "capturee in your data lake with minimum required computation resources and\n", + "seamless out-of-order data handling.\n", + "\n", + "**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\n", + "which explains creating scalable and reliable pipelines using Delta Live Tables\n", + "(DLT) and its declarative ETL definitions.\n", + "\n", + "**Background on change data capture**\n", + "\n", + "Change data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\n", + "changes (data deletes, inserts and updates) in databases, like tracking customer,\n", + "order or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\n", + "new events occur.\n", + "\n", + "\n", + "Since [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n", + "[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\n", + "real-time centralization of all data changes in your ETL pipeline across multiple\n", + "environments is critical.\n", + "\n", + "By capturing CDC events, Databricks users can re-materialize the source table\n", + "as Delta Table in Lakehouse and run their analysis on top of it, while being able\n", + "to combine data with external systems. The MERGE INTO command in Delta Lake\n", + "on Databricks enables customers to efficiently upsert and delete records in\n", + "their data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\n", + "This is a common use case that we observe many of Databricks customers are\n", + "leveraging Delta Lakes to perform, and keeping their data lakes up to date with\n", + "real-time business data.\n", + "\n", + "While Delta Lake provides a complete solution for real-time CDC synchronization\n", + "in a data lake, we are now excited to announce the change data capture feature\n", + "in Delta Live Tables that makes your architecture even simpler, more efficient and\n", + "scalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n", + "\n", + "Earlier CDC solutions with Delta tables were using MERGE INTO operation, which\n", + "requires manually ordering the data to avoid failure when multiple rows of the\n", + "source data set match while attempting to update the same rows of the target\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta table. To handle the out-of-order data, there was an extra step required to\n", + "preprocess the source table using a foreachBatch implementation to eliminate\n", + "the possibility of multiple matches, retaining only the latest change for each\n", + "key (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\n", + "operation in DLT pipelines automatically and seamlessly handles out-of-order\n", + "data without any need for data engineering manual intervention.\n", + "\n", + "**CDC with Databricks Delta Live Tables**\n", + "\n", + "In this blog, we will demonstrate how to use the APPLY CHANGES INTO command\n", + "in Delta Live Tables pipelines for a common CDC use case where the CDC data\n", + "is coming from an external system. A variety of CDC tools are available such\n", + "as Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\n", + "implementations differ, these tools generally capture and record the history\n", + "of data changes in logs; downstream applications consume these CDC logs. In\n", + "our example, data is landed in cloud object storage from a CDC tool such as\n", + "Debezium, Fivetran, etc.\n", + "\n", + "We have data from various CDC tools landing in a cloud object storage or a\n", + "message queue like Apache Kafka. Typically we see CDC used in an ingestion\n", + "to what we refer as the medallion architecture. A medallion architecture is a\n", + "data design pattern used to logically organize data in a Lakehouse, with the\n", + "goal of incrementally and progressively improving the structure and quality of\n", + "data as it flows through each layer of the architecture. Delta Live Tables allows\n", + "you to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\n", + "combining this functionality with the medallion architecture allows for\n", + "\n", + "\n", + "incremental changes to easily flow through analytical workloads at scale. Using\n", + "CDC together with the medallion architecture provides multiple benefits to users\n", + "since only changed or added data needs to be processed. Thus, it enables users\n", + "to cost-effectively keep Gold tables up-to-date with the latest business data.\n", + "\n", + "**NOTE:** The example here applies to both SQL and Python versions of CDC\n", + "and also on a specific way to use the operations; to evaluate variations,\n", + "please see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n", + "\n", + "**Prerequisites**\n", + "\n", + "To get the most out of this guide, you should have a basic familiarity with:\n", + "\n", + "**•** SQL or Python\n", + "\n", + "**•** Delta Live Tables\n", + "\n", + "**•** Developing ETL pipelines and/or working with Big Data systems\n", + "\n", + "**•** Databricks interactive notebooks and clusters\n", + "\n", + "**•** You must have access to a Databricks Workspace with permissions\n", + "to create new clusters, run jobs, and save data to a location on\n", + "external cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n", + "\n", + "**•** For the pipeline we are creating in this blog, “Advanced” product\n", + "edition which supports enforcement of data quality constraints,\n", + "needs to be selected\n", + "\n", + "\n", + "-----\n", + "\n", + "**The data set**\n", + "\n", + "Here we are consuming realistic looking CDC data from an external database. In\n", + "this pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\n", + "tool like Debezium can produce and bring into cloud storage for the initial ingest\n", + "in Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\n", + "object storage, and store them in the Bronze table as it stores the raw messages.\n", + "The Bronze tables are intended for data ingestion which enable quick access to a\n", + "single source of truth. Next we perform APPLY CHANGES INTO from the cleaned\n", + "Bronze layer table to propagate the updates downstream to the Silver table. As\n", + "data flows to Silver tables, generally it becomes more refined and optimized\n", + "(“just-enough”) to provide an enterprise a view of all its key business entities.\n", + "See the diagram below.\n", + "\n", + "\n", + "This blog focuses on a simple example that requires a JSON message with\n", + "four fields of customer’s name, email, address and id along with the two fields:\n", + "operation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\n", + "operation_date (which stores the date and timestamp for the record came for\n", + "each operation action) to describe the changed data.\n", + "\n", + "To generate a sample data set with the above fields, we are using a Python\n", + "package that generates fake data, Faker. You can find the notebook related to this\n", + "data generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\n", + "location to write the generated data there. We are using the DBFS functionality of\n", + "Databricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\n", + "we use a PySpark user-defined function to generate the synthetic data set for\n", + "each field, and write the data back to the defined storage location, which we will\n", + "refer to in other notebooks for accessing the synthetic data set.\n", + "\n", + "**Ingesting the raw data set using Auto Loader**\n", + "\n", + "According to the medallion architecture paradigm, the Bronze layer holds the\n", + "most raw data quality. At this stage we can incrementally read new data using\n", + "Auto Loader from a location in cloud storage. Here we are adding the path to our\n", + "generated data set to the configuration section under pipeline settings, which\n", + "allows us to load the source path as a variable. So now our configuration under\n", + "pipeline settings looks like below:\n", + "\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\"\n", + "\n", + "\n", + "-----\n", + "\n", + "Then we load this configuration property in our notebooks.\n", + "\n", + "Let’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n", + "\n", + "**A . S Q L**\n", + "\n", + "SET spark.source;\n", + "CREATE STREAMING LIVE TABLE customer_bronze\n", + "(\n", + "address string ,\n", + "email string ,\n", + "id string ,\n", + "firstname string ,\n", + "lastname string ,\n", + "operation string ,\n", + "operation_date string ,\n", + "_rescued_data string\n", + ")\n", + "TBLPROPERTIES ( \"quality\" = \"bronze\" )\n", + "COMMENT \"New customer data incrementally ingested from cloud object\n", + "storage landing zone\"\n", + "AS\n", + "SELECT *\n", + "FROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\n", + "inferColumnTypes\" , \"true\" ));\n", + "\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "source = spark.conf.get( \"source\" )\n", + "\n", + "**@dlt.table(name=** **\"customer_bronze\"** **,**\n", + "**comment =** **\"New customer data incrementally ingested from**\n", + "**cloud object storage landing zone\"** **,**\n", + "**table_properties={**\n", + "**\"quality\"** **:** **\"bronze\"**\n", + "**}**\n", + "**)**\n", + "```\n", + " def customer_bronze ():\n", + "\n", + "```\n", + "return (\n", + "spark.readStream. format ( \"cloudFiles\" ) \\\n", + ".option( \"cloudFiles.format\" , \"json\" ) \\\n", + ".option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n", + ".load( f\" {source} /customers\" )\n", + ")\n", + "\n", + "The above statements use the Auto Loader to create a streaming live table\n", + "called customer_bronze from json files. When using Auto Loader in Delta Live\n", + "\n", + "Tables, you do not need to provide any location for schema or checkpoint, as\n", + "those locations will be managed automatically by your DLT pipeline.\n", + "\n", + "Auto Loader provides a Structured Streaming source called cloud_files in\n", + "SQL and cloudFiles in Python, which takes a cloud storage path and format as\n", + "parameters.\n", + "\n", + "To reduce compute costs, we recommend running the DLT pipeline in\n", + "Triggered mode as a micro-batch assuming you do not have very low latency\n", + "requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Expectations and high-quality data**\n", + "\n", + "In the next step to create a high-quality, diverse, and accessible data set,\n", + "we impose quality check expectation criteria using Constraints. Currently,\n", + "a constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\n", + "constraints are logged to enable streamlined quality monitoring.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\n", + "CONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\n", + "CONSTRAINT valid_address EXPECT (address IS NOT NULL ),\n", + "CONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\n", + "DROP ROW\n", + ")\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\n", + "AS SELECT `*`\n", + "FROM STREAM(LIVE.customer_bronze);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " @dlt.view(name= \"customer_bronze_clean_v\" ,\n", + " comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n", + "\n", + "```\n", + "\n", + "**Using APPLY CHANGES INTO statement to propagate changes to**\n", + "\n", + "**downstream target table**\n", + "\n", + "Prior to executing the Apply Changes Into query, we must ensure that a target\n", + "streaming table which we want to hold the most up-to-date data exists. If it\n", + "does not exist we need to create one. Below cells are examples of creating a\n", + "target streaming table. Note that at the time of publishing this blog, the target\n", + "streaming table creation statement is required along with the Apply Changes\n", + "Into query, and both need to be present in the pipeline — otherwise your table\n", + "creation query will fail.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE STREAMING LIVE TABLE customer_silver\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Clean, merged customers\";\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "dlt.create_target_table(name= \"customer_silver\" ,\n", + "comment= \"Clean, merged customers\" ,\n", + "table_properties={\n", + "\"quality\" : \"silver\"\n", + "\n", + "```\n", + "@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n", + "@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n", + "@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\n", + "def customer_bronze_clean_v ():\n", + " return dlt.read_stream( \"customer_bronze\" ) \\\n", + "\n", + "```\n", + "`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n", + "```\n", + "\"operation\" , \"operation_date\" , \"_rescued_data\" )\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "Now that we have a target streaming table, we can propagate changes to the\n", + "downstream target table using the Apply Changes Into query. While CDC feed\n", + "comes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\n", + "INSERT and UPDATE events from any record in the source data set matching\n", + "on primary keys, and sequenced by a field which identifies the order of events.\n", + "More specifically it updates any row in the existing target table that matches\n", + "the primary key(s) or inserts a new row when a matching record does not exist\n", + "in the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\n", + "equivalent apply_as_deletes argument in Python to handle DELETE events.\n", + "\n", + "In this example we used \"id\" as my primary key, which uniquely identifies the\n", + "customers and allows CDC events to apply to those identified customer records\n", + "in the target streaming table. Since \"operation_date\" keeps the logical order of\n", + "CDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\n", + "SQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\n", + "change events that arrive out of order. Keep in mind that the field value we use\n", + "with SEQUENCE BY (or sequence_by) should be unique among all updates to\n", + "the same key. In most cases, the sequence by column will be a column with\n", + "timestamp information.\n", + "\n", + "Finally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\n", + "data)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\n", + "date\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n", + "\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\n", + "the columns are included in the target streaming table, when we do not specify\n", + "the \"COLUMNS\" clause.\n", + "\n", + "\n", + "**A . S Q L**\n", + "\n", + "APPLY CHANGES INTO LIVE.customer_silver\n", + "FROM stream(LIVE.customer_bronze_clean_v)\n", + "KEYS (id)\n", + "APPLY AS DELETE WHEN operation `=` \"DELETE\"\n", + "SEQUENCE BY operation_date\n", + "COLUMNS `*` EXCEPT (operation, operation_date,\n", + "_rescued_data);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " dlt.apply_changes(\n", + " target = \"customer_silver\",\n", + " source = \"customer_bronze_clean_v\",\n", + " keys = [\"id\"],\n", + " sequence_by = col(\"operation_date\"),\n", + " apply_as_deletes = expr(\"operation = 'DELETE'\"),\n", + " except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n", + "\n", + "```\n", + "To check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n", + "\n", + "Please note that, at the time of publishing this blog, a table that reads from the\n", + "target of an APPLY CHANGES INTO query or apply_changes function must be a\n", + "live table, and cannot be a streaming live table.\n", + "\n", + "A [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\n", + "we have all the cells ready, let’s create a pipeline to ingest data from cloud object\n", + "storage. Open Jobs in a new tab or window in your workspace, and select “Delta\n", + "Live Tables.”\n", + "\n", + "\n", + "-----\n", + "\n", + "The pipeline associated with this blog has the following DLT pipeline settings:\n", + "\n", + "{\n", + "\"clusters\" : [\n", + "{\n", + "\"label\" : \"default\" ,\n", + "\"num_workers\" : 1\n", + "}\n", + "],\n", + "\"development\" : true ,\n", + "\"continuous\" : false ,\n", + "\"edition\" : \"advanced\" ,\n", + "\"photon\" : false ,\n", + "\"libraries\" : [\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/1-CDC_DataGenerator\"\n", + "}\n", + "},\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/2-Retail_DLT_CDC_sql\"\n", + "}\n", + "}\n", + "],\n", + "\"name\" : \"CDC_blog\" ,\n", + "\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\" ,\n", + "\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n", + "},\n", + "\"target\" : \"my_database\"\n", + "\n", + "\n", + "1. Select “Create Pipeline” to create a new pipeline\n", + "\n", + "2. Specify a name such as “Retail CDC Pipeline”\n", + "\n", + "3. Specify the Notebook Paths that you already created earlier, one for the\n", + "generated data set using Faker package, and another path for the ingestion\n", + "of the generated data in DLT. The second notebook path can refer to the\n", + "notebook written in SQL, or Python depending on your language of choice.\n", + "\n", + "4. To access the data generated in the first notebook, add the data set path in\n", + "configuration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\n", + "we set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\n", + "our second notebook.\n", + "\n", + "5. Specify the Target (which is optional and referring to the target database),\n", + "where you can query the resulting tables from your pipeline\n", + "\n", + "6. Specify the Storage Location in your object storage (which is optional), to\n", + "access your DLT produced data sets and metadata logs for your pipeline\n", + "\n", + "7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\n", + "new data in the source all at once, and once the processing is done it will\n", + "terminate the compute resource automatically. You can toggle between\n", + "Triggered and Continuous modes when editing your pipeline settings. Setting\n", + "“continuous”: false in the JSON is equivalent to setting the pipeline to\n", + "Triggered mode.\n", + "\n", + "8. For this workload you can disable the autoscaling under Autopilot Options,\n", + "and use only one worker cluster. For production workloads, we recommend\n", + "enabling autoscaling and setting the maximum numbers of workers needed\n", + "for cluster size.\n", + "\n", + "9. Select “Start”\n", + "\n", + "10. Your pipeline is created and running now!\n", + "\n", + "\n", + "-----\n", + "\n", + "You can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\n", + "to see pipeline observability and data quality monitoring on the example DLT\n", + "pipeline associated with this blog.\n", + "\n", + "**Conclusion**\n", + "\n", + "In this blog, we showed how we made it seamless for users to efficiently\n", + "implement change data capture (CDC) into their lakehouse platform with Delta\n", + "Live Tables (DLT). DLT provides built-in quality controls with deep visibility into\n", + "pipeline operations, observing pipeline lineage, monitoring schema, and quality\n", + "checks at each step in the pipeline. DLT supports automatic error handling and\n", + "best in class auto-scaling capability for streaming workloads, which enables\n", + "users to have quality data with optimum resources required for their workload.\n", + "\n", + "Data engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n", + "[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\n", + "your ETL pipelines easily identify changes and apply those changes across tens\n", + "of thousands of tables with low-latency support.\n", + "\n", + "**Ready to get started and try out CDC in Delta Live Tables for yourself?**\n", + "Please watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\n", + "complexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n", + "[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n", + "[video](https://vimeo.com/700994477) to create your pipeline!\n", + "\n", + "\n", + "**DLT pipeline lineage observability and data quality**\n", + "**monitoring**\n", + "\n", + "All DLT pipeline logs are stored in the pipeline’s storage location. You can specify\n", + "your storage location only when you are creating your pipeline. Note that once\n", + "the pipeline is created you can no longer modify storage location.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.10 \u0007\n", + "\n", + "**Best Practices for Cross-Government Data Sharing**\n", + "\n", + "by **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n", + "\n", + "**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n", + "\n", + "February 21, 2023\n", + "\n", + "\n", + "Government data exchange is the practice of sharing data between different\n", + "government agencies and often partners in commercial sectors. Government\n", + "can share data for various reasons, such as to improve government operations’\n", + "efficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\n", + "private sector or receiving data from the private sector. The considerations span\n", + "multiple jurisdictions and over almost all industries. In this blog, we will address the\n", + "needs disclosed as part of national data strategies and how modern technologies,\n", + "particularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\n", + "implement and manage a future-proof and sustainable data ecosystem.\n", + "\n", + "**Data sharing and public sector**\n", + "\n", + "“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n", + "\n", + "Probably the quote about sharing that applies the most profoundly to the\n", + "topic of data sharing. To the extent that the purpose of sharing the data is to\n", + "create new information, new insights, and new data. The importance of data\n", + "sharing is even more amplified in the government context, where federation\n", + "\n", + "\n", + "between departments allows for increased focus. Still, the very same federation\n", + "introduces challenges around data completeness, data quality, data access,\n", + "security and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\n", + "and require a strategic, multifaceted approach to be addressed appropriately.\n", + "Technology, people, process, legal frameworks, etc., require dedicated\n", + "consideration when designing a robust data sharing ecosystem.\n", + "\n", + "[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\n", + "missions through which we can materialize the value of data for the citizen and\n", + "society-wide benefits.\n", + "\n", + "\n", + "-----\n", + "\n", + "It comes as no surprise that each and every one of the missions is strongly\n", + "related to the concept of data sharing, or more broadly, data access both within\n", + "and outside of government departments:\n", + "\n", + "**1. Unlocking the value of the data across the economy** — Mission 1 of the\n", + "NDS aims to assert government and the regulators as enablers of the value\n", + "extraction from data through the adoption of best practices. The UK data\n", + "economy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\n", + "In this context, it is essential to understand that the government-collected\n", + "and provided open data can be crucial for addressing many of the challenges\n", + "across all industries.\n", + "\n", + "For example, insurance providers can better assess the risk of insuring\n", + "properties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\n", + "the other hand, capital market investors could better understand the risk of\n", + "their investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\n", + "Reversely, it is crucial for regulators to have well-defined data access and\n", + "data sharing patterns for conducting their regulatory activities. This clarity\n", + "truly enables the economic actors that interact with government data.\n", + "\n", + "\n", + "**2. Securing a pro-growth and trusted data regime** — The key aspect of\n", + "Mission 2 is data trust, or more broadly, adherence to data quality norms.\n", + "Data quality considerations become further amplified for data sharing and\n", + "data exchange use cases where we are considering the whole ecosystem\n", + "at once, and quality implications transcend the boundaries of our own\n", + "platform. This is precisely why we have to adopt “data sustainability.” What\n", + "we mean by sustainable data products are data products that harness the\n", + "existing sources over reinvention of the same/similar assets, accumulation of\n", + "unnecessary data (data pollutants) and that anticipate future uses.\n", + "\n", + "Ungoverned and unbounded data sharing could negatively impact data\n", + "quality and hinder the growth and value of data. The quality of how the data\n", + "is shared should be a key consideration of data quality frameworks. For\n", + "this reason, we require a solid set of standards and best practices for data\n", + "sharing with governance and quality assurance built into the process and\n", + "technologies. Only this way can we ensure the sustainability of our data and\n", + "secure a pro-growth trusted data regime.\n", + "\n", + "\n", + "-----\n", + "\n", + "**3. Transforming government’s use of data to drive efficiency and improve**\n", + "**public services** — “By 2025 data assets are organized and supported as\n", + "products, regardless of whether they’re used by internal teams or external\n", + "customers… Data products continuously evolve in an agile manner to meet\n", + "the needs of consumers… these products provide data solutions that can\n", + "more easily and repeatedly be used to meet various business challenges and\n", + "reduce the time and cost of delivering new AI-driven capabilities.” —\n", + "[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\n", + "enablers of digital transformation for both the public and private sectors.\n", + "\n", + "AI, ML, reports, and dashboards are just a few examples of data products\n", + "and services that extract value from data. The quality of these solutions is\n", + "directly reflected in the quality of data used for building them and our ability\n", + "to access and leverage available data assets both internally and externally.\n", + "Whilst there is a vast amount of data available for us to build new intelligent\n", + "solutions for driving efficiency for better processes, better decision-making,\n", + "and better policies — there are numerous barriers that can trap the data,\n", + "such as legacy systems, data silos, fragmented standards, proprietary\n", + "formats, etc. Modeling data solutions as data products and standardizing\n", + "them to a unified format allows us to abstract such barriers and truly\n", + "leverage the data ecosystem.\n", + "\n", + "\n", + "**4. Ensuring the security and resilience of the infrastructure on which**\n", + "**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\n", + "from now and even in a not so distant future, we will be required to rethink\n", + "our approach to data, more specifically — what is our digital supply chain\n", + "infrastructure/data sharing infrastructure? Data and data assets are products\n", + "and should be managed as products. If data is a product, we need a coherent\n", + "and unified way of providing those products.\n", + "\n", + "If data is to be used across industries and across both private and public\n", + "sectors, we need an open protocol that drives adoption and habit generation.\n", + "To drive adoption, the technologies we use must be resilient, robust, trusted\n", + "and usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\n", + "boundaries to achieving this vision.\n", + "\n", + "**5. Championing the international flow of data** — Data exchange between\n", + "jurisdictions and across governments will likely be one of the most\n", + "transformative applications of data at scale. Some of the world’s toughest\n", + "challenges depend on the efficient exchange of data between governments\n", + "— prevention of criminal activities, counterterrorism activities, net-zero\n", + "emission goals, international trade, the list goes on and on. Some steps in\n", + "this direction are already materializing: the U.S. federal government and UK\n", + "government have agreed on data exchange for countering serious crime\n", + "activities. This is a true example of championing international flow data and\n", + "using data for good. It is imperative that for these use cases, we approach\n", + "data sharing from a security-first angle. Data sharing standards and protocols\n", + "need to adhere to security and privacy best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "While originally built with a focus on the UK government and how to better\n", + "integrate data as a key asset of a modern government, these concepts apply in\n", + "a much wider global public sector context. In the same spirit, the U.S. Federal\n", + "Government proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\n", + "practices, action steps and timeline through which government can leverage\n", + "the full value of Federal data for mission, service and the public good.\n", + "\n", + "The principles are grouped into three primary topics:\n", + "\n", + "**•** **Ethical governance** — Within the domain of ethics, the sharing of data\n", + "is a fundamental tool for promoting transparency, accountability and\n", + "explainability of decision-making. It is practically impossible to uphold\n", + "ethics without some form of audit conducted by an independent party.\n", + "Data (and metadata) exchange is a critical enabler for continuous robust\n", + "processes that ensure we are using the data for good and we are using data\n", + "we can trust.\n", + "\n", + "\n", + "\n", + "**•** **Conscious design** — These principles are strongly aligned with the idea of\n", + "data sustainability. The guidelines promote forward thinking around usability\n", + "and interoperability of the data and user-centric design principles of\n", + "sustainable data products.\n", + "\n", + "**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\n", + "an important role in building a scalable learning ecosystem and learning\n", + "culture. Data is front and center of knowledge synthesis, and from a\n", + "scientific angle, data proves factual knowledge. Another critical component\n", + "of knowledge is the “Why?” and data is what we need to address the\n", + "“Why?” component of any decisions we make, which policy to enforce, who\n", + "to sanction, who to support with grants, how to improve the efficiency of\n", + "government services, how to better serve citizens and society.\n", + "\n", + "In contrast to afore discussed qualitative analysis of the value of data sharing\n", + "across governments, the European Commission forecasts the economic value\n", + "of the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\n", + "same size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\n", + "million data professionals in Europe alone. The technology and infrastructure to\n", + "support the data society have to be accessible to all, interoperable, extensible,\n", + "flexible and open. Imagine a world in which you’d need a different truck to\n", + "transport products between different warehouses because each road requires a\n", + "different set of tires — the whole supply chain would collapse. When it comes to\n", + "data, we often experience the “one set of tires for one road” paradox. Rest APIs\n", + "and data exchange protocols have been proposed in the past but have failed\n", + "to address the need for simplicity, ease of use and cost of scaling up with the\n", + "number of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Sharing — the new data**\n", + "**highway**\n", + "\n", + "Delta Sharing provides an open protocol for\n", + "secure data sharing to any computing platform.\n", + "The protocol is based on Delta data format and is\n", + "agnostic concerning the cloud of choice.\n", + "\n", + "Delta is an open source data format that avoids\n", + "vendor, platform and cloud lock-in, thus fully\n", + "adhering to the principles of data sustainability,\n", + "conscious design of the U.S. Federal Data Strategy\n", + "and mission 4 of the UK National Data Strategy.\n", + "Delta provides a governance layer on top of the\n", + "Parquet data format. Furthermore, it provides many\n", + "performance optimizations not available in Parquet\n", + "out of the box. The openness of the data format\n", + "is a critical consideration. It is the main factor for\n", + "driving the habit generation and adoption of best\n", + "practices and standards.\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\n", + "permissions and access to any data asset stored in Delta or Parquet formats.\n", + "The protocol defines two main actors, the data provider (data supplier, data\n", + "owner) and the data recipient (data consumer). The recipient, by definition, is\n", + "agnostic to the data format at the source. Delta Sharing provides the necessary\n", + "abstractions for governed data access in many different languages and tools.\n", + "\n", + "Delta Sharing is uniquely positioned to answer many of the challenges of data\n", + "sharing in a scalable manner within the context of highly regulated domains like\n", + "the public sector:\n", + "\n", + "**• Privacy and security concerns** — Personally identifiable data or otherwise\n", + "sensitive or restricted data is a major part of the data exchange needs of a\n", + "data-driven and modernized government. Given the sensitive nature of such\n", + "data, it is paramount that the governance of data sharing is maintained in a\n", + "coherent and unified manner. Any unnecessary process and technological\n", + "complexities increase the risk of over-sharing data. With this in mind,\n", + "Delta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\n", + "very inception. The protocol provides end-to-end encryption, short-lived\n", + "credentials, and accessible and intuitive audit and governance features. All\n", + "of these capabilities are available in a centralized way across all your Delta\n", + "tables across all clouds.\n", + "\n", + "**• Quality and accuracy** — Another challenge of data sharing is ensuring\n", + "that the data being shared is of high quality and accuracy. Given that\n", + "the underlying data is stored as Delta tables, we can guarantee that the\n", + "[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\n", + "of data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n", + "\n", + "\n", + "quality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n", + "[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\n", + "additional effort. The issue becomes even more emphasized by the fact\n", + "that data quality cannot be ensured in the same way on both the data\n", + "provider and data recipient side without the exact reimplementation of the\n", + "source systems. It is critical to embed quality and metadata together with\n", + "data to ensure quality travels together with data. Any decoupled approach\n", + "to managing data, metadata and quality separately increases the risk of\n", + "sharing and can lead to undesirable outcomes.\n", + "\n", + "**• Lack of standardization** — Another challenge of data sharing is the lack\n", + "of standardization in how data is collected, organized, and stored. This is\n", + "particularly pronounced in the context of governmental activities. While\n", + "governments have proposed standard formats (e.g., Office for National\n", + "Statistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\n", + "sector companies to standards proposed by such initiatives is a massive\n", + "challenge. Other industries may have different requirements for scalability,\n", + "interoperability, format complexity, lack of structure in data, etc. Most of\n", + "the currently advocated standards are lacking in multiple such aspects.\n", + "Delta is the most mature candidate for assuming the central role in the\n", + "standardization of data exchange format. It has been built as a transactional\n", + "and scalable data format, it supports structured, semi-structured and\n", + "unstructured data, it stores data schema and metadata together with data\n", + "and it provides a scalable enterprise-grade sharing protocol through Delta\n", + "Sharing. Finally, Delta is one of the most popular open source projects\n", + "in the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n", + "[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**• Cultural and organizational barriers** — These challenges can be\n", + "summarized by one word: friction. Unfortunately, it’s a common problem\n", + "for civil servants to struggle to obtain access to both internal and external\n", + "data due to over-cumbersome processes, policies and outdated standards.\n", + "The principles we are using to build our data platforms and our data sharing\n", + "platforms have to be self-promoting, have to drive adoption and have to\n", + "generate habits that adhere to best practices.\n", + "\n", + "If there is friction with standard adoption, the only way to ensure standards\n", + "are respected is by enforcement and that itself is yet another barrier to\n", + "achieving data sustainability. Organizations have already adopted Delta\n", + "Sharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n", + "[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n", + "[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\n", + "Sharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\n", + "and governed.\n", + "\n", + "\n", + "\n", + "**• Technical challenges** — Federation at the government scale or even\n", + "further across multiple industries and geographies poses technical\n", + "challenges. Each organization within this federation owns its platform\n", + "and drives technological, architectural, platform and tooling choices.\n", + "\n", + "How can we promote interoperability and data exchange in this vast,\n", + "diverse technological ecosystem? The data is the only viable integration\n", + "vehicle. As long as the data formats we utilize are scalable, open and\n", + "governed, we can use them to abstract from individual platforms and\n", + "their intrinsic complexities.\n", + "\n", + "Delta format and Delta Sharing solve this wide array of requirements and\n", + "challenges in a scalable, robust and open way. This positions Delta Sharing\n", + "as the strongest choice for unification and simplification of the protocol and\n", + "mechanism through which we share data across both private and public sectors.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Sharing through data clean rooms**\n", + "\n", + "\n", + "[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\n", + "share data with third parties in a privacy-safe environment. With Unity Catalog ,\n", + "you can enable fine-grained access controls on the data and meet your privacy\n", + "requirements. In this architecture, the data participants never get access to\n", + "the raw data. The only outputs from the clean rooms are those data assets\n", + "generated in a pre-agreed, governed and fully controlled manner that ensures\n", + "compliance with the requirements of all parties involved.\n", + "\n", + "Finally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\n", + "on the premise. In contrast, less restricted data is free to leverage the power\n", + "of the cloud offerings. In said scenario, there may be a need to combine the\n", + "power of the cloud with the restricted data to solve advanced use cases where\n", + "capabilities are unavailable on the on-premises data platforms. Data clean rooms\n", + "can ensure that no physical data copies of the raw restricted data are created,\n", + "results are produced within the clean room’s controlled environment and results\n", + "are shared back to the on-premises environment (if the results maintain the\n", + "restricted access within the defined policies) or are forwarded to any other\n", + "compliant and predetermined destination system.\n", + "\n", + "\n", + "Taking the complexities of data sharing within highly regulated space and the\n", + "public sector one step further — what if we require to share the knowledge\n", + "contained in the data without ever granting direct access to the source data to\n", + "external parties? These requirements may prove achievable and desirable where\n", + "the data sharing risk appetite is very low.\n", + "\n", + "In many public sector contexts, there are concerns that combining the data that\n", + "describes citizens could lead to a big brother scenario where simply too much\n", + "data about an individual is concentrated in a single data asset. If it were to fall\n", + "into the wrong hands, such a hypothetical data asset could lead to immeasurable\n", + "consequences for individuals and the trust in public sector services could\n", + "erode. On the other hand, the value of a 360 view of the citizen could accelerate\n", + "important decision-making. It could immensely improve the quality of policies\n", + "and services provided to the citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Citizen value of data sharing**\n", + "\n", + "Every decision made by the government is a decision that affects its citizens.\n", + "Whether the decision is a change to a policy, granting a benefit or preventing\n", + "crime, it can significantly influence the quality of our society. Data is a key factor\n", + "in making the right decisions and justifying the decisions made. Simply put,\n", + "we can’t expect high-quality decisions without the high quality of data and a\n", + "complete view of the data (within the permitted context). Without data sharing,\n", + "we will remain in a highly fragmented position where our ability to make those\n", + "decisions is severely limited or even completely compromised. In this blog, we\n", + "have covered several technological solutions available within the lakehouse that\n", + "can derisk and accelerate how the government is leveraging the data ecosystem\n", + "in a sustainable and scalable way.\n", + "\n", + "For more details on the industry use cases that Delta Sharing is addressing\n", + "please consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Ready-to-Use Notebooks and Data Sets\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins**\n", + "\n", + "Leverage digital twins — virtual\n", + "representations of devices and\n", + "objects — to optimize operations and\n", + "gain insights\n", + "\n", + "\n", + "This section includes several Solution Accelerators — free, ready-to-use\n", + "\n", + "examples of data solutions from different industries ranging from retail to\n", + "\n", + "manufacturing and healthcare. Each of the following scenarios includes\n", + "\n", + "notebooks with code and step-by-step instructions to help you get\n", + "\n", + "started. Get hands-on experience with the Databricks Lakehouse Platform\n", + "\n", + "\n", + "by trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n", + "\n", + "\n", + "**Overall Equipment**\n", + "**Effectiveness**\n", + "\n", + "Ingest equipment sensor data for\n", + "metric generation and data driven\n", + "decision-making\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n", + "\n", + "**Real-time point of**\n", + "**sale analytics**\n", + "\n", + "Calculate current inventories for\n", + "various products across multiple store\n", + "locations with Delta Live Tables\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n", + "\n", + "\n", + "**Recommendation Engines**\n", + "**for Personalization**\n", + "\n", + "Improve customers’ user experience\n", + "and conversion with personalized\n", + "recommendations\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Understanding Price**\n", + "**Transparency Data**\n", + "\n", + "Efficiently ingest large healthcare data\n", + "sets to create price transparency for\n", + "better understanding of healthcare costs\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n", + "\n", + "Additional Solution Accelerators with ready-to-use notebooks can be found here:\n", + "\n", + "**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Case Studies\n", + "\n", + "**4.1** Akamai\n", + "\n", + "**4.2** Grammarly\n", + "\n", + "**4.3** Honeywell\n", + "\n", + "**4.4** Wood Mackenzie\n", + "\n", + "**4.5** Rivian\n", + "\n", + "**4.6** AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.1\n", + "**Akamai delivers real-time security**\n", + "**analytics using Delta Lake**\n", + "\n", + "\n", + "###### <1\n", + "\n", + "**Min ingestion time,**\n", + "**reduced from 15 min**\n", + "\n", + "\n", + "###### <85%\n", + "\n", + "**Of queries have a response**\n", + "**time of 7 seconds or less**\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Delta Lake, Data Streaming, Photon,\n", + "[Databricks SQL](https://databricks.com/product/databricks-sql)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "Akamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n", + "\n", + "uses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n", + "\n", + "worldwide to route internet traffic for some of the largest enterprises in media, commerce,\n", + "\n", + "finance, retail and many other industries. About 30% of the internet’s traffic flows through\n", + "\n", + "Akamai servers. Akamai also provides cloud security solutions.\n", + "\n", + "In 2018, the company launched a web security analytics tool that offers Akamai customers\n", + "\n", + "a single, unified interface for assessing a wide range of streaming security events and\n", + "\n", + "performing analysis of those events. The web analytics tool helps Akamai customers to\n", + "\n", + "take informed actions in relation to security events in real time. Akamai is able to stream\n", + "\n", + "massive amounts of data and meet the strict SLAs it provides to customers by leveraging\n", + "\n", + "Delta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting and streaming enormous amounts of data**\n", + "\n", + "Akamai’s web security analytics tool ingests approximately 10GB of data related\n", + "to security events per second. Data volume can increase significantly when\n", + "retail customers conduct a large number of sales — or on big shopping days like\n", + "Black Friday or Cyber Monday. The web security analytics tool stores several\n", + "petabytes of data for analysis purposes. Those analyses are performed to\n", + "protect Akamai’s customers and provide them with the ability to explore and\n", + "query security events on their own.\n", + "\n", + "The web security analytics tool initially relied on an on-premises architecture\n", + "running Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n", + "(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\n", + "displayed in the tool. The company sought to improve ingestion and query speed\n", + "to meet those SLAs. “Data needs to be as real-time as possible so customers\n", + "can see what is attacking them,” says Tomer Patel, Engineering Manager at\n", + "Akamai. “Providing queryable data to customers quickly is critical. We wanted to\n", + "move away from on-prem to improve performance and our SLAs so the latency\n", + "would be seconds rather than minutes.”\n", + "\n", + "**Delta Lake allows us to not only query the data better but to**\n", + "**also acquire an increase in the data volume. We’ve seen an**\n", + "**80% increase in traffic and data in the last year, so being able**\n", + "**to scale fast is critical.**\n", + "\n", + "\n", + "After conducting proofs of concept with several companies, Akamai chose to\n", + "base its streaming analytics architecture on Spark and the Databricks Lakehouse\n", + "Platform. “Because of our scale and the demands of our SLA, we determined that\n", + "Databricks was the right solution for us,” says Patel. “When we consider storage\n", + "optimization, and data caching, if we went with another solution, we couldn’t\n", + "achieve the same level of performance.”\n", + "\n", + "**Improving speed and reducing costs**\n", + "\n", + "Today, the web security analytics tool ingests and transforms data, stores it\n", + "in cloud storage, and sends the location of the file via Kafka. It then uses a\n", + "Databricks Job as the ingest application. Delta Lake, the open source storage\n", + "format at the base of the Databricks Lakehouse Platform, supports real-time\n", + "querying on the web security analytics data. Delta Lake also enables Akamai to\n", + "scale quickly. “Delta Lake allows us to not only query the data better but to also\n", + "acquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\n", + "in traffic and data in the last year, so being able to scale fast is critical.”\n", + "\n", + "Akamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n", + "\n", + "fast query performance. Patel added that Photon provided a significant boost\n", + "to query performance. Overall, Databricks’ streaming architecture combined\n", + "with DBSQL and Photon enables Akamai to achieve real-time analytics, which\n", + "translates to real-time business benefits.\n", + "\n", + "\n", + "**Tomer Patel**\n", + "Engineering Manager, Akamai\n", + "\n", + "\n", + "-----\n", + "\n", + "Patel says he likes that Delta Lake is open source, as the company has benefitted\n", + "from a community of users working to improve the product. “The fact that Delta\n", + "Lake is open source and there’s a big community behind it means we don’t need\n", + "to implement everything ourselves,” says Patel. “We benefit from fixed bugs that\n", + "others have encountered and from optimizations that are contributed to the\n", + "project.” Akamai worked closely with Databricks to ensure Delta Lake can meet\n", + "the scale and performance requirements Akamai defined. These improvements\n", + "have been contributed back to the project (many of which were made available as\n", + "part of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\n", + "technology being tested at such a large scale in a real-world production scenario.\n", + "\n", + "\n", + "**Meeting aggressive requirements for scale,**\n", + "**reliability and performance**\n", + "\n", + "Using Spark Structured Streaming on the Databricks Lakehouse Platform enables\n", + "the web security analytics tool to stream vast volumes of data and provide\n", + "low-latency, real-time analytics-as-a-service to Akamai’s customers. That way\n", + "Akamai is able to make available security event data to customers within the\n", + "SLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\n", + "performance, performance,” says Patel. “The platform’s performance and\n", + "scalability are what drives us.”\n", + "\n", + "Using the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\n", + "the security event data. “Reducing ingestion time from 15 minutes to under 1\n", + "minute is a huge improvement,” says Patel. “It benefits our customers because\n", + "they can see the security event data faster and they have a view of what exactly\n", + "is happening as well as the capability to filter all of it.”\n", + "\n", + "Akamai’s biggest priority is to provide customers with a good experience and\n", + "fast response times. To date, Akamai has moved about 70% of security event\n", + "data from its on-prem architecture to Databricks, and the SLA for customer\n", + "query and response time has improved significantly as a result. “Now, with the\n", + "move to Databricks, our customers experience much better response time, with\n", + "over 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\n", + "optimal security configuration.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.2\n", + "**Grammarly uses Databricks Lakehouse to improve**\n", + "**user experience**\n", + "\n", + "\n", + "###### 110%\n", + "\n", + "**Faster querying, at 10% of the cost**\n", + "**to ingest, than a data warehouse**\n", + "\n", + "\n", + "###### 5 billion\n", + "\n", + "**Daily events available for**\n", + "**analytics in under 15 minutes**\n", + "\n", + "\n", + "Grammarly’s mission is to improve lives by improving communication. The company’s\n", + "\n", + "trusted AI-powered communication assistance provides real-time suggestions to\n", + "\n", + "help individuals and teams write more confidently and achieve better results. Its\n", + "\n", + "comprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n", + "\n", + "[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n", + "\n", + "wherever writing happens. As the company grew over the years, its legacy, homegrown\n", + "\n", + "analytics system made it challenging to evaluate large data sets quickly and cost-\n", + "\n", + "effectively.\n", + "\n", + "By migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n", + "\n", + "flexible, scalable and highly secure analytics platform that helps 30 million people and\n", + "\n", + "50,000 teams worldwide write more effectively every day.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "Recommendation Engines, Advertising\n", + "Effectiveness, Customer Lifetime Value\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Unity Catalog,\n", + "[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Harnessing data to improve communications for millions of**\n", + "**users and thousands of teams**\n", + "\n", + "When people use Grammarly’s AI communication assistance, they receive\n", + "suggestions to help them improve multiple dimensions of communication,\n", + "including spelling and grammar correctness, clarity and conciseness, word\n", + "choice, style, and tone. Grammarly receives feedback when users accept, reject\n", + "or ignore its suggestions through app-created events, which total about 5 billion\n", + "events per day.\n", + "\n", + "Historically, Grammarly relied on a homegrown legacy analytics platform and\n", + "leveraged an in-house SQL-like language that was time-intensive to learn and\n", + "made it challenging to onboard new hires. As the company grew, Grammarly\n", + "data analysts found that the platform did not sufficiently meet the needs of its\n", + "essential business functions, especially marketing, sales and customer success.\n", + "Analysts found themselves copying and pasting data from spreadsheets\n", + "because the existing system couldn’t effectively ingest the external data needed\n", + "to answer questions such as, “Which marketing channel delivers the highest\n", + "ROI?” Reporting proved challenging because the existing system didn’t support\n", + "Tableau dashboards, and company leaders and analysts needed to ensure they\n", + "could make decisions quickly and confidently.\n", + "\n", + "\n", + "**Databricks Lakehouse has given us the flexibility to unleash**\n", + "**our data without compromise. That flexibility has allowed us**\n", + "**to speed up analytics to a pace we’ve never achieved before.**\n", + "\n", + "**Chris Locklin**\n", + "Engineering Manager, Data Platforms, Grammarly\n", + "\n", + "Grammarly also sought to unify its data warehouses in order to scale and\n", + "improve data storage and query capabilities. As it stood, large Amazon EMR\n", + "clusters ran 24/7 and drove up costs. With the various data sources, the team\n", + "also needed to maintain access control. “Access control in a distributed file\n", + "system is difficult, and it only gets more complicated as you ingest more data\n", + "sources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\n", + "Meanwhile, reliance on a single streaming workflow made collaboration among\n", + "teams challenging. Data silos emerged as different business areas implemented\n", + "analytics tools individually. “Every team decided to solve their analytics needs in\n", + "the best way they saw fit,” says Locklin. “That created challenges in consistency\n", + "and knowing which data set was correct.”\n", + "\n", + "\n", + "-----\n", + "\n", + "As its data strategy was evolving, Grammarly’s priority was to get the most out\n", + "of analytical data while keeping it secure. This was crucial because security is\n", + "Grammarly’s number-one priority and most important feature, both in how it\n", + "protects its users’ data and how it ensures its own company data remains secure.\n", + "To accomplish that, Grammarly’s data platform team sought to consolidate\n", + "data and unify the company on a single platform. That meant sustaining a highly\n", + "secure infrastructure that could scale alongside the company’s growth, improving\n", + "ingestion flexibility, reducing costs and fueling collaboration.\n", + "\n", + "**Improving analytics, visualization and decision-making**\n", + "**with the lakehouse**\n", + "\n", + "After conducting several proofs of concept to enhance its infrastructure,\n", + "Grammarly migrated to the Databricks Lakehouse Platform. Bringing all the\n", + "analytical data into the lakehouse created a central hub for all data producers\n", + "and consumers across Grammarly, with Delta Lake at the core.\n", + "\n", + "Using the lakehouse architecture, data analysts within Grammarly now have a\n", + "consolidated interface for analytics, which leads to a single source of truth and\n", + "\n", + "confidence in the accuracy and availability of all data managed by the data\n", + "platform team. Across the organization, teams are using Databricks SQL to\n", + "conduct queries within the platform on both internally generated product data\n", + "and external data from digital advertising platform partners. Now, they can easily\n", + "connect to Tableau and create dashboards and visualizations to present to\n", + "executives and key stakeholders.\n", + "\n", + "\n", + "“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\n", + "companies ask for your data, hold it for you, and then let you perform analytics\n", + "on it. Just as Grammarly ensures our users’ data always remains theirs, we\n", + "wanted to ensure our company data remained ours. Grammarly’s data stays\n", + "inside of Grammarly.”\n", + "\n", + "With its data consolidated in the lakehouse, different areas of Grammarly’s\n", + "business can now analyze data more thoroughly and effectively. For example,\n", + "Grammarly’s marketing team uses advertising to attract new business. Using\n", + "Databricks, the team can consolidate data from various sources to extrapolate\n", + "a user’s lifetime value, compare it with customer acquisition costs and get rapid\n", + "feedback on campaigns. Elsewhere, data captured from user interactions flow\n", + "into a set of tables used by analysts for ad hoc analysis to inform and improve\n", + "the user experience.\n", + "\n", + "By consolidating data onto one unified platform, Grammarly has eliminated data\n", + "silos. “The ability to bring all these capabilities, data processing and analysis\n", + "under the same platform using Databricks is extremely valuable,” says Sergey\n", + "Blanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\n", + "and engineering to analytics and ML under the same umbrella removes barriers\n", + "and makes it easy for everyone to work with the data and each other.”\n", + "\n", + "\n", + "-----\n", + "\n", + "To manage access control, enable end-to-end observability and monitor data\n", + "quality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n", + "“Data lineage allows us to effectively monitor usage of our data and ensure it\n", + "upholds the standards we set as a data platform team,” says Locklin. “Lineage is\n", + "the last crucial piece for access control. It allows analysts to leverage data to do\n", + "their jobs while adhering to all usage standards and access controls, even when\n", + "recreating tables and data sets in another environment.”\n", + "\n", + "**Faster time to insight drives more intelligent**\n", + "**business decisions**\n", + "\n", + "Using the Databricks Lakehouse Platform, Grammarly’s engineering teams now\n", + "have a tailored, centralized platform and a consistent data source across the\n", + "company, resulting in greater speed and efficiency and reduced costs. The\n", + "lakehouse architecture has led to 110% faster querying, at 10% of the cost to\n", + "ingest, than a data warehouse. Grammarly can now make its 5 billion daily events\n", + "available for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n", + "\n", + "receive feedback about new features being rolled out and understand if they are\n", + "being adopted as expected. Ultimately, it helps them understand how groups\n", + "of users engage with the UX, improving the experience and ensuring features\n", + "and product releases bring the most value to users. “Everything my team does\n", + "is focused on creating a rich, personalized experience that empowers people to\n", + "communicate more effectively and achieve their potential,” says Locklin.\n", + "\n", + "\n", + "Moving to the lakehouse architecture also solved the challenge of access control\n", + "over distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\n", + "ability to manage file permissions with more flexibility than a database would\n", + "allow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\n", + "using Databricks allows us to keep analytical data in-house, Unity Catalog helps\n", + "us continue to uphold the highest standards of data protection by controlling\n", + "access paradigms inside our data. That opens a whole new world of things that\n", + "we can do.”\n", + "\n", + "Ultimately, migrating to the Databricks Lakehouse Platform has helped\n", + "Grammarly to foster a data-driven culture where employees get fast access\n", + "to analytics without having to write complex queries, all while maintaining\n", + "Grammarly’s enterprise-grade security practices. “Our team’s mission is to help\n", + "Grammarly make better, faster business decisions,” adds Blanket. “My team\n", + "would not be able to effectively execute on that mission if we did not have a\n", + "platform like Databricks available to us.” Perhaps most critically, migrating off its\n", + "rigid legacy infrastructure gives Grammarly the adaptability to do more while\n", + "knowing the platform will evolve as its needs evolve. “Databricks has given us the\n", + "flexibility to unleash our data without compromise,” says Locklin. “That flexibility\n", + "has allowed us to speed up analytics to a pace we’ve never achieved before.”\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.3\n", + "**Honeywell selects Delta Live Tables for streaming data**\n", + "\n", + "Companies are under growing pressure to reduce energy use, while at the same time\n", + "\n", + "they are looking to lower costs and improve efficiency. Honeywell delivers industry-\n", + "\n", + "specific solutions that include aerospace products and services, control technologies\n", + "\n", + "for buildings and industry, and performance materials globally. Honeywell’s Energy\n", + "\n", + "and Environmental Solutions division uses IoT sensors and other technologies to help\n", + "\n", + "businesses worldwide manage energy demand, reduce energy consumption and carbon\n", + "\n", + "emissions, optimize indoor air quality, and improve occupant well-being.\n", + "\n", + "Accomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n", + "\n", + "Tables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n", + "\n", + "billions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n", + "\n", + "real-time queries and multilayer insights into data at scale — helping Honeywell improve\n", + "\n", + "how it manages data and extract more value from it, both for itself and for its customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Delta Live Tables\n", + "\n", + "\n", + "**C LO U D**\n", + "[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n", + "**aggregations, and bring the significant amount of data we collect**\n", + "**from our buildings under control so we can provide customers value.**\n", + "\n", + "**Dr. Chris Inkpen**\n", + "Global Solutions Architect, Honeywell Energy and Environmental Solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "**Processing billions of IoT data points per day**\n", + "\n", + "Honeywell’s solutions and services are used in millions of buildings around the\n", + "world. Helping its customers create buildings that are safe, more sustainable\n", + "and productive can require thousands of sensors per building. Those sensors\n", + "monitor key factors such as temperature, pressure, humidity and air quality.\n", + "In addition to the data collected by sensors inside a building, data is also\n", + "collected from outside, such as weather and pollution data. Another data set\n", + "consists of information about the buildings themselves — such as building\n", + "type, ownership, floor plan, square footage of each floor and square footage\n", + "of each room. That data set is combined with the two disparate data streams,\n", + "adding up to a lot of data across multiple structured and unstructured formats,\n", + "including images and video streams, telemetry data, event data, etc. At peaks,\n", + "Honeywell ingests anywhere between 200 to 1,000 events per second for any\n", + "building, which equates to billions of data points per day. Honeywell’s existing\n", + "data infrastructure was challenged to meet such demand. It also made it difficult\n", + "for Honeywell’s data team to query and visualize its disparate data so it could\n", + "provide customers with fast, high-quality information and analysis.\n", + "\n", + "**ETL simplified: high-quality, reusable data pipelines**\n", + "\n", + "With Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\n", + "data team can now ingest billions of rows of sensor data into Delta Lake and\n", + "automatically build SQL endpoints for real-time queries and multilayer insights\n", + "into data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n", + "\n", + "\n", + "Chris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\n", + "Solutions. “We give the system more data, and it copes. Out of the box, it’s given\n", + "us the confidence that it will handle whatever we throw at it.”\n", + "\n", + "Honeywell credits the Databricks Lakehouse Platform for helping it to unify its\n", + "vast and varied data — batch, streaming, structured and unstructured — into\n", + "one platform. “We have many different data types. The Databricks Lakehouse\n", + "Platform allows us to use things like Apache Kafka and Auto Loader to load and\n", + "process multiple types of data and treat everything as a stream of data, which is\n", + "awesome. Once we’ve got structured data from unstructured data, we can write\n", + "standardized pipelines.”\n", + "\n", + "Honeywell data engineers can now build and leverage their own ETL pipelines\n", + "with Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\n", + "be reused regardless of environment, and data can run in batches or streams. It’s\n", + "also helped Honeywell’s data team transition from a small team to a larger team.\n", + "“When we wrote our first few pipelines before DLT existed, only one person could\n", + "work in one part of the functionality. Now that we’ve got DLT and the ability to\n", + "have folders with common functionality, we’ve got a really good platform where\n", + "we can easily spin off different pipelines.”\n", + "\n", + "DLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\n", + "pipeline need optimization,” says Inkpen. “With standard pipelines, that was\n", + "much more chaotic.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling ease, simplicity and scalability across the**\n", + "**infrastructure**\n", + "\n", + "Delta Live Tables has helped Honeywell’s data team consistently query\n", + "complex data while offering simplicity of scale. It also enables end-to-end data\n", + "visualization of Honeywell’s data streams as they flow into its infrastructure, are\n", + "transformed, and then flow out. “Ninety percent of our ETL is now captured in\n", + "diagrams, so that’s helped considerably and improves data governance. DLT\n", + "encourages — and almost enforces — good design,” says Inkpen.\n", + "\n", + "Using the lakehouse as a shared workspace has helped promote teamwork and\n", + "collaboration at Honeywell. “The team collaborates beautifully now, working\n", + "together every day to divvy up the pipeline into their own stories and workloads,”\n", + "says Inkpen.\n", + "\n", + "Meanwhile, the ability to manage streaming data with low latency and better\n", + "throughput has improved accuracy and reduced costs. “Once we’ve designed\n", + "something using DLT, we’re pretty safe from scalability issues — certainly a\n", + "hundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\n", + "then go back and look at how we can take a traditional job and make it more\n", + "performant and less costly. We’re in a much better position to try and do that\n", + "from DLT.”\n", + "\n", + "\n", + "Using Databricks and DLT also helps the Honeywell team perform with greater\n", + "agility, which allows them to innovate faster while empowering developers to\n", + "respond to user requirements almost immediately. “Our previous architecture\n", + "made it impossible to know what bottlenecks we had and what we needed to\n", + "scale. Now we can do data science in near real-time.”\n", + "\n", + "Ultimately, Honeywell can now more quickly provide its customers with the\n", + "data and analysis they need to make their buildings more efficient, healthier\n", + "and safer for occupants. “I’m continuously looking for ways to improve our\n", + "lifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\n", + "us pull together many different data sources, do aggregations, and bring the\n", + "significant amount of data we collect from our buildings under control so we\n", + "can provide customers value.”\n", + "\n", + "**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.4\n", + "**Wood Mackenzie helps customers transition to a more**\n", + "**sustainable future**\n", + "\n", + "\n", + "###### 12 Billion\n", + "\n", + "**Data points processed**\n", + "**each week**\n", + "\n", + "\n", + "###### 80-90%\n", + "\n", + "**Reduction in**\n", + "**processing time**\n", + "\n", + "\n", + "###### Cost Savings\n", + "\n", + "**In operations through**\n", + "**workflow automation**\n", + "\n", + "\n", + "Wood Mackenzie offers customized consulting and analysis for a wide range of clients\n", + "\n", + "in the energy and natural resources sectors. Founded in Edinburgh, the company first\n", + "\n", + "cultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n", + "\n", + "detailed insight for every interconnected sector of the energy, chemicals, metals and\n", + "\n", + "mining industries.\n", + "\n", + "Today it sees itself playing an important role in the transition to a more sustainable\n", + "\n", + "future. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n", + "\n", + "ingest and process massive amounts of data. Using a common workflow provided\n", + "\n", + "higher visibility to engineering team members, encouraging better collaboration. With\n", + "\n", + "an automated, transparent workflow in place, the team saw improved productivity and\n", + "\n", + "data quality and an easier path to fix pipeline issues when they arise.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Workflows\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering insights to the energy industry**\n", + "\n", + "Fulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\n", + "built to deliver insights at key decision points for customers in the energy sector.\n", + "Feeding into Lens are vast amounts of data collected from various data sources\n", + "and sensors used to monitor energy creation, oil and gas production, and more.\n", + "Those data sources update about 12 billion data points every week that must\n", + "be ingested, cleaned and processed as part of the input for the Lens platform.\n", + "Yanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\n", + "data professionals that build and maintain the ETL pipeline that provides input\n", + "data for Lens. The team is leveraging the Databricks Lakehouse Platform and\n", + "uses Apache Spark™ for parallel processing, which provides greater performance\n", + "and scalability benefits compared to an earlier single-node system working\n", + "sequentially. “We saw a reduction of 80-90% in data processing time, which\n", + "results in us providing our clients with more up-to-date, more complete and\n", + "more accurate data,” says Wu.\n", + "\n", + "**Our mission is to transform the way we power the planet.**\n", + "**Our clients in the energy sector need data, consulting services**\n", + "**and research to achieve that transformation. Databricks**\n", + "**Workflows gives us the speed and flexibility to deliver the**\n", + "**insights our clients need.**\n", + "\n", + "\n", + "**Improved collaboration and transparency with a common**\n", + "**workflow**\n", + "\n", + "The data pipeline managed by the team includes several stages for standardizing\n", + "and cleaning raw data, which can be structured or unstructured and may be in\n", + "the form of PDFs or even handwritten notes.\n", + "\n", + "Different members of the data team are responsible for different parts of\n", + "the pipeline, and there is a dependency between the processing stages each\n", + "team member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\n", + "workstream that the entire team uses. Each stage of the pipeline is implemented\n", + "in a Python notebook, which is run as a job in the main workflow.\n", + "\n", + "Each team member can now see exactly what code is running on each stage,\n", + "making it easy to find the cause of the issue. Knowing who owns the part of the\n", + "pipeline that originated the problem makes fixing issues much faster. “Without\n", + "a common workflow, different members of the team would run their notebooks\n", + "independently, not knowing that failure in their run affected stages downstream,”\n", + "says Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\n", + "rerun notebooks, it was hard to tell which notebook version was initially run and\n", + "the latest version to use.”\n", + "\n", + "\n", + "**Yanyan Wu**\n", + "Vice President of Data, Wood Mackenzie\n", + "\n", + "\n", + "-----\n", + "\n", + "Using Workflows’ alerting capabilities to notify the team when a workflow task\n", + "fails ensures everyone knows a failure occurred and allows the team to work\n", + "together to resolve the issue quickly. The definition of a common workflow\n", + "created consistency and transparency that made collaboration easier. “Using\n", + "Databricks Workflows allowed us to encourage collaboration and break up the\n", + "walls between different stages of the process,” explains Wu. “It allowed us all to\n", + "speak the same language.”\n", + "\n", + "Creating transparency and consistency is not the only advantage the team saw.\n", + "Using Workflows to automate notebook runs also led to cost savings compared\n", + "to running interactive notebooks manually.\n", + "\n", + "**Improved code development productivity**\n", + "\n", + "The team’s ETL pipeline development process involves iteration on PySpark\n", + "notebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\n", + "for data professionals on the team to manually develop and test a notebook.\n", + "Because Databricks Workflows supports running notebooks as task type\n", + "(along with Python files, JAR files and other types), when the code is ready for\n", + "\n", + "\n", + "developing notebooks with the interactive notebook UI while leveraging the\n", + "power of automation, which reduces potential issues that may happen when\n", + "running notebooks manually.\n", + "\n", + "The team has gone even further in increasing productivity by developing a\n", + "CI/CD process. “By connecting our source control code repository, we know\n", + "the workflow always runs the latest code version we committed to the repo,”\n", + "explains Zhang. “It’s also easy to switch to a development branch to develop a\n", + "new feature, fix a bug and run a development workflow. When the code passes\n", + "all tests, it is merged back to the main branch and the production workflow is\n", + "automatically updated with the latest code.”\n", + "\n", + "Going forward, Wood Mackenzie plans to optimize its use of Databricks\n", + "Workflows to automate machine learning processes such as model training,\n", + "model monitoring and handling model drift. The firm uses ML to improve its data\n", + "quality and extract insights to provide more value to its clients. “Our mission is to\n", + "transform how we power the planet,” Wu says. “Our clients in the energy sector\n", + "need data, consulting services and research to achieve that transformation.\n", + "Databricks Workflows gives us the speed and flexibility to deliver the insights our\n", + "clients need.”\n", + "\n", + "\n", + "production, it’s easy and cost effective to automate it by adding it to a workflow.\n", + "The workflow can then be easily revised by adding or removing any steps to\n", + "or from the defined flow. This way of working keeps the benefit of manually\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.5\n", + "**Rivian redefines driving experience with**\n", + "**the Databricks Lakehouse**\n", + "\n", + "###### 250 platform users\n", + "\n", + "**A 50x increase from a year ago**\n", + "\n", + "Rivian is preserving the natural world for future generations with revolutionary Electric\n", + "\n", + "Adventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n", + "\n", + "terabytes of IoT data per day, the company is using data insights and machine\n", + "\n", + "learning to improve vehicle health and performance. However, with legacy cloud\n", + "\n", + "tooling, it struggled to scale pipelines cost-effectively and spent significant resources\n", + "\n", + "on maintenance — slowing its ability to be truly data driven.\n", + "\n", + "Since moving to the Databricks Lakehouse Platform, Rivian can now understand how\n", + "\n", + "a vehicle is performing and how this impacts the driver using it. Equipped with these\n", + "\n", + "insights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n", + "\n", + "driving experience to customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Predictive Maintenance, Scaling ML Models\n", + "for IoT, Data-Driven ESG\n", + "\n", + "**P L AT F O R M**\n", + "[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Struggling to democratize data on a legacy platform**\n", + "\n", + "\n", + "sharing of data, which further contributed to productivity issues. Required data\n", + "languages and specific expertise of toolsets created a barrier to entry that\n", + "limited developers from making full use of the data available. Jason Shiverick,\n", + "Principal Data Scientist at Rivian, said the biggest issue was the data access. “I\n", + "wanted to open our data to a broader audience of less technical users so they\n", + "could also leverage data more easily.”\n", + "\n", + "Rivian knew that once its EAVs hit the market, the amount of data ingested would\n", + "explode. In order to deliver the reliability and performance it promised, Rivian\n", + "needed an architecture that would not only democratize data access, but also\n", + "provide a common platform to build innovative solutions that can help ensure a\n", + "reliable and enjoyable driving experience.\n", + "\n", + "**Databricks Lakehouse empowers us to lower the barrier of**\n", + "**entry for data access across our organization so we can build**\n", + "**the most innovative and reliable electric vehicles in the world.**\n", + "\n", + "**Wassym Bensaid**\n", + "Vice President of Software Development, Rivian\n", + "\n", + "\n", + "Building a world that will continue to be enjoyed by future generations requires\n", + "a shift in the way we operate. At the forefront of this movement is Rivian —\n", + "an electric vehicle manufacturer focused on shifting our planet’s energy and\n", + "transportation systems entirely away from fossil fuel. Today, Rivian’s fleet\n", + "includes personal vehicles and involves a partnership with Amazon to deliver\n", + "100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\n", + "capture petabytes of data ranging from how the vehicle drives to how various\n", + "parts function. With all this data at its fingertips, Rivian is using machine learning\n", + "to improve the overall customer experience with predictive maintenance so that\n", + "potential issues are addressed before they impact the driver.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility\n", + "and tooling limitations that decreased output, prevented collaboration and\n", + "increased operational costs. It had 30 to 50 large and operationally complicated\n", + "compute clusters at any given time, which was costly. Not only was the system\n", + "difficult to manage, but the company experienced frequent cluster outages\n", + "as well, forcing teams to dedicate more time to troubleshooting than to data\n", + "analysis. Additionally, data silos created by disjointed systems slowed the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Predicting maintenance issues with Databricks Lakehouse**\n", + "\n", + "Rivian chose to modernize its data infrastructure on the Databricks Lakehouse\n", + "Platform, giving it the ability to unify all of its data into a common view for\n", + "downstream analytics and machine learning. Now, unique data teams have\n", + "a range of accessible tools to deliver actionable insights for different use\n", + "cases, from predictive maintenance to smarter product development. Venkat\n", + "Sivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\n", + "to build a culture around an open data platform that provided a system for\n", + "really democratizing data and analysis in an efficient way.” Databricks’ flexible\n", + "support of all programming languages and seamless integration with a variety of\n", + "toolsets eliminated access roadblocks and unlocked new opportunities. Wassym\n", + "Bensaid, Vice President of Software Development at Rivian, explains, “Today we\n", + "have various teams, both technical and business, using Databricks Lakehouse\n", + "to explore our data, build performant data pipelines, and extract actionable\n", + "business and product insights via visual dashboards.”\n", + "\n", + "\n", + "metrics, Rivian can improve the accuracy of smart features and the control\n", + "that drivers have over them. Designed to take the stress out of long drives and\n", + "driving in heavy traffic, features like adaptive cruise control, lane change assist,\n", + "automatic emergency driving, and forward collision warning can be honed over\n", + "time to continuously optimize the driving experience for customers.\n", + "\n", + "Secure data sharing and collaboration was also facilitated with the Databricks\n", + "Unity Catalog. Shiverick describes how unified governance for the lakehouse\n", + "benefits Rivian productivity. “Unity Catalog gives us a truly centralized data\n", + "catalog across all of our different teams,” he said. “Now we have proper access\n", + "management and controls.” Venkat adds, “With Unity Catalog, we are centralizing\n", + "data catalog and access management across various teams and workspaces,\n", + "which has simplified governance.” End-to-end version controlled governance\n", + "and auditability of sensitive data sources, like the ones used for autonomous\n", + "driving systems, produces a simple but secure solution for feature engineering.\n", + "This gives Rivian a competitive advantage in the race to capture the autonomous\n", + "driving grid.\n", + "\n", + "\n", + "Rivian’s ADAS (advanced driver-assistance systems) Team can now easily\n", + "prepare telemetric accelerometer data to understand all EAV motions. This core\n", + "recording data includes information about pitch, roll, speed, suspension and\n", + "airbag activity, to help Rivian understand vehicle performance, driving patterns\n", + "and connected car system predictability. Based on these key performance\n", + "\n", + "\n", + "-----\n", + "\n", + "**Accelerating into an electrified and sustainable world**\n", + "\n", + "\n", + "By scaling its capacity to deliver valuable data insights with speed, efficiency\n", + "and cost-effectiveness, Rivian is primed to leverage more data to improve\n", + "operations and the performance of its vehicles to enhance the customer\n", + "experience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\n", + "money from a cloud perspective, and that’s a huge win for us.” With Databricks\n", + "Lakehouse providing a unified and open source approach to data and analytics,\n", + "the Vehicle Reliability Team is able to better understand how people are using\n", + "their vehicles, and that helps to inform the design of future generations of\n", + "vehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n", + "30%–50% increase in runtime performance, which has led to faster insights and\n", + "model performance.\n", + "\n", + "Shiverick explains, “From a reliability standpoint, we can make sure that\n", + "components will withstand appropriate lifecycles. It can be as simple as\n", + "making sure door handles are beefy enough to endure constant usage, or as\n", + "complicated as predictive and preventative maintenance to eliminate the\n", + "chance of failure in the field. Generally speaking, we’re improving software quality\n", + "based on key vehicle metrics for a better customer experience.”\n", + "\n", + "\n", + "From a design optimization perspective, Rivian’s unobstructed data view is also\n", + "producing new diagnostic insights that can improve fleet health, safety, stability\n", + "and security. Venkat says, “We can perform remote diagnostics to triage a\n", + "problem quickly, or have a mobile service come in, or potentially send an OTA\n", + "to fix the problem with the software. All of this needs so much visibility into\n", + "the data, and that’s been possible with our partnership and integration on the\n", + "platform itself.” With developers actively building vehicle software to improve\n", + "issues along the way.\n", + "\n", + "Moving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\n", + "different teams — increasing the number of platform users from 5 to 250 in only\n", + "one year. This has unlocked new use cases including using machine learning to\n", + "optimize battery efficiency in colder temperatures, increasing the accuracy of\n", + "autonomous driving systems, and serving commercial depots with vehicle health\n", + "dashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\n", + "of commercial vans expands, Rivian will continue to leverage the troves of data\n", + "generated by its EAVs to deliver new innovations and driving experiences that\n", + "revolutionize sustainable transportation.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.6\n", + "**Migrating to the cloud to better serve**\n", + "**millions of customers**\n", + "\n", + "\n", + "###### 300%\n", + "\n", + "**ROI from OpEx savings**\n", + "**and cost avoidance**\n", + "\n", + "\n", + "###### 3X\n", + "\n", + "**Faster delivery of ML/data**\n", + "**science use cases**\n", + "\n", + "\n", + "Consistency in innovation is what keeps customers with a telecommunications company\n", + "\n", + "and is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n", + "\n", + "Hadoop system proved complex and costly to manage, impeding operational agility\n", + "\n", + "and efficiency and engineering resources. The need to pivot to cloud to better support\n", + "\n", + "hundreds of millions of subscribers was apparent.\n", + "\n", + "Migrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n", + "\n", + "savings in operating costs. Additionally, the new cloud-based environment has unlocked\n", + "\n", + "access to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n", + "\n", + "2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n", + "\n", + "overburdening its engineering team or exploding operational costs — to deliver new\n", + "\n", + "features and innovations to its millions of end users.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Customer Retention, Subscriber Churn\n", + "Prediction, Threat Detection\n", + "\n", + "**P L AT F O R M**\n", + "Lakehouse, Data Science, Machine Learning,\n", + "[Data Streaming](https://www.databricks.com/product/data-streaming)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hadoop technology adds operational complexity and**\n", + "**unnecessary costs**\n", + "\n", + "AT&T is a technology giant with hundreds of millions of subscribers and ingests\n", + "10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\n", + "this data, it has a team of 2,500+ data users across 60+ business units to ensure\n", + "the business is data powered — from building analytics to ensure decisions are\n", + "based on the best data-driven situation awareness to building ML models that\n", + "bring new innovations to its customers. To support these requirements, AT&T\n", + "needed to democratize and establish a data single version of truth (SVOT) while\n", + "simplifying infrastructure management to increase agility and lower overall costs.\n", + "\n", + "However, physical infrastructure was too resource intensive. The combination\n", + "of a highly complex hardware setup (12,500 data sources and 1,500+ servers)\n", + "coupled with an on-premises Hadoop architecture proved complex to\n", + "maintain and expensive to manage. Not only were the operational costs to\n", + "support workloads high, but there were also additional capital costs around\n", + "data centers, licensing and more. Up to 70% of the on-prem platform had to\n", + "\n", + "be prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n", + "\n", + "data quality objectives. Engineers’ time was focused on managing updates,\n", + "\n", + "\n", + "With these deeply rooted technology issues, AT&T was not in the best position\n", + "to achieve its goals of increasing its use of insights for improving its customer\n", + "experience and operating more efficiently. “To truly democratize data across\n", + "the business, we needed to pivot to a cloud-native technology environment,”\n", + "said Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\n", + "up resources that had been focused on managing our infrastructure and move\n", + "them up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n", + "\n", + "**A seamless migration journey to Databricks**\n", + "\n", + "As part of its due diligence, AT&T ran a comprehensive cost analysis and\n", + "concluded that Databricks was both the fastest and achieved the best price/\n", + "performance for data pipelines and machine learning workloads. AT&T knew the\n", + "migration would be a massive undertaking. As such, the team did a lot of upfront\n", + "planning — they prioritized migrating their largest workloads first to immediately\n", + "reduce their infrastructure footprint. They also decided to migrate their data\n", + "before migrating users to ensure a smooth transition and experience for their\n", + "thousands of data practitioners.\n", + "\n", + "\n", + "fixing performance issues or simply provisioning resources rather than focusing\n", + "\n", + "\n", + "on higher-valued tasks. The resource constraints of physical infrastructure\n", + "\n", + "also drove serialization of data science activities, slowing innovation. Another\n", + "\n", + "hurdle faced in operationalizing petabytes of data was the challenge of building\n", + "\n", + "streaming data pipelines for real-time analytics, an area that was key to\n", + "\n", + "supporting innovative use cases required to better serve its customers.\n", + "\n", + "\n", + "**The migration from Hadoop to Databricks enables us to bring**\n", + "**more value to our customers and do it more cost-efficiently**\n", + "**and much faster than before.**\n", + "\n", + "**Mark Holcomb**\n", + "Distinguished Solution Architect, AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "They spent a year deduplicating and synchronizing data to the cloud before\n", + "migrating any users. This was a critical step in ensuring the successful migration\n", + "of such a large, complex multi-tenant environment of 2,500+ users from 60+\n", + "business units and their workloads. The user migration process occurred over\n", + "nine months and enabled AT&T to retire on-premises hardware in parallel with\n", + "migration to accelerate savings as early as possible. Plus, due to the horizontal,\n", + "scalable nature of Databricks, AT&T didn’t need to have everything in one\n", + "contiguous environment. Separating data and compute, and across multiple\n", + "accounts and workspaces, ensured analytics worked seamlessly without any API\n", + "call limits or bandwidth issues and consumption clearly attributed to the 60+\n", + "business units.\n", + "\n", + "All in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n", + "12,500 data sources and 300 schemas. The entire process took about two and a\n", + "half years. And it was able to manage the entire migration with the equivalent of\n", + "15 full-time internal resources. “Databricks was a valuable collaborator throughout\n", + "the process,” said Holcomb. “The team worked closely with us to resolve product\n", + "features and security concerns to support our migration timeline.”\n", + "\n", + "**Databricks reduces TCO and opens new paths to**\n", + "**innovation**\n", + "\n", + "One of the immediate benefits of moving to Databricks was huge cost savings.\n", + "AT&T was able to rationalize about 30% of its data by identifying and not\n", + "migrating underutilized and duplicate data. And prioritizing the migration of\n", + "the largest workloads allowed half the on-prem equipment to be rationalized\n", + "\n", + "\n", + "during the course of the migration. “By prioritizing the migration of our most\n", + "compute-intensive workloads to Databricks, we were able to significantly drive\n", + "down costs while putting us in position to scale more efficiently moving forward,”\n", + "explained Holcomb. The result is an anticipated 300% five-year migration ROI\n", + "from OpEx savings and cost avoidance (e.g., not needing to refresh data center\n", + "hardware).\n", + "\n", + "With data readily available and the means to analyze data at any scale, teams\n", + "of citizen data scientists and analysts can now spend more time innovating,\n", + "instead of serializing analytics efforts or waiting on engineering to provide the\n", + "necessary resources — or having data scientists spend their valuable time\n", + "on less complex or less insightful analyses. Data scientists are now able to\n", + "collaborate more effectively and speed up machine learning workflows so that\n", + "teams can deliver value more quickly, with a 3x faster time to delivery for new\n", + "data science use cases.\n", + "\n", + "“Historically you would have had operations in one system and analytics in a\n", + "separate one,” said Holcomb. “Now we can do more use cases like operational\n", + "analytics in a platform that fosters cross-team collaboration, reduces cost and\n", + "improves the consistency of answers.” Since migrating to Databricks, AT&T now\n", + "has a single version of truth to create new data-driven opportunities, including\n", + "a self-serve AI-as-a-Service analytics platform that will enable new revenue\n", + "streams and help it continue delivering exceptional innovations to its millions\n", + "of customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
##### EBOOK\n", + "\n", + "# 8 Steps to Becoming an AI-Forward Retailer\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "\n", + "Introduction .............................................................................................................................................................................................. **3**\n", + "\n", + "The State of the Retail Industry:\n", + "\n", + "The Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n", + "\n", + "Begin With a Shared Vision of Success ....................................................................................................................................... **6**\n", + "\n", + "Why Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n", + "\n", + "Before Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n", + "\n", + "Getting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n", + "\n", + "Going Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n", + "\n", + "Normalizing the Process: Engraining a Data-Driven Mindset\n", + "\n", + "Into the Fabric of the Business ...................................................................................................................................................... **14**\n", + "\n", + "From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n", + "\n", + "The 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n", + "\n", + "Transform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "\n", + "In a world where data is king, retailers have historically been trailblazers, pioneering data technology\n", + "adoption to supercharge their operations, enhance customer understanding and sharpen\n", + "personalization. The journey began with the simple cash register about 150 years ago, progressed to\n", + "standardized product reporting with the introduction of the UPC and EAN, and has evolved to include\n", + "cutting-edge technologies such as RFID and machine learning.\n", + "\n", + "Today, we stand on the brink of “Generation AI,” defined by sophisticated language models and\n", + "images. Retailers, with their history of embracing data technologies, find themselves in a strong\n", + "position to reap the benefits of this new era. Automation of customer service, supply chain modeling\n", + "with digital twins and delivering hyper-personalized experiences in real time are all in the cards,\n", + "promising to bolster revenue, improve margins and slash costs for early adopters.\n", + "\n", + "According to an internal analysis by Databricks, data pioneers are already outstripping their\n", + "competition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\n", + "six major industry sectors, including retail — shows these front-runners outperforming the rest of the\n", + "market by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\n", + "are setting themselves up for significant gains and a robust competitive advantage.\n", + "\n", + "However, for retailers mired in the landscape of outdated data platforms, the transformation into an\n", + "AI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\n", + "feel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\n", + "evolving retail landscape.\n", + "\n", + "To help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\n", + "road map for organizations embarking on digital transformation journeys — a shift that is as much\n", + "about culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\n", + "vision for transformation, outlining a compelling case for why such change is vital for the company’s\n", + "long-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\n", + "critical business procedures.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n", + "\n", + "\n", + "The pandemic’s fallout has led to a widening chasm between the retail industry’s\n", + "leaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n", + "“Companies with tech-forward business models, who were already pulling ahead\n", + "pre-crisis, left their competitors in the dust.”\n", + "\n", + "But what exactly is a “tech-forward business model”? It isn’t a simple narrative of\n", + "digital natives dethroning traditional retailers. Heavyweights like Walmart, Target\n", + "and Costco held their own against Amazon. Nor was it purely a matter of scale —\n", + "smaller brands like Warby Parker or Everlane managed to carve out substantial\n", + "consumer bases, competing against larger, established players.\n", + "\n", + "**The common denominator among all victors**\n", + "**was their ability to harness data, analytics and AI**\n", + "**to rapidly react to shifts in consumer behavior.**\n", + "\n", + "\n", + "methods, optimizing operations to alleviate the pressure these modes exerted\n", + "on margins. They successfully established tighter partnerships with suppliers\n", + "and logistic entities, collaborating toward shared triumphs.\n", + "\n", + "In all these instances, it was their timely access to information, foresight\n", + "driven by this data, and the exploration of probable outcomes that set these\n", + "organizations apart. Infusing data-driven decision-making into core processes\n", + "within the organization, as well as those crossing partner boundaries, unlocked\n", + "this approach’s full potential.\n", + "\n", + "To illustrate the significance of prioritizing data and AI, we developed the\n", + "Databricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\n", + "stocks research, this index tracks marquee customers across our top five\n", + "verticals and partners. The Databricks 30 is an equal-weight price index,\n", + "\n", + "composed of five marquee customers each across Retail/Consumer Products,\n", + "Financial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\n", + "plus five strategic partners.\n", + "\n", + "\n", + "These businesses deftly used consumer demand insights to understand the\n", + "effects of supply chain disruptions and labor shortages and reallocate resources\n", + "to mitigate the most harmful impacts. They adeptly introduced new delivery\n", + "\n", + "\n", + "-----\n", + "\n", + "Our analysis reveals that companies in the Databricks 30 Index outpaced the\n", + "S&P 500 by an impressive +21 percentage points (pp) over the past three years.\n", + "In other words, if the stock market rose by 50% during this period, the Databricks\n", + "30 Index would have soared by 71% (outperforming by 21pp). Even more\n", + "remarkable, excluding tech entirely from the Databricks 30, the Databricks 30\n", + "ex-Tech index outperforms the S&P 500 by an even larger margin over the same\n", + "time frame: +23pp.\n", + "\n", + "\n", + "DB30 DOw30\n", + "\n", + "\n", + "Similar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\n", + "are investing in cloud, data and innovation do, in fact, win.\n", + "\n", + "\n", + "So now that we see the impact, let’s dive into the steps retail organizations can\n", + "take to put themselves on a trajectory of continued growth and success amid an\n", + "ever-changing landscape.\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021\n", + "\n", + "\n", + "DATE\n", + "\n", + "\n", + "-----\n", + "\n", + "## Begin With a Shared Vision of Success\n", + "\n", + "\n", + "The most overlooked activity in becoming an AI-forward retailer is the most\n", + "crucial. In the rush to secure a position on the AI frontier, many companies\n", + "are leaping before they look, embarking on AI initiatives without a clear\n", + "understanding of what they want to achieve. Simply adopting the newest,\n", + "shiniest tech tools isn’t a silver bullet. Many companies set themselves up for\n", + "failure by neglecting to clearly define the expected business outcomes at the\n", + "onset of the initiative, a strategic move that can effectively reduce project risk\n", + "and costs and lead to the ultimate success of the program. In fact, in an attempt\n", + "to accelerate results, this cavalier approach can instead spiral into expensive\n", + "mistakes, wasted resources and a decrease in trust for stakeholders from\n", + "unmet expectations. It’s like setting sail on an open ocean without a destination\n", + "in mind; the journey might provide some interesting detours, but it lacks\n", + "direction and purpose.\n", + "\n", + "However, when organizations take the time to articulate their expected\n", + "business outcomes before deploying AI and data-driven programs, they position\n", + "themselves to reduce project risk and costs. By aligning AI initiatives with\n", + "specific business objectives and creating a shared vision with stakeholders,\n", + "the focus becomes less about the technology itself and more about how it\n", + "can be used to reach these defined goals.\n", + "\n", + "\n", + "Technology decisions, too, are improved by having a known target. Without\n", + "clear business outcomes in mind, companies tend to design, develop and\n", + "implement technologies that _might_ be needed to solve the problem. Aligning\n", + "the technical road map and activities with business outcomes mitigates the\n", + "risk of misallocated resources and the potential fallout from the unfulfilled\n", + "promise of AI.\n", + "\n", + "Furthermore, a clear understanding of expected business outcomes allows\n", + "for efficient project management and cost control. Companies can set key\n", + "performance indicators (KPIs) tied directly to these outcomes. This not only\n", + "provides a means to measure progress, but also helps control costs by\n", + "ensuring that resources are targeted toward initiatives that deliver value.\n", + "\n", + "It’s not just about numbers either; having explicit objectives aids in cultivating\n", + "\n", + "stakeholder buy-in. Clear communication about the purpose and potential\n", + "benefits of an AI initiative can foster support from executives, employees,\n", + "investors and customers alike. This collective backing can further mitigate risk\n", + "and cut costs by ensuring that everyone is pulling in the same direction.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Companies Struggle With Setting Clear Business Outcomes for AI\n", + "\n", + "\n", + "Getting started with AI at your organization might be daunting, and that’s\n", + "because it is a big undertaking! Struggling to define clear outcomes for AI\n", + "projects is a common issue among many businesses for a variety of reasons.\n", + "Here are some key factors that contribute to this challenge:\n", + "\n", + "**They believe the data strategy is a technology problem.**\n", + "\n", + "Companies often hire a chief data officer, or make the data strategy\n", + "the responsibility of the technology organization.\n", + "\n", + "**They lack an understanding of their business processes**\n", + "An alarming number of businesses jump onto the AI bandwagon without\n", + "understanding how their business operates. Decisions are made at\n", + "the leadership level, but how they translate to operational decisions is\n", + "muddled. Data and AI are fundamentally business process technologies,\n", + "\n", + "and without fully understanding how the business works, any initiative\n", + "in data and AI is bound to have limited success.\n", + "\n", + "\n", + "**They lack a data culture**\n", + "\n", + "Somewhat related to the previous point, many companies have teams\n", + "that make decisions based on experience and intuition. These should\n", + "not be discounted, but the reason for intuition is often a result of a\n", + "poor definition of processes, which prevents the ability to measure\n", + "and improve processes.\n", + "\n", + "**They struggle to get high-quality data**\n", + "\n", + "AI projects require good-quality, relevant data. Many businesses\n", + "struggle with issues related to data access, quality, privacy and\n", + "security, which can complicate the process of defining clear outcomes.\n", + "\n", + "**They lack the organizational structures required**\n", + "\n", + "Implementing AI often requires significant changes in business\n", + "\n", + "processes, organizational structures and even corporate culture.\n", + "Many companies find it hard to manage these changes, leading to\n", + "difficulties in setting and achieving clear outcomes.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data and AI programs are a business process problem first, and a\n", + "technology problem last. Familiarity with technology is important, but\n", + "irrelevant if companies do not understand it.\n", + "\n", + "Addressing these challenges often requires companies to invest in\n", + "education about AI capabilities, to formulate clear strategies, to manage\n", + "change effectively, and to bring on board the necessary skills either\n", + "by hiring new talent or upskilling existing employees. It’s a journey that\n", + "requires commitment, but the potential benefits of successful AI initiatives\n", + "make it a worthwhile venture.\n", + "\n", + "\n", + "**They don’t have the right people in place**\n", + "\n", + "There’s often a gap between the skills available within a company and\n", + "the skills needed to define and achieve AI outcomes. Without team\n", + "members who understand AI, data analysis and project management,\n", + "businesses can struggle to set clear objectives for AI initiatives.\n", + "\n", + "**They struggle to quantify the value of AI projects**\n", + "\n", + "AI’s benefits can sometimes be intangible or long-term, making them\n", + "difficult to quantify. Companies may struggle to define outcomes in\n", + "measurable terms, complicating the process of setting objectives\n", + "and monitoring progress.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Before Diving In: Assess Your Readiness\n", + "\n", + "\n", + "There is a growing sense of urgency for organizations relatively new to data\n", + "and AI-driven enablement to “get in the game.” Profiles of top performers and\n", + "headline-making achievements create a clearer sense of what is possible\n", + "and what can be gained, leaving those entering into the space eager to achieve\n", + "similar results.\n", + "\n", + "But what’s missing in those articles are the sustained investments in\n", + "process, people and technology and the numerous challenges, missteps and\n", + "outright failures that had to occur before success was achieved. Data-driven\n", + "transformation is a journey, and before any successful journey is pursued,\n", + "it’s wise to reflect on the organization’s readiness so that you can anticipate\n", + "challenges and identify areas for remediation and improvement that will\n", + "deliver you to your intended destination.\n", + "\n", + "With this in mind, we encourage organizations new to this space to\n", + "assess their maturity in terms of the use and management of their existing\n", + "information assets:\n", + "\n", + "1. How easily discoverable and accessible are data in\n", + "your environment?\n", + "\n", + "\n", + "3. Is the quality of these data formally verified?\n", + "\n", + "4. Are key entities such as products and customers actively\n", + "managed, and can data related to these items be easily linked\n", + "across various data sources?\n", + "\n", + "5. How quickly are data made available for analysis following their\n", + "creation or modification? Is this latency aligned with how you\n", + "might use this data?\n", + "\n", + "6. Are processes established for determining appropriate uses of\n", + "data, governing access and providing oversight on consumption?\n", + "\n", + "7. Is there one individual responsible for effective data management\n", + "across the enterprise, and has this person established a\n", + "\n", + "process for receiving and responding to feedback and shifting\n", + "organizational priorities?\n", + "\n", + "This list of questions is by no means exhaustive, but it should help to identify\n", + "blockers that are likely to become impediments down the road.\n", + "\n", + "\n", + "2. How well understood are these information assets?\n", + "\n", + "\n", + "-----\n", + "\n", + "Similarly, we would encourage organizations to assess their maturity in terms of\n", + "analytics capabilities:\n", + "\n", + "1. Is business performance at all levels assessed in terms of\n", + "key metrics?\n", + "\n", + "2. How frequently are data-driven analyses used in making key\n", + "business decisions?\n", + "\n", + "3. To what degree are advanced analytics techniques\n", + "— i.e., data science — used in decision-making processes?\n", + "\n", + "4. Are predictive models regularly leveraged as part of operational\n", + "business processes?\n", + "\n", + "5. How is experimentation used to assess the performance of\n", + "various initiatives?\n", + "\n", + "\n", + "Lastly, and probably most importantly, we’d encourage the organization to\n", + "perform a frank assessment of its readiness to embrace change. Becoming a\n", + "data-driven enterprise is fundamentally about operating differently than before.\n", + "Decision-making authority becomes more diffuse and often more automated.\n", + "Project outcomes become less certain as the organization focuses on innovation\n", + "where learning is emphasized over predictable results. Process silos often\n", + "become more intertwined as new modes of engagement evolve.\n", + "\n", + "When done right, this transition creates a healthy tension between what’s\n", + "needed to be successful today and what’s needed to be successful tomorrow.\n", + "But this can also manifest itself as employee resistance and political infighting\n", + "as processes and organizational structures evolve. What’s often needed to\n", + "overcome this is strong leadership, a clear vision and mandate for change as\n", + "well as a reassessment of incentive structures and active organizational change\n", + "management as the organization transitions into this new way of working.\n", + "\n", + "\n", + "6. Are predictive models used to automate key business decisions?\n", + "\n", + "\n", + "7. Has the organization embraced a model of continuous deployment\n", + "for the regular update of model-driven processes?\n", + "\n", + "\n", + "**TRADITIONAL APPROACH**\n", + "\n", + "**Upfront reqs** **Technical implementation** **Production**\n", + "\n", + "\n", + "**ITERATIVE APPROACH**\n", + "\n", + "\n", + "Continuous feedback\n", + "\n", + "\n", + "**Business questions** **Testing** **Production** **Optimization**\n", + "\n", + "Continuous learning and optimization\n", + "\n", + "An iterative approach involves the use of data to continually optimize the performance of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started: Putting Some Wins on the Board\n", + "\n", + "\n", + "With the organization ready to proceed, the next phase is about learning to\n", + "deliver new solutions within your organization. There will be new technologies\n", + "to deploy and new skills to develop, and there will be new patterns for\n", + "integration into business workflows and procedures for incremental updates\n", + "and improvements. But most importantly, there will need to be a new level of\n", + "partnership and trust between the business and the technology sides of the\n", + "organization that needs to be carefully nurtured.\n", + "\n", + "The best way we have found to do this is to start with projects that improve\n", + "on existing operational workflows, i.e., do what you do, but do it smarter.\n", + "The business is often familiar with existing pain points and can more clearly\n", + "envision how a new capability can be folded into its processes. They are also\n", + "familiar with how to assess the impact a new approach may have on their\n", + "business and can help design tests to validate whether the intended results\n", + "\n", + "\n", + "As capabilities demonstrating value over the status quo are developed, they\n", + "are folded into business processes. This is not a one-and-done effort but part\n", + "of an ongoing cycle of deployment to continue so long as the team has a line\n", + "of sight to meaningful gains. The team does not wait for the ideal solution but\n", + "instead focuses on incremental improvements that deliver measurable value\n", + "along the way.\n", + "\n", + "Oversight for this process is provided by another body, one tasked with the\n", + "success of the overall transformative efforts within the business. As success\n", + "is delivered, there will be growing demand for the time and talents of these\n", + "teams, and the organization will need to prioritize resources across an increasing\n", + "number of opportunities. This steering committee will need to be responsible for\n", + "allocating limited resources and advocating for additional ones as well to strike\n", + "the right balance of investments for the organization.\n", + "\n", + "\n", + "are or are not being delivered.\n", + "\n", + "\n", + "**DEMAND FORECASTING**\n", + "\n", + "Demand forecasting is a massive challenge for retail and consumer goods\n", + "\n", + "organizations. And one where even an incremental change can have a massive impact,\n", + "\n", + "so it’s often one of the first projects organizations identify to put a win on the board.\n", + "\n", + "According to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n", + "\n", + "accuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n", + "\n", + "increase in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n", + "\n", + "[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n", + "\n", + "key use cases.\n", + "\n", + "\n", + "Work on these projects is a collaborative effort between the business and IT.\n", + "Together, the project team explores a potential solution with a notion of how it\n", + "may be integrated in mind from the outset. As the project unfolds, all members\n", + "are part of the iterative cycles and help to steer the solution in new directions\n", + "until an item of value is derived.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Going Big: Learning to Embrace Transformational Change\n", + "\n", + "\n", + "With some experience under your belt, it’s time to build on the organizational\n", + "muscle developed during initial efforts and flex for more transformative impact.\n", + "Again, the focus is on established functions within the business, but instead of\n", + "pointed, incremental improvements, the team begins to create a vision for the\n", + "part of the organization that would operate if it were to fully embrace data and\n", + "AI enablement.\n", + "\n", + "It’s at this phase that many of the concerns about organizational resistance\n", + "mentioned earlier are most likely to manifest themselves. Ideally, initial\n", + "implementation efforts have built champions within the business, but it’s still\n", + "important to be mindful of pushback that can emerge as the organization more\n", + "fully begins to change. Having and maintaining strong business sponsorship\n", + "in this phase is critical, and having that sponsor articulate and regularly\n", + "reinforce a clear vision for the change that’s now underway can help everyone\n", + "\n", + "understand the need to support these efforts.\n", + "\n", + "\n", + "So far in this exploration of the journey to data and AI transformation, we’ve\n", + "minimized the importance of technology in order to focus on the business and\n", + "organizational aspects that often get neglected in this conversation. But it’s\n", + "at this stage that the organization needs to have established its preference\n", + "for data and analytics platforms. Because of the breadth of needs that will\n", + "have to be addressed and the ongoing innovation taking place in the data\n", + "science community, we strongly suggest standardizing on a platform that is\n", + "open and flexible while also providing cost-effective use of both infrastructure\n", + "and people resources and strong data governance and protection. For many\n", + "organizations, the Databricks Lakehouse Platform has proven itself to be the\n", + "ideal platform to meet these needs.\n", + "\n", + "**WHY STANDARDIZE ON DATABRICKS?**\n", + "\n", + "The Databricks Lakehouse is the only enterprise data and AI\n", + "\n", + "platform that allows retailers to leverage all of their data, from any\n", + "\n", + "source, on any workload to always offer more engaging customer\n", + "\n", + "experiences driven by real-time data, at the lowest cost and with\n", + "\n", + "the greatest investment protection.\n", + "\n", + "\n", + "-----\n", + "\n", + "But simply standardizing on a platform is not enough. The organization\n", + "needs to work through the roles and responsibilities around the use of this\n", + "platform and processes for moving things from experimentation and formal\n", + "development to testing and operationalization.\n", + "\n", + "The importance of having an MLOps strategy really comes to life at this\n", + "phase. This doesn’t mean your strategy around MLOps can’t change, but this\n", + "phase is when you want to think about and define your answers to some key\n", + "questions such as the following:\n", + "\n", + "1. How do we evaluate new and existing (retrained) models as\n", + "part of their movement from development to production?\n", + "\n", + "2. How do we determine when a model should be retrained?\n", + "\n", + "3. What are the preferred mechanisms for production deployment?\n", + "\n", + "4. How do we fall back should we have a deployment problem?\n", + "\n", + "5. What are the service level expectations for the\n", + "deployment processes?\n", + "\n", + "\n", + "###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n", + "\n", + "**Numan Ali**\n", + "\n", + "Solutions Architect, Data and Analytics Center of Excellence at Pandora\n", + "\n", + "\n", + "-----\n", + "\n", + "## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n", + "\n", + "\n", + "Too often, leadership views innovation as a destination and not a process\n", + "(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\n", + "data-driven organization overnight and then it’s done. Yes, there will be an\n", + "upfront investment, but there will also be ongoing investment in order to\n", + "support sustained innovation.\n", + "\n", + "Ironically, one of the major obstacles to this change is viewing the goal as\n", + "simply delivering a project or projects. Think about it — just 12 months ago,\n", + "only a few specialists in academia and industry were talking about generative\n", + "AI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n", + "[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\n", + "personalized consumer experiences with it.\n", + "\n", + "\n", + "Technology, especially when it comes to data and AI, moves far too quickly.\n", + "What retailer tech teams need to deliver at the end of the day is applications,\n", + "of course, but also the ability to react quickly to change. What sort of ongoing\n", + "investments in terms of people, process and technology do retailers need to\n", + "foster in order to ingrain an innovation mindset?\n", + "\n", + "This is an ongoing balancing act where organizations need to innovate and look\n", + "for new opportunities but also sustain that innovation in a way that is realistic\n", + "for the business. For this, let’s consider the 70-20-10 rule: the idea that\n", + "companies should allocate 70% of innovation investment to core initiatives,\n", + "20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\n", + "not a hard-and-fast rule, this concept was touted by Google co-founder Larry\n", + "Page in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n", + "[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n", + "\n", + "outperformed their peers, typically realizing a P/E premium of 10% to 20%.\n", + "\n", + "\n", + "-----\n", + "\n", + "The goal of the 70-20-10 rule is to help guide the organization toward\n", + "sustained innovation and spend the bulk of time on the core business. This is\n", + "part of why we recommend starting first with fast (just 2- to 3-month total)\n", + "pilot projects to use AI on existing business use cases like demand forecasting\n", + "and call center optimization. By working in these areas with a focus on learning\n", + "and iterating, retailers will soon find where data silos and rigidity exist in the\n", + "system. As these foundational barriers are knocked down, it then makes it\n", + "possible to tackle more transformational use cases and start to build the\n", + "characteristics of a data-forward enterprise. In other words, start to utilize\n", + "data and data-driven insights as a primary driver for decision-making and\n", + "operations, while also prioritizing continuous data analysis and improvement.\n", + "\n", + "\n", + "**TRANSFORMATIVE**\n", + "\n", + "\n", + "**ADJACENT**\n", + "\n", + "\n", + "**CORE**\n", + "\n", + "\n", + "###### Companies that allocated about 70% of their innovation activity to core initiatives, \n", + "### 20% to adjacent ones and 10% to\n", + "###### transformational ones outperformed their peers.\n", + "\n", + "**Bansi Nagji & Geoff Tuff**\n", + "_Managing Your Innovation Portfolio_\n", + "Harvard Business Review, May 2012\n", + "\n", + "\n", + "-----\n", + "\n", + "## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n", + "\n", + "\n", + "So what does it take to successfully embark on this\n", + "journey to becoming a data-forward enterprise?\n", + "First and foremost, you need to not only establish\n", + "a baseline understanding of what has occurred by\n", + "examining historical data but leverage advancements\n", + "in technologies (e.g., streaming, computer vision,\n", + "voice recognition) to make predictions of the future.\n", + "\n", + "Through the use of both historical data and\n", + "predictive techniques such as forecasting,\n", + "recommendations, prescriptive care and nextbest-action, organizations can begin to improve\n", + "decisions and, in some cases, automate certain\n", + "decision-making processes. But rather than moving\n", + "\n", + "from historical views to predictive actions in a\n", + "linear fashion, this journey involves addressing both\n", + "approaches simultaneously. Once you are able to\n", + "unify historical and predictive analysis, you can then\n", + "take significant steps toward becoming a dataforward enterprise.\n", + "\n", + "\n", + "##### The Data-Forward Enterprise\n", + "\n", + "Data, analytics and AI working in concert\n", + "\n", + "\n", + "**Data Purgatory**\n", + "Things are better, but data isn’t\n", + "driving the business\n", + "\n", + "\n", + "**Data Maturity**\n", + "Every aspect of the\n", + "business is supported\n", + "by insights and AI\n", + "\n", + "\n", + "**Data Siloed**\n", + "Data and teams are segregated\n", + "into different systems\n", + "\n", + "DATA MATURITY\n", + "\n", + "Being data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The 8 Steps to Building a Data-Forward Retailer\n", + "\n", + "\n", + "Before you start your data-forward journey, a few critical steps must be\n", + "considered to establish a solid foundation to build upon. Based on our\n", + "work with the largest and most successful retailers in the world, spanning\n", + "startups to global giants, we at Databricks have seen that the most successful\n", + "followed these steps to effectively gain wallet share, whereas those who\n", + "couldn’t would often leave major gaps that competitors could take advantage\n", + "of. These steps are the basics to prepare businesses for where they need\n", + "to be both now and in the near future.\n", + "\n", + "\n", + "**2** **Get grounded: Understand the technology**\n", + "\n", + "To start, business leaders need to ground themselves in technology, especially\n", + "when it comes to AI. AI can do amazing things, but it is not magical and vendors\n", + "are prone to overpromising and underdelivering. Less than getting deep into\n", + "code, the purpose is to understand the limitations and ideal use cases.\n", + "\n", + "Databricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\n", + "starting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\n", + "perspective of how different brands are using data, analytics and AI to drive\n", + "revenue or cut operational costs.\n", + "\n", + "\n", + "**1** **Set the foundation: Define goals and objectives**\n", + "\n", + "\n", + "The best way to avoid shiny object syndrome (where you start out with a\n", + "\n", + "technology and then try to figure out what to do with it) is to first identify the\n", + "problems you want to solve. From there, you can set goals around innovation\n", + "to align incentives, and, most importantly, ensure you are driving specific\n", + "business outcomes such as improving customer engagement, optimizing\n", + "inventory management or increasing sales.\n", + "\n", + "\n", + "**3** **Understand the skills and processes in your business**\n", + "\n", + "As we will get into in step 4, starting with smaller pilot projects enables you\n", + "to not just deliver a quick win and validate the use of AI in the enterprise, but\n", + "also understand the in-house capabilities in terms of people, process and\n", + "technology to deliver technical projects. And if required, be willing and ready\n", + "to hire people with the right skill sets that can help you make the most of your\n", + "data. For example, building a core team of data analysts can help extract deep\n", + "insights that lead to better decision-making and identify opportunities for\n", + "growth. It is critical at this step to define the roles you need, determine how\n", + "you will source for those roles (via external hiring or internal transfer), and\n", + "ensure those roles have opportunities for career progression.\n", + "\n", + "\n", + "-----\n", + "\n", + "For inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n", + "[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\n", + "save hours of discovery, design, development and testing. Our purpose-built\n", + "guides — fully functional notebooks and best practices — speed up results\n", + "across your most common and high-impact use cases and enable you to go\n", + "from idea to proof of concept (PoC) in as little as two weeks. We have over\n", + "20 accelerators built specifically for critical retail and consumer goods use\n", + "cases, from Demand Forecasting and On-Shelf Availability to Recommendation\n", + "Engines and Customer Lifetime Value. We also have a set of Solution\n", + "Accelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n", + "\n", + "**5** **Implement data management and governance early**\n", + "\n", + "The first step to successfully implementing AI/ML in your business broadly\n", + "is to ensure you have accurate, reliable and current data to train your\n", + "models against. This data can (and should) come from a variety of sources,\n", + "so it’s key to unify all data types and sources (sales transactions, customer\n", + "feedback, social media) in a centralized location that is easily accessible,\n", + "while not losing sight of data security to maintain customer trust. Setting\n", + "up data governance parameters to control who has which kinds of access\n", + "to what data, and being able to audit the history of this access, will actually\n", + "accelerate innovation while ensuring data security and compliance.\n", + "\n", + "\n", + "**Delivering exactly what customers want,**\n", + "**every time, and on time**\n", + "\n", + "Data is at the heart of Gousto’s mission to change the\n", + "way people eat through the delivery of boxes of fresh\n", + "ingredients and easy-to-follow recipes. However, even\n", + "as their business exploded at the start of the pandemic,\n", + "their systems couldn’t ingest data fast enough, couldn’t\n", + "talk to each other and wouldn’t scale — forcing them to\n", + "temporarily stop accepting new customers. Now Gousto is\n", + "set up to achieve exciting ambitions for menu expansion,\n", + "sophisticated personalization and next-day delivery. Learn\n", + "how they did it.\n", + "\n", + "**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n", + "\n", + "**4** **Start small: Pilot a project**\n", + "\n", + "There is no substitute for rolling your sleeves up and running a pilot project to\n", + "evaluate the feasibility and potential impact of a project before implementing\n", + "it on a larger scale. When selecting a pilot project, we recommend starting with\n", + "a project that will deliver clear business value, such as incremental revenue\n", + "or clear cost savings, yet only takes 2-3 months to complete. The more time\n", + "there is between project inception and seeing results, the more likely it will lose\n", + "momentum internally.\n", + "\n", + "\n", + "-----\n", + "\n", + "**6** **Incorporate AI across the business (starting with daily tasks)**\n", + "\n", + "Given the large upfront investment in data scientists and engineers to build\n", + "an AI program, the ROI will come from using it at scale. Constantly look to\n", + "uncover patterns and repeatable processes that can be optimized or fully\n", + "automated with AI.\n", + "\n", + "**Building a global fashion icon with a**\n", + "**customer-first approach**\n", + "\n", + "British luxury brand Burberry was seeking an efficient way to\n", + "annotate its thousands of highly specific marketing assets\n", + "for better targeting. Working with Labelbox within Databricks\n", + "Lakehouse, they are now able to complete image annotation\n", + "projects in hours instead of months. And marketing team\n", + "members now have access to powerful content insights\n", + "without needing to ask data scientists for help.\n", + "\n", + "**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n", + "\n", + "**Customizing interactions that convert clicks**\n", + "**to revenue with Databricks Lakehouse**\n", + "\n", + "Global jewelry manufacturer and retailer Pandora needed a\n", + "unified view of all their data where they could easily segment,\n", + "categorize and analyze to deliver custom messaging to\n", + "consumers. With Databricks Lakehouse, they now have the\n", + "insights they need to deliver highly targeted messaging —\n", + "increasing consumer engagement from the initial opening of\n", + "a marketing email to maximizing shopping bag conversions to\n", + "driving revenue on the website.\n", + "\n", + "**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n", + "\n", + "\n", + "**Building an operationally efficient**\n", + "**omnichannel business**\n", + "\n", + "The Hershey Company analyzes the data they need to\n", + "stay in front of changing human behavior and delight their\n", + "customers. With Databricks Lakehouse, they can analyze\n", + "data feeds from their largest retail customer — uncovering\n", + "insights that will help extend their industry leadership.\n", + "\n", + "**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n", + "\n", + "\n", + "**Ushering in a new era**\n", + "**of data-driven retailing**\n", + "\n", + "Outdoor apparel brand Columbia Sportswear has enabled\n", + "data and analytics self-service throughout the organization in\n", + "a way that ensures everyone is working from a single source\n", + "of truth. Whichever data team needs access to the data,\n", + "Databricks Lakehouse gives them the confidence that the\n", + "data is reliable and consistent.\n", + "\n", + "**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**7** **Foster a culture of data-driven decision-making**\n", + "\n", + "What does it mean to have a culture of data-driven decision-making? In\n", + "practice, it means empowering all employees to use data to inform their\n", + "decisions. Only some strategic decisions will be based on complete and\n", + "accurate information. It’s unwise to assume otherwise. The right approach\n", + "is to leverage as much data as possible, from past tests or current efforts,\n", + "to mitigate risk. Leaders need to not only ask for data but also ensure\n", + "that their employees will be able to find the data they need.\n", + "\n", + "**Unlocking critical trends and insights**\n", + "**needed to serve our 180 million customers**\n", + "\n", + "Reckitt, the maker of Lysol as well as hundreds of other\n", + "household brands, was looking to deliver best-in-class\n", + "customer experiences to their over 180 million customers\n", + "spanning the globe. With Databricks Lakehouse, Reckitt\n", + "has established a data-first culture by surfacing real-time,\n", + "highly accurate, deep customer data insights that have\n", + "led to a better understanding of international market\n", + "trends and demand across the multiple product lines\n", + "they support.\n", + "\n", + "**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n", + "\n", + "\n", + "**Customer 360 to enable faster speed**\n", + "**to market, better results**\n", + "\n", + "The Middle East’s Al-Futtaim serves as a local distributor\n", + "for global brands such as Toyota, IKEA and Ace Hardware.\n", + "With Databricks Lakehouse serving as a unified platform to\n", + "aggregate and analyze various data sources on all customers,\n", + "they have created a “golden customer record” that improves\n", + "all decision-making, from forecasting demand to powering\n", + "their global loyalty program.\n", + "\n", + "**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n", + "\n", + "**8** **Continuously evaluate and improve**\n", + "\n", + "Recognize that establishing a data-driven culture is an ongoing journey and\n", + "never a set destination. Constantly evaluate your data collection, analysis and\n", + "decision-making process to identify areas for improvement. Even small and\n", + "constant incremental improvements will deliver large gains in absolute terms\n", + "when applied at scale. You can always personalize more, forecast better, or\n", + "better manage your supply chain as you bring in better data sources and refine\n", + "your models.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Transform Retail Data Into Actionable Insights\n", + "\n", + "\n", + "Becoming data forward is not a crazy idea. Too often, leaders or organizations\n", + "allow themselves to be intimidated by focusing on large-scale transformations.\n", + "But it’s the small operational changes that can make your business more efficient\n", + "as well as shift the larger culture forward. Once you’ve set this foundation, it then\n", + "allows you to move toward bigger things. These steps may fail, but it’s actually\n", + "positive to have these setbacks to learn from to try again. The bigger risk is to\n", + "not try and thus fall behind competitors who are embracing the internal changes\n", + "needed to take advantage of AI and machine learning.\n", + "\n", + "Core to delivering on these steps to become a data-forward retailer is a solid\n", + "data foundation that can unify your data and AI workloads with sharing and\n", + "governance built in, so internal and external teams can get access to the\n", + "data they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\n", + "companies gain valuable insights into customer behavior, optimize supply chain\n", + "\n", + "operations and make informed business decisions in real time.\n", + "\n", + "\n", + "EXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n", + "\n", + "Access key resources to understanding how a lakehouse\n", + "for retail can set you on the path toward becoming a\n", + "data-forward organization.\n", + "\n", + "**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n", + "\n", + "\n", + "#### Visit our website to learn more about Databricks Lakehouse for Retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
### eBook\n", + "\n", + "# The Big Book\n", + " of MLOps\n", + "\n", + "#### A data-centric approach\n", + " to build and scale AI,\n", + " including LLMOps\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R S :**\n", + "\n", + "**Joseph Bradley**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Rafi Kurlansik**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Matt Thomson**\n", + "\n", + "Director, EMEA Product Specialists\n", + "\n", + "**Niall Turbitt**\n", + "\n", + "Lead Data Scientist\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Introduction** 3\n", + "\n", + "###### People and process 4\n", + "\n", + " People 5\n", + "\n", + " Process 6\n", + "\n", + " Why should I care about MLOps? 8\n", + "\n", + " Guiding principles 9\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n", + "\n", + "###### Semantics of dev, staging and prod 11\n", + "\n", + " ML deployment patterns 15\n", + "\n", + "**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n", + "\n", + "###### Architecture components 19\n", + "\n", + " Data Lakehouse 19\n", + "\n", + " MLflow 19\n", + "\n", + " Databricks and MLflow Autologging 20\n", + "\n", + " Feature Store 20\n", + "\n", + " MLflow Model Serving 20\n", + "\n", + " Databricks SQL 20\n", + "\n", + " Databricks Workflows and Jobs 20\n", + "\n", + " Reference architecture 21\n", + "\n", + " Overview 22\n", + "\n", + " Dev 23\n", + "\n", + " Staging 27\n", + "\n", + " Prod 30\n", + "\n", + "**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n", + "\n", + "###### Discussion of key topics for LLMOps 39\n", + "\n", + " Reference architecture 46\n", + "\n", + " Looking ahead 48\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Introduction\n", + "\n", + "**Note:** Our prescription for MLOps is general to\n", + "\n", + "any set of tools and applications, though we give\n", + "\n", + "concrete examples using Databricks features\n", + "\n", + "and functionality. We also note that no single\n", + "\n", + "architecture or prescription will work for all\n", + "\n", + "organizations or use cases. Therefore, while we\n", + "\n", + "provide guidelines for building MLOps, we call out\n", + "\n", + "important options and variations. This whitepaper\n", + "\n", + "is written primarily for ML engineers and data\n", + "\n", + "scientists wanting to learn more about MLOps,\n", + "\n", + "with high-level guidance and pointers to more\n", + "\n", + "resources.\n", + "\n", + "\n", + "The past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n", + "\n", + "adopters were a small number of large technology companies that could afford the necessary resources,\n", + "\n", + "in recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n", + "\n", + "MIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n", + "\n", + "This democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n", + "\n", + "[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n", + "\n", + "However, building and deploying ML models is complex. There are many options available for achieving\n", + "\n", + "this but little in the way of well-defined and accessible standards. As a result, over the past few years we\n", + "\n", + "have seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n", + "\n", + "**and automation for managing models, data and code to improve performance stability and long-term**\n", + "\n", + "**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n", + "\n", + "The concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n", + "\n", + "software applications, and the deployment of ML applications has much to gain from it. However, strong\n", + "\n", + "DevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n", + "\n", + "artifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n", + "\n", + "account the various people and processes that interact with these artifacts.\n", + "\n", + "Here at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n", + "\n", + "which work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n", + "\n", + "successful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n", + "\n", + "adoption is a testament to the appetite for operationalizing ML models.\n", + "\n", + "This whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n", + "\n", + "First, we describe the people and process involved in deploying ML applications and the need for\n", + "\n", + "operational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n", + "\n", + "we go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n", + "\n", + "introduce a general MLOps reference architecture, the details of its processes, and best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People and process\n", + "\n", + "**M L W O R K F L O W A N D P E R S O N A S**\n", + "\n", + "Data Governance Officer\n", + "\n", + "Dat1\n", + "Data Scientist\n", + "Engineer\n", + "\n", + "ML Engineer\n", + "\n", + "Business Stakeholder\n", + "\n", + "\n", + "Dataa\n", + "Preparation\n", + "\n", + "\n", + "Evplorator{a\n", + "Data unal{sis\n", + "\n", + "\n", + "Feature Mode� Modela Deplo{�ent\n", + "Engineering Training Validation\n", + "\n", + "\n", + "Mode� Modela Deplo{�ent Monitoring\n", + "Training Validation\n", + "\n", + "\n", + "Modela\n", + "Validation\n", + "\n", + "\n", + "**Figure 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People\n", + "\n", + "Building ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n", + "\n", + "useful to think in terms of archetypes. They help us understand roles and responsibilities and where\n", + "\n", + "handoffs are required, and they highlight areas of complexity within the system. We distinguish between\n", + "\n", + "the following personas:\n", + "\n", + "**M L P E R S O N A S**\n", + "\n", + "\n", + "Data\n", + "Governance\n", + "Officer\n", + "\n", + "Responsible for ensuring\n", + "\n", + "that data governance,\n", + "\n", + "data privacy and other\n", + "\n", + "compliance measures are\n", + "\n", + "adhered to across the\n", + "\n", + "model development and\n", + "\n", + "deployment process. Not\n", + "\n", + "typically involved in day-to-\n", + "\n", + "day operations.\n", + "\n", + "\n", + "Data\n", + "Engineer\n", + "\n", + "Responsible for building\n", + "\n", + "data pipelines to process,\n", + "\n", + "organize and persist data\n", + "\n", + "sets for machine learning\n", + "\n", + "and other downstream\n", + "\n", + "applications.\n", + "\n", + "\n", + "Data\n", + "Scientist\n", + "\n", + "Responsible for\n", + "\n", + "understanding the business\n", + "\n", + "problem, exploring available\n", + "\n", + "data to understand\n", + "\n", + "if machine learning is\n", + "\n", + "applicable, and then training,\n", + "\n", + "tuning and evaluating a\n", + "\n", + "model to be deployed.\n", + "\n", + "\n", + "ML\n", + "Engineer\n", + "\n", + "Responsible for deploying\n", + "\n", + "machine learning models to\n", + "\n", + "production with appropriate\n", + "\n", + "governance, monitoring and\n", + "\n", + "software development best\n", + "\n", + "practices such as continuous\n", + "\n", + "integration and continuous\n", + "\n", + "deployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n", + "\n", + "\n", + "Business\n", + "Stakeholder\n", + "\n", + "Responsible for using the\n", + "\n", + "model to make decisions for\n", + "\n", + "the business or product, and\n", + "\n", + "responsible for the business\n", + "\n", + "value that the model is\n", + "\n", + "expected to generate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Process\n", + "\n", + "Together, these people develop and maintain ML applications. While the development process follows\n", + "\n", + "a distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n", + "\n", + "you take, and using techniques like reinforcement learning or online learning will change some details.\n", + "\n", + "Nevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n", + "\n", + "above.\n", + "\n", + "Let’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n", + "\n", + "which will be determined by the particular business case and data.\n", + "\n", + "**M L P R O C E S S**\n", + "\n", + "\n", + "Data\n", + "Preparation\n", + "\n", + "\n", + "Exploratory\n", + "Data Analysis\n", + "\n", + "\n", + "Feature\n", + "Engineering\n", + "\n", + "\n", + "Model\n", + "Training\n", + "\n", + "\n", + "Model\n", + "Validation\n", + "\n", + "\n", + "Deployment Monitoring\n", + "\n", + "\n", + "###### Data preparation\n", + "\n", + "Prior to any data science or ML work lies the data engineering needed to prepare production data and make\n", + "\n", + "it available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n", + "\n", + "will extract features and labels from the raw data.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "Analysis is conducted by data scientists to assess statistical properties of the data available, and determine\n", + "\n", + "if they address the business question. This requires frequent communication and iteration with business\n", + "\n", + "stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature engineering\n", + "\n", + "Data scientists clean data and apply business logic and specialized transformations to engineer features for\n", + "\n", + "model training. These data, or features, are split into training, testing and validation sets.\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n", + "\n", + "a best-performing model is determined according to predefined evaluation metric(s).\n", + "\n", + "###### Model validation\n", + "\n", + "Prior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n", + "\n", + "some baseline level of performance, in addition to meeting any other technical, business or regulatory\n", + "\n", + "requirements. This necessitates collaboration between data scientists, business stakeholders and ML\n", + "\n", + "engineers.\n", + "\n", + "###### Deployment\n", + "\n", + "ML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n", + "\n", + "requirements of the use case.\n", + "\n", + "###### Monitoring\n", + "\n", + "ML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n", + "\n", + "will often be involved in early monitoring phases to ensure that new models perform as expected after\n", + "\n", + "deployment. This will inform if and when the deployed model should be updated by returning to earlier\n", + "\n", + "stages in the workflow.\n", + "\n", + "The data governance officer is ultimately responsible for making sure this entire process is compliant with\n", + "\n", + "company and regulatory policies.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why should I care about MLOps?\n", + "\n", + "Consider that the typical ML application depends on the aforementioned people and process, as well\n", + "\n", + "as regulatory and ethical requirements. These dependencies change over time — and your models, data\n", + "\n", + "and code must change as well. The data that were a reliable signal yesterday become noise; open source\n", + "\n", + "libraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n", + "\n", + "resilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n", + "\n", + "moving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n", + "\n", + "iteration cycle of delivering models to production, thereby accelerating time to business value.\n", + "\n", + "There are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n", + "\n", + "**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n", + "\n", + "For example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n", + "\n", + "risk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n", + "\n", + "fails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n", + "\n", + "fines or incurring reputational damage. Recently, one private company’s data collection practices were\n", + "\n", + "found to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n", + "\n", + "$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n", + "\n", + "developed with that data.\n", + "\n", + "With respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n", + "\n", + "processes. These steps are slower and more prone to error, affecting the quality of models, data and code.\n", + "\n", + "Eventually they form a bottleneck, capping the ability for a data team to take on new projects.\n", + "\n", + "Seen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n", + "\n", + "stability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n", + "\n", + "introduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n", + "\n", + "manage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n", + "\n", + "**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n", + "\n", + "With clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n", + "\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the complexity of ML\n", + "\n", + "processes and the different personas\n", + "\n", + "involved, it is helpful to start from\n", + "\n", + "simpler, high-level guidance. We\n", + "\n", + "propose several broadly applicable\n", + "\n", + "principles to guide MLOps decisions.\n", + "\n", + "They inform our design choices in\n", + "\n", + "later sections, and we hope they can\n", + "\n", + "be adapted to support whatever your\n", + "\n", + "\n", + "#### Guiding principles\n", + "\n", + "###### Always keep your business goals in mind\n", + "\n", + "Just as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n", + "\n", + "purpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n", + "\n", + "continue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n", + "\n", + "business impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n", + "\n", + "reduce operational costs or risks?\n", + "\n", + "###### Take a data-centric approach to machine learning\n", + "\n", + "Feature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n", + "\n", + "as robust as other production data engineering processes. Data quality is crucial in any ML application, so\n", + "\n", + "ML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n", + "\n", + "Avoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n", + "\n", + "your data. The simplest way to achieve this is to develop ML applications on the same platform used to\n", + "\n", + "manage production data. For example, instead of downloading training data to a laptop, where it is hard\n", + "\n", + "to govern and reproduce results, secure the data in cloud storage and make that storage available to your\n", + "\n", + "training process.\n", + "\n", + "\n", + "business use case may be.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### \u0007Implement MLOps in a modular fashion\n", + "\n", + "As with any software application, code quality is paramount for an ML application. Modularized code\n", + "\n", + "enables testing of individual components and mitigates difficulties with future code refactoring. Define\n", + "\n", + "clear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n", + "\n", + "responsibilities to clarify the modular structure of your ML application.\n", + "\n", + "###### Process should guide automation\n", + "\n", + "We automate processes to improve productivity and lower risk of human error, but not every step of a\n", + "\n", + "process can or should be automated. People still determine the business question, and some models will\n", + "\n", + "always need human oversight before deployment. Therefore, the development process is primary and each\n", + "\n", + "module in the process should be automated as needed. This allows incremental build-out of automation\n", + "\n", + "and customization. Furthermore, when it comes to particular automation tools, choose those that align to\n", + "\n", + "your people and process. For example, instead of building a model logging framework around a generic\n", + "\n", + "database, you can choose a specialized tool like MLflow, which has been designed with the ML model\n", + "\n", + "lifecycle in mind.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Fundamentals of MLOps\n", + "\n", + "**Note:** In our experience with customers, there\n", + "\n", + "can be variations in these three stages, such as\n", + "\n", + "splitting staging into separate “test” and “QA”\n", + "\n", + "substages. However, the principles remain the\n", + "\n", + "same and we stick to a dev, staging and prod\n", + "\n", + "setup within this paper.\n", + "\n", + "\n", + "#### Semantics of dev, staging and prod\n", + "\n", + "ML workflows include the following key assets: code, models and data. These assets need to be developed\n", + "\n", + "(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n", + "\n", + "environment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n", + "\n", + "staging and prod.\n", + "\n", + "These divisions can best be understood in terms of quality guarantees and access control. On one end,\n", + "\n", + "assets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n", + "\n", + "who can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n", + "\n", + "of quality.\n", + "\n", + "For example, many data scientists will work together in a dev environment, freely producing dev model\n", + "\n", + "prototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n", + "\n", + "the live product. In contrast, the staging environment replicates the execution environment of production.\n", + "\n", + "Here, code changes made in the dev environment are tested prior to code being deployed to production.\n", + "\n", + "The staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n", + "\n", + "are given access to staging. Code promoted to production is considered a live product. In the production\n", + "\n", + "environment, human error can pose the greatest risk to business continuity, and so the least number of\n", + "\n", + "people have permission to modify production models.\n", + "\n", + "One might be tempted to say that code, models and data each share a one-to-one correspondence with\n", + "\n", + "the execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n", + "\n", + "close to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n", + "\n", + "and prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n", + "\n", + "access to each.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Execution environments\n", + "\n", + "An execution environment is the place where models and data are created or consumed by code. Each\n", + "\n", + "execution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n", + "\n", + "With Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n", + "\n", + "organization could create distinct environments across multiple cloud accounts, multiple Databricks\n", + "\n", + "workspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n", + "\n", + "are illustrated in Figure 2 below.\n", + "\n", + "**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n", + "\n", + "\n", + "Multiple clou$\n", + "accounts\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Multiple Databricks\n", + "workspaces\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Databricks workspace\n", + "access controls\n", + "\n", + "\n", + "dev\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "**Figure 2**\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "\n", + "###### Code\n", + "\n", + "ML project code is often stored in a version control repository (such as Git), with most organizations\n", + "\n", + "using branches corresponding to the lifecycle phases of development, staging or production. There are a\n", + "\n", + "few common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n", + "\n", + "Others use main and development branches (dev), branches cut for testing potential releases (staging), and\n", + "\n", + "branches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n", + "\n", + "through Git repository branches.\n", + "\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once and to replace a subset\n", + "\n", + "of the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "\n", + "As a best practice, code should only be run in an execution environment that corresponds to it or in one\n", + "\n", + "that’s higher. For example, the dev environment can run any code, but the prod environment can only run\n", + "\n", + "prod code.\n", + "\n", + "###### Models\n", + "\n", + "While models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n", + "\n", + "**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n", + "\n", + "a new model version before you push a code change, and vice versa. Consider the following scenarios:\n", + "\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "\n", + "\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n", + "\n", + "the code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n", + "\n", + "of being generated, tested and marked as “production” to predict on the most recent transactions. In\n", + "\n", + "this case the code lifecycle is slower than the model lifecycle.\n", + "\n", + "\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n", + "\n", + "time process due to cost. Updates to the serving and monitoring code in the project may be deployed\n", + "\n", + "more frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n", + "\n", + "Since model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n", + "\n", + "management to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n", + "\n", + "directly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n", + "\n", + "production models without code changes, streamlining the deployment process in many cases. Model\n", + "\n", + "artifacts are secured using MLflow access controls or cloud storage permissions\n", + "\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Some organizations label data as either dev, staging or prod, depending on which environment it originated\n", + "\n", + "in. For example, all prod data is produced in the prod environment, but dev and staging environments may\n", + "\n", + "have read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n", + "\n", + "may be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n", + "\n", + "\n", + "reliability and freshness. Access to data in each environment is controlled with table access controls\n", + "\n", + "( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n", + "| |\n", + "\n", + "In summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n", + "\n", + "prod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n", + "\n", + "will be the highest quality and tightly controlled.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|ASSET|SEMANTICS|SEPARATED BY|\n", + "|---|---|---|\n", + "|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n", + "|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n", + "|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n", + "|Code|Labeled according to software development lifecycle phase|Git repository branches|\n", + "\n", + "\n", + "**Table 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ML deployment patterns\n", + "\n", + "The fact that models and code can be managed separately results in multiple possible patterns for getting\n", + "\n", + "ML artifacts through staging and into production. We explain two major patterns below.\n", + "\n", + "**D E P L O Y M O D E L S**\n", + "\n", + "dev staging prod\n", + "\n", + "**D E P L O Y C O D E**\n", + "\n", + "dev staging prod\n", + "\n", + "These two patterns differ in terms of whether the model artifact or the training code that produces the\n", + "\n", + "model artifact is promoted toward production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Deploy models\n", + "\n", + "In the first pattern, the model artifact is generated by training code in the development environment.\n", + "\n", + "This artifact is then tested in staging for compliance and performance before finally being deployed into\n", + "\n", + "production. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n", + "\n", + "expensive, training the model once and managing that artifact may be preferable. However, this simpler\n", + "\n", + "architecture comes with limitations. If production data is not accessible from the development environment\n", + "\n", + "(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n", + "\n", + "automated model retraining. While you could automate retraining in the development environment, you\n", + "\n", + "would then be treating “dev” training code as production ready, which many deployment teams would not\n", + "\n", + "accept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n", + "\n", + "deployed to production, requiring a separate code deployment path.\n", + "\n", + "###### Deploy code\n", + "\n", + "In the second pattern, the code to train models is developed in the dev environment, and this code is\n", + "\n", + "moved to staging and then production. Models will be trained in each environment: initially in the dev\n", + "\n", + "environment as part of model development, in staging (on a limited subset of data) as part of integration\n", + "\n", + "tests, and finally in the production environment (on the full production data) to produce the final model.\n", + "\n", + "If an organization restricts data scientists’ access to production data from dev or staging environments,\n", + "\n", + "deploying code allows training on production data while respecting access controls. Since training code\n", + "\n", + "goes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n", + "\n", + "same pattern as model training code, and both can go through integration tests in staging. However, the\n", + "\n", + "learning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n", + "\n", + "project templates and workflows are helpful. Finally, data scientists need visibility into training results from\n", + "\n", + "the production environment, for only they have the knowledge to identify and fix ML-specific issues.\n", + "\n", + "\n", + "-----\n", + "\n", + "The diagram below contrasts the code lifecycle for the above deployment patterns across the different\n", + "\n", + "execution environments.\n", + "\n", + "\n", + "Code\n", + "development\n", + "\n", + "Development\n", + "environment\n", + "\n", + "\n", + "Unit\n", + "tests\n", + "\n", + "\n", + "Integration\n", + "tests\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "\n", + "Model\n", + "training\n", + "\n", + "\n", + "Continuous\n", + "deployment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "Deploy\n", + "pipelines\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "#### Deploy models\n", + "\n", + " Deploy code\n", + "\n", + "\n", + "**In general we recommend following the “deploy code” approach, and the reference architecture in**\n", + "\n", + "**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n", + "\n", + "the options outlined above are not mutually exclusive. Within a single organization, you may find some use\n", + "\n", + "cases deploying training code and others deploying model artifacts. Your choice of process will depend on\n", + "\n", + "the business use case, resources available and what is most likely to succeed.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n", + "|---|---|---|---|\n", + "|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n", + "||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n", + "||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n", + "|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n", + "||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n", + "||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n", + "||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n", + "||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n", + "||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n", + "|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|\n", + "\n", + "\n", + "**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## MLOps Architecture\n", + " and Process\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "#### Architecture components\n", + "\n", + "Before unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n", + "\n", + "features used to facilitate MLOps in the workflow prescribed.\n", + "\n", + "###### Data Lakehouse\n", + "\n", + "A [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n", + "\n", + "data management and performance typically found in data warehouses with the low-cost, flexible object\n", + "\n", + "stores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n", + "\n", + "of Bronze, Silver and Gold tables of increasing refinement and quality.\n", + "\n", + "###### MLflow\n", + "\n", + "[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n", + "\n", + "following primary components:\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n", + "\n", + "artifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n", + "| |\n", + "\n", + "\n", + "\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n", + "\n", + "model serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n", + "| |\n", + "\n", + "\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n", + "\n", + "\n", + "from staging to production, with capabilities for versioning and annotating. The registry also provides\n", + "\n", + "webhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n", + "| |\n", + "\n", + "Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n", + "\n", + "high availability, and other Databricks workspace features such as experiment and run management and\n", + "\n", + "notebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n", + "\n", + "machine learning model training runs and running machine learning projects.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Databricks and MLflow Autologging\n", + "\n", + "Databricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n", + "\n", + "experiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n", + "\n", + "\n", + "automatically captures model parameters, metrics, files and lineage information when you train models with\n", + "\n", + "training runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n", + "| |\n", + "\n", + "###### Feature Store\n", + "\n", + "The Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n", + "\n", + "\n", + "across an organization and also ensures that the same feature computation code is used for model training\n", + "\n", + "and inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n", + "| |\n", + "\n", + "###### MLflow Model Serving\n", + "\n", + "MLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n", + "\n", + "\n", + "that are updated automatically based on the availability of model versions and their stages. See\n", + "\n", + "documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n", + "| |\n", + "\n", + "###### Databricks SQL\n", + "\n", + "Databricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n", + "\n", + "\n", + "data lake, create multiple visualization types to explore query results from different perspectives, and build\n", + "\n", + "and share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n", + "| |\n", + "\n", + "###### Databricks Workflows and Jobs\n", + "\n", + "Databricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n", + "\n", + "\n", + "ways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n", + "\n", + "steps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n", + "| |\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "We are now ready to review a general reference architecture for implementing MLOps on the Databricks\n", + "\n", + "Lakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n", + "\n", + "the majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n", + "\n", + "we will highlight alternative approaches to implementing different parts of the process.\n", + "\n", + "We begin with an overview of the system end-to-end, followed by more detailed views of the process\n", + "\n", + "in development, staging and production environments. These diagrams show the system as it operates\n", + "\n", + "in a steady state, with the finer details of iterative development cycles omitted. This structure is\n", + "\n", + "summarized below.\n", + "\n", + "**O V E R V I E W**\n", + "```\n", + " dev staging prod\n", + "\n", + "```\n", + "\n", + "\u0007Data\n", + "\n", + "\u0007Exploratory data analysis (EDA)\n", + "\n", + "\u0007Project code\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Commit code\n", + "\n", + "\n", + "\u0007Merge request\n", + "\n", + "\u0007Unit tests (CI)\n", + "\n", + "\u0007Integration tests (CI)\n", + "\n", + "\u0007Merge\n", + "\n", + "\u0007Cut release branch\n", + "\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Continuous deployment (CD)\n", + "\n", + "\u0007Online serving (REST APIs)\n", + "\n", + "\u0007Inference: batch or streaming\n", + "\n", + "\u0007Monitoring\n", + "\n", + "\u0007Retraining\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Overview\n", + "\n", + "Source control\n", + "\n", + "dev staging (main) release\n", + "\n", + "Merge reIuest to staging Cut release branch Pull from release branch to production\n", + "\n", + "\n", + "**Figure 3**\n", + "\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Exploratory\n", + "data analysis\n", + "\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Create dev branch Commit code C} trigger Merge\n", + "\n", + "\n", + "Production\n", + "environment\n", + "\n", + "Model Registry\n", + "\n", + "St�ge{ �one St�ge{ St�ging St�ge{ Production\n", + "\n", + "\n", + ". . .\n", + "\n", + "\n", + "Inference & serving dev\n", + "\n", + "Feature table refresh dev\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Push model to registr� Load model for testing Load model for inference\n", + "\n", + "Integration\n", + "tests (CI)\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Promote to production\n", + "\n", + "\n", + "Inference & serving\n", + "\n", + "\n", + "Model training dev\n", + "\n", + "release\n", + "\n", + "dev\n", + "\n", + "\n", + "Feature\n", + "table refresh\n", + "\n", + "release\n", + "\n", + "\n", + "Mode�\n", + "training\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring\n", + "\n", + "release\n", + "\n", + "\n", + "Data tables Feature tables Feature tables Data tables Feature tables Metrics tables\n", + "\n", + "Here we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n", + "\n", + "and model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n", + "\n", + "pipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n", + "\n", + "development environment, and changes to the codebase are committed back to source control. Upon merge\n", + "\n", + "request to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n", + "\n", + "code in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n", + "\n", + "code release. In production, a model is trained on the full production data and pushed to the MLflow Model\n", + "\n", + "Registry. A continuous deployment (CD) process tests the model and promotes it toward the production\n", + "\n", + "stage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Dev\n", + "\n", + "In the development environment, data scientists and ML engineers can collaborate on all pipelines in\n", + "\n", + "an ML project, committing their changes to source control. While engineers may help to configure this\n", + "\n", + "environment, data scientists typically have significant control over the libraries, compute resources and\n", + "\n", + "code that they use.\n", + "\n", + "\n", + "**Figure 4** Development environment\n", + "\n", + "0�\n", + "\n", + "E�ploratory\n", + "data analysis\n", + "\n", + "0�\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Source control\n", + "\n", + "Tracking Server\n", + "\n", + "Metrics Parameters Models\n", + "\n", + "dev\n", + "\n", + "\n", + ". . .\n", + "\n", + "models\n", + "\n", + "\n", + "train.py\n", + "\n", + "deploy.py\n", + "\n", + "in(erence.py\n", + "\n", + "monitoring.py\n", + "\n", + "dat<\n", + "\n", + "(eaturization.py\n", + "\n", + "tests\n", + "\n", + "unit.py\n", + "\n", + "integration.py\n", + "\n", + "\n", + "Inference: Streaming or batch\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "\n", + "Model training\n", + "\n", + "Training and\n", + "Evaluation\n", + "tuning\n", + "\n", + "\n", + "Create dev mrancg\n", + "\n", + "0u\n", + "\n", + "Commit code\n", + "\n", + "\n", + "04\n", + "\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Feature tamles Bronze / Silver / Gold\n", + "\n", + "prod data\n", + "\n", + "\n", + "Feature tamles Temp tamles\n", + "\n", + "dev data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Data scientists working in the dev environment possess read-only access to production data. They also\n", + "\n", + "require read-write access to a separate dev storage environment to develop and experiment with new\n", + "\n", + "features and other data tables.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "The data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n", + "\n", + "assess whether the available data has the potential to address the business problem. EDA is also where the\n", + "\n", + "data scientist will begin discerning what data preparation and featurization are required for model training.\n", + "\n", + "This ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n", + "\n", + "###### Project code\n", + "\n", + "This is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n", + "\n", + "are used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n", + "\n", + "a project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n", + "\n", + "pipeline consists of two steps:\n", + "\n", + "\u0007 **Data preparation**\n", + "\n", + "This step checks for and corrects any data quality issues prior to featurization.\n", + "\n", + "**\u0007Featurization**\n", + "\n", + "In the dev environment, new features and updated featurization logic can be tested by writing to feature\n", + "\n", + "tables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n", + "\n", + "featurization code is promoted to production, these changes will affect the production feature tables.\n", + "\n", + "Features already available in production feature tables can be read directly for development.\n", + "\n", + "In some organizations, feature engineering pipelines are managed separately from ML projects. In such\n", + "\n", + "cases, the featurization pipeline can be omitted from this architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "The training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n", + "\n", + "and it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n", + "\n", + "hyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n", + "\n", + "between the model, its input data, and the code used to generate it.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out data. The results of these tests are logged to the\n", + "\n", + "MLflow tracking server.\n", + "\n", + "If governance requires additional metrics or supplemental documentation about the model, this is the\n", + "\n", + "time to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n", + "\n", + "and plain text descriptions are common, but defining the specifics for such governance requires input\n", + "\n", + "from business stakeholders or a data governance officer.\n", + "\n", + "**\u0007Model output**\n", + "\n", + "The output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n", + "\n", + "training pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n", + "\n", + "via the model URI (or path) and then push the model to the Model Registry for management and testing.\n", + "\n", + "###### Commit code\n", + "\n", + "After developing code for featurization, training, inference and other pipelines, the data scientist or\n", + "\n", + "ML engineer commits the dev branch changes into source control. This section does not discuss the\n", + "\n", + "continuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n", + "\n", + "information on those.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Staging\n", + "\n", + "The transition of code from development to production occurs in the staging environment. This code\n", + "\n", + "includes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n", + "\n", + "engineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n", + "\n", + "integration pipelines and orchestration.\n", + "\n", + "Source control\n", + "\n", + "0] 0_\n", + "\n", + "dev staging >main< release\n", + "\n", + "Merge reHuest to staging Cut release branch\n", + "\n", + "Staging environment\n", + "\n", + "CI trigger Merge\n", + "\n", + "0�\n", + "\n", + "\n", + "**Figure 5**\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Tracking Server\n", + "\n", + "0�\n", + "\n", + "Model Registry\n", + "\n", + "dev\n", + "\n", + "\n", + "03\n", + "\n", + "Integration tests (CI)\n", + "\n", + "\n", + "Feature\n", + "Store tests\n", + "\n", + "\n", + "Model\n", + "training tests\n", + "\n", + "\n", + "Model\n", + "deployment\n", + "tests\n", + "\n", + "\n", + "Inference\n", + "tests\n", + "\n", + "\n", + "Model\n", + "monitoring\n", + "tests\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "dev\n", + "\n", + "Feature tables Temp tables\n", + "\n", + "staging data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "The staging environment may have its own storage area for testing feature tables and ML pipelines. This\n", + "\n", + "data is generally temporary and only retained long enough to run tests and to investigate test failures. This\n", + "\n", + "data can be made readable from the development environment for debugging.\n", + "\n", + "###### Merge code\n", + "\n", + "\u0007 **Merge request**\n", + "\n", + "The deployment process begins when a merge (or pull) request is submitted against the staging branch\n", + "\n", + "of the project in source control. It is common to use the “main” branch as the staging branch.\n", + "\n", + "**\u0007Unit tests (CI)**\n", + "\n", + "This merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n", + "\n", + "request is rejected.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Integration tests (CI)\n", + "\n", + "The merge request then goes through integration tests, which run all pipelines to confirm that they function\n", + "\n", + "correctly together. The staging environment should mimic the production environment as much as is\n", + "\n", + "reasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n", + "\n", + "Integration tests can trade off fidelity of testing for speed and cost. For example, when models are\n", + "\n", + "expensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n", + "\n", + "cost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n", + "\n", + "testing within these integration tests, whereas other models may be tested with small batch jobs or a few\n", + "\n", + "queries to temporary REST endpoints.\n", + "\n", + "Once integration tests pass on the staging branch, the code may be promoted toward production.\n", + "\n", + "\u0007 **Merge**\n", + "\n", + "If all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n", + "\n", + "system should notify users and post results on the merge (pull) request.\n", + "\n", + "Note: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n", + "\n", + "updated frequently with concurrent merge requests.\n", + "\n", + "###### Cut release branch\n", + "\n", + "Once CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n", + "\n", + "that commit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6**\n", + "\n", + "\n", + "###### Prod\n", + "\n", + "The production environment is typically managed by a select set of ML engineers and is where ML pipelines\n", + "\n", + "directly serve the business or application. These pipelines compute fresh feature values, train and test new\n", + "\n", + "model versions, publish predictions to downstream tables or applications, and monitor the entire process to\n", + "\n", + "avoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n", + "\n", + "online serving below, most ML applications will use only one of these methods, depending on the business\n", + "\n", + "requirements.\n", + "\n", + "Production environment\n", + "\n", + "\n", + "0b\n", + "\n", + "0�\n", + "\n", + "0�\n", + "\n", + "\n", + "Model Registry\n", + "\n", + "\n", + "Online serving\n", + "\n", + "\n", + "Stage: None Stage: Staging Stage: Production\n", + "\n", + "\n", + "Log\n", + "requests and\n", + "predictions\n", + "\n", + "release\n", + "\n", + "\n", + "Load model for\n", + "online serving\n", + "\n", + "\n", + "Ena�le online\n", + "serving\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "release\n", + "\n", + "0B\n", + "\n", + "\n", + "0~\n", + "\n", + "\n", + "Load model for testing\n", + "\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Inference: Batch or streaming\n", + "\n", + "\n", + "Register and request transition\n", + "\n", + "Model training\n", + "\n", + "Training\n", + "Evaluation\n", + "and tuning\n", + "\n", + "release\n", + "\n", + "\n", + "Promote to staging Promote to production\n", + "\n", + "\n", + "Model\n", + "Data ingest\n", + "inference\n", + "\n", + "\n", + "Pu�lish\n", + "predictions\n", + "\n", + "\n", + "03\n", + "\n", + "\n", + "Continuous Deployment (CD)\n", + "\n", + "\n", + "release\n", + "\n", + "Monitoring\n", + "\n", + "\n", + "Data ingest\n", + "\n", + "\n", + "Check model\n", + "performance\n", + "and data drift\n", + "\n", + "\n", + "Pu�lish\n", + "metrics\n", + "\n", + "\n", + "Compare\n", + "Staging vs\n", + "Production\n", + "\n", + "\n", + "Request model\n", + "transition to\n", + "Production\n", + "\n", + "release\n", + "\n", + "\n", + "Compliance\n", + "checks\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Trigger model training\n", + "\n", + "\n", + "release\n", + "\n", + "\n", + "Data ta�les Feature ta�les Feature ta�les Monitoring ta�les\n", + "Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "Though data scientists may not have write or compute access in the production environment, it is\n", + "\n", + "important to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n", + "\n", + "in production. This visibility allows them to identify and diagnose problems in production.\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n", + "\n", + "or streaming computation, depending on the freshness requirements for downstream training and inference.\n", + "\n", + "The pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n", + "\n", + "###### Model training\n", + "\n", + "The model training pipeline runs either when code changes affect upstream featurization or training logic, or\n", + "\n", + "when automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "During the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n", + "\n", + "metrics, parameters, tags and the model itself.\n", + "\n", + "During development, data scientists may test many algorithms and hyperparameters, but it is common\n", + "\n", + "to restrict those choices to the top-performing options in the production training code. Restricting tuning\n", + "\n", + "can reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out production data. The results of these tests are\n", + "\n", + "logged to the MLflow tracking server. During development, data scientists will have selected meaningful\n", + "\n", + "evaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n", + "\n", + "**\u0007Register and request transition**\n", + "\n", + "Following model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n", + "\n", + "environment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Continuous deployment (CD)\n", + "\n", + "The CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n", + "\n", + "‘stage=Staging’. There are three key tasks in this pipeline:\n", + "\n", + "\u0007 **Compliance checks**\n", + "\n", + "These tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n", + "\n", + "etc.), and approve or reject the request based on test results. If compliance checks require human\n", + "\n", + "expertise, this automated step can compute statistics or visualizations for people to review in a manual\n", + "\n", + "approval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n", + "\n", + "are recorded to the Model Registry through metadata in tags and comments in descriptions.\n", + "\n", + "The MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n", + "\n", + "can be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n", + "\n", + "the transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n", + "\n", + "transition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n", + "\n", + "**\u0007Compare staging vs. production**\n", + "\n", + "To prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n", + "\n", + "‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n", + "\n", + "according to the use case, and the method for comparison can vary from canary deployments to A/B\n", + "\n", + "tests. All comparison results are saved to metrics tables in the lakehouse.\n", + "\n", + "If this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n", + "\n", + "should be compared to a business heuristic or other threshold as a baseline. For a new version\n", + "\n", + "of an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n", + "\n", + "‘stage=Production’ model.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Request model transition to production**\n", + "\n", + "If the candidate model passes the comparison tests, a request is made to transition it to\n", + "\n", + "‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n", + "\n", + "approvals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n", + "\n", + "webhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n", + "\n", + "fully available to downstream applications. A person can manually review the compliance checks and\n", + "\n", + "performance comparisons to perform checks which are difficult to automate.\n", + "\n", + "###### Online serving (REST APIs)\n", + "\n", + "For lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n", + "\n", + "simple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n", + "\n", + "custom serving layers.\n", + "\n", + "In all cases, the serving system loads the production model from the Model Registry upon initialization. On\n", + "\n", + "each request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n", + "\n", + "serving system, data transport layer or the model itself could log requests and predictions.\n", + "\n", + "###### Inference: batch or streaming\n", + "\n", + "This pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n", + "\n", + "‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n", + "\n", + "throughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n", + "\n", + "option.\n", + "\n", + "A batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n", + "\n", + "A streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n", + "\n", + "Apache Kafka.®\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Monitoring\n", + "\n", + "Input data and model predictions are monitored, both for statistical properties (data drift, model\n", + "\n", + "performance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n", + "\n", + "published for dashboards and alerts.\n", + "\n", + "\u0007 **Data ingestion**\n", + "\n", + "This pipeline reads in logs from batch, streaming or online inference.\n", + "\n", + "**\u0007Check accuracy and data drift**\n", + "\n", + "The pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n", + "\n", + "performance. Metrics that measure statistical properties are generally chosen by data scientists during\n", + "\n", + "development, whereas metrics for infrastructure are generally chosen by ML engineers.\n", + "\n", + "\u0007 **Publish metrics**\n", + "\n", + "The pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n", + "\n", + "to produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n", + "\n", + "dashboarding tool issues notifications when health metrics surpass defined thresholds.\n", + "\n", + "**\u0007Trigger model training**\n", + "\n", + "When the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n", + "\n", + "out of date, the data scientist may need to return to the development environment and develop a new\n", + "\n", + "model version.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Note:** While automated retraining is supported\n", + "\n", + "in this architecture, it isn’t required, and caution\n", + "\n", + "\n", + "###### Retraining\n", + "\n", + "This architecture supports automatic retraining using the same model training pipeline above. While we\n", + "\n", + "recommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n", + "\n", + "retraining when needed.\n", + "\n", + "\u0007 **Scheduled**\n", + "\n", + "If fresh data are regularly made available, rerunning model training on a defined schedule can help models\n", + "\n", + "to keep up with changing trends and behavior.\n", + "\n", + "**\u0007Triggered**\n", + "\n", + "If the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n", + "\n", + "trigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n", + "\n", + "performance degrades, automatic retraining and redeployment can boost model performance with\n", + "\n", + "minimal human intervention.\n", + "\n", + "\n", + "must be taken in cases where it is implemented.\n", + "\n", + "\n", + "It is inherently difficult to automate selecting the\n", + "\n", + "correct action to take from model monitoring\n", + "\n", + "\n", + "When the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n", + "\n", + "scientist may need to return to the dev environment and resume experimentation to address such issues.\n", + "\n", + "\n", + "alerts. For example, if data drift is observed, does\n", + "\n", + "it indicate that we should automatically retrain, or\n", + "\n", + "does it indicate that we should engineer additional\n", + "\n", + "features to encode some new signal in the data?\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4:**\n", + "## LLMOps – Large Language Model Operations\n", + "\n", + "\n", + "#### Large language models\n", + "\n", + "LLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n", + "\n", + "countless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n", + "\n", + "\u0007Is prompt engineering part of operations, and if so, what is needed?\n", + "\n", + "\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n", + "\n", + "\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n", + "\n", + "…and many more!\n", + "\n", + "The good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n", + "\n", + "some parts of your MLOps platform and process may require changes, and your team will need to learn a\n", + "\n", + "mental model of how LLMs coexist alongside traditional ML in your operations.\n", + "\n", + "In this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n", + "\n", + "key topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n", + "\n", + "a reference architecture diagram to illustrate what may change in your production environment.\n", + "\n", + "###### What changes with LLMs?\n", + "\n", + "For those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n", + "\n", + "one-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n", + "\n", + "significantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n", + "\n", + "question answering, summarization and execution of near-arbitrary instructions.\n", + "\n", + "From the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n", + "\n", + "platforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n", + "\n", + "into more detail in the next section.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Table 3**\n", + "\n", + "\n", + "\n", + "|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n", + "|---|---|\n", + "|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n", + "|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n", + "|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n", + "|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n", + "|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n", + "\n", + "\n", + "-----\n", + "\n", + "The list above may look long, but as we will see in the next section, many existing tools and processes\n", + "\n", + "only require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n", + "\n", + "do not change:\n", + "\n", + "\u0007The separation of development, staging and production remains the same\n", + "\n", + "\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n", + "\n", + "models toward production\n", + "\n", + "\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n", + "\n", + "\u0007Existing CI/CD infrastructure should not require changes\n", + "\n", + "\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n", + "\n", + "model inference, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Discussion of key topics for LLMOps\n", + "\n", + "So far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n", + "\n", + "more details about selected topics.\n", + "\n", + "###### Prompt engineering\n", + "\n", + "Prompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n", + "\n", + "responses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n", + "\n", + "We will cover a few tips and best practices and link to useful resources.\n", + "\n", + "**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n", + "\n", + "generally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n", + "\n", + "In the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n", + "\n", + "prompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n", + "\n", + "**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n", + "\n", + "\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n", + "\n", + "prompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n", + "\n", + "more details. Checking structured LLM pipeline code into version control also helps with prompt\n", + "\n", + "development, for git diffs allow you to review changes to prompts over time. Also see the section\n", + "\n", + "below on packaging model and pipelines for more information about tracking prompt versions.\n", + "\n", + "\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n", + "\n", + "Newer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n", + "\n", + "\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n", + "\n", + "tuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n", + "\n", + "tuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n", + "\n", + "tool for prompt tuning.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "There are lots of good resources about\n", + "prompt engineering, especially for popular\n", + "\n", + "models and services:\n", + "\n", + "\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n", + "\n", + "\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "\n", + "**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n", + "\n", + "popularity. Some of these generalize to other models as well. We will provide a few tips here:\n", + "\n", + "\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n", + "\n", + "input, and a description of the desired output type or format\n", + "\n", + "\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n", + "\n", + "\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n", + "\n", + "\u0007Tell the model to think step-by-step or explain its reasoning\n", + "\n", + "\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n", + "\n", + "clear which parts of the prompt correspond to your instruction vs. user input\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Packaging models or pipelines for deployment\n", + "\n", + "In traditional ML, there are generally two types of ML logic to package for deployment: models and\n", + "\n", + "pipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n", + "\n", + "control, respectively.\n", + "\n", + "With LLMs, it is common to package ML logic in new forms. These may include:\n", + "\n", + "\u0007A lightweight call to an LLM API service (third party or internal)\n", + "\n", + "\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n", + "\n", + "local LLM model.\n", + "\n", + "\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n", + "\n", + "pretrained model or a custom fine-tuned model.\n", + "\n", + "\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n", + "\n", + "Though LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n", + "\n", + "and pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n", + "\n", + "deployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n", + "\n", + "\u0007PyTorch and TensorFlow\n", + "\n", + "\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n", + "\n", + "\u0007LangChain\n", + "\n", + "\u0007OpenAI API\n", + "\n", + "\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n", + "\n", + "For other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n", + "\n", + "arbitrary Python code.\n", + "\n", + "\n", + "**Note about prompt versioning:** Just as it is helpful\n", + "\n", + "to track model versions, it is helpful to track prompt\n", + "\n", + "versions (and LLM pipeline versions, more generally).\n", + "\n", + "Packaging prompts and pipelines as MLflow Models\n", + "\n", + "simplifies versioning. Just as a newly retrained\n", + "\n", + "model can be tracked as a new model version in the\n", + "\n", + "MLflow Model Registry, a newly updated prompt can\n", + "\n", + "be tracked as a new model version.\n", + "\n", + "**Note about deploying models vs. code:** Your\n", + "\n", + "decisions around packaging ML logic as version\n", + "\n", + "controlled code vs. registered models will help\n", + "\n", + "to inform your decision about choosing between\n", + "\n", + "the deploy models, deploy code and hybrid\n", + "\n", + "architectures. Review the subsection below about\n", + "\n", + "human feedback, and make sure that you have a\n", + "\n", + "well-defined testing process for whatever artifacts\n", + "\n", + "you choose to deploy.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Managing cost/performance trade-offs\n", + "\n", + "One of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n", + "\n", + "and serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n", + "\n", + "of billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n", + "\n", + "manage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n", + "\n", + "**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n", + "\n", + "development is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n", + "\n", + "models. As you go, make sure to collect data such as queries and responses. In the future, you can use\n", + "\n", + "that data to fine-tune a smaller, cheaper model which you can own.\n", + "\n", + "**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n", + "\n", + "How much does each query cost? These estimates will inform you about project feasibility and will help\n", + "\n", + "you to decide when to consider bringing the model in-house with open source models and fine-tuning.\n", + "\n", + "**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n", + "\n", + "computation and costs. These include shortening queries, tweaking inference configurations and using\n", + "\n", + "smaller versions of models.\n", + "\n", + "**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n", + "\n", + "unless you get human feedback from end users.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "**Fine-tuning**\n", + "\n", + "\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "**Model distillation,**\n", + "**quantization and pruning**\n", + "\n", + "\n", + "###### Methods for reducing costs of inference\n", + "\n", + "**Use a smaller model**\n", + "\n", + "\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n", + "\n", + "or alternate architectures.\n", + "\n", + "\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n", + "\n", + "perform better than a generic model.\n", + "\n", + "\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n", + "\n", + "model into a smaller model.\n", + "\n", + "\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n", + "\n", + "without losing much in quality.\n", + "\n", + "\n", + "\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\n", + "**\u0007Reduce computation for a given model**\n", + "\n", + "\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n", + "\n", + "queries and responses reduces costs.\n", + "\n", + "\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n", + "\n", + "**Other**\n", + "\n", + "\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n", + "\n", + "low ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n", + "\n", + "\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n", + "\n", + "models to use sparse computation during inference. This reduces computation for most or all queries.\n", + "\n", + "\n", + "[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Human feedback, testing, and monitoring\n", + "\n", + "While human feedback is important in many traditional ML applications, it becomes much more important\n", + "\n", + "for LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n", + "\n", + "metrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n", + "\n", + "might have almost completely different words and word orders, so even defining a “ground-truth” label\n", + "\n", + "becomes difficult or impossible.\n", + "\n", + "Humans — ideally your end users — become essential for validating LLM output. While you can pay human\n", + "\n", + "labelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n", + "\n", + "feedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n", + "\n", + "to chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n", + "\n", + "were helpful.\n", + "\n", + "In terms of operations, not much changes from traditional MLOps:\n", + "\n", + "\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n", + "\n", + "lakehouse, and process it using the same data pipeline tooling as other data.\n", + "\n", + "\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n", + "\n", + "more important, superceding offline quality tests. If you can collect user feedback, then these rollout\n", + "\n", + "methods can validate models before they are fully deployed.\n", + "\n", + "\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n", + "\n", + "fine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n", + "\n", + "start with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n", + "\n", + "\n", + "###### Resources\n", + "\n", + "**Reinforcement Learning from**\n", + "**Human Feedback (RLHF)**\n", + "\n", + "\u0007Chip Huyen blog post on\n", + "\n", + "[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "\u0007Hugging Face blog post on\n", + "\n", + "[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n", + "\n", + "[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n", + "\n", + "\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Other topics\n", + "\n", + "\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n", + "\n", + "but some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n", + "\n", + "your LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n", + "\n", + "fine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n", + "\n", + "[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n", + "\n", + "\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n", + "\n", + "adjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n", + "\n", + "models, most LLMs will benefit from or require GPUs for serving and inference.\n", + "\n", + "\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n", + "\n", + "based lookups of documents or other data. Vector databases may be an important addition to your\n", + "\n", + "serving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n", + "\n", + "preprocessed data which can be queried by inference jobs or model serving systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "To illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n", + "\n", + "modified version of the previous production architecture.\n", + "\n", + "Production environment\n", + "\n", + "Model Registry\n", + "\n", + "Stage: �one Stage: Staging Stage: Production\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Push model to registry Promote to production\n", + "\n", + "\n", + "Model serving\n", + "\n", + "\n", + "LLM API request\n", + "\n", + "release\n", + "\n", + "\n", + "Fine-Tine LLM\n", + "\n", + "release\n", + "\n", + "\n", + "Vector Database\n", + "Update\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring &\n", + "Evaluation\n", + "\n", + "release\n", + "\n", + "\n", + "Internal/External Data tables Vector database Metrics tables Human feedback\n", + "model hub\n", + "\n", + "**Figure 7**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Additional resources\n", + "\n", + "With LLMs being such a novel field, we link to\n", + "several LLM resources below, which are not\n", + "\n", + "necessarily “LLMOps” but may prove useful\n", + "to you.\n", + "\n", + "\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "LLM lists and leaderboards\n", + "\n", + "\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n", + "\n", + "\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n", + "\n", + "\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n", + "\n", + "[Foundation Models](https://crfm.stanford.edu/)\n", + "\n", + "\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n", + "\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n", + "\n", + "\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "The primary changes to this production architecture are:\n", + "\n", + "\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n", + "\n", + "an internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n", + "\n", + "production to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n", + "\n", + "tuning, this hub would mainly be used in development.\n", + "\n", + "\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n", + "\n", + "model (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n", + "\n", + "but it is similar operationally.\n", + "\n", + "\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n", + "\n", + "most often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n", + "\n", + "its Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n", + "\n", + "that these data stores and jobs are analogous in terms of operations.\n", + "\n", + "\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n", + "\n", + "API calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n", + "\n", + "potential latency or flakiness from third-party APIs, as well as another layer of credential management.\n", + "\n", + "\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n", + "\n", + "but become essential in most LLM applications. Human feedback should be managed like other data,\n", + "\n", + "ideally incorporated into monitoring based on near real-time streaming.\n", + "\n", + "\n", + "[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Looking ahead\n", + "\n", + "LLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n", + "\n", + "support and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n", + "\n", + "sourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n", + "\n", + "flexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n", + "\n", + "While this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n", + "\n", + "requires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n", + "\n", + "news is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n", + "\n", + "LLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n", + "\n", + "the power of large language models.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "9,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast and over 50% of the Fortune 500 — rely\n", + "\n", + "on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered\n", + "\n", + "in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark ™ ,\n", + "\n", + "Delta Lake and MLflow, Databricks is on a mission\n", + "\n", + "to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
-----\n", + "\n", + "# TABLE OF CONTENTS\n", + "\n", + "\n", + "##### Welcome to Data, Analytics and AI ....... 02\n", + "\n", + "**Do you know what you’re getting into?** ............................................ **02**\n", + "\n", + "**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n", + "\n", + "##### Business Value .......................................................................... 03\n", + "\n", + "**Talking to the business (feels like combat)** \b����������������������������� **03**\n", + "\n", + "**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n", + "\n", + "**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n", + "\n", + "##### Ultimate Class Build Guide .................................. 04\n", + "\n", + "**Creating a character** \b����������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Engineers \b������������������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Scientists \b������������������������������������������������������������������������������������� **05**\n", + "\n", + "- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n", + "\n", + "##### Diving In ............................................................................................... 05\n", + "\n", + "**Producing game data** \b���������������������������������������������������������������������������� **05**\n", + "\n", + "**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n", + "\n", + "**Getting data from your game to the cloud** \b������������������������������ **08**\n", + "\n", + "##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n", + "\n", + "**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n", + "\n", + "**Use data to develop a next-generation**\n", + "\n", + "**customer experience** \b��������������������������������������������������������������������������� **09**\n", + "\n", + "##### Getting Started with Gaming Use Cases .............................................................. 10\n", + "\n", + "**Where do I start? Start with Game Analytics** \b������������������������� **10**\n", + "\n", + "**Understand your audience** \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Segmentation \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n", + "\n", + "- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n", + "\n", + "- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n", + "\n", + "- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n", + "\n", + "**Find your audience** \b���������������������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n", + "\n", + "- Player Recommendations \b����������������������������������������������������������������� **15**\n", + "\n", + "- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n", + "\n", + "- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n", + "\n", + "- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n", + "\n", + "**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n", + "\n", + "- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n", + "\n", + "- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n", + "\n", + "- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n", + "\n", + "##### Things to Look Forward To ..................................... 19\n", + "\n", + " Appendix .............................................................................................. 21\n", + "\n", + "**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n", + "\n", + "- Creating a Character \b��������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Engineers \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Scientists \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Analysts \b������������������������������������������������������������������������������ **22**\n", + "\n", + "**Data Access and the Major Cloud Providers** ................................ **23**\n", + "\n", + "- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n", + "\n", + "- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n", + "\n", + "- Getting started with the major cloud providers \b������������������� **23**\n", + "\n", + "**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n", + "\n", + "- Game analytics \b������������������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Segmentation \b�������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n", + "\n", + "- Social Media Monitoring \b������������������������������������������������������������������� **28**\n", + "\n", + "- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n", + "\n", + "- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n", + "\n", + "- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n", + "\n", + "- Player Recommendations \b���������������������������������������������������������������� **32**\n", + "\n", + "- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n", + "\n", + "- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n", + "\n", + "- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n", + "\n", + "**Getting Started with Operational Use Cases** \b�������������������������� **36**\n", + "\n", + "- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n", + "\n", + "- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n", + "\n", + "- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n", + "\n", + "\n", + "Multi-Touch Attribution \b��������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "-----\n", + "\n", + "# Welcome to Data, Analytics, and AI\n", + "\n", + "\n", + "### Do you know what you’re getting into?\n", + "\n", + "You may have heard the stories of game studios spending\n", + "\n", + "countless hours trying to more effectively acquire, engage,\n", + "\n", + "and retain players. Well, did you know that data, analytics,\n", + "\n", + "and AI plays a central role in the development and operation\n", + "\n", + "of today’s top-grossing video games? Studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are no\n", + "\n", + "longer a “nice-to-have”, but table stakes for success.\n", + "\n", + "The objective of this handbook is to guide you on the\n", + "\n", + "role data, analytics, and AI plays in the development\n", + "\n", + "and operations of video games. We’ll cover who the key\n", + "\n", + "stakeholders are and how to align people across business\n", + "\n", + "units. Then we’ll talk through strategies to help you\n", + "\n", + "successfully advocate for data, analytics, and AI projects\n", + "\n", + "internally. Finally, we dive deep through the most common\n", + "\n", + "use cases. We want to give you enough information to feel\n", + "\n", + "\n", + "well as helpful tips when operating as or working with one of\n", + "\n", + "these classes.\n", + "\n", + "We follow this with the fundamentals for building a Proof\n", + "\n", + "of Concept (POC) or Minimum Viable Product (MVP). That\n", + "\n", + "is, connecting to the cloud; accessing your data; and\n", + "\n", + "most importantly, being able to represent the value you’re\n", + "\n", + "seeking to unlock as you sell your project into your team and\n", + "\n", + "broader organization.\n", + "\n", + "Finally, we’ll dive into the most common use cases for data,\n", + "\n", + "analytics, and AI within game development. Similar to a tech-\n", + "\n", + "tree in a video game, we begin with the most basic use cases\n", + "\n", + "- setting up your game analytics. Then we progress through\n", + "\n", + "more advanced data use cases such as player segmentation,\n", + "\n", + "assessing lifetime value, detecting and mitigating toxicity,\n", + "\n", + "multi-touch attribution, recommendation engines, player\n", + "\n", + "churn prediction and prevention, and more.\n", + "\n", + "Don’t forget to review the Appendix. You’ll find a handy\n", + "\n", + "“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n", + "\n", + "guide for the three major cloud providers ”. All incredibly\n", + "\n", + "helpful assets to keep as hotkeys.\n", + "\n", + "\n", + "empowered to make a demonstrable impact. Just by reading\n", + "\n", + "this you are adding incredible insight and value to yourself as\n", + "\n", + "\n", + "an industry professional. Quest on!\n", + "\n", + "### How to use this book\n", + "\n", + "This book is primarily intended for technical professionals\n", + "\n", + "who are engaging with data within game studios. No\n", + "\n", + "matter your role in the gaming industry, you will be able to\n", + "\n", + "glean key takeaways that will make you more effective in\n", + "\n", + "your individual role and within the larger team — be that\n", + "\n", + "production, art, engineering, marketing, or otherwise.\n", + "\n", + "Begin your journey by reviewing the “ **Data, Analytics, and AI**\n", + "\n", + "**Ground Rules** ” section to the right, which presents some This\n", + "\n", + "section presents some rules and guidelines for interpreting\n", + "\n", + "the role that data plays in the game development lifecycle.\n", + "\n", + "Next, it’s time to learn about the key professions (aka\n", + "\n", + "character classes) that interact and engage with data,\n", + "\n", + "analytics, and AI on a consistent basis within a game studio.\n", + "\n", + "This section breaks down each of the classes, providing an\n", + "\n", + "\n", + "**Data, Analytics, and AI Ground Rules**\n", + "\n", + "This guide assumes you understand the following:\n", + "\n", + "- You understand the basics of data, analytics, and AI:\n", + "\n", + "How and why data is stored in a system, why data\n", + "\n", + "is transformed, the different types of output that\n", + "\n", + "data can feed into — such as a report, an analysis\n", + "\n", + "answering a question, or a machine learning model.\n", + "\n", + "If this is the first time you’re creating a character,\n", + "\n", + "we highly recommend reviewing our data, analytics,\n", + "\n", + "and AI tutorial — aka getting started training and\n", + "\n", + "documentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n", + "\n", + "- You have a basic understanding of cloud\n", + "\n", + "infrastructure. Specifically what it is, who are the\n", + "\n", + "key players, and associated terms (e.g., virtual\n", + "\n", + "machines, APIs, applications)\n", + "\n", + "- You are generally aware of the game development\n", + "\n", + "lifecycle; pre-production, production, testing/QA,\n", + "\n", + "launch, operation\n", + "\n", + "\n", + "overview of each character’s strengths and weaknesses as\n", + "\n", + "\n", + "-----\n", + "\n", + "# Business Value\n", + "\n", + "\n", + "Demonstrating business value is important when working\n", + "\n", + "on data, analytics, and AI projects because it helps ensure\n", + "\n", + "that the efforts of the project are aligned with the goals\n", + "\n", + "and objectives of the business. By showing how the project\n", + "\n", + "can positively impact a game’s key performance indicators\n", + "\n", + "(KPIs) and bottom-line metrics, such as game revenue, player\n", + "\n", + "satisfaction, and operational efficiency, studio stakeholders\n", + "\n", + "are more likely to support and invest in the project.\n", + "\n", + "Additionally, demonstrating business value can help justify\n", + "\n", + "the resources, time, and money that are required to execute\n", + "\n", + "the project, and can also help prioritize which projects should\n", + "\n", + "be pursued. By focusing on business value, data, analytics,\n", + "\n", + "and AI projects can become strategic initiatives that\n", + "\n", + "contribute to the long-term success of your game studio.\n", + "\n", + "### Talking to the business (feels like combat)\n", + "\n", + "While we highly encourage everyone to read this section,\n", + "\n", + "you may already feel confident understanding the needs and\n", + "\n", + "concerns of your internal stakeholders, and how to sell-in a\n", + "\n", + "project successfully. If so, feel free to skip this section.\n", + "\n", + "We would love to dive into the data to explore and discover\n", + "\n", + "as much as possible, unfortunately in most environments,\n", + "\n", + "we are limited by resources and time. Understanding both\n", + "\n", + "the businesses pain points and strategic goals is crucial to\n", + "\n", + "choosing projects that will benefit the business, create value\n", + "\n", + "and make your message much easier to sell.\n", + "\n", + "Whenever we embark on a proof-of-concept (PoC) or\n", + "\n", + "minimum viable product (MVP) — to prove out a new\n", + "\n", + "**Questions to ask:**\n", + "\n", + "- What other strategic goals and pain points can\n", + "\n", + "you list out and how would you prioritize them as\n", + "\n", + "a business leader?\n", + "\n", + "- Does your prioritization match how your team,\n", + "\n", + "manager and/or leadership would prioritize?\n", + "\n", + "Typically the closer the match, the easier initial\n", + "\n", + "projects will be to “sell”.\n", + "\n", + "\n", + "methodology or technology — we will need to pitch it back\n", + "\n", + "for adoption. The technology could be revolutionary and\n", + "\n", + "absolutely amazing, but without the value proposition and tie\n", + "\n", + "back to goals, it is likely to land flat or fail to be adopted.\n", + "\n", + "It is key to talk to your stakeholders to understand their\n", + "\n", + "perception of pain points and positions on potential projects\n", + "\n", + "to add value. Much like stopping at the Tavern when the\n", + "\n", + "adventuring party gets to town, these can be informal\n", + "\n", + "conversations where you socialize potential solutions while\n", + "\n", + "gathering information about what matters.\n", + "\n", + "### Creating value alignment\n", + "\n", + "So what are your strategic goals and pain points and how\n", + "\n", + "might they be addressed through a use case from a PoC or\n", + "\n", + "MVP leveraging your data?\n", + "\n", + "A few examples of strategic goals that are top of mind for our\n", + "\n", + "customers at the beginning of any fiscal or calendar year:\n", + "\n", + "- Reduce costs\n", + "\n", + "- Simplify your infrastructure\n", + "\n", + "- Acquire more players\n", + "\n", + "- Monetize your playerbase\n", + "\n", + "- Retain your players (aka prevent churn)\n", + "\n", + "Here are four ways the Databricks Lakehouse can provide\n", + "\n", + "value that aligns with your strategic goals and pain points:\n", + "\n", + "`1.` **\u0007Improved collaboration:** Databricks platform allows\n", + "\n", + "everyone to share and collaborate on data, notebooks and\n", + "\n", + "models between data scientists, engineers and business\n", + "\n", + "users. This enables for a more efficient and streamlined\n", + "\n", + "process for data analysis and decision making.\n", + "\n", + "`2.` **Find and explore your data:** The data in the Lakehouse is\n", + "\n", + "cataloged and accessible, which enables business users\n", + "\n", + "to explore and query the data easily and discover insights\n", + "\n", + "by themselves.\n", + "\n", + "`3.` **\u0007Uncover actionable business insights:** By putting\n", + "\n", + "your game’s data into a Lakehouse architecture, it\n", + "\n", + "can be better analyzed using various tools provided\n", + "\n", + "by Databricks such as SQL, dashboards, notebooks,\n", + "\n", + "visualization and machine learning to better understand\n", + "\n", + "your playerbase, providing valuable insights into player\n", + "\n", + "behavior and performance. These insights can help the\n", + "\n", + "\n", + "-----\n", + "\n", + "and retention, and use that information to improve the\n", + "\n", + "game and grow monetization.\n", + "\n", + "`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n", + "\n", + "architecture provides a single source of truth for your\n", + "\n", + "organization’s data. Data engineers write once, data\n", + "\n", + "analysts interpret the data, and data scientists can run\n", + "\n", + "machine machine learning models on the same data.\n", + "\n", + "_This cannot be understated in the value this provides an_\n", + "\n", + "_organization from a total cost of ownership perspective._\n", + "\n", + "With the ability to access and analyze all the data in one\n", + "\n", + "place, the business can make unified data-driven decisions,\n", + "\n", + "rather than relying on intuition or fragmented data.\n", + "\n", + "### Goals and outcomes\n", + "\n", + "Like many projects, starting with a strong foundation of ‘what\n", + "\n", + "success looks like’ will significantly improve your likelihood\n", + "\n", + "of achieving your objectives. Here are a few best-practices\n", + "\n", + "we recommend:\n", + "\n", + "`1.` **Set goals:** Define your hypothesis, then use your data\n", + "\n", + "and process to prove or disprove your hypothesis. You\n", + "\n", + "have a goal in mind, make it part of the experiment. If\n", + "\n", + "the outcome differs from the expectation, that is part of\n", + "\n", + "experiments and we can learn from it to improve the next\n", + "\n", + "experiment. This is all about shortening the feedback loop\n", + "\n", + "\n", + "project appropriately. For example, are you doing this as\n", + "\n", + "a side project? Do you have 2 sprints to show progress?\n", + "\n", + "It’s important to scope your project based on the time,\n", + "\n", + "resources, and quality needed for the said project to be a\n", + "\n", + "success.\n", + "\n", + "`3.` **Scope down:** Ruthlessly control scope for any PoC or\n", + "\n", + "MVP. Prioritization is your best friend. Stakeholders and\n", + "\n", + "your own internal team will naturally want to increase\n", + "\n", + "scope because there’s no shortage of good ideas. But by\n", + "\n", + "controlling scope, you improve your chances of shipping\n", + "\n", + "on time and on budget. Don’t let perfection be the enemy\n", + "\n", + "of good. There are always exceptions to this, but that is\n", + "\n", + "what the next sprint is for.\n", + "\n", + "`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n", + "\n", + "difficult - strive to always deliver on time. Make sure your\n", + "\n", + "goals, constraints and scope creep will not explode your\n", + "\n", + "timeline as creating tight feedback loops and iteration\n", + "\n", + "cycles is what will make you more agile than the competition.\n", + "\n", + "`5.` **Socialize early, and often:** Show quantifiable value as\n", + "\n", + "quickly as possible, both to your immediate team and\n", + "\n", + "business stakeholders. Measure the value as frequently\n", + "\n", + "as makes sense, and socialize early and often to promote\n", + "\n", + "visibility of the project and ensure tight alignment across\n", + "\n", + "teams. This will empower you to create tighter feedback\n", + "\n", + "loops that will help improve any future iterations of your\n", + "\n", + "product, platform, or technology.\n", + "\n", + "\n", + "between insight and action.\n", + "\n", + "# Ultimate Class Build Guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "Have you rolled your character already? Data engineers, data\n", + "\n", + "scientists, and data analysts form the heart of mature game\n", + "\n", + "data teams. Though, depending on studio size and resources,\n", + "\n", + "\n", + "making sense of large amounts of data. Depending on the size\n", + "\n", + "of the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in data\n", + "\n", + "engineering, analytics and data science. Key characters include:\n", + "\n", + "\n", + "game developers may also be pulled in from time to time to\n", + "\n", + "\n", + "perform data engineering and or data science tasks. Though for\n", + "\n", + "the sake of this guide, we’ll keep focus on roles of data engineers,\n", + "\n", + "data scientists, and data analysts. There are many aspects to\n", + "\n", + "these roles, but they can be summarized in that Data Engineers\n", + "\n", + "create and maintain critical data workflows, Data Analysts\n", + "\n", + "interpret data and create reports that keep the business teams\n", + "\n", + "running seamlessly, and Data Scientists are responsible for\n", + "\n", + "\n", + "**Data Engineers**\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Data Analysts**\n", + "\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "**Learn more about these character classes**\n", + "\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "# Diving In\n", + "\n", + "\n", + "Before we get to the primary use cases of game data,\n", + "\n", + "analytics, and AI, we need to cover some basics. That is, the\n", + "\n", + "different types of game data and how they are produced.\n", + "\n", + "And the subsequent receiving of that data in the cloud to\n", + "\n", + "\n", + "### Producing game data…\n", + "\n", + "Speaking in generalities, there are four buckets of data as it\n", + "\n", + "relates to your video game.\n", + "\n", + "\n", + "collect, clean, and prepare for analysis.\n", + "\n", + "**1. Game Telemetry**\n", + "\n", + "Game telemetry refers to the data collected about player\n", + "\n", + "behavior and interactions within a video game. The primary\n", + "\n", + "data source is the game engine. And the goal of game\n", + "\n", + "telemetry is to gather information that can help game\n", + "\n", + "developers understand player behavior and improve the\n", + "\n", + "overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in\n", + "\n", + "game telemetry include:\n", + "\n", + "- **Player engagement:** Track the amount of time players\n", + "\n", + "spend playing the game, and their level of engagement\n", + "\n", + "with different parts of the game.\n", + "\n", + "- **Game progress:** Monitor player progress through\n", + "\n", + "different levels and milestones in the game.\n", + "\n", + "- **In-game purchases:** Track the number and value of\n", + "\n", + "in-game purchases made by players.\n", + "\n", + "- **Player demographics:** Collect demographic information\n", + "\n", + "about players, such as age, gender, location, and device type.\n", + "\n", + "- **Session length:** Monitor the length of each player session,\n", + "\n", + "and how often players return to the game.\n", + "\n", + "- **Retention:** Track the percentage of players who return to\n", + "\n", + "the game after their first session.\n", + "\n", + "\n", + "-----\n", + "\n", + "such as the types of actions taken, the number of deaths,\n", + "\n", + "and the use of power-ups.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels.\n", + "\n", + "**2. Business KPIs**\n", + "\n", + "The second bucket of data is business key performance\n", + "\n", + "indicators (or KPIs). Business KPIs are metrics that measure\n", + "\n", + "the performance and success of a video game from a\n", + "\n", + "business perspective. The primary data source for business\n", + "\n", + "KPIs include game telemetry, stores, and marketplaces.\n", + "\n", + "These KPIs help game studios understand the financial and\n", + "\n", + "operational performance of their games and make informed\n", + "\n", + "decisions about future development and growth.\n", + "\n", + "Some of the primary business metrics that are typically\n", + "\n", + "tracked include:\n", + "\n", + "- **Revenue:** Track the total revenue generated by the game,\n", + "\n", + "including sales of the game itself, in-game purchases,\n", + "\n", + "and advertising.\n", + "\n", + "- **Player Acquisition Cost (CAC):** Calculate the cost\n", + "\n", + "of acquiring a new player, including marketing and\n", + "\n", + "advertising expenses.\n", + "\n", + "- **Lifetime Value (LTV):** Estimate the amount of revenue a\n", + "\n", + "player will generate over the course of their time playing\n", + "\n", + "the game.\n", + "\n", + "- **Player Retention:** Track the percentage of players who\n", + "\n", + "continue to play the game over time, and how long they\n", + "\n", + "play for.\n", + "\n", + "- **Engagement:** Measure the level of engagement of players\n", + "\n", + "with the game, such as the number of sessions played,\n", + "\n", + "time spent playing, and in-game actions taken.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels and the\n", + "\n", + "cost of acquiring each player.\n", + "\n", + "- **Conversion Rate:** Measure the percentage of players who\n", + "\n", + "make an in-game purchase or complete a specific action.\n", + "\n", + "- **Gross Margin:** Calculate the profit generated by the game\n", + "\n", + "after subtracting the cost of goods sold, such as the cost\n", + "\n", + "of game development and server hosting.\n", + "\n", + "**3. Game Services**\n", + "\n", + "Similar to game telemetry, game services provide critical\n", + "\n", + "infrastructure that requires careful monitoring and management.\n", + "\n", + "These services include things like game server hosting,\n", + "\n", + "\n", + "and more. Here the source of data is the game services used.\n", + "\n", + "Some of the common metrics game teams typically track for\n", + "\n", + "these services include:\n", + "\n", + "- **Concurrent Players:** Track the number of players who are\n", + "\n", + "simultaneously connected to the game servers to ensure\n", + "\n", + "that the servers have enough capacity to handle the\n", + "\n", + "player demand.\n", + "\n", + "- **Server Availability:** Monitor the uptime and downtime of\n", + "\n", + "the game servers to ensure that players have access to\n", + "\n", + "the game when they want to play, particularly important\n", + "\n", + "for global live service games where demand fluctuates\n", + "\n", + "throught the day.\n", + "\n", + "- **Latency:** Measure the time it takes for data to travel\n", + "\n", + "from the player’s device to the game server and back,\n", + "\n", + "to ensure that players have a smooth and responsive\n", + "\n", + "gaming experience.\n", + "\n", + "- **Network Bandwidth:** Monitor the amount of data being\n", + "\n", + "transmitted between the player’s device and the game\n", + "\n", + "server to ensure that players have a high-quality gaming\n", + "\n", + "experience, even on slow internet connections.\n", + "\n", + "- **Live Operations:** Monitor the success of in-game events,\n", + "\n", + "promotions, and other live operations to understand what\n", + "\n", + "resonates with players and what doesn’t.\n", + "\n", + "- **Player Feedback:** Monitor player feedback and reviews,\n", + "\n", + "including ratings and comments on social media, forums,\n", + "\n", + "and app stores, to understand what players like and dislike\n", + "\n", + "about the game.\n", + "\n", + "- **Chat Activity:** Track the number of messages and\n", + "\n", + "interactions between players in the game’s chat channels\n", + "\n", + "to understand the level of social engagement and\n", + "\n", + "community building in the game.\n", + "\n", + "**4. Data beyond the game**\n", + "\n", + "The last bucket comes from data sources beyond the video\n", + "\n", + "game. These typically include the following:\n", + "\n", + "- **Social Media Data:** Social media platforms, such as\n", + "\n", + "Facebook, Twitter, TikTok and Instagram, can provide\n", + "\n", + "valuable insights into player behavior, feedback and\n", + "\n", + "preferences, as well as help game teams understand\n", + "\n", + "how players are talking about their games online with\n", + "\n", + "different communities.\n", + "\n", + "- **Forum Data:** Online forums and discussion boards, such\n", + "\n", + "as Reddit and Discord, can be rich sources of player\n", + "\n", + "feedback and opinions about the game.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The secret to success is bringing all of the disparate data sources\n", + " together, so you have as complete a 360-degree view as possible of\n", + " what’s happening in and around your game.\n", + "\n", + "\n", + "\n", + "- **Player Reviews:** Ratings and reviews on app stores, such\n", + "\n", + "as Steam, Epic, Google Play and the Apple App Store, can\n", + "\n", + "provide valuable feedback on player experiences and help\n", + "\n", + "game teams identify areas for improvement.\n", + "\n", + "- **Third-Party Data:** Third-party data sources, such as\n", + "\n", + "market research firms and industry data providers, can\n", + "\n", + "provide valuable insights into broader gaming trends and\n", + "\n", + "help game teams make informed decisions about their\n", + "\n", + "games and marketing strategies.\n", + "\n", + "This is a lot of data. And it’s no wonder that studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are now\n", + "\n", + "table stakes for a game to be successful. Tapping into these\n", + "\n", + "four buckets of data sources, you’ll find actionable insights that\n", + "\n", + "drive better understanding of your playerbase, more efficient\n", + "\n", + "acquisition, stronger and longer lasting engagement, and\n", + "\n", + "monetization that deepens the relationship with your players.\n", + "\n", + "That’s what we’re going to dig into throughout the rest of\n", + "\n", + "this book.\n", + "\n", + "**Let’s begin with how to get data out of your game!**\n", + "\n", + "There are a variety of ways to get data out of the game and\n", + "\n", + "into cloud resources. In this section, we will provide resources\n", + "\n", + "for producing data streams in Unity and Unreal. In addition,\n", + "\n", + "we will also provide a generic approach that will work for any\n", + "\n", + "game engine, as long as you are able to send HTTP requests.\n", + "\n", + "**Unity**\n", + "\n", + "Since Unity supports C#, you would use a .NET SDK from the\n", + "\n", + "cloud provider of your choice. All three major cloud providers\n", + "\n", + "\n", + "[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n", + "\n", + "- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n", + "\n", + "- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n", + "\n", + "- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n", + "\n", + "- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n", + "\n", + "From here, the SDK is used to send data to a messaging\n", + "\n", + "service. These messaging services will be covered in more\n", + "\n", + "detail in the next section.\n", + "\n", + "**Unreal Engine**\n", + "\n", + "Unreal supports development with C++, so you could use\n", + "\n", + "C++ SDKs or Blueprint interfaces to those SDKs.\n", + "\n", + "The resources for each SDK are provided here\n", + "\n", + "- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n", + "\n", + "- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n", + "\n", + "- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "Just like with the Unity example above, from here the data is\n", + "\n", + "sent to a messaging streaming service.\n", + "\n", + "Other engines may not support C++ or C#, but there is still a\n", + "\n", + "way to get your data into the cloud, no matter the language!\n", + "\n", + "By hitting an API Gateway with a HTTP POST request, you are\n", + "\n", + "able to send data to cloud services from many more types of\n", + "\n", + "applications. A sample high level architecture of this solution\n", + "\n", + "in AWS and Azure can be seen below:\n", + "\n", + "**AWS:**\n", + "\n", + "\n", + "have .NET SDKs to use and I have linked the documentation\n", + "\n", + "\n", + "**Azure:**\n", + "\n", + "\n", + "for each below.\n", + "\n", + "No matter the cloud provider, if you want to use a SDK you\n", + "\n", + "install it through the NuGet package manager into your Unity\n", + "\n", + "project. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n", + "\n", + "\n", + "-----\n", + "\n", + "Once the data has been sent from the game into an event-\n", + "\n", + "streaming service, how do we get that data to a more\n", + "\n", + "permanent home? Here we will start by outlining what these\n", + "\n", + "messaging services do and how we can use them to point\n", + "\n", + "our data to a desired location.\n", + "\n", + "Messaging services ingest real-time event data, being\n", + "\n", + "streamed to them from a number of different sources,\n", + "\n", + "and then send them to their appropriate target locations.\n", + "\n", + "These target locations can be databases, compute clusters\n", + "\n", + "or cloud object stores. A key property of the messaging\n", + "\n", + "services is to preserve the time in which the events arrive, so\n", + "\n", + "that it is always known the order that events occurred.\n", + "\n", + "\n", + "\n", + "- Data is stored in object storage such as S3, Azure Storage\n", + "\n", + "or GCP Buckets using Delta Lake.\n", + "\n", + "- Delta Lake is an open-source storage framework that makes\n", + "\n", + "it easy to maintain data consistency and track changes.\n", + "\n", + "**Data Governance & Cataloging:**\n", + "\n", + "- Unity Catalog in Databricks provides tools for data\n", + "\n", + "governance that helps with compliance and controlling\n", + "\n", + "access to data in the lake.\n", + "\n", + "- Unity Catalog also allows to track data lineage, auditing and\n", + "\n", + "data discovery with the use of data catalogs and governance.\n", + "\n", + "- Metadata about the data including the structure, format,\n", + "\n", + "and location of the data can be stored in a data catalog.\n", + "\n", + "\n", + "Examples of cloud messaging services include AWS Kinesis\n", + "\n", + "\n", + "Firehose, Google PubSub, and Azure Event Hubs Messaging.\n", + "\n", + "If you prefer to use open-source products, Apache Kafka is a\n", + "\n", + "very popular open-source alternative.\n", + "\n", + "### Getting data from your game to the cloud\n", + "\n", + "Moving to the cloud platform part of the journey involves\n", + "\n", + "building a gaming Lakehouse. The gaming Lakehouse allows\n", + "\n", + "gaming companies to store, manage, and analyze large volumes\n", + "\n", + "of gaming data, such as player behavior, performance metrics,\n", + "\n", + "and financial transactions, to gain valuable insights and make\n", + "\n", + "data-driven decisions to improve their business outcomes.\n", + "\n", + "**Next here are the basics of the Databricks**\n", + "\n", + "**platform simplified.**\n", + "\n", + "**Data Ingestion:**\n", + "\n", + "- Data can be ingested into the Gaming Lakehouse using\n", + "\n", + "various built-in data ingestion capabilities provided by\n", + "\n", + "Databricks such as Structured Streaming and Delta Live\n", + "\n", + "Tables for a single simple API that handles streaming or\n", + "\n", + "batch pipelines.\n", + "\n", + "- Data can be ingested in real-time or batch mode from\n", + "\n", + "\n", + "**Data Quality:**\n", + "\n", + "- Databricks platform enables you to validate, clean\n", + "\n", + "and enrich data using built-in libraries and rule-based\n", + "\n", + "validation using Delta Live Tables.\n", + "\n", + "- It also allows tracking data quality issues and missing\n", + "\n", + "values by using Databricks Delta Live Tables tables.\n", + "\n", + "**Data Security:**\n", + "\n", + "- Databricks provides a comprehensive security model to\n", + "\n", + "secure data stored in the lake.\n", + "\n", + "- Access to data can be controlled through robust access\n", + "\n", + "controls on objects such as catalogs, schemas, tables,\n", + "\n", + "rows, columns, models, experiments, and clusters.\n", + "\n", + "**Analytics:**\n", + "\n", + "- The processed data can be analyzed using various\n", + "\n", + "tools provided by Databricks such as SQL Dashboards,\n", + "\n", + "Notebooks, visualizations and ML.\n", + "\n", + "- Game studios can gain insights into player performance and\n", + "\n", + "behaviorto better engageplayers and improve their games.\n", + "\n", + "**Get started with your preferred cloud**\n", + "\n", + "\n", + "various sources such as game clients, servers or APIs.\n", + "\n", + "Data can be cleaned, transformed and enriched with\n", + "\n", + "additional data sources, making it ready for analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "# The Value of Data Throughout the Game Development Lifecycle\n", + "\n", + "\n", + "### Lifecycle overview\n", + "\n", + "Over the last decade, the way games have been developed\n", + "\n", + "and monetized has changed dramatically. Most if not all\n", + "\n", + "top grossing games are now built using a games-as-service\n", + "\n", + "strategy, meaning titles shipped in cycles of constant\n", + "\n", + "iteration to increase engagement and monetization of\n", + "\n", + "players over time. Games-as-a-Service models have the\n", + "\n", + "ability to create sticky, high-margin games, but they also\n", + "\n", + "heavily depend on cloud-based services such as game\n", + "\n", + "play analytics, multiplayer servers and matchmaking, player\n", + "\n", + "relationship management, performance marketing and more.\n", + "\n", + "Data plays an integral role in the development and operation\n", + "\n", + "of video games. Teams need tools and services to optimize\n", + "\n", + "player lifetime value (LTV) with databases that can process\n", + "\n", + "terabytes-petabytes of evolving data, analytics solutions\n", + "\n", + "that can access that data with near real-time latency, and\n", + "\n", + "machine learning (ML) models that can translate insights into\n", + "\n", + "actionable and innovative gameplay features.\n", + "\n", + "A game’s development lifecycle is unique to each studio. With\n", + "\n", + "different skillsets, resources, and genres of games, there is no\n", + "\n", + "\n", + "one model. Below is a simplified view of a game development\n", + "\n", + "lifecycle for a studio running a games-as-a-service model.\n", + "\n", + "What’s important to remember is that throughout your title’s\n", + "\n", + "development lifecycle, there is data that can help you better\n", + "\n", + "understand your audience, more effectively find and acquire\n", + "\n", + "players, and more easily activate and engage them. Whether\n", + "\n", + "using game play data to optimize creative decision making\n", + "\n", + "during pre-production, tapping machine learning models to\n", + "\n", + "predict and prevent churn, or identifying the next best offer\n", + "\n", + "or action for your players in real-time, **data is your friend** .\n", + "\n", + "### Use data to develop a next-generation customer experience\n", + "\n", + "In the game industry, customer experience (CX) is an\n", + "\n", + "important factor that can impact a player’s enjoyment of a\n", + "\n", + "game and the length they choose to play that game over time.\n", + "\n", + "In today’s highly competitive and fast-paced games industry,\n", + "\n", + "a game studio’s ability to deliver exceptional and seamless\n", + "\n", + "customer experiences can be a strategic differentiator when\n", + "\n", + "it comes to cutting through the noise and winning a gamer’s\n", + "\n", + "\n", + "## Game Development Lifecycle\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n", + "\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "\n", + "_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n", + "\n", + "\n", + "**1. Pre-Production**\n", + "\n", + "Brainstorm how to give life to the many\n", + "\n", + "ideas laid out in the planning phase\n", + "\n", + "\n", + "**3. Testing**\n", + "\n", + "Every feature and mechanic in the game needs\n", + "\n", + "to be tested for game loop and quality control\n", + "\n", + "\n", + "**5. Operation**\n", + "\n", + "As studios increasingly adopt games-as-a-service models, the\n", + "\n", + "ongoing operation of a video game is as critical as the launch itself\n", + "\n", + "**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n", + "\n", + "\n", + "\n", + "|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n", + "|---|---|---|---|---|---|---|---|\n", + "|||||||||\n", + "|||||||||\n", + "\n", + "\n", + "**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n", + "**EXPERIMENTATION**\n", + "\n", + "\n", + "**2. Production**\n", + "\n", + "Most of the time, effort, and resources\n", + "\n", + "spent on developing video games are\n", + "\n", + "spent in production stage\n", + "\n", + "\n", + "**4. Launch**\n", + "\n", + "Whether developing alongside the community with\n", + "\n", + "alpha and beta releases, or launching into general\n", + "\n", + "availability, a game launch is a critical milestone\n", + "\n", + "\n", + "-----\n", + "\n", + "can help drive value through customer experience:\n", + "\n", + "`1.` **Personalization:** Game studios can use data analytics\n", + "\n", + "and machine learning to personalize the game experience\n", + "\n", + "for each player based on their preferences and behavior.\n", + "\n", + "This can include personalized recommendations for\n", + "\n", + "content, in-game events, and other features that are\n", + "\n", + "tailored to the player’s interests.\n", + "\n", + "`2.` **Omnichannel support:** Players often use multiple\n", + "\n", + "channels, such as social media, forums, and in-game\n", + "\n", + "support, to communicate with game studios. Next\n", + "\n", + "generation customer experience involves providing a\n", + "\n", + "seamless and integrated support experience across all\n", + "\n", + "these channels in near-real time.\n", + "\n", + "`3.` **Continuous improvement:** Game studios can use data\n", + "\n", + "and feedback from players to continuously improve\n", + "\n", + "\n", + "gathering feedback on new features and using it to refine\n", + "\n", + "and optimize the game over time.\n", + "\n", + "In summary, defining what a next generation customer\n", + "\n", + "experience looks like for your game is important because it can\n", + "\n", + "help you create a more personalized, seamless, and enjoyable\n", + "\n", + "experience for your players, which can lead to increased\n", + "\n", + "engagement, monetization, and loyalty. There are many\n", + "\n", + "ways teams can use data throughout a game’s development\n", + "\n", + "lifecycle, but far and away the most valuable focus area will be\n", + "\n", + "in building and refining the customer experience.\n", + "\n", + "Throughout the rest of this guide, we will dig into the most\n", + "\n", + "common use cases for data, analytics, and AI in game\n", + "\n", + "development, starting with where we recommend everyone\n", + "\n", + "begins: game analytics.\n", + "\n", + "\n", + "# Getting Started with Gaming Use Cases\n", + "\n", + "\n", + "### Where do I start? Start with game analytics\n", + "\n", + "**Overview**\n", + "\n", + "Big question: Where’s the best place to start when it comes\n", + "\n", + "to game data, analytics, and AI? For most game studios,\n", + "\n", + "the best place to start is with game analytics. Setting up a\n", + "\n", + "dashboard for your game analytics that helps you correlate\n", + "\n", + "data across disparate sources is infinitely valuable in a world\n", + "\n", + "\n", + "where there is no one gaming data source to rule them all.\n", + "\n", + "An effective dashboard should include your game telemetry\n", + "\n", + "data, data from any game services you’re running, and data\n", + "\n", + "sources outside of your game such as stores, marketplaces,\n", + "\n", + "and social media. See below.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Getting a strong foundation in game analytics unlocks more\n", + "\n", + "advanced data, analytics, and AI use cases. For example,\n", + "\n", + "concurrent player count plus store and marketplace data\n", + "\n", + "\n", + "**GAME TELEMETRY**\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**GAME SERVICES** **OTHER SOURCES**\n", + "\n", + "\n", + "-----\n", + "\n", + "and lifetime value. Usage telemetry combined with crash\n", + "\n", + "reporting and social media listening helps you more quickly\n", + "\n", + "uncover where players might be getting frustrated. And\n", + "\n", + "correlating chat logs, voice transcriptions, and or discord\n", + "\n", + "\n", + "that are relevant and engaging to your players, giving you\n", + "\n", + "tools to effectively market and monetize with your audience.\n", + "\n", + "**Let’s start with Player Segmentation.**\n", + "\n", + "\n", + "and reddit forums can help you identify disruptive behavior\n", + "\n", + "\n", + "before it gets out of hand, giving you the tools to take\n", + "\n", + "actionable steps to mitigate toxicity within your community.\n", + "\n", + "**Get started and set up your Analytics Dashboard**\n", + "\n", + "### Understand your audience\n", + "\n", + "With your analytics pipelines set up, the first area of focus is to\n", + "\n", + "better understand your audience. This can help you inform a\n", + "\n", + "variety of key business decisions, from the highest macro order\n", + "\n", + "of “what game(s) to develop”, to how to market and monetize\n", + "\n", + "those games, and how to optimize the player experience.\n", + "\n", + "By understanding the demographics, preferences, and\n", + "\n", + "behaviors of their audience, a game studio can create games\n", + "\n", + "that are more likely to appeal to their target market and be\n", + "\n", + "successful. You can also use this understanding to tailor your\n", + "\n", + "marketing and monetization strategies to the needs and\n", + "\n", + "preferences of your players.\n", + "\n", + "Additionally, understanding your audience can help you\n", + "\n", + "\n", + "##### Player Segmentation\n", + "\n", + "**Overview**\n", + "\n", + "Player segmentation is the practice of dividing players\n", + "\n", + "into groups based on shared characteristics or behaviors.\n", + "\n", + "Segmentation has a number of benefits. You can better\n", + "\n", + "understand your players, create more personalized content,\n", + "\n", + "improve player retention, and optimize monetization, all of\n", + "\n", + "which contributes to an improved player experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The primary objective of segmentation is to ensure you’re\n", + "\n", + "not treating your entire playerbase the exact same. Humans\n", + "\n", + "are different, and your players have different motivations,\n", + "\n", + "preferences and behaviors. Recognizing this and engaging\n", + "\n", + "with them in a way that meets them where they’re at\n", + "\n", + "is one of the most impactful ways you can cultivate\n", + "\n", + "engagement with your game. As we mentioned above,\n", + "\n", + "the benefits of segmentation are broad reaching. Through\n", + "\n", + "better understanding of your playerbase, you can better\n", + "\n", + "personalize experiences, tailoring content and customer\n", + "\n", + "experience to specific groups of players that increases\n", + "\n", + "engagement and satisfaction. Better understanding of\n", + "\n", + "your players also helps in improving player retention. By\n", + "\n", + "identifying common characteristics of players who are at\n", + "\n", + "risk of churning (i.e., stopping play), you can develop targeted\n", + "\n", + "strategies that only reach specific audiences.\n", + "\n", + "Create advanced customer segments to build out more\n", + "\n", + "effective user stories, and identify potential purchasing\n", + "\n", + "predictions based on behaviors. Leverage existing sales\n", + "\n", + "data, campaigns and promotions systems to create robust\n", + "\n", + "segments with actionable behavior insights to inform your\n", + "\n", + "product roadmap. You can then use this information to build\n", + "\n", + "useful customer clusters that are targetable with different\n", + "\n", + "promos and offers to drive more efficient acquisition and\n", + "\n", + "deeper engagement with existing players.\n", + "\n", + "\n", + "identify potential pain points or areas for improvement\n", + "\n", + "\n", + "within your games, allowing you to proactively make changes\n", + "\n", + "\n", + "**Get started with Player Segmentation**\n", + "\n", + "\n", + "to address these issues and improve the player experience\n", + "\n", + "before a player potentially churns.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player lifetime value (LTV) is a measure of the value that a\n", + "\n", + "player brings to a game over the lifetime they play that game.\n", + "\n", + "It is typically calculated by multiplying the average revenue\n", + "\n", + "per user (ARPU) by the average player lifespan. For example,\n", + "\n", + "if the average player spends $50 per year and plays the\n", + "\n", + "game for 2 years, their LTV would be $50 * 2 = $100.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Game studios care about LTV because it helps them\n", + "\n", + "understand the long-term value of their players and make\n", + "\n", + "informed decisions about how to invest in player acquisition\n", + "\n", + "and retention. For example, if the LTV of a player is higher\n", + "\n", + "than the cost of acquiring them (e.g., through advertising),\n", + "\n", + "it may be worth investing more in player acquisition. On the\n", + "\n", + "other hand, if the LTV of a player is lower than the cost of\n", + "\n", + "acquiring them, it may be more cost-effective to focus on\n", + "\n", + "retaining existing players rather than acquiring new ones.\n", + "\n", + "LTV is one of the more important metrics that game studios,\n", + "\n", + "particularly those building live service games, can use to\n", + "\n", + "understand the value of their players. It is important to\n", + "\n", + "consider other metrics as well, such as player retention,\n", + "\n", + "monetization, and engagement.\n", + "\n", + "**Get started with Player Lifetime Value**\n", + "\n", + "##### Social Media Monitoring\n", + "\n", + "**Overview**\n", + "\n", + "As the great Warren Buffet once said, “It takes 20 years to\n", + "\n", + "build a reputation and five minutes to ruin it. If you think\n", + "\n", + "about that, you’ll do things differently.” Now more than ever,\n", + "\n", + "people are able to use social media and instantly amplify\n", + "\n", + "their voices to thousands of people who share similar\n", + "\n", + "interests and hobbies. Take Reddit as an example. r/gaming,\n", + "\n", + "the largest video game community (also called a subreddit)\n", + "\n", + "has over 35 million members with nearly 500 new posts\n", + "\n", + "and 10,000 new comments per day, while over 120 game-\n", + "\n", + "specific subreddits have more than 10,000 members each,\n", + "\n", + "the largest being League of Legends with over 700,000\n", + "\n", + "members. The discourse that takes place on online social\n", + "\n", + "platforms generates massive amounts of raw and organic\n", + "\n", + "\n", + "be used to understand how customers think and discover\n", + "\n", + "exactly what they want.\n", + "\n", + "The act and process of monitoring content online across the\n", + "\n", + "internet and social media for keyword mentions and trends\n", + "\n", + "for downstream processing and analytics is called media\n", + "\n", + "monitoring. By applying media monitoring to social media\n", + "\n", + "platforms, game developers are able to gain new advantages\n", + "\n", + "that previously might not have been possible, including:\n", + "\n", + "- Programmatically aggregate product ideas for new\n", + "\n", + "feature prioritization\n", + "\n", + "- Promote a better user experience by automatically\n", + "\n", + "responding to positive or negative comments\n", + "\n", + "- Understand the top influencers in the industry who can\n", + "\n", + "sway public opinion\n", + "\n", + "- Monitor broader industry trends and emerging segments\n", + "\n", + "such as free-to-play games\n", + "\n", + "- Detect and react to controversies or crises as they begin\n", + "\n", + "- Get organic and unfiltered feedback of games and features\n", + "\n", + "- Understand customer sentiment at scale\n", + "\n", + "- Make changes faster to keep customer satisfaction high\n", + "\n", + "and prevent churn\n", + "\n", + "By failing to monitor, understand, and act on what customers\n", + "\n", + "are saying about the games and content you release as\n", + "\n", + "well as broader industry trends, you risk those customers\n", + "\n", + "leaving for a better experience that meets the demands and\n", + "\n", + "requirements of what customers want.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "By monitoring and listening to what existing and potential\n", + "\n", + "customers are saying on social media, game developers\n", + "\n", + "are able to get a natural and organic understanding of how\n", + "\n", + "customers actually feel about the games and products they\n", + "\n", + "release, or gauge consumer interest before investing time\n", + "\n", + "and money in a new idea. The main process for social media\n", + "\n", + "monitoring is to gather data from different social media\n", + "\n", + "platforms, such as Twitter or YouTube, process those comments\n", + "\n", + "or tweets, then take action on the processed data. While\n", + "\n", + "customer feedback can be manually discovered and processed\n", + "\n", + "in search of certain keyword mentions or feedback, it is a much\n", + "\n", + "better idea to automate it and do it programmatically.\n", + "\n", + "**Get started with Social Media Monitoring**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player feedback analysis is the process of collecting,\n", + "\n", + "analyzing, and acting on player feedback to inform game\n", + "\n", + "development. It involves collecting player feedback from\n", + "\n", + "multiple sources, such as in-game surveys, customer\n", + "\n", + "support tickets, social media, marketplace reviews, and\n", + "\n", + "forums, and using data analytics tools to identify patterns,\n", + "\n", + "trends, and insights. The goal of player feedback analysis is\n", + "\n", + "to better understand player needs, preferences, and pain\n", + "\n", + "points, and use this information to inform game development\n", + "\n", + "decisions and improve the overall player experience.\n", + "\n", + "Player feedback analysis is an important part of game\n", + "\n", + "development as it helps ensure that the game continues to\n", + "\n", + "meet player needs and expectations. By regularly collecting and\n", + "\n", + "analyzing player feedback, game studios can make data-driven\n", + "\n", + "decisions to improve the game, increase player engagement\n", + "\n", + "and retention, and ultimately drive success and growth.\n", + "\n", + "For this use case, we’re going to focus on taking online\n", + "\n", + "reviews for your video game and categorizing the different\n", + "\n", + "topics players are talking about (bucketing topics) in order\n", + "\n", + "to better understand the themes (via positive or negative\n", + "\n", + "sentiment) affecting your community.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "This is incredibly helpful, providing data-driven customer\n", + "\n", + "insight into your development process. Whether used in\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Across massively multiplayer online video games (MMOs),\n", + "\n", + "multiplayer online battle arena games (MOBAs) and other\n", + "\n", + "forms of online gaming, players continuously interact in real\n", + "\n", + "time to either coordinate or compete as they move toward a\n", + "\n", + "common goal — winning. This interactivity is integral to game\n", + "\n", + "play dynamics, but at the same time, it’s a prime opening for\n", + "\n", + "toxic behavior — an issue pervasive throughout the online\n", + "\n", + "video gaming sphere.\n", + "\n", + "Toxic behavior manifests in many forms, such as the varying\n", + "\n", + "degrees of griefing, cyberbullying and sexual harassment\n", + "\n", + "that are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n", + "\n", + "[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n", + "\n", + "the multiplayer game, _Dead by Daylight_ .\n", + "\n", + "\n", + "pre-production, such as looking at games that are similar\n", + "\n", + "\n", + "**Survivors**\n", + "\n", + "\n", + "with reviews to learn where those games have strengths and\n", + "\n", + "weaknesses; or using player feedback analysis with a live\n", + "\n", + "service title to identify themes that can apply to your product\n", + "\n", + "roadmap, player feedback analysis helps teams better\n", + "\n", + "support and cultivate engagement with the player community.\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "**RUSHING**\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "\n", + "**HIDING** **ACTIVATING** **LOOPING**\n", + "**EMOTES**\n", + "\n", + "\n", + "**RUSH** **BLINDING** **SANDBAGGING**\n", + "**UNHOOKING**\n", + "\n", + "**TEABAGGING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**TEXT**\n", + "**CHATTING**\n", + "\n", + "\n", + "Ultimately, player feedback analysis does two things. 1) It\n", + "\n", + "\n", + "**Less**\n", + "\n", + "**toxic**\n", + "\n", + "\n", + "**Most**\n", + "**toxic**\n", + "\n", + "\n", + "can help you stack rank themes according to positive and\n", + "\n", + "negative sentiment, and 2) you can weight those themes\n", + "\n", + "according to impact on player engagement, toxicity,\n", + "\n", + "monetization, churn, and more. We’ve all read reviews that\n", + "\n", + "are overly positive, or overly negative. The process of player\n", + "\n", + "feedback analysis helps to normalize feedback across the\n", + "\n", + "community (keeping in mind, only for those who have written\n", + "\n", + "a review), so you’re not over indexing on one review, or a\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**FARMING** **FARMING**\n", + "\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**BEING AWAY**\n", + "**FROM**\n", + "**KEYBOARD**\n", + "**(AFK)**\n", + "\n", + "\n", + "**CAMPING**\n", + "\n", + "**DRIBBLING** **TUNNELING**\n", + "\n", + "\n", + "**LOBBY**\n", + "**DODGING**\n", + "\n", + "**BODY**\n", + "**BLOCKING**\n", + "\n", + "**FACE**\n", + "**SLUGGING** **CAMPING**\n", + "\n", + "\n", + "**Killers**\n", + "\n", + "\n", + "single theme that may seem in the moment very pressing.\n", + "\n", + "In addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n", + "\n", + "\n", + "**Get started with Player Feedback Analysis**\n", + "\n", + "\n", + "on gamers and the community -- an issue that cannot be\n", + "\n", + "\n", + "-----\n", + "\n", + "game studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n", + "\n", + "\n", + "[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n", + "\n", + "toxicity, and of those, 20% reported leaving the game due to\n", + "\n", + "these interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n", + "\n", + "showed that having a disruptive or toxic encounter in the first\n", + "\n", + "session of the game led to players being over three times\n", + "\n", + "more likely to leave the game without returning. Given that\n", + "\n", + "player retention is a top priority for many studios, particularly\n", + "\n", + "as game delivery transitions from physical media releases to\n", + "\n", + "long-lived services, it’s clear that toxicity must be curbed.\n", + "\n", + "Compounding this issue related to churn, some companies\n", + "\n", + "face challenges related to toxicity early in development,\n", + "\n", + "even before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n", + "\n", + "released into testing without text or voice chat due in part\n", + "\n", + "to not having a system in place to monitor or manage toxic\n", + "\n", + "\n", + "In this section, we’re going to talk about how to use your data\n", + "\n", + "to more effectively find your target audience across the web.\n", + "\n", + "Whether you’re engaging in paid advertising, influencer or\n", + "\n", + "referral marketing, PR, cross promotion, community building,\n", + "\n", + "etc - use data to separate activity from impact. You want\n", + "\n", + "to focus on the channels and strategies that leverage your\n", + "\n", + "resources most effectively, be that time or money.\n", + "\n", + "Say you have a cohort of highly engaged players who are\n", + "\n", + "spending money on your title, and you want to find more\n", + "\n", + "gamers just like that. Doing an analysis on the demographic\n", + "\n", + "and behavioral data of this cohort will give you the\n", + "\n", + "information needed to use an ad platform (such as Meta,\n", + "\n", + "Google, or Unity) to do lookalike modeling and target those\n", + "\n", + "potential gamers for acquisition.\n", + "\n", + "\n", + "gamers and interactions. This illustrates that the scale of\n", + "\n", + "\n", + "the gaming space has far surpassed most teams’ ability to\n", + "\n", + "manage such behavior through reports or by intervening in\n", + "\n", + "disruptive interactions. Given this, it’s essential for studios\n", + "\n", + "to integrate analytics into games early in the development\n", + "\n", + "lifecycle and then design for the ongoing management of\n", + "\n", + "toxic interactions.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Toxicity in gaming is clearly a multifaceted issue that\n", + "\n", + "has become a part of video game culture and cannot be\n", + "\n", + "addressed universally in a single way. That said, addressing\n", + "\n", + "toxicity within in-game chat can have a huge impact given\n", + "\n", + "the frequency of toxic behavior and the ability to automate\n", + "\n", + "the detection of it using natural language processing (NLP). In\n", + "\n", + "summary, by leveraging machine learning to better identify\n", + "\n", + "disruptive behavior so that better-informed decisions\n", + "\n", + "around handling actions can be made.\n", + "\n", + "**Get started with Toxicity Detection**\n", + "\n", + "\n", + "##### Multi-Touch Attribution\n", + "\n", + "**Overview**\n", + "\n", + "Multi-touch attribution is a method of attributing credit to\n", + "\n", + "different marketing channels or touchpoints that contribute to\n", + "\n", + "a sale or conversion. In other words, it is a way of understanding\n", + "\n", + "how different marketing efforts influence a customer’s decision\n", + "\n", + "to make a purchase or take a desired action.\n", + "\n", + "There are a variety of different attribution models that can\n", + "\n", + "be used to assign credit to different touchpoints, each with\n", + "\n", + "its own strengths and limitations. For example, the last-\n", + "\n", + "click model attributes all credit to the last touchpoint that\n", + "\n", + "the customer interacted with before making a purchase,\n", + "\n", + "while the first-click model attributes all credit to the first\n", + "\n", + "touchpoint. Other models, such as the linear model or\n", + "\n", + "the time decay model, distribute credit across multiple\n", + "\n", + "touchpoints based on different algorithms.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Multi-touch attribution can be useful for game studios because\n", + "\n", + "it can help them understand which marketing channels or\n", + "\n", + "efforts are most effective at driving conversions and inform their\n", + "\n", + "marketing strategy. However, it is important to choose the right\n", + "\n", + "attribution model for your title based on your business model\n", + "\n", + "(one-time purchase, subscription, free-to-play, freemium,\n", + "\n", + "in-game advertising, etc.) and regularly review and optimize your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "**Get started with Multi-Touch Attribution**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Activating Your Playerbase\n", + "\n", + "So far, we’ve discussed how to better understand your\n", + "\n", + "players, and how to acquire more of your target audience.\n", + "\n", + "Next, we’re going to dig into how to better activate your\n", + "\n", + "players to create a more engaged and loyal playerbase that\n", + "\n", + "stays with your game for the long-term. Here, we’re going to\n", + "\n", + "focus on strategies that differentiate your gamer experience.\n", + "\n", + "##### Player Recommendations\n", + "\n", + "\n", + "and make in-game purchases. Additionally, personalized\n", + "\n", + "recommendations can help improve the overall player\n", + "\n", + "experience and increase satisfaction.\n", + "\n", + "Game studios can use a variety of techniques to create player\n", + "\n", + "recommendations, such as machine learning algorithms,\n", + "\n", + "collaborative filtering, and manual curation. It is important\n", + "\n", + "to regularly review and optimize these recommendations to\n", + "\n", + "ensure that they are effective and relevant to players.\n", + "\n", + "**Get started with Player Recommendations**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Player recommendations are suggestions for content or actions\n", + "\n", + "\n", + "that a game studio makes to individual players based on their\n", + "\n", + "interests and behaviors. These recommendations can be used\n", + "\n", + "to promote specific in-game items, encourage players to try\n", + "\n", + "new features, or simply provide a personalized experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Player recommendations matter to game studios because\n", + "\n", + "they can help improve player retention, engagement, and\n", + "\n", + "monetization. By providing players with recommendations\n", + "\n", + "that are relevant and engaging, studios can increase the\n", + "\n", + "likelihood that players will continue to play their games\n", + "\n", + "\n", + "##### Next Best Offer/Action\n", + "\n", + "**Overview**\n", + "\n", + "Next best offer (NBO) and next best action (NBA) are\n", + "\n", + "techniques that businesses use to make personalized\n", + "\n", + "recommendations to their customers. NBO refers to the\n", + "\n", + "practice of recommending the most relevant product or\n", + "\n", + "service to a customer based on their past purchases and\n", + "\n", + "behaviors. NBA refers to the practice of recommending the\n", + "\n", + "most relevant action or interaction to a customer based on\n", + "\n", + "the same information.\n", + "\n", + "\n", + "-----\n", + "\n", + "in-game purchase to a player based on their past spending\n", + "\n", + "habits and the items they have shown an interest in. They\n", + "\n", + "might use NBA to recommend a specific level or event to a\n", + "\n", + "player based on their progress and interests.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "It’s important to remember that next best offer is a specific\n", + "\n", + "use case within personalization that involves making\n", + "\n", + "recommendations to players on the most valuable in-game\n", + "\n", + "item or action they should take next. For example, a next\n", + "\n", + "best offer recommendation in a mobile game might suggest\n", + "\n", + "that a player purchase a specific in-game currency or unlock\n", + "\n", + "a new character.\n", + "\n", + "Both NBO and NBA can be used to improve customer\n", + "\n", + "retention, engagement, and monetization by providing\n", + "\n", + "personalized recommendations that are more likely to be\n", + "\n", + "relevant and appealing to individual customers. They can be\n", + "\n", + "implemented using a variety of techniques, such as machine\n", + "\n", + "learning algorithms or manual curation.\n", + "\n", + "**Get started with Next Best Offer/Action**\n", + "\n", + "##### Churn Prediction & Prevention\n", + "\n", + "**Overview**\n", + "\n", + "Video games live and die by their player base. For Games-\n", + "\n", + "\n", + "may overwhelm the ability of these players to consume,\n", + "\n", + "reinforcing the overall problem of player churn.\n", + "\n", + "At some point, it becomes critical for teams to take a cold,\n", + "\n", + "hard look at the cost of acquisition relative to the subscriber\n", + "\n", + "lifetime value (LTV) earned. These figures need to be brought\n", + "\n", + "into a healthy balance, and retention needs to be actively\n", + "\n", + "managed, not as a point-in-time problem to be solved, but\n", + "\n", + "as a “chronic condition” which needs to be managed for the\n", + "\n", + "ongoing health of the title.\n", + "\n", + "Headroom for continued acquisition-driven growth can\n", + "\n", + "be created by carefully examining why some players leave\n", + "\n", + "and some players stay. When centered on factors known\n", + "\n", + "at the time of acquisition, gaming studios may have the\n", + "\n", + "opportunity to rethink key aspects of their acquisition\n", + "\n", + "strategy that promote higher average retention rates, which\n", + "\n", + "can lead to higher average revenue per user.\n", + "\n", + "**Prerequisites for use case**\n", + "\n", + "This use case assumes a certain level of existing data\n", + "\n", + "collection infrastructure in the studio. Notably, a studio ready\n", + "\n", + "to implement a churn prediction and prevention model\n", + "\n", + "should have\n", + "\n", + "- A cloud environment where player data is stored\n", + "\n", + "- This source data should contain player behavior and\n", + "\n", + "session telemetry events from within the game. This is\n", + "\n", + "the foundation that insights can be built on top of.\n", + "\n", + "\n", + "as-a-Service (GaaS) titles, engagement is the most\n", + "\n", + "\n", + "important metric a team can measure. Naturally, proactively\n", + "\n", + "preventing churn is critical to sustained engagement and\n", + "\n", + "\n", + "**Get started with Churn Prediction & Prevention**\n", + "\n", + "\n", + "growth. Through churn prediction and prevention, you will\n", + "\n", + "\n", + "be able to analyze behavioral data to identify subscribers\n", + "\n", + "with an increased risk of churn. Next, you will use machine\n", + "\n", + "learning to quantify the likelihood of a subscriber to churn, as\n", + "\n", + "well as indicate which factors create that risk.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Balancing customer acquisition and retention is critical.\n", + "\n", + "This is the central challenge to the long-term success of\n", + "\n", + "any live service game. This is particularly challenging in that\n", + "\n", + "successful customer acquisition strategies needed to get\n", + "\n", + "games to scale tend to be followed by service disruptions or\n", + "\n", + "declines in quality and customer experience, accelerating\n", + "\n", + "player abandonment. To replenish lost subscribers, the\n", + "\n", + "acquisition engine continues to grind and expenses mount.\n", + "\n", + "As games reach for customers beyond the core playerbase\n", + "\n", + "they may have initially targeted, the title may not resonate\n", + "\n", + "\n", + "##### Real-time Ad Targeting\n", + "\n", + "**Overview**\n", + "\n", + "Real-time ad targeting in the context of game development\n", + "\n", + "focuses on using data to deliver personalized and relevant\n", + "\n", + "advertisements to players in near real-time, while they are\n", + "\n", + "playing a game. Real-time targeting is performanced based,\n", + "\n", + "using highly personalized messagings which are achieved\n", + "\n", + "by using data to precisely determine the most opportune\n", + "\n", + "moments to display ads, based on factors such as player\n", + "\n", + "behavior, game state, and other contextual information.\n", + "\n", + "Knowing when to send those ads is based on data. This\n", + "\n", + "use case is specific to titles using in-game advertising as a\n", + "\n", + "business model. It’s important to note that in-game real-\n", + "\n", + "time ad targeting requires a sophisticated tech stack, with\n", + "\n", + "\n", + "-----\n", + "\n", + "with bigger ad ecosystem, ad networks and partners. The\n", + "\n", + "Databricks Lakehouse platform is an optimal foundation as it\n", + "\n", + "already contains many of the connectors required to enable\n", + "\n", + "this use case.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of in-game real-time ad targeting is to provide a\n", + "\n", + "more immersive and relevant advertising experience for\n", + "\n", + "players, while also increasing the effectiveness of the ads\n", + "\n", + "for advertisers. By delivering targeted ads that are relevant\n", + "\n", + "to each player’s interests, game developers can create a\n", + "\n", + "more enjoyable and personalized gaming experience, which\n", + "\n", + "can help to reduce churn and increase the lifetime value of\n", + "\n", + "each player. Additionally, real-time ad targeting can also help\n", + "\n", + "game developers monetize their games more effectively, as\n", + "\n", + "advertisers are willing to pay a premium for hyper-targeted\n", + "\n", + "and engaged audiences.\n", + "\n", + "**Get started with Real-time Ad Targeting**\n", + "\n", + "### Operational use cases\n", + "\n", + "In the game development industry, operational analytics\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Anomaly detection plays an important role in the operation\n", + "\n", + "of a live service video game by helping to identify and\n", + "\n", + "diagnose unexpected behaviors in real-time. By identifying\n", + "\n", + "patterns and anomalies in player behavior, system\n", + "\n", + "performance, and network traffic, this information can then\n", + "\n", + "be used to detect and diagnose server crashes, performance\n", + "\n", + "bottlenecks, and hacking attempts. The ability to understand\n", + "\n", + "if there will be an issue before it becomes widespread is\n", + "\n", + "immensely valuable. Without anomaly detection, which is\n", + "\n", + "a form of advanced analytics, you’re always in a reactive\n", + "\n", + "(rather than proactive) state. Anomaly detection is a type of\n", + "\n", + "quality of service solution.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of anomaly detection is to ensure that players\n", + "\n", + "have a stable and enjoyable gaming experience. This has\n", + "\n", + "an impact across your game, from reducing downtime,\n", + "\n", + "to minimizing player churn, and improving your game’s\n", + "\n", + "reputation and revenue. Additionally, the insights gained from\n", + "\n", + "anomaly detection can also be used to mitigate cheating and\n", + "\n", + "disruptive behavior.\n", + "\n", + "**Get started with Anomaly Detection**\n", + "\n", + "\n", + "are essential for ensuring a smooth and efficient production\n", + "\n", + "\n", + "process. One common use case is anomaly detection, where\n", + "\n", + "data analytics is utilized to identify any unusual patterns\n", + "\n", + "or behaviors in the game, such as crashes or performance\n", + "\n", + "issues. This helps developers quickly identify and fix\n", + "\n", + "problems, improving the overall quality of the game. Another\n", + "\n", + "example is build pipelines, where data analytics can be used\n", + "\n", + "to monitor and optimize the process of creating new builds\n", + "\n", + "of the game. By tracking key metrics such as build time,\n", + "\n", + "error rates, and resource utilization, developers can make\n", + "\n", + "informed decisions about how to optimize the build process\n", + "\n", + "for maximum efficiency. Other operational use cases in game\n", + "\n", + "development include tracking player behavior, measuring\n", + "\n", + "server performance, and analyzing sales and marketing data.\n", + "\n", + "Lets explore a few of these below.\n", + "\n", + "\n", + "##### Build Pipeline\n", + "\n", + "**Overview**\n", + "\n", + "A build pipeline is a set of automated processes that\n", + "\n", + "are used to compile and assemble the code, assets, and\n", + "\n", + "resources that make up a game project. The build pipeline\n", + "\n", + "typically includes several stages, such as code compilation,\n", + "\n", + "optimization, testing, and release. The purpose of a build\n", + "\n", + "pipeline is to streamline the game development process\n", + "\n", + "and ensure that each stage of development is completed\n", + "\n", + "efficiently and effectively. A build pipeline can be configured\n", + "\n", + "to run automatically, so that new builds are generated\n", + "\n", + "whenever changes are made to the code or assets. This\n", + "\n", + "helps to ensure that the game is always up-to-date and\n", + "\n", + "ready for testing and release. The logs are collected are in\n", + "\n", + "near-real time from build servers. A simplified example:Dev\n", + "\n", + "X is committing code on title Y, submitted on day Z,\n", + "\n", + "along with the log files from the pipeline and build server.\n", + "\n", + "Builds typically take multiple hours to complete, requiring\n", + "\n", + "significant amounts of compute via build farms. Being able to\n", + "\n", + "\n", + "-----\n", + "\n", + "are wasting compute, and being able to predict which builds\n", + "\n", + "will fail as they goes through the pipeline are ways to curb\n", + "\n", + "operational expenses.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "With this use case, we’re seeking to reduce wasted compute\n", + "\n", + "and build a foundational view of what was developed, by\n", + "\n", + "who, when and how testing performed. In an ideal state, our\n", + "\n", + "automated build pipeline could send a notification to the\n", + "\n", + "developer with a confidence metric on the build making it\n", + "\n", + "through, allowing them to decide whether to continue or\n", + "\n", + "move another build through the pipeline. Often, developers\n", + "\n", + "do not have clear visibility until the build has completed\n", + "\n", + "or failed. By providing more insight to devs into the build\n", + "\n", + "pipeline process, we can increase the rate at which builds\n", + "\n", + "are completed efficiently and effectively.\n", + "\n", + "**Get started with Build Pipeline**\n", + "\n", + "##### Crash Analytics\n", + "\n", + "\n", + "resources were being used. How long crash testing takes\n", + "\n", + "can vary, depending on the game’s business model, amount\n", + "\n", + "of content, and scope. For a title with a one-time release,\n", + "\n", + "where there is a large amount of content and a complex\n", + "\n", + "storyline, the chances of hidden crashes causing errors while\n", + "\n", + "in development are high, making it require more time to\n", + "\n", + "perform testing before the game can be published. For titles\n", + "\n", + "built in a game-as-a-service model, i.e. a game shipped in\n", + "\n", + "cycles of constant iteration, crash detection should be done\n", + "\n", + "continuously, since errors in newly released content might\n", + "\n", + "affect the base game and lead to crashes.\n", + "\n", + "Increasingly, titles are being released in alpha (where\n", + "\n", + "developers do the testing), closed beta (which includes a\n", + "\n", + "limited group of testers/sample-users who do the gameplay\n", + "\n", + "testing) and open betas (where anyone interested can register\n", + "\n", + "to try the game). All of which happens before the game is\n", + "\n", + "“officially” released. Regardless of alpha, beta, or GA, players\n", + "\n", + "may stumble over game crashes, which triggers crash reports\n", + "\n", + "that are sent to the developers for fixing. But sometimes, it\n", + "\n", + "can be challenging to understand the issue that caused the\n", + "\n", + "crash from crash reports provided by your game’s platform.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Ultimately, the purpose of crash analytics is to identify the\n", + "\n", + "root cause of a crash, and help you take steps to prevent\n", + "\n", + "similar crashes from happening in the future. This feedback\n", + "\n", + "loop can be tightened through automation in the data\n", + "\n", + "pipeline. For example, by tracking crashes caused on builds\n", + "\n", + "from committers, the data can provide build suggestions\n", + "\n", + "to improve crash rate. Furthermore, teams can automate\n", + "\n", + "deduplication when multiple players experience the same\n", + "\n", + "errors, helping to reduce noise in the alerts received.\n", + "\n", + "**Get started with Crash Analytics**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Games crash, it is a fact of game development. The\n", + "\n", + "combination of drivers, hardware, software, and\n", + "\n", + "configurations create unique challenges in tracking, resolving\n", + "\n", + "and managing the user experience.\n", + "\n", + "Crash analytics and reporting is the process of collecting\n", + "\n", + "information about crashes or unexpected failures in a\n", + "\n", + "software application, in this case, a video game. A crash\n", + "\n", + "report typically includes information about the state of the\n", + "\n", + "game at the time of the crash, such as what the player was\n", + "\n", + "\n", + "-----\n", + "\n", + "# Things to look forward to\n", + "\n", + "\n", + "This eBook was created to help game developers better\n", + "\n", + "wrap their heads around the general concepts in which data,\n", + "\n", + "analytics, and AI can be used to support the development\n", + "\n", + "and growth of video games. **If you only have 5 minutes,**\n", + "\n", + "**these takeaways are critical to your success** .\n", + "\n", + "For more information on advanced data, analytics, and AI use\n", + "\n", + "cases, as well as education resources, we highly recommend\n", + "\n", + "Databricks training portal [dbricks.co/training](http://dbricks.co/training) .\n", + "\n", + "**Top takeaways:**\n", + "\n", + "If you take nothing else from this guide, here are the most\n", + "\n", + "important takeaways we want to leave with you on your journey.\n", + "\n", + "`1.` **Data is fundamental. Data, analytics, and AI play a role**\n", + "\n", + "throughout the entire game development lifecycle - from\n", + "\n", + "discovery to pre-production, development to operating\n", + "\n", + "a game as a live service. Build better games, cultivate\n", + "\n", + "deeper player engagements, and operate more effectively\n", + "\n", + "\n", + "by utilizing the full potential of your data.\n", + "\n", + "`2.` **Define your goals.** Start by establishing the goals of what\n", + "\n", + "you’re hoping to learn and or understand around your\n", + "\n", + "game. Clear goals make it easier to identify key metrics\n", + "\n", + "to track, example goals include; developing high-quality\n", + "\n", + "games that provide engaging and satisfying player\n", + "\n", + "experiences, increasing player engagement and retention\n", + "\n", + "by analyzing and improving gameplay and mechanics, and\n", + "\n", + "building a strong and positive brand reputation through\n", + "\n", + "effective marketing and community outreach.\n", + "\n", + "`3.` **Identify and understand your data sources.** Spend time\n", + "\n", + "to identify and understand the breadth of data sources\n", + "\n", + "you are already collecting, be that game telemetry,\n", + "\n", + "marketplace, game services, or sources beyond the game\n", + "\n", + "like social media. It is critical to collect the right data, and\n", + "\n", + "track the right metrics based on the goals and objectives\n", + "\n", + "you have set for your game.\n", + "\n", + "`4.` **Start small, and iterate quickly.** Recognize that goals and\n", + "\n", + "objectives evolve as you learn more about the interaction\n", + "\n", + "\n", + "-----\n", + "\n", + "are most effective when scoped small with tight feedback\n", + "\n", + "loops, allowing you to quickly adapt with your community\n", + "\n", + "and alongside shifting market conditions.\n", + "\n", + "`5.` **Game analytics forms the foundation.** Start by getting a\n", + "\n", + "game analytics dashboard up and running. The process of\n", + "\n", + "building out a dashboard will naturally require connecting\n", + "\n", + "and transforming your data in a way to unlock more\n", + "\n", + "advanced use cases down the road.\n", + "\n", + "`6.` **Plan and revisit your data strategy frequently.** Once\n", + "\n", + "dashboarding is set up, you’ll have a better picture of what\n", + "\n", + "downstream data use cases make the most sense for\n", + "\n", + "your game and business objectives. As you move to use\n", + "\n", + "cases such as player segmentation, churn analysis, and\n", + "\n", + "player lifetime value, revisit your data strategy frequently\n", + "\n", + "to ensure you’re spending time on use cases that drive\n", + "\n", + "actionable insights for you and your team.\n", + "\n", + "`7.` **Show value broad and wide.** Whether your data strategy\n", + "\n", + "is new or well established on the team, build the habit\n", + "\n", + "of communicating broadly to stakeholders across the\n", + "\n", + "company. Early in the process, it is important to gather\n", + "\n", + "critical feedback on what data is helpful and where there\n", + "\n", + "are opportunities for improvement. The worst thing that\n", + "\n", + "can happen is you create something that no one uses.\n", + "\n", + "That is a waste of everyone’s time and money.\n", + "\n", + "`8.` **Ask for help.** Engage with your technical partners. There\n", + "\n", + "are humans who can help ensure you’re developing your\n", + "\n", + "data and analytics platform in a way that is efficient and\n", + "\n", + "effective. There are numerous partners with domain\n", + "\n", + "expertise in data science and data engineering that can\n", + "\n", + "accelerate your data journey - here is our recommended\n", + "\n", + "partner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n", + "\n", + "`9.` **Participate in the community.** The community for game\n", + "\n", + "analytics is large and growing. It is important to research and\n", + "\n", + "\n", + "your needs and interests. Here are a few of our favorites:\n", + "\n", + "`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n", + "\n", + "Special Interest Groups that bring together user\n", + "\n", + "researchers, designers, data engineers and data\n", + "\n", + "scientists focused on understanding player behavior\n", + "\n", + "and experiences. They offer resources and events\n", + "\n", + "for those working in games user research, including a\n", + "\n", + "yearly Games User Research Summit.\n", + "\n", + "`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n", + "\n", + "global community of data scientists and engineers.\n", + "\n", + "While not specifically focused on game development,\n", + "\n", + "they offer a wealth of resources and opportunities for\n", + "\n", + "learning, networking, and collaboration in the field of\n", + "\n", + "data science.\n", + "\n", + "`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n", + "\n", + "Language Processing, computer vision, and other fields\n", + "\n", + "where AI plays its role. They also provide an online\n", + "\n", + "platform where users can access pre-trained models\n", + "\n", + "and tools, share their own models and datasets, and\n", + "\n", + "collaborate with other developers in the community.\n", + "\n", + "`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n", + "\n", + "subreddit is a forum for data engineers to discuss\n", + "\n", + "topics related to building and managing data pipelines,\n", + "\n", + "data warehousing, and related technologies. While\n", + "\n", + "not specifically focused on game development, it\n", + "\n", + "can be a valuable resource for those working on data\n", + "\n", + "engineering in the gaming industry.\n", + "\n", + "`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n", + "\n", + "first step in your data journey. Imagine how the output of\n", + "\n", + "your data can be presented in a way to help stakeholders\n", + "\n", + "across your company achieve more. For example, dropping\n", + "\n", + "data into an application that can help game designers\n", + "\n", + "make balancing decisions based on player events.\n", + "\n", + "\n", + "-----\n", + "\n", + "# APPENDIX Ultimate class build guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "The heart and soul of mature data teams are formed by this\n", + "\n", + "trio of classes. There are many aspects to these roles, but\n", + "\n", + "they can be summarized in that Data Engineers create and\n", + "\n", + "maintain critical data workflows, Data Analysts interpret data\n", + "\n", + "and create reports that keep the business teams running\n", + "\n", + "seamlessly, and Data Scientists are responsible for making\n", + "\n", + "sense of large amounts of data. Depending on the size of\n", + "\n", + "the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in\n", + "\n", + "data engineering, analytics and data science.\n", + "\n", + "Whether you’re looking to stand-up an analytics dashboard\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "##### Data Engineers\n", + "\n", + "\n", + "**Goals and Priorities of Data Engineers**\n", + "\n", + "- Enable access to usable data for real-time insights — data\n", + "\n", + "that both enables timely decision-making and is accurate\n", + "\n", + "and reproducible\n", + "\n", + "- Increase user confidence and trust in data. This involves\n", + "\n", + "ensuring high consistency and reliability in ETL processes\n", + "\n", + "- Limit the issues and failures experienced by other\n", + "\n", + "engineers and data scientists, allowing those roles to\n", + "\n", + "focus less on troubleshooting and more on drawing\n", + "\n", + "meaningful conclusions from data and building new\n", + "\n", + "products / features\n", + "\n", + "**What Data Engineers care about:**\n", + "\n", + "- Enabling access to data for real-time insights — data that\n", + "\n", + "both enables timely decision-making and is accurate and\n", + "\n", + "reproducible\n", + "\n", + "- Building high-performance, reliable and scalable pipelines\n", + "\n", + "for data processing\n", + "\n", + "- Delivering data for consumption from a variety of sources\n", + "\n", + "by Data Analysts and Data Scientists against tight SLAs\n", + "\n", + "- A Data Engineer’s biggest challenge? Collaboration\n", + "\n", + "across teams\n", + "\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Data Engineers are responsible for data migration,\n", + "\n", + "manipulation, and integration of data (joining dissimilar\n", + "\n", + "data systems)\n", + "\n", + "- Setup and maintenance of ETL pipelines to convert\n", + "\n", + "source data into actionable data for insights. It is the\n", + "\n", + "responsibility of the data engineer to make sure these\n", + "\n", + "pipelines run efficiently and are well orchestrated.\n", + "\n", + "- The Data Engineer sets up the workflow process\n", + "\n", + "to orchestrate pipelines for the studio’s data and\n", + "\n", + "continuously validates it\n", + "\n", + "- Managing workflows to enable data scientists and data\n", + "\n", + "analysts, and ensuring workflows are well-integrated with\n", + "\n", + "different parts of the studio (e.g., marketing, test/QA, etc)\n", + "\n", + "\n", + "##### Data Scientists\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Responsible for making sense of the large amounts of data\n", + "\n", + "collected for a given game title, such as game telemetry,\n", + "\n", + "business KPIs, game health and quality, and sources\n", + "\n", + "beyond the game such as social media listening\n", + "\n", + "- The analytics portion of a Data Scientist’s job means\n", + "\n", + "looking at new and existing data to try and discover new\n", + "\n", + "things within it\n", + "\n", + "- The engineering component may include writing out\n", + "\n", + "pipeline code and deploying it to a repository\n", + "\n", + "- Data Scientists are responding for building, maintaining, and\n", + "\n", + "monitoring models used for analytics and/or data products\n", + "\n", + "\n", + "-----\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Developing new business capabilities (such as behavioral\n", + "\n", + "segmentation, churn prediction, recommendations) and\n", + "\n", + "optimizing processes around those capabilities\n", + "\n", + "- Increase ROI by building algorithms and tools that are\n", + "\n", + "maintainable and reusable\n", + "\n", + "- Exploring (or further expanding) the use of machine\n", + "\n", + "learning models for specific use cases\n", + "\n", + "- Bridges the gap between engineering and analytics,\n", + "\n", + "between the technology teams and business teams\n", + "\n", + "- Provides business side of studio with data that is crucial\n", + "\n", + "in decision-making, for example a churn model that helps\n", + "\n", + "predict the impact of a new feature set\n", + "\n", + "**What Data Scientists care about:**\n", + "\n", + "- Creating exploratory analysis or models to accurately\n", + "\n", + "predict business metrics, e.g., customer spend, churn,\n", + "\n", + "etc., and provide data-driven recommendations\n", + "\n", + "- Enable team with actionable insights that are easy to\n", + "\n", + "understand and well curated\n", + "\n", + "- Create and move models from experimentation to\n", + "\n", + "production\n", + "\n", + "- A Data Scientist’s biggest challenge? Keeping up with\n", + "\n", + "advancements and innovation in data science, and\n", + "\n", + "knowing which tools and libraries to use\n", + "\n", + "##### Data Analysts\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Often serves as the go-to point of contact for non-\n", + "\n", + "\n", + "\n", + "- Analysts often interpret data and create reports or other\n", + "\n", + "documentation for studio leadership\n", + "\n", + "- Analysts typically are responsible for mining and\n", + "\n", + "compiling data\n", + "\n", + "- Streamline and or simplify processes when possible\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Empower stakeholder and business teams with\n", + "\n", + "actionable data\n", + "\n", + "- “Catch things before they break”. Proactively mitigate\n", + "\n", + "potential data issues before they occur (for internal and\n", + "\n", + "external customers)\n", + "\n", + "- Analysts are often recruited to assist other teams (i.e., BI\n", + "\n", + "teams) with their domain knowledge\n", + "\n", + "- Driving business impact through documentation and\n", + "\n", + "reliable data\n", + "\n", + "**What Data Analysts care about:**\n", + "\n", + "- Easy access to high quality data.\n", + "\n", + "- Quickly find insights from data with SQL queries and\n", + "\n", + "interactive visualizations.\n", + "\n", + "- The ability to easily share insights and while creating\n", + "\n", + "impactful assets for others to consume (dashboards, reports).\n", + "\n", + "- A Data Analyst’s biggest challenge? Working with complex\n", + "\n", + "processes and complicated technologies that are filled\n", + "\n", + "with messy data. While fighting these challenges, Analysts\n", + "\n", + "are often left alone or forced through paths that prevent\n", + "\n", + "collaboration with others across team/organization.\n", + "\n", + "- Untrustworthy data: often Analysts get asked to provide\n", + "\n", + "answers to leadership that will leverage the data to\n", + "\n", + "determine the direction of the company. When the data is\n", + "\n", + "untrustworthy or incorrect due to previously mentioned\n", + "\n", + "challenges this can eventually lead to lack of trust in the\n", + "\n", + "data teams from leadership or the business.\n", + "\n", + "\n", + "technical business / operations colleagues for data\n", + "\n", + "access / analysis questions\n", + "\n", + "\n", + "-----\n", + "\n", + "# Data access and the major cloud providers\n", + "\n", + "\n", + "### Cloud Rosetta Stone\n", + "\n", + "[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n", + "\n", + "If you are newer to the cloud computing space, it is easy to\n", + "\n", + "get lost between the hundreds of different services between\n", + "\n", + "the three major cloud providers. The table below is meant to\n", + "\n", + "highlight the important data, analytics, and AI services used\n", + "\n", + "by the various hyperscale service providers Amazon,\n", + "\n", + "Microsoft, and Google. In addition, it aims to pair up services\n", + "\n", + "from different cloud providers that serve the same purpose.\n", + "\n", + "### Getting started with the major cloud providers\n", + "\n", + "Here are some quick ways to get started with the three major\n", + "\n", + "cloud providers: AWS, Azure, and GCP:\n", + "\n", + "**AWS:**\n", + "\n", + "`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n", + "\n", + "account on the AWS website. This will give you access to\n", + "\n", + "the AWS Management Console, which is the web-based\n", + "\n", + "interface for managing your AWS resources.\n", + "\n", + "\n", + "`2.` **Use the AWS free tier:** AWS offers a free tier of service\n", + "\n", + "that provides a limited amount of free resources each\n", + "\n", + "month. This is a great way to get started and try out\n", + "\n", + "various AWS services without incurring any charges.\n", + "\n", + "`3.` **Explore the AWS Management Console:** Once you have\n", + "\n", + "an account and are logged in, take some time to explore\n", + "\n", + "the AWS Management Console and familiarize yourself\n", + "\n", + "with the various services that are available.\n", + "\n", + "`4.` **Next you can search for Databricks:** In the AWS\n", + "\n", + "Management Console, use the search bar in the top-left\n", + "\n", + "corner of the page and search for “Databricks”.\n", + "\n", + "`5.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`6.` **Launch Databricks Workspace:** To launch the Databricks\n", + "\n", + "Workspace on AWS, you can use the CloudFormation\n", + "\n", + "template provided by Databricks. Databricks\n", + "\n", + "CloudFormation template creates an IAM role, security\n", + "\n", + "group, and Databricks Workspace in your AWS account.\n", + "\n", + "**Azure:**\n", + "\n", + "`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n", + "\n", + "an account on Azure portal. This will give you access to\n", + "\n", + "the Azure portal, which is the web-based interface for\n", + "\n", + "managing your Azure resources.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n", + "|---|---|---|---|---|\n", + "|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n", + "|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n", + "|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n", + "|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n", + "\n", + "\n", + "-----\n", + "\n", + "**Jargon Glossary**\n", + "\n", + "|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n", + "|---|---|\n", + "|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n", + "|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n", + "|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n", + "|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n", + "|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n", + "|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n", + "|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n", + "\n", + "\n", + "`2.` **Take Azure tutorials:** Azure provides tutorials,\n", + "\n", + "documentation, and sample templates to help you get\n", + "\n", + "started. These resources can help you understand the\n", + "\n", + "basics of Azure and how to use its services.\n", + "\n", + "`3.` **You can search for Databricks:** In the Azure portal, use the\n", + "\n", + "search bar at the top of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the Azure portal, Azure\n", + "\n", + "CLI or Azure Powershell. Once created, you’ll be able to\n", + "\n", + "access your Databricks Workspace through the Azure portal.\n", + "\n", + "`6.` **Other Azure Services:** Once you have a Databricks\n", + "\n", + "workspace setup, you can easily connect it to other Azure\n", + "\n", + "Services such as Azure Storage, Event Hubs, Azure Data\n", + "\n", + "Lake Storage, Azure SQL and Cosmos DB for example.\n", + "\n", + "\n", + "**GCP:**\n", + "\n", + "`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n", + "\n", + "account on GCP portal. This will give you access to the\n", + "\n", + "GCP Console, which is the web-based interface for\n", + "\n", + "managing your GCP resources.\n", + "\n", + "`2.` **Explore the GCP Console:** Once you have an account\n", + "\n", + "and are logged in, take some time to explore the GCP\n", + "\n", + "Console and familiarize yourself with the various services\n", + "\n", + "that are available.\n", + "\n", + "`3.` **Search for Databricks:** In the GCP Console, use the search bar\n", + "\n", + "in the top-left corner of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the GCP Console or\n", + "\n", + "the gcloud command-line tool. Once created, you’ll be\n", + "\n", + "able to access your Databricks Workspace through the\n", + "\n", + "GCP Console.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Detailed Use Cases\n", + "\n", + "\n", + "### Getting started with game analytics\n", + "\n", + "Fortunately, standing up an effective analytics dashboard\n", + "\n", + "is getting easier. It all starts with getting your data into an\n", + "\n", + "architecture that sets your team up for success. Selecting\n", + "\n", + "any of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n", + "\n", + "you can land all your data into a cloud data lake, then use\n", + "\n", + "Databricks Lakehouse architecture to run real-time and\n", + "\n", + "reliable processing. Databricks can then help you visualize\n", + "\n", + "that data in a dashboard, or send to a visual analytics\n", + "\n", + "platform, such as Tableau.\n", + "\n", + "`1.` **Sign up for a Databricks account:** You’ll need to create\n", + "\n", + "an account on the Databricks website in order to use the\n", + "\n", + "platform.\n", + "\n", + "`2.` **Access the Databricks portal:** Interact with the\n", + "\n", + "Databricks platform and run tasks such as creating\n", + "\n", + "clusters, running jobs, and accessing data.\n", + "\n", + "`3.` **Set up a development environment:** You’ll need a\n", + "\n", + "development environment where you can write and\n", + "\n", + "test your code, whether you’re using a local IDE or the\n", + "\n", + "Databricks Workspace.\n", + "\n", + "`4.` **Collect data:** Once you have your development environment\n", + "\n", + "set up, you can start collecting data from your game. This\n", + "\n", + "can involve integrating or building a SDK into your game\n", + "\n", + "code, or using another tool to send data to cloud storage.\n", + "\n", + "`5.` **Process and analyze the data:** Once you have collected\n", + "\n", + "your data, you can use Databricks to process and analyze\n", + "\n", + "it. This can involve cleaning and transforming the data,\n", + "\n", + "running queries or machine learning algorithms, or\n", + "\n", + "creating visualizations.\n", + "\n", + "`6.` **Monitor and optimize:** Regularly monitor your analytics\n", + "\n", + "to ensure that they are accurate and relevant, and use the\n", + "\n", + "insights you gain to optimize your game.\n", + "\n", + "Keep in mind that these are just general steps to get started\n", + "\n", + "with Databricks for game analytics. The specific steps you’ll\n", + "\n", + "need to take will depend on your specific use case and needs.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** What do you want to learn from your\n", + "\n", + "analytics data? Having clear goals will help you focus on\n", + "\n", + "collecting the right data and making meaningful use of it.\n", + "\n", + "- **Plan your data collection:** Determine what data you need\n", + "\n", + "to collect, how you will collect it, and how you will store it.\n", + "\n", + "- **Consider privacy:** Make sure you are transparent with your\n", + "\n", + "players about what data you are collecting and how you\n", + "\n", + "will use it, and give them the option to opt out if they wish.\n", + "\n", + "- **Use analytics to inform design:** Leverage your analytics data\n", + "\n", + "to inform decisions around game design, such as any balance\n", + "\n", + "changes or new content targeting a specific audience.\n", + "\n", + "- **Monitor and test your analytics implementation:** Regularly\n", + "\n", + "check your analytics to ensure that data is being collected\n", + "\n", + "correctly, and conduct tests to validate the accuracy of\n", + "\n", + "your data.\n", + "\n", + "- **Visualize your data:** Dashboarding your data is one of the\n", + "\n", + "most effective ways to quickly and effectively make sense\n", + "\n", + "of what’s happening at a given moment in time.\n", + "\n", + "- **Use data to improve player retention:** Analyze player\n", + "\n", + "behavior and use the insights you gain to improve player\n", + "\n", + "retention, such as by identifying and addressing pain\n", + "\n", + "points or by providing personalized content.\n", + "\n", + "- **Collaborate with your team:** Share your analytics\n", + "\n", + "findings with your team and encourage them to use the\n", + "\n", + "data to inform their work.\n", + "\n", + "- **Keep it simple:** Don’t try to collect too much data or\n", + "\n", + "create overly complex analytics systems. Keep it simple\n", + "\n", + "and focused on your goals.\n", + "\n", + "- **Start where you are:** If you’ve yet to gather all of your\n", + "\n", + "data, don’t go build some fancy model. Start with the data\n", + "\n", + "you have available to you and build from there.\n", + "\n", + "### Getting started with Player Segmentation\n", + "\n", + "Player segmentation is crucial to studios as it allows them\n", + "\n", + "to better understand their audience and tailor their game\n", + "\n", + "experience to meet their specific needs and preferences.\n", + "\n", + "By dividing players into different segments based on factors\n", + "\n", + "such as demographics, playing styles, and in-game behavior,\n", + "\n", + "\n", + "-----\n", + "\n", + "studios can gain valuable insights into what motivates and\n", + "\n", + "engages their players. This information can then be used\n", + "\n", + "to design games that not only provide a more enjoyable\n", + "\n", + "experience for players, but also drive player retention\n", + "\n", + "and increase revenue for the studio. In a competitive\n", + "\n", + "industry where player satisfaction is key to success, player\n", + "\n", + "segmentation is an essential tool for studios to stay ahead of\n", + "\n", + "the game.\n", + "\n", + "Start by evaluating the segmentation goals such as:\n", + "\n", + "- **Personalize the experience:** Changing or creating\n", + "\n", + "experience specific designs to the player.\n", + "\n", + "- **Create relevant content:** Surface the best content to\n", + "\n", + "players based on features and behaviors that will matter\n", + "\n", + "the most depending on the player’s place in the games\n", + "\n", + "life cycle.\n", + "\n", + "- **Monetization:** Create tailored monetization strategies\n", + "\n", + "that effectively reach and convert each player group. For\n", + "\n", + "example, you may have a group of highly engaged players\n", + "\n", + "who are more likely to make in-app purchases, while\n", + "\n", + "another group is less likely to spend money but may be\n", + "\n", + "more receptive to advertisements.\n", + "\n", + "The next steps would be to identify, collect and analyze\n", + "\n", + "player data. By gathering information on player behavior,\n", + "\n", + "preferences, and demographics, you can gain insights\n", + "\n", + "into their motivations, pain points, and what drives their\n", + "\n", + "engagement with your game.\n", + "\n", + "There are multiple types of player data to collect, including:\n", + "\n", + "- **Player Behavior:** Track player behavior and actions\n", + "\n", + "within your game to gain insights into their play style,\n", + "\n", + "preferences, and patterns.\n", + "\n", + "- **Surveys:** Ask players directly about their preferences,\n", + "\n", + "motivations, and feedback through in-game surveys, email\n", + "\n", + "questionnaires, or other forms of direct communication.\n", + "\n", + "- **Focus groups:** Gather a small group of players to discuss\n", + "\n", + "and provide feedback on specific aspects of your game\n", + "\n", + "and player experience.\n", + "\n", + "- **Social media listening:** Monitor social media platforms\n", + "\n", + "to gather insights into how players are engaging with and\n", + "\n", + "talking about your game.\n", + "\n", + "**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "Define your segmentation goals: Determine what you want\n", + "\n", + "to learn about your players and why. This will help you focus\n", + "\n", + "your analysis and ensure that your segments are meaningful\n", + "\n", + "and actionable.\n", + "\n", + "- **Use meaningful criteria:** Choose criteria that are relevant\n", + "\n", + "to your goals and that differentiate players in meaningful\n", + "\n", + "ways. This could include demographic information, in-game\n", + "\n", + "behavior, spending habits, or a combination of factors.\n", + "\n", + "- **Analyze player data:** Use data from your players to inform\n", + "\n", + "your segmentation strategy. This could include data\n", + "\n", + "on in-game behavior, spending habits, or demographic\n", + "\n", + "information.\n", + "\n", + "- **Use multiple methods:** We recommend using a\n", + "\n", + "combination of methods, such as clustering to create\n", + "\n", + "segments that are statistically meaningful and actionable\n", + "\n", + "to your game.\n", + "\n", + "- **Validate your segments:** Test your segments to ensure\n", + "\n", + "that they accurately reflect the differences you observed\n", + "\n", + "in your player data. This could involve comparing the\n", + "\n", + "segments to each other, or validating the segments\n", + "\n", + "against external data sources.\n", + "\n", + "- **Consider ethical and privacy concerns:** Ensure that\n", + "\n", + "your segmentation strategy is ethical and complies\n", + "\n", + "with privacy laws and regulations. This could involve\n", + "\n", + "anonymizing your player data, obtaining consent from\n", + "\n", + "players, or other measures to protect player privacy.\n", + "\n", + "- **Monitor and refine your segments:** Regularly review\n", + "\n", + "your segments to ensure that they remain relevant and\n", + "\n", + "meaningful. Refine your segments as necessary to reflect\n", + "\n", + "changes in your player data or your goals.\n", + "\n", + "### Getting Started with Player Lifetime Value\n", + "\n", + "Assuming you’ve followed the steps to collecting, storing, and\n", + "\n", + "preparing your player data for analysis; To calculate player\n", + "\n", + "lifetime value (LTV), the quick and dirty way of assessing\n", + "\n", + "overall player LTV is to divide the total revenue by the total\n", + "\n", + "number of registered players. Note, LTV is a critical calculation\n", + "\n", + "for return on investment, which is player lifetime spend versus\n", + "\n", + "the amount spent on player acquisition. Ideally, you want\n", + "\n", + "lifetime spend to be equal to or more than cost of acquisition.\n", + "\n", + "\n", + "-----\n", + "\n", + "As long as your game and its community are currently active,\n", + "\n", + "any player lifetime value calculations should be considered\n", + "\n", + "models, not exact numbers. This is because many of the players\n", + "\n", + "you’re considering are likely actively registered and actively\n", + "\n", + "playing, so the exact player LTV number is a moving target.\n", + "\n", + "Advanced\n", + "predictive\n", + "models\n", + "\n", + "Simple\n", + "predictive\n", + "models\n", + "\n", + "\n", + "Historical\n", + "average and\n", + "benchmarks\n", + "\n", + "\n", + "But these models are not entirely accurate since it doesn’t\n", + "\n", + "take into account the players who are registered but have\n", + "\n", + "yet to generate any revenue. Instead, a data-driven approach\n", + "\n", + "pivoted around player segmentation or cohorts will generally\n", + "\n", + "yield more actionable insight, far more than calculating a\n", + "\n", + "single LTV for the entire player base.\n", + "\n", + "You can define your game’s cohorts in multiple ways. Perhaps\n", + "\n", + "the most obvious in terms of calculating LTV is going by daily\n", + "\n", + "active cohorts, or users who joined your game on the same\n", + "\n", + "day. You could also organize cohorts by users who joined\n", + "\n", + "your game through a certain ad campaign or promotional\n", + "\n", + "effort, by country or geographic location, or by the type of\n", + "\n", + "device used.\n", + "\n", + "**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "**ACCURACY**\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "\n", + "**Use multiple data sources:** To get a complete picture of\n", + "\n", + "a player’s value, be sure to consider data from a variety\n", + "\n", + "of sources, including in-game purchases, ad revenue, and\n", + "\n", + "other monetization strategies.\n", + "\n", + "**Consider player retention:** Player retention is a key factor\n", + "\n", + "in LTV, so be sure to consider how long players are likely to\n", + "\n", + "play your game when calculating LTV.\n", + "\n", + "**Use accurate data:** Make sure you are using accurate\n", + "\n", + "data when calculating LTV. This might involve cleaning and\n", + "\n", + "processing your data, or using trusted sources such as in-\n", + "\n", + "game analytics tools.\n", + "\n", + "**Regularly review and update your LTV estimates:** Player\n", + "\n", + "LTV can change over time, so be sure to regularly review\n", + "\n", + "and update your estimates to ensure they are accurate.\n", + "\n", + "**Test and optimize:** Use experimentation methods such\n", + "\n", + "as A/B testing to see how different variables, such as\n", + "\n", + "in-game events or pricing strategies, affect LTV. Use the\n", + "\n", + "insights you gain to optimize your LTV calculations.\n", + "\n", + "**Be aware of outside factors:** Your calculations should\n", + "\n", + "consider the many outside factors that can affect your\n", + "\n", + "LTV, such as the virality of your game, any spikes or surge\n", + "\n", + "in visitors due to unexpected promotions (influencers,\n", + "\n", + "reviewers talking about your game), any significant changes\n", + "\n", + "to your game that users respond well to, and other organic\n", + "\n", + "lifts that are difficult to predict with existing data.\n", + "\n", + "\n", + "The first calculation is relatively simple. We suggest using\n", + "\n", + "average revenue per user (ARPU), which is a game’s daily\n", + "\n", + "revenue divided by the number of active users, to help you\n", + "\n", + "calculate lifetime value. First, you’ll need to define what is\n", + "\n", + "an active player using retention values; which can be set to\n", + "\n", + "a week, multi-day, or multi-week period of time depending\n", + "\n", + "on how your game has performed to date. You can then look\n", + "\n", + "at the number of users who churn on a given day, averaging\n", + "\n", + "with the number of days from the player’s first visit to the\n", + "\n", + "current date (or the specific date you’ve considered the end\n", + "\n", + "for said exercise). This is your playerbase lifetime value (note\n", + "\n", + "not Player Lifetime Value). To get Lifetime Value, divide daily\n", + "\n", + "revenue by the number of daily active users, and multiply\n", + "\n", + "that by the Lifetime Value to get your player LTV.\n", + "\n", + "It’s important to note that while calculating player lifetime\n", + "\n", + "value, the term is not entirely accurate since most player\n", + "\n", + "lifetimes are not over (particularly true for live service\n", + "\n", + "games). But for the purpose of modeling, we recommend\n", + "\n", + "keeping the amount of time that you consider a lifetime\n", + "\n", + "relatively short, allowing you to extrapolate. Keeping the time\n", + "\n", + "period shorter helps mitigate inaccuracies, specifically, the\n", + "\n", + "longer you stretch out what you consider a lifetime the more\n", + "\n", + "likely you are to collect inactive users in your count.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Social Media Monitoring\n", + "\n", + "Social media monitoring has three primary components:\n", + "\n", + "collecting the data, processing the results, and taking action\n", + "\n", + "on the findings. When it comes to collecting the data, whether\n", + "\n", + "you’re looking for tweets, YouTube comments, or Reddit\n", + "\n", + "posts, it can be very easy to get started since many social\n", + "\n", + "media platforms such as Twitter, YouTube, and Reddit all\n", + "\n", + "provide their own detailed and comprehensive APIs making it\n", + "\n", + "easy to start gathering data from those platforms with proper\n", + "\n", + "documentation and code examples to help along the way.\n", + "\n", + "Once the data has been collected, the next step is to process\n", + "\n", + "it and prepare it to be used in the next step. Processing your\n", + "\n", + "data can range in complexity from a simple keywords filter\n", + "\n", + "or more complicated approach such as filtering by location,\n", + "\n", + "removing emojis, and censoring and substituting words. With\n", + "\n", + "the data collected and processed, it can move to the final\n", + "\n", + "stage and be analyzed for downstream use and actionable\n", + "\n", + "insights by applying sentiment analysis or text mining.\n", + "\n", + "If a game studio is looking to save time and have the above\n", + "\n", + "steps performed for them, it may be appealing to buy a\n", + "\n", + "pre-built tool. The primary benefits of buying an off the shelf\n", + "\n", + "solution is that it is often faster and easier to get started\n", + "\n", + "with, and the development of the tool is handled by a third\n", + "\n", + "party who will have experience in building media monitoring\n", + "\n", + "\n", + "solutions. On the other hand, building your own custom\n", + "\n", + "solution will provide more flexibility and control. Many pre-\n", + "\n", + "built media monitoring tools might not have the capabilities\n", + "\n", + "required to effectively process video, audio, and image\n", + "\n", + "data, and may not be able to control the frequency in which\n", + "\n", + "data is processed, whether it be near real-time or batch.\n", + "\n", + "Additionally, pre-built solutions tend to take a generalist\n", + "\n", + "approach for NLP, whether it be keyword extraction, topic\n", + "\n", + "filtering, or sentiment analysis, which often leads to poor\n", + "\n", + "results and feedback, especially for an industry as unique as\n", + "\n", + "the gaming industry where certain industry-specific slang\n", + "\n", + "or terminology is frequently used. Overall, building your\n", + "\n", + "own media monitoring tool will provide greater control and\n", + "\n", + "flexibility leading to a better tailored return on investment,\n", + "\n", + "and luckily Databricks makes it even easier to get started.\n", + "\n", + "With the Databricks Lakehouse platform, all data engineering,\n", + "\n", + "data science, machine learning, and data analytics can\n", + "\n", + "be done in a single place without having to stitch multiple\n", + "\n", + "systems and tools together.\n", + "\n", + "Data engineers can use Workflows and Jobs to call social\n", + "\n", + "media platform APIs on a scheduled basis and use Delta Live\n", + "\n", + "Tables to create declarative data pipelines for cleaning and\n", + "\n", + "processing the data that comes in. Data scientists can use\n", + "\n", + "tools such as ML-specific Databricks runtimes (DBRs) that\n", + "\n", + "come with many of the most popular and common libraries\n", + "\n", + "already installed, MLflow which makes model development,\n", + "\n", + "\n", + "-----\n", + "\n", + "tracking, and serving easy and efficient, and various other\n", + "\n", + "tools such as AutoML and Bamboolib. Data analysts are able\n", + "\n", + "to create real-time alerts, dashboards, and visualizations\n", + "\n", + "using Databricks SQL. Each of the three personas will be able\n", + "\n", + "to effectively collaborate with each other and integrate each\n", + "\n", + "piece of their work into the broader data architecture.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "While social media monitoring can be easy to get started\n", + "\n", + "with, there are a few key points to keep in mind.\n", + "\n", + "- Remember the Pareto principle (roughly 80% of impact\n", + "\n", + "comes from 20% of activity) and diminishing returns. While\n", + "\n", + "it’s important to monitor large platforms such as Reddit,\n", + "\n", + "Twitter, and YouTube, it might not be worthwhile to monitor\n", + "\n", + "smaller platforms (in terms of engagement) as the bulk of\n", + "\n", + "customer feedback will be on those major platforms.\n", + "\n", + "- Monitor other sources of information. It is also useful to\n", + "\n", + "monitor mentions of key company personnel such as\n", + "\n", + "executives or public facing employees.\n", + "\n", + "- While follower count does matter on platforms such as\n", + "\n", + "Twitter, don’t ignore users with low-follower counts. It only\n", + "\n", + "takes one or two re-tweets from other users to become a\n", + "\n", + "large issue.\n", + "\n", + "- On social media, customers can see through generic\n", + "\n", + "corporate responses to complaints, so it is important\n", + "\n", + "to get a clear understanding of the issue and provide a\n", + "\n", + "clear response.\n", + "\n", + "### Getting Started with Player Feedback Analysis\n", + "\n", + "The easiest place to start is gathering your data. With\n", + "\n", + "accounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n", + "\n", + "Nintendo (or whatever platform you’re using), identify the ID\n", + "\n", + "for your game(s), and pull the reviews corresponding to that\n", + "\n", + "game into Databricks through an API call.\n", + "\n", + "\n", + "From here, you clean the data using some of the pre-\n", + "\n", + "processing available in Python that removes any emojis and\n", + "\n", + "ASCII characters. Once complete, run through Spark NLP\n", + "\n", + "pipeline which does the basic natural language processing\n", + "\n", + "steps such as normalization, stemming, lemmatization. We\n", + "\n", + "recommend running through pre-trained models, such as Word\n", + "\n", + "Embeddings and Named Entity Recognition models from John\n", + "\n", + "Snow Labs. This should complete the pipeline and generates\n", + "\n", + "the aspects for the reviews provided by the community.\n", + "\n", + "This data is then loaded into a Delta table for further analysis,\n", + "\n", + "such as using a visual dashboard (built on SQL queries inside\n", + "\n", + "Databricks) to analyze and understand the aspects the\n", + "\n", + "community is talking about, which can then be shared back\n", + "\n", + "with the development team for analysis and action. This is a\n", + "\n", + "great exercise to run once per month.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Check for word groupings:** Make sure your word groupings\n", + "\n", + "are accurate to improve the analysis. For example, if your\n", + "\n", + "game is called Football Manager, and the shorthand is FM,\n", + "\n", + "make sure both of those are grouped appropriately.\n", + "\n", + "- **Leverage domain knowledge:** Clean the reviews based\n", + "\n", + "on your domain knowledge. There are generic steps one\n", + "\n", + "could take, but that will not be as effective as someone\n", + "\n", + "with domain, and specific game knowledge of your title.\n", + "\n", + "- **Experiment with models:** Feel free to try multiple pre-\n", + "\n", + "trained models, and or tweak the pre-trained models\n", + "\n", + "based on your understanding of the domain to improve\n", + "\n", + "the accuracy of your results.\n", + "\n", + "- **Work one title at a time:** This process works best when\n", + "\n", + "pulling reviews for a single title, specifically one version of\n", + "\n", + "one title at a time.\n", + "\n", + "- **Let the model to the heavy lift, but use humans to double-**\n", + "\n", + "**check:** The sentiment corresponding to the aspects in the\n", + "\n", + "model will be labeled as Positive or Negative. In the case\n", + "\n", + "of a neutral review, the model will do its best to determine\n", + "\n", + "whether that is more positive or negative. A best practice\n", + "\n", + "is to spend time going back through the aspects early to\n", + "\n", + "determine model accuracy and make updates accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Toxicity Detection\n", + "\n", + "Our recommendation on tackling the toxicity issue is\n", + "\n", + "to leverage cloud-agnostic and flexible tooling that can\n", + "\n", + "consume chat data from a variety of sources, such as chat\n", + "\n", + "logs, voice transcriptions, or sources like discord and reddit\n", + "\n", + "forums. No matter if the data is in log form from game\n", + "\n", + "servers or events from a message system, Databricks can\n", + "\n", + "provide quick and easy ways to ingest the data.\n", + "\n", + "Leveraging a simplified architecture like the diagram\n", + "\n", + "above shows no matter the source, getting chat data for\n", + "\n", + "inferencing and model development can be as simple. While\n", + "\n", + "we leveraged a pre-built model from John Snow Labs to\n", + "\n", + "accelerate development, you can bring the ML framework of\n", + "\n", + "your choice to the platform.\n", + "\n", + "**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define what toxic and disruptive behavior looks**\n", + "\n", + "**like within your community:** Clearly define what you\n", + "\n", + "consider to be toxic behavior, as this will determine how\n", + "\n", + "you measure and detect it. This might include things like\n", + "\n", + "hateful language, harassment, or cheating.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you detect toxicity. This might include\n", + "\n", + "data on in-game chat, player reports, and other sources.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and identify patterns of toxic\n", + "\n", + "behavior. This will allow you to more accurately detect\n", + "\n", + "toxicity and prioritize cases for review.\n", + "\n", + "- **Test and optimize:** Regularly review and test your toxicity\n", + "\n", + "detection systems to ensure they are accurate and\n", + "\n", + "effective. Use experimentation methods such as A/B\n", + "\n", + "testing to see how different strategies impact toxicity rates.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are detecting toxicity, and give\n", + "\n", + "them the option to opt out if they wish.\n", + "\n", + "- **Take action:** When toxic behavior is detected, take\n", + "\n", + "appropriate action to address it. The health and wellness\n", + "\n", + "of your community depends on it. This might involve\n", + "\n", + "banning players, issuing warnings, or taking other\n", + "\n", + "disciplinary measures.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n", + "\n", + "To get started with multi-touch attribution, you need to first\n", + "\n", + "select an attribution model. There are a variety of different\n", + "\n", + "attribution models to choose from, each with its own\n", + "\n", + "\n", + "attribution credit according to your chosen model (above).\n", + "\n", + "We highly recommend you regularly review and test your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "Use experimentation methods such as A/B testing to see\n", + "\n", + "how different strategies impact conversion rates.\n", + "\n", + "**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n", + "\n", + "\n", + "strengths and limitations.\n", + "\n", + "\n", + "`1.` **Last-click model:** This model attributes all credit to the\n", + "\n", + "last touchpoint that the customer interacted with before\n", + "\n", + "making a purchase or taking a desired action.\n", + "\n", + "`2.` **First-click model:** This model attributes all credit to the\n", + "\n", + "first touchpoint that the customer interacted with.\n", + "\n", + "`3.` **Linear model:** This model attributes equal credit to each\n", + "\n", + "touchpoint that the customer interacted with.\n", + "\n", + "`4.` **Time decay model:** This model attributes more credit to\n", + "\n", + "touchpoints that are closer in time to the purchase\n", + "\n", + "or desired action.\n", + "\n", + "`5.` **Position-based model:** This model attributes a portion of\n", + "\n", + "the credit to the first and last touchpoints, and the remainder\n", + "\n", + "is distributed evenly among the other touchpoints.\n", + "\n", + "`6.` **Custom model:** Some businesses create their own\n", + "\n", + "attribution model based on specific business needs or goals.\n", + "\n", + "Each attribution model has its own strengths and limitations,\n", + "\n", + "and the right model for a particular video game will depend\n", + "\n", + "on a variety of factors, including the goals of your title, the\n", + "\n", + "customer journey, and the types of marketing channels being\n", + "\n", + "used. It is important to carefully consider the pros and cons\n", + "\n", + "of each model and choose the one that best aligns with the\n", + "\n", + "needs of your game.\n", + "\n", + "Next, you’re going to want to set up tracking. In order to\n", + "\n", + "attribute credit to different touchpoints, you’ll need to set up\n", + "\n", + "tracking to capture data on customer interactions. This might\n", + "\n", + "involve integrating tracking code into the game, or using a\n", + "\n", + "third-party tracking tool.\n", + "\n", + "With tracking set up, you’ll start collecting data on player\n", + "\n", + "interactions and be able to use that information to calculate\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define clear goals:** Sounds simple, but by clearly defining\n", + "\n", + "the goals of your acquisition campaign and what success\n", + "\n", + "looks like, you will be able to guide your decision-making\n", + "\n", + "and ensure that you are measuring the right metrics -\n", + "\n", + "such as cost per install, return on ad spend, conversion\n", + "\n", + "rate, lifetime value, retention rate, and more.\n", + "\n", + "- **Use a data-driven approach:** Use data to inform your\n", + "\n", + "decision-making. Collect data on all touchpoints in the\n", + "\n", + "player journey, including ad impressions, clicks, installs,\n", + "\n", + "and in-game actions.\n", + "\n", + "- **Choose the right attribution model:** Select the right\n", + "\n", + "attribution model that accurately reflects the player\n", + "\n", + "journey for your specific genre of game. This can be a\n", + "\n", + "complex process. A couple of things to keep in mind\n", + "\n", + "- Consider the touchpoints that are most important for\n", + "\n", + "your player journey, such as first ad impression, first\n", + "\n", + "click, or first in-game action\n", + "\n", + "- Consider the business goals you’re trying to achieve.\n", + "\n", + "For example, if you are focused on maximizing return\n", + "\n", + "on investment, a last-click attribution model may be\n", + "\n", + "most appropriate. On the other hand, if you are looking\n", + "\n", + "to understand the impact of each touchpoint, a multi-\n", + "\n", + "touch attribution model may be more appropriate.\n", + "\n", + "- Consider the data you have available, including ad\n", + "\n", + "impressions, clicks, installs, and in-game actions.\n", + "\n", + "- **Continuously monitor and optimize:** Continuously\n", + "\n", + "monitor and optimize your acquisition campaigns based on\n", + "\n", + "the data. Test different approaches, make adjustments as\n", + "\n", + "needed, and use A/B testing to determine what works best.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Player Recommendations\n", + "\n", + "Recommendations is an advanced use case. We don’t\n", + "\n", + "recommend (hehe) that you start here, instead, we’re\n", + "\n", + "assuming that you’ve done the work to set up your game\n", + "\n", + "analytics (collecting, cleaning, and preparing data for analysis)\n", + "\n", + "and that you’ve done basic segmentation to place your\n", + "\n", + "players in cohorts based on their interests and behaviors.\n", + "\n", + "Recommendations can come in many forms for video games.\n", + "\n", + "For this context, we’re going to focus on the wide-and-deep\n", + "\n", + "learning for recommender systems, which has the ability\n", + "\n", + "to both memorize and generalize recommendations based\n", + "\n", + "on player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n", + "\n", + "[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n", + "\n", + "deep machine learning (ML) model has become popular in a\n", + "\n", + "variety of online scenarios for its ability to personalize user\n", + "\n", + "engagements, even in ‘cold start problem’ scenarios with\n", + "\n", + "sparse data inputs.\n", + "\n", + "The goal with wide-and-deep recommenders is to provide\n", + "\n", + "\n", + "**Understanding the model design**\n", + "\n", + "To understand the concept of wide-and-deep recommend­\n", + "\n", + "ations, it’s best to think of it as two separate, but collaborating,\n", + "\n", + "engines. The wide model, often referred to in the literature as\n", + "\n", + "the linear model, memorizes users and their past choices. Its\n", + "\n", + "inputs may consist simply of a user identifier and a product\n", + "\n", + "identifier, though other attributes relevant to the pattern (such\n", + "\n", + "as time of day) may also be incorporated.\n", + "\n", + "The deep portion of the model, so named as it is a deep\n", + "\n", + "neural network, examines the generalizable attributes of a\n", + "\n", + "user and their choices. From these, the model learns the\n", + "\n", + "broader characteristics that tend to favor user selections.\n", + "\n", + "Together, the wide-and-deep submodels are trained\n", + "\n", + "on historical product selections by individual users to\n", + "\n", + "predict future selections. The end result is a single model\n", + "\n", + "capable of calculating the probability with which a user will\n", + "\n", + "purchase a given item, given both memorized past choices\n", + "\n", + "and generalizations about a user’s preferences. These\n", + "\n", + "probabilities form the basis for user-specific rankings, which\n", + "\n", + "can be used for making recommendations.\n", + "\n", + "\n", + "an intimate level of player understanding. This model uses\n", + "\n", + "\n", + "explicit and implicit feedback to expand the considerations\n", + "\n", + "set for players. Wide-and-deep recommenders go beyond\n", + "\n", + "simple weighted averaging of player feedback found in some\n", + "\n", + "collaborative filters to balance what is understood about\n", + "\n", + "the individual with what is known about similar gamers. If\n", + "\n", + "done properly, the recommendations make the gamer feel\n", + "\n", + "understood (by your title) and this should translate into\n", + "\n", + "greater value for both the player and you as the business.\n", + "\n", + "\n", + "**Building the model**\n", + "\n", + "The intuitive logic of the wide-and-deep recommender\n", + "\n", + "belies the complexity of its actual construction. Inputs\n", + "\n", + "must be defined separately for each of the wide-and-\n", + "\n", + "deep portions of the model and each must be trained in a\n", + "\n", + "coordinated manner to arrive at a single output, but tuned\n", + "\n", + "using optimizers specific to the nature of each submodel.\n", + "\n", + "Thankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n", + "\n", + "[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n", + "\n", + "simplifying the assembly of an overall model.\n", + "\n", + "\n", + "**User A**\n", + "\n", + "- user identity\n", + "\n", + "- user attributes\n", + "\n", + "**Product B**\n", + "\n", + "\n", + "**Wide**\n", + "**Sub-Model**\n", + "\n", + "\n", + "**Probability of**\n", + "\n", + "**User A + Product B**\n", + "\n", + "**Wide & Deep**\n", + "**Model**\n", + "\n", + "\n", + "**Deep**\n", + "**Sub-Model**\n", + "\n", + "\n", + "\n", + "- product identity\n", + "\n", + "- product attributes\n", + "\n", + "\n", + "-----\n", + "\n", + "**Training**\n", + "\n", + "The challenge for most teams is then training the\n", + "\n", + "recommender on the large number of user-product\n", + "\n", + "combinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n", + "\n", + "open-source library for serving large datasets assembled in\n", + "\n", + "Apache Spark™ to Tensorflow (and other ML libraries), one can\n", + "\n", + "cache the data on high-speed, temporary storage and then\n", + "\n", + "read that data in manageable increments to the model during\n", + "\n", + "training. In doing so, we limit the memory overhead associated\n", + "\n", + "with the training exercise while preserving performance.\n", + "\n", + "**Tuning**\n", + "\n", + "Tuning the model becomes the next challenge. Various model\n", + "\n", + "parameters control its ability to arrive at an optimal solution.\n", + "\n", + "The most efficient way to work through the potential parameter\n", + "\n", + "combinations is simply to iterate through some number of\n", + "\n", + "training cycles, comparing the models’ evaluation metrics with\n", + "\n", + "each run to identify the ideal parameter combinations. By\n", + "\n", + "trials, we can parallelize this work across many compute nodes,\n", + "\n", + "allowing the optimizations to be performed in a timely manner.\n", + "\n", + "**Deploying**\n", + "\n", + "Finally, we need to deploy the model for integration with\n", + "\n", + "various retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n", + "\n", + "to both persist our model and package it for deployment\n", + "\n", + "across a wide variety of microservices layers, including\n", + "\n", + "Azure Machine Learning, AWS Sagemaker, Kubernetes and\n", + "\n", + "Databricks Model Serving.\n", + "\n", + "While this seems like a large number of technologies to bring\n", + "\n", + "together just to build a single model, Databricks integrates all\n", + "\n", + "of these technologies within a single platform, providing data\n", + "\n", + "scientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n", + "\n", + "ience. The pre-integration of these technologies means various\n", + "\n", + "per­sonas can work faster and leverage additional capabilities,\n", + "\n", + "such as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n", + "\n", + "transparency of the organization’s model building efforts.\n", + "\n", + "To see an end-to-end example of how a wide and deep\n", + "\n", + "recommender model may be built on Databricks, please\n", + "\n", + "check out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n", + "\n", + "**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Use data to inform recommendations:** Use data from\n", + "\n", + "your analytics, player feedback, and other sources to\n", + "\n", + "understand what players like and dislike. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and engaging for individual players.\n", + "\n", + "- **Segment your players:** Consider segmenting your players\n", + "\n", + "based on characteristics such as playstyle, spending\n", + "\n", + "habits, and demographic information. This will allow you\n", + "\n", + "to create more targeted recommendations for different\n", + "\n", + "groups of players.\n", + "\n", + "- **Consider the player’s current context:** When creating\n", + "\n", + "recommendations, consider the player’s current context,\n", + "\n", + "such as what they are doing in the game and what\n", + "\n", + "content they have already consumed. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and timely.\n", + "\n", + "- **Test and optimize your recommendations:** Use\n", + "\n", + "experimentation methods such as A/B testing to see\n", + "\n", + "how different recommendations perform with different\n", + "\n", + "player segments. Use the insights you gain to optimize\n", + "\n", + "your recommendations.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "players about how you are creating recommendations and\n", + "\n", + "give them the option to opt out if they wish.\n", + "\n", + "- **Use recommendations to improve the player experience:**\n", + "\n", + "Use personalized recommendations to improve the player\n", + "\n", + "experience and increase engagement and satisfaction.\n", + "\n", + "### Getting Started with Next Best Offer/Action\n", + "\n", + "Since NBO/NBA is a specific use case of personalization, how a\n", + "\n", + "team might get started implementing this will look very similar\n", + "\n", + "to how they would with broader personalization activities.\n", + "\n", + "Begin with ensuring you are appropriately collecting player\n", + "\n", + "data (behavior, preferences, in-game purchases, etc), storing\n", + "\n", + "it in your cloud data lake using a service such as Delta Lake\n", + "\n", + "from Databricks. From here, you’ll prepare the data using\n", + "\n", + "Databricks to clean, transform, and prepare for analysis.\n", + "\n", + "This may include aggregating data from multiple sources,\n", + "\n", + "removing duplicates and outliers, and transforming the data\n", + "\n", + "into a format suitable for analysis. As you analyze the player\n", + "\n", + "data, seek to identify patterns and trends in player behavior\n", + "\n", + "\n", + "-----\n", + "\n", + "and preferences that will give you signal on which actions are\n", + "\n", + "more likely to be successful.\n", + "\n", + "From here, you can build a recommendation model based\n", + "\n", + "on the player data analysis, and incorporate information\n", + "\n", + "on in-game items and player preferences to make\n", + "\n", + "personalized recommendations.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** Like every use case, starting with\n", + "\n", + "clearly defined goals helps to ensure your implementation\n", + "\n", + "of NBO and NBA will be as effective and efficient as\n", + "\n", + "possible. Your goals will also help you determine what data\n", + "\n", + "to collect and how it will be used.\n", + "\n", + "- **Collect relevant data:** Based on your goals, make sure\n", + "\n", + "you are collecting the right data to inform your NBO and\n", + "\n", + "NBA recommendations. This might include data on player\n", + "\n", + "behavior, engagement, and spending habits.\n", + "\n", + "- **Leverage machine learning to scale your**\n", + "\n", + "**recommendations:** Use machine learning algorithms to\n", + "\n", + "analyze your data and make personalized recommendations\n", + "\n", + "to your players. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n", + "\n", + "methods such as A/B testing to see how different\n", + "\n", + "recommendations perform with different player segments.\n", + "\n", + "Past performance is not a perfect indicator of future\n", + "\n", + "success. Consistent testing allows you to tune your NBO and\n", + "\n", + "NBA recommendations so they evolve with your playerbase.\n", + "\n", + "- **Consider the player’s context:** When making recommend­\n", + "\n", + "ations, consider the player’s current context, such as what\n", + "\n", + "they are doing in the game and what content they have\n", + "\n", + "already consumed. This will help you create recommend­\n", + "\n", + "ations that are more likely to be relevant and timely.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "your players about how you are using their data to make\n", + "\n", + "recommendations, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your NBO and NBA\n", + "\n", + "\n", + "### Getting Started with Churn Prediction & Prevention\n", + "\n", + "The exciting part of this analysis is that not only does it\n", + "\n", + "help to quantify the risk of customer churn but it paints a\n", + "\n", + "quantitative picture of exactly which factors explain that risk.\n", + "\n", + "It’s important that we not draw too rash of a conclusion with\n", + "\n", + "regards to the causal linkage between a particular attribute\n", + "\n", + "and its associated hazard, but it’s an excellent starting point\n", + "\n", + "for identifying where an organization needs to focus its\n", + "\n", + "attention for further investigation.\n", + "\n", + "The hard part in this analysis is not the analytic techniques.\n", + "\n", + "The Kaplan-Meier curves and Cox Proportional Hazard\n", + "\n", + "models used to perform the analysis above are well\n", + "\n", + "established and widely supported across analytics platforms.\n", + "\n", + "The principal challenge is organizing the input data.\n", + "\n", + "The vast majority of subscription services are fairly new as\n", + "\n", + "businesses. As such, the data required to examine customer\n", + "\n", + "attrition may be scattered across multiple systems,\n", + "\n", + "making an integrated analysis more difficult. Data Lakes\n", + "\n", + "are a starting point for solving this problem, but complex\n", + "\n", + "transformations required to cleanse and restructure data\n", + "\n", + "that has evolved as the business itself has (often rapidly)\n", + "\n", + "evolved requires considerable processing power. This is\n", + "\n", + "certainly the case with the KKBox information assets and is a\n", + "\n", + "point noted by the data provider in their public challenge.\n", + "\n", + "The key to successfully completing this work is the\n", + "\n", + "establishment of transparent, maintainable data processing\n", + "\n", + "pipelines executed on an elastically scalable (and therefore\n", + "\n", + "cost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n", + "\n", + "cost-conscious in their initial approach, it’s important to\n", + "\n", + "remember the point made above that churn is a chronic\n", + "\n", + "condition to be managed. As such, this is an analysis that\n", + "\n", + "should be periodically revisited to ensure acquisition and\n", + "\n", + "retention practices are aligned.\n", + "\n", + "To support this, we are making the code behind our\n", + "\n", + "analysis available for download and review. If you have any\n", + "\n", + "questions about how this solution can be deployed in your\n", + "\n", + "environment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "efforts with your team and encourage them to use the\n", + "\n", + "\n", + "data to inform their work.\n", + "\n", + "\n", + "**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define churn:** Clearly define what you consider to be\n", + "\n", + "player churn, as this will determine how you measure\n", + "\n", + "and predict it. For example, you might consider churn to\n", + "\n", + "be when a player stops playing your game for a certain\n", + "\n", + "number of days, or when they uninstall it.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you predict and prevent churn. This\n", + "\n", + "might include data on player behavior, engagement, and\n", + "\n", + "spending habits.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and predict which players are at\n", + "\n", + "risk of churning. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** Use experimentation methods such as\n", + "\n", + "A/B testing to see how different strategies impact churn\n", + "\n", + "rates. Use the insights you gain to optimize your churn\n", + "\n", + "prevention efforts.\n", + "\n", + "- **Focus on retention:** Implement retention strategies that are\n", + "\n", + "tailored to the needs and preferences of your players. This\n", + "\n", + "might involve providing personalized content, addressing\n", + "\n", + "pain points, or offering incentives to continue playing.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are using their data to predict and\n", + "\n", + "prevent churn, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your churn prediction\n", + "\n", + "and prevention efforts with your team and encourage\n", + "\n", + "them to use the data to inform their work.\n", + "\n", + "### Getting Started with Read-time Ad Targeting\n", + "\n", + "Typically, implementing a real-time ad targeting strategy begins\n", + "\n", + "outside of your game (in services such as Google Ads, Unity\n", + "\n", + "Advertising), where your game becomes the delivery point\n", + "\n", + "for the advertisement. Here, you will need to integrate with\n", + "\n", + "Ad networks that provide real-time ad targeting capabilities.\n", + "\n", + "That will allow you to access a range of available ad assets\n", + "\n", + "to dynamically select and display the most relevant ads to\n", + "\n", + "players. Both Google AdMob and Unity Ads are great for banner\n", + "\n", + "ads, native ads, and rewarded video ads. Your role is to ensure\n", + "\n", + "that the data you’re collecting is fed back into the advertising\n", + "\n", + "platform to better serve targeted ads to your playerbase.\n", + "\n", + "\n", + "To use a service like Databricks to manage the data needed\n", + "\n", + "to provide real-time ad targeting in your application, you can\n", + "\n", + "follow the below steps:\n", + "\n", + "`1.` **Collect and store player data:** Collect data on player\n", + "\n", + "behavior, preferences, and demographics, and store it in\n", + "\n", + "a data lake using Databricks. Popular analytics tools such\n", + "\n", + "as Google Analytics or Mixpanel can be integrated into\n", + "\n", + "the game to collect data on player behavior. These tools,\n", + "\n", + "just like tracking website traffic, can track in-game events,\n", + "\n", + "provide insights on player behavior and demographics..\n", + "\n", + "and they give you access to detailed reports and\n", + "\n", + "dashboards. Another option is to build in-house tracking\n", + "\n", + "systems to collect data on player behavior - logging\n", + "\n", + "events, e.g in-game purchases or player actions, activities\n", + "\n", + "such as “at which level does a player quit playing” and\n", + "\n", + "storing this in a database for analysis. The downside of\n", + "\n", + "building in-house tracking systems is you will need to host\n", + "\n", + "and maintain your own logging servers.\n", + "\n", + "`2.` **Prepare the data:** Use Databricks to clean, transform,\n", + "\n", + "and prepare the player data for analysis. This may\n", + "\n", + "include aggregating data from multiple sources, removing\n", + "\n", + "duplicates and outliers, and transforming the data into a\n", + "\n", + "format suitable for analysis.\n", + "\n", + "`3.` **Analyze the data:** Use Databricks’ built-in machine\n", + "\n", + "learning and data analytics capabilities to analyze the\n", + "\n", + "player data and identify patterns and trends.\n", + "\n", + "`4.` **Create audience segments:** Based on the analysis,\n", + "\n", + "use Databricks to create audience segments based on\n", + "\n", + "common characteristics such as interests, behaviors,\n", + "\n", + "and preferences.\n", + "\n", + "`5.` **Integrate with the ad server:** When an ad opportunity\n", + "\n", + "presents itself within the game, a call is made to the ad\n", + "\n", + "server. This call includes information about the player,\n", + "\n", + "such as the audience segment that they belong to. The\n", + "\n", + "ad server then uses this information to decide what ad to\n", + "\n", + "deliver to the player.\n", + "\n", + "`6.` **Monitor and optimize:** Use Databricks to monitor the\n", + "\n", + "performance of the ad targeting and make optimizations\n", + "\n", + "as needed, such as adjusting the audience segments or\n", + "\n", + "adjusting the targeting algorithms.\n", + "\n", + "By using a service like Databricks to manage the data needed\n", + "\n", + "for real-time ad targeting, game developers can effectively\n", + "\n", + "leverage their player data to create more personalized and\n", + "\n", + "engaging experiences, increase revenue, and reduce churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Focus on player data:** Make player data the center of your\n", + "\n", + "targeting strategy by collecting and storing comprehensive\n", + "\n", + "information on player behavior, preferences, and\n", + "\n", + "demographics. Here, it’s critical to ensure the game code\n", + "\n", + "data trackers are properly implemented in order to collect\n", + "\n", + "this data (see Game Analytics section for detail).\n", + "\n", + "- **Segment your audience:** Create audience segments\n", + "\n", + "based on common characteristics such as interests,\n", + "\n", + "behaviors, and preferences, and use these segments to\n", + "\n", + "\n", + "**Test and iterate:** Continuously test and iterate your\n", + "\n", + "targeting strategy to refine your audience segments and\n", + "\n", + "improve targeting accuracy.\n", + "\n", + "**Balance relevance and privacy:** Balance the need for\n", + "\n", + "relevant, personalized ads with players’ privacy by only\n", + "\n", + "collecting and using data that is necessary for targeting\n", + "\n", + "and obtaining player consent.\n", + "\n", + "**Monitor performance:** Regularly monitor the performance\n", + "\n", + "of your targeting strategy to ensure that it is delivering the\n", + "\n", + "desired results and make optimizations as needed.\n", + "\n", + "**Partner with the right ad platform:** Choose an ad\n", + "\n", + "platform that is well-suited to your needs and aligns with\n", + "\n", + "your goals, and work closely with them to ensure that your\n", + "\n", + "targeting strategy is delivering the best results.\n", + "\n", + "\n", + "deliver targeted ads.\n", + "\n", + "# Operational use cases\n", + "\n", + "\n", + "### Anomaly Detection\n", + "\n", + "First thing is to begin collecting the data, game server / client\n", + "\n", + "logs out of your project. Then consume this into Databricks\n", + "\n", + "Delta, to have a continuous anomaly detection model\n", + "\n", + "running. Focus this on key pieces of information you want to\n", + "\n", + "monitor, for example - for live service games, this is going to\n", + "\n", + "be infrastructure and network-related metrics such as Ping\n", + "\n", + "and Server Health (# of clients connected, server uptime,\n", + "\n", + "server usage, CPU/RAM, # of sessions, time of sessions).\n", + "\n", + "Once the model is ingesting and tuned specifically for the\n", + "\n", + "metrics based on the information you have above. You would\n", + "\n", + "build out alerts or notifications based on these specific\n", + "\n", + "metrics hitting a threshold that you define as needing\n", + "\n", + "attention. From here, you can build out automated systems\n", + "\n", + "to mitigate those effects - such as migrating players to a\n", + "\n", + "different server, canceling matches, scaling infrastructure,\n", + "\n", + "creating tickets for admins to review.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define the problem and objectives clearly:** Before\n", + "\n", + "implementing an anomaly detection solution, it is\n", + "\n", + "important to define the problem you are trying to solve\n", + "\n", + "and your specific objectives. This will help ensure that\n", + "\n", + "you have the right data sources and use the appropriate\n", + "\n", + "algorithms to achieve your goals.\n", + "\n", + "- **Choose the right data sources:** To effectively detect\n", + "\n", + "anomalies, you need to have the right data sources.\n", + "\n", + "Consider data from player behavior, system performance,\n", + "\n", + "and network traffic, as well as any other data sources that\n", + "\n", + "are relevant to your problem and objectives.\n", + "\n", + "- **Clean and preprocess the data:** To ensure that the\n", + "\n", + "data you use for anomaly detection is accurate and\n", + "\n", + "meaningful, it is important to clean and preprocess the\n", + "\n", + "data. This includes removing any irrelevant or invalid data,\n", + "\n", + "handling missing values, and normalizing the data\n", + "\n", + "if necessary.\n", + "\n", + "- **Choose the right algorithms:** There are many algorithms\n", + "\n", + "that can be used for anomaly detection, including\n", + "\n", + "statistical methods, machine learning algorithms, and\n", + "\n", + "rule-based systems. Choose the algorithms that are best\n", + "\n", + "\n", + "-----\n", + "\n", + "suited to your data and problem, and that provide the\n", + "\n", + "right level of accuracy, speed, and scalability.\n", + "\n", + "- **Validate the results:** Before deploying the anomaly\n", + "\n", + "detection solution in production, it is important to validate\n", + "\n", + "the results by testing the solution on a small subset of\n", + "\n", + "data and comparing the results to expected outcomes.\n", + "\n", + "- **Monitor and update the solution:** Once the anomaly\n", + "\n", + "detection solution is deployed, it is important to monitor\n", + "\n", + "its performance and accuracy, and update the solution as\n", + "\n", + "needed. This may include retraining the algorithms, adding\n", + "\n", + "or removing data sources, and updating the parameters\n", + "\n", + "and thresholds used by the algorithms.\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Avoid overfitting:** Overfitting occurs when the anomaly\n", + "\n", + "detection solution is too complex and learns the noise\n", + "\n", + "in the data rather than the underlying patterns. To avoid\n", + "\n", + "overfitting, it is important to choose algorithms that are\n", + "\n", + "appropriate for the size and complexity of the data, and to\n", + "\n", + "validate the results using a separate test dataset.\n", + "\n", + "- **False positive and false negative results:** False positive\n", + "\n", + "and false negative results can occur when the anomaly\n", + "\n", + "detection solution is not properly calibrated, or when\n", + "\n", + "the solution is applied to data that is significantly\n", + "\n", + "different from the training data. To minimize the risk of\n", + "\n", + "false positive and false negative results, it is important\n", + "\n", + "to validate the results using a separate test dataset, and\n", + "\n", + "to fine-tune the parameters and thresholds used by the\n", + "\n", + "algorithms as needed.\n", + "\n", + "- **Scalability:** Scalability can be a concern when\n", + "\n", + "implementing an anomaly detection solution, especially\n", + "\n", + "when dealing with large amounts of data. To ensure that\n", + "\n", + "the solution can scale to meet the demands of a growing\n", + "\n", + "player base, it is important to choose algorithms that\n", + "\n", + "are fast and scalable, and to deploy the solution using a\n", + "\n", + "scalable infrastructure.\n", + "\n", + "### Getting Started with Build Pipeline\n", + "\n", + "An operational goal game projects have is to make sure\n", + "\n", + "game project builds are generated, delivered quickly and\n", + "\n", + "efficiently to internal testing & external users.\n", + "\n", + "\n", + "A few of the key metrics and capabilities with analyzing your\n", + "\n", + "build pipelines are the below:\n", + "\n", + "- **Build time and speed:** This includes metrics such as\n", + "\n", + "the time it takes to create a build, number of builds, and\n", + "\n", + "compute spent.\n", + "\n", + "- **Build size and storage:** size of the builds, amount of\n", + "\n", + "storage, and network costs.\n", + "\n", + "- **Bug tracking and resolution:** This includes metrics such\n", + "\n", + "as the number of bugs reported, the time it takes to\n", + "\n", + "resolve them, and the number of bugs that are resolved in\n", + "\n", + "each build.\n", + "\n", + "- **Code quality and efficiency:** This includes metrics such\n", + "\n", + "as code complexity, code duplication, and the number of\n", + "\n", + "code lines written.\n", + "\n", + "- **Collaboration and communication:** Such as the number\n", + "\n", + "of code reviews, the number of team meetings, and the\n", + "\n", + "number of code commits.\n", + "\n", + "- **Advanced capabilities:** Such as Predicting real time build\n", + "\n", + "failure to reduce spend and combining build data with\n", + "\n", + "Crash Analytics (see below) to have “commit to build”\n", + "\n", + "visibility for accelerated bug fixing.\n", + "\n", + "Before you start implementing your build pipeline, it’s\n", + "\n", + "important to define your requirements. What are the key\n", + "\n", + "goals of your build pipeline? Choosing the right CI/CD tools is\n", + "\n", + "critical to the success of your build pipeline. There are many\n", + "\n", + "different tools available, including Jenkins, Azure Devops,\n", + "\n", + "Perforce, gitlab and more. When choosing a CI/CD tool,\n", + "\n", + "consider factors such as ease of use, scalability, and cost. In\n", + "\n", + "addition, consider the specific needs of your game project,\n", + "\n", + "and choose a tool that can meet those needs.\n", + "\n", + "The general recommendation is to look at automating your\n", + "\n", + "build process early. Once you’ve chosen your CI/CD tools, you\n", + "\n", + "can automate your build process by setting up a build server,\n", + "\n", + "configuring your CI/CD tool, and creating a script to build your\n", + "\n", + "game project. The build process should be automated as much\n", + "\n", + "as possible, and it should include steps to compile your code,\n", + "\n", + "run automated tests, and generate a build of your project.\n", + "\n", + "Once you have automated your build process, often the\n", + "\n", + "next step is to implement CD (Continuous Delivery). This\n", + "\n", + "involves automating the deployment of your game builds\n", + "\n", + "delivery to stakeholders, such as QA testers, beta testers, or\n", + "\n", + "end-users via publishing platforms. CD can help ensure that\n", + "\n", + "stakeholders have access to the latest version of your game\n", + "\n", + "\n", + "-----\n", + "\n", + "as soon as possible, allowing them to provide feedback and\n", + "\n", + "help drive the development process forward.\n", + "\n", + "Finally, it’s important to monitor and measure your build\n", + "\n", + "pipeline to ensure that it’s working as expected. This can\n", + "\n", + "involve using tools such as Databricks Dashboards to\n", + "\n", + "visualize the status of your pipeline, or using metrics such\n", + "\n", + "as build times, test results, and deployment success rates\n", + "\n", + "to evaluate the performance of your pipeline. By monitoring\n", + "\n", + "and measuring your build pipeline, you can identify areas for\n", + "\n", + "improvement and make changes as needed to ensure that\n", + "\n", + "your pipeline continues to meet your needs.\n", + "\n", + "If you have any questions about how databricks can\n", + "\n", + "integrate into your devops solution, please don’t hesitate to\n", + "\n", + "[reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Seek to automate early and often:** Automate as much\n", + "\n", + "of the build process as possible, from checking code into\n", + "\n", + "version control to generating builds and distributing them\n", + "\n", + "to stakeholders. This can help reduce errors and save time,\n", + "\n", + "allowing game teams to focus on more high value tasks.\n", + "\n", + "\n", + "**Version control, version control, version control:** Use a\n", + "\n", + "version control system to manage the source code and\n", + "\n", + "other assets. This ensures that changes to the codebase\n", + "\n", + "are tracked and can be easily undone if needed.\n", + "\n", + "**Implement continuous integration and delivery:**\n", + "\n", + "Continuous integration (CI) involves automatically building\n", + "\n", + "and testing after code changes are checked into version\n", + "\n", + "control. With CI, new changes to the codebase do not\n", + "\n", + "break existing functionality. By automating the build\n", + "\n", + "process, CI helps to reduce errors and save time. CD, on\n", + "\n", + "the other hand, involves automatically delivering builds to\n", + "\n", + "stakeholders, such as QA testers, beta testers, or end-\n", + "\n", + "users, after they have passed the automated tests. By\n", + "\n", + "combining CI and CD, a video game project can ensure\n", + "\n", + "that builds are generated and delivered quickly and\n", + "\n", + "efficiently, without the need for manual intervention.\n", + "\n", + "**Build for scalability:** As your game project grows, you\n", + "\n", + "will need a build pipeline solution that is scalable and can\n", + "\n", + "handle the needs of your game team.\n", + "\n", + "**Integration with other tools:** Integrate the build pipeline\n", + "\n", + "solution with other tools and systems, such as issue\n", + "\n", + "tracking, testing, and deployment tools, to ensure a\n", + "\n", + "smooth and efficient workflow.\n", + "\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "\n", + "|GAME INFRASTRUCTURE|Col2|\n", + "|---|---|\n", + "|||\n", + "|||\n", + "\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Crash Analytics\n", + "\n", + "Building a pipeline to build a holistic view to support crash\n", + "\n", + "analytics means data coming from multiple different\n", + "\n", + "sources, different velocities and joining the data together.\n", + "\n", + "The amount of data sources depends on your game projects\n", + "\n", + "publishing platforms, some may come from console based\n", + "\n", + "providers such as sony, xbox, and nintendo or pc platforms\n", + "\n", + "like Steam, Epic Games Marketplace, GoG and many others.\n", + "\n", + "**High level steps**\n", + "\n", + "- Determine what platforms your game is running on and\n", + "\n", + "how to interface to collect data.\n", + "\n", + "- **Collect crash data:** Implement crash reporting tools in\n", + "\n", + "your game to collect data on crashes. The source data\n", + "\n", + "may be delivered in varying formats such as JSON or CSV.\n", + "\n", + "- **Load crash data into Databricks:** Use Databricks’ data\n", + "\n", + "ingestion tools to load the crash data into your workspace.\n", + "\n", + "This could involve using Databricks’ built-in data source\n", + "\n", + "connectors, or programmatically ingest files to load the data.\n", + "\n", + "\n", + "\n", + "- **Transform and clean the crash data:** Use Databricks’\n", + "\n", + "data processing and transformation tools to clean and\n", + "\n", + "prepare the crash data for analysis. This could involve\n", + "\n", + "using Databricks’ capabilities like DLT, or using SQL to\n", + "\n", + "perform custom transformations.\n", + "\n", + "- **Visualize crash data:** Use Databricks’ dashboarding tools\n", + "\n", + "to create visualizations that help you understand the\n", + "\n", + "patterns and trends in your crash data. This could involve\n", + "\n", + "using Databricks’ built-in visualization tools, or integrating\n", + "\n", + "with external visualization tools like Tableau or PowerBI.\n", + "\n", + "- **Analyze crash data:** Use Databricks’ machine learning\n", + "\n", + "and statistical analysis tools to identify the root causes\n", + "\n", + "of crashes. This could involve using Spark MLlib or many\n", + "\n", + "of the popular tools to build machine learning models, or\n", + "\n", + "using SQL to perform custom analyses.\n", + "\n", + "- **Monitor and refine your pipeline:** Regularly review your\n", + "\n", + "pipeline to ensure that it remains relevant and useful.\n", + "\n", + "Refine your pipeline as necessary to reflect changes in\n", + "\n", + "your crash data or your goals.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Automated collection and aggregation of crash reports:**\n", + "\n", + "Collecting crash reports should be an automated process\n", + "\n", + "that is integrated into the output of the build pipeline\n", + "\n", + "for the game. The crash reports should be automatically\n", + "\n", + "aggregated and made available for analysis in near real-time.\n", + "\n", + "- **Clear reporting and prioritization of issues:** The solution\n", + "\n", + "should provide clear reporting on the most common\n", + "\n", + "issues and allow game developers to prioritize fixing the\n", + "\n", + "most impactful problems first.\n", + "\n", + "- **Integration with other analytics tools:** The crash analytics\n", + "\n", + "solution should integrate with other analytics tools, such\n", + "\n", + "as player behavior tracking, to provide a more complete\n", + "\n", + "picture of how crashes are impacting the player experience.\n", + "\n", + "- **Flexibility and scalability:** As the game grows, the\n", + "\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Data privacy and security:** Ensure that crash reports do\n", + "\n", + "not contain sensitive information that could be used to\n", + "\n", + "identify individual players.\n", + "\n", + "- **Scalability:** As the number of players and crashes\n", + "\n", + "increases, it may become difficult to manage and analyze\n", + "\n", + "the growing volume of data.\n", + "\n", + "- **Integration with other tools:** Be aware when integrating\n", + "\n", + "crash analytics with other tools and systems, especially if\n", + "\n", + "the tools use different data formats or data structures.\n", + "\n", + "- **Prioritization of issues:** Determine which crashes are\n", + "\n", + "the most impactful and prioritize fixes accordingly. This\n", + "\n", + "can be a complex process, especially if there are a large\n", + "\n", + "number of different crash types and causes.\n", + "\n", + "\n", + "solution should be able to scale to accommodate an\n", + "\n", + "increasing number of players and crashes.\n", + "\n", + "**Data privacy and security:** It’s important to consider data\n", + "\n", + "privacy and security when implementing a crash analytics\n", + "\n", + "solution. This may involve implementing measures to\n", + "\n", + "anonymize crash reports, or taking steps to ensure that\n", + "\n", + "sensitive information is not included in the reports.\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
### Executive Guide\n", + "\n", + "# Transform and Scale Your Organization With Data and AI\n", + "\n", + "#### A guide for CIOs, CDOs, and\n", + " data and AI executives\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R :**\n", + "\n", + "**Chris D’Agostino**\n", + "\n", + "Global Field CTO\n", + "\n", + "Databricks\n", + "\n", + "**E D I T O R S :**\n", + "\n", + "Manveer Sahota\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n", + "\n", + "**1.** Establish the goals and business value 8\n", + "\n", + "**2.** Identify and prioritize use cases 19\n", + "\n", + "**3.** Build successful data teams 22\n", + "\n", + "**4.** Deploy a modern data stack 28\n", + "\n", + "**5.** Improve data governance and compliance 36\n", + "\n", + "**6.** Democratize access to quality data 41\n", + "\n", + "**7.** Dramatically increase productivity of your workforce 47\n", + "\n", + "**8.** Make informed build vs. buy decisions 52\n", + "\n", + "**9.** Allocate, monitor and optimize costs 55\n", + "\n", + "**10.** Move to production and scale adoption 58\n", + "\n", + "\n", + "Jessica Barbieri\n", + "\n", + "\n", + "Toby Balfre\n", + "\n", + "\n", + "**C H A P T E R 3 :** **Conclusion** \u0007 63\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Executive Summary\n", + "\n", + "Data and AI leaders are faced with the challenge\n", + "\n", + "of future-proofing their architecture and platform\n", + "\n", + "investments. The Lakehouse implementation from\n", + "\n", + "Databricks combines the best features of EDWs\n", + "\n", + "and data lakes by enabling all their workloads using\n", + "\n", + "open source and open standards — avoiding the\n", + "\n", + "vendor lock-in, black box design and proprietary\n", + "\n", + "data formats of other cloud vendors.\n", + "\n", + "\n", + "It’s not surprising that many industry experts say data is the most valuable resource in the modern\n", + "\n", + "economy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n", + "\n", + "water. Its core compound never changes, and it can be transformed to whatever use case is desired,\n", + "\n", + "with the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n", + "\n", + "essential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n", + "\n", + "importance of data are growing exponentially in both our professional and personal lives, while artificial\n", + "\n", + "intelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n", + "\n", + "over the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n", + "\n", + "2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n", + "\n", + "(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n", + "\n", + "governance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n", + "\n", + "Every organization is working to improve business outcomes while effectively managing a variety of risks —\n", + "\n", + "including economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n", + "\n", + "Your organization’s data and the systems that process it play a critical role in not only enabling your financial\n", + "\n", + "goals but also in minimizing these seven key business risks.\n", + "\n", + "Businesses have realized that their legacy information technology (IT) platforms are not able to scale and\n", + "\n", + "meet the increasing demands for better data analytics. As a result, they are looking to transform how their\n", + "\n", + "organizations use and process data. Successful data transformation initiatives for data, analytics and AI\n", + "\n", + "involve not only the design of hardware and software systems but also the alignment of people, processes\n", + "\n", + "and platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n", + "\n", + "significant return on investment (ROI) — one that starts in months, not years.\n", + "\n", + "To guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n", + "\n", + "Despite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n", + "\n", + "to deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n", + "\n", + "efficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n", + "\n", + "identify and execute on AI opportunities.\n", + "\n", + "\n", + "-----\n", + "\n", + "To successfully lead data and AI transformation initiatives, organizations need to develop and execute\n", + "\n", + "a comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n", + "\n", + "full potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n", + "\n", + "organizations have the option of moving away from closed, proprietary systems offered by a variety\n", + "\n", + "of cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n", + "\n", + "industry standards.\n", + "\n", + "At Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n", + "\n", + "we’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n", + "\n", + "in successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n", + "\n", + "architecture, which decouples data storage from compute while providing the best price/performance\n", + "\n", + "metrics for all your data workloads — including data warehousing. We have captured the lessons learned\n", + "\n", + "and summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n", + "\n", + "CIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n", + "\n", + "initiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n", + "\n", + "unified data platform that realizes the data lakehouse architecture and enables the data personas in your\n", + "\n", + "organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n", + "\n", + "shown in Figure 1.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "**Figure 1:**\n", + "The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**The lakehouse architecture benefits organizations in several ways:**\n", + "\n", + "**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n", + "\n", + "**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n", + "\n", + "**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n", + "\n", + "Our intention is to present key considerations and equip you with the knowledge to ask informed questions,\n", + "\n", + "make the most critical decisions early in the process, and develop the comprehensive strategy that most\n", + "\n", + "organizations lack.\n", + "\n", + "In addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n", + "\n", + "professional services offering that organizations can leverage to measure their readiness, reskill their staff\n", + "\n", + "and track progress as they embark on their data transformation initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Define the Strategy\n", + "\n", + "\n", + "The most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n", + "\n", + "strategy for how your organization will leverage people, processes and platforms to drive measurable\n", + "\n", + "business results against your corporate priorities. The strategy serves as a set of principles that every\n", + "\n", + "member of your organization can refer to when making decisions. The strategy should cover the roles and\n", + "\n", + "responsibilities of teams within your organization for how you capture, store, curate and process data to run\n", + "\n", + "your business — including the internal and external resources (labor and budget) needed to be successful.\n", + "\n", + "\n", + "Establish the\n", + "goals and\n", + "business value\n", + "\n", + "\n", + "Build\n", + "successful\n", + "data teams\n", + "\n", + "\n", + "Ease data\n", + "governance and\n", + "compliance\n", + "\n", + "\n", + "Simplify\n", + "the user\n", + "experience\n", + "\n", + "\n", + "Allocate,\n", + "monitor and\n", + "optimize costs\n", + "\n", + "\n", + "Identify and\n", + "prioritize\n", + "use cases\n", + "\n", + "\n", + "Deploy a modern\n", + "data architecture\n", + "\n", + "\n", + "Democratize\n", + "access to\n", + "quality data\n", + "\n", + "\n", + "Make informed\n", + "build vs. buy\n", + "decisions\n", + "\n", + "\n", + "Move to\n", + "production and\n", + "drive adoption\n", + "\n", + "\n", + "**Figure 2:**\n", + "The 10 steps to a winning data and AI strategy\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Here are 10 key considerations\n", + "\n", + "**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n", + "\n", + "**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n", + "\n", + "**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n", + "\n", + "and data engineering talent.\n", + "\n", + "**4.** \u0007Future-proof your technology investment with a modern data architecture.\n", + "\n", + "**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n", + "\n", + "Consumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n", + "\n", + "**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n", + "\n", + "data access and the sharing of all your data across the organization.\n", + "\n", + "**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n", + "\n", + "**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n", + "\n", + "important problems.\n", + "\n", + "**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n", + "\n", + "**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n", + "\n", + "user satisfaction.\n", + "\n", + "The strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n", + "\n", + "owned and governed by the CDO and made available for everyone in the organization to review and provide\n", + "\n", + "feedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n", + "\n", + "the technology landscape or a combination of any of these — but it should serve as the North Star for\n", + "\n", + "how you will navigate the many decisions and trade-offs that you will need to make over the course of the\n", + "\n", + "transformation.\n", + "\n", + "\n", + "This guide takes a stepwise approach to\n", + "\n", + "addressing each of these 10 topics.\n", + "\n", + "\n", + "-----\n", + "\n", + "Studies have shown that data scientists spend 80%\n", + "\n", + "of their time collecting and compiling data sets\n", + "\n", + "\n", + "#### 1. Establish the goals and business value\n", + "\n", + "Most organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n", + "\n", + "The goals generally fall into one of three categories:\n", + "\n", + "**1.** **Business outcomes**\n", + "\n", + "**2.** **People**\n", + "\n", + "**3.** **Technology**\n", + "\n", + "\n", + "and only 20% of their time developing insights and\n", + "\n", + "\n", + "In terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n", + "\n", + "emerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n", + "\n", + "business leaders see the digital transformation as an opportunity to build a new technology foundation\n", + "\n", + "from which to run their business and increase business value. One that is more agile, scalable, secure and\n", + "\n", + "easier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n", + "\n", + "dynamic economy.\n", + "\n", + "For organizations today, people are one of their most valuable assets — you cannot succeed in data,\n", + "\n", + "analytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n", + "\n", + "impacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n", + "\n", + "employees work in a frictionless data environment, to the extent possible, so they feel productive each day\n", + "\n", + "and can do their best work.\n", + "\n", + "Finally, from a technology perspective, organizations have grown tired of the high costs associated with\n", + "\n", + "complex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n", + "\n", + "industry trend is to move away from large capital expenditures (capex) to pay for network and server\n", + "\n", + "capacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n", + "\n", + "approach. Your data analytics environment should support this trend as well — using open standards, low-\n", + "\n", + "cost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n", + "\n", + "once they are complete.\n", + "\n", + "\n", + "algorithms. Organizations that are able to invert\n", + "\n", + "these numbers benefit in two ways — happier\n", + "\n", + "employees and improved time to market for use\n", + "\n", + "cases. These employers create more favorable\n", + "\n", + "working environments and lower the risk of burnout\n", + "\n", + "and the resulting regrettable attrition.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Executive buy-in and support**\n", + "\n", + "Large organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n", + "\n", + "to have unwavering buy-in and support from the highest levels of management — including the CEO and\n", + "\n", + "board of directors. With this support, you have the leverage you need to develop the strategy, decide on\n", + "\n", + "an architecture and implement a solution that can truly change the way your business is run. Without it,\n", + "\n", + "you have a very expensive science project that has little hope of succeeding. Why? Because the majority\n", + "\n", + "of people in your organization are busy doing their day jobs. The added work to support the initiative must\n", + "\n", + "be offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n", + "\n", + "within it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n", + "\n", + "Transformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n", + "\n", + "all the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n", + "\n", + "to be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n", + "\n", + "vocal proponents.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolve to an AI-first company — not just a data-first company**\n", + "\n", + "Data and AI transformations should truly transform the way organizations use data, not just evolve it. For\n", + "\n", + "decades, businesses have operated using traditional business processes and leveraged Structured Query\n", + "\n", + "Language (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n", + "\n", + "data. There are five major challenges with this approach:\n", + "\n", + "**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n", + "\n", + "use pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n", + "\n", + "ability to replicate and scale the results is nearly impossible.\n", + "\n", + "Auto�ated Decision�Ma�ing\n", + "\n", + "#### Tech leaders are to the right of the Data Maturity Curve\n", + "\n", + "\n", + "Prescriptive Anal�tics\n", + "\n", + "Predictive Modeling\n", + "\n", + "Data Exploration\n", + "\n", + "\n", + "From hindsight to foresight\n", + "\n", + "\n", + "How should\n", + "we respond?\n", + "\n", + "\n", + "Auto�aticall� �a��\n", + "the best decision\n", + "\n", + "\n", + "Ad Hoc Queries\n", + "\n", + "Reports\n", + "Clean Data\n", + "\n", + "WHAT HAPPENED? WHAT W255 HAPPEN?\n", + "\n", + "Data and A2 Maturit�\n", + "\n", + "\n", + "**Figure 3:**\n", + "The Data Maturity Curve\n", + "\n", + "\n", + "-----\n", + "\n", + "**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n", + "\n", + "processing.\n", + "\n", + "**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n", + "\n", + "programmatically state, in a priority manner, how data insights can be achieved or how the business\n", + "\n", + "should react to changing data.\n", + "\n", + "**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n", + "\n", + "number of people needed to respond to every piece of data flowing into your environment. Machines\n", + "\n", + "scale, people do not.\n", + "\n", + "**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n", + "\n", + "gain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n", + "\n", + "21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n", + "\n", + "of processing and managing data will not work. Using ML and AI will empower your workforce to\n", + "\n", + "leverage data to make better decisions for managing risk, helping your organization succeed in the\n", + "\n", + "modern economy.\n", + "\n", + "**Go “all in” on the cloud**\n", + "\n", + "The COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n", + "\n", + "videoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n", + "\n", + "cloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n", + "\n", + "as a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n", + "\n", + "especially when combined with the use of open source software (OSS), increase the speed at which\n", + "\n", + "organizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n", + "\n", + "For AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n", + "\n", + "corporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n", + "\n", + "time, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n", + "\n", + "are well aware of vendor lock-in and want to abstract their applications so they can be moved across\n", + "\n", + "clouds if there is a compelling business reason.\n", + "\n", + "\n", + "-----\n", + "\n", + "Approaching your technology choices with a multicloud point of view gives the organization more sovereignty\n", + "\n", + "over the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n", + "\n", + "run on different cloud providers and simplified compliance with emerging regulations that may require\n", + "\n", + "companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n", + "\n", + "As a result, data portability and the ability to run workloads on different cloud providers are becoming\n", + "\n", + "increasingly important.\n", + "\n", + "**Modernize business applications**\n", + "\n", + "As organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n", + "\n", + "approach. The majority of on-premises applications are not built with the cloud in mind. They usually\n", + "\n", + "differ in the way that they handle security, resiliency, scalability and failover. Their application designs\n", + "\n", + "often store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n", + "\n", + "CCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n", + "\n", + "therefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n", + "\n", + "services and APIs to easily provide access to an application’s functionality.\n", + "\n", + "Cloud-based architectures, commodity databases and software application development frameworks make\n", + "\n", + "it easier for developers to build scalable, secure end-to-end applications to run all your internal business\n", + "\n", + "processes. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n", + "\n", + "a backing database) has become straightforward with the latest tooling available to your application\n", + "\n", + "development teams.\n", + "\n", + "As a first step, organizations should inventory their business-critical applications, prioritize them based\n", + "\n", + "on business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n", + "\n", + "applications that generate and store a significant amount of the data consumed within an organization. Using\n", + "\n", + "a consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n", + "\n", + "\n", + "“We are on an amazing journey. Being among\n", + "\n", + "the fastest-growing enterprise software cloud\n", + "\n", + "companies on record was unimaginable when\n", + "\n", + "we started Databricks. To get here, we’ve stayed\n", + "\n", + "focused on the three big bets we made when\n", + "\n", + "founding the company — cloud, open source\n", + "\n", + "and machine learning. Fast-forward seven years,\n", + "\n", + "thousands of data teams around the globe are\n", + "\n", + "working better together on Databricks.”\n", + "\n", + "**Ali Ghodsi**\n", + "\n", + "Co-founder and CEO\n", + "\n", + "Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "The next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n", + "\n", + "A good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n", + "\n", + "other applications within your environment to store copies of the data — unless absolutely necessary for\n", + "\n", + "performance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n", + "\n", + "the data from the actual SOR.\n", + "\n", + "Data from these SORs should be made available in three ways:\n", + "\n", + "**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n", + "\n", + "**2.** \u0007Ensure that copies of the data land in the data lake.\n", + "\n", + "**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n", + "\n", + "consumption by downstream applications.\n", + "\n", + "**Move toward real-time decisioning**\n", + "\n", + "The value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n", + "\n", + "and the second is to view data as an individual event. This so-called “time value of data” is an important\n", + "\n", + "concept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n", + "\n", + "the same data platform.\n", + "\n", + "On the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n", + "\n", + "aggregate data provides the ability to look back in time and see the complete history of an aspect of your\n", + "\n", + "business and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n", + "\n", + "newly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n", + "\n", + "can positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n", + "\n", + "The goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n", + "\n", + "This “time value of data” is shown in Figure 4 on the next page.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, real-time processing of clickstream data from your customer-facing mobile application can\n", + "\n", + "indicate when the customer is having trouble and may need to call into your call center. This insight gives\n", + "\n", + "you the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n", + "\n", + "center agents — improving the customer experience and lowering customer churn.\n", + "\n", + "Data, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n", + "\n", + "machine learning models using historical data and provides you with the ability to make real-time decisions\n", + "\n", + "as new events take place. For example, credit card fraud models can use deep historical data about a given\n", + "\n", + "customer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n", + "\n", + "build rich models that are then executed for each new credit card transaction. This real-time execution,\n", + "\n", + "combined with historical data, enables the best possible customer experience.\n", + "\n", + "#### Time Value of Data\n", + "\n", + "\n", + "The Databricks Lakehouse Platform allows you to\n", + "\n", + "combine real-time streaming and batch processing\n", + "\n", + "using one architecture and a consistent set of\n", + "\n", + "programming APIs.\n", + "\n", + "**Figure 4:**\n", + "Time Value of Data\n", + "\n", + "\n", + "Value of an individual data\n", + "\n", + "record is very high once created\n", + "but decreases over time\n", + "\n", + "\n", + "Value of data records\n", + "\n", + "in aggregate increases\n", + "over time\n", + "\n", + "\n", + "Real-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n", + "\n", + "\n", + "-----\n", + "\n", + "**Land** **_all_** **data in a data lake**\n", + "\n", + "In order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n", + "\n", + "user as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n", + "\n", + "access. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n", + "\n", + "warehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n", + "\n", + "that data only. What do you do when you want to ask a different question? To further complicate matters,\n", + "\n", + "how do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n", + "\n", + "How do you find new insights as quickly as possible?\n", + "\n", + "The overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n", + "\n", + "along the data pipeline — including raw, refined and curated data states.\n", + "\n", + "This phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n", + "\n", + "_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n", + "\n", + "all their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n", + "\n", + "knowing whether or not you can trust the data.\n", + "\n", + "**Change the way people work**\n", + "\n", + "When done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n", + "\n", + "savings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n", + "\n", + "minimizing the number of steps needed to produce results with data. This can be accomplished by:\n", + "\n", + "**1.** \u0007 Making data more accessible and ensuring it can be trusted\n", + "\n", + "**2.** Minimizing the number of tools/systems needed to perform work\n", + "\n", + "**3.** Creating a flywheel effect by leveraging the work of others\n", + "\n", + "\n", + "“We believe that the data lakehouse architecture\n", + "\n", + "presents an opportunity comparable to the one\n", + "\n", + "we saw during early years of the data warehouse\n", + "\n", + "market. The unique ability of the lakehouse to\n", + "\n", + "manage data in an open environment, blend all\n", + "\n", + "varieties of data from all parts of the enterprise and\n", + "\n", + "combine the data science focus of the data lake\n", + "\n", + "with the end-user analytics of the data warehouse\n", + "\n", + "will unlock incredible value for organizations.”\n", + "\n", + "**Bill Inmon**\n", + "\n", + "The father of the data warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "In large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n", + "\n", + "is laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n", + "\n", + "Systems and applications get built over time to satisfy specific needs within a line of business. As a result,\n", + "\n", + "it’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n", + "\n", + "data they need to do their jobs. It should be as simple as getting your identity and PC.\n", + "\n", + "With Databricks, users can collaborate and perform\n", + "\n", + "\n", + "A primary goal of your data and AI transformation should be to focus on improving the user experience —\n", + "\n", + "in other words, improving how your entire organization interacts with data. Data must be easily discoverable\n", + "\n", + "with default access to users based on their role(s) — with a simple process to compliantly request access to\n", + "\n", + "data sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n", + "\n", + "the various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n", + "\n", + "Finally, the results of the work performed by a user or system upstream should be made available to users\n", + "\n", + "and systems downstream as “data assets” that can drive business value.\n", + "\n", + "Organizations that maximize the productivity of their workforce and enable employees to do their best work\n", + "\n", + "under optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n", + "\n", + "**Minimize time in the “seam”**\n", + "\n", + "As you begin your data transformation, it is important to know that the longer it takes, the more risk and\n", + "\n", + "cost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n", + "\n", + "to a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n", + "\n", + "for some period of time. This will have a series of momentary adverse effects on your business:\n", + "\n", + "\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n", + "\n", + "\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n", + "\n", + "very different ecosystems\n", + "\n", + "\n", + "their work more efficiently, regardless of their\n", + "\n", + "persona or role. The user experience is designed\n", + "\n", + "to support the workloads of data analysts, SQL\n", + "\n", + "developers, data engineers, data scientists and\n", + "\n", + "machine learning professionals.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n", + "\n", + "models and cyber defenses\n", + "\n", + "\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n", + "\n", + "\u0007It will require precise communications to ensure that your business partners know which environment to\n", + "\n", + "use and for what data workloads\n", + "\n", + "To mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n", + "\n", + "“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n", + "\n", + "It’s important to remember this is a critical but short-lived experience for business continuity.\n", + "\n", + "**Shut down legacy platforms**\n", + "\n", + "In keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n", + "\n", + "steps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n", + "\n", + "premises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n", + "\n", + "premises Hadoop system is generally as follows:\n", + "\n", + "**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n", + "\n", + "**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n", + "\n", + "fixes or absolutely critical new business use cases.\n", + "\n", + "**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n", + "\n", + "**4.** \u0007Identify the source systems that feed the data.\n", + "\n", + "**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n", + "\n", + "**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n", + "\n", + "**7.** \u0007Determine the downstream consumers of the output from the jobs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n", + "\n", + "**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n", + "\n", + "architecture.\n", + "\n", + "**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n", + "\n", + "working smoothly.\n", + "\n", + "**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n", + "\n", + "**12.** \u0007Rinse and repeat — until all jobs are migrated.\n", + "\n", + "**13.** \u0007Shut down the Hadoop cluster.\n", + "\n", + "A similar model can also be applied to legacy on-premises enterprise data warehouses.\n", + "\n", + "You can follow the same process for other legacy systems in your environment. Some of these systems\n", + "\n", + "may be more complex and require the participation of more stakeholders to identify the fastest way to\n", + "\n", + "rationalize the data and processes. It is important, however, to make sure that the organization has the\n", + "\n", + "fortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n", + "\n", + "their lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n", + "\n", + "for teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n", + "\n", + "9 plays a crucial role in seeing the shutdown of legacy platforms through.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 2. Identify and prioritize use cases\n", + "\n", + "An important next step in enabling data, analytics and AI to transform your business is to identify use cases\n", + "\n", + "that drive business value — while prioritizing the ones that are achievable under the current conditions\n", + "\n", + "(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n", + "\n", + "that could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n", + "\n", + "Leaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n", + "\n", + "**Establish the list of potential use cases**\n", + "\n", + "The first step is to ideate by bringing together various stakeholders from across the organization and\n", + "\n", + "understand the overall business drivers — especially those that are monitored by the CEO and board of\n", + "\n", + "directors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n", + "\n", + "and understand the business processes and the data required to implement the use case. After steps one and\n", + "\n", + "two, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n", + "\n", + "project within the data/IT teams, it’s important to have a line of business champion at the executive level.\n", + "\n", + "There needs to be a balance between use cases that are complex and ones that are considered low-\n", + "\n", + "hanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n", + "\n", + "straightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n", + "\n", + "given individual or household. However, developing a sophisticated credit card fraud model that takes into\n", + "\n", + "account geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n", + "\n", + "to perform the analytics.\n", + "\n", + "In terms of performance, thought should be given to the speed at which the use case must execute. In\n", + "\n", + "general, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n", + "\n", + "cases into three categories:\n", + "\n", + "**1.** Sub-second response\n", + "\n", + "**2.** Multi-second response\n", + "\n", + "**3.** Multi-minute response\n", + "\n", + "\n", + "-----\n", + "\n", + "Being pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n", + "\n", + "engineering the design and infrastructure.\n", + "\n", + "**Thinking in terms of “data assets”**\n", + "\n", + "Machine learning algorithms require data — data that is readily available, of high quality and relevant — to\n", + "\n", + "perform the experiments, train the models, and then execute the model when it is deployed to production.\n", + "\n", + "The quality and veracity of the data used to perform these machine learning steps are key to deploying\n", + "\n", + "models into production that produce a tangible ROI.\n", + "\n", + "It is critical to understand what steps are needed in order to make the data available for a given use case.\n", + "\n", + "One point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n", + "\n", + "teams need to perform work to make data available for one use case, then look for opportunities to have the\n", + "\n", + "engineers do incremental work in order to surface data for adjacent use cases.\n", + "\n", + "Mature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n", + "\n", + "the importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n", + "\n", + "approach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n", + "\n", + "the level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n", + "\n", + "roadmap helps data source owners understand the priority and complexity of the data assets that need to\n", + "\n", + "be created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n", + "\n", + "influences the design of business applications and other systems within the organization.\n", + "\n", + "**Determine the highest impact/priority**\n", + "\n", + "As shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n", + "\n", + "account three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n", + "\n", + "whether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n", + "\n", + "reduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n", + "\n", + "data science talent readily available, to implement the use case. The ROI score indicates whether or not the\n", + "\n", + "organization can easily measure the impact to the P/L.\n", + "\n", + "\n", + "-----\n", + "\n", + "|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n", + "|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n", + "||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n", + "||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n", + "|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n", + "||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n", + "||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n", + "|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n", + "\n", + "\n", + "**Figure 5:**\n", + "Methodology for scoring use cases\n", + "**Ensure business and technology leadership alignment**\n", + "\n", + "Prioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n", + "\n", + "It is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n", + "\n", + "reduction (defensive). For example, data governance and compliance use cases should take priority\n", + "\n", + "over offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n", + "\n", + "acquisition of a new customer.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Professional Services team can\n", + "\n", + "help customers identify revenue-generating and\n", + "\n", + "cost-saving opportunities for data and AI use cases\n", + "\n", + "that provide a significant ROI when adopting the\n", + "\n", + "\n", + "#### 3. Build successful data teams\n", + "\n", + "In order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n", + "\n", + "performing teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n", + "\n", + "training and leadership. Digital transformations require executive-level support and are likely to fail without\n", + "\n", + "it — especially in large organizations.\n", + "\n", + "However, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n", + "\n", + "an enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n", + "\n", + "data literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n", + "\n", + "\n", + "lakehouse architecture.\n", + "\n", + "**Chief information officers and chief data officers — two sides of the data coin**\n", + "\n", + "Data native companies generally have a single, accountable executive who is responsible for areas such\n", + "\n", + "as data science, business analytics, data strategy, data governance and data management. The data\n", + "\n", + "management aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n", + "\n", + "through the environment, performing data quality checks and scanning for sensitive data in the clear.\n", + "\n", + "Many organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n", + "\n", + "to oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n", + "\n", + "stakeholders to establish the overall project plan, design and implementation — and to align project\n", + "\n", + "management, product management, business analysis, data engineering, data scientist and machine\n", + "\n", + "learning talent.\n", + "\n", + "The CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n", + "\n", + "make the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n", + "\n", + "must understand the benefits of — and their role and responsibilities in — supporting the initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "There are two organizational constructs that are found in most successful data native companies. The first is\n", + "\n", + "the creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n", + "\n", + "ML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n", + "\n", + "the formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n", + "\n", + "priorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n", + "\n", + "Furthermore, CDOs need to bring their CIOs along early in the journey.\n", + "\n", + "**Creating an AI/ML COE**\n", + "\n", + "Data science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n", + "\n", + "everything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n", + "\n", + "difficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n", + "\n", + "document, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n", + "\n", + "However, the general distinction is that data science is used to produce insights, machine learning is used to\n", + "\n", + "produce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n", + "\n", + "is expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n", + "\n", + "various data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n", + "\n", + "set of questions.\n", + "\n", + "Organizations wanting to build a data science competency should consider hiring talent into a centralized\n", + "\n", + "organization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n", + "\n", + "data science. The COE works with the rest of the organization to educate and promote the appropriate use\n", + "\n", + "of data science for various use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "A common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n", + "\n", + "the business units or department. Using this approach, you achieve two goals:\n", + "\n", + "\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n", + "\n", + "within a business unit and can help identify use cases that drive value\n", + "\n", + "\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n", + "\n", + "and consistency in how work is performed among the cohort and brings that to the entire organization\n", + "\n", + "**Data and AI transformation steering committee**\n", + "\n", + "The purpose of the steering committee is to provide governance and guidance to the data transformation\n", + "\n", + "initiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n", + "\n", + "a vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n", + "\n", + "initiative.\n", + "\n", + "The steering committee should meet regularly with leaders from across the organization to hear status\n", + "\n", + "reports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n", + "\n", + "group of stakeholders, including:\n", + "\n", + "\u0007\n", + "**Program/project management:** To report the status of progress for deploying the new data\n", + "\n", + "ecosystem and driving adoption through use cases\n", + "\n", + "\u0007\n", + "**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n", + "\n", + "of the platform\n", + "\n", + "\u0007\n", + "**Engineering:** To report the status of the implementation and what technology trade-offs need\n", + "\n", + "to be made\n", + "\n", + "\u0007\n", + "**Data science:** To report on the progress made by the COE on educating the organization about\n", + "\n", + "use cases for ML and to report the status of various implementations\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007\n", + "**InfoSec:** To review the overall security, including network, storage, application and data\n", + "\n", + "encryption and tokenization\n", + "\n", + "\u0007\n", + "**Architecture:** To oversee that the implementation adheres to architectural standards\n", + "\n", + "and guardrails\n", + "\n", + "\u0007\n", + "**Risk, compliance and legal:** To oversee the approach to data governance\n", + "\n", + "and ethics in ML\n", + "\n", + "\u0007\n", + "**User experience:** To serve as the voice of the end users who will perform their jobs using\n", + "\n", + "the new data ecosystem\n", + "\n", + "\u0007\n", + "**Communication:** To provide up-to-date communications to the organization about next\n", + "\n", + "steps and how to drive adoption\n", + "\n", + "**Partnering with architecture and InfoSec**\n", + "\n", + "Early on, the CDO and CIO should engage the engineering and architecture community within the\n", + "\n", + "organization to ensure that everyone understands the technical implications of the overall strategy. This\n", + "\n", + "minimizes the chances that the engineering teams will build separate and competing data platforms. In\n", + "\n", + "regulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n", + "\n", + "The EA is responsible for validating that the overall technology design and data management features\n", + "\n", + "support the performance and regulatory compliance requirements — specifically, whether the proposed\n", + "\n", + "design can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n", + "\n", + "variety and veracity (four Vs) of the data environment.\n", + "\n", + "\n", + "It is important to fully understand which\n", + "\n", + "environments and accounts your data is stored\n", + "\n", + "in. The goal is to minimize the number of copies of\n", + "\n", + "your data and to keep the data within your cloud\n", + "\n", + "account — and not the vendor’s.\n", + "\n", + "Make sure the architecture and security model for\n", + "\n", + "protecting data is well understood.\n", + "\n", + "\n", + "-----\n", + "\n", + "From an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n", + "\n", + "applied to the new data ecosystem and that the authentication, authorization and access control methods\n", + "\n", + "meet all the data governance requirements. An industry best practice is to enable self-service registration\n", + "\n", + "of data sets, by the data owner, and support the assignment of security groups or roles to help automate\n", + "\n", + "the access control process. This allows data sets to be accessible only to the personnel that belong to a\n", + "\n", + "given group. The group membership could be based primarily on job function or role within the organization.\n", + "\n", + "This approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n", + "\n", + "too many access control groups — in other words, do not get too fine grained with group permissions, as\n", + "\n", + "they will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n", + "\n", + "row- and column-level security sparingly.\n", + "\n", + "**Centralized vs. federated labor strategy**\n", + "\n", + "In most organizations today, managers work in silos, making decisions with the best intentions but focused\n", + "\n", + "on their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n", + "\n", + "conflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n", + "\n", + "and money and potentially erode the confidence and motivation of the various teams. While it certainly is\n", + "\n", + "beneficial to compare and contrast different approaches to implementing an architecture, the approaches\n", + "\n", + "should be strictly managed, with everyone designing for the same goals and requirements — as described in\n", + "\n", + "this strategy document and adhering to the architectural principles and best practices.\n", + "\n", + "Even still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n", + "\n", + "least amount of complexity as possible, and one that can easily scale across the organization. It is very\n", + "\n", + "challenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n", + "\n", + "in front of this wave of innovation and take input from the various teams to create a single, centralized\n", + "\n", + "platform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n", + "\n", + "modern data stack — while ensuring that there is no duplication of effort when implementing the platform\n", + "\n", + "components. Figure 6 shows one possible structure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6:**\n", + "Centralized teams with matrixed responsibilities\n", + "\n", + "\n", + "**Data Scientist**\n", + "Model and predict with data\n", + "\n", + "**Data Analyst**\n", + "Visualize and describe data\n", + "\n", + "\n", + "**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n", + "\n", + "**Data Engineer**\n", + "Store, process, maintain data\n", + "\n", + "**Business Partners**\n", + "**and Domain Experts**\n", + "\n", + "\n", + "Centralize data scientists under CDO — embed in lines of business for day-to-day tasking\n", + "\n", + "Centralize data engineers under CIO/CTO — initially as an enterprise function\n", + "\n", + "**Hiring, training and upskilling your talent**\n", + "\n", + "While this guide does not cover recruiting strategies, it is important to note that data engineering and data\n", + "\n", + "science talent is very difficult to find in this competitive market. As a result, every organization should\n", + "\n", + "consider what training and upskilling opportunities exist for their current staff. A large number of online\n", + "\n", + "courses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n", + "\n", + "augment your existing staff with experienced data scientists and machine learning experts. You will then\n", + "\n", + "need to establish clear training paths, resources and timelines to upskill your talent.\n", + "\n", + "Using the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n", + "\n", + "less experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n", + "\n", + "in educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n", + "\n", + "transfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n", + "\n", + "experience of your talent is to enable data science using production-like data and creating a collaborative\n", + "\n", + "environment for code sharing.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks training, [documentation](https://docs.databricks.com) and\n", + "\n", + "[certification](https://databricks.com/learn/certification) available to customers is industry-\n", + "\n", + "leading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n", + "\n", + "\n", + "#### 4. Deploy a modern data stack\n", + "\n", + "The modern data architecture can most easily be described as the evolution of the enterprise data\n", + "\n", + "warehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n", + "\n", + "limitations and lessons learned from working with these two legacy data architectures inspired the next\n", + "\n", + "generation of data architecture — what the industry now refers to as the lakehouse.\n", + "\n", + "Figure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n", + "\n", + "have improved over time.\n", + "\n", + "\n", + "exemplar code for organizations to hit the ground\n", + "\n", + "running with data and AI.\n", + "\n", + "**Figure 7:**\n", + "A brief history of data architectures\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving beyond the enterprise data warehouse and data lake**\n", + "\n", + "The EDW provided organizations with the ability to easily load structured and semi-structured data into\n", + "\n", + "well-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n", + "\n", + "(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n", + "\n", + "the business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n", + "\n", + "catalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n", + "\n", + "users — while still being performant. The EDW served its purpose for decades. However, most of the recent\n", + "\n", + "advances in AI have been in better models to process unstructured data (text, images, video, audio), but\n", + "\n", + "these are precisely the types of data that an EDW is not optimized for.\n", + "\n", + "Therefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n", + "\n", + "_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n", + "\n", + "_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n", + "\n", + "often leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n", + "\n", + "commodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n", + "\n", + "organizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n", + "\n", + "range of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n", + "\n", + "complex programming model known as MapReduce, and the performance fell short of the majority of real-\n", + "\n", + "time use cases.\n", + "\n", + "Over time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n", + "\n", + "processing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n", + "\n", + "model was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n", + "\n", + "for languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n", + "\n", + "from storage and providing the scale needed for distributed workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "A data lakehouse combines the best of data\n", + "\n", + "\n", + "**Cloud-based data lakes**\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n", + "\n", + "Amazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n", + "\n", + "storage systems in the world — which make them an attractive platform to serve as the next generation\n", + "\n", + "of data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "lakes and data warehouses, enabling BI and ML\n", + "\n", + "\n", + "However, data lakes lack some critical features: They do not support transactions, they do not enforce\n", + "\n", + "data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "and batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n", + "\n", + "example, efficiently listing the millions of files (objects) that make up most large data lakes.\n", + "\n", + "**Lakehouse — the modern data architecture**\n", + "\n", + "What if it were possible to combine the best of both worlds? The performance, concurrency and data\n", + "\n", + "management of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n", + "\n", + "the target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n", + "\n", + "the complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n", + "\n", + "of this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n", + "\n", + "architecture possible.\n", + "\n", + "\n", + "on all data on a simple, open and multicloud\n", + "\n", + "modern data stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Exploratory Data Scientist**\n", + "\n", + "\n", + "**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n", + "\n", + "\n", + "**Curated Data Lake**\n", + "\n", + "\n", + "**Raw Data Ingest**\n", + "“Bronze”\n", + "\n", + "\n", + "**Filtered/Cleaned/Augmented**\n", + "“Silver”\n", + "\n", + "\n", + "**Business-Level Aggregates**\n", + "“Gold”\n", + "\n", + "\n", + "**D ATA Q U A L I T Y**\n", + "\n", + "**Data Sources (Batch and Real-Time)**\n", + "\n", + "\n", + "**Unstructured**\n", + "\n", + "- Image, Video, Audio\n", + "\n", + "- Free Text, Blob\n", + "\n", + "\n", + "**Semi-Structured**\n", + "\n", + "- Logs, Clickstream\n", + "\n", + "- CSV, JSON, XML\n", + "\n", + "\n", + "**Structured**\n", + "\n", + "- Systems of Record\n", + "\n", + "- Operational DBs\n", + "\n", + "\n", + "**Figure 8:**\n", + "The building blocks for a modern data architecture\n", + "\n", + "The lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n", + "\n", + "including real-time streaming, batch processing, data warehousing, data science and machine learning. This\n", + "\n", + "target-state architecture supports loading all the data types that might be interesting to an organization —\n", + "\n", + "structured, semi-structured and unstructured — and provides a single processing layer, using consistent\n", + "\n", + "APIs across programming languages, to curate data while applying rigorous data management techniques.\n", + "\n", + "The move toward a single, consistent approach to data pipelining and refinement saves organizations\n", + "\n", + "time, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n", + "\n", + "curation and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n", + "\n", + "The architecture makes possible the efficient creation of “data assets” for the organization by taking a\n", + "\n", + "stepwise approach to improving data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse key features**\n", + "\n", + "To effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n", + "\n", + "available for stakeholders to run business-critical production workloads:\n", + "\n", + "\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n", + "\n", + "management with declarative pipeline development, automatic data testing and deep visibility for\n", + "\n", + "monitoring and recovery.\n", + "\n", + "\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n", + "\n", + "data concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n", + "\n", + "read or write data, typically using SQL.\n", + "\n", + "\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n", + "\n", + "and evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n", + "\n", + "be able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n", + "\n", + "lakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n", + "\n", + "to unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n", + "\n", + "unstructured data like tables, files, models and dashboards in concert with existing data, storage and\n", + "\n", + "catalogs.\n", + "\n", + "\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n", + "\n", + "clusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n", + "\n", + "Some modern data warehouses also have this property.\n", + "\n", + "\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n", + "\n", + "an API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n", + "\n", + "access the data directly.\n", + "\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once, replace a subset of\n", + "\n", + "the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n", + "\n", + "used to store, refine, analyze and access data types needed for many new data applications, including\n", + "\n", + "images, video, audio, semi-structured data and text.\n", + "\n", + "\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n", + "\n", + "tools might be needed to support all these workloads, but they all rely on the same data repository.\n", + "\n", + "\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n", + "\n", + "eliminates the need for separate systems dedicated to serving real-time data applications.\n", + "\n", + "\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n", + "\n", + "improves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n", + "\n", + "data in both a data lake and a warehouse.\n", + "\n", + "\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n", + "\n", + "governance experience across all clouds. You don’t need to invest in reinventing processes for every\n", + "\n", + "cloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n", + "\n", + "focus on putting all your data to work to discover new insights and create business value.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "**Figure 9:**\n", + "Delta Lake is the open data storage layer that delivers reliability,\n", + "security and performance on your data lake — for both\n", + "streaming and batch operations\n", + "\n", + "\n", + "-----\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n", + "\n", + "for security and access control are basic requirements. Data governance capabilities, including auditing,\n", + "\n", + "retention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n", + "\n", + "enable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n", + "\n", + "such enterprise features only need to be implemented, tested and administered for a single system.\n", + "\n", + "Databricks is the only cloud-native vendor\n", + "\n", + "\n", + "**Databricks — innovation driving performance**\n", + "\n", + "Advanced analytics and machine learning on unstructured and large-scale data are two of the most\n", + "\n", + "strategic priorities for enterprises today — and the growth of unstructured data is going to increase\n", + "\n", + "exponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n", + "\n", + "center of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n", + "\n", + "enough to meet the SLAs of the various workloads — especially SQL-based analytics.\n", + "\n", + "Databricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n", + "\n", + "and hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n", + "\n", + "on the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n", + "\n", + "technologies to provide the performance customers need to simplify their architecture. These innovations\n", + "\n", + "combine to provide a single architecture that can store and process all the data sets within an organization —\n", + "\n", + "supporting the range of analytics outlined above.\n", + "\n", + "**BI and SQL workloads**\n", + "\n", + "Perhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n", + "\n", + "for star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n", + "\n", + "part of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n", + "\n", + "to compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n", + "\n", + "satisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n", + "\n", + "execution, statistical analysis of files in the object store, and hardware and software improvements make it\n", + "\n", + "possible to deliver on this promise.\n", + "\n", + "\n", + "to be recognized as a Leader in both\n", + "\n", + "[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n", + "\n", + "**Cloud Database Management Systems** and\n", + "\n", + "**Data Science and Machine Learning Platforms**\n", + "\n", + "\n", + "-----\n", + "\n", + "**A word about the data mesh architecture**\n", + "\n", + "In 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n", + "\n", + "what some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n", + "\n", + "using a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n", + "\n", + "mesh approach avoids centralizing data in one location and encourages the source systems to create\n", + "\n", + "“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n", + "\n", + "designers advocate for a federated approach to data and AI — while using enterprise policies to govern how\n", + "\n", + "source systems make data assets available.\n", + "\n", + "There are several challenges with this approach. First, the data mesh assumes that each source system\n", + "\n", + "can dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n", + "\n", + "become “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n", + "\n", + "details to the individual teams. This has the potential of inconsistent implementations, which may lead to\n", + "\n", + "performance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n", + "\n", + "source system team has the necessary skills, or can acquire them, to build robust data products.\n", + "\n", + "The lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n", + "\n", + "from the source systems reduces the curation steps needed inside the data lake itself.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 5. Improve data governance and compliance\n", + "\n", + "Data governance is perhaps the most challenging aspect of data transformation initiatives. Every\n", + "\n", + "stakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n", + "\n", + "drive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n", + "\n", + "undetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n", + "\n", + "environments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n", + "\n", + "a blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n", + "\n", + "data governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n", + "\n", + "and privacy practices, and protect their data assets\n", + "\n", + "**Why data governance fails**\n", + "\n", + "While most people agree that data governance is a set of principles, practices and tooling that helps\n", + "\n", + "manage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n", + "\n", + "approach — one that balances realistic policies with automation and scalability.\n", + "\n", + "Too often the policies developed around data governance define very strict data management principles —\n", + "\n", + "for example, the development of an enterprise-wide ontological model that all data must adhere to.\n", + "\n", + "Organizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n", + "\n", + "effort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n", + "\n", + "complexity of the requirements. Meanwhile, data continues to flow through the organization without a\n", + "\n", + "consistent approach to governance, and too much of the effort is done manually and fraught with human error.\n", + "\n", + "\n", + "What are the basic building blocks of a sound data\n", + "\n", + "governance approach?\n", + "\n", + "\n", + "-----\n", + "\n", + "**A pragmatic approach to data governance**\n", + "\n", + "At a high level, organizations should enable the following data management capabilities:\n", + "\n", + "**\u0007Identify all sources of data**\n", + "\n", + "\u0007Identify all data-producing and data-storing applications\n", + "\n", + "\u0007Identify the systems of record (SOR) for each data set\n", + "\n", + "\u0007Label data sets as internal or external (third party)\n", + "\n", + "\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n", + "\n", + "\u0007Limit which operational data stores (ODSs) can re-store SOR data\n", + "\n", + "**\u0007Catalog data sets**\n", + "\n", + "\u0007Register all data sets in a centralized data catalog\n", + "\n", + "\u0007Create a lightweight, self-service data registration process\n", + "\n", + "\u0007Limit manual entry as much as possible\n", + "\n", + "\u0007Record the schema, if any, for the data set\n", + "\n", + "\u0007Use an inference engine or tool to extract the data set schema\n", + "\n", + "\u0007Add business and technical metadata to make it meaningful\n", + "\n", + "\u0007Use machine learning to classify data sets\n", + "\n", + "\u0007Use crowdsourcing to validate the machine-based results\n", + "\n", + "**Track data lineage**\n", + "\n", + "\u0007Track data set flow and what systems act on data\n", + "\n", + "\u0007Create an enumerated list of action values for specific operations\n", + "\n", + "\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n", + "\n", + "\n", + "\n", + "\u0007Optional: Add a source code repository URL for action traceability\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Perform data quality checks**\n", + "\n", + "\u0007Create a rules library that is centrally managed and versioned\n", + "\n", + "\u0007Update the rules library periodically with new rules\n", + "\n", + "\u0007Use a combination of checks — null/not null, regex, valid values\n", + "\n", + "\u0007Perform schema enforcement checks against data set registration\n", + "\n", + "By minimizing the number of copies of your data\n", + "\n", + "\n", + "**\u0007Scan for sensitive data**\n", + "\n", + "\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n", + "\n", + "\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n", + "\n", + "\u0007Use fixed-length tokens to preserve analytic value\n", + "\n", + "\u0007Determine the approach for token lookup/resolution when needed\n", + "\n", + "\u0007Ensure that any central token stores are secure with rotating keys\n", + "\n", + "\u0007Identify which data elements from GDPR/CCPA to include in scans\n", + "\n", + "\u0007Efficiently scan for sensitive data in cleartext using the rules library\n", + "\n", + "**\u0007Establish approved data flow patterns**\n", + "\n", + "\u0007Determine pathways for data flow (source —> target)\n", + "\n", + "\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n", + "\n", + "\u0007Determine read/write patterns for the data lake\n", + "\n", + "\u0007Strictly enforce data flow pathways to/from data lake\n", + "\n", + "\u0007Detect violations and anomalies using lineage event analysis\n", + "\n", + "\u0007Identify offending systems and shut down or grant exception\n", + "\n", + "\u0007Record data flow exceptions and set a remediation deadline\n", + "\n", + "**\u0007Centralize data access controls**\n", + "\n", + "\u0007Establish a common governance model for all data and AI assets\n", + "\n", + "\u0007Centrally define access policies for all data and AI assets\n", + "\n", + "\u0007Enable fine-grained access controls at row and column levels\n", + "\n", + "\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n", + "\n", + "\n", + "and moving to a single data processing layer where\n", + "\n", + "all your data governance controls can run together,\n", + "\n", + "you improve your chances of staying in compliance\n", + "\n", + "and detecting a data breach.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Make data discovery easy**\n", + "\n", + "\u0007Establish a data discovery model\n", + "\n", + "\u0007Use manual or automatic data classification\n", + "\n", + "\u0007Provide a visual interface for data discovery across your data estate\n", + "\n", + "\u0007Simplify data discovery with rich keyword- or business glossary-based search\n", + "\n", + "**\u0007Centralize data access auditing**\n", + "\n", + "\u0007Establish a framework or best practices for access auditing\n", + "\n", + "\u0007Capture audit logs for all CRUD operations performed on data\n", + "\n", + "\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n", + "\n", + "This is not intended to be an exhaustive list of features and requirements but rather a framework to\n", + "\n", + "evaluate your data governance approach. There will be violations at runtime, so it will be important to have\n", + "\n", + "procedures in place for how to handle these violations. In some cases, you may want to be very strict and\n", + "\n", + "shut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n", + "\n", + "the offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n", + "\n", + "these cases, the receiving systems must have their own methodology for dealing with bad data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hidden cost of data governance**\n", + "\n", + "There are numerous examples of high-profile data breaches and failure to comply with consumer data\n", + "\n", + "protection legislation. You don’t have to look very far to see reports of substantial fines levied against\n", + "\n", + "organizations that were not able to fully protect the data within their data ecosystem. As organizations\n", + "\n", + "produce and collect more and more data, it’s important to remember that while storage is cheap, failing\n", + "\n", + "to enforce proper data governance is very, very expensive.\n", + "\n", + "In order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n", + "\n", + "compute power when you consider the massive amounts of data that exist in your organization. Each\n", + "\n", + "time you copy a piece of data to load it into another tool or platform, you need to determine what data\n", + "\n", + "governance techniques exist there and how you ensure that you truly know where all your data resides.\n", + "\n", + "Imagine the scenario where data flows through your environment and is loaded into multiple platforms\n", + "\n", + "using various ETL processes. How do you handle the situation when you discover that sensitive data is\n", + "\n", + "in cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n", + "\n", + "problem before it’s flagged for violation.\n", + "\n", + "Having a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n", + "\n", + "organization’s brand and balance sheet.\n", + "\n", + "The bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n", + "\n", + "it is to get data governance right.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 6. Democratize access to quality data\n", + "\n", + "Effective data and AI solutions rely more on the amount of quality data available than on the sophistication\n", + "\n", + "or complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n", + "\n", + "Data” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n", + "\n", + "data scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n", + "\n", + "which is to create new opportunities for revenue growth, cost reduction and risk reduction.\n", + "\n", + "**The 80/20 data science dilemma**\n", + "\n", + "Most existing data environments have their data stored primarily in different operational data stores within a\n", + "\n", + "given business unit (BU) — creating several challenges:\n", + "\n", + "\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n", + "\n", + "of cross-BU opportunities\n", + "\n", + "\u0007The schemas are generally not well understood outside of BU or department — with only the database\n", + "\n", + "designers and power users being able to make efficient use of the data. This is referred to as the “tribal\n", + "\n", + "knowledge” phenomenon.\n", + "\n", + "\u0007The approval process and different system-level security models make it difficult and time-consuming\n", + "\n", + "for data scientists to gain the proper access to the data they need\n", + "\n", + "In order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n", + "\n", + "often done using single-node data science and generates unnecessary copies of data stored on local disk\n", + "\n", + "drives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n", + "\n", + "spaces” within production platform environments. This has the strong potential of degrading the overall\n", + "\n", + "performance for true production workloads.\n", + "\n", + "To make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n", + "\n", + "be needed in order to get the best model performance for your ML and AI workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Small data sets reduce the effectiveness of exploration, experimentation, model development and model\n", + "\n", + "training — resulting in inaccurate models when deployed into production and used with full-size data sets.\n", + "\n", + "As a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n", + "\n", + "time performing analytic work — work that may need to be redone once they have access to the full-size\n", + "\n", + "data sets. This is a serious problem for organizations that want to remain competitive and generate game-\n", + "\n", + "changing results.\n", + "\n", + "Another factor contributing to reduced productivity is the way in which end users are typically granted\n", + "\n", + "access to data. Security policies usually require both coarse-grained and fine-grained data protections.\n", + "\n", + "In other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n", + "\n", + "grained) within the data set.\n", + "\n", + "**Rationalize data access roles**\n", + "\n", + "The most common approach to providing coarse-grained and fine-grained access is to use what’s known\n", + "\n", + "as role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n", + "\n", + "(SSO) authentication and access control solution.\n", + "\n", + "Users can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n", + "\n", + "There are different strategies for identifying and creating these groups — but typically, they are done on a\n", + "\n", + "system-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n", + "\n", + "This approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n", + "\n", + "thousand discrete security groups for large organizations — despite having a much smaller number of\n", + "\n", + "defined job functions.\n", + "\n", + "This approach creates one of the biggest security challenges in large organizations. When personnel leave\n", + "\n", + "the company, it is fairly straightforward to remove them from the various security groups. However, when\n", + "\n", + "personnel move around within the organization, their old security group assignments often remain intact\n", + "\n", + "and new ones are assigned based on their new job function. This leads to personnel continuing to have\n", + "\n", + "access to data that they no longer have a “need to know.”\n", + "\n", + "\n", + "The Databricks Lakehouse Platform brings together\n", + "\n", + "all the data and AI personas into one environment\n", + "\n", + "and makes it easy to collaborate, share code and\n", + "\n", + "insights, and operate against the same view of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data classification**\n", + "\n", + "Having all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n", + "\n", + "strategies to segment your data based on “need to know.” Some organizations create a partition based\n", + "\n", + "on which business unit owns the data and which one owns the data classification. For example, in a\n", + "\n", + "financial services company, credit card customers’ data could be stored separately from that of debit card\n", + "\n", + "customers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n", + "\n", + "The simplest approach to data classification is to use three labels:\n", + "\n", + "\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n", + "\n", + "releases, etc.\n", + "\n", + "\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n", + "\n", + "competitors. This would include strategy briefings and market or customer segmentation research.\n", + "\n", + "\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n", + "\n", + "could negatively affect operations and put the organization at financial or legal risk. Restricted data\n", + "\n", + "requires the highest level of security protection.\n", + "\n", + "Some organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n", + "\n", + "understands how to apply them.\n", + "\n", + "The data classification requirements should be clearly documented and mapped to any legal or regulatory\n", + "\n", + "requirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n", + "\n", + "and defines “personal information” as “information that identifies, relates to, describes, is capable of\n", + "\n", + "being associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n", + "\n", + "household.”\n", + "\n", + "\n", + "-----\n", + "\n", + "Just examining one CCPA category, _Customer Records Information_ , we see that the following information is\n", + "\n", + "to be protected: name, signature, social security number, physical characteristics or description, address,\n", + "\n", + "telephone number, passport number, driver’s license or state identification card number, insurance policy\n", + "\n", + "number, education, employment, employment history, bank account number, credit or debit card number,\n", + "\n", + "other financial information, medical information, and health insurance information.\n", + "\n", + "There are generally three different approaches in industry to performing data classification:\n", + "\n", + "**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n", + "\n", + "done using regular expressions and lookup tables to map values to actual entities stored inside the\n", + "\n", + "organization (e.g., customer SSN).\n", + "\n", + "**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n", + "\n", + "the sensitivity of the data.\n", + "\n", + "**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n", + "\n", + "domain knowledge to ensure accuracy.\n", + "\n", + "Taking all this into account, an organization could implement a streamlined set of roles for RBAC that\n", + "\n", + "uses the convention where “domain” might be the\n", + "\n", + "business unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n", + "\n", + "the ID, and “classification” is one of the three values (public, internal, restricted).\n", + "\n", + "There is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n", + "\n", + "role assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n", + "\n", + "combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, gives a user or a system access to all the\n", + "\n", + "data fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n", + "\n", + "Whereas would allow the user or system\n", + "\n", + "access only to nonsensitive data regarding the transaction.\n", + "\n", + "This gives organizations the chance to rationalize their security groups by using a domain naming\n", + "\n", + "convention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n", + "\n", + "groups. It also dramatically eases the administration of granting access to data for a given user.\n", + "\n", + "**Everyone working from the same view of data**\n", + "\n", + "The modern data stack, when combined with a simplified security group approach and a robust data\n", + "\n", + "governance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n", + "\n", + "improves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n", + "\n", + "shared view of your data.\n", + "\n", + "Combining this with a sensitive data tokenization strategy can make it straightforward to empower data\n", + "\n", + "scientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n", + "\n", + "sets that both obfuscate NPI/PII information and preserve analytic value.\n", + "\n", + "Now, data discovery is easier because data sets have been registered in the catalog with full descriptions\n", + "\n", + "and business metadata — with some organizations going as far as showing realistic sample data for a\n", + "\n", + "particular data set. If a user does not have access to the underlying data files, having data in one physical\n", + "\n", + "location eases the burden of granting access, and then it’s easier to deploy access-control policies and\n", + "\n", + "collect/analyze audit logs to monitor data usage and to look for bad actors.\n", + "\n", + "\n", + "Adopting the Databricks Lakehouse Platform allows\n", + "\n", + "you to add data sets into a well-managed data lake\n", + "\n", + "using low-cost object stores, and makes it easy to\n", + "\n", + "partition data based on domain, entity, data set and\n", + "\n", + "classification levels to provide fine-grained (row-\n", + "\n", + "level and column-level) security.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data security, validation and curation — in one place**\n", + "\n", + "The modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n", + "\n", + "protecting, validating and improving your organization’s data. Data governance policies can be enforced\n", + "\n", + "using the built-in features of schema validation, expectations and pipelines — the three main steps to data\n", + "\n", + "curation. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n", + "\n", + "refer to it at Databricks, Bronze —> Silver —> Gold.\n", + "\n", + "The raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n", + "\n", + "data. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n", + "\n", + "the data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n", + "\n", + "level” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n", + "\n", + "ACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n", + "\n", + "“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n", + "\n", + "customer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n", + "\n", + "cross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n", + "\n", + "and ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n", + "\n", + "jobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n", + "\n", + "improves the user experience and helps retain talent.\n", + "\n", + "**Extend the impact of your data with secure data sharing**\n", + "\n", + "Data sharing is crucial to drive business value in today’s digital economy. More and more organizations\n", + "\n", + "are now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n", + "\n", + "customers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n", + "\n", + "monetization. Additionally, organizations are interested in leveraging external data to drive new product\n", + "\n", + "innovations and services.\n", + "\n", + "Business executives must establish and promote a data sharing culture in their organizations to build\n", + "\n", + "competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 7. Dramatically increase productivity of your workforce\n", + "\n", + "Now that you have deployed a modern data stack and have landed all your analytical data in a well-\n", + "\n", + "managed data lake with a rationalized approach to access control, the next question is, “What tools should I\n", + "\n", + "provide to the user community so they can be most effective at using the new data ecosystem?”\n", + "\n", + "**Design thinking: working backward from the user experience**\n", + "\n", + "Design thinking is a human-centered approach to innovation — focused on understanding customer needs,\n", + "\n", + "rapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n", + "\n", + "processes and organizations. Design thinking was introduced as a technique to not only improve but also\n", + "\n", + "bring joy to the way people work. The essence of design thinking is to determine what motivates people to\n", + "\n", + "do their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n", + "\n", + "**Moving beyond best of breed**\n", + "\n", + "If you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n", + "\n", + "training and model deployment tools. Many organizations take a “best of breed” approach in providing\n", + "\n", + "tooling for their end users. This typically occurs because leaders genuinely want to empower business\n", + "\n", + "units, departments and teams to select the tool that best suits their specific needs — so-called federated\n", + "\n", + "tool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n", + "\n", + "given the high cost of rolling it out to the entire user population.\n", + "\n", + "\n", + "-----\n", + "\n", + "When tool selection becomes localized, there are a few things to consider:\n", + "\n", + "\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n", + "\n", + "interchangeable with criteria that are established within a specific tool category. The tool with the best\n", + "\n", + "overall score gets selected.\n", + "\n", + "\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n", + "\n", + "personal preference or adoption within a department, or because a given tool is better suited to support\n", + "\n", + "a current business process\n", + "\n", + "\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n", + "\n", + "\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n", + "\n", + "moved into production\n", + "\n", + "\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n", + "\n", + "and security environment but nothing more\n", + "\n", + "\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n", + "\n", + "of tools in play or streamlining the user experience\n", + "\n", + "\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n", + "\n", + "partnership model, the ability to influence the roadmap and professional services support\n", + "\n", + "For these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n", + "\n", + "on selecting a data platform that enables seamless integration with point solutions rather than a suite of\n", + "\n", + "discrete tools that require integration work and may no longer be category leaders over the long haul.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is a leading data and AI company —\n", + "\n", + "\n", + "Keep in mind that data platforms work well because the vendor took an opinionated point of view of how\n", + "\n", + "data processing, validation and curation should work. It’s the integration between the discrete functions\n", + "\n", + "of the platform that saves time, conserves effort and improves the user experience. Many companies try\n", + "\n", + "to take on the integration of different technology stacks, which increases risk, cost and complexity. The\n", + "\n", + "consequences of not doing the integration properly can be serious — in terms of security, compliance,\n", + "\n", + "efficiency, cost, etc.\n", + "\n", + "\n", + "partly due to the innovations in the [open source](https://databricks.com/product/open-source)\n", + "\n", + "\n", + "So, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n", + "\n", + "and incorporate your requirements into their platform product roadmap. This will require some give-and-\n", + "\n", + "take from both parties — sometimes calling for an organization to adjust their processes to better fit how\n", + "\n", + "the platform works. There are many instances where a given business process could be simplified or recast\n", + "\n", + "to work with the platform, as is. Sometimes it will require the vendor to add features that support your\n", + "\n", + "processes. The vendor will always be market driven and will want to build features in such a way that they\n", + "\n", + "apply to the broadest set of customers.\n", + "\n", + "The final point to consider is that it takes a substantial amount of time to become an expert user of a given\n", + "\n", + "tool. Users must make a significant investment to learn how the tool works and the most efficient way of\n", + "\n", + "performing their job. The more discrete tools in an environment, the more challenging this becomes.\n", + "\n", + "Minimizing the number of tools and their different interfaces, styles of interaction and approach to security\n", + "\n", + "and collaboration helps improve the user experience and decreases time to market.\n", + "\n", + "\n", + "[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n", + "\n", + "listening to the needs of thousands of customers\n", + "\n", + "and having our engineers work side by side with\n", + "\n", + "customer teams to deliver real business value using\n", + "\n", + "data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified platform, unified personas**\n", + "\n", + "Deploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n", + "\n", + "data stack — will provide an integrated suite of tools for the full range of personas in your organization,\n", + "\n", + "including business analysts, SQL developers, data engineers and data scientists. You will immediately\n", + "\n", + "increase productivity and reduce risk because you’ll be better able to share the key aspects of data\n", + "\n", + "pipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n", + "\n", + "and deployment. All the work streams function off a single view of the data, and the handoffs between\n", + "\n", + "subsystems are well managed.\n", + "\n", + "Data processing happens in one auditable environment, and the number of copies of data is kept to an\n", + "\n", + "absolute minimum — with each user benefiting from the data assets created by others. Redundant work\n", + "\n", + "is eliminated.\n", + "\n", + "The 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n", + "\n", + "working with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n", + "\n", + "the 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n", + "\n", + "Another challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n", + "\n", + "differently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n", + "\n", + "consumers must be able to adjust by monitoring the execution and detecting the changes. The data\n", + "\n", + "scientist, in turn, must update and test new models on the new data. Your data platform should make the\n", + "\n", + "detection and remediation easier, not harder.\n", + "\n", + "For the data engineers, their primary focus is extracting data from source systems and moving it into the\n", + "\n", + "new data ecosystem. The data pipeline function can be simplified with a unified data platform because\n", + "\n", + "the programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n", + "\n", + "results in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n", + "\n", + "and debug since the compute layer is consistent, and the logging and auditing associated with the data\n", + "\n", + "processing and data management is centralized and of more value.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Maximize the productivity of your workforce**\n", + "\n", + "Once you have a data platform that brings together your full range of personas, you should focus on the\n", + "\n", + "next step for increasing productivity — namely, self-service environments.\n", + "\n", + "In large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n", + "\n", + "environments for development, testing and production. These environments need to be nearly identical to\n", + "\n", + "one another — using the same version of software while limiting the number, size and horsepower of the\n", + "\n", + "compute nodes. To the extent possible, development and test should be performed with realistic test/\n", + "\n", + "synthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n", + "\n", + "percentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n", + "\n", + "general shape and range of values.\n", + "\n", + "The **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n", + "\n", + "environments should be small and controlled with policies that spin them up and tear them down efficiently.\n", + "\n", + "Every aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n", + "\n", + "environment that cannot be destroyed and easily rebuilt.\n", + "\n", + "The **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n", + "\n", + "tools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n", + "\n", + "the developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n", + "\n", + "management.\n", + "\n", + "Moving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n", + "\n", + "developers cannot randomly promote software to run in production. Again, this process should be\n", + "\n", + "strictly governed using a workflow/sign-off approval approach — signed off by management as well.\n", + "\n", + "Many organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n", + "\n", + "deployments.\n", + "\n", + "\n", + "**DEV** **TEST**\n", + "\n", + "**PROD**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 8. Make informed build vs. buy decisions\n", + "\n", + "A key piece of the strategy will involve the decision around which components of the data ecosystem are\n", + "\n", + "built by the in-house engineering team and which components are purchased through a vendor relationship.\n", + "\n", + "There is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n", + "\n", + "engineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n", + "\n", + "**Competitive advantage**\n", + "\n", + "This “roll your own’’ approach has some advantages — including being able to establish the overall product\n", + "\n", + "vision, prioritize features and directly allocate the resources to build the software. However, it is important to\n", + "\n", + "keep in mind which aspects of your development effort give you the most competitive advantage.\n", + "\n", + "Spend some time working with the data transformation steering committee and other stakeholders to\n", + "\n", + "debate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n", + "\n", + "come down to whether or not a given solution offers true competitive advantage for the organization. Does\n", + "\n", + "building this piece of software make it harder for your competitors to compete with you? If the answer is no,\n", + "\n", + "then it is better to focus your engineering and data science resources on deriving insights from your data.\n", + "\n", + "**Beware: becoming your own software vendor**\n", + "\n", + "As many engineering leaders know, building your own software is an exciting challenge. However, it does\n", + "\n", + "come with added responsibility — namely, managing the overall project timeline and costs, and being\n", + "\n", + "responsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n", + "\n", + "updates. You basically are becoming your own software vendor for every component of the ecosystem\n", + "\n", + "that you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n", + "\n", + "several million dollars per year building out individual component parts of the new data system. This doesn’t\n", + "\n", + "include the cost to operate and maintain the software once it is in production.\n", + "\n", + "\n", + "-----\n", + "\n", + "To offset the anticipated development costs, engineering teams will oftentimes make the argument that\n", + "\n", + "they are starting with open source software and extending it to meet the “unique requirements” of your\n", + "\n", + "organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n", + "\n", + "unique and b) the development offers the competitive advantage that you need.\n", + "\n", + "Even software built on top of open source still requires significant investment in integration and testing.\n", + "\n", + "The integration work is particularly challenging because of the large number of open source libraries that\n", + "\n", + "are required in the data science space. The question becomes, “Is this really the area that you want your\n", + "\n", + "engineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n", + "\n", + "**How long will it take? Can the organization afford to wait?**\n", + "\n", + "Even if you decide the software component provides a competitive advantage and is something worth\n", + "\n", + "building in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n", + "\n", + "time-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n", + "\n", + "business due to the anticipated delivery schedule. Keep in mind that software development projects usually\n", + "\n", + "take longer and cost more money than initially planned.\n", + "\n", + "The organization should understand the impact to the overall performance and capabilities of the daily\n", + "\n", + "ecosystem for any features tied to the in-house development effort. Your business partners likely do\n", + "\n", + "not care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n", + "\n", + "is reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n", + "\n", + "features and schedule.\n", + "\n", + "\n", + "Databricks is built on top of popular open source\n", + "\n", + "software that it created. Engineering teams can\n", + "\n", + "improve the underpinnings of the Databricks\n", + "\n", + "platform by submitting code via pull request and\n", + "\n", + "becoming committers to the projects. The benefit\n", + "\n", + "to organizations is that their engineers contribute\n", + "\n", + "to the feature set of the data platform while\n", + "\n", + "Databricks remains responsible for all integration\n", + "\n", + "and performance testing plus all the runtime\n", + "\n", + "support, including failover and disaster recovery.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Don’t forget about the data**\n", + "\n", + "Perhaps the single most important feature of a modern data stack is its ability to help make data sets and\n", + "\n", + "“data assets” consumable to the end users or systems. Data insights, model training and model execution\n", + "\n", + "cannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n", + "\n", + "In large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n", + "\n", + "sets from multiple lines of business or departments. Focusing your data engineering and data science\n", + "\n", + "efforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n", + "\n", + "creating true competitive advantage.\n", + "\n", + "The amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n", + "\n", + "serve up data for analysis should not be underestimated. The value of this work is equally important to\n", + "\n", + "the business. The ability to curate data to enable game-changing insights should be the focus of the work\n", + "\n", + "led by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n", + "\n", + "engineers innovate on components that don’t bring true competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 9. Allocate, monitor and optimize costs\n", + "\n", + "Beginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n", + "\n", + "class of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n", + "\n", + "only one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n", + "\n", + "more manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n", + "\n", + "case anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n", + "\n", + "and increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n", + "\n", + "related personas to collaborate and operate from the same point of view. Lessons learned on the platform\n", + "\n", + "could be easily shared and reused by other members of the team. The more the team used the unified\n", + "\n", + "platform, the more they collaborated and their level of expertise increased.\n", + "\n", + "**Reduce complexity, reduce costs**\n", + "\n", + "The architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n", + "\n", + "more complex — resulting in increased time to market and increased costs. This was mainly due to the\n", + "\n", + "requirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n", + "\n", + "for the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n", + "\n", + "serving and analytics are performed in a single compute layer.\n", + "\n", + "Organizations can rightsize the data environments and control costs using policies. The centralized\n", + "\n", + "and consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n", + "\n", + "bottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n", + "\n", + "expertise is developed within the workforce.\n", + "\n", + "\n", + "The Databricks platform optimizes costs for your\n", + "\n", + "data and AI workloads by intelligently provisioning\n", + "\n", + "infrastructure only as you need it. Customers can\n", + "\n", + "establish policies that govern the size of clusters\n", + "\n", + "based on DEV, TEST, PROD environments or\n", + "\n", + "anticipated workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks monitors and records usage and allows\n", + "\n", + "organizations to easily track costs on a data and\n", + "\n", + "\n", + "**Centralized funding model**\n", + "\n", + "As previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n", + "\n", + "under the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n", + "\n", + "the likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n", + "\n", + "the funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n", + "\n", + "Funding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n", + "\n", + "steady state that is more sustainable.\n", + "\n", + "\n", + "AI workload basis. This provides the ability to\n", + "\n", + "\n", + "The budget takes into account the cost of the data engineering function, commercial software licenses and\n", + "\n", + "building out the center of excellence to accelerate the data science capabilities of the organization. Again,\n", + "\n", + "the CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n", + "\n", + "focused on the overall implementation plan and to make sound build vs. buy decisions.\n", + "\n", + "It’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n", + "\n", + "in the CIO’s organization to perform the data engineering tasks. The data science community reports into\n", + "\n", + "the CDO and is matrixed into the lines of business in order to better understand the business drivers and\n", + "\n", + "the data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n", + "\n", + "regulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n", + "\n", + "the necessary time to educate leaders throughout the organization on the value of data governance.\n", + "\n", + "\n", + "implement an enterprise-wide chargeback mode\n", + "\n", + "and put in place appropriate spending limits.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chargeback models**\n", + "\n", + "To establish the centralized budget to fund the data transformation initiative, some organizations impose\n", + "\n", + "a “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n", + "\n", + "should be used to build the data engineering and data science teams needed to deploy the building blocks\n", + "\n", + "of the new data ecosystem. However, as different teams, departments and business units begin using the\n", + "\n", + "new data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n", + "\n", + "not be evenly distributed, due to different levels of usage from the various parts of the organization. The\n", + "\n", + "groups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n", + "\n", + "ability to monitor and track usage — not only based on compute but also on the amount of data generated\n", + "\n", + "and consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n", + "\n", + "and above the base-level funding.\n", + "\n", + "Plus, not all the departments or lines of business will require the same level of compute power or fault\n", + "\n", + "tolerance. The architecture should support the ability to separate out the runtime portions of the data\n", + "\n", + "ecosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n", + "\n", + "Some workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n", + "\n", + "nodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n", + "\n", + "critical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n", + "\n", + "better manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n", + "\n", + "performance is needed most.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 10. Move to production and scale adoption\n", + "\n", + "Now that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n", + "\n", + "ecosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n", + "\n", + "managing and using data to enable use cases that drive business value. They must also establish a clear\n", + "\n", + "set of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n", + "\n", + "continues to improve over time.\n", + "\n", + "**If you build it, they will come**\n", + "\n", + "Keep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n", + "\n", + "set registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n", + "\n", + "A high level of automation for the registration process is important because it’s not uncommon to see\n", + "\n", + "thousands of data sets in large organizations. The business and technical metadata plus the data quality\n", + "\n", + "rules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n", + "\n", + "provide a visualization that shows the data movement and verifies that the approved data flow paths are\n", + "\n", + "being followed.\n", + "\n", + "Some key metrics to keep an eye on are:\n", + "\n", + "\u0007Percentage of source systems contributing data to the ecosystem\n", + "\n", + "\u0007Percentage of real-time streaming relative to API and batch transfers\n", + "\n", + "\u0007Percentage of registered data sets with full business and technical metadata\n", + "\n", + "\u0007Volume of data written to the data lake\n", + "\n", + "\u0007Percentage of raw data that enters a data curation pipeline\n", + "\n", + "\u0007Volume of data consumed from the data lake\n", + "\n", + "\u0007Number of tables defined and populated with curated data\n", + "\n", + "\u0007Number of models trained with data from the data lake\n", + "\n", + "\u0007Lineage reports and anomaly detection incidents\n", + "\n", + "\u0007Number of users running Python, SQL, Scala and R workloads\n", + "\n", + "\n", + "In 2018, Databricks released MLflow — an open\n", + "\n", + "source platform to manage the ML lifecycle,\n", + "\n", + "including experimentation, reproducibility,\n", + "\n", + "deployment and a central model registry. MLflow\n", + "\n", + "is included in the Databricks Lakehouse Platform\n", + "\n", + "and accelerates the adoption of machine learning\n", + "\n", + "and AI in organizations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Communication plan**\n", + "\n", + "Communication is critical throughout the data transformation initiative — however, it is particularly\n", + "\n", + "important once you move into production. Time is precious and you want to avoid rework, if at all possible.\n", + "\n", + "Organizations often overlook the emotional and cultural toll that a long transformation process takes on\n", + "\n", + "the workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n", + "\n", + "and exhausting place to be — because your business partners are busy supporting two data worlds. Most\n", + "\n", + "users just want to know when the new environment will be ready. They don’t want to work with partially\n", + "\n", + "completed features, especially while performing double duty.\n", + "\n", + "Establish a solid communication plan and set expectations for when features will come online. Make sure\n", + "\n", + "there is detailed documentation, training and a support/help desk to field users’ questions.\n", + "\n", + "**DevOps — software development + IT operations**\n", + "\n", + "Mature organizations develop a series of processes and standards for how software and data are developed,\n", + "\n", + "managed and delivered. The term “DevOps” comes from the software engineering world and refers to\n", + "\n", + "developing and operating large-scale software systems. DevOps defines how an organization, its developers,\n", + "\n", + "operations staff and other stakeholders establish the goal of delivering quality software reliably and\n", + "\n", + "repeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n", + "\n", + "continuous delivery (CD).\n", + "\n", + "The CI portion of the process is the practice of frequently integrating newly written or changed code\n", + "\n", + "with the existing code repository. As software is written, it is continuously saved back to the source code\n", + "\n", + "repository, merged with other changes, built, integrated and tested — and this should occur frequently\n", + "\n", + "enough that the window between commit and build is narrow enough that no errors can occur without\n", + "\n", + "developers noticing them and correcting them immediately.\n", + "\n", + "This is particularly important for large, distributed teams to ensure that the software is always in a working\n", + "\n", + "state — despite the frequent changes from various developers. Only software that passes the CI steps is\n", + "\n", + "deployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n", + "\n", + "dependable releases.\n", + "\n", + "\n", + "Software development IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "**DataOps — data processing + IT operations**\n", + "\n", + "DataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n", + "\n", + "use the well-established processes from DevOps to consistently and reliably improve the quality of data\n", + "\n", + "used to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n", + "\n", + "needed for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n", + "\n", + "data are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n", + "\n", + "end cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n", + "\n", + "data sets, data assets and models that create value.\n", + "\n", + "For DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n", + "\n", + "and the data tooling should be designed to support the workflow and make all aspects of data curation and\n", + "\n", + "ETL more efficient.\n", + "\n", + "**MLOps — machine learning + IT operations**\n", + "\n", + "Not surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n", + "\n", + "deep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n", + "\n", + "unique when compared with DevOps and DataOps because the approach to deploying effective machine\n", + "\n", + "learning models is far more iterative and requires much more experimentation — data scientists try different\n", + "\n", + "features, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n", + "\n", + "base, understand the data used to perform the training and create reproducible results. The logging aspect\n", + "\n", + "of the ML development lifecycle is critical.\n", + "\n", + "MLOps aims to manage deployment of machine learning and deep learning models in large-scale\n", + "\n", + "production environments while also focusing on business and regulatory requirements. The ideal MLOps\n", + "\n", + "environment would include data science tools where models are constructed and analytical engines where\n", + "\n", + "computations are performed.\n", + "\n", + "\n", + "Data processing IT operations\n", + "\n", + "#### \n", + "\n", + "Machine learning IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "The overall workflow for deploying production ML models is shown in Figure 10.\n", + "\n", + "Unlike most software applications that execute a series of discrete operations, ML platforms are not\n", + "\n", + "deterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n", + "\n", + "suffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n", + "\n", + "be refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n", + "\n", + "should natively support this style of iterative data science.\n", + "\n", + "**Ethics in AI**\n", + "\n", + "As more organizations deploy data and AI solutions, there is growing concern around a number of issues\n", + "\n", + "related to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n", + "\n", + "fair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n", + "\n", + "must ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n", + "\n", + "explainability to satisfy legal and regulatory safeguards.\n", + "\n", + "The vast majority of AI work still involves software development by human beings and the use of curated\n", + "\n", + "data sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n", + "\n", + "questionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n", + "\n", + "explain how it works and describe the impact of its existence on the target audience — whether internal\n", + "\n", + "workers or customers.\n", + "\n", + "\n", + "Data extraction\n", + "\n", + "Data preparation\n", + "\n", + "Model e�aluation\n", + "\n", + "\n", + "Data analI�i�\n", + "\n", + "4\n", + "Model training\n", + "\n", + "6\n", + "Model �er�ing and\n", + "execution\n", + "\n", + "\n", + "Model monitoring\n", + "\n", + "**Figure 10:**\n", + "Workflow for deploying production ML models\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data and AI Maturity Model**\n", + "\n", + "When data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n", + "\n", + "a data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n", + "\n", + "Figure 11.\n", + "\n", + "**Top-Line Categories and Ranking Criteria**\n", + "\n", + "**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n", + "\n", + "1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n", + "\n", + "\n", + "Organization is beginning\n", + "to explore big data and\n", + "AI, and understand the\n", + "possibilities and potential\n", + "of a few starter projects\n", + "and experiment\n", + "\n", + "**Figure 11:**\n", + "The Data and AI Maturity Model\n", + "\n", + "\n", + "Organization builds\n", + "the basic capabilities\n", + "and foundations to\n", + "begin exploring a more\n", + "expansive data and AI\n", + "strategy, but it lacks vision,\n", + "long-term objectives or\n", + "leadership buy-in\n", + "\n", + "\n", + "Data and AI are budding\n", + "into drivers of value for\n", + "BUs aligned to specific\n", + "projects and initiatives as\n", + "the core tenets of data\n", + "and AI are integrated into\n", + "corporate strategy\n", + "\n", + "\n", + "Data and AI are core\n", + "drivers of value across the\n", + "organization, structured\n", + "and central to corporate\n", + "strategy, with a scalable\n", + "architecture that meets\n", + "business needs and buy-in\n", + "from across the organization\n", + "\n", + "\n", + "Data and AI are at the\n", + "heart of the corporate\n", + "strategy and are\n", + "invaluable differentiators\n", + "and drivers of competitive\n", + "advantage\n", + "\n", + "\n", + "Databricks partners with its customers to enable them to do an internal self-assessment. The output of the\n", + "\n", + "self-assessment allows organizations to:\n", + "\n", + "\u0007Understand the current state of their journey to data and AI maturity\n", + "\n", + "\u0007Identify key gaps in realizing (more) value from data and AI\n", + "\n", + "\u0007Plot a path to increase maturity with specific actions\n", + "\n", + "\u0007Identify Databricks resources who can help support their journey\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## Conclusion\n", + "\n", + "\n", + "After a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n", + "\n", + "with the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n", + "\n", + "— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n", + "\n", + "to future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n", + "\n", + "architecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n", + "\n", + "unleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n", + "\n", + "every use case.\n", + "\n", + "For more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n", + "\n", + "**A B O U T T H E A U T H O R**\n", + "\n", + "Chris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n", + "\n", + "is to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n", + "\n", + "Prior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n", + "\n", + "he led a team that was responsible for building out a modern data architecture that emphasized the key\n", + "\n", + "attributes of the lakehouse architecture.\n", + "\n", + "Chris has also held leadership roles at a number of technology companies.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
### eBook\n", + "\n", + "# A New Approach to Data Sharing\n", + "\n", + "#### Open data sharing and collaboration for data, analytics, and AI\n", + "\n", + "### Second Edition\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n", + "\n", + "**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n", + "\n", + "Common data sharing use cases 6\n", + "\n", + "Data monetization 6\n", + "\n", + "Data sharing with partners or suppliers (B2B) 6\n", + "\n", + "Internal lines of business (LOBs) sharing 6\n", + "\n", + "Key benefits of data sharing 7\n", + "\n", + "**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n", + "\n", + "Legacy and homegrown solutions 9\n", + "\n", + "Proprietary vendor solutions 11\n", + "\n", + "Cloud object storage 13\n", + "\n", + "**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n", + "\n", + "What is Delta Sharing? 14\n", + "\n", + "Key benefits of Delta Sharing 16\n", + "\n", + "Maximizing value of data with Delta Sharing 18\n", + "\n", + "Data monetization with Delta Sharing 19\n", + "\n", + "B2B sharing with Delta Sharing 21\n", + "\n", + "Internal data sharing with Delta Sharing 23\n", + "\n", + "**Chapter 4: How Delta Sharing Works** **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chapter 5: Introducing Databricks Marketplace** **28**\n", + "## Contents\n", + "\n", + "What is Databricks Marketplace? 30\n", + "\n", + "Key benefits of Databricks Marketplace 30\n", + "\n", + "Enable collaboration and accelerate innovation 32\n", + "\n", + "Powered by a fast, growing ecosystem 32\n", + "\n", + "Use cases for an open marketplace 32\n", + "\n", + "New upcoming feature: AI model sharing 33\n", + "\n", + "**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n", + "\n", + "What is a data clean room? 34\n", + "\n", + "Common data clean room use cases 36\n", + "\n", + "Shortcomings of existing data clean rooms 38\n", + "\n", + "Key benefits of Databricks Clean Rooms 39\n", + "\n", + "**Resources: Getting started with Data Sharing and Collaboration** **40**\n", + "\n", + "**About the Authors** **42**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + " Data Sharing in Today’s Digital Economy\n", + "\n", + "\n", + "Today’s economy revolves around data. Everyday, more and more\n", + "\n", + "organizations must exchange data with their customers, suppliers\n", + "\n", + "and partners. Security is critical. And yet, efficiency and immediate\n", + "\n", + "accessibility are equally important.\n", + "\n", + "Where data sharing may have been considered optional, it’s now\n", + "\n", + "required. More organizations are investing in streamlining internal\n", + "\n", + "and external data sharing across the value chain. But they still face\n", + "\n", + "major roadblocks — from human inhibition to legacy solutions to\n", + "\n", + "vendor lock-in.\n", + "\n", + "To be truly data-driven, organizations need a better way to share\n", + "\n", + "data. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n", + "\n", + "data sharing will outperform their peers on most business value\n", + "\n", + "\n", + "who have successfully executed data sharing initiatives are 1.7x\n", + "\n", + "more effective in showing business value and return on investment\n", + "\n", + "from their data analytics strategy.\n", + "\n", + "To compete in the digital economy, organizations need an open —\n", + "\n", + "and secure — approach to data sharing.\n", + "\n", + "This eBook takes a deep dive into the modern era of data sharing\n", + "\n", + "and collaboration, from common use cases and key benefits to\n", + "\n", + "conventional approaches and the challenges of those methods.\n", + "\n", + "You’ll get an overview of our open approach to data sharing and find\n", + "\n", + "out how Databricks allows you to share your data across platforms,\n", + "\n", + "to share all your data and AI, and to share all your data securely with\n", + "\n", + "unified governance in a privacy-safe way.\n", + "\n", + "\n", + "metrics. In addition, Gartner recently found that Chief Data Officers\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 1\n", + " What Is Data Sharing and Why Is It Important?\n", + "\n", + "Data sharing is the ability to make the same data available to one or many stakeholders — both external\n", + "\n", + "and internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n", + "\n", + "Data sharing — within your organization or externally — is an enabling technology for data commercialization\n", + "\n", + "and enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n", + "\n", + "to collaborate with partners, establish new partnerships and generate new revenue streams with data\n", + "\n", + "monetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n", + "\n", + "groups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n", + "\n", + "limited to roles such as the data analyst, data scientist and data engineer.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data sharing use cases\n", + "\n", + "\n", + "#### Data\n", + " monetization\n", + "\n", + "Companies across industries are commercializing\n", + "\n", + "data. Large multinational organizations have\n", + "\n", + "formed exclusively to monetize data, while other\n", + "\n", + "organizations are looking for ways to monetize\n", + "\n", + "their data and generate additional revenue\n", + "\n", + "streams. Examples of these companies can\n", + "\n", + "range from an agency with an identity graph to a\n", + "\n", + "telecommunication company with proprietary 5G\n", + "\n", + "data or to retailers that have a unique ability to\n", + "\n", + "combine online and offline data. Data vendors are\n", + "\n", + "growing in importance as companies realize they\n", + "\n", + "need external data for better decision-making.\n", + "\n", + "\n", + "#### Data sharing with partners\n", + " or suppliers (B2B)\n", + "\n", + "Many companies now strive to share data with\n", + "\n", + "partners and suppliers as similarly as they share\n", + "\n", + "it across their own organizations. For example,\n", + "\n", + "retailers and their suppliers continue to work more\n", + "\n", + "closely together as they seek to keep their products\n", + "\n", + "moving in an era of ever-changing consumer tastes.\n", + "\n", + "Retailers can keep suppliers posted by sharing sales\n", + "\n", + "data by SKU in real time, while suppliers can share\n", + "\n", + "real-time inventory data with retailers so they know\n", + "\n", + "what to expect. Scientific research organizations\n", + "\n", + "can make their data available to pharmaceutical\n", + "\n", + "companies engaged in drug discovery. Public safety\n", + "\n", + "agencies can provide real-time public data feeds\n", + "\n", + "of environmental data, such as climate change\n", + "\n", + "statistics or updates on potential volcanic eruptions.\n", + "\n", + "\n", + "#### Internal lines of business\n", + " (LOBs) sharing\n", + "\n", + "Within any company, different departments, lines\n", + "\n", + "of business and subsidiaries seek to share data so\n", + "\n", + "everyone can make decisions based on a complete\n", + "\n", + "view of the current business reality. For example,\n", + "\n", + "finance and HR departments need to share data\n", + "\n", + "as they analyze the true costs of each employee.\n", + "\n", + "Marketing and sales teams need a common view\n", + "\n", + "of data to determine the effectiveness of recent\n", + "\n", + "marketing campaigns. And different subsidiaries\n", + "\n", + "of the same company need a unified view of the\n", + "\n", + "health of the business. Removing data silos — which\n", + "\n", + "are often established for the important purpose of\n", + "\n", + "preventing unauthorized access to data — is critical\n", + "\n", + "for digital transformation initiatives and maximizing\n", + "\n", + "the business value of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of data sharing\n", + "\n", + "As you can see from the use cases described above, there are many benefits of data sharing, including:\n", + "\n", + "\n", + "**Greater collaboration with existing partners.** In today’s hyper-\n", + "\n", + "connected digital economy, no single organization can advance its\n", + "\n", + "business objectives without partnerships. Data sharing helps solidify\n", + "\n", + "existing partnerships and can help organizations establish new ones.\n", + "\n", + "\u0007 **Ability to generate new revenue streams.** With data sharing,\n", + "\n", + "organizations can generate new revenue streams by offering data\n", + "\n", + "products or data services to their end consumers.\n", + "\n", + "\n", + "**Ease of producing new products, services or business models.**\n", + "\n", + "Product teams can leverage both first-party data and third-party\n", + "\n", + "data to refine their products and services and expand their product/\n", + "\n", + "service catalog.\n", + "\n", + "**Greater efficiency of internal operations.** Teams across the\n", + "\n", + "organization can meet their business goals far more quickly when\n", + "\n", + "they don’t have to spend time figuring out how to free data from\n", + "\n", + "silos. When teams have access to live data, there’s no lag time\n", + "\n", + "between the need for data and the connection with the appropriate\n", + "\n", + "data source.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 2\n", + " Conventional Methods of Data Sharing and Their Challenges\n", + "\n", + "Sharing data across different platforms, companies and clouds is no easy task. In the past,\n", + "\n", + "organizations have hesitated to share data more freely because of the perceived lack\n", + "\n", + "of secure technology, competitive concerns and the cost of implementing data sharing\n", + "\n", + "solutions.\n", + "\n", + "Even for companies that have the budget to implement data sharing technology, many of\n", + "\n", + "the current approaches can’t keep up with today’s requirements for open-format, multi-\n", + "\n", + "cloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n", + "\n", + "which creates friction for data providers and data consumers who use non-compatible\n", + "\n", + "platforms.\n", + "\n", + "Over the past 30 years, data sharing solutions have come in three forms: legacy and\n", + "\n", + "homegrown solutions, cloud object storage and closed source commercial solutions.\n", + "\n", + "Each of these approaches comes with its pros and cons.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Legacy and homegrown solutions\n", + "\n", + "Many companies have built homegrown data sharing solutions based on legacy\n", + "\n", + "technologies such as email, (S)FTP or APIs.\n", + "\n", + "\n", + "Provider\n", + "\n", + "ETL\n", + "\n", + "\n", + "Consumer\n", + "\n", + "\n", + "Batch data\n", + "from provider\n", + "\n", + "\n", + "Table �\n", + "\n", + "Table 2\n", + "\n", + "\n", + "FTP/SSH/API\n", + "Server\n", + "\n", + "\n", + "FTP/SSH/API ETL Database Analyst Run Analysis\n", + "Server\n", + "\n", + "\n", + "**Figure 1:**\n", + "Legacy data\n", + "sharing solutions\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n", + "\n", + "consumers can leverage a suite of clients to access data provided to them.\n", + "\n", + "\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n", + "\n", + "and will work both on-prem and on clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n", + "\n", + "it and host it on an FTP server for different recipients. Additionally, this approach\n", + "\n", + "results in creating copies of data sets. Data copying causes duplication and prevents\n", + "\n", + "organizations from instantly accessing live data.\n", + "\n", + "\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n", + "\n", + "architectures due to replication and provisioning. This can add considerable time to\n", + "\n", + "data sharing activities and result in out-of-date data for end consumers.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n", + "\n", + "and load (ETL) the shared data for their end use cases, which further delays the time to\n", + "\n", + "insights. For any new data updates from the providers, the consumers have to rerun ETL\n", + "\n", + "pipelines again and again.\n", + "\n", + "\u0007 **Security and governance.** As modern data requirements become more stringent,\n", + "\n", + "homegrown and legacy technologies have become more difficult to secure and govern.\n", + "\n", + "\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n", + "\n", + "accommodate large data sets.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Proprietary vendor solutions\n", + "\n", + "Commercial data sharing solutions are a popular option among companies that don’t want\n", + "\n", + "to devote the time and resources to building an in-house solution yet also want more\n", + "\n", + "control than what cloud object storage can offer.\n", + "\n", + "\n", + "Vendor 1 Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Vendor V Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "Shared dataset\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "No cross-platform\n", + "sharing\n", + "\n", + "\n", + "**Figure 2:**\n", + "Proprietary\n", + "vendor solutions\n", + "\n", + "\n", + "Shared dataset\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Sharing limited to recipients\n", + "on the same platform\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "Data;\n", + "Consumere\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n", + "\n", + "the same platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n", + "\n", + "data sharing is easy among fellow customers, it’s usually impossible with those who\n", + "\n", + "use competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n", + "\n", + "Furthermore, platform differences between data providers and recipients introduce\n", + "\n", + "data sharing complexities.\n", + "\n", + "\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n", + "\n", + "data copies.\n", + "\n", + "\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n", + "\n", + "\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n", + "\n", + "consumers, as data providers have to replicate data for different recipients on different\n", + "\n", + "cloud platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Cloud object storage\n", + "\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n", + "\n", + "same cloud to access the objects.\n", + "\n", + "\u0007 **Cumbersome security and governance.** Assigning permissions\n", + "\n", + "and managing access is complex. Custom application logic is\n", + "\n", + "needed to generate signed URLs.\n", + "\n", + "\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n", + "\n", + "find it difficult to understand Identity Access Management\n", + "\n", + "(IAM) policies and how data is mapped to underlying files. For\n", + "\n", + "companies with large volumes of data, sharing via cloud storage\n", + "\n", + "is time-consuming, cumbersome and nearly impossible to scale.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** The data recipients\n", + "\n", + "have to run extract, transform and load (ETL) pipelines on the\n", + "\n", + "raw files before consuming them for their end use cases.\n", + "\n", + "The lack of a comprehensive solution makes it challenging for data\n", + "\n", + "providers and consumers to easily share data. Cumbersome and\n", + "\n", + "incomplete data sharing processes also constrain the development\n", + "\n", + "of business opportunities from shared data.\n", + "\n", + "\n", + "Object storage is considered a good fit for the cloud because it is\n", + "\n", + "elastic and can more easily scale into multiple petabytes to support\n", + "\n", + "unlimited data growth. The big three cloud providers all offer object\n", + "\n", + "storage services (AWS S3, Azure Blob, Google Cloud Storage) that\n", + "\n", + "are cheap, scalable and extremely reliable.\n", + "\n", + "An interesting feature of cloud object storage is the ability to\n", + "\n", + "generate signed URLs, which grant time-limited permission to\n", + "\n", + "download objects. Anyone who receives the presigned URL can\n", + "\n", + "then access the specified objects, making this a convenient\n", + "\n", + "way to share data.\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Sharing data in place.** Object storage can be shared in place,\n", + "\n", + "allowing consumers to access the latest available data.\n", + "\n", + "\u0007 **Scalability.** Cloud object storage profits from availability and\n", + "\n", + "durability guarantees that typically cannot be achieved\n", + "\n", + "on-premises. Data consumers retrieve data directly from the\n", + "\n", + "cloud providers, saving bandwidth for the providers.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 3\n", + " Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n", + "\n", + "\n", + "We believe the future of data sharing should be characterized by\n", + "\n", + "open technology. Data sharing shouldn’t be tied to a proprietary\n", + "\n", + "technology that introduces unnecessary limitations and financial\n", + "\n", + "burdens to the process. It should be readily available to anyone who\n", + "\n", + "wants to share data at scale. This philosophy inspired us to develop\n", + "\n", + "and release a new protocol for sharing data: Delta Sharing.\n", + "\n", + "#### What is Delta Sharing?\n", + "\n", + "Delta Sharing provides an open solution to securely share live data\n", + "\n", + "from your lakehouse to any computing platform. Recipients don’t\n", + "\n", + "\n", + "Data providers can centrally manage, govern, audit and track\n", + "\n", + "usage of the shared data on one platform. Delta Sharing is natively\n", + "\n", + "integrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n", + "\n", + "manage and audit shared data across organizations and confidently\n", + "\n", + "share data assets while meeting security and compliance needs.\n", + "\n", + "With Delta Sharing, organizations can easily share existing large-\n", + "\n", + "scale data sets based on the open source formats Apache Parquet\n", + "\n", + "and Delta Lake without moving data. Teams gain the flexibility to\n", + "\n", + "query, visualize, transform, ingest or enrich shared data with their\n", + "\n", + "tools of choice.\n", + "\n", + "\n", + "have to be on the Databricks platform or on the same cloud or a\n", + "\n", + "cloud at all. Data providers can share live data without replicating\n", + "\n", + "it or moving it to another system. Recipients benefit from always\n", + "\n", + "having access to the latest version of data and can quickly query\n", + "\n", + "shared data using tools of their choice for BI, analytics and machine\n", + "\n", + "learning, reducing time-to-value.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data ����i�e�\n", + "\n", + "\n", + "Any u�e cy�e\n", + "\n", + "Analytics\n", + "\n", + "BI\n", + "\n", + "Data Science\n", + "\n", + "\n", + "Data Recipient\n", + "\n", + "Any sool\n", + "\n", + "And many more\n", + "\n", + "\n", + "Any cloud/on-prem\n", + "\n", + "On-premises\n", + "\n", + "\n", + "Access permissions\n", + "\n", + "Delta Sharing Protocol\n", + "\n", + "\n", + "Delta �a�e �a�le Delta Sharing Ser�er\n", + "\n", + "\n", + "No replication\n", + "Easy to manage\n", + "Secure\n", + "\n", + "\n", + "**Figure 3:**\n", + "Delta Sharing\n", + "\n", + "\n", + "Databricks designed Delta Sharing with five goals in mind:\n", + "\n", + "\u0007Provide an open cross-platform sharing solution\n", + "\n", + "\u0007Share live data without copying it to another system\n", + "\n", + "\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n", + "\n", + "provide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n", + "\n", + "\u0007Provide strong security, auditing and governance\n", + "\n", + "\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n", + "\n", + "derivatives such as ML models, dashboards and notebooks, in addition to tabular data\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Delta Sharing\n", + "\n", + "By eliminating the obstacles and shortcomings associated with typical data sharing\n", + "\n", + "approaches, Delta Sharing delivers several key benefits, including:\n", + "\n", + "\n", + "**Open cross-platform sharing.** Delta Sharing establishes a new\n", + "\n", + "open standard for secure data sharing and supports open source\n", + "\n", + "Delta and Apache Parquet formats. Data recipients don’t have to be\n", + "\n", + "on the Databricks platform or on the same cloud, as Delta Sharing\n", + "\n", + "works across clouds and even from cloud to on-premises setups. To\n", + "\n", + "give customers even greater flexibility, Databricks has also released\n", + "\n", + "open source connectors for pandas, Apache Spark, Elixir and\n", + "\n", + "Python, and is working with partners on many more.\n", + "\n", + "\u0007 **Securely share live data without replication.** Most enterprise\n", + "\n", + "\n", + "**Centralized governance.** With Databricks Delta Sharing, data\n", + "\n", + "providers can grant, track, audit and even revoke access to shared\n", + "\n", + "data sets from a single point of enforcement to meet compliance and\n", + "\n", + "other regulatory requirements. Databricks Delta Sharing users get:\n", + "\n", + "\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n", + "\n", + "governance offering for Databricks Lakehouse\n", + "\n", + "\u0007Simple, more secure setup and management of shares\n", + "\n", + "\u0007The ability to create and manage recipients and data shares\n", + "\n", + "\u0007Audit logging captured automatically as part of Unity Catalog\n", + "\n", + "\u0007Direct integration with the rest of the Databricks ecosystem\n", + "\n", + "\u0007No separate compute for providing and managing shares\n", + "\n", + "\n", + "data today is stored in cloud data lakes. Any of these existing data\n", + "\n", + "sets on the provider’s data lake can easily be shared without any\n", + "\n", + "data replication or physical movement of data. Data providers can\n", + "\n", + "update their data sets reliably in real time and provide a fresh and\n", + "\n", + "consistent view of their data to recipients.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Share data products, including AI models, dashboards and**\n", + "\n", + "**notebooks, with greater flexibility.** Data providers can choose\n", + "\n", + "between sharing anentire table or sharing only a version or\n", + "\n", + "specific partitions of a table. However, sharing just tabular data\n", + "\n", + "is not enough to meet today’s consumer demands. Delta Sharing\n", + "\n", + "also supports sharing of non-tabular data and data derivatives\n", + "\n", + "such as data streams, AI models, SQL views and arbitrary files,\n", + "\n", + "enablingincreased collaboration and innovation. Data providers can\n", + "\n", + "build, package and distribute data products including data sets,\n", + "\n", + "AI and notebooks, allowingdata recipients to get insights faster.\n", + "\n", + "Furthermore, this approach promotes and empowers the exchange\n", + "\n", + "of knowledge — not just data — between different organizations.\n", + "\n", + "\n", + "**Share data at a lower cost.** Delta Sharing lowers the cost of\n", + "\n", + "managing and consuming shares for both data providers and\n", + "\n", + "recipients. Providers can share data from their cloud object store\n", + "\n", + "without replicating, thereby reducing the cost of storage. Incontrast,\n", + "\n", + "existing data sharing platforms require data providers to first move\n", + "\n", + "their data into their platform or store data in proprietary formats in\n", + "\n", + "their managed storage, which often costs more and results in data\n", + "\n", + "duplication. With Delta Sharing, data providers don’t need to set\n", + "\n", + "up separate computing environments to share data. Consumers\n", + "\n", + "can access shared data directly using their tools of choice without\n", + "\n", + "setting up specific consumption ecosystems, thereby reducing\n", + "\n", + "costs.\n", + "\n", + "\n", + "With Delta Sharing we are able to achieve a truly open marketplace\n", + "\n", + "and truly open ecosystem. In contrast, commercial products are\n", + "\n", + "mostly limited to sharing raw tabular data and cannot be used to\n", + "\n", + "\n", + "share these higher-valued data derivatives.\n", + "\n", + "\n", + "\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n", + "\n", + "set up a new ingestion process to consume data. Data recipients\n", + "\n", + "can directly access the fresh data and query it using tools of their\n", + "\n", + "choice. Recipients can also enrich data with data sets from popular\n", + "\n", + "data providers. The Delta Sharing ecosystem of open source and\n", + "\n", + "commercial partners is growing every day.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Maximizing value of data with Delta Sharing\n", + "\n", + "Delta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n", + "\n", + "variety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n", + "\n", + "Sharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n", + "\n", + "In this section we will explore the building blocks of such an approach and the use cases emerging from these.\n", + "\n", + "\n", + "“Delta Sharing helped us streamline our data delivery process\n", + "\n", + "for large data sets. This enables our clients to bring their own\n", + "\n", + "compute environment to read fresh curated data with little-to-\n", + "\n", + "no integration work, and enables us to continue expanding our\n", + "\n", + "catalog of unique, high-quality data products.”\n", + "\n", + "— **William Dague** , Head of Alternative Data, Nasdaq\n", + "\n", + "\n", + "“We recognize that openness of data will play a key role in\n", + "\n", + "achieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n", + "\n", + "provides Shell with a standard, controlled and secure protocol\n", + "\n", + "for sharing vast amounts of data easily with our partners to work\n", + "\n", + "toward these goals without requiring our partners be on the same\n", + "\n", + "data sharing platform.”\n", + "\n", + "— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n", + "\n", + "\n", + "“Leveraging the powerful capabilities of Delta Sharing from\n", + "\n", + "\n", + "Databricks enables Pumpjack Dataworks to have a faster\n", + "\n", + "onboarding experience, removing the need for exporting,\n", + "\n", + "importing and remodeling of data, which brings immediate\n", + "\n", + "value to our clients. Faster results yield greater commercial\n", + "\n", + "opportunity for our clients and their partners.”\n", + "\n", + "\n", + "“Data accessibility is a massive consideration for us. We believe\n", + "\n", + "that Delta Sharing will simplify data pipelines by enabling us to\n", + "\n", + "query fresh data from the place where it lives, and we are not\n", + "\n", + "locked into any platform or data format.”\n", + "\n", + "— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n", + "\n", + "\n", + "— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n", + "\n", + "“As a data company, giving our customers access to our data sets\n", + "\n", + "is critical. The Databricks Lakehouse Platform with Delta Sharing\n", + "\n", + "really streamlines that process, allowing us to securely reach a\n", + "\n", + "much broader user base regardless of cloud or platform.”\n", + "\n", + "— **Felix Cheung** , VP of Engineering, SafeGraph\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data monetization with Delta Sharing\n", + "\n", + "Delta Sharing enables companies to monetize their data product simply and with necessary governance.\n", + "\n", + "Data /on.2-er $\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Fulfllleen\n", + "\n", + "Entitles various data products\n", + "\n", + "Data Vendor\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Data /on.2-er �\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 4:**\n", + "Data monetization\n", + "with Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Billieg Audit Log\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Sharing, a data provider can seamlessly share large data sets and overcome\n", + "\n", + "the scalability issues associated with SFTP servers. Data providers can easily expand their\n", + "\n", + "data product lines since Delta Sharing doesn’t require you to build a dedicated service\n", + "\n", + "for each of your data products like API services would. The company simply grants and\n", + "\n", + "manages access to the data recipients instead of replicating the data — thereby reducing\n", + "\n", + "complexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n", + "\n", + "for a data product. Any data that exists on your platform can be securely shared with your\n", + "\n", + "consumers. This grants a wider addressable market — your products have appeal to a\n", + "\n", + "broader range of consumers, from those who say “we need access to your raw data only”\n", + "\n", + "to those who say “we want only small subsets of your Gold layer data.”\n", + "\n", + "To mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n", + "\n", + "access to the data. Data providers can use this information to determine the costs\n", + "\n", + "associated with any of the data products and evaluate if such products are commercially\n", + "\n", + "viable and sensible.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### B2B sharing with Delta Sharing\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner A\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Partner U\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner B\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 5:**\n", + "B2B sharing with\n", + "Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing applies in the case of bidirectional exchange of data.\n", + "\n", + "Companies use Delta Sharing to incorporate partners and suppliers\n", + "\n", + "seamlessly into their workflows. Traditionally, this is not an easy task.\n", + "\n", + "An organization typically has no control over how their partners are\n", + "\n", + "implementing their own data platforms. The complexity increases\n", + "\n", + "when we consider that the partners and suppliers can reside in\n", + "\n", + "a public cloud, private cloud or an on-premises deployed data\n", + "\n", + "platform. The choices of platform and architecture are not imposed\n", + "\n", + "on your partners and suppliers. Due to its open protocol, Delta\n", + "\n", + "Sharing addresses this requirement foundationally. Through a wide\n", + "\n", + "array of existing connectors (and many more being implemented),\n", + "\n", + "your data can land anywhere your partners and suppliers need to\n", + "\n", + "consume it.\n", + "\n", + "\n", + "In addition to the location of data consumer residency, the\n", + "\n", + "complexity of data arises as a consideration. The traditional\n", + "\n", + "approach to sharing data using APIs is inflexible and imposes\n", + "\n", + "additional development cycles on both ends of the exchange in\n", + "\n", + "order to implement both the provider pipelines and consumer\n", + "\n", + "pipelines. With Delta Sharing, this problem can be abstracted. Data\n", + "\n", + "can be shared as soon as it lands in the Delta table and when the\n", + "\n", + "shares and grants are defined. There are no implementation costs\n", + "\n", + "on the provider side. On the consumer side, data simply needs\n", + "\n", + "to be ingested and transformed into an expected schema for the\n", + "\n", + "downstream processes.\n", + "\n", + "This means that you can form much more agile data exchange\n", + "\n", + "patterns with your partners and suppliers and attain value from your\n", + "\n", + "combined data much quicker than ever before.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Internal data sharing with Delta Sharing\n", + "\n", + "Internal data sharing is becoming an increasingly important consideration for any modern\n", + "\n", + "organization, particularly where data describing the same concepts have been produced in\n", + "\n", + "different ways and in different data silos across the organization. In this situation it is important\n", + "\n", + "to design systems and platforms that allow governed and intentional federation of data and\n", + "\n", + "processes, and at the same time allow easy and seamless integration of said data and processes.\n", + "\n", + "Architectural design patterns such as Data Mesh have emerged to address these specific\n", + "\n", + "challenges and considerations. Data Mesh architecture assumes a federated design and\n", + "\n", + "dissemination of ownership and responsibility to business units or divisions. This, in fact, has\n", + "\n", + "several advantages, chief among them that data is owned by the parts of the organization closest\n", + "\n", + "to the source of the data. Data residence is naturally enforced since data sits within the geo-\n", + "\n", + "locality where it has been generated. Finally, data volumes and data variety are kept in control\n", + "\n", + "due to the localization within a data domain (or data node). On the other hand, the architecture\n", + "\n", + "promotes exchange of data between different data domains when that data is needed to deliver\n", + "\n", + "outcomes and better insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "Business Unit 1 Business Unit ,\n", + "i n R e g i o n A i n R e g i o n -\n", + "\n", + "Cloud Storage\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Business Unit B\n", + "\n", + "i n R e g i o n A\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "Business Unit �\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "**Figure 6:**\n", + "Building a Data Mesh\n", + "with Delta Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog enables consolidated data access control across\n", + "\n", + "different data domains within an organization using the Lakehouse\n", + "\n", + "on Databricks. In addition, Unity Catalog adds a set of simple and\n", + "\n", + "easy-to-use declarative APIsto govern and control data exchange\n", + "\n", + "patterns between the data domains in the Data Mesh.\n", + "\n", + "To make matters even more complicated, organizations can grow\n", + "\n", + "through mergers and acquisitions. In such cases we cannot assume\n", + "\n", + "that organizations being acquired have followed the same set of\n", + "\n", + "rules and standards to define their platforms and produce their\n", + "\n", + "data. Furthermore, we cannot even assume that they have used\n", + "\n", + "the same cloud providers, nor can we assume the complexity of\n", + "\n", + "their data models. Delta Sharing can simplify and accelerate the\n", + "\n", + "\n", + "unification and assimilation of newly acquired organizations and\n", + "\n", + "their data and processes.. Individual organizations can be treated\n", + "\n", + "as new data domains in the overarching mesh. Only selected data\n", + "\n", + "sources can be exchanged between the different platforms. This\n", + "\n", + "enables teams to move freely between the organizations that are\n", + "\n", + "merging without losing their data — if anything, they are empowered\n", + "\n", + "to drive insights of higher quality by combining the data of both.\n", + "\n", + "With Unity Catalog and Delta Sharing, the Lakehouse architecture\n", + "\n", + "seamlessly combines with the Data Mesh architecture to deliver\n", + "\n", + "more power than ever before, pushing the boundaries of what’s\n", + "\n", + "possible and simplifying activities that were deemed daunting not\n", + "\n", + "so long ago.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 4\n", + " How Delta Sharing Works\n", + "\n", + "\n", + "Delta Sharing is designed to be simple, scalable, nonproprietary\n", + "\n", + "and cost-effective for organizations that are serious about getting\n", + "\n", + "more from their data. Delta Sharing is natively integrated with Unity\n", + "\n", + "Catalog, which enables customers to add fine-grained governance\n", + "\n", + "and security controls, making it easy and safe to share data\n", + "\n", + "\n", + "Delta Sharing is a simple REST protocol that securely grants\n", + "\n", + "temporary access to part of a cloud data set. It leverages modern\n", + "\n", + "cloud storage systems — such as AWS S3, Azure ADLS or Google’s\n", + "\n", + "GCS — to reliably grant read-only access to large data sets. Here’s\n", + "\n", + "how it works for data providers and data recipients.\n", + "\n", + "\n", + "internally or externally.\n", + "\n", + "Data PJQIiLeJ Data Recipient\n", + "\n", + "Access permissions\n", + "\n", + "Request table\n", + "\n", + "Pre-signed short-lived URLs\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "Parquet `iles\n", + "\n", + "\n", + "Delta Sharing Server\n", + "\n", + "\n", + "**Figure 7:**\n", + "How Delta Sharing\n", + "works connecting data\n", + "providers and data\n", + "recipients\n", + "\n", + "\n", + "Temporary direct access to fles Parquet ormatt\n", + "in the object store — AWS S3, GCP, ADLS\n", + "\n", + "\n", + "\n", + "- • •\n", + "Delta Sharing Client\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data providers\n", + "\n", + "The data provider shares existing tables or parts thereof (such as\n", + "\n", + "specific table versions or partitions) stored on the cloud data lake\n", + "\n", + "in Delta Lake format. The provider decides what data they want to\n", + "\n", + "share and runs a sharing server in front of it that implements the\n", + "\n", + "Delta Sharing protocol and manages recipient access. . To manage\n", + "\n", + "shares and recipients, you can use SQL commands,the Unity\n", + "\n", + "Catalog CLI or the intuitive user interface.\n", + "\n", + "#### Data recipients\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients\n", + "\n", + "that support the protocol. Databricks has released open source\n", + "\n", + "connectors for pandas, Apache Spark, Java and Python, and is\n", + "\n", + "working with partners on many more.\n", + "\n", + "\n", + "#### The data exchange\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "**1.** \u0007The recipient’s client authenticates to the sharing server and\n", + "\n", + "asks to query a specific table. The client can also provide filters\n", + "\n", + "on the data (for example, “country=US”) as a hint to read just a\n", + "\n", + "subset of the data.\n", + "\n", + "**2.** \u0007The server verifies whether the client is allowed to access the\n", + "\n", + "data, logs the request, and then determines which data to send\n", + "\n", + "back. This will be a subset of the data objects in cloud storage\n", + "\n", + "systems that make up the table.\n", + "\n", + "**3.** \u0007To allow temporary access to the data, the server generates\n", + "\n", + "short-lived presigned URLs that allow the client to read Parquet\n", + "\n", + "files directly from the cloud provider so that the read-only\n", + "\n", + "access can happen in parallel at massive bandwidth, without\n", + "\n", + "streaming through the sharing server.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 5\n", + " Introducing Databricks Marketplace\n", + "\n", + "\n", + "Enterprises need open collaboration for data and AI. Data sharing\n", + "\n", + "— within an organization or externally — allows companies to\n", + "\n", + "collaborate with partners, establish new partnerships and generate\n", + "\n", + "new revenue streams with data monetization.\n", + "\n", + "The demand for generative AI is driving disruption across industries,\n", + "\n", + "increasing the urgency for technical teams to build generative AI\n", + "\n", + "models and Large Language Models (LLMs) on top of their own data\n", + "\n", + "to differentiate their offerings.\n", + "\n", + "\n", + "Traditional data marketplaces are restricted and offer only data or\n", + "\n", + "simple applications, therefore limiting their value to data consumers.\n", + "\n", + "They also don’t offer tools to evaluate the data assets beyond basic\n", + "\n", + "descriptions or examples. Finally, data delivery is limited, often\n", + "\n", + "requiring ETL or a proprietary delivery mechanism.\n", + "\n", + "Enterprises need a better way to share data and AI that is flexible,\n", + "\n", + "secure and unlocks business value. An ecosystem makes data\n", + "\n", + "sharing and collaboration powerful.\n", + "\n", + "\n", + "**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "Focus on data only\n", + "or simple applications\n", + "\n", + "Lengthy discovery and\n", + "evaluation\n", + "\n", + "Delayed time-to-insights\n", + "with vendor lock-in\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "monetize new types of assets\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "more users\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "and unified governance\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Challenges in today's data marketplaces\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "\u0007 **Focus on data only or simple applications:** Accessing only\n", + "\n", + "data sets means organizations looking to take advantage of\n", + "\n", + "AI/ML need to look elsewhere or start from scratch, causing\n", + "\n", + "delays in driving business insights.\n", + "\n", + "\u0007 **Lengthy discovery and evaluation:** The tools most\n", + "\n", + "marketplaces provide for data consumers to evaluate data\n", + "\n", + "are simply descriptions and example SQL statements. Minimal\n", + "\n", + "\n", + "\u0007 **Limited opportunities to monetize new types of assets:**\n", + "\n", + "A data-only approach means organizations are limited to\n", + "\n", + "monetizing anything beyond a data set and will face more\n", + "\n", + "friction to create new revenue opportunities with non-\n", + "\n", + "compatible platforms.\n", + "\n", + "**Difficulty reaching more users:** Data providers must choose\n", + "\n", + "between forgoing potential business or incurring the expense\n", + "\n", + "of replicating data.\n", + "\n", + "\n", + "evaluation tools mean it takes more time to figure out if a data\n", + "\n", + "product is right for you, which might include more time in\n", + "\n", + "back-and-forth messages with a provider or searching for a\n", + "\n", + "new provider altogether.\n", + "\n", + "\n", + "**Delayed time-to-insights with vendor lock-in:** Delivery\n", + "\n", + "through proprietary sharing technologies or FTP means either\n", + "\n", + "vendor lock-in or lengthy ETL processes to get the data where\n", + "\n", + "you need to work with it.\n", + "\n", + "\n", + "**Lack of secure technology and unified governance:** Without\n", + "\n", + "open standards for sharing data securely across platforms\n", + "\n", + "and clouds, data providers must use multiple tools to secure\n", + "\n", + "access to scattered data, leading to inconsistent governance.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Databricks Marketplace?\n", + "\n", + "\n", + "approach allows you to put your data to work more quickly in\n", + "\n", + "every cloud with your tools of choice.\n", + "\n", + "Marketplace brings together a vast ecosystem of data\n", + "\n", + "consumers and data providers to collaborate across a wide\n", + "\n", + "array of data sets without platform dependencies, complicated\n", + "\n", + "ETL, expensive replication and vendor lock-in.\n", + "\n", + "\n", + "Databricks Marketplace is an open marketplace for all your data,\n", + "\n", + "analytics and AI, powered by Delta Sharing.\n", + "\n", + "Since Marketplace is powered by Delta Sharing, you can benefit\n", + "\n", + "from open source flexibility and no vendor lock-in, enabling you\n", + "\n", + "to collaborate across all platforms, clouds and regions. This open\n", + "\n", + "\n", + "#### Key Benefits of Databricks Marketplace\n", + "\n", + "**Consumers** **Providers**\n", + "\n", + "\n", + "Databricks\n", + "Marketplace\n", + "provides key benefits\n", + "for both data\n", + "consumers and data\n", + "providers.\n", + "\n", + "\n", + "Discover more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Reach users\n", + "\n", + "on any platform\n", + "\n", + "\n", + "Reach users\n", + "\n", + "\n", + "Evaluate data\n", + "\n", + "products faster\n", + "\n", + "Avoid vendor lock-in\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "\n", + "Share data securely\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Marketplace drives innovation and expands revenue opportunities\n", + "\n", + "\n", + "##### Data Consumers\n", + "\n", + " For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "**Discover more than just data:** Access more than just data sets,\n", + "\n", + "including AI models, notebooks, applications and solutions.\n", + "\n", + "**Evaluate data products faster:** Pre-built notebooks and sample\n", + "\n", + "data help you quickly evaluate and have much greater confidence\n", + "\n", + "that a data product is right for your AI or analytics initiatives.\n", + "\n", + "Obtain the fastest and simplest time to insight.\n", + "\n", + "**Avoid vendor lock-in:** Substantially reduce the time to deliver\n", + "\n", + "insights and avoid lock-in with open and seamless sharing and\n", + "\n", + "collaboration across clouds, regions, or platforms. Directly\n", + "\n", + "integrate with your tools of choice and right where you work.\n", + "\n", + "\n", + "##### Data Providers\n", + "\n", + " For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n", + "\n", + "**Reach users on any platform:** Expand your reach across\n", + "\n", + "platforms and access a massive ecosystem beyond walled\n", + "\n", + "gardens. Streamline delivery of simple data sharing to any cloud\n", + "\n", + "or region, without replication.\n", + "\n", + "**Monetize more than just data:** Monetize the broadest set of\n", + "\n", + "data assets including data sets, notebooks, AI models to reach\n", + "\n", + "more data consumers.\n", + "\n", + "**Share data securely:** Share all your data sets, notebooks, AI\n", + "\n", + "models, dashboards and more securely across clouds, regions\n", + "\n", + "and data platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Enable collaboration and accelerate innovation\n", + "\n", + "\n", + "#### Powered by a fast, growing ecosystem\n", + "\n", + "Enterprises need open collaboration for data and AI. In the past few\n", + "\n", + "months, we've continued to increase partners across industries,\n", + "\n", + "including Retail, Communications and Media & Entertainment,\n", + "\n", + "\n", + "\u0007 **Advertising and Retail**\n", + "\n", + "Incorporate shopper behavior analysis | Ads uplift/\n", + "\n", + "performance | Demand forecasting | “Next best SKU”\n", + "\n", + "prediction | Inventory analysis | Live weather data\n", + "\n", + "\n", + "Financial Services, with 520+ listings you can explore in our open\n", + "\n", + "\n", + "\u0007 **Finance**\n", + "\n", + "Incorporate data from stock exchange to predict\n", + "\n", + "economic impact | Market research | Public census and\n", + "\n", + "housing data to predict insurance sales\n", + "\n", + "\u0007 **Healthcare and Life Sciences**\n", + "\n", + "Genomic target identification | Patient risk scoring\n", + "\n", + "Accelerating drug discovery | Commercial effectiveness |\n", + "\n", + "Clinical research\n", + "\n", + "For more on Databricks Marketplace,\n", + "\n", + "go to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n", + "\n", + "Resources section on page 41 .\n", + "\n", + "\n", + "Marketplace from 80+ providers and counting.\n", + "\n", + "#### Use cases for an open marketplace\n", + "\n", + "Organizations across all industries have many use cases for\n", + "\n", + "consuming and sharing third-party data from the simple (dataset\n", + "\n", + "joins) to the more advanced (AI notebooks, applications and\n", + "\n", + "dashboards).\n", + "\n", + "\n", + "-----\n", + "\n", + "#### New upcoming feature: AI model sharing\n", + "\n", + "\n", + "Nowadays, it may seem like every organization wants to become\n", + "\n", + "an AI organization. However, most organizations are new to AI.\n", + "\n", + "Databricks has heard from customers that they want to discover\n", + "\n", + "out-of-the-box AI models on Marketplace to help them kickstart\n", + "\n", + "their AI innovation journey.\n", + "\n", + "To meet this demand, Databricks will be adding AI model sharing\n", + "\n", + "capabilities on Marketplace to provide users access to both OSS\n", + "\n", + "and proprietary AI (both first-and third-party) models. This will\n", + "\n", + "enable data consumers and providers to discover and monetize AI\n", + "\n", + "models and integrate AI into their data solutions.\n", + "\n", + "\n", + "Using this feature, data consumers can evaluate AI models with\n", + "\n", + "rich previews, including visualizations and pre-built notebooks with\n", + "\n", + "sample data. With Databricks Marketplace, there are no difficult\n", + "\n", + "data delivery mechanisms — you can get the AI models instantly\n", + "\n", + "with the click of a button. All of this works out-of-the-box with the AI\n", + "\n", + "capabilities of the Databricks Lakehouse Platform for both real-time\n", + "\n", + "and batch inference. For real-time inference, you can use model\n", + "\n", + "serving endpoints. For batch inference, you can invoke the models\n", + "\n", + "as functions directly from DBSQL or notebooks.\n", + "\n", + "With AI model sharing, Databricks customers will have access\n", + "\n", + "to best-in-class models from leading providers, as well as OSS\n", + "\n", + "models published by Databricks which can be quickly and securely\n", + "\n", + "applied on top of their data. Databricks will curate and publish\n", + "\n", + "its own open source models across common use cases, such as\n", + "\n", + "instruction-following and text summarization, and optimize tuning or\n", + "\n", + "deployment of these models.\n", + "\n", + "Using AI models from Databricks Marketplace can help your\n", + "\n", + "organization summarize complex information quickly and easily to\n", + "\n", + "help accelerate the pace of innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 6\n", + " Share securely with Databricks Clean Rooms\n", + "\n", + "\n", + "While the demand for external data to make data-driven\n", + "\n", + "innovations is greater than ever, there is growing concern among\n", + "\n", + "organizations around data privacy. The need for organizations to\n", + "\n", + "share data and collaborate with their partners and customers in a\n", + "\n", + "secure, governed and privacy-centric way is driving the concept\n", + "\n", + "of “data clean rooms.”\n", + "\n", + "\n", + "#### What is a data clean room?\n", + "\n", + "A data clean room provides a secure, governed and privacy-safe\n", + "\n", + "environment where participants can bring their sensitive data, which\n", + "\n", + "might include personally identifiable information (PII), and perform\n", + "\n", + "joint analysis on that private data. Participants have full control\n", + "\n", + "of the data and can decide which participants can perform what\n", + "\n", + "analysis without exposing any sensitive data.\n", + "\n", + "\n", + "###### Collaborator A\n", + " Data Cleanroom\n", + "E.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n", + "\n", + "\u0007What is our audience overlap?\n", + "\n", + "\n", + "###### Collaborator B\n", + "\n", + "E.G., ADVERTISERTS\n", + "\n", + "\n", + "**Figure 8:**\n", + "Data clean room\n", + "diagram example\n", + "for audience\n", + "overlap analysis in\n", + "advertising\n", + "\n", + "\n", + "How did my campaign do in\n", + "\n", + "terms of reach and frequency?\n", + "\n", + "\n", + "\u0007What is the lift in purchases\n", + "\n", + "among those in-segment versus\n", + "those out-of-segment?\n", + "\n", + "**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n", + "\n", + "\n", + "-----\n", + "\n", + "A data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n", + "\n", + "advertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n", + "\n", + "the last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n", + "\n", + "sharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n", + "\n", + "There are various compelling needs driving this demand:\n", + "\n", + "\n", + "**Privacy-first world.** Stringent data privacy regulations such as\n", + "\n", + "GDPR and CCPA, along with sweeping changes in third-party\n", + "\n", + "measurement, have transformed how organizations collect, use\n", + "\n", + "and share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n", + "\n", + "[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n", + "\n", + "and flexibility to easily opt out of app tracking. Google also plans\n", + "\n", + "to [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n", + "\n", + "2024. As these privacy laws and practices evolve, the demand\n", + "\n", + "for data cleanrooms is likely to rise as the industry moves to new\n", + "\n", + "\n", + "**Collaboration in a fragmented ecosystem.** Today, consumers have\n", + "\n", + "more options than ever before when it comes to where, when and\n", + "\n", + "how they engage with content. As a result, the digital footprint of\n", + "\n", + "consumers is fragmented across different platforms, necessitating\n", + "\n", + "that companies collaborate with their partners to create a unified\n", + "\n", + "view of their customers’ needs and requirements. To facilitate\n", + "\n", + "collaboration across organizations, cleanrooms provide a secure\n", + "\n", + "and private way to combine their data with other data to unlock new\n", + "\n", + "insights or capabilities.\n", + "\n", + "\n", + "identifiers that are PII based, such as UID 2.0, and organizations\n", + "\n", + "try to find new ways to share and join data with customers and\n", + "\n", + "partners in a privacy-centric way.\n", + "\n", + "**New ways to monetize data.** Most organizations are looking to\n", + "\n", + "monetize their data in one form or another. With today’s privacy\n", + "\n", + "laws, companies will try to find any possible advantages to monetize\n", + "\n", + "their data without the risk of breaking privacy rules. This creates an\n", + "\n", + "opportunity for data vendors or publishers to join data for big data\n", + "\n", + "analytics without having direct access to the data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data clean room uses cases\n", + "\n", + "\n", + "#### Category management for retail and consumer goods\n", + "\n", + "Clean rooms enable real-time collaboration between retailers\n", + "\n", + "and suppliers, ensuring secure information exchange for demand\n", + "\n", + "forecasting, inventory planning and supply chain optimization.\n", + "\n", + "This improves product availability, reduces costs and streamlines\n", + "\n", + "operations for both parties.\n", + "\n", + "#### Real-world evidence (RWE) for healthcare\n", + "\n", + "Clean rooms provide secure access to sensitive healthcare data sets,\n", + "\n", + "allowing collaborators to connect and query multiple sources of data\n", + "\n", + "without comprising data privacy. This supports RWE use cases such\n", + "\n", + "as regulatory decisions, safety, clinical trial design and observational\n", + "\n", + "research.\n", + "\n", + "\n", + "#### Audience overlap exploration for media and entertainment\n", + "\n", + "By creating a clean room environment, media companies can\n", + "\n", + "securely share their audience data with advertisers or other media\n", + "\n", + "partners. This allows them to perform in-depth analysis and identify\n", + "\n", + "shared audience segments without directly accessing or exposing\n", + "\n", + "individual user information.\n", + "\n", + "#### Know Your Customer (KYC) in banking\n", + "\n", + "KYC standards are designed to combat financial fraud, money\n", + "\n", + "laundering and terrorism financing. Clean rooms can be used within a\n", + "\n", + "given jurisdiction to allow financial services companies to collaborate\n", + "\n", + "and run shared analytics to build a holistic view of a transaction for\n", + "\n", + "investigations.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Personalization with expanded interests for retailers\n", + "\n", + "Retailers want to target consumers based on past purchases, as\n", + "\n", + "well as other purchases with different retailers. Clean rooms enable\n", + "\n", + "retailers to augment their knowledge of consumers to suggest new\n", + "\n", + "products and services that are relevant to the individual but have\n", + "\n", + "\n", + "#### 5G data monetization for telecom\n", + "\n", + "5G data monetization enables telecoms to capitalize on data\n", + "\n", + "from 5G networks. Clean rooms provide a secure environment\n", + "\n", + "for collaboration with trusted partners, ensuring privacy while\n", + "\n", + "maximizing data value for optimized services, personalized\n", + "\n", + "experiences and targeted advertising.\n", + "\n", + "\n", + "not yet been purchased.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Shortcomings of existing data clean rooms\n", + "\n", + "\n", + "Organizations exploring clean room options are finding some glaring\n", + "\n", + "shortcomings in the existing solutions that limit the full potential of the\n", + "\n", + "“clean rooms” concept.\n", + "\n", + "First, many existing data clean room vendors require data to be on the\n", + "\n", + "same cloud, same region, and/or same data platform. Participants then\n", + "\n", + "have to move data into proprietary platforms, which results in lock-in\n", + "\n", + "and additional data storage costs.\n", + "\n", + "\n", + "Second, most existing solutions are not scalable to expand\n", + "\n", + "collaboration beyond a few collaborators at a time. For example,\n", + "\n", + "an advertiser might want to get a detailed view of their ad\n", + "\n", + "performance across different platforms, which requires analysis\n", + "\n", + "of the aggregated data from multiple data publishers. With\n", + "\n", + "collaboration limited to just a few participants, organizations get\n", + "\n", + "partial insights on one clean room platform and end up moving\n", + "\n", + "their data to another clean room vendor to aggregate the data,\n", + "\n", + "incurring the operational overhead of collating partial insights.\n", + "\n", + "Finally, existing clean room solutions do not provide the flexibility\n", + "\n", + "to run arbitrary analysis and are mainly restricted to SQL, a\n", + "\n", + "subset of Python, and pre-defined templates. While SQL is\n", + "\n", + "absolutely needed for clean rooms, there are times when you\n", + "\n", + "require complex computations such as machine learning or\n", + "\n", + "integration with APIs where SQL doesn’t satisfy the full depth of\n", + "\n", + "the technical requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Databricks Clean Rooms\n", + "\n", + "Databricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n", + "\n", + "any cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n", + "\n", + "\n", + "**Flexible - your language and workload of**\n", + "\n", + "**choice.** Databricks Clean Rooms empower\n", + "\n", + "collaborators to share and join their existing\n", + "\n", + "data and run complex workloads in any\n", + "\n", + "language —Python, R, SQL, Java and Scala —\n", + "\n", + "on the data while maintaining data privacy.\n", + "\n", + "Beyond traditional SQL, users can run arbitrary\n", + "\n", + "workloads and languages, allowing them to train\n", + "\n", + "machine learning models, perform inference\n", + "\n", + "and utilize open-source or third-party privacy-\n", + "\n", + "enhancing technologies. This flexibility enables\n", + "\n", + "data scientists and analysts to achieve more\n", + "\n", + "comprehensive and advanced data analysis\n", + "\n", + "within the secure Clean Room environment.\n", + "\n", + "\n", + "**Scalable, multi-party collaboration.**\n", + "\n", + "With Databricks Clean Rooms, you can\n", + "\n", + "launch a clean room and work with multiple\n", + "\n", + "collaborators at a time. This capability\n", + "\n", + "enables real-time collaboration, fostering\n", + "\n", + "efficient and rapid results. Moreover,\n", + "\n", + "Databricks Clean Rooms seamlessly\n", + "\n", + "integrate with identity service providers,\n", + "\n", + "allowing users to leverage offerings from\n", + "\n", + "these providers during collaboration. The\n", + "\n", + "ability to collaborate with multiple parties\n", + "\n", + "and leverage identity services enhances the\n", + "\n", + "overall data collaboration experience within\n", + "\n", + "Databricks Clean Rooms.\n", + "\n", + "\n", + "**Interoperable - any data source**\n", + "\n", + "**with no replication.** Databricks Clean\n", + "\n", + "Rooms excel in interoperability, ensuring\n", + "\n", + "smooth collaboration across diverse\n", + "\n", + "environments. With Delta Sharing,\n", + "\n", + "collaborators can seamlessly work\n", + "\n", + "together across different cloud providers,\n", + "\n", + "regions and even data platforms without\n", + "\n", + "the need for extensive data movement.\n", + "\n", + "This eliminates data silos and enables\n", + "\n", + "organizations to leverage existing\n", + "\n", + "infrastructure and data ecosystems while\n", + "\n", + "maintaining the utmost security and\n", + "\n", + "compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Resources\n", + " Getting started with Data Sharing and Collaboration\n", + "\n", + "\n", + "Data sharing plays a key role in business processes across the\n", + "\n", + "enterprise, from product development and internal operations to\n", + "\n", + "customer experience and compliance. However, most businesses\n", + "\n", + "have been slow to move forward because of incompatibility\n", + "\n", + "between systems, complexity and security concerns.\n", + "\n", + "Data-driven organizations need an open — and secure — approach\n", + "\n", + "to data sharing.\n", + "\n", + "\n", + "Databricks offers an open approach to data sharing and\n", + "\n", + "collaboration with a variety of tools to:\n", + "\n", + "\u0007 **Share across platforms:** You can share live data sets, as well\n", + "\n", + "as AI models, dashboards and notebooks across platforms,\n", + "\n", + "clouds and regions. This open approach is powered by\n", + "\n", + "Delta Sharing, the world’s first open protocol for secure data\n", + "\n", + "sharing, which allows organizations to share data for any use\n", + "\n", + "case, any tool and on any cloud.\n", + "\n", + "\u0007 **Share all your data and AI: Databricks Marketplace** is an\n", + "\n", + "open marketplace for all your data, analytics and AI, enabling\n", + "\n", + "both data consumers and data providers with the ability to\n", + "\n", + "deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n", + "\n", + "to easily collaborate with customers and partners on any\n", + "\n", + "cloud in a privacy-safe way. With Delta Sharing, clean room\n", + "\n", + "participants can securely share data from their data lakes\n", + "\n", + "without any data replication across clouds or regions. Your\n", + "\n", + "data stays with you without vendor lock-in, and you can\n", + "\n", + "centrally audit and monitor the usage of your data.\n", + "\n", + "\n", + "-----\n", + "\n", + "Get started with these products by exploring the resources below.\n", + "\n", + "\n", + "**Delta Sharing**\n", + "\n", + "\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n", + "\n", + "[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n", + "\n", + "[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "\n", + "**Databricks Marketplace**\n", + "\n", + "[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n", + "\n", + "[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n", + "\n", + "[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n", + "\n", + "[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n", + "\n", + "[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n", + "\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n", + "\n", + "\n", + "**Databricks Clean Rooms**\n", + "\n", + "\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n", + "\n", + "[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n", + "\n", + "[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "\n", + "[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n", + "\n", + "\n", + "-----\n", + "\n", + "## About the Authors\n", + "\n", + "\n", + "**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n", + "\n", + "making analytics and AI simple for customers by leveraging the\n", + "\n", + "power of the Databricks Lakehouse Platform. You can reach Vuong\n", + "\n", + "on [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n", + "\n", + "\n", + "**Sachin Thakur** is a Principal Product Marketing Manager on the\n", + "\n", + "Databricks Data Engineering and Analytics team. His area of focus\n", + "\n", + "is data governance with Unity Catalog, and he is passionate about\n", + "\n", + "helping organizations democratize data and AI with the Databricks\n", + "\n", + "Lakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n", + "\n", + "\n", + "**Milos Colic** is a Senior Solution Architect at Databricks. His\n", + "\n", + "\n", + "passion is to help customers with their data exchange and data\n", + "\n", + "monetization needs. Furthermore, he is passionate about geospatial\n", + "\n", + "data processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n", + "\n", + "\n", + "**Jay Bhankharia** is a Senior Director on the Databricks Data\n", + "\n", + "Partnerships team. His passion is to help customers gain insights\n", + "\n", + "from data to use the power of the Databricks Lakehouse Platform\n", + "\n", + "for their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n", + "\n", + "\n", + "**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n", + "\n", + "\n", + "over 20 years of experience in helping organizations of any size\n", + "\n", + "build data solutions. He focuses on data monetization and loves to\n", + "\n", + "help customers and businesses get more value from the data they\n", + "\n", + "have. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n", + "\n", + "**Somasekar Natarajan** (Som) is a Solution Architect at\n", + "\n", + "Databricks specializing in enterprise data management. Som has\n", + "\n", + "worked with Fortune organizations spanning three continents for\n", + "\n", + "close to two decades with one objective — helping customers to\n", + "\n", + "\n", + "**Giselle Goicochea** is a Senior Product Marketing Manager\n", + "\n", + "on the Databricks Data Engineering and Analytics team. Her area\n", + "\n", + "of focus is data sharing and collaboration with Delta Sharing and\n", + "\n", + "Databricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n", + "\n", + "**Kelly Albano** is a Product Marketing Manager on the Databricks\n", + "\n", + "Data Engineering and Analytics team. Her area of focus is security,\n", + "\n", + "compliance and Databricks Clean Rooms. You can reach\n", + "\n", + "Kelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n", + "\n", + "\n", + "harness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "© Databricks 2023 All rights reserved\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
##### The Delta Lake Series Complete Collection\n", + "\n", + "\n", + "-----\n", + "\n", + "### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "lifecycle management to data lakes. With Delta Lake, there will be no more\n", + "malformed data ingestion, difficulties deleting data for compliance, or issues\n", + "modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "scalable cloud service.\n", + "\n", + "In this eBook, the Databricks team has compiled all of their insights into a comprehensive\n", + "format so that you can gain a full understanding of Delta Lake and its capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Contents Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "Fundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n", + "\n", + "Performance Matter **you’ll find inside** 5 Features 22\n", + "\n", + "\n", + "\n", + "Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "\n", + "Rollbacks 39\n", + "\n", + "Pinned view of a continuously updating\n", + "\n", + "Delta Lake table across multiple downstream jobs\n", + "\n", + "Queries for time series analytics made simple\n", + "\n", + "Easily Clone Your Delta Lake\n", + "\n", + "for Testing, Sharing and ML\n", + "\n", + "Reproducibility 41\n", + "\n", + "What are clones? 41\n", + "\n", + "\n", + "A lakehouse combines the best elements\n", + "\n", + "of data lakes and data warehouses 52\n", + "\n", + "Some early examples 55\n", + "\n", + "From BI to AI 55\n", + "\n", + "Diving Deep Into the\n", + "\n", + "Inner Workings of the Lakehouse and Delta Lake 56\n", + "\n", + "1. Data lakes 57\n", + "\n", + "2. Custom storage engines 57\n", + "\n", + "\n", + "Creating the Dashboard /\n", + "\n", + "Virtual Network Operation Centers 82\n", + "\n", + "Creating (near) real-time alerts 85\n", + "\n", + "Next steps: machine learning 86\n", + "\n", + "Point-of-failure prediction and remediation 87\n", + "\n", + "Customer churn 87\n", + "\n", + "Getting started with the Databricks streaming video QoS solution 87\n", + "\n", + "Customer Use Cases 88\n", + "\n", + "Healthdirect Australia 89\n", + "\n", + "Data quality and governance issues, silos, and the inability to scale 89\n", + "\n", + "\n", + "Fundamentals & Performance\n", + "\n", + "\n", + "Using data skipping and Z-Order clustering 21\n", + "\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and\n", + "\n", + "\n", + "Exploring the details 21\n", + "\n", + "\n", + "Performance Matter\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "Challenges with data lakes\n", + "\n", + "Delta Lake’s key functionalities\n", + "\n", + "Unpacking the Transaction Log\n", + "\n", + "Implementing atomicity to ensure\n", + "\n", + "\n", + "Why Use MERGE\n", + "\n", + "With Delta Lake?\n", + "\n", + "When are upserts necessary? 24\n", + "\n", + "Why upserts into data lakes have\n", + "\n", + "\n", + "operations complete fully\n", + "\n", + "\n", + "operations complete fully 9\n", + "\n", + "Dealing with multiple concurrent reads and writes **Chapter**\n", + "\n", + "Time travel, data lineage and debugging 10\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "Understanding table schemas 11\n", + "\n", + "#### 01\n", + "\n", + "\n", + "Fundamentals and Performance traditionally been challenging 25\n", + "\n", + "\n", + "traditionally been challenging\n", + "\n", + "\n", + "Shallow clones\n", + "\n", + "Deep clones\n", + "\n", + "\n", + "**Chapter**\n", + "\n", + "42\n", + "\n", + "42\n", + "\n", + "#### 04\n", + "\n", + "\n", + "3. Lakehouse\n", + "\n", + "\n", + "Dealing with multiple concurrent reads and writes\n", + "\n", + "\n", + "Introducing MERGE in Delta Lake\n", + "\n", + "\n", + "In the research paper, the authors explain: 59\n", + "\n", + "\n", + "3. Lakehouse Streaming 58\n", + "\n", + "\n", + "\n", + "- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\n", + "and Performance Matter Deleting data due to GDPR 26\n", + "\n", + "\n", + "Understanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n", + "\n", + "Delta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n", + "\n", + "Scaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n", + "\n", + "\n", + "Time travel, data lineage and debugging\n", + "\n", + "\n", + "Simplifying use cases with MERGE\n", + "\n", + "\n", + "Where do clones help?\n", + "\n", + "\n", + "Understanding\n", + "\n", + "\n", + "Modernizing analytics with Databricks and Delta Lake\n", + "\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "\n", + "Deleting data due to GDPR\n", + "\n", + "\n", + "Testing and experimentation with a production table\n", + "\n", + "\n", + "Delta Engine\n", + "\n", + "\n", + "Faster data pipelines result in better patient-driven healthcare\n", + "\n", + "\n", + "\n", + "- Unpacking the Transaction Log Applying change data from databases 26\n", + "\n", + "- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n", + "\n", + "- Delta Lake DML Internals How to start using Delta Lake 28\n", + "\n", + "- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\n", + "With Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n", + "\n", + "\n", + "Understanding table schemas\n", + "\n", + "\n", + "Applying change data from databases\n", + "\n", + "\n", + "Staging major changes to a production table\n", + "\n", + "\n", + "Scaling execution performance\n", + "\n", + "\n", + "Comcast\n", + "\n", + "\n", + "Announcing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n", + "\n", + "high-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n", + "\n", + "\n", + "What is schema enforcement?\n", + "\n", + "How does schema enforcement work?\n", + "\n", + "How is schema enforcement useful?\n", + "\n", + "What is schema evolution?\n", + "\n", + "How does schema evolution work?\n", + "\n", + "\n", + "Updating session information from streaming pipelines\n", + "\n", + "\n", + "Machine learning result reproducibility\n", + "\n", + "Data migration\n", + "\n", + "Data sharing\n", + "\n", + "Data archiving\n", + "\n", + "Looks awesome! Any gotchas?\n", + "\n", + "How can I use it?\n", + "\n", + "Enabling Spark SQL DDL\n", + "\n", + "\n", + "Announcing Delta Engine for\n", + "\n", + "\n", + "Infrastructure unable to support data and ML needs\n", + "\n", + "\n", + "How to start using Delta Lake\n", + "\n", + "\n", + "high-performance query execution\n", + "\n", + "\n", + "Automated infrastructure, faster data\n", + "\n", + "\n", + "Getting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n", + "\n", + "Streaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n", + "\n", + "\n", + "Loading and saving our Delta Lake data\n", + "\n", + "\n", + "Getting started with Delta Engine\n", + "\n", + "\n", + "pipelines with Delta Lake\n", + "\n", + "\n", + "In-place conversion to Delta Lake\n", + "\n", + "\n", + "Streaming\n", + "\n", + "\n", + "Delivering personalized experiences with ML\n", + "\n", + "\n", + "Delete our flight data\n", + "\n", + "Update our flight data 31\n", + "\n", + "Merge our flight data 31\n", + "\n", + "\n", + "How Delta Lake Solves Common Pain Points in Streaming\n", + "\n", + "\n", + "Banco Hipotecario 97\n", + "\n", + "Legacy analytics tools are slow, rigid and\n", + "\n", + "impossible to scale 98\n", + "\n", + "\n", + "How is schema evolution useful? 14\n", + "\n", + "Summary **Chapter** 14\n", + "\n", + "Delta Lake\n", + "\n", + "DML Internals 15\n", + "\n", + "Delta Lake DML: UPDATE 15\n", + "\n", + "#### 02\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "#### 05 Chapter\n", + "\n", + "\n", + "Data lake pain points Customer Use Cases 64\n", + "\n", + "\n", + "How is schema evolution useful?\n", + "\n", + "\n", + "Data lake pain points\n", + "\n", + "\n", + "Summary\n", + "\n", + "\n", + "Data warehouse pain points\n", + "\n", + "\n", + "\n", + "- Why Use MERGE With Delta Lake? View table history 32\n", + "\n", + "- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\n", + "Tables Using Python APIs Clean up old table versions with vacuum 33\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "\n", + "View table history\n", + "\n", + "\n", + "and DML in Delta Lake on\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues\n", + "\n", + "\n", + "A unified platform powers the data lake\n", + "\n", + "\n", + "DML Internals\n", + "\n", + "\n", + "Travel back in time with table history\n", + "\n", + "\n", + "Apache Spark 3.0\n", + "\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake\n", + "\n", + "\n", + "and easy collaboration\n", + "\n", + "\n", + "Implement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n", + "\n", + "stock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n", + "\n", + "\n", + "Delta Lake DML: UPDATE\n", + "\n", + "\n", + "Clean up old table versions with vacuum\n", + "\n", + "\n", + "Support for SQL DDL commands\n", + "\n", + "\n", + "Implement your streaming\n", + "\n", + "\n", + "An efficient team maximizes customer\n", + "\n", + "\n", + "\n", + "- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n", + "\n", + "- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\n", + "and ML Reproducibility 1. Using a timestamp 36\n", + "\n", + "\n", + "UPDATE: Under the hood 16\n", + "\n", + "UPDATE + Delta Lake time travel = Easy debugging\n", + "\n", + "UPDATE: Performance tuning tips 16\n", + "\n", + "Delta Lake DML: DELETE 16\n", + "\n", + "DELETE: Under the hood 17\n", + "\n", + "DELETE + VACUUM: Cleaning up old data files\n", + "\n", + "\n", + "Common challenges with changing data\n", + "\n", + "\n", + "to define tables in the Hive metastore\n", + "\n", + "\n", + "stock analysis solution with Delta Lake\n", + "\n", + "\n", + "acquisition and retention\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n", + "\n", + "\n", + "Working with Time Travel\n", + "\n", + "\n", + "Create or replace tables\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69\n", + "\n", + "\n", + "Viacom18\n", + "\n", + "\n", + "1. Using a timestamp\n", + "\n", + "\n", + "Explicitly alter the table schema\n", + "\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake\n", + "\n", + "\n", + "Growth in subscribers and terabytes of viewing data push Hadoop to its limits\n", + "\n", + "\n", + "\n", + "- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\n", + "on Apache Spark 3.0 Python syntax 37\n", + "\n", + "\n", + "How data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n", + "\n", + "Leveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n", + "\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "Support for SQL Insert, Delete, Update and Merge\n", + "\n", + "Automatic and incremental Presto/Athena manifest generation\n", + "\n", + "Configuring your table through table properties\n", + "\n", + "Support for adding user-defined metadata\n", + "\n", + "in Delta Lake table commits 48\n", + "\n", + "Other highlights 49\n", + "\n", + "Lakehouse 50\n", + "\n", + "What Is a\n", + "\n", + "Lakehouse? 51\n", + "\n", + "\n", + "How data flows and associated challenges 72\n", + "\n", + "\n", + "Rapid data processing for analytics\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "Leveraging Structured Streaming with blob store as\n", + "\n", + "\n", + "and ML with Databricks\n", + "\n", + "\n", + "SQL syntax 37\n", + "\n", + "2. Using a version number\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "source and Delta Lake tables as sink\n", + "\n", + "\n", + "Leveraging viewer data to power personalized viewing experiences 104\n", + "\n", + "\n", + "DELETE: Performance tuning tips 18\n", + "\n", + "Delta Lake DML: MERGE **Chapter** 18\n", + "\n", + "Here’s how an upsert works: 18\n", + "\n", + "MERGE: Under the hood 19\n", + "\n", + "MERGE: Performance tuning tips **03** 19\n", + "\n", + "\n", + "DELETE: Performance tuning tips\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Building a Quality of Service Analytics Solution for Streaming Video Services 75\n", + "\n", + "Databricks Quality of Service solution overview 76\n", + "\n", + "Video QoS solution architecture 77\n", + "\n", + "Making your data ready for analytics 79\n", + "\n", + "Video applications events 80\n", + "\n", + "CDN logs 81\n", + "\n", + "\n", + "Delta Lake DML: MERGE\n", + "\n", + "\n", + "\n", + "- What Is a Lakehouse? Python syntax 38\n", + "\n", + "- Diving Deep Into the Inner Workings of the SQL syntax 38\n", + "Lakehouse and Delta Lake Audit data changes 39\n", + "\n", + "\n", + "Here’s how an upsert works:\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "MERGE: Under the hood\n", + "\n", + "\n", + "SQL syntax\n", + "\n", + "\n", + "MERGE: Performance tuning tips\n", + "\n", + "\n", + "Audit data changes\n", + "\n", + "\n", + "How Delta Lake Quickly\n", + "\n", + "\n", + "\n", + "- Understanding Delta Engine Reproduce experiments and reports 39\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fundamentals and Performance**\n", + "Boost data reliability for machine learning and\n", + "business intelligence with Delta Lake\n", + "\n", + "## CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Fundamentals of Delta**\n", + "**Lake: Why Reliability and**\n", + "**Performance Matter**\n", + "\n", + "When it comes to data reliability, performance — the speed at which your programs\n", + "run — is of utmost importance. Because of the ACID transactional protections that\n", + "Delta Lake provides, you’re able to get the reliability and performance you need.\n", + "\n", + "With Delta Lake, you can stream and batch concurrently, perform CRUD operations,\n", + "and save money because you’re now using fewer VMs. It’s easier to maintain your data\n", + "engineering pipelines by taking advantage of streaming, even for batch jobs.\n", + "\n", + "Delta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\n", + "cloud object storage by providing ACID transactions through optimistic concurrency\n", + "control between writes and snapshot isolation for consistent reads during writes.\n", + "Delta Lake also provides built-in data versioning for easy rollbacks and reproducing\n", + "reports.\n", + "\n", + "In this chapter, we’ll share some of the common challenges with data lakes as well as\n", + "the Delta Lake features that address them.\n", + "\n", + "**Challenges with data lakes**\n", + "Data lakes are a common element within modern data architectures. They serve as a\n", + "central ingestion point for the plethora of data that organizations seek to gather and\n", + "mine. While a good step forward in getting to grips with the range of data, they run\n", + "into the following common problems:\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\n", + "the problem of unsafe writes into data lakes that cause readers to see garbage\n", + "data during writes. They have to build workarounds to ensure readers always see\n", + "consistent data during writes.\n", + "\n", + "**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\n", + "lake is easy, but this comes at the cost of data quality. Without any mechanisms\n", + "for validating schema and the data, data lakes suffer from poor data quality. As a\n", + "consequence, analytics projects that strive to mine this data also fail.\n", + "\n", + "**3. Poor performance with increasing amounts of data.** As the amount of data\n", + "that gets dumped into a data lake increases, the number of files and directories\n", + "also increases. Big data jobs and query engines that process the data spend a\n", + "significant amount of time handling the metadata operations. This problem is more\n", + "pronounced in the case of streaming jobs or handling many concurrent batch jobs.\n", + "\n", + "**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain.\n", + "\n", + "Because of these challenges, many big data projects fail to deliver on their vision or\n", + "sometimes just fail altogether. We need a solution that enables data practitioners to\n", + "make use of their existing data lakes, while ensuring data quality.\n", + "\n", + "**Delta Lake’s key functionalities**\n", + "Delta Lake addresses the above problems to simplify how you build your data lakes.\n", + "Delta Lake offers the following key functionalities:\n", + "\n", + "**• ACID transactions:** Delta Lake provides ACID transactions between multiple\n", + "writes. Every write is a transaction, and there is a serial order for writes recorded in\n", + "a transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n", + "\n", + "\n", + "-----\n", + "\n", + "[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\n", + "trying to modify the same files don’t happen that often. In scenarios where\n", + "there is a conflict, Delta Lake throws a concurrent modification exception for\n", + "users to handle them and retry their jobs. Delta Lake also offers the highest level\n", + "of isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\n", + "keep writing to a directory or table and consumers to keep reading from the same\n", + "directory or table. Readers will see the latest snapshot that existed at the time the\n", + "reading started.\n", + "\n", + "**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\n", + "DataFrame being written is compatible with the schema of the table. Columns that\n", + "are present in the table but not in the DataFrame are set to null. If there are extra\n", + "columns in the DataFrame that are not present in the table, this operation throws\n", + "an exception. Delta Lake has DDL to add new columns explicitly and the ability to\n", + "update the schema automatically.\n", + "\n", + "**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\n", + "a table or directory in the transaction log instead of the metastore. This allows\n", + "Delta Lake to list files in large directories in constant time and be efficient while\n", + "reading data.\n", + "\n", + "**• Data versioning and time travel:** Delta Lake allows users to read a previous\n", + "snapshot of the table or directory. When files are modified during writes, Delta\n", + "Lake creates newer versions of the files and preserves the older versions. When\n", + "\n", + "\n", + "users want to read the older versions of the table or directory, they can provide\n", + "a timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\n", + "constructs the full snapshot as of that timestamp or version based on the\n", + "information in the transaction log. This allows users to reproduce experiments and\n", + "reports and also revert a table to its older versions, if needed.\n", + "\n", + "**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\n", + "be used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\n", + "Combined with ACID transactions and scalable metadata handling, the efficient\n", + "streaming sink enables lots of near real-time analytics use cases without having to\n", + "maintain a complicated streaming and batch pipeline.\n", + "\n", + "**• Record update and deletion:** Delta Lake will support merge, update and delete\n", + "DML commands. This allows engineers to easily upsert and delete records in data\n", + "lakes and simplify their change data capture and GDPR use cases. Since Delta Lake\n", + "tracks and modifies data at file-level granularity, it is much more efficient than\n", + "reading and overwriting entire partitions or tables.\n", + "\n", + "**• Data expectations (coming soon):** Delta Lake will also support a new API to set\n", + "data expectations on tables or directories. Engineers will be able to specify a\n", + "boolean condition and tune the severity to handle data expectations. When Apache\n", + "Spark jobs write to the table or directory, Delta Lake will automatically validate\n", + "the records and when there is a violation, it will handle the records based on the\n", + "severity provided.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unpacking the**\n", + "**Transaction Log**\n", + "\n", + "The transaction log is key to understanding Delta Lake because it is the common thread\n", + "that runs through many of its most important features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log is\n", + "an ordered record of every transaction that has ever been performed on a Delta Lake\n", + "table since its inception.\n", + "\n", + "Delta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\n", + "given table to work on the table at the same time. To show users correct views of the\n", + "data at all times, the transaction log serves as a single source of truth: the central\n", + "repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on an\n", + "open table that has been modified since the last time it was read, Spark checks the\n", + "transaction log to see what new transactions are posted to the table. Then, Spark\n", + "updates the end user’s table with those new changes. This ensures that a user’s\n", + "version of a table is always synchronized with the master record as of the most recent\n", + "query and that users cannot make divergent, conflicting changes to a table.\n", + "\n", + "In this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\n", + "solution to the problem of multiple concurrent reads and writes.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Implementing atomicity to ensure**\n", + "**operations complete fully**\n", + "Atomicity is one of the four properties of ACID transactions that guarantees that\n", + "operations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\n", + "fully or don’t complete at all. Without this property, it’s far too easy for a hardware\n", + "failure or a software bug to cause data to be only partially written to a table, resulting\n", + "in messy or corrupted data.\n", + "\n", + "The transaction log is the mechanism through which Delta Lake is able to offer\n", + "the guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\n", + "transaction log, it never happened. By only recording transactions that execute fully\n", + "and completely, and using that record as the single source of truth, the Delta Lake\n", + "transaction log allows users to reason about their data and have peace of mind about\n", + "its fundamental trustworthiness, at petabyte scale.\n", + "\n", + "**Dealing with multiple concurrent reads and writes**\n", + "But how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\n", + "Lake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n", + "\n", + "\n", + "table at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n", + "**concurrency control** .\n", + "\n", + "Optimistic concurrency control is a method of dealing with concurrent transactions\n", + "that assumes the changes made to a table by different users can complete without\n", + "conflicting with one another. It is incredibly fast because when dealing with petabytes\n", + "of data, there’s a high likelihood that users will be working on different parts of the data\n", + "altogether, allowing them to complete non-conflicting transactions simultaneously.\n", + "\n", + "Of course, even with optimistic concurrency control, sometimes users do try to\n", + "modify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\n", + "for that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\n", + "then it attempts to solve any conflict optimistically.\n", + "\n", + "This protocol allows Delta Lake to deliver on the ACID principle of isolation, which\n", + "ensures that the resulting state of the table after multiple, concurrent writes is the\n", + "same as if those writes had occurred serially, in isolation from one another.\n", + "\n", + "\n", + "-----\n", + "\n", + "As all the transactions made on Delta Lake tables are stored directly to disk, this\n", + "process satisfies the ACID property of durability, meaning it will persist even in the\n", + "event of system failure.\n", + "\n", + "**Time travel, data lineage and debugging**\n", + "Every table is the result of the sum total of all the commits recorded in the Delta Lake\n", + "transaction log — no more and no less. The transaction log provides a step-by-step\n", + "instruction guide, detailing exactly how to get from the table’s original state to its\n", + "current state.\n", + "\n", + "Therefore, we can recreate the state of a table at any point in time by starting with\n", + "an original table, and processing only commits made after that point. This powerful\n", + "ability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n", + "\n", + "\n", + "of situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "\n", + "As the definitive record of every change ever made to a table, the Delta Lake\n", + "transaction log offers users a verifiable data lineage that is useful for governance,\n", + "audit and compliance purposes. It can also be used to trace the origin of an\n", + "inadvertent change or a bug in a pipeline back to the exact action that caused it. Users\n", + "can run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\n", + "that were made.\n", + "\n", + "**Want to learn more about Delta Lake’s transaction log?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Use Schema**\n", + "**Enforcement and**\n", + "**Evolution**\n", + "\n", + "As business problems and requirements evolve over time, so does the structure of\n", + "your data. With Delta Lake, incorporating new columns or objects is easy; users have\n", + "access to simple semantics to control the schema of their tables. At the same time,\n", + "it is important to call out the importance of schema enforcement to prevent users\n", + "from accidentally polluting their tables with mistakes or garbage data in addition to\n", + "schema evolution, which enables them to automatically add new columns of rich data\n", + "when those columns belong.\n", + "\n", + "**Schema enforcement rejects any new columns or other schema changes that**\n", + "**aren’t compatible with your table.** By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together,\n", + "these features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Understanding table schemas**\n", + "Every DataFrame in Apache Spark contains a schema, a blueprint that defines the\n", + "shape of the data, such as data types and columns, and metadata. With Delta Lake,\n", + "the table’s schema is saved in JSON format inside the transaction log.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What is schema enforcement?**\n", + "Schema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\n", + "data quality by rejecting writes to a table that don’t match the table’s schema.\n", + "\n", + "Like the front-desk manager at a busy restaurant who only accepts reservations, it\n", + "checks to see whether each column of data inserted into the table is on its list of\n", + "expected columns (in other words, whether each one has a “reservation”), and rejects\n", + "any writes with columns that aren’t on the list.\n", + "\n", + "**How does schema enforcement work?**\n", + "Delta Lake uses **schema validation on write,** which means that all new writes to a\n", + "table are checked for compatibility with the target table’s schema at write time. If the\n", + "schema is not compatible, Delta Lake cancels the transaction altogether (no data is\n", + "written), and raises an exception to let the user know about the mismatch.\n", + "\n", + "To determine whether a write to a table is compatible, Delta Lake uses the following\n", + "rules. The DataFrame to be written cannot contain:\n", + "\n", + "**• Any additional columns that are not present in the target table’s schema.**\n", + "Conversely, it’s OK if the incoming data doesn’t contain every column in the table —\n", + "those columns will simply be assigned null values.\n", + "\n", + "**• \u0007Column data types that differ from the column data types in the target table.**\n", + "If a target table’s column contains StringType data, but the corresponding column\n", + "in the DataFrame contains IntegerType data, schema enforcement will raise an\n", + "exception and prevent the write operation from taking place.\n", + "\n", + "**• Column names that differ only by case.** This means that you cannot have columns\n", + "such as “Foo” and “foo” defined in the same table. While Spark can be used in case\n", + "sensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\n", + "when storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\n", + "column information. To avoid potential mistakes, data corruption or loss issues (which\n", + "we’ve personally experienced at Databricks), we decided to add this restriction.\n", + "\n", + "\n", + "-----\n", + "\n", + "Rather than automatically adding the new columns, Delta Lake enforces the schema,\n", + "and stops the write from occurring. To help identify which column(s) caused the\n", + "mismatch, Spark prints out both schemas in the stack trace for comparison.\n", + "\n", + "**How is schema enforcement useful?**\n", + "Because it’s such a stringent check, schema enforcement is an excellent tool to use\n", + "as a gatekeeper for a clean, fully transformed data set that is ready for production or\n", + "consumption. It’s typically enforced on tables that directly feed:\n", + "\n", + "- Machine learning algorithms\n", + "\n", + "- BI dashboards\n", + "\n", + "- Data analytics and visualization tools\n", + "\n", + "- Any production system requiring highly structured,\n", + "strongly typed, semantic schemas\n", + "\n", + "In order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\n", + "a look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "**What is schema evolution?**\n", + "Schema evolution is a feature that allows users to easily change a table’s current\n", + "schema to accommodate data that is changing over time. Most commonly, it’s used\n", + "when performing an append or overwrite operation, to automatically adapt the\n", + "schema to include one or more new columns.\n", + "\n", + "**How does schema evolution work?**\n", + "Following up on the example from the previous section, developers can\n", + "easily use schema evolution to add the new columns that were previously\n", + "rejected due to a schema mismatch. Schema evolution is activated by adding\n", + ".option(‘mergeSchema’, ‘true’) to your .write or .writeStream\n", + "Spark command, as shown in the following example.\n", + "\n", + "\n", + "#Add the mergeSchema option\n", + "\n", + "loans.write.format( “delta” ) \\\n", + "\n", + ".option( “mergeSchema” , “true” ) \\\n", + "\n", + ".mode( “append” ) \\\n", + "\n", + ".save(DELTALAKE_SILVER_PATH)\n", + "\n", + "By including the mergeSchema option in your query, any columns that are present\n", + "\n", + "in the DataFrame but not in the target table are automatically added to the end of the\n", + "\n", + "schema as part of a write transaction. Nested fields can also be added, and these\n", + "\n", + "fields will get added to the end of their respective struct columns as well.\n", + "\n", + "Data engineers and scientists can use this option to add new columns (perhaps a\n", + "\n", + "newly tracked metric, or a column of this month’s sales figures) to their existing ML\n", + "\n", + "production tables without breaking existing models that rely on the old columns.\n", + "\n", + "The following types of schema changes are eligible for schema evolution during table\n", + "\n", + "appends or overwrites:\n", + "\n", + "- Adding new columns (this is the most common scenario)\n", + "\n", + "- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n", + "\n", + "→ ShortType → IntegerType\n", + "\n", + "Other changes, not eligible for schema evolution, require that the schema and data\n", + "\n", + "are overwritten by adding .option(“overwriteSchema”,“true”) . Those\n", + "\n", + "changes include:\n", + "\n", + "- Dropping a column\n", + "\n", + "- Changing an existing column’s data typeC (in place)\n", + "\n", + "- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\n", + "supported, allowing users to perform the following actions on table schemas:\n", + "\n", + "- Adding columns\n", + "\n", + "- Changing column comments\n", + "\n", + "- Setting table properties that define the behavior of the table, such as setting the\n", + "retention duration of the transaction log\n", + "\n", + "**How is schema evolution useful?**\n", + "Schema evolution can be used anytime you _intend_ to change the schema of your table\n", + "(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\n", + "be there). It’s the easiest way to migrate your schema because it automatically adds the\n", + "correct column names and data types, without having to declare them explicitly.\n", + "\n", + "**Summary**\n", + "Schema enforcement rejects any new columns or other schema changes that\n", + "aren’t compatible with your table. By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together, these\n", + "features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Want to learn more about schema enforcement and evolution?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake**\n", + "**DML Internals**\n", + "\n", + "Delta Lake supports data manipulation language (DML) commands including UPDATE,\n", + "DELETE and MERGE. These commands simplify change data capture (CDC), audit and\n", + "governance, and GDPR/CCPA workflows, among others.\n", + "\n", + "In this chapter, we will demonstrate how to use each of these DML commands,\n", + "describe what Delta Lake is doing behind the scenes, and offer some performance\n", + "tuning tips for each one.\n", + "\n", + "**Delta Lake DML: UPDATE**\n", + "You can use the UPDATE operation to selectively update any rows that match a\n", + "filtering condition, also known as a predicate. The code below demonstrates how\n", + "to use each type of predicate as part of an UPDATE statement. Note that Delta Lake\n", + "offers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\n", + "only the SQL code.\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n", + "\n", + "\n", + "-----\n", + "\n", + "**UPDATE: Under the hood**\n", + "Delta Lake performs an UPDATE on a table in two steps:\n", + "\n", + "1. Find and select the files containing data that match the predicate and, therefore,\n", + "need to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\n", + "this process.\n", + "\n", + "2. \u0007Read each matching file into memory, update the relevant rows, and write out the\n", + "result into a new data file.\n", + "\n", + "Once Delta Lake has executed the UPDATE successfully, it adds a commit in the\n", + "transaction log indicating that the new data file will be used in place of the old one\n", + "from now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n", + "— recorded as a data file that applied to an older version of the table, but not the\n", + "current version. Delta Lake is able to use it to provide data versioning and time travel.\n", + "\n", + "**UPDATE + Delta Lake time travel = Easy debugging**\n", + "Keeping the old data files turns out to be very useful for debugging because you can\n", + "use Delta Lake “time travel” to go back and query previous versions of a table at any\n", + "\n", + "\n", + "time. In the event that you update your table incorrectly and want to figure out what\n", + "happened, you can easily compare two versions of a table to one another to see what\n", + "has changed.\n", + "\n", + "SELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n", + "\n", + "- FROM mytable VERSION AS OF 12\n", + "\n", + "**UPDATE: Performance tuning tips**\n", + "The main way to improve the performance of the UPDATE command on Delta Lake\n", + "is to add more predicates to narrow down the search space. The more specific the\n", + "search, the fewer files Delta Lake needs to scan and/or modify.\n", + "\n", + "**Delta Lake DML: DELETE**\n", + "You can use the DELETE command to selectively delete rows based upon a\n", + "predicate (filtering condition).\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "\n", + "-----\n", + "\n", + "In the event that you want to revert an accidental DELETE operation, you can use time\n", + "travel to roll back your table to the way it was.\n", + "\n", + "**DELETE: Under the hood**\n", + "DELETE works just like UPDATE under the hood. Delta Lake makes two scans of\n", + "the data: The first scan is to identify any data files that contain rows matching the\n", + "predicate condition. The second scan reads the matching data files into memory,\n", + "at which point Delta Lake deletes the rows in question before writing out the newly\n", + "clean data to disk.\n", + "\n", + "After Delta Lake completes a DELETE operation successfully, the old data files are\n", + "not deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\n", + "longer part of the active table) in the Delta Lake transaction log. Remember, those old\n", + "files aren’t deleted immediately because you might still need them to time travel back\n", + "to an earlier version of the table. If you want to delete files older than a certain time\n", + "period, you can use the VACUUM command.\n", + "\n", + "**DELETE + VACUUM: Cleaning up old data files**\n", + "Running the VACUUM command permanently deletes all data files that are:\n", + "\n", + "1. No longer part of the active table and\n", + "2. \u0007Older than the retention threshold, which is seven days by default\n", + "\n", + "Delta Lake does not automatically VACUUM old files — you must run the command\n", + "yourself, as shown below. If you want to specify a retention period that is different\n", + "from the default of seven days, you can provide it as a parameter.\n", + "\n", + "from delta.tables import - deltaTable.\n", + "\n", + "# vacuum files older than 30 days(720 hours)\n", + "\n", + "deltaTable.vacuum( 720 )\n", + "\n", + "\n", + "-----\n", + "\n", + "**DELETE: Performance tuning tips**\n", + "Just like with the UPDATE command, the main way to improve the performance of\n", + "a DELETE operation on Delta Lake is to add more predicates to narrow down the\n", + "search space. The Databricks managed version of Delta Lake also features other\n", + "performance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n", + "[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "\n", + "**Delta Lake DML: MERGE**\n", + "The Delta Lake MERGE command allows you to perform upserts, which are a mix of\n", + "an UPDATE and an INSERT. To understand upserts, imagine that you have an existing\n", + "table (aka a target table), and a source table that contains a mix of new records and\n", + "updates to existing records.\n", + "\n", + "\n", + "**Here’s how an upsert works:**\n", + "\n", + "- When a record from the source table matches a preexisting record in the target\n", + "table, Delta Lake updates the record.\n", + "\n", + "- When there is no such match, Delta Lake inserts the new record.\n", + "\n", + "The Delta Lake MERGE command greatly simplifies workflows that can be complex\n", + "and cumbersome with other traditional data formats like Parquet. Common scenarios\n", + "where merges/upserts come in handy include change data capture, GDPR/CCPA\n", + "compliance, sessionization, and deduplication of records.\n", + "\n", + "**For more information about upserts, read:**\n", + "\n", + "[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n", + "\n", + "[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n", + "\n", + "[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**MERGE: Under the hood**\n", + "Delta Lake completes a MERGE in two steps:\n", + "\n", + "1. Perform an inner join between the target table and source table to select all files\n", + "that have matches.\n", + "2. Perform an outer join between the selected files in the target and source tables\n", + "and write out the updated/deleted/inserted data.\n", + "\n", + "The main way that this differs from an UPDATE or a DELETE under the hood is that\n", + "Delta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\n", + "strategies when seeking to improve performance.\n", + "\n", + "**MERGE: Performance tuning tips**\n", + "To improve performance of the MERGE command, you need to determine which of the\n", + "two joins that make up the merge is limiting your speed.\n", + "\n", + "If the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\n", + "takes too long), try the following strategies:\n", + "\n", + "- Add more predicates to narrow down the search space.\n", + "\n", + "- Adjust shuffle partitions.\n", + "\n", + "- Adjust broadcast join thresholds.\n", + "\n", + "- Compact the small files in the table if there are lots of them, but don’t compact them\n", + "into files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n", + "\n", + "\n", + "**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n", + "**locality of updates.**\n", + "\n", + "On the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\n", + "themselves takes too long), try the strategies below.\n", + "\n", + "- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\n", + "before writes (with Optimized Writes in Databricks Delta Lake).\n", + "\n", + "- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\n", + "broadcast join, but if you’re doing a right outer join, Spark can do one, and you can\n", + "adjust the broadcast thresholds as needed.\n", + "\n", + "- **Cache the source table / DataFrame:** Caching the source table can speed up the\n", + "second scan, but be sure not to cache the target table, as this can lead to cache\n", + "coherency issues.\n", + "\n", + "Delta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\n", + "greatly simplify the workflow for many common big data operations. In this chapter, we\n", + "demonstrated how to use these commands in Delta Lake, shared information about\n", + "how each one works under the hood, and offered some performance tuning tips.\n", + "\n", + "**Want a deeper dive into DML internals, including snippets of code?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Quickly**\n", + "**Processes Petabytes With**\n", + "**Data Skipping and Z-Ordering**\n", + "\n", + "Delta Lake is capable of sifting through petabytes of data within seconds. Much of this\n", + "speed is owed to two features: (1) data skipping and (2) Z-Ordering.\n", + "\n", + "Combining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\n", + "amount of data that needs to be scanned to answer selective queries against large\n", + "Delta tables, which typically translates into substantial runtime improvements and\n", + "cost savings.\n", + "\n", + "Using Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\n", + "data lakes can be queried in a matter of seconds by skipping files not relevant to\n", + "the query. For example, 93.2% of the records in a 504 TB data set were skipped for a\n", + "typical query in a real-world cybersecurity analysis use case, reducing query times by\n", + "up to two orders of magnitude. In other words, Delta Lake can speed up your queries\n", + "by as much as 100x.\n", + "\n", + "**Want to see data skipping and Z-Ordering in action?**\n", + "\n", + "Apple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n", + "\n", + "use Delta Lake as a unified solution for data engineering and data science in the context\n", + "\n", + "of cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n", + "\n", + "[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n", + "\n", + "\n", + "-----\n", + "\n", + "AND / OR / NOT are also supported as well as “literal op column” predicates.\n", + "\n", + "Even though data skipping kicks in when the above conditions are met, it may not\n", + "always be effective. But, if there are a few columns that you frequently filter by and\n", + "want to make sure that’s fast, then you can explicitly optimize your data layout with\n", + "respect to skipping effectiveness by running the following command:\n", + "\n", + "OPTIMIZE [ WHERE ]\n", + "ZORDER BY ( [, …])\n", + "\n", + "**Exploring the details**\n", + "Apart from partition pruning, another common technique that’s used in the data\n", + "warehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n", + "[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\n", + "as minimum and maximum values at a certain granularity that are correlated with I/O\n", + "granularity. And we want to leverage those statistics at query planning time in order\n", + "to avoid unnecessary I/O.\n", + "\n", + "This is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\n", + "inserted into a Delta Lake table, file-level min/max statistics are collected for all\n", + "columns (including nested ones) of supported types. Then, when there’s a lookup\n", + "query against the table, Delta Lake first consults these statistics in order to determine\n", + "which files can safely be skipped.\n", + "\n", + "**Want to learn more about data skipping and Z-Ordering, including**\n", + "**how to apply it within a cybersecurity analysis?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n", + "\n", + "\n", + "**Using data skipping and Z-Order clustering**\n", + "Data skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\n", + "Delta Lake, kicking in whenever your SQL queries or data set operations include filters\n", + "of the form “column op literal,” where:\n", + "\n", + "- column is an attribute of some Delta Lake table, be it top-level or nested, whose\n", + "data type is string / numeric / date/ timestamp\n", + "\n", + "- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n", + "\n", + "\n", + "- literal is an explicit (list of) value(s) of the same data type as a column\n", + "\n", + "\n", + "-----\n", + "\n", + "**Features**\n", + "Use Delta Lake’s robust features\n", + "to reliably manage your data\n", + "\n", + "## CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why Use MERGE**\n", + "**With Delta Lake?**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\n", + "MERGE command, which allows you to efficiently upsert and delete records in your\n", + "data lakes.\n", + "\n", + "MERGE dramatically simplifies how a number of common data pipelines can be built\n", + "-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\n", + "can now be replaced by simple MERGE queries.\n", + "\n", + "This finer-grained update capability simplifies how you build your big data\n", + "pipelines for various use cases ranging from change data capture to GDPR. You\n", + "no longer need to write complicated logic to overwrite tables and overcome a lack\n", + "of snapshot isolation.\n", + "\n", + "With changing data, another critical capability required is the ability to roll back, in\n", + "case of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n", + "\n", + "In this chapter, we’ll discuss common use cases where existing data might need to be\n", + "updated or deleted. We’ll also explore the challenges inherent to upserts and explain\n", + "how MERGE can address them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**When are upserts necessary?**\n", + "There are a number of common use cases where existing data in a data lake needs to\n", + "be updated or deleted:\n", + "\n", + "- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\n", + "the right to be forgotten (also known as data erasure) in GDPR, organizations must\n", + "remove a user’s information upon request. This data erasure includes deleting user\n", + "information in the data lake as well.\n", + "\n", + "- **Change data capture from traditional databases:** In a service-oriented\n", + "architecture, typically web and mobile applications are served by microservices\n", + "built on traditional SQL/NoSQL databases that are optimized for low latency. One\n", + "of the biggest challenges organizations face is joining data across these various\n", + "siloed data systems, and hence data engineers build pipelines to consolidate\n", + "all data sources into a central data lake to facilitate analytics. These pipelines\n", + "often have to periodically read changes made on a traditional SQL/NoSQL table\n", + "and apply them to corresponding tables in the data lake. Such changes can take\n", + "various forms: Tables with slowly changing dimensions, change data capture of all\n", + "inserted/updated/deleted rows, etc.\n", + "\n", + "- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\n", + "case in many areas ranging from product analytics to targeted advertising to\n", + "predictive maintenance. Building continuous applications to track sessions and\n", + "recording the results that write into data lakes is difficult because data lakes have\n", + "always been optimized for appending data.\n", + "\n", + "- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\n", + "Delta Lake table by appending data to the table. However, often the sources can\n", + "generate duplicate records and downstream de-duplication steps are needed to\n", + "take care of them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why upserts into data lakes have**\n", + "**traditionally been challenging**\n", + "Since data lakes are fundamentally based on files, they have always been optimized\n", + "for appending data rather than for changing existing data. Hence, building the above\n", + "use case has always been challenging.\n", + "\n", + "Users typically read the entire table (or a subset of partitions) and then overwrite\n", + "them. Therefore, every organization tries to reinvent the wheel for their requirement\n", + "by handwriting complicated queries in SQL, Spark, etc. This approach is:\n", + "\n", + "- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\n", + "records causes pipelines to be slow and costly. Hand-tuning the table layout and\n", + "query optimization is tedious and requires deep domain knowledge.\n", + "\n", + "- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\n", + "human errors. For example, multiple pipelines concurrently modifying the same table\n", + "without any transactional support can lead to unpredictable data inconsistencies\n", + "and in the worst case, data losses. Often, even a single handwritten pipeline can\n", + "easily cause data corruptions due to errors in encoding the business logic.\n", + "\n", + "- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\n", + "keep track of and maintain. In the long term, this alone can significantly increase\n", + "the organizational and infrastructural costs.\n", + "\n", + "**Introducing MERGE in Delta Lake**\n", + "With Delta Lake, you can easily address the use cases above without any of the\n", + "aforementioned problems using the following MERGE command:\n", + "\n", + "MERGE INTO\n", + "\n", + "USING\n", + "\n", + "ON\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "\n", + "[ WHEN NOT MATCHED [ AND ] THEN ]\n", + "\n", + "where\n", + "\n", + "=\n", + "\n", + "DELETE |\n", + "\n", + "UPDATE SET - |\n", + "\n", + "UPDATE SET column1 = value1 [, column2 = value2 ...]\n", + "\n", + "=\n", + "\n", + "INSERT - |\n", + "\n", + "INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n", + "\n", + "Let’s understand how to use MERGE with a simple example. Suppose you have a\n", + "[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\n", + "Furthermore, you have a table of new addresses for both existing and new users. To\n", + "merge all the new addresses to the main user table, you can run the following:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING updates\n", + "\n", + "ON users.userId = updates.userId\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = updates.addresses\n", + "\n", + "WHEN NOT MATCHED THEN\n", + "INSERT (userId, address) VALUES (updates.userId, updates.address)\n", + "\n", + "This will perform exactly what the syntax says -- for existing users (i.e., MATCHED\n", + "clause), it will update the address column, and for new users (i.e., NOT MATCHED\n", + "clause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\n", + "MERGE operation can be orders of magnitude faster than overwriting entire partitions\n", + "or tables since Delta Lake reads only relevant files and updates them. Specifically,\n", + "Delta Lake's MERGE has the following advantages:\n", + "\n", + "\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying use cases with MERGE**\n", + "**Deleting data due to GDPR**\n", + "Complying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\n", + "get any easier. You can set up a simple scheduled job with an example code, like\n", + "below, to delete all the users who have opted out of your service.\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING opted_out_users\n", + "\n", + "ON opted_out_users.userId = users.userId\n", + "\n", + "WHEN MATCHED THEN DELETE\n", + "\n", + "**Applying change data from databases**\n", + "You can easily apply all data changes — updates, deletes, inserts — generated from an\n", + "external database into a Delta Lake table with the MERGE syntax as follows:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING (\n", + "\n", + "SELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n", + "\n", + "(\n", + "\n", + "SELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n", + "\n", + "FROM changes GROUP BY userId\n", + "\n", + ")\n", + "\n", + ") latestChange\n", + "\n", + "ON latestChange.userId = users.userId\n", + "\n", + "WHEN MATCHED AND latestChange.deleted = TRUE THEN\n", + "\n", + "DELETE\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = latestChange.address\n", + "\n", + "WHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n", + "\n", + "INSERT (userId, address) VALUES (userId, address)\n", + "\n", + "\n", + "\n", + "- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\n", + "partitions. This eliminates all the complications of rewriting partitions, updating\n", + "the Hive metastore with MSCK and so on.\n", + "\n", + "- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\n", + "rewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\n", + "Delta Lake with all its I/O and processing optimizations makes all the reading and\n", + "writing data by MERGE significantly faster than similar operations in Apache Spark.\n", + "\n", + "- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\n", + "concurrent writers update the data correctly with ACID transactions, and concurrent\n", + "readers always see a consistent snapshot of the data.\n", + "\n", + "Here is a visual explanation of how MERGE compares with handwritten pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Updating session information from streaming**\n", + "**pipelines**\n", + "If you have streaming event data flowing in and if you want to sessionize the streaming\n", + "event data and incrementally update and store sessions in a Delta Lake table, you\n", + "can accomplish this using the foreachBatch in Structured Streaming and MERGE.\n", + "For example, suppose you have a Structured Streaming DataFrame that computes\n", + "updated session information for each user. You can start a streaming query that\n", + "applies all the sessions update to a Delta Lake table as follows (Scala).\n", + "\n", + "streamingSessionUpdatesDF.writeStream\n", + "\n", + ".foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n", + "\n", + "microBatchOutputDF.createOrReplaceTempView(“updates”)\n", + "\n", + "microBatchOutputDF.sparkSession.sql(s”””\n", + "\n", + "MERGE INTO sessions\n", + "\n", + "USING updates\n", + "\n", + "ON sessions.sessionId = updates.sessionId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *\n", + "\n", + "WHEN NOT MATCHED THEN INSERT * “”” )\n", + "\n", + "}.start()\n", + "\n", + "For a complete working example of each Batch and MERGE, see this notebook\n", + "( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n", + "\n", + "[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n", + "\n", + "[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n", + "\n", + "[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n", + "\n", + "[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simple, Reliable Upserts and**\n", + "**Deletes on Delta Lake Tables**\n", + "**Using Python APIs**\n", + "\n", + "In this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\n", + "Lake within the context of an on-time flight performance scenario. We will show how\n", + "to upsert and delete data, query old versions of data with time travel, and vacuum\n", + "older versions for cleanup.\n", + "\n", + "**How to start using Delta Lake**\n", + "The Delta Lake package is installable through PySpark by using the --packages\n", + "option. In our example, we will also demonstrate the ability to VACUUM files and execute\n", + "Delta Lake SQL commands within Apache Spark. As this is a short demonstration, we\n", + "will also enable the following configurations:\n", + "\n", + "\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n", + "\n", + "to allow us to vacuum files shorter than the default retention duration of seven days.\n", + "Note, this is only required for the SQL command VACUUM\n", + "\n", + "\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n", + "\n", + "to enable Delta Lake SQL commands within Apache Spark; this is not required for\n", + "Python or Scala API calls.\n", + "\n", + "# Using Spark Packages\n", + "\n", + "./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n", + "\n", + "databricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n", + "\n", + "sql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Loading and saving our Delta Lake data**\n", + "This scenario will be using the On-Time Flight Performance or Departure Delays data\n", + "set generated from the RITA BTS Flight Departure Statistics; some examples of this data\n", + "in action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\n", + "by reading the data set.\n", + "\n", + "\u0007# Location variables\n", + "\n", + "\n", + "/departureDelays.delta$ ls l\n", + "\n", + ".\n", + "\n", + "..\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n", + "\n", + "Part- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n", + "\n", + "\n", + "tripdelaysFilePath = “/root/data/departuredelays.csv”\n", + "\n", + "pathToEventsTable = “/root/deltalake/departureDelays.delta”\n", + "\n", + "Now, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n", + "\n", + "# Read flight delay data\n", + "\n", + "\n", + "departureDelays = spark.read \\\n", + "\n", + ".option( “header” , “true” ) \\\n", + "\n", + ".option( “inferSchema” , “true” ) \\\n", + "\n", + ".csv(tripdelaysFilePath)\n", + "\n", + "Next, let’s save our departureDelays data set to a Delta Lake table. By saving this table\n", + "to Delta Lake storage, we will be able to take advantage of its features including ACID\n", + "transactions, unified batch and streaming and time travel.\n", + "\n", + "# Save flight delay data into Delta Lake format\n", + "\n", + "departureDelays \\\n", + "\n", + ".write \\\n", + "\n", + "\n", + "# Load flight delay data in Delta Lake format\n", + "\n", + "delays_delta = spark \\\n", + "\n", + ".read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “departureDelays.delta” )\n", + "\n", + "# Create temporary view\n", + "\n", + "delays_delta.createOrReplaceTempView(“delays_delta”)\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’”).show()\n", + "\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".mode( “overwrite” ) \\\n", + "\n", + ".save( “departureDelays.delta” )\n", + "\n", + "Note, this approach is similar to how you would normally save Parquet data; instead\n", + "of specifying format(“parquet”) , you will now specify format(“delta”) . If\n", + "you were to take a look at the underlying file system, you will notice four files created\n", + "for the departureDelays Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, lets determine the number of flights originating from Seattle to San Francisco; in\n", + "this data set, there are 1698 flights.\n", + "\n", + "**In-place conversion to Delta Lake**\n", + "If you have existing Parquet tables, you have the ability to convert them to Delta Lake\n", + "format in place, thus not needing to rewrite your table. To convert the table, you can\n", + "run the following commands.\n", + "\n", + "\n", + "deltaTable DeltaTable .forPath(spark, pathToEventsTable\n", + "\n", + ")\n", + "\n", + "# Delete all on-time and early flights\n", + "\n", + "deltaTable. delete ( “delay < 0” )\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "# Convert non partitioned parquet table at path ‘/path/to/table’\n", + "\n", + "deltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n", + "\n", + "table`” )\n", + "\n", + "# Convert partitioned parquet table at path ‘/path/to/table’ and\n", + "\n", + "partitioned by integer column named ‘part’\n", + "\n", + "\n", + "After we delete (more on this below) all of the on-time and early flights, as you can\n", + "see from the preceding query there are 837 late flights originating from Seattle to\n", + "San Francisco. If you review the file system, you will notice there are more files even\n", + "though you deleted data.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n", + "\n", + "part-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n", + "\n", + "part- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n", + "\n", + "part- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n", + "\n", + "part- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n", + "\n", + "\n", + "partitionedDeltaTable = DeltaTable .convertToDelta(spark,\n", + "\n", + "“parquet.`/path/to/table`”, “part int” )\n", + "\n", + "**Delete our flight data**\n", + "To delete data from a traditional data lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to delete\n", + "2. Create a new table based on the previous query\n", + "3. Delete the original table\n", + "4. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this process\n", + "by running a DELETE statement. To show this, let’s delete all of the flights that had\n", + "arrived early or on-time (i.e., delay < 0).\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "from pyspark.sql.functions import - \n", + "\n", + "# Access the Delta Lake table\n", + "\n", + "\n", + "-----\n", + "\n", + "In traditional data lakes, deletes are performed by rewriting the entire table\n", + "excluding the values to be deleted. With Delta Lake, deletes are instead performed\n", + "by selectively writing new versions of the files containing the data to be deleted and\n", + "only marks the previous files as deleted. This is because Delta Lake uses multiversion\n", + "concurrency control (MVCC) to do atomic operations on the table: For example, while\n", + "one user is deleting data, another user may be querying the previous version of the\n", + "table. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\n", + "and query previous versions as we will see later.\n", + "\n", + "**Update our flight data**\n", + "To update data from your traditional Data Lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to modify\n", + "2. Modify the rows that need to be updated/changed\n", + "3. Merge these two tables to create a new table\n", + "4. Delete the original table\n", + "5. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this\n", + "process by running an UPDATE statement. To show this, let’s update all of the flights\n", + "originating from Detroit to Seattle.\n", + "\n", + "\n", + "With the Detroit flights now tagged as Seattle flights, we now have 986 flights\n", + "originating from Seattle to San Francisco. If you were to list the file system for\n", + "your departureDelays folder (i.e., $../departureDelays/ls -l ), you will\n", + "notice there are now 11 files (instead of the 8 right after deleting the files and the four\n", + "files after creating the table).\n", + "\n", + "**Merge our flight data**\n", + "A common scenario when working with a data lake is to continuously append data to\n", + "your table. This often results in duplicate data (rows you do not want to be inserted\n", + "into your table again), new rows that need to be inserted, and some rows that need to\n", + "be updated. With Delta Lake, all of this can be achieved by using the merge operation\n", + "(similar to the SQL MERGE statement).\n", + "\n", + "Let’s start with a sample data set that you will want to be updated, inserted or\n", + "de-duplicated with the following query.\n", + "\n", + "\n", + "# Update all flights originating from Detroit to now be\n", + "\n", + "\n", + "originating from Seattle\n", + "\n", + "deltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n", + "\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "\n", + "The output of this query looks like the following table. Note, the color-coding has been\n", + "added to clearly identify which rows are de-duplicated (blue), updated (yellow) and\n", + "inserted (green).\n", + "\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n", + "\n", + "and destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "-----\n", + "\n", + "Next, let’s generate our own merge_table that contains data we will insert, update\n", + "or de-duplicate with the following code snippet.\n", + "\n", + "items = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n", + "\n", + "‘SEA’ , ‘SFO’ ),\n", + "\n", + "(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n", + "\n", + "\n", + "With Delta Lake, this can be easily achieved via a merge statement as noted in the\n", + "following code snippet.\n", + "\n", + "# Merge merge_table with flights\n", + "\n", + "deltaTable. alias( “flights” ) \\\n", + "\n", + ".merge(merge_table. alias ( “updates”),”flights.date =\n", + "\n", + "updates.date” ) \\\n", + "\n", + ".whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n", + "\n", + ".whenNotMatchedInsertAll() \\\n", + "\n", + ".execute()\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "cols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n", + "\n", + "\n", + "merge_table = spark.createDataFrame(items, cols)\n", + "\n", + "merge_table.toPandas()\n", + "\n", + "In the preceding table ( merge_table ), there are three rows with a unique date value:\n", + "\n", + "1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n", + "2. 1010710: This row is a _duplicate_ (blue)\n", + "3. 1010832: This is a new row to be _inserted_ (green)\n", + "\n", + "\n", + "All three actions of de-duplication, update and insert were efficiently completed with\n", + "one statement.\n", + "\n", + "**View table history**\n", + "As previously noted, after each of our transactions (delete, update), there were more\n", + "files created within the file system. This is because for each transaction, there are\n", + "different versions of the Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "This can be seen by using the DeltaTable.history() method as noted below\n", + "\n", + "Note: You can also perform the same task with SQL:\n", + "\n", + "spark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n", + "\n", + "As you can see, there are three rows representing the different versions of the table\n", + "(below is an abridged version to help make it easier to read) for each of the operations\n", + "(create table, delete and update):\n", + "\n", + "**Travel back in time with table history**\n", + "With Time Travel, you can review the Delta Lake table as of the version or timestamp.\n", + "To view historical data, specify the version or timestamp option; in the following code\n", + "snippet, we will specify the version option.\n", + "\n", + "\n", + "# Load DataFrames for each version\n", + "\n", + "dfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "0 ).load( “departureDelays.delta” )\n", + "\n", + "dfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n", + "\n", + "1 ).load( “departureDelays.delta” )\n", + "\n", + "dfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "2 ).load( “departureDelays.delta” )\n", + "\n", + "# Calculate the SEA to SFO flight counts for each version of history\n", + "\n", + "cnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "# Print out the value\n", + "\n", + "print ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n", + "\n", + "(cnt0, cnt1, cnt2))\n", + "\n", + "## Output\n", + "\n", + "SEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n", + "\n", + "Whether for governance, risk management and compliance (GRC) or rolling back\n", + "errors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\n", + "delete had occurred with these operators) and data (e.g., the actual rows deleted). But\n", + "how do we remove the data files either for compliance or size reasons?\n", + "\n", + "**Clean up old table versions with vacuum**\n", + "The [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\n", + "older than seven days’ reference. If you were to view the file system, you’ll notice the\n", + "11 files for your table.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n", + "\n", + "part- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n", + "\n", + "part- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n", + "\n", + "part- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n", + "\n", + "Part- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "To delete all of the files so that you only keep the current snapshot of data, you will specify a\n", + "small value for the vacuum method (instead of the default retention of 7 days).\n", + "\n", + "# Remove all files older than 0 hours old.\n", + "\n", + "deltaTable.vacuum( 0 )\n", + "\n", + "Note , you perform the same task via SQL syntax:¸\n", + "\n", + "# Remove all files older than 0 hours old\n", + "\n", + "spark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n", + "\n", + "Once the vacuum has completed, when you review the file system you will notice fewer\n", + "files as the historical data has been removed.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "Note, the ability to time travel back to a version older than the retention period is lost\n", + "after running vacuum.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Time Travel for**\n", + "**Large-Scale Data Lakes**\n", + "\n", + "Time travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n", + "[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\n", + "metadata handling, and unifies streaming and batch data processing. Delta Lake runs on\n", + "top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "With this feature, Delta Lake automatically versions the big data that you store in your\n", + "data lake, and you can access any historical version of that data. This temporal data\n", + "management simplifies your data pipeline by making it easy to audit, roll back data\n", + "in case of accidental bad writes or deletes, and reproduce experiments and reports.\n", + "\n", + "Your organization can finally standardize on a clean, centralized, versioned big data\n", + "repository in your own cloud storage for your analytics.\n", + "\n", + "**Common challenges with changing data**\n", + "\n", + "- **Audit data changes:** Auditing data changes is critical both in terms of data\n", + "compliance as well as simple debugging to understand how data has changed over\n", + "time. Organizations moving from traditional data systems to big data technologies\n", + "and the cloud struggle in such scenarios.\n", + "\n", + "- **Reproduce experiments and reports:** During model training, data scientists\n", + "run various experiments with different parameters on a given set of data. When\n", + "scientists revisit their experiments after a period of time to reproduce the models,\n", + "typically the source data has been modified by upstream pipelines. A lot of times,\n", + "they are caught unaware by such upstream data changes and hence struggle to\n", + "reproduce their experiments. Some scientists and organizations engineer best\n", + "\n", + "\n", + "-----\n", + "\n", + "practices by creating multiple copies of the data, leading to increased storage\n", + "costs. The same is true for analysts generating reports.\n", + "\n", + "- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n", + "\n", + "This can happen because of issues ranging from infrastructure instabilities to messy\n", + "data to bugs in the pipeline. For pipelines that do simple appends to directories or a\n", + "table, rollbacks can easily be addressed by date-based partitioning. With updates\n", + "and deletes, this can become very complicated, and data engineers typically have\n", + "to engineer a complex pipeline to deal with such scenarios.\n", + "\n", + "**Working with Time Travel**\n", + "Delta Lake’s time travel capabilities simplify building data pipelines for the above use\n", + "cases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n", + "\n", + "- Data scientists manage their experiments better\n", + "\n", + "- Data engineers simplify their pipelines and roll back bad writes\n", + "\n", + "- Data analysts do easy reporting\n", + "\n", + "Organizations can finally standardize on a clean, centralized, versioned big data\n", + "repository in their own cloud storage for analytics. We are thrilled to see what you will\n", + "be able to accomplish with this feature.\n", + "\n", + "As you write into a Delta Lake table or directory, every operation is automatically\n", + "versioned. You can access the different versions of the data two different ways:\n", + "\n", + "**1. Using a timestamp**\n", + "**Scala syntax**\n", + "You can provide the timestamp or date string as an option to DataFrame reader:\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "\n", + "-----\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n", + "\n", + "If the reader code is in a library that you don’t have access to, and if you are passing\n", + "input parameters to the library to read data, you can still travel back in time for a table\n", + "by passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n", + "\n", + "val inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "val df = loadData(inputPath)\n", + "\n", + "// Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath : String ) : DataFrame = {\n", + "\n", + "spark.read\n", + "\n", + ".format(“delta”)\n", + "\n", + ".load(inputPath)\n", + "\n", + "}\n", + "\n", + "inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "df = loadData(inputPath)\n", + "\n", + "# Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath):\n", + "\n", + "return spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load(inputPath)\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Using a version number**\n", + "In Delta Lake, every write has a version number, and you can use the version number\n", + "to travel back in time as well.\n", + "\n", + "**Scala syntax**\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “versionAsOf” , “5238” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “versionAsOf” , “5238” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table VERSION AS OF 5238\n", + "\n", + "\n", + "-----\n", + "\n", + "**Audit data changes**\n", + "You can look at the history of table changes using the DESCRIBE HISTORY command\n", + "or through the UI.\n", + "\n", + "**Reproduce experiments and reports**\n", + "Time travel also plays an important role in machine learning and data science.\n", + "Reproducibility of models and experiments is a key consideration for data scientists\n", + "because they often create hundreds of models before they put one into production,\n", + "and in that time-consuming process would like to go back to earlier models. However,\n", + "because data management is often separate from data science tools, this is really\n", + "hard to accomplish.\n", + "\n", + "Databricks solves this reproducibility problem by integrating Delta Lake’s Time\n", + "Travel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\n", + "lifecycle. For reproducible machine learning training, you can simply log a\n", + "\n", + "\n", + "timestamped URL to the path as an MLflow parameter to track which version of the\n", + "data was used for each training job.\n", + "\n", + "This enables you to go back to earlier settings and data sets to reproduce earlier\n", + "models. You neither need to coordinate with upstream teams on the data nor worry\n", + "about cloning data for different experiments. This is the power of unified analytics,\n", + "whereby data science is closely married with data engineering.\n", + "\n", + "**Rollbacks**\n", + "Time travel also makes it easy to do rollbacks in case of bad writes. For example, if\n", + "your GDPR pipeline job had a bug that accidentally deleted user information, you can\n", + "easily fix the pipeline:\n", + "\n", + "INSERT INTO my_table\n", + "\n", + "SELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "WHERE userId = 111\n", + "\n", + "\n", + "-----\n", + "\n", + "You can also fix incorrect updates as follows:\n", + "\n", + "# Will use the latest version of the table for all operations below\n", + "\n", + "MERGE INTO my_table target\n", + "\n", + "\n", + "USING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n", + "\n", + "ON source.userId = target.userId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET - \n", + "\n", + "If you simply want to roll back to a previous version of your table, you can do so with\n", + "either of the following commands:\n", + "\n", + "RESTORE TABLE my_table VERSION AS OF [version_number]\n", + "\n", + "RESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n", + "\n", + "**Pinned view of a continuously updating**\n", + "**Delta Lake table across multiple downstream jobs**\n", + "With AS OF queries, you can now pin the snapshot of a continuously updating Delta\n", + "Lake table for multiple downstream jobs. Consider a situation where a Delta Lake table\n", + "is being continuously updated, say every 15 seconds, and there is a downstream job\n", + "that periodically reads from this Delta Lake table and updates different destinations.\n", + "In such scenarios, typically you want a consistent view of the source Delta Lake table\n", + "so that all destination tables reflect the same state.\n", + "\n", + "You can now easily handle such scenarios as follows:\n", + "\n", + "version = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n", + "\n", + "my_table)” ).collect()\n", + "\n", + "\n", + "data = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n", + "\n", + "( “event_type = e1” ).write.jdbc( “table1” )\n", + "\n", + "data.where( “event_type = e2” ).write.jdbc( “table2” )\n", + "\n", + "...\n", + "\n", + "data.where( “event_type = e10” ).write.jdbc( “table10” )\n", + "\n", + "**Queries for time series analytics made simple**\n", + "Time travel also simplifies time series analytics. For example, if you want to find out\n", + "how many new customers you added over the last week, your query could be a very\n", + "simple one like this:\n", + "\n", + "SELECT count( distinct userId) - (\n", + "\n", + "SELECT count( distinct userId)\n", + "\n", + "FROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n", + "\n", + "FROM my_table\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n", + "\n", + "[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n", + "\n", + "[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n", + "\n", + "[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Easily Clone Your Delta Lake**\n", + "**for Testing, Sharing and ML**\n", + "**Reproducibility**\n", + "\n", + "Delta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\n", + "recreate tables for ML reproducibility. Creating copies of tables in a data lake or data\n", + "warehouse has several practical uses. However, given the volume of data in tables\n", + "in a data lake and the rate of its growth, making physical copies of tables is an\n", + "expensive operation.\n", + "\n", + "Delta Lake now makes the process simpler and cost-effective with the help of\n", + "table clones.\n", + "\n", + "**What are clones?**\n", + "Clones are replicas of a source table at a given point in time. They have the same\n", + "metadata as the source table: same schema, constraints, column descriptions, statistics\n", + "and partitioning. However, they behave as a separate table with a separate lineage\n", + "or history. Any changes made to clones only affect the clone and not the source. Any\n", + "changes that happen to the source during or after the cloning process also do not get\n", + "reflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\n", + "clones: shallow or deep.\n", + "\n", + "**Shallow clones**\n", + "A _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\n", + "table being cloned; the data files of the table itself are not copied. This type of cloning\n", + "does not create another physical copy of the data resulting in minimal storage costs.\n", + "Shallow clones are inexpensive and can be extremely fast to create.\n", + "\n", + "\n", + "-----\n", + "\n", + "These clones are not self-contained and depend on the source from which they were\n", + "cloned as the source of data. If the files in the source that the clone depends on are removed,\n", + "for example with VACUUM, a shallow clone may become unusable. Therefore, shallow\n", + "clones are typically used for short-lived use cases such as testing and experimentation.\n", + "\n", + "**Deep clones**\n", + "Shallow clones are great for short-lived use cases, but some scenarios require a\n", + "separate and independent copy of the table’s data. A deep clone makes a full copy of\n", + "the metadata and the data files of the table being cloned. In that sense, it is similar in\n", + "functionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\n", + "But it is simpler to specify since it makes a faithful copy of the original table at the\n", + "specified version, and you don’t need to re-specify partitioning, constraints and other\n", + "information as you have to do with CTAS. In addition, it is much faster, robust and can\n", + "work in an incremental manner against failures.\n", + "\n", + "With deep clones, we copy additional metadata, such as your streaming application\n", + "transactions and COPY INTO transactions, so you can continue your ETL applications\n", + "exactly where it left off on a deep clone.\n", + "\n", + "**Where do clones help?**\n", + "Sometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\n", + "not talking about human clones here. There are many scenarios where you need a\n", + "copy of your data sets — for exploring, sharing or testing ML models or analytical\n", + "queries. Below are some examples of customer use cases.\n", + "\n", + "**Testing and experimentation with a production table**\n", + "When users need to test a new version of their data pipeline they often have to rely\n", + "on sample test data sets that are not representative of all the data in their production\n", + "environment. Data teams may also want to experiment with various indexing techniques\n", + "to improve the performance of queries against massive tables. These experiments and\n", + "\n", + "\n", + "tests cannot be carried out in a production environment without risking production\n", + "data processes and affecting users.\n", + "\n", + "It can take many hours or even days, to spin up copies of your production tables for a test\n", + "or a development environment. Add to that, the extra storage costs for your development\n", + "environment to hold all the duplicated data — there is a large overhead in setting a test\n", + "environment reflective of the production data. With a shallow clone, this is trivial:\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n", + "\n", + "# Python\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=True)\n", + "\n", + "// Scala\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=true)\n", + "\n", + "After creating a shallow clone of your table in a matter of seconds, you can start\n", + "running a copy of your pipeline to test out your new code, or try optimizing your table\n", + "in different dimensions to see how you can improve your query performance, and much\n", + "much more. These changes will only affect your shallow clone, not your original table.\n", + "\n", + "**Staging major changes to a production table**\n", + "Sometimes, you may need to perform some major changes to your production table.\n", + "These changes may consist of many steps, and you don’t want other users to see the\n", + "changes that you’re making until you’re done with all of your work. A shallow clone can\n", + "help you out here:\n", + "\n", + "\n", + "-----\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n", + "\n", + "DELETE FROM temp.staged_changes WHERE event_id is null;\n", + "\n", + "UPDATE temp.staged_changes SET change_date = current_date()\n", + "\n", + "WHERE change_date is null;\n", + "\n", + "...\n", + "\n", + "-- Perform your verifications\n", + "\n", + "Once you’re happy with the results, you have two options. If no other change has\n", + "been made to your source table, you can replace your source table with the clone.\n", + "If changes have been made to your source table, you can merge the changes into\n", + "your source table.\n", + "\n", + "-- If no changes have been made to the source\n", + "\n", + "REPLACE TABLE prod.events CLONE temp.staged_changes;\n", + "\n", + "-- If the source table has changed\n", + "\n", + "MERGE INTO prod.events USING temp.staged_changes\n", + "\n", + "ON events.event_id <=> staged_changes.event_id\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *;\n", + "\n", + "-- Drop the staged table\n", + "\n", + "DROP TABLE temp.staged_changes;\n", + "\n", + "**Machine learning result reproducibility**\n", + "Coming up with an effective ML model is an iterative process. Throughout this process\n", + "of tweaking the different parts of the model, data scientists need to assess the\n", + "accuracy of the model against a fixed data set.\n", + "\n", + "This is hard to do in a system where the data is constantly being loaded or updated. A\n", + "snapshot of the data used to train and test the model is required. This snapshot allows\n", + "the results of the ML model to be reproducible for testing or model governance purposes.\n", + "\n", + "\n", + "-----\n", + "\n", + "We recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\n", + "example of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "Once you’re happy with the results and would like to archive the data for later retrieval,\n", + "for example, next Black Friday, you can use deep clones to simplify the archiving process.\n", + "MLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\n", + "autolog() ) will tell you which version of the table was used to run a set of experiments.\n", + "\n", + "# Run your ML workloads using Python and then\n", + "\n", + "DeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n", + "\n", + "store_bf2020”)\n", + "\n", + "**Data migration**\n", + "A massive table may need to be moved to a new, dedicated bucket or storage system\n", + "for performance or governance reasons. The original table will not receive new\n", + "updates going forward and will be deactivated and removed at a future point in time.\n", + "Deep clones make the copying of massive tables more robust and scalable.\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n", + "\n", + "ALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n", + "\n", + "With deep clones, since we copy your streaming application transactions and\n", + "COPY INTO transactions, you can continue your ETL applications from exactly where\n", + "it left off after this migration!\n", + "\n", + "**Data sharing**\n", + "In an organization, it is often the case that users from different departments are\n", + "looking for data sets that they can use to enrich their analysis or models. You may\n", + "want to share your data with other users across the organization. But rather than\n", + "setting up elaborate pipelines to move the data to yet another store, it is often easier\n", + "and economical to create a copy of the relevant data set for users to explore and\n", + "\n", + "\n", + "-----\n", + "\n", + "**Looks awesome! Any gotchas?**\n", + "Just to reiterate some of the gotchas mentioned above as a single list, here’s what you\n", + "should be wary of:\n", + "\n", + "- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\n", + "the source table after the cloning process starts will not be reflected in the\n", + "clone.\n", + "\n", + "- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\n", + "deleted in the source table (for example through VACUUM), your shallow clone\n", + "may not be usable.\n", + "\n", + "- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\n", + "queries on your source table and clone may not return the same result.\n", + "\n", + "- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\n", + "deep clones to migrate your tables and continue your ETL processes from\n", + "where it left off.\n", + "\n", + "**How can I use it?**\n", + "Shallow and deep clones support new advances in how data teams test and manage\n", + "their modern cloud data lakes and warehouses. Table clones can help your team\n", + "implement production-level testing of their pipelines, fine-tune their indexing for optimal\n", + "query performance, create table copies for sharing — all with minimal overhead and\n", + "expense. If this is a need in your organization, we hope you will take table cloning for\n", + "a spin and give us your feedback — we look forward to hearing about new use cases and\n", + "extensions you would like to see in the future.\n", + "\n", + "**Additional resource**\n", + "\n", + "[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n", + "\n", + "\n", + "test the data to see if it is a fit for their needs without affecting your own production\n", + "systems. Here deep clones again come to the rescue.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n", + "\n", + "**Data archiving**\n", + "For regulatory or archiving purposes, all data in a table needs to be preserved for a\n", + "certain number of years, while the active table retains data for a few months. If you\n", + "want your data to be updated as soon as possible, but you have a requirement to keep\n", + "data for several years, storing this data in a single table and performing time travel\n", + "may become prohibitively expensive.\n", + "\n", + "In this case, archiving your data in a daily, weekly or monthly manner is a better\n", + "solution. The incremental cloning capability of deep clones will really help you here.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE archive.events CLONE prod.events;\n", + "\n", + "Note that this table will have an independent history compared to the source table,\n", + "therefore, time travel queries on the source table and the clone may return different\n", + "results based on your frequency of archiving.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling Spark SQL DDL**\n", + "**and DML in Delta Lake on**\n", + "**Apache Spark 3.0**\n", + "\n", + "The release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\n", + "enabling a new set of features that were simplified using Delta Lake from SQL. Here\n", + "are some of the key features.\n", + "\n", + "**Support for SQL DDL commands**\n", + "**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\n", + "You can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\n", + "SQL operations when creating (or replacing) tables.\n", + "\n", + "**Create or replace tables**\n", + "\n", + "-- Create table in the metastore\n", + "\n", + "CREATE TABLE events (\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "USING DELTA\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "-- If a table with the same name already exists, the table is replaced\n", + "\n", + "with\n", + "\n", + "the new configuration, else it i s created\n", + "\n", + "CREATE OR REPLACE TABLE events (\n", + "\n", + "\n", + "-----\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "\n", + "INSERT INTO events SELECT * FROM newEvents\n", + "\n", + "-- To atomically replace all of the data in a table, you can use\n", + "\n", + "overwrite mode\n", + "\n", + "INSERT OVERWRITE events SELECT * FROM newEvents\n", + "\n", + "\n", + "USING DELTA\n", + "\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "**Explicitly alter the table schema**\n", + "\n", + "-- Alter table and schema\n", + "\n", + "\n", + "-- Delete events\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n", + "\n", + "\n", + "ALTER TABLE table_name ADD COLUMNS (\n", + "\n", + "\n", + "col_name data_type\n", + "\n", + "[COMMENT col_comment]\n", + "\n", + "[FIRST|AFTER colA_name],\n", + "\n", + "...)\n", + "\n", + "You can also use the Scala/Java/Python APIs:\n", + "\n", + "- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\n", + "APIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n", + "\n", + "- \u0007DeltaTable.forName(tableName) API to create instances of\n", + "io.delta.tables .DeltaTable which is useful for executing\n", + "Update/Delete/Merge operations in Scala/Java/Python.\n", + "\n", + "**Support for SQL Insert, Delete, Update and Merge**\n", + "One of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\n", + "would DML operations such as delete, update and merge be available in Spark SQL?\n", + "Wait no more, these operations are now available in SQL! Below are examples of how\n", + "you can write delete, update and merge (insert, update, delete and de-duplication\n", + "operations using Spark SQL).\n", + "\n", + "-- Using append mode, you can atomically add new data to an existing\n", + "\n", + "Delta table\n", + "\n", + "\n", + "-- Upsert data to a target Delta\n", + "\n", + "-- table using merge\n", + "\n", + "MERGE INTO events\n", + "\n", + "USING updates\n", + "\n", + "ON events.eventId = updates.eventId\n", + "\n", + "WHEN MATCHED THEN UPDATE\n", + "\n", + "SET events.data = updates.data\n", + "\n", + "WHEN NOT MATCHED THEN INSERT (date, eventId, data)\n", + "\n", + "VALUES (date, eventId, data)\n", + "\n", + "It is worth noting that the merge operation in Delta Lake supports more advanced\n", + "syntax than standard ANSI SQL syntax. For example, merge supports\n", + "\n", + "- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n", + "“... WHEN MATCHED THEN DELETE ...”\n", + "\n", + "- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\n", + "and source rows match. For example:\n", + "\n", + "...\n", + "\n", + "WHEN MATCHED AND events.shouldDelete THEN DELETE\n", + "\n", + "WHEN MATCHED THEN UPDATE SET events.data = updates.data\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\n", + "sources column. For example:\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "such as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\n", + "block deletes and updates in a Delta table using delta.appendOnly=true .\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN MATCHED THEN SET *\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN NOT MATCHED THEN INSERT *\n", + "\n", + "-- equivalent to updating/inserting with event .date = updates.date,\n", + "\n", + "events.eventId = updates.eventId, event .data = updates.data\n", + "\n", + "**Automatic and incremental Presto/Athena manifest**\n", + "**generation**\n", + "As noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\n", + "to read Delta Lake by using manifest files; the manifest files contain the list of the\n", + "most current version of files as of manifest generation. As described in the preceding\n", + "chapter, you will need to:\n", + "\n", + "- Generate a Delta Lake manifest file\n", + "\n", + "- Configure Presto or Athena to read the generated manifests\n", + "\n", + "- Manually re-generate (update) the manifest file\n", + "\n", + "New for Delta Lake 0.7.0 is the capability to update the manifest file automatically\n", + "with the following command:\n", + "\n", + "ALTER TABLE delta.`pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.compatibility.symlinkFormatManifest.enabled=true\n", + "\n", + ")\n", + "\n", + "**Configuring your table through table properties**\n", + "With the ability to set table properties on your table by using ALTER TABLE SET\n", + "TBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "You can also easily control the history of your Delta Lake table retention by the\n", + "following [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n", + "\n", + "- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n", + "(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\n", + "want to alter this value based on your requirements (e.g., GDPR historical context)\n", + "\n", + "- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\n", + "must have been deleted before being a candidate for VACUUM. By default, data\n", + "files older than seven days are deleted.\n", + "\n", + "As of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\n", + "configure these properties.\n", + "\n", + "ALTER TABLE delta. `pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.logRetentionDuration = “interval “\n", + "\n", + "delta.deletedFileRetentionDuration = “interval “\n", + "\n", + ")\n", + "\n", + "**Support for adding user-defined metadata**\n", + "**in Delta Lake table commits**\n", + "You can specify user-defined strings as metadata in commits made by Delta\n", + "Lake table operations, either using the DataFrameWriter option userMetadata or\n", + "the SparkSession configuration spark.databricks.delta.commitInfo.\n", + "userMetadata .\n", + "\n", + "In the following example, we are deleting a user (1xsdf1) from our data lake per user\n", + "request. To ensure we associate the user’s request with the deletion, we have also\n", + "added the DELETE request ID into the userMetadata.\n", + "\n", + "\n", + "-----\n", + "\n", + "SET spark.databricks.delta.commitInfo.userMetadata={\n", + "\n", + "“GDPR”:”DELETE Request 1x891jb23”\n", + "\n", + "\n", + "There were a lot of great questions during the AMA concerning structured streaming\n", + "and using trigger.once .\n", + "\n", + "\n", + "};\n", + "\n", + "\n", + "For more information, some good resources explaining this concept include:\n", + "\n", + "- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\n", + "trade-off discussed here .\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n", + "\n", + "[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n", + "\n", + "[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n", + "\n", + "\n", + "DELETE FROM user_table WHERE user_id = ‘1xsdf1’\n", + "\n", + "When reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\n", + "identify the associated deletion request within the transaction log.\n", + "\n", + "**Other highlights**\n", + "Other highlights for the Delta Lake 0.7.0 release include:\n", + "\n", + "- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n", + "3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n", + "\n", + "- Improved support for streaming one-time triggers — With Spark 3.0, we now\n", + "ensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\n", + "in a Delta Lake table in a single micro-batch even if rate limits are set with the\n", + "DataStreamReader option maxFilesPerTrigger.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse**\n", + "Combining the best elements of data\n", + "lakes and data warehouses\n", + "\n", + "## CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "architects began envisioning a single system to house data for many different\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "and data warehouses.\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "consistency as multiple parties concurrently read or write data, typically using SQL.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "warehouses.\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "or copy data between different systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "such as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n", + "[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\n", + "use separate clusters, thus these systems are able to scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "Cloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\n", + "store large data warehouses and data lakes. Unfortunately, their implementation\n", + "as key-value stores makes it difficult to achieve ACID transactions and high\n", + "performance: Metadata operations, such as listing objects, are expensive, and\n", + "consistency guarantees are limited. In this paper, we present Delta Lake, an\n", + "open source ACID table storage layer over cloud object stores initially developed\n", + "at Databricks. Delta Lake uses a transaction log that is compacted into Apache\n", + "Parquet format to provide ACID properties, time travel, and significantly faster\n", + "metadata operations for large tabular data sets (e.g., the ability to quickly search\n", + "billions of table partitions for those relevant to a query). It also leverages this\n", + "design to provide high-level features such as automatic data layout optimization,\n", + "upserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\n", + "Spark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\n", + "thousands of Databricks customers that process exabytes of data per day, with\n", + "the largest instances managing exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\n", + "Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\n", + "Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\n", + "Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "Microsoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "enables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n", + "[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "source file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "important for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "exploration and refinement are standard for many analytic and data science\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "data that went into a company’s products or decision-making was structured\n", + "data from operational systems, whereas today, many products incorporate\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "unstructured data.\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "systems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "and other issues will be addressed as the technology continues to mature and\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "diverse data applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "paper that describes some of the core technological challenges and solutions that\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "can read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "object stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "approach because the table is just a group of objects that can be accessed from\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "customers into a specific service provider, leaving customers to contend with\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "similar data structures and data management features to those in a data warehouse,\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "in the open source community. These use cases span a variety of data sources and\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "transactional features to replace some or all of the functionality provided by message\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "access protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "workloads through three components: an improved query optimizer, a caching\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "By linking these three components together, we think it will be easier for customers\n", + "to understand how improvements in multiple places within the Databricks code\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "new advances in how data teams design their data architectures for increased\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming**\n", + "Using Delta Lake to express\n", + "computation on streaming data\n", + "\n", + "## CHAPTER 04\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Solves Common**\n", + "**Pain Points in Streaming**\n", + "\n", + "The pain points of a traditional streaming and data warehousing solution can be\n", + "broken into two groups: data lake and data warehouse pains.\n", + "\n", + "**Data lake pain points**\n", + "While data lakes allow you to flexibly store an immense amount of data in a file system,\n", + "there are many pain points including (but not limited to):\n", + "\n", + "- Consolidation of streaming data from many disparate systems is difficult.\n", + "\n", + "- Updating data in a data lake is nearly impossible, and much of the streaming\n", + "data needs to be updated as changes are made. This is especially important in\n", + "scenarios involving financial reconciliation and subsequent adjustments.\n", + "\n", + "- Query speeds for a data lake are typically very slow.\n", + "\n", + "- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n", + "\n", + "**Data warehouse pain points**\n", + "The power of a data warehouse is that you have a persistent performant store of your\n", + "data. But the pain points for building modern continuous applications include (but are\n", + "not limited to):\n", + "\n", + "- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n", + "\n", + "- Accessing streaming data and stored data together is very difficult, if at all possible.\n", + "\n", + "- Data warehouses do not scale very well.\n", + "\n", + "- Tying compute and storage together makes using a warehouse very expensive.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake on Databricks solves these issues**\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\n", + "performance optimizations to cloud data lakes. More succinctly, Delta Lake combines\n", + "the advantages of data lakes and data warehouses with Apache Spark™ to allow you\n", + "to do incredible things.\n", + "\n", + "- Delta Lake, along with Structured Streaming, makes it possible to analyze\n", + "streaming and historical data together at high speeds.\n", + "\n", + "- When Delta Lake tables are used as sources and destinations of streaming big\n", + "data, it is easy to consolidate disparate data sources.\n", + "\n", + "- Upserts are supported on Delta Lake tables.\n", + "\n", + "- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n", + "\n", + "- Easily include machine learning scoring and advanced analytics into ETL\n", + "and queries.\n", + "\n", + "- Decouples compute and storage for a completely scalable solution.\n", + "\n", + "In the following use cases, we’ll share what this looks like in practice.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying Streaming Stock**\n", + "**Data Analysis Using Delta Lake**\n", + "\n", + "Real-time analysis of stock data is a complicated endeavor. After all, there are many\n", + "challenges in maintaining a streaming system and ensuring transactional consistency\n", + "of legacy and streaming data concurrently.\n", + "\n", + "Thankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\n", + "system to analyze stock data in real time. In this section, we’ll share how to simplify\n", + "the streaming of stock data analysis using Delta Lake.\n", + "\n", + "In the following diagram, you can see a high-level architecture that simplifies this\n", + "problem. We start by ingesting two different sets of data into two Delta Lake tables.\n", + "The two data sets are stock prices and fundamentals.\n", + "\n", + "After ingesting the data into their respective tables, we then join the data in an ETL\n", + "process and write the data out into a third Delta Lake table for downstream analysis.\n", + "\n", + "Delta Lake helps solve these problems by combining the scalability, streaming and\n", + "access to the advanced analytics of Apache Spark with the performance and ACID\n", + "compliance of a data warehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Create Fundamental Data (Databricks Delta table)\n", + "\n", + "dfBaseFund = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksFundamentals’ )\n", + "\n", + "# Create Price Data (Databricks Delta table)\n", + "\n", + "dfBasePrice = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksDailyPrices’ )\n", + "\n", + "\n", + "**Implement your streaming**\n", + "**stock analysis solution with Delta Lake**\n", + "Delta Lake and Apache Spark do most of the work for our solution; you can try out the\n", + "full [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n", + "\n", + "As noted in the preceding diagram, we have two data sets to process — one for\n", + "fundamentals and one for price data. To create our two Delta Lake tables, we specify\n", + "the .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n", + "\n", + "\n", + "-----\n", + "\n", + "While we’re updating the stockFundamentals and stocksDailyPrices ,\n", + "we will consolidate this data through a series of ETL jobs into a consolidated view\n", + "( stocksDailyPricesWFund ).\n", + "\n", + "With the following code snippet, we can determine the start and end date of available\n", + "data and then combine the price and fundamentals data for that date range into DBFS.\n", + "\n", + "# Determine start and end date of available data\n", + "\n", + "row = dfBasePrice.agg(\n", + "\n", + "func.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n", + "\n", + "func.min(dfBasePrice.price_date) .alias ( “minDate” )\n", + "\n", + ").collect()[ 0 ]\n", + "\n", + "startDate = row[ “minDate” ]\n", + "\n", + "endDate = row[ “maxDate” ]\n", + "\n", + "# Define our date range function\n", + "\n", + "\n", + "# Save data to DBFS\n", + "\n", + "dfPriceWFund\n", + "\n", + ".write\n", + "\n", + ".format( ‘delta’ )\n", + "\n", + ".mode( ‘append’ )\n", + "\n", + ".save( ‘/delta/stocksDailyPricesWFund’ )\n", + "\n", + "# Loop through dates to complete fundamentals + price ETL process\n", + "\n", + "for single_date in daterange(\n", + "\n", + "startDate, (endDate + datetime.timedelta(days= 1 ))\n", + "\n", + "):\n", + "\n", + "print ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n", + "\n", + "start = datetime.datetime.now()\n", + "\n", + "combinePriceAndFund(single_date)\n", + "\n", + "end = datetime.datetime.now()\n", + "\n", + "print ( end - start)\n", + "\n", + "\n", + "def daterange(start_date, end_date):\n", + "\n", + "\n", + "Now we have a stream of consolidated fundamentals and price data that is being\n", + "pushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\n", + "Delta Lake table by specifying .format(“delta”) against that DBFS location.\n", + "\n", + "\n", + "for n in range( int ((end_date - start_date).days)):\n", + "\n", + "yield start_date + datetime.timedelta(n)\n", + "\n", + "\n", + "# Define combinePriceAndFund information by date and\n", + "\n", + "\n", + "def combinePriceAndFund(theDate):\n", + "\n", + "dfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n", + "\n", + "dfPrice = dfBasePrice. where (\n", + "\n", + "dfBasePrice.price_date == theDate\n", + "\n", + "\n", + "dfPriceWithFundamentals = spark\n", + "\n", + ".readStream\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/delta/stocksDailyPricesWFund” )\n", + "\n", + "\n", + ").drop( ‘price_date’ )\n", + "\n", + "\n", + "# Drop the updated column\n", + "\n", + "dfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n", + "\n", + "\n", + "// Create temporary view of the data\n", + "\n", + "dfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n", + "\n", + "\n", + "-----\n", + "\n", + "Now that we have created our initial Delta Lake table, let’s create a view that will\n", + "allow us to calculate the price/earnings ratio in real time (because of the underlying\n", + "streaming data updating our Delta Lake table).\n", + "\n", + "%sql\n", + "\n", + "CREATE OR REPLACE TEMPORARY VIEW viewPE AS\n", + "\n", + "select ticker,\n", + "\n", + "price_date,\n", + "\n", + "first(close) as price,\n", + "\n", + "(close/eps_basic_net) as pe\n", + "\n", + "from priceWithFundamentals\n", + "\n", + "where eps_basic_net > 0\n", + "\n", + "group by ticker, price_date, pe\n", + "\n", + "**Analyze streaming stock data in real time**\n", + "With our view in place, we can quickly analyze our data using Spark SQL.\n", + "\n", + "%sql\n", + "\n", + "select - \n", + "\n", + "from viewPE\n", + "\n", + "where ticker == “AAPL”\n", + "\n", + "order by price_date\n", + "\n", + "\n", + "-----\n", + "\n", + "As the underlying source of this consolidated data set is a Delta Lake table, this view\n", + "isn’t just showing the batch data but also any new streams of data that are coming in\n", + "as per the following streaming dashboard.\n", + "\n", + "Underneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\n", + "tables but also keeping the state of the distinct number of keys (in this case ticker\n", + "symbols) that need to be tracked.\n", + "\n", + "\n", + "Because you are using Spark SQL, you can execute aggregate queries at scale\n", + "and in real time.\n", + "\n", + "%sql\n", + "\n", + "SELECT ticker, AVG(close) as Average_Close\n", + "\n", + "FROM priceWithFundamentals\n", + "\n", + "GROUP BY ticker\n", + "\n", + "ORDER BY Average_Close\n", + "\n", + "In closing, we demonstrated how to simplify streaming stock data analysis using\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\n", + "Databricks integrated workspace to create a performant, scalable solution that has\n", + "the advantages of both data lakes and data warehouses.\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\n", + "commonly associated with streaming and transactional consistency, enabling\n", + "data engineering and data science teams to focus on understanding the trends in\n", + "their stock data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Tilting Point Does Streaming**\n", + "**Ingestion Into Delta Lake**\n", + "\n", + "Tilting Point is a new-generation games partner that provides top development\n", + "studios with expert resources, services and operational support to optimize\n", + "high-quality live games for success. Through its user acquisition fund and its\n", + "world-class technology platform, Tilting Point funds and runs performance\n", + "marketing management and live games operations to help developers achieve\n", + "profitable scale.\n", + "\n", + "By leveraging Delta Lake, Tilting Point is able to leverage quality data and make\n", + "it readily available for analytics to improve the business. Diego Link, VP of\n", + "Engineering at Tilting Point, provided insights for this use case.\n", + "\n", + "The team at Tilting Point was running daily and hourly batch jobs for reporting on\n", + "game analytics. They wanted to make their reporting near real-time, getting insights\n", + "within 5–10 minutes.\n", + "\n", + "They also wanted to make their in-game LiveOps decisions based on real-time player\n", + "behavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\n", + "effects and even alert on service interruptions in game operations. The goal was to\n", + "ensure that the game experience was as robust as possible for their players.\n", + "\n", + "Additionally, they had to store encrypted Personally Identifiable Information (PII) data\n", + "separately in order to maintain GDPR compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How data flows and associated challenges**\n", + "Tilting Point has a proprietary software development kit that developers integrate\n", + "with to send data from game servers to an ingest server hosted in AWS. This service\n", + "removes all PII data and then sends the raw data to an Amazon Firehose endpoint.\n", + "Firehose then dumps the data in JSON format continuously to S3.\n", + "\n", + "To clean up the raw data and make it available quickly for analytics, the team\n", + "considered pushing the continuous data from Firehose to a message bus (e.g.,\n", + "Kafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\n", + "process data and write to Delta Lake tables.\n", + "\n", + "While that architecture sounds ideal for low latency requirements of processing\n", + "data in seconds, Tilting Point didn’t have such low latency needs for their ingestion\n", + "pipeline. They wanted to make the data available for analytics in a few minutes, not\n", + "seconds. Hence they decided to simplify our architecture by eliminating a message\n", + "bus and instead use S3 as a continuous source for their structured streaming job.\n", + "\n", + "But the key challenge in using S3 as a continuous source is identifying files that\n", + "changed recently.\n", + "\n", + "Listing all files every few minutes has two major issues:\n", + "\n", + "- **Higher latency:** Listing all files in a directory with a large number of files has high\n", + "overhead and increases processing time.\n", + "\n", + "- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n", + "\n", + "**Leveraging Structured Streaming with blob store as**\n", + "**source and Delta Lake tables as sink**\n", + "To continuously stream data from cloud blob storage like S3, Tilting Point uses\n", + "[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\n", + "stream data from S3 without the need to write any state management code on what\n", + "files were recently processed.\n", + "\n", + "\n", + "-----\n", + "\n", + "This is how Tilting Point’s ingestion pipeline looks:\n", + "\n", + "- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\n", + "to SQS via SNS.\n", + "\n", + "- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\n", + "information to read the actual file contents in S3. An example code below:\n", + "\n", + "spark.readStream \\\n", + "\n", + ".format( “s3-sqs” ) \\\n", + "\n", + ". option ( “fileFormat” , “json” ) \\\n", + "\n", + ". option ( “queueUrl” , ...) \\\n", + "\n", + ". schema (...) \\\n", + "\n", + ". load ()\n", + "\n", + "- Tilting Point’s structured streaming job then cleans up and transforms the data.\n", + "Based on the game data, the streaming job uses the foreachBatch API of Spark\n", + "streaming and writes to 30 different Delta Lake tables.\n", + "\n", + "- The streaming job produces lots of small files. This affects performance of\n", + "downstream consumers. So, an optimize job runs daily to compact small files in\n", + "the table and store them as right file sizes so that consumers of the data have\n", + "good performance while reading the data from Delta Lake tables. Tilting Point\n", + "also runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n", + "\n", + "\n", + "-----\n", + "\n", + "The above Delta Lake ingestion architecture helps in the following ways:\n", + "\n", + "- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\n", + "This helps quickly process the new files without too much overhead in listing files.\n", + "\n", + "- **No explicit file state management:** There is no explicit file state management\n", + "needed to look for recent files.\n", + "\n", + "- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\n", + "and Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n", + "\n", + "- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\n", + "transactional guarantees. This helps with reliable data ingestion.\n", + "\n", + "- **File compaction:** One of the major problems with streaming ingestion is tables\n", + "ending up with a large number of small files that can affect read performance.\n", + "Before Delta Lake, we had to set up a different table to write the compacted\n", + "data. With Delta Lake, thanks to ACID transactions, we can compact the files and\n", + "rewrite the data back to the same table safely.\n", + "\n", + "- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\n", + "ingestion tables to downstream consumers while data is being appended by a\n", + "streaming job and modified during compaction.\n", + "\n", + "- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\n", + "previous version of the table.\n", + "\n", + "In this section, we walked through Tilting Point’s use cases and how they do\n", + "streaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\n", + "efficiently without too much operational overhead to make good quality data\n", + "readily available for analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Building a Quality of Service**\n", + "**Analytics Solution for Streaming**\n", + "**Video Services**\n", + "\n", + "As traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\n", + "libraries of content. For companies whose entire business model revolved around\n", + "producing great content, which they then licensed to distributors, the shift to now\n", + "owning the entire glass-to-glass experience has required new capabilities, such as\n", + "building media supply chains for content delivery to consumers, supporting apps for\n", + "a myriad of devices and operating systems, and performing customer relationship\n", + "functions like billing and customer service.\n", + "\n", + "With most services renewing on a monthly basis, subscription service operators need\n", + "to prove value to their subscribers at all times. General quality of streaming video\n", + "issues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\n", + "screen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n", + "[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n", + "\n", + "When you start streaming, you realize there are so many places where breaks can\n", + "happen and the viewer experience can suffer. There may be an issue at the source in\n", + "the servers on-premises or in the cloud; in transit at either the CDN level or ISP level\n", + "or the viewer’s home network; or at the playout level with player/client issues. What\n", + "breaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\n", + "x 106. There is no pre-release testing that can quite replicate real-world users and\n", + "their ability to push even the most redundant systems to their breaking point as they\n", + "\n", + "\n", + "-----\n", + "\n", + "channel surf, click in and out of the app, sign on from different devices simultaneously\n", + "and so on. And because of the nature of TV, things will go wrong during the most\n", + "important, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n", + "[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\n", + "rather regional or a national issue? If national, is it across all devices or only certain\n", + "types (e.g., possibly the OEM updated the OS on an older device type, which ended up\n", + "causing compatibility issues with the client)?\n", + "\n", + "Identifying, remediating and preventing viewer quality of experience issues becomes\n", + "a big data problem when you consider the number of users, the number of actions\n", + "they are taking and the number of handoffs in the experience (servers to CDN to ISP to\n", + "home network to client). Quality of Service (QoS) helps make sense of these streams\n", + "of data so you can understand what is going wrong, where and why. Eventually you\n", + "can get into predictive analytics around what could go wrong and how to remediate\n", + "it before anything breaks.\n", + "\n", + "**Databricks Quality of Service solution overview**\n", + "The aim of this solution is to provide the core for any streaming video platform that\n", + "wants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n", + "[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\n", + "a Unified Data Analytics Platform for both the real-time insights and the advanced\n", + "analytics capabilities.\n", + "\n", + "[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\n", + "leveraging the most complete and recent data sets powered by robust and reliable\n", + "data pipelines. This decreases time to market for new features by accelerating\n", + "data science using a collaborative environment. It provides support for managing\n", + "the end-to-end machine learning lifecycle and reduces operational costs across\n", + "all cycles of software development by having a unified platform for both data\n", + "engineering and data science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Video QoS solution architecture**\n", + "With complexities like low-latency monitoring alerts and highly scalable infrastructure\n", + "required for peak video traffic hours, the straightforward architectural choice was\n", + "the Delta Architecture — both standard big data architectures like Lambda and Kappa\n", + "Architectures have disadvantages around the operational effort required to maintain\n", + "multiple types of pipelines (streaming and batch) and lack support for a unified data\n", + "engineering and data science approach.\n", + "\n", + "The Delta Architecture is the next-generation paradigm that enables all the data\n", + "personas in your organization to be more productive:\n", + "\n", + "- Data engineers can develop data pipelines in a cost-efficient manner\n", + "continuously without having to choose between batch and streaming\n", + "\n", + "- Data analysts can get near real-time insights and faster answers to their BI queries\n", + "\n", + "- Data scientists can develop better machine learning models using more reliable data\n", + "sets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n", + "\n", + "\n", + "-----\n", + "\n", + "Writing data pipelines using the Delta Architecture follows the best practices of\n", + "having a multi-layer “multi-hop” approach where we progressively add structure to\n", + "data: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n", + "(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\n", + "reporting or data science, and “Gold” tables are the final presentation layer.\n", + "\n", + "For the pure streaming use cases, the option of materializing the DataFrames in\n", + "intermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\n", + "cost (an example being real-time monitoring alerts vs. updates of the recommender\n", + "system based on new content).\n", + "\n", + "A streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n", + "\n", + "The number of “hops” in this approach is directly impacted by the number of consumers\n", + "downstream, complexity of the aggregations (e.g., Structured Streaming enforces\n", + "certain limitations around chaining multiple aggregations) and the maximization of\n", + "operational efficiency.\n", + "\n", + "The QoS solution architecture is focused around best practices for data processing\n", + "and is not a full video-on-demand (VoD) solution — with some standard components\n", + "like the “front door” service Amazon API Gateway being avoided from the high-level\n", + "architecture in order to keep the focus on data and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "High-level architecture for the QoS platform\n", + "\n", + "\n", + "**Making your data ready for analytics**\n", + "Both sources of data included in the QoS solution (application events and CDN logs)\n", + "are using the JSON format, great for data exchange — allowing you to represent\n", + "complex nested structures, but not scalable and difficult to maintain as a storage\n", + "format for your data lake / analytics system.\n", + "\n", + "\n", + "In order to make the data directly queryable across the entire organization, the\n", + "Bronze to Silver pipeline (the “make your data available to everyone” pipeline) should\n", + "transform any raw formats into Delta Lake and include all the quality checks or data\n", + "masking required by any regulatory agencies.\n", + "\n", + "\n", + "-----\n", + "\n", + "Raw format of the app events\n", + "\n", + "**Video applications events**\n", + "Based on the architecture, the video application events are pushed directly to\n", + "Kinesis Streams and then just ingested to a Delta Lake append-only table without\n", + "any changes to the schema.\n", + "\n", + "Using this pattern allows a high number of consumers downstream to process the\n", + "data in a streaming paradigm without having to scale the throughput of the Kinesis\n", + "stream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\n", + "we don’t have to worry about the way the size of the processing window will impact the\n", + "number of files in your target table — known as the “small files” issue in the big data world.\n", + "\n", + "Both the timestamp and the type of message are being extracted from the JSON\n", + "event in order to be able to partition the data and allow consumers to choose the\n", + "type of events they want to process. Again combining a single Kinesis stream for\n", + "the events with a Delta Lake “Events” table reduces the operational complexity while\n", + "making things easier for scaling during peak hours.\n", + "\n", + "\n", + "All the details are extracted from JSON for the Silver table\n", + "\n", + "\n", + "-----\n", + "\n", + "**CDN logs**\n", + "The CDN logs are delivered to S3, so the easiest way to process them is the Databricks\n", + "Auto Loader, which incrementally and efficiently processes new data files as they\n", + "arrive in S3 without any additional setup.\n", + "\n", + "auto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n", + "\n", + ".option( “cloudFiles.format” , “json” ) \\\n", + "\n", + ".option( “cloudFiles.region” , region) \\\n", + "\n", + ".load(input_location)\n", + "\n", + "anonymized_df = auto_loader_df. select ( ‘*’ , ip_\n", + "\n", + "anonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n", + "\n", + ".drop( ‘requestip’ )\\\n", + "\n", + ".withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n", + "\n", + "anonymized_df.writeStream \\\n", + "\n", + ".option( ‘checkpointLocation’ , checkpoint_location)\\\n", + "\n", + ".format( ‘delta’ ) \\\n", + "\n", + ".table(silver_database + ‘.cdn_logs’ )\n", + "\n", + "As the logs contain IPs — considered personal data under the GDPR regulations — the\n", + "“make your data available to everyone” pipeline has to include an anonymization step.\n", + "Different techniques can be used, but we decided to just strip the last octet from IPv4\n", + "and the last 80 bits from IPv6. On top, the data set is also enriched with information\n", + "around the origin country and the ISP provider, which will be used later in the Network\n", + "Operation Centers for localization.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating the Dashboard /**\n", + "**Virtual Network Operation Centers**\n", + "Streaming companies need to monitor network performance and the user experience\n", + "as near real-time as possible, tracking down to the individual level with the ability to\n", + "abstract at the segment level, easily defining new segments such as those defined by\n", + "geos, devices, networks and/or current and historical viewing behavior.\n", + "\n", + "For streaming companies that has meant adopting the concept of Network Operation\n", + "Centers (NOC) from telco networks for monitoring the health of the streaming\n", + "experience for their users at a macro level, flagging and responding to any issues\n", + "early on. At their most basic, NOCs should have dashboards that compare the current\n", + "experience for users against a performance baseline so that the product teams can\n", + "quickly and easily identify and attend to any service anomalies.\n", + "\n", + "In the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\n", + "be effortlessly connected in order to build more complex visualizations, but based\n", + "on customer feedback, built-in dashboards are, most of the time, the fastest way to\n", + "present the insights to business users.\n", + "\n", + "The aggregated tables for the NOC will basically be the Gold layer of our Delta\n", + "Architecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n", + "\n", + "\n", + "-----\n", + "\n", + "The dashboard is just a way to visually package the results of SQL queries or Python\n", + "/ R transformation — each notebook supports multiple dashboards so in case of\n", + "multiple end users with different requirements we don’t have to duplicate the code —\n", + "as a bonus the refresh can also be scheduled as a Databricks job.\n", + "\n", + "Visualization of the results of a SQL query\n", + "\n", + "Loading time for videos (time to first frame) allows better understanding of the\n", + "performance for individual locations of your CDN — in this case the AWS CloudFront\n", + "Edge nodes — which has a direct impact in your strategy for improving this KPI —\n", + "either by spreading the user traffic over multi-CDNs or maybe just implementing a\n", + "dynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n", + "\n", + "\n", + "-----\n", + "\n", + "Failure to understand the reasons for high levels of buffering — and the poor video\n", + "quality experience that it brings — has a significant impact on subscriber churn rate.\n", + "On top of that, advertisers are not willing to spend money on ads responsible for\n", + "reducing the viewer engagement — as they add extra buffering on top, so the profits\n", + "on the advertising business usually are impacted too. In this context, collecting as\n", + "much information as possible from the application side is crucial to allow the analysis\n", + "to be done not only at video level but also browser or even type / version of application.\n", + "\n", + "On the content side, events for the application can provide useful information about\n", + "user behavior and overall quality of experience. How many people that paused a video\n", + "have actually finished watching that episode / video? What caused the stoppage: The\n", + "quality of the content or delivery issues? Of course, further analyses can be done by\n", + "linking all the sources together (user behavior, performance of CDNs /ISPs) to not only\n", + "create a user profile but also to forecast churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating (near) real-time alerts**\n", + "When dealing with the velocity, volume and variety of data generated in video\n", + "streaming from millions of concurrent users, dashboard complexity can make it\n", + "harder for human operators in the NOC to focus on the most important data at the\n", + "moment and zero-in on root cause issues. With this solution, you can easily set up\n", + "automated alerts when performance crosses certain thresholds that can help the\n", + "human operators of the network as well as set off automatic remediation protocols\n", + "via a Lambda function. For example:\n", + "\n", + "- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\n", + "latency vs. baseline average), initiate automatic CDN traffic shifts.\n", + "\n", + "- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\n", + "product team that there is likely a client issue for a specific device.\n", + "\n", + "- If viewers on a certain ISP are having higher-than-average buffering and\n", + "pixelation issues, alert frontline customer representatives on responses and ways\n", + "to decrease issues (e.g., set stream quality lower).\n", + "\n", + "From a technical perspective, generating real-time alerts requires a streaming\n", + "engine capable of processing data real time and publish-subscribe service to push\n", + "notifications.\n", + "\n", + "\n", + "updates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n", + "[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\n", + "on a rule-based engine (e.g., validating the percentage of errors for each individual\n", + "type of app over a period of time) really straightforward.\n", + "\n", + "def send_error_notification(row):\n", + "\n", + "sns_client = boto3.client( ‘sns’ , region)\n", + "\n", + "error_message = ‘Number of errors for the App has exceeded the\n", + "\n", + "threshold {}’ .format(row[ ‘percentage’ ])\n", + "\n", + "response = sns_client.publish(\n", + "\n", + "TopicArn =,\n", + "\n", + "Message = error_message,\n", + "\n", + "Subject =,\n", + "\n", + "MessageStructure = ‘string’ )\n", + "\n", + "# Structured Streaming Job\n", + "\n", + "getKinesisStream( “player_events” )\\\n", + "\n", + ".selectExpr( “type” , “app_type” )\\\n", + "\n", + ".groupBy( “app_type” )\\\n", + "\n", + ".apply(calculate_error_percentage)\\\n", + "\n", + ". where ( “percentage > {}” .format(threshold)) \\\n", + "\n", + ".writeStream\\\n", + "\n", + ". foreach (send_error_notification)\\\n", + "\n", + ".start()\n", + "\n", + "\n", + "Integrating microservices using Amazon SNS and Amazon SQS\n", + "\n", + "Sending email notifications using AWS SNS\n", + "\n", + "The QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\n", + "by using Amazon SNS and its integrations with Amazon Lambda (see below for the\n", + "\n", + "\n", + "-----\n", + "\n", + "On top of the basic email use case, the Demo Player includes three widgets updated\n", + "in real time using AWS AppSync: the number of active users, the most popular videos\n", + "and the number of users concurrently watching a video.\n", + "\n", + "Updating the application with the results of real-time aggregations\n", + "\n", + "The QoS solution is applying a similar approach — Structured Streaming and Amazon\n", + "SNS — to update all the values allowing for extra consumers to be plugged in using AWS\n", + "SQS. This is a common pattern when huge volumes of events have to be enhanced and\n", + "analyzed; pre-aggregate data once and allow each service (consumer) to make their\n", + "own decision downstream.\n", + "\n", + "**Next steps: machine learning**\n", + "Manually making sense of the historical data is important but is also very slow. If\n", + "we want to be able to make automated decisions in the future, we have to integrate\n", + "machine learning algorithms.\n", + "\n", + "As a Unified Data Platform, Databricks empowers data scientists to build better data\n", + "science products using features like Runtime for Machine Learning with built-in\n", + "or the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n", + "\n", + "\n", + "-----\n", + "\n", + "We have already explored a few important use cases across our customer base while\n", + "focusing on the possible extensions to the QoS solution.\n", + "\n", + "**Point-of-failure prediction and remediation**\n", + "As D2C streamers reach more users, the costs of even momentary loss of service\n", + "increases. ML can help operators move from reporting to prevention by forecasting\n", + "where issues could come up and remediating before anything goes wrong (e.g.,\n", + "a spike in concurrent viewers leads to switching CDNs to one with more capacity\n", + "automatically).\n", + "\n", + "**Customer churn**\n", + "Critical to growing subscription services is keeping the subscribers you have. By\n", + "understanding the quality of service at the individual level, you can add QoS as a\n", + "variable in churn and customer lifetime value models. Additionally, you can create\n", + "customer cohorts for those who have had video quality issues in order to test\n", + "proactive messaging and save offers.\n", + "\n", + "\n", + "**Getting started with the Databricks streaming video**\n", + "**QoS solution**\n", + "Providing consistent quality in the streaming video experience is table stakes at this\n", + "point to keep fickle audiences with ample entertainment options on your platform.\n", + "With this solution we have sought to create a quick start for most streaming video\n", + "platform environments to embed this QoS real-time streaming analytics solution in\n", + "a way that:\n", + "1. Scales to any audience size\n", + "2. Quickly flags quality performance issues at key parts of the distribution workflow\n", + "3. Is flexible and modular enough to easily customize for your audience and your\n", + "needs, such as creating new automated alerts or enabling data scientists to test\n", + "and roll out predictive analytics and machine learning\n", + "\n", + "To get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n", + "[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\n", + "system, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Customer Use Cases**\n", + "See how customers are using\n", + "Delta Lake to rapidly innovate\n", + "\n", + "## CHAPTER 05\n", + "\n", + "\n", + "-----\n", + "\n", + "**Healthdirect Australia**\n", + "Provides Personalized and Secure Online\n", + "Patient Care With Databricks\n", + "\n", + "As the shepherds of the National Health Services Directory (NHSD), Healthdirect\n", + "is focused on leveraging terabytes of data covering time-driven, activity-based\n", + "healthcare transactions to improve health care services and support. With\n", + "governance requirements, siloed teams and a legacy system that was difficult\n", + "to scale, they moved to Databricks. This boosted data processing for downstream\n", + "machine learning while improving data security to meet HIPAA requirements.\n", + "\n", + "**Spotlight on Healthdirect**\n", + "**Industry:** Healthcare and life sciences\n", + "6x\n", + "Improvement in data processing\n", + "20M\n", + "Records ingested in minutes\n", + "\n", + "**Data quality and governance issues, silos, and the**\n", + "**inability to scale**\n", + "Due to regulatory pressures, Healthdirect Australia set forth to improve overall data\n", + "quality and ensure a level of governance on top of that, but they ran into challenges\n", + "when it came to data storage and access. On top of that, data silos were blocking the\n", + "team from efficiently preparing data for downstream analytics. These disjointed data\n", + "\n", + "\n", + "-----\n", + "\n", + "sources impacted the consistency of data reads, as data was oftentimes out-of-sync\n", + "between the various systems in their stack. The low-quality data also led to higher\n", + "error rates and processing inefficiencies. This fragmented architecture created\n", + "significant operational overhead and limited their ability to have a comprehensive\n", + "view of the patient.\n", + "\n", + "Further, they needed to ingest over 1 billion data points due to a changing landscape\n", + "of customer demand such as bookings, appointments, pricing, eHealth transaction\n", + "activity, etc. — estimated at over 1TB of data.\n", + "\n", + "“We had a lot of data challenges. We just couldn’t process efficiently enough. We\n", + "were starting to get batch overruns. We were starting to see that a 24-hour window\n", + "isn’t the most optimum time in which we want to be able to deliver healthcare data\n", + "and services,” explained Peter James, Chief Architect at Healthdirect Australia.\n", + "\n", + "Ultimately, Healthdirect realized they needed to modernize their end-to-end process\n", + "and tech stack to properly support the business.\n", + "\n", + "**Modernizing analytics with Databricks and Delta Lake**\n", + "Databricks provides Healthdirect Australia with a Unified Data Platform that simplifies\n", + "data engineering and accelerates data science innovation. The notebook environment\n", + "enables them to make content changes in a controlled fashion rather than having to\n", + "run bespoke jobs each time.\n", + "\n", + "“Databricks has provided a big uplift for our teams and our data operations,” said\n", + "James. “The analysts were working directly with the data operations teams. They are\n", + "able to achieve the same pieces of work together within the same time frames that\n", + "used to take twice as long. They’re working together, and we’re seeing just a massive\n", + "acceleration in the speed at which we can deliver service.”\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\n", + "Within these zones, they store their data “as is,” in their structured or unstructured\n", + "state, in Delta Lake tables. From there, they use a metadata-driven schema and hold\n", + "the data within a nested structure within that table. What this allows them to do is\n", + "handle data consistently from every source and simplifies the mapping of data to the\n", + "various applications pulling the data.\n", + "\n", + "Meanwhile, through Structured Streaming, they were able to convert all of their\n", + "ETL batch jobs into streaming ETL jobs that could serve multiple applications\n", + "consistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\n", + "Databricks Unified Data Platform provides significant architectural improvements\n", + "that have boosted performance, reduced operational overheads and increased\n", + "process efficiencies.\n", + "\n", + "\n", + "**Faster data pipelines result in better patient-driven**\n", + "**healthcare**\n", + "As a result of the performance gains delivered by Databricks and the improved data\n", + "reliability through Delta Lake, Healthdirect Australia realized improved accuracy of\n", + "their fuzzy name match algorithm from less than 80% with manual verification to 95%\n", + "and no manual intervention.\n", + "\n", + "The processing improvements with Delta Lake and Structured Streaming allowed\n", + "them to process more than 30,000 automated updates per month. Prior to Databricks,\n", + "they had to use unreliable batch jobs that were highly manual to process the same\n", + "number of updates over a span of 6 months — a 6x improvement in data processing.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the\n", + "healthcare sector.”\n", + "\n", + "– Peter James, Chief Architect, Healthdirect Australia\n", + "\n", + "\n", + "-----\n", + "\n", + "They were also able to increase their data load rate to 1 million records per minute,\n", + "loading their entire 20 million record data set in 20 minutes. Before the adoption\n", + "of Databricks, this used to take more than 24 hours to process the same 1 million\n", + "transactions, blocking analysts from making swift decisions to drive results.\n", + "\n", + "Last, data security, which was critical to meet compliance requirements, was greatly\n", + "improved. Databricks provides standard security accreditations like HIPAA, and\n", + "Healthdirect was able to use Databricks to meet Australia’s security requirements.\n", + "This yielded significant cost reductions and gave them continuous data assurance\n", + "by monitoring changes to access privileges like changes in roles, metadata-level\n", + "security changes, data leakage, etc.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the healthcare\n", + "sector,” said James.\n", + "\n", + "With the help of Databricks, they have proven the value of data and analytics and how\n", + "it can impact their business vision. With transparent access to data that boasts\n", + "well-documented lineage and quality, participation across various business and\n", + "analyst groups has increased — empowering teams to collaborate and more\n", + "easily and quickly extract value from their data with the goal of improving\n", + "healthcare for everyone.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Comcast**\n", + "Uses Delta Lake and MLflow to\n", + "Transform the Viewer Experience\n", + "\n", + "**Spotlight on Comcast**\n", + "**Industry:** Media and entertainment\n", + "10x\n", + "Reduction in overall compute costs to process data\n", + "90%\n", + "Reduction in required DevOps resources to manage infrastructure\n", + "Reduced\n", + "Deployment times from weeks to minutes\n", + "\n", + "As a global technology and media company connecting millions of customers to\n", + "personalized experiences, Comcast struggled with massive data, fragile data pipelines\n", + "\n", + "and poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n", + "— they can build performant data pipelines for petabytes of data and easily manage the\n", + "lifecycle of hundreds of models to create a highly innovative, unique and award-winning\n", + "viewer experience using voice recognition and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Infrastructure unable to support data and ML needs**\n", + "Instantly answering a customer’s voice request for a particular program while turning\n", + "billions of individual interactions into actionable insights, strained Comcast’s IT\n", + "infrastructure and data analytics and data science teams. To make matters more\n", + "complicated, Comcast needed to deploy models to a disjointed and disparate range\n", + "of environments: cloud, on-premises and even directly to devices in some instances.\n", + "\n", + "- **Massive data:** Billions of events generated by the entertainment system and 20+\n", + "million voice remotes, resulting in petabytes of data that need to be sessionized\n", + "for analysis.\n", + "\n", + "- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\n", + "hard to recover. Small files were difficult to manage, slowing data ingestion for\n", + "downstream machine learning.\n", + "\n", + "- **Poor collaboration:** Globally dispersed data scientists working in different\n", + "scripting languages struggled to share and reuse code.\n", + "\n", + "- **Manage management of ML models:** Developing, training and deploying hundreds\n", + "of models was highly manual, slow and hard to replicate, making it difficult to scale.\n", + "\n", + "- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\n", + "and models while ops wanted to deploy on proven infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Automated infrastructure, faster data**\n", + "**pipelines with Delta Lake**\n", + "Comcast realized they needed to modernize their entire approach to analytics from\n", + "data ingest to the deployment of machine learning models to delivering new features\n", + "that delight their customers. Today, the Databricks Unified Data Platform enables\n", + "Comcast to build rich data sets and optimize machine learning at scale, streamline\n", + "workflows across teams, foster collaboration, reduce infrastructure complexity, and\n", + "deliver superior customer experiences.\n", + "\n", + "- **Simplified infrastructure management:** Reduced operational costs through\n", + "automated cluster management and cost management features such as\n", + "autoscaling and spot instances.\n", + "\n", + "\n", + "\n", + "- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\n", + "initial processing of the raw telemetry from video and voice applications and devices.\n", + "\n", + "- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\n", + "and reliable ingestion at scale.\n", + "\n", + "- **Collaborative workspaces:** Interactive notebooks improve cross-team\n", + "collaboration and data science creativity, allowing Comcast to greatly accelerate\n", + "model prototyping for faster iteration.\n", + "\n", + "- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\n", + "and model serving via the Kubeflow environment, allowing them to track and\n", + "manage hundreds of models with ease.\n", + "\n", + "- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\n", + "that can reliably join historic and streaming data for richer insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering personalized experiences with ML**\n", + "In the intensely competitive entertainment industry, there is no time to press the\n", + "Pause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\n", + "delighted with competition-beating customer experiences.\n", + "\n", + "- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\n", + "a highly innovative and award-winning viewer experience with intelligent voice\n", + "commands that boosts engagement.\n", + "\n", + "- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\n", + "ingestion, replacing 640 machines with 64 while improving performance. Teams\n", + "can spend more time on analytics and less time on infrastructure management.\n", + "\n", + "- **Less DevOps:** Reduced the number of DevOps full-time employees required for\n", + "onboarding 200 users from 5 to 0.5.\n", + "\n", + "- **Higher data science productivity:** Fostered collaboration between global data\n", + "scientists by enabling different programming languages through a single\n", + "interactive workspace. Also, Delta Lake has enabled the data team to use data at\n", + "any point within the data pipeline, allowing them to act more quickly in building\n", + "and training new models.\n", + "\n", + "- **Faster model deployment:** Reduced deployment times from weeks to minutes as\n", + "operations teams deployed models on disparate platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Banco Hipotecario**\n", + "Personalizes the Banking\n", + "Experience With Data and ML\n", + "\n", + "Banco Hipotecario — a leading Argentinian commercial bank — is on a mission\n", + "to leverage machine learning to deliver new insights and services that will delight\n", + "customers and create upsell opportunities. With a legacy analytics and data\n", + "warehousing system that was rigid and complex to scale, they turned to Databricks\n", + "to unify data science, engineering and analytics.\n", + "\n", + "As a result of this partnership, they were able to significantly increase customer\n", + "acquisition and cross-sells while lowering the cost for acquisition, greatly impacting\n", + "overall customer retention and profitability.\n", + "\n", + "**Spotlight on Banco Hipotecario**\n", + "**Industry:** Financial services\n", + "35%\n", + "\n", + "Reduction in cost of acquisition\n", + "**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n", + "\n", + "\n", + "-----\n", + "\n", + "**Legacy analytics tools are slow, rigid and**\n", + "**impossible to scale**\n", + "Banco Hipotecario set forth to increase customer acquisition by reducing risk and\n", + "improving the customer experience. With data analytics and machine learning\n", + "anchoring their strategy, they hoped to influence a range of use cases from fraud\n", + "detection and risk analysis to serving product recommendations to drive upsell and\n", + "cross-sell opportunities and forecast sales.\n", + "\n", + "Banco Hipotecario faced a number of the challenges that often come along with\n", + "outdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n", + "— the list goes on.\n", + "\n", + "“In order to execute on our data analytics strategy, new technologies were needed\n", + "in order to improve data engineering and boost data science productivity,” said\n", + "Daniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\n", + "took were to move to a cloud-based data lake, which led us to Azure Databricks\n", + "and Delta Lake.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**A unified platform powers the data lake**\n", + "**and easy collaboration**\n", + "Banco Hipotecario turned to Databricks to modernize their data warehouse\n", + "environment, improve cross-team collaboration, and drive data science innovation.\n", + "Fully managed in Microsoft Azure, they were able to easily and reliably ingest massive\n", + "volumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\n", + "automated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n", + "\n", + "Delta Lake has been especially useful in bringing reliability and performance to Banco\n", + "Hipotecario’s data lake environment. With Delta Lake, they are now able to build\n", + "reliable and performant ETL pipelines like never before.\n", + "\n", + "\n", + "Meanwhile, performing SQL Analytics on Databricks has helped them do data\n", + "exploration, cleansing and generate data sets in order to create models, enabling the\n", + "team to deploy their first model within the first three months, and the second model\n", + "generated was rolled out in just two weeks.\n", + "\n", + "At the same time, data scientists were finally able to collaborate, thanks to interactive\n", + "notebooks; this meant faster builds, training and deployment. And MLflow streamlined\n", + "the ML lifecycle and removed the overreliance on data engineering.\n", + "\n", + "“Databricks gives our data scientists the means to easily create our own experiments\n", + "and deploy them to production in weeks, rather than months,” said Miguel Villalba,\n", + "Head of Data Engineering and Data Science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**An efficient team maximizes customer**\n", + "**acquisition and retention**\n", + "Since moving to Databricks, the data team at Banco Hipotecario could not be happier,\n", + "as Databricks has unified them across functions in an integrated fashion.\n", + "\n", + "The results of data unification and markedly improved collaboration and autonomy\n", + "cannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\n", + "their cross-sell into new products by a whopping 90%, while machine learning has\n", + "reduced the cost of customer acquisition by 35%.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Viacom18**\n", + "Migrates From Hadoop to Databricks to\n", + "Deliver More Engaging Experiences\n", + "\n", + "Viacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\n", + "with 40x growth over the past decade. They offer multi-platform, multigenerational\n", + "and multicultural brand experiences to 600+ million monthly viewers.\n", + "\n", + "In order to deliver more engaging experiences for their millions of viewers, Viacom18\n", + "migrated from their Hadoop environment due to its inability to process data at scale\n", + "efficiently. With Databricks, they have streamlined their infrastructure management,\n", + "increased data pipeline speeds and increased productivity among their data teams.\n", + "\n", + "Today, Viacom18 is able to deliver more relevant viewing experiences to their\n", + "subscribers, while identifying opportunities to optimize the business and drive\n", + "greater ROI.\n", + "\n", + "**Spotlight on Viacom18**\n", + "**Industry:** Media and entertainment\n", + "26%\n", + "Increase in operational efficiency lowers overall costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**Growth in subscribers and terabytes of viewing data**\n", + "**push Hadoop to its limits**\n", + "Viacom18, a joint venture between Network18 and ViacomCBS, is focused on\n", + "providing its audiences with highly personalized viewing experiences. The core\n", + "of this strategy requires implementing an enterprise data architecture that enables\n", + "the building of powerful customer analytics on daily viewer data. But with millions of\n", + "consumers across India, the sheer amount of data was tough to wrangle: They were\n", + "tasked with ingesting and processing over 45,000 hours of daily content on VOOT\n", + "(Viacom18’s on-demand video subscription platform), which easily generated 700GB\n", + "to 1TB of data per day.\n", + "\n", + "“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\n", + "Vice President of Digital Transformation and Technology. “We deliver personalized\n", + "content recommendations across our audiences around the world based on\n", + "individual viewing history and preferences in order to increase viewership and\n", + "customer loyalty.”\n", + "\n", + "Viacom18’s data lake, which was leveraging on-premises Hadoop for operations,\n", + "wasn’t able to optimally process 90 days of rolling data within their management’s\n", + "defined SLAs, limiting their ability to deliver on their analytics needs, which impacted\n", + "not only the customer experience but also overall costs.\n", + "\n", + "To meet this challenge head-on, Viacom18 needed a modern data warehouse with the\n", + "ability to analyze data trends for a longer period of time instead of daily snapshots. They\n", + "also needed a platform that simplified infrastructure by allowing their team to easily\n", + "provision clusters with features like auto-scaling to help reduce compute costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Rapid data processing for analytics**\n", + "**and ML with Databricks**\n", + "To enable the processing power and data science capabilities they required, Viacom18\n", + "partnered with Celebal Technologies, a premier Salesforce, data analytics and big data\n", + "consulting organization based in India. The team at Celebal leveraged Azure Databricks\n", + "to provide Viacom18 with a unified data platform that modernizes its data warehousing\n", + "capabilities and accelerates data processing at scale.\n", + "\n", + "The ability to cache data within Delta Lake resulted in the much-needed acceleration\n", + "of queries, while cluster management with auto-scaling and the decoupling of\n", + "\n", + "\n", + "storage and compute simplified Viacom18’s infrastructure management and\n", + "optimized operational costs. “Delta Lake has created a streamlined approach to\n", + "the management of data pipelines,” explained Dey. “This has led to a decrease in\n", + "operational costs while speeding up time-to-insight for downstream analytics and\n", + "data science.”\n", + "\n", + "The notebooks feature was an unexpected bonus for Viacom18, as a common workspace\n", + "gave data teams a way to collaborate and increase productivity on everything from\n", + "model training to ad hoc analysis, dashboarding and reporting via PowerBI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Leveraging viewer data to power personalized**\n", + "**viewing experiences**\n", + "Celebal Technologies and Databricks have enabled Viacom18 to deliver innovative\n", + "customer solutions and insights with increased cross-team collaboration and\n", + "productivity. With Databricks, Viacom18’s data team is now able to seamlessly\n", + "navigate their data while better serving their customers.\n", + "\n", + "“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\n", + "and deliver customer behavioral and engagement insights to the analysts and data\n", + "scientists,” said Dey.\n", + "\n", + "In addition to performance gains, the faster query times have also lowered the overall\n", + "cost of ownership, even with daily increases in data volumes. “Azure Databricks has\n", + "greatly streamlined processes and improved productivity by an estimated 26%,”\n", + "concluded Dey.\n", + "\n", + "Overall, Dey cites the migration from Hadoop to Databricks has delivered significant\n", + "business value — reducing the cost of failure, accelerating processing speeds at\n", + "scale, and simplifying ad hoc analysis for easier data exploration and innovations that\n", + "deliver highly engaging customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "# What’s next?\n", + "\n", + "Now that you understand Delta Lake, it may be time to take a look\n", + "at some additional resources.\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n", + "\n", + "- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n", + "\n", + "- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/try-databricks)**\n", + "**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n", + "\n", + "\n", + "-----
SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
**eBook**\n", + "\n", + "## The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "\n", + "**C H A P TE R 1**\n", + "\n", + "**C H A P TE R 2**\n", + "\n", + "**C H A P TE R 3**\n", + "\n", + "**C H A P TE R 4**\n", + "\n", + "**C H A P TE R 5**\n", + "\n", + "**C H A P TE R 6**\n", + "\n", + "**C H A P TE R 7**\n", + "\n", + "**C H A P TE R 8**\n", + "\n", + "**C H A P TE R 9**\n", + "\n", + "**C H A P TE R 10**\n", + "\n", + "**C H A P TE R 11**\n", + "\n", + "**C H A P TE R 12**\n", + "\n", + "\n", + "**The data lakehouse** ...................................................................................................................................................................................... **4**\n", + "\n", + "**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n", + "\n", + "**Data reliability and performance** ................................................................................................................................... **18**\n", + "\n", + "**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n", + "\n", + "**Security** .............................................................................................................................................................................................................................. **41**\n", + "\n", + "**Instant compute and serverless** ................................................................................................................................... **48**\n", + "\n", + "**Data warehousing** ......................................................................................................................................................................................... **52**\n", + "\n", + "**Data engineering** ............................................................................................................................................................................................. **56**\n", + "\n", + "**Data streaming** .................................................................................................................................................................................................. **68.**\n", + "\n", + "**Data science and machine learning** ........................................................................................................................ **7** **3.**\n", + "\n", + "**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n", + "\n", + "**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "#### The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\n", + "designed for data practitioners and leaders who are embarking\n", + "on their journey into the data lakehouse architecture.\n", + "\n", + "In this eBook, you will learn the full capabilities of the data lakehouse architecture\n", + "and how the Databricks Lakehouse Platform helps organizations of all sizes — from\n", + "enterprises to startups in every industry — with all their data, analytics, AI and\n", + "machine learning use cases on one platform.\n", + "\n", + "You will see how the platform combines the best elements of data warehouses\n", + "and data lakes to increase the reliability, performance and scalability of your\n", + "data platform. Discover how the lakehouse simplifies complex workloads in data\n", + "engineering, data warehousing, data streaming, data science and machine learning\n", + "— and bolsters collaboration for your data teams, allowing them to maintain new\n", + "levels of governance, flexibility and agility in an open and multicloud environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### The data lakehouse\n", + "# 01\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The evolution of data architectures\n", + "\n", + "\n", + "Data has moved front and center within every organization as data-driven insights\n", + "have fueled innovation, competitive advantage and better customer experiences.\n", + "\n", + "However, as companies place mandates on becoming more data-driven,\n", + "their data teams are left in a sprint to deliver the right data for business\n", + "insights and innovation. With the widespread adoption of cloud, data teams\n", + "often invest in large-scale complex data systems that have capabilities for\n", + "streaming, business intelligence, analytics and machine learning to support\n", + "the overall business objectives.\n", + "\n", + "To support these objectives, data teams have deployed cloud data\n", + "\n", + "warehouses and data lakes.\n", + "\n", + "\n", + "Traditional data systems: The data warehouse and data lake\n", + "\n", + "With the advent of big data, companies began collecting large amounts of\n", + "data from many different sources, such as weblogs, sensor data and images.\n", + "Data warehouses — which have a long history as the foundation for decision\n", + "support and business intelligence applications — cannot handle large volumes\n", + "of data.\n", + "\n", + "While data warehouses are great for structured data and historical analysis,\n", + "they weren’t designed for unstructured data, semi-structured data, and data\n", + "with high variety, velocity and volume, making them unsuitable for many types\n", + "of data.\n", + "\n", + "This led to the introduction of data lakes, providing a single repository of raw\n", + "data in a variety of formats. While suitable for storing big data, data lakes do\n", + "not support transactions, nor do they enforce data quality, and their lack of\n", + "consistency/isolation makes it almost impossible to read, write or process data.\n", + "\n", + "For these reasons, many of the promises of data lakes never materialized and,\n", + "in many cases, reduced the benefits of data warehouses.\n", + "\n", + "As companies discovered new use cases for data exploration, predictive modeling\n", + "and prescriptive analytics, the need for a single, flexible, high-performance system\n", + "only grew. Data teams require systems for diverse data applications including SQL\n", + "analytics, real-time analytics, data science and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "To solve for new use cases and new users, a common approach is to use multiple\n", + "systems — a data lake, several data warehouses and other specialized systems\n", + "such as streaming, time-series, graph and image databases. But having multiple\n", + "systems introduces complexity and delay, as data teams invariably need to\n", + "move or copy data between different systems, effectively losing oversight and\n", + "governance over data usage.\n", + "\n", + "\n", + "You have now duplicated data in two different systems and the changes you\n", + "make in one system are unlikely to find their way to the other. So, you are going\n", + "to have data drift almost immediately, not to mention paying to store the same\n", + "data multiple times.\n", + "\n", + "Then, because governance is happening at two distinct levels across these\n", + "platforms, you are not able to control things consistently.\n", + "\n", + "\n", + "**Challenges with data, analytics and AI**\n", + "\n", + "In a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\n", + "measurable value from data. The challenge is that most companies continue to\n", + "implement two different platforms: data warehouses for BI and data lakes for AI.\n", + "These platforms are incompatible with each other, but data from both systems\n", + "is generally needed to deliver game-changing outcomes, which makes success\n", + "with AI extremely difficult.\n", + "\n", + "Today, most of the data is landing in the data lake, and a lot of it is unstructured.\n", + "In fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\n", + "unstructured by 2025. But, this data is where much of the value from AI resides.\n", + "Subsets of the data are then copied to the data warehouse into structured\n", + "tables, and back again in some cases.\n", + "\n", + "You also must secure and govern the data in both warehouses and offer\n", + "fine-grained governance, while lakes tend to be coarser grained at the file level.\n", + "Then, you stand up different stacks of tools on these platforms to do either\n", + "BI or AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, the tool stacks on top of these platforms\n", + "are fundamentally different, which makes it difficult\n", + "to get any kind of collaboration going between the\n", + "teams that support them.\n", + "\n", + "This is why AI efforts fail. There is a tremendous\n", + "amount of complexity and rework being introduced\n", + "into the system. Time and resources are being\n", + "wasted trying to get the right data to the right\n", + "people, and everything is happening too slowly\n", + "to get in front of the competition.\n", + "\n", + "\n", + "**Realizing this requires two disparate,**\n", + "**incompatible data platforms**\n", + "\n", + "\n", + "**Business** **SQL** **Incomplete** **Data science** **Data**\n", + "\n", + "**support for**\n", + "\n", + "**intelligence** **analytics** **and ML** **streaming**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "**Incomplete**\n", + "**support for**\n", + "**use cases**\n", + "\n", + "\n", + "**Incompatible**\n", + "**security and**\n", + "**governance models**\n", + "\n", + "**Copy subsets of data**\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa T|n a|c b|e and security le ACLs|\n", + "|||||\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa File|n s|c a|e and security nd blobs|\n", + "|||||\n", + "\n", + "\n", + "**Disjointed**\n", + "**and duplicative**\n", + "\n", + "**Data warehouse** **data silos** **Data lake**\n", + "Structured tables Unstructured files:\n", + "logs, text, images, video\n", + "\n", + "\n", + "-----\n", + "\n", + "**Moving forward with a lakehouse architecture**\n", + "\n", + "To satisfy the need to support AI and BI directly on vast amounts of data stored\n", + "in data lakes (on low-cost cloud storage), a new data management architecture\n", + "emerged independently across many organizations and use cases: the\n", + "data lakehouse.\n", + "\n", + "The data lakehouse can store _all_ and _any_ type of data once in a data lake and\n", + "make that data accessible directly for AI and BI. The lakehouse paradigm has\n", + "specific capabilities to efficiently allow both AI and BI on all the enterprise’s data\n", + "at a massive scale. Namely, it has the SQL and performance capabilities such as\n", + "indexing, caching and MPP processing to make BI work fast on data lakes. It also\n", + "has direct file access and direct native support for Python, data science and AI\n", + "frameworks without the need for a separate data warehouse.\n", + "\n", + "In short, a lakehouse is a data architecture that combines the best elements\n", + "of data warehouses and data lakes. Lakehouses are enabled by a new system\n", + "design, which implements similar data structures and data management features\n", + "found in a data warehouse directly on the low-cost storage used for data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Data lakehouse\n", + "\n", + "One platform to unify all your data, analytics and AI workloads\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "All machine learning, SQL,\n", + "BI, and streaming use cases\n", + "\n", + "One security and governance\n", + "approach for all data assets\n", + "on all clouds\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key features for a lakehouse**\n", + "\n", + "Recent innovations with the data lakehouse architecture can help simplify\n", + "your data and AI workloads, ease collaboration for data teams, and maintain\n", + "the kind of flexibility and openness that allows your organization to stay agile\n", + "as you scale. Here are key features to consider when evaluating data lakehouse\n", + "architectures:\n", + "\n", + "Transaction support: In an enterprise lakehouse, many data pipelines will\n", + "often be reading and writing data concurrently. Support for ACID (Atomicity,\n", + "Consistency, Isolation and Durability) transactions ensures consistency as\n", + "multiple parties concurrently read or write data.\n", + "\n", + "Schema enforcement and governance: The lakehouse should have\n", + "a way to support schema enforcement and evolution, supporting data\n", + "warehouse schema paradigms such as star/snowflake. The system should\n", + "be able to reason about data integrity, and it should have robust governance\n", + "and auditing mechanisms.\n", + "\n", + "Data governance: Capabilities including auditing, retention and lineage\n", + "have become essential, particularly considering recent privacy regulations.\n", + "\n", + "Tools that allow data discovery have become popular, such as data catalogs\n", + "and data usage metrics.\n", + "\n", + "BI support: Lakehouses allow the use of BI tools directly on the source\n", + "data. This reduces staleness and latency, improves recency and lowers cost\n", + "by not having to operationalize two copies of the data in both a data lake\n", + "and a warehouse.\n", + "\n", + "\n", + "Storage decoupled from compute: In practice, this means storage and\n", + "compute use separate clusters, thus these systems can scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also\n", + "have this property.\n", + "\n", + "Openness: The storage formats, such as Apache Parquet, are open and\n", + "standardized, so a variety of tools and engines, including machine learning\n", + "and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "Support for diverse data types (unstructured and structured):\n", + "The lakehouse can be used to store, refine, analyze and access data types\n", + "needed for many new data applications, including images, video, audio,\n", + "semi-structured data and text.\n", + "\n", + "Support for diverse workloads: Use the same data repository for a range\n", + "of workloads including data science, machine learning and SQL analytics.\n", + "Multiple tools might be needed to support all these workloads.\n", + "\n", + "End-to-end streaming: Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "**Learn more**\n", + "\n", + "**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "\n", + "**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "\n", + "**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Lakehouse: A new generation of open platforms\n", + "\n", + "\n", + "###### This is the lakehouse paradigm\n", + "\n", + "\n", + "Databricks is the inventor and pioneer of the\n", + "data lakehouse architecture. The data lakehouse\n", + "architecture was coined in the research paper,\n", + "[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\n", + "introduced by Databricks’ founders, UC Berkeley\n", + "and Stanford University at the 11th Conference on\n", + "Innovative Data Systems Research (CIDR) in 2021.\n", + "\n", + "At Databricks, we are continuously innovating on\n", + "the lakehouse architecture to help customers deliver\n", + "on their data, analytics and AI aspirations. The ideal\n", + "data, analytics and AI platform needs to operate\n", + "differently. Rather than copying and transforming\n", + "data in multiple systems, you need one platform\n", + "that accommodates all data types.\n", + "\n", + "\n", + "**Data science** **Data**\n", + "**and ML** **streaming**\n", + "\n", + "\n", + "**All ML, SQL, BI**\n", + "**and streaming use cases**\n", + "\n", + "**One security and governance**\n", + "**approach for all data assets**\n", + "**on all clouds**\n", + "\n", + "**A reliable data platform**\n", + "**to efficiently handle**\n", + "**all data types**\n", + "\n", + "\n", + "**Persona-based**\n", + "**use cases**\n", + "\n", + "**Unity Catalog**\n", + "Fine-grained governance\n", + "for data and AI\n", + "\n", + "**Delta Lake**\n", + "Data reliability and performance\n", + "\n", + "\n", + "**Business**\n", + "**intelligence**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "Files and blobs and table ACLs\n", + "\n", + "\n", + "Ideally, the platform must be open, so that you\n", + "are not locked into any walled gardens. You would\n", + "also have one security and governance model.\n", + "It would not only manage all data types, but it\n", + "would also be cloud-agnostic to govern data\n", + "wherever it is stored.\n", + "\n", + "Last, it would support all major data, analytics and AI\n", + "workloads, so that your teams can easily collaborate\n", + "and get access to all the data they need to innovate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is the Databricks Lakehouse Platform?\n", + "\n", + "The Databricks Lakehouse Platform unifies your\n", + "data warehousing and AI uses cases on a single\n", + "platform. It combines the best elements of data\n", + "lakes and data warehouses to deliver the reliability,\n", + "strong governance and performance of data\n", + "warehouses with the openness, flexibility and\n", + "machine learning support of data lakes.\n", + "\n", + "This unified approach simplifies your modern data\n", + "stack by eliminating the data silos that traditionally\n", + "separate and complicate data engineering, analytics,\n", + "BI, data science and machine learning. It’s built\n", + "on open source and open standards to maximize\n", + "flexibility. And, its common approach to data\n", + "management, security and governance helps you\n", + "\n", + "operate more efficiently and innovate faster.\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "Data Data Data Data science\n", + "warehousing engineering streaming and ML\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "**Simple**\n", + "\n", + "The unified approach simplifies your data\n", + "architecture by eliminating the data silos that\n", + "traditionally separate analytics, BI, data science\n", + "and machine learning. With a lakehouse, you\n", + "can eliminate the complexity and expense that\n", + "make it hard to achieve the full potential of\n", + "your analytics and AI initiatives.\n", + "\n", + "\n", + "**Open**\n", + "\n", + "Delta Lake forms the open foundation of\n", + "the lakehouse by providing reliability and\n", + "performance directly on data in the data\n", + "lake. You’re able to avoid proprietary walled\n", + "gardens, easily share data and build your\n", + "modern data stack with unrestricted access\n", + "to the ecosystem of open source data projects\n", + "and the broad Databricks partner network.\n", + "\n", + "\n", + "**Multicloud**\n", + "\n", + "The Databricks Lakehouse Platform offers\n", + "you a consistent management, security and\n", + "governance experience across all clouds. You\n", + "do not need to invest in reinventing processes\n", + "for every cloud platform that you are using to\n", + "support your data and AI efforts. Instead, your\n", + "data teams can simply focus on putting all\n", + "your data to work to discover new insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform architecture\n", + "\n", + "**Data reliability and performance for lakehouse**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\n", + "with all major analytics tools and works with the widest variety of formats to\n", + "store and process data.\n", + "\n", + "\n", + "**Instant compute and serverless**\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions and\n", + "manages the compute layer on behalf of the customer in the Databricks cloud\n", + "account instead of the customer account. As of the current release, serverless\n", + "compute is supported for use with Databricks SQL.\n", + "\n", + "In Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n", + "\n", + "\n", + "[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\n", + "a state-of-the-art vectorized engine for fast querying and provides the best\n", + "performance for all workloads in the lakehouse.\n", + "\n", + "In Chapter 3, we explore the details of data reliability and performance\n", + "\n", + "for the lakehouse.\n", + "\n", + "**Unified governance and security for lakehouse**\n", + "\n", + "The Databricks Lakehouse Platform provides unified governance with enterprise\n", + "scale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\n", + "governance for your data and AI assets in the lakehouse — files, tables,\n", + "dashboards, and machine learning models — giving you much better control,\n", + "management and security across clouds.\n", + "\n", + "[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\n", + "data across the organization in real time, independent of the platform\n", + "on which the data resides.\n", + "\n", + "In Chapter 4, we go into the details of unified governance for lakehouse\n", + "\n", + "and, in Chapter 5, we dive into the details of security for lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform workloads\n", + "\n", + "The Databricks Lakehouse Platform architecture supports different workloads\n", + "such as data warehousing, data engineering, data streaming, data science and\n", + "machine learning on one simple, open and multicloud data platform.\n", + "\n", + "**Data warehousing**\n", + "\n", + "Data warehousing is one of the most business-critical workloads for data teams,\n", + "and the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\n", + "lets you run all your SQL and BI applications at scale with up to 12x better price/\n", + "performance, a unified governance model, open formats and APIs, and your tools\n", + "of choice — no lock-in. Reduce resource management overhead with serverless\n", + "compute, and easily ingest, transform and query all your data in-place to deliver\n", + "real-time business insights faster.\n", + "\n", + "Built on open standards and APIs, the Databricks Lakehouse Platform provides\n", + "the reliability, quality and performance that data lakes natively lack, plus\n", + "integrations with the ecosystem for maximum flexibility.\n", + "\n", + "In Chapter 7, we go into the details of data warehousing on the lakehouse.\n", + "\n", + "**Data engineering**\n", + "\n", + "Data engineering on the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows on\n", + "any cloud platform, and meet regulatory requirements to maintain governance.\n", + "\n", + "\n", + "automates the complexity of building and maintaining pipelines and running ETL\n", + "workloads so data engineers and analysts can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "In Chapter 8, we go into the details of data engineering on the lakehouse.\n", + "\n", + "**Data streaming**\n", + "\n", + "[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\n", + "Lakehouse Platform and is the future of all data processing. Real-time processing\n", + "provides the freshest possible data to an organization’s analytics and machine\n", + "learning models enabling them to make better, faster decisions, more accurate\n", + "predictions, offer improved customer experiences and more.\n", + "\n", + "The Databricks Lakehouse Platform Dramatically simplifies data streaming to\n", + "deliver real-time analytics, machine learning and applications on one platform.\n", + "\n", + "In Chapter 9, we go into the details of data streaming on the lakehouse.\n", + "\n", + "**Data science and machine learning**\n", + "\n", + "Data science and machine learning (DSML) on the lakehouse is a powerful\n", + "workload that is unique to many other data offerings. DSML on the lakehouse\n", + "provides a data-native and collaborative solution for the full ML lifecycle. It\n", + "can maximize data and ML team productivity, streamline collaboration, empower\n", + "ML teams to prepare, process and manage data in a self-service manner,\n", + "and standardize the ML lifecycle from experimentation to production.\n", + "\n", + "In Chapter 10, we go into the details of DSML on the lakehouse.\n", + "\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform that\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform and your**\n", + "**modern data stack**\n", + "\n", + "The Databricks Lakehouse Platform is open and provides the flexibility to\n", + "continue using existing infrastructure, to easily share data and build your modern\n", + "data stack with unrestricted access to the ecosystem of open source data\n", + "projects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n", + "\n", + "In Chapter 11, we go into the details of our technology partners and the\n", + "\n", + "modern data stack.\n", + "\n", + "#### Global adoption of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "Today, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\n", + "across industries doing transformational work. Organizations around the globe\n", + "are driving change and delivering a new generation of data, analytics and AI\n", + "applications. We believe that the unfulfilled promise of data and AI can finally\n", + "be fulfilled with one platform for data analytics, data science and machine\n", + "learning with the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n", + "\n", + "[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n", + "\n", + "[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n", + "\n", + "[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n", + "\n", + "[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n", + "\n", + "[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Data reliability and performance\n", + "\n", + "To bring openness, reliability and lifecycle management to data lakes,\n", + "the Databricks Lakehouse Platform is built on the foundation of Delta\n", + "Lake. Delta Lake solves challenges around unstructured/structured data\n", + "ingestion, the application of data quality, difficulties with deleting data for\n", + "compliance or issues with modifying data for data capture.\n", + "\n", + "Although data lakes are great solutions for holding large quantities of raw\n", + "data, they lack important attributes for data reliability and quality and\n", + "often don’t offer good performance when compared to data warehouses.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Problems with today’s data lakes\n", + "\n", + "When it comes to data reliability and quality, examples of these\n", + "missing attributes include:\n", + "\n", + "**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\n", + "appends and reads\n", + "\n", + "**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\n", + "For example, rejecting writes that don’t match a table’s schema.\n", + "\n", + "**•** **Lack of integration with data catalog:** Results in dark data and no single\n", + "source of truth\n", + "\n", + "Even just the absence of these three attributes can cause a lot of extra work\n", + "for data engineers as they strive to ensure consistent high-quality data in the\n", + "pipelines they create.\n", + "\n", + "\n", + "These challenges are solved with two key technologies that are at the foundation\n", + "of the lakehouse: Delta Lake and Photon.\n", + "\n", + "**What is Delta Lake?**\n", + "\n", + "Delta Lake is a file-based, open source storage format that provides ACID\n", + "transactions and scalable metadata handling, and unifies streaming and batch\n", + "data processing. It runs on top of existing data lakes and is compatible with\n", + "Apache Spark™ and other processing engines.\n", + "\n", + "Delta Lake uses Delta Tables which are based on Apache Parquet, a commonly\n", + "used format for structured data already utilized by many organizations. Therefore,\n", + "switching existing Parquet tables to Delta Tables is easy and quick. Delta\n", + "Tables can also be used with semi-structured and unstructured data, providing\n", + "versioning, reliability, metadata management, and time travel capabilities that\n", + "make these types of data easily managed as well.\n", + "\n", + "\n", + "As for performance, data lakes use object storage, so data is mostly kept in\n", + "immutable files leading to the following problems:\n", + "\n", + "**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\n", + "indexing practices in the form of partitioning that leads to hundreds of dev hours\n", + "spent tuning file sizes to improve read/write performance. Often, partitioning\n", + "proves to be ineffective over time if the wrong field was selected for partitioning\n", + "or due to high cardinality columns.\n", + "\n", + "**•** **Too many small files:** With no support for transactions, appending new data\n", + "takes the form of adding more and more files, leading to “small file problems,”\n", + "a known root cause of query performance degradation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake features**\n", + "\n", + "\n", + "**ACID guarantees**\n", + "\n", + "Delta Lake ensures that all data changes\n", + "written to storage are committed for durability\n", + "and made visible to readers atomically. In other\n", + "words, no more partial or corrupted files.\n", + "\n", + "**Scalable data and metadata handling**\n", + "\n", + "Since Delta Lake is built on data lakes, all reads\n", + "and writes using Spark or other distributed\n", + "processing engines are inherently scalable to\n", + "petabyte-scale. However, unlike most other\n", + "storage formats and query engines, Delta Lake\n", + "leverages Spark to scale out all the metadata\n", + "processing, thus efficiently handling metadata\n", + "of billions of files for petabyte-scale tables.\n", + "\n", + "\n", + "**Audit history and time travel**\n", + "\n", + "The Delta Lake transaction log records details\n", + "about every change made to data, providing a full\n", + "audit trail of the changes. These data snapshots\n", + "allow developers to access and revert to earlier\n", + "versions of data for audits, rollbacks or to\n", + "reproduce experiments.\n", + "\n", + "**Schema enforcement and schema evolution**\n", + "\n", + "Delta Lake automatically prevents the insertion of\n", + "data with an incorrect schema, i.e., not matching\n", + "the table schema. And when needed, it allows the\n", + "table schema to be explicitly and safely evolved to\n", + "accommodate ever-changing data.\n", + "\n", + "\n", + "**Support for deletes, updates and merges**\n", + "\n", + "Most distributed processing frameworks do not\n", + "support atomic data modification operations on\n", + "data lakes. Delta Lake supports merge, update\n", + "and delete operations to enable complex use\n", + "cases including but not limited to change data\n", + "capture (CDC), slowly changing dimension (SCD)\n", + "operations and streaming upserts.\n", + "\n", + "**Streaming and batch unification**\n", + "\n", + "A Delta Lake table can work both in batch\n", + "and as a streaming source and sink. The\n", + "ability to work across a wide variety of latencies,\n", + "ranging from streaming data ingestion to batch\n", + "historic backfill, to interactive queries all work\n", + "out of the box.\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Delta Lake transaction log**\n", + "\n", + "A key to understanding how Delta Lake provides all these capabilities is the\n", + "transaction log. The Delta Lake transaction log is the common thread that runs\n", + "through many of Delta Lake’s most notable features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log\n", + "is an ordered record of every transaction that has ever been performed on\n", + "a Delta Lake table since its inception.\n", + "\n", + "Delta Lake is built on top of Spark to allow multiple readers and writers of a\n", + "given table to work on a table at the same time. To always show users correct\n", + "views of the data, the transaction log serves as a single source of truth: the\n", + "central repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on\n", + "an open table that has been modified since the last time it was read, Spark\n", + "checks the transaction log to see what new transactions are posted to the table.\n", + "Then, Spark updates the table with those recent changes. This ensures that a\n", + "user’s version of a table is always synchronized with the master record as of the\n", + "most recent query, and that users cannot make divergent, conflicting changes\n", + "to a table.\n", + "\n", + "\n", + "**Flexibility and broad industry support**\n", + "\n", + "Delta Lake is an open source project, with an engaged community of\n", + "contributors building and growing the Delta Lake ecosystem atop a set of open\n", + "APIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\n", + "as an open storage standard in different environments and use cases, comes a\n", + "broad set of integration with industry-leading tools, technologies and formats.\n", + "\n", + "Organizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\n", + "flexibility in how they ingest, store and query data. They are not limited in storing\n", + "data in a single cloud provider and can implement a true multicloud approach to\n", + "data storage.\n", + "\n", + "Connectors to tools, such as Fivetran, allow you to leverage Databricks’\n", + "ecosystem of partner solutions, so organizations have full control of building the\n", + "right ingestion pipelines for their use cases. Finally, consuming data via queries\n", + "for exploration or business intelligence (BI) is also flexible and open.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake integrates with all major analytics tools**\n", + "\n", + "Eliminates unnecessary data movement and duplication\n", + "\n", + "\n", + "-----\n", + "\n", + "In addition to a wide ecosystem of tools and technologies, Delta Lake supports\n", + "a broad set of data formats for structured, semi-structured and unstructured\n", + "data. These formats include image binary data that can be stored in Delta\n", + "Tables, graph data format, geospatial data types and key-value stores.\n", + "\n", + "**Learn more**\n", + "\n", + "[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "[Documentation](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n", + "\n", + "\n", + "**What is Photon?**\n", + "\n", + "As many organizations standardize on the lakehouse paradigm, this new\n", + "architecture poses challenges with the underlying query execution engine\n", + "for accessing and processing structured and unstructured data. The execution\n", + "engine needs to provide the performance of a data warehouse and the scalability\n", + "of data lakes.\n", + "\n", + "Photon is the next-generation query engine on the Databricks Lakehouse\n", + "Platform that provides dramatic infrastructure cost savings and speedups for\n", + "all use cases — from data ingestion, ETL, streaming, data science and interactive\n", + "queries — directly on your data lake. Photon is compatible with Spark APIs and\n", + "implements a more general execution framework that allows efficient processing\n", + "of data with support of the Spark API. This means getting started is as easy as\n", + "turning it on — no code change and no lock-in. With Photon, typical customers are\n", + "seeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\n", + "to 85% reduction in VM compute hours.\n", + "\n", + "Spark instructions Photon instructions\n", + "\n", + "\n", + "Photon engine\n", + "\n", + "\n", + "Delta/Parquet\n", + "\n", + "Photon writer\n", + "to Delta/Parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "Why process queries with Photon?\n", + "\n", + "\n", + "Query performance on Databricks has steadily increased over the years,\n", + "powered by Spark and thousands of optimizations packaged as part of the\n", + "Databricks Runtime (DBR). Photon provides an additional 2x speedup per the\n", + "TPC-DS 1TB benchmark compared to the latest DBR versions.\n", + "\n", + "**Relative speedup to DBR 2.1 by DBR version**\n", + "Higher is better\n", + "\n", + "\n", + "**Customers have observed significant speedups using**\n", + "**Photon on workloads such as:**\n", + "\n", + "**•** **SQL-based jobs:** Accelerate large-scale production jobs on\n", + "SQL and Spark DataFrames\n", + "\n", + "**•** **IoT use cases:** Faster time-series analysis using Photon\n", + "compared to Spark and traditional Databricks Runtime\n", + "\n", + "**•** **Data privacy and compliance:** Query petabytes-scale data\n", + "sets to identify and delete records without duplicating data\n", + "with Delta Lake, production jobs and Photon\n", + "\n", + "**•** **Loading data into Delta and Parquet:** Vectorized I/O\n", + "speeds up data loads for Delta and Parquet tables, lowering\n", + "overall runtime and costs of data engineering jobs\n", + "\n", + "\n", + "Release date - DBR version (TPC-DS 1TB 10 x i3xl)\n", + "\n", + "\n", + "-----\n", + "\n", + "**100TB TPC-DS price/performance**\n", + "Lower is better\n", + "\n", + "\n", + "Best price/performance for analytics\n", + "in the cloud\n", + "\n", + "Written from the ground up in C++, Photon takes\n", + "advantage of modern hardware for faster queries,\n", + "providing up to 12x better price/performance\n", + "compared to other cloud data warehouses —\n", + "all natively on your data lake.\n", + "\n", + "\n", + "Databricks SQL Databricks SQL Cloud data Cloud data Cloud data\n", + "spot on-demand warehouse 1 warehouse 2 warehouse 3\n", + "\n", + "**System**\n", + "\n", + "\n", + "-----\n", + "\n", + "Works with your existing code\n", + "and avoids vendor lock-in\n", + "\n", + "Photon is designed to be compatible with the\n", + "Apache Spark DataFrame and SQL APIs to ensure\n", + "workloads run seamlessly without code changes.\n", + "All you do is turn it on. Photon will seamlessly\n", + "coordinate work and resources and transparently\n", + "accelerate portions of your SQL and Spark queries.\n", + "No tuning or user intervention required.\n", + "\n", + "\n", + "**Photon in the Databricks Lakehouse Platform**\n", + "\n", + "**Client: submit SQL**\n", + "\n", + "Parsing\n", + "Catalyst: analysis/\n", + "planning/optimization\n", + "scheduling\n", + "\n", + "Execute task Execute task Execute task Execute task\n", + "\n", + "_Lifecycle of a Photon query_\n", + "\n", + "\n", + "Spark\n", + "driver\n", + "JVM\n", + "\n", + "Spark\n", + "executors mixed\n", + "JVM/Native\n", + "\n", + "\n", + "-----\n", + "\n", + "Optimizing for all data use cases\n", + "and workloads\n", + "\n", + "Photon is the first purpose-built lakehouse engine\n", + "designed to accelerate all data and analytics\n", + "workloads: data ingestion, ETL, streaming, data\n", + "science, and interactive queries. While we started\n", + "Photon primarily focused on SQL to provide\n", + "customers with world-class data warehousing\n", + "performance on their data lakes, we’ve significantly\n", + "increased the scope of ingestion sources, formats,\n", + "APIs and methods supported by Photon since\n", + "then. As a result, customers have seen dramatic\n", + "infrastructure cost savings and speedups on\n", + "Photon across all their modern Spark (e.g., Spark\n", + "SQL and DataFrame) workloads.\n", + "\n", + "\n", + "Query optimizer\n", + "\n", + "Native execution engine\n", + "\n", + "Caching\n", + "\n", + "\n", + "_Accelerating all workloads on the lakehouse_\n", + "\n", + "**Learn more**\n", + "\n", + "[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "\n", + "[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Unified governance and sharing for data, analytics and AI\n", + "\n", + "Today, more and more organizations recognize the importance of making\n", + "high-quality data readily available to data teams to drive actionable insights\n", + "and business value. At the same time, organizations also understand the risks\n", + "of data breaches which negatively impact brand value and inevitably lead to\n", + "erosion of customer trust. Governance is one of the most critical components\n", + "of a lakehouse data platform architecture; it helps ensure that data assets\n", + "are securely managed throughout the enterprise. However, many companies\n", + "are using different incompatible governance models leading to complex and\n", + "expensive solutions.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key challenges with data and AI governance\n", + "\n", + "**Diversity of data and AI assets**\n", + "\n", + "The increased use of data and the added complexity of the data landscape\n", + "have left organizations with a difficult time managing and governing all types\n", + "of their data-related assets. No longer is data stored in files or tables. Data\n", + "assets today take many forms, including dashboards, machine learning models\n", + "and unstructured data like video and images that legacy data governance\n", + "solutions simply are not built to govern and manage.\n", + "\n", + "\n", + "**Rising multicloud adoption**\n", + "\n", + "More and more organizations now leverage a multicloud strategy to optimize\n", + "costs, avoid vendor lock-in, and meet compliance and privacy regulations. With\n", + "nonstandard, cloud-specific governance models, data governance across clouds\n", + "is complex and requires familiarity with cloud-specific security and governance\n", + "concepts, such as identity and access management (IAM).\n", + "\n", + "**Disjointed tools for data governance on the lakehouse**\n", + "\n", + "Today, data teams must deal with a myriad of fragmented tools and services for\n", + "their data governance requirements, such as data discovery, cataloging, auditing,\n", + "sharing, access controls, etc. This inevitably leads to operational inefficiencies\n", + "and poor performance due to multiple integration points and network latency\n", + "between the services.\n", + "\n", + "\n", + "**Two disparate and incompatible data platforms**\n", + "\n", + "Organizations today use two different platforms for their data analytics and\n", + "AI efforts — data warehouses for BI and data lakes for AI. This results in data\n", + "replication across two platforms, presenting a major governance challenge.\n", + "With no unified view of the data landscape, it is difficult to see where data is\n", + "stored, who has access to what data, and consistently define and enforce data\n", + "access policies across the two platforms with different governance models.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### One security and governance approach\n", + "\n", + "Lakehouse systems provide a uniform way to manage access control, data\n", + "quality and compliance across all of an organization’s data using standard\n", + "interfaces similar to those in data warehouses by adding a management\n", + "interface on top of data lake storage.\n", + "\n", + "Modern lakehouse systems support fine-grained (row, column and view level)\n", + "access control via SQL, query auditing, attribute-based access control, data\n", + "versioning and data quality constraints and monitoring. These features are\n", + "generally provided using standard interfaces familiar to database administrators\n", + "(for example, SQL GRANT commands) to allow existing personnel to manage\n", + "all the data in an organization in a uniform way. Centralizing all the data in\n", + "a lakehouse system with a single management interface also reduces the\n", + "administrative burden and potential for error that comes with managing\n", + "multiple separate systems.\n", + "\n", + "\n", + "#### What is Unity Catalog?\n", + "\n", + "Unity Catalog is a unified governance solution for all data, analytics and AI\n", + "assets including files, tables, dashboards and machine learning models in your\n", + "lakehouse on any cloud. Unity Catalog simplifies governance by empowering\n", + "data teams with a common governance model based on ANSI-SQL to define\n", + "and enforce fine-grained access controls. With attribute-based access controls,\n", + "data administrators can enable fine-grained access controls on rows and\n", + "columns using tags (attributes). Built-in data search and discovery allows\n", + "data teams to quickly find and reference relevant data for any use case. Unity\n", + "Catalog offers automated data lineage for all workloads in SQL, R, Scala and\n", + "Python, to build a better understanding of the data and its flow in the lakehouse.\n", + "Unity Catalog also allows data sharing across or within organizations and\n", + "seamless integrations with your existing data governance tools.\n", + "\n", + "With Unity Catalog, data teams can simplify governance for all data and AI\n", + "assets with one consistent model to discover, access and share data, giving\n", + "you much better native performance, management and security across clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key benefits**\n", + "\n", + "\n", + "The common metadata layer for cross-workspace metadata is at the account\n", + "level and eases collaboration by allowing different workspaces to access Unity\n", + "Catalog metadata through a common interface and break down data silos.\n", + "Further, the data permissions in Unity Catalog are applied to account-level\n", + "identities, rather than identities that are local to a workspace, allowing\n", + "a consistent view of users and groups across all workspaces.\n", + "\n", + "\n", + "Catalog, secure and audit access to all data assets on any cloud\n", + "\n", + "Unity Catalog provides centralized metadata, enabling data teams to create\n", + "a single source of truth for all data assets ranging from files, tables, dashboards\n", + "to machine learning models in one place.\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog offers a unified data access layer that provides a simple and\n", + "streamlined way to define and connect to your data through managed tables,\n", + "external tables, or files, while managing their access controls. Unity Catalog\n", + "centralizes access controls for files, tables and views.\n", + "\n", + "It allows fine-grained access controls for restricting access to certain rows\n", + "and columns to the users and groups who are authorized to query them. With\n", + "Attribute-Based Access Controls (ABAC), you can control access to multiple\n", + "data items at once based on user and data attributes, further simplifying\n", + "governance at scale. For example, you will be able to tag multiple columns\n", + "as personally identifiable information (PII) and manage access to all columns\n", + "tagged as PII in a single rule.\n", + "\n", + "Today, organizations are dealing with an increased burden of regulatory\n", + "compliance, and data access auditing is a critical component to ensure your\n", + "organization is set up for success while meeting compliance requirements.\n", + "Unity Catalog also provides centralized fine-grained auditing by capturing an\n", + "audit log of operations such as create, read, update and delete (CRUD) that have\n", + "been performed against the data. This allows a fine-grained audit trail showing\n", + "who accessed a given data set and helps you meet your compliance and\n", + "business requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "Built-in data search and discovery\n", + "\n", + "Data discovery is a critical component to break\n", + "down data silos and democratize data across\n", + "your organization to make data-driven decisions.\n", + "Unity Catalog provides a rich user interface for\n", + "data search and discovery, enabling data teams to\n", + "quickly search relevant data assets across the data\n", + "landscape and reference them for all use cases —\n", + "BI, analytics and machine learning — accelerating\n", + "time-to-value and boosting productivity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Automated data lineage for all workloads\n", + "\n", + "Data lineage describes the transformations and\n", + "refinements of data from source to insight. Lineage\n", + "includes capturing all the relevant metadata and\n", + "events associated with the data in its lifecycle,\n", + "including the source of the data set, what other\n", + "data sets were used to create it, who created it and\n", + "when, what transformations were performed, which\n", + "other data sets leverage it, and many other events\n", + "and attributes. Unity Catalog offers automated data\n", + "lineage down to table and column level, enabling\n", + "data teams to get an end-to-end view of where\n", + "data is coming from, what transformations were\n", + "performed on the data and how data is consumed\n", + "by end applications such as notebooks, workflows,\n", + "dashboards, machine learning models, etc.\n", + "\n", + "With automated data lineage for all workloads —\n", + "SQL, R, Python and Scala, data teams can quickly\n", + "identify and perform root cause analysis of any\n", + "errors in the data pipelines or end applications.\n", + "Second, data teams can perform impact analysis\n", + "to see dependencies of any data changes\n", + "on downstream consumers and notify them\n", + "about the potential impact. Finally, data lineage\n", + "also empowers data teams with increased\n", + "understanding of their data and reduces tribal\n", + "knowledge. Unity Catalog can also capture lineage\n", + "associated with non-data entities, such as notebooks,\n", + "workflows and dashboards. Lineage can be\n", + "\n", + "\n", + "_Data lineage with Unity Catalog_\n", + "\n", + "retrieved via REST APIs to support integrations\n", + "with other catalogs.\n", + "\n", + "Integrated with your existing tools\n", + "\n", + "\n", + "**Resources**\n", + "\n", + "[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n", + "\n", + "[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n", + "\n", + "\n", + "Unity Catalog helps you to future-proof your data\n", + "and AI governance with the flexibility to leverage\n", + "your existing data catalogs and governance\n", + "solutions — Collibra, Alation, Immuta, Privacera,\n", + "Microsoft Purview and AWS Lakeformation.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Open data sharing and collaboration\n", + "\n", + "Data sharing has become important in the digital\n", + "economy as enterprises wish to exchange data\n", + "easily and securely with their customers, partners,\n", + "suppliers and internal lines of business to better\n", + "collaborate and unlock value from that data. But\n", + "to date, a lack of standards-based data sharing\n", + "protocol has resulted in data sharing solutions\n", + "tied to a single vendor or commercial product,\n", + "introducing vendor lock-in risks. What the industry\n", + "deserves is an open approach to data sharing.\n", + "\n", + "**Why data sharing is hard**\n", + "\n", + "Data sharing has evolved from an optional feature\n", + "of a few data platforms to a business necessity\n", + "and success factor for organizations. Our solution\n", + "architects encounter daily the classic scenarios\n", + "of a retailer looking to publish sales data to their\n", + "suppliers in real time or a supplier that wants to\n", + "share real-time inventory.\n", + "\n", + "As a reminder, data sharing recently triggered\n", + "the most impressive scientific development that\n", + "humankind has ever seen. On January 5, 2021, the\n", + "first sample of the genome of the coronavirus was\n", + "\n", + "\n", + "uploaded to the internet. It wasn’t a lung biopsy\n", + "from a patient in Wuhan, but a shared digital\n", + "genomic data set that triggered the development\n", + "of the first batch of COVID vaccines worldwide.\n", + "\n", + "\n", + "treatments, tests and tracking mutations as they\n", + "are passed down through a lineage, a branch of\n", + "the coronavirus family tree. The above graphic\n", + "shows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n", + "\n", + "\n", + "Since then, coronavirus experts have daily\n", + "exchanged public data sets, looking for better\n", + "\n", + "\n", + "-----\n", + "\n", + "Sharing data, as well as consuming data from\n", + "external sources, allows you to collaborate with\n", + "partners, establish new partnerships, enable\n", + "research and can generate new revenue streams\n", + "with data monetization.\n", + "\n", + "Despite those promising examples, existing data\n", + "sharing technologies come with several limitations:\n", + "\n", + "**•** Traditional data sharing technologies, such as\n", + "Secure File Transfer Protocol (SFTP), do not\n", + "scale well and only serve files offloaded to a\n", + "server\n", + "\n", + "**•** Cloud object stores operate on an object level\n", + "and are cloud-specific\n", + "\n", + "**•** Commercial data sharing offerings baked into\n", + "vendor products often share tables instead of\n", + "files, but scaling them is expensive and they\n", + "are not open and, therefore, do not permit data\n", + "sharing with a different platform\n", + "\n", + "The following table compares proprietary vendor\n", + "solutions with SFTP, cloud object stores and Delta\n", + "Sharing.\n", + "\n", + "\n", + "\n", + "|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n", + "|---|---|---|---|---|\n", + "|Secure|||||\n", + "|Cheap|||||\n", + "|Vendor agnostic|||||\n", + "|Multicloud|||||\n", + "|Open source|||||\n", + "|Table/DataFrame abstraction|||||\n", + "|Live data|||||\n", + "|Predicate pushdown|||||\n", + "|Object store bandwidth|||||\n", + "|Zero compute cost|||||\n", + "|Scalability|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "**Open source data sharing and Databricks**\n", + "\n", + "To address the limitations of existing data sharing solutions, Databricks developed\n", + "[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\n", + "to the Linux Foundation.\n", + "\n", + "An open source–based solution, such as Delta Sharing, eliminates the lock-in\n", + "of commercial solutions and brings a number of additional benefits such as\n", + "community-developed integrations with popular, open source data processing\n", + "frameworks. In addition, open protocols allow the easy integration of commercial\n", + "clients, such as BI tools.\n", + "\n", + "**What is Databricks Delta Sharing?**\n", + "\n", + "Databricks Delta Sharing provides an open solution to securely share live data\n", + "from your lakehouse to any computing platform. Recipients don’t have to be\n", + "on the Databricks platform or on the same cloud or a cloud at all. Data providers\n", + "can share live data, without replicating or moving it to another system. Recipients\n", + "benefit from always having access to the latest version of data and can quickly\n", + "query shared data using tools of their choice for BI, analytics and machine\n", + "learning, reducing time-to-value. Data providers can centrally manage, govern,\n", + "audit and track usage of the shared data on one platform.\n", + "\n", + "Unity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\n", + "for data sharing, enabling organizations to share live, large-scale data without\n", + "replication and make data easily and quickly accessible from tools of your\n", + "choice, with enterprise-grade security.\n", + "\n", + "\n", + "**Key benefits**\n", + "\n", + "Open cross-platform sharing\n", + "\n", + "Easily share existing data in Delta Lake and Apache Parquet formats between\n", + "different vendors. Consumers don’t have to be on the Databricks platform, same\n", + "cloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\n", + "and Java allow recipients to consume shared data directly from the tools of their\n", + "choice. Delta Sharing eliminates the need to set up a new ingestion process to\n", + "consume data. Data recipients can directly access the fresh data and query it\n", + "using tools of their choice. Recipients can also enrich data with data sets from\n", + "popular data providers.\n", + "\n", + "Sharing live data without copying it\n", + "\n", + "Share live ready-to-query data, without replicating or moving it to another system.\n", + "Most enterprise data today is stored in cloud data lakes. Any of the existing data\n", + "sets on the provider’s data lake can easily be shared across clouds, regions or\n", + "data platforms without any data replication or physical movement of data. Data\n", + "providers can update their data sets reliably in real time and provide a fresh and\n", + "consistent view of their data to recipients.\n", + "\n", + "Centralized administration and governance\n", + "\n", + "You can centrally govern, track and audit access to the shared data from a single\n", + "point of enforcement to meet compliance requirements. Detailed user-access\n", + "audit logs are kept to know who is accessing the data and monitor usage of the\n", + "shared data down to table, partition and version level.\n", + "\n", + "\n", + "-----\n", + "\n", + "An open Marketplace for data solutions\n", + "\n", + "The demand for third-party data to make data-driven innovations is greater than ever,\n", + "\n", + "and data marketplaces act as a bridge between data providers and data consumers to\n", + "\n", + "help facilitate the discovery and distribution of data sets.\n", + "\n", + "Databricks Marketplace provides an open marketplace for exchanging data products\n", + "\n", + "such as data sets, notebooks, dashboards and machine learning models. To accelerate\n", + "\n", + "insights, data consumers can discover, evaluate and access more data products from\n", + "\n", + "third-party vendors than ever before. Providers can now commercialize new offerings\n", + "\n", + "and shorten sales cycles by providing value-added services on top of their data.\n", + "\n", + "Databricks Marketplace is powered by Delta Sharing, allowing consumers to access\n", + "\n", + "data products without having to be on the Databricks platform. This open approach\n", + "\n", + "allows data providers to broaden their addressable market without forcing consumers\n", + "\n", + "into vendor lock-in.\n", + "\n", + "_Databricks Marketplace_\n", + "\n", + "\n", + "Privacy-safe data cleanrooms\n", + "\n", + "Powered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n", + "\n", + "a flexible data cleanroom solution allowing businesses to easily collaborate with their\n", + "\n", + "customers and partners on any cloud in a privacy-safe way. Participants in the data\n", + "\n", + "cleanrooms can share and join their existing data, and run complex workloads in any\n", + "\n", + "language — Python, R, SQL, Java and Scala — on the data while maintaining data\n", + "\n", + "privacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n", + "\n", + "data replication across clouds or regions with other participants, which simplifies data\n", + "\n", + "operations and reduces cost.\n", + "\n", + "_Data cleanrooms with Databricks Lakehouse Platform_\n", + "\n", + "\n", + "-----\n", + "\n", + "**How it works**\n", + "\n", + "Delta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\n", + "is natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\n", + "or externally.\n", + "\n", + "Delta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\n", + "Azure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n", + "\n", + "**Data provider** **Data recipient**\n", + "\n", + "Data science And many more On-premises\n", + "\n", + "The data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\n", + "decides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\n", + "shares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\n", + "Spark, Java and Python, and is working with partners on many more.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "1. The recipient’s client authenticates to the sharing server and asks to query\n", + "a specific table. The client can also provide filters on the data (for example,\n", + "“country=US”) as a hint to read just a subset of the data.\n", + "\n", + "2. The server verifies whether the client is allowed to access the data, logs the\n", + "request, and then determines which data to send back. This will be a subset\n", + "of the data objects in cloud storage systems that make up the table.\n", + "\n", + "3. To transfer the data, the server generates short-lived presigned URLs that\n", + "allow the client to read these Parquet files directly from the cloud provider,\n", + "so that the transfer can happen in parallel at massive bandwidth, without\n", + "streaming through the sharing server.\n", + "\n", + "**Learn more**\n", + "\n", + "[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n", + "\n", + "[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n", + "\n", + "[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 05\n", + "\n", + "\n", + "### Security\n", + "\n", + "Organizations that operate in multicloud environments need a unified, reliable\n", + "and consistent approach to secure data. We’ve learned from our customers that\n", + "a simple and unified approach to data security for the lakehouse is one of the\n", + "most critical requirements for modern data solutions. Databricks is trusted by\n", + "the world’s largest organizations to provide a powerful lakehouse platform with\n", + "high security and scalability. In fact, thousands of customers trust Databricks\n", + "with their most sensitive data to analyze and build data products using machine\n", + "learning (ML). With significant investment in building a highly secure and scalable\n", + "platform, Databricks delivers end-to-end platform security for data and users.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Platform architecture reduces risk\n", + "\n", + "The Databricks Lakehouse architecture is split into\n", + "two separate planes to simplify your permissions,\n", + "avoid data duplication and reduce risk. The control\n", + "plane is the management plane where Databricks\n", + "runs the workspace application and manages\n", + "notebooks, configuration and clusters. Unless you\n", + "choose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\n", + "runs inside your cloud service provider account,\n", + "processing your data without taking it out of your\n", + "account. You can embed Databricks in your data\n", + "exfiltration protection architecture using features\n", + "like customer-managed VPCs/VNets and admin\n", + "console options that disable export.\n", + "\n", + "While certain data, such as your notebooks,\n", + "configurations, logs, and user information, is\n", + "present within the control plane, that information\n", + "is encrypted at rest, and communication to and\n", + "from the control plane is encrypted in transit.\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Control pane|Col3|\n", + "|---|---|---|\n", + "||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n", + "||Cluster manager||\n", + "\n", + "\n", + "You also have choices for where certain data lives:\n", + "You can host your own store of metadata about\n", + "your data tables (Hive metastore), or store query\n", + "\n", + "\n", + "**Data**\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "\n", + "results in your cloud service provider account and\n", + "decide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Step-by-step example\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "|Col1|ample|Col3|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "||Control pane 1 4||||\n", + "|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "Suppose you have a data engineer that signs in to Databricks and\n", + "writes a notebook that transforms raw data in Kafka to a normalized\n", + "data set sent to storage such as Amazon S3 or Azure Data Lake\n", + "Storage. Six steps make that happen:\n", + "\n", + "1. The data engineer seamlessly authenticates, via your single sign-on\n", + "if desired, to the Databricks web UI in the control plane, hosted in\n", + "the Databricks account.\n", + "\n", + "2. As the data engineer writes code, their web browser sends it to\n", + "the control plane. JDBC/ODBC requests also follow the same path,\n", + "authenticating with a token.\n", + "\n", + "3. When ready, the control plane uses Cloud Service Provider APIs to\n", + "create a Databricks cluster, made of new instances in the data plane,\n", + "in your CSP account. Administrators can apply cluster policies to\n", + "enforce security profiles.\n", + "\n", + "4. Once the instances launch, the cluster manager sends the data\n", + "engineer’s code to the cluster.\n", + "\n", + "5. The cluster pulls from Kafka in your account, transforms the data\n", + "in your account and writes it to a storage in your account.\n", + "\n", + "6. The cluster reports status and any outputs back to the cluster manager.\n", + "\n", + "The data engineer does not need to worry about many of the details —\n", + "simply write the code and Databricks runs it.\n", + "\n", + "\n", + "#### Network and server security\n", + "\n", + "Here is how Databricks interacts with your cloud service provider\n", + "account to manage network and server security\n", + "\n", + "**Networking**\n", + "\n", + "Regardless of where you choose to host the data plane, Databricks networking\n", + "is straightforward. If you host it yourself, Databricks by default will still configure\n", + "networking for you, but you can also control data plane networking with your\n", + "own managed VPC or VNet.\n", + "\n", + "The serverless data plane network infrastructure is managed by Databricks in\n", + "a Databricks cloud service provider account and shared among customers,\n", + "with additional network boundaries between workspaces and between clusters.\n", + "\n", + "Databricks does not rewrite or change your data structure in your storage, nor\n", + "does it change or modify any of your security and governance policies. Local\n", + "firewalls complement security groups and subnet firewall policies to block\n", + "unexpected inbound connections.\n", + "\n", + "Customers at the enterprise tier can also use the IP access list feature on\n", + "the control plane to limit which IP addresses can connect to the web UI or\n", + "REST API — for example, to allow only VPN or office IPs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Servers**\n", + "\n", + "In the data plane, Databricks clusters automatically run the latest hardened\n", + "system image. Users cannot choose older (less secure) images or code. For AWS\n", + "and Azure deployments, images are typically updated every two-to-four weeks.\n", + "GCP is responsible for its system image.\n", + "\n", + "Databricks runs scans for every release, including:\n", + "\n", + "**•** System image scanning for vulnerabilities\n", + "\n", + "**•** Container OS and library scanning\n", + "\n", + "\n", + "**Severity** **Remediation time**\n", + "\n", + "**Critical** **< 14 days**\n", + "\n", + "**High** **< 30 days**\n", + "\n", + "**Medium** **< 60 days**\n", + "\n", + "**Low** **When appropriate**\n", + "\n", + "\n", + "\n", + "**•** Static and dynamic code scanning\n", + "\n", + "**Databricks access**\n", + "\n", + "\n", + "Databricks code is peer reviewed by developers who have security training.\n", + "Significant design documents go through comprehensive security reviews.\n", + "Scans run fully authenticated, with all checks enabled, and issues are\n", + "tracked against the timeline shown in this table.\n", + "\n", + "Note that Databricks clusters are typically short-lived (often terminated\n", + "after a job completes) and do not persist data after they terminate. Clusters\n", + "typically share the same permission level (excluding high concurrency or\n", + "Databricks SQL clusters, where more robust security controls are in place).\n", + "Your code is launched in an unprivileged container to maintain system\n", + "stability. This security design provides protection against persistent attackers\n", + "and privilege escalation.\n", + "\n", + "\n", + "Databricks access to your environment is limited to cloud service provider APIs\n", + "for our automation and support access. Automated access allows the Databricks\n", + "control plane to configure resources in your environment using the cloud service\n", + "provider APIs. The specific APIs vary based on the cloud. For instance, an AWS\n", + "cross-account IAM role, or Azure-owned automation or GKE automation do not\n", + "grant access to your data sets (see the next section).\n", + "\n", + "Databricks has a custom-built system that allows staff to fix issues or handle\n", + "support requests — for example, when you open a support request and check the\n", + "box authorizing access to your workspace. Access requires either a support ticket\n", + "or engineering ticket tied expressly to your workspace and is limited to a subset of\n", + "employees and for limited time periods. Additionally, if you have configured audit\n", + "log delivery, the audit logs show the initial access event and the staff’s actions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Identity and access**\n", + "\n", + "Databricks supports robust ACLs and SCIM. AWS customers can configure\n", + "SAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\n", + "GCP automatically integrate with Azure Active Directory or GCP identity.\n", + "\n", + "Databricks supports a variety of ways to enable users to access their data.\n", + "\n", + "**Examples include:**\n", + "\n", + "**•** The Table ACLs feature uses traditional SQL-based statements to\n", + "manage access to data and enable fine-grained view-based access\n", + "\n", + "**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\n", + "users of that cluster automatically access allowed resources without\n", + "explicit credentials\n", + "\n", + "**•** External storage can be mounted or accessed using a securely\n", + "stored access key\n", + "\n", + "**•** The Secrets API separates credentials from code when accessing\n", + "external resources\n", + "\n", + "\n", + "**Data security**\n", + "\n", + "Databricks provides encryption, isolation and auditing.\n", + "\n", + "**Databricks encryption capabilities are**\n", + "**in place both at rest and in motion**\n", + "\n", + "\n", + "\n", + "|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n", + "|---|---|\n", + "\n", + "\n", + "**Customers can isolate users at multiple levels:**\n", + "\n", + "**•** **Workspace level:** Each team or department can use a separate workspace\n", + "\n", + "**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n", + "\n", + "to a given cluster\n", + "\n", + "**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\n", + "languages (SQL, Python) allow for the safe coexistence of users of different\n", + "privilege levels, and is used with Table ACLs\n", + "\n", + "**•** **Single-user cluster:** Users can create a private dedicated cluster\n", + "\n", + "Activities of Databricks users are logged and can be delivered automatically to\n", + "a cloud storage bucket. Customers can also monitor provisioning activities by\n", + "monitoring cloud audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Compliance**\n", + "\n", + "**Databricks supports the following compliance standards on**\n", + "\n", + "**our multi-tenant platform:**\n", + "\n", + "**•** **SOC 2 Type II**\n", + "\n", + "**•** **ISO 27001**\n", + "\n", + "**•** **ISO 27017**\n", + "\n", + "**•** **ISO 27018**\n", + "\n", + "Certain clouds support Databricks deployment options for FedRAMP\n", + "High, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\n", + "are also GDPR and CCPA ready.\n", + "\n", + "**Learn more**\n", + "\n", + "To learn more about Databricks security,\n", + "visit the [Security and Trust Center](https://databricks.com/trust)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 06\n", + "\n", + "\n", + "### Instant compute and serverless\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of Databricks Serverless SQL\n", + "\n", + "Serverless SQL is much easier to administer with Databricks taking on the\n", + "responsibility of deploying, configuring and managing your cluster VMs. Databricks\n", + "can transfer compute capacity to user queries typically in about 15 seconds — so\n", + "you no longer need to wait for clusters to start up or scale out to run your queries.\n", + "\n", + "Serverless SQL also has built-in connectors to your favorite tools such as Tableau,\n", + "Power BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\n", + "authentication support and high performance. And finally, you save on cost\n", + "because you do not need to overprovision or pay for the idle capacity.\n", + "\n", + "\n", + "#### What is serverless compute?\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions\n", + "and manages the compute layer on behalf of the customer in the Databricks\n", + "cloud account instead of the customer account. As of the current release,\n", + "serverless compute is supported for use with Databricks SQL. This new\n", + "capability for Databricks SQL provides instant compute to users for their\n", + "BI and SQL workloads, with minimal management required and capacity\n", + "optimizations that can lower overall cost by 20%-40% on average. This\n", + "makes it even easier for organizations to expand adoption of the lakehouse\n", + "for business analysts who are looking to access the rich, real-time data sets\n", + "of the lakehouse with a simple and performant solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Inside Serverless SQL**\n", + "\n", + "\n", + "**Databricks Serverless SQL**\n", + "\n", + "**Managed servers**\n", + "\n", + "**Serverless SQL**\n", + "**compute**\n", + "\n", + "**Secure**\n", + "**Instant compute**\n", + "\n", + "\n", + "At the core of Serverless SQL is a compute\n", + "platform that operates a pool of servers located\n", + "in a Databricks’ account, running Kubernetes\n", + "containers that can be assigned to a user\n", + "within seconds.\n", + "\n", + "When many users are running reports or queries\n", + "at the same time, the compute platform adds more\n", + "servers to the cluster (again, within seconds) to\n", + "handle the concurrent load. Databricks manages\n", + "the entire configuration of the server and\n", + "automatically performs the patching and upgrades\n", + "as needed.\n", + "\n", + "Each server is running a secure configuration and\n", + "all processing is secured by three layers of isolation:\n", + "The Kubernetes container hosting the runtime; the\n", + "virtual machine (VM) hosting the container; and\n", + "the virtual network for the workspace. Each layer\n", + "is isolated to one workspace with no sharing or\n", + "cross-network traffic allowed. The containers use\n", + "hardened configurations, VMs are shut down and\n", + "not reused, and network traffic is restricted\n", + "to nodes in the same cluster.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Performance of Serverless SQL\n", + "\n", + "We ran a set of internal tests to compare\n", + "Databricks Serverless SQL to the current\n", + "Databricks SQL and several traditional cloud\n", + "data warehouses. We found Serverless SQL\n", + "to be the most cost-efficient and performant\n", + "environment to run SQL workloads when\n", + "considering cluster startup time, query\n", + "execution time and overall cost.\n", + "\n", + "\n", + "**Databricks Serverless SQL is the highest**\n", + "**performing and most cost-effective solution**\n", + "\n", + "**Cloud SQL solutions compared**\n", + "\n", + "\n", + "**Faster**\n", + "\n", + "**Query**\n", + "**execution**\n", + "**time**\n", + "\n", + "**Slower**\n", + "\n", + "\n", + "**Serverless**\n", + "**SQL**\n", + "\n", + "**CDW1**\n", + "\n", + "**CDW3**\n", + "\n", + "\n", + "**Cost Estimate**\n", + "\n", + "**High**\n", + "\n", + "**Medium**\n", + "\n", + "**Low**\n", + "\n", + "\n", + "**CDW2**\n", + "\n", + "\n", + "**CDW4**\n", + "\n", + "\n", + "**Slower** **Faster**\n", + "**(~5min)** **Startup time** **(~2-3sec)**\n", + "\n", + "**Learn more**\n", + "\n", + "The feature is currently in Public Preview. Sign up to\n", + "[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\n", + "Serverless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 07\n", + "\n", + "\n", + "### Data warehousing\n", + "\n", + "Data warehouses are not keeping up with today’s world. The explosion of\n", + "languages other than SQL and unstructured data, machine learning, IoT and\n", + "streaming analytics are forcing organizations to adopt a bifurcated architecture\n", + "of disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\n", + "is ubiquitous and known by millions of professionals, it has never been treated\n", + "as a first-class citizen on data lakes, until the lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is data warehousing\n", + "\n", + "The Databricks Lakehouse Platform provides a simplified multicloud and\n", + "serverless architecture for your data warehousing workloads. Data warehousing on\n", + "the lakehouse allows SQL analytics and BI at scale with a common governance\n", + "model. Now you can ingest, transform and query all your data in-place — using\n", + "your SQL and BI tools of choice — to deliver real-time business insights at the\n", + "best price/performance. Built on open standards and APIs, the lakehouse\n", + "provides the reliability, quality and performance that data lakes natively lack,\n", + "and integrations with the ecosystem for maximum flexibility — no lock-in.\n", + "\n", + "With data warehousing on the lakehouse, organizations can unify all analytics\n", + "and simplify their architecture to enable their business with real-time business\n", + "insights at the best price/performance.\n", + "\n", + "\n", + "#### Key benefits\n", + "\n", + "**Best price/performance**\n", + "\n", + "Lower costs, get the best price/performance and eliminate\n", + "resource management overhead\n", + "\n", + "On-premises data warehouses have reached their limits — they physically\n", + "cannot scale to handle the growing volumes of data, and don’t provide the\n", + "elasticity customers need to respond to ever-changing business needs.\n", + "Cloud data warehouses are a great alternative to on-premises data\n", + "warehouses, providing greater scale and elasticity, but cloud costs for\n", + "proprietary cloud data warehouses typically yield to an exponential cost\n", + "increase following the growth of data volume.\n", + "\n", + "The Databricks Lakehouse Platform provides instant, elastic SQL serverless\n", + "compute — decoupled from storage on cheap cloud object stores — and\n", + "thousands of performance optimizations that can lower overall infrastructure\n", + "costs by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\n", + "types and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\n", + "use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Built-in governance**\n", + "\n", + "One source of truth and one unified\n", + "governance layer across all data teams\n", + "\n", + "Underpinned by Delta Lake, the Databricks\n", + "Lakehouse Platform simplifies your architecture by\n", + "allowing you to establish one single copy of all your\n", + "data for in-place analytics and ETL/ELT on your\n", + "existing data lakes — no more data movements\n", + "and copies in disjointed systems. Then, seamless\n", + "integration with Databricks Unity Catalog lets you\n", + "easily discover, secure and manage all your data\n", + "with fine-grained governance, data lineage, and\n", + "standard SQL.\n", + "\n", + "**Rich ecosystem**\n", + "\n", + "Ingest, transform and query all your\n", + "data in-place with your favorite tools\n", + "\n", + "Very few tools exist to conduct BI on data lakes.\n", + "Generally, doing so has required data analysts to\n", + "\n", + "submit Spark jobs or use a developer interface.\n", + "While these tools are common for data scientists,\n", + "they require knowledge of languages and\n", + "interfaces that are not traditionally part of a data\n", + "analyst’s tool set. As a result, the learning curve for\n", + "an analyst to make use of a data lake is too high\n", + "when well-established tools and methods already\n", + "exist for data warehouses.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform works with\n", + "your preferred tools like dbt, Fivetran, Power BI or\n", + "Tableau, allowing analysts and analytical engineers\n", + "to easily ingest, transform and query the most\n", + "recent and complete data, without having to move\n", + "it into a separate data warehouse. Additionally, it\n", + "empowers every analyst across your organization\n", + "to quickly and collaboratively find and share new\n", + "insights with a built-in SQL editor, visualizations\n", + "and dashboards.\n", + "\n", + "**Break down silos**\n", + "\n", + "Accelerate time from raw to actionable\n", + "data and go effortlessly from BI to ML\n", + "\n", + "\n", + "applications, organizations will need to manage\n", + "an entirely different system than their SQL-only\n", + "data warehouse, slowing down collaboration and\n", + "innovation.\n", + "\n", + "The Databricks Lakehouse Platform provides the\n", + "most complete end-to-end data warehousing\n", + "solution for all your modern analytics needs,\n", + "and more. Now you can empower data teams\n", + "and business users to access the latest data\n", + "faster for downstream real-time analytics and go\n", + "effortlessly from BI to ML. Speed up the time from\n", + "raw to actionable data at any scale — in batch and\n", + "streaming. And go from descriptive to advanced\n", + "analytics effortlessly to uncover new insights.\n", + "\n", + "\n", + "It is challenging for data engineering teams to\n", + "enable analysts at the speed that the business\n", + "requires. Data warehouses need data to be\n", + "ingested and processed ahead of time before\n", + "analysts can access and query it using BI tools.\n", + "Because traditional data warehouses lack\n", + "real-time processing and do not scale well for\n", + "large ETL jobs, they create new data movements\n", + "and bottlenecks for the data engineering team,\n", + "and make it slow for analysts to access the\n", + "latest data. And for advanced analytics (ML)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data warehousing on Databricks**\n", + "\n", + "**Truly decoupled, serverless, compute layer**\n", + "\n", + "\n", + "**Data consumers**\n", + "\n", + "\n", + "**Data processing**\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "**ETL** **ETL**\n", + "\n", + "**Bronze raw** **Silver staging** **Gold DW/marts**\n", + "\n", + "\n", + "**Open storage layer**\n", + "\n", + "**Data ingest**\n", + "\n", + "**Data sources**\n", + "\n", + "\n", + "**Databricks**\n", + "**Partner Connect**\n", + "\n", + "\n", + "**Continuous**\n", + "**ingest**\n", + "\n", + "\n", + "**Batch**\n", + "**ingest**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Try Databricks SQL for free](https://dbricks.co/dbsql)\n", + "\n", + "[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n", + "\n", + "[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n", + "[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n", + "\n", + "\n", + "[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "\n", + "[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 08\n", + "\n", + "\n", + "### Data engineering\n", + "\n", + "Organizations realize the value data plays as a strategic asset for growing\n", + "revenues, improving the customer experience, operating efficiently or improving\n", + "a product or service. Data is really the driver of all these initiatives. Nowadays,\n", + "data is often streamed and ingested from hundreds of different data sources,\n", + "sometimes acquired from a data exchange, cleaned in various ways with\n", + "different orchestrated steps, versioned and shared for analytics and AI.\n", + "And increasingly, data is being monetized.\n", + "\n", + "Data teams rely on getting the right data at the right time for analytics, data\n", + "science and machine learning, but often are faced with challenges meeting\n", + "the needs of their initiatives for data engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why data engineering is hard\n", + "\n", + "One of the biggest challenges is accessing and managing the increasingly\n", + "complex data that lives across the organization. Most of the complexity\n", + "arises with the explosion of data volumes and data types, with organizations\n", + "amassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "\n", + "With this volume, managing data pipelines to transform and process data\n", + "is slow and difficult, and increasingly expensive. And to top off the complexity,\n", + "most businesses are putting an increased emphasis on multicloud\n", + "environments which can be even more difficult to maintain.\n", + "\n", + "[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\n", + "that data itself has become a product, and the challenging goal of the data\n", + "engineer is to build and run the machinery that creates this high-fidelity\n", + "data product all the way from ingestion to monetization.\n", + "\n", + "\n", + "Despite current technological advances data engineering remains\n", + "difficult for several reasons:\n", + "\n", + "**Complex data ingestion methods**\n", + "\n", + "Data ingestion means retrieving batch and streaming data from various\n", + "sources and in various formats. Ingesting data is hard and complex since you\n", + "either need to use an always-running streaming platform like Apache Kafka\n", + "or you need to be able to keep track of which files haven’t been ingested yet.\n", + "Data engineers are required to spend a lot of time hand-coding repetitive\n", + "and error-prone data ingestion tasks.\n", + "\n", + "**Data engineering principles**\n", + "\n", + "These days, large operations teams are often just a memory of the past.\n", + "Modern data engineering principles are based on agile software development\n", + "methodologies. They apply the well-known “you build it, you run it” paradigm,\n", + "use isolated development and production environments, CI/CD, and version\n", + "control transformations that are pushed to production after validation. Tooling\n", + "needs to support these principles.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Third-party tools**\n", + "\n", + "Data engineers are often required to run additional third-party tools for\n", + "orchestration to automate tasks such as ELT/ETL or customer code in\n", + "notebooks. Running third-party tools increases the operational overhead\n", + "and decreases the reliability of the system.\n", + "\n", + "**Performance tuning**\n", + "\n", + "Finally, with all pipelines and workflows written, data engineers need to\n", + "constantly focus on performance, tuning pipelines and architectures to meet\n", + "SLAs. Tuning such architectures requires in-depth knowledge of the underlying\n", + "architecture and constantly observing throughput parameters.\n", + "\n", + "Most organizations are dealing with a complex landscape of data warehouses\n", + "and data lakes these days. Each of those platforms has its own limitations,\n", + "workloads, development languages and governance model.\n", + "\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:\n", + "\n", + "_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n", + "_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n", + "**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n", + "_kinds of workflows._\n", + "\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "#### Benefits of data engineering on the lakehouse\n", + "\n", + "By simplifying and modernizing with the lakehouse architecture, data engineers\n", + "gain an enterprise-grade and enterprise-ready approach to building data\n", + "pipelines. The following are eight key differentiating capabilities that a data\n", + "engineering solution team can enable with the Databricks Lakehouse Platform:\n", + "\n", + "**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\n", + "engineers can enable fast, reliable, scalable and automatic data ingestion\n", + "for analytics, data science or machine learning.\n", + "\n", + "\n", + "\n", + "**•** **Data pipeline observability:** Monitor overall data pipeline estate status\n", + "from a dataflow graph dashboard and visually track end-to-end pipeline\n", + "health for performance, quality, status and latency.\n", + "\n", + "**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\n", + "analytics and machine learning use cases by enabling easy and automatic\n", + "data pipeline deployments into production or roll back pipelines and\n", + "minimize downtime.\n", + "\n", + "**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\n", + "of data processing tasks for data and machine learning pipelines with the\n", + "ability to run multiple non-interactive tasks as a directed acyclic graph\n", + "(DAG) on a Databricks compute cluster.\n", + "\n", + "\n", + "\n", + "**•** **Automated ETL pipelines:** Data engineers can reduce development\n", + "time and effort and focus on implementing business logic and data\n", + "quality checks within the data pipeline using SQL or Python.\n", + "\n", + "**•** **Data quality checks:** Improve data reliability throughout the data\n", + "lakehouse so data teams can confidently trust the information for\n", + "downstream initiatives with the ability to define data quality and\n", + "automatically address errors.\n", + "\n", + "**•** **Batch and streaming:** Allow data engineers to set tunable data latency\n", + "with cost controls without having to know complex stream processing\n", + "and implement recovery logic.\n", + "\n", + "**•** **Automatic recovery:** Handle transient errors and use automatic recovery\n", + "for most common error conditions that can occur during the operation of\n", + "a pipeline with fast, scalable fault-tolerance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data engineering is all about data quality**\n", + "\n", + "The goal of modern data engineering is to distill data with a quality that is fit for\n", + "downstream analytics and AI. Within the Lakehouse, data quality is achieved on\n", + "three different levels.\n", + "\n", + "\n", + "1. On a **technical level** , data quality is\n", + "guaranteed by enforcing and evolving\n", + "schemas for data storage and ingestion.\n", + "\n", + "**Kenesis**\n", + "\n", + "**CSV,**\n", + "**JSON, TXT...**\n", + "\n", + "**Data Lake**\n", + "\n", + "\n", + "2. On an **architectural level** , data quality is\n", + "often achieved by implementing the medallion\n", + "architecture. A medallion architecture is a data\n", + "design pattern used to logically organize data in\n", + "a [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\n", + "progressively improving the structure and quality\n", + "of data as it flows through each layer of the\n", + "architecture, e.g., from Bronze to Silver to Gold\n", + "layer tables.\n", + "\n", + "\n", + "3. The **Databricks Unity Catalog** comes\n", + "with robust data quality management with\n", + "built-in quality controls, testing, monitoring\n", + "and enforcement to ensure accurate and\n", + "useful data is available for downstream BI,\n", + "analytics and machine learning workloads.\n", + "\n", + "**Streaming**\n", + "**analytics**\n", + "\n", + "\n", + "**Bronze**\n", + "\n", + "\n", + "**Silver**\n", + "\n", + "\n", + "**Gold**\n", + "\n", + "\n", + "**BI and**\n", + "\n", + "**reporting**\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned, Business-level\n", + "and history augmented aggregates\n", + "\n", + "**Quality**\n", + "\n", + "\n", + "**Data science**\n", + "\n", + "**and ML**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data ingestion\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers can build robust\n", + "hyper-scale ingestion pipelines in streaming and batch mode. They can\n", + "incrementally process new files as they land on cloud storage — with no\n", + "need to manage state information — in scheduled or continuous jobs.\n", + "\n", + "Data engineers can efficiently track new files (with the ability to scale\n", + "to billions of files) without having to list them in a directory. Databricks\n", + "automatically infers the schema from the source data and evolves it as\n", + "the data loads into the Delta Lake lakehouse. Efforts continue with\n", + "enhancing and supporting Auto Loader, our powerful data ingestion\n", + "tool for the Lakehouse.\n", + "\n", + "**What is Auto Loader?**\n", + "\n", + "Have you ever imagined that ingesting data could become as easy\n", + "as dropping a file into a folder? Welcome to Databricks Auto Loader.\n", + "\n", + "[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\n", + "efficiently processes new data files as they arrive in the cloud storage built\n", + "into the Databricks Lakehouse. Auto Loader can detect and enforce the\n", + "schema of your data and, therefore, guarantee data quality. New files or\n", + "files that have been changed since the last time new data was processed\n", + "are identified automatically and ingested. Noncompliant data sets are\n", + "quarantined into rescue data columns. You can use the [trigger once]\n", + "option with Auto Loader to turn it into a job that turns itself off.\n", + "\n", + "\n", + "**Ingestion for data analysts: COPY INTO**\n", + "\n", + "Ingestion also got much easier for data analysts and analytics engineers working\n", + "with Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\n", + "lake-first approach and loads data from a folder location into a Delta Lake table.\n", + "COPY INTO can be scheduled and called by a job repeatedly. When run, only new\n", + "files from the source location will be processed.\n", + "\n", + "#### Data transformation\n", + "\n", + "Turning SQL queries into production ETL pipelines typically involves a lot\n", + "of tedious, complicated operational work. Even at a small scale, the majority\n", + "of a data practitioner’s time is spent on tooling and managing infrastructure.\n", + "\n", + "Although the medallion architecture is an established and reliable pattern\n", + "for improving data quality, the implementation of this pattern is challenging\n", + "for many data engineering teams.\n", + "\n", + "While hand-coding the medallion architecture was hard for data engineers,\n", + "creating data pipelines was outright impossible for data analysts not being\n", + "able to code with Spark Structured Streaming in Scala or Python.\n", + "\n", + "Even at a small scale, most data engineering time is spent on tooling and\n", + "managing infrastructure rather than transformation. Auto-scaling, observability\n", + "and governance are difficult to implement and, as a result, often left out of the\n", + "solution entirely.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Live Tables?\n", + "\n", + "Delta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\n", + "infrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\n", + "and apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\n", + "both Python and SQL and is tailored to work with both streaming and batch workloads.\n", + "\n", + "With DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n", + "\n", + "\n", + "**Write** **create live table**\n", + "\n", + "\n", + "**Create** **a pipeline** **Click** **Start**\n", + "\n", + "Start\n", + "\n", + "\n", + "-----\n", + "\n", + "DLT reduces the implementation time by accelerating development and\n", + "automating complex operational tasks. Since DLT can use plain SQL, it also\n", + "enables data analysts to create production pipelines and turns them into\n", + "the often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\n", + "execution applied with Photon.\n", + "\n", + "Software engineering principles are applied for data engineering to foster the\n", + "idea of treating your data as code. Your data is the sole source of truth for what\n", + "is going on inside your business.\n", + "\n", + "Beyond just the transformations, there are many things that should be included\n", + "\n", + "Dependency\n", + "Full refresh\n", + "management\n", + "\n", + "*Coming soon\n", + "\n", + "\n", + "in the code that define your data. Declaratively express entire data flows in SQL\n", + "or Python. Natively enable modern software engineering best practices like\n", + "separate development and production environments, the ability to easily test\n", + "before deploying, deploy and manage environments using parameterization, unit\n", + "testing and documentation.\n", + "\n", + "DLT also automatically scales compute, providing the option to set the minimum\n", + "and maximum number of instances and let DLT size up the cluster according\n", + "to cluster utilization. In addition, tasks like orchestration, error handling and\n", + "recovery, and performance optimization are all handled automatically.\n", + "\n", + "\n", + "Incremental\n", + "computation*\n", + "\n", + "\n", + "Checkpointing\n", + "and retries\n", + "\n", + "\n", + "-----\n", + "\n", + "Expectations in the code help prevent bad data from flowing into tables, track\n", + "data quality over time, and provide tools to troubleshoot bad data with granular\n", + "pipeline observability. This enables a high-fidelity lineage diagram of your\n", + "pipeline to track dependencies and aggregate data quality metrics across all\n", + "your pipelines.\n", + "\n", + "Unlike other products that force you to deal with streaming and batch workloads\n", + "separately, DLT supports any type of data workload with a single API so data\n", + "engineers and analysts alike can build cloud-scale data pipelines faster without\n", + "the need for advanced data engineering skills.\n", + "\n", + "#### Data orchestration\n", + "\n", + "The lakehouse makes it much easier for businesses to undertake ambitious data\n", + "and machine learning (ML) initiatives. However, orchestrating and managing\n", + "end-to-end production workflows remains a bottleneck for most organizations,\n", + "relying on external tools or cloud-specific solutions that are not part of their\n", + "lakehouse platform. Tools that decouple task orchestration from the underlying\n", + "data processing platform reduce the overall reliability of their production\n", + "workloads, limit observability, and increase complexity for end users.\n", + "\n", + "#### What is Databricks Workflows?\n", + "\n", + "[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n", + "[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\n", + "any cloud.\n", + "\n", + "\n", + "Workflows lets you orchestrate data flow pipelines (written in DLT or dbt),\n", + "as well as machine learning pipelines, or any other tasks such as notebooks\n", + "or Python wheels. Since Databricks Workflows is fully managed, it eliminates\n", + "operational overhead for data engineers, enabling them to focus on your\n", + "workflows not on managing your infrastructure. It provides an easy point-and-click\n", + "authoring experience for all your data teams, not just those with specialized skills.\n", + "Deep integration with the underlying lakehouse platform ensures you will create\n", + "and run reliable production workloads on any cloud while providing deep and\n", + "centralized monitoring with simplicity for end users.\n", + "\n", + "Sharing job clusters over multiple tasks reduces the time a job takes, reduces\n", + "costs by eliminating overhead and increases cluster utilization with parallel tasks.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\n", + "shows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\n", + "triggers the execution of all dependent tasks.\n", + "\n", + "You can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n", + "\n", + "external orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Orchestrate anything\n", + "\n", + "Remember that DLT is one of many task types for Databricks Workflows.\n", + "This is where the managed data flow pipelines with DLT tie together with\n", + "the easy point-and-click authoring experience of Databricks Workflows.\n", + "\n", + "In the following example, you can see an end-to-end workflow built with\n", + "customers in a workshop: Data is streamed from Twitter according to search\n", + "terms, then ingested with Auto Loader using automatic schema detection and\n", + "enforcement. In the next step, the data is cleaned and transformed with Delta\n", + "Live table pipelines written in SQL, and finally run through a pre-trained BERT\n", + "language model from Hugging Face for sentiment analysis of the tweets.\n", + "Different task types for ingest, cleanse/transform and ML are combined\n", + "in a single workflow.\n", + "\n", + "Using Workflows, these tasks can be scheduled to provide a daily overview of\n", + "social media coverage and customer sentiment for a business. After streaming\n", + "tweets with filtering for keywords such as “data engineering,” “lakehouse” and\n", + "“Delta Lake,” we curated a list of those tweets that were classified as positive\n", + "with the highest probability score.\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n", + "[Lakehouse](https://databricks.com/solutions/data-pipelines)\n", + "\n", + "\n", + "[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n", + "\n", + "[Databricks Workflows](https://www.databricks.com/product/workflows)\n", + "\n", + "\n", + "[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data streaming\n", + "# 09\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "There are two types of data processing: batch processing\n", + "and streaming processing.\n", + "\n", + "\n", + "Batch processing refers to the discontinuous, periodic processing\n", + "of data that has been stored for a period of time. For example,\n", + "an organization may need to run weekly reports on a set of\n", + "predictable transaction data. There is no need for this data\n", + "to be streaming — it can be processed on a weekly basis.\n", + "\n", + "Streaming processing, on the other hand, refers to unbounded\n", + "processing of data as it arrives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming Challenges**\n", + "\n", + "However, getting value from streaming data can be a tricky practice. While most\n", + "data today can be considered streaming data, organizations are overwhelmed by\n", + "the need to access, process and analyze the volume, speed and variety of this\n", + "data moving through their platforms. To keep pace with innovation, they must\n", + "quickly make sense of data streams decisively, consistently and in real time.\n", + "\n", + "Three common technical challenges organizations experience\n", + "with implementing real-time data streaming include:\n", + "\n", + "**•** **Specialized APIs and language skills:** Data practitioners encounter\n", + "barriers to adopting streaming skillsets because there are new languages,\n", + "APIs and tools to learn.\n", + "\n", + "**•** **Operational complexity:** To implement data streaming at scale, data\n", + "teams need to integrate and manage streaming-specific tools with\n", + "their other cloud services. They also have to manually build complex\n", + "operational tooling to help these systems recover from failure, restart\n", + "workloads without reprocessing data, optimize performance, scale the\n", + "underlying infrastructure, and so on.\n", + "\n", + "**•** **Incompatible governance models:** Different governance and security\n", + "models across real-time and historical data platforms makes it difficult\n", + "to provide the right access to the right users, see the end-to-end data\n", + "lineage, and/or meet compliance requirements.\n", + "\n", + "\n", + "In a wide variety of cases, an organization might find it useful to\n", + "leverage streaming data. Here are some common examples:\n", + "\n", + "**•** **Retail:** Real-time inventory updates help support business activities, such\n", + "as inventory and pricing optimization and optimization of the supply chain,\n", + "logistics and just-in-time delivery.\n", + "\n", + "**•** **Smart energy:** Smart meter monitoring in real time allows for smart\n", + "electricity pricing models and connection with renewable energy sources\n", + "to optimize power generation and distribution.\n", + "\n", + "**•** **Preventative maintenance:** By reducing unplanned outages and\n", + "unnecessary site and maintenance visits, real-time streaming analytics can\n", + "lower operational and equipment costs.\n", + "\n", + "**•** **Industrial automation:** Manufacturers can use streaming and predictive\n", + "analytics to improve production processes and product quality, including\n", + "setting up automated alerts.\n", + "\n", + "**•** **Healthcare:** To optimize care recommendations, real-time data allows\n", + "for the integration of various smart sensors to monitor patient condition,\n", + "medication levels and even recovery speed.\n", + "\n", + "**•** **Financial institutions:** Firms can conduct real-time analysis of\n", + "\n", + "transactions to detect fraudulent transactions and send alerts. They\n", + "can use fraud analytics to identify patterns and feed data into machine\n", + "learning algorithms.\n", + "\n", + "\n", + "Regardless of specific use cases, the central tenet of streaming data is that it\n", + "gives organizations the opportunity to leverage the freshest possible insights for\n", + "better decision-making and more optimized customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data streaming architecture**\n", + "\n", + "Before addressing these challenges head-on, it may help to take a step back and\n", + "discuss the ingredients of a streaming data pipeline. Then, we will explain how\n", + "the Databricks Lakehouse Platform operates within this context to address the\n", + "aforementioned challenges.\n", + "\n", + "Every application of streaming data requires a pipeline that brings the data from\n", + "its origin point — whether sensors, IoT devices or database transactions — to its\n", + "final destination.\n", + "\n", + "In building this pipeline, streaming architectures typically employ two layers.\n", + "First, streaming capture systems **capture** and temporarily store streaming data\n", + "for processing. Sometimes these systems are also called messaging systems\n", + "or messaging buses. These systems are optimized for small payloads and high\n", + "frequency inputs/outputs. Second, streaming **processing** systems continuously\n", + "process data from streaming capture systems and other storage systems.\n", + "\n", + "**Capturing** **Processing**\n", + "\n", + "\n", + "It may help to think of a simplified streaming pipeline\n", + "according to the following seven phases:\n", + "\n", + "1. Data is continuously generated at origin points\n", + "\n", + "2. The generated data is captured from those origin points by\n", + "a capture system like Apache Kafka (with limited retention)\n", + "\n", + "**3. The captured data is extracted and incrementally ingested to**\n", + "**a processing platform like Databricks; data is ingested exactly**\n", + "**once and stored permanently, even if this step is rerun**\n", + "\n", + "**4. The ingested data is converted into a workable format**\n", + "\n", + "**5. The formatted data is cleansed, transformed and joined in**\n", + "**a number of pipeline steps**\n", + "\n", + "**6. The transformed data is processed downstream through**\n", + "**analysis or ML modeling**\n", + "\n", + "7. The resulting analysis or model is used for some sort of practical\n", + "application, which may be anything from basic reporting to an\n", + "event-driven software application\n", + "\n", + "You will notice four of the steps in this list are in boldface. This is because the\n", + "lakehouse architecture is specifically designed to optimize this part of the\n", + "pipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\n", + "analyze and model on streaming data _alongside_ batch-processed data. It can\n", + "accommodate both structured _and_ unstructured data. It is here that the value\n", + "of unifying the best pieces of data lakes and data warehouses really shines for\n", + "complex enterprise use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming on the Lakehouse**\n", + "\n", + "Now let’s zoom in a bit and see how the Databricks Lakehouse\n", + "Platform addresses each part of the pipeline mentioned above.\n", + "\n", + "**Streaming data ingestion and transformation** begins with continuously\n", + "and incrementally collecting raw data from streaming sources through a\n", + "feature called Auto Loader. Once the data is ingested, it can be transformed\n", + "from raw, messy data into clean, fresh, reliable data appropriate for downstream\n", + "analytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\n", + "manage these data pipelines while automatically taking care of infrastructure\n", + "management and scaling, data quality, error testing and other administrative\n", + "tasks. DLT is a high-level abstraction built on Spark Structured Streaming,\n", + "a scalable and fault-tolerant stream processing engine.\n", + "\n", + "**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\n", + "of streaming data. With fresher data streaming into SQL analytics or BI\n", + "reporting, more actionable insights can be achieved, resulting in better\n", + "business outcomes.\n", + "\n", + "**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\n", + "deployment is supported with structured streaming for continuous inference\n", + "from a live data stream. Like real-time analytics, real-time ML is a downstream\n", + "impact of streaming data, but for different business use cases (i.e., AI instead\n", + "of BI). Real-time modeling has many benefits, including more accurate\n", + "predictions about the future.\n", + "\n", + "\n", + "**Real-time applications** process data directly from streaming pipelines and\n", + "trigger programmatic actions, such as displaying a relevant ad, updating the\n", + "price on a pricing page, stopping a fraudulent transaction, etc. There typically\n", + "is no human-in-the-loop for such applications.\n", + "\n", + "\n", + "Data in cloud storage and message stores\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform differentiators**\n", + "\n", + "Understanding what the lakehouse architecture provides is one\n", + "\n", + "thing, but it is useful to understand how Databricks uniquely\n", + "\n", + "approaches the common challenges mentioned earlier around\n", + "\n", + "working with streaming data.\n", + "\n", + "**Databricks empowers unified data teams.** Data engineers, data scientists\n", + "and analysts can easily build streaming data workloads with the languages\n", + "and tools they already know and the APIs they already use.\n", + "\n", + "**Databricks simplifies development and operations.** Organizations can\n", + "focus on getting value from data by reducing complexity and automating\n", + "much of the production aspects associated with building and maintaining\n", + "real-time data workloads.\n", + "\n", + "\n", + "See why customers love streaming on the Databricks\n", + "Lakehouse Platform with these resources.\n", + "\n", + "**Learn more**\n", + "\n", + "[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n", + "\n", + "[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "\n", + "[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n", + "\n", + "[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n", + "\n", + "\n", + "**Databricks is one platform for streaming and batch data.** Organizations\n", + "can eliminate data silos, centralize security and governance models, and\n", + "provide complete support for all their real-time use cases under one roof —\n", + "the roof of the lakehouse.\n", + "\n", + "Finally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\n", + "deeply integrated with Spark Structured Streaming and overcomes many of\n", + "the limitations typically associated with streaming systems and files.\n", + "\n", + "In summary, the Databricks Lakehouse Platform dramatically simplifies data\n", + "streaming to deliver real-time analytics, machine learning and applications on\n", + "one platform. And, that platform is built on a foundation with streaming at its\n", + "core. This means organizations of all sizes can use their data in motion and\n", + "make more informed decisions faster than ever.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data science and machine learning\n", + "# 10\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "While most companies are aware of the potential benefits of applying\n", + "machine learning and AI, realizing these potentials can often be quite\n", + "challenging for those brave enough to take the leap. Some of the\n", + "largest hurdles come from siloed/disparate data systems, complex\n", + "experimentation environments, and getting models served in a\n", + "production setting.\n", + "\n", + "\n", + "Fortunately, the Databricks Lakehouse Platform provides a helping\n", + "hand and lets you use data to derive innovative insights, build\n", + "powerful predictive models, and enable data scientists, ML engineers,\n", + "and developers of all kinds to create within the space of machine\n", + "learning and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Machine Learning\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Exploratory data analysis\n", + "\n", + "With all the data in one place, data is easily\n", + "explored and visualized from within the\n", + "notebook-style experience that provides support\n", + "for various languages (R, SQL, Python and Scala)\n", + "as well as built-in visualizations and dashboards.\n", + "Confidently and securely share code with\n", + "co-authoring, commenting, automatic versioning,\n", + "Git integrations and role-based access controls.\n", + "The platform provides laptop-like simplicity at\n", + "production-ready scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Model creation and management\n", + "\n", + "From data ingestion to model training and tuning, all the way through to\n", + "production model serving and versioning, the Lakehouse brings the tools\n", + "needed to simplify those tasks.\n", + "\n", + "Get right into experimenting with the Databricks ML runtimes, optimized and\n", + "preconfigured to include most popular libraries like scikit-learn, XGBoost and\n", + "more. Massively scale thanks to built-in support for distributed training and\n", + "hardware acceleration with GPUs.\n", + "\n", + "From within the runtimes, you can track model training sessions, package and\n", + "reuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\n", + "created by Databricks and included as a managed service within the Lakehouse.\n", + "It provides a centralized location from which to manage models and package\n", + "code in an easily reusable way.\n", + "\n", + "Training these models often involves the use of features housed in a centralized\n", + "feature store. Fortunately, Databricks has a built-in feature store that allows you\n", + "to create new features, explore and re-use existing features, select features for\n", + "training and scoring machine learning models, and publish features to low-latency\n", + "online stores for real-time inference.\n", + "\n", + "If you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\n", + "experimentation by pointing to your data set and automatically training models\n", + "and tuning hyperparameters to save both novice and advanced users precious\n", + "time in the machine learning process.\n", + "\n", + "\n", + "AutoML will also report back metrics related to the model training results as well\n", + "as the code needed to repeat the training already custom-tailored to your data\n", + "set. This glass box approach ensures that you are never trapped or suffer from\n", + "vendor lock-in.\n", + "\n", + "In that regard, the Lakehouse supports the industry’s widest range of data tools,\n", + "development environments, and a thriving ISV ecosystem so you can make your\n", + "workspace your own and put out your best work.\n", + "\n", + "##### Compute platform\n", + "\n", + "**Any ML workload optimized and accelerated**\n", + "\n", + "**Databricks Machine Learning Runtime**\n", + "\n", + "- Optimized and preconfigured ML frameworks\n", + "\n", + "- Turnkey distribution ML\n", + "\n", + "- Built-in AutoML\n", + "\n", + "- GPU support out of the box\n", + "\n", + "\n", + "Built-in **ML frameworks**\n", + "and **model explainability**\n", + "\n", + "Built-in support for **AutoML**\n", + "and **hyperparameter tuning**\n", + "\n", + "\n", + "Built-in support for\n", + "**distributed training**\n", + "\n", + "Built-in support for\n", + "**hardware accelerators**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Deploy your models to production\n", + "\n", + "Exploring and creating your machine learning models\n", + "typically represents only part of the task. Once the\n", + "models exist and perform well, they must become\n", + "part of a pipeline that keeps models updated,\n", + "monitored and available for use by others.\n", + "\n", + "**Webhooks** allow registering of\n", + "\n", + "\n", + "Databricks can help here by providing a world-class\n", + "experience for model versioning, monitoring and\n", + "serving within the same platform that you can use\n", + "to generate the models themselves. This means you\n", + "can make all your ML pipelines in the same place,\n", + "monitor them for drift, retrain them with new data,\n", + "and promote and serve them easily and at scale.\n", + "\n", + "Throughout the ML lifecycle, rest assured knowing\n", + "that lineage and governance are being tracked the\n", + "entire way. This means regulatory compliance and\n", + "security woes are significantly reduced, potentially\n", + "saving costly issues down the road.\n", + "\n", + "\n", + "callbacks on events like stage\n", + "\n", + "transitions to integrate with CI/CD\n", + "\n", + "automation.\n", + "\n", + "**Tags** allow storing deployment\n", + "\n", + "— specific metadata with model\n", + "\n", + "versions, e.g., whether the\n", + "\n", + "deployment was successful.\n", + "\n", + "\n", + "**Model lifecycle management**\n", + "\n", + "Staging Production Archived\n", + "\n", + "\n", + "Logged\n", + "model\n", + "\n", + "**Comments** allow communication\n", + "\n", + "and collaboration between\n", + "\n", + "teammates when reviewing\n", + "\n", + "model versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n", + "\n", + "[Databricks Data Science](https://databricks.com/product/data-science)\n", + "\n", + "[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 11\n", + "\n", + "\n", + "### Databricks Technology Partners and the modern data stack\n", + "\n", + "Databricks Technology Partners integrate their solutions with Databricks to\n", + "provide complementary capabilities for ETL, data ingestion, business intelligence,\n", + "machine learning and governance. These integrations allow customers to leverage\n", + "the Databricks Lakehouse Platform’s reliability and scalability to innovate faster\n", + "while deriving valuable data insights. Use preferred analytical tools with optimized\n", + "connectors for fast performance, low latency and high user concurrency to your\n", + "data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\n", + "tools to your lakehouse using validated integrations and helps you discover and try new solutions.\n", + "\n", + "**Databricks thrives within your modern data stack**\n", + "\n", + "**BI and dashboards** **Machine learning** **Data science**\n", + "\n", + "\n", + "**Data governance**\n", + "\n", + "**Data pipelines**\n", + "\n", + "**Data ingestion**\n", + "\n", + "\n", + "Data Data Data\n", + "warehousing engineering streaming\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "Data science\n", + "and ML\n", + "\n", + "\n", + "**Consulting**\n", + "**and SI partners**\n", + "\n", + "\n", + "**Delta Lake**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n", + "\n", + "[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n", + "\n", + "\n", + "[Partner Connect](https://databricks.com/partnerconnect)\n", + "\n", + "[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Get started with the Databricks Lakehouse Platform\n", + "# 12\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build solutions together with interactive\n", + "notebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\n", + "scikit-learn and more.\n", + "\n", + "**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\n", + "hosted by Databricks\n", + "\n", + "**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "\n", + "**[Databricks documentation](https://databricks.com/documentation)**\n", + "\n", + "Get detailed documentation to get started with\n", + "the Databricks Lakehouse Platform on your cloud\n", + "of choice: Databricks on AWS, Azure Databricks\n", + "and [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n", + "\n", + "**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n", + "\n", + "Get a firsthand look at Databricks from the\n", + "practitioner’s perspective with these simple\n", + "on-demand videos. Each demo is paired with\n", + "related materials — including notebooks, videos\n", + "and eBooks — so that you can try it out for\n", + "yourself on Databricks.\n", + "\n", + "\n", + "**[Databricks Academy](https://databricks.com/learn/training/home)**\n", + "\n", + "Whether you are new to the data lake or building on\n", + "an existing skill set, you can find a curriculum tailored\n", + "to your role or interest. With training and certification\n", + "through Databricks Academy, you will learn to master\n", + "the Databricks Lakehouse Platform for all your big\n", + "data analytics projects.\n", + "\n", + "**[Databricks Community](https://community.databricks.com/)**\n", + "\n", + "\n", + "**[Databricks Labs](https://databricks.com/learn/labs)**\n", + "\n", + "Databricks Labs are projects created by the\n", + "field to help customers get their use cases\n", + "into production faster.\n", + "\n", + "**[Databricks customers](https://databricks.com/customers)**\n", + "\n", + "Discover how innovative companies across\n", + "every industry are leveraging the Databricks\n", + "Lakehouse Platform.\n", + "\n", + "\n", + "Get answers, network with peers and solve\n", + "the world’s toughest problems, together.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000\n", + "organizations worldwide — including Comcast, Condé Nast,\n", + "H&M and over 40% of the Fortune 500 — rely on the Databricks\n", + "Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve the\n", + "world’s toughest problems. To learn more, follow Databricks on\n", + "[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
##### Guide\n", + "\n", + "## 6 Strategies for Building Personalized Customer Experiences\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "**Introduction** ................................................................................................................................................................................................................. **3**\n", + "\n", + "**1.** **Building a Foundation for Personalization**\n", + "Leveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n", + "\n", + "**2.** **Estimating Customer Lifetime Value**\n", + "Building Brand Loyalty With Data ................................................................................................................................................................. **6**\n", + "\n", + "**3.** **Mitigating Customer Churn**\n", + "Balancing Acquisition and Retention .......................................................................................................................................................... **10**\n", + "\n", + "**4.** **Streamlining Customer Analysis and Targeting**\n", + "Creating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n", + "\n", + "**5.** **Assessing Consumer Interest Data**\n", + "Fine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n", + "\n", + "**6.** **Delivering Personalized Customer Journeys**\n", + "Crafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n", + "\n", + "**Conclusion**\n", + "Building a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "In today’s experience-driven world, the most beloved brands are the ones that\n", + "know their customers. Customers are loyal to brands that recognize their needs\n", + "and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\n", + "buying from a brand that personalizes the shopping and user experience to the\n", + "wants and needs of the customer. And as organizations pursue omnichannel\n", + "excellence, these same high expectations of online experiences also extend to\n", + "brick-and-mortar locations — revealing for many merchants that personalized\n", + "engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized\n", + "experiences requires integrating various types of data — including demographics,\n", + "behavioral and transactional — to develop robust profiles. This guide focuses on six\n", + "actionable strategic pillars for businesses to leverage automation, real-time data,\n", + "AI-driven analysis and well-tuned ML models to architect and deliver customized\n", + "customer experiences at every touch point.\n", + "\n", + "\n", + "# 76%\n", + "\n", + "of consumers are more\n", + "likely to purchase due to\n", + "personalization\n", + "\n", + "\n", + "# 76%\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Foundation for Personalization\n", + "\n", + "Get a 360-degree view of the customer by leveraging ML-based entity resolution\n", + "\n", + "\n", + "To create truly personalized interactions, you need actionable insights\n", + "about your customers. Start by establishing a common customer profile and\n", + "accurately linking together customer records across disparate data sets.\n", + "\n", + "Get a 360-degree view of your target customer by bringing together:\n", + "\n", + "- Sales and traffic-driven first-party data\n", + "\n", + "- Product ratings and surveys\n", + "\n", + "- Customer surveys and support center calls\n", + "\n", + "- Third-party data purchased from data aggregators and online trackers\n", + "\n", + "- Zero-party data provided by customers themselves\n", + "\n", + "Location\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n", + "\n", + "Grab is the largest online-to-offline platform in Southeast Asia and\n", + "has generated over 6 billion transactions for transport, food and\n", + "grocery delivery, and digital payments. Grab uses Databricks to create\n", + "sophisticated customer segmentation and recommendation engines\n", + "that can now ingest and optimize thousands of user-generated signals\n", + "and data sources simultaneously, enhancing data integrity and security,\n", + "and reducing weeks of work to only hours.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/grab)\n", + "\n", + "\n", + "\n", + "Demographics\n", + "\n", + "\n", + "Orders\n", + "\n", + "Network/\n", + "Usage\n", + "\n", + "\n", + "“The C360 platform empowered teams to create\n", + "consumer features at scale, which in turn allows\n", + "for these features to be extended to other markets\n", + "and used by other teams. This helps to reduce the\n", + "engineering overhead and costs exponentially.”\n", + "\n", + "**N I K H I L DWA R A K A N AT H**\n", + "Head of Analytics, Grab\n", + "\n", + "\n", + "Social\n", + "\n", + "Apps/\n", + "Clickstream\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|||||||\n", + "||Cus 3|t 6|o|mer 0||\n", + "|||||||\n", + "|||||||\n", + "\n", + "\n", + "\n", + "Service Call/\n", + "Records\n", + "\n", + "\n", + "Customer\n", + "360\n", + "\n", + "\n", + "Billing\n", + "\n", + "Devices\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the different data sources and data types, automated matching can still\n", + "be incredibly challenging due to inconsistent formats, misinterpretation of data,\n", + "and entry errors across various systems. And even if inconsistent, all that data\n", + "may be perfectly valid — but to accurately connect the millions of customer\n", + "identities most retailers manage, businesses must lean on automation.\n", + "\n", + "In a machine learning (ML) approach to entity resolution, text attributes like\n", + "name, address and phone number are translated into numerical representations\n", + "that can be used to quantify the degree of similarity between any two attribute\n", + "values. But your ability to train such a model depends on your access to\n", + "accurately labeled training data. It’s a time-consuming exercise, but if done right,\n", + "the model learns to reflect the judgments of the human reviewers.\n", + "\n", + "Many organizations rely on libraries encapsulating this knowledge to build their\n", + "applications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\n", + "bringing together ML-based approaches to intelligent candidate pair generation\n", + "and pair-scoring. Oriented toward the construction of custom workflows, Zingg\n", + "presents these capabilities within the context of commonly employed steps\n", + "such as training data label assignment, model training, data set deduplication,\n", + "and (cross-data set) record matching.\n", + "\n", + "Built as a native Apache Spark TM application, Zingg scales well to apply these\n", + "techniques to enterprise-sized data sets. Organizations can then use Zingg in\n", + "combination with platforms such as Databricks Lakehouse to provide the back\n", + "end to human-in-the-middle workflow applications that automate the bulk of\n", + "the entity resolution work and present data experts with a more manageable\n", + "set of edge case pairs to interpret.\n", + "\n", + "\n", + "As an active-learning solution, models can be retrained to take advantage of\n", + "this additional human input to improve future predictions and further reduce\n", + "the number of cases requiring expert review. Finally, these technologies can be\n", + "assembled to enable their own enterprise-scaled customer entity resolution\n", + "workflow applications.\n", + "\n", + "**Need help building your foundation for a**\n", + "**360-degree view of your customers?**\n", + "\n", + "Get pre-built code sample data and step-by-step instructions\n", + "in a Databricks notebook in the **Customer Entity Resolution**\n", + "**Solution Accelerator.**\n", + "\n", + "**•** Translating text attributes (like name, address, phone number)\n", + "into quantifiable numerical representations\n", + "\n", + "**•** Training ML models to determine if these numerical labels\n", + "form a match\n", + "\n", + "**•** Scoring the confidence of each match\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Estimating Customer Lifetime Value\n", + "\n", + "Building brand loyalty to drive share of wallet with data\n", + "\n", + "\n", + "Once you’ve set up a 360-degree view of the customer, the next challenge\n", + "is how to spend money to profitably grow the brand. The goal is to spend\n", + "marketing dollars on activities that attract loyal customers and avoid spending on\n", + "unprofitable customers or activities that damage the brand. Keep in mind, that\n", + "making decisions solely based on ROI isn’t the answer. This one-track approach\n", + "could ultimately weaken your brand equity and make you more dependent on\n", + "lowering your price through promotions as a way to generate sales.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "**Identifying and engaging brand loyalists**\n", + "\n", + "Today’s customer has overwhelmingly abundant options in products and\n", + "services to choose from. That’s why personalizing customer experiences is so\n", + "important, as it increases revenue, marketing efficiency and customer retention.\n", + "\n", + "Not every customer carries the same potential for profitability. Different\n", + "customers derive different value from your products and services, which directly\n", + "translates into differences in the overall amount of value a business can expect\n", + "in return. Mutually beneficial relationships carefully align customer acquisition\n", + "cost (CAC) and retention rates with the total revenue or customer lifetime value\n", + "(CLV).\n", + "\n", + "\n", + "**Predicting and increasing customer lifetime value with ML**\n", + "\n", + "\n", + "Kolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\n", + "attracts over 10 million monthly active users. With Databricks, they\n", + "achieved a 30% increase in player LTV, improved data team productivity\n", + "by 3x, and reduced ML model-to-production time by 40x.\n", + "\n", + "[Get the full story](https://databricks.com/customers/kolibri-games)\n", + "\n", + "Within your existing customer base are people ranging from brand loyalists to\n", + "brand transients. Brand loyalists are highly engaged with your brand, are willing\n", + "to share their experience with others, and are the most likely to purchase\n", + "again. Brand transients have no loyalty to your brand and shop based on price.\n", + "Your focus should be on growing the group of brand loyalists while minimizing\n", + "interactions with brand transients.\n", + "\n", + "\n", + "**Calculating customers’ lifetime intent**\n", + "\n", + "To assess the remaining lifetime in a customer relationship, businesses must\n", + "\n", + "carefully examine the transactional signals and other indicators from previous\n", + "customer engagements and transactions.\n", + "\n", + "For example, if a frequent customer slows down their buying habits — or simply\n", + "doesn’t make a purchase for an extended period of time — it may signal the\n", + "upcoming end of the relationship. However, in the case of another customer\n", + "who engages infrequently, the same extended absence may not signal anything\n", + "notable. The infrequent buyer may continue to purchase even after a long pause\n", + "in activity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Customer A\n", + "\n", + "Customer B\n", + "\n", + "Customer C\n", + "\n", + "\n", + "Past Future\n", + "\n", + "Different customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n", + "\n", + "\n", + "Every customer relationship with a business has a lifespan. Understanding what\n", + "point in the lifespan at a given time provides critical insight to inform marketing\n", + "and sales tactics. By proactively discovering shifts in the relationship, you can\n", + "adapt how to respond to each customer at the optimal time. For example, a\n", + "certain signal might prompt a change in how to deliver products and services,\n", + "which could help maximize revenue.\n", + "\n", + "Transactional signals can be used to estimate the probability that a customer\n", + "is active and likely to return in the future. Popularized as the Buy ’til You Die\n", + "(BTYD) model, analysts can compare a customer’s frequency and recency of\n", + "\n", + "engagement to similar patterns across their user population to accurately\n", + "predict individual CLV.\n", + "\n", + "\n", + "The mathematics behind these predictive CLV models is complex, but the logic\n", + "behind these critical models is accessible through a popular Python library\n", + "named Lifetimes, which allows the input of simple summary metrics in order to\n", + "derive customer-specific lifetime estimates.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**How personalized experiences keep customers coming**\n", + "**back for more**\n", + "\n", + "Publicis Groupe empowers brands to transform retail experiences with\n", + "digital technologies, but data challenges and team silos stood in the\n", + "way of delivering the personalization that their customers required.\n", + "See how they use Databricks to create a single customer view that\n", + "allows them to drive customer loyalty and retention. As a result, they’ve\n", + "seen a 45%–50% increase in customer campaign revenue.\n", + "\n", + "[Get the full story](https://databricks.com/customers/publicis-groupe)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering customer lifetime estimates to the business**\n", + "\n", + "\n", + "Spark natively distributes this work across a multi-server environment, enabling\n", + "consistent, accurate and efficient analysis. Spark’s flexibility allows models to\n", + "adapt in real time as new information is ingested, eliminating the bottlenecks\n", + "that come with manual data mapping and profile building.\n", + "\n", + "With per customer metrics calculated, the Lifetimes library can be used to train\n", + "multiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\n", + "predict engagements over time using proprietary data can take several months\n", + "and thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\n", + "businesses tap into the infrastructure behind their Spark environments and\n", + "distribute the training outputs across models.\n", + "\n", + "\n", + "Using the Lifetimes library to calculate customer-specific probabilities at speed\n", + "and scale can be challenging — from processing large volumes of transaction\n", + "data to deriving data curves and value distribution patterns and, eventually,\n", + "to integration with business initiatives. But with the proper approach, you can\n", + "resolve all of them.\n", + "\n", + "These models depend on three key per customer metrics:\n", + "\n", + "**FREQUENCY**\n", + "The number of times within a given time period in which a repeat\n", + "transaction is observed\n", + "\n", + "**AGE**\n", + "The length of time between the occurrence of an initial transaction\n", + "to the end of a given time period\n", + "\n", + "**RECENCY**\n", + "\n", + "The “age” of a customer (how long they’ve engaged with a brand)\n", + "at the time of their latest repeat transaction\n", + "\n", + "\n", + "-----\n", + "\n", + "**Solution deployment**\n", + "\n", + "\n", + "Once properly trained, these models can determine the probability that a\n", + "customer will re-engage, as well as the number of engagements a business\n", + "can expect from that customer over time. But the real challenge is putting\n", + "these predictive capabilities into the hands of those that determine\n", + "customer engagement.\n", + "\n", + "Matrices illustrating the probability a customer is alive (left) and the number of future\n", + "purchases in a 30-day window given a customer’s frequency and recency metrics (right).\n", + "\n", + "\n", + "Businesses need a way to develop and deploy solutions in a highly scalable\n", + "environment with a limited upfront cost. Databricks Solution Accelerators\n", + "leverage real-world sample data sets and pre-built code to show how raw data\n", + "can be transformed into real solutions — including step-by-step instructions\n", + "ready to go in a Databricks notebook.\n", + "\n", + "**Need help determining your customers’**\n", + "**lifetime value?**\n", + "\n", + "Use the **Customer Lifetime Value Accelerator** to\n", + "\n", + "**•** Ingest sample retail data\n", + "\n", + "**•** Use pre-built code to develop visualizations and explore\n", + "past purchase behavior\n", + "\n", + "**•** Apply machine learning to predict the likelihood and\n", + "nature of future purchases\n", + "\n", + "**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Mitigating Customer Churn\n", + "\n", + "Balancing acquisition and retention with personalized experiences\n", + "\n", + "\n", + "There are no guarantees of success. With a bevy of options at their disposal,\n", + "customer churn is a reality that companies face and are focused on overcoming\n", + "every day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\n", + "estimated a segment average 7.2% monthly rate of churn. When narrowed to\n", + "brands focused on consumer goods, that rate jumped to 10.0%. This figure\n", + "translates to a lifetime of 10 months for the average subscription box service,\n", + "leaving businesses of this kind with little time to recover acquisition costs and\n", + "bring subscribers to net profitability.\n", + "\n", + "**C A S E S T U DY**\n", + "##### Riot Games\n", + "\n", + "**Creating an optimal in-game experience for League of Legends**\n", + "\n", + "Riot Games is one of the top PC game developers in the world, with over\n", + "100 million monthly active users, 500 billion data points, and over 26\n", + "petabytes of data and counting. They turned to Databricks to build a more\n", + "\n", + "efficient and scalable way to leverage data and improve the overall gaming\n", + "experience — ensuring customer engagement and reducing churn.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/riot-games)\n", + "\n", + "Organizations must take an honest look at the cost of acquisition relative to a\n", + "customer’s lifetime value (LTV) earned. These figures need to be brought into a\n", + "\n", + "healthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n", + "\n", + "\n", + "**Understanding attrition predictability through subscriptions:**\n", + "**Examining retention-based acquisition variables**\n", + "\n", + "Public data for subscription services is extremely hard to come by. KKBox, a\n", + "Taiwan-based music streaming service, recently released over two years of\n", + "anonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\n", + "the data, we uncover customer dynamics familiar to any subscription provider.\n", + "\n", + "Most subscribers join the KKBox service through a 30-day trial offer. Customers\n", + "then appear to enlist in one-year subscriptions, which provide the service with\n", + "a steady flow of revenue. Subscribers typically churn at the end of the 30-day\n", + "trial and at regular one-year intervals.\n", + "\n", + "The Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\n", + "retained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n", + "\n", + "\n", + "-----\n", + "\n", + "By Initial Payment Method\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different payment methods.\n", + "\n", + "By Initial Payment Plan Days\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers selecting different initial payment methods and terms/days.\n", + "\n", + "\n", + "This pattern of high initial drop-off, followed by a period of slower but continuing\n", + "drop-off cycles makes intuitive sense. Where it gets interesting is when the\n", + "data changes. The patterns of customer churn become vastly different as time\n", + "passes and new or changing elements are introduced (e.g., payment methods\n", + "and options, membership tiers, etc.).\n", + "\n", + "By Registration Channel\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different channels.\n", + "\n", + "\n", + "-----\n", + "\n", + "These patterns seem to indicate that KKBox _could_ potentially differentiate\n", + "between customers based on their lifetime potential, using only the information\n", + "available at subscriber acquisition. In the same way, non-subscription businesses\n", + "could use similar data techniques to get an accurate illustration of the total\n", + "lifetime value of a particular customer, even before collecting historical data.\n", + "\n", + "This information can help businesses target certain shoppers with effective\n", + "discounts or promotions as early as trial registration. Nevertheless, it’s always\n", + "important to consider more than individual data points.\n", + "\n", + "The baseline risk of customer attrition over a subscription lifespan.\n", + "\n", + "\n", + "The channel and payment method multipliers combine to explain a customer’s risk of attrition\n", + "at various points in time. The higher the value, the higher the proportional risk of churn in the\n", + "associated period.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Applying churn analytics to your data**\n", + "\n", + "This analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n", + "**2)** to paint a quantitative picture of the specific factors that explain that risk,\n", + "giving analysts a clearer understanding of what to focus on, what to ignore and\n", + "what to investigate further. The main challenge is organizing the input data.\n", + "\n", + "The data required to examine customer attrition may be scattered across\n", + "multiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\n", + "the creation of transparent, sustainable data processing pipelines that are\n", + "flexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n", + "**condition to be managed** , and attrition data should be periodically revisited to\n", + "maintain alignment between acquisition and retention efforts.\n", + "\n", + "**Need help predicting customer churn?**\n", + "\n", + "Use the **Subscriber Churn Prediction Accelerator** to analyze\n", + "behavioral data, identify subscribers with an increased risk of\n", + "cancellation, and predict attrition. Machine learning lets you\n", + "quantify a user’s likelihood to churn, identifying factors that\n", + "explain the risk.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Streamlining Customer Analysis and Targeting\n", + "\n", + "Creating efficient and highly targeted customer experiences with behavioral data\n", + "\n", + "\n", + "Effective targeting comes down to one fundamental element: the cost of\n", + "delivering a good or service relative to what a consumer is willing to pay.\n", + "\n", + "In the earliest applications of segmentation, manufacturers recognized that\n", + "specialized product lines targeting specific consumer groups could help\n", + "brands stand out against competitors.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Finding that special something every time**\n", + "\n", + "Pandora is a jewelry company with global reach. They built their master\n", + "consumer view (MCV) dashboard on the Databricks Lakehouse Platform,\n", + "giving them the insights necessary to deliver highly targeted messaging\n", + "and personalization — resulting in 80% growth in email marketing\n", + "success, a 50% increase in click-to-open rate across 65 million emails,\n", + "and 255M DKK (Danish Krone) in quarterly revenue.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/pandora)\n", + "\n", + "This mode of thinking extends beyond product development and into every\n", + "customer-oriented business function, requiring specific means of ideation,\n", + "production and delivery. The work put into segmentation doesn’t need to be\n", + "a gamble. Scrutinizing customers and testing responsiveness is an ongoing\n", + "process. Organizations must analyze and adapt to shifting markets, changing\n", + "consumer demand and evolving business objectives.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Powering insight-driven dashboards to increase customer**\n", + "**acquisition**\n", + "\n", + "Bagelcode is a global game company with more than 50 million global\n", + "users. By using the Databricks Lakehouse Platform, they are now able to\n", + "support more diversified indicators, such as a user’s level of frequency\n", + "and the amount of time they use a specific function for each game,\n", + "enabling more well-informed responses. In addition, the company is\n", + "mitigating customer churn by better predicting gamer behavior and\n", + "providing personalized experiences at scale.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/bagelcode)\n", + "\n", + "“Thanks to Databricks Lakehouse, we can support\n", + "real-time business decision-making based on data\n", + "analysis results that are automatically updated on\n", + "an hourly and daily basis, even as data volumes have\n", + "increased by nearly 1,000 times.”\n", + "\n", + "**J O O H Y U N K I M**\n", + "Vice President, Data and AI, Bagelcode\n", + "\n", + "\n", + "-----\n", + "\n", + "A brand’s goal with segmentation should be to define a shared customer\n", + "perspective on customers, allowing the organization to engage users consistently\n", + "and cohesively. But any adjustments to customer engagement require careful\n", + "consideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Responding to global demand shifts with ease**\n", + "\n", + "Reckitt produces some of the world’s most recognizable and trusted\n", + "consumer brands in hygiene, health and nutrition. With Databricks\n", + "Lakehouse on Azure, they’re able to meet the needs of billions of\n", + "consumers worldwide by surfacing real-time, highly accurate, deep\n", + "customer insights, leading to a better understanding of trends and\n", + "demand, allowing them to provide best-in-class experiences in\n", + "every market.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/reckitt)\n", + "\n", + "\n", + "**A segmentation walk-through: Grocery chain promotions**\n", + "\n", + "A promotions management team for a large grocery chain is responsible for\n", + "running a number of promotional campaigns, each of which is intended to drive\n", + "greater overall sales. Today, these marketing campaigns include leaflets and\n", + "coupons mailed to individual households, manufacturer coupon matching,\n", + "in-store discounts and the stocking of various private-label alternatives to\n", + "popular national brands.\n", + "\n", + "Recognizing uneven response rates between households, the team is eager to\n", + "determine if customers might be segmented based on their responsiveness\n", + "to these promotions. They anticipate that such segmentation may allow the\n", + "promotions management team to better target individual households, driving\n", + "overall higher response rates for each promotional dollar spent.\n", + "\n", + "Using historical data from point-of-sale systems along with campaign\n", + "information from their promotions management systems, the team derives\n", + "a number of features that capture the behavior of various households with\n", + "regard to promotions. Applying standard data preparation techniques, the data\n", + "is organized for analysis and using a variety of clustering algorithms, such as\n", + "k-means and hierarchical clustering, the team settles on two potentially useful\n", + "cluster designs.\n", + "\n", + "\n", + "-----\n", + "\n", + "Overlapping segment designs separating households based on their responsiveness to\n", + "various promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n", + "\n", + "**Assessing results**\n", + "\n", + "\n", + "Comparing households by demographic factors not used in developing the\n", + "clusters themselves, some interesting patterns separating cluster members\n", + "by age and other factors are identified. While this information may be useful\n", + "\n", + "in not only predicting cluster membership and designing more effective\n", + "campaigns targeted to specific groups of households, the team recognizes\n", + "the need to collect additional demographic data before putting too much\n", + "emphasis on these results.\n", + "\n", + "\n", + "With profiling, marketers can discern those customer households in the\n", + "highlighted example fall into two groups: those who are responsive to coupons\n", + "and mailed leaflets, and those who are not. Further divisions show differing\n", + "degrees of responsiveness to other promotional offers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Need help segmenting your customers for**\n", + "**more targeted marketing?**\n", + "\n", + "Use the **Customer Segmentation Accelerator** and drive\n", + "better purchasing predictions based on behaviors. Through\n", + "sales data, campaigns and promotions systems, you can\n", + "build useful customer clusters to effectively target various\n", + "households with different promos and offers.\n", + "\n", + "Age-based differences in cluster composition of behavior-based customer segments.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "The results of the analysis now drive a dialog between the data scientists and\n", + "the promotions management team. Based on initial findings, a revised analysis\n", + "will be performed focused on what appear to be the most critical features\n", + "differentiating households as a means to simplify the cluster design and evaluate\n", + "overall cluster stability. Subsequent analyses will also examine the revenue\n", + "\n", + "generated by various households to understand how changes in promotional\n", + "engagement may impact customer spending.\n", + "\n", + "Using this information, the team believes they will have the ability to make a case\n", + "for change to upper management. Should a change in promotions targeting be\n", + "approved, the team makes plans to monitor household spending, promotions\n", + "spend and campaign responsiveness rates using much of the same data used in\n", + "this analysis. This will allow the team to assess the impact of these efforts and\n", + "identify when the segmentation design needs to be revisited.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Assessing Consumer Interest Data to Inform Engagement Strategies\n", + "\n", + "Fine-tuning ML recommendations to boost conversions\n", + "\n", + "\n", + "Personalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\n", + "important to identify high-value audiences who have the highest likelihood of\n", + "specific actions. Here’s where **propensity scoring** comes in.\n", + "\n", + "Specifically, this process allows companies to estimate customers’ potential\n", + "receptiveness to an offer or to content related to a subset of products, and\n", + "determine which messaging to apply. Calculating propensity scores requires\n", + "assessment of past interactions and data points (e.g., frequency of purchases,\n", + "percentage of spend associated with a particular product category, days since\n", + "last purchase and other historical data).\n", + "\n", + "Databricks provides critical capabilities for propensity scoring (like the Feature\n", + "Store, AutoML and MLflow) to help businesses answer three key considerations\n", + "and develop a robust process:\n", + "\n", + "**1.** How to maintain the significant number of features used\n", + "to train propensity models\n", + "\n", + "**2.** How to rapidly train models aligned with new campaigns\n", + "\n", + "**3.** How to rapidly re-deploy models, retrained as customer\n", + "patterns drift, into the scoring pipeline\n", + "\n", + "**Boosting model training efficiency**\n", + "\n", + "With the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\n", + "created by others.\n", + "\n", + "\n", + "The feature store is a centralized repository that enables the persistence,\n", + "discovery and sharing of features across various model training exercises.\n", + "As features are captured, lineage and other metadata are captured. Standard\n", + "security models ensure that only permitted users and processes may\n", + "employ these features, enforcing the organization’s data access policies on\n", + "data science processes.\n", + "\n", + "**Extracting the complexities of ML**\n", + "\n", + "[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\n", + "best practices. As a glass box solution, AutoML first generates a collection of\n", + "notebooks representing various aligned model variations. In addition to iteratively\n", + "training models, AutoML allows you to access the notebooks associated with each\n", + "model, creating an editable starting point for further exploration.\n", + "\n", + "**Streamlining the overall ML lifecycle**\n", + "\n", + "[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\n", + "Databricks Lakehouse. This repository enables tracking and analysis of the various\n", + "model iterations generated by both AutoML and custom training cycles alike.\n", + "\n", + "When used in combination with the Databricks Feature Store, models persisted\n", + "with MLflow can retain knowledge of the features used during training. As models\n", + "are retrieved, this same information allows the model to retrieve relevant features\n", + "from the Feature Store, greatly simplifying the scoring workflow and enabling\n", + "rapid deployment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to build a propensity scoring workflow with Databricks**\n", + "\n", + "Using these features in combination, many organizations implement propensity\n", + "scoring as part of a three-part workflow:\n", + "\n", + "**1.** Data engineers work with data scientists to define features relevant\n", + "to the propensity scoring exercise and persist these to the Feature Store.\n", + "Daily or even real-time feature engineering processes are then defined\n", + "to calculate up-to-date feature values as new data inputs arrive.\n", + "\n", + "Model Training\n", + "and Deployment\n", + "\n", + "\n", + "**2.** As part of the inference workflow, customer identifiers are presented to\n", + "previously trained models in order to generate propensity scores based on\n", + "the latest features available. Feature Store information captured with the\n", + "model allows data engineers to retrieve these features and easily generate\n", + "the desired scores, which can then be used for analysis within Databricks\n", + "Lakehouse or published to downstream marketing systems.\n", + "\n", + "**3.** In the model-training workflow, data scientists periodically retrain the\n", + "propensity score models to capture shifts in customer behaviors. As these\n", + "models are persisted to MLfLow, change management processes are used\n", + "to evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\n", + "production version of each model is retrieved to generate customer scores.\n", + "\n", + "\n", + "Score Generation\n", + "and Publication ETL\n", + "\n", + "**Need help assessing interest from your**\n", + "**target audience?**\n", + "\n", + "\n", + "Feature\n", + "Engineering ETL\n", + "\n", + "Feature Store Profiles\n", + "\n", + "\n", + "Sales\n", + "\n", + "Promotions\n", + "\n", + "Customer\n", + "\n", + "\n", + "Use the **Propensity Scoring Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Downstream\n", + "Applications\n", + "\n", + "\n", + "A three-part propensity scoring workflow.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Delivering Personalized Customer Journeys\n", + "\n", + "Strategies for crafting a real-time recommendation engine\n", + "\n", + "\n", + "As the economy continues to weather unpredictable disruptions, shortages and\n", + "demand, delivering personalized customer experiences at speed and scale will\n", + "require adaptability on the ground and within a company’s operational tech stack.\n", + "\n", + "\n", + "With the Databricks Lakehouse, Al-Futtaim has transformed their data\n", + "strategy and operations, allowing them to create a “golden customer\n", + "record” that improves all decision-making from forecasting demand to\n", + "powering their global loyalty program.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/al-futtaim)\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "“Databricks Lakehouse allows every division in our\n", + "organization — from automotive to retail — to gain\n", + "a unified view of our customer across businesses.\n", + "With these insights, we can optimize everything from\n", + "forecasting and supply chain, to powering our loyalty\n", + "program through personalized marketing campaigns,\n", + "cross-sell strategies and offers.”\n", + "\n", + "**D M I T R I Y D O V G A N**\n", + "Head of Data Science, Al-Futtaim Group\n", + "\n", + "As COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\n", + "safety and community, brands most attuned to changing needs and sentiments\n", + "saw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\n", + "business and many lost, organizations that had already begun the journey toward\n", + "improved customer experience saw better outcomes, closely mirroring patterns\n", + "[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n", + "\n", + "\n", + "**Creating a unified view across 200+ brands**\n", + "\n", + "As a driving force for economic growth in the Middle East, Al-Futtaim\n", + "impacts the lives of millions of people across the region through the\n", + "distribution and operations of global brands like Toyota, IKEA, Ace\n", + "Hardware and Marks & Spencer.\n", + "\n", + "Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**\n", + "\n", + "Personalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\n", + "The [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\n", + "how they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n", + "[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing the beauty product shopping experience**\n", + "\n", + "Flaconi wanted to leverage data and AI to become the No. 1 online\n", + "beauty product destination in Europe. However, they struggled with\n", + "massive volumes of streaming data and with infrastructure complexity\n", + "that was resource-intensive and costly to scale. See how they used\n", + "Databricks to increase time-to-market by 200x, reduce staff costs by\n", + "40% and increase net order income.\n", + "\n", + "Get the full story\n", + "\n", + "\n", + "¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\n", + "Experience Performance Index in 2007-09.\n", + "\n", + "Source: Forrester Customer Experience Performance Index (2007-09); press search\n", + "\n", + "CX leaders outperform laggards, even in a down market, in this visualization of the Forrester\n", + "Customer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n", + "\n", + "\n", + "-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Flipp is an online marketplace that aggregates weekly shopping circulars,\n", + "so consumers get deals and discounts without clipping coupons. Siloed\n", + "customer data sources once made getting insights difficult. Now with\n", + "Databricks, Flipp’s data teams can access and democratize data, helping\n", + "them do their jobs more effectively while bringing better deals to users,\n", + "more meaningful insights to partners, and a 10% jump in foot traffic to\n", + "brick-and-mortar retailers.\n", + "\n", + "Get the full story\n", + "\n", + "The engines we use to serve content based on customer preferences are known\n", + "as recommenders. With some recommenders, a heavy focus on the shared\n", + "preferences of similar customers helps define what recommendations will actually\n", + "make an impact. With others, it can be more useful to focus on the properties of\n", + "the content itself (e.g., product descriptions).\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n", + "\n", + "\n", + "Providing deep, effective personalized experiences to customers depends\n", + "on a brand’s ability to intelligently leverage consumer and market data from a\n", + "wide variety of sources to fuel faster, smarter decisions — without sacrificing\n", + "accuracy for speed. The Databricks Lakehouse Platform is purpose-built for\n", + "exactly that, offering a scalable data architecture that unifies all your data,\n", + "analytics and AI to deliver unforgettable customer experiences.\n", + "\n", + "Created on open source and open standards, Databricks offers a robust\n", + "and cost-effective platform for brands to collaborate with partners, clients,\n", + "manufacturers and distributors to unleash more innovation and efficiencies\n", + "at every touch point. Businesses can rapidly ingest available data in real time,\n", + "\n", + "\n", + "at scale, and create accessible, data-driven insights that enable actionable\n", + "strategies across the value chain.\n", + "\n", + "Databricks is a multicloud platform, designed for quick enterprise development.\n", + "Teams using the Lakehouse can more effectively reveal the 360-degree view into\n", + "their company’s operational health and the evolving needs of their customers\n", + "— all while empowering teams to easily unify data efforts, perform fine-grained\n", + "analyses and streamline cross-functional data operations using a single,\n", + "sophisticated solution.\n", + "\n", + "\n", + "###### Learn more about Databricks Lakehouse for industries\n", + " like Retail & Consumer Goods, Media & Entertainment\n", + " and more at databricks.com/solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n", + "\n", + "a mission to help data teams solve the world’s toughest problems. To learn more,\n", + "\n", + "follow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n", + "\n", + "##### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
#### eBook\n", + "\n", + "# Big Book of Retail\n", + " & Consumer Goods Use Cases\n", + "\n", + "##### Driving real-time decisions\n", + " with the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n", + "\n", + "Common challenges 6\n", + "\n", + "The Lakehouse for Retail 8\n", + "\n", + "**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n", + "\n", + "Case Study: Gousto 14\n", + "\n", + "Case Study: ButcherBox 14\n", + "\n", + "**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n", + "\n", + "Case Study: Embark 16\n", + "\n", + "**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n", + "\n", + "Case Study: H&M 19\n", + "\n", + "Case Study: Edmunds 19\n", + "\n", + "**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n", + "\n", + "**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n", + "\n", + "Case Study: Reckitt 25\n", + "\n", + "**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n", + "\n", + "**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n", + "\n", + "Case Study: Wehkamp 31\n", + "\n", + "Case Study: Columbia 31\n", + "\n", + "Case Study: Pandora 31\n", + "\n", + "**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n", + "\n", + "**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n", + "\n", + "Case Study: ButcherBox 37\n", + "\n", + "Case Study: Sam’s Club 37\n", + "\n", + "**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n", + "\n", + "**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n", + "\n", + "**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n", + "\n", + "**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "### Introduction\n", + "\n", + "\n", + "Retailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n", + "\n", + "e-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n", + "\n", + "decisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n", + "\n", + "to support these decisions, but legacy systems are limited to data that’s hours or days old.\n", + "\n", + "**When seconds matter, only the Lakehouse delivers better decisions**\n", + "\n", + "Retail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n", + "\n", + "The integration of physical and e-commerce customer experiences into an omnichannel journey has been\n", + "\n", + "happening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n", + "\n", + "purchasing patterns.\n", + "\n", + "In reaction to these industry changes, retailers have responded with significant, rapid investments — including\n", + "\n", + "stronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n", + "\n", + "capabilities have addressed the immediate need — and created expectations of making decisions in real\n", + "\n", + "time — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n", + "\n", + "Unfortunately, most legacy systems are only able to process information in hours or days.\n", + "\n", + "The delays caused by waiting for data are leading to significant risks and costs for the industry.\n", + "\n", + "**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n", + "\n", + "the-minute order data. Not having this information causes them to spend more resources on having people\n", + "\n", + "pick orders separately, at a higher operating cost.\n", + "\n", + "**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n", + "\n", + "that in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n", + "\n", + "sales, or worse, the customer becoming unsatisfied and moving to different retailers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n", + "\n", + "and other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n", + "\n", + "week.\n", + "\n", + "The margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n", + "\n", + "zero. Reducing the error rate requires better predictions and real-time data.\n", + "\n", + "**Use Case Guide**\n", + "\n", + "In this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n", + "\n", + "**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n", + "\n", + "**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n", + "\n", + "**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n", + "\n", + "performance, while achieving a market-leading total cost of ownership.\n", + "\n", + "Databricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n", + "\n", + "in real time. This use case guide also highlights common use cases across the industry, and offers additional\n", + "\n", + "resources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n", + "\n", + "journey to drive better customer experiences with data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "### Modern Data Platform\n", + " for Real-Time Retail\n", + "\n", + "\n", + "Retailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n", + "\n", + "changes, retailers are increasingly focused on improving the real-time availability of data and insights, and\n", + "\n", + "performing advanced analytics delivered within tight business service windows.\n", + "\n", + "**Common challenges**\n", + "\n", + "In response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n", + "\n", + "in modernizing distribution centers, partnering with delivery companies, and investing in customer\n", + "\n", + "engagement systems.\n", + "\n", + "Warehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n", + "\n", + "distribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n", + "\n", + "that became accustomed to having fast, same-day, and sometimes even overnight delivery options\n", + "\n", + "during the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n", + "\n", + "experience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n", + "\n", + "## $41B Market | Retail Warehouse Automation\n", + "\n", + "Yet while retailers modernize different areas of their operations, they’re constrained by a single point of\n", + "\n", + "weakness, as they are reliant on legacy data platforms to bring together all of this data.\n", + "\n", + "Powering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n", + "\n", + "governance of information, and powering business intelligence and predictive analytics all within the time\n", + "\n", + "required by retail operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n", + "\n", + "is the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n", + "\n", + "systems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n", + "\n", + "and other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n", + "\n", + "aggregated and integrated with each other before they can be used. The problem? Retailers have used\n", + "\n", + "legacy data warehouses that are built around batch processing. And worse, increasing the frequency\n", + "\n", + "of how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n", + "\n", + "merchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n", + "\n", + "other data sets. The result? Accurate data to drive decisions can be delayed by days.\n", + "\n", + "**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n", + "\n", + "trade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n", + "\n", + "Running forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n", + "\n", + "but doing so requires tens of millions of model calculations that need to be performed in narrow service\n", + "\n", + "windows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n", + "\n", + "forced to accept the trade-off and live with less accurate predictions.\n", + "\n", + "**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n", + "\n", + "real-time data to thousands of employees is a daunting task. While data warehouses are capable\n", + "\n", + "of serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n", + "\n", + "frequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n", + "\n", + "decisions that are more frequent.\n", + "\n", + "**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n", + "\n", + "focused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n", + "\n", + "a trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n", + "\n", + "struggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n", + "\n", + "deliver personalized experiences at scale to win in retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### The Lakehouse for Retail\n", + "\n", + "Databricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n", + "\n", + "all types of data — from images to structured data — in real time, provide enterprise-class management\n", + "\n", + "and governance, and then immediately turn that data into actionable insights with real-time reporting and\n", + "\n", + "predictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n", + "\n", + "(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n", + "\n", + "**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n", + "**or frequency** **processing** **any persona** **& collaboration**\n", + "\n", + "_Semi-structured batch_\n", + "\n", + "\n", + "**All of**\n", + "**your sources**\n", + "\n", + "Competitive activity\n", + "\n", + "E-commerce\n", + "\n", + "Mobile Applications\n", + "\n", + "Video & Images\n", + "\n", + "Point of Sale\n", + "\n", + "Distribution & Logistics\n", + "\n", + "Customer & Loyalty\n", + "\n", + "Delivery & Partners\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Unstructured batch_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Structured batch_\n", + "\n", + "\n", + "Data Lakehouse\n", + "\n", + "Data Management and Governance\n", + "\n", + "Process, manage and query all of your data\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "**Internal Teams**\n", + "\n", + "Production\n", + "Machine Learning\n", + "\n", + "**Customers**\n", + "\n", + "BI Reporting\n", + "& Dashboarding\n", + "\n", + "**Partners**\n", + "\n", + "Real-time Applications\n", + "\n", + "\n", + "Any Cloud\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture**\n", + "\n", + "At the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n", + "\n", + "offs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n", + "\n", + "that combines the best elements of data warehouses and data lakes — to directly address these factors by\n", + "\n", + "enabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n", + "\n", + "managed and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n", + "\n", + "data scientists and data engineers can all leverage this information to serve models for applications and\n", + "\n", + "power real-time reporting, advanced analytics, large-scale forecasting models and more.\n", + "\n", + "**EDGE** **HYBRID** **CLOUD**\n", + "\n", + "\n", + "\n", + "REST Model Serving\n", + "\n", + "|Machine Learning Operations Tracking Registery|RES|\n", + "|---|---|\n", + "||Application|\n", + "\n", + "\n", + "\n", + "Replication\n", + "\n", + "\n", + "Automatic DBs\n", + "\n", + "|Col1|Real-tim|\n", + "|---|---|\n", + "|||\n", + "\n", + "\n", + "Raw Data\n", + "\n", + "(Bronze Table)\n", + "\n", + "\n", + "Clean Data\n", + "\n", + "(Silver Table)\n", + "\n", + "\n", + "Refined Data\n", + "\n", + "(Gold Table)\n", + "\n", + "\n", + "Business\n", + "Applications\n", + "\n", + "Power BI\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How it works\n", + "\n", + "The Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n", + "\n", + "simplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n", + "\n", + "differentiated capabilities that help retailers win.\n", + "\n", + "Robust data Time-sensitive machine\n", + "Data in real time Use all of your data Real-time reporting\n", + "management learning\n", + "\n", + "\n", + "**Limited.** EDWs support the\n", + "\n", + "management of structured\n", + "\n", + "data.\n", + "\n", + "**No.** Data lakes lack\n", + "\n", + "enterprise-class data\n", + "\n", + "management tools.\n", + "\n", + "**Yes.** Delta and Unity\n", + "\n", + "Catalog offer native\n", + "\n", + "data management and\n", + "\n", + "governance of all data types.\n", + "\n", + "\n", + "**No.** EDWs offer quick access\n", + "\n", + "to reports on old data.\n", + "\n", + "**No.** Data lakes were not\n", + "\n", + "designed for reporting, let\n", + "\n", + "alone real-time reporting.\n", + "\n", + "**No.** Data lakes are able to\n", + "\n", + "support large analytics,\n", + "\n", + "but lack the ability to meet\n", + "\n", + "business SLAs.\n", + "\n", + "\n", + "**No.** EDWs must extract data\n", + "\n", + "and send it to a third party\n", + "\n", + "for machine learning.\n", + "\n", + "**Yes.** Data views can be\n", + "\n", + "materialized, enabling front-\n", + "\n", + "line employees with real-\n", + "\n", + "time data.\n", + "\n", + "**Yes.** The Lakehouse can\n", + "\n", + "scale to process the most\n", + "\n", + "demanding predictions\n", + "\n", + "within business SLAs.\n", + "\n", + "\n", + "**No.** Data warehouses are\n", + "\n", + "batch oriented, restricting\n", + "\n", + "data updates to hours or days.\n", + "\n", + "**No.** Data lakes are batch\n", + "\n", + "oriented.\n", + "\n", + "**Yes.** Support for real-time\n", + "\n", + "streaming data.\n", + "\n", + "\n", + "**No.** Data warehouses have\n", + "\n", + "very limited support for\n", + "\n", + "unstructured data.\n", + "\n", + "**Yes.** Data lakes offer support\n", + "\n", + "for all types of data.\n", + "\n", + "**Yes.** Supports all types of\n", + "\n", + "data in a centrally managed\n", + "\n", + "platform.\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "**WAREHOUSE**\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "**(HADOOP)**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "**DATA**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n", + "\n", + "for streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n", + "\n", + "and point-of-sale data. And Delta Lake enables this world-record-leading performance while\n", + "\n", + "maintaining support for ACID transactions.\n", + "\n", + "**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n", + "\n", + "and a growing variety of other data sources. This data is extremely powerful in helping to improve our\n", + "\n", + "understanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n", + "\n", + "to take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n", + "\n", + "architecture.\n", + "\n", + "**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n", + "\n", + "was lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n", + "\n", + "compliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n", + "\n", + "to a modern data architecture does not require sacrificing enterprise maturity.\n", + "\n", + "**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n", + "\n", + "or recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n", + "\n", + "can scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n", + "\n", + "sensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n", + "\n", + "availability and out-of-stocks, and delivering highly personalized predictions.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "By using Databricks to build and support your lakehouse, you can empower your business with even more\n", + "\n", + "speed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n", + "\n", + "start with the use case that will have the most impact on your business. As you implement the pattern, you\n", + "\n", + "will find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n", + "\n", + "guidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3**\n", + "### Use Case:\n", + " Real-Time Supply\n", + " Chain Data\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "As companies see a surge in demand from e-commerce and delivery services, and seek increasing\n", + "\n", + "efficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n", + "\n", + "roadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n", + "\n", + "items are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n", + "\n", + "control tower.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Manufacturers Distributors Logistics Restaurants\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n", + "\n", + "happening and when a customer can act on it\n", + "\n", + "**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n", + "\n", + "have the added pressure to take immediate action on it\n", + "\n", + "**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n", + "\n", + "plant operations or as part of a supply chain control tower.\n", + "\n", + "**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n", + "\n", + "improvement in speed to data, with greater reliability\n", + "\n", + "**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n", + "\n", + "that they were able to move from batch to real-time at neutral costs\n", + "\n", + "**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n", + "\n", + "real-time data ingestions. Customers frequently report that the amount of code required to support\n", + "\n", + "streaming ingestion is 50% less than previous solutions.\n", + "\n", + "**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n", + "\n", + "happening, predict and prevent issues, and even gain days on major changes such as production\n", + "\n", + "schedules between shifts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks allows for both streaming and batch data sets to be ingested and made available to enable\n", + "\n", + "real-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n", + "\n", + "ACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n", + "\n", + "Delta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n", + "\n", + "learning experiments.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n", + "\n", + "data, IoT sensor, transportation management\n", + "\n", + "\n", + "-----\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n", + "\n", + "daily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n", + "\n", + "provided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n", + "\n", + "outbreak by providing real-time insight into performance on the factory picking lines.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "As a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n", + "\n", + "hundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n", + "\n", + "data in under three minutes.\n", + "\n", + "Now, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n", + "\n", + "address any logistical and delivery issues.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4**\n", + "### Use Case: Truck Monitoring\n", + "\n", + "\n", + "With many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n", + "\n", + "of trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n", + "\n", + "manner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n", + "\n", + "delays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n", + "\n", + "\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n", + "\n", + "\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n", + "\n", + "accidents or shipment delays\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n", + "\n", + "to delivery.\n", + "\n", + "**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n", + "\n", + "ability to monitor driver safety more immediately\n", + "\n", + "**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n", + "\n", + "expansion without sacrificing data quality and speed\n", + "\n", + "**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n", + "\n", + "proactive maintenance and reduced risk of accidents\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n", + "\n", + "to route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n", + "\n", + "analytics provide companies with the tools they need to scale and improve their operations.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, transportation management, manufacturing, predictive maintenance\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n", + "\n", + "to unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n", + "\n", + "via dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n", + "\n", + "autonomous trucks.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 5**\n", + "### Use Case: Inventory Allocation\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Replenishment planning is the process of determining what needs to go where. It is used by replenishment\n", + "\n", + "planning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n", + "\n", + "vendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n", + "\n", + "and on what day.\n", + "\n", + "Replenishment is challenging for companies because it deals with rapidly changing data and the need to\n", + "\n", + "make complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n", + "\n", + "data to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n", + "\n", + "number of products being sent to stores. This results in lost sales and low customer satisfaction.\n", + "\n", + "Inventory allocation is a process that might be performed multiple times a day during peak seasons, or\n", + "\n", + "daily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n", + "\n", + "multiple times a day — on demand and dynamically — during peak season without paying a premium for\n", + "\n", + "this capability throughout the year.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Restaurants\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n", + "\n", + "This information is used to determine which products get put on trucks and go to specific stores.\n", + "\n", + "\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n", + "\n", + "the service windows\n", + "\n", + "\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n", + "\n", + "attributes that may be more or less popular by store\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n", + "\n", + "\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n", + "\n", + "improvement in forecast accuracy\n", + "\n", + "\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n", + "\n", + "\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n", + "\n", + "information on flavors or apparel sizes for specific stores\n", + "\n", + "**Solution overview**\n", + "\n", + "The objective of inventory allocation is to quickly determine when to distribute items and where — from\n", + "\n", + "warehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n", + "\n", + "rate of products, the available inventory and the shipping schedules, and then using this information to\n", + "\n", + "create an optimized manifest of what items should be carried on which trucks, at what point, and at what\n", + "\n", + "time. This becomes the plan for route accounting systems that arrange deliveries.\n", + "\n", + "Inventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n", + "\n", + "in a store for a long time, that store may receive heightened priority for the item in the allocation.\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "\n", + "**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n", + "\n", + "stock, promotions data, weather\n", + "\n", + "**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n", + "\n", + "**demand forecasting.**\n", + "\n", + "**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n", + "\n", + "Our most popular notebook at Databricks. This blog walks you through the business and technical\n", + "\n", + "challenges of performing demand forecasting and explains how we approached solving it.\n", + "\n", + "**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n", + "\n", + "Video and Q&A from our webinar with Starbucks.\n", + "\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "H&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n", + "\n", + "performant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n", + "\n", + "driven organization that could better forecast operations to streamline costs and boost revenue.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Edmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n", + "\n", + "Platform, they are able to simplify access to their disparate data sources and build ML models that make\n", + "\n", + "predictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n", + "\n", + "on their website is accurate and up to date, improving overall customer satisfaction.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 6**\n", + "### Use Case: Point of Sale\n", + " and Clickstream\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Disruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n", + "\n", + "coupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n", + "\n", + "retailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n", + "\n", + "recorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n", + "\n", + "This would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n", + "\n", + "crucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n", + "\n", + "complete and real-time snapshot of each customer’s shopping behavior.\n", + "\n", + "Near real-time availability of information means that retailers can continuously update their estimates of\n", + "\n", + "item availability. No longer is the business managing operations based on their knowledge of inventory\n", + "\n", + "states as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n", + "\n", + "they are now.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n", + "\n", + "incomplete sales data\n", + "\n", + "\u0007Both POS and clickstream data need to be unified and ingested in real time\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks brings POS and clickstream data together for a unified data source that leads to real-time\n", + "\n", + "insights and a clearer understanding of customer behavior.\n", + "\n", + "\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n", + "\n", + "clickstream data\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n", + "\n", + "customer purchasing behaviors and trends\n", + "\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 7**\n", + "### Use Case: On-Shelf Availability\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Ensuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n", + "\n", + "missing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n", + "\n", + "their stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n", + "\n", + "worldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n", + "\n", + "according to industry research firm IHL.\n", + "\n", + "In the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n", + "\n", + "of going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n", + "\n", + "belong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n", + "\n", + "items online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n", + "\n", + "buy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n", + "\n", + "$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n", + "\n", + "On-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n", + "\n", + "considered in stock when it is actually in a current customer’s basket. If another customer places the same\n", + "\n", + "item in their basket, there is the possibility that the first customer will purchase the last available item\n", + "\n", + "before the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n", + "\n", + "these situations, customers may order an item that is picked for delivery at a much later time. The window\n", + "\n", + "between ordering and picking creates the probability of out-of-stocks.\n", + "\n", + "On-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n", + "\n", + "replenishment points, and generates a signal that suggests an item may be out of stock. This information is\n", + "\n", + "used to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n", + "\n", + "of thousands of people around the world do work that is generated by these algorithms.\n", + "\n", + "The sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n", + "\n", + "all of their products. Companies have between midnight and 4 AM to collect all of the needed information\n", + "\n", + "and run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n", + "\n", + "the priority categories or products to analyze, which means a significant percentage of their unavailable\n", + "\n", + "products will not be proactively addressed.\n", + "\n", + "\n", + "-----\n", + "\n", + "One of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n", + "\n", + "While some retailers are investing in computer vision and robots, and others employ the use of people to\n", + "\n", + "manually survey item availability, most retailers default to a signal of determining when an item has not been\n", + "\n", + "scanned in an acceptable time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "The biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n", + "\n", + "data from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n", + "\n", + "volumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n", + "\n", + "warehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n", + "\n", + "process that can require multiple hours per night.\n", + "\n", + "For this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n", + "\n", + "different days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n", + "\n", + "levels. Among the challenges:\n", + "\n", + "\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n", + "\n", + "\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n", + "\n", + "few categories\n", + "\n", + "\u0007Dealing with false positives and negatives in predictions\n", + "\n", + "Distributing information quickly and efficiently to internal systems and external partners\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n", + "\n", + "compromises.\n", + "\n", + "**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n", + "\n", + "process large volumes of highly detailed and frequently changing point-of-sale transaction data.\n", + "\n", + "**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n", + "\n", + "**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n", + "\n", + "**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n", + "\n", + "reporting\n", + "\n", + "**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables companies to perform on-shelf availability analysis without making compromises to the\n", + "\n", + "breadth or quality of predictions.\n", + "\n", + "It begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n", + "\n", + "biggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n", + "\n", + "a data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n", + "\n", + "metadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n", + "\n", + "based systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n", + "\n", + "Once data is available, users need to generate predictions about item availability on the shelf. With its\n", + "\n", + "extremely performant engine and the ability to distribute computation across countless nodes, Spark\n", + "\n", + "provides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n", + "\n", + "or against a subset of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "In this solution, we show how the\n", + "\n", + "Databricks Lakehouse Platform enables\n", + "\n", + "real-time insights to rapidly respond\n", + "\n", + "\n", + "And lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n", + "\n", + "Lake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n", + "\n", + "feed their predictive alerts to downstream retail operations systems or even to external partners within the\n", + "\n", + "tightest service windows, and in enough time to drive actions on that day.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n", + "\n", + "manual inventory data (optional), robotic or computer vision inventory data (optional)\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Reckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n", + "\n", + "organization to struggle with the complexity of forecast demand, especially with large volumes of different\n", + "\n", + "types of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n", + "\n", + "uses predictive analytics, product placement and business forecasting to better support neighborhood\n", + "\n", + "grocery stores.\n", + "\n", + "\n", + "to demand, drive more sales by\n", + "\n", + "ensuring stock is available on shelf, and\n", + "\n", + "scale out your forecasting models to\n", + "\n", + "accommodate any size operation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 8**\n", + "### Use Case: Customer and Vehicle Identification\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "COVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n", + "\n", + "options. Retailers that were able to implement these new services have been able to differentiate overall\n", + "\n", + "customer experiences and mitigate catastrophic hits on revenue levels.\n", + "\n", + "For retailers to create a seamless contactless experience for customers, they need real-time data to\n", + "\n", + "know when a customer has arrived and where they’re located, as well as provide updates throughout the\n", + "\n", + "pickup journey. And through the use of computer vision, they can capture that data by employing optical\n", + "\n", + "recognition on images to read vehicle license plates.\n", + "\n", + "Retailers can also use information captured from license plates to make recommendations on buying\n", + "\n", + "patterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n", + "\n", + "information to better serve their customers in real time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Drive-Through\n", + "Food Retailers\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Ineffective data processing can lead to suboptimal order preparation timing\n", + "\n", + "\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n", + "\n", + "time communications throughout the entire shopping and curbside or drive-through experience.\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n", + "\n", + "preparation timing\n", + "\n", + "\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n", + "\n", + "each subsequent visit is equally as or more seamless than the last\n", + "\n", + "\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n", + "\n", + "vehicle identification and order prediction\n", + "\n", + "**CASE STUDY**\n", + "\n", + "**CASE STUDY**\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 9**\n", + "### Use Case: Recommendation Engines\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n", + "\n", + "frequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n", + "\n", + "is by recommending products and services that align with customer needs.\n", + "\n", + "Providing an experience that makes customers feel understood helps retailers stand out from the crowd\n", + "\n", + "of mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n", + "\n", + "this more critical than ever for retail organizations. With research showing the cost of customer acquisition\n", + "\n", + "is as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n", + "\n", + "continue to build deeper connections with existing customers in order to retain a solid consumer base.\n", + "\n", + "There is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n", + "\n", + "of spending.\n", + "\n", + "Recommendation engines are used to create personalized experiences for users across retail channels.\n", + "\n", + "These recommendations are generated based on the data collected from purchases, items interacted\n", + "\n", + "with, users’ behavior across physical and digital channels, and other data such as from customer service\n", + "\n", + "interactions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n", + "\n", + "behavioral data, marketers are able to create recommendations that are integrated with other business\n", + "\n", + "objectives such as highlighting items that are on promotion or product availability.\n", + "\n", + "Creating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n", + "\n", + "the customer experience in every possible area of consumer engagement, from proactive notifications and\n", + "\n", + "offers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n", + "\n", + "and upsell, and even suggestions for complementary items after the purchase.\n", + "\n", + "\n", + "-----\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "Media Telecom Financial Services\n", + "(any B2B or B2C\n", + "company)\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Recommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n", + "\n", + "but traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n", + "\n", + "recommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n", + "\n", + "irrelevant.\n", + "\n", + "**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n", + "\n", + "is improved by having recent data, and yet most systems struggle to handle the large volumes of\n", + "\n", + "information involved.\n", + "\n", + "**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n", + "\n", + "touchpoints in one place are critical to enabling this use case. More data, including transaction and\n", + "\n", + "clickstream data, is critical for driving accuracy and precision in messaging.\n", + "\n", + "**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n", + "\n", + "changing dynamics, and deliver real-time recommendations via APIs.\n", + "\n", + "**Automation.** This is an “always-on” use case where automation is essential for scalability and\n", + "\n", + "responsiveness based on frequent model updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "Many firms choose to use recommender systems from Amazon or Google. Using these systems trains\n", + "\n", + "the general recommendation engine in a way that helps competitors improve the accuracy of their own\n", + "\n", + "recommendations.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Recommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n", + "\n", + "retailers must own, and Databricks provides a solid platform for enabling this.\n", + "\n", + "Using Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n", + "\n", + "personalization, sample value metrics from a media agency include:\n", + "\n", + "**200% ROI for 70% of retailers** engaging in advanced personalization\n", + "\n", + "**10% improvement** in conversions\n", + "\n", + "**35% improvement** in purchase frequency\n", + "\n", + "**37% improvement** in customer lifetime value\n", + "\n", + "**Solution overview**\n", + "\n", + "Recommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n", + "\n", + "capturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n", + "\n", + "to combine various sources of data in a timely and efficient manner, from transactions, demographics and\n", + "\n", + "preference information across products, to clickstream, digital journey and marketing analytics data to bring\n", + "\n", + "a 360 view of customer interactions to enable omnichannel personalization.\n", + "\n", + "By identifying changes in user behavior or engagement, retailers are able to detect early signals that\n", + "\n", + "indicate a propensity to buy or a change in preferences, and recommend products and services that will\n", + "\n", + "keep consumers engaged.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n", + "\n", + "clickstream data, mobile data:\n", + "\n", + "**Engagement data** — transaction log data, clickstream data, promotion interaction\n", + "\n", + "**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n", + "\n", + "children, location\n", + "\n", + "**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n", + "\n", + "to churn\n", + "\n", + "**CASE STUDY**\n", + "\n", + "For Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n", + "\n", + "for help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n", + "\n", + "personalized to each of their customers.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Columbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n", + "\n", + "Databricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n", + "\n", + "business decisions.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Pandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n", + "\n", + "Lakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n", + "\n", + "quarterly revenue.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Databricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "with content-based and collaborative\n", + "\n", + "filter methods, and both item-\n", + "\n", + "and user-based analysis. These\n", + "\n", + "accelerators have been further refined\n", + "\n", + "to be highly performant to enable\n", + "\n", + "frequent retraining of models.\n", + "\n", + "To begin working on recommendation\n", + "\n", + "engines, contact your Databricks\n", + "\n", + "account team.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 10**\n", + "### Use Case: Perpetual Inventory\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "With the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n", + "\n", + "customer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n", + "\n", + "inventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n", + "\n", + "levels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n", + "\n", + "accurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n", + "\n", + "The key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n", + "\n", + "records related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n", + "\n", + "and lower overall costs.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Supply Chain\n", + "\n", + "\n", + "Inventory\n", + "Management\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n", + "\n", + "**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n", + "\n", + "view of inventory\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n", + "\n", + "the latest sales data\n", + "\n", + "**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n", + "\n", + "companies know they’re getting the most accurate information at any point\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n", + "\n", + "management costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 11**\n", + "### Use Case: Automated\n", + " Replenishments\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers favor convenience more than ever when it comes to their goods, and automated replenishments\n", + "\n", + "help meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n", + "\n", + "key role in ensuring consumers get a refill automatically delivered at the right time.\n", + "\n", + "On the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n", + "\n", + "reducing the time needed to forecast, order and receive thousands of items.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Direct to\n", + "Customer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n", + "\n", + "replenishment orders\n", + "\n", + "With VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n", + "\n", + "for replenishment even when the customer can’t fulfill that order\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n", + "\n", + "customer needs\n", + "\n", + "**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n", + "\n", + "unique properties and expiry dates\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 12**\n", + "### Use Case: Fresh Food Forecasting\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Fresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n", + "\n", + "store traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n", + "\n", + "range of suppliers to work with and the products expire, which creates significant amounts of waste.\n", + "\n", + "In order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n", + "\n", + "sell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n", + "\n", + "timing for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n", + "\n", + "changing needs around fresh food.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Distributors Logistics Restaurants\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n", + "\n", + "enough to conduct daily forecasting and daily replenishment\n", + "\n", + "**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n", + "\n", + "**\u0007** Customers are forced to compromise on what they can analyze\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team to get\n", + "\n", + "started with inventory allocation. Databricks\n", + "\n", + "does not have a Solution Accelerator.\n", + "\n", + "View our webinar covering demand forecasting\n", + "\n", + "with Starbucks and then read our blog about\n", + "\n", + "demand forecasting.\n", + "\n", + "[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n", + "\n", + "This blog details the importance of time series\n", + "\n", + "forecasting, walks through building a simple\n", + "\n", + "model to show the use of Facebook Prophet, and\n", + "\n", + "then shows off the combination of Facebook\n", + "\n", + "Prophet and Adobe Spark to scale to hundreds\n", + "\n", + "of models.\n", + "\n", + "[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n", + "\n", + "Video and Q&A from our webinar with Starbucks\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\n", + "spoiled products, as well as lower inventory and handling costs.\n", + "\n", + "**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n", + "\n", + "double-digit improvement in forecast accuracy\n", + "\n", + "**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n", + "\n", + "millions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\n", + "few hours.\n", + "\n", + "**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n", + "\n", + "the products they forecast. They can predict demand for all products as frequently as required.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Databricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\n", + "Solution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\n", + "can be efficiently scaled to tens of millions of predictions in tight service windows.\n", + "\n", + "**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n", + "\n", + "expiration dates and weather.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "ButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\n", + "customer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\n", + "Databricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\n", + "rest of its data estate.\n", + "\n", + "\n", + "on demand forecasting.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Sam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\n", + "trillions of events going through the company. Find out how Databricks became a key component in the shift\n", + "from on premises Hadoop clusters to a cloud based platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 13**\n", + "### Use Case: Propensity-to-Buy\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers often have repeatable purchase patterns that may not be noticed upon initial observation.\n", + "\n", + "While we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n", + "\n", + "mornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n", + "\n", + "predict these buying moments when customers are not in our stores?\n", + "\n", + "The purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n", + "\n", + "purchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n", + "\n", + "models leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n", + "\n", + "useful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n", + "\n", + "models are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n", + "\n", + "data from receipts, and causal data that helps to explain when and why customers make purchases.\n", + "\n", + "Propensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n", + "\n", + "management, email and mobile alerts, recommendations and others.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n", + "\n", + "outreach to customers to avoid angering them.\n", + "\n", + "Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequently\n", + "\n", + "Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n", + "\n", + "efficiently synthesize this into data for analysis\n", + "\n", + "**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequency\n", + "\n", + "**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Propensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n", + "\n", + "moment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n", + "\n", + "incorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n", + "\n", + "moment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n", + "\n", + "or even later in the day you have lost the opportunity to act\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "With the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n", + "\n", + "enables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n", + "\n", + "to refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n", + "\n", + "systems, where the consumer can be engaged.\n", + "\n", + "As this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n", + "\n", + "Calculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n", + "\n", + "periods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n", + "\n", + "skipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n", + "\n", + "mobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 14**\n", + "### Use Case: Next Best Action\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "The e-commerce boom over the last couple of years has given consumers ample choice for digital\n", + "\n", + "shopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n", + "\n", + "risk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n", + "\n", + "best action for customers, you can greatly increase your conversion rates.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Direct to\n", + "Consumer\n", + "\n", + "\n", + "E-commerce\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Siloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n", + "\n", + "resulting in suboptimal recommendations for the next best action\n", + "\n", + "Companies need to ingest large amounts of data in real time and then take action on it immediately\n", + "\n", + "Many businesses still struggle with training their ML models to properly determine the next best action\n", + "\n", + "(and self-optimize based on the results)\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "**Value with Databricks:**\n", + "\n", + "Databricks provides all the tools needed to **process large volumes of data and find the next best**\n", + "\n", + "**action** at any given point in the customer journey\n", + "\n", + "**Near real-time insights** — the greater speed to data means businesses can react immediately to\n", + "\n", + "customer actions\n", + "\n", + "**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n", + "\n", + "basic information, transactional data, online behavior/purchase history, and more) to get a complete\n", + "\n", + "customer profile\n", + "\n", + "**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n", + "\n", + "step for customers\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 15**\n", + "### Customers That Innovate With Databricks Lakehouse for Retail\n", + "\n", + "\n", + "Some of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n", + "\n", + "for Retail to deliver real-time experiences to their customers.\n", + "\n", + "Today, data is at the core of every innovation in the retail and consumer packaged goods industry.\n", + "\n", + "Databricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n", + "\n", + "harness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n", + "\n", + "experiences to customers.\n", + "\n", + "Get started with a free trial of Lakehouse for Retail and start building better data applications today.\n", + "\n", + "**[Start your free trial](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
**eBook**\n", + "\n", + "# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n", + "\n", + "### Real-world use cases with Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Three Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n", + "\n", + "The Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n", + "\n", + "Common Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n", + "\n", + "Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n", + "\n", + "Key Use Cases for Insurance:\n", + "\n", + "**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n", + "\n", + "**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n", + "\n", + "**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n", + "\n", + "**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n", + "\n", + "Global Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n", + "\n", + "**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n", + "\n", + "Get Started With Industry Solutions ............................................................................................................................................................. **20**\n", + "\n", + "Conclusion ................................................................................................................................................................................................................... **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "With the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\n", + "from the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\n", + "insights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\n", + "data from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\n", + "clickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n", + "\n", + "Consumers want stronger reassurance for what they value most: financial security and greater peace of mind.\n", + "Insurers have always prided themselves on delivering such protection and security. However, customer needs\n", + "have changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\n", + "challenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\n", + "their customers to remain competitive.\n", + "\n", + "Data-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\n", + "pricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\n", + "digital investments and enterprise data strategy has become a top priority for boards and senior executives\n", + "in the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\n", + "to having one reliable source of truth for data, which is derived from batch and streaming data, structured and\n", + "unstructured data, from multiple clouds and jurisdictions.\n", + "\n", + "\n", + "In a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\n", + "fundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\n", + "Databricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\n", + "transform their businesses, resulting in significant operational efficiencies and improved customer experiences\n", + "at a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\n", + "and common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\n", + "Lakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\n", + "partners available to assist you on your journey.\n", + "\n", + "\n", + "**The future of insurance will**\n", + "\n", + "**become increasingly data-driven,**\n", + "\n", + "**and analytics enabled.”**\n", + "\n", + "**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Lakehouse reference architecture below illustrates a sample framework upon\n", + "which insurers can build. Moving from left to right in the diagram, the first layer\n", + "represents various data sources such as on-premises systems, web and mobile\n", + "applications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\n", + "is then ingested through automated data pipelines, and processed within the\n", + "Lakehouse platform across three layers (Bronze, Silver and Gold). These layers\n", + "are responsible for data preparation, including ML model registry, centralized\n", + "\n", + "\n", + "governance, workflow orchestration, and job scheduling. They ensure a compliant\n", + "and secure infrastructure that sits atop the cloud layer (or multiple clouds),\n", + "eliminating the need for data duplication. Finally, the transformed data is delivered\n", + "as actionable insights and supports use cases such as automated reporting,\n", + "business analytics, customer 360, and claims analytics. These use cases not only\n", + "mitigate risk but also drive revenue.\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**On-Premises**\n", + "**Servers**\n", + "\n", + "\n", + "**Ingestion**\n", + "\n", + "\n", + "**Lakehouse for Financial Services**\n", + "\n", + "**Bronze Layer** **Silver Layer** **Gold Layer**\n", + "\n", + "\n", + "**Serving**\n", + "\n", + "**Automated**\n", + "**Reporting**\n", + "\n", + "\n", + "**Web and Mobile**\n", + "**Applications**\n", + "\n", + "\n", + "**Business Analytics**\n", + "**and Interactive**\n", + "**Dashboards**\n", + "\n", + "\n", + "**Raw Entity Data**\n", + "\n", + "\n", + "**Curated Feature**\n", + "**Sets**\n", + "\n", + "\n", + "**Aggregated**\n", + "**Business Views**\n", + "\n", + "\n", + "**Automated Data Pipelines**\n", + "**(Batch or Streaming)**\n", + "\n", + "**Collaborative**\n", + "**Data Source**\n", + "\n", + "\n", + "**Internet-of-Things**\n", + "**(IoT) Devices**\n", + "\n", + "\n", + "**Enterprise Data**\n", + "**Warehouses**\n", + "\n", + "\n", + "**Third-Party APIs**\n", + "**and Services**\n", + "\n", + "\n", + "**ML Model**\n", + "**Registry**\n", + "\n", + "\n", + "**Centralized Data**\n", + "**Governance**\n", + "\n", + "\n", + "**Workflow**\n", + "**Orchestration**\n", + "\n", + "\n", + "**Productionized**\n", + "**Referenced Data**\n", + "**and Models**\n", + "\n", + "**Job Scheduling**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Three Trends Driving Transformation in Insurance\n", + "\n", + "Over the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\n", + "The following three trends are driving this transformation in the insurance industry:\n", + "\n", + "\n", + "**The rapid emergence of large language**\n", + "**models and generative AI**\n", + "\n", + "In recent years, there has been a significant\n", + "breakthrough in the field of artificial intelligence with\n", + "the emergence of large language models (LLMs)\n", + "and generative AI. These models, such as GPT-4 and\n", + "its predecessors, Databricks Dolly and others are\n", + "built using deep learning techniques and massive\n", + "amounts of training data, enabling them to generate\n", + "human-like text and perform a wide range of natural\n", + "language processing tasks. LLMs and generative AI\n", + "can help insurance companies automate repetitive\n", + "tasks such as underwriting, claims processing,\n", + "\n", + "and customer service, improving efficiency and\n", + "reducing costs. They can also help insurers to better\n", + "understand customer needs and preferences,\n", + "leading to more personalized products and services.\n", + "However, as with any disruptive technology, the\n", + "adoption of LLMs and generative AI will require\n", + "careful consideration of ethical and regulatory\n", + "issues, such as data privacy and algorithmic bias.\n", + "\n", + "\n", + "**Transformed ecosystems**\n", + "**and open insurance**\n", + "\n", + "[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\n", + "insurtechs in their ecosystems to achieve high\n", + "margins in commoditized products. Open insurance,\n", + "which involves sharing and managing insurancerelated data through APIs, is more than an item in\n", + "the regulatory agenda. It can give consumers access\n", + "to better products and accurate pricing, as well as\n", + "enable them to execute transactions more easily.\n", + "In its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\n", + "found that organizations that promote external data\n", + "sharing have three times the measurable economic\n", + "\n", + "benefit across a variety of performance metrics\n", + "compared to their peers.\n", + "\n", + "\n", + "**Revised target operating model**\n", + "**with a focus on talent**\n", + "\n", + "Demographic shifts and perennial cost pressures\n", + "make it critical for insurers to attract and retain\n", + "talent. Consequently, it’s important for insurers\n", + "to equip their workforces with the right tools\n", + "and technologies to help them identify business\n", + "processes that can be optimized to differentiate\n", + "themselves from their competitors, with an emphasis\n", + "on moments that matter in the customer journey,\n", + "according to EY. Recent research from Deloitte\n", + "highlights the advantages of upskilling and building\n", + "a future-ready workforce. One of the benefits\n", + "\n", + "of AI adoption in the workforce is that it enables\n", + "organizations to automate a wide range of business\n", + "processes, boosting speed and efficiency. But what’s\n", + "even more important is that it enables employees to\n", + "focus on higher-value work, according to Deloitte.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for Modern Data Infrastructure\n", + "\n", + "**Insurers turning to cloud and data analytics**\n", + "\n", + "\n", + "The insurance industry has undergone significant changes over the years, and\n", + "one of the areas that has evolved the most is data management. With the\n", + "growing need for advanced analytics and digital transformation, many insurance\n", + "companies are turning to cloud technology and modern data infrastructures\n", + "to enhance their data management strategies. The benefits of adopting cloud\n", + "technology are numerous, particularly the ability to efficiently store and quickly\n", + "access vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\n", + "insurers to scale costs, adapt to changing work environments, and meet evolving\n", + "customer and business requirements.\n", + "\n", + "\n", + "dynamic pricing and underwriting, and form the foundation for claims automation.\n", + "By implementing advanced analytics, insurers can innovate more easily, scale their\n", + "businesses, and bring new products to market more quickly.\n", + "\n", + "To remain competitive, insurance companies must increase their investment in\n", + "cloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\n", + "policy administration, and customer satisfaction. Overall, the adoption of cloud\n", + "technology and data analytics is imperative for insurance providers to enhance\n", + "operational efficiency, improve business processes, and stay relevant in today’s\n", + "fast-paced business landscape.\n", + "\n", + "\n", + "Furthermore, insurance providers can leverage the cloud to analyze customer\n", + "data at scale, gaining insights into behaviors that drive hyper-personalization,\n", + "\n", + "\n", + "-----\n", + "\n", + "**Let’s take a closer look look at a few examples:**\n", + "\n", + "\n", + "**Auto insurers** need to integrate new data sources, such as weather and traffic,\n", + "to build solutions capable of real-time processing. This enables them to alert\n", + "emergency services promptly and gain a better understanding of drivers’ driving\n", + "patterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n", + "\n", + "**Commercial insurance** , including property, general liability, cyber insurance and\n", + "business income insurance, utilizes ML-based automation of actuarial models.\n", + "This automation facilitates underwriting, claims forecasting and dynamic pricing\n", + "for their customers. Another notable trend in recent years is the use of IoT-\n", + "\n", + "\n", + "based alerting for sensitive or valuable commodities. For example, in the case of\n", + "vaccines, IoT sensors can monitor the temperature in real time and send alerts to\n", + "the appropriate team or person if the temperature exceeds acceptable thresholds.\n", + "This is crucial as vaccines must be stored within specific temperature ranges.\n", + "\n", + "In **life insurance** , complex ML models can be employed to create a profile of\n", + "the customer’s lifestyle and, importantly, detect any changes to it. This deeper\n", + "understanding and 360-degree view of the customer enable more customized\n", + "underwriting and pricing based on the policyholder’s current health, lifestyle and\n", + "eating habits.\n", + "\n", + "\n", + "|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n", + "|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n", + "|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n", + "|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n", + "|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n", + "\n", + "\n", + "**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n", + "\n", + "\n", + "-----\n", + "\n", + "## Common Challenges Insurers Face Using Legacy Technology\n", + "\n", + "\n", + "Modernization is not an easy process for insurers, and while transforming IT\n", + "ecosystems is necessary to improve business outcomes, ensuring business\n", + "continuity is absolutely critical. However, the volume of data they collect, along\n", + "with changes in user behavior and legacy systems that can’t handle this amount of\n", + "data, are forcing insurance providers to accelerate their modernization journeys.\n", + "\n", + "Insurance providers face several challenges when using legacy technology, including:\n", + "\n", + "**Legacy on-premises systems:** Legacy on-premises systems are not only\n", + "expensive to maintain, but they also store large amounts of big data in silos across\n", + "the business. This makes it difficult to access the data, hindering data analytics\n", + "efforts and limiting executives’ ability to make informed business decisions.\n", + "\n", + "**Ingesting large volumes of transactional data in real time:** The inability to\n", + "ingest data from transaction systems in real time is a major obstacle to obtaining\n", + "critical insights. Transaction logs from operations such as policy administration,\n", + "enrollment and claims constantly stream data. However, many insurance\n", + "companies still rely on legacy data warehouses built around batch processing,\n", + "which is not suitable for ingesting and integrating large data sets. As a result,\n", + "insurers often opt to ingest data nightly, leading to delays in receiving accurate\n", + "data for decision-making.\n", + "\n", + "\n", + "**Performing fine-grained analysis at scale within tight time frames:** Legacy\n", + "technology forces insurers to make a trade-off when analyzing data for user intent.\n", + "They can choose between detailed and accurate predictions or fast predictions.\n", + "Running detailed forecasts can improve accuracy, but it requires performing\n", + "millions of model calculations within narrow service windows, which exceeds the\n", + "capability of legacy data platforms. Consequently, insurance companies have to\n", + "accept less accurate predictions.\n", + "\n", + "**Powering real-time decisions on the front line:** Serving real-time data to\n", + "thousands of workers is a complex task. While data warehouses can serve reports\n", + "to large groups of users, they are limited to providing stale data. As a result, most\n", + "insurers only provide daily or weekly updates to reports and rely on employees’\n", + "judgment for more frequent decisions.\n", + "\n", + "**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\n", + "to deliver personalized experiences across every channel, both digital and offline.\n", + "While insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\n", + "high volumes, leading to inaccurate analytics. To succeed in the insurance industry,\n", + "companies must deliver personalized experiences at scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Lakehouse for Insurance addresses the key challenges faced across the\n", + "insurance value chain. The lakehouse enables the integration of various data types,\n", + "including images and structured data, in real time. It offers robust management\n", + "and governance capabilities, and rapidly transforms data into actionable insights\n", + "\n", + "\n", + "through real-time reporting and predictive analytics. This platform-as-a-service\n", + "solution delivers exceptional speed and industry-leading total cost of ownership,\n", + "providing insurers with faster insights to enhance the customer experience and\n", + "gain a competitive edge.\n", + "\n", + "\n", + "**Product**\n", + "**Development &**\n", + "**Feature Selection**\n", + "\n", + "\n", + "**Application**\n", + "**Review &**\n", + "**Submission**\n", + "\n", + "\n", + "**Policy Issue,**\n", + "**Service &**\n", + "**Administration**\n", + "\n", + "\n", + "**Sales & Lead**\n", + "**Management**\n", + "\n", + "**Hyperpersonalization/**\n", + "**life events**\n", + "\n", + "\n", + "**Underwriting**\n", + "**and Pricing**\n", + "\n", + "**UW rules**\n", + "**guidelines &**\n", + "**technical pricing**\n", + "\n", + "\n", + "**Rating Offer &**\n", + "**Endorsements**\n", + "\n", + "**Evaluate**\n", + "**rate options,**\n", + "**pricing and**\n", + "**endorsements**\n", + "\n", + "\n", + "**Claims**\n", + "\n", + "\n", + "**Coverage/** **Review policy**\n", + "**features/riders** **documents**\n", + "**(submission)**\n", + "\n", + "\n", + "**Omnichannel** **Fraud, frequency,**\n", + "**severity and**\n", + "**reserves**\n", + "\n", + "\n", + "**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n", + "\n", + "\n", + "\n", + "**•** Dynamic segmentation\n", + "\n", + "**•** Personas\n", + "\n", + "**•** Hyper-personalization\n", + "\n", + "**•** Intelligent automation\n", + "\n", + "\n", + "\n", + "**•** Product architecture and\n", + "manufacturing\n", + "\n", + "**•** Configurable products\n", + "\n", + "**•** Competitor rates\n", + "\n", + "\n", + "\n", + "**•** Reflexive questionnaire\n", + "\n", + "**•** LLM assistance for\n", + "document summarization\n", + "\n", + "**•** NLP for unstructured data\n", + "\n", + "\n", + "\n", + "**•** Evaluation of risk within\n", + "appetite\n", + "\n", + "**•** Validation of UW\n", + "requirements\n", + "\n", + "**•** Straight-through\n", + "processing optimization\n", + "\n", + "**•** Risk assessment via\n", + "actuarial pricing\n", + "\n", + "**•** Triaging of risk to\n", + "underwriter SME for policy/\n", + "exposure changes\n", + "\n", + "\n", + "\n", + "**•** Predict loss cost\n", + "(frequency and severity)\n", + "\n", + "**•** Computer vision on images\n", + "to identify loss\n", + "\n", + "**•** Auto-adjudication and\n", + "triaging of claims to claim\n", + "adjuster\n", + "\n", + "**•** Tailor communication by\n", + "segment (e.g., email, text,\n", + "mail, or omnichannel)\n", + "\n", + "**•** Identify Fraud, Waste and\n", + "Abuse, route to ICU\n", + "\n", + "\n", + "**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Lakehouse for Insurance\n", + "\n", + "Databricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\n", + "best-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n", + "\n", + "\n", + "\n", + "**•** Insurance companies can store any type of\n", + "data using Databricks Lakehouse for Insurance,\n", + "leveraging the low-cost object storage supported\n", + "by cloud providers. This helps break down data\n", + "silos that hinder efforts to aggregate data for\n", + "advanced analytics, such as claim triaging and\n", + "fraud identification, regulatory reporting, or\n", + "compute-intensive risk workloads. Another critical\n", + "feature is the time-travel capabilities of the\n", + "lakehouse architecture, allowing insurers to access\n", + "any historical version of their data.\n", + "\n", + "\n", + "\n", + "**•** Supporting streaming use cases, such as\n", + "monitoring transaction data, is easier with the\n", + "lakehouse. It utilizes Apache Spark ™ as the data\n", + "processing engine and Delta Lake as the storage\n", + "layer. Spark enables seamless switching between\n", + "batch and streaming workloads with just a single\n", + "line of code. Delta Lake’s native support for ACID\n", + "transactions ensures reliable and high-performing\n", + "streaming workloads.\n", + "\n", + "\n", + "\n", + "**•** For both machine learning and non-machine\n", + "learning insurance models, a comprehensive\n", + "governance framework is provided. Data, code,\n", + "libraries and models are linked and independently\n", + "version controlled using technologies like Delta\n", + "Lake and MLflow. Delta Lake ensures stability by\n", + "allowing insurance companies to declare their\n", + "expectations for data quality upfront. MLflow\n", + "enables training models in any language and\n", + "deploying them anywhere, minimizing the need for\n", + "complex handoffs between data science practices,\n", + "independent validation units and operational teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Level-up value with Databricks Lakehouse for insurance**\n", + "\n", + "Building your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\n", + "use cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n", + "\n", + "With a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\n", + "cloud data architecture. The key benefits include:\n", + "\n", + "\n", + "**1. Cost and complexity reduction**\n", + "\n", + "The Databricks Lakehouse provides an open, simple\n", + "and unified cloud data management architecture\n", + "that streamlines operational inefficiencies, reduces\n", + "IT infrastructure costs, and enhances productivity\n", + "across teams.\n", + "\n", + "\n", + "**2. Enhanced risk management and control**\n", + "\n", + "By unlocking the value of enterprise data, the\n", + "platform helps reduce corporate governance and\n", + "security risks. It facilitates data-driven decisionmaking through governed discovery, access and\n", + "data sharing.\n", + "\n", + "\n", + "**3. Accelerated innovation**\n", + "\n", + "The platform enables the acceleration of digital\n", + "transformation, modernization and cloud migration\n", + "initiatives, fostering new growth opportunities\n", + "and driving innovation for improved customer and\n", + "workforce experiences.\n", + "\n", + "\n", + "To help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture for Smart Claims**\n", + "\n", + "\n", + "**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n", + "\n", + "or incrementally through change data capture (CDC). These\n", + "\n", + "include structured and unstructured data sets like images, text,\n", + "\n", + "and video, such as IoT sensor data, operational data like claims\n", + "\n", + "and policies, and on-prem or third-party data such as from\n", + "\n", + "credit bureaus, weather, and driving records. Partner Connect\n", + "\n", + "offers a range of ingest tools from different vendors that you can\n", + "\n", + "directly use from the Databricks portal.\n", + "\n", + "\n", + "**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n", + "\n", + "path to transform the data based on business\n", + "\n", + "requirements. All the data resides in cloud storage,\n", + "\n", + "where Delta refines it into Bronze, Silver and Gold\n", + "\n", + "zones of a medallion pipeline blueprint. Databricks\n", + "\n", + "Workflows provide orchestration of the various\n", + "\n", + "dependent tasks, with advanced capabilities like\n", + "\n", + "\n", + "**3.** \u0007Databricks SQL, with Photon\n", + "\n", + "and serverless options, caters\n", + "\n", + "to BI consumption use cases to\n", + "\n", + "refresh a dashboard monitoring\n", + "\n", + "key metrics and KPIs, with\n", + "\n", + "query history and alerts on\n", + "\n", + "critical events.\n", + "\n", + "\n", + "**4.** \u0007Databricks ML Runtime,\n", + "\n", + "MLFlow, along with\n", + "\n", + "Feature Store, Auto ML,\n", + "\n", + "and real-time Model\n", + "\n", + "Serving enable ML\n", + "\n", + "use cases to provide\n", + "\n", + "\n", + "**5.** \u0007Delta Sharing provides\n", + "\n", + "a secure and governed\n", + "\n", + "way of sharing data\n", + "\n", + "internally and externally\n", + "\n", + "without copying it,\n", + "\n", + "using Unity Catalog.\n", + "\n", + "\n", + "predictive insights.\n", + "\n", + "\n", + "retry, repair and job status notifications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Secure data sharing with Delta Lake**\n", + "\n", + "At the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\n", + "Lake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\n", + "unify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n", + "\n", + "Once the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\n", + "They can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n", + "\n", + "**Business intelligence**\n", + "\n", + "**Streaming**\n", + "\n", + "**Centralized**\n", + "**governance**\n", + "\n", + "\n", + "##### Lakehouse Platform\n", + "\n", + "\n", + "**Data science / ML**\n", + "\n", + "**One copy**\n", + "**of data**\n", + "\n", + "**Data warehouse**\n", + "\n", + "**Orchestration**\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Claims automation and transformation\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving technological advancements\n", + "and increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\n", + "amount of structured and unstructured data coming in from different sources, in different formats, and time\n", + "frames. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\n", + "by a combination of technology and human intervention that seamlessly expedites the process.\n", + "\n", + "**Business problem**\n", + "\n", + "Missing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\n", + "leakage and inefficient processes in triaging claims to the right resource.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Enable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\n", + "including MLflow model lifecycle management.\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Decrease in annual claims payout\n", + "\n", + "**•** Increase in claim fraud detection/prevention\n", + "\n", + "**•** Improve efficiencies by 15%\n", + "\n", + "**“Applying AI as broadly, as aggressively**\n", + "\n", + "**and as enthusiastically as possible. No part**\n", + "\n", + "**of our business should be untouched by it.”**\n", + "\n", + "— \u0007Masashi Namatame, Group Chief Digital Officer,\n", + "Managing Executive Officer, Tokio Marine\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**Tokio Marine: Striving to**\n", + "**become Al-driven**\n", + "\n", + "Insurers of all types now routinely use AI\n", + "models to drive underwriting, streamline claims\n", + "processing and accelerate claims adjudication,\n", + "protect against insurance fraud, and improve\n", + "risk forecasting, for example. Tokio Marine —\n", + "Japan’s oldest insurance company, which has\n", + "done business since 1879 — has been applying\n", + "advanced uses of AI, particularly in its auto\n", + "insurance business, says Masashi Namatame,\n", + "Group Chief Digital Officer and Managing\n", + "Executive Officer at Tokio Marine: “To assess\n", + "collision damages, the company uses an AIbased computer vision solution to analyze\n", + "photos from accident scenes.” Comparing these\n", + "with what he describes as “thousands or even\n", + "millions” of photos of past analogous incidents,\n", + "the model produces liability assessments of the\n", + "parties involved and projects anticipated repair\n", + "costs. AI has also provided the company with\n", + "tangible benefits in online sales — especially in\n", + "personalized product recommendations and\n", + "contract writing, according to Namatame. Read\n", + "the case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Dynamic pricing and underwriting\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "In modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\n", + "carriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\n", + "This involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\n", + "geolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\n", + "offers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n", + "\n", + "**Business problem**\n", + "\n", + "Actuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\n", + "capabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "**•** Unified cloud-native platform\n", + "\n", + "**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n", + "\n", + "**•** Reduced total cost of ownership compared to legacy Hadoop systems\n", + "\n", + "**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\n", + "lowering loss ratios\n", + "\n", + "**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**American financial services**\n", + "**mutual organization**\n", + "\n", + "This organization aimed to leverage the vast\n", + "amounts of structured and unstructured data\n", + "it collected to enhance its underwriting and\n", + "decision-making processes, enabling greater\n", + "efficiency and effectiveness. However, the\n", + "company’s legacy infrastructure struggled\n", + "to scale with the increasing data volume and\n", + "processing demands, limiting its ability to\n", + "analyze the data and derive actionable insights.\n", + "\n", + "With Databricks, the insurer centralized\n", + "everything on one unified Lakehouse platform,\n", + "\n", + "supporting all operational and analytical\n", + "use cases. This allowed them to analyze\n", + "broader sets of data for superior underwriting\n", + "performance and create a digitally empowered,\n", + "end-to-end underwriting experience.\n", + "\n", + "\n", + "\n", + "**•** Improve competitive position\n", + "\n", + "**•** Decrease combined ratio\n", + "\n", + "**•** 15% improvement in efficiencies\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Anomaly detection and fraudulent claims\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**One of the largest U.S.**\n", + "**insurance companies and a**\n", + "**leading small business insurer**\n", + "\n", + "The increasing availability of data and market\n", + "competition challenge insurance providers to\n", + "offer better pricing to their customers. This\n", + "U.S.-based insurer, with hundreds of millions of\n", + "insurance records to analyze for downstream\n", + "ML, realized that their legacy batch analysis\n", + "process was slow and inaccurate, providing\n", + "limited insight for predicting the frequency\n", + "and severity of claims. With Databricks, they\n", + "were able to scale up the use of deep learning\n", + "models, resulting in more accurate pricing\n", + "predictions and increased revenue from\n", + "claims. By leveraging Databricks Lakehouse,\n", + "they harmonized data, analytics and AI at\n", + "scale, enabling accurate pricing predictions\n", + "and supporting various use cases from vehicle\n", + "telematics to actuarial modeling.\n", + "\n", + "\n", + "Fraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\n", + "American consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\n", + "in 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\n", + "change to support new channels and services, offering transactional features and facilitating payments through\n", + "digital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\n", + "consumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\n", + "models. It often involves a complex decision science process that combines a rules engine with a robust and\n", + "scalable machine learning platform.\n", + "\n", + "**Business problem**\n", + "\n", + "Insurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Modernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\n", + "providers (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\n", + "This data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n", + "\n", + "**$1 of fraud costs companies 3.36x in chargeback,**\n", + "**replacement and operational costs**\n", + "\n", + "\n", + "[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Customer 360 and hyper-personalization\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Winning the hearts and minds of your customers\n", + "starts with personalizing the user experience. The\n", + "ability to offer complementary products to meet\n", + "the needs of your customers lets you build deeper\n", + "relationships with them and engender their loyalty.\n", + "In addition, a better understanding of the finer\n", + "details within accounts allows you to offer them\n", + "more personalized products. To do this, you need\n", + "360-degree customer views, which requires you to\n", + "locate and consolidate all your customers’ contact\n", + "data from every digital tool that you use and house\n", + "it in one central location. With Databricks Lakehouse,\n", + "insurers can “hyper-personalize,” increase\n", + "cross-sell/upsell opportunities, enhance customer\n", + "360 and bring new products to market faster.\n", + "\n", + "**Business problem**\n", + "\n", + "The inability to reconcile customer records across\n", + "different lines of business limits real-time customer\n", + "insights necessary for upselling and cross-selling.\n", + "Siloed data makes it challenging to create accurate\n", + "and comprehensive customer profiles, resulting in\n", + "suboptimal recommendations for the next best action.\n", + "\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Databricks provides the tools needed to process\n", + "large volumes of data and determine the next best\n", + "action at any point in the customer journey.\n", + "\n", + "**•** Eliminates data silos by unifying all customer data,\n", + "including basic information, transactional data,\n", + "online behavior/purchase history, etc., to create\n", + "complete customer profiles\n", + "\n", + "**•** Integrated data security ensures that security\n", + "measures are incorporated at every layer of the\n", + "Databricks Lakehouse Platform\n", + "\n", + "**•** Delta improves data quality, providing a single\n", + "source of truth for real-time streams and ensuring\n", + "reliable and high-quality data for data teams\n", + "\n", + "**•** Integrated ML and AI capabilities utilize AI to\n", + "create self-optimizing ML models that determine\n", + "the next best step for each customer\n", + "\n", + "**•** MLflow model lifecycle management helps manage\n", + "the entire machine learning lifecycle reliably,\n", + "securely and at scale\n", + "\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Use AI, ML, automation and real-time data to\n", + "gain deeper customer insights and understand\n", + "their needs\n", + "\n", + "**•** Improve competitive positioning\n", + "\n", + "**•** Enhance the customer experience\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**160-year-old U.S.**\n", + "**insurance company**\n", + "\n", + "This insurance provider underwent a significant\n", + "digital transformation to provide a more\n", + "personalized financial services experience to\n", + "its 10,000 advisors and millions of customers\n", + "across various touchpoints. Recognizing the\n", + "importance of becoming data-driven, the\n", + "company leveraged Databricks in its client\n", + "360 platform to aggregate transactional and\n", + "behavioral data, along with core attributes,\n", + "providing business users with next-best-action\n", + "recommendations for seamless customer\n", + "engagement.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Global Regulatory Impact in Insurance\n", + "\n", + "\n", + "**Navigating global regulations**\n", + "**with technical implementation**\n", + "\n", + "Digital innovation continues to reshape the insurance sector. The pace and scale\n", + "of technological change are likely to increase due to factors such as artificial\n", + "intelligence (AI), cloud computing, and the entry of new players like insurtechs,\n", + "e-tailers, and manufacturers from outside the insurance industry.\n", + "\n", + "To succeed and thrive in today’s economic environment, insurers should prioritize\n", + "upgrading their infrastructure and technology, rather than solely focusing on\n", + "transforming operations. For example, migrating from on-premises systems to the\n", + "cloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n", + "\n", + "As insurers upgrade their compliance processes to meet new global regulations,\n", + "such as IFRS 17 and LDTI, the impact of regulatory updates becomes more\n", + "complex for organizations operating across multiple jurisdictions. Instead of\n", + "merely responding to regulatory and industry requirements, insurance companies\n", + "should make data-focused investments that help them anticipate and meet the\n", + "expectations of distributors and policyholders.\n", + "\n", + "\n", + "**IFRS-17**\n", + "\n", + "IFRS 17 is an International Finance Reporting Standard (IFRS) for\n", + "insurance contracts. IFRS 17 aims to standardize insurance accounting\n", + "by providing consistent principles for all facets of accounting for\n", + "insurance contracts. IFRS 17 removes existing inconsistencies so\n", + "analysts, investors and others can more easily compare companies,\n", + "contracts and industries.\n", + "\n", + "**LDTI for long-duration contracts**\n", + "\n", + "The Financial Accounting Standards Board long-duration targeted\n", + "improvements (LDTI) introduced changes to the U.S. GAAP accounting\n", + "model to simplify and improve the financial reporting of long-duration\n", + "contracts, including providing financial statement users with more\n", + "timely and relevant information about those contracts.\n", + "\n", + "\n", + "It is crucial for insurers to redirect their focus toward developing advanced data\n", + "management and utilization capabilities that offer better insights and improved\n", + "performance. These investments serve as not only a foundation for regulatory\n", + "compliance but also a starting point for more comprehensive and proactive\n", + "transformation initiatives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N D U S T R Y S O L U T I O N S**\n", + "\n", + "## Get Started With Accelerators, Brickbuilders and Enablers\n", + "\n", + "Insurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n", + "\n", + "**Adoption challenges**\n", + "\n", + "\n", + "Numerous challenges hinder organizations from developing and implementing the\n", + "necessary technical solutions to enhance their operational effectiveness, increase\n", + "revenue, and stay competitive. These challenges include:\n", + "\n", + "**•** Lack of technical skills (data scientists/data engineers): Companies often\n", + "struggle to find employees proficient in Python or Scala, or individuals who\n", + "possess extensive experience in data science.\n", + "\n", + "\n", + "\n", + "**•** Business problems require in-depth data science and industry knowledge:\n", + "Businesses seek solutions tailored to address specific problems, rather than\n", + "generic technical features.\n", + "\n", + "**•** Companies seek actionable insights: Organizations prefer readily applicable\n", + "patterns that can be quickly implemented, rather than custom data science\n", + "solutions that come with potential costs and risks of implementation failure.\n", + "\n", + "\n", + "**What are accelerators/enablers?**\n", + "\n", + "\n", + "**Solution Accelerators**\n", + "\n", + "Save hours on discovery, design, development and\n", + "testing with Databricks Solution Accelerators. Our\n", + "purpose-built guides, including fully functional\n", + "notebooks and best practices, expedite results for\n", + "your most common and high-impact use cases. With\n", + "these accelerators, you can go from idea to proof of\n", + "concept (PoC) in as little as two weeks.\n", + "\n", + "\n", + "**Brickbuilders**\n", + "\n", + "Brickbuilder Solutions are data and AI solutions\n", + "designed by leading consulting companies to\n", + "address industry-specific business requirements.\n", + "Built on the Databricks Lakehouse Platform and\n", + "backed by the industry experience of these\n", + "consultancies, businesses can have confidence\n", + "in solutions tailored to their specific use cases.\n", + "Brickbuilder Solutions can be implemented at any\n", + "stage of the customer journey.\n", + "\n", + "\n", + "**Solution Enablers**\n", + "\n", + "Solution enablers consist of targeted collections\n", + "of notebooks and materials, such as webinars and\n", + "blog posts, designed to support larger solutions.\n", + "They aim to solve pain points or address specific\n", + "layers of business capabilities, such as resolving data\n", + "ingestion challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Get Started With Industry Solutions\n", + "\n", + "\n", + "**Claims transformation:**\n", + "**automation and fraud prevention**\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving\n", + "technological advancements and growing data availability. The end-to-end claims\n", + "process, from extracting relevant information from documentation submitted\n", + "when filing a claim to triaging and routing claims and the underwriting process,\n", + "is ripe for digital transformation. By leveraging the Databricks Lakehouse,\n", + "organizations can handle millions of data points coming in different formats and\n", + "time frames, from various sources, at an unprecedented volume. Every touchpoint\n", + "in the claims journey, starting even before an incident occurs, will be supported by\n", + "a combination of technology and human intervention that seamlessly expedites\n", + "the process. Personalizing the claims experience by anticipating needs, providing\n", + "real-time status alerts, and reducing friction in the process increases customer\n", + "loyalty and retention.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Accelerate underwriting through collaboration and efficient ML**\n", + "\n", + "A leading P&C insurer took full advantage of the MongoDB and Databricks\n", + "integration, leveraging both platforms to foster collaboration between their data\n", + "and developer teams. The integration provides a more natural development\n", + "experience for Spark users and exposes all of Spark’s libraries. This allows\n", + "MongoDB data to be materialized as DataFrames and data sets for analysis\n", + "using machine learning, graph, streaming and SQL APIs. The insurer also benefits\n", + "from automatic schema inference. With this integration, the insurer was able to\n", + "train and observe their ML models (MongoDB Atlas Charts) more efficiently and\n", + "incorporate them into business applications.\n", + "\n", + "As a result, crucial underwriting processes that previously took days are now executed\n", + "in seconds. In addition to the time and cost savings, the company can provide a more\n", + "immediate response to customers within its digital experience platform.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n", + "\n", + "**Claims processing is the process whereby an insurer receives,**\n", + "\n", + "\n", + "**verifies and processes a claim report submitted by a policyholder.**\n", + "\n", + "**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n", + "\n", + "**criticial component of customer satisfaction with their carrier.”**\n", + "\n", + "\n", + "**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n", + "\n", + "\n", + "[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "\n", + "\n", + "**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n", + "\n", + "\n", + "**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n", + "\n", + "**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Risk management:**\n", + "**dynamic pricing and underwriting**\n", + "\n", + "Modernized approaches at insurance carriers require a full digital transformation,\n", + "and one aspect of this transformation involves dynamic pricing and underwriting\n", + "to reduce premiums. Insurance providers are now consuming data from the largest\n", + "mobile telematics providers to obtain the most granular sensor and trip summaries\n", + "for users of online insurance applications. Not only is this data critical for pricing,\n", + "but it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\n", + "and underwriting automate routine tasks and provide teams with alternative\n", + "data sources to empower actuarial and underwriting professionals to become\n", + "“exponential.” This allows teams to focus on key aspects of risk selection and\n", + "analysis that drive competitive advantage and market differentiation. By leveraging\n", + "personalized data points, insurers can deliver near real-time underwriting\n", + "decisions for life insurance applicants, reducing policy abandonment and costs.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Automated extraction of medical risk factors for life insurance underwriting**\n", + "**(John Snow Labs)**\n", + "\n", + "Life insurance underwriting considers an applicant’s medical risk factors in\n", + "addition to mortality risk characteristics. These risk factors are often found\n", + "in free-text documents. New insurance-specific natural language processing\n", + "(NLP) models can automatically extract relevant medical history and risk factors\n", + "from such documents. Forward-thinking companies are embracing accelerated\n", + "underwriting, which utilizes new data along with algorithmic tools and modeling\n", + "techniques to quickly assess and group applicants without requiring bodily fluids,\n", + "physician’s notes, and so on. This joint Solution Accelerator from Databricks and\n", + "John Snow Labs simplifies the implementation of this approach, creating a faster,\n", + "more consistent, and scalable underwriting experience.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n", + "\n", + "**Risk is highly influenced by behavior, and 80% of morbidity in**\n", + "\n", + "\n", + "**healthcare risk is driven by factors such as smoking, drinking**\n", + "\n", + "**alcohol, physical activity and diet. In the case of driving,**\n", + "\n", + "**60% of fatal accidents are a result of behavior alone. If insurers**\n", + "\n", + "**can change customer behaviors and help them make better**\n", + "\n", + "**choices, then the risk curve shifts.”**\n", + "\n", + "\n", + "**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n", + "\n", + "**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n", + "\n", + "**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "\n", + "\n", + "[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "\n", + "\n", + "**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Product distribution:**\n", + "**segmentation and personalization**\n", + "\n", + "The most forward-thinking and data-driven insurers are\n", + "focused on achieving personalization at scale. They are\n", + "exploring new partnerships and business models to create\n", + "integrated, value-added experiences that prioritize the\n", + "overall health and financial wellness of their customers,\n", + "rather than just their insurance needs. These insurers\n", + "are investing in new data sources, analytics platforms,\n", + "and artificial intelligence (AI)-powered decision engines\n", + "that enable them to connect producers with like-minded\n", + "customers or engage customers with enticing offers\n", + "and actionable steps based on their previous choices.\n", + "The outcome is more efficient and effective service\n", + "from producers, trusted and convenient interactions for\n", + "consumers, and increased customer engagement and\n", + "growth for insurers in an increasingly digital-oriented world.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n", + "\n", + "[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\n", + "insurance companies. It enables them to complete, unify and comprehensively capture customer profiles\n", + "using a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n", + "360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\n", + "as call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n", + "360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n", + "\n", + "With Persona 360, you can:\n", + "\n", + "**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n", + "1,695+ attributes and segments\n", + "\n", + "**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\n", + "Persona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\n", + "users to comprehend and activate the data\n", + "\n", + "**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\n", + "personalized campaigns\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Demand for hyper-personalized and real-time risk protection**\n", + "\n", + "\n", + "**requires broad adoption of artificial** **intelligence (AI), machine**\n", + "\n", + "**learning and digital platforms.**\n", + "\n", + "**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n", + "\n", + "\n", + "**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "\n", + "\n", + "**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n", + "\n", + "**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n", + "**by insurance provider type**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n", + "|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n", + "|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n", + "|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n", + "|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n", + "|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n", + "|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n", + "|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n", + "|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\n", + "Insurance empowers insurance providers to leverage the potential of data and analytics to address strategic\n", + "challenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n", + "\n", + "**Customers that innovate with Databricks Lakehouse for Insurance**\n", + "\n", + "Some of the top property and casualty, life and health insurance companies and reinsurers in the world turn\n", + "to Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\n", + "smarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000 organizations worldwide — including\n", + "\n", + "Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n", + "\n", + "Lake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n", + "\n", + "###### Contact us for a personalized demo at:\n", + " dbricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
```\n", + "TECHNICAL GUIDE\n", + "\n", + "```\n", + "\n", + "# Solving Common Data Challenges \n", + "\n", + "\n", + "#### Startups and Digital Native Businesses\n", + "\n", + "\n", + "-----\n", + "\n", + "### Table of Contents\n", + "\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Creating a unified data architecture for data quality, governance and efficiency\n", + "\n", + "# 03\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building effective machine learning operations\n", + "\n", + "```\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building a data architecture to support scale and performance\n", + "\n", + "# 04\n", + "SUMMARY:\n", + "\n", + "###### The Databricks Lakehouse Platform addresses these challenges\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "\n", + "This guide shares how the lakehouse architecture can increase\n", + "productivity and cost-efficiently support all your data, analytics\n", + "and AI workloads, and flexibly scale with the pace of growth\n", + "for your company. Read the entire guide or dive straight into a\n", + "specific challenge.\n", + "\n", + "With the advent of cloud infrastructure, a new generation of\n", + "startups has rapidly built and scaled their businesses. The use of\n", + "cloud infrastructure, once seen as innovative, has now become\n", + "table stakes. The differentiator for the fastest-moving startups\n", + "and digital natives now comes from the effective use of data\n", + "at scale, primarily analytics and AI. Digital natives — defined\n", + "as fast-moving, lean, and technically savvy, born-in-the-cloud\n", + "organizations — are beginning to focus on new data-driven use\n", + "cases such as real-time machine learning and personalized\n", + "customer experiences.\n", + "\n", + "To pursue these new data-intensive use cases and initiatives,\n", + "organizations must look beyond the technologies that delivered\n", + "them to this point in time. Over time, these technologies, such\n", + "as transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n", + "\n", + "This guide examines some of the biggest data challenges and\n", + "solutions for startups and for scaling digital native businesses\n", + "that have reached the point where an end-to-end modern data\n", + "platform is a smart investment. Some key considerations include:\n", + "systems that are not cost-efficient and require time-consuming\n", + "administration and engineering toil. In addition to growing\n", + "maintenance needs, data is often stored in disparate locations\n", + "and formats, with little or no governance, making real-time use\n", + "cases, analytics and AI difficult or impossible.\n", + "\n", + "\n", + "**Consolidating on a unified data platform**\n", + "As mentioned above, siloed data storage and management add administrative and\n", + "financial cost. You can benefit significantly when you unify your data in one location\n", + "with a flexible architecture that scales with your needs and delivers performance\n", + "for future success. For this, you will want an open platform that supports all your\n", + "data including batch and streaming workloads, data analytics and machine learning.\n", + "With data unification, you create a more efficient, integrated approach to ingesting,\n", + "cleaning and organizing your data. You also need automation to make data analysis\n", + "easier for the nontechnical users in the company. But broader data access also\n", + "means more focus on security, privacy, compliance and access control, which can\n", + "create overhead for a growing.\n", + "\n", + "**Scaling up capacity and increasing performance**\n", + "**and usability of the data solutions**\n", + "Data teams at growing digital native organizations find it time intensive and costly to\n", + "handle the growing volume and velocity of their data being ingested from multiple\n", + "sources, across multiple clouds. You now need a unified and simplified platform that\n", + "can instantly scale up capacity and deliver more computing power on demand to\n", + "free up your data teams to produce outputs more quickly. This lowers the total cost\n", + "for the overall infrastructure by eliminating redundant licensing, infrastructure and\n", + "administration costs.\n", + "\n", + "**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "As cloud-born companies grow, data volumes rapidly increase, leading to new\n", + "challenges and use cases. Among the challenges:\n", + "\n", + "\n", + "Application stacks optimized for transaction\n", + "use cases aren’t able to handle the volume,\n", + "velocity and variety of data that modern data\n", + "teams require. For example, this leads to query\n", + "performance issues as data volume grows.\n", + "\n", + "Data silos develop as each team within an\n", + "organization chooses different ETL/ELT and\n", + "storage solutions for their needs. As the\n", + "organization grows and changes, these pipelines\n", + "and storage solutions become brittle, hard to\n", + "maintain and nearly impossible to integrate.\n", + "\n", + "\n", + "These data silos lead to discoverability,\n", + "integration and access issues, which prevent\n", + "teams from leveraging the full value of the\n", + "organization’s available data.\n", + "\n", + "Data governance is hard. Disparate ETL/ELT\n", + "and storage solutions lead to governance,\n", + "compliance, auditability and access control\n", + "challenges, which expose organizations to\n", + "tremendous risk.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides\n", + "a unified set of tools for building, deploying,\n", + "sharing and maintaining data solutions at scale.\n", + "It integrates with cloud storage and the security\n", + "in your cloud account, manages and deploys\n", + "cloud infrastructure on your behalf. Your data\n", + "practitioners no longer need separate storage\n", + "systems for their data. And you don’t have to rely\n", + "on your cloud provider for security. The lakehouse\n", + "has its own robust security built into the platform.\n", + "\n", + "\n", + "For all the reasons above, the most\n", + "consistent advice from successful data\n", + "practitioners is to create a “single source\n", + "of truth” by unifying all data on a single\n", + "platform. With the Databricks Lakehouse\n", + "Platform, you can unify all your data on one\n", + "platform, reducing data infrastructure costs\n", + "and compute. You don’t need excess data\n", + "copies and you can retire expensive\n", + "legacy infrastructure.\n", + "```\n", + " 01\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: GRAMMARLY\n", + "\n", + "### Helping 30 million people and 50,000 teams communicate more effectively\n", + "\n", + "```\n", + "\n", + "While its business is based on analytics, [Grammarly](http://www.grammarly.com)\n", + "\n", + "for many years relied on a homegrown analytics\n", + "\n", + "platform to drive its AI writing assistant to\n", + "\n", + "help users improve multiple aspects of written\n", + "\n", + "communications. As teams developed their own\n", + "\n", + "requirements, data silos inevitably emerged as\n", + "\n", + "different business areas implemented analytics\n", + "\n", + "tools individually.\n", + "\n", + "“Every team decided to solve their analytics\n", + "\n", + "needs in the best way they saw fit,” said Chris\n", + "\n", + "Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholds\n", + "\n", + "the standards we set as a data platform team,” said Locklin. “Lineage is\n", + "\n", + "the last crucial piece for access control.”\n", + "\n", + "Data analysts within Grammarly now have a consolidated interface for\n", + "\n", + "analytics, which leads to a single source of truth and confidence in the\n", + "\n", + "accuracy and availability of all data managed by the data platform team.\n", + "\n", + "Having a consistent data source across the company also resulted in\n", + "\n", + "greater speed and efficiency and reduced costs. Data practitioners\n", + "\n", + "experienced 110% faster querying at 10% of the cost to ingest compared\n", + "\n", + "to a data warehouse. Grammarly can now make its 5 billion daily events\n", + "\n", + "available for analytics in under 15 minutes rather than 4 hours. Migrating\n", + "\n", + "off its rigid legacy infrastructure gave Grammarly the flexibility to do\n", + "\n", + "more and the confidence that the platform will evolve with its needs.\n", + "\n", + "Grammarly is now able to sustain a flexible, scalable and highly secure\n", + "\n", + "analytics platform that helps 30 million people and 50,000 teams\n", + "\n", + "worldwide write more effectively every day.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/grammarly)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to unify the data infrastructure with Databricks\n", + "\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\n", + "is composed of two primary parts:\n", + "\n", + "- The infrastructure to deploy, configure and\n", + "manage the platform and services\n", + "\n", + "\n", + "You can build a Databricks workspace by configuring\n", + "secure integrations between the Databricks platform\n", + "and your cloud account, and then Databricks deploys\n", + "temporary Apache Spark™/Photon clusters using cloud\n", + "resources in your account to process and store data\n", + "in object storage and other integrated services you\n", + "control. Here are three steps to get started with the\n", + "Databricks Lakehouse Platform:\n", + "\n", + "**Understand the architecture**\n", + "The lakehouse provides a unified architecture,\n", + "meaning that all data is stored in the same\n", + "accessible place. The diagram shows how data\n", + "comes in from sources like a customer relationship\n", + "management (CRM) system, an enterprise resource\n", + "planning (ERP) system, websites or unstructured\n", + "customer emails.\n", + "\n", + "**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).\n", + "\n", + "[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Lakehouse organizes data stored with Delta Lake in cloud object\n", + "storage with familiar concepts like database, tables and views. Delta Lake extends\n", + "Parquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\n", + "scalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\n", + "and was developed for tight integration with Structured Streaming, allowing you to\n", + "easily use a single copy of data for both batch and streaming operations to provide\n", + "incremental processing at scale.This model combines many of the benefits of a data\n", + "warehouse with the scalability and flexibility of a data lake.\n", + "\n", + "To learn more about the optimized storage layer that provides the foundation for\n", + "storing data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n", + "[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n", + "\n", + "The first step in unifying your data architecture is setting up how data is to be\n", + "accessed and used across the organization. We’ll discuss this as a series of steps:\n", + "\n", + "**1** Set up governance with Unity Catalog\n", + "\n", + "**2** Grant secure access to the data\n", + "\n", + "\n", + "###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n", + " – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n", + "\n", + "[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n", + "\n", + "To set up Unity Catalog for your organization,\n", + "you do the following:\n", + "\n", + "\n", + "**1** Configure an S3 bucket and IAM role that\n", + "Unity Catalog can use to store and access\n", + "data in your AWS account.\n", + "\n", + "**2** Create a metastore for each region in\n", + "\n", + "which your organization operates, and\n", + "attach workspaces to the metastore. Each\n", + "workspace will have the same view of the\n", + "data you manage in Unity Catalog.\n", + "\n", + "\n", + "**3** If you have a new account, add users,\n", + "groups and service principals to your\n", + "Databricks account.\n", + "\n", + "**4** Next, create and grant access to\n", + "\n", + "catalogs, schemas and tables.\n", + "\n", + "\n", + "For complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How Unity Catalog works\n", + "\n", + "\n", + "You will notice that the hierarchy of primary data\n", + "objects in Unity Catalog flows from metastore to table:\n", + "\n", + "**Metastore** is the top-level container for metadata.\n", + "Each metastore exposes a three-level namespace\n", + "(catalog.schema.table) that organizes your data.\n", + "\n", + "\n", + "**Metastore** **Catalog** **Schemas**\n", + "\n", + "\n", + "**Views**\n", + "\n", + "**Managed**\n", + "**Tables**\n", + "\n", + "\n", + "**Catalog** is the first layer of the object hierarchy, used\n", + "to organize your data assets.\n", + "\n", + "\n", + "**Schemas** , also known as databases, are the second\n", + "layer of the object hierarchy and contain tables and\n", + "views.\n", + "\n", + "**Table** is the lowest level in the object hierarchy, and\n", + "tables can be external (stored in external locations in\n", + "your cloud storage of choice) or managed (stored in a\n", + "storage container in your cloud storage that you create\n", + "\n", + "expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n", + "\n", + "Unity Catalog users, service principals, and groups\n", + "must also be added to workspaces to access Unity\n", + "Catalog data in a notebook, a Databricks SQL query,\n", + "Data Explorer or a REST API command. The assignment\n", + "of users, service principals, and groups to workspaces\n", + "is called identity federation. All workspaces attached\n", + "to a Unity Catalog metastore are enabled for identity\n", + "federation.\n", + "\n", + "Securable objects in Unity Catalog are hierarchical,\n", + "meaning that granting a privilege on a catalog or schema\n", + "automatically grants the privilege to all current and\n", + "future objects within the catalog or schema. For more\n", + "on granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\n", + "A common scenario is to set up a schema per team\n", + "where only that team has USE SCHEMA and CREATE on\n", + "the schema. This means that any tables produced by\n", + "team members can only be shared within the team.\n", + "Data Explorer uses the privileges configured by Unity\n", + "Catalog administrators to ensure that users are only\n", + "able to see catalogs, databases, tables and views that\n", + "they have permission to query.\n", + "\n", + "\n", + "[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\n", + "many Unity Catalog features. Use Data Explorer to view\n", + "schema details, preview sample data, and see table\n", + "details and properties. Administrators can view and\n", + "change owners. Admins and data object owners can grant\n", + "and revoke permissions through this interface.\n", + "\n", + "**Set up secure access**\n", + "In Unity Catalog, data is secure by default. Initially, users\n", + "have no access to data in a metastore. Access can\n", + "be granted by either a metastore admin, the owner of\n", + "an object, or the owner of the catalog or schema that\n", + "contains the object. Securable objects in Unity Catalog\n", + "are hierarchical and privileges are inherited downward.\n", + "\n", + "Unity Catalog’s security model is based on standard ANSI\n", + "SQL and allows administrators to grant permissions in\n", + "their existing data lake using familiar syntax, at the level of\n", + "catalogs, databases (schema), tables and views. Privileges\n", + "and metastores are shared across workspaces, allowing\n", + "administrators to set secure permissions once against\n", + "\n", + "groups synced from identity providers and know that\n", + "end users only have access to the proper data in any\n", + "Databricks workspace they enter.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: BUTCHERBOX\n", + "\n", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significant\n", + "\n", + "\n", + "“We knew we needed to migrate from our legacy data warehouse\n", + "\n", + "environment to a data analytics platform that would unify our\n", + "\n", + "data and make it easily accessible for quick analysis to improve\n", + "\n", + "supply chain operations, forecast demand and, most importantly,\n", + "\n", + "keep up with our growing customer base,” explained Jake Stone,\n", + "\n", + "Senior Manager, Business Analytics, at ButcherBox.\n", + "\n", + "The platform allows analysts to share builds and iterate on a\n", + "\n", + "project without getting into the code. Querying a table of 18\n", + "\n", + "billion rows would have been problematic with a traditional\n", + "\n", + "platform. With Databricks, ButcherBox can do it in three minutes.\n", + "\n", + "“Delta Lake provides us with a single source of truth for all of\n", + "\n", + "our data,” said Stone. “Now our data engineers are able to build\n", + "\n", + "reliable data pipelines that thread the needle on key topics such\n", + "\n", + "as inventory management, allowing us to identify in near real-\n", + "\n", + "time what our trends are so we can figure out how to effectively\n", + "\n", + "move inventory.”\n", + "\n", + "[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "problem because they blocked complete\n", + "\n", + "visibility into critical insights needed to make\n", + "\n", + "strategic and marketing decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Set up secure data sharing**\n", + "Databricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\n", + "to share data with other entities regardless of their\n", + "computing platforms. Delta Sharing is integrated with\n", + "Unity Catalog. Your data must be registered with Unity\n", + "Catalog to manage, govern, audit and track usage of the\n", + "shared data on the Lakehouse Platform. The primary\n", + "concepts of Delta Sharing are shares (read-only\n", + "collections of tables and table partitions to be shared)\n", + "and recipients (objects that associate an organization\n", + "with a credential or secure sharing identifier).\n", + "\n", + "As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n", + "\n", + "**View data lineage**\n", + "You can use Unity Catalog to capture runtime data\n", + "lineage across queries in any language executed on\n", + "a Databricks cluster or SQL warehouse. Lineage can\n", + "be visualized in Data Explorer in near real-time and\n", + "retrieved with the Databricks REST API. Lineage is\n", + "aggregated across all workspaces attached to Unity\n", + "Catalog and captured down to the column level, and\n", + "includes notebooks, workflows and dashboards related\n", + "to the query. To understand the requirements and how\n", + "to capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n", + "[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n", + "\n", + "\n", + "Unity Catalog Metastore\n", + "\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data providers can use Databricks audit logging to\n", + "monitor the creation and modification of shares,\n", + "and recipients can monitor recipient activity on\n", + "shares. Data recipients who use shared data in a\n", + "Databricks account can use Databricks audit logging\n", + "to understand who is accessing which data.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n", + "\n", + "- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n", + "\n", + "- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n", + "\n", + "- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n", + "\n", + "- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "\n", + "\n", + "###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloud\n", + "\n", + "The Databricks Lakehouse Platform is open\n", + "source with multicloud flexibility so that you can\n", + "use your data however and wherever you want —\n", + "no vendor lock-in\n", + "\n", + "\n", + "-----\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 02\n", + "\n", + "### Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "As modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\n", + "the increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\n", + "suddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\n", + "cost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\n", + "upon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n", + "\n", + "Here are some common challenges around managing data and performance at scale:\n", + "\n", + "\n", + "**Volume and velocity** — Exponentially\n", + "increasing data sources, and the speed at\n", + "which they capture and create data.\n", + "\n", + "**Latency requirements** — The demands of\n", + "downstream applications and users have\n", + "evolved (people want data and the results\n", + "from the data faster).\n", + "\n", + "\n", + "**Governance** — Cataloging, auditing, securing and\n", + "reporting on data is burdensome at scale when\n", + "using old systems not built with data access\n", + "controls and compliance in mind.\n", + "\n", + "**Multicloud** is really hard.\n", + "\n", + "\n", + "**Data storage** — Storing data in the wrong\n", + "format is slow to access, query and is\n", + "expensive at scale.\n", + "\n", + "\n", + "**Data format** — Supporting structured, semistructured and unstructured data formats\n", + "is now a requirement. Most data storage\n", + "solutions are designed to handle only one type\n", + "of data, requiring multiple products\n", + "to be stitched together.\n", + "\n", + "```\n", + "02\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Lakehouse solves scale and performance challenges\n", + "\n", + "\n", + "The solution for growing digital companies is a unified\n", + "and simplified platform that can instantly scale up\n", + "capacity to deliver more computing power on demand,\n", + "freeing up teams to go after the much-needed data\n", + "and produce outputs more quickly. With a lakehouse,\n", + "they can replace their data silos with a single home for\n", + "their structured, semi-structured and unstructured\n", + "data. Users and applications throughout the enterprise\n", + "environment can connect to the same single copy of\n", + "the data to drive diverse workloads.\n", + "\n", + "The lakehouse architecture is cost-efficient for\n", + "scaling, lowering the total cost of ownership for the\n", + "overall infrastructure by consolidating all data estate\n", + "and use cases onto a single platform and eliminating\n", + "redundant licensing, infrastructure and administration\n", + "costs. Unlike other warehouse options that can only\n", + "scale horizontally, the Databricks Lakehouse can scale\n", + "horizontally and vertically based on workload demands.\n", + "\n", + "With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "```\n", + "\n", + "With more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n", + "\n", + "day, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n", + "\n", + "legacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n", + "\n", + "decreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n", + "\n", + "infrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n", + "\n", + "downstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n", + "\n", + "actionable insights for different use cases, from predictive maintenance to smarter product development.\n", + "\n", + "“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n", + "\n", + "performant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n", + "\n", + "Wassym Bensaid, Vice President of Software Development at Rivian.\n", + "\n", + "For instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n", + "\n", + "accelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n", + "\n", + "roll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n", + "\n", + "connected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n", + "\n", + "smart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n", + "\n", + "has seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Step 2**\n", + "**Understand the common Delta Lake operations**\n", + "The Databricks Lakehouse Platform simplifies the\n", + "entire data lifecycle, from data ingestion to monitoring\n", + "and governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\n", + "open-source storage system based on the Delta\n", + "format providing reliability through ACID transactions\n", + "and scalable metadata handling. Large quantities of\n", + "raw files in blob storage can be converted to Delta to\n", + "organize and store the data cheaply. This allows for\n", + "flexibility of data movement while being performant\n", + "and less expensive.\n", + "\n", + "\n", + "**Step 1**\n", + "**Get a trial Databricks account**\n", + "Start your 14-day free trial with Databricks on\n", + "AWS in a few easy steps.\n", + "[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\n", + "uses compute and S3 storage resources in your cloud\n", + "provider account.\n", + "\n", + "\n", + "and writing data can occur simultaneously without risk\n", + "of many queries resulting in performance degradation\n", + "or deadlock for business-critical workloads.\n", + "\n", + "This means that users and applications throughout\n", + "the enterprise environment can connect to the same\n", + "single copy of the data to drive diverse workloads, with\n", + "all viewers guaranteed to receive the most current\n", + "version of the data at the time their query executes.\n", + "With performance features like indexing, Delta Lake\n", + "customers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n", + "[up to 48x faster.](https://www.databricks.com/customers/columbia)\n", + "\n", + "\n", + "[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\n", + "and learn how to create, manage and query tables.\n", + "With support for ACID transactions and schema\n", + "enforcement, Delta Lake provides the reliability that\n", + "traditional data lakes lack. This enables you to scale\n", + "reliable data insights throughout the organization and\n", + "run analytics and other data projects directly on your\n", + "data lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n", + "\n", + "Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 3**\n", + "**Ingest data efficiently at scale**\n", + "With a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\n", + "from hundreds of data sources for analytics, AI and\n", + "streaming applications into one place.\n", + "\n", + "Databricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\n", + "data ingestion. To ingest any file that can land in a data\n", + "lake, Auto Loader incrementally and automatically\n", + "processes new data files as they arrive in cloud storage\n", + "in scheduled or continuous jobs. Auto Loader scales to\n", + "support near real-time ingestion of millions of files\n", + "per hour.\n", + "\n", + "For pushing data in Delta Lake, the SQL command\n", + "[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\n", + "into Delta Lake. COPY INTO is best used when the input\n", + "directory contains thousands of files or fewer, and the\n", + "user prefers SQL. COPY INTO can be used over JDBC\n", + "to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "**Step 4**\n", + "**Leverage production-ready tools**\n", + "**to automate ETL pipelines**\n", + "Once the raw data is ingested, Databricks provides\n", + "a suite of production-ready tools that allow data\n", + "professionals to quickly develop and deploy extract,\n", + "\n", + "transform and load (ETL) pipelines. Databricks SQL\n", + "allows analysts to run SQL queries against the same\n", + "tables used in production ETL workloads, allowing for\n", + "real-time business intelligence at scale.\n", + "\n", + "With your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "for data orchestration and learn how easy it is to create\n", + "a cluster, create a Databricks notebook, configure\n", + "[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\n", + "interact with the data, and schedule a job.\n", + "\n", + "\n", + "Databricks supports workloads in SQL, Python, Scala\n", + "and R, allowing users with diverse skill sets and\n", + "technical backgrounds to leverage their knowledge\n", + "to derive analytic insights. You can use all languages\n", + "supported by Databricks to define production jobs, and\n", + "notebooks can leverage a combination of languages.\n", + "\n", + "This means that you can promote queries written by\n", + "SQL analysts for last-mile ETL into production data\n", + "engineering code with almost no effort. Queries and\n", + "workloads defined by personas across the organization\n", + "leverage the same data sets, so there’s no need to\n", + "reconcile field names or make sure dashboards are up\n", + "to date before sharing code and results with\n", + "other teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\n", + "a framework that uses a simple declarative approach\n", + "to build ETL and ML pipelines on batch or streaming\n", + "data while automating operational complexities such\n", + "as infrastructure management, task orchestration,\n", + "error handling and recovery, retries, and performance\n", + "optimization.\n", + "\n", + "Delta Live Tables extends functionality in Apache Spark\n", + "Structured Streaming and allows you to write just a\n", + "few lines of declarative Python or SQL to deploy a\n", + "production-quality data pipeline with:\n", + "\n", + "- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n", + "\n", + "- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n", + "\n", + "- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n", + "\n", + "- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n", + "\n", + "With DLT, engineers can also treat their data as code\n", + "and apply software engineering best practices like\n", + "testing, monitoring and documentation to deploy\n", + "reliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\n", + "maintain all data dependencies across the pipeline and\n", + "reuse ETL pipelines with environment-independent\n", + "data management.\n", + "\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "### Stopping sophisticated ransomware in its tracks\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "```\n", + "\n", + "The increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n", + "\n", + "to meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n", + "\n", + "that scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n", + "\n", + "Abnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n", + "\n", + "pipelines and constantly refined ML models.\n", + "\n", + "“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n", + "\n", + "Abnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n", + "\n", + "product better.”\n", + "\n", + "The company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n", + "\n", + "maximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n", + "\n", + "directly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n", + "\n", + "delivers reliability, security and performance on the data lake for both streaming and batch operations. With\n", + "\n", + "Databricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n", + "\n", + "decisions and improve detection efficacy.\n", + "\n", + "Databricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n", + "\n", + "productivity and work in the same space without constantly competing for compute resources.\n", + "\n", + "With Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n", + "\n", + "infrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\n", + "that trigger intermittently and are unpredictable. It optimizes cluster utilization\n", + "by only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\n", + "unnecessary idle node capacity.\n", + "\n", + "\n", + "Delta Live Tables helps prevent bad data from flowing into tables through validation,\n", + "integrity checks and predefined error policies. In addition, you can monitor data\n", + "\n", + "quality trends over time to get insight into how your data is evolving and where\n", + "changes may be necessary.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 5**\n", + "**Use Databricks SQL for serverless compute**\n", + "[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\n", + "warehouse on the Lakehouse Platform for running your\n", + "SQL and BI applications at scale with up to 12x better\n", + "price/performance. It’s imperative for younger, growing\n", + "companies to reduce resource contention, and one way\n", + "to accomplish that is with serverless compute. Running\n", + "serverless removes the need to manage, configure or\n", + "scale cloud infrastructure on the lakehouse, freeing up\n", + "your data team for what they do best.\n", + "\n", + "\n", + "See for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n", + "[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\n", + "stored in your data lake.\n", + "\n", + "The Databricks SQL REST API supports services to\n", + "manage queries and dashboards, query history and SQL\n", + "warehouses.\n", + "\n", + "\n", + "Databricks SQL warehouses provide instant, elastic\n", + "SQL compute — decoupled from storage — and will\n", + "automatically scale to provide unlimited concurrency\n", + "without disruption, for high concurrency use cases. DB\n", + "SQL has data governance and security built in. Handle\n", + "high concurrency with fully managed load balancing\n", + "and scaling of compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Faster queries with Photon**\n", + "[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\n", + "to deliver dramatic infrastructure cost savings and\n", + "accelerate all data and analytics workloads: data\n", + "ingestion, ETL, streaming, interactive queries, data\n", + "science and machine learning.\n", + "\n", + "Photon is used by default in Databricks SQL. To\n", + "enable Photon acceleration, select the **Use Photon**\n", + "**Acceleration** checkbox when you create the cluster.\n", + "If you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\n", + "set runtime_engine to PHOTON.\n", + "\n", + "Photon supports a number of instance types on\n", + "the driver and worker nodes. Photon instance types\n", + "consume DBUs at a different rate than the same\n", + "instance type running the non-Photon runtime. For\n", + "more information about Photon instances and DBU\n", + "consumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n", + "\n", + "Photon will seamlessly coordinate work and resources\n", + "and transparently accelerate portions of your SQL and\n", + "Spark queries. No tuning or user intervention required.\n", + "Photon is compatible with Apache Spark APIs, so\n", + "getting started is as easy as turning it on — no code\n", + "change and no lock- in. Written entirely in C++, Photon\n", + "provides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\n", + "per the TPC-DS 1TB benchmark, and customers have\n", + "observed 3x–8x speedups on average.\n", + "\n", + "\n", + "With Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\n", + "Databricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n", + "\n", + "Learn how to connect BI tools to Databricks SQL\n", + "compute resources with the following user guides:\n", + "\n", + "\n", + "[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n", + "\n", + "[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n", + "\n", + "\n", + "[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n", + "\n", + "[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n", + "\n", + "\n", + "[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n", + "\n", + "[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 6**\n", + "**Orchestrate workflows**\n", + "Databricks provides a comprehensive suite of tools and integrations to support your\n", + "data processing workflows.\n", + "\n", + "Databricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\n", + "orchestration service for all your teams, so you can focus on your workflows, not on\n", + "managing your infrastructure. Orchestrate diverse workloads for the full lifecycle\n", + "including Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n", + "\n", + "Here’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\n", + "learn how to create notebooks, create and run a job, view the run details, and run jobs\n", + "with different parameters.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 7**\n", + "**Run an end-to-end analytics pipeline**\n", + "This where you can see how everything works together to run efficiently at scale. First\n", + "take the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\n", + "will write to and read data from an external location managed by Unity Catalog and\n", + "configure Auto Loader to ingest data to Unity Catalog.\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n", + "\n", + "- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n", + "\n", + "- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n", + "\n", + "- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n", + "\n", + "- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/)\n", + "\n", + "\n", + "###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n", + " —Anup Segu, Senior Software Engineer, YipitData\n", + "\n", + "[Learn more.](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "# 03\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Building effective machine-learning operations\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 03\n", + "\n", + "### Building effective machine-learning operations\n", + "\n", + "```\n", + "Growing startups and digital native companies face several challenges when they\n", + "start building, maintaining and scaling machine learning operations (MLOps) for their\n", + "data science teams.\n", + "\n", + "\n", + "MLOps is different from DevOps. DevOps practices\n", + "and tooling alone are insufficient because ML\n", + "applications rely on an assortment of artifacts (e.g.,\n", + "models, data, code) that can each require different\n", + "methods of experiment tracking, model training,\n", + "feature development, governance, feature and\n", + "model serving.\n", + "\n", + "For data teams beginning their machine learning\n", + "journeys, the challenge of training data models can\n", + "be labor-intensive and not cost-effective because\n", + "the data has to be converted into features and\n", + "\n", + "trained on a separate machine learning platform\n", + "\n", + "\n", + "Data teams often perform development in\n", + "disjointed, siloed stacks spanning DataOps,\n", + "ModelOps and DevOps\n", + "\n", + "Development and training environment\n", + "disconnect. Moving code and data between\n", + "personal development environments and\n", + "machine learning platforms for model training\n", + "at scale is error prone and cumbersome. The\n", + "“it worked on my machine” problem.\n", + "\n", + "Gathering high-quality data. Data that is siloed\n", + "across the organization is hard to discover,\n", + "collect, clean and use. This leads to stale data\n", + "and delays in development of models.\n", + "\n", + "\n", + "See **Create a unified data architecture.**\n", + "```\n", + " 03\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Siloed stacks spanning DataOps, ModelOps and DevOps\n", + "\n", + "When data engineers help ingest, refine and prep\n", + "data, they do so on their own stack. This data has\n", + "to be converted into features and then trained on\n", + "a separate machine learning platform. This cross-\n", + "platform handoff often results in data staleness,\n", + "difficulty in maintaining versions, and eventually,\n", + "poorly performing models. Even after you have\n", + "trained your model, you have to deal with yet another\n", + "tech stack for model deployment. It’s challenging\n", + "to serve features in real time and difficult to trace\n", + "problems in production back to the data.\n", + "\n", + "The downstream business impact is massive —\n", + "longer and more expensive projects, and lower\n", + "model accuracy in production leading to declining\n", + "business metrics.\n", + "\n", + "If you are looking at launching or scaling your\n", + "MLOps, you should probably focus on an incremental\n", + "strategy. At Databricks, we see firsthand how\n", + "customers develop their MLOps approaches across\n", + "a huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n", + "[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\n", + "building robust MLOps practices.\n", + "\n", + "\n", + "###### Databricks solution:\n", + "\n", + "Databricks Machine Learning is an integrated\n", + "end-to-end machine learning environment\n", + "incorporating managed services for experiment\n", + "tracking, model training, feature development and\n", + "management, and model serving. The capabilities\n", + "of Databricks map directly to the steps of model\n", + "development and deployment. With Databricks\n", + "Machine Learning, you can:\n", + "\n", + "\n", + "Train models either manually or with AutoML\n", + "\n", + "Track training parameters and models using\n", + "experiments with MLflow tracking\n", + "\n", + "Create feature tables and access them for model\n", + "training and inference\n", + "\n", + "Share, manage and serve models using MLflow\n", + "Model Registry\n", + "\n", + "Deploy models for Serverless Real-time Inference\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Use MLOps on the Databricks Lakehouse Platform\n", + "\n", + "To gain efficiencies and reduce costs, many smaller\n", + "digital companies are employing machine learning\n", + "operations. MLOps is a set of processes and\n", + "automation for managing models, data and code, and\n", + "unique library dependencies to improve performance\n", + "stability and long-term efficiency in ML systems.\n", + "\n", + "To describe it simply, MLOps = ModelOps + DataOps +\n", + "DevOps. The aim of MLOps is to improve the long-term\n", + "performance, stability and success rate of ML systems\n", + "while maximizing the efficiency of the teams who\n", + "build them.\n", + "\n", + "\n", + "Not only does MLOps improve organizational efficiency,\n", + "it also allows the models to iterate faster and react\n", + "to real-life changes in the data. This ability separates\n", + "companies that can grow to meet their customer’s\n", + "challenges in a reactive manner versus those that will\n", + "spend significant time on data updates/processes and\n", + "miss the opportunity to do something with\n", + "their models.\n", + "\n", + "The absence of MLOps is typically marked by an\n", + "overabundance of manual processes which are slower\n", + "\n", + "\n", + "and more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\n", + "capping the ability for a data team to take on new projects. The process is complex. In larger organizations,\n", + "several specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\n", + "natives and high-growth startups may be forced to wear several hats.\n", + "\n", + "\n", + "-----\n", + "\n", + "And once an ML project goes into production, the\n", + "MLOps continues, since the models, data and code\n", + "change over time due to regulatory and business\n", + "requirements. But the ML system must be resilient and\n", + "flexible. Addressing these challenges with a defined\n", + "MLOps strategy can dramatically reduce the iteration\n", + "cycle of delivering models to production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Steps in machine learning model development and deployment:\n", + "\n", + "\n", + "**Step 1**\n", + "**Data preparation**\n", + "Manually preparing and labeling data is a thankless,\n", + "time-consuming job. With Databricks, teams can\n", + "label data with human effort, machine learning\n", + "models in Databricks, or a combination of both.\n", + "Teams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\n", + "workflow that allows humans to easily inspect and\n", + "correct a model’s predicted labels. This process can\n", + "drastically reduce the amount of unstructured data\n", + "you need to achieve strong model performance.\n", + "\n", + "The [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\n", + "ready-to-go environment with many external\n", + "libraries, including TensorFlow, PyTorch, Horovod,\n", + "scikit-learn and XGBoost. It provides\n", + "extensions to improve performance, including GPU\n", + "acceleration in XGBoost, distributed deep\n", + "learning using HorovodRunner, and model\n", + "checkpointing.\n", + "\n", + "To use Databricks Runtime ML, select the ML version\n", + "of the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\n", + "access data in Unity Catalog for machine learning\n", + "workflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\n", + "isolation clusters are not compatible with Databricks\n", + "Runtime for Machine Learning.\n", + "\n", + "\n", + "Machine learning applications often\n", + "need to use shared storage for data\n", + "loading and model checkpointing. You\n", + "can load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\n", + "files. A table is a collection of\n", + "structured data stored as a directory\n", + "on cloud object storage.\n", + "\n", + "For [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\n", + "use [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\n", + "new features, explore and reuse\n", + "existing features, track lineage and\n", + "feature creation code, and publish\n", + "features to low-latency online stores\n", + "for real-time inference. The Feature\n", + "Store is a centralized repository\n", + "that enables data scientists to find\n", + "and share features. It ensures that\n", + "the same code used to compute\n", + "the feature values is used for model\n", + "training and inference. The Feature\n", + "Store library is available only on\n", + "Databricks Runtime for Machine\n", + "Learning and is accessible through\n", + "Databricks notebooks and workflows.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n", + "\n", + "- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n", + "\n", + "- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "C `USTOMER STORY: ZIPLINE`\n", + "\n", + "### Data-driven drones deliver lifesaving medical aid around the world\n", + "\n", + "\n", + "Automated logistics and delivery system\n", + "\n", + "provider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n", + "\n", + "cutting-edge drone technology and a global\n", + "\n", + "autonomous logistics network to save lives\n", + "\n", + "\n", + "information they need to accurately measure success, find\n", + "\n", + "the metrics that relate to customer experiences or logistics,\n", + "\n", + "and improve on them exponentially as more data is ingested\n", + "\n", + "and machine learning models are refined.\n", + "\n", + "\n", + "by giving remote communities access to\n", + "\n", + "\n", + "emergency and preparatory medical aid and\n", + "\n", + "resources, regardless of where they are in the\n", + "\n", + "world.\n", + "\n", + "Doing so requires the ability to ingest and\n", + "\n", + "analyze huge chunks of time series data in real\n", + "\n", + "time. This data is produced every time a drone\n", + "\n", + "takes flight and includes performance data,\n", + "\n", + "in-flight battery management, regional weather\n", + "\n", + "patterns, geographic obstacles, landing errors\n", + "\n", + "and a litany of other information that must be\n", + "\n", + "processed.\n", + "\n", + "\n", + "“About 30% of the deliveries we do are lifesaving emergency\n", + "\n", + "deliveries, where the product being delivered does not exist\n", + "\n", + "at the hospital. We have to be fast, and we have to be able\n", + "\n", + "to rely on all the different kinds of data to predict failures\n", + "\n", + "before they occur so that we can guarantee a really, really\n", + "\n", + "high service level to the people who are literally depending\n", + "\n", + "on us with their lives,” said Zipline CEO Keller Rinaudo.\n", + "\n", + "“Databricks gives us confidence in our operations, and\n", + "\n", + "enables us to continuously improve our technology, expand\n", + "\n", + "our impact, and provide lifesaving aid where and when it’s\n", + "\n", + "needed, every single day.”\n", + "\n", + "[Read full story here.](https://www.databricks.com/customers/zipline)\n", + "\n", + "\n", + "Every Zipline flight generates a gigabyte of data\n", + "\n", + "with potential life-or-death consequences,\n", + "\n", + "but accessing and federating the data for both\n", + "\n", + "internal and external decision-making was\n", + "\n", + "challenging. With Databricks as the common\n", + "\n", + "platform, Zipline’s data team can access all the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 2**\n", + "**Model training**\n", + "For training machine learning and deep learning\n", + "models, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\n", + "prepares a data set for model training, performs a set\n", + "of trials using open-source libraries such as scikit-learn\n", + "and XGBoost, and creates a Python notebook with\n", + "the source code for each trial run so you can review,\n", + "reproduce and modify the code.\n", + "\n", + "In Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\n", + "creating data science and machine learning workflows\n", + "and collaborating with colleagues. Databricks\n", + "notebooks provide real-time coauthoring in multiple\n", + "languages, automatic versioning and built-in data\n", + "visualizations.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n", + "\n", + "- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n", + "\n", + "- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n", + "\n", + "- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n", + "\n", + "- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n", + "\n", + "- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n", + "\n", + "\n", + "**Step 3**\n", + "**Track model development**\n", + "The model development process is iterative, and can\n", + "be challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\n", + "you keep track of the model development process,\n", + "including parameter settings or combinations you have\n", + "tried and how they affected the model’s performance.\n", + "\n", + "MLflow tracking uses experiments and runs to log\n", + "and track your model development. A run is a single\n", + "execution of model code. An experiment is a collection\n", + "of related runs. Within an experiment, you can compare\n", + "and filter runs to understand how your model performs\n", + "and how its performance depends on the parameter\n", + "settings, input data, etc.\n", + "\n", + "MLflow can automatically log training code written\n", + "in many ML frameworks. This is the easiest way to\n", + "get started using MLflow tracking. With MLflow’s\n", + "autologging capabilities, a single line of code\n", + "automatically logs the resulting model.\n", + "\n", + "\n", + "A hosted version of MLflow Model Registry can help\n", + "[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\n", + "apply webhooks to automatically trigger actions based\n", + "on registry events. For example, you can trigger CI\n", + "builds when a new model version is created or notify\n", + "your team members through Slack each time a model\n", + "transition to production is requested. This promotes\n", + "a traceable version control work process. You can\n", + "leverage this feature for web traffic A/B testing and\n", + "funneled to versions of deployed models for more\n", + "precise population studies.\n", + "\n", + "\n", + "**Step 4**\n", + "**Deploy machine learning models**\n", + "You can use MLflow to deploy models for batch or\n", + "streaming inference or to set up a REST endpoint to\n", + "serve the model. Simplify your model deployment by\n", + "registering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\n", + "you have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n", + "[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\n", + "the model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n", + "\n", + "[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\n", + "applications, Databricks recommends the following\n", + "workflow.\n", + "\n", + "To debug and tune model inference on Databricks,\n", + "using GPUs (graphics processing units) can efficiently\n", + "optimize the running speed for model inference. As\n", + "GPUs and other accelerators become faster, it is\n", + "important that the data input pipeline keep up with\n", + "demand. The data input pipeline reads the data into\n", + "Spark DataFrames, transforms it and loads it as the\n", + "input for model inference.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: ITERABLE\n", + "\n", + "### Optimizing touch points across the entire customer journey\n", + "\n", + "```\n", + "“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n", + "\n", + "rising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n", + "\n", + "Principal Product Manager, [Iterable](https://iterable.com/)\n", + "\n", + "Captivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n", + "\n", + "connections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n", + "\n", + "patients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n", + "\n", + "than 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n", + "\n", + "This need to build personalized and automated customer experiences for its clients drove the company to find a\n", + "\n", + "fully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n", + "\n", + "the ability to scale for analytics and AI.\n", + "\n", + "With Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n", + "\n", + "unique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n", + "\n", + "learning models that deliver top-notch and personalized user experiences for higher-converting marketing\n", + "\n", + "campaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### ML Stages\n", + "\n", + "ML workflows include the following key assets: code,\n", + "models and data. These assets need to be developed\n", + "(dev), tested (staging) and deployed (production).\n", + "Each stage needs to operate within an execution\n", + "environment. So the execution environments, code,\n", + "models and data are divided into dev, staging and\n", + "production.\n", + "\n", + "ML project code is often stored in a version control\n", + "repository (such as Git), with most organizations using\n", + "branches corresponding to the lifecycle phases of\n", + "development, staging or production.\n", + "\n", + "Since model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\n", + "management to have its own service. MLflow and its\n", + "Model Registry support managing model artifacts\n", + "directly via UI and APIs. The loose coupling of model\n", + "artifacts and code provides flexibility to update\n", + "production models without code changes, streamlining\n", + "the deployment process in many cases.\n", + "\n", + "Databricks recommends creating separate\n", + "environments for the different stages of ML code and\n", + "model development with clearly defined transitions\n", + "between stages. The recommended MLOps workflow is\n", + "broken into these three stages:\n", + "\n", + "\n", + "[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\n", + "is experimentation. Data scientists develop features\n", + "and models and run experiments to optimize model\n", + "performance. The output of the development process is\n", + "ML pipeline code that can include feature computation,\n", + "model training inference and monitoring\n", + "\n", + "\n", + "-----\n", + "\n", + "[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\n", + "This stage focuses on testing the ML pipeline code\n", + "for production readiness, including code for model\n", + "training as well as feature engineering pipelines and\n", + "inference code. The output of the staging process is a\n", + "release branch that triggers the CI/CD system to start\n", + "the production stage.\n", + "\n", + "\n", + "-----\n", + "\n", + "[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\n", + "ML engineers own the production environment\n", + "where ML pipelines are deployed. These pipelines\n", + "compute fresh feature values, train and test new model\n", + "versions, publish predictions to downstream tables\n", + "or applications, and monitor the entire process to\n", + "avoid performance degradation and instability. Data\n", + "scientists have visibility to test results, logs, model\n", + "artifacts and production pipeline status to allow them\n", + "to identify and diagnose problems in production.\n", + "\n", + "The Databricks Machine Learning home page provides\n", + "quick access to all the machine learning resources. To\n", + "access this page, move your mouse or pointer over\n", + "the left sidebar in the Databricks workspace. From\n", + "the persona switcher at the top of the sidebar, select\n", + "\n", + "Machine Learning.\n", + "\n", + "From the shortcuts menu, you can create\n", + "a [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\n", + "The center of the screen includes any recently viewed\n", + "items, and the sidebar provides quick access to\n", + "the [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n", + "[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\n", + "New users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\n", + "that illustrate how to use Databricks throughout the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n", + "\n", + "- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "\n", + "- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "\n", + "- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n", + "\n", + "- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n", + "\n", + "- [Watch the demos](https://www.databricks.com/discover/demos)\n", + "\n", + "\n", + "ML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\n", + "for a model-training tutorial notebook that steps\n", + "through loading data, training and tuning a model,\n", + "comparing and analyzing model performance and\n", + "using the model for inference.\n", + "\n", + "Also be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\n", + "learn how your organization can build a robust MLOPs\n", + "practice incrementally.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 04\n", + "```\n", + "SUMMARY: \u0003\n", + "\n", + "## The Databricks Lakehouse Platform addresses these challenges\n", + " 04\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "### Summary\n", + "\n", + "We’ve organized the common data challenges for startups and growing digital native\n", + "\n", + "businesses into three main buckets: Building a **unified data architecture** — one that\n", + "\n", + "supports **scalability and performance** ; and building effective **machine learning**\n", + "\n", + "**operations** , all with an eye on cost efficiency and increased productivity.\n", + "\n", + "The Lakehouse Platform provides an efficient and scalable architecture that solves\n", + "these challenges and will support your data, analytics and AI workloads now and as\n", + "you scale.\n", + "\n", + "With [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\n", + "performant digital native applications and analytic workloads — designed to scale as\n", + "you grow. Use your data however and wherever you want with open-source flexibility,\n", + "leverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\n", + "data workloads while Databricks automatically manages your infrastructure as you\n", + "scale. Leverage serverless Databricks SQL to increase productivity and scale on\n", + "demand with up to 12x better price/performance.\n", + "\n", + "Easily access data for ML models and accelerate the full ML lifecycle from\n", + "experimentation to production.\n", + "\n", + "Discover more about the lakehouse for companies born in the cloud **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Get started with Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build\n", + "solutions together with interactive notebooks to use\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\n", + "TensorFlow, Keras, scikit-learn and more.\n", + "\n", + "\n", + "### Get started with About Databricks Trial Databricks\n", + "\n", + "Get a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\n", + "solutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\n", + "TensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San\n", + "\n", + "Available as a 14-day full trial in your own cloud or as\n", + "\n", + "Francisco, with offices around the globe. Founded by\n", + "\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and\n", + "MLflow, Databricks is on a mission to help data teams\n", + "solve the world’s toughest problems. To learn more,\n", + "follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n", + "\n", + "\n", + "\n", + "- Available as a 14-day full trial in your own cloud or as\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
**EBOOK**\n", + "\n", + "# Four Forces Driving Intelligent Manufacturing\n", + "\n", + "### A data-driven business built on Lakehouse for Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction .................................................................................................................................................................................................................................................. **03**\n", + "\n", + "The four driving forces of change ..................................................................................................................................................................................................... **04**\n", + "\n", + "Digital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n", + "\n", + "Manufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n", + "\n", + "The foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n", + "\n", + "DRIVING FORCE NO. 1\n", + "The shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n", + "\n", + "DRIVING FORCE NO. 2\n", + "Transparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n", + "\n", + "DRIVING FORCE NO. 3\n", + "Future opportunities for manufacturing business models ......................................................................................................................................................... **13**\n", + "\n", + "DRIVING FORCE NO. 4\n", + "The focus on sustainability ....................................................................................................................................................................................................................... **15**\n", + "\n", + "Leveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n", + "\n", + "The building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n", + "\n", + "Manufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n", + "\n", + "2 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n", + "\n", + "\n", + "But today it’s data- and AI-driven businesses that\n", + "are being rewarded because they’re using process\n", + "and product optimization not previously possible,\n", + "able to forecast and sense supply chain demand,\n", + "and, crucially, introduce new forms of revenue\n", + "based upon service rather than product.\n", + "\n", + "The drivers for this evolution will be the emergence\n", + "of what we refer to as “Intelligent Manufacturing”\n", + "that has been enabled by the rise of computational\n", + "power at the Edge and in the Cloud. As well as\n", + "new levels of connectivity speed enabled by 5G\n", + "and fiber optic, combined with increased use of\n", + "advanced analytics and machine learning (ML).\n", + "\n", + "\n", + "Yet, even with all the technological advances\n", + "enabling these new data-driven businesses,\n", + "challenges exist.\n", + "\n", + "McKinsey’s recent research with the World\n", + "Economic Forum estimates the value creation\n", + "potential of manufacturers and suppliers that\n", + "implement Industry 4.0 in their operations\n", + "at USD$37 trillion by 2025. Truly a huge number.\n", + "But the challenge that most companies still\n", + "struggle with is the move from piloting point\n", + "solutions to delivering sustainable impact at scale.\n", + "[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "\n", + "\n", + "##### 80% of manufacturers\n", + "[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "\n", + "##### 57% of manufacturing leaders feel their organization\n", + "[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "\n", + "[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n", + "##### manufacturers by 2025\n", + "\n", + "\n", + "3 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The four driving forces of change\n", + "\n", + "###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n", + "\n", + "\n", + "###### Skills and production gaps\n", + "\n", + "The rise of the digital economy is demanding a new set of skills.\n", + "For today’s Intelligent Manufacturing organizations, there’s a fundamental\n", + "need for computer and programming skills for automation, along\n", + "with critical-thinking abilities. Also important is the ability to use\n", + "collaboration systems and new advanced assistance tools, such as\n", + "automation, virtual reality (VR) and augmented reality (AR). The deficit\n", + "of workers with these skills is of critical concern to manufacturers.\n", + "\n", + "In addition, the industry dynamics are pushing companies to increase\n", + "and refine both partner/supplier relationships, optimize internal\n", + "operations and build robust supply chains that do not rely upon\n", + "safety stock to weather supply chain swings. Historical focus on\n", + "operational use cases is now extending to building agile supply chains.\n", + "\n", + "###### Supply chain volatility\n", + "\n", + "If the events of the last few years proved anything, it’s that supply\n", + "chains need to be robust and resilient. Historically, supply chain volatility\n", + "was smoothed by holding “safety stock,” which added costs without\n", + "financial value. Then the pendulum swung to “just in time delivery,”\n", + "where efficient use of working capital disregarded demand risks.\n", + "\n", + "Recent experiences have highlighted that demand sensing is needed\n", + "in addition to safety stock for high-risk parts or raw materials. The ability\n", + "to monitor, predict and respond to external factors – including natural\n", + "disasters, shipping and warehouse constraints, and geopolitical disruption\n", + "– is vital to reduce risk and promote agility. Many of these external\n", + "data sources leverage unstructured data (news, social posts, videos\n", + "and images), and being able to manage both structured and unstructured\n", + "data available to measure and analyze this volatility is key.\n", + "\n", + "\n", + "###### Need for new and additional sources of revenue\n", + "\n", + "Manufacturers’ growth historically has been limited\n", + "to new product introduction rate or expansion into\n", + "new geographies. The emergence of “equipment\n", + "as-a-service” is changing that dynamic. It’s pivoting\n", + "the business from product-centric growth to one\n", + "leveraging added services, which are not slaves to the\n", + "product development introduction cycle and can be highly\n", + "differentiated depending on the market segment and types\n", + "of products. Real-time data plays an outsize role, as now\n", + "businesses are in unison with use cases such as predictive\n", + "maintenance, stock replenishment and worker safety.\n", + "\n", + "###### An increased focus on sustainability\n", + "\n", + "Manufacturers have always focused on efficiency,\n", + "but they’re increasingly seeing the value chain as circular.\n", + "It’s no longer enough to consider an organization’s own\n", + "carbon footprint – it needs to also include indirect\n", + "emissions and other environmental impacts from the\n", + "activities it doesn’t own or control. This requires a\n", + "360-degree view of sustainability, which includes both\n", + "internal and external factors in measuring compliance\n", + "with ESG programs.\n", + "\n", + "**This eBook will look closer at these four key challenges**\n", + "**and their associated use cases, as well as some**\n", + "**of the most effective technologies and solutions**\n", + "**that can be implemented to respond to them.**\n", + "\n", + "\n", + "4 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Digital transformation is not a destination, it’s a journey\n", + "\n", + "##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n", + "\n", + "This transition from manual operations to automated\n", + "solutions is enhancing and optimizing operational\n", + "efficiency and decision-making, while also making\n", + "supply chains more frictionless and reliable, as well\n", + "as enabling organizations to become more responsive\n", + "and adaptable to market and customer needs.\n", + "\n", + "This disruption has been driven by a rush of new\n", + "technologies including artificial intelligence, machine\n", + "learning, advanced analytics, digital twins, Internet\n", + "of Things (IoT), and automation. These, in turn, have\n", + "been enabled by the greater network capabilities of 5G.\n", + "Industry 4.0 is well underway. Intelligent Manufacturing\n", + "isn’t the future, it’s what competitive organizations\n", + "have established today.\n", + "\n", + "\n", + "## The data and AI maturity curve\n", + "### From descriptive to prescriptive\n", + "\n", + "Prescriptive\n", + "Analytics\n", + "\n", + "Predictive\n", + "Modeling\n", + "\n", + "**How** can we make it happen?\n", + "\n", + "Data\n", + "Exploration\n", + "\n", + "\n", + "**What** will happen?\n", + "\n", + "**Why** did it happen?\n", + "\n", + "\n", + "Ad Hoc\n", + "Queries\n", + "\n", + "\n", + "Reports\n", + "\n", + "\n", + "Cleaned\n", + "Data\n", + "\n", + "**What** happened?\n", + "\n", + "Analytics Maturity\n", + "\n", + "\n", + "Raw\n", + "Data\n", + "\n", + "\n", + "5 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing – use case maturity matrix\n", + "\n", + "\n", + "No\n", + "\n", + "1\n", + "\n", + "2\n", + "\n", + "3\n", + "\n", + "4\n", + "\n", + "5\n", + "\n", + "6\n", + "\n", + "7\n", + "\n", + "8\n", + "\n", + "9\n", + "\n", + "10\n", + "\n", + "11\n", + "\n", + "12\n", + "\n", + "13\n", + "\n", + "14\n", + "\n", + "15\n", + "\n", + "16\n", + "\n", + "17\n", + "\n", + "18\n", + "\n", + "19\n", + "\n", + "20\n", + "\n", + "21\n", + "\n", + "22\n", + "\n", + "23\n", + "\n", + "\n", + "Use case name\n", + "\n", + "EDW offload\n", + "\n", + "Product 360\n", + "\n", + "Voice of customer insights\n", + "\n", + "Testing & simulation optimization\n", + "\n", + "Supplier 360\n", + "\n", + "Spend analytics\n", + "\n", + "Sourcing event optimization\n", + "\n", + "Process & quality monitoring\n", + "\n", + "Process 360\n", + "\n", + "Equipment predictive maintenance\n", + "\n", + "Quality & yield optimization\n", + "\n", + "Supply chain 360\n", + "\n", + "Demand analytics\n", + "\n", + "Inventory visibility & tracking\n", + "\n", + "Inventory optimization\n", + "\n", + "Logistics route optimization\n", + "\n", + "Customer 360\n", + "\n", + "Marketing & sales personalization\n", + "\n", + "Recommendation engine\n", + "\n", + "Asset/Vehicle 360\n", + "\n", + "Connected asset & value-added services\n", + "\n", + "Quality event detection & traceability\n", + "\n", + "Asset predictive maintenance\n", + "\n", + "\n", + "Peer Competitive Scale\n", + "\n", + "Standard among peer group\n", + "\n", + "Common among peer group\n", + "\n", + "Strategic among peer group\n", + "\n", + "\n", + "Design\n", + "\n", + "\n", + "Purchasing\n", + "\n", + "**11**\n", + "\n", + "**10**\n", + "\n", + "**13**\n", + "\n", + "**12**\n", + "\n", + "**17**\n", + "\n", + "\n", + "New innovations\n", + "\n", + "Manufacturing\n", + "\n", + "Supply Chain\n", + "\n", + "\n", + "That is not to say that the digital transformation\n", + "journey is simple. Replacing legacy systems, breaking\n", + "down data and organizational silos, bridging the gap\n", + "between operational technology (OT) and informational\n", + "technology (IT), reskilling workforces, and much more\n", + "requires a clear and determined digitalization strategy,\n", + "and to reach new levels of IT and data maturity.\n", + "\n", + "\n", + "**16**\n", + "\n", + "\n", + "Much of the aforementioned transformation requires\n", + "a foundation of effective data management and\n", + "architecture to be in place. Without this ability to\n", + "control the vast amounts of structured data (highly\n", + "organized and easily decipherable) and unstructured\n", + "data (qualitative, no predefined data model),\n", + "manufacturers cannot generate actionable insights\n", + "from their data, derive value from machine learning,\n", + "monitor and analyze supply chains, or coordinate\n", + "decisions across the business.\n", + "\n", + "\n", + "**15**\n", + "\n", + "\n", + "**14**\n", + "\n", + "\n", + "Marketing & Sales\n", + "\n", + "Service\n", + "\n", + "\n", + "**19**\n", + "\n", + "\n", + "**18**\n", + "\n", + "\n", + "**23**\n", + "\n", + "\n", + "**22**\n", + "**21**\n", + "**20**\n", + "\n", + "\n", + "Awareness\n", + "\n", + "\n", + "Exploration Optimization Transformation\n", + "\n", + "Maturity Stages\n", + "\n", + "\n", + "6 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The foundations for data-driven manufacturing\n", + "\n", + "###### Cloud-native platforms\n", + "\n", + "Improve data management, enhance data analytics\n", + "and expand the use of enterprise data, including streaming\n", + "structured and unstructured data\n", + "\n", + "###### Technology-enabled collaboration\n", + "\n", + "Democratize analytics and ML capabilities – ensure the right\n", + "users have access to the right data driving business value\n", + "\n", + "###### The ability to scale machine learning use cases\n", + "\n", + "A central place to store and discover ML models and enabling\n", + "greater collaboration between ML, data and business users\n", + "\n", + "\n", + "##### 95% agree that\n", + "[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "\n", + "[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "##### USD$2.8 trillion by 2025\n", + "\n", + "\n", + "##### 85% have accelerated\n", + "[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "\n", + "\n", + "###### Open standards and open data architectures\n", + "\n", + "Leverage open source standards and open data formats\n", + "to accelerate innovation and enable the integration\n", + "of best-of-breed, third-party tools and services\n", + "\n", + "\n", + "7 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 1\n", + "\n", + "## The shift from manufacturing to Intelligent Manufacturing\n", + "\n", + "##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n", + "\n", + "\n", + "Yet the reasons for the lack of manufacturing\n", + "talent today are manifold, and COVID-19 has only\n", + "contributed to an existing problem. For instance,\n", + "many highly experienced baby boomers are\n", + "retiring from the workforce, leaving fewer people\n", + "with the in-depth knowledge of custom equipment\n", + "and machines. Meanwhile, younger generations\n", + "have a poor perception of what manufacturing jobs\n", + "are like and are reluctant to step into the industry.\n", + "Meaning not only a problem with retaining skills,\n", + "but also attracting them.\n", + "\n", + "And, of course, there is a growing gap between\n", + "the current capabilities of industrial workers and\n", + "the skill sets needed for today’s data-driven,\n", + "sensor-filled, 5G-enabled Intelligent Manufacturing.\n", + "\n", + "\n", + "With the drive to optimize operations, stabilize\n", + "supply chains and reinvent business models\n", + "through equipment-as-a-service, the skill sets\n", + "have radically changed from even a decade ago.\n", + "\n", + "Intelligent Manufacturing’s use cases are placing\n", + "a high demand on robotics programmers and\n", + "technicians, cybersecurity experts, digital twin\n", + "architects, supply network analysts, and people\n", + "who can leverage AI and machine learning\n", + "algorithms because deployment of these common\n", + "use cases is producing multiples of returns for\n", + "those embracing Intelligent Manufacturing.\n", + "\n", + "\n", + "8 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n", + "\n", + "\n", + "##### 44% report difficulty\n", + "[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "##### 83% of manufacturing workers are interested\n", + "[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n", + "\n", + "##### 56% of Gen Z say\n", + "[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "\n", + "### Proof through customer success\n", + "\n", + "##### Watch our case study\n", + "\n", + "\n", + "###### Digital twins\n", + "\n", + "Ingesting information from sensors and other data sources,\n", + "these virtual replicas of physical assets create models\n", + "to which a layer of visualization can be applied. This enables\n", + "users to predict failures, assess performance and reveal\n", + "opportunities for optimization. Digital twins unlock the ability\n", + "for manufacturers to monitor and manage production remotely,\n", + "as well as explore “what-if” scenarios.\n", + "\n", + "###### Process and quality optimization\n", + "\n", + "Process and quality optimization generally covers the\n", + "optimization of equipment, operating procedures, and control\n", + "loops. It requires access to accurate, up-to-date data about\n", + "conditions, collected through IoT devices to monitor every\n", + "aspect. The introduction of deep learning architectures is\n", + "enabling manufacturing machinery to identify visual clues\n", + "that are indicative of quality issues in manufactured goods,\n", + "while digital twins can be used to spot inefficiencies without\n", + "the need to pause production.\n", + "\n", + "###### Throughput optimization\n", + "\n", + "Increasing throughput is critical for meeting delivery schedules,\n", + "and manufacturers are always looking for ways to identify\n", + "and eliminate bottlenecks, reduce inventory and increase\n", + "the utilization of assets. Throughput optimization makes\n", + "use of data-driven algorithms to identify, rank and resolve\n", + "labor, equipment or inventory bottlenecks.\n", + "\n", + "\n", + "###### Equipment predictive maintenance\n", + "\n", + "Rather than wait for a piece of equipment to fail or\n", + "stick to a fixed schedule, predictive maintenance adopts\n", + "a predictive approach to equipment maintenance.\n", + "By monitoring real-time data collected from hundreds\n", + "of IoT sensors, machine learning techniques can detect\n", + "anomalies in operations and possible defects in equipment\n", + "and processes. Predictive maintenance correlates data across\n", + "many more dimensions than traditional inspection techniques,\n", + "to anticipate failures and prevent costly breakdowns.\n", + "\n", + "###### Quality and yield optimization (with computer vision)\n", + "\n", + "Quality assurance focuses on the use of data analytics,\n", + "AI and machine learning to identify and prevent defects\n", + "during the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\n", + "recognition and machine learning, computer vision\n", + "can automate visual inspections, detecting faults\n", + "and imperfections faster and more cost effectively\n", + "than manual approaches.\n", + "\n", + "\n", + "9 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 2\n", + "\n", + "## Transparency, visibility, data: optimizing the supply chain\n", + "\n", + "##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n", + "\n", + "\n", + "Such resiliency requires a combination\n", + "of technologies and solutions. For example,\n", + "decision support tools with predictive capabilities\n", + "– to monitor the supply chain and analyze\n", + "what-if scenarios. Demand sensing and forecasting\n", + "in combination with enterprise critical systems\n", + "(ERP) needs to combine data from a wide variety\n", + "of sources.\n", + "\n", + "10 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "Working together, combining millions of data points\n", + "from across organizations’ operations along with\n", + "other external sources, these technologies can\n", + "be used to optimize supply chains, reduce costs\n", + "and improve customer service and loyalty.\n", + "However, achieving this – embracing the latest\n", + "in AI, machine learning and predictive analytics –\n", + "means being able to manage and maintain\n", + "a flow of accurate, relevant data and to be able\n", + "to translate this data into actionable insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n", + "\n", + "\n", + "###### Demand, inventory, logistics\n", + "\n", + "\n", + "###### Purchasing\n", + "\n", + "**Spend analytics:** Most obviously, transparency and insight into where\n", + "cash is spent is vital for identifying opportunities to reduce external\n", + "spending across supply markets, suppliers and locations. However, spend\n", + "analytics are also hugely important to supply chain agility and resilience.\n", + "This requires a single source of data truth for finance and procurement\n", + "departments. For example, integrating purchase order, invoice,\n", + "accounts payable, and general-ledger account data to create a level of\n", + "transparency, visibility and consistency to inform supplier discussions\n", + "and deploy strategies to manage cash better during times\n", + "of disruption.\n", + "\n", + "###### Cross supply chain collaboration\n", + "\n", + "**Supply chain 360:** With real-time insights and aggregated supply\n", + "chain data in a single business intelligence dashboard, manufacturers\n", + "are empowered with greater levels of visibility, transparency\n", + "and insights for more informed decision-making. This dashboard\n", + "can be used to identify risks and take corrective steps,\n", + "assess suppliers, control costs and more.\n", + "\n", + "\n", + "**Demand analytics:** By collecting and analyzing millions –\n", + "if not billions – of data points about market and customer\n", + "behavior and product performance, manufacturers can\n", + "use this understanding to improve operations and support\n", + "strategic decisions that affect the demand of products\n", + "and services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "\n", + "**Inventory visibility and tracking:**\n", + "Inventory visibility is the ability to view and track\n", + "inventory in real time, with insights into SKU stock levels\n", + "and which warehouse or fulfillment center it is stored at.\n", + "With complete oversight of inventory across multiple\n", + "channels, this helps improve supply chain efficiency,\n", + "demand forecasting and order accuracy, while ultimately\n", + "enhancing the customer experience.\n", + "\n", + "\n", + "**Inventory optimization:** The practice of having the right\n", + "amount of available inventory to meet demand, both in the\n", + "present and the future, enables manufacturers to address\n", + "demand expectations, and reduce the costs of common\n", + "inventory issues. Inventory optimization incorporates\n", + "data for demand forecasting, inventory strategy and\n", + "stock replenishment. With the addition of AI reinforced\n", + "learning models, this can help improve demand prediction,\n", + "recommend stock levels, and automatically order\n", + "raw materials to fulfill orders, while also detecting\n", + "and responding to shifts in demand.\n", + "\n", + "**Logistics route optimization:** Using AI, route optimization\n", + "can help manufacturers go beyond normal route planning\n", + "and include parameters to further drive logistics efficiency.\n", + "What-if scenarios present route options that help cut\n", + "transportation costs, boost productivity and execute\n", + "on-time deliveries.\n", + "\n", + "\n", + "**Supply chain network design:** By building and modeling the supply\n", + "chain, it enables manufacturers to understand the costs and time\n", + "to bring goods and services to market. Supply chain network design\n", + "helps to evaluate delivery at the lowest possible cost, optimal sources\n", + "and inventory deployment, as well as define distribution strategies.\n", + "\n", + "11 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n", + "\n", + " Only 6% of companies believe\n", + "[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n", + "\n", + "##### 57% believe that supply chain management \n", + "[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n", + "[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n", + "\n", + "### Supply chain optimization case study\n", + "\n", + "##### Watch our case study\n", + "\n", + "12 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 3\n", + "\n", + "## Future opportunities for manufacturing business models\n", + "\n", + "##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n", + "\n", + "\n", + "These new opportunities are forming part\n", + "of a longer-term industry shift from the sale\n", + "of goods (CapEx) to recurring revenue streams,\n", + "such as through Equipment-as-a-Service (EaaS)\n", + "models. While this approach is not new to many\n", + "(Rolls-Royce’s “Power-by-the-Hour” engine\n", + "subscription model has been around since 1962),\n", + "customer demand, advances in industrial IoT\n", + "technology, and a continuing decline in\n", + "sales and margins have seen EaaS emerge\n", + "as an imperative for manufacturers.\n", + "\n", + "\n", + "Opening up some of these new revenue streams,\n", + "of course, demands operational flexibility, but more\n", + "importantly, digital maturity. This means cloud\n", + "technologies that allow employees new levels\n", + "of access to data, the ability to work anywhere,\n", + "and adapt rapidly to new needs. The introduction\n", + "of a microservices architecture, to allow the agile\n", + "development and deployment of new IT services.\n", + "And the democratization of data, so the entire\n", + "organization and its ecosystem of partners\n", + "and suppliers have access to information\n", + "about market demand, operations, production,\n", + "logistics and transportation.\n", + "\n", + "\n", + "13 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "##### By 2023, 20% of industrial equipment manufacturers will\n", + "[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "\n", + "##### In 2025, the global EaaS market is estimated\n", + "[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "\n", + "##### In the U.S., 34% said\n", + "[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n", + "[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n", + "[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n", + "[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n", + "\n", + "### Equipment as a service case study\n", + "\n", + "##### Read our case study\n", + "\n", + "\n", + "### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n", + "\n", + "\n", + "###### Connected assets\n", + "\n", + "The digital connectivity of high-value\n", + "physical assets is helping to drive a\n", + "more efficient use of assets and cost\n", + "savings. Connected assets can provide\n", + "continuous, real-time data on their\n", + "operating conditions, even if they are on\n", + "the other side of the world. Connected\n", + "assets can also be used as the foundation\n", + "of as-a-service business models to\n", + "track the usage of rented machines, and\n", + "for automakers to use with connected\n", + "vehicles and electrification strategies.\n", + "\n", + "\n", + "###### Quality event detection and traceability\n", + "\n", + "Manufacturers are increasingly seeking\n", + "end-to-end supply chain traceability —\n", + "to be able to identify and trace\n", + "the history, distribution, location\n", + "and application of products, parts\n", + "and materials. With event-based\n", + "traceability, typically using blockchain\n", + "ledgers, manufacturers can record\n", + "events along the supply chain.\n", + "This can help aid legal compliance,\n", + "support quality assurance and brand\n", + "trust, and provide full supply chain\n", + "visibility for better risk management.\n", + "\n", + "\n", + "###### Demand-driven manufacturing\n", + "\n", + "**Equipment-as-a-Service:**\n", + "Startup organizations without\n", + "the in-house infrastructure can\n", + "use a third-party to realize their\n", + "concepts, while manufacturers\n", + "with the production capabilities\n", + "can ensure minimal downtime\n", + "for their assets. This involves\n", + "greater risk for the manufacturer,\n", + "but also the potential for higher\n", + "and annuitized revenues.\n", + "\n", + "\n", + "14 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 4\n", + "\n", + "## The focus on sustainability\n", + "\n", + "##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n", + "\n", + "\n", + "When looking at the entire manufacturing\n", + "value chain, there are many areas where\n", + "more sustainable practices can deliver\n", + "measurable change. Products can be\n", + "designed in a way that reduces waste\n", + "and increases their longevity; materials\n", + "can be selected and sourced in a more\n", + "ethical way; operational efficiency and\n", + "green energy can improve production;\n", + "and the introduction of sustainable\n", + "practices for transportation and\n", + "shipping can help reduce carbon\n", + "footprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "\n", + "\n", + "There are a number of business\n", + "operating models that employ the four\n", + "Rs and support the circular economy.\n", + "Sharing platforms and aaS models help\n", + "optimize manufacturing capacity and\n", + "enable businesses to rent rather than\n", + "buy the machinery and equipment\n", + "they need. Product use extension helps\n", + "extend the lifecycle of products through\n", + "repair and refurbishment, while resource\n", + "recovery means recovering raw materials\n", + "from end-of-life products.\n", + "\n", + "Achieving this means establishing\n", + "a redesigned supply chain that\n", + "leverages many use cases, technologies\n", + "and solutions we covered earlier.\n", + "\n", + "\n", + "It will require greater levels of\n", + "collaboration between suppliers\n", + "and vendors. It will require optimizing\n", + "production lines and transportation.\n", + "It will require greater levels of customer\n", + "engagement to extend product lifecycles\n", + "and close the loop of the supply chain.\n", + "\n", + "But most of all, it will require data,\n", + "to provide visibility and intelligence\n", + "across the network, and to be able\n", + "to make the decisions to improve\n", + "efficiency in the present, as well as\n", + "longer-term decisions based on a\n", + "broad view of sustainability impacts.\n", + "\n", + "\n", + "15 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Sustainability Solution Accelerator\n", + "\n", + "##### Read now\n", + "\n", + "\n", + "[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n", + "[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "##### world’s energy consumption\n", + "[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "\n", + "\n", + "##### 80% of the world’s leading companies \n", + "[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "\n", + "\n", + "##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n", + "\n", + "\n", + "16 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Leveraging the Databricks Lakehouse for Manufacturing\n", + "\n", + "Our open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\n", + "and transportation & logistics organizations to unlock more value and transform how they use data and AI.\n", + "\n", + "\n", + "All your sources Any structure or frequency\n", + "\n", + "\n", + "Reliable, real-time processing Analytics capabilities for any use case or persona\n", + "\n", + "\n", + "Competitor News\n", + "& Social\n", + "\n", + "Consumer Devices\n", + "\n", + "Video & Images\n", + "\n", + "IoT & Shop Floor\n", + "\n", + "Enterprise Resource\n", + "Planning\n", + "\n", + "Sales Transaction\n", + "& Syndicated\n", + "\n", + "Inventory & Logistics\n", + "\n", + "\n", + "Unstructured batch\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "Low-cost, rapid experimentation\n", + "with new data and models.\n", + "\n", + "Production Machine Learning\n", + "\n", + "High volume, fine-grained analysis at scale\n", + "served in the tightest of service windows.\n", + "\n", + "BI Reporting and Dashboarding\n", + "\n", + "Power real-time dashboarding directly,\n", + "or feed data to a data warehouse for\n", + "high-concurrency reporting.\n", + "\n", + "Real-Time Applications\n", + "\n", + "\n", + "Lakehouse enables a real-time\n", + "data-driven business with the ability\n", + "to ingest structured, semi-structured\n", + "and unstructured data from ERP,\n", + "SCM, IoT, social or other sources\n", + "in your value chain so that predictive\n", + "AI and ML insights can be realized.\n", + "This enables them to operate their\n", + "business in real time, deliver more\n", + "accurate analytics that leverage all\n", + "their data, and drive collaboration\n", + "and innovation across their value\n", + "chain. Most important for capital\n", + "intensive manufacturing business,\n", + "it enables them to move quickly\n", + "from proof-of-concept (PoC)\n", + "ideation to ROI quickly.\n", + "\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Unstructured batch\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Structured real-time\n", + "\n", + "Structured batch\n", + "\n", + "Structured real-time\n", + "\n", + "\n", + "Data Lakehouse\n", + "\n", + "Process, manage, and\n", + "query all your data.\n", + "\n", + "Any cloud\n", + "\n", + "\n", + "Provide real-time data to downstream\n", + "applications or power applications via APIs.\n", + "\n", + "\n", + "17 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The building blocks of Lakehouse for Manufacturing\n", + "\n", + "\n", + "###### Real Time\n", + "\n", + "Make data-informed decisions\n", + "\n", + "\n", + "###### Solution Accelerators\n", + "\n", + "Accelerate the possibilities\n", + "of capabilities\n", + "\n", + "\n", + "###### Partner Solutions\n", + "\n", + "Accelerate the\n", + "creation of insights\n", + "\n", + "\n", + "###### Speed\n", + "\n", + "Delivering fast ROI\n", + "\n", + "\n", + "**Real-time data to make informed**\n", + "**decisions:** The Lakehouse Platform\n", + "streamlines data ingestion and\n", + "management in a way that makes it easy\n", + "to automate and secure data with fast,\n", + "real-time performance. This means you\n", + "can consolidate and enhance data from\n", + "across the organization and turn it into\n", + "accessible, actionable insights.\n", + "\n", + "\n", + "**Solution Accelerators for new**\n", + "**capabilities:** Through our Solution\n", + "Accelerators, manufacturers can\n", + "easily access and deploy common and\n", + "high-impact use cases. For manufacturers\n", + "restricted by time and resources, these\n", + "accelerators provide the tools and\n", + "pre-built code to deliver PoCs in\n", + "less than two weeks.\n", + "\n", + "\n", + "**Pre-built applications to deliver**\n", + "**solutions faster:** We make it easy\n", + "for you to discover data, analytics\n", + "and AI tools, using pre-built integrations\n", + "to connect with partner solutions,\n", + "integrating them (and existing solutions)\n", + "into the Lakehouse Platform to rapidly\n", + "expand capabilities in a few clicks.\n", + "\n", + "\n", + "**The speed to deliver fast ROI:**\n", + "With faster data ingestion and access\n", + "to insights combined with easier, quicker\n", + "deployments, this means accelerated\n", + "digital transformation and higher ROI.\n", + "\n", + "\n", + "18 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturers’ end goals\n", + "\n", + "##### Intelligent Manufacturing leaders leverage a combination of familiar manufacturing techniques and recent value producing and differentiating use of data-led use cases.\n", + "\n", + "This means making use of IIoT, cloud computing, data analytics,\n", + "machine learning and more to create an end-to-end digital ecosystem\n", + "across the entire value chain and build scalable architectures\n", + "that take data from edge to AI. It means embracing automation\n", + "and robotics, optimizing how organizations use assets and\n", + "augmenting the capabilities of workforces, and introducing new\n", + "levels of connectivity to accelerate performance. Not to mention\n", + "open the door to new platform and as-a-service business models\n", + "with the potential to generate new revenue streams.\n", + "\n", + "Also key to the data-driven transformation of manufacturing is visibility:\n", + "a 360-degree, end-end-to view of the supply chain. Not only is this\n", + "critical for the efficiency, optimization and profitability of operations,\n", + "it is needed to be able to take new strides in sustainability.\n", + "\n", + "Of course, better data management is not only about unlocking\n", + "insight, empowering AI, and enabling decision-making. It’s also about\n", + "governance: acknowledging format issues, adhering to compliance,\n", + "protecting IP, ensuring data security. All this needs to be taken into\n", + "consideration when bringing onboard an ISV to establish a modern,\n", + "unified architecture for data and AI.\n", + "\n", + "19 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems. To learn more,\n", + "follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "Get started with a free trial of Databricks and\n", + "start building data applications today\n", + "\n", + "##### Start your free trial\n", + "\n", + "To learn more, visit us at:\n", + "\n", + "**[Databricks for Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
## Driving Innovation and Transformation in the Federal Government With Data + AI\n", + "\n", + "Empowering the federal government\n", + "to efficiently deliver on mission objectives\n", + "and better serve citizens\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "State of the union: Data and AI in the federal government **03**\n", + "\n", + "Recognizing the opportunity for data and AI **04**\n", + "\n", + "Challenges to innovation **07**\n", + "\n", + "The Databricks Lakehouse Platform: Modernizing the federal government to achieve mission objectives **09**\n", + "\n", + "Customer story: U.S. Citizenship and Immigration Services **13**\n", + "\n", + "Conclusion **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### State of the union: Data and AI in the federal government\n", + "\n", + "For the private sector, the growth, maturation and application of data analytics and\n", + "\n", + "artificial intelligence (AI) have driven innovation. This has resulted in solutions that have\n", + "\n", + "helped to improve efficiencies in everything from optimizing supply chains to accelerating\n", + "\n", + "drug development to creating personalized customer experiences and much more.\n", + "\n", + "Unfortunately, the federal government and many of its agencies are just beginning to take\n", + "\n", + "advantage of the benefits that data, analytics and AI can deliver. This inability to innovate\n", + "\n", + "is largely due to aging technology investments, resulting in a sprawl of legacy systems\n", + "\n", + "siloed by agencies and departments.\n", + "\n", + "Additionally, the government is one of the largest employers in the world, which introduces\n", + "\n", + "significant complexity, operational inefficiencies and a lack of transparency that limit the\n", + "\n", + "ability of its agencies to leverage the data at their disposal for even basic analytics – let\n", + "\n", + "alone advanced data analytic techniques, such as machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Recognizing the opportunity for data and AI\n", + "\n", + "The opportunity for the federal government to leverage data analytics and AI cannot be\n", + "\n", + "overstated. With access to some of the largest current and historical data sets available to the\n", + "\n", + "\n", + "United States — and with vast personnel resources and some of the best private sector use\n", + "\n", + "cases and applications of AI available in the world — the federal government has the ability to\n", + "\n", + "transform the efficiency and effectiveness of many of its agencies.\n", + "\n", + "In fact, the federal government plans to spend $4.3 billion in artificial intelligence research and\n", + "\n", + "development across agencies in fiscal year 2023, according to a recent report from Bloomberg\n", + "\n", + "Government. These priorities are validated by a recent Gartner study of government CIOs\n", + "\n", + "across all levels (including state and local), confirming that the top game-changing technologies\n", + "\n", + "are AI, data analytics and the cloud.\n", + "\n", + "And as an indication of the potential impact, a recent study by Deloitte shows the government\n", + "\n", + "can save upward of $3 billion annually on the low end to more than $41 billion annually on the\n", + "\n", + "high end from data-driven automation and AI.\n", + "\n", + "Sources:\n", + "\n", + "[• Gartner Survey Finds Government CIOs to Focus Technology Investments on Data Analytics and Cybersecurity in 2019](https://www.gartner.com/en/newsroom/press-releases/2019-01-23-gartner-survey-finds-government-cios-to-focus-technol)\n", + "\n", + "[• Administration Projects Agencies Will Spend $1 Billion on Artificial Intelligence Next Year](https://www.nextgov.com/emerging-tech/2019/09/administration-projects-agencies-will-spend-1-billion-artificial-intelligence-next-year/159781/)\n", + "\n", + "\n", + "Investment in AI to\n", + "\n", + "automate repetitive tasks\n", + "\n", + "can improve efficiencies\n", + "\n", + "across government agencies,\n", + "\n", + "which could save **96.7**\n", + "#### million federal hours annually, with a potential\n", + "\n", + "savings of **$3.3 billion.**\n", + "\n", + "**WILLIAM EGGERS, PETER VIECHNICKI**\n", + "\n", + "**AND DAVID SCHATSKY**\n", + "\n", + "[Deloitte Insights](https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/artificial-intelligence-government.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**An increased focus on cloud, analytics and AI = operational efficiency**\n", + "\n", + "1. AI/ML\n", + "2. Data Analytics\n", + "3. Cloud\n", + "\n", + "**$1B** **TOP PRIORITIES** **$41B+**\n", + "\n", + "Data and AI Research and Government CIOs’ top Estimated government\n", + "Development Initiative game-changing technologies savings from data-driven\n", + "automation\n", + "\n", + "**U.S. Government**\n", + "\n", + "Fortunately, the President’s Management Agenda (PMA) has recognized the need to\n", + "\n", + "modernize their existing infrastructure, federate data for easier access and build more\n", + "\n", + "\n", + "**IT Modernization Act**\n", + "\n", + "Allows agencies to invest\n", + "\n", + "in modern technology\n", + "\n", + "solutions to improve\n", + "\n", + "service to the public,\n", + "\n", + "secure sensitive systems\n", + "\n", + "and data, and save\n", + "\n", + "taxpayer dollars.\n", + "\n", + "\n", + "**Federal Data Strategy**\n", + "\n", + "A 10-year vision for how\n", + "\n", + "the federal government will\n", + "\n", + "accelerate the use of data to\n", + "\n", + "achieve its mission, serve the\n", + "\n", + "public and steward resources,\n", + "\n", + "while protecting security,\n", + "\n", + "privacy and confidentiality.\n", + "\n", + "\n", + "**AI Executive Order**\n", + "\n", + "Makes AI a top research\n", + "\n", + "and development priority for\n", + "\n", + "federal agencies, provides\n", + "\n", + "a shared ethics framework\n", + "\n", + "for developing and using AI,\n", + "\n", + "and expands job rotation\n", + "\n", + "programs to increase\n", + "\n", + "the number of AI experts\n", + "\n", + "at agencies.\n", + "\n", + "\n", + "advanced data analytics capabilities by establishing mandates for modernization, data\n", + "\n", + "openness and the progression of AI innovations.\n", + "\n", + "\n", + "This will put agencies in a better position to leverage the scale of the cloud and democratize\n", + "\n", + "\n", + "This will put agencies in a better position to leverage the scale of the cloud and democratize The end result will be transformative innovation that can not only improve the operational\n", + "\n", + "secure access to data in order to enable downstream business intelligence and AI use cases. efficiencies of each agency, but also support the delivery of actionable insights in real time\n", + "\n", + "\n", + "efficiencies of each agency, but also support the delivery of actionable insights in real time\n", + "\n", + "\n", + "for more informed decision-making. This benefits citizens in the form of better services,\n", + "\n", + "stronger national security and smarter resource management.\n", + "\n", + "\n", + "-----\n", + "\n", + "Top data and AI use cases in the government\n", + "\n", + "\n", + "**H E A LT H C A R E**\n", + "\n", + "Improve the delivery and quality of healthcare services for citizens with powerful analytics and a 360°\n", + "\n", + "view of patients.\n", + "\n", + "- Patient 360 - Insurance management\n", + "\n", + "- Population health - Genomics\n", + "\n", + "- Supply chain optimization - Drug discovery and delivery\n", + "\n", + "\n", + "Across the federal government, data and AI is providing the insights and predictive\n", + "\n", + "capabilities to thwart cyberattacks and national threats, provide better social services more\n", + "\n", + "efficiently, and improve the delivery and quality of healthcare services.\n", + "\n", + "**H O M E L A N D S E C U R I T Y**\n", + "\n", + "\n", + "Detect and prevent criminal activities and national threats with real-time analytics and data-driven\n", + "\n", + "decision-making.\n", + "\n", + "\n", + "\n", + "- Customs and border protection - Counter-terrorism\n", + "\n", + "- Immigration and citizenship - Federal emergency aid management\n", + "\n", + "**D E F E N S E**\n", + "\n", + "\n", + "**E N E R G Y**\n", + "\n", + "Improve energy management with data insights that ensure energy resiliency and sustainability.\n", + "\n", + "- Security of energy infrastructure - Energy exploration\n", + "\n", + "- Smarter energy management - Electrical grid reliability\n", + "\n", + "\n", + "Apply the power of predictive analytics to geospatial, IoT and surveillance data to improve operations\n", + "\n", + "\n", + "**C O M M E R C E**\n", + "\n", + "Proactively detect anomalies with machine learning to mitigate risk and prevent fraudulent activity.\n", + "\n", + "- Tax fraud and collection - Grants management\n", + "\n", + "- Process and operations management - Customer 360\n", + "\n", + "**I N T E L L I G E N C E C O M M U N I T Y**\n", + "\n", + "Leverage real-time insights to make informed decisions that can impact the safety of our citizens and\n", + "\n", + "the world.\n", + "\n", + "- Threat detection - Intelligence surveillance and reconnaissance\n", + "\n", + "- Neutralize cyberattacks - Social media analytics\n", + "\n", + "\n", + "and protect the nation.\n", + "\n", + "- Logistics - Surveillance and reconnaissance\n", + "\n", + "- Predictive maintenance - Law enforcement and readiness\n", + "\n", + "\n", + "-----\n", + "\n", + "### Challenges to innovation\n", + "\n", + "The opportunity to drive innovation throughout the federal government is massive and\n", + "\n", + "has implications for every U.S. citizen. But there are several critical barriers preventing\n", + "\n", + "\n", + "Ten of the existing legacy systems\n", + "most in need of modernization\n", + "cost about **$337 million a year**\n", + "to operate and maintain.\n", + "\n", + "\n", + "agencies from making the progress needed to realize the value of their data and delivering\n", + "\n", + "those innovations.\n", + "\n", + "**THE GOVERNMENT ACCOUNTABILITY OFFICE,**\n", + "\n", + "**INFORMATION TECHNOLOGY REPORT TO CONGRESS, JUNE 2019**\n", + "\n", + "The complexities and impact of legacy data warehouses and marts\n", + "\n", + "Multiple federal agencies are burdened with a legacy IT infrastructure that is being left\n", + "\n", + "\n", + "behind by the technological advancements seen in the private sector. This infrastructure\n", + "\n", + "is traditionally built with on-premises data warehouses and data marts that are highly\n", + "\n", + "complex to maintain, costly to scale as compute is coupled with storage, limited from a\n", + "\n", + "data science perspective, and they lack support for the growing volumes of unstructured\n", + "\n", + "data. This inhibits data-driven innovation and blocks the use of AI, leaving agencies to\n", + "\n", + "search for data science tools to fill the gaps.\n", + "\n", + "Infrastructure also becomes harder and more expensive to maintain as it ages. Over time,\n", + "\n", + "these environments become more complex due to their need for specialized patches and\n", + "\n", + "updates that keep these systems available while doing nothing to solve the issues of poor\n", + "\n", + "interoperability, ever-decreasing processing speeds, and an inability to scale – all of which\n", + "\n", + "are critically necessary to support today’s more data-intensive use cases. For example,\n", + "\n", + "systems at the departments of Education, Health and Human Services, Treasury, and Social\n", + "\n", + "Security are over 40 years old.¹ This is causing pain in a variety of areas.\n", + "\n", + "\n", + "often requires significant customization and, even then, there is still a chance that the final\n", + "\n", + "integration won’t be successful. These systems also keep personnel from spending their\n", + "\n", + "energy and resources on emerging technologies such as AI.\n", + "\n", + "And data reliability is a big concern. Replication of data occurs across data marts as\n", + "\n", + "various teams try to access and explore it, creating data management and governance\n", + "\n", + "challenges. Without a single source of truth, teams struggle with data inconsistencies,\n", + "\n", + "which can result in inaccurate analysis and model performance that is only compounded\n", + "\n", + "over time.\n", + "\n", + "Thankfully, there are initiatives in place, such as the Data Center and Cloud Optimization\n", + "\n", + "Initiative Program Management Office (DCCOI PMO), which are investing in modernizing IT\n", + "\n", + "infrastructure for federal agencies.²\n", + "\n", + "\n", + "Maintaining these systems requires a massive investment of both time and money\n", + "\n", + "compared to modern cloud-based systems. For the technical teams that are tasked with\n", + "\n", + "\n", + "trying to integrate any of these legacy systems with third-party tooling or services, this\n", + "\n", + "\n", + "[¹ Agencies Need to Develop Modernization Plans for Critical Legacy Systems](https://www.gao.gov/assets/gao-19-471.pdf)\n", + "\n", + "[² IT Modernization](https://www.gsa.gov/technology/government-it-initiatives/data-center-optimization-initiative-dcoi)\n", + "\n", + "\n", + "-----\n", + "\n", + "Data is critical … and complicated\n", + "\n", + "Data is both the greatest asset and one of the greatest challenges that federal agencies must\n", + "\n", + "learn to manage. While the volume and usefulness of data collected by federal agencies are\n", + "\n", + "not in question, much of it is locked in legacy source systems, comes in diverse structured\n", + "\n", + "\n", + "Data silos hamper any data-driven advancements\n", + "\n", + "In any data-driven organization, the need to have trusted, timely and efficient access to\n", + "\n", + "data is critical. For the data teams responsible for driving the digital transformation of\n", + "\n", + "federal agencies, the challenges they face are myriad.\n", + "\n", + "\n", + "and unstructured formats, and is subject to a variety of governance models.\n", + "\n", + "We have already seen how existing, legacy infrastructure, as well as the integration of\n", + "\n", + "\n", + "Not only is this data siloed and very difficult to integrate, but the data volumes collected\n", + "\n", + "by federal agencies are massive. At Health and Human Services, for example, or the\n", + "\n", + "Department of Veterans Affairs, healthcare data sets will be sized by population and include\n", + "\n", + "electronic health records, clinical data, imaging and more. For the Department of Defense\n", + "\n", + "\n", + "fragmented data sources, will strain data engineering teams trying to deliver high-quality\n", + "\n", + "data at scale. Their challenge includes developing the right data pipelines that will take\n", + "\n", + "the massive volumes of raw data coming from fragmented sources into one centralized\n", + "\n", + "location with clean, secure and compliant data for agency decision-makers.\n", + "\n", + "\n", + "and the Department of Homeland Security, data includes everything from mapping, satellite\n", + "\n", + "\n", + "Data scientists and analysts alike must have the right toolset to collaboratively investigate,\n", + "\n", + "extract and report meaningful insights from this data. Unfortunately, data silos extend\n", + "\n", + "to organizational silos, which make collaboration inside an agency as well as between\n", + "\n", + "agencies very difficult. With different groups of data teams leveraging their own coding\n", + "\n", + "and analytical tools, communicating insights and working across teams — let alone\n", + "\n", + "across agencies — is almost impossible. This lack of collaboration can drastically limit\n", + "\n", + "the capabilities of any data analytics or AI initiatives — from the deployment of shared\n", + "\n", + "business intelligence (BI) reports and dashboards for data investigation and decision-\n", + "\n", + "making to the training of machine learning models to automate processes and make\n", + "\n", + "predictions. Compounding these challenges is an overall lack of data science expertise and\n", + "\n", + "skills within federal agencies. As a result, even with access to their data, without intuitive\n", + "\n", + "tooling it’s very difficult to deliver advanced analytic use cases with ML and AI.\n", + "\n", + "Organizational silos also impact the effectiveness of data analysts, who are responsible\n", + "\n", + "for analyzing and reporting insights from the data to better inform subject-matter experts\n", + "\n", + "or policy — and decision-makers. Without a data platform that eliminates these silos and\n", + "\n", + "enables visualization of and reporting on shared data, data analysts will be limited in how\n", + "\n", + "they are able to drive the organizational and policy agendas of their respective agencies.\n", + "\n", + "\n", + "imagery and intelligence data to payroll and human resources data. The Social Security\n", + "\n", + "Administration and Internal Revenue Service manage personal data for every single citizen in\n", + "\n", + "the United States.\n", + "\n", + "Combining these various forms of data from disparate legacy systems that are not\n", + "\n", + "integrated — and doing it across different government agencies and departments — can be\n", + "\n", + "slow and error prone, hindering downstream analytics and actionable insights. The teams\n", + "\n", + "that are responsible for this are faced with not only integrating these data sources, but also\n", + "\n", + "managing the entire ETL workflow in order to enable the application of basic analytics, let\n", + "\n", + "alone machine learning and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**THE DATABRICKS LAKEHOUSE PLATFORM:**\n", + "### Modernizing the federal government to achieve mission objectives\n", + "\n", + "\n", + "Databricks provides federal agencies with a Lakehouse Platform that combines the best of data warehouses and data\n", + "\n", + "lakes — to store and manage all your data for all your analytics workloads. Databricks federates all data and democratizes\n", + "\n", + "access for downstream use cases, empowering federal agencies to unlock the full potential of their data to deliver on\n", + "\n", + "their mission objectives and better serve citizens.\n", + "\n", + "\n", + "Federal agencies that are\n", + "powering impactful innovations\n", + "with Databricks Lakehouse\n", + "\n", + "\n", + "Lakehouse offers a single solution for all major data workloads, whether structured or unstructured, and supports use\n", + "\n", + "\n", + "cases from streaming analytics to BI, data science and AI.\n", + "\n", + "\n", + "Using predictive\n", + "analytics for better\n", + "passenger safety and\n", + "experience\n", + "\n", + "Enabling operational\n", + "efficiencies through\n", + "process automation\n", + "to streamline the path\n", + "to citizenship\n", + "\n", + "\n", + "All your\n", + "government data\n", + "\n", + "\n", + "Reliable, Analytics capabilities\n", + "real-time processing for every use case\n", + "\n", + "AD HOC\n", + "DATA SCIENCE\n", + "\n", + "\n", + "Health\n", + "\n", + "Surveillance\n", + "\n", + "Social Security\n", + "\n", + "Demographics\n", + "\n", + "Crime\n", + "\n", + "Audio/Visual\n", + "\n", + "Geospatial\n", + "\n", + "\n", + "Structured batch\n", + "\n", + "Unstructured stream\n", + "\n", + "Structured batch\n", + "\n", + "Structured batch\n", + "\n", + "Unstructured batch\n", + "\n", + "Unstructured stream\n", + "\n", + "Unstructured stream\n", + "\n", + "\n", + "PRODUCTION\n", + "MACHINE LEARNING\n", + "\n", + "\n", + "**DATA LAKEHOUSE**\n", + "\n", + "Process, manage\n", + "and query all your data\n", + "\n", + "\n", + "BI REPORTING AND\n", + "SCORECARDING\n", + "\n", + "\n", + "Leveraging advanced\n", + "analytics to improve\n", + "outcomes for patients\n", + "through Medicare and\n", + "Medicaid services\n", + "\n", + "\n", + "The Databricks Lakehouse Platform has three unique characteristics that address head-on the biggest challenges that\n", + "\n", + "federal agencies are facing:\n", + "\n", + "\n", + "It offers simplicity with regard to data\n", + "\n", + "management, in that the Databricks\n", + "\n", + "Lakehouse is architected to support all\n", + "\n", + "of an agency’s data workloads on one\n", + "\n", + "\n", + "It is built on open standards so\n", + "\n", + "that any existing investments\n", + "\n", + "in tooling or resources can\n", + "\n", + "remain effective\n", + "\n", + "\n", + "And it’s collaborative, enabling\n", + "\n", + "agency data engineers, analysts\n", + "\n", + "and data scientists to work\n", + "\n", + "together much more easily\n", + "\n", + "\n", + "common platform\n", + "\n", + "\n", + "-----\n", + "\n", + "Managing federal data with a unified approach\n", + "\n", + "\n", + "Databricks enables aggregation and processing of massive collections of diverse and\n", + "\n", + "sensitive agency data that currently exists in silos, both structured and unstructured. As\n", + "\n", + "we’ve seen, for many agencies this would be incredibly difficult with the infrastructure\n", + "\n", + "challenges they are experiencing. The Databricks Lakehouse leverages Delta Lake to unify\n", + "\n", + "\n", + "By providing a unified data foundation for business intelligence, data science and machine\n", + "\n", + "learning, federal agencies can add reliability, performance and quality to existing data lakes\n", + "\n", + "while simplifying data engineering and infrastructure management with automation to\n", + "\n", + "simplify the development and management of data pipelines.\n", + "\n", + "\n", + "the very large and diverse amounts of data that government agencies are working with.\n", + "\n", + "Delta Lake is an open format, centralized data storage layer that delivers reliability, security\n", + "\n", + "and performance — for both streaming and batch operations.\n", + "\n", + "The Lakehouse Platform combines the best elements of data lakes and data warehouses — delivering the data management and performance\n", + "typically found in data warehouses with the low-cost, flexible object stores offered by data lakes\n", + "\n", + "\n", + "-----\n", + "\n", + "Break down the institutional silos limiting collaboration\n", + "\n", + "Foster collaboration at every step with the latest machine learning tools that allow everyone\n", + "\n", + "to work and build value together — from data scientists to researchers to business\n", + "\n", + "decision-makers. Close the glaring skills gap within these government organizations by\n", + "\n", + "providing tooling that simplifies the ML lifecycle and empowers the data teams that do not\n", + "\n", + "have the data science expertise to still be productive with their data through integrating BI\n", + "\n", + "tools and SQL analytics capabilities.\n", + "\n", + "Empower data scientists with an intuitive and interactive workspace where they can easily\n", + "\n", + "collaborate on data, share models and code, and manage the entire machine learning\n", + "\n", + "lifecycle in one place. Databricks notebooks natively support Python, R, SQL and Scala so\n", + "\n", + "practitioners can work together with the languages and libraries of their choice.\n", + "\n", + "Deliver on mission objectives with powerful analytics across agencies\n", + "\n", + "The Databricks Lakehouse Platform includes a business intelligence capability — Databricks\n", + "\n", + "SQL. Databricks SQL allows data analysts and users to query and run reports against all of\n", + "\n", + "an agency’s unified data. Databricks SQL integrates with BI tools, like Tableau and Microsoft\n", + "\n", + "Power BI, and complements any existing BI tools with a SQL-native interface, allowing data\n", + "\n", + "analysts and data scientists to query data directly within Databricks.\n", + "\n", + "Additionally, with Databricks SQL, the data team can turn insights from real-world data into\n", + "\n", + "\n", + "powerful visualizations designed for machine learning. Visualizations can then be turned\n", + "\n", + "into interactive dashboards to share insights with peers across agencies, policymakers,\n", + "\n", + "\n", + "Easily create visualizations and share dashboards via integrations with BI tools, like Tableau and Microsoft Power BI\n", + "\n", + "\n", + "regulators and decision-makers.\n", + "\n", + "\n", + "-----\n", + "\n", + "Ensure data security and compliance at scale\n", + "\n", + "Databricks is fully aware of the sensitivity of the data that many of our federal agencies are\n", + "\n", + "responsible for. From national security and defense data to individual health and financial\n", + "\n", + "information to national infrastructure and energy data — all of it is critical. Data is protected\n", + "\n", + "at every level of the platform through deep integration with fine-grained, cloud-provider\n", + "\n", + "access control mechanisms. The Databricks Lakehouse is a massively secure and scalable\n", + "\n", + "multicloud platform running millions of machines every day. It is independently audited\n", + "\n", + "and compliant with FedRAMP security assessment protocols on the Azure cloud and can\n", + "\n", + "provide a HIPAA-compliant deployment on both AWS and Azure clouds.\n", + "\n", + "The platform’s administration capabilities include tools to manage user access, control\n", + "\n", + "spend, audit usage, and analyze activity across every workspace, all while seamlessly\n", + "\n", + "enforcing user and data governance, at any scale.\n", + "\n", + "With complete AWS accreditation, Databricks runs across all major networks including\n", + "\n", + "GovCloud, SC2S, C2S and commercial; all networks, including public, NIPR, SIPR and JWICS;\n", + "\n", + "and ATOs, including FISMA, IL5, IL6, ICD 503 INT-A and INT-B.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CUSTOMER STORY: U.S. CITIZENSHIP AND IMMIGRATION SERVICES**\n", + "### Streamlining the path to citizenship with data\n", + "\n", + "##### 24x faster\n", + "\n", + "query\n", + "performance\n", + "\n", + "\n", + "##### 10 minutes\n", + "\n", + "to process tables\n", + "with 120 million rows\n", + "\n", + "\n", + "##### 40 million\n", + "\n", + "applications\n", + "processed\n", + "\n", + "\n", + "The U.S. Citizenship and Immigration Services (USCIS) gains actionable insights from\n", + "\n", + "dashboards via Tableau to better understand how to streamline operations and more quickly\n", + "\n", + "process immigration and employment applications as well as petitions. Today, their data\n", + "\n", + "analyst team has over 6,000 Tableau dashboards running — all powered by Databricks.\n", + "\n", + "The U.S. Citizenship and Immigration Services is the government agency that oversees\n", + "\n", + "\n", + "lawful immigration to the United States. Over the last decade, the volume of immigration-\n", + "\n", + "and citizenship-related applications has skyrocketed across naturalizations, green cards,\n", + "\n", + "employment authorizations and other categories. With millions of applications and petitions\n", + "\n", + "flooding the USCIS, processing delays were reaching crisis levels — with overall case\n", + "\n", + "processing times increasing 91% since FY2014.\n", + "\n", + "\n", + "-----\n", + "\n", + "Processing delays fueled by on-premises, legacy architecture\n", + "\n", + "Core to these issues was an on-premises, legacy architecture that was complex, slow and\n", + "\n", + "costly to scale. By migrating to AWS and Databricks, USCIS adopted a unified approach\n", + "\n", + "to data analytics with more big data processing power and the federation of data\n", + "\n", + "across dozens of disparate sources. This has unlocked operational efficiencies and new\n", + "\n", + "\n", + "A new era of data-driven innovation improves operations\n", + "\n", + "USCIS now has the ability to understand their data more quickly, which has unlocked new\n", + "\n", + "opportunities for innovation. With Databricks, they are able to run queries in 19 minutes,\n", + "\n", + "something that used to take an entire day — a 24x performance gain. This means they are\n", + "\n", + "spending far less time troubleshooting and more time creating value.\n", + "\n", + "\n", + "opportunities for their entire data organization to drive business intelligence and fuel ML\n", + "\n", + "innovations designed to streamline application and petition processes.\n", + "\n", + "Removing complexities with a fully managed cloud platform\n", + "\n", + "\n", + "Since migrating to the cloud and integrating Databricks into their data analytics workflows,\n", + "\n", + "USCIS has been able to make smarter decisions that help streamline processes and\n", + "\n", + "leverage ML to reduce application processing times. These newfound efficiencies and\n", + "\n", + "capabilities have allowed them to scale their data footprint from about 30 data sources to\n", + "\n", + "75 without issue.\n", + "\n", + "Databricks provided USCIS with significant impact where it mattered most — faster\n", + "\n", + "processing speeds that enabled data analysts to deliver timely reports to decision-\n", + "\n", + "\n", + "We discovered Databricks, and\n", + "the light bulb really clicked for\n", + "us on what we needed to do\n", + "moving forward to stay relevant.\n", + "\n", + "\n", + "makers — and that freed up data scientists to build ML models to help improve operations.\n", + "\n", + "Leveraging the efficiencies of the cloud and Delta Lake, they were able to easily provision a\n", + "\n", + "\n", + "26-node cluster within minutes and ingest tables with 120 million rows into S3 in under 10\n", + "\n", + "minutes. Prior to Databricks, performing the same processes would have taken somewhere\n", + "\n", + "\n", + "**SHAWN BENJAMIN**\n", + "\n", + "**CHIEF OF DATA AND BUSINESS INTELLIGENCE, USCIS**\n", + "\n", + "\n", + "between two and three hours.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Enabling federal agencies to take advantage of data analytics and AI will help them execute\n", + "\n", + "their missions both effectively and efficiently. The Databricks Lakehouse Platform will unify\n", + "\n", + "data, analytics and AI workloads, making agencies data-driven and giving policymakers\n", + "\n", + "access to deeper, more meaningful insights for decision-making. It will also eliminate data\n", + "\n", + "silos and increase communication and collaboration across agencies to ensure the best\n", + "\n", + "results for all citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 5,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M, and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems.\n", + "\n", + "Get started with a free trial of Databricks and\n", + "start building data applications today\n", + "\n", + "**START YOUR FREE TRIAL**\n", + "\n", + "To learn more, visit us at: **dbricks.co/federal**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Data-AI-in-Fed-Gov-Ebook.pdf2024-09-19T16:57:19Z
**eBook**\n", + "\n", + "# Cybersecurity in Financial Services\n", + "\n", + "### Protecting financial institutions with advanced analytics and AI\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "The State of the Industry .................................................................................................................................................................................... **03**\n", + "\n", + "A New Commitment to Cybersecurity ....................................................................................................................................................... **04**\n", + "\n", + "The Biggest Challenge With Security Analytics ..................................................................................................................................... **05**\n", + "\n", + "Journey of SecOps: Destination Lakehouse ............................................................................................................................................ **06**\n", + "\n", + "Rethinking Cybersecurity in Financial Services With Databricks Lakehouse ......................................................................... **07**\n", + "\n", + "Lakehouse in Financial Services ..................................................................................................................................................................... **08**\n", + "\n", + "Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations .................................................................................. **12**\n", + "\n", + "Common Use Cases ................................................................................................................................................................................................ **14**\n", + "\n", + "Getting Started With Databricks for Cybersecurity ............................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "## The State of the Industry\n", + "\n", + "\n", + "Cloud, cost and complexity of customer data and cybersecurity are\n", + "top of mind for every financial services security leader today. As\n", + "financial services institutions (FSIs) continue to accelerate their digital\n", + "transformation, cybercriminals, fraudsters and state-sponsored actors\n", + "continue with more sophisticated threats. The impact of these attacks\n", + "ranges from the exposure of highly sensitive data to the disruption\n", + "of services and the exploitation of backdoors for future attacks — all\n", + "resulting in both financial and non-financial costs. Responding quickly\n", + "to potential threats requires security tools capable of analyzing billions\n", + "of threat signals in real-time.\n", + "\n", + "Recently, it seems like every week reveals a new data breach or ransomware assault,\n", + "and the cost is skyrocketing: more than $4 million per incident, up 10 percent from\n", + "2020, and about $401 million for a substantial [breach at a large corporation](https://www.ibm.com/security/data-breach) .\n", + "\n", + "\n", + "**Cybersecurity is no longer just a back-office cost and now**\n", + "**poses critical business risks, such as:**\n", + "\n", + "**•** Operational disruption\n", + "\n", + "**•** Material customer loss\n", + "\n", + "**•** Increase in insurance premiums\n", + "\n", + "**•** Lawsuits or fines\n", + "\n", + "**•** Systemic destabilization\n", + "\n", + "**•** Credit downgrade\n", + "\n", + "**•** Reputational damage\n", + "\n", + "Source: Navigating Cyber 2022, FS-ISAC, Annual Cyber Threat Review and Predictions\n", + "\n", + "\n", + "-----\n", + "\n", + "## A New Commitment to Cybersecurity\n", + "\n", + "\n", + "It comes as no surprise that in recent years FSIs have seen an amplified\n", + "commitment to cybersecurity. As business leaders look to new solutions, large\n", + "portions of IT budgets are now devoted to leveraging data and AI to thwart\n", + "cyberattacks.\n", + "\n", + "Furthermore, regulators are taking notice of the increased risk of cybersecurity\n", + "threats. Growing geopolitical tensions have also prompted federal agencies such\n", + "as the Cybersecurity and Infrastructure Security Agency and the Federal Bureau\n", + "of Investigation [to warn](https://www.wsj.com/livecoverage/russia-ukraine-latest-news-2022-04-05/card/banks-haven-t-seen-rise-in-cyberattacks-from-russia-yet-p3F5ebzAhTauVjsNx46E) that “tough sanctions imposed on Russia could prompt a\n", + "spate of cyberattacks against critical infrastructure such as banks.” Additionally,\n", + "the Securities and Exchange Commission released its [2022 Exam Priorities](https://www.sec.gov/news/press-release/2022-57) , which\n", + "include information security, and specifically “how firms are safeguarding their\n", + "customers’ records and assets from cyber threats, including oversight of thirdparty providers, identification of red flags related to identity theft, response to\n", + "incidents, including to ransomware attacks and management of operational risk in\n", + "light of ‘a dispersed workforce.’”\n", + "\n", + "However, as is often the case, implementing new cybersecurity strategies and\n", + "processes is easier said than done.\n", + "\n", + "\n", + "**Cybersecurity needs a transformation**\n", + "**... breaches, cost and complexity are growing**\n", + "\n", + "\n", + "## 100%\n", + "of organizations surveyed have had\n", + "breaches.\n", + "**The average breach costs $4M**\n", + "\n", + "## 85%\n", + "**will increase their cyber budget**\n", + "next FY. Cybersecurity industry will\n", + "grow to $366B by ‘28\n", + "\n", + "\n", + "## 67%\n", + "of organizations were **breached at**\n", + "**least three times** . A mega breach\n", + "costs $401M.\n", + "\n", + "**Cost, Complexity, Cloud**\n", + "\n", + "- \u0007Hundreds of tools with expanding\n", + "footprints\n", + "\n", + "- \u0007Data locked in vendor proprietary\n", + "tools\n", + "\n", + "- \u0007Humans compensating for\n", + "analytical and integration\n", + "deficiencies\n", + "\n", + "\n", + "In this eBook, we’ll take a closer look at the challenges associated with replacing\n", + "the infrastructure of a legacy data analytics system, and how financial institutions\n", + "are solving them with Databricks.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Biggest Challenge With Security Analytics\n", + "\n", + "\n", + "For many FSIs, on-premises security incident and event management (SIEM)\n", + "technologies have been the go-to solution for threat detection, analysis and\n", + "investigations. However, these legacy technologies were built for a world where big\n", + "data was measured in gigabytes, not today’s terabytes or petabytes. This means\n", + "that not only are legacy SIEMs unable to scale to today’s data volumes, but they\n", + "are also unable to serve the modern, distributed enterprise.\n", + "\n", + "By now, the advantages of moving to the cloud are no secret to anyone. For FSIs,\n", + "scalability, simplicity, efficiency and cost are absolutely essential components of\n", + "success. Many within FinServ are looking to cloud computing to make this possible,\n", + "adding detection and response in the cloud to the security team’s responsibility.\n", + "\n", + "Because legacy SIEMs predate the emergence of cloud, artificial intelligence and\n", + "machine learning (AI/ML) in the mainstream, they’re unable to address the complex\n", + "data and AI-driven analytics needed for threat detection, threat hunting, in-stream\n", + "threat intelligence enrichment, analytical automation and analyst collaboration.\n", + "\n", + "In other words, legacy SIEMs are no longer suitable for the modern enterprise or\n", + "the current threat landscape.\n", + "\n", + "\n", + "**Counting the Financial Cost of Legacy SIEMs**\n", + "\n", + "The financial cost of the continued use of legacy SIEMs continues to rise because\n", + "most SIEM providers charge their customers based on the volume of data\n", + "ingested. While some legacy technologies are available in the cloud, they’re either\n", + "not designed to be cloud-native applications or confined to a single cloud service\n", + "provider. As a result, security teams have to employ multiple tools for detection,\n", + "investigation and response — or pay exorbitant egress charges for data transiting\n", + "from one cloud provider to another. This causes operational slowdowns, errors\n", + "driven by complexity, and inconsistent implementation of security policies.\n", + "\n", + "A lack of support for multiple clouds also means an increase in maintenance\n", + "overhead. Security staff members are often stressed because analysts have to\n", + "learn different tools for different cloud platforms. For some, it also creates an\n", + "implicit cloud vendor lock-in, meaning that security teams are unable to support\n", + "missions because their tools are not portable across multiple cloud providers.\n", + "\n", + "Collectively, these drawbacks to legacy SIEMs result in a much weaker security\n", + "posture for FSIs.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Journey of SecOps: Destination Lakehouse\n", + "\n", + "How did security analytics get to this point? In the early days, there was a need to aggregate alerts from antiviruses and intrusion detection systems. SIEMs were born, built\n", + "on data warehouses, relational databases or NoSQL database management systems. But as incident investigation needs evolved, those data warehouses weren’t able to\n", + "handle the volume and variety of data, which led to the development of data lakes. Data lakes were cost-effective and scalable but didn’t have strong data governance and\n", + "data hygiene, earning them the moniker of “data swamps.” Simply integrating the two tech stacks is really complicated because of varying governance models, data silos\n", + "and inconsistent use case support. Fast-forward to today, security teams now need AI/ML at scale in a multicloud world.\n", + "\n", + "Why choose one or the other? The lakehouse architecture has emerged in recent years to help address these concerns with a single unified architecture for all your threat\n", + "data, analytics and AI in the cloud. The governance and transactional capabilities of the data warehouse, the scale and flexibility of a data lake, AI/ML from the ground up\n", + "and multicloud native deployments in one platform – this is a modern architecture called the lakehouse (data lake and data warehouse).\n", + "\n", + "**Current Challenges** **Introducing the Data Lakehouse**\n", + "\n", + "\n", + "**Cloud Storage**\n", + "No support for\n", + "analytics or\n", + "investigations\n", + "\n", + "**SIEMs**\n", + "No attack chaining.\n", + "Poor for high\n", + "cardinality search.\n", + "\n", + "\n", + "**UBA tools**\n", + "No historical search,\n", + "blackbox,\n", + "proprietary storage\n", + "\n", + "**No SIEM/Log**\n", + "solution is\n", + "multicloud\n", + "native\n", + "\n", + "\n", + "**Curated Alerts** **Cloud-scale**\n", + "**search**\n", + "\n", + "**ML/AI** **Multicloud**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Rethinking Cybersecurity in Financial Services With Databricks Lakehouse\n", + "\n", + "Databricks introduced the first data lakehouse platform to the industry, and today over 7,000 customers use it worldwide. With Databricks Lakehouse, FSIs that are ready to\n", + "modernize their data infrastructure and analytics capabilities for better protection against cyber threats now have one cost-effective solution that addresses the needs of\n", + "all their teams.\n", + "\n", + "The Databricks Lakehouse Platform combines the best elements of data lakes and data warehouses, delivering the low-cost, flexible object stores offered by data lakes and\n", + "the data management and performance typically found in data warehouses. This unified platform simplifies existing architecture by eliminating the data silos that traditionally\n", + "separate analytics, data science and ML. It’s built on open source, open data and open standards to maximize flexibility, and its inherent collaborative capabilities accelerate\n", + "the ability to work across teams and innovate faster. Moreover, because it’s multicloud, it works the same way no matter which cloud provider is used.\n", + "\n", + "ETL and Enrichment\n", + "\n", + "**Proof Point**\n", + "\n", + "**Firewall**\n", + "\n", + "**Antivirus**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse in Financial Services\n", + "\n", + "By unifying data with analytics and AI, Lakehouse allows FSIs to easily access all their data for downstream advanced analytics capabilities to support complex security\n", + "use cases. Lakehouse facilitates collaboration between threat intelligence teams and cyber operations, enables security operations teams to detect advanced threats, and\n", + "reduces human resource burnout through analytical automation and collaboration. Importantly, Lakehouse also accelerates investigations from days to minutes.\n", + "\n", + "Along with a more modern architecture, the Lakehouse Platform includes Delta Lake, which unifies all security data in a transactional data lake to feed advanced analytics.\n", + "The analytics and collaboration are done in notebooks, and security teams can use multiple languages — SQL, Python, R and Scala — in the same notebook. This makes\n", + "it easy for security practitioners to explore data and develop advanced analytics and reporting using their favorite methods. Additionally, a separation of compute from\n", + "storage means performance at scale without impacting overall storage costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**C A S E S T U D Y**\n", + "\n", + "**When It Comes to Security, Data Is the Best Defense***\n", + "\n", + "**Protecting HSBC’s 40 million customers begins with collecting and processing data from billions**\n", + "**of signals to make previously impossible threat detection possible**\n", + "\n", + "security operation departments, creating an enhanced relationship that results\n", + "in better defenses, insight into the security posture of the organization, and the\n", + "ability to respond at the pace of the adversary.\n", + "\n", + "\n", + "The old way of thinking about security — stronger locks, higher walls — is outdated\n", + "and ineffective. “When defending an organization, too often we just focus heavily\n", + "on tools, technology, and reactive scenarios,” said T.J. Campana, managing director\n", + "of global defense and chief technology officer at HSBC, the multinational bank. “But\n", + "the security business is a data business. And the data always has a story to tell us.”\n", + "\n", + "The quality of security, he added, is proportional to the information that can be\n", + "\n", + "distilled from petabytes of data that endlessly flows through company networks.\n", + "That means “empowering people to get the right insights, in the right way to\n", + "quickly prevent, detect, and respond to threats, wherever and whenever they\n", + "occur,” said George Webster, executive director of global cybersecurity science\n", + "and analytics at HSBC.\n", + "\n", + "If a big organization is made up of tens of millions of parts that must click together\n", + "seamlessly, security keeps those seals tight. Data gathering, analytical tools, and\n", + "human intellect work together as one. This involves fusing the data science and\n", + "\n", + "\n", + "But working across years of data at petabyte scale is not an easy task, especially\n", + "when a long time is measured in minutes and the adversary is constantly working\n", + "against you. To put this in perspective, the security teams at HSBC intake 10 times\n", + "the amount of data contained in all of the books in the U.S. Library of Congress\n", + "every day, and must process months, if not years, of data at a time. That is where\n", + "innovative design, smart people, and leveraging the right technology come into\n", + "play. “We have to break the paradigm of the tool being the end goal of defense\n", + "and instead view the tools as an enabler of our people,” said Webster. “It is always\n", + "about the people,” added Campana.\n", + "\n", + "HSBC turned away from the common security paradigm by leveraging the big data\n", + "processing techniques from Azure Databricks. In many ways, their open source\n", + "Delta Lake is the key enabler, with Spark being the engine. Delta Lake allows these\n", + "teams to structure, optimize, and unlock data at scale, while Spark allows multiple\n", + "complex programs to seamlessly crunch through the data. This enables HSBC’s\n", + "security teams to constantly evolve their defenses, create new capabilities at\n", + "pace, and perform investigations that were previously impossible. When a new\n", + "threat emerges, the bank doesn’t have the luxury to wait for the security market to\n", + "identify, respond, and mitigate. Instead, the bank turns to its people and creates\n", + "what is needed at breathtaking speed.\n", + "\n", + "\n", + "-----\n", + "\n", + "**C A S E S T U D Y : C O N T I N U E D**\n", + "\n", + "\n", + "It’s an essential function for HSBC, which needs to continually think about how to\n", + "keep more than 40 million customers in 64 countries and territories safe. Taken\n", + "together, it’s an all-brains-on-deck moment with data and people guiding the\n", + "ship. It’s also a tall task for a company as massive and multifaceted as HSBC.\n", + "Headquartered in the UK, it is one of the largest global banks (total assets: a\n", + "whopping $2.968 trillion), with operations across Africa, Europe, Asia, and the\n", + "Americas. It’s also the largest bank in Hong Kong and even prints some of the local\n", + "currency, which bears the HSBC name.\n", + "\n", + "The bank’s cybersecurity approach involves fusing the data science and security\n", + "operation departments, creating an enhanced relationship that results in more\n", + "efficient threat discovery, rapid development of operational use cases and AI\n", + "models. This enables the continuous creation of capabilities that stop adversaries\n", + "before they even start. “We have to get out of the mindset that security is a walled\n", + "garden,” said Webster. “We must create truly collaborative environments for our\n", + "people to enable the business to operate,” said Campana.\n", + "\n", + "Staffing this symbiotic power center will be someone Campana optimistically calls\n", + "“the analyst of the future,” a description that’s both mindset and skillset: threat\n", + "hunter and data scientist.\n", + "\n", + "In addition, when another organization is hit by cybercrime, HSBC analyzes it\n", + "to understand how it may have responded and then improves its defenses\n", + "accordingly. That’s in contrast to the industry norm; a Ponemon survey revealed\n", + "\n", + "\n", + "that 47 percent of organizations have not assessed the readiness of their incident\n", + "response teams. That means the first time they test their plans will be at the worst\n", + "possible time — in the middle of a cyber attack.\n", + "\n", + "The proactive approach is a far cry from the old reactive conveyor belt model of\n", + "security when alert tickets were received from tooling and processed in a slow\n", + "and linear way. Today, cross-disciplinary security teams don’t just react; they\n", + "continually search for the signals in the noise — tiny aberrations that indicate\n", + "something’s not right – and send up red flags in real-time. “We’re scanning\n", + "hundreds of billions of signals per day. I cannot wait. We need situational\n", + "awareness right now,” said Campana.\n", + "\n", + "That increased speed is critical for threat assessment. Information theft may be\n", + "the most expensive and fastest-rising consequence of cybercrime, but data is not\n", + "the only target. Core systems are being hacked in a dangerous trend to disrupt\n", + "and destroy. Regulators are also increasingly asking banks for controls in place to\n", + "detect and preempt financial crimes. That’s where big data tooling like Delta Lake\n", + "and Spark shine, and where it will continually be called on to address the security\n", + "needs of new initiatives.\n", + "\n", + "“Digital security is about organically adjusting to risks,” said Webster. “It’s a journey\n", + "of continual discovery with one central goal: to protect customers. They want\n", + "things easy and they want them quick. It’s our job to make sure that it’s secure.”\n", + "\n", + "*This story previously appeared in [WIRED Brand Lab for Databricks](https://www.wired.com/sponsored/story/when-it-comes-to-security-data-is-the-best-defense/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Advantages of a Lakehouse**\n", + "\n", + "\n", + "**A cost-efficient upgrade**\n", + "\n", + "Databricks customers only pay for the data they\n", + "analyze, not for what they collect. This means that\n", + "security teams can collect any amount of data\n", + "without worrying about ingest-based pricing, and\n", + "only pay for the data that’s actually used for analysis\n", + "— for example, an incident investigation or a data\n", + "call for an audit. This pricing model enables security\n", + "teams to collect data that was previously out of\n", + "reach, such as netflow data, endpoint detection and\n", + "response data, and application and services data.\n", + "\n", + "Further, Databricks is a fully managed service,\n", + "meaning that security teams don’t have to\n", + "pre-commit to hardware capital expenditures.\n", + "With no hardware to manage and no big data\n", + "implementations to maintain, security teams\n", + "can significantly reduce their management and\n", + "maintenance costs.\n", + "\n", + "\n", + "**Multicloud**\n", + "\n", + "Databricks is cloud-native on AWS, Microsoft Azure\n", + "and Google Cloud. This creates freedom for the\n", + "security teams to use whatever cloud provider they\n", + "like. Additionally, teams can acquire and maintain\n", + "operational consistency across all providers when\n", + "they have multiple cloud footprints. This enables\n", + "consistent policy implementation, reduced\n", + "complexity for staff and increased efficiency.\n", + "\n", + "Additionally, Databricks enables faster detection,\n", + "investigation and response across the enterprise\n", + "because analytics can be reused across the\n", + "major cloud providers through a unified platform\n", + "that centralizes data for easy sharing and fosters\n", + "collaboration across teams.\n", + "\n", + "\n", + "**Enterprise security and**\n", + "**360° risk management**\n", + "\n", + "The Lakehouse Platform is easy to set up, manage,\n", + "scale and, most importantly, secure. This is because\n", + "Lakehouse easily integrates with existing security\n", + "and management tools, enabling users to extend\n", + "their policies for peace of mind and greater control.\n", + "\n", + "With multicloud management, security admins and\n", + "data teams get a consistent experience across all\n", + "major cloud providers. This saves valuable time\n", + "and the resources required to upskill talent on\n", + "proprietary services for data, analytics and AI.\n", + "\n", + "Security, risk and compliance leaders are also\n", + "able to give team members a range of security\n", + "permissions that come with thorough audit trails.\n", + "This allows teams to quickly spin up and wind down\n", + "collaborative workspaces for any project and to\n", + "manage use cases from end to end — from enabling\n", + "user access and controlling spend to auditing usage\n", + "and analyzing activity across every workspace to\n", + "enforce user and data governance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations\n", + "\n", + "\n", + "According to George Webster, head of cybersecurity sciences and analytics at\n", + "HSBC, Lakehouse and SIEM is the pattern for security operations. What does\n", + "it look like? It leverages the strengths of the two components: Lakehouse for\n", + "multicloud native storage and analytics, SIEM for security operations workflows.\n", + "For Databricks customers like HSBC, there are two general patterns for this\n", + "integration that are both underpinned by what Webster calls the cybersecurity\n", + "data lake with Lakehouse.\n", + "\n", + "In the first pattern, Lakehouse stores all the data for the maximum retention\n", + "period. A subset of the data is then sent to the SIEM and stored for a fraction of\n", + "the time. This pattern has the advantage of allowing analysts to query near-term\n", + "\n", + "\n", + "data using the SIEM while having the ability to do historical analysis and more\n", + "sophisticated analytics in Databricks. It also lets them manage any licensing or\n", + "storage costs for the SIEM deployment.\n", + "\n", + "The second pattern is to send the highest-volume data sources to Databricks —\n", + "for example, cloud-native logs, endpoint threat detection and response logs, DNS\n", + "data and network events. Low-volume data sources such as alerts, e-mail logs\n", + "and vulnerability scan data go to the SIEM. This pattern enables Tier 1 analysts to\n", + "quickly handle high-priority alerts in the SIEM. Threat-hunt teams and investigators\n", + "can leverage the advanced analytical capabilities of Databricks. This pattern has a\n", + "cost-benefit of offloading processing, ingestion and storage from the SIEM.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks and Splunk:**\n", + "**A Case Study in Cost-Savings**\n", + "\n", + "Databricks integrates with your preferred SIEM, like\n", + "Splunk, and the Splunk-certified Databricks add-on\n", + "can be used to meet SOC needs without changing\n", + "the user interface. This example features a global\n", + "financial institution’s security operation, where\n", + "the organization grew throughput from 25TB per\n", + "day with only 180 days lookback, to 100TB per day\n", + "with 395 days lookback using the Databricks SIEM\n", + "augmentation. The total cost of ownership savings,\n", + "including infrastructure and license costs, saved tens\n", + "of millions (more than $80mn per year) in cloud costs.\n", + "\n", + "\n", + "##### FinServ Security Operations\n", + "\n", + "Databricks + Splunk **Drastically** Lowered Costs\n", + "\n", + "**CURRENT STATE** **FUTURE OPTION**\n", + "\n", + "100\n", + "\n", + "75\n", + "\n", + "\n", + "**Throughput**\n", + "TB per day\n", + "\n", + "**Lookback**\n", + "**period**\n", + "Days\n", + "\n", + "\n", + "50\n", + "\n", + "\n", + "**100**\n", + "\n", + "\n", + "25\n", + "\n", + "**25**\n", + "\n", + "0\n", + "\n", + "Splunk only Splunk + Databricks\n", + "\n", + "**395**\n", + "\n", + "**180**\n", + "\n", + "Splunk only Splunk + Databricks\n", + "\n", + "TCO savings with Splunk and Databricks vs. Splunk only solution: $81M\n", + "\n", + "\n", + "-----\n", + "\n", + "## Common Use Cases\n", + "\n", + "As FSIs focus on modernizing their data analytics and warehousing capabilities, the Databricks Lakehouse Platform\n", + "brings a new level of empowerment to FSIs, allowing them to unlock the full potential of their data to deliver on their\n", + "objectives and better serve their customers.\n", + "\n", + "**Common use cases include:**\n", + "\n", + "\n", + "\n", + "**•** **Threat hunting:** Empower security teams to\n", + "proactively detect and discover advanced\n", + "threats using months or years of data\n", + "\n", + "**•** **Incident investigation:** Gain complete visibility\n", + "across network, endpoint, cloud and application\n", + "data to respond to incidents\n", + "\n", + "**•** **Phishing threat detection:** Uncover social\n", + "engineering attacks that are often used to steal\n", + "user data, including log-in credentials and credit\n", + "card numbers\n", + "\n", + "**•** **Supply chain monitoring:** Leverage ML to\n", + "identify suspicious behavior within your software\n", + "supply chain\n", + "\n", + "\n", + "\n", + "**•** **Ransomware detection:** Scope the impact\n", + "and spread of ransomware attacks to inform\n", + "complete mitigation and remediation\n", + "\n", + "**•** **Credentials-abuse detection:** Identify and\n", + "investigate anomalous credential usage across\n", + "your infrastructure\n", + "\n", + "**•** **Insider-threats detection:** Find and respond\n", + "to malicious threats from people within an\n", + "organization who have inside information about\n", + "security practices, data and computer systems\n", + "\n", + "**•** **Network traffic analysis:** Examine real-time\n", + "network availability and activity to identify\n", + "anomalies, vulnerabilities and malware\n", + "\n", + "\n", + "\n", + "**•** **Analytics automation:** Automatically\n", + "contextualize and enrich multiple streaming and\n", + "batch analytics to accelerate analyst workflows\n", + "and decision-making\n", + "\n", + "**•** **Augmenting anti-money laundering practices**\n", + "**(AML):** Using structured and unstructured\n", + "data to maintain a list of politically exposed\n", + "individuals, often referred to as PEP, to augment a\n", + "bank’s AML processes. This includes pulling data\n", + "from an organization externally (keeping the PEP\n", + "list up-to-date including out-of-country officials\n", + "and diplomats) as well as internally (including\n", + "critical personnel, network admins, etc.) who\n", + "need extra scrutiny.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started With Databricks for Cybersecurity\n", + "\n", + "Getting up and running on Databricks to address your cybersecurity needs is easy with our Solution\n", + "Accelerators. Databricks Solution Accelerators are highly optimized, fully functional analytics solutions that\n", + "provide customers with a fast start to solving their data problems.\n", + "\n", + "**•** [Cybersecurity analytics and AI at scale with Splunk and Databricks](https://databricks.com/solutions/accelerators/cybersecurity-analytics-and-ai) : Rapidly detect threats,\n", + "investigate the impact and reduce risks with the Databricks add-on for Splunk\n", + "\n", + "**•** [Threat detection at scale with DNS analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html) : Recognize cybercriminals using DNS,\n", + "threat intelligence feeds and ML\n", + "\n", + "Databricks Solution Accelerators are free. Join the hundreds of Databricks customers using Solution\n", + "Accelerators to drive better outcomes in their businesses.\n", + "\n", + "If you’d like to learn more about how we are helping financial services institutions securely leverage data and AI,\n", + "please visit us at [dbricks.co/fiserv](https://databricks.com/solutions/industries/financial-services) or reach out to us at [cybersecurity@databricks.com](mailto:cybersecurity%40databricks.com?subject=) .\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including\n", + "\n", + "Comcast, Condé Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks\n", + "\n", + "Lakehouse Platform to unify their data, analytics and AI. Databricks is headquartered in San\n", + "\n", + "Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™\n", + "\n", + "Delta Lake and MLflow, Databricks is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "#### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n", + "\n", + "###### To learn more, visit us at:\n", + " dbricks.com/fiserv\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-eBook-finServ-cyber.pdf2024-09-19T16:57:20Z
**EBOOK**\n", + "\n", + "## Why the Data Lakehouse Is Your Next Data Warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Preface .......................................................................................................................................................................................................................................... **3**\n", + "\n", + "Introduction ............................................................................................................................................................................................................................. **4**\n", + "\n", + "Our Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n", + "\n", + "Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n", + "\n", + "Why Databricks SQL? ............................................................................................................................................................................................... 6\n", + "\n", + "Common use cases .................................................................................................................................................................................................... 7\n", + "\n", + "The Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n", + "\n", + "**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n", + "\n", + "**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n", + "\n", + "**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n", + "\n", + "Conclusion ............................................................................................................................................................................................................................. **24**\n", + "\n", + "Customer Stories ............................................................................................................................................................................................................... **25**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Preface\n", + "\n", + "Historically, data teams have had to resort to a bifurcated architecture to run traditional\n", + "BI and analytics workloads, copying subsets of the data already stored in their data lake\n", + "to a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\n", + "governance inherent in proprietary architectures.\n", + "\n", + "Our customers have asked us to simplify their data architecture. We decided to accelerate\n", + "our investments to do just that.\n", + "\n", + "\n", + "We introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\n", + "first-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\n", + "We use the term “lakehouse” to reflect our customers’ desire to combine the best of data\n", + "warehouses and data lakes. With the lakehouse, you can now establish one source of truth\n", + "for all data and enable all workloads from AI to BI on one platform. And we want to provide\n", + "you with ease-of-use and state-of-the-art performance at the lowest cost.\n", + "\n", + "\n", + "**Reynold Xin**\n", + "\n", + "Original Creator of Apache Spark, TM\n", + "Co-founder and Chief Architect,\n", + "Databricks\n", + "\n", + "\n", + "This eBook covers how we went back to the drawing board to build Databricks SQL — the\n", + "last mile of enabling data warehousing capabilities for your existing data lakes — as part of\n", + "the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "Most organizations operate their business with a complex data architecture that\n", + "combines data warehouses and data lakes. For one thing, data lakes are great\n", + "for machine learning (ML). They support open formats and a large ecosystem.\n", + "But data lakes have poor support for business intelligence (BI) and suffer\n", + "complex data quality problems. Data warehouses, on the other hand, are great\n", + "for BI applications. But they have limited support for ML workloads, can’t handle\n", + "natural language data, large-scale structured data, or raw, video, audio or image\n", + "files, and are proprietary systems with only a SQL interface.\n", + "\n", + "As a result, data is moved around the organization through data pipelines and\n", + "systems that create a multitude of data silos. A large amount of time is spent\n", + "maintaining these pipelines and systems rather than creating new value from\n", + "data, and downstream consumers struggle to get a single source of truth of the\n", + "data due to the inherent siloing of data that takes place. The situation becomes\n", + "very expensive, and decision-making speed and quality are negatively affected.\n", + "\n", + "Unifying these systems can be transformational in how we think about data.\n", + "\n", + "\n", + "##### The need for simplification\n", + "\n", + "It is time for a new data architecture that can meet both today’s and tomorrow’s\n", + "needs. Without any compromise. Advanced analytics and ML are one of the\n", + "most strategic priorities for data-driven organizations today, and the amount\n", + "of unstructured data is growing exponentially. So it makes sense to position\n", + "the data lake as the center of the data infrastructure. However, for this to be\n", + "achievable, the data lake needs to adopt the strengths of data warehouses.\n", + "\n", + "The answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\n", + "and standardized system design: one that implements data structure and data\n", + "management features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n", + "\n", + "##### Building the Data Lakehouse\n", + "[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n", + "\n", + "\n", + "-----\n", + "\n", + "### Our Approach: The Databricks Lakehouse Platform\n", + "\n", + "Our customers have asked us for simplification. This is why we’ve embarked on\n", + "this journey to deliver one simple, open and collaborative platform for all your\n", + "data, AI and BI workloads on your existing data lakes.\n", + "\n", + "The [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\n", + "combining the data management and performance typically found in data\n", + "warehouses with the low-cost, flexible object stores offered by data lakes.\n", + "\n", + "It’s built on open source and open standards to maximize flexibility, and lets you\n", + "store all your data — structured, semi-structured and unstructured — in your\n", + "existing data lake while still getting the data quality, performance, security and\n", + "governance you’d expect from a data warehouse. Data only needs to exist once\n", + "to support all of your data, AI and BI workloads on one common platform\n", + "— establishing one source of truth.\n", + "\n", + "Finally, the Lakehouse Platform provides tailored and collaborative\n", + "experiences so data engineers, data scientists and analysts can work together\n", + "on one common platform across the entire data lifecycle — from ingestion to\n", + "consumption and the serving of data products — and innovate faster.\n", + "\n", + "Let’s look at how, with the right data structures and data management\n", + "capabilities in place, we can now deliver data warehouse and analytics\n", + "capabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n", + "\n", + "\n", + "Databricks SQL is a serverless data warehouse on the Databricks Lakehouse\n", + "Platform that lets you run all your SQL and BI applications at scale with up to 12x\n", + "better price/performance, a unified governance model, open formats and APIs,\n", + "and your tools of choice — no vendor lock-in. Reduce resource management\n", + "overhead with serverless compute, and easily ingest, transform and query\n", + "all your data in place to deliver real-time business insights faster. In fact, DB\n", + "SQL now holds the new world record in 100TB TPC-DS, the gold standard\n", + "performance benchmark for data warehousing.\n", + "\n", + "Built on open standards and APIs, the lakehouse provides an open, simplified and\n", + "multicloud architecture that brings the best of data warehousing and data lakes\n", + "together, and integrations with a rich ecosystem for maximum flexibility.\n", + "\n", + "\n", + "##### Why Databricks SQL?\n", + "\n", + "Best Price/Performance\n", + "Lower costs, get world-class performance, and eliminate the need to manage,\n", + "configure or scale cloud infrastructure with serverless.\n", + "\n", + "Built-In Governance\n", + "Establish one single copy for all your data using open standards, and one unified\n", + "governance layer across all data teams using standard SQL.\n", + "\n", + "Rich Ecosystem\n", + "Use SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\n", + "to ingest, transform and query all your data in place.\n", + "\n", + "Break Down Silos\n", + "Empower every analyst to access the latest data faster for downstream real-time\n", + "analytics, and go effortlessly from BI to ML.\n", + "\n", + "**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Common use cases\n", + "\n", + "Thousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\n", + "for hundreds of analysts across their organizations, and to build custom data applications to better serve their\n", + "customers. Below are some examples of use cases for Databricks SQL.\n", + "\n", + "**At Atlassian, we have proven**\n", + "\n", + "\n", + "**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n", + "**your BI tools of choice** **the freshest data** **data applications**\n", + "\n", + "\n", + "**that there is no longer a need**\n", + "\n", + "**for two separate data things.**\n", + "\n", + "**Technology has advanced**\n", + "\n", + "**far enough for us to consider**\n", + "\n", + "**one single unified lakehouse**\n", + "\n", + "**architecture.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager,\n", + "Atlassian\n", + "\n", + "\n", + "Enable business analysts to\n", + "directly query data lake data\n", + "using their favorite BI tool and\n", + "avoid data silos. Reengineered\n", + "and optimized connectors\n", + "ensure fast performance,\n", + "low latency and high user\n", + "concurrency to your data lake.\n", + "Now analysts can use the best\n", + "tool for the job on one single\n", + "source of truth for your data.\n", + "\n", + "\n", + "Empower every analyst and SQL\n", + "professional in your organization\n", + "to quickly find and share new\n", + "insights by providing them with\n", + "a collaborative and self-served\n", + "analytics experience. Confidently\n", + "manage data permissions with\n", + "fine-grained governance, share and\n", + "reuse queries, and quickly analyze\n", + "and share results using interactive\n", + "visualizations and dashboards.\n", + "\n", + "\n", + "Build more effective and\n", + "tailored data applications\n", + "for your own organization or\n", + "your customers. Benefit from\n", + "the ease of connectivity,\n", + "management and better price/\n", + "performance of DB SQL to\n", + "simplify development of dataenhanced applications at scale,\n", + "all served from your data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "### The Inner Workings of the Lakehouse\n", + "\n", + "\n", + "In the next chapter, we’ll unpack the three foundational layers of the Databricks\n", + "Lakehouse Platform and how we went back to the drawing board to build this\n", + "experience. Specifically, we’ll dive into how we built Databricks SQL to deliver\n", + "analytics and data warehousing workloads on your lakehouse.\n", + "\n", + "\n", + "Those layers are:\n", + "\n", + "**1 .** The storage layer, or how we store and govern data\n", + "\n", + "**2 .** The compute layer, or how we process queries\n", + "\n", + "**3 .** The consumption layer, or the tools you can use to interface with the system\n", + "\n", + "\n", + "###### PART 1: STORAGE LAYER\n", + "\n", + "In order to bring the best of data lakes and data\n", + "warehouses, we needed to support the openness\n", + "and flexibility of data lakes, as well as the quality,\n", + "performance and governance you’d expect from a\n", + "data warehouse.\n", + "\n", + "\n", + "**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n", + "|---|---|---|\n", + "|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n", + "|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n", + "|All data types|Structured only|All data types|\n", + "|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Transactional guarantees for your data lake\n", + "\n", + "\n", + "The open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\n", + "lake challenges around data quality and reliability. It is the foundation for the\n", + "lakehouse, and Databricks SQL stores and processes data using Delta Lake.\n", + "\n", + "For example, it provides ACID transactions to ensure that every operation either\n", + "fully succeeds or fully aborts for later retries — without requiring new data\n", + "pipelines to be created. It unifies batch and streaming pipelines so you can\n", + "easily merge existing and new data at the speed required for your business. With\n", + "Time Travel, Delta Lake automatically records all past transactions, so it’s easy\n", + "to access and use previous versions of your data for compliance needs or for\n", + "ML applications. Advanced indexing, caching and auto-tuning allow optimization\n", + "of Delta tables for the best query performance. Delta Lake also acts as the\n", + "foundation for fine-grained, role-based access controls on the lakehouse.\n", + "\n", + "As a result, Delta Lake allows you to treat tables in Databricks SQL just like you\n", + "treat tables in a database: updates, inserts and merges can take place with high\n", + "performance at the row level. This is particularly useful if you are inserting new\n", + "\n", + "\n", + "data rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n", + "(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\n", + "with one open and standard format — not only for SQL but also for Python, Scala\n", + "and other languages — so you can run all analytical and ML use cases on the\n", + "same data.\n", + "\n", + "**Delta Lake provides the key**\n", + "\n", + "An open format storage layer built for lake-first architecture\n", + "\n", + "ACID transactions, Time Travel, highly available\n", + "\n", + "Advanced indexing, caching, auto-tuning\n", + "\n", + "Fine-grained, role-based access controls\n", + "\n", + "Streaming & batch, analytics & ML\n", + "\n", + "Python, SQL, R, Scala\n", + "\n", + "Delta Lake brings data quality, performance and governance to the lakehouse\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n", + "##### Delta Lake: The Definitive Guide\n", + "[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A framework for building a curated data lake\n", + "\n", + "\n", + "With the ability to ingest petabytes of data with auto-evolving schemas, Delta\n", + "Lake helps turn raw data into actionable data by incrementally and efficiently\n", + "processing data as it arrives from files or streaming sources like Kafka, Kinesis,\n", + "Event Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\n", + "as it arrives with no manual intervention, as well as infer schema, detect column\n", + "changes for structured and unstructured data formats, and prevent data loss by\n", + "rescuing data columns that don’t meet data quality specifications. And now with\n", + "[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\n", + "various sources.\n", + "\n", + "As you refine the data, you can add more structure to it. Databricks recommends\n", + "the Bronze, Silver and Gold pattern. It lets you easily merge and transform new\n", + "and existing data — in batch or streaming — while benefiting from the low-cost,\n", + "flexible object storage offered by data lakes. Bronze is the initial landing zone\n", + "for the pipeline. We recommend copying data that’s as close to its raw form as\n", + "possible to easily replay the whole pipeline from the beginning, if needed. Silver\n", + "is where the raw data gets cleansed (think data quality checks), transformed\n", + "and potentially enriched with external data sets. Gold is the production-grade\n", + "data that your entire company can rely on for business intelligence, descriptive\n", + "statistics, and data science/machine learning.\n", + "\n", + "\n", + "By the time you get to Gold, the tables are high-value business-level metrics\n", + "that have all the schema enforcement and constraints applied. This way, you can\n", + "retain the flexibility of the data lake at the Bronze and Silver levels, and then use\n", + "the Gold level for high-quality business data.\n", + "\n", + "Auto Loader\n", + "\n", + "\n", + "BRONZE\n", + "\n", + "\n", + "SILVER GOLD\n", + "\n", + "\n", + "Structured Streaming\n", + "\n", + "Batch\n", + "\n", + "COPY INTO\n", + "\n", + "Partners\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned Business-level\n", + "and history and augmented aggregates\n", + "\n", + "|Col1|Col2|\n", + "|---|---|\n", + "||R|\n", + "\n", + "\n", + "**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### An aside on batch and streaming data pipelines\n", + "\n", + "\n", + "The best way to set up and run data pipelines in the Bronze/Silver/Gold\n", + "pattern recommended on the previous page is in Delta Live Tables (DLT).\n", + "DLT makes it easy to build and manage reliable batch and streaming\n", + "data pipelines that deliver high-quality data. It helps data engineering\n", + "teams simplify ETL development and management with declarative\n", + "pipeline development, automatic data testing, and deep visibility for\n", + "monitoring and recovery.\n", + "\n", + "The fact that you can run all your batch and streaming pipelines together\n", + "in one simple, declarative framework makes data engineering easy on the\n", + "Databricks Lakehouse Platform. We regularly talk to customers who have\n", + "been able to reduce pipeline development time from weeks — or months\n", + "— to mere minutes with Delta Live Tables. And by the way, even data\n", + "\n", + "\n", + "analysts can easily interrogate DLT pipelines for the queries they need\n", + "to run, without knowing any sort of specialized programming language\n", + "or niche skills.\n", + "\n", + "One of the top benefits of DLT, and Delta Lake in general, is that it is built\n", + "with streaming pipelines in mind. Today, the world operates in real time, and\n", + "businesses are increasingly expected to analyze and respond to their data in\n", + "real time. With streaming data pipelines built on DLT, analysts can easily access,\n", + "query and analyze data with greater accuracy and actionability than with\n", + "conventional batch processing. Delta Live Tables makes real-time analytics a\n", + "reality for our customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Fine-grained governance on the lakehouse\n", + "\n", + "Delta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\n", + "on the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\n", + "provides fine-grained governance across clouds, data and ML assets. Among the\n", + "benefits of the Unity Catalog, it allows you to:\n", + "\n", + "**• Discover, audit and govern data assets in one place:** A user-friendly\n", + "interface, automated data lineage across tables, columns, notebooks,\n", + "workflows and dashboards, role-based security policies, table or\n", + "column-level tags, and central auditing capabilities make it easy for\n", + "data stewards to discover, manage and secure data access to meet\n", + "compliance and privacy needs directly on the lakehouse.\n", + "\n", + "\n", + "\n", + "**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\n", + "open standard SQL DCL. This means database administrators can easily\n", + "grant permission to arbitrary, user-specific views, or set permissions on\n", + "all columns tagged together, using familiar SQL.\n", + "\n", + "**• Centrally manage and audit shared data across organizations:** Every\n", + "organization needs to share data with customers, partners and suppliers\n", + "to better collaborate and to unlock value from their data. Unity Catalog\n", + "builds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\n", + "shared assets within and across organizations.\n", + "\n", + "\n", + "The Unity Catalog makes it easy for data stewards to discover, manage and secure data access\n", + "to meet compliance and privacy needs on the lakehouse.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 2: COMPUTE LAYER\n", + "\n", + "\n", + "The next layer to look at is the compute layer, or how we process queries.\n", + "\n", + "Apache Spark TM has been the de facto standard for data lake compute. It’s great\n", + "for processing terabytes and petabytes of data cheaply, but historically Spark\n", + "SQL uses a nonstandard syntax and can be difficult to configure.\n", + "\n", + "\n", + "Data warehouses, on the other hand, tend to support short running queries\n", + "really well, especially when you have a lot of users issuing queries concurrently.\n", + "They tend to be easier to set up, but don’t necessarily scale or they become\n", + "too costly.\n", + "\n", + "\n", + "**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n", + "|---|---|---|\n", + "|Economical|Scaling is exponentially more expensive|Economical|\n", + "|High operational complexity|Ease of use|Ease of use|\n", + "||||\n", + "\n", + "\n", + "A popular belief is that large workloads require a drastically different system\n", + "than low latency, high concurrency workloads. For example, there’s the classic\n", + "trade-off in computer systems between latency and throughput.\n", + "\n", + "But after spending a lot of time analyzing these systems, we found that it was\n", + "possible to simultaneously improve large query performance and concurrency\n", + "\n", + "\n", + "and latency. Although the classic trade-offs definitely existed, they were only\n", + "explicit when we optimized the system to the very theoretical optimal. It turned\n", + "out the vast majority of software — and this includes all data warehouse systems\n", + "and Databricks — were far away from optimal.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n", + "\n", + "\n", + "To achieve world-class performance for analytics on the lakehouse, we chose to\n", + "completely rebuild the compute layer. But performance isn’t everything. We also\n", + "want it to be simple to administer and cheaper to use. Databricks SQL leverages\n", + "serverless SQL warehouses that let you get started in seconds, and it’s powered\n", + "by a new native MPP vectorized engine: Photon.\n", + "\n", + "Databricks SQL warehouses are optimized and elastic SQL compute resources.\n", + "Just pick the cluster size and Databricks automatically determines the best\n", + "instance types and VMs configuration for the best price/performance. This\n", + "means you don’t have to worry about estimating peak demand or paying too\n", + "much by overprovisioning. You just need to click a few buttons to operate.\n", + "To further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\n", + "With the serverless capability, queries start rapidly with zero infrastructure\n", + "management or configuration overhead. This lowers your total cost, as you pay\n", + "only for what you consume without idle time or overprovisioned resources.\n", + "\n", + "\n", + "Since CPU clock speeds have plateaued, we also wanted to find new ways to\n", + "process data faster, beyond raw compute power. One of the most impactful\n", + "methods has been to improve the amount of data that can be processed in\n", + "parallel. However, data processing engines need to be specifically architected to\n", + "take advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\n", + "C++ based vectorized query processing engine that dramatically improves query\n", + "performance while remaining fully compatible with open Spark APIs. Databricks\n", + "SQL warehouses are powered by Photon, which seamlessly coordinates work and\n", + "resources and transparently accelerates portions of your SQL queries directly on\n", + "your data lake. No need to move the data to a data warehouse.\n", + "\n", + "**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n", + "##### Photon: A Fast Query Engine for Lakehouse Systems\n", + "\n", + "[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Did you know?**\n", + "\n", + "Databricks SQL warehouses scale automatically throughout the day to\n", + "better suit your business needs. Administration is simplified by identifying\n", + "how many clusters can scale out with min and max, and Databricks SQL will\n", + "auto-scale as needed. This ensures that you have ample compute to serve\n", + "your needs, without overprovisioning. Administrators appreciate the ability\n", + "to have better control over consumption costs, while users appreciate that\n", + "their queries process as fast and efficiently as possible. For most BI and\n", + "analytics use cases, using medium-size warehouses with scaling is a great\n", + "balance of price/performance that fits most business needs.\n", + "\n", + "In the next section, we will discuss examples of Databricks SQL performance results\n", + "on large-scale analytic workloads as well as highly concurrent workloads.\n", + "\n", + "\n", + "Running Scheduled Starting Cluster Scale\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Large query performance: the fastest data warehouse\n", + "\n", + "\n", + "The industry standard benchmark used by data warehouses is TPC-DS. It includes\n", + "100 queries that range from very simple to very sophisticated to simulate decision\n", + "support workloads. This benchmark was created by a committee formed by\n", + "data warehousing vendors. The chart at right shows price/performance results\n", + "running the 100TB version of TPC-DS, since for large workloads the numbers that\n", + "ultimately matter pertain to the performance cost. As you can see, Databricks SQL\n", + "outperforms all cloud data warehouses we have measured.\n", + "\n", + "**[LEARN MORE](https://dbricks.co/benchmark)**\n", + "\n", + "**Did you know?**\n", + "\n", + "\n", + "**$2,000**\n", + "\n", + "**$1,791**\n", + "\n", + "**$1,500**\n", + "\n", + "**$1,000**\n", + "\n", + "**$952**\n", + "\n", + "\n", + "**$500**\n", + "\n", + "\n", + "**$242**\n", + "**$146**\n", + "\n", + "\n", + "**$358**\n", + "\n", + "\n", + "**$0**\n", + "Databricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\n", + "Spot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n", + "\n", + "System\n", + "\n", + "100TB TPC-DS price/performance benchmark (lower is better).\n", + "\n", + "\n", + "Databricks SQL has set a [new world record in](http://tpc.org/5013)\n", + "[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\n", + "benchmark for data warehousing. Databricks\n", + "SQL outperformed the previous record by 2.2x.\n", + "And this result has been formally audited and\n", + "reviewed by the TPC council.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Highly concurrent analytics workloads\n", + "\n", + "Beyond large queries, it is also common for highly concurrent analytics workloads\n", + "to execute over small data sets. To optimize concurrency, we used the same\n", + "TPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\n", + "streams. We analyzed the results to identify and remove bottlenecks, and\n", + "built hundreds of optimizations to improve concurrency. Databricks SQL now\n", + "outperforms some of the best cloud data warehouses for both large queries and\n", + "small queries with lots of users.\n", + "\n", + "Real-world workloads, however, are not just about either large or small queries.\n", + "Databricks SQL also provides intelligent workload management with a dual\n", + "queuing system and highly parallel reads.\n", + "\n", + "\n", + "16,523\n", + "\n", + "12,248\n", + "\n", + "###### ~3X\n", + "\n", + "4,672\n", + "\n", + "\n", + "11,690\n", + "\n", + "\n", + "July 2020\n", + "\n", + "\n", + "Jan 2021 Oct 2022\n", + "\n", + "\n", + "CLOUD DW X SQL WAREHOUSE X - L SIZE\n", + "\n", + "10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Intelligent workload management with smart queuing system\n", + "\n", + "Real-world workloads typically include a mix of small and large queries. Therefore\n", + "the smart queuing and load balancing capabilities of Databricks SQL need to\n", + "account for that too. Databrick SQL uses a smart dual queuing system (in preview)\n", + "that prioritizes small queries over large, as analysts typically care more about the\n", + "latency of short queries than large ones.\n", + "\n", + "\n", + "##### Highly parallel reads with improved I/O performance\n", + "\n", + "It is common for some tables in a lakehouse to be composed of many files — for\n", + "example, in streaming scenarios such as IoT ingest when data arrives continuously.\n", + "In legacy systems, the execution engine can spend far more time listing these\n", + "files than actually executing the query. Our customers told us they do not want to\n", + "sacrifice performance for data freshness. With async and highly parallel I/O, when\n", + "executing a query, Databricks SQL now automatically reads the next blocks of data\n", + "from cloud storage while the current block is being processed. This considerably\n", + "increases overall query performance on small files (by 12x for 1MB files) and “cold\n", + "data” (data that is not cached) use cases as well.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 3: CONSUMPTION LAYER\n", + "\n", + "\n", + "The third layer of the Databricks Lakehouse Platform would similarly have to bridge\n", + "the best of both data lakes and data warehouses. In the lakehouse, you would\n", + "have to be able to work seamlessly with your tools of choice — whether you are a\n", + "business analyst, data scientist, or ML or data engineer.\n", + "\n", + "\n", + "The lakehouse must treat Python, Scala, R and SQL programming languages\n", + "and ecosystems as first-class citizens to truly unify data engineering, ML and BI\n", + "workloads in one place.\n", + "\n", + "\n", + "**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n", + "|---|---|---|\n", + "|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n", + "|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n", + "||||\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A platform for your tools of choice\n", + "\n", + "\n", + "At Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\n", + "closely with a large number of software vendors to make sure you can easily use your tools of choice\n", + "on Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\n", + "your favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\n", + "concurrency and performance improvements, we make sure that the direct and live query experience is great.\n", + "\n", + "\n", + "**Now more than ever, organizations**\n", + "\n", + "**need a data strategy that enables**\n", + "\n", + "**speed and agility to be adaptable.**\n", + "\n", + "**As organizations are rapidly moving**\n", + "\n", + "**their data to the cloud, we’re**\n", + "\n", + "**seeing growing interest in doing**\n", + "\n", + "**analytics on the data lake. The**\n", + "\n", + "**introduction of Databricks SQL**\n", + "\n", + "**delivers an entirely new experience**\n", + "\n", + "**for customers to tap into insights**\n", + "\n", + "**from massive volumes of data with**\n", + "\n", + "**the performance, reliability and**\n", + "\n", + "**scale they need. We’re proud to**\n", + "\n", + "**partner with Databricks to bring**\n", + "\n", + "**that opportunity to life.**\n", + "\n", + "**Francois Ajenstat**\n", + "Chief Product Officer, Tableau\n", + "\n", + "\n", + "+ Any other Apache Spark-compatible client\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Faster BI results retrieval with Cloud Fetch\n", + "\n", + "Once query results are computed, cloud data warehouses often collect and\n", + "stream back results to BI clients on a single thread. This can create a bottleneck\n", + "and greatly slows down the experience if you are fetching anything more than a\n", + "few megabytes of results in size. To provide analysts with the best experience\n", + "from their favorite BI tools, we also needed to speed up how the system delivers\n", + "results to BI tools like Power BI or Tableau once computed.\n", + "\n", + "That’s why we’ve reimagined this approach with a new architecture called\n", + "[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\n", + "all of the compute nodes to cloud storage, and then sends the list of files using\n", + "pre-signed URLs back to the client. The client then can download in parallel\n", + "all the data from cloud storage. This approach provides up to 10x performance\n", + "improvement in real-world scenarios.\n", + "\n", + "\n", + "parallel\n", + "data\n", + "transfers\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "**Cluster**\n", + "\n", + "\n", + "SQL Endpoint\n", + "\n", + "\n", + "CUSTOMER BENCHMARK\n", + "TABLEAU EXTRACT\n", + "\n", + "\n", + "Cloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n", + "**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A first-class SQL development experience\n", + "\n", + "In addition to supporting your favorite tools, we\n", + "are also focused on providing a native first-class\n", + "SQL development experience. We’ve talked to\n", + "hundreds of analysts using various SQL editors\n", + "like SQL Workbench every day, and worked with\n", + "them to provide the dream set of capabilities\n", + "for SQL development.\n", + "\n", + "For example, Databricks SQL now supports\n", + "[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\n", + "special SQL dialect. Query tabs allow you to work\n", + "on multiple queries at once, autosave gives you\n", + "peace of mind so you never have to worry about\n", + "losing your drafts, integrated history lets you\n", + "easily look at what you have run in the past, and\n", + "intelligent auto-complete understands subqueries\n", + "and aliases for a delightful experience.\n", + "\n", + "\n", + "The built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with Databricks SQL, analysts can easily\n", + "make sense of query results through a wide variety\n", + "of rich visualizations and quickly build dashboards\n", + "with an intuitive drag-and-drop interface. To keep\n", + "everyone current, dashboards can be shared and\n", + "configured to automatically refresh, as well as to\n", + "alert the team to meaningful changes in the data.\n", + "\n", + "\n", + "Easily combine visualizations to build rich dashboards that can be shared with stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Databricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\n", + "into actionable data, combining the flexibility and openness of data lakes\n", + "with the reliability and performance of data warehouses. The Unity Catalog\n", + "provides fine-grained governance on the lakehouse across all clouds using\n", + "one friendly interface and standard SQL.\n", + "\n", + "Databricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\n", + "standard performance benchmark for data warehousing. It is powered by\n", + "Photon, the new vectorized query engine for the lakehouse, and by SQL\n", + "warehouses for instant, elastic compute decoupled from storage.\n", + "\n", + "Finally, Databricks SQL offers a native first-class SQL development\n", + "experience, with a built-in SQL editor, rich visualizations and dashboards,\n", + "and integrates seamlessly with your favorite BI- and SQL-based tools for\n", + "maximum productivity.\n", + "\n", + "\n", + "Databricks SQL under the hood.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Atlassian\n", + "\n", + "\n", + "Atlassian is a leading provider of collaboration, development and issue-tracking\n", + "\n", + "software for teams. With over 150,000 global customers (including 85 of the Fortune\n", + "\n", + "100), Atlassian is advancing the power of collaboration with products including Jira,\n", + "\n", + "Confluence, Bitbucket, Trello and more.\n", + "\n", + "USE CASE\n", + "\n", + "Atlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\n", + "down operational costs. Atlassian currently has a number of use cases focused on putting the\n", + "customer experience at the forefront.\n", + "\n", + "**Customer support and service experience**\n", + "With the majority of their customers being server-based (using products like Jira and Confluence),\n", + "Atlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\n", + "customer support experience.\n", + "\n", + "**Marketing personalization**\n", + "The same insights could also be used to deliver personalized marketing emails to drive\n", + "engagement with new features and products.\n", + "\n", + "**Anti-abuse and fraud detection**\n", + "They can predict license abuse and fraudulent behavior through anomaly detection and\n", + "predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Atlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\n", + "and externally. They have moved from a data warehousing paradigm to standardization on Databricks,\n", + "enabling the company to become more data driven across the organization. Over 3,000 internal users in\n", + "areas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\n", + "insights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\n", + "using the platform to drive more personalized support and service experiences to their customers.\n", + "\n", + "**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\n", + "finance, sales, support and R&D\n", + "\n", + "**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n", + "\n", + "**•** MLflow streamlines MLOps for faster delivery\n", + "\n", + "**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n", + "\n", + "With cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\n", + "access all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\n", + "Already the company has:\n", + "\n", + "**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\n", + "jobs from EMR to Databricks with minimal effort and low-code change\n", + "\n", + "**•** Decreased delivery time by 30% with shorter dev cycles\n", + "\n", + "**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n", + "\n", + "\n", + "**At Atlassian, we need to ensure**\n", + "**teams can collaborate well**\n", + "**across functions to achieve**\n", + "**constantly evolving goals. A**\n", + "**simplified lakehouse architecture**\n", + "**would empower us to ingest high**\n", + "**volumes of user data and run the**\n", + "**analytics necessary to better**\n", + "**predict customer needs and**\n", + "**improve the experience of our**\n", + "**customers. A single, easy-to-use**\n", + "**cloud analytics platform allows**\n", + "**us to rapidly improve and build**\n", + "**new collaboration tools based on**\n", + "**actionable insights.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "### ABN AMRO\n", + "\n", + "\n", + "As an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n", + "\n", + "by legacy infrastructure and data warehouses that complicated access to data across various\n", + "\n", + "sources and created inefficient data processes and workflows. Today, Azure Databricks\n", + "\n", + "empowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n", + "\n", + "scientists and analysts who work collaboratively on improving business operations and\n", + "\n", + "introducing new go-to-market capabilities across the company.\n", + "\n", + "USE CASE\n", + "\n", + "ABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\n", + "providing automation and insight across operations.\n", + "\n", + "**Personalized finance**\n", + "ABN AMRO leverages real-time data and customer insights to provide products and services tailored to\n", + "customers’ needs. For example, they use machine learning to power targeted messaging within their automated\n", + "marketing campaigns to help drive engagement and conversion.\n", + "\n", + "**Risk management**\n", + "Using data-driven decision-making, they are focused on mitigating risk for both the company and their\n", + "customers. For example, they generate reports and dashboards that internal decision makers and leaders use to\n", + "better understand risk and keep it from impacting ABN AMRO’s business.\n", + "\n", + "**Fraud detection**\n", + "With the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\n", + "impacts their customers. Among the activities they’re trying to address are money laundering and fake credit\n", + "card applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Today, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\n", + "scientists and analysts who work collaboratively on improving business operations and introducing new\n", + "go-to-market capabilities across the company.\n", + "\n", + "**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\n", + "downstream analytics\n", + "\n", + "**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\n", + "through reports and dashboards\n", + "\n", + "**•** MLflow speeds deployment of new models that improve the customer experience — with new use\n", + "cases delivered in under two months\n", + "\n", + "\n", + "**Databricks has changed the way**\n", + "**we do business. It has put us in**\n", + "**a better position to succeed in**\n", + "**our data and AI transformation**\n", + "**as a company by enabling data**\n", + "**professionals with advanced data**\n", + "**capabilities in a controlled and**\n", + "**scalable way.**\n", + "\n", + "**Stefan Groot**\n", + "Head of Analytics Engineering,\n", + "ABN AMRO\n", + "\n", + "\n", + "#### 10x faster\n", + "\n", + "time to market — use cases\n", + "deployed in two months\n", + "\n", + "\n", + "#### 100+ \n", + "\n", + "use cases to be delivered\n", + "over the coming year\n", + "\n", + "\n", + "#### 500+\n", + "\n", + "empowered business\n", + "and IT users\n", + "\n", + "\n", + "**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### SEGA Europe\n", + "\n", + "**Improving the player experience**\n", + "\n", + "# “ is at the heart of everything\n", + "\n", + "**we do, and we very much**\n", + "**see Databricks as a key**\n", + "**partner, supporting us to drive**\n", + "**forward the next generation of**\n", + "**community gaming.**\n", + "\n", + "**Felix Baker**\n", + "Data Services Manager, SEGA Europe\n", + "\n", + "\n", + "SEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n", + "\n", + "Lakehouse Platform to personalize the player experience and build its own machine\n", + "\n", + "learning algorithm to help target and tailor games for over 30 million of its customers.\n", + "\n", + "As housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\n", + "titles, including Football Manager,™ saw over double the number of sales during the first lockdown\n", + "compared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\n", + "in players over the course of the COVID-19 pandemic. With more anonymized data being collected through\n", + "an analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\n", + "sheer volume of data, extract meaningful insights from it and enable the data science team to improve\n", + "general workflow.\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the lakehouse company. More than 7,000 organizations\n", + "\n", + "worldwide — including Comcast, Condé Nast and over 50% of the\n", + "\n", + "Fortune 500 — rely on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of\n", + "\n", + "Apache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
# Big Book of Data and AI Use Cases for the Public Sector\n", + "\n", + "### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "The State of Data and AI in the Government .......................................................................................... 3\n", + "\n", + "The Need for a Modern Data Architecture ............................................................................................. 5\n", + "\n", + "Introducing the Lakehouse for Public Sector ......................................................................................... 6\n", + "\n", + "**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n", + "\n", + "**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n", + "\n", + "**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n", + "\n", + "**U S E C A S E :** Money Laundering ................................................................................................................. 17\n", + "\n", + "**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n", + "\n", + "**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n", + "\n", + "**U S E C A S E :** Public Health Management .................................................................................................. 24\n", + "\n", + "Conclusion ................................................................................................................................................. 26\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of Data and AI in the Government\n", + "\n", + "###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n", + "\n", + "\n", + "In 2018, the U.S. Federal Government embarked on one of its most ambitious\n", + "efforts since putting a man on the moon — embedding data into all aspects of\n", + "decision-making. By enacting the Evidence-Based Policymaking Act of 2018,\n", + "Congress set in motion requirements for agencies to modernize their data and\n", + "analytics capabilities, including the appointment of agency-level chief data\n", + "officers. A year later came the Federal Data Strategy, which provided further\n", + "guidance for how agencies should manage and use data by 2030.\n", + "\n", + "\n", + "With all of this guidance, agencies are starting to make meaningful improvements\n", + "to their data strategy, but when it comes to innovating with data, agencies still\n", + "lag behind the private sector. This begs the question: what’s standing in the way?\n", + "The hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\n", + "they can largely be attributed to a patchwork of legacy technologies that have\n", + "been amassed over the last 30 to 40 years. While these hurdles stand in the\n", + "way, a number of innovative agencies are making significant progress as they\n", + "embrace new data and AI capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Federal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n", + "50% from 2018. There’s a good reason for this level of spend: Deloitte recently\n", + "published a report, “AI-augmented Government,” that estimates the federal\n", + "government could free up as many as 1.2 billion hours of work and save up to\n", + "$41.1 billion annually through the use of AI-driven automation. Early adopters\n", + "of advanced analytics are starting to see the fruits of their labor. For example,\n", + "[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\n", + "on applicants by 24x, automate the processing of millions of applications,\n", + "and reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n", + "[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n", + "\n", + "In this eBook, we explore the hurdles of legacy technologies and how a modern\n", + "data lakehouse can help agencies unlock innovative data and analytics use cases\n", + "at all levels of government. Over the following seven example use cases, covering\n", + "everything from cyber threat detection to improving public health,\n", + "\n", + "\n", + "**An increased focus on cloud, analytics and AI = operational efficiency**\n", + "\n", + "1. AI/ML\n", + "2. Data Analytics\n", + "3. Cloud\n", + "\n", + "**$1B** **TOP PRIORITIES** **$41B+**\n", + "\n", + "Data and AI Research and Government CIOs’ top Estimated government\n", + "Development Initiative game-changing technologies savings from data-driven\n", + "automation\n", + "\n", + "**U.S. Government**\n", + "\n", + "we demonstrate how the Databricks Lakehouse for Public Sector is critical to\n", + "improving citizen services and delivering on mission objectives. This guide also\n", + "includes resources in the form of Solution Accelerators, reference architectures\n", + "and real-world customer stories to help as you embark on your own journey to\n", + "drive a safer and more prosperous nation through the use of data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Modern Data Architecture\n", + "\n", + "###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n", + "\n", + "##### Common challenges\n", + "\n", + "\n", + "Many government agencies are burdened with a legacy IT infrastructure that is\n", + "built with on-premises data warehouses that are complex to maintain, are costly\n", + "to scale as compute is coupled with storage, and lack support for unstructured\n", + "data and advanced analytics. This severely inhibits data-driven innovation.\n", + "Maintaining these systems requires a massive investment of both time and\n", + "money compared to modern cloud-based systems and creates a number of\n", + "avoidable challenges:\n", + "\n", + "\n", + "government is often done in weekly or daily batches, but decision-making\n", + "needs to happen in real time. Critical events like cyber attacks and health\n", + "pandemics can’t wait a week.\n", + "\n", + "**Lack of citizen insights**\n", + "\n", + "When data is siloed, teams get an incomplete view of the citizen,\n", + "resulting in missed opportunities to improve the delivery of services that\n", + "impact the quality of life for their constituents.\n", + "\n", + "\n", + "**Lack of reliability**\n", + "\n", + "\n", + "Siloed systems result in data replication as teams spin up new data marts\n", + "to support their one-off use cases. Without a single source of truth, teams\n", + "struggle with data inconsistencies, which can result in inaccurate analysis\n", + "and model performance that is only compounded over time.\n", + "\n", + "**Lack of agility**\n", + "\n", + "Disjointed analytics tools and legacy infrastructure hinder the ability of\n", + "teams to conduct real-time analytics. Most data processing in the\n", + "\n", + "\n", + "**Lack of productivity**\n", + "\n", + "Data scientists and data analysts alike must have the right tool set to\n", + "collaboratively investigate, extract and report meaningful insights from\n", + "their data. Unfortunately, data silos lead to organizational silos, which make\n", + "collaboration inside an agency as well as between agencies very difficult.\n", + "With different groups of data teams leveraging their own coding and\n", + "analytical tools, communicating insights and working across teams —\n", + "let alone across agencies — is almost impossible. This lack of collaboration\n", + "can drastically limit the capabilities of any data analytics or AI initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introducing the Lakehouse for Public Sector\n", + "\n", + "\n", + "The reason that the Databricks Lakehouse is\n", + "able to deliver the simplicity, flexibility and\n", + "speed that a government agency requires is\n", + "that it fundamentally reimagines the modern\n", + "data architecture. Databricks provides federal,\n", + "state and local agencies with a cloud-native\n", + "Lakehouse Platform that combines the best\n", + "of data warehouses and data lakes — to store\n", + "and manage all your data for all your analytics\n", + "workloads. With this modern architecture,\n", + "agencies can federate all their data and\n", + "democratize access for downstream use\n", + "cases, empowering their teams to deliver on\n", + "their mission objectives by unlocking the full\n", + "potential of their data.\n", + "\n", + "\n", + "**Delivering real-time data insight in support of the mission**\n", + "\n", + "- Fraud, Waste & Abuse\n", + "\n", + "- Cybersecurity\n", + "\n", + "- Medicaid Dashboards &\n", + "Reporting\n", + "\n", + "- Process Improvement\n", + "\n", + "- Predictive Maintenance\n", + "\n", + "- SCM & Demand Forecasting\n", + "\n", + "- Smart Military/Censor Data\n", + "\n", + "- Military Heatlh\n", + "\n", + "- COVID Response/Decision\n", + "Support\n", + "\n", + "- Smart Cities/Connected\n", + "Vehicles\n", + "\n", + "- Citizen Engagement\n", + "\n", + "- Data-Driven Decision-Making\n", + "\n", + "\n", + "-----\n", + "\n", + "**Federate all of your agency’s data**\n", + "\n", + "Any type of data can be stored because, like a data lake, the Databricks\n", + "Lakehouse is built using the low-cost object storage supported by cloud\n", + "providers. Leveraging this capability helps break down the data silos that\n", + "hinder efforts to aggregate data for advanced analytics (e.g., predictive\n", + "maintenance) or compute-intensive workloads like detecting cyber\n", + "threats across billions of signals. Probably even more important is the\n", + "ability of the lakehouse architecture to travel back in time, ensuring full\n", + "audit compliance and high governance standards for analytics and AI.\n", + "\n", + "**Power real-time decision-making**\n", + "\n", + "Streaming use cases such as IoT analytics or disease spread tracking is\n", + "simpler to support because the lakehouse uses Apache Spark TM as the\n", + "data processing engine and Delta Lake as a storage layer. With Spark,\n", + "you can toggle between batch and streaming workloads with just a line\n", + "of code. With Delta Lake, native support for ACID transactions means\n", + "that you can deploy streaming workloads without the overhead of\n", + "common reliability and performance issues. These capabilities make\n", + "real-time analytics possible.\n", + "\n", + "\n", + "**Unlock collaborative analytics for all personas**\n", + "\n", + "The Databricks Lakehouse for Public Sector is your one-stop shop for\n", + "all your analytics and AI. The platform includes a business intelligence\n", + "capability — Databricks SQL — that empowers data analysts to query and run\n", + "reports against all of an agency’s unified data. Databricks SQL integrates with\n", + "BI tools like Tableau and Microsoft Power BI and complements any existing BI\n", + "tools with a SQL-native interface, allowing data analysts and data scientists\n", + "to query data directly within Databricks and build powerful dashboards.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Deliver on your mission with predictive insights**\n", + "In the same environment, data scientists can build, share and collaborate\n", + "on machine learning models for advanced use cases like fraud detection\n", + "or geospatial analytics. Additionally, MLflow, an open source toolkit for\n", + "managing the ML lifecycle, is built into the Lakehouse so data scientists\n", + "can manage everything in one place. Databricks natively supports Python,\n", + "R, SQL and Scala so practitioners can work together with the languages and\n", + "libraries of their choice, reducing the need for separate tools. With these\n", + "capabilities, data teams can turn insights from real-world data into powerful\n", + "visualizations designed for machine learning. Visualizations can then be\n", + "turned into interactive dashboards to share insights with peers across\n", + "agencies, policymakers, regulators and decision-makers.\n", + "\n", + "\n", + "##### Customers That Innovate With Databricks Lakehouse for Public Sector\n", + "\n", + "Some of the top government agencies in the world turn to the\n", + "Databricks Lakehouse for Public Sector to bring analytics and AI-driven\n", + "automation and innovation to the communities they serve.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Cybersecurity\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Limited window of data**\n", + "Given the high cost of storage, most agencies retain only a few weeks of threat\n", + "data. This can be a real problem in scenarios where a perpetrator gains access\n", + "to a network but waits months before doing anything malicious. Without a long\n", + "historical record, security teams can’t analyze cyberattacks over long-term\n", + "horizons or conduct deep forensic reviews.\n", + "\n", + "##### Solution overview\n", + "\n", + "For government agencies that are ready to modernize their security data\n", + "infrastructure and analyze data at petabyte-scale more cost-effectively,\n", + "Databricks provides an open lakehouse platform that augments existing SIEMs\n", + "to help democratize access to data for downstream analytics and AI. Built\n", + "on Apache Spark and Delta Lake, Databricks is optimized to process large\n", + "volumes of streaming and historic data for real-time threat analysis and incident\n", + "response. Security teams can query threat data going years into the past in just\n", + "minutes and build ML models to detect new threat patterns and reduce false\n", + "positives. Additionally, Databricks created a Splunk-certified add-on to augment\n", + "Splunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n", + "\n", + "\n", + "Cyberattacks from bad actors and nation states are a huge and growing threat\n", + "to government agencies. Recent large-scale attacks like the ones on SolarWinds,\n", + "log4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\n", + "frequency of broad-reaching cyberattacks. Data breaches cost the federal\n", + "government more than $4 million per incident in 2021 and threaten national\n", + "security. Staying ahead of the next threat requires continuous monitoring of\n", + "security data from an agency’s entire attack surface before, during and after\n", + "an incident.\n", + "\n", + "##### Challenges\n", + "\n", + "**Scaling existing SIEM solutions**\n", + "Agencies looking to expand existing SIEM tools for today’s petabytes of data can\n", + "expect increased licensing, storage, compute and integration resources resulting\n", + "in tens of millions of dollars in additional costs per year.\n", + "\n", + "**Rules-based systems**\n", + "Many legacy SIEM tools lack the critical analytics capabilities — such as\n", + "advanced analytics, graph processing and machine learning — needed to detect\n", + "unknown threat patterns or deliver on a broader set of security use cases like\n", + "behavioral analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "\n", + "Detecting criminals and nation states through DNS analytics. In order to address\n", + "common cybersecurity challenges such as deployment complexity, tech\n", + "limitation and cost, security teams need a real-time data analytics platform that\n", + "can handle cloud scale, analyze data wherever it is, natively support streaming\n", + "and batch analytics, and have collaborative content development capabilities.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n", + "\n", + "**Fighting Cyber Threats in Real Time**\n", + "Since partnering with Databricks, HSBC has reduced costs, accelerated threat\n", + "detection and response, and improved their security posture. Not only can\n", + "they process all of their required data, but they’ve also increased online query\n", + "retention from just days to months at petabyte scale. HSBC is now able to\n", + "execute 2-3x more threat hunts per analyst.\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "\n", + "Designed for cloud-scale security operations, the add-on provides Splunk\n", + "analysts with access to all data stored in the Lakehouse. Bidirectional pipelines\n", + "between Splunk and Databricks allow agency analysts to integrate directly into\n", + "Splunk visualizations and security workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Predictive Maintenance\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Integrating unstructured data**\n", + "Equipment data doesn’t just come in the form of IoT data. Agencies can gather\n", + "rich unstructured signals like audio, visual (e.g., video inspections) and text\n", + "(e.g., maintenance logs). Most legacy data architectures are unable to integrate\n", + "structured and unstructured data sources.\n", + "\n", + "**Operationalizing machine learning**\n", + "Most agencies lack the advanced analytics tools needed to build models that\n", + "can predict potential equipment failures. Those that do typically have their\n", + "data scientists working in a siloed set of tools, resulting in unnecessary data\n", + "replication and inefficient workflows.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is tailor-made for building IoT applications at scale.\n", + "With Databricks, agencies can easily manage large streaming volumes of small\n", + "files, with ACID transaction guarantees and reduced job fails compared to\n", + "traditional data warehouse architectures. Additionally, the Lakehouse is cloud\n", + "native and built on Apache Spark, so scaling for petabytes of data is not an issue.\n", + "With the Lakehouse, agencies can bring together all of their structured and\n", + "unstructured data with a unified set of tooling for data engineering, model building\n", + "and production rollout. With these capabilities, operations teams can quickly\n", + "detect and act on pending equipment failures before they affect performance.\n", + "\n", + "\n", + "Predictive maintenance is oftentimes associated with the manufacturing sector,\n", + "but in reality it extends far beyond the factory floor. Consider this for a moment:\n", + "the U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\n", + "buses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\n", + "vehicles — like multimillion-dollar aircraft — contain sensors that generate\n", + "massive amounts of data on the use and conditions of various components. And\n", + "it’s not just vehicles. Modern public utilities stream data through connected IoT\n", + "devices. All of this data can be analyzed to identify the root cause of a failure\n", + "and predict future maintenance, helping to avoid costly repairs and critical\n", + "assets from being out of service.\n", + "\n", + "##### Challenges\n", + "\n", + "**Managing IoT data at scale**\n", + "With billions of sensors generating information, most data systems are unable to\n", + "handle the sheer volume of data. Before agencies can even start analyzing their\n", + "data, legacy data warehouse–based tools require preprocessing of data, making\n", + "real-time analysis impossible.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "**Solution Accelerator: Predictive Maintenance**\n", + "Learn how to ingest real-time IoT data from field devices, perform complex\n", + "time series processing in Delta Lake and leverage machine learning to build\n", + "predictive maintenance models.\n", + "\n", + "[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n", + "\n", + "[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n", + "\n", + "[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n", + "\n", + "\n", + "[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n", + "[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n", + "\n", + "##### Customer story\n", + "\n", + "**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n", + "\n", + "**Protecting the Water Supply for 700,000 Residents**\n", + "Utilizing machine learning for predictive analytics to help stop water main\n", + "breaks before they occur, potentially saving hundreds of thousands of dollars\n", + "in repairs while reducing service interruption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Weather Sensor\n", + "Readings\n", + "(semi-structured)\n", + "\n", + "Real-time\n", + "streaming\n", + "\n", + "Wind Turbine\n", + "Telematics\n", + "(semi-structured)\n", + "\n", + "Maintenance Logs\n", + "(unstructured)\n", + "\n", + "\n", + "#### Databricks Lakehouse Platform\n", + "\n", + "Bronze Layer Silver Layer Gold Layer\n", + "\n", + "\n", + "Append Raw\n", + "Merge Data\n", + "Data\n", + "\n", + "\n", + "Join Streams and\n", + "Analyze Data\n", + "\n", + "Enriched\n", + "Readings\n", + "\n", + "\n", + "Output\n", + "\n", + "\n", + "Build Predictive\n", + "Maintenance Model\n", + "\n", + "\n", + "Granular\n", + "Readings\n", + "\n", + "\n", + "Aggregated\n", + "Hourly\n", + "Readings\n", + "\n", + "\n", + "Real-time Dashboards for Real-Time Dashboards for\n", + "Optimizing Performance Optimizing Performance\n", + "\n", + "|Col1|Col2|Col3|\n", + "|---|---|---|\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Fraud Detection\n", + "\n", + "\n", + "##### Overview\n", + "\n", + "According to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\n", + "monetary losses to fraud, waste and abuse go undetected and total tens of\n", + "billions of dollars. Financial fraud comes in many forms, from individuals taking\n", + "advantage of relief programs to complex networks of criminal organizations\n", + "working together to falsify medical claims and rebate forms. Investigative teams\n", + "hoping to stay ahead of fraudsters need advanced analytics techniques so they\n", + "can detect anomalous behavior buried in a sea of data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Lack of machine learning**\n", + "A rules-based approach is not enough. Bad actors are getting more and more\n", + "sophisticated in how they take advantage of government programs, necessitating\n", + "an AI-driven approach.\n", + "\n", + "**Unreliable data**\n", + "Getting high-quality, clean data and maintaining a rich feature store is critical\n", + "for identifying ever-evolving fraud patterns while maintaining a strict record of\n", + "previous data points.\n", + "\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables teams to develop complex ML models with\n", + "high governance standards and bridge the gap between data science and\n", + "technology to address the challenge of analyzing large volumes of data at scale\n", + "— 40 billion financial transactions a year are made in the United States alone.\n", + "Additionally, Databricks makes it possible to combine modern AI techniques\n", + "with the legacy rules-based methods that underpin current approaches to fraud\n", + "detection all within a common and efficient Spark-based orchestration engine.\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n", + "\n", + "Due to an ever-changing landscape, building a financial fraud detection\n", + "framework often goes beyond just creating a highly accurate machine learning\n", + "model. Oftentimes it involves a complex-decision science setup that combines\n", + "a rules engine with a need for a robust and scalable machine learning platform.\n", + "In this example, we show how to build a holistic fraud detection solution on\n", + "Databricks using data from a financial institution.\n", + "\n", + "\n", + "**Analytics at scale**\n", + "Training complex ML models with hundreds of features on gigabytes of\n", + "structured, semi-structured and unstructured data can be impossible without a\n", + "highly scalable and distributed infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n", + "\n", + "**Identifying Financial Fraud at Scale**\n", + "Processes hundreds of billions of market events\n", + "per day on the Databricks Lakehouse and uses\n", + "the power of machine learning to identify illicit\n", + "activity in near real-time.\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Money Laundering\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "Approximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\n", + "and with criminal organizations — both at home and abroad — implementing\n", + "increasingly sophisticated methods for laundering funds, it’s getting harder to\n", + "stop. While the federal government continues to apply pressure on the financial\n", + "sector through heightened regulation, more is needed to combat laundering.\n", + "Modern AI techniques such as graph analytics and computer vision can be\n", + "used to process different types of structured (e.g., financial transactions) and\n", + "unstructured (e.g., real estate images) data and identify illicit behavior. This\n", + "allows investigative teams to automate labor-intensive activities like confirming\n", + "a residential address or reviewing transaction histories, and instead dig into\n", + "priority threats.\n", + "\n", + "##### Challenges\n", + "\n", + "**Complex data science**\n", + "Modern anti-money laundering (AML) practices require multiple ML capabilities\n", + "such as entity resolution, computer vision and graph analytics on entity\n", + "metadata, which is typically not supported by any one data platform.\n", + "\n", + "\n", + "**Time-consuming false positives**\n", + "Any reported suspicious activity must be investigated manually to ensure\n", + "accuracy. Many legacy solutions generate a high number of false positives or fail\n", + "to identify unknown patterns, resulting in wasted effort by investigators.\n", + "\n", + "##### Solution overview\n", + "\n", + "AML solutions face the operational burden of processing billions of transactions\n", + "a day. The Databricks Lakehouse Platform combines the low storage cost\n", + "benefits of cloud data lakes with the robust transaction capabilities of data\n", + "warehouses, making it the ideal foundation for building AML analytics at massive\n", + "scale. At the core of Databricks is Delta Lake, which can store and combine\n", + "both unstructured and structured data to build entity relationships; moreover,\n", + "Databricks Delta Engine provides efficient access using the new Photon compute\n", + "to speed up BI queries on tables spanning billions of transactions. On top of\n", + "these capabilities, ML is a first-class citizen in the Lakehouse, which means\n", + "analysts and data scientists do not waste time subsampling or moving data to\n", + "share dashboards and stay one step ahead of bad actors.\n", + "\n", + "\n", + "**Model transparency**\n", + "Although AI can be used to address many money laundering use cases, the lack\n", + "of transparency in the development of ML models offers little explainability,\n", + "inhibiting broader adoption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "\n", + "\n", + "Lakehouse Platform leveraging a series of next-gen machine learning techniques\n", + "including NLP, computer vision, entity resolution and graph analytics. This\n", + "approach helps teams better adapt to the reality of modern laundering practices.\n", + "\n", + "\n", + "Current anti-money laundering practices bear little resemblance to those of the\n", + "last decade. In today’s digital world, financial institutions are processing billions\n", + "of transactions daily, increasing the surface area of money laundering. With this\n", + "accelerator, we demonstrate how to build a scalable AML solution on the\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Entity Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**No machine learning capabilities**\n", + "Entity resolution typically relies on basic rules-based logic to compare records\n", + "(e.g., matching on name and address), but with messy, large volumes of data,\n", + "advanced analytics is needed to improve accuracy and accelerate efforts.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is an ideal platform for building entity analytics at\n", + "scale. With support for a wide range of data formats and a rich and extensible\n", + "set of data transformation and ML capabilities, Databricks enables agencies to\n", + "bring together all of their data in a central location and move beyond simple\n", + "rules-based methods for entity resolution. Data teams can easily explore\n", + "different machine learning techniques like natural language processing,\n", + "classification and graph analytics to automate entity matching. And one-click\n", + "provisioning and deprovisioning of cloud resources makes it easy for teams to\n", + "cost-effectively allocate the necessary compute resources for any size job so\n", + "they can uncover findings faster.\n", + "\n", + "\n", + "Entity analytics aims to connect disparate data sources to build a full view of\n", + "a person or an organization. This has many applications in the public sector,\n", + "such as fraud detection, national security and population health. For example,\n", + "Medicare fraud teams need to understand which prescriptions are filled, claims\n", + "filed and facilities visited across geographies to uncover suspicious behavior.\n", + "Before teams can even look for suspicious behavior, they must first determine\n", + "which records are associated. In the United States, nearly 50,000 people share\n", + "the name John Smith (and there are thousands of others with similar names).\n", + "Imagine trying to identify the right John Smith for this type of analysis. That’s no\n", + "easy task.\n", + "\n", + "##### Challenges\n", + "\n", + "**Disjointed data**\n", + "Managing complex and brittle ETL pipelines in order to cleanse and join data\n", + "across siloed systems and data stores.\n", + "\n", + "\n", + "**Compute intensive**\n", + "Identifying related entities across population-level data sets requires massive\n", + "compute power that far outstrips legacy on-prem data architectures.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n", + "\n", + "Learn from Databricks experts on how entity analytics is being deployed\n", + "in the public sector and watch a demo that shows how to use ML to link\n", + "payments and treatments across millions of records in a public CMS data set.\n", + "\n", + "[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "\n", + "While focused on retail, this accelerator has applications for any organization\n", + "working on entity matching, especially as it relates to items that might be stored\n", + "across locations. In this notebook, we demonstrate how to use machine learning\n", + "and the Databricks Lakehouse Platform to resolve differences between product\n", + "definitions and descriptions, and determine which items are likely pairs and\n", + "which are distinct across disparate data sets.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n", + "\n", + "In this talk, NewWave shares the specifics on CMS’s entity resolution use case,\n", + "the ML necessary for this data and the unique uses of Databricks in providing\n", + "this capability.\n", + "\n", + "##### Sample workflow\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Geospatial Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Broad range of analytics capabilities**\n", + "Enterprises require a diverse set of data applications — including SQL-based\n", + "analytics, real-time monitoring, data science and machine learning — to support\n", + "geospatial workloads given the diverse nature of the data and use cases.\n", + "\n", + "##### Solution overview\n", + "\n", + "With Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\n", + "workloads, as it provides a single source of truth for all types of structured,\n", + "unstructured, streaming and batch data, enabling seamless spatio-temporal\n", + "unification and cross-querying with tabular and raster-based data. Built on\n", + "Apache Spark, the Lakehouse easily scales for data sets consisting of billions\n", + "of rows of data with distributed processing in the cloud. To expand on the core\n", + "capabilities of the Lakehouse, Databricks has introduced the Mosaic library,\n", + "an extension to the Apache Spark framework, built for fast and easy processing\n", + "of large geospatial data sets. Popular frameworks such as Apache Sedona or\n", + "GeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\n", + "Lakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\n", + "to support all types of geospatial use cases.\n", + "\n", + "\n", + "Every day billions of handheld and IoT devices, along with thousands of\n", + "airborne and satellite remote sensing platforms, generate hundreds of exabytes\n", + "of location-aware data. This boom of geospatial big data combined with\n", + "advancements in machine learning is enabling government agencies to develop\n", + "new capabilities. The potential use cases for geospatial analytics and AI touch\n", + "every part of the government, including disaster recovery (e.g., flood/earthquake\n", + "mapping), defense and intel (e.g., detecting threats using drone footage),\n", + "infrastructure (e.g., public transportation planning), civilian safety (e.g., crime\n", + "prediction), public health (e.g., disease spread tracking), and much more. Every\n", + "agency at the state and federal level needs to consider how they can tap into\n", + "geospatial data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Massive volumes of geospatial data**\n", + "With the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\n", + "daily, outpacing their ability to store and process this data at scale.\n", + "\n", + "\n", + "**Compute-intensive spatial workloads**\n", + "Geospatial data is complex in structure, with various formats not well suited for\n", + "legacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Build a Lakehouse to support all of your geospatial analytics and AI use cases\n", + "with the Mosaic library. Mosaic provides a number of capabilities including easy\n", + "conversion between common spatial data encodings, constructors to easily\n", + "generate new geometries from Spark native data types, many of the OGC SQL\n", + "standard ST_ functions implemented as Spark Expressions for transforming,\n", + "aggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\n", + "all provided with the flexibility of a Scala, SQL or Python API.\n", + "\n", + "[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "\n", + "Learn how to build powerful geospatial insights and visualizations with a\n", + "Lakehouse for all your geospatial data processing, analytics and AI.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n", + "\n", + "**Analyzing Flight Data to Improve Aviation**\n", + "To help airlines better serve their millions of passengers, USDOT built a\n", + "modern analytics architecture on Databricks that incorporates data such as\n", + "weather, flight, aeronautical and surveillance information. With this new\n", + "platform, they reduced compute costs by 90% and can now power use cases\n", + "such as predicting air cargo traffic patterns, flight delays and the financial\n", + "impact of flight cancellations.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n", + "\n", + "**Customer Story: Flood Prediction With Machine Learning**\n", + "In an effort to improve the safety of civil projects, Stantec built a machine\n", + "learning model on Databricks leveraging large volumes of weather and geological\n", + "data — oftentimes consisting of trillions of data points — to predict the impact\n", + "of flash floods on various regions and adjust civil planning accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Mosaic Kepler Magics\n", + "Geometry Display Functions\n", + "for Map Display\n", + "\n", + "ESRI Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "Built-In Indexing\n", + "System Support\n", + "\n", + "\n", + "JTS Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Public Health Management\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "In their lifetime, every human is expected to generate a million gigabytes of\n", + "health data spanning electronic health records, medical images, claims, wearable\n", + "data, genomics and more. This data is critical to understanding the health of\n", + "the individual, but when aggregated and analyzed across large populations,\n", + "government agencies can glean important insights like disease trends, the\n", + "impact of various treatment guidelines and the effectiveness of resources. By\n", + "adding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\n", + "location, income level, education, housing — agencies can better identify\n", + "underserved communities and the critical factors that contribute to positive\n", + "health outcomes.\n", + "\n", + "##### Challenges\n", + "\n", + "**Rapidly growing health data**\n", + "Healthcare data is growing exponentially. Unfortunately, legacy on-premises data\n", + "architectures are complex to manage and too costly to scale for populationscale analytics.\n", + "\n", + "\n", + "**Complexities of ML in healthcare**\n", + "The legacy analytics platforms that underpin healthcare lack the robust data\n", + "science capabilities needed for predictive health use cases like disease risk\n", + "scoring. There’s also the challenge of managing reproducibility, which is critical\n", + "when building ML models that can impact patient outcomes.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables public health agencies to bring together all\n", + "their research and patient data in a HIPAA-certified environment and marry it\n", + "with powerful analytics and AI capabilities to deliver real-time and predictive\n", + "insights at population scale. The Lakehouse eliminates the need for legacy\n", + "data architectures, which have historically inhibited innovation in patient care\n", + "by creating data silos and making advanced analytics difficult. Databricks led\n", + "open source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\n", + "that make it easy to ingest and prepare healthcare-specific data modalities for\n", + "downstream analytics.\n", + "\n", + "\n", + "**Fragmented patient data**\n", + "It is widely accepted that over 80% of medical data is unstructured, yet most\n", + "organizations still focus their attention on data warehouses designed to only\n", + "support structured data and SQL-based analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Our joint solutions with John Snow Labs bring together the power of Spark NLP\n", + "for Healthcare with the collaborative analytics and AI capabilities of Databricks.\n", + "Informatics teams can ingest raw unstructured medical text files into Databricks,\n", + "extract meaningful insights using natural language processing techniques,\n", + "and make the data available for downstream analytics. We have specific NLP\n", + "solutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "\n", + "One of the most powerful tools for identifying patients at risk for a chronic\n", + "condition is the analysis of real world data (RWD). This Solution Accelerator\n", + "notebook provides a template for building a machine learning model that\n", + "assesses the risk of a patient for a given condition within a given window of time\n", + "based on a patient’s encounter history and demographics information.\n", + "\n", + "\n", + "[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "\n", + "Databricks COVID-19 surveillance solution takes a data-driven approach to\n", + "adaptive response, applying predictive analytics to COVID-19 data sets to\n", + "help drive more effective shelter-in-place policies.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n", + "\n", + "**From Vaccine Management to ICU Planning**\n", + "During the pandemic, the Chesapeake Regional Information System for our\n", + "Patients implemented a modern data architecture on Databricks to address\n", + "critical reporting needs. This allowed them to analyze 400 billion data points\n", + "\n", + "for innovative use cases like real-time disease spread tracking, vaccine\n", + "distribution and prioritizing vulnerable populations.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data is at the core of how government agencies operate and AI is at the\n", + "\n", + "forefront of driving innovation into the future. The Databricks Lakehouse for\n", + "\n", + "Public Sector enables government agencies at the federal, state and local level\n", + "\n", + "to harness the full power of data and analytics to solve strategic challenges and\n", + "\n", + "make smarter decisions that improve the safety and quality of life of all citizens.\n", + "\n", + "Get started with a free trial of Databricks Lakehouse and start building better\n", + "\n", + "data applications today.\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "###### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
###### EBOOK\n", + "\n", + "# Lakehouse for Manufacturing\n", + "\n", + "###### Build a connected customer experience, optimize operations and unify your data ecosystem\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction .......................................................................................................................... **3**\n", + "\n", + "Manufacturing Transformation Trends .............................................................................. **5**\n", + "\n", + "Manufacturing Data Challenges ......................................................................................... **9**\n", + "\n", + "Databricks Lakehouse for Manufacturing ....................................................................... **10**\n", + "\n", + "Building Innovative Solutions on the Lakehouse ............................................................. **12**\n", + "\n", + "**SOLUTION:** Part-Level Demand Forecasting ....................................................................... 12\n", + "\n", + "**SOLUTION:** Overall Equipment Effectiveness & KPI Monitoring ............................................. 14\n", + "\n", + "**SOLUTION:** Digital Twins ................................................................................................... 15\n", + "\n", + "**SOLUTION:** Computer Vision ............................................................................................ 16\n", + "\n", + "An Ecosystem on the Lakehouse for Manufacturing ...................................................... **17**\n", + "\n", + "**SOLUTION:** Avanade Intelligent Manufacturing .................................................................. **18**\n", + "\n", + "**SOLUTION:** DataSentics Quality Inspector ........................................................................ **18**\n", + "\n", + "SOLUTION: Tredence Predictive Supply Risk Management ................................................. **19**\n", + "\n", + "Leading Manufacturing Companies That Choose Us ................................................... **20**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "Market conditions in manufacturing are more challenging than ever. Operating margins\n", + "and growth are impacted by the rising cost of labor, materials, energy and transportation, all\n", + "peaking at the same time. Disruptive events in the supply chain are increasing in frequency\n", + "and intensity, leading to significant revenue losses and damaged brand reputation.\n", + "\n", + "Effective acquisition and retention of next-generation talent is a considerable issue for\n", + "manufacturers. There are more jobs in the industry than there are people to do them, further\n", + "compounding the problem of slower than expected industrial productivity growth over the\n", + "last 15 years. The industry is also one of the largest consumers of energy, and faces a direct\n", + "challenge of transforming operations to be more sustainable as governments are prioritizing\n", + "net-zero policies that require a step change in energy efficiency and transition to low-carbon\n", + "energy sources.\n", + "\n", + "The manufacturing industry generates massive amounts of new data every day — estimated\n", + "to be two to four times more in size than in industries such as communications, media,\n", + "retail and financial services. This explosion of data has opened the door for the global\n", + "manufacturing ecosystem to boost productivity, quality, sustainability and growth beyond\n", + "what was previously thought possible.\n", + "\n", + "Unfortunately, legacy data warehouse-based architectures weren’t built for the massive\n", + "volumes and type of data coming in through today’s factories, products, processes and\n", + "workers, let alone to support the advanced AI/ML use cases required to meet the customer\n", + "expectations of shorter lead times, reliable delivery and smarter products.\n", + "\n", + "\n", + "-----\n", + "\n", + "For that, companies need to adopt a modern data architecture that provides the speed, scale and\n", + "collaboration needed by broad teams of data engineers, data scientists, and analysts. Manufacturers need\n", + "a comprehensive data platform that can not only handle massive volumes of data, but effectively and\n", + "seamlessly operationalize the value from data, analytics and AI.\n", + "\n", + "This is achieved by:\n", + "\n", + "Removing data silos by placing all data, regardless of type or frequency, in a single, open\n", + "architecture — including unstructured data from sensors, telemetry, natural language logs,\n", + "videos and images — helping you to gain end-to-end visibility into your business\n", + "\n", + "Ensuring your data is “always on” so that the freshest and highest quality data is available for\n", + "all for the full spectrum of enterprise analytics and AI/ML use cases, allowing you to drive ITOT convergence\n", + "\n", + "Having a comprehensive open architecture so IT and data teams can move with agility\n", + "to bring AI and ML to where it’s needed, when it’s needed, including in connectivityconstrained environments\n", + "\n", + "Maintaining fine-grained governance and access control on your data assets, protecting\n", + "\n", + "sensitive intellectual property and customer data\n", + "\n", + "The Databricks Lakehouse for Manufacturing does just this. It’s a comprehensive approach that empowers\n", + "teams in the industry to collaborate and innovate around data, analytics and AI. It eliminates the technical\n", + "limitations of legacy technologies and gives data teams the ability to drive deeper, end-to-end insight\n", + "into supply chains, automate processes to reduce costs and grow productivity, and achieve sustainable\n", + "transformation for a more prosperous future. Welcome to the Lakehouse for Manufacturing.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing Transformation Trends\n", + "\n", + "\n", + "The future of manufacturing is smart, sustainable and service oriented. Today’s\n", + "forward-thinking leaders are preparing the foundation they need to support that\n", + "future by leveraging fast and connected data from all corners of the enterprise.\n", + "There are four key trends driving transformation in manufacturing:\n", + "\n", + "**Boosting industrial productivity through automation**\n", + "\n", + "A spike in labor costs, as well as the cost of energy and materials, puts significant\n", + "pressure on operating margins. At the same time, industrial productivity has\n", + "plateaued — it is at the same level today as it was in the late 2000s. In the face\n", + "of these macro challenges and economic uncertainty, there has never been a\n", + "more burning need to reduce costs and improve productivity through greater\n", + "visibility and automation.\n", + "\n", + "The industry has made strides in collecting data from machines and performing\n", + "predictive analytics on sensor readings, with 47% of manufacturers citing the\n", + "use of predictive maintenance to reduce operational costs with considerable\n", + "upside ahead.\n", + "\n", + "However, there is an entirely different class of unstructured data in the form of\n", + "images, videos and LiDAR that is opening the door to game-changing automation\n", + "in quality inspection, flow optimization and production scheduling. Historically,\n", + "these critical processes have depended on manual and visual inspection of\n", + "products and operations, which is resource intensive and less accurate than\n", + "ML-driven computer vision techniques. This untapped data and capability\n", + "is allowing manufacturers to deliver higher product quality and deliver on\n", + "production demands using fewer resources. Andrew Ng, a machine learning\n", + "\n", + "\n", + "pioneer, rightly describes the massive opportunity for these technologies in\n", + "his quote: “It is incumbent on every CEO in any manufacturing or industrial\n", + "automation company to figure out how to make deep learning technology work\n", + "for your business.”\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Corning\n", + "\n", + "#### $2 million in cost avoidance through \n", + "\n", + "manufacturing upset event reduction\n", + "\n", + "**Driving Better Efficiency in Manufacturing Process With ML**\n", + "\n", + "Corning has been one of the world’s leading innovators in materials science for\n", + "\n", + "nearly 200 years. Delivering high-quality products is a key objective across the\n", + "\n", + "company’s manufacturing facilities around the world, and it’s always on a mission\n", + "\n", + "to explore how ML can help deliver on that goal. Databricks has been central\n", + "\n", + "to the company’s digital transformation, as it provides a simplified and unified\n", + "\n", + "platform where teams can centralize all data and ML work. Now, they can train\n", + "\n", + "models, register them in MLflow, generate all additional artifacts — like exported\n", + "\n", + "formats — and track them in the same place as the base model.\n", + "\n", + "[LEARN MORE](https://www.databricks.com/blog/2023/01/05/how-corning-built-end-end-ml-databricks-lakehouse-platform.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Gaining end-to-end operations and**\n", + "**supply chain visibility**\n", + "\n", + "Modern customer expectations are forcing manufacturers to focus on more\n", + "customer-centric KPIs: quality, on-time commitments and speed of delivery.\n", + "That’s not to say that asset and labor efficiency are less important — however,\n", + "with customer expectations of shorter lead times and more reliable delivery,\n", + "the success measures in manufacturing are shifting to a mantra of “measure\n", + "what your customer values.”\n", + "\n", + "High-performing manufacturers that embed this deep into their operational\n", + "playbook also perform best on productivity and ROIC growth results, as\n", + "evidenced in a recent study by the World Economic Forum and the International\n", + "Centre of Industrial Transformation. The problem? In a post-pandemic world,\n", + "operations and supply chains are persistently constrained, with increasing\n", + "disruptions, spiraling costs and unpredictable performance. The business\n", + "impact is considerable — studies have shown that a 30-day disruption can\n", + "reduce EBITDA by 5% and impact annual revenue by as much as 20%.\n", + "\n", + "Manufacturing companies need to be able to deliver on customer expectations,\n", + "commitments and service levels, all while lowering costs and increasing\n", + "productivity. Manufacturers need an enterprise data platform that can provide\n", + "real-time visibility into order flows, production processes, supplier performance,\n", + "inventory and logistics execution, breaking down departmental silos to maximize\n", + "customer responsiveness, improve manufacturing agility and boost performance.\n", + "\n", + "\n", + "**Transforming your business model through**\n", + "**tech-fueled services**\n", + "\n", + "Servitization, defined as the process of building revenue streams from services,\n", + "has been trending for some time. The adaptation of the business model has\n", + "been considerably profitable: on average, services account for ~30% of industrial\n", + "manufacturing companies but contribute 60%+ of profit.\n", + "\n", + "In aftersale services, a clear customer preference for business outcome-based\n", + "offerings has emerged in almost every corner of the manufacturing industry.\n", + "The use of data, analytics and AI is foundational to delivering more personalized\n", + "customer outcomes, proactive field service delivery and differentiated missioncritical applications to their customers.\n", + "\n", + "With greater autonomy, connectivity and sensorization, manufacturers operate\n", + "in a paradigm where their products generate more and more data every second,\n", + "opening up numerous new addressable opportunities for value creation. The\n", + "business of manufacturing is no longer linear, and manufacturers will need to\n", + "reimagine their businesses to go beyond merely providing the primary unit of\n", + "production — the next SKU, machine, vehicle or airplane — and leverage this data\n", + "to operate a platform business with higher growth, stickier revenue streams and\n", + "greater resilience to demand shocks.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Rolls-Royce\n", + "\n", + "**Aerospace Goes Green With Data and AI**\n", + "\n", + "While most people think of luxury cars when they hear “Rolls-Royce,” the\n", + "\n", + "Civil Aerospace branch is its own company, having separated from the car\n", + "\n", + "manufacturing arm in 1971. The now wildly successful manufacturer of commercial\n", + "\n", + "airplane engines is a leader in its industry for innovation. Today, Rolls-Royce\n", + "\n", + "\n", + "_“We employed Databricks to optimize inventory planning using data and analytics,_\n", + "_positioning parts where they need to be, based on the insight we gain from our_\n", + "_connected engines in real time and usage patterns we see in our service network. This_\n", + "_has helped us minimize risks to engine availability, reduce lead times for spare parts_\n", + "_and drive more efficiency in stock turns — all of this enables us to deliver TotalCare,_\n", + "_the aviation industry’s leading Power-by-the-Hour (PBH) maintenance program.”_\n", + "\n", + "**S T U A R T H U G H E S**\n", + "\n", + "Chief Information and Digital Officer\n", + "Rolls-Royce Civil Aerospace\n", + "\n", + "\n", + "obtains information directly from the airlines’ engines and funnels it into the\n", + "\n", + "Databricks platform. This gives the company insights into how the engines are\n", + "\n", + "performing and ways to improve maintenance schedules, translating to less\n", + "\n", + "downtime, delays, and rerouting — all of which reduce carbon footprint.\n", + "\n", + "[LEARN MORE](https://www.wired.com/sponsored/story/how-tech-is-helping-to-save-the-world/)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Driving a more sustainable approach**\n", + "**to manufacturing**\n", + "\n", + "Global efforts on reducing greenhouse gas (GHG)\n", + "emissions are accelerating, with over 70 countries\n", + "representing more than 75% of global emissions\n", + "having signed agreements to reach net-zero\n", + "emissions by 2050. Manufacturing-centric sectors\n", + "are critical to achieving net-zero sustainability\n", + "commitments around the world, as they represent\n", + "over 50% of global energy consumption and\n", + "contribute to ~25% of global emissions.\n", + "\n", + "Those at the forefront of data, analytics and\n", + "AI are setting science-based targets and are\n", + "driving favorable sustainability outcomes today\n", + "by deriving better insights from their operations,\n", + "supply chains and the outcomes that their\n", + "products generate for their end customers.\n", + "\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Shell\n", + "\n", + "**Delivering Innovative Energy Solutions for a Cleaner World**\n", + "\n", + "\n", + "Shell has been at the forefront of creating a cleaner tomorrow by investing in digital\n", + "\n", + "technologies to tackle climate change and become a net-zero emissions energy\n", + "\n", + "business. Across the business, they are turning to data and AI to improve operational\n", + "\n", + "efficiencies, drive customer engagement, and tap into new innovations like renewable\n", + "\n", + "energy. Hampered by large volumes of data, Shell chose Databricks to be one of\n", + "\n", + "the foundational components of its Shell.ai platform. Today, Databricks empowers\n", + "\n", + "hundreds of Shell’s engineers, scientists and analysts to innovate together as part of\n", + "\n", + "their ambition to deliver cleaner energy solutions more rapidly and efficiently.\n", + "\n", + "[LEARN MORE](https://www.google.com/url?q=https://www.databricks.com/customers/shell&sa=D&source=editors&ust=1679097620349908&usg=AOvVaw00lb46oTfGRpOREXOI1Ue3)\n", + "\n", + "_“Shell has been undergoing a digital transformation as part of our ambition to deliver more_\n", + "_and cleaner energy solutions. As part of this, we have been investing heavily in our data lake_\n", + "_architecture. Our ambition has been to enable our data teams to rapidly query our massive_\n", + "_data sets in the simplest possible way. The ability to execute rapid queries on petabyte_\n", + "_scale data sets using standard BI tools is a game changer for us. Our co-innovation_\n", + "_approach with Databricks has allowed us to influence the product road map, and we are_\n", + "_excited to see this come to market.”_\n", + "\n", + "\n", + "### Millions\n", + "of dollars saved in\n", + "potential engine\n", + "repair costs\n", + "\n", + "data team\n", + "### 250\n", + "members supporting\n", + "160+ high-value use\n", + "cases\n", + "\n", + "faster –\n", + "### 9x\n", + "5 minutes to validate\n", + "a label, reduced from\n", + "45 minutes\n", + "\n", + "\n", + "**D A N I E L J E AV O N S**\n", + "General Manager – Advanced Analytics CoE\n", + "\n", + "Shell\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing Data Challenges\n", + "\n", + "\n", + "**Massive unstructured/OT data volumes**\n", + "\n", + "The industry is seeing immense growth in data volumes: much of this massive\n", + "growth is due to semi-structured and unstructured data from connected workers,\n", + "buildings, vehicles and factories. This growth in multi-modal data from IoT sensors,\n", + "process historians, product telemetry, images, cameras and perception systems\n", + "has outpaced legacy data warehouse-centric technologies. On-prem and cloud\n", + "data warehouse tech-based architectures are too complex and too costly for the\n", + "large and heterogeneous data sets prevalent in the industry.\n", + "\n", + "**Driving IT-OT convergence**\n", + "\n", + "The success and pace of data modernization efforts in manufacturing is so often\n", + "muted by critical data being stuck in multiple closed systems and proprietary\n", + "formats, making it difficult and cost-prohibitive to extract the full potential of IT\n", + "and OT data sets. In addition, data quality issues such as outdated or inaccurate\n", + "data can often lead to a disjointed and incomplete view of customers, operations\n", + "and assets. For years, companies have lacked a common foundation for complex\n", + "and heterogeneous manufacturing data — from IoT-generated data streams to\n", + "financial metrics stored in ERP applications — and it has impacted their ability to\n", + "provide the freshest, highest-quality and most complete data for analytics.\n", + "\n", + "\n", + "**Bringing AI/ML to where it’s needed**\n", + "\n", + "To realize the promise of AI/ML in manufacturing, machine learning models need\n", + "to be brought as close to the decision as possible, often at the edge in facilities\n", + "and locations with limited or intermittent connectivity to the internet or cloud.\n", + "This requires deployment flexibility to on-premises or edge devices, with an\n", + "experience comparable to that in the cloud.\n", + "\n", + "**Inability to innovate at scale**\n", + "\n", + "CDOs want to be able to quickly and efficiently reproduce successes at global\n", + "scale. Technical and business users want to simply and quickly know what data\n", + "sets are available to solve the business issue at hand. Analysts want flexibility to\n", + "use the tools they are most familiar with in order to stay responsive to business\n", + "needs. Fragmented approaches to architecture and tooling make scaling\n", + "business impact very difficult, which results in talent churn, slower development\n", + "and duplicative efforts — all leading to higher costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Databricks Lakehouse for Manufacturing\n", + "\n", + "**Deliver personalized outcomes and frictionless experiences**\n", + "\n", + "**Millions of assets streaming IoT data**\n", + "\n", + "**5%–10% reduction in unplanned downtime and cost**\n", + "\n", + "**Accurate prices across 1,000s of locations and millions of dealers**\n", + "\n", + "**200%+ increase in offer conversion rates**\n", + "\n", + "With Databricks Lakehouse for Manufacturing, manufacturers can gain a\n", + "single view of their customers that combines data from each stage of the\n", + "customer journey. With a 360-degree view in place, manufacturers can drive\n", + "more differentiated sales strategies and precise service outcomes in the\n", + "field, delivering higher revenue growth, profitability and CSAT scores.\n", + "\n", + "With the Databricks Lakehouse, you can analyze product telemetry data,\n", + "customer insights and service networks to deliver highest uptime, quality of\n", + "service and economic value through the product lifecycle.\n", + "\n", + "**Optimize the supply chain, production processes and fulfillment logistics**\n", + "\n", + "**with real-time analytics and AI.**\n", + "\n", + "The Databricks Lakehouse for Manufacturing is the only enterprise data platform\n", + "that helps manufacturing organizations optimize their supply chains, boost\n", + "product innovation, increase operational efficiencies, predict fulfillment needs\n", + "and reduce overall costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Gain real-time insight for agile manufacturing and logistics**\n", + "\n", + "**30%–50% improvement in forecast accuracy**\n", + "\n", + "**90% lower cost for new manufacturing line**\n", + "\n", + "**4%–8% reduction in logistics costs**\n", + "\n", + "**10% improvement in carbon footprint**\n", + "\n", + "The Databricks Lakehouse lets you build a resilient and predictive supply\n", + "chain by eliminating the trade-off between accuracy or depth of analysis\n", + "and time. With scalable, fine-grained forecasts to predict or sense demand,\n", + "or perform supply chain planning and optimization, Databricks improves\n", + "accuracy of decisions, leading to higher revenue growth and lower costs.\n", + "\n", + "The lakehouse provides an “always on” architecture that makes IT-OT\n", + "convergence a reality, by continuously putting all data to work regardless of the\n", + "frequency at which it arrives (periodic, event-driven or real-time streaming)\n", + "and creates valuable data products that can empower decision makers. This\n", + "creates real-time insight into performance with data from connected factory\n", + "equipment, order flows and production processes to drive the most effective\n", + "resource scheduling.\n", + "\n", + "\n", + "**Empower the manufacturing workforce of the future**\n", + "\n", + "**25% improvement in data team productivity**\n", + "\n", + "**50x faster time to insight**\n", + "\n", + "**50% reduction in workplace injuries**\n", + "\n", + "With Databricks, manufacturers can increase the impact and decrease the\n", + "time-to-value of their data assets, ultimately making data and AI central to every\n", + "part of their operation. And by empowering data teams across engineering,\n", + "analytics and AI to work together, Databricks frees up employees to self-serve\n", + "and focus on realizing maximum business value — improving product quality,\n", + "reducing downtime and exceeding customer expectations.\n", + "\n", + "**Execute product innovation at the speed of data**\n", + "\n", + "**90% decrease in time to market of new innovations**\n", + "\n", + "**20x faster data processing of vehicle and road data**\n", + "\n", + "It is critical that manufacturers are offering the most desirable value\n", + "propositions so end consumers don’t look elsewhere. By tapping into product\n", + "performance and attribute data along with market trends and operations\n", + "information, manufacturers can make strategic decisions.\n", + "\n", + "With Databricks, manufacturers can decrease time to market with new products\n", + "to increase sales by analyzing customer behavior and insights (structured,\n", + "unstructured and semi-structured), product telemetry (streaming, RFID, computer\n", + "vision) and digital twins, and leveraging that data to drive product decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Building Innovative Solutions on the Lakehouse\n", + "\n", + "\n", + "The flexibility of the Databricks Lakehouse Platform means that you can start\n", + "with the use case that will have the most impact on your business. Through\n", + "our experience working with some of the largest and most cutting-edge\n", + "manufacturers in the world, we’ve developed Solution Accelerators based\n", + "on the most common needs of manufacturers to help you get started. These\n", + "purpose-built guides — fully functional notebooks and best practices — speed\n", + "up results across your most common and high-impact use cases. Go from idea\n", + "to proof of concept (PoC) in as little as two weeks. Check out the full list of\n", + "Solution Accelerators [here](https://www.databricks.com/solutions/accelerators) .\n", + "\n", + "**S O L U T I O N**\n", + "**Part-Level Demand**\n", + "**Forecasting**\n", + "\n", + "\n", + "Demand forecasting is a critical business process for manufacturing and\n", + "supply chains. McKinsey estimates that over the next 10 years, supply\n", + "chain disruptions can cost close to half (~45%) of a year’s worth of profits\n", + "for companies. Having accurate and up-to-date forecasts is vital to plan\n", + "the scaling of manufacturing operations, ensure sufficient inventory and\n", + "guarantee customer fulfillment.\n", + "\n", + "In recent years, manufacturers have been investing heavily in quantitativebased forecasting that is driven by historical data and powered using either\n", + "statistical or machine learning techniques. Benefits include:\n", + "\n", + "**•** Better sales planning and revenue forecasting\n", + "\n", + "**•** Optimized safety stock to maximize turn-rates and\n", + "service-delivery performance\n", + "\n", + "**•** Improved production planning by tracing back\n", + "production outputs to raw material levels\n", + "\n", + "**A disruption lasting just 30 days or less could**\n", + "\n", + "**equal losses of** **3%-5% of EBITDA.**\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Lakehouse can enable large-scale forecasting solutions to help\n", + "manufacturers navigate the most common data challenges when trying to\n", + "forecast demand.\n", + "\n", + "**C O M M O N U S E C A S E S :**\n", + "\n", + "Scalable, accurate forecasts across large numbers of store-item\n", + "combinations experiencing intermittent demand\n", + "\n", + "Automated model selection to ensure the best model is selected\n", + "for each store-item combination\n", + "\n", + "Metrics to identify the optimal frequency with which to generate\n", + "new predictions\n", + "\n", + "Manage material shortages and predict overplanning\n", + "\n", + "**Try our** **[Parts-Level Solution Accelerator](https://www.databricks.com/solutions/accelerators/demand-forecasting)** **to facilitate**\n", + "\n", + "**fine-grained demand forecasts and planning.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Overall Equipment Effectiveness**\n", + "**& KPI Monitoring**\n", + "\n", + "\n", + "​The need to monitor and measure manufacturing equipment performance is\n", + "critical for operational teams within manufacturing. Today, Overall Equipment\n", + "Effectiveness (OEE) is considered the standard for measuring manufacturing\n", + "equipment productivity. According to Engineering USA, an OEE value of 85% or\n", + "above is considered world-leading. However, many manufacturers typcially achieve\n", + "a range of between 40% and 60%. Reasons for underachievement often include:\n", + "\n", + "**•** Delayed inputs due to manual processes that are prone to human error\n", + "\n", + "**•** Bottlenecks created by data silos, impeding the flow of fresh data to\n", + "stakeholders\n", + "\n", + "**•** A lack of collaboration capabilities, keeping stakeholders from working on the\n", + "same information at the same time\n", + "\n", + "**Poor OEE value** **can be a result of poor parts quality, slow**\n", + "**production performance and production availability issues.**\n", + "\n", + "Databricks Lakehouse can help manufacturers maneuver through the\n", + "challenges of ingesting and converging operational technology (OT) data with\n", + "traditional data from IT systems to build forecasting solutions.\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Incrementally ingest and process sensor data from IoT devices\n", + "in a variety of formats\n", + "\n", + "Compute and surface KPIs and metrics to drive valuable insights\n", + "\n", + "Optimize plant operations with data-driven decisions\n", + "\n", + "**Try our** **[Solution Accelerator for OEE and KPI Monitoring](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)** **for**\n", + "**performant and scalable end-to-end monitoring.**\n", + "\n", + "\n", + "-----\n", + "\n", + "Market dynamics and volatility are requiring manufacturers to bring products to\n", + "market more quickly, optimize production processes and build agile supply chains\n", + "at scale at a lower price. To do so, many manufacturers have turned to building\n", + "digital twins, which are virtual representations of objects, products, pieces of\n", + "equipment, people, processes or even complete manufacturing ecosystems.\n", + "\n", + "Digital twins provide insights — derived from sensors (often IoT or IIoT) that\n", + "are embedded in the original equipment — that have the potential to transform\n", + "the manufacturing industry by driving greater efficiency, reducing costs and\n", + "improving quality.\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**Digital Twins**\n", + "\n", + "\n", + "**Digital twin technologies can improve product**\n", + "\n", + "**quality by** **up to 25%.**\n", + "\n", + "Databricks Lakehouse can bring digital twins to life through fault-tolerant\n", + "processing of streaming workloads generated by IoT sensor data and complex\n", + "event processing (important for modeling physical processes).\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Process real-world data in real time\n", + "\n", + "Compute insights at scale and deliver to multiple downstream applications\n", + "\n", + "Optimize plant operations with data-driven decisions\n", + "\n", + "**Try our** **[Solution Accelerator for Digital Twins](https://www.databricks.com/solutions/accelerators/digital-twins)** **to accelerate**\n", + "**time to market of new innovations.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Computer Vision**\n", + "\n", + "The rise in computer vision has been fueled by the rapid developments in\n", + "neural network technologies, which use AI to better understand and interpret\n", + "images with near-perfect precision. In manufacturing, computer vision can\n", + "transform operations by, for example, identifying product defects to improve\n", + "quality control, detecting safety hazards on the production floor, and tracking\n", + "and managing inventory levels.\n", + "\n", + "**As per the American Society for Quality, cost of poor quality for**\n", + "\n", + "**companies can be as high as** **20% of revenue.**\n", + "\n", + "\n", + "Databricks Lakehouse can easily ingest complex, unstructured image and video\n", + "data at massive scale. Through the most popular computer vision libraries, data\n", + "teams can scale AI models that leverage computer vision to recognize patterns,\n", + "detect objects and make predictions with 99% accuracy.\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Quickly identify defects and ensure that products and processes meet\n", + "quality standards\n", + "\n", + "Automate positioning and guidance to ensure that parts and products are\n", + "properly aligned and assembled\n", + "\n", + "Predict maintenance issues to reduce downtime and maintenance costs,\n", + "improve parts reliability, and increase safety for workers\n", + "\n", + "**Try our** **[Solution Accelerator for Computer Vision](https://www.databricks.com/blog/2021/12/17/enabling-computer-vision-applications-with-the-data-lakehouse.html)** **to improve**\n", + "**efficiency, reduce costs and enhance overall safety.**\n", + "\n", + "\n", + "-----\n", + "\n", + "## An Ecosystem on the Lakehouse for Manufacturing\n", + "\n", + "We’ve partnered with leading consulting firms and\n", + "independent software vendors to deliver innovative,\n", + "manufacturing-specific solutions. Databricks\n", + "Brickbuilder Solutions help you cut costs and\n", + "increase value from your data. Backed by decades\n", + "of industry expertise — and built for the Databricks\n", + "Lakehouse Platform — Brickbuilder Solutions are\n", + "tailored to your exact needs.\n", + "\n", + "We also work with technology partners like Alteryx,\n", + "AtScale, Fivetran, Microsoft Power BI, Qlik, Sigma,\n", + "Simplement, Tableau and ThoughtSpot to accelerate\n", + "the availability and value of data. This allows\n", + "businesses to unify data from complex source\n", + "systems and operationalize it for analytics, AI and\n", + "ML on the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Avanade Intelligent Manufacturing**\n", + "\n", + "Every year, businesses lose millions of dollars due to equipment failure,\n", + "unscheduled downtime and lack of control in maintenance scheduling. Along\n", + "with lost dollars, businesses will experience lower employee morale when\n", + "stations are in and out of service. Avanade’s Intelligent Manufacturing solution\n", + "supports connected production facilities and assets, workers, products and\n", + "consumers to create value through enhanced insights and improved outcomes.\n", + "Manufacturers can harness data to drive interoperability and enhanced insights\n", + "at scale using analytics and AI. Outcomes include improvements across\n", + "production (e.g., uptime, quantity and yield), better experiences for workers,\n", + "and greater insight into what customers want.\n", + "\n", + "**Try our joint solution,** **[Intelligent Manufacturing](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/avanade-intelligent-manufacturing)** **, to drive value and**\n", + "**operationalize team coordination and productivity.**\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**DataSentics Quality Inspector**\n", + "\n", + "Quality control is a crucial aspect of any production process, but traditional\n", + "methods can be time-consuming and prone to human error. Quality\n", + "Inspector by DataSentics, an Atos company, offers a solution that is\n", + "both efficient and reliable. With out-of-the-box models for visual quality\n", + "inspection, which are tailored to meet specific business requirements,\n", + "organizations will experience stable, scalable quality control that’s easy to\n", + "improve over time. Quality Inspector is an end-to-end solution that can be\n", + "seamlessly integrated into an existing setup, delivering high performance\n", + "and reliability.\n", + "\n", + "**Try our joint solution,** **[Quality Inspector](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to automate production quality**\n", + "**control with an increase in accuracy and quicker time to value.**\n", + "\n", + "\n", + "-----\n", + "\n", + "TREDENCE PSRM_1”: PREDICT SUPPLY RISK\n", + "\n", + "TREDENCE PSRM_2”: REAL-TIME SHIPMENT VISIBILITY\n", + "\n", + "TREDENCE PSRM_3”: DELAY ALERTS\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**Tredence Predictive Supply Risk Management**\n", + "\n", + "Customers today are faced with multiple supply risks including lack of\n", + "in-transit visibility, disruptions caused by weather, local events, among\n", + "others. Tredence’s Predictive Supply Risk Management solution, built on\n", + "the Databricks Lakehouse Platform, helps businesses meet supply risk\n", + "challenges by providing a scalable, cloud-based solution that can be\n", + "tailored to the specific needs of each organization. The platform’s flexibility\n", + "and scalability allow businesses to keep pace with changing regulations\n", + "and customer demands, while their comprehensive suite of tools helps\n", + "identify and mitigate risks across the enterprise.\n", + "\n", + "**Try our joint solution,** **[Predictive Supply Risk Management](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to**\n", + "**predict order delays, identify root causes and quantify supply**\n", + "**chain impact.**\n", + "\n", + "Visit our [site](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview) to learn more about our Databricks Partner Solutions.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Leading Manufacturing Companies That Choose Us\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is the lakehouse company. More than 9,000 organizations worldwide\n", + "\n", + "— including Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the\n", + "\n", + "Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the\n", + "\n", + "original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission\n", + "\n", + "to help data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "###### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=634147899783&utm_term=try%20databricks&gclid=CjwKCAiAr4GgBhBFEiwAgwORrTnkJaDf9SpIDy2RxOV28a2G2HtUDvJnLXiVWBsqcAWa_XmSvabkVRoCiwgQAvD_BwE#account)**\n", + "\n", + "To learn more, visit us at:\n", + "**[Manufacturing Industry Solutions](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Lakehouse-for-Manufacturing.pdf2024-09-19T16:57:19Z
**2 0 2 0 E D I T I O N** | U P D AT E D\n", + "\n", + "# Standardizing the Machine Learning Lifecycle\n", + "\n", + "### From experimentation to production with MLflow\n", + "\n", + "[��](https://mlflow.org)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "#### Contents\n", + "\n", + "Chapter 1: \u0007Machine Learning\n", + "Lifecycle Challenges 3\n", + "\n", + "Chapter 2: \u0007Applying Good Engineering\n", + "Principles to Machine Learning 7\n", + "\n", + "Chapter 3: \u0007Introducing MLflow 9\n", + "\n", + "Chapter 4: \u0007A Closer Look at MLflow\n", + "Model Registry 16\n", + "\n", + "Chapter 5: \u0007Making Organizations\n", + "Successful with ML 19\n", + "\n", + "Chapter 6: \u0007Introducing the Unified\n", + "Data Analytics Platform 20\n", + "\n", + "Chapter 7: \u0007Standardizing the Machine\n", + "Learning Lifecycle on Databricks 25\n", + "\n", + "Chapter 8: \u0007Getting Started 26\n", + "\n", + "Chapter 9: \u0007Comparison Matrix 27\n", + "\n", + "\n", + "#### Preface\n", + "\n", + "##### Technology changes quickly. Data science and machine learning (ML) are moving\n", + " even faster. In the short time since we first published this eBook, businesses across industries have rapidly matured their machine learning operations (MLOps) — implementing ML applications and moving their first models into production. This has turned ML models into corporate assets that need to be managed across the lifecycle.\n", + "\n", + " That’s why MLflow, an open-source platform developed by Databricks, has emerged\n", + " as a leader in automating the end-to-end ML lifecycle. With 1.8 million 1 downloads a month — and growing support in the developer community — this open-source platform is simplifying the complex process of standardizing and productionizing MLOps. This updated eBook explores the advantages of MLflow and introduces you to the newest component: MLflow Model Registry. You’ll also discover how MLflow fits into the Databricks Unified Data Analytics Platform for data engineering, science and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 1: **\u0007** **Machine Learning**\n", + "\n", + "#### Lifecycle Challenges\n", + "\n", + "\n", + "Building machine learning models is hard. Putting them into production is harder. Enabling others — data\n", + "\n", + "scientists, engineers or even yourself — to reproduce your pipeline and results is equally challenging. How\n", + "\n", + "many times have you or your peers had to discard previous work because it was either not documented\n", + "\n", + "properly or too difficult to replicate?\n", + "\n", + "Getting models up to speed in the first place is significant enough that it can be easy to overlook long-\n", + "\n", + "term management. What does this involve in practice? In essence, we have to compare the results of\n", + "\n", + "different versions of ML models along with corresponding artifacts — code, dependencies, visualizations,\n", + "\n", + "intermediate data and more — to track what’s running where, and to redeploy and roll back updated models\n", + "\n", + "as needed. Each of these requires its own specific tools, and it’s these changes that make the ML lifecycle\n", + "\n", + "so challenging compared with traditional software development lifecycle (SDLC) management.\n", + "\n", + "This represents a serious shift and creates challenges compared with a more traditional software\n", + "\n", + "development lifecycle for the following reasons:\n", + "\n", + "\n", + "The diversity and number of ML\n", + "\n", + "tools involved, coupled with a\n", + "\n", + "lack of standardization across\n", + "\n", + "ML libraries and frameworks\n", + "\n", + "\n", + "The continuous nature of ML\n", + "\n", + "development, accompanied by a\n", + "\n", + "lack of tracking and management\n", + "\n", + "tools for machine learning models\n", + "\n", + "and experiments\n", + "\n", + "\n", + "The complexity of productionizing\n", + "\n", + "ML models due to the lack of\n", + "\n", + "integration among data pipelines,\n", + "\n", + "ML environments and production\n", + "\n", + "services\n", + "\n", + "\n", + "Let’s look at each of these areas in turn.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The diversity and number of ML tools involved\n", + "\n", + "\n", + "While the traditional software development process leads to the\n", + "\n", + "rationalization and governance of tools and platforms used for developing and\n", + "\n", + "managing applications, the ML lifecycle relies on data scientists’ ability to use\n", + "\n", + "multiple tools, whether for preparing data and training models, or deploying\n", + "\n", + "them for production use. Data scientists will seek the latest algorithms from\n", + "\n", + "\n", + "However, due to the variety of available tools and the lack of detailed tracking,\n", + "\n", + "teams often have trouble getting the same code to work again in the same way.\n", + "\n", + "Reproducing the ML workflow is a critical challenge, whether a data scientist\n", + "\n", + "needs to pass training code to an engineer for use in production or go back to\n", + "\n", + "past work to debug a problem.\n", + "\n", + "\n", + "the most up-to-date ML libraries and frameworks available to compare results\n", + "\n", + "and improve performance.\n", + "\n", + "**PREP DATA** **BUILD MODEL** **DEPLOY MODEL**\n", + "\n", + "Azure ML\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The continuous nature of ML development\n", + "\n", + "Technology never stands still. New data, algorithms,\n", + "\n", + "libraries and frameworks impact model performance\n", + "\n", + "continuously and, thus, need to be tested. Therefore,\n", + "\n", + "machine learning development requires a continuous\n", + "\n", + "\n", + "approach, along with tracking capabilities to\n", + "\n", + "compare and reproduce results. The performance\n", + "\n", + "of ML models depends not only on the algorithms\n", + "\n", + "used, but also on the quality of the data sets and the\n", + "\n", + "parameter values for the models.\n", + "\n", + "\n", + "**P R E P**\n", + "**D ATA**\n", + "\n", + "**B U I L D**\n", + "**M O D E L**\n", + "\n", + "\n", + "Whether practitioners work alone or on teams, it’s\n", + "\n", + "still very difficult to track which parameters, code\n", + "\n", + "and data went into each experiment to produce a\n", + "\n", + "model, due to the intricate nature of the ML\n", + "\n", + "lifecycle itself.\n", + "\n", + "**D E P L O Y**\n", + "**M O D E L**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The complexity of productionizing ML models\n", + "\n", + "\n", + "In software development, the architecture is set early on, based on the target\n", + "\n", + "application. Once the infrastructure and architecture have been chosen, they\n", + "\n", + "won’t be updated or changed due to the sheer amount of work involved in\n", + "\n", + "rebuilding applications from scratch. Modern developments, such as the move\n", + "\n", + "to microservices, are making this easier, but for the most part, SDLC focuses on\n", + "\n", + "maintaining and improving what already exists.\n", + "\n", + "\n", + "One of today’s key challenges is to effectively transition models from\n", + "\n", + "experimentation to staging and production — without needing to rewrite the code\n", + "\n", + "for production use. This is time-consuming and risky as it can introduce new\n", + "\n", + "bugs. There are many solutions available to productionize a model quickly, but\n", + "\n", + "practitioners need the ability to choose and deploy models across any platform,\n", + "\n", + "and scale resources as needed to manage model inference effectively on big data,\n", + "\n", + "in batch or real time.\n", + "\n", + "\n", + "With machine learning the first goal is to build a model. And keep in mind: a\n", + "\n", + "model’s performance in terms of accuracy and sensitivity is agnostic from the\n", + "\n", + "deployment mode. However, models can be heavily dependent on latency, and\n", + "\n", + "the chosen architecture requires significant scalability based on the business\n", + "\n", + "application. End-to-end ML pipeline designs can be great for batch analytics and\n", + "\n", + "looking at streaming data, but they can involve different approaches for real-time\n", + "\n", + "scoring when an application is based on a microservice architecture working via\n", + "\n", + "REST APIs, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 2: **\u0007** **Applying Good Engineering**\n", + "\n", + "#### Principles to Machine Learning\n", + "\n", + "\n", + "Many data science and machine learning projects fail due to preventable issues that have been resolved\n", + "\n", + "in software engineering for more than a decade. However, those solutions need to be adapted due to key\n", + "\n", + "differences between developing code and training ML models.\n", + "\n", + "- \u0007 **Expertise, code and data** — With the addition of data, data science and ML, code not only needs to deal\n", + "\n", + "with data dependencies but also handle the inherent nondeterministic characteristics of statistical\n", + "\n", + "modeling. ML models are not guaranteed to behave the same way when trained twice, unlike traditional\n", + "\n", + "code, which can be easily unit tested.\n", + "\n", + "- \u0007 **Model artifacts** — In addition to application code, ML products and features also depend on models\n", + "\n", + "that are the result of a training process. Those model artifacts can often be large — on the order of\n", + "\n", + "gigabytes — and often need to be served differently from code itself.\n", + "\n", + "- \u0007 **Collaboration** — In large organizations, models that are deployed in an application are usually not trained\n", + "\n", + "by the same people responsible for the deployment. Handoffs between experimentation, testing and\n", + "\n", + "production deployments are similar but not identical to approval processes in software engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The need for standardization\n", + "\n", + "Some of the world’s largest tech companies have already begun solving these problems internally with\n", + "\n", + "their own machine learning platforms and lifecycle management tools. 2 These internal platforms have\n", + "\n", + "been extremely successful and are designed to accelerate the ML lifecycle by standardizing the process of\n", + "\n", + "data preparation, model training, and deployment via APIs built for data scientists. The platforms not only\n", + "\n", + "help standardize the ML lifecycle but also play a major role in retaining knowledge and best practices, and\n", + "\n", + "maximizing data science team productivity and collaboration, thereby leading to greater ROI.\n", + "\n", + "Internally driven strategies still have limitations. First, they are limited to a few algorithms or frameworks.\n", + "\n", + "Adoption of new tools or libraries can lead to significant bottlenecks. Of course, data scientists always\n", + "\n", + "want to try the latest and the best algorithms, libraries and frameworks — the most recent versions of\n", + "\n", + "PyTorch, TensorFlow and so on. Unfortunately, production teams cannot easily incorporate these into\n", + "\n", + "the custom ML platform without significant rework. The second limitation is that each platform is tied\n", + "\n", + "to a specific company’s infrastructure. This can limit sharing of efforts among data scientists. As each\n", + "\n", + "framework is so specific, options for deployment can be limited.\n", + "\n", + "The question then is: Can similar benefits to these systems be provided in an open manner? This evaluation\n", + "\n", + "must be based on the widest possible mix of tools, languages, libraries and infrastructures. Without this\n", + "\n", + "approach, it will be very difficult for data scientists to evolve their ML models and keep pace with industry\n", + "\n", + "developments. Moreover, by making it available as open source, the wider industry will be able to join in and\n", + "\n", + "contribute to ML’s wider adoption. This also makes it easier to move between various tools and libraries\n", + "\n", + "over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 3: **\u0007** **Introducing MLflow**\n", + "\n", + "**M AT E I Z A H A R I A**\n", + "\n", + "Co-founder and Chief Technologist at Databricks\n", + "\n", + "\n", + "At Databricks, we believe that there should be a better way to manage the ML lifecycle. So in June 2018,\n", + "\n", + "we unveiled [MLflow](https://mlflow.org/) , an open-source machine learning platform for managing the complete ML lifecycle.\n", + "\n", + "###### “MLflow is designed to be a cross-cloud, modular, API-first framework, to work well with\n", + " all popular ML frameworks and libraries. It is open and extensible by design, and platform\n", + " agnostic for maximum flexibility.”\n", + "\n", + "With MLflow, data scientists can now package code as reproducible runs, execute and\n", + "\n", + "compare hundreds of parallel experiments, and leverage any hardware or software platform\n", + "\n", + "for training, hyperparameter tuning and more. Also, organizations can deploy and manage\n", + "\n", + "models in production on a variety of clouds and serving platforms.\n", + "\n", + "###### “ With MLflow, data science teams can systematically package and reuse models\n", + " across frameworks, track and share experiments locally or in the cloud, and deploy\n", + " models virtually anywhere,” says Zaharia. “The flurry of interest and contributions we’ve\n", + " seen from the data science community validates the need for an open-source framework to\n", + " streamline the machine learning lifecycle.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "**EXPERIMENT TRACKING** As mentioned previously, getting ML models to perform takes significant trial and error, and continuous configuration, building, tuning, testing,\n", + "\n", + "etc. Therefore, it is imperative to allow data science teams to track all that goes into a specific run, along with the results. With MLflow, data scientists can quickly record\n", + "\n", + "runs and keep track of model parameters, results, code and data from each experiment, all in one place.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "\n", + "**FLEXIBLE DEPLOYMENT** There is virtually no limit to what machine learning can\n", + "\n", + "do for your business. However, there are different ways to architect ML applications\n", + "\n", + "for production, and various tools can be used for deploying models, which often\n", + "\n", + "lead to code rewrites prior to deploying ML models into production. With MLflow,\n", + "\n", + "your data scientists can quickly download or deploy any saved models to various\n", + "\n", + "platforms — locally or in the cloud — from experimentation to production.\n", + "\n", + "\n", + "**REPRODUCIBLE PROJECTS** The ability to reproduce a project — entirely or just\n", + "\n", + "parts of it — is key to data science productivity, knowledge sharing and, hence,\n", + "\n", + "accelerating innovation. With MLflow, data scientists can build and package\n", + "\n", + "composable projects, capture dependencies and code history for reproducible\n", + "\n", + "results, and quickly share projects with their peers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "**MODEL MANAGEMENT** Use one central place to share ML models, collaborate on moving them from experimentation to online testing and production, integrate with\n", + "\n", + "approval and governance workflows, and monitor ML deployments and their performance. This is powered by the latest MLflow component, MLflow Model Registry.\n", + "\n", + "**M O D E L D E P L O Y M E N T A N D M O N I T O R I N G**\n", + "\n", + "**I N - L I N E C O D E**\n", + "\n", + "��\n", + "\n", + "**M L L I B R A R I E S**\n", + "\n", + "###### Model Format\n", + "\n", + "**C O N TA I N E R S**\n", + "\n", + "\n", + "**F L AV O R 1**\n", + "\n", + "\n", + "**F L AV O R 2**\n", + "\n", + "**B AT C H A N D S T R E A M S C O R I N G**\n", + "\n", + "\n", + "Simple model flavors\n", + "usable by many tools\n", + "\n", + "\n", + "**C L O U D I N F E R E N C E S E R V I C E S**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Use case examples\n", + "\n", + "Let‘s examine three use cases to explore how users can leverage some of the MLflow components.\n", + "\n", + "\n", + "**EXPERIMENT TRACKING** A European energy\n", + "\n", + "company is using MLflow to track and update\n", + "\n", + "hundreds of energy-grid models. This company’s\n", + "\n", + "goal is to build a time-series model for every major\n", + "\n", + "energy producer (e.g., power plant) and consumer\n", + "\n", + "(e.g., factory), monitor these models using standard\n", + "\n", + "metrics, and combine the predictions to drive\n", + "\n", + "business processes, such as pricing. Because a\n", + "\n", + "single team is responsible for hundreds of models,\n", + "\n", + "possibly using different ML libraries, it’s important to\n", + "\n", + "have a standard development and tracking process.\n", + "\n", + "The team has standardized on Jupyter notebooks\n", + "\n", + "for development, MLflow Tracking for metrics, and\n", + "\n", + "Databricks Jobs for inference.\n", + "\n", + "\n", + "**REPRODUCIBLE PROJECTS** An online marketplace\n", + "\n", + "is using MLflow to package deep learning jobs using\n", + "\n", + "Keras and run them in the cloud. Each data scientist\n", + "\n", + "develops models locally on a laptop using a small\n", + "\n", + "data set, checks them into a Git repository with\n", + "\n", + "an MLproject file, and submits remote runs of the\n", + "\n", + "project to GPU instances in the cloud for large-scale\n", + "\n", + "training or hyperparameter search. Using MLflow\n", + "\n", + "Projects makes it easy to create the same software\n", + "\n", + "environment in the cloud and share project code\n", + "\n", + "among data scientists.\n", + "\n", + "\n", + "**MODEL PACKAGING** An e-commerce site’s data\n", + "\n", + "science team is using MLflow Model Registry to\n", + "\n", + "package recommendation models for use by\n", + "\n", + "application engineers. This presents a technical\n", + "\n", + "challenge because the recommendation\n", + "\n", + "application includes both a standard, off-the-shelf\n", + "\n", + "recommendation model and custom business logic\n", + "\n", + "for pre- and post-processing. For example, the\n", + "\n", + "application might include custom code to ensure the\n", + "\n", + "recommended items are diverse. This business logic\n", + "\n", + "needs to change in sync with the model, and the data\n", + "\n", + "science team wants to control both the business logic\n", + "\n", + "and the model, without having to submit a patch to\n", + "\n", + "the web application each time the logic has to change.\n", + "\n", + "Moreover, the team wants to A/B test distinct models\n", + "\n", + "with distinct versions of the processing logic. The\n", + "\n", + "solution was to package both the recommendation\n", + "\n", + "model and the custom logic using the python_\n", + "\n", + "function flavor in an MLflow Model, which can then\n", + "\n", + "be deployed and tested as a single unit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Open and extensible by design\n", + "\n", + "Since we [unveiled](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) and open sourced MLflow in June 2018 at the Spark + AI Summit in San Francisco, community engagement and contributions have led to an impressive\n", + "\n", + "array of new features and integrations:\n", + "\n", + "\n", + "**SUPPORT FOR MULTIPLE**\n", + "\n", + "**PROGRAMMING LANGUAGES**\n", + "\n", + "To give developers a choice, MLflow supports R,\n", + "\n", + "Python, Java and Scala, along with a REST server\n", + "\n", + "interface that can be used from any language.\n", + "\n", + "\n", + "**INTEGRATION WITH POPULAR ML**\n", + "\n", + "**LIBRARIES AND FRAMEWORKS**\n", + "\n", + "MLflow has built-in integrations with the most popular\n", + "\n", + "machine learning libraries — such as scikit-learn,\n", + "\n", + "TensorFlow, Keras, PyTorch, H2O, and Apache Spark™\n", + "\n", + "MLlib — to help teams build, test and deploy machine\n", + "\n", + "learning applications.\n", + "\n", + "\n", + "**CROSS-CLOUD SUPPORT**\n", + "\n", + "Organizations can use MLflow to quickly deploy\n", + "\n", + "machine learning models to multiple cloud services,\n", + "\n", + "including Databricks, Azure Machine Learning and\n", + "\n", + "Amazon SageMaker, depending on their needs.\n", + "\n", + "MLflow leverages AWS S3, Google Cloud Storage and\n", + "\n", + "Azure Data Lake Storage, allowing teams to easily\n", + "\n", + "track and share artifacts from their code.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Rapid community adoption\n", + "\n", + "## 2.5M\n", + "#### monthly downloads\n", + "\n", + "## 200+\n", + "#### code contributors\n", + "\n", + "\n", + "## 100+\n", + "#### contributing organizations\n", + "\n", + "\n", + "Organizations using and contributing to MLflow\n", + "\n", + "Source: [mlflow.org](https://mlflow.org)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 4: **\u0007** **A Closer Look at**\n", + "\n", + "#### MLflow Model Registry\n", + "\n", + "\n", + "MLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\n", + "\n", + "[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\n", + "\n", + "The latest MLflow component — MLflow Model Registry — builds on MLflow’s original capabilities to\n", + "\n", + "provide organizations with one central place to share ML models, collaborate on moving them from\n", + "\n", + "experimentation to testing and production, and implement approval and governance workflows.\n", + "\n", + "��\n", + "\n", + "\n", + "**Model Registry**\n", + "\n", + "\n", + "**D O W N S T R E A M**\n", + "\n", + "\n", + "��\n", + "\n", + "**Tracking Server**\n", + "\n", + "\n", + "Data Scientists\n", + "\n", + "**Staging**\n", + "\n", + "\n", + "Data Engineers\n", + "\n", + "**Production** **Archived**\n", + "\n", + "**A U T O M AT E D J O B S**\n", + "\n", + "\n", + "**Parameters**\n", + "\n", + "\n", + "**Metrics** **Artifacts**\n", + "\n", + "\n", + "The Model Registry gives MLflow users new\n", + "\n", + "\n", + "tools for sharing, reviewing and managing\n", + "\n", + "ML models throughout their lifecycle\n", + "\n", + "\n", + "**Metadata** **Models**\n", + "\n", + "**R E S T S E R V I N G**\n", + "\n", + "**R E V I E W E R S + C I / C D T O O L S**\n", + "\n", + "The MLflow Model Registry complements the MLflow offering and is designed to help organizations\n", + "\n", + "implement good engineering principles with machine learning initiatives, such as collaboration,\n", + "\n", + "governance, reproducibility and knowledge management. The next few pages highlight some of the key\n", + "\n", + "features of this new component.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "###### One hub for managing ML models collaboratively\n", + "\n", + "Building and deploying ML models is a team sport. Not only are the responsibilities\n", + "\n", + "along the machine learning model lifecycle often split across multiple people\n", + "\n", + "(e.g., data scientists train models whereas production engineers deploy them),\n", + "\n", + "but also at each lifecycle stage, teams can benefit from collaboration and sharing\n", + "\n", + "\n", + "###### Flexible CI/CD pipelines to manage stage transitions\n", + "\n", + "MLflow lets you manage your models’ lifecycles either manually or through\n", + "\n", + "automated tools. Analogous to the approval process in software engineering,\n", + "\n", + "users can manually request to move a model to a new lifecycle stage (e.g., from\n", + "\n", + "staging to production), and review or comment on other users’ transition requests.\n", + "\n", + "\n", + "(e.g., a fraud model built in one part of the organization could be reused in others).\n", + "\n", + "Alternatively, you can use the Model Registry’s API to plug in continuous integration\n", + "\n", + "\n", + "MLflow facilitates sharing of expertise and knowledge across teams by making ML\n", + "\n", + "models more discoverable and providing collaborative features to jointly improve\n", + "\n", + "on common ML tasks. Simply register an MLflow model from your experiments to\n", + "\n", + "\n", + "and deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\n", + "\n", + "your models. Each model also links to the experiment run that built it — in MLflow\n", + "\n", + "Tracking — to let you easily review models.\n", + "\n", + "\n", + "get started. The MLflow Model Registry will then let you track multiple versions\n", + "\n", + "of the model and mark each one with a lifecycle stage: development, staging,\n", + "\n", + "production or archived.\n", + "\n", + "\n", + "Sample machine learning\n", + "models displayed via the\n", + "MLflow Model Registry\n", + "dashboard\n", + "\n", + "\n", + "The machine learning model\n", + "page view in MLflow, showing\n", + "how users can request and\n", + "review changes to a model’s\n", + "stage\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Visibility and governance for the full ML lifecycle\n", + "\n", + "In large enterprises, the number of ML models that are in development, staging\n", + "\n", + "and production at any given point in time may be in the hundreds or thousands.\n", + "\n", + "Having full visibility into which models exist, what stages they are in and who\n", + "\n", + "has collaborated on and changed the deployment stages of a model allows\n", + "\n", + "organizations to better manage their ML efforts.\n", + "\n", + "MLflow provides full visibility and enables governance by keeping track of each\n", + "\n", + "model’s history and managing who can approve changes to the model’s stages.\n", + "\n", + "Identify versions, stages and\n", + "authors of each model\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 5: **\u0007** **Making Organizations**\n", + "\n", + "#### Successful with ML\n", + "\n", + "\n", + "Standardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\n", + "\n", + "track experiments, compare results, reproduce runs and productionize faster.\n", + "\n", + "In addition to increasing data science team productivity and collaboration and applying good engineering\n", + "\n", + "practices to machine learning, organizations also need to do the following:\n", + "\n", + "\n", + "**Reliably ingest, ETL and**\n", + "\n", + "**catalog big data**\n", + "\n", + "\n", + "**Work with state-of-the-art**\n", + "\n", + "**ML frameworks and tools**\n", + "\n", + "\n", + "**Easily scale compute from**\n", + "\n", + "**single to multi-node**\n", + "\n", + "\n", + "Databricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 6: **\u0007** **Introducing the Unified**\n", + "\n", + "#### Data Analytics Platform\n", + "\n", + "\n", + "Databricks accelerates innovation by unifying data science, engineering and business. Through a fully\n", + "\n", + "managed, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\n", + "\n", + "Databricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\n", + "\n", + "accelerates their innovation.\n", + "\n", + "**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\n", + "\n", + "\n", + "**BI INTEGRATIONS**\n", + "\n", + "**Access all your data**\n", + "\n", + "\n", + "**DATA SCIENCE WORKSPACE**\n", + "\n", + "**Collaboration across the lifecycle**\n", + "\n", + "**UNIFIED DATA SERVICE**\n", + "\n", + "**High-quality data with great performance**\n", + "\n", + "\n", + "\n", + "**ENTERPRISE CLOUD SERVICE**\n", + "\n", + "**A simple, scalable and secure managed service**\n", + "\n", + "##### RAW DATA LAKE\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "###### Data engineering\n", + "\n", + "Speed up the preparation of high-quality\n", + "\n", + "data, essential for best-in-class ML\n", + "\n", + "applications, at scale\n", + "\n", + "\n", + "###### Data science\n", + "\n", + "Collaboratively explore large data sets,\n", + "\n", + "build models iteratively and deploy across\n", + "\n", + "multiple platforms\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Providing managed MLflow on Databricks\n", + "\n", + "MLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\n", + "\n", + "packaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\n", + "\n", + "By using MLflow as part of Databricks, data scientists can:\n", + "\n", + "\n", + "**WORKSPACES**\n", + "\n", + "Benefit from a streamlined\n", + "\n", + "experiment tracking experience\n", + "\n", + "with Databricks Workspace and\n", + "\n", + "collaborative Notebooks\n", + "\n", + "\n", + "**BIG DATA SNAPSHOTS**\n", + "\n", + "Track large-scale data that fed\n", + "\n", + "the models, along with all the\n", + "\n", + "other model parameters, then\n", + "\n", + "\n", + "**JOBS**\n", + "\n", + "Easily initiate jobs remotely, from\n", + "\n", + "an on-premises environment or\n", + "\n", + "from Databricks notebooks\n", + "\n", + "\n", + "**SECURITY**\n", + "\n", + "Take advantage of one common\n", + "\n", + "security model for the entire\n", + "\n", + "machine learning lifecycle\n", + "\n", + "\n", + "reproduce training runs reliably\n", + "\n", + "\n", + "Read our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Getting data ready for ML with Delta Lake\n", + "\n", + "Delta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\n", + "\n", + "data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "By using Delta Lake, data engineers and data scientists can keep track of data used for model training.\n", + "\n", + "Files ML Runtime\n", + "\n", + "- \u0007Schema enforced high\n", + "\n", + "quality data\n", + "\n", + "\n", + "\n", + "- Optimized performance\n", + "\n", + "��\n", + "\n", + "- \u0007Full data lineage /\n", + "\n", + "governance\n", + "\n", + "- \u0007reproductibility through\n", + "\n", + "time travel\n", + "\n", + "\n", + "Streaming\n", + "\n", + "Batch\n", + "\n", + "\n", + "Ingestion\n", + "\n", + "Tables\n", + "\n", + "\n", + "Ingestion\n", + "\n", + "\n", + "Data\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data\n", + "\n", + "\n", + "Feature\n", + "\n", + "Store\n", + "\n", + "\n", + "Feature\n", + "\n", + "\n", + "**Y O U R E X I S T I N G D E LTA L A K E**\n", + "\n", + "\n", + "3rd Party Data\n", + "\n", + "Marketplace\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Ready-to-use ML environments\n", + "\n", + "Databricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\n", + "\n", + "preconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\n", + "\n", + "By using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\n", + "\n", + "and simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\n", + "\n", + "\n", + "**P A C K A G E S A N D O P T I M I Z E S M O S T**\n", + "\n", + "**C O M M O N M L F R A M E W O R K S**\n", + "\n", + "\n", + "**C U S T O M I Z E D E N V I R O N M E N T S**\n", + "\n", + "**U S I N G C O N D A**\n", + "\n", + "\n", + "**C U S T O M I Z E D E N V I R O N M E N T S**\n", + "\n", + "\n", + "requirements.txt\n", + "conda.yaml\n", + "\n", + "\n", + "**...**\n", + "\n", + "\n", + "**B U I LT- I N O P T I M I Z AT I O N F O R**\n", + "\n", + "**D I S T R I B U T E D D E E P L E A R N I N G**\n", + "\n", + "Distribute and Scale any Single-Machine\n", + "ML Code to thousands of machines\n", + "\n", + "\n", + "**B U I LT- I N A U T O M L A N D**\n", + "\n", + "**E X P E R I M E N T T R A C K I N G**\n", + "\n", + "\n", + "Machine\n", + "\n", + "Learning\n", + "\n", + "\n", + "Machine\n", + "\n", + "\n", + "\n", + "Auto ML and Tracking /\n", + "Visualizations with MLflow\n", + "\n", + "\n", + "Conda-\n", + "\n", + "Based\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 7: **\u0007** **Standardizing the**\n", + "\n", + "#### Machine Learning\n", + " Lifecycle on Databricks\n", + "\n", + "**B U I L D M O D E L**\n", + "**P R E P D ATA**\n", + "\n", + "��\n", + "\n", + "Azure ML\n", + "\n", + "**D E P L O Y M O D E L**\n", + "\n", + "��\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 8: **\u0007** **Getting Started**\n", + "Take the next step toward standardizing your ML lifecycle — test drive MLflow and the\n", + "\n", + "Databricks Unified Data Analytics Platform.\n", + "\n", + "**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\n", + "\n", + "**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 8: **\u0007** **Comparison Matrix**\n", + "\n", + "|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\n", + "|---|---|---|\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf2024-09-19T16:57:20Z
-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_improper_payments_eBook_v4_image.pdf2024-09-19T16:57:20Z
### Technical Migration Guide\n", + "\n", + "# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Lakehouse Architecture 3\n", + "\n", + "The Databricks Lakehouse Platform 4\n", + "\n", + "Business Value 5\n", + "\n", + "Single source of truth 5\n", + "\n", + "Data team 6\n", + "\n", + "Future-proof 6\n", + "\n", + "Migration to Lakehouse 7\n", + "\n", + "Overview 7\n", + "\n", + "Migration strategy 8\n", + "\n", + "Migration planning 9\n", + "\n", + "ELT approach 12\n", + "\n", + "Agile modernization 15\n", + "\n", + "Security and data governance 17\n", + "\n", + "Team involvement 19\n", + "\n", + "Conclusion 19\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse Architecture\n", + "\n", + "\n", + "Data warehouses were designed to provide a central data repository\n", + "\n", + "with analytic compute capabilities to help business leaders\n", + "\n", + "get analytical insights, support decision-making and business\n", + "\n", + "intelligence (BI). Legacy on-premises data warehouse architectures\n", + "\n", + "are difficult to scale and make it difficult for data teams to keep up\n", + "\n", + "with the exponential growth of data. Oftentimes data teams publish\n", + "\n", + "and use a subset of well-defined data for development and testing.\n", + "\n", + "This slows down both innovation and time to insight.\n", + "\n", + "Cloud data warehouses (CDW) were an attempt to tackle the\n", + "\n", + "on-premises data warehouse challenges. CDWs removed the\n", + "\n", + "administrative burden of tasks such as setup, upgrades and\n", + "\n", + "backups. CDWs also improved scalability and introduced cloud’s\n", + "\n", + "pay-as-you-go model to reduce cost. CDWs leverage a proprietary\n", + "\n", + "data format to achieve cloud-scale and performance; however, this\n", + "\n", + "also leads to customers locked into these formats with difficult\n", + "\n", + "\n", + "But enterprise data teams don’t need a better data warehouse.\n", + "\n", + "They need an innovative, simple solution that provides reliable\n", + "\n", + "performance, elastic scale and allows self-service to unblock\n", + "\n", + "analytics to access all data at a reasonable cost. The answer is\n", + "\n", + "the lakehouse.\n", + "\n", + "The lakehouse pattern represents a paradigm shift from traditional\n", + "\n", + "on-premises data warehouse systems that are expensive and\n", + "\n", + "complex to manage. It uses an open data management architecture\n", + "\n", + "that combines the flexibility, cost-efficiency and scale of data\n", + "\n", + "lakes with the data management and ACID semantics of data\n", + "\n", + "warehouses. A lakehouse pattern enables data transformation,\n", + "\n", + "cleansing and validation to support both business intelligence and\n", + "\n", + "machine learning (ML) users on all data. Lakehouse is cloud-centric\n", + "\n", + "and unifies a complete up-to-date data set for teams, allowing\n", + "\n", + "collaboration across an organization.\n", + "\n", + "\n", + "paths to support use cases outside the data warehouse itself\n", + "\n", + "(i.e., machine learning). Customers often find themselves with a\n", + "\n", + "bifurcated architecture, which ultimately leads to a more costly and\n", + "\n", + "complex data platform over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Databricks Lakehouse Platform\n", + "\n", + "The Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n", + "\n", + "and AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n", + "\n", + "ecosystem with open standards and data formats. Databricks is **multicloud** — delivering\n", + "\n", + "one **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n", + "\n", + "every cloud platform that you’re using to support your data and AI efforts.\n", + "\n", + "Databricks SQL stores and processes data using Delta Lake to simplify and enhance\n", + "\n", + "data warehousing capabilities. Analysts can use their favorite language, SQL, popular\n", + "\n", + "transformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n", + "\n", + "analyze data. The built-in query editor reduces contextual switching and improves\n", + "\n", + "productivity. Administrators enjoy simplified workload management via serverless\n", + "\n", + "compute and auto-scaling to meet high-concurrency workload needs. All this at a\n", + "\n", + "fraction of the cost of traditional data warehouses.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "Simple Open Multicloud\n", + "\n", + "\n", + "-----\n", + "\n", + "## Business Value\n", + "\n", + "#### Single source of truth\n", + "\n", + "Databricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n", + "\n", + "scalable storage layer where you can store all your data, including raw and historical data,\n", + "\n", + "alongside structured data tables in the data warehouse. The lakehouse pattern avoids\n", + "\n", + "data silos and shares the same elastic scale and governance across all use cases: BI, data\n", + "\n", + "engineering, streaming and AI/ML. This means that data engineering teams don’t have to\n", + "\n", + "move data to a proprietary data warehouse for business analysts or create a separate\n", + "\n", + "data store to support data science.\n", + "\n", + "Instead, data teams can access the open format Delta tables directly and combine data\n", + "\n", + "sets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n", + "\n", + "data with access to versioned history to facilitate repeatable experiments. A single source\n", + "\n", + "of truth facilitates moving from descriptive to predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data team\n", + "\n", + "\n", + "With central data governance and fine-grained access control\n", + "\n", + "capabilities to secure the lakehouse, you can enable self-service\n", + "\n", + "SQL analytics for everyone on the Databricks Lakehouse Platform.\n", + "\n", + "This allows each team to be more agile and innovate faster.\n", + "\n", + "**Data Analysts** — Using the Databricks SQL editor\n", + "\n", + "or their tools of choice (DBT, Power BI, Tableau), SQL\n", + "\n", + "analysts can leverage familiar toolsets.\n", + "\n", + "**Data Engineers** — Utilizing Delta Lake as a unified\n", + "\n", + "storage layer, data engineering teams can eliminate\n", + "\n", + "duplicate data and ETL jobs that move data across\n", + "\n", + "various systems. Databricks supports both batch and\n", + "\n", + "streaming workloads to reduce bottlenecks and serve\n", + "\n", + "the most up-to-date data to downstream users and\n", + "\n", + "applications.\n", + "\n", + "**Administrators** — The pay-as-you-go, decentralized\n", + "\n", + "compute resource allows each team to run their\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides a reliable ETL and data\n", + "\n", + "management framework to simplify ETL pipelines. Data teams can\n", + "\n", + "build end-to-end data transformations in a single pipeline instead of\n", + "\n", + "many small ETL tasks. Databricks supports data quality enforcement\n", + "\n", + "to ensure reliability with auto-scalable infrastructure. Your teams\n", + "\n", + "can onboard new data sources quickly to power new use cases with\n", + "\n", + "fresh data. This not only allows your team to efficiently and reliably\n", + "\n", + "deliver high-quality data in a timely manner, it also reduces ETL\n", + "\n", + "workload cost significantly.\n", + "\n", + "#### Future-proof\n", + "\n", + "Unlike CDWs that lock customers in, Databricks offers an open\n", + "\n", + "platform with open standards, open protocols and open data\n", + "\n", + "formats. It supports a full range of popular languages (SQL, Python,\n", + "\n", + "R, Scala) and popular BI tools. You can leverage the performant\n", + "\n", + "and low-cost distributed compute layer for data processing — or\n", + "\n", + "use a variety of tools and engines to efficiently access the data via\n", + "\n", + "Databricks APIs. Databricks also allows data consumption with a rich\n", + "\n", + "partner ecosystem. Teams can handle all existing BI and AI use cases\n", + "\n", + "with the flexibility to support future use cases as they emerge.\n", + "\n", + "\n", + "workload in isolated environments without worrying\n", + "\n", + "about contention. Serverless SQL endpoint frees your\n", + "\n", + "team from infrastructure management challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Migration to Lakehouse\n", + "\n", + "#### Overview\n", + "\n", + "A lakehouse is the ideal data architecture for data-driven organizations. It combines the\n", + "\n", + "best qualities of data warehouses and data lakes to provide a single solution for all major\n", + "\n", + "data workloads and supports use cases from streaming analytics to BI, data science and\n", + "\n", + "AI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n", + "\n", + "only consumes (charges for) compute resources when workloads are running. This pay-\n", + "\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Building the Lakehouse\n", + " at Atlassian\n", + "\n", + "[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n", + "\n", + "\n", + "as-you-go model means compute resources are automatically shut down if no processing\n", + "\n", + "is needed. Data teams can use small clusters that can power individual workloads\n", + "\n", + "they plan to migrate. They can make the choice to leverage serverless SQL endpoints\n", + "\n", + "and completely free data teams from infrastructure capacity planning and cluster\n", + "\n", + "maintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n", + "\n", + "savings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n", + "\n", + "savings compared to other cloud data warehouses.\n", + "\n", + "Data warehouse migration is never an easy task. Databricks aims to mitigate the things\n", + "\n", + "that can go wrong in these demanding migration projects. The Databricks Lakehouse\n", + "\n", + "Platform provides many out-of-the-box features to mitigate migration risks.\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Driving Freight Transportation Into the Future\n", + "\n", + "[Read more](https://databricks.com/customers/jbhunt)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration strategy\n", + "\n", + "\n", + "Migration is a huge effort and very expensive. Yet, almost every\n", + "\n", + "enterprise has to migrate to new platforms every 3–5 years because\n", + "\n", + "the old platform cannot support new use cases, catch up with\n", + "\n", + "data growth or meet scaling needs. To get better ROI on migration,\n", + "\n", + "implement a migration strategy that can reduce future re-platform\n", + "\n", + "needs and extend to your future data and AI strategy.\n", + "\n", + "Use the opportunity of a data migration to standardize your data\n", + "\n", + "in open Delta format to allow existing and future tools to access\n", + "\n", + "it directly without moving or converting it. Merge your siloed\n", + "\n", + "data warehouses into the unified storage layer in the Databricks\n", + "\n", + "Lakehouse Platform — without worrying about storage capacity.\n", + "\n", + "The unified storage layer allows your team to deploy a unified data\n", + "\n", + "governance on top to secure all data access consistently. Simplify\n", + "\n", + "your data governance story with Databricks Unity Catalog.\n", + "\n", + "\n", + "Move toward a single, consistent approach to data pipelining\n", + "\n", + "and refinement. Merge batch and streaming into a single end-\n", + "\n", + "to-end pipeline to get fresher data and provide more real-time\n", + "\n", + "decisions. Take a metadata-driven approach to align the dataflow\n", + "\n", + "with business processes and have data validation and quality\n", + "\n", + "check built-in. Through a series of curation and refinement steps,\n", + "\n", + "the output results in highly consumable and trusted data for\n", + "\n", + "downstream use cases.\n", + "\n", + "The lakehouse architecture makes it possible for the organization\n", + "\n", + "to create “data assets” by taking a stepwise approach to improving\n", + "\n", + "data and serving all essential use cases. Encourage your BI/analyst\n", + "\n", + "team to leverage Databricks serverless endpoints for self-serve\n", + "\n", + "and agility. Each team can evaluate their top priority workloads and\n", + "\n", + "migrate them in parallel to speed up migration.\n", + "\n", + "Take advantage of Databricks’ rich partner ecosystem. Your favorite\n", + "\n", + "partners are likely already integrated via Partner Connect and\n", + "\n", + "can be set up with a few clicks. There are also many ISV and SI\n", + "\n", + "consulting partners who can help your migration journey.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration planning\n", + "\n", + "Migrating a data warehouse to the cloud can be time consuming and challenging for your\n", + "\n", + "data teams. It’s important to agree on the data architecture, migration strategy and process/\n", + "\n", + "frameworks to be used before undertaking a data migration. Databricks provides Migration\n", + "\n", + "Assessment and Architecture Review sessions to develop a joint migration roadmap. This\n", + "\n", + "process is designed to help organizations to successfully migrate to a lakehouse architecture.\n", + "\n", + "Based on information collected and business objectives, the Databricks team will work with\n", + "\n", + "customers to propose a target architecture and provide a tailored migration roadmap.\n", + "\n", + "These assessments help get a full picture of current data systems and the future vision. They\n", + "\n", + "clarify what you are migrating and do proper use case discovery. This includes identifying\n", + "\n", + "workloads and data source dependency, for example:\n", + "\n", + "Sample migration assessment checklist:\n", + "\n", + "Identify upstream data sources and workload dependencies\n", + "\n", + "Identify active/inactive data sets and database objects\n", + "\n", + "Identify downstream application dependencies and data freshness requirements\n", + "\n", + "Define a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n", + "\n", + "Define security requirements and data governance\n", + "\n", + "Clarify access management need, document needed permissions per user/group\n", + "\n", + "Outline current tooling (ingestion, ETL and BI) and what’s needed\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s important to identify key stakeholders and keep them engaged during the migration to\n", + "\n", + "make sure they are aligned with the overall objectives. The workload assessment result will\n", + "\n", + "be reviewed with key stakeholders. Through the review process, data teams can get a better\n", + "\n", + "understanding of which workloads can most benefit from modernization.\n", + "\n", + "Databricks often works with partners to provide a workload assessment and help customers\n", + "\n", + "understand their migration complexity and properly plan a budget. Databricks also partners\n", + "\n", + "with third-party vendors that provide migration tools to securely automate major migration\n", + "\n", + "tasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n", + "\n", + "help with the migration, including:\n", + "\n", + "\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n", + "\n", + "your current system to Databricks optimized code with Delta and other best practices\n", + "\n", + "\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n", + "\n", + "migration time and cost\n", + "\n", + "\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n", + "\n", + "2x–3x faster than what was previously possible\n", + "\n", + "\n", + "-----\n", + "\n", + "#### We can use Automated conversion for most workload types\n", + "\n", + "###### EDWs\n", + "\n", + "\n", + "Open Cloud Storage\n", + "ADLS, S3, GCP Storage\n", + "\n", + "Databricks Tables, �ie�s\n", + "\n", + "Spark SQL Databricks Notebooks\n", + "\n", + "Spark SQL � little bit o� Python or Scal�\n", + "\n", + "Runs on Databricks JDBC/ODBC\n", + "\n", + "Databricks permissions- Table ACLs\n", + "\n", + "Credential Pass-throughs to Files\n", + "\n", + "Big Data ETL tools, Databricks Notebooks\n", + "\n", + "Air5o� DAGs, ADF, Databricks Job\n", + "and any other Enterprise Schedulers\n", + "\n", + "\n", + "Data Migration\n", + "\n", + "Metastore Migration\n", + "\n", + "SQL Migration\n", + "\n", + "Security\n", + "\n", + "ETL Tools\n", + "\n", + "\n", + "DB locked �ormats on Disks\n", + "\n", + "Databases, Tables, �ie�s\n", + "\n", + "Ad-hoc SQL �ueries\n", + "\n", + "T-SQL, PL/SQL, BTEQ\n", + "\n", + "Reports �rom PB`, Tableau etc^\n", + "\n", + "GRANTs, Roles\n", + "\n", + "External tables- File permissions\n", + "\n", + "Data Stage, Po�erCenter, Ab `nitio etc^\n", + "\n", + "\n", + "Orchestration ETL Schedulers\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ELT approach\n", + "\n", + "The separation of storage and compute makes ELT on lakehouse a better choice than traditional\n", + "\n", + "ETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n", + "\n", + "data implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n", + "\n", + "use cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n", + "\n", + "the foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n", + "\n", + "as needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n", + "\n", + "by exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n", + "\n", + "“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n", + "\n", + "**I M P R O V E D ATA Q U A L I T Y**\n", + "\n", + "Data B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n", + "\n", + "Streaming Analytics\n", + "\n", + "CSV TXT JSON\n", + "\n", + "\n", + "D ata �a �e\n", + "\n", + "\n", + "Raw\n", + "integration\n", + "\n", + "\n", + "Filtered, Cleaned,\n", + "Augmented\n", + "\n", + "\n", + "Business-level\n", + "Aggregates\n", + "\n", + "\n", + "Reuorting\n", + "\n", + "\n", + "-----\n", + "\n", + "We highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n", + "\n", + "service in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n", + "\n", + "modernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n", + "\n", + "a traditional data warehouse, you can focus on source and expected output, and create your\n", + "\n", + "entire dataflow graph declaratively. Delta Live Tables offers:\n", + "\n", + "\u0007A metadata-driven approach — You just specify what data should be in each table or view\n", + "\n", + "rather than the details of how processing should be done\n", + "\n", + "\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n", + "\n", + "monitoring/visibility, error recovery, and lineage, which reduces the strain on data\n", + "\n", + "engineering teams and improves time-to-value in building data pipelines\n", + "\n", + "\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n", + "\n", + "are populated correctly, whether continuously or on a regular schedule. For example,\n", + "\n", + "updating one table will automatically trigger all downstream table updates to keep data\n", + "\n", + "up-to-date.\n", + "\n", + "\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n", + "\n", + "pipelines simpler and easier. DLT can also automatically recover from common error\n", + "\n", + "conditions, reducing operational overhead.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Agile modernization\n", + "\n", + "\n", + "Agile development allows teams to move quickly knowing migrated\n", + "\n", + "pipelines can be revisited at a later cycle and evolving data models\n", + "\n", + "are supported within the architecture. Allowing business impact to\n", + "\n", + "drive priorities via an agile approach helps mitigate migration risks.\n", + "\n", + "Prioritizing and selecting use cases where modernization brings\n", + "\n", + "business benefits quickly is a good starting point. Focus on the 20%\n", + "\n", + "of workloads that consume 80% of budget. By breaking workflows\n", + "\n", + "down into components and managing data stories, teams can adjust\n", + "\n", + "priorities over time. Changes can be made in collaboration with the\n", + "\n", + "user community to fit the business definition of value.\n", + "\n", + "Migrating to a lakehouse architecture leverages separation of storage\n", + "\n", + "and compute to remove resource contention between ETL and BI\n", + "\n", + "workloads. As a result, the migration process can be more agile,\n", + "\n", + "allowing you to evolve your design iteratively without big-bang effort:\n", + "\n", + "\u0007Reduce time during the initial phase on full capacity plan and\n", + "\n", + "\n", + "All of this allows you to take a more iterative and business-focused\n", + "\n", + "approach for migration instead of a full planning, execution, test/\n", + "\n", + "validation approach. Here are more approaches that help facilitate\n", + "\n", + "this phased implementation:\n", + "\n", + "\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n", + "\n", + "new data into pipelines quicker to get data in near real-time.\n", + "\n", + "\u0007Delta Live Tables (DLT) improves data quality during data\n", + "\n", + "transformation and automatically scales to address data volume\n", + "\n", + "change. DLT can also support schema evolution and quarantine\n", + "\n", + "bad data or data that needs to be reprocessed at a later stage.\n", + "\n", + "\u0007Use dedicated clusters to isolate workloads, lower the total cost\n", + "\n", + "of ownership and improve overall performance. By using multiple\n", + "\n", + "clusters, we can shut down resources when not in use and move\n", + "\n", + "away from managing fixed resources in a single large cluster.\n", + "\n", + "\n", + "scoping\n", + "\n", + "\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n", + "\n", + "\u0007Workload management is much simpler, you can isolate each\n", + "\n", + "workload with a dedicated compute resource, without worrying\n", + "\n", + "about managing workload contention\n", + "\n", + "\u0007Auto-scale and tear down the compute resources after the job\n", + "\n", + "is done to achieve cost efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "Leverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n", + "\n", + "\u0007Create a migration factory for iterative migration process\n", + "\n", + "\u0007Determine and implement a security and governance framework\n", + "\n", + "\u0007Establish a to-be environment and move use cases/workloads in logical units\n", + "\n", + "\u0007Prove business value and scale over time\n", + "\n", + "\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n", + "\n", + "Take this iterative and templated approach. Migration speed will accelerate. Customers can\n", + "\n", + "finish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n", + "\n", + "\n", + "“ M a k e i t w o r k ”\n", + "\n", + "Pa r e l l e l i z e t h e\n", + "B u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\n", + "i t e r at i o n s\n", + "\n", + "“ M a k e i t w o r k >a s t 2\n", + "\n", + "\n", + "Full %i\"ecycle %ig�t�ou�e /or�load�\n", + "\n", + "Leverage Databricks’ deep\n", + "\n", + "bench of expertise to build\n", + "\n", + "out some **templates for the**\n", + "\n", + "**most effective Databricks**\n", + "\n", + "**implementation.**\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Take an **iterative, bite-sized**\n", + "\n", + "**approach** to migration, reduce tech\n", + "\n", + "debt and rework, and bring forward\n", + "\n", + "the value of the solution earlier.\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "\n", + "-----\n", + "\n", + "To maximize the value of your lakehouse, you should consider retiring\n", + "\n", + "some legacy architecture design patterns. Leverage the migration\n", + "\n", + "process to simplify data warehousing tasks. Regardless of how you\n", + "\n", + "complete your migration, you could utilize lakehouse strengths to\n", + "\n", + "improve architectural patterns:\n", + "\n", + "\u0007Merge your siloed data warehouses on your unified lakehouse\n", + "\n", + "platform and unify data access and data governance via Unity\n", + "\n", + "Catalog. The lakehouse architecture provides a unified storage\n", + "\n", + "layer for all your data where there is no physical boundary\n", + "\n", + "between data. There is no need to keep data copies for each\n", + "\n", + "system using the data set. Clean up and remove jobs that are\n", + "\n", + "created to keep data in sync across various data systems.\n", + "\n", + "Keep a single copy of raw data in your lakehouse as a single\n", + "\n", + "source of truth.\n", + "\n", + "\u0007The Databricks Lakehouse Platform allows you to merge batch\n", + "\n", + "and streaming into a single system to build a simple continuous\n", + "\n", + "\n", + "\u0007Simplify your workload isolation and management by running jobs\n", + "\n", + "in dedicated clusters. Separating storage and compute allows you\n", + "\n", + "to easily isolate each task with isolated compute resources. There\n", + "\n", + "is no need to squeeze them into a single large data appliance\n", + "\n", + "and spend lots of time managing and coordinating resources.\n", + "\n", + "Leverage the elasticity of the Databricks compute layer to\n", + "\n", + "automatically handle workload concurrency changes at peak time\n", + "\n", + "instead of paying for over-provisioned resources for most of the\n", + "\n", + "time. This greatly simplifies the workload management effort the\n", + "\n", + "traditional data warehouses require.\n", + "\n", + "\u0007Simplify disaster recovery. Storage and compute separation\n", + "\n", + "allows easy disaster recovery. The cloud storage provides very\n", + "\n", + "good data redundancy and supports automated replication\n", + "\n", + "to another region. Customers can spin up compute resources\n", + "\n", + "quickly in another region and maintain service availability in case\n", + "\n", + "of an outage.\n", + "\n", + "\n", + "data flow model to process data as it arrives. Process data in\n", + "\n", + "near real-time and enable data-driven decisions with the most\n", + "\n", + "recent updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Security and data governance\n", + "\n", + "\n", + "Security is paramount in any data-driven organization. Data security\n", + "\n", + "should enforce the business needs for both internal and external\n", + "\n", + "data, so the lakehouse should be set up to meet your organization’s\n", + "\n", + "security requirements. Databricks provides built-in security to\n", + "\n", + "protect your data during and after migration.\n", + "\n", + "\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n", + "\n", + "or your own\n", + "\n", + "\u0007Set up a custom network policy, use IP range to control access\n", + "\n", + "\u0007Leverage Private Link to limit network traffic to not traverse the\n", + "\n", + "public internet\n", + "\n", + "\n", + "The challenge with the traditional data warehouse and data lake\n", + "\n", + "architecture is that data is stored in multiple stores and your data\n", + "\n", + "team also needs to manage data access and data governance\n", + "\n", + "twice. The lakehouse pattern uses unified storage which simplifies\n", + "\n", + "governance. The Databricks Lakehouse Platform provides a unified\n", + "\n", + "governance layer across all your data teams. Migrating to Databricks\n", + "\n", + "Unity Catalog provides data discovery, data lineage, role-based\n", + "\n", + "security policies, table or row/column-level access control, and\n", + "\n", + "central auditing capabilities that make the data platform easy for\n", + "\n", + "data stewards to confidently manage and secure data access to\n", + "\n", + "meet compliance and privacy needs, directly on the lakehouse.\n", + "\n", + "\n", + "\u0007Enable SSO, integrate with active directory and other IdPs\n", + "\n", + "\u0007Control data access to database objects using RBAC\n", + "\n", + "\u0007Enable audit logs to monitor user activities\n", + "\n", + "\n", + "-----\n", + "\n", + "A-�it Log\n", + "\n", + "Acco-nt Level$\n", + "User Management\n", + "\n", + "Cre�entials\n", + "\n", + "##### Centralized Governance\n", + "\n", + "ACL Store\n", + "\n", + "Access Control\n", + "\n", + "\n", + "Metastore\n", + "\n", + "Lineage Explorer\n", + "\n", + "Data Explorer\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Team involvement\n", + "\n", + "Plan to educate and train your team iteratively throughout the\n", + "\n", + "migration process. As new workloads are migrated, new teams will\n", + "\n", + "gain exposure to the lakehouse pattern. Plan to ramp up new team\n", + "\n", + "members as the migration process progresses, developing a data\n", + "\n", + "Center of Excellence within the organization. Databricks provides\n", + "\n", + "a cost effective platform for ad hoc work to be performed. A\n", + "\n", + "sandbox environment can be leveraged for teams to get exposure\n", + "\n", + "to Databricks technology and get hands-on experience. Databricks\n", + "\n", + "also provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n", + "\n", + "to get hands-on experience relevant to their immediate tasks, gain\n", + "\n", + "\n", + "#### Conclusion\n", + "\n", + "Data warehouse migration touches many business areas and\n", + "\n", + "impacts many teams, but the Databricks Lakehouse Platform\n", + "\n", + "simplifies this transition, reduces risks and accelerates your ROI.\n", + "\n", + "The Databricks Business Value Consulting team can work with you\n", + "\n", + "to quantify the impact of your use cases to both data and business\n", + "\n", + "teams. And the Databricks team of solution architects, professional\n", + "\n", + "services, and partners are ready to help.\n", + "\n", + "Reach out to your Databricks account team or send a message to\n", + "\n", + "[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n", + "\n", + "\n", + "exposure to new things and try new ideas.\n", + "\n", + "#### Additional resources\n", + "\n", + "[Migrate to Databricks](https://databricks.com/solutions/migration)\n", + "\n", + "[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
**The**\n", + "**Delta Lake**\n", + "**Series**\n", + "**Lakehouse**\n", + "\n", + "Combining the best elements of\n", + "data lakes and data warehouses\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Here’s what\n", + "#### What’s \n", + "###### you’ll find inside\n", + "#### inside?\n", + "\n", + "\n", + "The Delta Lake Series of eBooks is published\n", + "\n", + "\n", + "by Databricks to help leaders and practitioners\n", + "\n", + "understand the full capabilities of Delta Lake as\n", + "\n", + "\n", + "**Introduction**\n", + "**What is Delta Lake?**\n", + "\n", + "\n", + "well as the landscape it resides in. This eBook,\n", + "\n", + "\n", + "**The Delta Lake Series — Lakehouse** , focuses\n", + "\n", + "on lakehouse.\n", + "\n", + "\n", + "**Chapter** **01**\n", + "\n", + "##### 02 Chapter\n", + " 03 Chapter\n", + "\n", + "\n", + "What Is\n", + "a Lakehouse?\n", + "\n", + "Diving Deep Into the Inner Workings\n", + "of the Lakehouse and Delta Lake\n", + "\n", + "Understanding\n", + "Delta Engine\n", + "\n", + "\n", + "#### What’s next?\n", + "\n", + "After reading this eBook, you’ll not only\n", + "\n", + "\n", + "understand what Delta Lake offers, but you’ll\n", + "\n", + "also understand how its features result in\n", + "\n", + "substantial performance improvements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "\n", + "lifecycle management to data lakes. Our customers have found that Delta Lake\n", + "\n", + "solves for challenges around malformed data ingestion, difficulties deleting data for\n", + "\n", + "compliance, or issues modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "\n", + "scalable cloud service.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a Lakehouse?**\n", + "### CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "# 01\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "\n", + "architects began envisioning a single system to house data for many different\n", + "\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "\n", + "and data warehouses.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "\n", + "\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "\n", + "or copy data between different systems.\n", + "\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data struc-\n", + "\n", + "tures and data management features to those in a data warehouse, directly on the\n", + "\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "\n", + "consistency as multiple parties concurrently read or write data, typically using\n", + "\n", + "SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "\n", + "such as star/snowflake-schemas. The system should be able to reason about data\n", + "\n", + "integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n", + "\n", + "compute use separate clusters, thus these systems are able to scale to many more\n", + "\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "\n", + "Cloud object stores such as Amazon S3 are some of the largest and most\n", + "\n", + "cost-effective storage systems on the planet, making the main attractive\n", + "\n", + "target to store large data warehouses and data lakes. Unfortunately, their\n", + "\n", + "implementation as key-value stores makes it difficult to achieve ACID\n", + "\n", + "transactions and high performance: Metadata operations, such as listing\n", + "\n", + "objects, are expensive, and consistency guarantees are limited. In this paper,\n", + "\n", + "we present Delta Lake, an open source ACID table storage layer over cloud\n", + "\n", + "object stores initially developed at Databricks. Delta Lake uses a transaction log\n", + "\n", + "that is compacted into Apache Parquet format to provide ACID properties, time\n", + "\n", + "travel, and significantly faster metadata operations for large tabular data sets\n", + "\n", + "(e.g., the ability to quickly search billions of table partitions for those relevant\n", + "\n", + "to a query). It also leverages this design to provide high-level features such\n", + "\n", + "as automatic data layout optimization, upserts, caching, and audit logs. Delta\n", + "\n", + "Lake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n", + "\n", + "other systems. Delta Lake is deployed at thousands of Databricks customers\n", + "\n", + "that process exabytes of data per day, with the largest instances managing\n", + "\n", + "exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n", + "\n", + "Zhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n", + "\n", + "Łuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n", + "\n", + "Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "\n", + "Microsoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "\n", + "enables a similar lakehouse pattern. Other managed services such as BigQuery and\n", + "\n", + "Redshift Spectrum have some of the lakehouse features listed above, but they are\n", + "\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "\n", + "source file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n", + "\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "\n", + "data that went into a company’s products or decision-making was structured\n", + "\n", + "data from operational systems, whereas today, many products incorporate\n", + "\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "\n", + "unstructured data.\n", + "\n", + "\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "\n", + "\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "\n", + "important for “lift and shift scenarios,” which require systems that achieve semantics\n", + "\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "\n", + "exploration and refinement are standard for many analytic and data science\n", + "\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "\n", + "systems (such as data warehouses) that have years of investments and real-\n", + "\n", + "world deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "\n", + "and other issues will be addressed as the technology continues to mature and\n", + "\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "\n", + "diverse data applications.\n", + "\n", + "\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the Inner Workings**\n", + "**of the Lakehouse and Delta Lake**\n", + "\n", + "### CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "# 02\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "\n", + "paper that describes some of the core technological challenges and solutions that\n", + "\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "\n", + "can read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "\n", + "[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "\n", + "object stores like Amazon S3 have become some of the largest and most cost-\n", + "\n", + "effective storage systems in the world, which makes them an attractive platform to\n", + "\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "\n", + "approach because the table is just a group of objects that can be accessed from\n", + "\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "\n", + "customers into a specific service provider, leaving customers to contend with\n", + "\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "\n", + "similar data structures and data management features to those in a data warehouse,\n", + "\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "\n", + "in the open source community. These use cases span a variety of data sources and\n", + "\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "\n", + "transactional features to replace some or all of the functionality provided by message\n", + "\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "\n", + "access protocols make it simple to operate, highly available, and able to deliver high-\n", + "\n", + "bandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding Delta Engine**\n", + "\n", + "### CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "# 03\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "\n", + "workloads through three components: an improved query optimizer, a caching\n", + "\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "\n", + "By linking these three components together, we think it will be easier for customers\n", + "\n", + "to understand how improvements in multiple places within the Databricks code\n", + "\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "\n", + "new advances in how data teams design their data architectures for increased\n", + "\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "\n", + "[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "## What’s next?\n", + "\n", + "\n", + "Now that you understand Delta Lake and how its features can improve\n", + "\n", + "performance, it may be time to take a look at some additional resources.\n", + "\n", + "**Data + AI Summit Europe 2020 >**\n", + "\n", + "- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n", + "\n", + "\n", + "**Explore subsequent eBooks in the collection >**\n", + "\n", + "- The Delta Lake Series — Fundamentals and Performance\n", + "\n", + "- The Delta Lake Series — Features\n", + "\n", + "- The Delta Lake Series — Streaming\n", + "\n", + "- The Delta Lake Series — Customer Use Cases\n", + "\n", + "\n", + "\n", + "- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n", + "\n", + "- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "\n", + "\n", + "- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n", + "\n", + "- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n", + "\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n", + "\n", + "- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "\n", + "**Vodcasts and podcasts >**\n", + "\n", + "\n", + "\n", + "- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n", + "\n", + "- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n", + "\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "\n", + "\n", + "\n", + "- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
**EBOOK**\n", + "\n", + "# All Roads Lead to the Lakehouse\n", + "\n", + "#### A deep dive into data ingestion with the lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction...................................................................................................................................................................................................................... **03**\n", + "\n", + "Life of a Data Engineer ............................................................................................................................................................................................... **04**\n", + "\n", + "Ingesting From Cloud Object Stores...................................................................................................................................................................... **05**\n", + "\n", + "COPY INTO ......................................................................................................................................................................................................... **06**\n", + "\n", + "Auto Loader ....................................................................................................................................................................................................... **09**\n", + "\n", + "Ingesting Data From External Applications .......................................................................................................................................................... **13**\n", + "\n", + "Partner Connect ............................................................................................................................................................................................... **13**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "Organizations today are inundated with data siloed across various on-premises\n", + "application systems, databases, data warehouses and SaaS applications. This\n", + "fragmentation makes it difficult to support new use cases for analytics or machine\n", + "learning, so many IT teams are now centralizing all of their data with a lakehouse\n", + "architecture built on top of Delta Lake, an open format storage layer.\n", + "\n", + "The first thing data engineers need to do to support the lakehouse architecture is to\n", + "efficiently move data from various systems into their lakehouse. Ingesting data is a\n", + "critical first step in the data engineering and management lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Life of a Data Engineer\n", + "\n", + "The primary focus of data engineers is to provide timely and reliable data to downstream\n", + "\n", + "data teams at an organization. Requests for data can come from a variety of teams, and for\n", + "\n", + "\n", + "a variety of data types. For example:\n", + "\n", + "**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n", + "\n", + "better allocate their budget for ads\n", + "\n", + "**•** Security team looking to get access to a table with low latency security data from Kafka,\n", + "\n", + "in order to run rules to detect intrusions into the network\n", + "\n", + "**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n", + "\n", + "**•** Finance team hoping to find a way to automatically ingest critical data from Google\n", + "\n", + "Sheets or transaction data from AWS Kinesis\n", + "\n", + "In each of these common scenarios, data engineers must create usable and easily\n", + "\n", + "queryable tables from semi-structured and unstructured data. Beyond writing queries to\n", + "\n", + "retrieve and transform all this data, the data engineering team must also be concerned\n", + "\n", + "with performance, because running these queries on an ongoing basis can be a big load on\n", + "\n", + "the system.\n", + "\n", + "Data engineers face the challenge of constant requests and ongoing business\n", + "\n", + "\n", + "###### W H AT I S \n", + " D E LTA L A K E ?\n", + "\n", + "Before thinking about ingestion into Delta Lake, it’s important to\n", + "\n", + "understand why ingesting into Delta Lake is the right solution in\n", + "\n", + "the first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n", + "\n", + "layer that brings data warehouse capabilities to your open data\n", + "\n", + "lake. Across industries, enterprises have enabled true collaboration\n", + "\n", + "among their data teams with a reliable single source of truth\n", + "\n", + "enabled by Delta Lake. By delivering quality, reliability, security and\n", + "\n", + "performance on your data lake — for both streaming and batch\n", + "\n", + "operations — Delta Lake eliminates data silos and makes analytics\n", + "\n", + "accessible across the enterprise. With Delta Lake, customers can\n", + "\n", + "build a cost-efficient, highly scalable lakehouse that eliminates\n", + "\n", + "data silos and provides self-serving analytics to end users.\n", + "\n", + "\n", + "requirements, as well as an ever-changing ecosystem. As business requirements change,\n", + "\n", + "so do the requirements around schemas, necessitating custom code to handle the\n", + "\n", + "changes. With all of these challenges, the work of a data engineer is extremely critical, and\n", + "\n", + "increasingly complex, with many steps involved before getting data to a state where it can\n", + "\n", + "actually be queried by the business stakeholders. So how do data engineers get the data\n", + "\n", + "that each of these teams need at the frequency, with the freshness, and in the format\n", + "\n", + "required?\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting From Cloud Object Stores\n", + "\n", + "There are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n", + "\n", + "cloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n", + "\n", + "existing tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n", + "\n", + "[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n", + "\n", + "\n", + "**Auto Loader**\n", + "\n", + "Auto Loader is an optimized data ingestion tool that incrementally and efficiently\n", + "\n", + "processes new data files as they arrive in cloud storage with minimal DevOps effort. You\n", + "\n", + "just need to provide a source directory path and start a streaming job. The new structured\n", + "\n", + "streaming source, called “cloudFiles”, will automatically set up file notification services that\n", + "\n", + "\n", + "**COPY INTO**\n", + "\n", + "COPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n", + "\n", + "Lake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n", + "\n", + "when the input directory contains thousands of files or fewer, and the user prefers SQL.\n", + "\n", + "COPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "subscribe file events from the input directory and process new files as they arrive, with the\n", + "\n", + "option of also processing existing files in that directory. Auto Loader has interfaces through\n", + "\n", + "Python and Scala, and can be used with SQL through Delta Live Tables.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### COPY INTO\n", + "\n", + "\n", + "COPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n", + "\n", + "ingestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n", + "\n", + "INTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n", + "\n", + "```\n", + "FILEFORMAT = CSV\n", + "FORMAT_OPTIONS (‘header’ = ‘true’)\n", + "\n", + "```\n", + "\n", + "While COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n", + "\n", + "\n", + "events by using cloud functions such as AWS Lambda or through orchestrators like Apache\n", + "\n", + "Airflow. COPY INTO supports incremental appends and simple transformations.\n", + "\n", + "COPY INTO is a great command to use when your source directory contains a small number\n", + "\n", + "of files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n", + "\n", + "Auto Loader, which we will cover later in this eBook.\n", + "\n", + "**Common Use Cases for COPY INTO**\n", + "\n", + "**Ingesting data to a new Delta table**\n", + "\n", + "A common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n", + "\n", + "table. To copy data into a new Delta table, users can use CREATE TABLE command first,\n", + "\n", + "followed by COPY INTO.\n", + "\n", + "Step 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\n", + "Step 2 1 : `COPY INTO` `my_table`\n", + "```\n", + " FROM ‘s3://my_bucket/my_path’ WITH (\n", + " CREDENTIAL (\n", + " AWS_ACCESS_KEY = ‘*****’,\n", + " AWS_SECRET_KEY = ‘*****’,\n", + " AWS_SESSION_TOKEN = ‘*****’\n", + " )\n", + " ENCRYPTION (\n", + " TYPE = ‘AWS_SSE_C’,\n", + " MASTER_KEY = ‘*****’\n", + "\n", + "```\n", + "\n", + "The code block above covers the AWS temporary in-line credential format. When you use\n", + "\n", + "in-line credentials in Azure and AWS, the following parameters are required for each type of\n", + "\n", + "credential and encryption:\n", + "\n", + "\n", + "|Credential Name|Required Parameters|\n", + "|---|---|\n", + "|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n", + "||AWS_SESSION_TOKEN|\n", + "|Azure SAS token|AZURE_SAS_TOKEN|\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Encryption Name|Required Parameters|\n", + "|---|---|\n", + "|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n", + "|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n", + "\n", + "\n", + "**Appending data to your Delta table**\n", + "\n", + "To append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n", + "\n", + "is a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n", + "\n", + "users point to a location of files, and once those files are ingested, Delta Lake will keep\n", + "\n", + "1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\n", + "the cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\n", + "cloud object store, you do not need to specify credentials in-line for COPY INTO.\n", + "\n", + "\n", + "-----\n", + "\n", + "track of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n", + "\n", + "get idempotency with COPY INTO, which means users are prevented from ingesting the\n", + "\n", + "same data twice to the same table.\n", + "```\n", + " COPY INTO table_identifier\n", + " FROM [ file_location | ( SELECT expression_list FROM file_location)]\n", + " FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n", + " [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n", + " [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n", + " [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n", + "\n", + "```\n", + "One of the main benefits of COPY INTO is that users don’t have to worry about providing a\n", + "\n", + "schema, because the schema is automatically inferred from your data files. Here is a very\n", + "\n", + "simple example of how you would ingest data from CSV files that have headers, where you\n", + "\n", + "leave the tool to infer the schema and the proper data types. It’s as simple as that.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Using COPY INTO without an existing table** 2\n", + "\n", + "```\n", + " CREATE TABLE my_delta_table (dummy string);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS (\n", + " ‘header’ = ‘true’ ,\n", + " ‘inferSchema’ = ‘true’ ,\n", + " ‘mergeSchema’ = ‘true’\n", + " )\n", + " COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Ingesting a CSV file without headers**\n", + "\n", + "If you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n", + "\n", + "_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n", + "\n", + "data type that you want and then alias these columns to whatever you want to call them.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ( SELECT\n", + " _c0::int as key,\n", + " _c1::double value,\n", + " _c2::timestamp event_time\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + "\n", + "```\n", + "\n", + "In the most common case, in order to use COPY INTO, a table definition is required.\n", + "\n", + "However, if you would like to get started quickly and don’t have an existing table or require\n", + "\n", + "a specific schema, you can create your table with a dummy schema. Then, once you run\n", + "\n", + "COPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n", + "\n", + "infer the data types, and then change your Delta table to have the required schema.\n", + "\n", + "2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving schema over time for CSV files** 3\n", + "\n", + "When ingesting CSV files that have a different number of columns than your existing table,\n", + "\n", + "you can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n", + "\n", + "as FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n", + "\n", + "Once “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n", + "\n", + "files and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n", + "\n", + "when you’re running the COPY INTO command. When “mergeSchema” is provided as a\n", + "\n", + "copy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n", + "\n", + "evolution only allows the addition of new columns. Data type changes for existing columns\n", + "\n", + "are not supported.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM (SELECT\n", + " _C0::int as key,\n", + " _C1::double value,\n", + " _C2::timestamp event_time,\n", + " ...\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + "\n", + "```\n", + "\n", + "**Fixing bad data**\n", + "\n", + "If you find that there is a mistake in the source data file and some of the data you ingested\n", + "\n", + "is bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n", + "\n", + "the Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n", + "\n", + "can rerun your COPY INTO command.\n", + "\n", + "Alternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n", + "\n", + "the use of the “force” copy option. You can manually remove the old data from your Delta\n", + "\n", + "Lake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n", + "\n", + "You can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n", + "\n", + "names with the FILES keyword to reload a subset of files in conjunction with “force”.\n", + "```\n", + " RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n", + " 1);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " PATTERN = ‘2021-09-08*.csv’\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘force’ = ‘true’ )\n", + "\n", + "```\n", + "3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\n", + "clusters enabled with table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Auto Loader\n", + "\n", + "\n", + "While COPY INTO can solve a lot of the key use cases our customers face, due to its\n", + "\n", + "limitations (scalability), there are many scenarios where we recommend Auto Loader\n", + "\n", + "for data ingestion. Auto Loader is a data source on Databricks that incrementally and\n", + "\n", + "efficiently processes new data files as they arrive in cloud storage with minimal DevOps\n", + "\n", + "effort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n", + "\n", + "Auto Loader is an incremental streaming source that provides exactly-once ingestion\n", + "\n", + "guarantees. It keeps track of which files have been ingested using a durable key-value store.\n", + "\n", + "It can discover new files very efficiently and is extremely scalable. Auto Loader has been\n", + "\n", + "battle tested. We have seen customers running Auto Loader on millions of files an hour, and\n", + "\n", + "petabytes of data per day.\n", + "\n", + "To use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n", + "\n", + "that you will use Auto Loader to load files from the cloud object stores. Next, you specify\n", + "\n", + "the format of the file — for example, JSON — as an option to Auto Loader, and you specify\n", + "\n", + "where to load it from.\n", + "```\n", + " df = spark.readStream.format( “cloudFiles” )\n", + " .option( “cloudfiles.format” , “json” )\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "Under the hood, when data lands in your cloud storage, Auto Loader discovers files either\n", + "\n", + "through directory listing or file notifications. Given permissions to the underlying storage\n", + "\n", + "bucket or container, Auto Loader can list the directory that you want to load data from\n", + "\n", + "in an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n", + "\n", + "can also automatically set up file notifications on your storage account, which allows it\n", + "\n", + "\n", + "from queues, deduplicate these notifications using its key-value store and then process\n", + "\n", + "the underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n", + "\n", + "processed, giving you exactly-once semantics.\n", + "\n", + "Directory listing mode is very easy to get started with. If your files are uploaded to your\n", + "\n", + "cloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n", + "\n", + "files by starting directory listing from the latest uploaded files, saving you both time and\n", + "\n", + "money. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n", + "\n", + "to scale to high volumes, Databricks recommends using the file notification mode. Cloud\n", + "\n", + "services such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n", + "\n", + "to upload files in a lexical order, typically by providing the upload time of records in the file\n", + "\n", + "path, such as /base/path/yyyy/MM/dd/HH/file.format.\n", + "\n", + "**Common Use Cases for Auto Loader**\n", + "\n", + "**New to Auto Loader**\n", + "\n", + "As a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n", + "\n", + "stores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n", + "\n", + "example using Python to demonstrate the ease and flexibility of Auto Loader with a few\n", + "\n", + "defined options. You can run the code in a notebook.\n", + "```\n", + " stream = spark.readStream \\\n", + " .format( “cloudFiles” ) \\\n", + " .option( “cloudFiles.format” , “csv” ) \\\n", + " .option( “cloudFiles.schemaLocation” , schema_location) \\\n", + " .load(raw_data_location)\n", + "\n", + "```\n", + "\n", + "to efficiently discover newly arriving files. When a file lands in file notification mode, the\n", + "\n", + "cloud storage system sends a notification to a queuing system. For example, in AWS, S3\n", + "\n", + "will send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n", + "\n", + "On Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n", + "\n", + "\n", + "-----\n", + "\n", + "In order to write to a Delta table from the stream, follow the example below:\n", + "```\n", + " stream.writeStream \\\n", + " .option( “mergeSchema” , “true” ) \\\n", + " .option( “checkpointLocation” , checkpoint_location) \\\n", + " .start(target_delta_table_location)\n", + "\n", + "```\n", + "**Migrating to Auto Loader**\n", + "\n", + "As a Spark user, you may be using an existing Spark structured streaming to process data.\n", + "\n", + "To migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n", + "\n", + "two lines of it into ‘cloudFiles’, specifying the file format within an option.\n", + "\n", + "\n", + "**Migrating a livestreaming pipeline**\n", + "\n", + "Migrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n", + "\n", + "INTO, you can specify a timestamp when the source files are updated or created and Auto\n", + "\n", + "Loader will ingest all modified data after that point.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Schema inference and evolution**\n", + "\n", + "Auto Loader provides schema inference and management capabilities. With a schema\n", + "\n", + "location specified, Auto Loader can store the changes to the inferred schema over time. For\n", + "\n", + "file formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n", + "\n", + "Loader can automatically infer data types or treat everything as a string.\n", + "\n", + "When data does not match your schema (e.g., an unknown column or format), Auto Loader\n", + "\n", + "has a data rescue capability that will “rescue” all data in a separate column, stored as a\n", + "\n", + "JSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n", + "\n", + "Auto Loader supports three schema evolution modes: add new columns as they are\n", + "\n", + "discovered, fail if an unexpected column is seen, or rescue new columns.\n", + "\n", + "```\n", + "df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.\n", + "format” , “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "```\n", + "df = spark.readStream\n", + " .format( “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "\n", + "Once it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n", + "\n", + "Loader can scale to trillions of files, unlike the open-source file streaming source. One of\n", + "\n", + "the ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n", + "\n", + "to discover files first, then plan, Auto Loader discovers and processes files concurrently,\n", + "\n", + "making it much more efficient and leading to cost reductions in compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fixing a file that was processed with Auto Loader**\n", + "\n", + "To fix a file that was already processed, Auto Loader supports an option called\n", + "\n", + "‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n", + "\n", + "new timestamp. If you want to enable this option in an existing Auto Loader stream, you\n", + "\n", + "need to stop and restart the Auto Loader stream with the enabled option.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .schema(schema)\n", + " .option( “cloudFiles.allowOverwrites” , “true” )\n", + " .options(format_options)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Discover missing data**\n", + "\n", + "While event notification is a very scalable method to collect all data, it relies on cloud\n", + "\n", + "services, which are distributed systems and are not always reliable. With Auto Loader, you\n", + "\n", + "can additionally specify a backfill interval, where Auto Loader will perform asynchronous\n", + "\n", + "backfills at whatever interval you set up. This can be enabled with a once trigger,\n", + "\n", + "```\n", + " df = spark.readStream\n", + " .format(“cloudFiles”)\n", + " .option(“cloudFiles.format”, “json”)\n", + " .schema(schema)\n", + " .option( “cloudFiles.backfillInterval” , “1 week” )\n", + " .options(format_options)\n", + " .load(“/path/to/table”)\n", + " .writeStream\n", + " .trigger(Trigger.AvailableNow())\n", + " .option(“checkpointLocation”, checkpointDir)\n", + " .start()\n", + "\n", + "```\n", + "The trigger tells Auto Loader how frequently to process incoming data. A processing time\n", + "\n", + "trigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n", + "\n", + "interval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n", + "\n", + "process all new data that has been added until the start of your application. Once the data\n", + "\n", + "is processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n", + "\n", + "process all the new data in a single micro-batch, which requires it to first discover all the\n", + "\n", + "new files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n", + "\n", + "and perform rate limiting, which makes it a preferable alternative to Trigger Once.\n", + "\n", + "\n", + "processing time trigger and available now trigger. The following example shows how to use\n", + "\n", + "backfill internal and trigger availableNow together:\n", + "\n", + "\n", + "-----\n", + "\n", + "**Using Auto Loader in SQL with Delta Live Tables**\n", + "\n", + "Delta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n", + "\n", + "framework to develop, test, monitor, manage and operationalize data pipelines at scale to\n", + "\n", + "drive insights for data science, machine learning and analytics. Auto Loader is available in\n", + "\n", + "Delta Live Tables.\n", + "\n", + "```\n", + "CREATE INCREMENTAL LIVE TABLE\n", + " autoloader_test\n", + "AS\n", + "SELECT\n", + " *,\n", + " id + id2 AS new_id\n", + "FROM\n", + " CLOUD_FILES (\n", + " “some/cloud/path” , – the path to the data\n", + " “json” – the file format\n", + " );\n", + "\n", + "```\n", + "\n", + "**Live Tables understands**\n", + "\n", + "**and coordinates data flow**\n", + "\n", + "**between your queries**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting Data From External Applications\n", + "\n", + "While Auto Loader and COPY INTO are powerful tools, not all data is available as files\n", + "\n", + "in cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n", + "\n", + "your data and break down the silos between sources and downstream teams. To do this,\n", + "\n", + "customers need to discover and connect a broad set of data, BI and AI tools, and systems\n", + "\n", + "to the data within their lakehouse.\n", + "\n", + "##### Partner Connect\n", + "\n", + "Historically, stitching multiple enterprise tools and data sources together has been a burden\n", + "\n", + "on the end user, making it very complicated and expensive to execute at any scale. Partner\n", + "\n", + "Connect solves this challenge by making it easy for you to integrate data, analytics and AI\n", + "\n", + "tools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n", + "\n", + "validated solutions from Databricks partners that support your expanding analytics needs.\n", + "\n", + "To ingest into the lakehouse, select the partner tile in Partner Connect via the left\n", + "\n", + "navigation bar in Databricks. Partner Connect will automatically configure resources such\n", + "\n", + "as clusters, tokens and connection files for you to connect with your data ingestion tools\n", + "\n", + "of choice. You can finish signing up for a trial account on the partner’s website or directly\n", + "\n", + "log in if you already used Partner Connect to create a trial account. Once you log in, you will\n", + "\n", + "see that Databricks is already configured as a destination in the partner portal and ready\n", + "\n", + "to be used.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Common Use Case for Partner Connect**\n", + "\n", + "**Ingest Salesforce data via Fivetran into Delta Lake**\n", + "\n", + "Clicking on the Fivetran tile in Partner Connect starts an automated workflow between\n", + "\n", + "the two products. Databricks automatically provisions a SQL endpoint and associated\n", + "\n", + "credentials for Fivetran to interact with, and passes the user’s identity and the SQL\n", + "\n", + "\n", + "endpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n", + "\n", + "Databricks destination is automatically created. This destination is configured to ingest into\n", + "\n", + "Delta via the SQL endpoint that was auto-configured by Partner Connect.\n", + "\n", + "The customer now selects their choice of data source in Fivetran from hundreds of pre-\n", + "\n", + "built connectors — for example, Salesforce. The user authenticates to the Salesforce\n", + "\n", + "source, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "(in this case the Account & Contact objects) and starts the initial sync. This automation\n", + "\n", + "has saved users dozens of manual steps and copying/pasting of configuration if they\n", + "\n", + "manually set up the connection. It also protects the user from making any unintentional\n", + "\n", + "configuration errors and spending time debugging those errors. The Salesforce tables\n", + "\n", + "are now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n", + "\n", + "details or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n", + "\n", + "globe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n", + "\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
# 2023 State\n", + " of Data + AI\n", + "```\n", + "Powered by the Databricks Lakehouse\n", + "\n", + "```\n", + "2023 STATE OF DATA + AI\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||W|e’|r|e|in||th|e|||||||\n", + "|||||||go|l|de|n|a|ge||of|||||||\n", + "|||||||||||||||||||||\n", + "|||||||d|a|ta|a|n|d|A|I|||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "INTRO\n", + "\n", + "In the 6 months since ChatGPT launched, the world has woken up to the vast potential\n", + "of AI. The unparalleled pace of AI discoveries, model improvements and new products\n", + "on the market puts data and AI strategy at the top of conversations across every\n", + "organization around the world. We believe that AI will usher in the next generation of\n", + "product and software innovation, and we’re already seeing this play out in the market.\n", + "The next generation of winning companies and executives will be those who understand\n", + "and leverage AI.\n", + "\n", + "In this report, we examine patterns and trends in data and AI adoption across more\n", + "than 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\n", + "applications across companies’ entire data estates, the Databricks Lakehouse provides\n", + "a unique vantage point into the state of data and AI, including which products and\n", + "technologies are the fastest growing, the types of data science and machine learning\n", + "(DS/ML) applications being developed and more.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Here are the major stories we uncovered:\n", + "\n", + "```\n", + "\n", + "Companies are adopting\n", + "machine learning and large\n", + "language models (LLMs)\n", + "at a rapid pace. Natural\n", + "language processing (NLP)\n", + "is dominating use cases,\n", + "with an accelerated focus\n", + "on LLMs.\n", + "\n", + "\n", + "Organizations are investing in\n", + "data integration products as\n", + "they prioritize more DS/ML\n", + "initiatives. 50% of our fastestgrowing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "Organizations are increasingly\n", + "using the Lakehouse for data\n", + "warehousing, as evidenced\n", + "by the high growth of data\n", + "integration tools dbt and\n", + "Fivetran, and the accelerated\n", + "adoption of Databricks SQL.\n", + "\n", + "\n", + "We hope that by sharing these trends, data leaders will be able to benchmark\n", + "their organizations and gain insights that help inform their strategies for an\n", + "era defined by data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Summary of\n", + "\n", + "Key Findings\n", + " DATA SCIENCE AND MACHINE LEARNING:\n", + "\n", + " NLP AND LLMS ARE IN HIGH DEMAND\n", + " 1\n", + "\n", + "```\n", + "**•** The number of companies using SaaS LLM APIs (used to access\n", + "services like ChatGPT) has grown 1310% between the end of\n", + "November 2022 and the beginning of May 2023\n", + "\n", + "**•** NLP accounts for 49% of daily Python data science library usage,\n", + "making it the most popular application\n", + "\n", + "**•** Organizations are putting substantially more models into production\n", + "(411% YoY growth) while also increasing their ML experimentation\n", + "(54% YoY growth)\n", + "\n", + "**•** Organizations are getting more efficient with ML; for every three\n", + "\n", + "experimental models, roughly one is put into production, compared\n", + "to five experimental models a year prior\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "FASTEST-GROWING DATA\n", + "AND AI PRODUCTS\n", + "\n", + "```\n", + "```\n", + "ADOPTION AND\n", + "MIGRATION TRENDS\n", + "\n", + "```\n", + "61% of customers migrating to the\n", + "Lakehouse are coming from onprem and cloud data warehouses\n", + "\n", + "The volume of data in Delta Lake\n", + "has grown 304% YoY\n", + "\n", + "The Lakehouse is increasingly\n", + "being used for data warehousing,\n", + "including serverless data\n", + "warehousing with Databricks\n", + "SQL, which grew 144% YoY\n", + "\n", + "\n", + "BI is the top data and AI market, but\n", + "growth trends in other markets show that\n", + "companies are increasingly looking at\n", + "more advanced data use cases\n", + "\n", + "The fastest-growing data and AI product\n", + "is dbt, which grew 206% YoY by number\n", + "of customers\n", + "\n", + "Data integration is the fastest-growing\n", + "data and AI market on the Databricks\n", + "Lakehouse with 117% YoY growth\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Methodology: How did Databricks\n", + "\n", + "create this report?\n", + "\n", + "```\n", + "The _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\n", + "collected from our customers based on how they are using the Databricks\n", + "Lakehouse and its broad ecosystem of integrated tools. This report focuses\n", + "on machine learning adoption, data architecture (integrations and migrations)\n", + "and use cases. The customers in this report represent every major industry\n", + "and range in size from startups to many of the world’s largest enterprises.\n", + "\n", + "Unless otherwise noted, this report presents and analyzes data from February 1,\n", + "2022, to January 31, 2023, and usage is measured by number of customers.\n", + "When possible, we provide YoY comparisons to showcase growth trends over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Data Science and\n", + "\n", + "Machine Learning\n", + "NATURAL LANGUAGE PROCESSING AND LARGE\n", + "LANGUAGE MODELS ARE IN HIGH DEMAND\n", + "\n", + "```\n", + "Across all industries, companies leverage data science and\n", + "machine learning (DS/ML) to accelerate growth, improve\n", + "predictability and enhance customer experiences. Recent\n", + "advancements in large language models (LLMs) are propelling\n", + "companies to rethink AI within their own data strategies.\n", + "Given the rapidly evolving DS/ML landscape, we wanted to\n", + "understand several aspects of the market:\n", + "\n", + "- Which types of DS/ML applications are companies investing\n", + "in? In particular, given the recent buzz, what does the data\n", + "around LLMs look like?\n", + "\n", + "- Are companies making headway on operationalizing\n", + "\n", + "their machine learning models (MLOps)?\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Time Series Time Series\n", + "Speech Recognition\n", + "Simulations &\u0003\n", + "\n", + "Optimizations\n", + "Recommender Systems\n", + "Natural\n", + "\n", + "\u0003Language \u0003\n", + "\n", + "Processing\n", + "Industry Data Modeling\n", + "Graph\n", + "Geospatial\n", + "Computer Vision\n", + "Anomaly Detection\n", + "\u0003& Segmentation\n", + "\n", + "```\n", + "```\n", + " SPECIALIZED PYTHON \u0003DS/ML\n", + "\n", + " LIBRARIES FROM \u0003FEBRUARY 2022 \n", + "\n", + " TO JANUARY 2023\n", + "\n", + "```\n", + "\n", + "Note: This chart reflects the unique\n", + "number of notebooks using ML\n", + "libraries per day in each of the\n", + "categories. It includes libraries used\n", + "for the particular problem-solving use\n", + "cases mentioned. It does not include\n", + "libraries used in tooling for data\n", + "preparations and modeling.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Natural language processing dominates\n", + "\n", + "machine learning use cases\n", + "\n", + "```\n", + "\n", + "Our second most popular DS/ML application is\n", + "simulations and optimization, which accounts for 30% of\n", + "all use cases. This signals organizations are using data to\n", + "model prototypes and solve problems cost-effectively.\n", + "\n", + "\n", + "To understand how organizations are applying AI and\n", + "ML within the Lakehouse, we aggregated the usage\n", + "of specialized Python libraries, which include NLTK,\n", + "Transformers and FuzzyWuzzy, into popular data science\n", + "use cases. 1 We look at data from these libraries because\n", + "Python is on the cutting edge of new developments in ML,\n", + "advanced analytics and AI, and has consistently ranked\n", + "as one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\n", + "recent years.\n", + "\n", + "Our most popular use case is natural language processing\n", + "(NLP), a rapidly growing field that enables businesses to\n", + "gain value from unstructured textual data. This opens the\n", + "door for users to accomplish tasks that were previously\n", + "too abstract for code, such as summarizing content or\n", + "extracting sentiment from customer reviews. In our data\n", + "set, 49% of libraries used are associated with NLP. LLMs\n", + "also fall within this bucket. Given the innovations launched\n", + "in recent months, we expect to see NLP take off even\n", + "more in coming years as it is applied to use cases like\n", + "chatbots, research assistance, fraud detection, content\n", + "generation and more.\n", + "\n", + "```\n", + " In our data set, 49% of\n", + " specialized Python libraries\n", + " used are associated with NLP\n", + "\n", + "```\n", + "Many of the DS/ML use cases are predominantly\n", + "leveraged by specific industries. While they take up a\n", + "smaller share of the total, they are mission-critical for\n", + "many organizations. For example, time series includes\n", + "forecasting, a use case that is especially popular in\n", + "industries such as Retail and CPG, which rely heavily\n", + "on the ability to forecast the need for every item in\n", + "every store.\n", + "\n", + "\n", + "1. This data does not include general-purpose ML libraries, including\n", + "scikit-learn or TensorFlow.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " USE OF LARGE LANGUAGE MODELS (LLMS)\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "We have rolled these libraries up into groupings based on the type of functionality they provide.\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||||||||\n", + "|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n", + "||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n", + "|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n", + "|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n", + "|||SaaS|||||||||||||||||||||||||||||\n", + "|||LLM|Tools||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n", + "|2022||||||||||||||||||||20|23||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n", + "\n", + "\n", + "\n", + "D t i t tl di i th l t k f D b d t lit\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Large language models are\n", + "\n", + "the “it” tool\n", + "\n", + "```\n", + "LLMs are currently one of the hottest and most-watched areas\n", + "in the field of NLP. LLMs have been instrumental in enabling\n", + "machines to understand, interpret and generate human language\n", + "in a way that was previously impossible, powering everything\n", + "from machine translation to content creation to virtual assistants\n", + "and chatbots.\n", + "\n", + "Transformer-related libraries have been growing in popularity\n", + "even before ChatGPT thrust LLMs into the public consciousness.\n", + "Within the last 6 months, our data shows two accelerating\n", + "trends: organizations are building their own LLMs, which models\n", + "like [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\n", + "they are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\n", + "LLMs, have the highest adoption within the Lakehouse.\n", + "\n", + "The second most popular type is SaaS LLMs, which are used\n", + "to access models like OpenAI. This category has grown\n", + "exponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\n", + "number of Lakehouse customers using SaaS LLMs has grown\n", + "\n", + "\n", + "Organizations can leverage LLMs either by\n", + "using SaaS LLM APIs to call services like\n", + "ChatGPT from OpenAI or by operating their\n", + "own LLMs in-house.\n", + "\n", + "Thinking of building your own modern LLM\n", + "application? This approach could entail\n", + "the use of specialized transformer-related\n", + "Python libraries to train the model, as well as\n", + "LLM tools like LangChain to develop prompt\n", + "interfaces or integrations to other systems.\n", + "```\n", + "LLM DEFINITIONS\n", + "\n", + "```\n", + "**◊** **Transformer-related libraries:**\n", + "Python libraries used to train LLMs\n", + "(example: Hugging Face)\n", + "\n", + "**◊** **SaaS LLM APIs:** Libraries used to access\n", + "LLMs as a service (example: OpenAI)\n", + "\n", + "**◊** **LLM tools:** Toolchains for working\n", + "with and building proprietary LLMs\n", + "(example: LangChain)\n", + "\n", + "\n", + "an impressive 1310% between the end of November 2022 and\n", + "the beginning of May 2023. (In contrast, transformer-related\n", + "libraries grew 82% in this same period.)\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " ac e ea g e pe e a o a d p oduc o\n", + "take off across industries\n", + "\n", + "```\n", + "\n", + "The increasing demand for ML solutions and the growing\n", + "availability of technologies have led to a significant\n", + "increase in experimentation and production, two distinct\n", + "parts of the ML model lifecycle. We look at the _logging_ and\n", + "_registering_ of models in MLflow, an open source platform\n", + "developed by Databricks, to understand how ML is\n", + "trending and being adopted within organizations.\n", + "```\n", + " LOGGED MODELS AND\n", + "\n", + " ML EXPERIMENTATION\n", + "\n", + "```\n", + "During the experimentation phase of ML, data scientists\n", + "develop models designed to solve given tasks. After training\n", + "the models, they test them to evaluate their accuracy,\n", + "precision, recall (the percentage of correctly predicted\n", + "positive instances out of all actual positive instances), and\n", + "more. These metrics are logged (recorded) in order to analyze\n", + "the various models’ performance and identify which approach\n", + "works best for the given task.\n", + "\n", + "We have chosen logged models as a proxy to measure ML\n", + "experimentation because the MLflow Tracking Server is\n", + "\n", + "designed to facilitate experiment tracking and reproducibility.\n", + "\n", + "\n", + "MLflow Model Registry launched in May 2021. Overall, the\n", + "number of logged models has grown 54% since February\n", + "2022, while the number of registered models has grown\n", + "411% over the same period. This growth in volume suggests\n", + "organizations are understanding the value of investing in\n", + "and allocating more people power to ML.\n", + "```\n", + "REGISTERED MODELS AND ML PRODUCTION\n", + "\n", + "```\n", + "Production models have undergone the experimentation\n", + "phase and are then deployed in real-world applications. They\n", + "are typically used to make predictions or decisions based on\n", + "new data. Registering a model is the process of recording and\n", + "storing metadata about a trained model in a centralized location\n", + "that allows users to easily access and reuse existing models.\n", + "Registering models prior to production enables organizations to\n", + "ensure consistency and reliability in model deployment and scale.\n", + "\n", + "We have chosen registered models to represent ML production\n", + "because the MLflow Model Registry is designed to manage\n", + "models that have left the experimentation phase through the\n", + "\n", + "rest of their lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "g y yi p\n", + "\n", + "was registered. Recent advances in ML, such as improved\n", + "open source libraries like MLflow and Hugging Face, have\n", + "\n", + "radically simplified building and putting models into\n", + "production. The result is that 34% of logged models are\n", + "now candidates for production today, an improvement\n", + "from over 20% just a year ago.\n", + "\n", + "\n", + "before committing an ML model to production. We wanted\n", + "to understand, “How many models do data scientists\n", + "\n", + "experiment with before moving to production?”\n", + "\n", + "Our data shows the ratio of logged to registered models\n", + "is 2.9 : 1 as of January 2023. This means that for roughly\n", + "every three experimental models, one model will get\n", + "registered as a candidate for production. This ratio has\n", + "improved significantly from just a year prior, when we\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||VS. S|||||||||||||||||||||||\n", + "|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "||||||Models|||||||||||||||||||||||\n", + "||||||ber of|||||||||||||||||||||||\n", + "||||||Num|||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "|2.|9 :|1||||||||||||||||||||||||||\n", + "\n", + "```\n", + "Ratio of Logged to Registered\n", + "\n", + " Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\n", + "Models in Jan 2023 2023\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "The Modern Data\n", + "and AI Stack\n", + "\n", + "```\n", + "Over the last several years, the trend toward building\n", + "open, unified data architectures has played out in our\n", + "own data. We see that data leaders are opting to preserve\n", + "choice, leverage the best products and deliver innovation\n", + "across their organizations by democratizing access to\n", + "data for more people.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " FASTEST-GROWING DATA AND AI PRODUCTS\n", + " dbt 206%\n", + "\n", + "```\n", + "```\n", + "Fivetran\n", + "Informatica\n", + "Qlik Data Integration\n", + "Esri\n", + "Looker\n", + "Hugging Face\n", + "\n", + "```\n", + "```\n", + " 181%\n", + " 174%\n", + " 152%\n", + " 145%\n", + " 141%\n", + "110%\n", + "\n", + "```\n", + "```\n", + "Lytics\n", + "Great Expectations\n", + "Kepler.gl\n", + "\n", + "```\n", + "```\n", + " 101%\n", + " 100%\n", + "95%\n", + "\n", + "```\n", + "```\n", + "0% 50% 100% 150% 200%\n", + " Year-Over-Year Growth by Number of Customers\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "DBT IS THE FASTEST-GROWING DATA\n", + "\n", + "AND AI PRODUCT OF 2023\n", + "\n", + "```\n", + "As companies move quickly to develop more advanced\n", + "use cases with their data, they are investing in newer\n", + "products that produce trusted data sets for reporting,\n", + "ML modeling and operational workflows. Hence, we see\n", + "the rapid rise of data integration products. dbt, a data\n", + "transformation tool, and Fivetran, which automates\n", + "data pipelines, are our two fastest-growing data and AI\n", + "products. This suggests a new era of the data integration\n", + "market with challenger tools making headway as\n", + "companies shift to prioritize DS/ML initiatives. With Great\n", + "Expectations from Superconductive in the ninth spot,\n", + "a full 50% of our fastest-growing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||Busi|ness I|ntelli|gence|\n", + "|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n", + "|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n", + "|Custom||||||||||||||||||||\n", + "|ber of||||||||||||||||||||\n", + "|Num||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "Note: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\n", + "categories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " a a a d a e s bus ess e ge ce s\n", + "standard, organizations invest in their machine\n", + "learning foundation\n", + "\n", + "```\n", + "\n", + "To understand how organizations are prioritizing their data\n", + "initiatives, we aggregated all data and AI products on the\n", + "Databricks Lakehouse and categorized them into four\n", + "core markets: BI, data governance and security, DS/ML,\n", + "and data integration. Our data set confirms that BI tools\n", + "are more widely adopted across organizations relative to\n", + "more nascent categories — and they continue to grow,\n", + "with a 66% YoY increase in adoption. This aligns with the\n", + "broader trend of more organizations performing data\n", + "warehousing on a Lakehouse, covered in the next section,\n", + "Views from the Lakehouse.\n", + "\n", + "\n", + "While BI is often where organizations start their data\n", + "journey, companies are increasingly looking at more\n", + "advanced data and AI use cases.\n", + "```\n", + "DEMAND FOR DATA INTEGRATION PRODUCTS\n", + "\n", + "IS GROWING FAST\n", + "\n", + "```\n", + "We see the fastest growth in the data integration market.\n", + "These tools enable a company to integrate vast amounts\n", + "of upstream and downstream data in one consolidated\n", + "view. Data integration products ensure that all BI and DS/\n", + "ML initiatives are built on solid foundation.\n", + "\n", + "While it’s easier for smaller markets to experience\n", + "faster growth, at 117% YoY increased adoption, the data\n", + "integration market is growing substantially faster than BI.\n", + "This trend dovetails with the rapid growth of ML adoption\n", + "we see across the Lakehouse, covered in the DS/ML\n", + "section of the report.\n", + "\n", + "```\n", + "Data integration is the\n", + "fastest-growing market,\n", + "\n", + " with 117% YoY growth\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Views from\n", + "the Lakehouse\n", + "MIGRATION AND DATA\n", + "\n", + "FORMAT TRENDS\n", + "\n", + "```\n", + "Data migration is a major undertaking: it can be risky,\n", + "expensive and delay companies’ timelines. It’s not a\n", + "task to jump into lightly. As organizations run into the\n", + "limitations, scalability challenges and the cost burden\n", + "of legacy data platforms, they are increasingly likely\n", + "to migrate to a new type of architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Migration trends:\n", + "\n", + "the best data warehouse\n", + "\n", + "is a Lakehouse\n", + "\n", + "```\n", + "The Lakehouse Platform is an attractive\n", + "alternative to traditional data warehouses\n", + "because it supports advanced use cases and\n", + "DS/ML, allowing organizations to boost their\n", + "overall data strategy. As evidenced by the most\n", + "popular data and AI products, with BI and data\n", + "integration tools at the top, organizations are\n", + "increasingly using the data lakehouse for data\n", + "warehousing. To better understand which legacy\n", + "platforms organizations are moving away from,\n", + "\n", + "we look at the migrations of new customers\n", + "to Databricks.\n", + "\n", + "An interesting takeaway is that roughly half of the\n", + "companies moving to the Lakehouse are coming\n", + "from data warehouses. This includes the 22%\n", + "that are moving from cloud data warehouses.\n", + "It also demonstrates a growing focus on running\n", + "data warehousing workloads on a Lakehouse\n", + "and unifying data platforms to reduce cost.\n", + "\n", + "```\n", + " SOURCE OF NEW CUSTOMER \u0003\n", + "\n", + " MIGRATIONS TO DATABRICKS\n", + "\n", + "```\n", + "```\n", + "12%\n", + "\n", + "```\n", + "```\n", + "39%\n", + "\n", + "```\n", + "```\n", + "27%\n", + "\n", + "```\n", + "```\n", + "22%\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Rising tides: the volume\n", + "\n", + "of data in Delta Lake\n", + "\n", + "has grown 304% YoY\n", + "\n", + "```\n", + "As the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\n", + "large proportion is in the form of semi-structured\n", + "and unstructured data. Previously, organizations\n", + "had to manage multiple different platforms for\n", + "their structured, unstructured and semi-structured\n", + "data, which caused unnecessary complexity and\n", + "high costs. The Lakehouse solves this problem by\n", + "providing a unified platform for all data types\n", + "and formats.\n", + "\n", + "Delta Lake is the foundation of the Databricks\n", + "Lakehouse. The Delta Lake format encompasses\n", + "structured, unstructured and semi-structured\n", + "data. Use has surged over the past 2 years.\n", + "When compared to the steady, flat or declining\n", + "growth in other storage formats (e.g., text, JSON\n", + "and CSV), our data shows that a growing number\n", + "of organizations are turning to Delta Lake to manage\n", + "their data. In June 2022, Delta Lake surpassed\n", + "Parquet as the most popular data lake source,\n", + "reaching 304% YoY growth.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||BY|STO||RAG||E FO||RMA|T|||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|ata||||||||||||||||||\n", + "|e of D||||||||||||||||||\n", + "|Volum||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "||Jan|||||||J|an|||Jan||||Ja||\n", + "|||||Jan||||||||||||||\n", + "|2|019|||2020||||20|21|||2022||||202||\n", + "|||||||||Delta|Te|xt||CSV||Av||ro||\n", + "|||||||||Parquet|OR|C||JSON||||||\n", + "|||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " g g ,\n", + "with emphasis on serverless\n", + "\n", + "```\n", + "\n", + "Over the past 2 years, companies have vastly increased their usage\n", + "of data warehousing on the Lakehouse Platform. This is especially\n", + "demonstrated by use of Databricks SQL ­— the serverless data\n", + "warehouse on the Lakehouse — which shows 144% YoY growth.\n", + "This suggests that organizations are increasingly ditching traditional\n", + "data warehouses and are able to perform all their BI and analytics\n", + "on a Lakehouse.\n", + "\n", + "```\n", + " Data \n", + "Warehouse\n", + "\n", + "```\n", + "```\n", + "Data \n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "Platform\n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||\n", + "||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n", + "||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n", + "||DA|TABR|ICK|S SQ|||||||||||||||||||||\n", + "||||||||ustome||||||||||||||||||\n", + "||||||||r of C||||||||||||||||||\n", + "||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n", + "||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n", + "||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n", + "\n", + "\n", + "-----\n", + "\n", + "CONCLUSION\n", + "```\n", + "Generation AI\n", + "\n", + "```\n", + "We’re excited that companies are progressing into more\n", + "advanced ML and AI use cases, and the modern data and\n", + "AI stack is evolving to keep up. Along with the rapid growth\n", + "of data integration tools (including our fastest growing,\n", + "dbt), we’re seeing the rapid rise of NLP and LLM usage in\n", + "our own data set, and there’s no doubt that the next few\n", + "years will see an explosion in these technologies. It’s never\n", + "been more clear: the companies that harness the power\n", + "of DS/ML will lead the next generation of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "About Databricks\n", + "\n", + "```\n", + "Databricks is the data and AI company. More than 9,000\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "the world’s toughest problems. To learn more, follow Databricks\n", + "on Twitter, LinkedIn and Instagram.\n", + "\n", + "[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
**eBook**\n", + "\n", + "# Making Your Digital Twin Come to Life\n", + "\n", + "##### With the Lakehouse for Manufacturing and Tredence\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Digital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n", + "\n", + "What Are Digital Twins? ........................................................................................................................................................................................ **07**\n", + "\n", + "Digital Twin Architectures .................................................................................................................................................................................. **08**\n", + "\n", + "How to Build a Digital Twin ................................................................................................................................................................................ **09**\n", + "\n", + "Why Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n", + "\n", + "Why Databricks for Digital Twins? ................................................................................................................................................................... **13**\n", + "\n", + "Why Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n", + "\n", + "Using Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "The concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\n", + "over 25 years ago, during the early phases of foundation and cofferdam construction for the\n", + "London Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\n", + "the years since this first application, edge computing, AI, data connectivity, 5G connectivity\n", + "and the improvements of the Internet of Things (IoT) have enabled digital twins to become\n", + "cost-effective and are now an imperative in today’s data-driven businesses.\n", + "\n", + "Today’s manufacturing industries are expected to streamline and optimize all the processes in their value\n", + "chain from product development and design, through operations and supply chain optimization to obtaining\n", + "feedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\n", + "and is addressing a multitude of challenges within manufacturing, logistics and transportation.\n", + "\n", + "\n", + "[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "\n", + "\n", + "**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 10%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 50%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 25%\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Introduction (continued)**\n", + "\n", + "\n", + "**Digital twin market growth rate accelerates**\n", + "\n", + "Digital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\n", + "is forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\n", + "at a CAGR of 58%, riding on the wave of Industry 4.0.\n", + "\n", + "\n", + "**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to Manufacturing\n", + "\n", + "Industry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\n", + "would have come at significant costs without digital twin technology.\n", + "\n", + "**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n", + "\n", + "\n", + "\n", + "**•** Product design and development is performed with\n", + "less cost and is completed in less time as iterative\n", + "simulations, using multiple constraints, deliver the\n", + "best or most optimized design. All commercial\n", + "aircraft are designed using digital twins.\n", + "\n", + "**•** Digital twins provide the awareness of how long\n", + "inventory will last, when to replenish and how to\n", + "minimize the supply chain disruptions. The oil and gas\n", + "industry, for example, uses supply chain–oriented\n", + "digital twins to reduce supply chain bottlenecks in\n", + "storage and midstream delivery, schedule tanker\n", + "off-loads and model demand with externalities.\n", + "\n", + "\n", + "\n", + "**•** Continuous quality checks on produced items\n", + "with ML/AI generated feedback pre-emptively\n", + "assuring improved product quality. Final paint\n", + "inspection in the automotive industry, for example,\n", + "is performed with computer vision built on top of\n", + "digital twin technology.\n", + "\n", + "**•** Striking the sweet spot between when to replace\n", + "a part before the process degrades or breaks\n", + "down and utilizing the components to their fullest,\n", + "digital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\n", + "building an asset performance management suite.\n", + "\n", + "\n", + "\n", + "**•** Digital twins create the opportunity to have\n", + "multiple departments in sync by providing\n", + "necessary instructions modularly to attain\n", + "a required throughput. Digital twins are the\n", + "backbone of kaizen events that optimize\n", + "manufacturing process flow.\n", + "\n", + "**•** Customer feedback loops can be modeled through\n", + "inputs, from point of sale customer behavior,\n", + "buying preferences, or product performance and\n", + "then integrated into the product development\n", + "process, forming a closed loop providing an\n", + "improved product design.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n", + "\n", + "The top four use cases are heavily focused on operational processes and are typically the first to be deployed\n", + "in manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\n", + "deployment, but typically offer higher and longer-lasting value.\n", + "\n", + "**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n", + "\n", + "\n", + "Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**\n", + "\n", + "**8%**\n", + "**8%**\n", + "\n", + "\n", + "Can you imagine the cost to change\n", + "an oil refinery’s crude distillation\n", + "unit process conditions to improve\n", + "the output of diesel one week\n", + "and gasoline the next to address\n", + "changes in demand and ensure\n", + "maximum economic value? Can you\n", + "imagine how to replicate an even\n", + "simple supply chain to model risk?\n", + "\n", + "\n", + "**5%**\n", + "\n", + "\n", + "**1%**\n", + "\n", + "\n", + "-----\n", + "\n", + "### What Are Digital Twins?\n", + "\n", + "\n", + "Knowing the business challenges and benefits digital twins deliver, let’s turn to\n", + "the basics and explore what digital twins are and how a modern data stack is\n", + "necessary to build effective and timely digital twins. The classic definition of\n", + "digital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n", + "\n", + "\n", + "For a discrete or continuous manufacturing process, a digital twin gathers system\n", + "and processes state data with the help of various IoT sensors [operational\n", + "technology data (OT)] and enterprise data [informational technology (IT)] to form a\n", + "virtual model which is then used to run simulations, study performance issues and\n", + "generate possible insights.\n", + "\n", + "\n", + "**Types of Digital Twins**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twin Architectures\n", + "\n", + "Classic digital twins have been physics-based models of specific systems. More recently,\n", + "**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n", + "\n", + "\n", + "These twins provide the opportunity to not just monitor and simulate system performance under specific\n", + "conditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\n", + "the industrial environment.\n", + "\n", + "Digital twins undergo a series of changes during their lifecycle to become completely autonomous.\n", + "\n", + "**Data-Driven Operational Digital Twins: Maturity Journey**\n", + "\n", + "**AI**\n", + "\n", + "Simulate & Optimize\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "# 6-8 18-24\n", + "## years to months\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "# 20% to 25%\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "\n", + "Identify next best action and\n", + "integrate with actuation systems\n", + "\n", + "\n", + "**IoT**\n", + "\n", + "**Edge/**\n", + "**Cloud**\n", + "\n", + "\n", + "**Digital Twins**\n", + "\n", + "**ERP**\n", + "\n", + "\n", + "Predict & Diagnose\n", + "\n", + "|Col1|I i|\n", + "|---|---|\n", + "\n", + "\n", + "\n", + "Predictive maintenance, process\n", + "improvements and Root Causing\n", + "\n", + "\n", + "Monitor & Alert\n", + "\n", + "|Col1|P i|\n", + "|---|---|\n", + "\n", + "\n", + "Real-time operations monitoring\n", + "and alerting\n", + "\n", + "\n", + "-----\n", + "\n", + "### How to Build a Digital Twin\n", + "\n", + "\n", + "A data architecture capability is needed to capture\n", + "and collect the ever-expanding volume and variety\n", + "of data streaming in real time from example\n", + "protocols, such as ABB Total Flow, Allen Bradley,\n", + "Emerson, Fanuc, GE, Hitachi and Mitsubishi.\n", + "\n", + "\n", + "Data collection, data analytics, application\n", + "enablement and data integration orchestrate the\n", + "time-series data stream and transfer to the cloud.\n", + "Azure IoT Hub is used to securely ingest data from\n", + "edge to cloud.\n", + "\n", + "\n", + "Cloud infrastructure and analytics capabilities are\n", + "offered within the flexibility of the cloud. Azure\n", + "Digital Twin is used to model and visualize process\n", + "workflows. Databricks MLflow and Delta Lake scale to\n", + "deliver real-time predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Digital Twins: Technical Architecture**\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n", + "\n", + "\n", + "**System and use case discovery**\n", + "**and blueprinting**\n", + "\n", + "**•** Identify priority plant processes and systems\n", + "to model, with focused use cases (e.g., asset\n", + "maintenance, energy management, process\n", + "monitoring/optimization, etc.)\n", + "\n", + "**•** Develop a validated process outline, blueprint and\n", + "key performance indicators\n", + "\n", + "**•** Develop a set of process variables, control\n", + "variables and manipulated variables\n", + "\n", + "**•** Design control loop\n", + "\n", + "**•** Validate and document process and asset FMEA\n", + "for all assets and sub-systems\n", + "\n", + "\n", + "**Technology infrastructure requirements**\n", + "\n", + "**•** Technical edge infrastructure onsite — to sense,\n", + "collect and transmit real-time information\n", + "\n", + "**•** Clean, reliable data availability in the cloud\n", + "\n", + "**•** Data processing and analytics platform — to\n", + "design, develop and implement solutions\n", + "\n", + "**•** Stream processing and deployment of models for\n", + "predictions and soft sensing\n", + "\n", + "\n", + "**Visualization delivered**\n", + "\n", + "**•** Information communication — visual\n", + "representation of digital twin along with remote\n", + "controlling functions (e.g., Power BI dashboards,\n", + "time series insights, web app-based digital\n", + "twin portals)\n", + "\n", + "**•** Closed-loop feedback — to send the insights and\n", + "actions back to form a closed loop — Azure – Event\n", + "Grid and Event Hub with connection from IoT Hub to\n", + "Azure IoT edge devices and control systems is used\n", + "\n", + "\n", + "\n", + "**•** Edge platform to orchestrate the data, insights and\n", + "actions between the cloud and site IT systems\n", + "\n", + "**•** Cloud to edge integration — to enable seamless\n", + "monitoring, alerting and integration with plant\n", + "OT/IT systems\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Is Manufacturing Struggling With Data and AI?\n", + "\n", + "**Challenge** **Root Cause** **Goal**\n", + "\n", + "\n", + "Aggregate high volumes and velocities of\n", + "\n", + "structured and unstructured data to power\n", + "\n", + "predictive analytics (e.g., images, IoT, ERP/SCM)\n", + "\n", + "Data architectures that scale for TBs /PBs of\n", + "\n", + "enterprise IT and OT data\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "for on-premises 30 years ago\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "\n", + "Legacy architectures such as data\n", + "\n", + "historians that can’t handle semi-structured\n", + "\n", + "or unstructured data\n", + "\n", + "\n", + "**Unable to scale enterprise data sets**\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "granular supply chain issues in the real world\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "\n", + "**Can’t meet intellectual property**\n", + "\n", + "\n", + "**Can’t meet intellectual property** Data lineage established across organizational\n", + "\n", + "Systems that do not establish data lineage\n", + "**requirements** silos and disjointed workflows\n", + "\n", + "\n", + "silos and disjointed workflows\n", + "\n", + "\n", + "### Data architecture is the root cause of this struggle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Databricks for Digital Twins?\n", + "\n", + "\n", + "Lakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\n", + "from across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\n", + "digital twins with predictive insights across the value chain from product development to optimizing operations\n", + "to building agile supply chains to robust customer insights.\n", + "\n", + "\n", + "Databricks open Lakehouse\n", + "\n", + "Platform has shown time and\n", + "\n", + "again that it is the foundational\n", + "\n", + "enabling technology to power\n", + "\n", + "digital twins for manufacturing. But\n", + "\n", + "the real power is the Databricks\n", + "\n", + "partnership with Tredence that\n", + "\n", + "speeds implementation for\n", + "\n", + "tailored use cases that deliver\n", + "\n", + "superior ROI in less time.”\n", + "\n", + "**Dr. Bala Amavasai** ,\n", + "\n", + "Manufacturing CTO, Databricks\n", + "\n", + "\n", + "**Supports Real-Time**\n", + "**Decisions**\n", + "\n", + "Lakehouse for Manufacturing\n", + "leverages any enterprise data\n", + "source — from business critical\n", + "ERP data to edge sensor data in\n", + "one integrated platform, making it\n", + "easy to automate and secure data\n", + "with fast, real-time performance.\n", + "\n", + "\n", + "**Faster and More**\n", + "**Accurate Analysis**\n", + "\n", + "The true benefits of digital twins\n", + "are not the business intelligence\n", + "dashboards, but machine\n", + "learning insights generated\n", + "from incorporating real-time\n", + "data. Scalable and shareable\n", + "notebook-based machine learning\n", + "accelerates ROI.\n", + "\n", + "\n", + "**Open Data Sharing**\n", + "**and Collaboration**\n", + "\n", + "Drive stronger customer insights\n", + "and greater service with partners\n", + "leveraging open and secure\n", + "data collaboration between\n", + "departments or your supply chain\n", + "delivering faster ROI.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Tredence for Digital Twins?\n", + "\n", + "\n", + "Over the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\n", + "expertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\n", + "Now, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\n", + "further strengthen their ‘’last mile impact’’ vision.\n", + "\n", + "\n", + "Tredence is excited to\n", + "\n", + "co-innovate with Databricks to\n", + "\n", + "deliver the solutions required for\n", + "\n", + "enterprises to create digital twins\n", + "\n", + "from the ground up and implement\n", + "\n", + "them swiftly to maximize their ROI.\n", + "\n", + "Our partnership enables clients to\n", + "\n", + "get the most out of Tredence’s data\n", + "\n", + "science capabilities to build decision\n", + "\n", + "intelligence around manufacturing\n", + "\n", + "processes and Databricks’\n", + "\n", + "Lakehouse Platform to realize the full\n", + "\n", + "promise of digital twins.”\n", + "\n", + "**Naresh Agarwal** ,\n", + "\n", + "Head of Industrials, Tredence\n", + "\n", + "\n", + "**Global Reach**\n", + "\n", + "Tredence offers a global team with\n", + "the subject matter expertise that\n", + "delivers practitioner and useroriented solutions to identify\n", + "and solve for challenges in\n", + "digital transformation design\n", + "and implementation.\n", + "\n", + "\n", + "**Purpose-Built Solutions**\n", + "\n", + "Adopt contextual edge to cloud,\n", + "purpose-built AIoT solutions\n", + "that unify your ecosystems with\n", + "connected insights and enhance\n", + "productivity, while enabling\n", + "efficient cost structures.\n", + "\n", + "\n", + "**Focused Dedication**\n", + "\n", + "A dedicated centre of excellence\n", + "(CoE) for AIoT and smart\n", + "manufacturing solutions —\n", + "serving the entire manufacturing\n", + "value chain from product\n", + "development to manufacturing and\n", + "downstream operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Using Digital Twins to Drive Insights\n", + "\n", + "\n", + "**Use Case**\n", + "\n", + "**Predictive Maintenance**\n", + "\n", + "- \u0007Rolls-Royce sought to use real-time\n", + "engine data to reduce unplanned\n", + "maintenance and downtime\n", + "\n", + "- \u0007Legacy systems were unable to\n", + "scale data ingestion of engine\n", + "sensor data in real time for ML\n", + "\n", + "**Impact**\n", + "\n", + "\n", + "**Why Databricks?**\n", + "\n", + "- \u0007The Lakehouse Platform on Azure unifies in-flight data\n", + "streams with external environmental conditions data to\n", + "predict engine performance issues\n", + "\n", + "- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\n", + "across use cases\n", + "\n", + "- \u0007MLflow speeds deployment of new models and reduces\n", + "incidents of grounded planes\n", + "\n", + "\n", + "Rolls-Royce uses Databricks\n", + "to drive insights around predictive\n", + "maintenance, improving\n", + "airframe reliability and reducing\n", + "carbon emissions.\n", + "\n", + "\n", + "#### 22 million tons\n", + "of carbon emissions saved\n", + "\n", + "\n", + "#### 5% reduction\n", + "in unplanned airplane groundings\n", + "\n", + "\n", + "#### Millions of pounds\n", + "in inventory cost savings from a 50%\n", + "improvement in maintenance efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n", + "\n", + "Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n", + "\n", + "original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "###### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "To learn more, visit us at:\n", + "\n", + "**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
### EBOOK\n", + "\n", + "# A Compact Guide to Large Language Models\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 1\n", + "## Introduction\n", + "\n", + "##### Definition of large language models (LLMs)\n", + "\n", + "Large language models are AI systems that are designed to process and analyze\n", + "vast amounts of natural language data and then use that information to generate\n", + "responses to user prompts. These systems are trained on massive data sets\n", + "using advanced machine learning algorithms to learn the patterns and structures\n", + "of human language, and are capable of generating natural language responses to\n", + "a wide range of written inputs. Large language models are becoming increasingly\n", + "important in a variety of applications such as natural language processing,\n", + "machine translation, code and text generation, and more.\n", + "\n", + "While this guide will focus on language models, it’s important to understand that\n", + "they are only one aspect under a larger generative AI umbrella. Other noteworthy\n", + "generative AI implementations include projects such as art generation from text,\n", + "audio and video generation, and certainly more to come in the near future.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Extremely brief historical background and development of LLMs\n", + "\n", + "\n", + "###### 1950s–1990s\n", + "Initial attempts are made to map hard rules around languages and\n", + "follow logical steps to accomplish tasks like translating a sentence\n", + "from one language to another.\n", + "\n", + "While this works sometimes, strictly defined rules only work for\n", + "concrete, well-defined tasks that the system has knowledge about.\n", + "\n", + "###### 1990s \n", + "Language models begin evolving into statistical models and\n", + "language patterns start being analyzed, but larger-scale projects\n", + "are limited by computing power.\n", + "\n", + "###### 2000s \n", + "Advancements in machine learning increase the complexity of\n", + "language models, and the wide adoption of the internet sees an\n", + "\n", + "enormous increase in available training data.\n", + "\n", + "###### 2012 \n", + "Advancements in deep learning architectures and larger data sets\n", + "lead to the development of GPT (Generative Pre-trained Transformer).\n", + "\n", + "\n", + "###### 2018\n", + "Google introduces BERT (Bidirectional Encoder Representations\n", + "from Transformers), which is a big leap in architecture and paves\n", + "the way for future large language models.\n", + "\n", + "###### 2020\n", + "OpenAI releases GPT-3, which becomes the largest model at\n", + "175B parameters and sets a new performance benchmark for\n", + "language-related tasks.\n", + "\n", + "###### 2022\n", + "ChatGPT is launched, which turns GPT-3 and similar models into\n", + "a service that is widely accessible to users through a web interface\n", + "and kicks off a huge increase in public awareness of LLMs and\n", + "generative AI.\n", + "\n", + "###### 2023\n", + "Open source LLMs begin showing increasingly impressive results\n", + "with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\n", + "GPT-4 is also released, setting a new benchmark for both parameter\n", + "size and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2\n", + "## Understanding Large Language Models\n", + "\n", + "\n", + "##### What are language models and how do they work?\n", + "\n", + "Large language models are advanced artificial intelligence systems that take\n", + "some input and generate humanlike text as a response. They work by first\n", + "analyzing vast amounts of data and creating an internal structure that models\n", + "the natural language data sets that they’re trained on. Once this internal\n", + "structure has been developed, the models can then take input in the form of\n", + "natural language and approximate a good response.\n", + "\n", + "##### If they’ve been around for so many years, why are they just now making headlines?\n", + "\n", + "A few recent advancements have really brought the spotlight to generative AI\n", + "and large language models:\n", + "\n", + "**A D VA N C E M E N T S I N T E C H N I Q U E S**\n", + "Over the past few years, there have been significant advancements in the\n", + "techniques used to train these models, resulting in big leaps in performance.\n", + "Notably, one of the largest jumps in performance has come from integrating\n", + "human feedback directly into the training process.\n", + "\n", + "\n", + "**I N C R E A S E D A C C E S S I B I L I T Y**\n", + "The release of ChatGPT opened the door for anyone with internet access\n", + "to interact with one of the most advanced LLMs through a simple web\n", + "interface. This brought the impressive advancements of LLMs into the\n", + "spotlight, since previously these more powerful LLMs were only available\n", + "to researchers with large amounts of resources and those with very deep\n", + "technical knowledge.\n", + "\n", + "**G R O W I N G C O M P U TAT I O N A L P O W E R**\n", + "The availability of more powerful computing resources, such as graphics\n", + "processing units (GPUs), and better data processing techniques allowed\n", + "researchers to train much larger models, improving the performance of\n", + "these language models.\n", + "\n", + "**I M P R O V E D T R A I N I N G D ATA**\n", + "As we get better at collecting and analyzing large amounts of data, the\n", + "\n", + "model performance has improved dramatically. In fact, Databricks showed\n", + "that you can get amazing results training a relatively small model with a\n", + "high-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\n", + "with the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### So what are organizations using large language models for?\n", + "\n", + "Here are just a few examples of common use cases for large language models:\n", + "\n", + "**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\n", + "One of the most common implementations, LLMs can be used by\n", + "organizations to provide help with things like customer support,\n", + "troubleshooting, or even having open-ended conversations with userprovided prompts.\n", + "\n", + "**C O D E G E N E R AT I O N A N D D E B U G G I N G**\n", + "LLMs can be trained on large amounts of code examples and give\n", + "useful code snippets as a response to a request written in natural language.\n", + "With the proper techniques, LLMs can also be built in a way to reference\n", + "other relevant data that it may not have been trained with, such as a\n", + "company’s documentation, to help provide more accurate responses.\n", + "\n", + "**S E N T I M E N T A N A LY S I S**\n", + "Often a hard task to quantify, LLMs can help take a piece of text and gauge\n", + "emotion and opinions. This can help organizations gather the data and\n", + "\n", + "feedback needed to improve customer satisfaction.\n", + "\n", + "\n", + "**L A N G U A G E T R A N S L AT I O N**\n", + "Globalize all your content without hours of painstaking work by simply\n", + "feeding your web pages through the proper LLMs and translating them to\n", + "different languages. As more LLMs are trained in other languages, quality\n", + "and availability will continue to improve.\n", + "\n", + "**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\n", + "Entire customer calls or meetings could be efficiently summarized so that\n", + "others can more easily digest the content. LLMs can take large amounts of\n", + "text and boil it down to just the most important bytes.\n", + "\n", + "**C O N T E N T G E N E R AT I O N**\n", + "Start with a detailed prompt and have an LLM develop an outline for you.\n", + "Then continue on with those prompts and LLMs can generate a good first\n", + "draft for you to build off. Use them to brainstorm ideas, and ask the LLM\n", + "questions to help you draw inspiration from.\n", + "\n", + "**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\n", + "language, but they might not know who won the big sporting event last year.\n", + "It’s always important to fact check and understand the responses before\n", + "\n", + "using them as a reference.\n", + "\n", + "\n", + "**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\n", + "The ability to categorize and sort large volumes of data enables the\n", + "identification of common themes and trends, supporting informed\n", + "decision-making and more targeted strategies.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 3\n", + "## Applying Large Language Models\n", + "\n", + "\n", + "There are a few paths that one can take when looking to apply large language\n", + "models for their given use case. Generally speaking, you can break them down\n", + "into two categories, but there’s some crossover between each. We’ll briefly cover\n", + "the pros and cons of each and what scenarios fit best for each.\n", + "\n", + "##### Proprietary services\n", + "\n", + "As the first widely available LLM powered service, OpenAI’s ChatGPT was the\n", + "explosive charge that brought LLMs into the mainstream. ChatGPT provides\n", + "a nice user interface (or API) where users can feed prompts to one of many\n", + "models (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\n", + "among the highest-performing models, trained on enormous data sets, and are\n", + "capable of extremely complex tasks both from a technical standpoint, such as\n", + "code generation, as well as from a creative perspective like writing poetry in a\n", + "specific style.\n", + "\n", + "The downside of these services is the absolutely enormous amount of compute\n", + "required not only to train them (OpenAI has said GPT-4 cost them over $100\n", + "million to develop) but also to serve the responses. For this reason, these\n", + "extremely large models will likely always be under the control of organizations,\n", + "\n", + "\n", + "and require you to send your data to their servers in order to interact with their\n", + "language models. This raises privacy and security concerns, and also subjects\n", + "users to “black box” models, whose training and guardrails they have no control\n", + "over. Also, due to the compute required, these services are not free beyond a\n", + "very limited use, so cost becomes a factor in applying these at scale.\n", + "\n", + "In summary: Proprietary services are great to use if you have very complex tasks,\n", + "are okay with sharing your data with a third party, and are prepared to incur\n", + "costs if operating at any significant scale.\n", + "\n", + "##### Open source models\n", + "\n", + "The other avenue for language models is to go to the open source community,\n", + "where there has been similarly explosive growth over the past few years.\n", + "Communities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n", + "\n", + "from contributors that can help solve tons of specific use cases such as text\n", + "generation, summarization and classification. The open source community has\n", + "been quickly catching up to the performance of the proprietary models, but\n", + "ultimately still hasn’t matched the performance of something like GPT-4.\n", + "\n", + "\n", + "-----\n", + "\n", + "It does currently take a little bit more work to grab an open source model and\n", + "start using it, but progress is moving very quickly to make them more accessible\n", + "to users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n", + "[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\n", + "experience to pull any Hugging Face transformer model and use it as a Python\n", + "object. Oftentimes, you can find an open source model that solves your specific\n", + "problem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\n", + "the model into your environment and host it yourself. This means that you can\n", + "keep the data in your control for privacy and governance concerns as well as\n", + "manage your costs.\n", + "\n", + "\n", + "##### Conclusion and general guidelines\n", + "\n", + "Ultimately, every organization is going to have unique challenges to overcome,\n", + "and there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\n", + "becomes more data driven, everything, including LLMs, will be reliant on having\n", + "a strong foundation of data. LLMs are incredible tools, but they have to be used\n", + "and implemented on top of this strong data foundation. Databricks brings both\n", + "that strong data foundation as well as the integrated tools to let you use and\n", + "fine-tune LLMs in your domain.\n", + "\n", + "\n", + "Another huge upside to using open source models is the ability to fine-tune\n", + "them to your own data. Since you’re not dealing with a black box of a proprietary\n", + "service, there are techniques that let you take open source models and train\n", + "them to your specific data, greatly improving their performance on your\n", + "specific domain. We believe the future of language models is going to move\n", + "in this direction, as more and more organizations will want full control and\n", + "understanding of their LLMs.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4\n", + "## So What Do I Do Next If I Want to Start Using LLMs?\n", + "\n", + "\n", + "That depends where you are on your journey! Fortunately, we have a few paths\n", + "for you.\n", + "\n", + "If you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\n", + "you can watch one of Databricks’ most talented developers and speakers go\n", + "over these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n", + "\n", + "If you’re ready to dive a little deeper and expand your education and\n", + "understanding of LLM foundations, we’d recommend checking out our\n", + "[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\n", + "and dive into the theory behind foundation models.\n", + "\n", + "If your hands are already shaking with excitement and you already have some\n", + "working knowledge of Python and Databricks, we’ll provide some great examples\n", + "with sample code that can get you up and running with LLMs right away!\n", + "\n", + "\n", + "###### Getting started with NLP using Hugging Face transformers pipelines\n", + "\n", + " Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n", + "\n", + " Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "#### Contact us for a personalized demo: databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
# Building Reliable Data Lakes at Scale With Delta Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "#### Data Engineering Drivers 2\n", + "\n", + " Data Pipeline Key Goals 4\n", + "\n", + " Apache Spark™: The First Unified Analytics Engine 5\n", + "\n", + " Data Reliability Challenges With Data Lakes 6\n", + "\n", + " Delta Lake: A New Storage Layer 7\n", + "\n", + " Delta Lake: Key Features 8\n", + "\n", + " Getting Started With Delta Lake 10\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "#### Data Engineering Drivers\n", + "\n", + "Data engineering professionals are needing to respond to several different drivers.\n", + "\n", + "Chief among the drivers they face are:\n", + "\n", + "**Rise of Advanced Analytics** — Advanced analytics, including methods\n", + "\n", + "based on machine learning techniques, have evolved to such a degree that\n", + "\n", + "organizations seek to derive far more value from their corporate assets.\n", + "\n", + "**Widespread Adoption** — Once the province of leading edge, high-tech\n", + "\n", + "companies, these advanced approaches are being adopted across a\n", + "\n", + "multitude of industries from retail to hospitality to healthcare and across\n", + "\n", + "private as well as public sector organizations. This is further driving the need\n", + "\n", + "for strong data engineering practices.\n", + "\n", + "**Regulation** — With the growth of data generation and data collection,\n", + "\n", + "there is increased interest in how the data is protected and managed.\n", + "\n", + "Regulatory regimes such as GDPR (General Data Protection Regulation)\n", + "\n", + "from the EU and other jurisdictions mandate very specific ways in which\n", + "\n", + "data must be managed.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "**Technology Innovation** — The move to cloud-based analytics architectures\n", + "\n", + "that is now well underway is being propelled further by innovations such as\n", + "\n", + "analytics-focused chipsets, pipeline automation and the unification of data\n", + "\n", + "and machine learning. All these offer data professionals new approaches for\n", + "\n", + "their data initiatives.\n", + "\n", + "**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n", + "\n", + "also subject to increasing scrutiny. There is also a greater understanding of\n", + "\n", + "data as a valuable asset. Deriving value from data must be done in a manner\n", + "\n", + "that is financially responsible and actually value adding to the enterprise and\n", + "\n", + "meeting ROI hurdles.\n", + "\n", + "**Role Evolution** — Reflecting the importance of managing the data and\n", + "\n", + "maximizing value extraction, the Chief Data Officer (CDO) role is becoming\n", + "\n", + "more prominent and newer roles such as Data Curator are emerging.\n", + "\n", + "They must balance the needs of governance, security and democratization.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Key Goals\n", + "\n", + "#### Data Pipeline Key Goals\n", + "\n", + "Making quality data available in a reliable manner is a major determinant of success for data\n", + "\n", + "analytics initiatives be they regular dashboards or reports, or advanced analytics projects\n", + "\n", + "drawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n", + "\n", + "responsibility need to take account of a broad set of dependencies and requirements as they\n", + "\n", + "design and build their data pipelines.\n", + "\n", + "Three primary goals that data engineers typically seek to address as they work to enable the\n", + "\n", + "analytics professionals in their organizations are:\n", + "\n", + "**Deliver quality data in less time** — When it comes to data, quality and timeliness\n", + "\n", + "are key. Data with gaps or errors (which can arise for many reasons) is\n", + "\n", + "“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n", + "\n", + "users. Equally well, many applications require up-to-date information (who\n", + "\n", + "wants to use last night’s closing stock price or weather forecast) and are of\n", + "\n", + "limited value without it.\n", + "\n", + "**Enable faster queries** — Wanting fast responses to queries is natural enough\n", + "\n", + "in today’s “New York minute,” online world. Achieving this is particularly\n", + "\n", + "demanding when the queries are based on very large data sets.\n", + "\n", + "**Simplify data engineering at scale** — It is one thing to have high reliability and\n", + "\n", + "performance in a limited, development or test environment. What matters\n", + "\n", + "more is the ability to have robust, production data pipelines at scale without\n", + "\n", + "requiring high operational overhead.\n", + "\n", + "\n", + "-----\n", + "\n", + "### ™\n", + "## Apache Spark\n", + "\n", + "#### Apache Spark ™ : The First Unified Analytics Engine\n", + "\n", + "Originally developed at UC Berkeley in 2009, Apache Spark can be\n", + "\n", + "considered the first unified analytics engine. Uniquely bringing data\n", + "\n", + "\n", + "and AI technologies together, Spark comes packaged with higher-level\n", + "\n", + "libraries, including support for SQL queries, streaming data, machine\n", + "\n", + "learning and graph processing. These standard libraries increase\n", + "\n", + "developer productivity and can be seamlessly combined to create\n", + "\n", + "\n", + "Customer\n", + "Data\n", + "\n", + "Emails/\n", + "Web Pages\n", + "\n", + "\n", + "Click\n", + "Streams\n", + "\n", + "Video/\n", + "Speech\n", + "\n", + "...\n", + "\n", + "Sensor\n", + "Data (IoT)\n", + "\n", + "\n", + "complex workflows.\n", + "\n", + "\n", + "#### Big Data Processing\n", + "\n", + "\n", + "#### Machine Learning\n", + "\n", + "\n", + "Since its release, Apache Spark, has seen rapid adoption by\n", + "\n", + "enterprises across a wide range of industries. Internet powerhouses\n", + "\n", + "\n", + "ETL + SQL + Streaming MLlib + SparkR\n", + "\n", + "\n", + "such as Netflix, Yahoo and eBay have deployed Spark at massive scale,\n", + "\n", + "\n", + "collectively processing multiple petabytes of data on clusters of over\n", + "\n", + "8,000 nodes making it the de facto choice for new analytics initiatives.\n", + "\n", + "It has quickly become the largest open source community in big data,\n", + "\n", + "with over 1000 contributors from 250+ organizations.\n", + "\n", + "\n", + "##### While Spark has had a significant impact in taking data analytics to the next level, practitioners continue to face data reliability and performance challenges with their data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Data Reliability Challenges With Data Lakes\n", + "\n", + "\n", + "**Failed Writes** — If a production job that is writing data experiences failures which\n", + "\n", + "are inevitable in large distributed environments, it can result in data corruption\n", + "\n", + "through partial or multiple writes. What is needed is a mechanism that is able to\n", + "\n", + "ensure that either a write takes place completely or not at all (and not multiple times,\n", + "\n", + "adding spurious data). Failed jobs can impose a considerable burden to recover\n", + "\n", + "to a clean state.\n", + "\n", + "\n", + "**Schema Mismatch** — When ingesting content from multiple sources, typical of\n", + "\n", + "large, modern big data environments, it can be difficult to ensure that the same\n", + "\n", + "data is encoded in the same way i.e., the schema matches. A similar challenge\n", + "\n", + "arises when the formats for data elements are changed without informing the\n", + "\n", + "data engineering team. Both can result in low quality, inconsistent data that\n", + "\n", + "requires cleaning up to improve its usability. The ability to observe and enforce\n", + "\n", + "schema would serve to mitigate this.\n", + "\n", + "\n", + "**Lack of Consistency** — In a complex big data environment, one may be interested\n", + "\n", + "in considering a mix of both batch and streaming data. Trying to read data while\n", + "\n", + "it is being appended to provides a challenge since on the one hand there is a\n", + "\n", + "desire to keep ingesting new data while on the other hand anyone reading the\n", + "\n", + "data prefers a consistent view. This is especially an issue when there are multiple\n", + "\n", + "readers and writers at work. It is undesirable and impractical, of course, to\n", + "\n", + "stop read access while writes complete or stop write access while reads are\n", + "\n", + "in progress.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: A New Storage Layer\n", + "\n", + "[Delta Lake](https://delta.io/) is an open source storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable metadata handling, and unifies\n", + "\n", + "streaming and batch data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs. Raw data is ingested\n", + "\n", + "from various batch and streaming input sources. Simple, reliable data pipelines help create a curated data lake containing tables of differing degrees of\n", + "\n", + "refinement based on business needs. The data in these tables is then made available via the standard Spark APIs or special connectors for various use cases\n", + "\n", + "such as machine learning, SQL analytics or feeding to a data warehouse.\n", + "\n", + "Streaming\n", + "\n", + "###### Analytics and Machine Learning\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "Ingestion Tables Refined Tables\n", + "(Bronze) (Silver)\n", + "\n", + "\n", + "Feature/Agg Data Store\n", + "(Gold)\n", + "\n", + "\n", + "###### Your Existing Data Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: Key Features\n", + "\n", + "\n", + "**ACID Transactions —** Data lakes typically have multiple data pipelines reading\n", + "\n", + "and writing data concurrently, and data engineers have to go through a tedious\n", + "\n", + "process to ensure data integrity, due to the lack of transactions. Delta Lake\n", + "\n", + "brings ACID transactions to your data lakes. It provides serializability, the\n", + "\n", + "\n", + "**Scalable Metadata Handling —** In big data, even the metadata itself can be “big\n", + "\n", + "data.” Delta Lake treats metadata just like data, leveraging Spark’s distributed\n", + "\n", + "processing power to handle all its metadata. As a result, Delta Lake can handle\n", + "\n", + "petabyte-scale tables with billions of partitions and files at ease.\n", + "\n", + "\n", + "strongest level of isolation level.\n", + "\n", + "\n", + "**Time Travel (data versioning) —** Delta Lake provides snapshots of data enabling\n", + "\n", + "developers to access and revert to earlier versions of data for audits, rollbacks or\n", + "\n", + "to reproduce experiments. For further details, please see this [documentation](https://www.google.com/url?q=https://docs.delta.io/latest/delta-batch.html%23-deltatimetravel&sa=D&source=editors&ust=1666305658154469&usg=AOvVaw0Zh1svr9wsqkIDKGQTgtLh) .\n", + "\n", + "\n", + "**Schema Enforcement —** Delta Lake provides the ability to specify your schema\n", + "\n", + "and enforce it. This helps ensure that the data types are correct and required\n", + "\n", + "columns are present, preventing bad data from causing data corruption.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: Key Features\n", + "\n", + "Parquet\n", + "\n", + "\n", + "**Open Format —** All data in Delta Lake is stored in Apache Parquet format,\n", + "\n", + "enabling Delta Lake to leverage the efficient compression and encoding schemes\n", + "\n", + "that are native to Parquet.\n", + "\n", + "**Unified Batch and Streaming Source and Sink** — A table in Delta Lake is both a\n", + "\n", + "batch table, as well as a streaming source and sink. Streaming data ingest, batch\n", + "\n", + "historic backfill, and interactive queries all just work out of the box.\n", + "\n", + "\n", + "**Schema Evolution —** Big data is continuously changing. Delta Lake\n", + "\n", + "enables you to make changes to a table schema that can be applied\n", + "\n", + "automatically, without the need for cumbersome DDL.\n", + "\n", + "**100% Compatible With Apache Spark API —** Developers can use Delta\n", + "\n", + "Lake with their existing data pipelines with minimal change as it is fully\n", + "\n", + "compatible with Spark, the commonly used big data processing engine.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started With Delta Lake\n", + "\n", + "**Getting started with Delta Lake is easy. Specifically, to create a Delta table simply specify Delta instead of using Parquet.**\n", + "\n", + "\n", + "#### Instead of parquet ...\n", + "```\n", + "dataframe\n", + ".write\n", + ".format(“ parquet ”)\n", + ".save(“/data”)\n", + "\n", + "```\n", + "\n", + "#### … simply say delta\n", + "```\n", + "dataframe\n", + ".write\n", + ".format(“ delta ”)\n", + ".save(“/data”)\n", + "\n", + "```\n", + "\n", + "##### Learn more about Delta Lake :\n", + "\n", + "[Delta Lake Blogs](https://delta.io/blog)\n", + "\n", + "Delta Lake Tutorials\n", + "\n", + "[Delta Lake Integrations](https://delta.io/integrations/)\n", + "\n", + "**For more information, please refer to the** **[documentation](https://docs.delta.io/latest/index.html)** **.**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf2024-09-19T16:57:20Z
#### eBook\n", + "\n", + "# The CDP Build vs Buy Guide:\n", + "\n", + "### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Customer Data Platform\n", + "\n", + "\n", + "Organizations need to deliver personalized experiences to their customers to stay ahead\n", + "of the curve — that means they need a customer data platform (CDP). Through a CDP, data\n", + "from every touch point, along with third-party information, is brought together to provide\n", + "a unified view of the customer. This enables your marketing team to analyze, identify and\n", + "activate customers with targeted content.\n", + "\n", + "The key question for all IT teams at these organizations is whether to build or to buy.\n", + "\n", + "A CDP that sounds like music to the ears of business leaders may be perceived as noise\n", + "by enterprise IT leaders. The business side of the house needs immediate enablement, and\n", + "an out-of-the-box system dedicated to the specialized needs of marketers seems like the\n", + "fastest path to a solution.\n", + "\n", + "But for IT, the CDP is yet another system, bringing stack baggage and redundancies to\n", + "existing marketing and analytics systems.. The cost of adding another system to the\n", + "landscape and the redundancy of sensitive customer data creates a governance challenge\n", + "that has immediate consequences.\n", + "\n", + "**Critical IT Needs** **Critical Business Needs**\n", + "\n", + "\n", + "Keep control of data access and\n", + "governance; ability to architecture a\n", + "customer data stack with decisions on\n", + "where data is stored and where queries\n", + "are executed\n", + "\n", + "\n", + "Get customer data access via a no-code\n", + "interface to generate insights; build customer\n", + "experiences and activate data within\n", + "business applications\n", + "\n", + "\n", + "-----\n", + "\n", + "The question of whether to build or buy seems to leave legitimate needs and concerns by one\n", + "side or the other unaddressed — which is why so many organizations who have built a CDP\n", + "have expressed dissatisfaction regardless of which side of the fence they came down upon.\n", + "\n", + "**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n", + "**both sides of the debate and provide organizations a third choice of both building and**\n", + "**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\n", + "the business with no-code and ease of use interface along with the flexibility and centralized\n", + "governance IT desires. By shifting the conversation from building or buying to building _and_\n", + "buying, we’ve opened the door to finding the right balance of approaches for our customer\n", + "organizations, helping organizations find greater success in their personalization journey.\n", + "\n", + "**“We made an attempt to internally build a CDP platform and while we**\n", + "**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n", + "**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n", + "**or offer a campaign interface to our product marketers that could empower**\n", + "**them to create and manage those journeys. It was going to take at least two**\n", + "**years for us to build all of that functionality in house.”**\n", + "\n", + "– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "## Combining the Build and Buy Approaches\n", + "\n", + "\n", + "Bringing together the best of build and buy involves the deployment of the CDP alongside or\n", + "within the lakehouse platform. There are three approaches to this:\n", + "\n", + "**Bundled** **Composable**\n", + "\n", + "**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "(Local & Views)\n", + "\n", + "\n", + "Query\n", + "Virtualization\n", + "\n", + "Metadata\n", + "\n", + "\n", + "Data Copy\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "Storage\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Compute Compute\n", + "\n", + "Storage Storage\n", + "\n", + "\n", + "-----\n", + "\n", + "Deployment Type\n", + "\n", + "**Bundled**\n", + "\n", + "**Composable –**\n", + "**Hybrid**\n", + "\n", + "**Composable –**\n", + "**Lakehouse-Only**\n", + "\n", + "\n", + "Description\n", + "\n", + "The CDP and the lakehouse are managed as two separate systems. Connectors in either system (as well as\n", + "third-party tools) allow data to be exchanged, typically as part of an ad hoc or batch process. This approach\n", + "allows the organization to leverage the functionality of both systems but data is duplicated making governance\n", + "an on-going concern.\n", + "\n", + "The CDP and the lakehouse are managed as two separate systems, but deeper integrations between the two\n", + "allow the organization to decide within which system a specific dataset should reside. Real-time integrations\n", + "between the systems allow CDP users to select information assets in the lakehouse and generate queries\n", + "spanning data on either side of the platform divide. This approach minimizes the need for data duplication\n", + "which simplifies data governance, even though it must be implemented within two separate systems.\n", + "\n", + "All CDP information assets reside within the lakehouse. User interfaces built on other technologies, directly\n", + "interact with the lakehouse for access to data. This approach minimizes redundancy and allows organizations\n", + "to implement a centralized data governance strategy for all consumers of customer-relevant data.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Deployment Architectures \n", + "\n", + "\n", + "The choice of which of these deployment architectures is best depends on the functional\n", + "requirements of a specific organization. Each has its benefits, and in the case of parallel\n", + "and federated deployments, organizations can easily transition between deployment\n", + "architectures over time. The following table captures many of the typical benefits\n", + "associated with the different deployment architectures.\n", + "\n", + "\n", + "Bundled CDP\n", + "Deployment Composable CDPHybrid Composable CDPLakehouse-Only\n", + "\n", + "\n", + "Typical\n", + "User\n", + "\n", + "**IT**\n", + "\n", + "\n", + "Component\n", + "\n", + "Digital Touchpoints\n", + "\n", + "Data Modeling\n", + "\n", + "Identity Resolution\n", + "\n", + "Data Governance\n", + "\n", + "\n", + "Description\n", + "\n", + "Collect and integrate\n", + "data from digital\n", + "channels (website,\n", + "app, etc.)\n", + "\n", + "Unify and model data\n", + "to make it usable by\n", + "other applications\n", + "\n", + "Deduplicate records to\n", + "build a private ID graph\n", + "with a single view of\n", + "the customer\n", + "\n", + "Control data access\n", + "and permitted actions\n", + "on the data\n", + "\n", + "\n", + "Included with CDP\n", + "via a tag\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "Primarily with CDP\n", + "or other tools (MDM,\n", + "Lakehouse)\n", + "\n", + "Included with CDP\n", + "\n", + "\n", + "Works with any digital\n", + "touchpoint collection\n", + "system\n", + "\n", + "Either within the CDP\n", + "or in Lakehouse via\n", + "real-time integration\n", + "\n", + "CDP, MDM, or\n", + "Lakehouse\n", + "\n", + "Both CDP and\n", + "Lakehouse\n", + "\n", + "\n", + "Works with any digital\n", + "touchpoint collection\n", + "system\n", + "\n", + "Unified environment with\n", + "minimal data replication\n", + "in and centralized\n", + "governance in Lakehouse\n", + "\n", + "Built with Lakehouse and\n", + "additional tools\n", + "\n", + "Managed centrally from\n", + "Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "Bundled CDP\n", + "Deployment Composable CDPHybrid Composable CDPLakehouse-Only\n", + "\n", + "\n", + "Typical\n", + "User\n", + "\n", + "**Business**\n", + "\n", + "\n", + "Component\n", + "\n", + "Predictive Scoring\n", + "\n", + "Marketing Audience\n", + "Segments\n", + "\n", + "Customer Journey\n", + "Orchestration\n", + "\n", + "Data Activations\n", + "\n", + "Analytics\n", + "\n", + "\n", + "Description\n", + "\n", + "Create and execute\n", + "models predicting\n", + "user behaviors such as\n", + "purchase or churn\n", + "\n", + "Use a self-service UI\n", + "to build rule-based\n", + "or model-based\n", + "audiences\n", + "\n", + "Define and optimize\n", + "the customer journey\n", + "and interactions with\n", + "the brand across every\n", + "channel and every\n", + "phase of the customer\n", + "lifecycle\n", + "\n", + "Integrate seamlessly\n", + "with delivery systems\n", + "for both inbound and\n", + "outbound customer\n", + "experiences\n", + "\n", + "Understand audience\n", + "and customer journey\n", + "performance\n", + "\n", + "\n", + "Included with CDP\n", + "with supplement\n", + "scoring from\n", + "Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "\n", + "CDP, or automatically\n", + "present with Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "CDP, marketing\n", + "automation, or\n", + "additional tools\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP or built\n", + "with Lakehouse and\n", + "additional tools\n", + "\n", + "\n", + "Automatically present\n", + "with Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "CDP, marketing\n", + "automation, or\n", + "additional tools\n", + "\n", + "CDP, or additional tools\n", + "\n", + "Built with Lakehouse\n", + "and additional tools\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000 organizations worldwide —\n", + "including Comcast, Condé Nast, H&M, and over 50% of the Fortune 500 — rely on\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe. Founded by the\n", + "original creators of Apache SparkTM, Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems.\n", + "\n", + "## About ActionIQ\n", + "\n", + "AIQ brings order to CX chaos. Our Customer Experience Hub empowers\n", + "everyone to be a CX champion by giving business teams the freedom to explore\n", + "and action on customer data while helping technical teams regain control of\n", + "where data lives and how it’s used.\n", + "\n", + "**[Get in touch](https://www.actioniq.com/get-started/)** with our experts to learn more.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf2024-09-19T16:57:20Z
-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf2024-09-19T16:57:19Z
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "**EBOOK**\n\n## The Big Book of Data Engineering 2nd Edition\n\nA collection of technical\nblogs, including code\nsamples and notebooks\n\n##### With all-new content\n\n\n-----\n\n#### Contents\n\n**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n\n**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n\n**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n\n**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n\n**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n\n**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n\n**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n\n**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n\n**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n\n**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n\n**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n\n**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n\n**4 . 1** Akamai .................................................................................................................................................................................................... 77\n\n**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n\n**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n\n**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n\n**4 . 5** Rivian .................................................................................................................................................................................................... 90\n\n**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n\n\n-----\n\n**SECTION**\n\n# 01\n\n\n### Introduction to Data Engineering on Databricks\n\n\n-----\n\nOrganizations realize the value data plays as a strategic asset for various\nbusiness-related initiatives, such as growing revenues, improving the customer\nexperience, operating efficiently or improving a product or service. However,\naccessing and managing data for these initiatives has become increasingly\ncomplex. Most of the complexity has arisen with the explosion of data volumes\nand data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\nto increase, 73% of the data goes unused for analytics or decision-making. In\norder to try and decrease this percentage and make more data usable, data\nengineering teams are responsible for building data pipelines to efficiently and\nreliably deliver data. But the process of building these complex data pipelines\ncomes with a number of difficulties:\n\n**•** In order to get data into a data lake, data engineers are required\nto spend immense time hand-coding repetitive data ingestion tasks\n\n**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state\n\n**Data pipeline observability**\nMonitor overall data pipeline status from a dataflow graph dashboard and\nvisually track end-to-end pipeline health for performance, quality and latency.\nData pipeline observability capabilities include:\n\n**•** A high-quality, high-fidelity lineage diagram that provides visibility\ninto how data flows for impact analysis\n\n**•** Granular logging with performance and status of the data pipeline\nat a row level\n\n**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n\n\n**Automatic deployments and operations**\nEnsure reliable and predictable delivery of data for analytics and machine\nlearning use cases by enabling easy and automatic data pipeline deployments\nand rollbacks to minimize downtime. Benefits include:\n\n**•** Complete, parameterized and automated deployment for the\ncontinuous delivery of data\n\n**•** End-to-end orchestration, testing and monitoring of data pipeline\ndeployment across all major cloud providers\n\n**Migrations**\nAccelerating and de-risking the migration journey to the lakehouse, whether\nfrom legacy on-prem systems or disparate cloud services.\n\nThe migration process starts with a detailed discovery and assessment to\nget insights on legacy platform workloads and estimate migration as well as\nDatabricks platform consumption costs. Get help with the target architecture\nand how the current technology stack maps to Databricks, followed by a\nphased implementation based on priorities and business needs. Throughout\nthis journey companies can leverage:\n\n**•** Automation tools from Databricks and its ISV partners\n\n**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n\n**•** Databricks Professional Services and training\n\nThis is the recommended approach for a successful migration, whereby\ncustomers have seen a 25-50% reduction in costs and 2-3x faster time to value\nfor their use cases.\n\n\n-----\n\n**Unified governance**\nWith Unity Catalog, data engineering and governance teams benefit from an\nenterprisewide data catalog with a single interface to manage permissions,\ncentralize auditing, automatically track data lineage down to the column level,\nand share data across platforms, clouds and regions. Benefits:\n\n**•** Discover all your data in one place, no matter where it lives,\nand centrally manage fine-grained access permissions using an\nANSI SQL-based interface\n\n**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**\n\nAs organizations strive to become data-driven, data engineering is a focal\npoint for success. To deliver reliable, trustworthy data, data engineers shouldn’t\nneed to spend time manually developing and maintaining an end-to-end\nETL lifecycle. Data engineering teams need an efficient, scalable way to\nsimplify ETL development, improve data reliability and manage operations.\n\nAs described, the eight key differentiating capabilities simplify the\nmanagement of the ETL lifecycle by automating and maintaining all data\ndependencies, leveraging built-in quality controls with monitoring and by\nproviding deep visibility into pipeline operations with automatic recovery.\nData engineering teams can now focus on easily and rapidly building reliable\nend-to-end production-ready data pipelines using only SQL or Python\nfor batch and streaming that deliver high-value data for analytics, data\nscience or machine learning.\n\n\n**Follow proven best practices**\n\nIn the next section, we describe best practices for data engineering\nend-to end use cases drawn from real-world examples. From data ingestion\nand real-time processing to analytics and machine learning, you’ll learn\nhow to translate raw data into actionable data.\n\nAs you explore the rest of this guide, you can find data sets and code\nsamples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\nget your hands dirty as you explore all aspects of the data lifecycle on the\nDatabricks Lakehouse Platform.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 02\n\n\n### Guidance and Best Practices\n\n**2.1** Top 5 Databricks Performance Tips\n\n**2.2** How to Profile PySpark\n\n**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n\n**2.4** Streaming in Production: Collected Best Practices\n\n**2.5** Streaming in Production: Collected Best Practices, Part 2\n\n**2.6** Building Geospatial Data Products\n\n**2.7** Data Lineage With Unity Catalog\n\n**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:\n\n**•** **Use larger clusters.** It may sound obvious, but this is the number\none problem we see. It’s actually not any more expensive to use a large\ncluster for a workload than it is to use a smaller one. It’s just faster.\nIf there’s anything you should take away from this article, it’s this.\n\nRead section 1. Really.\n\n**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\nto learn more. You won’t regret it.\n\n\n\n**•** **Clean out your configurations** . Configurations carried from one\nApache Spark™ version to the next can cause massive problems. Clean up!\nRead section 3 to learn more.\n\n**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\ncorrectly, if at all. See Section 4 to learn more.\n\n**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\nyou’re writing Spark code, jump to section 5.\n\n**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\nblog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n\n**1. Give your clusters horsepower!**\n\nThis is the number one mistake customers make. Many customers create tiny\nclusters of two workers with four cores each, and it takes forever to do anything.\nThe concern is always the same: they don’t want to spend too much money on\nlarger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n\n\n-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2\n\nNotice that the total cost of the workload stays the same while the real-world\ntime it takes for the job to run drops significantly. So, bump up your Databricks\ncluster specs and speed up your workloads without spending any more money. It\n\ncan’t really get any simpler than that.\n\n**2. Use Photon**\n\nOur colleagues in engineering have rewritten the Spark execution engine in C++\nand dubbed it Photon. The results are impressive!\n\n\nBeyond the obvious improvements due to running the engine in native code,\nthey’ve also made use of CPU-level performance features and better memory\n\nmanagement. On top of this, they’ve rewritten the Parquet writer in C++. So this\nmakes writing to Parquet and Delta (based on Parquet) super fast as well!\n\nBut let’s also be clear about what Photon is speeding up. It improves\ncomputation speed for any built-in functions or operations, as well as writes to\nParquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\nspending most of its time reading from an ancient on-prem database? Won’t\nhelp there either, unfortunately.\n\n\n-----\n\nThe good news is that it helps where it can. So even if part of your job can’t be\nsped up, it will speed up the other parts. Also, most jobs are written with the\nnative operations and spend a lot of time writing to Delta, and Photon helps a lot\nthere. So give it a try. You may be amazed by the results!\n\n**3. Clean out old configurations**\n\nYou know those Spark configurations you’ve been carrying along from version to\nversion and no one knows what they do anymore? They may not be harmless.\nWe’ve seen jobs go from running for hours down to minutes simply by cleaning\nout old configurations. There may have been a quirk in a particular version of\nSpark, a performance tweak that has not aged well, or something pulled off\nsome blog somewhere that never really made sense. At the very least, it’s worth\nrevisiting your Spark configurations if you’re in this situation. Often the default\nconfigurations are the best, and they’re only getting better. Your configurations\nmay be holding you back.\n\n**4. The Delta Cache is your friend**\n\nThis may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.\n\nOf course, your mileage may vary. If you’re doing BI, which involves reading the\nsame tables over and over again, caching gives an amazing boost. However, if\nyou’re simply reading a table once and writing out the results as in some ETL\njobs, you may not get much benefit. You know your jobs better than anyone.\nGo forth and conquer.\n\n\n-----\n\n**5. Be aware of lazy evaluation**\n\n\nHowever, there is a catch here. Every time you try to display or write out\nresults, it runs the execution plan again. Let’s look at the same block of code\nbut extend it and do a few more operations.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n.filter(...)\n)\n\n_# Now run the execution plan to get results_\ndf2.display()\n\n_# Unfortunately this will run the plan again, including filtering, joining,_\n_etc_\ndf2.display()\n\n_# So will this…_\ndf2.count()\n—------\n\n\nIf you’re a data analyst or data scientist only using SQL or doing BI you can skip\nthis section. However, if you’re in data engineering and writing pipelines or doing\nprocessing using Databricks/Spark, read on.\n\nWhen you’re writing Spark code like select, groupBy, filter, etc., you’re really\nbuilding an execution plan. You’ll notice the code returns almost immediately when\nyou run these functions. That’s because it’s not actually doing any computation. So\neven if you have petabytes of data, it will return in less than a second.\n\nHowever, once you go to write your results out you’ll notice it takes longer. This\nis due to lazy evaluation. It’s not until you try to display or write results that your\nexecution plan is actually run.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.\n\n\nThis works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\nbenefit greatly from lazy evaluation, but it’s something a lot of customers trip\nover. So be aware of its existence and save results you reuse in order to avoid\nunnecessary computation.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nLet’s look at the same block of code again, but this time let’s avoid the\nrecomputation:\n\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n)\n\n_# save it_\ndf2.write.save(path)\n\n_# load it back in_\ndf3 = spark.read.load(path)\n\n_# now use it_\ndf3.display()\n\n_# this is not doing any extra computation anymore. No joins, filtering,_\n_etc. It’s already done and saved._\ndf3.display()\n\n_# nor is this_\ndf3.count()\n\n\n-----\n\nSECTION 2.2 \u0007\n\n**How to Profile PySpark**\n\nby **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n\nOctober 6, 2022\n\n\nIn Apache Spark™, declarative Python APIs are supported for big data workloads.\nThey are powerful enough to handle most common use cases. Furthermore,\nPySpark UDFs offer more flexibility since they enable users to run arbitrary\nPython code on top of the Apache Spark™ engine. Users only have to state\n“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\nPySpark easier to use, but it can be difficult to identify performance bottlenecks\nand apply custom optimizations.\n\nTo address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**\n\nPySpark applications run as independent sets of processes on a cluster,\ncoordinated by the SparkContext object in the driver program. On the driver\nside, PySpark is a regular Python process; thus, we can profile it as a normal\nPython program using cProfile as illustrated below:\n\nimport cProfile\n\nwith cProfile.Profile() as pr:\n_# Your code_\n\npr.print_stats()\n\n**Workers profiling**\n\nExecutors are distributed on worker nodes in the cluster, which introduces\ncomplexity because we need to aggregate profiles. Furthermore, a Python worker\nprocess is spawned per executor for PySpark UDF execution, which makes the\nprofiling more intricate.\n\n\n-----\n\nThe UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\nand becomes a major tool to profile workers for PySpark applications. We’ll\nillustrate how to use the UDF profiler with a simple Pandas UDF example.\n\nFirstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n```\n sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n by 'id' )\n ).withColumn( 'v' , rand())\n\n```\nLater, we will group by the id column, which results in 8 groups with 1,000 rows\nper group.\n\nThe Pandas UDF plus_one is then created and applied as shown below:\n```\n import pandas as pd\n def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf.apply( lambda x: x + 1 , axis= 1 )\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n res.collect()\n\n```\n\nExecuting the example above and running sc.show_profiles() prints the\nfollowing profile. The profile below can also be dumped to disk by sc.dump_\nprofiles(path).\n\nThe UDF id in the profile (271, highlighted above) matches that in the Spark plan\nfor res. The Spark plan can be shown by calling res.explain() .\n\n\nNote that plus_one takes a pandas DataFrame and returns another pandas\nDataFrame. For each group, all columns are passed together as a pandas\nDataFrame to the plus_one UDF, and the returned pandas DataFrames are\ncombined into a PySpark DataFrame.\n\n\n-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function\n\nDigging into the column details: plus_one is triggered once per group, 8 times\nin total; _arith_method of pandas Series is called once per row, 8,000 times\nin total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\nrow, thus suffering from high invocation overhead.\n\nWe can reduce such overhead by substituting the pandas.DataFrame.apply\nwith pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\nfollows:\n```\n import pandas as pd\n def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf + 1\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n schema)\n res.collect()\n\n```\n\nThe updated profile is as shown below.\n\nWe can summarize the optimizations as follows:\n\n**•** Arithmetic operation from 8,000 calls to 8 calls\n\n**•** Total function calls from 2,898,160 calls to 2,384 calls\n\n**•** Total execution time from 2.300 seconds to 0.004 seconds\n\nThe short example above demonstrates how the UDF profiler helps us deeply\nunderstand the execution, identify the performance bottleneck and enhance\nthe overall performance of the user-defined function.\n\nThe UDF profiler was implemented based on the executor-side profiler,\nwhich is designed for PySpark RDD API. The executor-side profiler is available\nin all active Databricks Runtime versions.\n\n\n-----\n\nBoth the UDF profiler and the executor-side profiler run on Python workers.\nThey are controlled by the spark.python.profile Spark configuration, which\nis false by default. We can enable that Spark configuration on a Databricks\nRuntime cluster as shown below.\n\n\n**Conclusion**\n\nPySpark profilers are implemented based on cProfile; thus, the profile reporting\nrelies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\ncollecting profile reports from Python workers.\n\nPowerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022\n\n\n[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\napproach for creating reliable data pipelines and fully manages the underlying\ninfrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\nactionable insights derived from near real-time data. Delta Live Tables enables\nlow-latency streaming data pipelines to support such use cases with low\nlatencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n\nThis article will walk through using DLT with Apache Kafka while providing the\nrequired Python code to ingest streams. The recommended system architecture\nwill be explained, and related DLT settings worth considering will be explored\nalong the way.\n\n**Streaming platforms**\n\nEvent buses or message buses decouple message producers from consumers.\nA popular streaming use case is the collection of click-through data from\nusers navigating a website where every user interaction is stored as an event in\n\n\nApache Kafka. The event stream from Kafka is then used for real-time streaming\ndata analytics. Multiple message consumers can read the same data from Kafka\nand use the data to learn about audience interests, conversion rates, and bounce\nreasons. The real-time, streaming event data from the user interactions often\nalso needs to be correlated with actual purchases stored in a billing database.\n\n**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”\n\nWhen developing DLT with Python, the @dlt.table decorator is used to create a\nDelta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\nwhich are simple SQL constraints clauses that define the pipeline’s behavior with\ninvalid records.\n\nSince streaming workloads often come with unpredictable data volumes,\nDatabricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\noverall end-to-end latency while reducing cost by shutting down unnecessary\ninfrastructure.\n\n**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\npipeline run.\n\nIn contrast, **streaming Delta Live Tables** are stateful, incrementally computed\nand only process data that has been added since the last pipeline run. If the\nquery which defines a streaming live tables changes, new data will be processed\nbased on the new query but existing data is not recomputed. Streaming live\ntables always use a streaming source and only work over append-only streams,\nsuch as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\nStructured Streaming.\n\n\nYou can chain multiple streaming pipelines, for example, workloads with very\nlarge data volume and low latency requirements.\n\n**Direct ingestion from streaming engines**\n\nDelta Live Tables written in Python can directly ingest data from an event bus like\nKafka using Spark Structured Streaming. You can set a short retention period for\nthe Kafka topic to avoid compliance issues, reduce costs and then benefit from\nthe cheap, elastic and governable storage that Delta provides.\n\nAs a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n(raw) table and avoid complex transformations that could drop important data.\nLike any Delta table the Bronze table will retain the history and allow it to perform\nGDPR and other compliance tasks.\n\nIngest streaming data from Apache Kafka\n\n\n-----\n\nWhen writing DLT pipelines in Python, you use the @dlt.table annotation\nto create a DLT table. There is no special attribute to mark streaming DLTs in\nPython; simply use spark.readStream() to access the stream. Example code\nfor creating a DLT table with the name kafka_bronze that is consuming data\nfrom a Kafka topic looks as follows:\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nTOPIC = \"tracker-events\"\nKAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n_# subscribe to TOPIC at KAFKA_BROKER_\nraw_kafka_events = (spark.readStream\n. format ( \"kafka\" )\n.option( \"subscribe\" , TOPIC)\n.option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n.option( \"startingOffsets\" , \"earliest\" )\n.load()\n)\n\n**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n```\n def kafka_bronze ():\n\n```\nreturn raw_kafka_events\n\npipelines.reset.allowed\n\nNote that event buses typically expire messages after a certain period of time,\nwhereas Delta is designed for infinite retention.\n\nThis might lead to the effect that source data on Kafka has already been deleted\nwhen running a full refresh for a DLT pipeline. In this case, not all historic data\ncould be backfilled from the messaging platform, and data would be missing in\nDLT tables. To prevent dropping data, use the following DLT table property:\n\n\npipelines.reset.allowed=false\n\nSetting pipelines.reset.allowed to false prevents refreshes to the table but\ndoes not prevent incremental writes to the tables or new data from flowing into\nthe table.\n\n**Checkpointing**\n\nIf you are an experienced Spark Structured Streaming developer, you will notice\nthe absence of checkpointing in the above code. In Spark Structured Streaming\ncheckpointing is required to persist progress information about what data has\nbeen successfully processed and upon failure, this metadata is used to restart a\nfailed query exactly where it left off.\n\nWhereas checkpoints are necessary for failure recovery with exactly-once\nguarantees in Spark Structured Streaming, DLT handles state automatically\nwithout any manual configuration or explicit checkpointing required.\n\n**Mixing SQL and Python for a DLT pipeline**\n\nA DLT pipeline can consist of multiple notebooks but one DLT notebook is\nrequired to be written entirely in either SQL or Python (unlike other Databricks\nnotebooks where you can have cells of different languages in a single notebook).\n\nNow, if your preference is SQL, you can code the data ingestion from Apache\nKafka in one notebook in Python and then implement the transformation logic of\nyour data pipelines in another notebook in SQL.\n\n\n-----\n\n**Schema mapping**\n\nWhen reading data from messaging platform, the data stream is opaque and a\nschema has to be provided.\n\nThe Python example below shows the schema definition of events from a fitness\ntracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n\nevent_schema = StructType([ \\\nStructField( \"time\" , TimestampType(), True ) , \\\nStructField( \"version\" , StringType(), True ), \\\nStructField( \"model\" , StringType(), True ) , \\\nStructField( \"heart_bpm\" , IntegerType(), True ), \\\nStructField( \"kcal\" , IntegerType(), True ) \\\n])\n\n_# temporary table, visible in pipeline but not in data browser,_\n_# cannot be queried interactively_\n**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n**temporary=** **True** **)**\n```\n def kafka_silver ():\n\n```\nreturn (\n_# kafka streams are (timestamp,value)_\n_# value contains the kafka payload_\n\ndlt.read_stream( \"kafka_bronze\" )\n.select(col( \"timestamp\" ),from_json(col( \"value\" )\n.cast( \"string\" ), event_schema).alias( \"event\" ))\n.select( \"timestamp\" , \"event.*\" )\n\n\n**Benefits**\n\nReading streaming data in DLT directly from a message broker minimizes the\narchitectural complexity and provides lower end-to-end latency since data is\ndirectly streamed from the messaging broker and no intermediary step is involved.\n\n**Streaming ingest with cloud object store intermediary**\n\nFor some specific use cases, you may want to offload data from Apache Kafka,\ne.g., using a Kafka connector, and store your streaming data in a cloud object\nintermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\ningest the files.\n\nAuto Loader can ingest data with a single line of SQL code. The syntax to ingest\nJSON files into a DLT table is shown below (it is wrapped across two lines for\nreadability).\n\n_-- INGEST with Auto Loader_\ncreate or replace streaming live table raw\nas select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n\n\n-----\n\nNote that Auto Loader itself is a streaming data source and all newly arrived files\nwill be processed exactly once, hence the streaming keyword for the raw table\nthat indicates data is ingested incrementally to that table.\n\nSince offloading streaming data to a cloud object store introduces an additional\nstep in your system architecture it will also increase the end-to-end latency\nand create additional storage costs. Keep in mind that the Kafka connector\nwriting event data to the cloud object store needs to be managed, increasing\noperational complexity.\n\nTherefore Databricks recommends as a best practice to directly access event\nbus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n\n**Other event buses or messaging systems**\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to other event buses or messaging systems. DLT supports any data\nsource that Databricks Runtime directly supports.\n\n**Amazon Kinesis**\nIn Kinesis, you write messages to a fully managed serverless stream. Same as\nKafka, Kinesis does not permanently store messages. The default message\nretention in Kinesis is one day.\n\nWhen using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\nPython code for streaming ingestion above and add Amazon Kinesis-specific\nsettings with option(). For more information, check the section about Kinesis\nIntegration in the Spark Structured Streaming documentation.\n\n\n**Azure Event Hubs**\n\nFor Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\nthe article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n\n**Summary**\n\nDLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\nstreaming and batch sources, cleanse and transform data on the Databricks\nLakehouse Platform on any cloud with guaranteed data quality.\n\nData from Apache Kafka can be ingested by directly connecting to a Kafka broker\nfrom a DLT notebook in Python. Data loss can be prevented for a full pipeline\nrefresh even when the source data in the Kafka streaming layer expired.\n\n**Get started**\n\nIf you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\nrelease notes to learn more about what’s included in this GA release. If you are\nnot an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\ndetailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n\nJoin the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\nare chatting about Data + AI Summit 2022 announcements and updates. Learn.\nNetwork.\n\nLast but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\nsummit. In that session, I walk you through the code of another streaming data\nexample with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\nHugging Face sentiment analysis.\n\n\n-----\n\nSECTION 2.4 \u0007\n\n**Streaming in Production: Collected Best Practices**\n\nby **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nDecember 12, 2022\n\n\nReleasing any data pipeline or application into a production state requires\nplanning, testing, monitoring, and maintenance. Streaming pipelines are no\ndifferent in this regard; in this blog we present some of the most important\nconsiderations for deploying streaming pipelines and applications to a\nproduction environment.\n\nAt Databricks, we offer two different ways of building and running streaming\npipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\nDLT is our flagship, fully managed ETL product that supports both batch and\nstreaming pipelines. It offers declarative development, automated operations,\ndata quality, advanced observability capabilities, and more. Workflows enable\ncustomers to run Apache Spark™ workloads in Databricks’ optimized runtime\nenvironment (i.e., Photon) with access to unified governance (Unity Catalog) and\nstorage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n\nshare the same core streaming engine — Spark Structured Streaming. In the\ncase of DLT, customers program against the DLT API and DLT uses the Structured\nStreaming engine under the hood. In the case of Jobs, customers program\nagainst the Spark API directly.\n\n\nThe recommendations in this blog post are written from the Structured\nStreaming engine perspective, most of which apply to both DLT and Workflows\n(although DLT does take care of some of these automatically, like Triggers and\nCheckpoints). We group the recommendations under the headings “Before\nDeployment” and “After Deployment” to highlight when these concepts will\nneed to be applied and are releasing this blog series with this split between\nthe two. There will be additional deep-dive content for some of the sections\nbeyond as well. We recommend reading all sections before beginning work\nto productionalize a streaming pipeline or application, and revisiting these\nrecommendations as you promote it from dev to QA and eventually production.\n\n**Before deployment**\n\nThere are many things you need to consider when creating your streaming\napplication to improve the production experience. Some of these topics, like\nunit testing, checkpoints, triggers, and state management, will determine how\nyour streaming application performs. Others, like naming conventions and how\nmany streams to run on which clusters, have more to do with managing multiple\nstreaming applications in the same environment.\n\n\n-----\n\n**Unit testing**\n\n\nThe cost associated with finding and fixing a bug goes up exponentially\nthe farther along you get in the SDLC process, and a Structured Streaming\napplication is no different. When you’re turning that prototype into a hardened\nproduction pipeline you need a CI/CD process with built-in tests. So how do you\ncreate those tests?\n\nAt first you might think that unit testing a streaming pipeline requires something\nspecial, but that isn’t the case. The general guidance for streaming pipelines is\nno different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\norganizing your code so that it can be unit tested effectively:\n\n**•** Divide your code into testable chunks\n\n**•** Organize your business logic into functions calling other functions.\nIf you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\nmultiple functions that can be individually tested.\n\n**•** Do not code in dependencies on the global state or external systems\n\n**•** Any function manipulating a DataFrame or data set should be organized\nto take the DataFrame/data set/configuration as input and output the\nDataFrame/data set\n\nOnce your code is separated out in a logical manner you can implement unit\ntests for each of your functions. Spark-agnostic functions can be tested like any\nother function in that language. For testing UDFs and functions with DataFrames\nand data sets, there are multiple Spark testing frameworks available. These\n\n\nframeworks support all of the DataFrame/data set APIs so that you can easily\ncreate input, and they have specialized assertions that allow you to compare\nDataFrame content and schemas. Some examples are:\n\n**•** The built-in Spark test suite, designed to test all parts of Spark\n\n**•** spark-testing-base, which has support for both Scala and Python\n\n**•** spark-fast-tests, for testing Scala Spark 2 & 3\n\n**•** chispa, a Python version of spark-fast-tests\n\nCode examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n\nBut wait! I’m testing a streaming application here — don’t I need to make\nstreaming DataFrames for my unit tests? The answer is no; you do not! Even\nthough a streaming DataFrame represents a data set with no defined ending,\nwhen functions are executed on it they are executed on a microbatch — a\ndiscrete set of data. You can use the same unit tests that you would use for a\nbatch application, for both stateless and stateful streams. One of the advantages\nof Structured Streaming over other frameworks is the ability to use the same\ntransformation code for both streaming and with other batch operations for\nthe same sink. This allows you to simplify some operations, like backfilling\ndata, for example, where rather than trying to sync the logic between two\ndifferent applications, you can just modify the input sources and write to the\nsame destination. If the sink is a Delta table, you can even do these operations\nconcurrently if both processes are append-only operations.\n\n\n-----\n\n**Triggers**\n\n\nprocess a microbatch in order to maximize resource utilization, but setting the\ninterval longer would make sense if your stream is running on a shared cluster\nand you don’t want it to constantly take the cluster resources.\n\nIf you do not need your stream to run continuously, either because data doesn’t\ncome that often or your SLA is 10 minutes or greater, then you can use the\nTrigger.Once option. This option will start up the stream, check for anything new\nsince the last time it ran, process it all in one big batch, and then shut down.\nJust like with a continuously running stream when using Trigger.Once, the\ncheckpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n\nSpark has a new version of Trigger.Once called Trigger.AvailableNow. While\nTrigger.Once will process everything in one big batch, which depending on your\ndata size may not be ideal, Trigger.AvailableNow will split up the data based on\nmaxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\nprocessed in multiple batches. Those settings are ignored with Trigger.Once.\nYou can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n\n**Pop quiz —** how do you turn your streaming process into a batch process\nthat automatically keeps track of where it left off with just one line of code?\n\n**Answer —** change your processing time trigger to Trigger.Once/Trigger.\nAvailableNow! Exact same code, running on a schedule, that will neither miss nor\nreprocess any records.\n\n\nNow that you know your code works, you need to determine how often your\nstream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\none of the options for the writeStream command, and it looks like this:\n\n_// Scala/Java_\n.trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n\n_# Python_\n.trigger(processingTime= '30 seconds' )\n\nIn the above example, if a microbatch completes in less than 30 seconds,\nthen the engine will wait for the rest of the time before kicking off the next\nmicrobatch. If a microbatch takes longer than 30 seconds to complete, then the\nengine will start the next microbatch immediately after the previous one finishes.\n\nThe two factors you should consider when setting your trigger interval are how\nlong you expect your stream to process a microbatch and how often you want\nthe system to check for new data. You can lower the overall processing latency\nby using a shorter trigger interval and increasing the resources available for\nthe streaming query by adding more workers or using compute or memory\noptimized instances tailored to your application’s performance. These increased\nresources come with increased costs, so if your goal is to minimize costs, then a\nlonger trigger interval with less compute can work. Normally you would not set a\ntrigger interval longer than what it would typically take for your stream to\n\n\n-----\n\n**Name your stream**\n\n\nYou name your children, you name your pets, now it’s time to name your streams.\nThere’s a writeStream option called .queryName that allows you to provide a\nfriendly name for your stream. Why bother? Well, suppose you don’t name it. In\nthat case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\nis the string and the unintelligible guid that is automatically generated\nas the stream’s unique identifier. If you have more than one stream running on a\ncluster, and all of them have and unintelligible strings as identifiers,\nhow do you find the one you want? If you’re exporting metrics how do you tell\nwhich is which?\n\nMake it easy on yourself, and name your streams. When you’re managing them in\nproduction you’ll be glad you did, and while you’re at it, go and name your batch\nqueries in any foreachBatch() code you have.\n\n**Fault tolerance**\n\nHow does your stream recover from being shut down? There are a few different\ncases where this can come into play, like cluster node failures or intentional\nhalts, but the solution is to set up checkpointing. Checkpoints with write-ahead\nlogs provide a degree of protection from your streaming application being\ninterrupted, ensuring it will be able to pick up again where it last left off.\n\nCheckpoints store the current offsets and state values (e.g., aggregate values) for\nyour stream. Checkpoints are stream specific so each should be set to its own\nlocation. Doing this will let you recover more gracefully from shutdowns, failures\nfrom your application code or unexpected cloud provider failures or limitations.\n\n\nTo configure checkpoints, add the checkpointLocation option to your stream\ndefinition:\n\n_// Scala/Java/Python_\nstreamingDataFrame.writeStream\n.format( \"delta\" )\n.option( \"path\" , \"\" )\n.queryName( \"TestStream\" )\n.option( \"checkpointLocation\" , \"\" )\n.start()\n\nTo keep it simple — every time you call .writeStream, you must specify the\ncheckpoint option with a unique checkpoint location. Even if you’re using\nforeachBatch and the writeStream itself doesn’t specify a path or table option,\nyou must still specify that checkpoint. It’s how Spark Structured Streaming gives\nyou hassle-free fault tolerance.\n\nEfforts to manage the checkpointing in your stream should be of little concern\nin general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\nanalytics is not having to reason about streaming at all.” That said, one setting\n\ndeserves mention as questions around the maintenance of checkpoint files\ncome up occasionally. Though it is an internal setting that doesn’t require direct\nconfiguration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\ncontrols the number of checkpoint files that get created. Basically, the number\nof files will be roughly this number times two, as there is a file created noting the\noffsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\non completing the batch (commits). The number of files is checked periodically\nfor cleanup as part of the internal processes. This simplifies at least one aspect\nof long-term streaming application maintenance for you.\n\n\n-----\n\nIt is also important to note that some changes to your application code can\ninvalidate the checkpoint. Checking for any of these changes during code\nreviews before deployment is recommended. You can find examples of changes\nwhere this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\nwhether asynchronous checkpointing might improve the latency in your\nstreaming application. In that case, these are covered in greater depth in\n[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n\n**State management and RocksDB**\n\nStateful streaming applications are those where current records may depend\non previous events, so Spark has to retain data in between microbatches.\nThe data it retains is called state, and Spark will store it in a state store and\nread, update and delete it during each microbatch. Typical stateful operations\nare streaming aggregations, streaming dropDuplicates, stream-stream joins,\nmapGroupsWithState, or flatMapGroupsWithState. Some common types of\nexamples where you’ll need to think about your application state could be\nsessionization or hourly aggregation using group by methods to calculate\n\nbusiness metrics. Each record in the state store is identified by a key that is used\nas part of the stateful computation, and the more unique keys that are required\nthe larger the amount of state data that will be stored.\n\nWhen the amount of state data needed to enable these stateful operations\ngrows large and complex, it can degrade your workloads’ performance, leading\nto increased latency or even failures. A typical indicator of the state store being\n\n\nthe culprit of added latency is large amounts of time spent in garbage collection\n(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\nthis could look like a continual increase or wildly varying processing time across\nmicrobatches.\n\nThe default configuration for a state store, which is sufficient for most general\nstreaming workloads, is to store the state data in the executors’ JVM memory.\nLarge number of keys (typically millions, see the Monitoring & Instrumentation\nsection in part 2 of this blog) can add excessive memory pressure on the\nmachine memory and increase the frequency of hitting these GC pauses as it\ntries to free up resources.\n\nOn the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\nuse [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\nmemory pressure. RocksDB is an embeddable persistent key-value store for fast\nstorage. It features high performance through a log-structured database engine\nwritten entirely in C++ and optimized for fast, low-latency storage.\n\nLeveraging RocksDB as the state store provider still uses machine memory\nbut no longer occupies space in the JVM and makes for a more efficient\nstate management system for large amounts of keys. This doesn’t come for\nfree, however, as it introduces an extra step in processing every microbatch.\nIntroducing RocksDB shouldn’t be expected to reduce latency except when it is\nrelated to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\nregular state storage as it is included in the stream checkpointing.\n\n\n-----\n\nRocksDB configuration, like checkpoint configuration, is minimal by design and so\nyou only need to declare it in your overall Spark configuration:\n\nspark.conf. set (\n\"spark.sql.streaming.stateStore.providerClass\" ,\n\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n\nIf you are monitoring your stream using the streamingQueryListener class, then\nyou will also notice that RocksDB metrics will be included in the stateOperators\nfield. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n\nIt’s worth noting that large numbers of keys can have other adverse impacts in\naddition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\nalso gets backed up in checkpoints for fault tolerance. So it makes sense that\nif you have state files being created so that they will not expire, you will keep\naccumulating files in the checkpoint, increasing the amount of storage required\nand potentially the time to write it or recover from failures as well. For the data\nin memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n\nthis situation can lead to somewhat vague out-of-memory errors, and for the\ncheckpointed data written to cloud storage you might observe unexpected\nand unreasonable growth. Unless you have a business need to retain streaming\nstate for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\noperations so that the system can drop state records that are no longer needed\n(pay close attention to dropDuplicates and stream-stream joins).\n\n\n**Running multiple streams on a cluster**\n\nOnce your streams are fully tested and configured, it’s time to figure out how to\norganize them in production. It’s a common pattern to stack multiple streams on\nthe same Spark cluster to maximize resource utilization and save cost. This is fine\nto a point, but there are limits to how much you can add to one cluster before\nperformance is affected. The driver has to manage all of the streams running on\nthe cluster, and all streams will compete for the same cores across the workers.\nYou need to understand what your streams are doing and plan your capacity\nappropriately to stack effectively.\n\nHere is what you should take into account when you’re planning on stacking\nmultiple streams on the same cluster:\n\n**•** Make sure your driver is big enough to manage all of your streams. Is your\ndriver struggling with a high CPU utilization and garbage collection? That\nmeans it’s struggling to manage all of your streams. Either reduce the\nnumber of streams or increase the size of your driver.\n\n**•** Consider the amount of data each stream is processing. The more data\nyou are ingesting and writing to a sink, the more cores you will need in\norder to maximize your throughput for each stream. You’ll need to reduce\nthe number of streams or increase the number of workers depending on\nhow much data is being processed. For sources like Kafka you will need to\nconfigure how many cores are being used to ingest with the minPartitions\noption if you don’t have enough cores for all of the partitions across all of\nyour streams.\n\n\n-----\n\n**•** Consider the complexity and data volume of your streams. If all of the\nstreams are doing minimal manipulation and just appending to a sink, then\neach stream will need fewer resources per microbatch and you’ll be able to\nstack more. If the streams are doing stateful processing or computation/\nmemory-intensive operations, that will require more resources for good\nperformance and you’ll want to stack fewer streams.\n\n**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\ncontending for the same workers and cores, and one stream that needs a\nlot of cores will cause the other streams to wait. Scheduler pools enable\nyou to have different streams execute on different parts of the cluster.\nThis will enable streams to execute in parallel with a subset of the available\nresources.\n\n\n**Conclusion**\n\nSome of the ideas we’ve addressed here certainly deserve their own time\nand special treatment with a more in-depth discussion, which you can look\nforward to in later deep dives. However, we hope these recommendations are\nuseful as you begin your journey or seek to enhance your production streaming\nexperience. Be sure to continue with the next post, “Streaming in Production:\nCollected Best Practices, Part 2.”\n\n**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n\n\n\n**•** Consider your SLA. If you have mission critical streams, isolate them as a\nbest practice so lower-criticality streams do not affect them.\n\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nOn Databricks we typically see customers stack between 10-30 streams on a\ncluster, but this varies depending on the use case. Consider the factors above so\nthat you can have a good experience with performance, cost and maintainability.\n\n\n-----\n\nSECTION 2.5 \u0007\n\n**Streaming in Production: Collected Best Practices, Part 2**\n\nby **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nJanuary 10, 2023\n\n\nIn our two-part blog series titled “Streaming in Production: Collected Best\nPractices,” this is the second article. Here we discuss the “After Deployment”\nconsiderations for a Structured Streaming Pipeline. The majority of the\nsuggestions in this post are relevant to both Structured Streaming Jobs and\nDelta Live Tables (our flagship and fully managed ETL product that supports\nboth batch and streaming pipelines).\n\n**After deployment**\n\nAfter the deployment of your streaming application, there are typically three\nmain things you’ll want to know:\n\n**•** How is my application running?\n\n**•** Are resources being used efficiently?\n\n**•** How do I manage any problems that come up?\n\nWe’ll start with an introduction to these topics, followed by a deeper dive later in\nthis blog series.\n\n\n**Monitoring and instrumentation (How is my application running?)**\n\nStreaming workloads should be pretty much hands-off once deployed to\nproduction. However, one thing that may sometimes come to mind is: “how is my\napplication running?” Monitoring applications can take on different levels and\nforms depending on:\n\n**•** the metrics collected for your application (batch duration/latency,\nthroughput, …)\n\n**•** where you want to monitor the application from\n\nAt the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\nused in a variety of situations.\n\nThis is in addition to setting up failure alerts on jobs running streaming\nworkloads.\n\nIf you want more fine-grained metrics or to create custom actions based on\nthese metrics as part of your code base, then the StreamingQueryListener is\nbetter aligned with what you’re looking for.\n\n\n-----\n\nIf you want the Spark metrics to be reported (including machine level traces for\ndrivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n\nThe Apache Spark Structured Streaming UI\n\n\nAnother point to consider is where you want to surface these metrics for\nobservability. There is a Ganglia dashboard at the cluster level, integrated partner\napplications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\nsource options you can build using tools like Prometheus and Grafana. Each\nhas advantages and disadvantages to consider around cost, performance, and\nmaintenance requirements.\n\nWhether you have low volumes of streaming workloads where interactions in the\nUI are sufficient or have decided to invest in a more robust monitoring platform,\nyou should know how to observe your production streaming workloads. Further\n“Monitoring and Alerting” posts later in this series will contain a more thorough\ndiscussion. In particular, we’ll see different measures on which to monitor\nstreaming applications and then later take a deeper look at some of the tools\nyou can leverage for observability.\n\n**Application optimization (Are resources being used effectively?**\n\n**Think “cost”)**\n\nThe next concern we have after deploying to production is “is my application\n\nusing resources effectively?” As developers, we understand (or quickly learn) the\ndistinction between working code and well-written code. Improving the way your\ncode runs is usually very satisfying, but what ultimately matters is the overall\ncost of running it. Cost considerations for Structured Streaming applications will\nbe largely similar to those for other Spark applications. One notable difference\nis that failing to optimize for production workloads can be extremely costly,\nas these workloads are frequently “always-on” applications, and thus wasted\nexpenditure can quickly compound. Because assistance with cost optimization is\n\n\n-----\n\nfrequently requested, a separate post in this series will address it. The key points\nthat we’ll focus on will be efficiency of usage and sizing.\n\nGetting the cluster sizing right is one of the most significant differences between\nefficiency and wastefulness in streaming applications. This can be particularly\ntricky because in some cases it’s difficult to estimate the full load conditions of\nthe application in production before it’s actually there. In other cases, it may be\ndifficult due to natural variations in volume handled throughout the day, week, or\nyear. When first deploying, it can be beneficial to oversize slightly, incurring the\nextra expense to avoid inducing performance bottlenecks. Utilize the monitoring\ntools you chose to employ after the cluster has been running for a few weeks\nto ensure proper cluster utilization. For example, are CPU and memory levels\nbeing used at a high level during peak load or is the load generally small and the\ncluster may be downsized? Maintain regular monitoring of this and keep an eye\nout for changes in data volume over time; if either occurs, a cluster resize may be\nrequired to maintain cost-effective operation.\n\nAs a general guideline, you should avoid excessive shuffle operations, joins, or an\nexcessive or extreme watermark threshold (don’t exceed your needs), as each\ncan increase the number of resources you need to run your application. A large\nwatermark threshold will cause Structured Streaming to keep more data in the\nstate store between batches, leading to an increase in memory requirements\nacross the cluster. Also, pay attention to the type of VM configured — are you\nusing memory-optimized for your memory-intense stream? Compute-optimized\nfor your computationally-intensive stream? If not, look at the utilization levels\nfor each and consider trying a machine type that could be a better fit. Newer\nfamilies of servers from cloud providers with more optimal CPUs often lead to\nfaster execution, meaning you might need fewer of them to meet your SLA.\n\n\n**Troubleshooting (How do I manage any problems that come up?)**\n\nThe last question we ask ourselves after deployment is “how do I manage any\nproblems that come up?” As with cost optimization, troubleshooting streaming\napplications in Spark often looks the same as other applications since most of\nthe mechanics remain the same under the hood. For streaming applications,\nissues usually fall into two categories — failure scenarios and latency scenarios\n\n**Failure scenarios**\n\nFailure scenarios typically manifest with the stream stopping with an error,\nexecutors failing or a driver failure causing the whole cluster to fail. Common\ncauses for this are:\n\n**•** Too many streams running on the same cluster, causing the driver to be\noverwhelmed. On Databricks, this can be seen in Ganglia, where the driver\nnode will show up as overloaded before the cluster fails.\n\n**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\nThis can also be seen on Databricks in Ganglia before an executor fails,\nor in the Spark UI under the executors tab.\n\n**•** Using a collect to send too much data to the driver, causing it to fail\nwith an Out Of Memory error.\n\n\n-----\n\n**Latency scenarios**\n\nFor latency scenarios, your stream will not execute as fast as you want or expect.\nA latency issue can be intermittent or constant. Too many streams or too small\nof a cluster can be the cause of this as well. Some other common causes are:\n\n**•** Data skew — when a few tasks end up with much more data than the rest\nof the tasks. With skewed data, these tasks take longer to execute than the\nothers, often spilling to disk. Your stream can only run as fast as its slowest\ntask.\n\n**•** Executing a stateful query without defining a watermark or defining a very\nlong one will cause your state to grow very large, slowing down your stream\nover time and potentially leading to failure.\n\n**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n\n**•** Stable but high latency (batch execution time). Depending on the cause,\nadding more workers to increase the number of cores concurrently available\nfor Spark tasks can help. Increasing the number of input partitions and/or\ndecreasing the load per core through batch size settings can also reduce\nthe latency.\n\nJust like troubleshooting a batch job, you’ll use Ganglia to check cluster\nutilization and the Spark UI to find performance bottlenecks. There is a\nspecific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\ntroubleshoot streaming applications. On that tab each stream that is running will\nbe listed, and you’ll see either your stream name if you named your stream or\n\n\n if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\ntab of the Spark UI so that you can tell which jobs are for a given stream.\n\nYou’ll notice above we said which jobs are for a given stream. It’s a common\nmisconception that if you were to look at a streaming application in the Spark\nUI you would just see one job in the Jobs tab running continuously. Instead,\ndepending on your code, you will see one or more jobs that start and complete\nfor each microbatch. Each job will have the stream ID from the Structured\nStreaming tab and a microbatch number in the description, so you’ll be able to\ntell which jobs go with which stream. You can click into those jobs to find the\nlongest running stages and tasks, check for disk spills, and search by Job ID in\nthe SQL tab to find the slowest queries and check their explain plans.\n\nThe Jobs tab in the Apache Spark UI\n\n\n-----\n\nIf you click on your stream in the Structured Streaming tab you’ll see how much\ntime the different streaming operations are taking for each microbatch, such as\nadding a batch, query planning and committing (see earlier screenshot of the\nApache Spark Structured Streaming UI). You can also see how many rows are\nbeing processed as well as the size of your state store for a stateful stream.\nThis can give insights into where potential latency issues are.\n\nWe will go more in-depth with troubleshooting later in this blog series, where\nwe’ll look at some of the causes and remedies for both failure scenarios and\nlatency scenarios as we outlined above.\n\n**Conclusion**\n\nYou may have noticed that many of the topics covered here are very similar to\nhow other production Spark applications should be deployed. Whether your\nworkloads are primarily streaming applications or batch processes, the majority\nof the same principles will apply. We focused more on things that become\nespecially important when building out streaming applications, but as we’re\n\n\nsure you’ve noticed by now, the topics we discussed should be included in\nmost production deployments.\n\nAcross the majority of industries in the world today information is needed\nfaster than ever, but that won’t be a problem for you. With Spark Structured\nStreaming you’re set to make it happen at scale in production. Be on the lookout\nfor more in-depth discussions on some of the topics we’ve covered in this blog,\nand in the meantime keep streaming!\n\n**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.6 \u0007\n\n**Building Geospatial Data Products**\n\nby **M I L O S C O L I C**\n\nJanuary 6, 2023\n\n\nGeospatial data has been driving innovation for centuries, through use of\nmaps, cartography and more recently through digital content. For example,\nthe oldest map has been found etched in a piece of mammoth tusk and dates\n[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\nsources used by society to make decisions. A more recent example, labeled\nas the birth of spatial analysis, is that of Charles Picquet in 1832 who used\ngeospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\nlater John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\nproblems of their times and in effect save countless lives. Fast-forwarding to the\n20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\nRural Development.\n\nToday we are in the midst of the cloud computing industry revolution —\nsupercomputing scale available to any organization, virtually infinitely scalable\nfor both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\nare emerging within the data community to address questions like platform\nfederation and interoperability. How can we adopt these concepts to geospatial\ndata, spatial analysis and GIS systems? By adopting the concept of data\nproducts and approaching the design of geospatial data as a product.\n\n\nIn this blog we will provide a point of view on how to design scalable geospatial\ndata products that are modern and robust. We will discuss how Databricks\nLakehouse Platform can be used to unlock the full potential of geospatial\nproducts that are one of the most valuable assets in solving the toughest\nproblems of today and the future.\n\n**What is a data product? And how to design one?**\n\nThe most broad and the most concise definition of a “data product” was coined\nby DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n_Data into Product:_ “a product that facilitates an end goal through the use of\ndata.” The complexity of this definition (as admitted by Patil himself) is needed to\nencapsulate the breadth of possible products, to include dashboards, reports, Excel\n\nspreadsheets, and even CSV extracts shared via emails. You might notice that the\nexamples provided deteriorate rapidly in quality, robustness and governance.\n\nWhat are the concepts that differentiate a successful product versus an\nunsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\ncontent? Or is it only the product adoption in the market? Forbes defines the\n10 must-haves of a successful product. A good framework to summarize this is\nthrough the value pyramid.\n\n\n-----\n\nFigure 1: Product value pyramid (source)\n\nThe value pyramid provides a priority on each aspect of the product. Not every\nvalue question we ask about the product carries the same amount of weight. If\nthe output is not useful none of the other aspects matter — the output isn’t really\na product but becomes more of a data pollutant to the pool of useful results.\nLikewise, scalability only matters after simplicity and explainability are addressed.\n\nHow does the value pyramid relate to the data products? Each data output, in\norder to be a data product:\n\n**•** **Should have clear usefulness.** The amount of the data society is\ngenerating is rivaled only by the amount of data pollutants we are\ngenerating. These are outputs lacking clear value and use, much less a\nstrategy for what to do with them.\n\n\n\n**•** **Should be explainable.** With the emergence of AI/ML, explainability has\nbecome even more important for data driven decision-making. Data\nis as good as the metadata describing it. Think of it in terms of food —\ntaste does matter, but a more important factor is the nutritional value\nof ingredients.\n\n**•** **Should be simple.** An example of product misuse is using a fork to eat\ncereal instead of using a spoon. Furthermore, simplicity is essential but\nnot sufficient — beyond simplicity the products should be intuitive.\nWhenever possible both intended and unintended uses of the data\nshould be obvious.\n\n**•** **Should be scalable.** Data is one of the few resources that grows with\nuse. The more data you process the more data you have. If both inputs\nand outputs of the system are unbounded and ever-growing, then the\nsystem has to be scalable in compute power, storage capacity and\ncompute expressive power. Cloud data platforms like Databricks are in\na unique position to answer for all of the three aspects.\n\n**•** **Should generate habits.** In the data domain we are not concerned\nwith customer retention as is the case for the retail products. However,\nthe value of habit generation is obvious if applied to best practices.\nThe systems and data outputs should exhibit the best practices and\npromote them — it should be easier to use the data and the system in\nthe intended way than the opposite.\n\nThe geospatial data should adhere to all the aforementioned aspects — any data\nproducts should. On top of this tall order, geospatial data has some specific needs.\n\n\n-----\n\n**Geospatial data standards**\n\n\n\n**•** **“Advocate the understanding and use of geospatial data standards**\n**within other sectors of government.”** — Value pyramid applies to\nthe standards as well — concepts like ease of adherence (usefulness/\nsimplicity), purpose of the standard (explainability/usefulness), adoption\n(habit generation) are critical for the value generation of a standard.\n\nA critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\nprinciples:\n\n**•** **Findable** — The first step in (re)using data is to find them. Metadata\nand data should be easy to find for both humans and computers.\nMachine-readable metadata are essential for automatic discovery of\ndata sets and services.\n\n**•** **Accessible** — Once the user finds the required data, she/he/they\nneed to know how they can be accessed, possibly including\nauthentication and authorization.\n\n**•** **Interoperable** — The data usually needs to be integrated with\nother data. In addition, the data needs to interoperate with\napplications or workflows for analysis, storage, and processing.\n\n**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\nTo achieve this, metadata and data should be well-described so that\nthey can be replicated and/or combined in different settings.\n\n\nGeospatial data standards are used to ensure that geographic data is collected,\norganized, and shared in a consistent and reliable way. These standards can\ninclude guidelines for things like data formatting, coordinate systems, map\nprojections, and metadata. Adhering to standards makes it easier to share data\nbetween different organizations, allowing for greater collaboration and broader\naccess to geographic information.\n\nThe Geospatial Commision (UK government) has defined the UK Geospatial\nData Standards Register as a central repository for data standards to be applied\nin the case of geospatial data. Furthermore, the mission of this registry is to:\n\n**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n**across a wider range of systems.”** — These concepts are a callout for the\nimportance of explainability, usefulness and habit generation (possibly\nother aspects of the value pyramid).\n\n**•** **“Empower the UK geospatial community to become more engaged with**\n**the relevant standards and standards bodies.”** — Habit generation within\nthe community is as important as the robust and critical design on the\nstandard. If not adopted standards are useless.\n\n\n-----\n\nWe share the belief that the FAIR principles are crucial for the design of scalable\ndata products we can trust. To be fair, FAIR is based on common sense, so why\nis it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n_does well is to articulate, in an accessible way, the need for a holistic approach_\n_to data improvement. This ease in communication is why FAIR is being used_\n_increasingly widely as an umbrella for data improvement — and not just in the_\n_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n\nTo further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\ndeveloped the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\ncovers the years 2021-2024 and was approved in November 2020. The goals\nof NSDI are in essence FAIR principles and convey the same message of designing\nsystems that promote the circular economy of data — data products that flow\nbetween organizations following common standards and in each step through the\ndata supply chain unlock new value and new opportunities. The fact that these\nprinciples are permeating different jurisdictions and are adopted across different\nregulators is a testament to the robustness and soundness of the approach.\n\n\nThe FAIR concepts weave really well together with the data product design.\nIn fact FAIR is traversing the whole product value pyramid and forms a value\ncycle. By adopting both the value pyramid and FAIR principles we design data\nproducts with both internal and external outlook. This promotes data reuse\nas opposed to data accumulation.\n\nWhy do FAIR principles matter for geospatial data and geospatial data\n\nproducts? FAIR is transcendent to geospatial data, it is actually transcendent\nto data, it is a simple yet coherent system of guiding principles for good design\n— and that good design can be applied to anything including geospatial data\nand geospatial systems.\n\n\nFigure 2:\nNDSI Strategic Goals\n\n\n-----\n\n**Grid index systems**\n\nIn traditional GIS solutions’ performance of spatial operations are usually\nachieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\nThe issue with tree approaches is that they eventually break the scalability\nprinciple — when the data is too big to be processed in order to build the tree\nand the computation required to build the tree is too long and defeats the\npurpose. This also negatively affects the accessibility of data; if we cannot\nconstruct the tree we cannot access the complete data and in effect we cannot\nreproduce the results. In this case, grid index systems provide a solution.\n\n\nGrid index systems are built from the start with the scalability aspects of the\ngeospatial data in mind. Rather than building the trees, they define a series of\ngrids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\nthe grid covers the area of the Earth; in the case of local grid index systems\n(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\nThese grids are composed of cells that have unique identifiers. There is a\nmathematical relationship between location and the cell in the grid. This makes\nthe grid index systems very scalable and parallel in nature.\n\n\nFigure 4: Grid Index Systems (H3, British National Grid)\n\n\n-----\n\nAnother important aspect of grid index systems is that they are open source,\nallowing index values to be universally leveraged by data producers and\nconsumers alike. Data can be enriched with the grid index information at any\nstep of its journey through the data supply chain. This makes the grid index\nsystems an example of community driven data standards. Community driven\ndata standards by nature do not require enforcement, which fully adheres\nto the habit generation aspect of value pyramid and meaningfully addresses\ninteroperability and accessibility principles of FAIR.\n\n\nDatabricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\nfollowing the same value proposition. Adopting common industry standards\ndriven by the community is the only way to properly drive habit generation and\ninteroperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\nand [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\nGIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\nthe UK government. Grid index systems are key for the scalability of geospatial\ndata processing and for properly designing solutions for complex problems\n(e.g., figure 5 — flight holding patterns using H3).\n\n**Geospatial data diversity**\n\nGeospatial data standards spend a solid amount of effort regarding data\nformat standardization, and format for that matter is one of the most\nimportant considerations when it comes to interoperability and reproducibility.\nFurthermore, if the reading of your data is complex — how can we talk about\nsimplicity? Unfortunately geospatial data formats are typically complex, as\ndata can be produced in a number of formats including both open source\n\nand vendor-specific formats. Considering only vector data, we can expect\ndata to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\nand many others. On the other hand, if we are considering raster data we can\nexpect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\nGeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n\n\nFigure 5: Example of using H3 to express flight holding patterns\n\n\n-----\n\nGeospatial data domain is so diverse and has organically grown over the years\naround the use cases it was addressing. Unification of such a diverse ecosystem\nis a massive challenge. A recent effort by the Open Geospatial Consortium\n(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\nof designing a good scalable and robust product — unification leads to simplicity\nand addresses one of the main sources of friction in the ecosystem — the data\ningestion. Standardizing to GeoParquet brings a lot of value that addresses all of\nthe aspects of FAIR data and value pyramid.\n\nFigure 6: Geoparquet as a geospatial standard data format\n\n\nWhy introduce another format into an already complex ecosystem? GeoParquet\nisn’t a new format — it is a schema specification for Apache Parquet format that\nis already widely adopted and used by the industry and the community. Parquet\nas the base format supports binary columns and allows for storage of arbitrary\ndata payload. At the same time the format supports structured data columns\nthat can store metadata together with the data payload. This makes it a choice\nthat promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\nhas been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\nproperties of a format are crucial for reproducibility and for trusted outputs. In\naddition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n\nDelta Sharing enables enterprise scale data sharing between any public cloud\nusing Databricks (DIY options for private cloud are available using open source\nbuilding blocks). Delta Sharing completely abstracts the need for custom built\nRest APIs for exposing data to other third parties. Any data asset stored in Delta\n(using GeoParquet schema) automatically becomes a data product that can be\nexposed to external parties in a controlled and governed manner. Delta Sharing\nhas been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n\n\n-----\n\nFigure 7: Delta Sharing simplifying data access in the ecosystem\n\n**Circular data economy**\n\n\nBorrowing the concepts from the sustainability domain, we can define a circular\ndata economy as a system in which data is collected, shared, and used in a way\nthat maximizes its value while minimizing waste and negative impacts, such as\nunnecessary compute time, untrustworthy insights, or biased actions based\ndata pollutants. Reusability is the key concept in this consideration — how can\nwe minimize the \"reinvention of the wheel.\" There are countless data assets out\nin the wild that represent the same area, same concepts with just ever slight\nalterations to better match a specific use case. Is this due to the actual\n\n\noptimizations or due to the fact it was easier to create a new copy of the assets\nthan to reuse the existing ones? Or was it too hard to find the existing data\nassets, or maybe it was too complex to define data access patterns.\n\nData asset duplication has many negative aspects in both FAIR considerations\nand data value pyramid considerations — having many disparate similar (but\ndifferent) data assets that represent the same area and same concepts can\ndeteriorate simplicity considerations of the data domain — it becomes hard\nto identify the data asset we actually can trust. It can also have very negative\n\n\n-----\n\nimplications toward habit generation. Many niche communities will emerge\nthat will standardize to themselves ignoring the best practices of the wider\necosystem, or worse yet they will not standardize at all.\n\nIn a circular data economy, data is treated as a valuable resource that can be\nused to create new products and services, as well as improving existing ones.\nThis approach encourages the reuse and recycling of data, rather than treating it\nas a disposable commodity. Once again, we are using the sustainability analogy\nin a literal sense — we argue that this is the correct way of approaching the\nproblem. Data pollutants are a real challenge for organizations both internally and\nexternally. An article by The Guardian states that less than 1% of collected data is\nactually analyzed. There is too much data duplication, the majority of data is hard\nto access and deriving actual value is too cumbersome. Circular data economy\npromotes best practices and reusability of existing data assets allowing for a more\nconsistent interpretation and insights across the wider data ecosystem.\n\n\nFigure 8: Databricks Marketplace\n\n\n-----\n\nInteroperability is a key component of FAIR data principles, and from\ninteroperability a question of circularity comes to mind. How can we design an\necosystem that maximizes data utilization and data reuse? Once again, FAIR\ntogether with the value pyramid holds answers. Findability of the data is key to\nthe data reuse and to solving for data pollution. With data assets that can be\ndiscovered easily we can avoid the recreation of same data assets in multiple\nplaces with just slight alteration. Instead we gain a coherent data ecosystem\nwith data that can be easily combined and reused. Databricks has recently\nannounced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\nline with the original definition of data product by DJ Patel. The marketplace\nwill support sharing of data sets, notebooks, dashboards, and machine learning\nmodels. The critical building block for such a marketplace is the concept of\nDelta Sharing — the scalable, flexible and robust channel for sharing any data —\ngeospatial data included.\n\n\nDesigning scalable data products that will live in the marketplace is crucial.\nIn order to maximize the value add of each data product one should strongly\nconsider FAIR principles and the product value pyramid. Without these guiding\nprinciples we will only increase the issues that are already present in the\ncurrent systems. Each data product should solve a unique problem and should\nsolve it in a simple, reproducible and robust way.\n\n**You can read more on how Databricks Lakehouse**\n**Platform can help you accelerate time to value from**\n**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.7 \u0007\n\n**Data Lineage With Unity Catalog**\n\nby **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n\nJune 8, 2022\n\n\nThis blog will discuss the importance of data lineage, some of the common\nuse cases, our vision for better data transparency and data understanding with\ndata lineage.\n\n**What is data lineage and why is it important?**\n\nData lineage describes the transformations and refinements of data from source\nto insight. Lineage includes capturing all the relevant metadata and events\nassociated with the data in its lifecycle, including the source of the data set,\nwhat other data sets were used to create it, who created it and when, what\ntransformations were performed, what other data sets leverage it, and many other\nevents and attributes. With a data lineage solution, data teams get an end-to-end\nview of how data is transformed and how it flows across their data estate.\n\nAs more and more organizations embrace a data-driven culture and set up\nprocesses and tools to democratize and scale data and AI, data lineage is\nbecoming an essential pillar of a pragmatic data management and governance\nstrategy.\n\nTo understand the importance of data lineage, we have highlighted some of the\ncommon use cases we have heard from our customers below.\n\n\n**Impact analysis**\nData goes through multiple updates or revisions over its lifecycle, and\nunderstanding the potential impact of any data changes on downstream\nconsumers becomes important from a risk management standpoint. With data\nlineage, data teams can see all the downstream consumers — applications,\ndashboards, machine learning models or data sets, etc. — impacted by data\nchanges, understand the severity of the impact, and notify the relevant\nstakeholders. Lineage also helps IT teams proactively communicate data\nmigrations to the appropriate teams, ensuring business continuity.\n\n**Data understanding and transparency**\nOrganizations deal with an influx of data from multiple sources, and building\na better understanding of the context around data is paramount to ensure\nthe trustworthiness of the data. Data lineage is a powerful tool that enables\ndata leaders to drive better transparency and understanding of data in their\norganizations. Data lineage also empowers data consumers such as data scientists,\ndata engineers and data analysts to be context-aware as they perform analyses,\nresulting in better quality outcomes. Finally, data stewards can see which data sets\nare no longer accessed or have become obsolete to retire unnecessary data and\nensure data quality for end business users .\n\n\n-----\n\n**Debugging and diagnostics**\nYou can have all the checks and balances in place, but something will eventually\nbreak. Data lineage helps data teams perform a root cause analysis of any errors\nin their data pipelines, applications, dashboards, machine learning models, etc.,\nby tracing the error to its source. This significantly reduces the debugging time,\nsaving days, or in many cases, months of manual effort.\n\n**Compliance and audit readiness**\nMany compliance regulations, such as the General Data Protection Regulation\n(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\nAccountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\nand Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\nand visibility of data flow. As a result, data traceability becomes a key requirement\nin order for their data architecture to meet legal regulations. Data lineage helps\norganizations be compliant and audit-ready, thereby alleviating the operational\noverhead of manually creating the trails of data flows for audit reporting purposes.\n\n\n**Effortless transparency and proactive control with**\n**data lineage**\n\nThe [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\nsubstantially simplifies enterprise data infrastructure and accelerates innovation\nby unifying your data warehousing and AI use cases on a single platform.\nWe believe data lineage is a key enabler of better data transparency and data\nunderstanding in your lakehouse, surfacing the relationships between data,\njobs, and consumers, and helping organizations move toward proactive data\nmanagement practices. For example:\n\n**•** As the owner of a dashboard, do you want to be notified next time that a\ntable your dashboard depends upon wasn’t loaded correctly?\n\n**•** As a machine learning practitioner developing a model, do you want to be\nalerted that a critical feature in your model will be deprecated soon?\n\n**•** As a governance admin, do you want to automatically control access to\ndata based on its provenance?\n\nAll of these capabilities rely upon the automatic collection of data lineage across\nall use cases and personas — which is why the lakehouse and data lineage are a\npowerful combination.\n\n\n-----\n\nData lineage for tables\n\nData lineage for table columns\n\n\nData Lineage for notebooks, workflows, dashboards\n\n**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\nthe same permission model as Unity Catalog. If users do not have access to\na table, they will not be able to explore the lineage associated with the table,\nadding an additional layer of security for privacy considerations.\n\n**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\nin near real-time, and retrieved via REST API to support integrations with our\ncatalog partners.\n\n**Getting started with data lineage in Unity Catalog**\n\nData lineage is available with Databricks Premium and Enterprise tiers for\nno additional cost. If you already are a Databricks customer, follow the data\nlineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\ncustomer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n\n\n-----\n\nSECTION 2.8\n\n**Easy Ingestion to Lakehouse With COPY INTO**\n\nby **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n\nJanuary 17, 2023\n\n\nA new data management architecture known as the data lakehouse emerged\nindependently across many organizations and use cases to support AI and BI\ndirectly on vast amounts of data. One of the key success factors for using the\ndata lakehouse for analytics and machine learning is the ability to quickly and\neasily ingest data of various types, including data from on-premises storage\nplatforms (data warehouses, mainframes), real-time streaming data, and bulk\ndata assets.\n\nAs data ingestion into the lakehouse is an ongoing process that feeds the\nproverbial ETL pipeline, you will need multiple options to ingest various formats,\ntypes and latency of data. For data stored in cloud object stores such as AWS\nS3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\nAuto Loader, a natively integrated feature, that allows data engineers to ingest\nmillions of files from the cloud storage continuously. In other streaming cases\n\n(e.g., IoT sensor or clickstream data), Databricks provides native connectors\nfor Apache Spark Structured Streaming to quickly ingest data from popular\nmessage queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\nlatencies. Furthermore, many customers can leverage popular ingestion tools\n\n\nthat integrate with Databricks, such as Fivetran — to easily ingest data from\nenterprise applications, databases, mainframes and more into the lakehouse.\nFinally, analysts can use the simple “COPY INTO” command to pull new data into\nthe lakehouse automatically, without the need to keep track of which files have\nalready been processed.\n\nThis blog focuses on COPY INTO, a simple yet powerful SQL command that allows\nyou to perform batch file ingestion into Delta Lake from cloud object stores.\nIt’s idempotent, which guarantees to ingest files with exactly-once semantics\nwhen executed multiple times, supporting incremental appends and simple\ntransformations. It can be run once, in an ad hoc manner, and can be scheduled\nthrough Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\nINTO introduced new functionalities for data preview, validation, enhanced error\nhandling, and a new way to copy into a schemaless Delta Lake table so that users\n\ncan get started quickly, completing the end-to-end user journey to ingest from\ncloud object stores. Let’s take a look at the popular COPY INTO use cases.\n\n\n-----\n\n**1. Ingesting data for the first time**\n\n\nThe default for data validation is to parse all the data in the source directory to\nensure that there aren’t any issues, but the rows returned for preview are limited.\nOptionally, you can provide the number of rows to preview after VALIDATE.\n\nThe COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\nof your target Delta table. Schema evolution only allows the addition of new\ncolumns, and does not support data type changes for existing columns. In other\nuse cases, you can omit this option if you intend to manage your table schema\nmore strictly as your data pipeline may have strict schema requirements and\nmay not want to evolve the schema at all times. However, our target Delta table\nin the example above is an empty, columnless table at the moment; therefore,\nwe have to specify the COPY_OPTION “mergeSchema” here.\n\nFigure 1: COPY INTO VALIDATE mode output\n\n\nCOPY INTO requires a table to exist as it ingests the data into a target Delta\ntable. However, you have no idea what your data looks like. You first create an\nempty Delta table.\n```\n CREATE TABLE my_example_data;\n\n```\nBefore you write out your data, you may want to preview it and ensure the\ndata looks correct. The COPY INTO Validate mode is a new feature in\nDatabricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\nsource data before ingesting many files from the cloud object stores.\nThese validations include:\n\n**•** if the data can be parsed\n\n**•** the schema matches that of the target table or if the schema\nneeds to be evolved\n\n**•** all nullability and check constraints on the table are met\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\n-----\n\n**2. Configuring COPY INTO**\n\n\nFigure 2 shows the validate output that the header is properly parsed.\n\nFigure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n\n**3. Appending data to a Delta table**\n\nNow that the preview looks good, we can remove the VALIDATE keyword and\nexecute the COPY INTO command.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\nWhen looking over the results of VALIDATE (see Figure 1), you may notice that\nyour data doesn’t look like what you want. Aren’t you glad you previewed your\ndata set first? The first thing you notice is the column names are not what is\nspecified in the CSV header. What’s worse, the header is shown as a row in your\ndata. You can configure the CSV parser by specifying FORMAT_OPTIONS.\nLet’s add those next.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\nWhen using the FORMAT OPTION, you can tell COPY INTO to infer the data types\nof the CSV file by specifying the inferSchema option; otherwise, all default\ndata types are STRINGs. On the other hand, binary file formats like AVRO and\nPARQUET do not need this option since they define their own schema. Another\n\noption, “mergeSchema” states that the schema should be inferred over a\ncomprehensive sample of CSV files rather than just one. The comprehensive list\nof format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n\n\n-----\n\nCOPY INTO keeps track of the state of files that\nhave been ingested. Unlike commands like INSERT\nINTO, users get idempotency with COPY INTO,\nwhich means users won’t get duplicate data in\nthe target table when running COPY INTO multiple\ntimes from the same source data.\n\nCOPY INTO can be run once, in an ad hoc manner,\nand can be scheduled with Databricks Workflows.\nWhile COPY INTO does not support low latencies\nfor ingesting natively, you can trigger COPY INTO\nthrough orchestrators like Apache Airflow.\n\n\nFigure 3: Databricks workflow UI to schedule a task\n\n\n-----\n\n**4. Secure data access with COPY INTO**\n\nCOPY INTO supports secure access in several ways. In this section, we want to\nhighlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\nfrom recent releases:\n\n**Unity Catalog**\nWith the general availability of Databrick Unity Catalog, you can use COPY INTO\nto ingest data to Unity Catalog managed or external tables from any source and\nfile format supported by COPY INTO. Unity Catalog also adds new options for\nconfiguring secure access to raw data, allowing you to use Unity Catalog external\nlocations or storage credentials to access data in cloud object storage. Learn\nmore about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n\n**Temporary Credentials**\nWhat if you have not configured Unity Catalog or instance profile? How about\ndata from a trusted third party bucket? Here is a convenient COPY INTO feature\nthat allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\nhoc bulk ingestion use case.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath' WITH (\nCREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\nTOKEN `=` '...' )\n)\nFILEFORMAT `=` CSV\n\n\n**5. Filtering files for ingestion**\n\nWhat about ingesting a subset of files where the filenames match a pattern? You\ncan apply glob patterns — a glob pattern that identifies the files to load from the\nsource directory. For example, let’s filter and ingest files which contain the word\n`raw_data` in the filename below.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data*.csv'\nFORMAT_OPTIONS ( 'header' `=` 'true' )\n\n**6. Ingest files in a time period**\n\nIn data engineering, it is frequently necessary to ingest files that have been\nmodified before or after a specific timestamp. Data between two timestamps\nmay also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\noffered by COPY INTO allow users to ingest data from a chosen time window into\na Delta table.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_*.csv'\nFORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n\n\n-----\n\n**7. Correcting data with the force option**\n\nBecause COPY INTO is by default idempotent, running the same query against\nthe same source files more than once has no effect on the destination table\nafter the initial execution. You must propagate changes to the target table\nbecause, in real-world circumstances, source data files in cloud object storage\nmay be altered for correction at a later time. In such a case, it is possible to first\nerase the data from the target table before ingesting the more recent data files\nfrom the source. For this operation you only need to set the copy option ‘force’\nto ‘true’.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_2022*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n\n**8. Applying simple transformations**\n\nWhat if you want to rename columns? Or the source data has changed and a\nprevious column has been renamed to something else? You don’t want to ingest\nthat data as two separate columns, but as a single column. We can leverage the\nSELECT statement in COPY INTO perform simple transformations.\n\nCOPY INTO demo.my_example_data\nFROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n`*` EXCEPT (first_name, last_name)\nFROM 's3://my-bucket/exampleDataPath'\n)\nFILEFORMAT `=` CSV\nPATTERN `=` '*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n**9. Error handling and observability with COPY INTO**\n\n**Error handling:**\nHow about ingesting data with file corruption issues? Common examples of file\ncorruption are:\n\n**•** Files with an incorrect file format\n\n**•** Failure to decompress\n\n**•** Unreadable files (e.g., invalid Parquet)\n\n\n-----\n\nCOPY INTO’s format option ignoreCorruptFiles helps skip those files while\nprocessing. The result of the COPY INTO command returns the number of files\nskipped in the num_skipped_corrupt_files column. In addition, these corrupt\nfiles aren’t tracked by the ingestion state in COPY INTO, therefore they can be\nreloaded in a subsequent execution once the corruption is fixed. This option is\navailable in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n\nYou can see which files have been detected as corrupt by running COPY INTO in\nVALIDATE mode.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nVALIDATE ALL\nFORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n\n**Observability:**\nIn Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\ninput file metadata information, which allows users to monitor and get key\nproperties of the ingested files like path, name, size and modification time, by\nquerying a hidden STRUCT column called _metadata. To include this information\nin the destination, you must explicitly reference the _metadata column in your\nquery in COPY INTO.\n\nCOPY INTO my_example_data\nFROM (\nSELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\nexampleDataPath'\n)\nFILEFORMAT `=` CSV\n\n\n**How does it compare to Auto Loader?**\n\nCOPY INTO is a simple and powerful command to use when your source\ndirectory contains a small number of files (i.e., thousands of files or less), and if\nyou prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\nDelta Lake at your convenience, a common pattern by many ingestion partners.\nTo ingest a larger number of files both in streaming and batch we recommend\nusing [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\nleveraging advanced capabilities of automatic error handling, quality control,\ndata lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n\n**How to get started?**\n\nTo get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\nexample SQL commands to ingest from your cloud object stores. Check out\nthe options in No. 4 to establish secure access to your data for querying it in\nDatabricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\nfollow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n\nAs an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\nMachine Learning workspaces to learn most of the COPY INTO features in this\nblog, where source data and target Delta tables are generated in DBFS.\n\nMore tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n\n\n-----\n\nSECTION 2.9 \u0007\n\n**Simplifying Change Data Capture With Databricks Delta Live Tables**\n\nby **M O J G A N M A Z O U C H I**\n\nApril 25, 2022\n\n\nThis guide will demonstrate how you can leverage change data capture in Delta\nLive Tables pipelines to identify new records and capture changes made to the\ndata set in your data lake. Delta Live Tables pipelines enable you to develop\nscalable, reliable and low latency data pipelines, while performing change data\ncapturee in your data lake with minimum required computation resources and\nseamless out-of-order data handling.\n\n**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\nwhich explains creating scalable and reliable pipelines using Delta Live Tables\n(DLT) and its declarative ETL definitions.\n\n**Background on change data capture**\n\nChange data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\nchanges (data deletes, inserts and updates) in databases, like tracking customer,\norder or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\nnew events occur.\n\n\nSince [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\nreal-time centralization of all data changes in your ETL pipeline across multiple\nenvironments is critical.\n\nBy capturing CDC events, Databricks users can re-materialize the source table\nas Delta Table in Lakehouse and run their analysis on top of it, while being able\nto combine data with external systems. The MERGE INTO command in Delta Lake\non Databricks enables customers to efficiently upsert and delete records in\ntheir data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\nThis is a common use case that we observe many of Databricks customers are\nleveraging Delta Lakes to perform, and keeping their data lakes up to date with\nreal-time business data.\n\nWhile Delta Lake provides a complete solution for real-time CDC synchronization\nin a data lake, we are now excited to announce the change data capture feature\nin Delta Live Tables that makes your architecture even simpler, more efficient and\nscalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n\nEarlier CDC solutions with Delta tables were using MERGE INTO operation, which\nrequires manually ordering the data to avoid failure when multiple rows of the\nsource data set match while attempting to update the same rows of the target\n\n\n-----\n\nDelta table. To handle the out-of-order data, there was an extra step required to\npreprocess the source table using a foreachBatch implementation to eliminate\nthe possibility of multiple matches, retaining only the latest change for each\nkey (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\noperation in DLT pipelines automatically and seamlessly handles out-of-order\ndata without any need for data engineering manual intervention.\n\n**CDC with Databricks Delta Live Tables**\n\nIn this blog, we will demonstrate how to use the APPLY CHANGES INTO command\nin Delta Live Tables pipelines for a common CDC use case where the CDC data\nis coming from an external system. A variety of CDC tools are available such\nas Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\nimplementations differ, these tools generally capture and record the history\nof data changes in logs; downstream applications consume these CDC logs. In\nour example, data is landed in cloud object storage from a CDC tool such as\nDebezium, Fivetran, etc.\n\nWe have data from various CDC tools landing in a cloud object storage or a\nmessage queue like Apache Kafka. Typically we see CDC used in an ingestion\nto what we refer as the medallion architecture. A medallion architecture is a\ndata design pattern used to logically organize data in a Lakehouse, with the\ngoal of incrementally and progressively improving the structure and quality of\ndata as it flows through each layer of the architecture. Delta Live Tables allows\nyou to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\ncombining this functionality with the medallion architecture allows for\n\n\nincremental changes to easily flow through analytical workloads at scale. Using\nCDC together with the medallion architecture provides multiple benefits to users\nsince only changed or added data needs to be processed. Thus, it enables users\nto cost-effectively keep Gold tables up-to-date with the latest business data.\n\n**NOTE:** The example here applies to both SQL and Python versions of CDC\nand also on a specific way to use the operations; to evaluate variations,\nplease see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n\n**Prerequisites**\n\nTo get the most out of this guide, you should have a basic familiarity with:\n\n**•** SQL or Python\n\n**•** Delta Live Tables\n\n**•** Developing ETL pipelines and/or working with Big Data systems\n\n**•** Databricks interactive notebooks and clusters\n\n**•** You must have access to a Databricks Workspace with permissions\nto create new clusters, run jobs, and save data to a location on\nexternal cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n\n**•** For the pipeline we are creating in this blog, “Advanced” product\nedition which supports enforcement of data quality constraints,\nneeds to be selected\n\n\n-----\n\n**The data set**\n\nHere we are consuming realistic looking CDC data from an external database. In\nthis pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\ntool like Debezium can produce and bring into cloud storage for the initial ingest\nin Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\nobject storage, and store them in the Bronze table as it stores the raw messages.\nThe Bronze tables are intended for data ingestion which enable quick access to a\nsingle source of truth. Next we perform APPLY CHANGES INTO from the cleaned\nBronze layer table to propagate the updates downstream to the Silver table. As\ndata flows to Silver tables, generally it becomes more refined and optimized\n(“just-enough”) to provide an enterprise a view of all its key business entities.\nSee the diagram below.\n\n\nThis blog focuses on a simple example that requires a JSON message with\nfour fields of customer’s name, email, address and id along with the two fields:\noperation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\noperation_date (which stores the date and timestamp for the record came for\neach operation action) to describe the changed data.\n\nTo generate a sample data set with the above fields, we are using a Python\npackage that generates fake data, Faker. You can find the notebook related to this\ndata generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\nlocation to write the generated data there. We are using the DBFS functionality of\nDatabricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\nwe use a PySpark user-defined function to generate the synthetic data set for\neach field, and write the data back to the defined storage location, which we will\nrefer to in other notebooks for accessing the synthetic data set.\n\n**Ingesting the raw data set using Auto Loader**\n\nAccording to the medallion architecture paradigm, the Bronze layer holds the\nmost raw data quality. At this stage we can incrementally read new data using\nAuto Loader from a location in cloud storage. Here we are adding the path to our\ngenerated data set to the configuration section under pipeline settings, which\nallows us to load the source path as a variable. So now our configuration under\npipeline settings looks like below:\n\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\"\n\n\n-----\n\nThen we load this configuration property in our notebooks.\n\nLet’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n\n**A . S Q L**\n\nSET spark.source;\nCREATE STREAMING LIVE TABLE customer_bronze\n(\naddress string ,\nemail string ,\nid string ,\nfirstname string ,\nlastname string ,\noperation string ,\noperation_date string ,\n_rescued_data string\n)\nTBLPROPERTIES ( \"quality\" = \"bronze\" )\nCOMMENT \"New customer data incrementally ingested from cloud object\nstorage landing zone\"\nAS\nSELECT *\nFROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\ninferColumnTypes\" , \"true\" ));\n\n\n**B . P Y T H O N**\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nsource = spark.conf.get( \"source\" )\n\n**@dlt.table(name=** **\"customer_bronze\"** **,**\n**comment =** **\"New customer data incrementally ingested from**\n**cloud object storage landing zone\"** **,**\n**table_properties={**\n**\"quality\"** **:** **\"bronze\"**\n**}**\n**)**\n```\n def customer_bronze ():\n\n```\nreturn (\nspark.readStream. format ( \"cloudFiles\" ) \\\n.option( \"cloudFiles.format\" , \"json\" ) \\\n.option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n.load( f\" {source} /customers\" )\n)\n\nThe above statements use the Auto Loader to create a streaming live table\ncalled customer_bronze from json files. When using Auto Loader in Delta Live\n\nTables, you do not need to provide any location for schema or checkpoint, as\nthose locations will be managed automatically by your DLT pipeline.\n\nAuto Loader provides a Structured Streaming source called cloud_files in\nSQL and cloudFiles in Python, which takes a cloud storage path and format as\nparameters.\n\nTo reduce compute costs, we recommend running the DLT pipeline in\nTriggered mode as a micro-batch assuming you do not have very low latency\nrequirements.\n\n\n-----\n\n**Expectations and high-quality data**\n\nIn the next step to create a high-quality, diverse, and accessible data set,\nwe impose quality check expectation criteria using Constraints. Currently,\na constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\nconstraints are logged to enable streamlined quality monitoring.\n\n**A . S Q L**\n\nCREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\nCONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\nCONSTRAINT valid_address EXPECT (address IS NOT NULL ),\nCONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\nDROP ROW\n)\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\nAS SELECT `*`\nFROM STREAM(LIVE.customer_bronze);\n\n**B . P Y T H O N**\n```\n @dlt.view(name= \"customer_bronze_clean_v\" ,\n comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n\n```\n\n**Using APPLY CHANGES INTO statement to propagate changes to**\n\n**downstream target table**\n\nPrior to executing the Apply Changes Into query, we must ensure that a target\nstreaming table which we want to hold the most up-to-date data exists. If it\ndoes not exist we need to create one. Below cells are examples of creating a\ntarget streaming table. Note that at the time of publishing this blog, the target\nstreaming table creation statement is required along with the Apply Changes\nInto query, and both need to be present in the pipeline — otherwise your table\ncreation query will fail.\n\n**A . S Q L**\n\nCREATE STREAMING LIVE TABLE customer_silver\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Clean, merged customers\";\n\n**B . P Y T H O N**\n\ndlt.create_target_table(name= \"customer_silver\" ,\ncomment= \"Clean, merged customers\" ,\ntable_properties={\n\"quality\" : \"silver\"\n\n```\n@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\ndef customer_bronze_clean_v ():\n return dlt.read_stream( \"customer_bronze\" ) \\\n\n```\n`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n```\n\"operation\" , \"operation_date\" , \"_rescued_data\" )\n\n```\n\n-----\n\nNow that we have a target streaming table, we can propagate changes to the\ndownstream target table using the Apply Changes Into query. While CDC feed\ncomes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\nINSERT and UPDATE events from any record in the source data set matching\non primary keys, and sequenced by a field which identifies the order of events.\nMore specifically it updates any row in the existing target table that matches\nthe primary key(s) or inserts a new row when a matching record does not exist\nin the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\nequivalent apply_as_deletes argument in Python to handle DELETE events.\n\nIn this example we used \"id\" as my primary key, which uniquely identifies the\ncustomers and allows CDC events to apply to those identified customer records\nin the target streaming table. Since \"operation_date\" keeps the logical order of\nCDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\nSQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\nchange events that arrive out of order. Keep in mind that the field value we use\nwith SEQUENCE BY (or sequence_by) should be unique among all updates to\nthe same key. In most cases, the sequence by column will be a column with\ntimestamp information.\n\nFinally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\ndata)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\ndate\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\nthe columns are included in the target streaming table, when we do not specify\nthe \"COLUMNS\" clause.\n\n\n**A . S Q L**\n\nAPPLY CHANGES INTO LIVE.customer_silver\nFROM stream(LIVE.customer_bronze_clean_v)\nKEYS (id)\nAPPLY AS DELETE WHEN operation `=` \"DELETE\"\nSEQUENCE BY operation_date\nCOLUMNS `*` EXCEPT (operation, operation_date,\n_rescued_data);\n\n**B . P Y T H O N**\n```\n dlt.apply_changes(\n target = \"customer_silver\",\n source = \"customer_bronze_clean_v\",\n keys = [\"id\"],\n sequence_by = col(\"operation_date\"),\n apply_as_deletes = expr(\"operation = 'DELETE'\"),\n except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n\n```\nTo check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n\nPlease note that, at the time of publishing this blog, a table that reads from the\ntarget of an APPLY CHANGES INTO query or apply_changes function must be a\nlive table, and cannot be a streaming live table.\n\nA [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\nwe have all the cells ready, let’s create a pipeline to ingest data from cloud object\nstorage. Open Jobs in a new tab or window in your workspace, and select “Delta\nLive Tables.”\n\n\n-----\n\nThe pipeline associated with this blog has the following DLT pipeline settings:\n\n{\n\"clusters\" : [\n{\n\"label\" : \"default\" ,\n\"num_workers\" : 1\n}\n],\n\"development\" : true ,\n\"continuous\" : false ,\n\"edition\" : \"advanced\" ,\n\"photon\" : false ,\n\"libraries\" : [\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/1-CDC_DataGenerator\"\n}\n},\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/2-Retail_DLT_CDC_sql\"\n}\n}\n],\n\"name\" : \"CDC_blog\" ,\n\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\" ,\n\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n},\n\"target\" : \"my_database\"\n\n\n1. Select “Create Pipeline” to create a new pipeline\n\n2. Specify a name such as “Retail CDC Pipeline”\n\n3. Specify the Notebook Paths that you already created earlier, one for the\ngenerated data set using Faker package, and another path for the ingestion\nof the generated data in DLT. The second notebook path can refer to the\nnotebook written in SQL, or Python depending on your language of choice.\n\n4. To access the data generated in the first notebook, add the data set path in\nconfiguration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\nwe set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\nour second notebook.\n\n5. Specify the Target (which is optional and referring to the target database),\nwhere you can query the resulting tables from your pipeline\n\n6. Specify the Storage Location in your object storage (which is optional), to\naccess your DLT produced data sets and metadata logs for your pipeline\n\n7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\nnew data in the source all at once, and once the processing is done it will\nterminate the compute resource automatically. You can toggle between\nTriggered and Continuous modes when editing your pipeline settings. Setting\n“continuous”: false in the JSON is equivalent to setting the pipeline to\nTriggered mode.\n\n8. For this workload you can disable the autoscaling under Autopilot Options,\nand use only one worker cluster. For production workloads, we recommend\nenabling autoscaling and setting the maximum numbers of workers needed\nfor cluster size.\n\n9. Select “Start”\n\n10. Your pipeline is created and running now!\n\n\n-----\n\nYou can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\nto see pipeline observability and data quality monitoring on the example DLT\npipeline associated with this blog.\n\n**Conclusion**\n\nIn this blog, we showed how we made it seamless for users to efficiently\nimplement change data capture (CDC) into their lakehouse platform with Delta\nLive Tables (DLT). DLT provides built-in quality controls with deep visibility into\npipeline operations, observing pipeline lineage, monitoring schema, and quality\nchecks at each step in the pipeline. DLT supports automatic error handling and\nbest in class auto-scaling capability for streaming workloads, which enables\nusers to have quality data with optimum resources required for their workload.\n\nData engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\nyour ETL pipelines easily identify changes and apply those changes across tens\nof thousands of tables with low-latency support.\n\n**Ready to get started and try out CDC in Delta Live Tables for yourself?**\nPlease watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\ncomplexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n[video](https://vimeo.com/700994477) to create your pipeline!\n\n\n**DLT pipeline lineage observability and data quality**\n**monitoring**\n\nAll DLT pipeline logs are stored in the pipeline’s storage location. You can specify\nyour storage location only when you are creating your pipeline. Note that once\nthe pipeline is created you can no longer modify storage location.\n\n\n-----\n\nSECTION 2.10 \u0007\n\n**Best Practices for Cross-Government Data Sharing**\n\nby **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n\n**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n\nFebruary 21, 2023\n\n\nGovernment data exchange is the practice of sharing data between different\ngovernment agencies and often partners in commercial sectors. Government\ncan share data for various reasons, such as to improve government operations’\nefficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\nprivate sector or receiving data from the private sector. The considerations span\nmultiple jurisdictions and over almost all industries. In this blog, we will address the\nneeds disclosed as part of national data strategies and how modern technologies,\nparticularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\nimplement and manage a future-proof and sustainable data ecosystem.\n\n**Data sharing and public sector**\n\n“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n\nProbably the quote about sharing that applies the most profoundly to the\ntopic of data sharing. To the extent that the purpose of sharing the data is to\ncreate new information, new insights, and new data. The importance of data\nsharing is even more amplified in the government context, where federation\n\n\nbetween departments allows for increased focus. Still, the very same federation\nintroduces challenges around data completeness, data quality, data access,\nsecurity and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\nand require a strategic, multifaceted approach to be addressed appropriately.\nTechnology, people, process, legal frameworks, etc., require dedicated\nconsideration when designing a robust data sharing ecosystem.\n\n[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\nmissions through which we can materialize the value of data for the citizen and\nsociety-wide benefits.\n\n\n-----\n\nIt comes as no surprise that each and every one of the missions is strongly\nrelated to the concept of data sharing, or more broadly, data access both within\nand outside of government departments:\n\n**1. Unlocking the value of the data across the economy** — Mission 1 of the\nNDS aims to assert government and the regulators as enablers of the value\nextraction from data through the adoption of best practices. The UK data\neconomy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\nIn this context, it is essential to understand that the government-collected\nand provided open data can be crucial for addressing many of the challenges\nacross all industries.\n\nFor example, insurance providers can better assess the risk of insuring\nproperties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\nthe other hand, capital market investors could better understand the risk of\ntheir investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\nReversely, it is crucial for regulators to have well-defined data access and\ndata sharing patterns for conducting their regulatory activities. This clarity\ntruly enables the economic actors that interact with government data.\n\n\n**2. Securing a pro-growth and trusted data regime** — The key aspect of\nMission 2 is data trust, or more broadly, adherence to data quality norms.\nData quality considerations become further amplified for data sharing and\ndata exchange use cases where we are considering the whole ecosystem\nat once, and quality implications transcend the boundaries of our own\nplatform. This is precisely why we have to adopt “data sustainability.” What\nwe mean by sustainable data products are data products that harness the\nexisting sources over reinvention of the same/similar assets, accumulation of\nunnecessary data (data pollutants) and that anticipate future uses.\n\nUngoverned and unbounded data sharing could negatively impact data\nquality and hinder the growth and value of data. The quality of how the data\nis shared should be a key consideration of data quality frameworks. For\nthis reason, we require a solid set of standards and best practices for data\nsharing with governance and quality assurance built into the process and\ntechnologies. Only this way can we ensure the sustainability of our data and\nsecure a pro-growth trusted data regime.\n\n\n-----\n\n**3. Transforming government’s use of data to drive efficiency and improve**\n**public services** — “By 2025 data assets are organized and supported as\nproducts, regardless of whether they’re used by internal teams or external\ncustomers… Data products continuously evolve in an agile manner to meet\nthe needs of consumers… these products provide data solutions that can\nmore easily and repeatedly be used to meet various business challenges and\nreduce the time and cost of delivering new AI-driven capabilities.” —\n[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\nenablers of digital transformation for both the public and private sectors.\n\nAI, ML, reports, and dashboards are just a few examples of data products\nand services that extract value from data. The quality of these solutions is\ndirectly reflected in the quality of data used for building them and our ability\nto access and leverage available data assets both internally and externally.\nWhilst there is a vast amount of data available for us to build new intelligent\nsolutions for driving efficiency for better processes, better decision-making,\nand better policies — there are numerous barriers that can trap the data,\nsuch as legacy systems, data silos, fragmented standards, proprietary\nformats, etc. Modeling data solutions as data products and standardizing\nthem to a unified format allows us to abstract such barriers and truly\nleverage the data ecosystem.\n\n\n**4. Ensuring the security and resilience of the infrastructure on which**\n**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\nfrom now and even in a not so distant future, we will be required to rethink\nour approach to data, more specifically — what is our digital supply chain\ninfrastructure/data sharing infrastructure? Data and data assets are products\nand should be managed as products. If data is a product, we need a coherent\nand unified way of providing those products.\n\nIf data is to be used across industries and across both private and public\nsectors, we need an open protocol that drives adoption and habit generation.\nTo drive adoption, the technologies we use must be resilient, robust, trusted\nand usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\nboundaries to achieving this vision.\n\n**5. Championing the international flow of data** — Data exchange between\njurisdictions and across governments will likely be one of the most\ntransformative applications of data at scale. Some of the world’s toughest\nchallenges depend on the efficient exchange of data between governments\n— prevention of criminal activities, counterterrorism activities, net-zero\nemission goals, international trade, the list goes on and on. Some steps in\nthis direction are already materializing: the U.S. federal government and UK\ngovernment have agreed on data exchange for countering serious crime\nactivities. This is a true example of championing international flow data and\nusing data for good. It is imperative that for these use cases, we approach\ndata sharing from a security-first angle. Data sharing standards and protocols\nneed to adhere to security and privacy best practices.\n\n\n-----\n\nWhile originally built with a focus on the UK government and how to better\nintegrate data as a key asset of a modern government, these concepts apply in\na much wider global public sector context. In the same spirit, the U.S. Federal\nGovernment proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\npractices, action steps and timeline through which government can leverage\nthe full value of Federal data for mission, service and the public good.\n\nThe principles are grouped into three primary topics:\n\n**•** **Ethical governance** — Within the domain of ethics, the sharing of data\nis a fundamental tool for promoting transparency, accountability and\nexplainability of decision-making. It is practically impossible to uphold\nethics without some form of audit conducted by an independent party.\nData (and metadata) exchange is a critical enabler for continuous robust\nprocesses that ensure we are using the data for good and we are using data\nwe can trust.\n\n\n\n**•** **Conscious design** — These principles are strongly aligned with the idea of\ndata sustainability. The guidelines promote forward thinking around usability\nand interoperability of the data and user-centric design principles of\nsustainable data products.\n\n**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\nan important role in building a scalable learning ecosystem and learning\nculture. Data is front and center of knowledge synthesis, and from a\nscientific angle, data proves factual knowledge. Another critical component\nof knowledge is the “Why?” and data is what we need to address the\n“Why?” component of any decisions we make, which policy to enforce, who\nto sanction, who to support with grants, how to improve the efficiency of\ngovernment services, how to better serve citizens and society.\n\nIn contrast to afore discussed qualitative analysis of the value of data sharing\nacross governments, the European Commission forecasts the economic value\nof the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\nsame size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\nmillion data professionals in Europe alone. The technology and infrastructure to\nsupport the data society have to be accessible to all, interoperable, extensible,\nflexible and open. Imagine a world in which you’d need a different truck to\ntransport products between different warehouses because each road requires a\ndifferent set of tires — the whole supply chain would collapse. When it comes to\ndata, we often experience the “one set of tires for one road” paradox. Rest APIs\nand data exchange protocols have been proposed in the past but have failed\nto address the need for simplicity, ease of use and cost of scaling up with the\nnumber of data products.\n\n\n-----\n\n**Delta Sharing — the new data**\n**highway**\n\nDelta Sharing provides an open protocol for\nsecure data sharing to any computing platform.\nThe protocol is based on Delta data format and is\nagnostic concerning the cloud of choice.\n\nDelta is an open source data format that avoids\nvendor, platform and cloud lock-in, thus fully\nadhering to the principles of data sustainability,\nconscious design of the U.S. Federal Data Strategy\nand mission 4 of the UK National Data Strategy.\nDelta provides a governance layer on top of the\nParquet data format. Furthermore, it provides many\nperformance optimizations not available in Parquet\nout of the box. The openness of the data format\nis a critical consideration. It is the main factor for\ndriving the habit generation and adoption of best\npractices and standards.\n\n\n-----\n\nDelta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\npermissions and access to any data asset stored in Delta or Parquet formats.\nThe protocol defines two main actors, the data provider (data supplier, data\nowner) and the data recipient (data consumer). The recipient, by definition, is\nagnostic to the data format at the source. Delta Sharing provides the necessary\nabstractions for governed data access in many different languages and tools.\n\nDelta Sharing is uniquely positioned to answer many of the challenges of data\nsharing in a scalable manner within the context of highly regulated domains like\nthe public sector:\n\n**• Privacy and security concerns** — Personally identifiable data or otherwise\nsensitive or restricted data is a major part of the data exchange needs of a\ndata-driven and modernized government. Given the sensitive nature of such\ndata, it is paramount that the governance of data sharing is maintained in a\ncoherent and unified manner. Any unnecessary process and technological\ncomplexities increase the risk of over-sharing data. With this in mind,\nDelta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\nvery inception. The protocol provides end-to-end encryption, short-lived\ncredentials, and accessible and intuitive audit and governance features. All\nof these capabilities are available in a centralized way across all your Delta\ntables across all clouds.\n\n**• Quality and accuracy** — Another challenge of data sharing is ensuring\nthat the data being shared is of high quality and accuracy. Given that\nthe underlying data is stored as Delta tables, we can guarantee that the\n[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\nof data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n\n\nquality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\nadditional effort. The issue becomes even more emphasized by the fact\nthat data quality cannot be ensured in the same way on both the data\nprovider and data recipient side without the exact reimplementation of the\nsource systems. It is critical to embed quality and metadata together with\ndata to ensure quality travels together with data. Any decoupled approach\nto managing data, metadata and quality separately increases the risk of\nsharing and can lead to undesirable outcomes.\n\n**• Lack of standardization** — Another challenge of data sharing is the lack\nof standardization in how data is collected, organized, and stored. This is\nparticularly pronounced in the context of governmental activities. While\ngovernments have proposed standard formats (e.g., Office for National\nStatistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\nsector companies to standards proposed by such initiatives is a massive\nchallenge. Other industries may have different requirements for scalability,\ninteroperability, format complexity, lack of structure in data, etc. Most of\nthe currently advocated standards are lacking in multiple such aspects.\nDelta is the most mature candidate for assuming the central role in the\nstandardization of data exchange format. It has been built as a transactional\nand scalable data format, it supports structured, semi-structured and\nunstructured data, it stores data schema and metadata together with data\nand it provides a scalable enterprise-grade sharing protocol through Delta\nSharing. Finally, Delta is one of the most popular open source projects\nin the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n\n\n-----\n\n**• Cultural and organizational barriers** — These challenges can be\nsummarized by one word: friction. Unfortunately, it’s a common problem\nfor civil servants to struggle to obtain access to both internal and external\ndata due to over-cumbersome processes, policies and outdated standards.\nThe principles we are using to build our data platforms and our data sharing\nplatforms have to be self-promoting, have to drive adoption and have to\ngenerate habits that adhere to best practices.\n\nIf there is friction with standard adoption, the only way to ensure standards\nare respected is by enforcement and that itself is yet another barrier to\nachieving data sustainability. Organizations have already adopted Delta\nSharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\nSharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\nand governed.\n\n\n\n**• Technical challenges** — Federation at the government scale or even\nfurther across multiple industries and geographies poses technical\nchallenges. Each organization within this federation owns its platform\nand drives technological, architectural, platform and tooling choices.\n\nHow can we promote interoperability and data exchange in this vast,\ndiverse technological ecosystem? The data is the only viable integration\nvehicle. As long as the data formats we utilize are scalable, open and\ngoverned, we can use them to abstract from individual platforms and\ntheir intrinsic complexities.\n\nDelta format and Delta Sharing solve this wide array of requirements and\nchallenges in a scalable, robust and open way. This positions Delta Sharing\nas the strongest choice for unification and simplification of the protocol and\nmechanism through which we share data across both private and public sectors.\n\n\n-----\n\n**Data Sharing through data clean rooms**\n\n\n[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\nshare data with third parties in a privacy-safe environment. With Unity Catalog ,\nyou can enable fine-grained access controls on the data and meet your privacy\nrequirements. In this architecture, the data participants never get access to\nthe raw data. The only outputs from the clean rooms are those data assets\ngenerated in a pre-agreed, governed and fully controlled manner that ensures\ncompliance with the requirements of all parties involved.\n\nFinally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\non the premise. In contrast, less restricted data is free to leverage the power\nof the cloud offerings. In said scenario, there may be a need to combine the\npower of the cloud with the restricted data to solve advanced use cases where\ncapabilities are unavailable on the on-premises data platforms. Data clean rooms\ncan ensure that no physical data copies of the raw restricted data are created,\nresults are produced within the clean room’s controlled environment and results\nare shared back to the on-premises environment (if the results maintain the\nrestricted access within the defined policies) or are forwarded to any other\ncompliant and predetermined destination system.\n\n\nTaking the complexities of data sharing within highly regulated space and the\npublic sector one step further — what if we require to share the knowledge\ncontained in the data without ever granting direct access to the source data to\nexternal parties? These requirements may prove achievable and desirable where\nthe data sharing risk appetite is very low.\n\nIn many public sector contexts, there are concerns that combining the data that\ndescribes citizens could lead to a big brother scenario where simply too much\ndata about an individual is concentrated in a single data asset. If it were to fall\ninto the wrong hands, such a hypothetical data asset could lead to immeasurable\nconsequences for individuals and the trust in public sector services could\nerode. On the other hand, the value of a 360 view of the citizen could accelerate\nimportant decision-making. It could immensely improve the quality of policies\nand services provided to the citizens.\n\n\n-----\n\n**Citizen value of data sharing**\n\nEvery decision made by the government is a decision that affects its citizens.\nWhether the decision is a change to a policy, granting a benefit or preventing\ncrime, it can significantly influence the quality of our society. Data is a key factor\nin making the right decisions and justifying the decisions made. Simply put,\nwe can’t expect high-quality decisions without the high quality of data and a\ncomplete view of the data (within the permitted context). Without data sharing,\nwe will remain in a highly fragmented position where our ability to make those\ndecisions is severely limited or even completely compromised. In this blog, we\nhave covered several technological solutions available within the lakehouse that\ncan derisk and accelerate how the government is leveraging the data ecosystem\nin a sustainable and scalable way.\n\nFor more details on the industry use cases that Delta Sharing is addressing\nplease consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 03\n\n\n### Ready-to-Use Notebooks and Data Sets\n\n\n-----\n\n**Digital Twins**\n\nLeverage digital twins — virtual\nrepresentations of devices and\nobjects — to optimize operations and\ngain insights\n\n\nThis section includes several Solution Accelerators — free, ready-to-use\n\nexamples of data solutions from different industries ranging from retail to\n\nmanufacturing and healthcare. Each of the following scenarios includes\n\nnotebooks with code and step-by-step instructions to help you get\n\nstarted. Get hands-on experience with the Databricks Lakehouse Platform\n\n\nby trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n\n\n**Overall Equipment**\n**Effectiveness**\n\nIngest equipment sensor data for\nmetric generation and data driven\ndecision-making\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n\n**Real-time point of**\n**sale analytics**\n\nCalculate current inventories for\nvarious products across multiple store\nlocations with Delta Live Tables\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n\n\n**Recommendation Engines**\n**for Personalization**\n\nImprove customers’ user experience\nand conversion with personalized\nrecommendations\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Understanding Price**\n**Transparency Data**\n\nEfficiently ingest large healthcare data\nsets to create price transparency for\nbetter understanding of healthcare costs\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n\nAdditional Solution Accelerators with ready-to-use notebooks can be found here:\n\n**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n\n\n-----\n\n**SECTION**\n\n# 04\n\n\n### Case Studies\n\n**4.1** Akamai\n\n**4.2** Grammarly\n\n**4.3** Honeywell\n\n**4.4** Wood Mackenzie\n\n**4.5** Rivian\n\n**4.6** AT&T\n\n\n-----\n\nSECTION 4.1\n**Akamai delivers real-time security**\n**analytics using Delta Lake**\n\n\n###### <1\n\n**Min ingestion time,**\n**reduced from 15 min**\n\n\n###### <85%\n\n**Of queries have a response**\n**time of 7 seconds or less**\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\n[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n\n**P L AT F O R M U S E C A S E**\nDelta Lake, Data Streaming, Photon,\n[Databricks SQL](https://databricks.com/product/databricks-sql)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\nAkamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n\nuses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n\nworldwide to route internet traffic for some of the largest enterprises in media, commerce,\n\nfinance, retail and many other industries. About 30% of the internet’s traffic flows through\n\nAkamai servers. Akamai also provides cloud security solutions.\n\nIn 2018, the company launched a web security analytics tool that offers Akamai customers\n\na single, unified interface for assessing a wide range of streaming security events and\n\nperforming analysis of those events. The web analytics tool helps Akamai customers to\n\ntake informed actions in relation to security events in real time. Akamai is able to stream\n\nmassive amounts of data and meet the strict SLAs it provides to customers by leveraging\n\nDelta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n\n\n-----\n\n**Ingesting and streaming enormous amounts of data**\n\nAkamai’s web security analytics tool ingests approximately 10GB of data related\nto security events per second. Data volume can increase significantly when\nretail customers conduct a large number of sales — or on big shopping days like\nBlack Friday or Cyber Monday. The web security analytics tool stores several\npetabytes of data for analysis purposes. Those analyses are performed to\nprotect Akamai’s customers and provide them with the ability to explore and\nquery security events on their own.\n\nThe web security analytics tool initially relied on an on-premises architecture\nrunning Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\ndisplayed in the tool. The company sought to improve ingestion and query speed\nto meet those SLAs. “Data needs to be as real-time as possible so customers\ncan see what is attacking them,” says Tomer Patel, Engineering Manager at\nAkamai. “Providing queryable data to customers quickly is critical. We wanted to\nmove away from on-prem to improve performance and our SLAs so the latency\nwould be seconds rather than minutes.”\n\n**Delta Lake allows us to not only query the data better but to**\n**also acquire an increase in the data volume. We’ve seen an**\n**80% increase in traffic and data in the last year, so being able**\n**to scale fast is critical.**\n\n\nAfter conducting proofs of concept with several companies, Akamai chose to\nbase its streaming analytics architecture on Spark and the Databricks Lakehouse\nPlatform. “Because of our scale and the demands of our SLA, we determined that\nDatabricks was the right solution for us,” says Patel. “When we consider storage\noptimization, and data caching, if we went with another solution, we couldn’t\nachieve the same level of performance.”\n\n**Improving speed and reducing costs**\n\nToday, the web security analytics tool ingests and transforms data, stores it\nin cloud storage, and sends the location of the file via Kafka. It then uses a\nDatabricks Job as the ingest application. Delta Lake, the open source storage\nformat at the base of the Databricks Lakehouse Platform, supports real-time\nquerying on the web security analytics data. Delta Lake also enables Akamai to\nscale quickly. “Delta Lake allows us to not only query the data better but to also\nacquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\nin traffic and data in the last year, so being able to scale fast is critical.”\n\nAkamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n\nfast query performance. Patel added that Photon provided a significant boost\nto query performance. Overall, Databricks’ streaming architecture combined\nwith DBSQL and Photon enables Akamai to achieve real-time analytics, which\ntranslates to real-time business benefits.\n\n\n**Tomer Patel**\nEngineering Manager, Akamai\n\n\n-----\n\nPatel says he likes that Delta Lake is open source, as the company has benefitted\nfrom a community of users working to improve the product. “The fact that Delta\nLake is open source and there’s a big community behind it means we don’t need\nto implement everything ourselves,” says Patel. “We benefit from fixed bugs that\nothers have encountered and from optimizations that are contributed to the\nproject.” Akamai worked closely with Databricks to ensure Delta Lake can meet\nthe scale and performance requirements Akamai defined. These improvements\nhave been contributed back to the project (many of which were made available as\npart of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\ntechnology being tested at such a large scale in a real-world production scenario.\n\n\n**Meeting aggressive requirements for scale,**\n**reliability and performance**\n\nUsing Spark Structured Streaming on the Databricks Lakehouse Platform enables\nthe web security analytics tool to stream vast volumes of data and provide\nlow-latency, real-time analytics-as-a-service to Akamai’s customers. That way\nAkamai is able to make available security event data to customers within the\nSLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\nperformance, performance,” says Patel. “The platform’s performance and\nscalability are what drives us.”\n\nUsing the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\nthe security event data. “Reducing ingestion time from 15 minutes to under 1\nminute is a huge improvement,” says Patel. “It benefits our customers because\nthey can see the security event data faster and they have a view of what exactly\nis happening as well as the capability to filter all of it.”\n\nAkamai’s biggest priority is to provide customers with a good experience and\nfast response times. To date, Akamai has moved about 70% of security event\ndata from its on-prem architecture to Databricks, and the SLA for customer\nquery and response time has improved significantly as a result. “Now, with the\nmove to Databricks, our customers experience much better response time, with\nover 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\noptimal security configuration.\n\n\n-----\n\nSECTION 4.2\n**Grammarly uses Databricks Lakehouse to improve**\n**user experience**\n\n\n###### 110%\n\n**Faster querying, at 10% of the cost**\n**to ingest, than a data warehouse**\n\n\n###### 5 billion\n\n**Daily events available for**\n**analytics in under 15 minutes**\n\n\nGrammarly’s mission is to improve lives by improving communication. The company’s\n\ntrusted AI-powered communication assistance provides real-time suggestions to\n\nhelp individuals and teams write more confidently and achieve better results. Its\n\ncomprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n\n[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n\nwherever writing happens. As the company grew over the years, its legacy, homegrown\n\nanalytics system made it challenging to evaluate large data sets quickly and cost-\n\neffectively.\n\nBy migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n\nflexible, scalable and highly secure analytics platform that helps 30 million people and\n\n50,000 teams worldwide write more effectively every day.\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\nRecommendation Engines, Advertising\nEffectiveness, Customer Lifetime Value\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Unity Catalog,\n[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Harnessing data to improve communications for millions of**\n**users and thousands of teams**\n\nWhen people use Grammarly’s AI communication assistance, they receive\nsuggestions to help them improve multiple dimensions of communication,\nincluding spelling and grammar correctness, clarity and conciseness, word\nchoice, style, and tone. Grammarly receives feedback when users accept, reject\nor ignore its suggestions through app-created events, which total about 5 billion\nevents per day.\n\nHistorically, Grammarly relied on a homegrown legacy analytics platform and\nleveraged an in-house SQL-like language that was time-intensive to learn and\nmade it challenging to onboard new hires. As the company grew, Grammarly\ndata analysts found that the platform did not sufficiently meet the needs of its\nessential business functions, especially marketing, sales and customer success.\nAnalysts found themselves copying and pasting data from spreadsheets\nbecause the existing system couldn’t effectively ingest the external data needed\nto answer questions such as, “Which marketing channel delivers the highest\nROI?” Reporting proved challenging because the existing system didn’t support\nTableau dashboards, and company leaders and analysts needed to ensure they\ncould make decisions quickly and confidently.\n\n\n**Databricks Lakehouse has given us the flexibility to unleash**\n**our data without compromise. That flexibility has allowed us**\n**to speed up analytics to a pace we’ve never achieved before.**\n\n**Chris Locklin**\nEngineering Manager, Data Platforms, Grammarly\n\nGrammarly also sought to unify its data warehouses in order to scale and\nimprove data storage and query capabilities. As it stood, large Amazon EMR\nclusters ran 24/7 and drove up costs. With the various data sources, the team\nalso needed to maintain access control. “Access control in a distributed file\nsystem is difficult, and it only gets more complicated as you ingest more data\nsources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\nMeanwhile, reliance on a single streaming workflow made collaboration among\nteams challenging. Data silos emerged as different business areas implemented\nanalytics tools individually. “Every team decided to solve their analytics needs in\nthe best way they saw fit,” says Locklin. “That created challenges in consistency\nand knowing which data set was correct.”\n\n\n-----\n\nAs its data strategy was evolving, Grammarly’s priority was to get the most out\nof analytical data while keeping it secure. This was crucial because security is\nGrammarly’s number-one priority and most important feature, both in how it\nprotects its users’ data and how it ensures its own company data remains secure.\nTo accomplish that, Grammarly’s data platform team sought to consolidate\ndata and unify the company on a single platform. That meant sustaining a highly\nsecure infrastructure that could scale alongside the company’s growth, improving\ningestion flexibility, reducing costs and fueling collaboration.\n\n**Improving analytics, visualization and decision-making**\n**with the lakehouse**\n\nAfter conducting several proofs of concept to enhance its infrastructure,\nGrammarly migrated to the Databricks Lakehouse Platform. Bringing all the\nanalytical data into the lakehouse created a central hub for all data producers\nand consumers across Grammarly, with Delta Lake at the core.\n\nUsing the lakehouse architecture, data analysts within Grammarly now have a\nconsolidated interface for analytics, which leads to a single source of truth and\n\nconfidence in the accuracy and availability of all data managed by the data\nplatform team. Across the organization, teams are using Databricks SQL to\nconduct queries within the platform on both internally generated product data\nand external data from digital advertising platform partners. Now, they can easily\nconnect to Tableau and create dashboards and visualizations to present to\nexecutives and key stakeholders.\n\n\n“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\ncompanies ask for your data, hold it for you, and then let you perform analytics\non it. Just as Grammarly ensures our users’ data always remains theirs, we\nwanted to ensure our company data remained ours. Grammarly’s data stays\ninside of Grammarly.”\n\nWith its data consolidated in the lakehouse, different areas of Grammarly’s\nbusiness can now analyze data more thoroughly and effectively. For example,\nGrammarly’s marketing team uses advertising to attract new business. Using\nDatabricks, the team can consolidate data from various sources to extrapolate\na user’s lifetime value, compare it with customer acquisition costs and get rapid\nfeedback on campaigns. Elsewhere, data captured from user interactions flow\ninto a set of tables used by analysts for ad hoc analysis to inform and improve\nthe user experience.\n\nBy consolidating data onto one unified platform, Grammarly has eliminated data\nsilos. “The ability to bring all these capabilities, data processing and analysis\nunder the same platform using Databricks is extremely valuable,” says Sergey\nBlanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\nand engineering to analytics and ML under the same umbrella removes barriers\nand makes it easy for everyone to work with the data and each other.”\n\n\n-----\n\nTo manage access control, enable end-to-end observability and monitor data\nquality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n“Data lineage allows us to effectively monitor usage of our data and ensure it\nupholds the standards we set as a data platform team,” says Locklin. “Lineage is\nthe last crucial piece for access control. It allows analysts to leverage data to do\ntheir jobs while adhering to all usage standards and access controls, even when\nrecreating tables and data sets in another environment.”\n\n**Faster time to insight drives more intelligent**\n**business decisions**\n\nUsing the Databricks Lakehouse Platform, Grammarly’s engineering teams now\nhave a tailored, centralized platform and a consistent data source across the\ncompany, resulting in greater speed and efficiency and reduced costs. The\nlakehouse architecture has led to 110% faster querying, at 10% of the cost to\ningest, than a data warehouse. Grammarly can now make its 5 billion daily events\navailable for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n\nreceive feedback about new features being rolled out and understand if they are\nbeing adopted as expected. Ultimately, it helps them understand how groups\nof users engage with the UX, improving the experience and ensuring features\nand product releases bring the most value to users. “Everything my team does\nis focused on creating a rich, personalized experience that empowers people to\ncommunicate more effectively and achieve their potential,” says Locklin.\n\n\nMoving to the lakehouse architecture also solved the challenge of access control\nover distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\nability to manage file permissions with more flexibility than a database would\nallow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\nusing Databricks allows us to keep analytical data in-house, Unity Catalog helps\nus continue to uphold the highest standards of data protection by controlling\naccess paradigms inside our data. That opens a whole new world of things that\nwe can do.”\n\nUltimately, migrating to the Databricks Lakehouse Platform has helped\nGrammarly to foster a data-driven culture where employees get fast access\nto analytics without having to write complex queries, all while maintaining\nGrammarly’s enterprise-grade security practices. “Our team’s mission is to help\nGrammarly make better, faster business decisions,” adds Blanket. “My team\nwould not be able to effectively execute on that mission if we did not have a\nplatform like Databricks available to us.” Perhaps most critically, migrating off its\nrigid legacy infrastructure gives Grammarly the adaptability to do more while\nknowing the platform will evolve as its needs evolve. “Databricks has given us the\nflexibility to unleash our data without compromise,” says Locklin. “That flexibility\nhas allowed us to speed up analytics to a pace we’ve never achieved before.”\n\n\n-----\n\nSECTION 4.3\n**Honeywell selects Delta Live Tables for streaming data**\n\nCompanies are under growing pressure to reduce energy use, while at the same time\n\nthey are looking to lower costs and improve efficiency. Honeywell delivers industry-\n\nspecific solutions that include aerospace products and services, control technologies\n\nfor buildings and industry, and performance materials globally. Honeywell’s Energy\n\nand Environmental Solutions division uses IoT sensors and other technologies to help\n\nbusinesses worldwide manage energy demand, reduce energy consumption and carbon\n\nemissions, optimize indoor air quality, and improve occupant well-being.\n\nAccomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n\nTables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n\nbillions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n\nreal-time queries and multilayer insights into data at scale — helping Honeywell improve\n\nhow it manages data and extract more value from it, both for itself and for its customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Delta Live Tables\n\n\n**C LO U D**\n[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n**aggregations, and bring the significant amount of data we collect**\n**from our buildings under control so we can provide customers value.**\n\n**Dr. Chris Inkpen**\nGlobal Solutions Architect, Honeywell Energy and Environmental Solutions\n\n\n-----\n\n**Processing billions of IoT data points per day**\n\nHoneywell’s solutions and services are used in millions of buildings around the\nworld. Helping its customers create buildings that are safe, more sustainable\nand productive can require thousands of sensors per building. Those sensors\nmonitor key factors such as temperature, pressure, humidity and air quality.\nIn addition to the data collected by sensors inside a building, data is also\ncollected from outside, such as weather and pollution data. Another data set\nconsists of information about the buildings themselves — such as building\ntype, ownership, floor plan, square footage of each floor and square footage\nof each room. That data set is combined with the two disparate data streams,\nadding up to a lot of data across multiple structured and unstructured formats,\nincluding images and video streams, telemetry data, event data, etc. At peaks,\nHoneywell ingests anywhere between 200 to 1,000 events per second for any\nbuilding, which equates to billions of data points per day. Honeywell’s existing\ndata infrastructure was challenged to meet such demand. It also made it difficult\nfor Honeywell’s data team to query and visualize its disparate data so it could\nprovide customers with fast, high-quality information and analysis.\n\n**ETL simplified: high-quality, reusable data pipelines**\n\nWith Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\ndata team can now ingest billions of rows of sensor data into Delta Lake and\nautomatically build SQL endpoints for real-time queries and multilayer insights\ninto data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n\n\nChris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\nSolutions. “We give the system more data, and it copes. Out of the box, it’s given\nus the confidence that it will handle whatever we throw at it.”\n\nHoneywell credits the Databricks Lakehouse Platform for helping it to unify its\nvast and varied data — batch, streaming, structured and unstructured — into\none platform. “We have many different data types. The Databricks Lakehouse\nPlatform allows us to use things like Apache Kafka and Auto Loader to load and\nprocess multiple types of data and treat everything as a stream of data, which is\nawesome. Once we’ve got structured data from unstructured data, we can write\nstandardized pipelines.”\n\nHoneywell data engineers can now build and leverage their own ETL pipelines\nwith Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\nbe reused regardless of environment, and data can run in batches or streams. It’s\nalso helped Honeywell’s data team transition from a small team to a larger team.\n“When we wrote our first few pipelines before DLT existed, only one person could\nwork in one part of the functionality. Now that we’ve got DLT and the ability to\nhave folders with common functionality, we’ve got a really good platform where\nwe can easily spin off different pipelines.”\n\nDLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\npipeline need optimization,” says Inkpen. “With standard pipelines, that was\nmuch more chaotic.”\n\n\n-----\n\n**Enabling ease, simplicity and scalability across the**\n**infrastructure**\n\nDelta Live Tables has helped Honeywell’s data team consistently query\ncomplex data while offering simplicity of scale. It also enables end-to-end data\nvisualization of Honeywell’s data streams as they flow into its infrastructure, are\ntransformed, and then flow out. “Ninety percent of our ETL is now captured in\ndiagrams, so that’s helped considerably and improves data governance. DLT\nencourages — and almost enforces — good design,” says Inkpen.\n\nUsing the lakehouse as a shared workspace has helped promote teamwork and\ncollaboration at Honeywell. “The team collaborates beautifully now, working\ntogether every day to divvy up the pipeline into their own stories and workloads,”\nsays Inkpen.\n\nMeanwhile, the ability to manage streaming data with low latency and better\nthroughput has improved accuracy and reduced costs. “Once we’ve designed\nsomething using DLT, we’re pretty safe from scalability issues — certainly a\nhundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\nthen go back and look at how we can take a traditional job and make it more\nperformant and less costly. We’re in a much better position to try and do that\nfrom DLT.”\n\n\nUsing Databricks and DLT also helps the Honeywell team perform with greater\nagility, which allows them to innovate faster while empowering developers to\nrespond to user requirements almost immediately. “Our previous architecture\nmade it impossible to know what bottlenecks we had and what we needed to\nscale. Now we can do data science in near real-time.”\n\nUltimately, Honeywell can now more quickly provide its customers with the\ndata and analysis they need to make their buildings more efficient, healthier\nand safer for occupants. “I’m continuously looking for ways to improve our\nlifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\nus pull together many different data sources, do aggregations, and bring the\nsignificant amount of data we collect from our buildings under control so we\ncan provide customers value.”\n\n**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n\n\n-----\n\nSECTION 4.4\n**Wood Mackenzie helps customers transition to a more**\n**sustainable future**\n\n\n###### 12 Billion\n\n**Data points processed**\n**each week**\n\n\n###### 80-90%\n\n**Reduction in**\n**processing time**\n\n\n###### Cost Savings\n\n**In operations through**\n**workflow automation**\n\n\nWood Mackenzie offers customized consulting and analysis for a wide range of clients\n\nin the energy and natural resources sectors. Founded in Edinburgh, the company first\n\ncultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n\ndetailed insight for every interconnected sector of the energy, chemicals, metals and\n\nmining industries.\n\nToday it sees itself playing an important role in the transition to a more sustainable\n\nfuture. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n\ningest and process massive amounts of data. Using a common workflow provided\n\nhigher visibility to engineering team members, encouraging better collaboration. With\n\nan automated, transparent workflow in place, the team saw improved productivity and\n\ndata quality and an easier path to fix pipeline issues when they arise.\n\n\n**I N D U S T R Y**\n[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Workflows\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Delivering insights to the energy industry**\n\nFulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\nbuilt to deliver insights at key decision points for customers in the energy sector.\nFeeding into Lens are vast amounts of data collected from various data sources\nand sensors used to monitor energy creation, oil and gas production, and more.\nThose data sources update about 12 billion data points every week that must\nbe ingested, cleaned and processed as part of the input for the Lens platform.\nYanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\ndata professionals that build and maintain the ETL pipeline that provides input\ndata for Lens. The team is leveraging the Databricks Lakehouse Platform and\nuses Apache Spark™ for parallel processing, which provides greater performance\nand scalability benefits compared to an earlier single-node system working\nsequentially. “We saw a reduction of 80-90% in data processing time, which\nresults in us providing our clients with more up-to-date, more complete and\nmore accurate data,” says Wu.\n\n**Our mission is to transform the way we power the planet.**\n**Our clients in the energy sector need data, consulting services**\n**and research to achieve that transformation. Databricks**\n**Workflows gives us the speed and flexibility to deliver the**\n**insights our clients need.**\n\n\n**Improved collaboration and transparency with a common**\n**workflow**\n\nThe data pipeline managed by the team includes several stages for standardizing\nand cleaning raw data, which can be structured or unstructured and may be in\nthe form of PDFs or even handwritten notes.\n\nDifferent members of the data team are responsible for different parts of\nthe pipeline, and there is a dependency between the processing stages each\nteam member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\nworkstream that the entire team uses. Each stage of the pipeline is implemented\nin a Python notebook, which is run as a job in the main workflow.\n\nEach team member can now see exactly what code is running on each stage,\nmaking it easy to find the cause of the issue. Knowing who owns the part of the\npipeline that originated the problem makes fixing issues much faster. “Without\na common workflow, different members of the team would run their notebooks\nindependently, not knowing that failure in their run affected stages downstream,”\nsays Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\nrerun notebooks, it was hard to tell which notebook version was initially run and\nthe latest version to use.”\n\n\n**Yanyan Wu**\nVice President of Data, Wood Mackenzie\n\n\n-----\n\nUsing Workflows’ alerting capabilities to notify the team when a workflow task\nfails ensures everyone knows a failure occurred and allows the team to work\ntogether to resolve the issue quickly. The definition of a common workflow\ncreated consistency and transparency that made collaboration easier. “Using\nDatabricks Workflows allowed us to encourage collaboration and break up the\nwalls between different stages of the process,” explains Wu. “It allowed us all to\nspeak the same language.”\n\nCreating transparency and consistency is not the only advantage the team saw.\nUsing Workflows to automate notebook runs also led to cost savings compared\nto running interactive notebooks manually.\n\n**Improved code development productivity**\n\nThe team’s ETL pipeline development process involves iteration on PySpark\nnotebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\nfor data professionals on the team to manually develop and test a notebook.\nBecause Databricks Workflows supports running notebooks as task type\n(along with Python files, JAR files and other types), when the code is ready for\n\n\ndeveloping notebooks with the interactive notebook UI while leveraging the\npower of automation, which reduces potential issues that may happen when\nrunning notebooks manually.\n\nThe team has gone even further in increasing productivity by developing a\nCI/CD process. “By connecting our source control code repository, we know\nthe workflow always runs the latest code version we committed to the repo,”\nexplains Zhang. “It’s also easy to switch to a development branch to develop a\nnew feature, fix a bug and run a development workflow. When the code passes\nall tests, it is merged back to the main branch and the production workflow is\nautomatically updated with the latest code.”\n\nGoing forward, Wood Mackenzie plans to optimize its use of Databricks\nWorkflows to automate machine learning processes such as model training,\nmodel monitoring and handling model drift. The firm uses ML to improve its data\nquality and extract insights to provide more value to its clients. “Our mission is to\ntransform how we power the planet,” Wu says. “Our clients in the energy sector\nneed data, consulting services and research to achieve that transformation.\nDatabricks Workflows gives us the speed and flexibility to deliver the insights our\nclients need.”\n\n\nproduction, it’s easy and cost effective to automate it by adding it to a workflow.\nThe workflow can then be easily revised by adding or removing any steps to\nor from the defined flow. This way of working keeps the benefit of manually\n\n\n-----\n\nSECTION 4.5\n**Rivian redefines driving experience with**\n**the Databricks Lakehouse**\n\n###### 250 platform users\n\n**A 50x increase from a year ago**\n\nRivian is preserving the natural world for future generations with revolutionary Electric\n\nAdventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n\nterabytes of IoT data per day, the company is using data insights and machine\n\nlearning to improve vehicle health and performance. However, with legacy cloud\n\ntooling, it struggled to scale pipelines cost-effectively and spent significant resources\n\non maintenance — slowing its ability to be truly data driven.\n\nSince moving to the Databricks Lakehouse Platform, Rivian can now understand how\n\na vehicle is performing and how this impacts the driver using it. Equipped with these\n\ninsights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n\ndriving experience to customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**S O L U T I O N**\nPredictive Maintenance, Scaling ML Models\nfor IoT, Data-Driven ESG\n\n**P L AT F O R M**\n[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Struggling to democratize data on a legacy platform**\n\n\nsharing of data, which further contributed to productivity issues. Required data\nlanguages and specific expertise of toolsets created a barrier to entry that\nlimited developers from making full use of the data available. Jason Shiverick,\nPrincipal Data Scientist at Rivian, said the biggest issue was the data access. “I\nwanted to open our data to a broader audience of less technical users so they\ncould also leverage data more easily.”\n\nRivian knew that once its EAVs hit the market, the amount of data ingested would\nexplode. In order to deliver the reliability and performance it promised, Rivian\nneeded an architecture that would not only democratize data access, but also\nprovide a common platform to build innovative solutions that can help ensure a\nreliable and enjoyable driving experience.\n\n**Databricks Lakehouse empowers us to lower the barrier of**\n**entry for data access across our organization so we can build**\n**the most innovative and reliable electric vehicles in the world.**\n\n**Wassym Bensaid**\nVice President of Software Development, Rivian\n\n\nBuilding a world that will continue to be enjoyed by future generations requires\na shift in the way we operate. At the forefront of this movement is Rivian —\nan electric vehicle manufacturer focused on shifting our planet’s energy and\ntransportation systems entirely away from fossil fuel. Today, Rivian’s fleet\nincludes personal vehicles and involves a partnership with Amazon to deliver\n100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\ncapture petabytes of data ranging from how the vehicle drives to how various\nparts function. With all this data at its fingertips, Rivian is using machine learning\nto improve the overall customer experience with predictive maintenance so that\npotential issues are addressed before they impact the driver.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility\nand tooling limitations that decreased output, prevented collaboration and\nincreased operational costs. It had 30 to 50 large and operationally complicated\ncompute clusters at any given time, which was costly. Not only was the system\ndifficult to manage, but the company experienced frequent cluster outages\nas well, forcing teams to dedicate more time to troubleshooting than to data\nanalysis. Additionally, data silos created by disjointed systems slowed the\n\n\n-----\n\n**Predicting maintenance issues with Databricks Lakehouse**\n\nRivian chose to modernize its data infrastructure on the Databricks Lakehouse\nPlatform, giving it the ability to unify all of its data into a common view for\ndownstream analytics and machine learning. Now, unique data teams have\na range of accessible tools to deliver actionable insights for different use\ncases, from predictive maintenance to smarter product development. Venkat\nSivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\nto build a culture around an open data platform that provided a system for\nreally democratizing data and analysis in an efficient way.” Databricks’ flexible\nsupport of all programming languages and seamless integration with a variety of\ntoolsets eliminated access roadblocks and unlocked new opportunities. Wassym\nBensaid, Vice President of Software Development at Rivian, explains, “Today we\nhave various teams, both technical and business, using Databricks Lakehouse\nto explore our data, build performant data pipelines, and extract actionable\nbusiness and product insights via visual dashboards.”\n\n\nmetrics, Rivian can improve the accuracy of smart features and the control\nthat drivers have over them. Designed to take the stress out of long drives and\ndriving in heavy traffic, features like adaptive cruise control, lane change assist,\nautomatic emergency driving, and forward collision warning can be honed over\ntime to continuously optimize the driving experience for customers.\n\nSecure data sharing and collaboration was also facilitated with the Databricks\nUnity Catalog. Shiverick describes how unified governance for the lakehouse\nbenefits Rivian productivity. “Unity Catalog gives us a truly centralized data\ncatalog across all of our different teams,” he said. “Now we have proper access\nmanagement and controls.” Venkat adds, “With Unity Catalog, we are centralizing\ndata catalog and access management across various teams and workspaces,\nwhich has simplified governance.” End-to-end version controlled governance\nand auditability of sensitive data sources, like the ones used for autonomous\ndriving systems, produces a simple but secure solution for feature engineering.\nThis gives Rivian a competitive advantage in the race to capture the autonomous\ndriving grid.\n\n\nRivian’s ADAS (advanced driver-assistance systems) Team can now easily\nprepare telemetric accelerometer data to understand all EAV motions. This core\nrecording data includes information about pitch, roll, speed, suspension and\nairbag activity, to help Rivian understand vehicle performance, driving patterns\nand connected car system predictability. Based on these key performance\n\n\n-----\n\n**Accelerating into an electrified and sustainable world**\n\n\nBy scaling its capacity to deliver valuable data insights with speed, efficiency\nand cost-effectiveness, Rivian is primed to leverage more data to improve\noperations and the performance of its vehicles to enhance the customer\nexperience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\nmoney from a cloud perspective, and that’s a huge win for us.” With Databricks\nLakehouse providing a unified and open source approach to data and analytics,\nthe Vehicle Reliability Team is able to better understand how people are using\ntheir vehicles, and that helps to inform the design of future generations of\nvehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n30%–50% increase in runtime performance, which has led to faster insights and\nmodel performance.\n\nShiverick explains, “From a reliability standpoint, we can make sure that\ncomponents will withstand appropriate lifecycles. It can be as simple as\nmaking sure door handles are beefy enough to endure constant usage, or as\ncomplicated as predictive and preventative maintenance to eliminate the\nchance of failure in the field. Generally speaking, we’re improving software quality\nbased on key vehicle metrics for a better customer experience.”\n\n\nFrom a design optimization perspective, Rivian’s unobstructed data view is also\nproducing new diagnostic insights that can improve fleet health, safety, stability\nand security. Venkat says, “We can perform remote diagnostics to triage a\nproblem quickly, or have a mobile service come in, or potentially send an OTA\nto fix the problem with the software. All of this needs so much visibility into\nthe data, and that’s been possible with our partnership and integration on the\nplatform itself.” With developers actively building vehicle software to improve\nissues along the way.\n\nMoving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\ndifferent teams — increasing the number of platform users from 5 to 250 in only\none year. This has unlocked new use cases including using machine learning to\noptimize battery efficiency in colder temperatures, increasing the accuracy of\nautonomous driving systems, and serving commercial depots with vehicle health\ndashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\nof commercial vans expands, Rivian will continue to leverage the troves of data\ngenerated by its EAVs to deliver new innovations and driving experiences that\nrevolutionize sustainable transportation.\n\n\n-----\n\nSECTION 4.6\n**Migrating to the cloud to better serve**\n**millions of customers**\n\n\n###### 300%\n\n**ROI from OpEx savings**\n**and cost avoidance**\n\n\n###### 3X\n\n**Faster delivery of ML/data**\n**science use cases**\n\n\nConsistency in innovation is what keeps customers with a telecommunications company\n\nand is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n\nHadoop system proved complex and costly to manage, impeding operational agility\n\nand efficiency and engineering resources. The need to pivot to cloud to better support\n\nhundreds of millions of subscribers was apparent.\n\nMigrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n\nsavings in operating costs. Additionally, the new cloud-based environment has unlocked\n\naccess to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n\n2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n\noverburdening its engineering team or exploding operational costs — to deliver new\n\nfeatures and innovations to its millions of end users.\n\n\n**I N D U S T R Y**\n[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n\n**S O L U T I O N**\nCustomer Retention, Subscriber Churn\nPrediction, Threat Detection\n\n**P L AT F O R M**\nLakehouse, Data Science, Machine Learning,\n[Data Streaming](https://www.databricks.com/product/data-streaming)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\n-----\n\n**Hadoop technology adds operational complexity and**\n**unnecessary costs**\n\nAT&T is a technology giant with hundreds of millions of subscribers and ingests\n10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\nthis data, it has a team of 2,500+ data users across 60+ business units to ensure\nthe business is data powered — from building analytics to ensure decisions are\nbased on the best data-driven situation awareness to building ML models that\nbring new innovations to its customers. To support these requirements, AT&T\nneeded to democratize and establish a data single version of truth (SVOT) while\nsimplifying infrastructure management to increase agility and lower overall costs.\n\nHowever, physical infrastructure was too resource intensive. The combination\nof a highly complex hardware setup (12,500 data sources and 1,500+ servers)\ncoupled with an on-premises Hadoop architecture proved complex to\nmaintain and expensive to manage. Not only were the operational costs to\nsupport workloads high, but there were also additional capital costs around\ndata centers, licensing and more. Up to 70% of the on-prem platform had to\n\nbe prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n\ndata quality objectives. Engineers’ time was focused on managing updates,\n\n\nWith these deeply rooted technology issues, AT&T was not in the best position\nto achieve its goals of increasing its use of insights for improving its customer\nexperience and operating more efficiently. “To truly democratize data across\nthe business, we needed to pivot to a cloud-native technology environment,”\nsaid Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\nup resources that had been focused on managing our infrastructure and move\nthem up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n\n**A seamless migration journey to Databricks**\n\nAs part of its due diligence, AT&T ran a comprehensive cost analysis and\nconcluded that Databricks was both the fastest and achieved the best price/\nperformance for data pipelines and machine learning workloads. AT&T knew the\nmigration would be a massive undertaking. As such, the team did a lot of upfront\nplanning — they prioritized migrating their largest workloads first to immediately\nreduce their infrastructure footprint. They also decided to migrate their data\nbefore migrating users to ensure a smooth transition and experience for their\nthousands of data practitioners.\n\n\nfixing performance issues or simply provisioning resources rather than focusing\n\n\non higher-valued tasks. The resource constraints of physical infrastructure\n\nalso drove serialization of data science activities, slowing innovation. Another\n\nhurdle faced in operationalizing petabytes of data was the challenge of building\n\nstreaming data pipelines for real-time analytics, an area that was key to\n\nsupporting innovative use cases required to better serve its customers.\n\n\n**The migration from Hadoop to Databricks enables us to bring**\n**more value to our customers and do it more cost-efficiently**\n**and much faster than before.**\n\n**Mark Holcomb**\nDistinguished Solution Architect, AT&T\n\n\n-----\n\nThey spent a year deduplicating and synchronizing data to the cloud before\nmigrating any users. This was a critical step in ensuring the successful migration\nof such a large, complex multi-tenant environment of 2,500+ users from 60+\nbusiness units and their workloads. The user migration process occurred over\nnine months and enabled AT&T to retire on-premises hardware in parallel with\nmigration to accelerate savings as early as possible. Plus, due to the horizontal,\nscalable nature of Databricks, AT&T didn’t need to have everything in one\ncontiguous environment. Separating data and compute, and across multiple\naccounts and workspaces, ensured analytics worked seamlessly without any API\ncall limits or bandwidth issues and consumption clearly attributed to the 60+\nbusiness units.\n\nAll in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n12,500 data sources and 300 schemas. The entire process took about two and a\nhalf years. And it was able to manage the entire migration with the equivalent of\n15 full-time internal resources. “Databricks was a valuable collaborator throughout\nthe process,” said Holcomb. “The team worked closely with us to resolve product\nfeatures and security concerns to support our migration timeline.”\n\n**Databricks reduces TCO and opens new paths to**\n**innovation**\n\nOne of the immediate benefits of moving to Databricks was huge cost savings.\nAT&T was able to rationalize about 30% of its data by identifying and not\nmigrating underutilized and duplicate data. And prioritizing the migration of\nthe largest workloads allowed half the on-prem equipment to be rationalized\n\n\nduring the course of the migration. “By prioritizing the migration of our most\ncompute-intensive workloads to Databricks, we were able to significantly drive\ndown costs while putting us in position to scale more efficiently moving forward,”\nexplained Holcomb. The result is an anticipated 300% five-year migration ROI\nfrom OpEx savings and cost avoidance (e.g., not needing to refresh data center\nhardware).\n\nWith data readily available and the means to analyze data at any scale, teams\nof citizen data scientists and analysts can now spend more time innovating,\ninstead of serializing analytics efforts or waiting on engineering to provide the\nnecessary resources — or having data scientists spend their valuable time\non less complex or less insightful analyses. Data scientists are now able to\ncollaborate more effectively and speed up machine learning workflows so that\nteams can deliver value more quickly, with a 3x faster time to delivery for new\ndata science use cases.\n\n“Historically you would have had operations in one system and analytics in a\nseparate one,” said Holcomb. “Now we can do more use cases like operational\nanalytics in a platform that fosters cross-team collaboration, reduces cost and\nimproves the consistency of answers.” Since migrating to Databricks, AT&T now\nhas a single version of truth to create new data-driven opportunities, including\na self-serve AI-as-a-Service analytics platform that will enable new revenue\nstreams and help it continue delivering exceptional innovations to its millions\nof customers.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### EBOOK\n\n# 8 Steps to Becoming an AI-Forward Retailer\n\n\n-----\n\n## Contents\n\n\nIntroduction .............................................................................................................................................................................................. **3**\n\nThe State of the Retail Industry:\n\nThe Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n\nBegin With a Shared Vision of Success ....................................................................................................................................... **6**\n\nWhy Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n\nBefore Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n\nGetting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n\nGoing Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n\nNormalizing the Process: Engraining a Data-Driven Mindset\n\nInto the Fabric of the Business ...................................................................................................................................................... **14**\n\nFrom Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n\nThe 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n\nTransform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n\n\n-----\n\n## Introduction\n\n\nIn a world where data is king, retailers have historically been trailblazers, pioneering data technology\nadoption to supercharge their operations, enhance customer understanding and sharpen\npersonalization. The journey began with the simple cash register about 150 years ago, progressed to\nstandardized product reporting with the introduction of the UPC and EAN, and has evolved to include\ncutting-edge technologies such as RFID and machine learning.\n\nToday, we stand on the brink of “Generation AI,” defined by sophisticated language models and\nimages. Retailers, with their history of embracing data technologies, find themselves in a strong\nposition to reap the benefits of this new era. Automation of customer service, supply chain modeling\nwith digital twins and delivering hyper-personalized experiences in real time are all in the cards,\npromising to bolster revenue, improve margins and slash costs for early adopters.\n\nAccording to an internal analysis by Databricks, data pioneers are already outstripping their\ncompetition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\nsix major industry sectors, including retail — shows these front-runners outperforming the rest of the\nmarket by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\nare setting themselves up for significant gains and a robust competitive advantage.\n\nHowever, for retailers mired in the landscape of outdated data platforms, the transformation into an\nAI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\nfeel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\nevolving retail landscape.\n\nTo help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\nroad map for organizations embarking on digital transformation journeys — a shift that is as much\nabout culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\nvision for transformation, outlining a compelling case for why such change is vital for the company’s\nlong-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\ncritical business procedures.\n\n\n-----\n\n## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n\n\nThe pandemic’s fallout has led to a widening chasm between the retail industry’s\nleaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n“Companies with tech-forward business models, who were already pulling ahead\npre-crisis, left their competitors in the dust.”\n\nBut what exactly is a “tech-forward business model”? It isn’t a simple narrative of\ndigital natives dethroning traditional retailers. Heavyweights like Walmart, Target\nand Costco held their own against Amazon. Nor was it purely a matter of scale —\nsmaller brands like Warby Parker or Everlane managed to carve out substantial\nconsumer bases, competing against larger, established players.\n\n**The common denominator among all victors**\n**was their ability to harness data, analytics and AI**\n**to rapidly react to shifts in consumer behavior.**\n\n\nmethods, optimizing operations to alleviate the pressure these modes exerted\non margins. They successfully established tighter partnerships with suppliers\nand logistic entities, collaborating toward shared triumphs.\n\nIn all these instances, it was their timely access to information, foresight\ndriven by this data, and the exploration of probable outcomes that set these\norganizations apart. Infusing data-driven decision-making into core processes\nwithin the organization, as well as those crossing partner boundaries, unlocked\nthis approach’s full potential.\n\nTo illustrate the significance of prioritizing data and AI, we developed the\nDatabricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\nstocks research, this index tracks marquee customers across our top five\nverticals and partners. The Databricks 30 is an equal-weight price index,\n\ncomposed of five marquee customers each across Retail/Consumer Products,\nFinancial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\nplus five strategic partners.\n\n\nThese businesses deftly used consumer demand insights to understand the\neffects of supply chain disruptions and labor shortages and reallocate resources\nto mitigate the most harmful impacts. They adeptly introduced new delivery\n\n\n-----\n\nOur analysis reveals that companies in the Databricks 30 Index outpaced the\nS&P 500 by an impressive +21 percentage points (pp) over the past three years.\nIn other words, if the stock market rose by 50% during this period, the Databricks\n30 Index would have soared by 71% (outperforming by 21pp). Even more\nremarkable, excluding tech entirely from the Databricks 30, the Databricks 30\nex-Tech index outperforms the S&P 500 by an even larger margin over the same\ntime frame: +23pp.\n\n\nDB30 DOw30\n\n\nSimilar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\nare investing in cloud, data and innovation do, in fact, win.\n\n\nSo now that we see the impact, let’s dive into the steps retail organizations can\ntake to put themselves on a trajectory of continued growth and success amid an\never-changing landscape.\n\n\n01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n\n\n01-01-2019 01-01-2020 01-01-2021\n\n\nDATE\n\n\n-----\n\n## Begin With a Shared Vision of Success\n\n\nThe most overlooked activity in becoming an AI-forward retailer is the most\ncrucial. In the rush to secure a position on the AI frontier, many companies\nare leaping before they look, embarking on AI initiatives without a clear\nunderstanding of what they want to achieve. Simply adopting the newest,\nshiniest tech tools isn’t a silver bullet. Many companies set themselves up for\nfailure by neglecting to clearly define the expected business outcomes at the\nonset of the initiative, a strategic move that can effectively reduce project risk\nand costs and lead to the ultimate success of the program. In fact, in an attempt\nto accelerate results, this cavalier approach can instead spiral into expensive\nmistakes, wasted resources and a decrease in trust for stakeholders from\nunmet expectations. It’s like setting sail on an open ocean without a destination\nin mind; the journey might provide some interesting detours, but it lacks\ndirection and purpose.\n\nHowever, when organizations take the time to articulate their expected\nbusiness outcomes before deploying AI and data-driven programs, they position\nthemselves to reduce project risk and costs. By aligning AI initiatives with\nspecific business objectives and creating a shared vision with stakeholders,\nthe focus becomes less about the technology itself and more about how it\ncan be used to reach these defined goals.\n\n\nTechnology decisions, too, are improved by having a known target. Without\nclear business outcomes in mind, companies tend to design, develop and\nimplement technologies that _might_ be needed to solve the problem. Aligning\nthe technical road map and activities with business outcomes mitigates the\nrisk of misallocated resources and the potential fallout from the unfulfilled\npromise of AI.\n\nFurthermore, a clear understanding of expected business outcomes allows\nfor efficient project management and cost control. Companies can set key\nperformance indicators (KPIs) tied directly to these outcomes. This not only\nprovides a means to measure progress, but also helps control costs by\nensuring that resources are targeted toward initiatives that deliver value.\n\nIt’s not just about numbers either; having explicit objectives aids in cultivating\n\nstakeholder buy-in. Clear communication about the purpose and potential\nbenefits of an AI initiative can foster support from executives, employees,\ninvestors and customers alike. This collective backing can further mitigate risk\nand cut costs by ensuring that everyone is pulling in the same direction.\n\n\n-----\n\n## Why Companies Struggle With Setting Clear Business Outcomes for AI\n\n\nGetting started with AI at your organization might be daunting, and that’s\nbecause it is a big undertaking! Struggling to define clear outcomes for AI\nprojects is a common issue among many businesses for a variety of reasons.\nHere are some key factors that contribute to this challenge:\n\n**They believe the data strategy is a technology problem.**\n\nCompanies often hire a chief data officer, or make the data strategy\nthe responsibility of the technology organization.\n\n**They lack an understanding of their business processes**\nAn alarming number of businesses jump onto the AI bandwagon without\nunderstanding how their business operates. Decisions are made at\nthe leadership level, but how they translate to operational decisions is\nmuddled. Data and AI are fundamentally business process technologies,\n\nand without fully understanding how the business works, any initiative\nin data and AI is bound to have limited success.\n\n\n**They lack a data culture**\n\nSomewhat related to the previous point, many companies have teams\nthat make decisions based on experience and intuition. These should\nnot be discounted, but the reason for intuition is often a result of a\npoor definition of processes, which prevents the ability to measure\nand improve processes.\n\n**They struggle to get high-quality data**\n\nAI projects require good-quality, relevant data. Many businesses\nstruggle with issues related to data access, quality, privacy and\nsecurity, which can complicate the process of defining clear outcomes.\n\n**They lack the organizational structures required**\n\nImplementing AI often requires significant changes in business\n\nprocesses, organizational structures and even corporate culture.\nMany companies find it hard to manage these changes, leading to\ndifficulties in setting and achieving clear outcomes.\n\n\n-----\n\nData and AI programs are a business process problem first, and a\ntechnology problem last. Familiarity with technology is important, but\nirrelevant if companies do not understand it.\n\nAddressing these challenges often requires companies to invest in\neducation about AI capabilities, to formulate clear strategies, to manage\nchange effectively, and to bring on board the necessary skills either\nby hiring new talent or upskilling existing employees. It’s a journey that\nrequires commitment, but the potential benefits of successful AI initiatives\nmake it a worthwhile venture.\n\n\n**They don’t have the right people in place**\n\nThere’s often a gap between the skills available within a company and\nthe skills needed to define and achieve AI outcomes. Without team\nmembers who understand AI, data analysis and project management,\nbusinesses can struggle to set clear objectives for AI initiatives.\n\n**They struggle to quantify the value of AI projects**\n\nAI’s benefits can sometimes be intangible or long-term, making them\ndifficult to quantify. Companies may struggle to define outcomes in\nmeasurable terms, complicating the process of setting objectives\nand monitoring progress.\n\n\n-----\n\n## Before Diving In: Assess Your Readiness\n\n\nThere is a growing sense of urgency for organizations relatively new to data\nand AI-driven enablement to “get in the game.” Profiles of top performers and\nheadline-making achievements create a clearer sense of what is possible\nand what can be gained, leaving those entering into the space eager to achieve\nsimilar results.\n\nBut what’s missing in those articles are the sustained investments in\nprocess, people and technology and the numerous challenges, missteps and\noutright failures that had to occur before success was achieved. Data-driven\ntransformation is a journey, and before any successful journey is pursued,\nit’s wise to reflect on the organization’s readiness so that you can anticipate\nchallenges and identify areas for remediation and improvement that will\ndeliver you to your intended destination.\n\nWith this in mind, we encourage organizations new to this space to\nassess their maturity in terms of the use and management of their existing\ninformation assets:\n\n1. How easily discoverable and accessible are data in\nyour environment?\n\n\n3. Is the quality of these data formally verified?\n\n4. Are key entities such as products and customers actively\nmanaged, and can data related to these items be easily linked\nacross various data sources?\n\n5. How quickly are data made available for analysis following their\ncreation or modification? Is this latency aligned with how you\nmight use this data?\n\n6. Are processes established for determining appropriate uses of\ndata, governing access and providing oversight on consumption?\n\n7. Is there one individual responsible for effective data management\nacross the enterprise, and has this person established a\n\nprocess for receiving and responding to feedback and shifting\norganizational priorities?\n\nThis list of questions is by no means exhaustive, but it should help to identify\nblockers that are likely to become impediments down the road.\n\n\n2. How well understood are these information assets?\n\n\n-----\n\nSimilarly, we would encourage organizations to assess their maturity in terms of\nanalytics capabilities:\n\n1. Is business performance at all levels assessed in terms of\nkey metrics?\n\n2. How frequently are data-driven analyses used in making key\nbusiness decisions?\n\n3. To what degree are advanced analytics techniques\n— i.e., data science — used in decision-making processes?\n\n4. Are predictive models regularly leveraged as part of operational\nbusiness processes?\n\n5. How is experimentation used to assess the performance of\nvarious initiatives?\n\n\nLastly, and probably most importantly, we’d encourage the organization to\nperform a frank assessment of its readiness to embrace change. Becoming a\ndata-driven enterprise is fundamentally about operating differently than before.\nDecision-making authority becomes more diffuse and often more automated.\nProject outcomes become less certain as the organization focuses on innovation\nwhere learning is emphasized over predictable results. Process silos often\nbecome more intertwined as new modes of engagement evolve.\n\nWhen done right, this transition creates a healthy tension between what’s\nneeded to be successful today and what’s needed to be successful tomorrow.\nBut this can also manifest itself as employee resistance and political infighting\nas processes and organizational structures evolve. What’s often needed to\novercome this is strong leadership, a clear vision and mandate for change as\nwell as a reassessment of incentive structures and active organizational change\nmanagement as the organization transitions into this new way of working.\n\n\n6. Are predictive models used to automate key business decisions?\n\n\n7. Has the organization embraced a model of continuous deployment\nfor the regular update of model-driven processes?\n\n\n**TRADITIONAL APPROACH**\n\n**Upfront reqs** **Technical implementation** **Production**\n\n\n**ITERATIVE APPROACH**\n\n\nContinuous feedback\n\n\n**Business questions** **Testing** **Production** **Optimization**\n\nContinuous learning and optimization\n\nAn iterative approach involves the use of data to continually optimize the performance of data products.\n\n\n-----\n\n## Getting Started: Putting Some Wins on the Board\n\n\nWith the organization ready to proceed, the next phase is about learning to\ndeliver new solutions within your organization. There will be new technologies\nto deploy and new skills to develop, and there will be new patterns for\nintegration into business workflows and procedures for incremental updates\nand improvements. But most importantly, there will need to be a new level of\npartnership and trust between the business and the technology sides of the\norganization that needs to be carefully nurtured.\n\nThe best way we have found to do this is to start with projects that improve\non existing operational workflows, i.e., do what you do, but do it smarter.\nThe business is often familiar with existing pain points and can more clearly\nenvision how a new capability can be folded into its processes. They are also\nfamiliar with how to assess the impact a new approach may have on their\nbusiness and can help design tests to validate whether the intended results\n\n\nAs capabilities demonstrating value over the status quo are developed, they\nare folded into business processes. This is not a one-and-done effort but part\nof an ongoing cycle of deployment to continue so long as the team has a line\nof sight to meaningful gains. The team does not wait for the ideal solution but\ninstead focuses on incremental improvements that deliver measurable value\nalong the way.\n\nOversight for this process is provided by another body, one tasked with the\nsuccess of the overall transformative efforts within the business. As success\nis delivered, there will be growing demand for the time and talents of these\nteams, and the organization will need to prioritize resources across an increasing\nnumber of opportunities. This steering committee will need to be responsible for\nallocating limited resources and advocating for additional ones as well to strike\nthe right balance of investments for the organization.\n\n\nare or are not being delivered.\n\n\n**DEMAND FORECASTING**\n\nDemand forecasting is a massive challenge for retail and consumer goods\n\norganizations. And one where even an incremental change can have a massive impact,\n\nso it’s often one of the first projects organizations identify to put a win on the board.\n\nAccording to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n\naccuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n\nincrease in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n\n[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n\nkey use cases.\n\n\nWork on these projects is a collaborative effort between the business and IT.\nTogether, the project team explores a potential solution with a notion of how it\nmay be integrated in mind from the outset. As the project unfolds, all members\nare part of the iterative cycles and help to steer the solution in new directions\nuntil an item of value is derived.\n\n\n-----\n\n## Going Big: Learning to Embrace Transformational Change\n\n\nWith some experience under your belt, it’s time to build on the organizational\nmuscle developed during initial efforts and flex for more transformative impact.\nAgain, the focus is on established functions within the business, but instead of\npointed, incremental improvements, the team begins to create a vision for the\npart of the organization that would operate if it were to fully embrace data and\nAI enablement.\n\nIt’s at this phase that many of the concerns about organizational resistance\nmentioned earlier are most likely to manifest themselves. Ideally, initial\nimplementation efforts have built champions within the business, but it’s still\nimportant to be mindful of pushback that can emerge as the organization more\nfully begins to change. Having and maintaining strong business sponsorship\nin this phase is critical, and having that sponsor articulate and regularly\nreinforce a clear vision for the change that’s now underway can help everyone\n\nunderstand the need to support these efforts.\n\n\nSo far in this exploration of the journey to data and AI transformation, we’ve\nminimized the importance of technology in order to focus on the business and\norganizational aspects that often get neglected in this conversation. But it’s\nat this stage that the organization needs to have established its preference\nfor data and analytics platforms. Because of the breadth of needs that will\nhave to be addressed and the ongoing innovation taking place in the data\nscience community, we strongly suggest standardizing on a platform that is\nopen and flexible while also providing cost-effective use of both infrastructure\nand people resources and strong data governance and protection. For many\norganizations, the Databricks Lakehouse Platform has proven itself to be the\nideal platform to meet these needs.\n\n**WHY STANDARDIZE ON DATABRICKS?**\n\nThe Databricks Lakehouse is the only enterprise data and AI\n\nplatform that allows retailers to leverage all of their data, from any\n\nsource, on any workload to always offer more engaging customer\n\nexperiences driven by real-time data, at the lowest cost and with\n\nthe greatest investment protection.\n\n\n-----\n\nBut simply standardizing on a platform is not enough. The organization\nneeds to work through the roles and responsibilities around the use of this\nplatform and processes for moving things from experimentation and formal\ndevelopment to testing and operationalization.\n\nThe importance of having an MLOps strategy really comes to life at this\nphase. This doesn’t mean your strategy around MLOps can’t change, but this\nphase is when you want to think about and define your answers to some key\nquestions such as the following:\n\n1. How do we evaluate new and existing (retrained) models as\npart of their movement from development to production?\n\n2. How do we determine when a model should be retrained?\n\n3. What are the preferred mechanisms for production deployment?\n\n4. How do we fall back should we have a deployment problem?\n\n5. What are the service level expectations for the\ndeployment processes?\n\n\n###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n\n**Numan Ali**\n\nSolutions Architect, Data and Analytics Center of Excellence at Pandora\n\n\n-----\n\n## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n\n\nToo often, leadership views innovation as a destination and not a process\n(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\ndata-driven organization overnight and then it’s done. Yes, there will be an\nupfront investment, but there will also be ongoing investment in order to\nsupport sustained innovation.\n\nIronically, one of the major obstacles to this change is viewing the goal as\nsimply delivering a project or projects. Think about it — just 12 months ago,\nonly a few specialists in academia and industry were talking about generative\nAI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\npersonalized consumer experiences with it.\n\n\nTechnology, especially when it comes to data and AI, moves far too quickly.\nWhat retailer tech teams need to deliver at the end of the day is applications,\nof course, but also the ability to react quickly to change. What sort of ongoing\ninvestments in terms of people, process and technology do retailers need to\nfoster in order to ingrain an innovation mindset?\n\nThis is an ongoing balancing act where organizations need to innovate and look\nfor new opportunities but also sustain that innovation in a way that is realistic\nfor the business. For this, let’s consider the 70-20-10 rule: the idea that\ncompanies should allocate 70% of innovation investment to core initiatives,\n20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\nnot a hard-and-fast rule, this concept was touted by Google co-founder Larry\nPage in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n\noutperformed their peers, typically realizing a P/E premium of 10% to 20%.\n\n\n-----\n\nThe goal of the 70-20-10 rule is to help guide the organization toward\nsustained innovation and spend the bulk of time on the core business. This is\npart of why we recommend starting first with fast (just 2- to 3-month total)\npilot projects to use AI on existing business use cases like demand forecasting\nand call center optimization. By working in these areas with a focus on learning\nand iterating, retailers will soon find where data silos and rigidity exist in the\nsystem. As these foundational barriers are knocked down, it then makes it\npossible to tackle more transformational use cases and start to build the\ncharacteristics of a data-forward enterprise. In other words, start to utilize\ndata and data-driven insights as a primary driver for decision-making and\noperations, while also prioritizing continuous data analysis and improvement.\n\n\n**TRANSFORMATIVE**\n\n\n**ADJACENT**\n\n\n**CORE**\n\n\n###### Companies that allocated about 70% of their innovation activity to core initiatives, \n### 20% to adjacent ones and 10% to\n###### transformational ones outperformed their peers.\n\n**Bansi Nagji & Geoff Tuff**\n_Managing Your Innovation Portfolio_\nHarvard Business Review, May 2012\n\n\n-----\n\n## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n\n\nSo what does it take to successfully embark on this\njourney to becoming a data-forward enterprise?\nFirst and foremost, you need to not only establish\na baseline understanding of what has occurred by\nexamining historical data but leverage advancements\nin technologies (e.g., streaming, computer vision,\nvoice recognition) to make predictions of the future.\n\nThrough the use of both historical data and\npredictive techniques such as forecasting,\nrecommendations, prescriptive care and nextbest-action, organizations can begin to improve\ndecisions and, in some cases, automate certain\ndecision-making processes. But rather than moving\n\nfrom historical views to predictive actions in a\nlinear fashion, this journey involves addressing both\napproaches simultaneously. Once you are able to\nunify historical and predictive analysis, you can then\ntake significant steps toward becoming a dataforward enterprise.\n\n\n##### The Data-Forward Enterprise\n\nData, analytics and AI working in concert\n\n\n**Data Purgatory**\nThings are better, but data isn’t\ndriving the business\n\n\n**Data Maturity**\nEvery aspect of the\nbusiness is supported\nby insights and AI\n\n\n**Data Siloed**\nData and teams are segregated\ninto different systems\n\nDATA MATURITY\n\nBeing data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n\n\n-----\n\n## The 8 Steps to Building a Data-Forward Retailer\n\n\nBefore you start your data-forward journey, a few critical steps must be\nconsidered to establish a solid foundation to build upon. Based on our\nwork with the largest and most successful retailers in the world, spanning\nstartups to global giants, we at Databricks have seen that the most successful\nfollowed these steps to effectively gain wallet share, whereas those who\ncouldn’t would often leave major gaps that competitors could take advantage\nof. These steps are the basics to prepare businesses for where they need\nto be both now and in the near future.\n\n\n**2** **Get grounded: Understand the technology**\n\nTo start, business leaders need to ground themselves in technology, especially\nwhen it comes to AI. AI can do amazing things, but it is not magical and vendors\nare prone to overpromising and underdelivering. Less than getting deep into\ncode, the purpose is to understand the limitations and ideal use cases.\n\nDatabricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\nstarting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\nperspective of how different brands are using data, analytics and AI to drive\nrevenue or cut operational costs.\n\n\n**1** **Set the foundation: Define goals and objectives**\n\n\nThe best way to avoid shiny object syndrome (where you start out with a\n\ntechnology and then try to figure out what to do with it) is to first identify the\nproblems you want to solve. From there, you can set goals around innovation\nto align incentives, and, most importantly, ensure you are driving specific\nbusiness outcomes such as improving customer engagement, optimizing\ninventory management or increasing sales.\n\n\n**3** **Understand the skills and processes in your business**\n\nAs we will get into in step 4, starting with smaller pilot projects enables you\nto not just deliver a quick win and validate the use of AI in the enterprise, but\nalso understand the in-house capabilities in terms of people, process and\ntechnology to deliver technical projects. And if required, be willing and ready\nto hire people with the right skill sets that can help you make the most of your\ndata. For example, building a core team of data analysts can help extract deep\ninsights that lead to better decision-making and identify opportunities for\ngrowth. It is critical at this step to define the roles you need, determine how\nyou will source for those roles (via external hiring or internal transfer), and\nensure those roles have opportunities for career progression.\n\n\n-----\n\nFor inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\nsave hours of discovery, design, development and testing. Our purpose-built\nguides — fully functional notebooks and best practices — speed up results\nacross your most common and high-impact use cases and enable you to go\nfrom idea to proof of concept (PoC) in as little as two weeks. We have over\n20 accelerators built specifically for critical retail and consumer goods use\ncases, from Demand Forecasting and On-Shelf Availability to Recommendation\nEngines and Customer Lifetime Value. We also have a set of Solution\nAccelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n\n**5** **Implement data management and governance early**\n\nThe first step to successfully implementing AI/ML in your business broadly\nis to ensure you have accurate, reliable and current data to train your\nmodels against. This data can (and should) come from a variety of sources,\nso it’s key to unify all data types and sources (sales transactions, customer\nfeedback, social media) in a centralized location that is easily accessible,\nwhile not losing sight of data security to maintain customer trust. Setting\nup data governance parameters to control who has which kinds of access\nto what data, and being able to audit the history of this access, will actually\naccelerate innovation while ensuring data security and compliance.\n\n\n**Delivering exactly what customers want,**\n**every time, and on time**\n\nData is at the heart of Gousto’s mission to change the\nway people eat through the delivery of boxes of fresh\ningredients and easy-to-follow recipes. However, even\nas their business exploded at the start of the pandemic,\ntheir systems couldn’t ingest data fast enough, couldn’t\ntalk to each other and wouldn’t scale — forcing them to\ntemporarily stop accepting new customers. Now Gousto is\nset up to achieve exciting ambitions for menu expansion,\nsophisticated personalization and next-day delivery. Learn\nhow they did it.\n\n**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n\n**4** **Start small: Pilot a project**\n\nThere is no substitute for rolling your sleeves up and running a pilot project to\nevaluate the feasibility and potential impact of a project before implementing\nit on a larger scale. When selecting a pilot project, we recommend starting with\na project that will deliver clear business value, such as incremental revenue\nor clear cost savings, yet only takes 2-3 months to complete. The more time\nthere is between project inception and seeing results, the more likely it will lose\nmomentum internally.\n\n\n-----\n\n**6** **Incorporate AI across the business (starting with daily tasks)**\n\nGiven the large upfront investment in data scientists and engineers to build\nan AI program, the ROI will come from using it at scale. Constantly look to\nuncover patterns and repeatable processes that can be optimized or fully\nautomated with AI.\n\n**Building a global fashion icon with a**\n**customer-first approach**\n\nBritish luxury brand Burberry was seeking an efficient way to\nannotate its thousands of highly specific marketing assets\nfor better targeting. Working with Labelbox within Databricks\nLakehouse, they are now able to complete image annotation\nprojects in hours instead of months. And marketing team\nmembers now have access to powerful content insights\nwithout needing to ask data scientists for help.\n\n**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n\n**Customizing interactions that convert clicks**\n**to revenue with Databricks Lakehouse**\n\nGlobal jewelry manufacturer and retailer Pandora needed a\nunified view of all their data where they could easily segment,\ncategorize and analyze to deliver custom messaging to\nconsumers. With Databricks Lakehouse, they now have the\ninsights they need to deliver highly targeted messaging —\nincreasing consumer engagement from the initial opening of\na marketing email to maximizing shopping bag conversions to\ndriving revenue on the website.\n\n**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n\n\n**Building an operationally efficient**\n**omnichannel business**\n\nThe Hershey Company analyzes the data they need to\nstay in front of changing human behavior and delight their\ncustomers. With Databricks Lakehouse, they can analyze\ndata feeds from their largest retail customer — uncovering\ninsights that will help extend their industry leadership.\n\n**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n\n\n**Ushering in a new era**\n**of data-driven retailing**\n\nOutdoor apparel brand Columbia Sportswear has enabled\ndata and analytics self-service throughout the organization in\na way that ensures everyone is working from a single source\nof truth. Whichever data team needs access to the data,\nDatabricks Lakehouse gives them the confidence that the\ndata is reliable and consistent.\n\n**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n\n\n-----\n\n**7** **Foster a culture of data-driven decision-making**\n\nWhat does it mean to have a culture of data-driven decision-making? In\npractice, it means empowering all employees to use data to inform their\ndecisions. Only some strategic decisions will be based on complete and\naccurate information. It’s unwise to assume otherwise. The right approach\nis to leverage as much data as possible, from past tests or current efforts,\nto mitigate risk. Leaders need to not only ask for data but also ensure\nthat their employees will be able to find the data they need.\n\n**Unlocking critical trends and insights**\n**needed to serve our 180 million customers**\n\nReckitt, the maker of Lysol as well as hundreds of other\nhousehold brands, was looking to deliver best-in-class\ncustomer experiences to their over 180 million customers\nspanning the globe. With Databricks Lakehouse, Reckitt\nhas established a data-first culture by surfacing real-time,\nhighly accurate, deep customer data insights that have\nled to a better understanding of international market\ntrends and demand across the multiple product lines\nthey support.\n\n**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n\n\n**Customer 360 to enable faster speed**\n**to market, better results**\n\nThe Middle East’s Al-Futtaim serves as a local distributor\nfor global brands such as Toyota, IKEA and Ace Hardware.\nWith Databricks Lakehouse serving as a unified platform to\naggregate and analyze various data sources on all customers,\nthey have created a “golden customer record” that improves\nall decision-making, from forecasting demand to powering\ntheir global loyalty program.\n\n**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n\n**8** **Continuously evaluate and improve**\n\nRecognize that establishing a data-driven culture is an ongoing journey and\nnever a set destination. Constantly evaluate your data collection, analysis and\ndecision-making process to identify areas for improvement. Even small and\nconstant incremental improvements will deliver large gains in absolute terms\nwhen applied at scale. You can always personalize more, forecast better, or\nbetter manage your supply chain as you bring in better data sources and refine\nyour models.\n\n\n-----\n\n## Transform Retail Data Into Actionable Insights\n\n\nBecoming data forward is not a crazy idea. Too often, leaders or organizations\nallow themselves to be intimidated by focusing on large-scale transformations.\nBut it’s the small operational changes that can make your business more efficient\nas well as shift the larger culture forward. Once you’ve set this foundation, it then\nallows you to move toward bigger things. These steps may fail, but it’s actually\npositive to have these setbacks to learn from to try again. The bigger risk is to\nnot try and thus fall behind competitors who are embracing the internal changes\nneeded to take advantage of AI and machine learning.\n\nCore to delivering on these steps to become a data-forward retailer is a solid\ndata foundation that can unify your data and AI workloads with sharing and\ngovernance built in, so internal and external teams can get access to the\ndata they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\ncompanies gain valuable insights into customer behavior, optimize supply chain\n\noperations and make informed business decisions in real time.\n\n\nEXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n\nAccess key resources to understanding how a lakehouse\nfor retail can set you on the path toward becoming a\ndata-forward organization.\n\n**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n\n\n#### Visit our website to learn more about Databricks Lakehouse for Retail.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast, and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "### eBook\n\n# The Big Book\n of MLOps\n\n#### A data-centric approach\n to build and scale AI,\n including LLMOps\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\n## Contents\n\n**A U T H O R S :**\n\n**Joseph Bradley**\n\nLead Product Specialist\n\n**Rafi Kurlansik**\n\nLead Product Specialist\n\n**Matt Thomson**\n\nDirector, EMEA Product Specialists\n\n**Niall Turbitt**\n\nLead Data Scientist\n\n\n**C H A P T E R 1 :** \u0007 **Introduction** 3\n\n###### People and process 4\n\n People 5\n\n Process 6\n\n Why should I care about MLOps? 8\n\n Guiding principles 9\n\n**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n\n###### Semantics of dev, staging and prod 11\n\n ML deployment patterns 15\n\n**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n\n###### Architecture components 19\n\n Data Lakehouse 19\n\n MLflow 19\n\n Databricks and MLflow Autologging 20\n\n Feature Store 20\n\n MLflow Model Serving 20\n\n Databricks SQL 20\n\n Databricks Workflows and Jobs 20\n\n Reference architecture 21\n\n Overview 22\n\n Dev 23\n\n Staging 27\n\n Prod 30\n\n**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n\n###### Discussion of key topics for LLMOps 39\n\n Reference architecture 46\n\n Looking ahead 48\n\n\n-----\n\n**CHAPTER 1:**\n## Introduction\n\n**Note:** Our prescription for MLOps is general to\n\nany set of tools and applications, though we give\n\nconcrete examples using Databricks features\n\nand functionality. We also note that no single\n\narchitecture or prescription will work for all\n\norganizations or use cases. Therefore, while we\n\nprovide guidelines for building MLOps, we call out\n\nimportant options and variations. This whitepaper\n\nis written primarily for ML engineers and data\n\nscientists wanting to learn more about MLOps,\n\nwith high-level guidance and pointers to more\n\nresources.\n\n\nThe past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n\nadopters were a small number of large technology companies that could afford the necessary resources,\n\nin recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n\nMIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n\nThis democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n\n[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n\nHowever, building and deploying ML models is complex. There are many options available for achieving\n\nthis but little in the way of well-defined and accessible standards. As a result, over the past few years we\n\nhave seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n\n**and automation for managing models, data and code to improve performance stability and long-term**\n\n**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n\nThe concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n\nsoftware applications, and the deployment of ML applications has much to gain from it. However, strong\n\nDevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n\nartifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n\naccount the various people and processes that interact with these artifacts.\n\nHere at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n\nwhich work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n\nsuccessful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n\nadoption is a testament to the appetite for operationalizing ML models.\n\nThis whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n\nFirst, we describe the people and process involved in deploying ML applications and the need for\n\noperational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n\nwe go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n\nintroduce a general MLOps reference architecture, the details of its processes, and best practices.\n\n\n-----\n\n#### People and process\n\n**M L W O R K F L O W A N D P E R S O N A S**\n\nData Governance Officer\n\nDat1\nData Scientist\nEngineer\n\nML Engineer\n\nBusiness Stakeholder\n\n\nDataa\nPreparation\n\n\nEvplorator{a\nData unal{sis\n\n\nFeature Mode� Modela Deplo{�ent\nEngineering Training Validation\n\n\nMode� Modela Deplo{�ent Monitoring\nTraining Validation\n\n\nModela\nValidation\n\n\n**Figure 1**\n\n\n-----\n\n#### People\n\nBuilding ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n\nuseful to think in terms of archetypes. They help us understand roles and responsibilities and where\n\nhandoffs are required, and they highlight areas of complexity within the system. We distinguish between\n\nthe following personas:\n\n**M L P E R S O N A S**\n\n\nData\nGovernance\nOfficer\n\nResponsible for ensuring\n\nthat data governance,\n\ndata privacy and other\n\ncompliance measures are\n\nadhered to across the\n\nmodel development and\n\ndeployment process. Not\n\ntypically involved in day-to-\n\nday operations.\n\n\nData\nEngineer\n\nResponsible for building\n\ndata pipelines to process,\n\norganize and persist data\n\nsets for machine learning\n\nand other downstream\n\napplications.\n\n\nData\nScientist\n\nResponsible for\n\nunderstanding the business\n\nproblem, exploring available\n\ndata to understand\n\nif machine learning is\n\napplicable, and then training,\n\ntuning and evaluating a\n\nmodel to be deployed.\n\n\nML\nEngineer\n\nResponsible for deploying\n\nmachine learning models to\n\nproduction with appropriate\n\ngovernance, monitoring and\n\nsoftware development best\n\npractices such as continuous\n\nintegration and continuous\n\ndeployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n\n\nBusiness\nStakeholder\n\nResponsible for using the\n\nmodel to make decisions for\n\nthe business or product, and\n\nresponsible for the business\n\nvalue that the model is\n\nexpected to generate.\n\n\n-----\n\n#### Process\n\nTogether, these people develop and maintain ML applications. While the development process follows\n\na distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n\nyou take, and using techniques like reinforcement learning or online learning will change some details.\n\nNevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n\nabove.\n\nLet’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n\nwhich will be determined by the particular business case and data.\n\n**M L P R O C E S S**\n\n\nData\nPreparation\n\n\nExploratory\nData Analysis\n\n\nFeature\nEngineering\n\n\nModel\nTraining\n\n\nModel\nValidation\n\n\nDeployment Monitoring\n\n\n###### Data preparation\n\nPrior to any data science or ML work lies the data engineering needed to prepare production data and make\n\nit available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n\nwill extract features and labels from the raw data.\n\n###### Exploratory data analysis (EDA)\n\nAnalysis is conducted by data scientists to assess statistical properties of the data available, and determine\n\nif they address the business question. This requires frequent communication and iteration with business\n\nstakeholders.\n\n\n-----\n\n###### Feature engineering\n\nData scientists clean data and apply business logic and specialized transformations to engineer features for\n\nmodel training. These data, or features, are split into training, testing and validation sets.\n\n###### Model training\n\nData scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n\na best-performing model is determined according to predefined evaluation metric(s).\n\n###### Model validation\n\nPrior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n\nsome baseline level of performance, in addition to meeting any other technical, business or regulatory\n\nrequirements. This necessitates collaboration between data scientists, business stakeholders and ML\n\nengineers.\n\n###### Deployment\n\nML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n\nrequirements of the use case.\n\n###### Monitoring\n\nML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n\nwill often be involved in early monitoring phases to ensure that new models perform as expected after\n\ndeployment. This will inform if and when the deployed model should be updated by returning to earlier\n\nstages in the workflow.\n\nThe data governance officer is ultimately responsible for making sure this entire process is compliant with\n\ncompany and regulatory policies.\n\n\n-----\n\n#### Why should I care about MLOps?\n\nConsider that the typical ML application depends on the aforementioned people and process, as well\n\nas regulatory and ethical requirements. These dependencies change over time — and your models, data\n\nand code must change as well. The data that were a reliable signal yesterday become noise; open source\n\nlibraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n\nresilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n\nmoving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n\niteration cycle of delivering models to production, thereby accelerating time to business value.\n\nThere are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n\n**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n\nFor example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n\nrisk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n\nfails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n\nfines or incurring reputational damage. Recently, one private company’s data collection practices were\n\nfound to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n\n$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n\ndeveloped with that data.\n\nWith respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n\nprocesses. These steps are slower and more prone to error, affecting the quality of models, data and code.\n\nEventually they form a bottleneck, capping the ability for a data team to take on new projects.\n\nSeen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n\nstability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n\nintroduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n\nmanage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n\n**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n\nWith clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\nGiven the complexity of ML\n\nprocesses and the different personas\n\ninvolved, it is helpful to start from\n\nsimpler, high-level guidance. We\n\npropose several broadly applicable\n\nprinciples to guide MLOps decisions.\n\nThey inform our design choices in\n\nlater sections, and we hope they can\n\nbe adapted to support whatever your\n\n\n#### Guiding principles\n\n###### Always keep your business goals in mind\n\nJust as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n\npurpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n\ncontinue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n\nbusiness impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n\nreduce operational costs or risks?\n\n###### Take a data-centric approach to machine learning\n\nFeature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n\nas robust as other production data engineering processes. Data quality is crucial in any ML application, so\n\nML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n\nAvoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n\nyour data. The simplest way to achieve this is to develop ML applications on the same platform used to\n\nmanage production data. For example, instead of downloading training data to a laptop, where it is hard\n\nto govern and reproduce results, secure the data in cloud storage and make that storage available to your\n\ntraining process.\n\n\nbusiness use case may be.\n\n\n-----\n\n###### \u0007Implement MLOps in a modular fashion\n\nAs with any software application, code quality is paramount for an ML application. Modularized code\n\nenables testing of individual components and mitigates difficulties with future code refactoring. Define\n\nclear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n\nresponsibilities to clarify the modular structure of your ML application.\n\n###### Process should guide automation\n\nWe automate processes to improve productivity and lower risk of human error, but not every step of a\n\nprocess can or should be automated. People still determine the business question, and some models will\n\nalways need human oversight before deployment. Therefore, the development process is primary and each\n\nmodule in the process should be automated as needed. This allows incremental build-out of automation\n\nand customization. Furthermore, when it comes to particular automation tools, choose those that align to\n\nyour people and process. For example, instead of building a model logging framework around a generic\n\ndatabase, you can choose a specialized tool like MLflow, which has been designed with the ML model\n\nlifecycle in mind.\n\n\n-----\n\n**CHAPTER 2:**\n## Fundamentals of MLOps\n\n**Note:** In our experience with customers, there\n\ncan be variations in these three stages, such as\n\nsplitting staging into separate “test” and “QA”\n\nsubstages. However, the principles remain the\n\nsame and we stick to a dev, staging and prod\n\nsetup within this paper.\n\n\n#### Semantics of dev, staging and prod\n\nML workflows include the following key assets: code, models and data. These assets need to be developed\n\n(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n\nenvironment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n\nstaging and prod.\n\nThese divisions can best be understood in terms of quality guarantees and access control. On one end,\n\nassets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n\nwho can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n\nof quality.\n\nFor example, many data scientists will work together in a dev environment, freely producing dev model\n\nprototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n\nthe live product. In contrast, the staging environment replicates the execution environment of production.\n\nHere, code changes made in the dev environment are tested prior to code being deployed to production.\n\nThe staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n\nare given access to staging. Code promoted to production is considered a live product. In the production\n\nenvironment, human error can pose the greatest risk to business continuity, and so the least number of\n\npeople have permission to modify production models.\n\nOne might be tempted to say that code, models and data each share a one-to-one correspondence with\n\nthe execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n\nclose to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n\nand prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n\naccess to each.\n\n\n-----\n\n###### Execution environments\n\nAn execution environment is the place where models and data are created or consumed by code. Each\n\nexecution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n\nWith Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n\norganization could create distinct environments across multiple cloud accounts, multiple Databricks\n\nworkspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n\nare illustrated in Figure 2 below.\n\n**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n\n\nMultiple clou$\naccounts\n\nstaging\n\nprod\n\n\nMultiple Databricks\nworkspaces\n\nstaging\n\nprod\n\n\nDatabricks workspace\naccess controls\n\n\ndev\n\nstaging\n\nprod\n\n\ndev\n\n\ndev\n\n\n**Figure 2**\n\n\n-----\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\n\n###### Code\n\nML project code is often stored in a version control repository (such as Git), with most organizations\n\nusing branches corresponding to the lifecycle phases of development, staging or production. There are a\n\nfew common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n\nOthers use main and development branches (dev), branches cut for testing potential releases (staging), and\n\nbranches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n\nthrough Git repository branches.\n\n\nlifecycle management functions that are needed\n\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once and to replace a subset\n\nof the objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\n\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\n\nthat’s higher. For example, the dev environment can run any code, but the prod environment can only run\n\nprod code.\n\n###### Models\n\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n\na new model version before you push a code change, and vice versa. Consider the following scenarios:\n\n\npoint-in-time snapshots or rollback of erroneous\n\n\n\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n\nof being generated, tested and marked as “production” to predict on the most recent transactions. In\n\nthis case the code lifecycle is slower than the model lifecycle.\n\n\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\n\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n\nproduction models without code changes, streamlining the deployment process in many cases. Model\n\nartifacts are secured using MLflow access controls or cloud storage permissions\n\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n###### Data\n\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\n\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\n\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n\n\nreliability and freshness. Access to data in each environment is controlled with table access controls\n\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n| |\n\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n\nwill be the highest quality and tightly controlled.\n\n\n\n\n\n\n\n\n|ASSET|SEMANTICS|SEPARATED BY|\n|---|---|---|\n|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n|Code|Labeled according to software development lifecycle phase|Git repository branches|\n\n\n**Table 1**\n\n\n-----\n\n#### ML deployment patterns\n\nThe fact that models and code can be managed separately results in multiple possible patterns for getting\n\nML artifacts through staging and into production. We explain two major patterns below.\n\n**D E P L O Y M O D E L S**\n\ndev staging prod\n\n**D E P L O Y C O D E**\n\ndev staging prod\n\nThese two patterns differ in terms of whether the model artifact or the training code that produces the\n\nmodel artifact is promoted toward production.\n\n\n-----\n\n###### Deploy models\n\nIn the first pattern, the model artifact is generated by training code in the development environment.\n\nThis artifact is then tested in staging for compliance and performance before finally being deployed into\n\nproduction. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n\nexpensive, training the model once and managing that artifact may be preferable. However, this simpler\n\narchitecture comes with limitations. If production data is not accessible from the development environment\n\n(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n\nautomated model retraining. While you could automate retraining in the development environment, you\n\nwould then be treating “dev” training code as production ready, which many deployment teams would not\n\naccept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n\ndeployed to production, requiring a separate code deployment path.\n\n###### Deploy code\n\nIn the second pattern, the code to train models is developed in the dev environment, and this code is\n\nmoved to staging and then production. Models will be trained in each environment: initially in the dev\n\nenvironment as part of model development, in staging (on a limited subset of data) as part of integration\n\ntests, and finally in the production environment (on the full production data) to produce the final model.\n\nIf an organization restricts data scientists’ access to production data from dev or staging environments,\n\ndeploying code allows training on production data while respecting access controls. Since training code\n\ngoes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n\nsame pattern as model training code, and both can go through integration tests in staging. However, the\n\nlearning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n\nproject templates and workflows are helpful. Finally, data scientists need visibility into training results from\n\nthe production environment, for only they have the knowledge to identify and fix ML-specific issues.\n\n\n-----\n\nThe diagram below contrasts the code lifecycle for the above deployment patterns across the different\n\nexecution environments.\n\n\nCode\ndevelopment\n\nDevelopment\nenvironment\n\n\nUnit\ntests\n\n\nIntegration\ntests\n\nDevelopment\nenvironment\n\nStaging\nenvironment\n\n\nModel\ntraining\n\n\nContinuous\ndeployment\n\nStaging\nenvironment\n\nProduction\nenvironment\n\n\nDeploy\npipelines\n\nProduction\nenvironment\n\n\n#### Deploy models\n\n Deploy code\n\n\n**In general we recommend following the “deploy code” approach, and the reference architecture in**\n\n**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n\nthe options outlined above are not mutually exclusive. Within a single organization, you may find some use\n\ncases deploying training code and others deploying model artifacts. Your choice of process will depend on\n\nthe business use case, resources available and what is most likely to succeed.\n\n\n-----\n\n|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n|---|---|---|---|\n|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|\n\n\n**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n\n\n-----\n\n**CHAPTER 3:**\n## MLOps Architecture\n and Process\n\n###### Lakehouse Platform\n\n\n#### Architecture components\n\nBefore unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n\nfeatures used to facilitate MLOps in the workflow prescribed.\n\n###### Data Lakehouse\n\nA [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n\ndata management and performance typically found in data warehouses with the low-cost, flexible object\n\nstores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n\nof Bronze, Silver and Gold tables of increasing refinement and quality.\n\n###### MLflow\n\n[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n\nfollowing primary components:\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n\nartifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n| |\n\n\n\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n\nmodel serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n| |\n\n\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n\n\nfrom staging to production, with capabilities for versioning and annotating. The registry also provides\n\nwebhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n| |\n\nDatabricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n\nhigh availability, and other Databricks workspace features such as experiment and run management and\n\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n\nmachine learning model training runs and running machine learning projects.\n\n\n-----\n\n###### Databricks and MLflow Autologging\n\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n\n\nautomatically captures model parameters, metrics, files and lineage information when you train models with\n\ntraining runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n| |\n\n###### Feature Store\n\nThe Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n\n\nacross an organization and also ensures that the same feature computation code is used for model training\n\nand inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n| |\n\n###### MLflow Model Serving\n\nMLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n\n\nthat are updated automatically based on the availability of model versions and their stages. See\n\ndocumentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n| |\n\n###### Databricks SQL\n\nDatabricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n\n\ndata lake, create multiple visualization types to explore query results from different perspectives, and build\n\nand share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n| |\n\n###### Databricks Workflows and Jobs\n\nDatabricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n\n\nways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n\nsteps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n| |\n\n\n-----\n\n#### Reference architecture\n\nWe are now ready to review a general reference architecture for implementing MLOps on the Databricks\n\nLakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n\nthe majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n\nwe will highlight alternative approaches to implementing different parts of the process.\n\nWe begin with an overview of the system end-to-end, followed by more detailed views of the process\n\nin development, staging and production environments. These diagrams show the system as it operates\n\nin a steady state, with the finer details of iterative development cycles omitted. This structure is\n\nsummarized below.\n\n**O V E R V I E W**\n```\n dev staging prod\n\n```\n\n\u0007Data\n\n\u0007Exploratory data analysis (EDA)\n\n\u0007Project code\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Commit code\n\n\n\u0007Merge request\n\n\u0007Unit tests (CI)\n\n\u0007Integration tests (CI)\n\n\u0007Merge\n\n\u0007Cut release branch\n\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Continuous deployment (CD)\n\n\u0007Online serving (REST APIs)\n\n\u0007Inference: batch or streaming\n\n\u0007Monitoring\n\n\u0007Retraining\n\n\n-----\n\n###### Overview\n\nSource control\n\ndev staging (main) release\n\nMerge reIuest to staging Cut release branch Pull from release branch to production\n\n\n**Figure 3**\n\n\nDevelopment\nenvironment\n\nExploratory\ndata analysis\n\n\nStaging\nenvironment\n\nCreate dev branch Commit code C} trigger Merge\n\n\nProduction\nenvironment\n\nModel Registry\n\nSt�ge{ �one St�ge{ St�ging St�ge{ Production\n\n\n. . .\n\n\nInference & serving dev\n\nFeature table refresh dev\n\n\nUnit tests\n(CI)\n\n\nPush model to registr� Load model for testing Load model for inference\n\nIntegration\ntests (CI)\n\n\ndev\n\n\ndev\n\n\nPromote to production\n\n\nInference & serving\n\n\nModel training dev\n\nrelease\n\ndev\n\n\nFeature\ntable refresh\n\nrelease\n\n\nMode�\ntraining\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring\n\nrelease\n\n\nData tables Feature tables Feature tables Data tables Feature tables Metrics tables\n\nHere we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n\nand model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n\npipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n\ndevelopment environment, and changes to the codebase are committed back to source control. Upon merge\n\nrequest to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n\ncode in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n\ncode release. In production, a model is trained on the full production data and pushed to the MLflow Model\n\nRegistry. A continuous deployment (CD) process tests the model and promotes it toward the production\n\nstage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n\n\n-----\n\n###### Dev\n\nIn the development environment, data scientists and ML engineers can collaborate on all pipelines in\n\nan ML project, committing their changes to source control. While engineers may help to configure this\n\nenvironment, data scientists typically have significant control over the libraries, compute resources and\n\ncode that they use.\n\n\n**Figure 4** Development environment\n\n0�\n\nE�ploratory\ndata analysis\n\n0�\n\n\ndev\n\n\nSource control\n\nTracking Server\n\nMetrics Parameters Models\n\ndev\n\n\n. . .\n\nmodels\n\n\ntrain.py\n\ndeploy.py\n\nin(erence.py\n\nmonitoring.py\n\ndat<\n\n(eaturization.py\n\ntests\n\nunit.py\n\nintegration.py\n\n\nInference: Streaming or batch\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\n\nModel training\n\nTraining and\nEvaluation\ntuning\n\n\nCreate dev mrancg\n\n0u\n\nCommit code\n\n\n04\n\n\n\ndev\n\n\ndev\n\n\n0�\n\n\nLakehouse\n\n\nFeature tamles Bronze / Silver / Gold\n\nprod data\n\n\nFeature tamles Temp tamles\n\ndev data\n\n\n-----\n\n###### Data\n\nData scientists working in the dev environment possess read-only access to production data. They also\n\nrequire read-write access to a separate dev storage environment to develop and experiment with new\n\nfeatures and other data tables.\n\n###### Exploratory data analysis (EDA)\n\nThe data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n\nassess whether the available data has the potential to address the business problem. EDA is also where the\n\ndata scientist will begin discerning what data preparation and featurization are required for model training.\n\nThis ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n\n###### Project code\n\nThis is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n\nare used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n\na project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n\n\n-----\n\n###### Feature table refresh\n\nThis pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n\npipeline consists of two steps:\n\n\u0007 **Data preparation**\n\nThis step checks for and corrects any data quality issues prior to featurization.\n\n**\u0007Featurization**\n\nIn the dev environment, new features and updated featurization logic can be tested by writing to feature\n\ntables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n\nfeaturization code is promoted to production, these changes will affect the production feature tables.\n\nFeatures already available in production feature tables can be read directly for development.\n\nIn some organizations, feature engineering pipelines are managed separately from ML projects. In such\n\ncases, the featurization pipeline can be omitted from this architecture.\n\n\n-----\n\n###### Model training\n\nData scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n\n\u0007 **Training and tuning**\n\nThe training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n\nand it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n\nhyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n\nbetween the model, its input data, and the code used to generate it.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out data. The results of these tests are logged to the\n\nMLflow tracking server.\n\nIf governance requires additional metrics or supplemental documentation about the model, this is the\n\ntime to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n\nand plain text descriptions are common, but defining the specifics for such governance requires input\n\nfrom business stakeholders or a data governance officer.\n\n**\u0007Model output**\n\nThe output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n\ntraining pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n\nvia the model URI (or path) and then push the model to the Model Registry for management and testing.\n\n###### Commit code\n\nAfter developing code for featurization, training, inference and other pipelines, the data scientist or\n\nML engineer commits the dev branch changes into source control. This section does not discuss the\n\ncontinuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n\ninformation on those.\n\n\n-----\n\n###### Staging\n\nThe transition of code from development to production occurs in the staging environment. This code\n\nincludes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n\nengineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n\nintegration pipelines and orchestration.\n\nSource control\n\n0] 0_\n\ndev staging >main< release\n\nMerge reHuest to staging Cut release branch\n\nStaging environment\n\nCI trigger Merge\n\n0�\n\n\n**Figure 5**\n\n\nUnit tests\n(CI)\n\n\nTracking Server\n\n0�\n\nModel Registry\n\ndev\n\n\n03\n\nIntegration tests (CI)\n\n\nFeature\nStore tests\n\n\nModel\ntraining tests\n\n\nModel\ndeployment\ntests\n\n\nInference\ntests\n\n\nModel\nmonitoring\ntests\n\n\nLakehouse\n\n\ndev\n\nFeature tables Temp tables\n\nstaging data\n\n\n-----\n\n###### Data\n\nThe staging environment may have its own storage area for testing feature tables and ML pipelines. This\n\ndata is generally temporary and only retained long enough to run tests and to investigate test failures. This\n\ndata can be made readable from the development environment for debugging.\n\n###### Merge code\n\n\u0007 **Merge request**\n\nThe deployment process begins when a merge (or pull) request is submitted against the staging branch\n\nof the project in source control. It is common to use the “main” branch as the staging branch.\n\n**\u0007Unit tests (CI)**\n\nThis merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n\nrequest is rejected.\n\n\n-----\n\n###### Integration tests (CI)\n\nThe merge request then goes through integration tests, which run all pipelines to confirm that they function\n\ncorrectly together. The staging environment should mimic the production environment as much as is\n\nreasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n\nIntegration tests can trade off fidelity of testing for speed and cost. For example, when models are\n\nexpensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n\ncost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n\ntesting within these integration tests, whereas other models may be tested with small batch jobs or a few\n\nqueries to temporary REST endpoints.\n\nOnce integration tests pass on the staging branch, the code may be promoted toward production.\n\n\u0007 **Merge**\n\nIf all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n\nsystem should notify users and post results on the merge (pull) request.\n\nNote: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n\nupdated frequently with concurrent merge requests.\n\n###### Cut release branch\n\nOnce CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n\nthat commit.\n\n\n-----\n\n**Figure 6**\n\n\n###### Prod\n\nThe production environment is typically managed by a select set of ML engineers and is where ML pipelines\n\ndirectly serve the business or application. These pipelines compute fresh feature values, train and test new\n\nmodel versions, publish predictions to downstream tables or applications, and monitor the entire process to\n\navoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n\nonline serving below, most ML applications will use only one of these methods, depending on the business\n\nrequirements.\n\nProduction environment\n\n\n0b\n\n0�\n\n0�\n\n\nModel Registry\n\n\nOnline serving\n\n\nStage: None Stage: Staging Stage: Production\n\n\nLog\nrequests and\npredictions\n\nrelease\n\n\nLoad model for\nonline serving\n\n\nEna�le online\nserving\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\nrelease\n\n0B\n\n\n0~\n\n\nLoad model for testing\n\n\nLoad model for testing Load model for inference\n\n\nInference: Batch or streaming\n\n\nRegister and request transition\n\nModel training\n\nTraining\nEvaluation\nand tuning\n\nrelease\n\n\nPromote to staging Promote to production\n\n\nModel\nData ingest\ninference\n\n\nPu�lish\npredictions\n\n\n03\n\n\nContinuous Deployment (CD)\n\n\nrelease\n\nMonitoring\n\n\nData ingest\n\n\nCheck model\nperformance\nand data drift\n\n\nPu�lish\nmetrics\n\n\nCompare\nStaging vs\nProduction\n\n\nRequest model\ntransition to\nProduction\n\nrelease\n\n\nCompliance\nchecks\n\n\n0�\n\n\nTrigger model training\n\n\nrelease\n\n\nData ta�les Feature ta�les Feature ta�les Monitoring ta�les\nLakehouse\n\n\n-----\n\nThough data scientists may not have write or compute access in the production environment, it is\n\nimportant to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n\nin production. This visibility allows them to identify and diagnose problems in production.\n\n###### Feature table refresh\n\nThis pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n\nor streaming computation, depending on the freshness requirements for downstream training and inference.\n\nThe pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n\n###### Model training\n\nThe model training pipeline runs either when code changes affect upstream featurization or training logic, or\n\nwhen automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n\n\u0007 **Training and tuning**\n\nDuring the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n\nmetrics, parameters, tags and the model itself.\n\nDuring development, data scientists may test many algorithms and hyperparameters, but it is common\n\nto restrict those choices to the top-performing options in the production training code. Restricting tuning\n\ncan reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out production data. The results of these tests are\n\nlogged to the MLflow tracking server. During development, data scientists will have selected meaningful\n\nevaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n\n**\u0007Register and request transition**\n\nFollowing model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n\nenvironment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n\n\n-----\n\n###### Continuous deployment (CD)\n\nThe CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n\n‘stage=Staging’. There are three key tasks in this pipeline:\n\n\u0007 **Compliance checks**\n\nThese tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n\netc.), and approve or reject the request based on test results. If compliance checks require human\n\nexpertise, this automated step can compute statistics or visualizations for people to review in a manual\n\napproval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n\nare recorded to the Model Registry through metadata in tags and comments in descriptions.\n\nThe MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n\ncan be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n\nthe transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n\ntransition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n\n**\u0007Compare staging vs. production**\n\nTo prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n\n‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n\naccording to the use case, and the method for comparison can vary from canary deployments to A/B\n\ntests. All comparison results are saved to metrics tables in the lakehouse.\n\nIf this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n\nshould be compared to a business heuristic or other threshold as a baseline. For a new version\n\nof an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n\n‘stage=Production’ model.\n\n\n-----\n\n**\u0007Request model transition to production**\n\nIf the candidate model passes the comparison tests, a request is made to transition it to\n\n‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n\napprovals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n\nwebhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n\nfully available to downstream applications. A person can manually review the compliance checks and\n\nperformance comparisons to perform checks which are difficult to automate.\n\n###### Online serving (REST APIs)\n\nFor lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n\nsimple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n\ncustom serving layers.\n\nIn all cases, the serving system loads the production model from the Model Registry upon initialization. On\n\neach request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n\nserving system, data transport layer or the model itself could log requests and predictions.\n\n###### Inference: batch or streaming\n\nThis pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n\n‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n\nthroughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n\noption.\n\nA batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n\nA streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n\nApache Kafka.®\n\n\n-----\n\n###### Monitoring\n\nInput data and model predictions are monitored, both for statistical properties (data drift, model\n\nperformance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n\npublished for dashboards and alerts.\n\n\u0007 **Data ingestion**\n\nThis pipeline reads in logs from batch, streaming or online inference.\n\n**\u0007Check accuracy and data drift**\n\nThe pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n\nperformance. Metrics that measure statistical properties are generally chosen by data scientists during\n\ndevelopment, whereas metrics for infrastructure are generally chosen by ML engineers.\n\n\u0007 **Publish metrics**\n\nThe pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n\nto produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n\ndashboarding tool issues notifications when health metrics surpass defined thresholds.\n\n**\u0007Trigger model training**\n\nWhen the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n\nout of date, the data scientist may need to return to the development environment and develop a new\n\nmodel version.\n\n\n-----\n\n**Note:** While automated retraining is supported\n\nin this architecture, it isn’t required, and caution\n\n\n###### Retraining\n\nThis architecture supports automatic retraining using the same model training pipeline above. While we\n\nrecommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n\nretraining when needed.\n\n\u0007 **Scheduled**\n\nIf fresh data are regularly made available, rerunning model training on a defined schedule can help models\n\nto keep up with changing trends and behavior.\n\n**\u0007Triggered**\n\nIf the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n\ntrigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n\nperformance degrades, automatic retraining and redeployment can boost model performance with\n\nminimal human intervention.\n\n\nmust be taken in cases where it is implemented.\n\n\nIt is inherently difficult to automate selecting the\n\ncorrect action to take from model monitoring\n\n\nWhen the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n\nscientist may need to return to the dev environment and resume experimentation to address such issues.\n\n\nalerts. For example, if data drift is observed, does\n\nit indicate that we should automatically retrain, or\n\ndoes it indicate that we should engineer additional\n\nfeatures to encode some new signal in the data?\n\n\n-----\n\n**CHAPTER 4:**\n## LLMOps – Large Language Model Operations\n\n\n#### Large language models\n\nLLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n\ncountless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n\n\u0007Is prompt engineering part of operations, and if so, what is needed?\n\n\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n\n\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n\n…and many more!\n\nThe good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n\nsome parts of your MLOps platform and process may require changes, and your team will need to learn a\n\nmental model of how LLMs coexist alongside traditional ML in your operations.\n\nIn this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n\nkey topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n\na reference architecture diagram to illustrate what may change in your production environment.\n\n###### What changes with LLMs?\n\nFor those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n\none-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n\nsignificantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n\nquestion answering, summarization and execution of near-arbitrary instructions.\n\nFrom the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n\nplatforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n\ninto more detail in the next section.\n\n\n-----\n\n**Table 3**\n\n\n\n|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n|---|---|\n|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n\n\n-----\n\nThe list above may look long, but as we will see in the next section, many existing tools and processes\n\nonly require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n\ndo not change:\n\n\u0007The separation of development, staging and production remains the same\n\n\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n\nmodels toward production\n\n\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n\n\u0007Existing CI/CD infrastructure should not require changes\n\n\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n\nmodel inference, etc.\n\n\n-----\n\n#### Discussion of key topics for LLMOps\n\nSo far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n\nmore details about selected topics.\n\n###### Prompt engineering\n\nPrompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n\nresponses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n\nWe will cover a few tips and best practices and link to useful resources.\n\n**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n\ngenerally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n\nIn the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n\nprompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n\n**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n\n\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n\nprompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n\nmore details. Checking structured LLM pipeline code into version control also helps with prompt\n\ndevelopment, for git diffs allow you to review changes to prompts over time. Also see the section\n\nbelow on packaging model and pipelines for more information about tracking prompt versions.\n\n\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n\nNewer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n\n\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n\ntuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n\ntuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n\ntool for prompt tuning.\n\n\n-----\n\n###### Resources\n\nThere are lots of good resources about\nprompt engineering, especially for popular\n\nmodels and services:\n\n\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n\n\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n\n**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n\npopularity. Some of these generalize to other models as well. We will provide a few tips here:\n\n\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n\ninput, and a description of the desired output type or format\n\n\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n\n\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n\n\u0007Tell the model to think step-by-step or explain its reasoning\n\n\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n\nclear which parts of the prompt correspond to your instruction vs. user input\n\n\n-----\n\n###### Packaging models or pipelines for deployment\n\nIn traditional ML, there are generally two types of ML logic to package for deployment: models and\n\npipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n\ncontrol, respectively.\n\nWith LLMs, it is common to package ML logic in new forms. These may include:\n\n\u0007A lightweight call to an LLM API service (third party or internal)\n\n\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n\nlocal LLM model.\n\n\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n\npretrained model or a custom fine-tuned model.\n\n\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n\nThough LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n\nand pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n\ndeployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n\n\u0007PyTorch and TensorFlow\n\n\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n\n\u0007LangChain\n\n\u0007OpenAI API\n\n\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n\nFor other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n\narbitrary Python code.\n\n\n**Note about prompt versioning:** Just as it is helpful\n\nto track model versions, it is helpful to track prompt\n\nversions (and LLM pipeline versions, more generally).\n\nPackaging prompts and pipelines as MLflow Models\n\nsimplifies versioning. Just as a newly retrained\n\nmodel can be tracked as a new model version in the\n\nMLflow Model Registry, a newly updated prompt can\n\nbe tracked as a new model version.\n\n**Note about deploying models vs. code:** Your\n\ndecisions around packaging ML logic as version\n\ncontrolled code vs. registered models will help\n\nto inform your decision about choosing between\n\nthe deploy models, deploy code and hybrid\n\narchitectures. Review the subsection below about\n\nhuman feedback, and make sure that you have a\n\nwell-defined testing process for whatever artifacts\n\nyou choose to deploy.\n\n\n-----\n\n###### Managing cost/performance trade-offs\n\nOne of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n\nand serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n\nof billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n\nmanage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n\n**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n\ndevelopment is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n\nmodels. As you go, make sure to collect data such as queries and responses. In the future, you can use\n\nthat data to fine-tune a smaller, cheaper model which you can own.\n\n**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n\nHow much does each query cost? These estimates will inform you about project feasibility and will help\n\nyou to decide when to consider bringing the model in-house with open source models and fine-tuning.\n\n**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n\ncomputation and costs. These include shortening queries, tweaking inference configurations and using\n\nsmaller versions of models.\n\n**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n\nunless you get human feedback from end users.\n\n\n-----\n\n###### Resources\n\n**Fine-tuning**\n\n\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n**Model distillation,**\n**quantization and pruning**\n\n\n###### Methods for reducing costs of inference\n\n**Use a smaller model**\n\n\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n\nor alternate architectures.\n\n\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n\nperform better than a generic model.\n\n\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n\nmodel into a smaller model.\n\n\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n\nwithout losing much in quality.\n\n\n\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\n**\u0007Reduce computation for a given model**\n\n\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n\nqueries and responses reduces costs.\n\n\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n\n**Other**\n\n\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n\nlow ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n\n\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n\nmodels to use sparse computation during inference. This reduces computation for most or all queries.\n\n\n[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n\n-----\n\n###### Human feedback, testing, and monitoring\n\nWhile human feedback is important in many traditional ML applications, it becomes much more important\n\nfor LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n\nmetrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n\nmight have almost completely different words and word orders, so even defining a “ground-truth” label\n\nbecomes difficult or impossible.\n\nHumans — ideally your end users — become essential for validating LLM output. While you can pay human\n\nlabelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n\nfeedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n\nto chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n\nwere helpful.\n\nIn terms of operations, not much changes from traditional MLOps:\n\n\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n\nlakehouse, and process it using the same data pipeline tooling as other data.\n\n\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n\nmore important, superceding offline quality tests. If you can collect user feedback, then these rollout\n\nmethods can validate models before they are fully deployed.\n\n\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n\nfine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n\nstart with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n\n\n###### Resources\n\n**Reinforcement Learning from**\n**Human Feedback (RLHF)**\n\n\u0007Chip Huyen blog post on\n\n[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n\n[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n\n\u0007Hugging Face blog post on\n\n[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n\n[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n\n\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n\n\n-----\n\n###### Other topics\n\n\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n\nbut some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n\nyour LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n\nfine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n\n[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n\n\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n\nadjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n\nmodels, most LLMs will benefit from or require GPUs for serving and inference.\n\n\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n\nbased lookups of documents or other data. Vector databases may be an important addition to your\n\nserving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n\npreprocessed data which can be queried by inference jobs or model serving systems.\n\n\n-----\n\n#### Reference architecture\n\nTo illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n\nmodified version of the previous production architecture.\n\nProduction environment\n\nModel Registry\n\nStage: �one Stage: Staging Stage: Production\n\nLoad model for testing Load model for inference\n\n\nPush model to registry Promote to production\n\n\nModel serving\n\n\nLLM API request\n\nrelease\n\n\nFine-Tine LLM\n\nrelease\n\n\nVector Database\nUpdate\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring &\nEvaluation\n\nrelease\n\n\nInternal/External Data tables Vector database Metrics tables Human feedback\nmodel hub\n\n**Figure 7**\n\n\n-----\n\n###### Additional resources\n\nWith LLMs being such a novel field, we link to\nseveral LLM resources below, which are not\n\nnecessarily “LLMOps” but may prove useful\nto you.\n\n\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\n[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\nLLM lists and leaderboards\n\n\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n\n\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n\n\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n\n[Foundation Models](https://crfm.stanford.edu/)\n\n\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n\n\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\nThe primary changes to this production architecture are:\n\n\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n\nan internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n\nproduction to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n\ntuning, this hub would mainly be used in development.\n\n\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n\nmodel (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n\nbut it is similar operationally.\n\n\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n\nmost often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n\nits Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n\nthat these data stores and jobs are analogous in terms of operations.\n\n\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n\nAPI calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n\npotential latency or flakiness from third-party APIs, as well as another layer of credential management.\n\n\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n\nbut become essential in most LLM applications. Human feedback should be managed like other data,\n\nideally incorporated into monitoring based on near real-time streaming.\n\n\n[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\n-----\n\n#### Looking ahead\n\nLLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n\nsupport and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n\nsourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n\nflexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n\nWhile this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n\nrequires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n\nnews is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n\nLLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n\nthe power of large language models.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n9,000 organizations worldwide — including Comcast,\n\nCondé Nast and over 50% of the Fortune 500 — rely\n\non the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered\n\nin San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark ™ ,\n\nDelta Lake and MLflow, Databricks is on a mission\n\nto help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "-----\n\n# TABLE OF CONTENTS\n\n\n##### Welcome to Data, Analytics and AI ....... 02\n\n**Do you know what you’re getting into?** ............................................ **02**\n\n**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n\n##### Business Value .......................................................................... 03\n\n**Talking to the business (feels like combat)** \b����������������������������� **03**\n\n**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n\n**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n\n##### Ultimate Class Build Guide .................................. 04\n\n**Creating a character** \b����������������������������������������������������������������������������� **04**\n\n- Data Engineers \b������������������������������������������������������������������������������������� **04**\n\n- Data Scientists \b������������������������������������������������������������������������������������� **05**\n\n- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n\n##### Diving In ............................................................................................... 05\n\n**Producing game data** \b���������������������������������������������������������������������������� **05**\n\n**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n\n**Getting data from your game to the cloud** \b������������������������������ **08**\n\n##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n\n**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n\n**Use data to develop a next-generation**\n\n**customer experience** \b��������������������������������������������������������������������������� **09**\n\n##### Getting Started with Gaming Use Cases .............................................................. 10\n\n**Where do I start? Start with Game Analytics** \b������������������������� **10**\n\n**Understand your audience** \b���������������������������������������������������������������������������� **11**\n\n- Player Segmentation \b���������������������������������������������������������������������������� **11**\n\n- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n\n- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n\n- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n\n- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n\n**Find your audience** \b���������������������������������������������������������������������������������� **14**\n\n\n**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n\n- Player Recommendations \b����������������������������������������������������������������� **15**\n\n- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n\n- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n\n- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n\n**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n\n- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n\n- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n\n- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n\n##### Things to Look Forward To ..................................... 19\n\n Appendix .............................................................................................. 21\n\n**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n\n- Creating a Character \b��������������������������������������������������������������������������� **21**\n\n- Data Engineers \b���������������������������������������������������������������������������� **21**\n\n- Data Scientists \b���������������������������������������������������������������������������� **21**\n\n- Data Analysts \b������������������������������������������������������������������������������ **22**\n\n**Data Access and the Major Cloud Providers** ................................ **23**\n\n- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n\n- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n\n- Getting started with the major cloud providers \b������������������� **23**\n\n**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n\n- Game analytics \b������������������������������������������������������������������������������������� **25**\n\n- Player Segmentation \b�������������������������������������������������������������������������� **25**\n\n- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n\n- Social Media Monitoring \b������������������������������������������������������������������� **28**\n\n- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n\n- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n\n- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n\n- Player Recommendations \b���������������������������������������������������������������� **32**\n\n- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n\n- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n\n- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n\n**Getting Started with Operational Use Cases** \b�������������������������� **36**\n\n- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n\n- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n\n- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n\n\nMulti-Touch Attribution \b��������������������������������������������������������������������� **14**\n\n\n-----\n\n# Welcome to Data, Analytics, and AI\n\n\n### Do you know what you’re getting into?\n\nYou may have heard the stories of game studios spending\n\ncountless hours trying to more effectively acquire, engage,\n\nand retain players. Well, did you know that data, analytics,\n\nand AI plays a central role in the development and operation\n\nof today’s top-grossing video games? Studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are no\n\nlonger a “nice-to-have”, but table stakes for success.\n\nThe objective of this handbook is to guide you on the\n\nrole data, analytics, and AI plays in the development\n\nand operations of video games. We’ll cover who the key\n\nstakeholders are and how to align people across business\n\nunits. Then we’ll talk through strategies to help you\n\nsuccessfully advocate for data, analytics, and AI projects\n\ninternally. Finally, we dive deep through the most common\n\nuse cases. We want to give you enough information to feel\n\n\nwell as helpful tips when operating as or working with one of\n\nthese classes.\n\nWe follow this with the fundamentals for building a Proof\n\nof Concept (POC) or Minimum Viable Product (MVP). That\n\nis, connecting to the cloud; accessing your data; and\n\nmost importantly, being able to represent the value you’re\n\nseeking to unlock as you sell your project into your team and\n\nbroader organization.\n\nFinally, we’ll dive into the most common use cases for data,\n\nanalytics, and AI within game development. Similar to a tech-\n\ntree in a video game, we begin with the most basic use cases\n\n- setting up your game analytics. Then we progress through\n\nmore advanced data use cases such as player segmentation,\n\nassessing lifetime value, detecting and mitigating toxicity,\n\nmulti-touch attribution, recommendation engines, player\n\nchurn prediction and prevention, and more.\n\nDon’t forget to review the Appendix. You’ll find a handy\n\n“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n\nguide for the three major cloud providers ”. All incredibly\n\nhelpful assets to keep as hotkeys.\n\n\nempowered to make a demonstrable impact. Just by reading\n\nthis you are adding incredible insight and value to yourself as\n\n\nan industry professional. Quest on!\n\n### How to use this book\n\nThis book is primarily intended for technical professionals\n\nwho are engaging with data within game studios. No\n\nmatter your role in the gaming industry, you will be able to\n\nglean key takeaways that will make you more effective in\n\nyour individual role and within the larger team — be that\n\nproduction, art, engineering, marketing, or otherwise.\n\nBegin your journey by reviewing the “ **Data, Analytics, and AI**\n\n**Ground Rules** ” section to the right, which presents some This\n\nsection presents some rules and guidelines for interpreting\n\nthe role that data plays in the game development lifecycle.\n\nNext, it’s time to learn about the key professions (aka\n\ncharacter classes) that interact and engage with data,\n\nanalytics, and AI on a consistent basis within a game studio.\n\nThis section breaks down each of the classes, providing an\n\n\n**Data, Analytics, and AI Ground Rules**\n\nThis guide assumes you understand the following:\n\n- You understand the basics of data, analytics, and AI:\n\nHow and why data is stored in a system, why data\n\nis transformed, the different types of output that\n\ndata can feed into — such as a report, an analysis\n\nanswering a question, or a machine learning model.\n\nIf this is the first time you’re creating a character,\n\nwe highly recommend reviewing our data, analytics,\n\nand AI tutorial — aka getting started training and\n\ndocumentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n\n- You have a basic understanding of cloud\n\ninfrastructure. Specifically what it is, who are the\n\nkey players, and associated terms (e.g., virtual\n\nmachines, APIs, applications)\n\n- You are generally aware of the game development\n\nlifecycle; pre-production, production, testing/QA,\n\nlaunch, operation\n\n\noverview of each character’s strengths and weaknesses as\n\n\n-----\n\n# Business Value\n\n\nDemonstrating business value is important when working\n\non data, analytics, and AI projects because it helps ensure\n\nthat the efforts of the project are aligned with the goals\n\nand objectives of the business. By showing how the project\n\ncan positively impact a game’s key performance indicators\n\n(KPIs) and bottom-line metrics, such as game revenue, player\n\nsatisfaction, and operational efficiency, studio stakeholders\n\nare more likely to support and invest in the project.\n\nAdditionally, demonstrating business value can help justify\n\nthe resources, time, and money that are required to execute\n\nthe project, and can also help prioritize which projects should\n\nbe pursued. By focusing on business value, data, analytics,\n\nand AI projects can become strategic initiatives that\n\ncontribute to the long-term success of your game studio.\n\n### Talking to the business (feels like combat)\n\nWhile we highly encourage everyone to read this section,\n\nyou may already feel confident understanding the needs and\n\nconcerns of your internal stakeholders, and how to sell-in a\n\nproject successfully. If so, feel free to skip this section.\n\nWe would love to dive into the data to explore and discover\n\nas much as possible, unfortunately in most environments,\n\nwe are limited by resources and time. Understanding both\n\nthe businesses pain points and strategic goals is crucial to\n\nchoosing projects that will benefit the business, create value\n\nand make your message much easier to sell.\n\nWhenever we embark on a proof-of-concept (PoC) or\n\nminimum viable product (MVP) — to prove out a new\n\n**Questions to ask:**\n\n- What other strategic goals and pain points can\n\nyou list out and how would you prioritize them as\n\na business leader?\n\n- Does your prioritization match how your team,\n\nmanager and/or leadership would prioritize?\n\nTypically the closer the match, the easier initial\n\nprojects will be to “sell”.\n\n\nmethodology or technology — we will need to pitch it back\n\nfor adoption. The technology could be revolutionary and\n\nabsolutely amazing, but without the value proposition and tie\n\nback to goals, it is likely to land flat or fail to be adopted.\n\nIt is key to talk to your stakeholders to understand their\n\nperception of pain points and positions on potential projects\n\nto add value. Much like stopping at the Tavern when the\n\nadventuring party gets to town, these can be informal\n\nconversations where you socialize potential solutions while\n\ngathering information about what matters.\n\n### Creating value alignment\n\nSo what are your strategic goals and pain points and how\n\nmight they be addressed through a use case from a PoC or\n\nMVP leveraging your data?\n\nA few examples of strategic goals that are top of mind for our\n\ncustomers at the beginning of any fiscal or calendar year:\n\n- Reduce costs\n\n- Simplify your infrastructure\n\n- Acquire more players\n\n- Monetize your playerbase\n\n- Retain your players (aka prevent churn)\n\nHere are four ways the Databricks Lakehouse can provide\n\nvalue that aligns with your strategic goals and pain points:\n\n`1.` **\u0007Improved collaboration:** Databricks platform allows\n\neveryone to share and collaborate on data, notebooks and\n\nmodels between data scientists, engineers and business\n\nusers. This enables for a more efficient and streamlined\n\nprocess for data analysis and decision making.\n\n`2.` **Find and explore your data:** The data in the Lakehouse is\n\ncataloged and accessible, which enables business users\n\nto explore and query the data easily and discover insights\n\nby themselves.\n\n`3.` **\u0007Uncover actionable business insights:** By putting\n\nyour game’s data into a Lakehouse architecture, it\n\ncan be better analyzed using various tools provided\n\nby Databricks such as SQL, dashboards, notebooks,\n\nvisualization and machine learning to better understand\n\nyour playerbase, providing valuable insights into player\n\nbehavior and performance. These insights can help the\n\n\n-----\n\nand retention, and use that information to improve the\n\ngame and grow monetization.\n\n`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n\narchitecture provides a single source of truth for your\n\norganization’s data. Data engineers write once, data\n\nanalysts interpret the data, and data scientists can run\n\nmachine machine learning models on the same data.\n\n_This cannot be understated in the value this provides an_\n\n_organization from a total cost of ownership perspective._\n\nWith the ability to access and analyze all the data in one\n\nplace, the business can make unified data-driven decisions,\n\nrather than relying on intuition or fragmented data.\n\n### Goals and outcomes\n\nLike many projects, starting with a strong foundation of ‘what\n\nsuccess looks like’ will significantly improve your likelihood\n\nof achieving your objectives. Here are a few best-practices\n\nwe recommend:\n\n`1.` **Set goals:** Define your hypothesis, then use your data\n\nand process to prove or disprove your hypothesis. You\n\nhave a goal in mind, make it part of the experiment. If\n\nthe outcome differs from the expectation, that is part of\n\nexperiments and we can learn from it to improve the next\n\nexperiment. This is all about shortening the feedback loop\n\n\nproject appropriately. For example, are you doing this as\n\na side project? Do you have 2 sprints to show progress?\n\nIt’s important to scope your project based on the time,\n\nresources, and quality needed for the said project to be a\n\nsuccess.\n\n`3.` **Scope down:** Ruthlessly control scope for any PoC or\n\nMVP. Prioritization is your best friend. Stakeholders and\n\nyour own internal team will naturally want to increase\n\nscope because there’s no shortage of good ideas. But by\n\ncontrolling scope, you improve your chances of shipping\n\non time and on budget. Don’t let perfection be the enemy\n\nof good. There are always exceptions to this, but that is\n\nwhat the next sprint is for.\n\n`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n\ndifficult - strive to always deliver on time. Make sure your\n\ngoals, constraints and scope creep will not explode your\n\ntimeline as creating tight feedback loops and iteration\n\ncycles is what will make you more agile than the competition.\n\n`5.` **Socialize early, and often:** Show quantifiable value as\n\nquickly as possible, both to your immediate team and\n\nbusiness stakeholders. Measure the value as frequently\n\nas makes sense, and socialize early and often to promote\n\nvisibility of the project and ensure tight alignment across\n\nteams. This will empower you to create tighter feedback\n\nloops that will help improve any future iterations of your\n\nproduct, platform, or technology.\n\n\nbetween insight and action.\n\n# Ultimate Class Build Guide\n\n\n### Creating a character\n\nHave you rolled your character already? Data engineers, data\n\nscientists, and data analysts form the heart of mature game\n\ndata teams. Though, depending on studio size and resources,\n\n\nmaking sense of large amounts of data. Depending on the size\n\nof the organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in data\n\nengineering, analytics and data science. Key characters include:\n\n\ngame developers may also be pulled in from time to time to\n\n\nperform data engineering and or data science tasks. Though for\n\nthe sake of this guide, we’ll keep focus on roles of data engineers,\n\ndata scientists, and data analysts. There are many aspects to\n\nthese roles, but they can be summarized in that Data Engineers\n\ncreate and maintain critical data workflows, Data Analysts\n\ninterpret data and create reports that keep the business teams\n\nrunning seamlessly, and Data Scientists are responsible for\n\n\n**Data Engineers**\n\nData engineers build systems that collect, manage, and\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n\n-----\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Data Analysts**\n\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n**Learn more about these character classes**\n\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n# Diving In\n\n\nBefore we get to the primary use cases of game data,\n\nanalytics, and AI, we need to cover some basics. That is, the\n\ndifferent types of game data and how they are produced.\n\nAnd the subsequent receiving of that data in the cloud to\n\n\n### Producing game data…\n\nSpeaking in generalities, there are four buckets of data as it\n\nrelates to your video game.\n\n\ncollect, clean, and prepare for analysis.\n\n**1. Game Telemetry**\n\nGame telemetry refers to the data collected about player\n\nbehavior and interactions within a video game. The primary\n\ndata source is the game engine. And the goal of game\n\ntelemetry is to gather information that can help game\n\ndevelopers understand player behavior and improve the\n\noverall game experience.\n\nSome of the primary metrics that are typically tracked in\n\ngame telemetry include:\n\n- **Player engagement:** Track the amount of time players\n\nspend playing the game, and their level of engagement\n\nwith different parts of the game.\n\n- **Game progress:** Monitor player progress through\n\ndifferent levels and milestones in the game.\n\n- **In-game purchases:** Track the number and value of\n\nin-game purchases made by players.\n\n- **Player demographics:** Collect demographic information\n\nabout players, such as age, gender, location, and device type.\n\n- **Session length:** Monitor the length of each player session,\n\nand how often players return to the game.\n\n- **Retention:** Track the percentage of players who return to\n\nthe game after their first session.\n\n\n-----\n\nsuch as the types of actions taken, the number of deaths,\n\nand the use of power-ups.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels.\n\n**2. Business KPIs**\n\nThe second bucket of data is business key performance\n\nindicators (or KPIs). Business KPIs are metrics that measure\n\nthe performance and success of a video game from a\n\nbusiness perspective. The primary data source for business\n\nKPIs include game telemetry, stores, and marketplaces.\n\nThese KPIs help game studios understand the financial and\n\noperational performance of their games and make informed\n\ndecisions about future development and growth.\n\nSome of the primary business metrics that are typically\n\ntracked include:\n\n- **Revenue:** Track the total revenue generated by the game,\n\nincluding sales of the game itself, in-game purchases,\n\nand advertising.\n\n- **Player Acquisition Cost (CAC):** Calculate the cost\n\nof acquiring a new player, including marketing and\n\nadvertising expenses.\n\n- **Lifetime Value (LTV):** Estimate the amount of revenue a\n\nplayer will generate over the course of their time playing\n\nthe game.\n\n- **Player Retention:** Track the percentage of players who\n\ncontinue to play the game over time, and how long they\n\nplay for.\n\n- **Engagement:** Measure the level of engagement of players\n\nwith the game, such as the number of sessions played,\n\ntime spent playing, and in-game actions taken.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels and the\n\ncost of acquiring each player.\n\n- **Conversion Rate:** Measure the percentage of players who\n\nmake an in-game purchase or complete a specific action.\n\n- **Gross Margin:** Calculate the profit generated by the game\n\nafter subtracting the cost of goods sold, such as the cost\n\nof game development and server hosting.\n\n**3. Game Services**\n\nSimilar to game telemetry, game services provide critical\n\ninfrastructure that requires careful monitoring and management.\n\nThese services include things like game server hosting,\n\n\nand more. Here the source of data is the game services used.\n\nSome of the common metrics game teams typically track for\n\nthese services include:\n\n- **Concurrent Players:** Track the number of players who are\n\nsimultaneously connected to the game servers to ensure\n\nthat the servers have enough capacity to handle the\n\nplayer demand.\n\n- **Server Availability:** Monitor the uptime and downtime of\n\nthe game servers to ensure that players have access to\n\nthe game when they want to play, particularly important\n\nfor global live service games where demand fluctuates\n\nthrought the day.\n\n- **Latency:** Measure the time it takes for data to travel\n\nfrom the player’s device to the game server and back,\n\nto ensure that players have a smooth and responsive\n\ngaming experience.\n\n- **Network Bandwidth:** Monitor the amount of data being\n\ntransmitted between the player’s device and the game\n\nserver to ensure that players have a high-quality gaming\n\nexperience, even on slow internet connections.\n\n- **Live Operations:** Monitor the success of in-game events,\n\npromotions, and other live operations to understand what\n\nresonates with players and what doesn’t.\n\n- **Player Feedback:** Monitor player feedback and reviews,\n\nincluding ratings and comments on social media, forums,\n\nand app stores, to understand what players like and dislike\n\nabout the game.\n\n- **Chat Activity:** Track the number of messages and\n\ninteractions between players in the game’s chat channels\n\nto understand the level of social engagement and\n\ncommunity building in the game.\n\n**4. Data beyond the game**\n\nThe last bucket comes from data sources beyond the video\n\ngame. These typically include the following:\n\n- **Social Media Data:** Social media platforms, such as\n\nFacebook, Twitter, TikTok and Instagram, can provide\n\nvaluable insights into player behavior, feedback and\n\npreferences, as well as help game teams understand\n\nhow players are talking about their games online with\n\ndifferent communities.\n\n- **Forum Data:** Online forums and discussion boards, such\n\nas Reddit and Discord, can be rich sources of player\n\nfeedback and opinions about the game.\n\n\n-----\n\n#### The secret to success is bringing all of the disparate data sources\n together, so you have as complete a 360-degree view as possible of\n what’s happening in and around your game.\n\n\n\n- **Player Reviews:** Ratings and reviews on app stores, such\n\nas Steam, Epic, Google Play and the Apple App Store, can\n\nprovide valuable feedback on player experiences and help\n\ngame teams identify areas for improvement.\n\n- **Third-Party Data:** Third-party data sources, such as\n\nmarket research firms and industry data providers, can\n\nprovide valuable insights into broader gaming trends and\n\nhelp game teams make informed decisions about their\n\ngames and marketing strategies.\n\nThis is a lot of data. And it’s no wonder that studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are now\n\ntable stakes for a game to be successful. Tapping into these\n\nfour buckets of data sources, you’ll find actionable insights that\n\ndrive better understanding of your playerbase, more efficient\n\nacquisition, stronger and longer lasting engagement, and\n\nmonetization that deepens the relationship with your players.\n\nThat’s what we’re going to dig into throughout the rest of\n\nthis book.\n\n**Let’s begin with how to get data out of your game!**\n\nThere are a variety of ways to get data out of the game and\n\ninto cloud resources. In this section, we will provide resources\n\nfor producing data streams in Unity and Unreal. In addition,\n\nwe will also provide a generic approach that will work for any\n\ngame engine, as long as you are able to send HTTP requests.\n\n**Unity**\n\nSince Unity supports C#, you would use a .NET SDK from the\n\ncloud provider of your choice. All three major cloud providers\n\n\n[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n\n- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n\n- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n\n- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n\n- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n\nFrom here, the SDK is used to send data to a messaging\n\nservice. These messaging services will be covered in more\n\ndetail in the next section.\n\n**Unreal Engine**\n\nUnreal supports development with C++, so you could use\n\nC++ SDKs or Blueprint interfaces to those SDKs.\n\nThe resources for each SDK are provided here\n\n- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n\n- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n\n- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\n[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\nJust like with the Unity example above, from here the data is\n\nsent to a messaging streaming service.\n\nOther engines may not support C++ or C#, but there is still a\n\nway to get your data into the cloud, no matter the language!\n\nBy hitting an API Gateway with a HTTP POST request, you are\n\nable to send data to cloud services from many more types of\n\napplications. A sample high level architecture of this solution\n\nin AWS and Azure can be seen below:\n\n**AWS:**\n\n\nhave .NET SDKs to use and I have linked the documentation\n\n\n**Azure:**\n\n\nfor each below.\n\nNo matter the cloud provider, if you want to use a SDK you\n\ninstall it through the NuGet package manager into your Unity\n\nproject. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n\n\n-----\n\nOnce the data has been sent from the game into an event-\n\nstreaming service, how do we get that data to a more\n\npermanent home? Here we will start by outlining what these\n\nmessaging services do and how we can use them to point\n\nour data to a desired location.\n\nMessaging services ingest real-time event data, being\n\nstreamed to them from a number of different sources,\n\nand then send them to their appropriate target locations.\n\nThese target locations can be databases, compute clusters\n\nor cloud object stores. A key property of the messaging\n\nservices is to preserve the time in which the events arrive, so\n\nthat it is always known the order that events occurred.\n\n\n\n- Data is stored in object storage such as S3, Azure Storage\n\nor GCP Buckets using Delta Lake.\n\n- Delta Lake is an open-source storage framework that makes\n\nit easy to maintain data consistency and track changes.\n\n**Data Governance & Cataloging:**\n\n- Unity Catalog in Databricks provides tools for data\n\ngovernance that helps with compliance and controlling\n\naccess to data in the lake.\n\n- Unity Catalog also allows to track data lineage, auditing and\n\ndata discovery with the use of data catalogs and governance.\n\n- Metadata about the data including the structure, format,\n\nand location of the data can be stored in a data catalog.\n\n\nExamples of cloud messaging services include AWS Kinesis\n\n\nFirehose, Google PubSub, and Azure Event Hubs Messaging.\n\nIf you prefer to use open-source products, Apache Kafka is a\n\nvery popular open-source alternative.\n\n### Getting data from your game to the cloud\n\nMoving to the cloud platform part of the journey involves\n\nbuilding a gaming Lakehouse. The gaming Lakehouse allows\n\ngaming companies to store, manage, and analyze large volumes\n\nof gaming data, such as player behavior, performance metrics,\n\nand financial transactions, to gain valuable insights and make\n\ndata-driven decisions to improve their business outcomes.\n\n**Next here are the basics of the Databricks**\n\n**platform simplified.**\n\n**Data Ingestion:**\n\n- Data can be ingested into the Gaming Lakehouse using\n\nvarious built-in data ingestion capabilities provided by\n\nDatabricks such as Structured Streaming and Delta Live\n\nTables for a single simple API that handles streaming or\n\nbatch pipelines.\n\n- Data can be ingested in real-time or batch mode from\n\n\n**Data Quality:**\n\n- Databricks platform enables you to validate, clean\n\nand enrich data using built-in libraries and rule-based\n\nvalidation using Delta Live Tables.\n\n- It also allows tracking data quality issues and missing\n\nvalues by using Databricks Delta Live Tables tables.\n\n**Data Security:**\n\n- Databricks provides a comprehensive security model to\n\nsecure data stored in the lake.\n\n- Access to data can be controlled through robust access\n\ncontrols on objects such as catalogs, schemas, tables,\n\nrows, columns, models, experiments, and clusters.\n\n**Analytics:**\n\n- The processed data can be analyzed using various\n\ntools provided by Databricks such as SQL Dashboards,\n\nNotebooks, visualizations and ML.\n\n- Game studios can gain insights into player performance and\n\nbehaviorto better engageplayers and improve their games.\n\n**Get started with your preferred cloud**\n\n\nvarious sources such as game clients, servers or APIs.\n\nData can be cleaned, transformed and enriched with\n\nadditional data sources, making it ready for analysis.\n\n\n-----\n\n# The Value of Data Throughout the Game Development Lifecycle\n\n\n### Lifecycle overview\n\nOver the last decade, the way games have been developed\n\nand monetized has changed dramatically. Most if not all\n\ntop grossing games are now built using a games-as-service\n\nstrategy, meaning titles shipped in cycles of constant\n\niteration to increase engagement and monetization of\n\nplayers over time. Games-as-a-Service models have the\n\nability to create sticky, high-margin games, but they also\n\nheavily depend on cloud-based services such as game\n\nplay analytics, multiplayer servers and matchmaking, player\n\nrelationship management, performance marketing and more.\n\nData plays an integral role in the development and operation\n\nof video games. Teams need tools and services to optimize\n\nplayer lifetime value (LTV) with databases that can process\n\nterabytes-petabytes of evolving data, analytics solutions\n\nthat can access that data with near real-time latency, and\n\nmachine learning (ML) models that can translate insights into\n\nactionable and innovative gameplay features.\n\nA game’s development lifecycle is unique to each studio. With\n\ndifferent skillsets, resources, and genres of games, there is no\n\n\none model. Below is a simplified view of a game development\n\nlifecycle for a studio running a games-as-a-service model.\n\nWhat’s important to remember is that throughout your title’s\n\ndevelopment lifecycle, there is data that can help you better\n\nunderstand your audience, more effectively find and acquire\n\nplayers, and more easily activate and engage them. Whether\n\nusing game play data to optimize creative decision making\n\nduring pre-production, tapping machine learning models to\n\npredict and prevent churn, or identifying the next best offer\n\nor action for your players in real-time, **data is your friend** .\n\n### Use data to develop a next-generation customer experience\n\nIn the game industry, customer experience (CX) is an\n\nimportant factor that can impact a player’s enjoyment of a\n\ngame and the length they choose to play that game over time.\n\nIn today’s highly competitive and fast-paced games industry,\n\na game studio’s ability to deliver exceptional and seamless\n\ncustomer experiences can be a strategic differentiator when\n\nit comes to cutting through the noise and winning a gamer’s\n\n\n## Game Development Lifecycle\n\n**Game Development Lifecycle**\n\n#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n\n\n**Game Development Lifecycle**\n\n\n_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n\n\n**1. Pre-Production**\n\nBrainstorm how to give life to the many\n\nideas laid out in the planning phase\n\n\n**3. Testing**\n\nEvery feature and mechanic in the game needs\n\nto be tested for game loop and quality control\n\n\n**5. Operation**\n\nAs studios increasingly adopt games-as-a-service models, the\n\nongoing operation of a video game is as critical as the launch itself\n\n**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n\n\n\n|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n|---|---|---|---|---|---|---|---|\n|||||||||\n|||||||||\n\n\n**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n**EXPERIMENTATION**\n\n\n**2. Production**\n\nMost of the time, effort, and resources\n\nspent on developing video games are\n\nspent in production stage\n\n\n**4. Launch**\n\nWhether developing alongside the community with\n\nalpha and beta releases, or launching into general\n\navailability, a game launch is a critical milestone\n\n\n-----\n\ncan help drive value through customer experience:\n\n`1.` **Personalization:** Game studios can use data analytics\n\nand machine learning to personalize the game experience\n\nfor each player based on their preferences and behavior.\n\nThis can include personalized recommendations for\n\ncontent, in-game events, and other features that are\n\ntailored to the player’s interests.\n\n`2.` **Omnichannel support:** Players often use multiple\n\nchannels, such as social media, forums, and in-game\n\nsupport, to communicate with game studios. Next\n\ngeneration customer experience involves providing a\n\nseamless and integrated support experience across all\n\nthese channels in near-real time.\n\n`3.` **Continuous improvement:** Game studios can use data\n\nand feedback from players to continuously improve\n\n\ngathering feedback on new features and using it to refine\n\nand optimize the game over time.\n\nIn summary, defining what a next generation customer\n\nexperience looks like for your game is important because it can\n\nhelp you create a more personalized, seamless, and enjoyable\n\nexperience for your players, which can lead to increased\n\nengagement, monetization, and loyalty. There are many\n\nways teams can use data throughout a game’s development\n\nlifecycle, but far and away the most valuable focus area will be\n\nin building and refining the customer experience.\n\nThroughout the rest of this guide, we will dig into the most\n\ncommon use cases for data, analytics, and AI in game\n\ndevelopment, starting with where we recommend everyone\n\nbegins: game analytics.\n\n\n# Getting Started with Gaming Use Cases\n\n\n### Where do I start? Start with game analytics\n\n**Overview**\n\nBig question: Where’s the best place to start when it comes\n\nto game data, analytics, and AI? For most game studios,\n\nthe best place to start is with game analytics. Setting up a\n\ndashboard for your game analytics that helps you correlate\n\ndata across disparate sources is infinitely valuable in a world\n\n\nwhere there is no one gaming data source to rule them all.\n\nAn effective dashboard should include your game telemetry\n\ndata, data from any game services you’re running, and data\n\nsources outside of your game such as stores, marketplaces,\n\nand social media. See below.\n\n**What we’re trying to solve/achieve**\n\nGetting a strong foundation in game analytics unlocks more\n\nadvanced data, analytics, and AI use cases. For example,\n\nconcurrent player count plus store and marketplace data\n\n\n**GAME TELEMETRY**\n\n\n**Data Sources**\n\n**GAME SERVICES** **OTHER SOURCES**\n\n\n-----\n\nand lifetime value. Usage telemetry combined with crash\n\nreporting and social media listening helps you more quickly\n\nuncover where players might be getting frustrated. And\n\ncorrelating chat logs, voice transcriptions, and or discord\n\n\nthat are relevant and engaging to your players, giving you\n\ntools to effectively market and monetize with your audience.\n\n**Let’s start with Player Segmentation.**\n\n\nand reddit forums can help you identify disruptive behavior\n\n\nbefore it gets out of hand, giving you the tools to take\n\nactionable steps to mitigate toxicity within your community.\n\n**Get started and set up your Analytics Dashboard**\n\n### Understand your audience\n\nWith your analytics pipelines set up, the first area of focus is to\n\nbetter understand your audience. This can help you inform a\n\nvariety of key business decisions, from the highest macro order\n\nof “what game(s) to develop”, to how to market and monetize\n\nthose games, and how to optimize the player experience.\n\nBy understanding the demographics, preferences, and\n\nbehaviors of their audience, a game studio can create games\n\nthat are more likely to appeal to their target market and be\n\nsuccessful. You can also use this understanding to tailor your\n\nmarketing and monetization strategies to the needs and\n\npreferences of your players.\n\nAdditionally, understanding your audience can help you\n\n\n##### Player Segmentation\n\n**Overview**\n\nPlayer segmentation is the practice of dividing players\n\ninto groups based on shared characteristics or behaviors.\n\nSegmentation has a number of benefits. You can better\n\nunderstand your players, create more personalized content,\n\nimprove player retention, and optimize monetization, all of\n\nwhich contributes to an improved player experience.\n\n**What we’re trying to solve/achieve**\n\nThe primary objective of segmentation is to ensure you’re\n\nnot treating your entire playerbase the exact same. Humans\n\nare different, and your players have different motivations,\n\npreferences and behaviors. Recognizing this and engaging\n\nwith them in a way that meets them where they’re at\n\nis one of the most impactful ways you can cultivate\n\nengagement with your game. As we mentioned above,\n\nthe benefits of segmentation are broad reaching. Through\n\nbetter understanding of your playerbase, you can better\n\npersonalize experiences, tailoring content and customer\n\nexperience to specific groups of players that increases\n\nengagement and satisfaction. Better understanding of\n\nyour players also helps in improving player retention. By\n\nidentifying common characteristics of players who are at\n\nrisk of churning (i.e., stopping play), you can develop targeted\n\nstrategies that only reach specific audiences.\n\nCreate advanced customer segments to build out more\n\neffective user stories, and identify potential purchasing\n\npredictions based on behaviors. Leverage existing sales\n\ndata, campaigns and promotions systems to create robust\n\nsegments with actionable behavior insights to inform your\n\nproduct roadmap. You can then use this information to build\n\nuseful customer clusters that are targetable with different\n\npromos and offers to drive more efficient acquisition and\n\ndeeper engagement with existing players.\n\n\nidentify potential pain points or areas for improvement\n\n\nwithin your games, allowing you to proactively make changes\n\n\n**Get started with Player Segmentation**\n\n\nto address these issues and improve the player experience\n\nbefore a player potentially churns.\n\n\n-----\n\n**Overview**\n\nPlayer lifetime value (LTV) is a measure of the value that a\n\nplayer brings to a game over the lifetime they play that game.\n\nIt is typically calculated by multiplying the average revenue\n\nper user (ARPU) by the average player lifespan. For example,\n\nif the average player spends $50 per year and plays the\n\ngame for 2 years, their LTV would be $50 * 2 = $100.\n\n**What we’re trying to solve/achieve**\n\nGame studios care about LTV because it helps them\n\nunderstand the long-term value of their players and make\n\ninformed decisions about how to invest in player acquisition\n\nand retention. For example, if the LTV of a player is higher\n\nthan the cost of acquiring them (e.g., through advertising),\n\nit may be worth investing more in player acquisition. On the\n\nother hand, if the LTV of a player is lower than the cost of\n\nacquiring them, it may be more cost-effective to focus on\n\nretaining existing players rather than acquiring new ones.\n\nLTV is one of the more important metrics that game studios,\n\nparticularly those building live service games, can use to\n\nunderstand the value of their players. It is important to\n\nconsider other metrics as well, such as player retention,\n\nmonetization, and engagement.\n\n**Get started with Player Lifetime Value**\n\n##### Social Media Monitoring\n\n**Overview**\n\nAs the great Warren Buffet once said, “It takes 20 years to\n\nbuild a reputation and five minutes to ruin it. If you think\n\nabout that, you’ll do things differently.” Now more than ever,\n\npeople are able to use social media and instantly amplify\n\ntheir voices to thousands of people who share similar\n\ninterests and hobbies. Take Reddit as an example. r/gaming,\n\nthe largest video game community (also called a subreddit)\n\nhas over 35 million members with nearly 500 new posts\n\nand 10,000 new comments per day, while over 120 game-\n\nspecific subreddits have more than 10,000 members each,\n\nthe largest being League of Legends with over 700,000\n\nmembers. The discourse that takes place on online social\n\nplatforms generates massive amounts of raw and organic\n\n\nbe used to understand how customers think and discover\n\nexactly what they want.\n\nThe act and process of monitoring content online across the\n\ninternet and social media for keyword mentions and trends\n\nfor downstream processing and analytics is called media\n\nmonitoring. By applying media monitoring to social media\n\nplatforms, game developers are able to gain new advantages\n\nthat previously might not have been possible, including:\n\n- Programmatically aggregate product ideas for new\n\nfeature prioritization\n\n- Promote a better user experience by automatically\n\nresponding to positive or negative comments\n\n- Understand the top influencers in the industry who can\n\nsway public opinion\n\n- Monitor broader industry trends and emerging segments\n\nsuch as free-to-play games\n\n- Detect and react to controversies or crises as they begin\n\n- Get organic and unfiltered feedback of games and features\n\n- Understand customer sentiment at scale\n\n- Make changes faster to keep customer satisfaction high\n\nand prevent churn\n\nBy failing to monitor, understand, and act on what customers\n\nare saying about the games and content you release as\n\nwell as broader industry trends, you risk those customers\n\nleaving for a better experience that meets the demands and\n\nrequirements of what customers want.\n\n**What we’re trying to solve/achieve**\n\nBy monitoring and listening to what existing and potential\n\ncustomers are saying on social media, game developers\n\nare able to get a natural and organic understanding of how\n\ncustomers actually feel about the games and products they\n\nrelease, or gauge consumer interest before investing time\n\nand money in a new idea. The main process for social media\n\nmonitoring is to gather data from different social media\n\nplatforms, such as Twitter or YouTube, process those comments\n\nor tweets, then take action on the processed data. While\n\ncustomer feedback can be manually discovered and processed\n\nin search of certain keyword mentions or feedback, it is a much\n\nbetter idea to automate it and do it programmatically.\n\n**Get started with Social Media Monitoring**\n\n\n-----\n\n**Overview**\n\nPlayer feedback analysis is the process of collecting,\n\nanalyzing, and acting on player feedback to inform game\n\ndevelopment. It involves collecting player feedback from\n\nmultiple sources, such as in-game surveys, customer\n\nsupport tickets, social media, marketplace reviews, and\n\nforums, and using data analytics tools to identify patterns,\n\ntrends, and insights. The goal of player feedback analysis is\n\nto better understand player needs, preferences, and pain\n\npoints, and use this information to inform game development\n\ndecisions and improve the overall player experience.\n\nPlayer feedback analysis is an important part of game\n\ndevelopment as it helps ensure that the game continues to\n\nmeet player needs and expectations. By regularly collecting and\n\nanalyzing player feedback, game studios can make data-driven\n\ndecisions to improve the game, increase player engagement\n\nand retention, and ultimately drive success and growth.\n\nFor this use case, we’re going to focus on taking online\n\nreviews for your video game and categorizing the different\n\ntopics players are talking about (bucketing topics) in order\n\nto better understand the themes (via positive or negative\n\nsentiment) affecting your community.\n\n**What we’re trying to solve/achieve**\n\nThis is incredibly helpful, providing data-driven customer\n\ninsight into your development process. Whether used in\n\n\n**Overview**\n\nAcross massively multiplayer online video games (MMOs),\n\nmultiplayer online battle arena games (MOBAs) and other\n\nforms of online gaming, players continuously interact in real\n\ntime to either coordinate or compete as they move toward a\n\ncommon goal — winning. This interactivity is integral to game\n\nplay dynamics, but at the same time, it’s a prime opening for\n\ntoxic behavior — an issue pervasive throughout the online\n\nvideo gaming sphere.\n\nToxic behavior manifests in many forms, such as the varying\n\ndegrees of griefing, cyberbullying and sexual harassment\n\nthat are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n\n[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n\nthe multiplayer game, _Dead by Daylight_ .\n\n\npre-production, such as looking at games that are similar\n\n\n**Survivors**\n\n\nwith reviews to learn where those games have strengths and\n\nweaknesses; or using player feedback analysis with a live\n\nservice title to identify themes that can apply to your product\n\nroadmap, player feedback analysis helps teams better\n\nsupport and cultivate engagement with the player community.\n\n\n**GEN**\n\n**RUSHING**\n\n\n**GEN**\n\n\n**HIDING** **ACTIVATING** **LOOPING**\n**EMOTES**\n\n\n**RUSH** **BLINDING** **SANDBAGGING**\n**UNHOOKING**\n\n**TEABAGGING**\n\n\n**REPORTING** **REPORTING**\n\n\n**REPORTING** **REPORTING**\n\n\n**TEXT**\n**CHATTING**\n\n\nUltimately, player feedback analysis does two things. 1) It\n\n\n**Less**\n\n**toxic**\n\n\n**Most**\n**toxic**\n\n\ncan help you stack rank themes according to positive and\n\nnegative sentiment, and 2) you can weight those themes\n\naccording to impact on player engagement, toxicity,\n\nmonetization, churn, and more. We’ve all read reviews that\n\nare overly positive, or overly negative. The process of player\n\nfeedback analysis helps to normalize feedback across the\n\ncommunity (keeping in mind, only for those who have written\n\na review), so you’re not over indexing on one review, or a\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n**CAMPING** **CAMPING**\n\n\n**FARMING** **FARMING**\n\n\n**CAMPING** **CAMPING**\n\n\n**BEING AWAY**\n**FROM**\n**KEYBOARD**\n**(AFK)**\n\n\n**CAMPING**\n\n**DRIBBLING** **TUNNELING**\n\n\n**LOBBY**\n**DODGING**\n\n**BODY**\n**BLOCKING**\n\n**FACE**\n**SLUGGING** **CAMPING**\n\n\n**Killers**\n\n\nsingle theme that may seem in the moment very pressing.\n\nIn addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n\n\n**Get started with Player Feedback Analysis**\n\n\non gamers and the community -- an issue that cannot be\n\n\n-----\n\ngame studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n\n\n[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n\ntoxicity, and of those, 20% reported leaving the game due to\n\nthese interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n\nshowed that having a disruptive or toxic encounter in the first\n\nsession of the game led to players being over three times\n\nmore likely to leave the game without returning. Given that\n\nplayer retention is a top priority for many studios, particularly\n\nas game delivery transitions from physical media releases to\n\nlong-lived services, it’s clear that toxicity must be curbed.\n\nCompounding this issue related to churn, some companies\n\nface challenges related to toxicity early in development,\n\neven before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n\nreleased into testing without text or voice chat due in part\n\nto not having a system in place to monitor or manage toxic\n\n\nIn this section, we’re going to talk about how to use your data\n\nto more effectively find your target audience across the web.\n\nWhether you’re engaging in paid advertising, influencer or\n\nreferral marketing, PR, cross promotion, community building,\n\netc - use data to separate activity from impact. You want\n\nto focus on the channels and strategies that leverage your\n\nresources most effectively, be that time or money.\n\nSay you have a cohort of highly engaged players who are\n\nspending money on your title, and you want to find more\n\ngamers just like that. Doing an analysis on the demographic\n\nand behavioral data of this cohort will give you the\n\ninformation needed to use an ad platform (such as Meta,\n\nGoogle, or Unity) to do lookalike modeling and target those\n\npotential gamers for acquisition.\n\n\ngamers and interactions. This illustrates that the scale of\n\n\nthe gaming space has far surpassed most teams’ ability to\n\nmanage such behavior through reports or by intervening in\n\ndisruptive interactions. Given this, it’s essential for studios\n\nto integrate analytics into games early in the development\n\nlifecycle and then design for the ongoing management of\n\ntoxic interactions.\n\n**What we’re trying to solve/achieve**\n\nToxicity in gaming is clearly a multifaceted issue that\n\nhas become a part of video game culture and cannot be\n\naddressed universally in a single way. That said, addressing\n\ntoxicity within in-game chat can have a huge impact given\n\nthe frequency of toxic behavior and the ability to automate\n\nthe detection of it using natural language processing (NLP). In\n\nsummary, by leveraging machine learning to better identify\n\ndisruptive behavior so that better-informed decisions\n\naround handling actions can be made.\n\n**Get started with Toxicity Detection**\n\n\n##### Multi-Touch Attribution\n\n**Overview**\n\nMulti-touch attribution is a method of attributing credit to\n\ndifferent marketing channels or touchpoints that contribute to\n\na sale or conversion. In other words, it is a way of understanding\n\nhow different marketing efforts influence a customer’s decision\n\nto make a purchase or take a desired action.\n\nThere are a variety of different attribution models that can\n\nbe used to assign credit to different touchpoints, each with\n\nits own strengths and limitations. For example, the last-\n\nclick model attributes all credit to the last touchpoint that\n\nthe customer interacted with before making a purchase,\n\nwhile the first-click model attributes all credit to the first\n\ntouchpoint. Other models, such as the linear model or\n\nthe time decay model, distribute credit across multiple\n\ntouchpoints based on different algorithms.\n\n**What we’re trying to solve/achieve**\n\nMulti-touch attribution can be useful for game studios because\n\nit can help them understand which marketing channels or\n\nefforts are most effective at driving conversions and inform their\n\nmarketing strategy. However, it is important to choose the right\n\nattribution model for your title based on your business model\n\n(one-time purchase, subscription, free-to-play, freemium,\n\nin-game advertising, etc.) and regularly review and optimize your\n\nattribution efforts to ensure they are accurate and effective.\n\n**Get started with Multi-Touch Attribution**\n\n\n-----\n\n### Activating Your Playerbase\n\nSo far, we’ve discussed how to better understand your\n\nplayers, and how to acquire more of your target audience.\n\nNext, we’re going to dig into how to better activate your\n\nplayers to create a more engaged and loyal playerbase that\n\nstays with your game for the long-term. Here, we’re going to\n\nfocus on strategies that differentiate your gamer experience.\n\n##### Player Recommendations\n\n\nand make in-game purchases. Additionally, personalized\n\nrecommendations can help improve the overall player\n\nexperience and increase satisfaction.\n\nGame studios can use a variety of techniques to create player\n\nrecommendations, such as machine learning algorithms,\n\ncollaborative filtering, and manual curation. It is important\n\nto regularly review and optimize these recommendations to\n\nensure that they are effective and relevant to players.\n\n**Get started with Player Recommendations**\n\n\n**Overview**\n\nPlayer recommendations are suggestions for content or actions\n\n\nthat a game studio makes to individual players based on their\n\ninterests and behaviors. These recommendations can be used\n\nto promote specific in-game items, encourage players to try\n\nnew features, or simply provide a personalized experience.\n\n**What we’re trying to solve/achieve**\n\nPlayer recommendations matter to game studios because\n\nthey can help improve player retention, engagement, and\n\nmonetization. By providing players with recommendations\n\nthat are relevant and engaging, studios can increase the\n\nlikelihood that players will continue to play their games\n\n\n##### Next Best Offer/Action\n\n**Overview**\n\nNext best offer (NBO) and next best action (NBA) are\n\ntechniques that businesses use to make personalized\n\nrecommendations to their customers. NBO refers to the\n\npractice of recommending the most relevant product or\n\nservice to a customer based on their past purchases and\n\nbehaviors. NBA refers to the practice of recommending the\n\nmost relevant action or interaction to a customer based on\n\nthe same information.\n\n\n-----\n\nin-game purchase to a player based on their past spending\n\nhabits and the items they have shown an interest in. They\n\nmight use NBA to recommend a specific level or event to a\n\nplayer based on their progress and interests.\n\n**What we’re trying to solve/achieve**\n\nIt’s important to remember that next best offer is a specific\n\nuse case within personalization that involves making\n\nrecommendations to players on the most valuable in-game\n\nitem or action they should take next. For example, a next\n\nbest offer recommendation in a mobile game might suggest\n\nthat a player purchase a specific in-game currency or unlock\n\na new character.\n\nBoth NBO and NBA can be used to improve customer\n\nretention, engagement, and monetization by providing\n\npersonalized recommendations that are more likely to be\n\nrelevant and appealing to individual customers. They can be\n\nimplemented using a variety of techniques, such as machine\n\nlearning algorithms or manual curation.\n\n**Get started with Next Best Offer/Action**\n\n##### Churn Prediction & Prevention\n\n**Overview**\n\nVideo games live and die by their player base. For Games-\n\n\nmay overwhelm the ability of these players to consume,\n\nreinforcing the overall problem of player churn.\n\nAt some point, it becomes critical for teams to take a cold,\n\nhard look at the cost of acquisition relative to the subscriber\n\nlifetime value (LTV) earned. These figures need to be brought\n\ninto a healthy balance, and retention needs to be actively\n\nmanaged, not as a point-in-time problem to be solved, but\n\nas a “chronic condition” which needs to be managed for the\n\nongoing health of the title.\n\nHeadroom for continued acquisition-driven growth can\n\nbe created by carefully examining why some players leave\n\nand some players stay. When centered on factors known\n\nat the time of acquisition, gaming studios may have the\n\nopportunity to rethink key aspects of their acquisition\n\nstrategy that promote higher average retention rates, which\n\ncan lead to higher average revenue per user.\n\n**Prerequisites for use case**\n\nThis use case assumes a certain level of existing data\n\ncollection infrastructure in the studio. Notably, a studio ready\n\nto implement a churn prediction and prevention model\n\nshould have\n\n- A cloud environment where player data is stored\n\n- This source data should contain player behavior and\n\nsession telemetry events from within the game. This is\n\nthe foundation that insights can be built on top of.\n\n\nas-a-Service (GaaS) titles, engagement is the most\n\n\nimportant metric a team can measure. Naturally, proactively\n\npreventing churn is critical to sustained engagement and\n\n\n**Get started with Churn Prediction & Prevention**\n\n\ngrowth. Through churn prediction and prevention, you will\n\n\nbe able to analyze behavioral data to identify subscribers\n\nwith an increased risk of churn. Next, you will use machine\n\nlearning to quantify the likelihood of a subscriber to churn, as\n\nwell as indicate which factors create that risk.\n\n**What we’re trying to solve/achieve**\n\nBalancing customer acquisition and retention is critical.\n\nThis is the central challenge to the long-term success of\n\nany live service game. This is particularly challenging in that\n\nsuccessful customer acquisition strategies needed to get\n\ngames to scale tend to be followed by service disruptions or\n\ndeclines in quality and customer experience, accelerating\n\nplayer abandonment. To replenish lost subscribers, the\n\nacquisition engine continues to grind and expenses mount.\n\nAs games reach for customers beyond the core playerbase\n\nthey may have initially targeted, the title may not resonate\n\n\n##### Real-time Ad Targeting\n\n**Overview**\n\nReal-time ad targeting in the context of game development\n\nfocuses on using data to deliver personalized and relevant\n\nadvertisements to players in near real-time, while they are\n\nplaying a game. Real-time targeting is performanced based,\n\nusing highly personalized messagings which are achieved\n\nby using data to precisely determine the most opportune\n\nmoments to display ads, based on factors such as player\n\nbehavior, game state, and other contextual information.\n\nKnowing when to send those ads is based on data. This\n\nuse case is specific to titles using in-game advertising as a\n\nbusiness model. It’s important to note that in-game real-\n\ntime ad targeting requires a sophisticated tech stack, with\n\n\n-----\n\nwith bigger ad ecosystem, ad networks and partners. The\n\nDatabricks Lakehouse platform is an optimal foundation as it\n\nalready contains many of the connectors required to enable\n\nthis use case.\n\n**What we’re trying to solve/achieve**\n\nThe goal of in-game real-time ad targeting is to provide a\n\nmore immersive and relevant advertising experience for\n\nplayers, while also increasing the effectiveness of the ads\n\nfor advertisers. By delivering targeted ads that are relevant\n\nto each player’s interests, game developers can create a\n\nmore enjoyable and personalized gaming experience, which\n\ncan help to reduce churn and increase the lifetime value of\n\neach player. Additionally, real-time ad targeting can also help\n\ngame developers monetize their games more effectively, as\n\nadvertisers are willing to pay a premium for hyper-targeted\n\nand engaged audiences.\n\n**Get started with Real-time Ad Targeting**\n\n### Operational use cases\n\nIn the game development industry, operational analytics\n\n\n**Overview**\n\nAnomaly detection plays an important role in the operation\n\nof a live service video game by helping to identify and\n\ndiagnose unexpected behaviors in real-time. By identifying\n\npatterns and anomalies in player behavior, system\n\nperformance, and network traffic, this information can then\n\nbe used to detect and diagnose server crashes, performance\n\nbottlenecks, and hacking attempts. The ability to understand\n\nif there will be an issue before it becomes widespread is\n\nimmensely valuable. Without anomaly detection, which is\n\na form of advanced analytics, you’re always in a reactive\n\n(rather than proactive) state. Anomaly detection is a type of\n\nquality of service solution.\n\n**What we’re trying to solve/achieve**\n\nThe goal of anomaly detection is to ensure that players\n\nhave a stable and enjoyable gaming experience. This has\n\nan impact across your game, from reducing downtime,\n\nto minimizing player churn, and improving your game’s\n\nreputation and revenue. Additionally, the insights gained from\n\nanomaly detection can also be used to mitigate cheating and\n\ndisruptive behavior.\n\n**Get started with Anomaly Detection**\n\n\nare essential for ensuring a smooth and efficient production\n\n\nprocess. One common use case is anomaly detection, where\n\ndata analytics is utilized to identify any unusual patterns\n\nor behaviors in the game, such as crashes or performance\n\nissues. This helps developers quickly identify and fix\n\nproblems, improving the overall quality of the game. Another\n\nexample is build pipelines, where data analytics can be used\n\nto monitor and optimize the process of creating new builds\n\nof the game. By tracking key metrics such as build time,\n\nerror rates, and resource utilization, developers can make\n\ninformed decisions about how to optimize the build process\n\nfor maximum efficiency. Other operational use cases in game\n\ndevelopment include tracking player behavior, measuring\n\nserver performance, and analyzing sales and marketing data.\n\nLets explore a few of these below.\n\n\n##### Build Pipeline\n\n**Overview**\n\nA build pipeline is a set of automated processes that\n\nare used to compile and assemble the code, assets, and\n\nresources that make up a game project. The build pipeline\n\ntypically includes several stages, such as code compilation,\n\noptimization, testing, and release. The purpose of a build\n\npipeline is to streamline the game development process\n\nand ensure that each stage of development is completed\n\nefficiently and effectively. A build pipeline can be configured\n\nto run automatically, so that new builds are generated\n\nwhenever changes are made to the code or assets. This\n\nhelps to ensure that the game is always up-to-date and\n\nready for testing and release. The logs are collected are in\n\nnear-real time from build servers. A simplified example:Dev\n\nX is committing code on title Y, submitted on day Z,\n\nalong with the log files from the pipeline and build server.\n\nBuilds typically take multiple hours to complete, requiring\n\nsignificant amounts of compute via build farms. Being able to\n\n\n-----\n\nare wasting compute, and being able to predict which builds\n\nwill fail as they goes through the pipeline are ways to curb\n\noperational expenses.\n\n**What we’re trying to solve/achieve**\n\nWith this use case, we’re seeking to reduce wasted compute\n\nand build a foundational view of what was developed, by\n\nwho, when and how testing performed. In an ideal state, our\n\nautomated build pipeline could send a notification to the\n\ndeveloper with a confidence metric on the build making it\n\nthrough, allowing them to decide whether to continue or\n\nmove another build through the pipeline. Often, developers\n\ndo not have clear visibility until the build has completed\n\nor failed. By providing more insight to devs into the build\n\npipeline process, we can increase the rate at which builds\n\nare completed efficiently and effectively.\n\n**Get started with Build Pipeline**\n\n##### Crash Analytics\n\n\nresources were being used. How long crash testing takes\n\ncan vary, depending on the game’s business model, amount\n\nof content, and scope. For a title with a one-time release,\n\nwhere there is a large amount of content and a complex\n\nstoryline, the chances of hidden crashes causing errors while\n\nin development are high, making it require more time to\n\nperform testing before the game can be published. For titles\n\nbuilt in a game-as-a-service model, i.e. a game shipped in\n\ncycles of constant iteration, crash detection should be done\n\ncontinuously, since errors in newly released content might\n\naffect the base game and lead to crashes.\n\nIncreasingly, titles are being released in alpha (where\n\ndevelopers do the testing), closed beta (which includes a\n\nlimited group of testers/sample-users who do the gameplay\n\ntesting) and open betas (where anyone interested can register\n\nto try the game). All of which happens before the game is\n\n“officially” released. Regardless of alpha, beta, or GA, players\n\nmay stumble over game crashes, which triggers crash reports\n\nthat are sent to the developers for fixing. But sometimes, it\n\ncan be challenging to understand the issue that caused the\n\ncrash from crash reports provided by your game’s platform.\n\n**What we’re trying to solve/achieve**\n\nUltimately, the purpose of crash analytics is to identify the\n\nroot cause of a crash, and help you take steps to prevent\n\nsimilar crashes from happening in the future. This feedback\n\nloop can be tightened through automation in the data\n\npipeline. For example, by tracking crashes caused on builds\n\nfrom committers, the data can provide build suggestions\n\nto improve crash rate. Furthermore, teams can automate\n\ndeduplication when multiple players experience the same\n\nerrors, helping to reduce noise in the alerts received.\n\n**Get started with Crash Analytics**\n\n\n**Overview**\n\nGames crash, it is a fact of game development. The\n\ncombination of drivers, hardware, software, and\n\nconfigurations create unique challenges in tracking, resolving\n\nand managing the user experience.\n\nCrash analytics and reporting is the process of collecting\n\ninformation about crashes or unexpected failures in a\n\nsoftware application, in this case, a video game. A crash\n\nreport typically includes information about the state of the\n\ngame at the time of the crash, such as what the player was\n\n\n-----\n\n# Things to look forward to\n\n\nThis eBook was created to help game developers better\n\nwrap their heads around the general concepts in which data,\n\nanalytics, and AI can be used to support the development\n\nand growth of video games. **If you only have 5 minutes,**\n\n**these takeaways are critical to your success** .\n\nFor more information on advanced data, analytics, and AI use\n\ncases, as well as education resources, we highly recommend\n\nDatabricks training portal [dbricks.co/training](http://dbricks.co/training) .\n\n**Top takeaways:**\n\nIf you take nothing else from this guide, here are the most\n\nimportant takeaways we want to leave with you on your journey.\n\n`1.` **Data is fundamental. Data, analytics, and AI play a role**\n\nthroughout the entire game development lifecycle - from\n\ndiscovery to pre-production, development to operating\n\na game as a live service. Build better games, cultivate\n\ndeeper player engagements, and operate more effectively\n\n\nby utilizing the full potential of your data.\n\n`2.` **Define your goals.** Start by establishing the goals of what\n\nyou’re hoping to learn and or understand around your\n\ngame. Clear goals make it easier to identify key metrics\n\nto track, example goals include; developing high-quality\n\ngames that provide engaging and satisfying player\n\nexperiences, increasing player engagement and retention\n\nby analyzing and improving gameplay and mechanics, and\n\nbuilding a strong and positive brand reputation through\n\neffective marketing and community outreach.\n\n`3.` **Identify and understand your data sources.** Spend time\n\nto identify and understand the breadth of data sources\n\nyou are already collecting, be that game telemetry,\n\nmarketplace, game services, or sources beyond the game\n\nlike social media. It is critical to collect the right data, and\n\ntrack the right metrics based on the goals and objectives\n\nyou have set for your game.\n\n`4.` **Start small, and iterate quickly.** Recognize that goals and\n\nobjectives evolve as you learn more about the interaction\n\n\n-----\n\nare most effective when scoped small with tight feedback\n\nloops, allowing you to quickly adapt with your community\n\nand alongside shifting market conditions.\n\n`5.` **Game analytics forms the foundation.** Start by getting a\n\ngame analytics dashboard up and running. The process of\n\nbuilding out a dashboard will naturally require connecting\n\nand transforming your data in a way to unlock more\n\nadvanced use cases down the road.\n\n`6.` **Plan and revisit your data strategy frequently.** Once\n\ndashboarding is set up, you’ll have a better picture of what\n\ndownstream data use cases make the most sense for\n\nyour game and business objectives. As you move to use\n\ncases such as player segmentation, churn analysis, and\n\nplayer lifetime value, revisit your data strategy frequently\n\nto ensure you’re spending time on use cases that drive\n\nactionable insights for you and your team.\n\n`7.` **Show value broad and wide.** Whether your data strategy\n\nis new or well established on the team, build the habit\n\nof communicating broadly to stakeholders across the\n\ncompany. Early in the process, it is important to gather\n\ncritical feedback on what data is helpful and where there\n\nare opportunities for improvement. The worst thing that\n\ncan happen is you create something that no one uses.\n\nThat is a waste of everyone’s time and money.\n\n`8.` **Ask for help.** Engage with your technical partners. There\n\nare humans who can help ensure you’re developing your\n\ndata and analytics platform in a way that is efficient and\n\neffective. There are numerous partners with domain\n\nexpertise in data science and data engineering that can\n\naccelerate your data journey - here is our recommended\n\npartner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n\n`9.` **Participate in the community.** The community for game\n\nanalytics is large and growing. It is important to research and\n\n\nyour needs and interests. Here are a few of our favorites:\n\n`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n\nSpecial Interest Groups that bring together user\n\nresearchers, designers, data engineers and data\n\nscientists focused on understanding player behavior\n\nand experiences. They offer resources and events\n\nfor those working in games user research, including a\n\nyearly Games User Research Summit.\n\n`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n\nglobal community of data scientists and engineers.\n\nWhile not specifically focused on game development,\n\nthey offer a wealth of resources and opportunities for\n\nlearning, networking, and collaboration in the field of\n\ndata science.\n\n`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n\nLanguage Processing, computer vision, and other fields\n\nwhere AI plays its role. They also provide an online\n\nplatform where users can access pre-trained models\n\nand tools, share their own models and datasets, and\n\ncollaborate with other developers in the community.\n\n`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n\nsubreddit is a forum for data engineers to discuss\n\ntopics related to building and managing data pipelines,\n\ndata warehousing, and related technologies. While\n\nnot specifically focused on game development, it\n\ncan be a valuable resource for those working on data\n\nengineering in the gaming industry.\n\n`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n\nfirst step in your data journey. Imagine how the output of\n\nyour data can be presented in a way to help stakeholders\n\nacross your company achieve more. For example, dropping\n\ndata into an application that can help game designers\n\nmake balancing decisions based on player events.\n\n\n-----\n\n# APPENDIX Ultimate class build guide\n\n\n### Creating a character\n\nThe heart and soul of mature data teams are formed by this\n\ntrio of classes. There are many aspects to these roles, but\n\nthey can be summarized in that Data Engineers create and\n\nmaintain critical data workflows, Data Analysts interpret data\n\nand create reports that keep the business teams running\n\nseamlessly, and Data Scientists are responsible for making\n\nsense of large amounts of data. Depending on the size of\n\nthe organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in\n\ndata engineering, analytics and data science.\n\nWhether you’re looking to stand-up an analytics dashboard\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n##### Data Engineers\n\n\n**Goals and Priorities of Data Engineers**\n\n- Enable access to usable data for real-time insights — data\n\nthat both enables timely decision-making and is accurate\n\nand reproducible\n\n- Increase user confidence and trust in data. This involves\n\nensuring high consistency and reliability in ETL processes\n\n- Limit the issues and failures experienced by other\n\nengineers and data scientists, allowing those roles to\n\nfocus less on troubleshooting and more on drawing\n\nmeaningful conclusions from data and building new\n\nproducts / features\n\n**What Data Engineers care about:**\n\n- Enabling access to data for real-time insights — data that\n\nboth enables timely decision-making and is accurate and\n\nreproducible\n\n- Building high-performance, reliable and scalable pipelines\n\nfor data processing\n\n- Delivering data for consumption from a variety of sources\n\nby Data Analysts and Data Scientists against tight SLAs\n\n- A Data Engineer’s biggest challenge? Collaboration\n\nacross teams\n\n\nData engineers build systems that collect, manage, and\n\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n**Responsibilities:**\n\n- Data Engineers are responsible for data migration,\n\nmanipulation, and integration of data (joining dissimilar\n\ndata systems)\n\n- Setup and maintenance of ETL pipelines to convert\n\nsource data into actionable data for insights. It is the\n\nresponsibility of the data engineer to make sure these\n\npipelines run efficiently and are well orchestrated.\n\n- The Data Engineer sets up the workflow process\n\nto orchestrate pipelines for the studio’s data and\n\ncontinuously validates it\n\n- Managing workflows to enable data scientists and data\n\nanalysts, and ensuring workflows are well-integrated with\n\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\n\n\n##### Data Scientists\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Responsibilities:**\n\n- Responsible for making sense of the large amounts of data\n\ncollected for a given game title, such as game telemetry,\n\nbusiness KPIs, game health and quality, and sources\n\nbeyond the game such as social media listening\n\n- The analytics portion of a Data Scientist’s job means\n\nlooking at new and existing data to try and discover new\n\nthings within it\n\n- The engineering component may include writing out\n\npipeline code and deploying it to a repository\n\n- Data Scientists are responding for building, maintaining, and\n\nmonitoring models used for analytics and/or data products\n\n\n-----\n\n**Goals and Priorities:**\n\n- Developing new business capabilities (such as behavioral\n\nsegmentation, churn prediction, recommendations) and\n\noptimizing processes around those capabilities\n\n- Increase ROI by building algorithms and tools that are\n\nmaintainable and reusable\n\n- Exploring (or further expanding) the use of machine\n\nlearning models for specific use cases\n\n- Bridges the gap between engineering and analytics,\n\nbetween the technology teams and business teams\n\n- Provides business side of studio with data that is crucial\n\nin decision-making, for example a churn model that helps\n\npredict the impact of a new feature set\n\n**What Data Scientists care about:**\n\n- Creating exploratory analysis or models to accurately\n\npredict business metrics, e.g., customer spend, churn,\n\netc., and provide data-driven recommendations\n\n- Enable team with actionable insights that are easy to\n\nunderstand and well curated\n\n- Create and move models from experimentation to\n\nproduction\n\n- A Data Scientist’s biggest challenge? Keeping up with\n\nadvancements and innovation in data science, and\n\nknowing which tools and libraries to use\n\n##### Data Analysts\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n**Responsibilities:**\n\n- Often serves as the go-to point of contact for non-\n\n\n\n- Analysts often interpret data and create reports or other\n\ndocumentation for studio leadership\n\n- Analysts typically are responsible for mining and\n\ncompiling data\n\n- Streamline and or simplify processes when possible\n\n**Goals and Priorities:**\n\n- Empower stakeholder and business teams with\n\nactionable data\n\n- “Catch things before they break”. Proactively mitigate\n\npotential data issues before they occur (for internal and\n\nexternal customers)\n\n- Analysts are often recruited to assist other teams (i.e., BI\n\nteams) with their domain knowledge\n\n- Driving business impact through documentation and\n\nreliable data\n\n**What Data Analysts care about:**\n\n- Easy access to high quality data.\n\n- Quickly find insights from data with SQL queries and\n\ninteractive visualizations.\n\n- The ability to easily share insights and while creating\n\nimpactful assets for others to consume (dashboards, reports).\n\n- A Data Analyst’s biggest challenge? Working with complex\n\nprocesses and complicated technologies that are filled\n\nwith messy data. While fighting these challenges, Analysts\n\nare often left alone or forced through paths that prevent\n\ncollaboration with others across team/organization.\n\n- Untrustworthy data: often Analysts get asked to provide\n\nanswers to leadership that will leverage the data to\n\ndetermine the direction of the company. When the data is\n\nuntrustworthy or incorrect due to previously mentioned\n\nchallenges this can eventually lead to lack of trust in the\n\ndata teams from leadership or the business.\n\n\ntechnical business / operations colleagues for data\n\naccess / analysis questions\n\n\n-----\n\n# Data access and the major cloud providers\n\n\n### Cloud Rosetta Stone\n\n[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n\nIf you are newer to the cloud computing space, it is easy to\n\nget lost between the hundreds of different services between\n\nthe three major cloud providers. The table below is meant to\n\nhighlight the important data, analytics, and AI services used\n\nby the various hyperscale service providers Amazon,\n\nMicrosoft, and Google. In addition, it aims to pair up services\n\nfrom different cloud providers that serve the same purpose.\n\n### Getting started with the major cloud providers\n\nHere are some quick ways to get started with the three major\n\ncloud providers: AWS, Azure, and GCP:\n\n**AWS:**\n\n`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n\naccount on the AWS website. This will give you access to\n\nthe AWS Management Console, which is the web-based\n\ninterface for managing your AWS resources.\n\n\n`2.` **Use the AWS free tier:** AWS offers a free tier of service\n\nthat provides a limited amount of free resources each\n\nmonth. This is a great way to get started and try out\n\nvarious AWS services without incurring any charges.\n\n`3.` **Explore the AWS Management Console:** Once you have\n\nan account and are logged in, take some time to explore\n\nthe AWS Management Console and familiarize yourself\n\nwith the various services that are available.\n\n`4.` **Next you can search for Databricks:** In the AWS\n\nManagement Console, use the search bar in the top-left\n\ncorner of the page and search for “Databricks”.\n\n`5.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`6.` **Launch Databricks Workspace:** To launch the Databricks\n\nWorkspace on AWS, you can use the CloudFormation\n\ntemplate provided by Databricks. Databricks\n\nCloudFormation template creates an IAM role, security\n\ngroup, and Databricks Workspace in your AWS account.\n\n**Azure:**\n\n`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n\nan account on Azure portal. This will give you access to\n\nthe Azure portal, which is the web-based interface for\n\nmanaging your Azure resources.\n\n\n\n\n\n\n\n\n\n\n\n|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n|---|---|---|---|---|\n|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n\n\n-----\n\n**Jargon Glossary**\n\n|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n|---|---|\n|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n\n\n`2.` **Take Azure tutorials:** Azure provides tutorials,\n\ndocumentation, and sample templates to help you get\n\nstarted. These resources can help you understand the\n\nbasics of Azure and how to use its services.\n\n`3.` **You can search for Databricks:** In the Azure portal, use the\n\nsearch bar at the top of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the Azure portal, Azure\n\nCLI or Azure Powershell. Once created, you’ll be able to\n\naccess your Databricks Workspace through the Azure portal.\n\n`6.` **Other Azure Services:** Once you have a Databricks\n\nworkspace setup, you can easily connect it to other Azure\n\nServices such as Azure Storage, Event Hubs, Azure Data\n\nLake Storage, Azure SQL and Cosmos DB for example.\n\n\n**GCP:**\n\n`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n\naccount on GCP portal. This will give you access to the\n\nGCP Console, which is the web-based interface for\n\nmanaging your GCP resources.\n\n`2.` **Explore the GCP Console:** Once you have an account\n\nand are logged in, take some time to explore the GCP\n\nConsole and familiarize yourself with the various services\n\nthat are available.\n\n`3.` **Search for Databricks:** In the GCP Console, use the search bar\n\nin the top-left corner of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the GCP Console or\n\nthe gcloud command-line tool. Once created, you’ll be\n\nable to access your Databricks Workspace through the\n\nGCP Console.\n\n\n-----\n\n# Detailed Use Cases\n\n\n### Getting started with game analytics\n\nFortunately, standing up an effective analytics dashboard\n\nis getting easier. It all starts with getting your data into an\n\narchitecture that sets your team up for success. Selecting\n\nany of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n\nyou can land all your data into a cloud data lake, then use\n\nDatabricks Lakehouse architecture to run real-time and\n\nreliable processing. Databricks can then help you visualize\n\nthat data in a dashboard, or send to a visual analytics\n\nplatform, such as Tableau.\n\n`1.` **Sign up for a Databricks account:** You’ll need to create\n\nan account on the Databricks website in order to use the\n\nplatform.\n\n`2.` **Access the Databricks portal:** Interact with the\n\nDatabricks platform and run tasks such as creating\n\nclusters, running jobs, and accessing data.\n\n`3.` **Set up a development environment:** You’ll need a\n\ndevelopment environment where you can write and\n\ntest your code, whether you’re using a local IDE or the\n\nDatabricks Workspace.\n\n`4.` **Collect data:** Once you have your development environment\n\nset up, you can start collecting data from your game. This\n\ncan involve integrating or building a SDK into your game\n\ncode, or using another tool to send data to cloud storage.\n\n`5.` **Process and analyze the data:** Once you have collected\n\nyour data, you can use Databricks to process and analyze\n\nit. This can involve cleaning and transforming the data,\n\nrunning queries or machine learning algorithms, or\n\ncreating visualizations.\n\n`6.` **Monitor and optimize:** Regularly monitor your analytics\n\nto ensure that they are accurate and relevant, and use the\n\ninsights you gain to optimize your game.\n\nKeep in mind that these are just general steps to get started\n\nwith Databricks for game analytics. The specific steps you’ll\n\nneed to take will depend on your specific use case and needs.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define your goals:** What do you want to learn from your\n\nanalytics data? Having clear goals will help you focus on\n\ncollecting the right data and making meaningful use of it.\n\n- **Plan your data collection:** Determine what data you need\n\nto collect, how you will collect it, and how you will store it.\n\n- **Consider privacy:** Make sure you are transparent with your\n\nplayers about what data you are collecting and how you\n\nwill use it, and give them the option to opt out if they wish.\n\n- **Use analytics to inform design:** Leverage your analytics data\n\nto inform decisions around game design, such as any balance\n\nchanges or new content targeting a specific audience.\n\n- **Monitor and test your analytics implementation:** Regularly\n\ncheck your analytics to ensure that data is being collected\n\ncorrectly, and conduct tests to validate the accuracy of\n\nyour data.\n\n- **Visualize your data:** Dashboarding your data is one of the\n\nmost effective ways to quickly and effectively make sense\n\nof what’s happening at a given moment in time.\n\n- **Use data to improve player retention:** Analyze player\n\nbehavior and use the insights you gain to improve player\n\nretention, such as by identifying and addressing pain\n\npoints or by providing personalized content.\n\n- **Collaborate with your team:** Share your analytics\n\nfindings with your team and encourage them to use the\n\ndata to inform their work.\n\n- **Keep it simple:** Don’t try to collect too much data or\n\ncreate overly complex analytics systems. Keep it simple\n\nand focused on your goals.\n\n- **Start where you are:** If you’ve yet to gather all of your\n\ndata, don’t go build some fancy model. Start with the data\n\nyou have available to you and build from there.\n\n### Getting started with Player Segmentation\n\nPlayer segmentation is crucial to studios as it allows them\n\nto better understand their audience and tailor their game\n\nexperience to meet their specific needs and preferences.\n\nBy dividing players into different segments based on factors\n\nsuch as demographics, playing styles, and in-game behavior,\n\n\n-----\n\nstudios can gain valuable insights into what motivates and\n\nengages their players. This information can then be used\n\nto design games that not only provide a more enjoyable\n\nexperience for players, but also drive player retention\n\nand increase revenue for the studio. In a competitive\n\nindustry where player satisfaction is key to success, player\n\nsegmentation is an essential tool for studios to stay ahead of\n\nthe game.\n\nStart by evaluating the segmentation goals such as:\n\n- **Personalize the experience:** Changing or creating\n\nexperience specific designs to the player.\n\n- **Create relevant content:** Surface the best content to\n\nplayers based on features and behaviors that will matter\n\nthe most depending on the player’s place in the games\n\nlife cycle.\n\n- **Monetization:** Create tailored monetization strategies\n\nthat effectively reach and convert each player group. For\n\nexample, you may have a group of highly engaged players\n\nwho are more likely to make in-app purchases, while\n\nanother group is less likely to spend money but may be\n\nmore receptive to advertisements.\n\nThe next steps would be to identify, collect and analyze\n\nplayer data. By gathering information on player behavior,\n\npreferences, and demographics, you can gain insights\n\ninto their motivations, pain points, and what drives their\n\nengagement with your game.\n\nThere are multiple types of player data to collect, including:\n\n- **Player Behavior:** Track player behavior and actions\n\nwithin your game to gain insights into their play style,\n\npreferences, and patterns.\n\n- **Surveys:** Ask players directly about their preferences,\n\nmotivations, and feedback through in-game surveys, email\n\nquestionnaires, or other forms of direct communication.\n\n- **Focus groups:** Gather a small group of players to discuss\n\nand provide feedback on specific aspects of your game\n\nand player experience.\n\n- **Social media listening:** Monitor social media platforms\n\nto gather insights into how players are engaging with and\n\ntalking about your game.\n\n**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n**Tips / Best Practices**\n\nDefine your segmentation goals: Determine what you want\n\nto learn about your players and why. This will help you focus\n\nyour analysis and ensure that your segments are meaningful\n\nand actionable.\n\n- **Use meaningful criteria:** Choose criteria that are relevant\n\nto your goals and that differentiate players in meaningful\n\nways. This could include demographic information, in-game\n\nbehavior, spending habits, or a combination of factors.\n\n- **Analyze player data:** Use data from your players to inform\n\nyour segmentation strategy. This could include data\n\non in-game behavior, spending habits, or demographic\n\ninformation.\n\n- **Use multiple methods:** We recommend using a\n\ncombination of methods, such as clustering to create\n\nsegments that are statistically meaningful and actionable\n\nto your game.\n\n- **Validate your segments:** Test your segments to ensure\n\nthat they accurately reflect the differences you observed\n\nin your player data. This could involve comparing the\n\nsegments to each other, or validating the segments\n\nagainst external data sources.\n\n- **Consider ethical and privacy concerns:** Ensure that\n\nyour segmentation strategy is ethical and complies\n\nwith privacy laws and regulations. This could involve\n\nanonymizing your player data, obtaining consent from\n\nplayers, or other measures to protect player privacy.\n\n- **Monitor and refine your segments:** Regularly review\n\nyour segments to ensure that they remain relevant and\n\nmeaningful. Refine your segments as necessary to reflect\n\nchanges in your player data or your goals.\n\n### Getting Started with Player Lifetime Value\n\nAssuming you’ve followed the steps to collecting, storing, and\n\npreparing your player data for analysis; To calculate player\n\nlifetime value (LTV), the quick and dirty way of assessing\n\noverall player LTV is to divide the total revenue by the total\n\nnumber of registered players. Note, LTV is a critical calculation\n\nfor return on investment, which is player lifetime spend versus\n\nthe amount spent on player acquisition. Ideally, you want\n\nlifetime spend to be equal to or more than cost of acquisition.\n\n\n-----\n\nAs long as your game and its community are currently active,\n\nany player lifetime value calculations should be considered\n\nmodels, not exact numbers. This is because many of the players\n\nyou’re considering are likely actively registered and actively\n\nplaying, so the exact player LTV number is a moving target.\n\nAdvanced\npredictive\nmodels\n\nSimple\npredictive\nmodels\n\n\nHistorical\naverage and\nbenchmarks\n\n\nBut these models are not entirely accurate since it doesn’t\n\ntake into account the players who are registered but have\n\nyet to generate any revenue. Instead, a data-driven approach\n\npivoted around player segmentation or cohorts will generally\n\nyield more actionable insight, far more than calculating a\n\nsingle LTV for the entire player base.\n\nYou can define your game’s cohorts in multiple ways. Perhaps\n\nthe most obvious in terms of calculating LTV is going by daily\n\nactive cohorts, or users who joined your game on the same\n\nday. You could also organize cohorts by users who joined\n\nyour game through a certain ad campaign or promotional\n\neffort, by country or geographic location, or by the type of\n\ndevice used.\n\n**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n**ACCURACY**\n\n**Tips / Best Practices**\n\n\n**Use multiple data sources:** To get a complete picture of\n\na player’s value, be sure to consider data from a variety\n\nof sources, including in-game purchases, ad revenue, and\n\nother monetization strategies.\n\n**Consider player retention:** Player retention is a key factor\n\nin LTV, so be sure to consider how long players are likely to\n\nplay your game when calculating LTV.\n\n**Use accurate data:** Make sure you are using accurate\n\ndata when calculating LTV. This might involve cleaning and\n\nprocessing your data, or using trusted sources such as in-\n\ngame analytics tools.\n\n**Regularly review and update your LTV estimates:** Player\n\nLTV can change over time, so be sure to regularly review\n\nand update your estimates to ensure they are accurate.\n\n**Test and optimize:** Use experimentation methods such\n\nas A/B testing to see how different variables, such as\n\nin-game events or pricing strategies, affect LTV. Use the\n\ninsights you gain to optimize your LTV calculations.\n\n**Be aware of outside factors:** Your calculations should\n\nconsider the many outside factors that can affect your\n\nLTV, such as the virality of your game, any spikes or surge\n\nin visitors due to unexpected promotions (influencers,\n\nreviewers talking about your game), any significant changes\n\nto your game that users respond well to, and other organic\n\nlifts that are difficult to predict with existing data.\n\n\nThe first calculation is relatively simple. We suggest using\n\naverage revenue per user (ARPU), which is a game’s daily\n\nrevenue divided by the number of active users, to help you\n\ncalculate lifetime value. First, you’ll need to define what is\n\nan active player using retention values; which can be set to\n\na week, multi-day, or multi-week period of time depending\n\non how your game has performed to date. You can then look\n\nat the number of users who churn on a given day, averaging\n\nwith the number of days from the player’s first visit to the\n\ncurrent date (or the specific date you’ve considered the end\n\nfor said exercise). This is your playerbase lifetime value (note\n\nnot Player Lifetime Value). To get Lifetime Value, divide daily\n\nrevenue by the number of daily active users, and multiply\n\nthat by the Lifetime Value to get your player LTV.\n\nIt’s important to note that while calculating player lifetime\n\nvalue, the term is not entirely accurate since most player\n\nlifetimes are not over (particularly true for live service\n\ngames). But for the purpose of modeling, we recommend\n\nkeeping the amount of time that you consider a lifetime\n\nrelatively short, allowing you to extrapolate. Keeping the time\n\nperiod shorter helps mitigate inaccuracies, specifically, the\n\nlonger you stretch out what you consider a lifetime the more\n\nlikely you are to collect inactive users in your count.\n\n\n-----\n\n### Getting Started with Social Media Monitoring\n\nSocial media monitoring has three primary components:\n\ncollecting the data, processing the results, and taking action\n\non the findings. When it comes to collecting the data, whether\n\nyou’re looking for tweets, YouTube comments, or Reddit\n\nposts, it can be very easy to get started since many social\n\nmedia platforms such as Twitter, YouTube, and Reddit all\n\nprovide their own detailed and comprehensive APIs making it\n\neasy to start gathering data from those platforms with proper\n\ndocumentation and code examples to help along the way.\n\nOnce the data has been collected, the next step is to process\n\nit and prepare it to be used in the next step. Processing your\n\ndata can range in complexity from a simple keywords filter\n\nor more complicated approach such as filtering by location,\n\nremoving emojis, and censoring and substituting words. With\n\nthe data collected and processed, it can move to the final\n\nstage and be analyzed for downstream use and actionable\n\ninsights by applying sentiment analysis or text mining.\n\nIf a game studio is looking to save time and have the above\n\nsteps performed for them, it may be appealing to buy a\n\npre-built tool. The primary benefits of buying an off the shelf\n\nsolution is that it is often faster and easier to get started\n\nwith, and the development of the tool is handled by a third\n\nparty who will have experience in building media monitoring\n\n\nsolutions. On the other hand, building your own custom\n\nsolution will provide more flexibility and control. Many pre-\n\nbuilt media monitoring tools might not have the capabilities\n\nrequired to effectively process video, audio, and image\n\ndata, and may not be able to control the frequency in which\n\ndata is processed, whether it be near real-time or batch.\n\nAdditionally, pre-built solutions tend to take a generalist\n\napproach for NLP, whether it be keyword extraction, topic\n\nfiltering, or sentiment analysis, which often leads to poor\n\nresults and feedback, especially for an industry as unique as\n\nthe gaming industry where certain industry-specific slang\n\nor terminology is frequently used. Overall, building your\n\nown media monitoring tool will provide greater control and\n\nflexibility leading to a better tailored return on investment,\n\nand luckily Databricks makes it even easier to get started.\n\nWith the Databricks Lakehouse platform, all data engineering,\n\ndata science, machine learning, and data analytics can\n\nbe done in a single place without having to stitch multiple\n\nsystems and tools together.\n\nData engineers can use Workflows and Jobs to call social\n\nmedia platform APIs on a scheduled basis and use Delta Live\n\nTables to create declarative data pipelines for cleaning and\n\nprocessing the data that comes in. Data scientists can use\n\ntools such as ML-specific Databricks runtimes (DBRs) that\n\ncome with many of the most popular and common libraries\n\nalready installed, MLflow which makes model development,\n\n\n-----\n\ntracking, and serving easy and efficient, and various other\n\ntools such as AutoML and Bamboolib. Data analysts are able\n\nto create real-time alerts, dashboards, and visualizations\n\nusing Databricks SQL. Each of the three personas will be able\n\nto effectively collaborate with each other and integrate each\n\npiece of their work into the broader data architecture.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\nWhile social media monitoring can be easy to get started\n\nwith, there are a few key points to keep in mind.\n\n- Remember the Pareto principle (roughly 80% of impact\n\ncomes from 20% of activity) and diminishing returns. While\n\nit’s important to monitor large platforms such as Reddit,\n\nTwitter, and YouTube, it might not be worthwhile to monitor\n\nsmaller platforms (in terms of engagement) as the bulk of\n\ncustomer feedback will be on those major platforms.\n\n- Monitor other sources of information. It is also useful to\n\nmonitor mentions of key company personnel such as\n\nexecutives or public facing employees.\n\n- While follower count does matter on platforms such as\n\nTwitter, don’t ignore users with low-follower counts. It only\n\ntakes one or two re-tweets from other users to become a\n\nlarge issue.\n\n- On social media, customers can see through generic\n\ncorporate responses to complaints, so it is important\n\nto get a clear understanding of the issue and provide a\n\nclear response.\n\n### Getting Started with Player Feedback Analysis\n\nThe easiest place to start is gathering your data. With\n\naccounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n\nNintendo (or whatever platform you’re using), identify the ID\n\nfor your game(s), and pull the reviews corresponding to that\n\ngame into Databricks through an API call.\n\n\nFrom here, you clean the data using some of the pre-\n\nprocessing available in Python that removes any emojis and\n\nASCII characters. Once complete, run through Spark NLP\n\npipeline which does the basic natural language processing\n\nsteps such as normalization, stemming, lemmatization. We\n\nrecommend running through pre-trained models, such as Word\n\nEmbeddings and Named Entity Recognition models from John\n\nSnow Labs. This should complete the pipeline and generates\n\nthe aspects for the reviews provided by the community.\n\nThis data is then loaded into a Delta table for further analysis,\n\nsuch as using a visual dashboard (built on SQL queries inside\n\nDatabricks) to analyze and understand the aspects the\n\ncommunity is talking about, which can then be shared back\n\nwith the development team for analysis and action. This is a\n\ngreat exercise to run once per month.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Check for word groupings:** Make sure your word groupings\n\nare accurate to improve the analysis. For example, if your\n\ngame is called Football Manager, and the shorthand is FM,\n\nmake sure both of those are grouped appropriately.\n\n- **Leverage domain knowledge:** Clean the reviews based\n\non your domain knowledge. There are generic steps one\n\ncould take, but that will not be as effective as someone\n\nwith domain, and specific game knowledge of your title.\n\n- **Experiment with models:** Feel free to try multiple pre-\n\ntrained models, and or tweak the pre-trained models\n\nbased on your understanding of the domain to improve\n\nthe accuracy of your results.\n\n- **Work one title at a time:** This process works best when\n\npulling reviews for a single title, specifically one version of\n\none title at a time.\n\n- **Let the model to the heavy lift, but use humans to double-**\n\n**check:** The sentiment corresponding to the aspects in the\n\nmodel will be labeled as Positive or Negative. In the case\n\nof a neutral review, the model will do its best to determine\n\nwhether that is more positive or negative. A best practice\n\nis to spend time going back through the aspects early to\n\ndetermine model accuracy and make updates accordingly.\n\n\n-----\n\n### Getting Started with Toxicity Detection\n\nOur recommendation on tackling the toxicity issue is\n\nto leverage cloud-agnostic and flexible tooling that can\n\nconsume chat data from a variety of sources, such as chat\n\nlogs, voice transcriptions, or sources like discord and reddit\n\nforums. No matter if the data is in log form from game\n\nservers or events from a message system, Databricks can\n\nprovide quick and easy ways to ingest the data.\n\nLeveraging a simplified architecture like the diagram\n\nabove shows no matter the source, getting chat data for\n\ninferencing and model development can be as simple. While\n\nwe leveraged a pre-built model from John Snow Labs to\n\naccelerate development, you can bring the ML framework of\n\nyour choice to the platform.\n\n**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define what toxic and disruptive behavior looks**\n\n**like within your community:** Clearly define what you\n\nconsider to be toxic behavior, as this will determine how\n\nyou measure and detect it. This might include things like\n\nhateful language, harassment, or cheating.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you detect toxicity. This might include\n\ndata on in-game chat, player reports, and other sources.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and identify patterns of toxic\n\nbehavior. This will allow you to more accurately detect\n\ntoxicity and prioritize cases for review.\n\n- **Test and optimize:** Regularly review and test your toxicity\n\ndetection systems to ensure they are accurate and\n\neffective. Use experimentation methods such as A/B\n\ntesting to see how different strategies impact toxicity rates.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are detecting toxicity, and give\n\nthem the option to opt out if they wish.\n\n- **Take action:** When toxic behavior is detected, take\n\nappropriate action to address it. The health and wellness\n\nof your community depends on it. This might involve\n\nbanning players, issuing warnings, or taking other\n\ndisciplinary measures.\n\n\n-----\n\n### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n\nTo get started with multi-touch attribution, you need to first\n\nselect an attribution model. There are a variety of different\n\nattribution models to choose from, each with its own\n\n\nattribution credit according to your chosen model (above).\n\nWe highly recommend you regularly review and test your\n\nattribution efforts to ensure they are accurate and effective.\n\nUse experimentation methods such as A/B testing to see\n\nhow different strategies impact conversion rates.\n\n**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n\n\nstrengths and limitations.\n\n\n`1.` **Last-click model:** This model attributes all credit to the\n\nlast touchpoint that the customer interacted with before\n\nmaking a purchase or taking a desired action.\n\n`2.` **First-click model:** This model attributes all credit to the\n\nfirst touchpoint that the customer interacted with.\n\n`3.` **Linear model:** This model attributes equal credit to each\n\ntouchpoint that the customer interacted with.\n\n`4.` **Time decay model:** This model attributes more credit to\n\ntouchpoints that are closer in time to the purchase\n\nor desired action.\n\n`5.` **Position-based model:** This model attributes a portion of\n\nthe credit to the first and last touchpoints, and the remainder\n\nis distributed evenly among the other touchpoints.\n\n`6.` **Custom model:** Some businesses create their own\n\nattribution model based on specific business needs or goals.\n\nEach attribution model has its own strengths and limitations,\n\nand the right model for a particular video game will depend\n\non a variety of factors, including the goals of your title, the\n\ncustomer journey, and the types of marketing channels being\n\nused. It is important to carefully consider the pros and cons\n\nof each model and choose the one that best aligns with the\n\nneeds of your game.\n\nNext, you’re going to want to set up tracking. In order to\n\nattribute credit to different touchpoints, you’ll need to set up\n\ntracking to capture data on customer interactions. This might\n\ninvolve integrating tracking code into the game, or using a\n\nthird-party tracking tool.\n\nWith tracking set up, you’ll start collecting data on player\n\ninteractions and be able to use that information to calculate\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define clear goals:** Sounds simple, but by clearly defining\n\nthe goals of your acquisition campaign and what success\n\nlooks like, you will be able to guide your decision-making\n\nand ensure that you are measuring the right metrics -\n\nsuch as cost per install, return on ad spend, conversion\n\nrate, lifetime value, retention rate, and more.\n\n- **Use a data-driven approach:** Use data to inform your\n\ndecision-making. Collect data on all touchpoints in the\n\nplayer journey, including ad impressions, clicks, installs,\n\nand in-game actions.\n\n- **Choose the right attribution model:** Select the right\n\nattribution model that accurately reflects the player\n\njourney for your specific genre of game. This can be a\n\ncomplex process. A couple of things to keep in mind\n\n- Consider the touchpoints that are most important for\n\nyour player journey, such as first ad impression, first\n\nclick, or first in-game action\n\n- Consider the business goals you’re trying to achieve.\n\nFor example, if you are focused on maximizing return\n\non investment, a last-click attribution model may be\n\nmost appropriate. On the other hand, if you are looking\n\nto understand the impact of each touchpoint, a multi-\n\ntouch attribution model may be more appropriate.\n\n- Consider the data you have available, including ad\n\nimpressions, clicks, installs, and in-game actions.\n\n- **Continuously monitor and optimize:** Continuously\n\nmonitor and optimize your acquisition campaigns based on\n\nthe data. Test different approaches, make adjustments as\n\nneeded, and use A/B testing to determine what works best.\n\n\n-----\n\n### Getting Started with Player Recommendations\n\nRecommendations is an advanced use case. We don’t\n\nrecommend (hehe) that you start here, instead, we’re\n\nassuming that you’ve done the work to set up your game\n\nanalytics (collecting, cleaning, and preparing data for analysis)\n\nand that you’ve done basic segmentation to place your\n\nplayers in cohorts based on their interests and behaviors.\n\nRecommendations can come in many forms for video games.\n\nFor this context, we’re going to focus on the wide-and-deep\n\nlearning for recommender systems, which has the ability\n\nto both memorize and generalize recommendations based\n\non player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n\n[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n\ndeep machine learning (ML) model has become popular in a\n\nvariety of online scenarios for its ability to personalize user\n\nengagements, even in ‘cold start problem’ scenarios with\n\nsparse data inputs.\n\nThe goal with wide-and-deep recommenders is to provide\n\n\n**Understanding the model design**\n\nTo understand the concept of wide-and-deep recommend­\n\nations, it’s best to think of it as two separate, but collaborating,\n\nengines. The wide model, often referred to in the literature as\n\nthe linear model, memorizes users and their past choices. Its\n\ninputs may consist simply of a user identifier and a product\n\nidentifier, though other attributes relevant to the pattern (such\n\nas time of day) may also be incorporated.\n\nThe deep portion of the model, so named as it is a deep\n\nneural network, examines the generalizable attributes of a\n\nuser and their choices. From these, the model learns the\n\nbroader characteristics that tend to favor user selections.\n\nTogether, the wide-and-deep submodels are trained\n\non historical product selections by individual users to\n\npredict future selections. The end result is a single model\n\ncapable of calculating the probability with which a user will\n\npurchase a given item, given both memorized past choices\n\nand generalizations about a user’s preferences. These\n\nprobabilities form the basis for user-specific rankings, which\n\ncan be used for making recommendations.\n\n\nan intimate level of player understanding. This model uses\n\n\nexplicit and implicit feedback to expand the considerations\n\nset for players. Wide-and-deep recommenders go beyond\n\nsimple weighted averaging of player feedback found in some\n\ncollaborative filters to balance what is understood about\n\nthe individual with what is known about similar gamers. If\n\ndone properly, the recommendations make the gamer feel\n\nunderstood (by your title) and this should translate into\n\ngreater value for both the player and you as the business.\n\n\n**Building the model**\n\nThe intuitive logic of the wide-and-deep recommender\n\nbelies the complexity of its actual construction. Inputs\n\nmust be defined separately for each of the wide-and-\n\ndeep portions of the model and each must be trained in a\n\ncoordinated manner to arrive at a single output, but tuned\n\nusing optimizers specific to the nature of each submodel.\n\nThankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n\n[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n\nsimplifying the assembly of an overall model.\n\n\n**User A**\n\n- user identity\n\n- user attributes\n\n**Product B**\n\n\n**Wide**\n**Sub-Model**\n\n\n**Probability of**\n\n**User A + Product B**\n\n**Wide & Deep**\n**Model**\n\n\n**Deep**\n**Sub-Model**\n\n\n\n- product identity\n\n- product attributes\n\n\n-----\n\n**Training**\n\nThe challenge for most teams is then training the\n\nrecommender on the large number of user-product\n\ncombinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n\nopen-source library for serving large datasets assembled in\n\nApache Spark™ to Tensorflow (and other ML libraries), one can\n\ncache the data on high-speed, temporary storage and then\n\nread that data in manageable increments to the model during\n\ntraining. In doing so, we limit the memory overhead associated\n\nwith the training exercise while preserving performance.\n\n**Tuning**\n\nTuning the model becomes the next challenge. Various model\n\nparameters control its ability to arrive at an optimal solution.\n\nThe most efficient way to work through the potential parameter\n\ncombinations is simply to iterate through some number of\n\ntraining cycles, comparing the models’ evaluation metrics with\n\neach run to identify the ideal parameter combinations. By\n\ntrials, we can parallelize this work across many compute nodes,\n\nallowing the optimizations to be performed in a timely manner.\n\n**Deploying**\n\nFinally, we need to deploy the model for integration with\n\nvarious retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n\nto both persist our model and package it for deployment\n\nacross a wide variety of microservices layers, including\n\nAzure Machine Learning, AWS Sagemaker, Kubernetes and\n\nDatabricks Model Serving.\n\nWhile this seems like a large number of technologies to bring\n\ntogether just to build a single model, Databricks integrates all\n\nof these technologies within a single platform, providing data\n\nscientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n\nience. The pre-integration of these technologies means various\n\nper­sonas can work faster and leverage additional capabilities,\n\nsuch as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n\ntransparency of the organization’s model building efforts.\n\nTo see an end-to-end example of how a wide and deep\n\nrecommender model may be built on Databricks, please\n\ncheck out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n\n**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Use data to inform recommendations:** Use data from\n\nyour analytics, player feedback, and other sources to\n\nunderstand what players like and dislike. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and engaging for individual players.\n\n- **Segment your players:** Consider segmenting your players\n\nbased on characteristics such as playstyle, spending\n\nhabits, and demographic information. This will allow you\n\nto create more targeted recommendations for different\n\ngroups of players.\n\n- **Consider the player’s current context:** When creating\n\nrecommendations, consider the player’s current context,\n\nsuch as what they are doing in the game and what\n\ncontent they have already consumed. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and timely.\n\n- **Test and optimize your recommendations:** Use\n\nexperimentation methods such as A/B testing to see\n\nhow different recommendations perform with different\n\nplayer segments. Use the insights you gain to optimize\n\nyour recommendations.\n\n- **Be transparent:** Make sure you are transparent with\n\nplayers about how you are creating recommendations and\n\ngive them the option to opt out if they wish.\n\n- **Use recommendations to improve the player experience:**\n\nUse personalized recommendations to improve the player\n\nexperience and increase engagement and satisfaction.\n\n### Getting Started with Next Best Offer/Action\n\nSince NBO/NBA is a specific use case of personalization, how a\n\nteam might get started implementing this will look very similar\n\nto how they would with broader personalization activities.\n\nBegin with ensuring you are appropriately collecting player\n\ndata (behavior, preferences, in-game purchases, etc), storing\n\nit in your cloud data lake using a service such as Delta Lake\n\nfrom Databricks. From here, you’ll prepare the data using\n\nDatabricks to clean, transform, and prepare for analysis.\n\nThis may include aggregating data from multiple sources,\n\nremoving duplicates and outliers, and transforming the data\n\ninto a format suitable for analysis. As you analyze the player\n\ndata, seek to identify patterns and trends in player behavior\n\n\n-----\n\nand preferences that will give you signal on which actions are\n\nmore likely to be successful.\n\nFrom here, you can build a recommendation model based\n\non the player data analysis, and incorporate information\n\non in-game items and player preferences to make\n\npersonalized recommendations.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Define your goals:** Like every use case, starting with\n\nclearly defined goals helps to ensure your implementation\n\nof NBO and NBA will be as effective and efficient as\n\npossible. Your goals will also help you determine what data\n\nto collect and how it will be used.\n\n- **Collect relevant data:** Based on your goals, make sure\n\nyou are collecting the right data to inform your NBO and\n\nNBA recommendations. This might include data on player\n\nbehavior, engagement, and spending habits.\n\n- **Leverage machine learning to scale your**\n\n**recommendations:** Use machine learning algorithms to\n\nanalyze your data and make personalized recommendations\n\nto your players. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n\nmethods such as A/B testing to see how different\n\nrecommendations perform with different player segments.\n\nPast performance is not a perfect indicator of future\n\nsuccess. Consistent testing allows you to tune your NBO and\n\nNBA recommendations so they evolve with your playerbase.\n\n- **Consider the player’s context:** When making recommend­\n\nations, consider the player’s current context, such as what\n\nthey are doing in the game and what content they have\n\nalready consumed. This will help you create recommend­\n\nations that are more likely to be relevant and timely.\n\n- **Be transparent:** Make sure you are transparent with\n\nyour players about how you are using their data to make\n\nrecommendations, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your NBO and NBA\n\n\n### Getting Started with Churn Prediction & Prevention\n\nThe exciting part of this analysis is that not only does it\n\nhelp to quantify the risk of customer churn but it paints a\n\nquantitative picture of exactly which factors explain that risk.\n\nIt’s important that we not draw too rash of a conclusion with\n\nregards to the causal linkage between a particular attribute\n\nand its associated hazard, but it’s an excellent starting point\n\nfor identifying where an organization needs to focus its\n\nattention for further investigation.\n\nThe hard part in this analysis is not the analytic techniques.\n\nThe Kaplan-Meier curves and Cox Proportional Hazard\n\nmodels used to perform the analysis above are well\n\nestablished and widely supported across analytics platforms.\n\nThe principal challenge is organizing the input data.\n\nThe vast majority of subscription services are fairly new as\n\nbusinesses. As such, the data required to examine customer\n\nattrition may be scattered across multiple systems,\n\nmaking an integrated analysis more difficult. Data Lakes\n\nare a starting point for solving this problem, but complex\n\ntransformations required to cleanse and restructure data\n\nthat has evolved as the business itself has (often rapidly)\n\nevolved requires considerable processing power. This is\n\ncertainly the case with the KKBox information assets and is a\n\npoint noted by the data provider in their public challenge.\n\nThe key to successfully completing this work is the\n\nestablishment of transparent, maintainable data processing\n\npipelines executed on an elastically scalable (and therefore\n\ncost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n\ncost-conscious in their initial approach, it’s important to\n\nremember the point made above that churn is a chronic\n\ncondition to be managed. As such, this is an analysis that\n\nshould be periodically revisited to ensure acquisition and\n\nretention practices are aligned.\n\nTo support this, we are making the code behind our\n\nanalysis available for download and review. If you have any\n\nquestions about how this solution can be deployed in your\n\nenvironment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n\n\nefforts with your team and encourage them to use the\n\n\ndata to inform their work.\n\n\n**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Define churn:** Clearly define what you consider to be\n\nplayer churn, as this will determine how you measure\n\nand predict it. For example, you might consider churn to\n\nbe when a player stops playing your game for a certain\n\nnumber of days, or when they uninstall it.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you predict and prevent churn. This\n\nmight include data on player behavior, engagement, and\n\nspending habits.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and predict which players are at\n\nrisk of churning. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** Use experimentation methods such as\n\nA/B testing to see how different strategies impact churn\n\nrates. Use the insights you gain to optimize your churn\n\nprevention efforts.\n\n- **Focus on retention:** Implement retention strategies that are\n\ntailored to the needs and preferences of your players. This\n\nmight involve providing personalized content, addressing\n\npain points, or offering incentives to continue playing.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are using their data to predict and\n\nprevent churn, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your churn prediction\n\nand prevention efforts with your team and encourage\n\nthem to use the data to inform their work.\n\n### Getting Started with Read-time Ad Targeting\n\nTypically, implementing a real-time ad targeting strategy begins\n\noutside of your game (in services such as Google Ads, Unity\n\nAdvertising), where your game becomes the delivery point\n\nfor the advertisement. Here, you will need to integrate with\n\nAd networks that provide real-time ad targeting capabilities.\n\nThat will allow you to access a range of available ad assets\n\nto dynamically select and display the most relevant ads to\n\nplayers. Both Google AdMob and Unity Ads are great for banner\n\nads, native ads, and rewarded video ads. Your role is to ensure\n\nthat the data you’re collecting is fed back into the advertising\n\nplatform to better serve targeted ads to your playerbase.\n\n\nTo use a service like Databricks to manage the data needed\n\nto provide real-time ad targeting in your application, you can\n\nfollow the below steps:\n\n`1.` **Collect and store player data:** Collect data on player\n\nbehavior, preferences, and demographics, and store it in\n\na data lake using Databricks. Popular analytics tools such\n\nas Google Analytics or Mixpanel can be integrated into\n\nthe game to collect data on player behavior. These tools,\n\njust like tracking website traffic, can track in-game events,\n\nprovide insights on player behavior and demographics..\n\nand they give you access to detailed reports and\n\ndashboards. Another option is to build in-house tracking\n\nsystems to collect data on player behavior - logging\n\nevents, e.g in-game purchases or player actions, activities\n\nsuch as “at which level does a player quit playing” and\n\nstoring this in a database for analysis. The downside of\n\nbuilding in-house tracking systems is you will need to host\n\nand maintain your own logging servers.\n\n`2.` **Prepare the data:** Use Databricks to clean, transform,\n\nand prepare the player data for analysis. This may\n\ninclude aggregating data from multiple sources, removing\n\nduplicates and outliers, and transforming the data into a\n\nformat suitable for analysis.\n\n`3.` **Analyze the data:** Use Databricks’ built-in machine\n\nlearning and data analytics capabilities to analyze the\n\nplayer data and identify patterns and trends.\n\n`4.` **Create audience segments:** Based on the analysis,\n\nuse Databricks to create audience segments based on\n\ncommon characteristics such as interests, behaviors,\n\nand preferences.\n\n`5.` **Integrate with the ad server:** When an ad opportunity\n\npresents itself within the game, a call is made to the ad\n\nserver. This call includes information about the player,\n\nsuch as the audience segment that they belong to. The\n\nad server then uses this information to decide what ad to\n\ndeliver to the player.\n\n`6.` **Monitor and optimize:** Use Databricks to monitor the\n\nperformance of the ad targeting and make optimizations\n\nas needed, such as adjusting the audience segments or\n\nadjusting the targeting algorithms.\n\nBy using a service like Databricks to manage the data needed\n\nfor real-time ad targeting, game developers can effectively\n\nleverage their player data to create more personalized and\n\nengaging experiences, increase revenue, and reduce churn.\n\n\n-----\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Focus on player data:** Make player data the center of your\n\ntargeting strategy by collecting and storing comprehensive\n\ninformation on player behavior, preferences, and\n\ndemographics. Here, it’s critical to ensure the game code\n\ndata trackers are properly implemented in order to collect\n\nthis data (see Game Analytics section for detail).\n\n- **Segment your audience:** Create audience segments\n\nbased on common characteristics such as interests,\n\nbehaviors, and preferences, and use these segments to\n\n\n**Test and iterate:** Continuously test and iterate your\n\ntargeting strategy to refine your audience segments and\n\nimprove targeting accuracy.\n\n**Balance relevance and privacy:** Balance the need for\n\nrelevant, personalized ads with players’ privacy by only\n\ncollecting and using data that is necessary for targeting\n\nand obtaining player consent.\n\n**Monitor performance:** Regularly monitor the performance\n\nof your targeting strategy to ensure that it is delivering the\n\ndesired results and make optimizations as needed.\n\n**Partner with the right ad platform:** Choose an ad\n\nplatform that is well-suited to your needs and aligns with\n\nyour goals, and work closely with them to ensure that your\n\ntargeting strategy is delivering the best results.\n\n\ndeliver targeted ads.\n\n# Operational use cases\n\n\n### Anomaly Detection\n\nFirst thing is to begin collecting the data, game server / client\n\nlogs out of your project. Then consume this into Databricks\n\nDelta, to have a continuous anomaly detection model\n\nrunning. Focus this on key pieces of information you want to\n\nmonitor, for example - for live service games, this is going to\n\nbe infrastructure and network-related metrics such as Ping\n\nand Server Health (# of clients connected, server uptime,\n\nserver usage, CPU/RAM, # of sessions, time of sessions).\n\nOnce the model is ingesting and tuned specifically for the\n\nmetrics based on the information you have above. You would\n\nbuild out alerts or notifications based on these specific\n\nmetrics hitting a threshold that you define as needing\n\nattention. From here, you can build out automated systems\n\nto mitigate those effects - such as migrating players to a\n\ndifferent server, canceling matches, scaling infrastructure,\n\ncreating tickets for admins to review.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define the problem and objectives clearly:** Before\n\nimplementing an anomaly detection solution, it is\n\nimportant to define the problem you are trying to solve\n\nand your specific objectives. This will help ensure that\n\nyou have the right data sources and use the appropriate\n\nalgorithms to achieve your goals.\n\n- **Choose the right data sources:** To effectively detect\n\nanomalies, you need to have the right data sources.\n\nConsider data from player behavior, system performance,\n\nand network traffic, as well as any other data sources that\n\nare relevant to your problem and objectives.\n\n- **Clean and preprocess the data:** To ensure that the\n\ndata you use for anomaly detection is accurate and\n\nmeaningful, it is important to clean and preprocess the\n\ndata. This includes removing any irrelevant or invalid data,\n\nhandling missing values, and normalizing the data\n\nif necessary.\n\n- **Choose the right algorithms:** There are many algorithms\n\nthat can be used for anomaly detection, including\n\nstatistical methods, machine learning algorithms, and\n\nrule-based systems. Choose the algorithms that are best\n\n\n-----\n\nsuited to your data and problem, and that provide the\n\nright level of accuracy, speed, and scalability.\n\n- **Validate the results:** Before deploying the anomaly\n\ndetection solution in production, it is important to validate\n\nthe results by testing the solution on a small subset of\n\ndata and comparing the results to expected outcomes.\n\n- **Monitor and update the solution:** Once the anomaly\n\ndetection solution is deployed, it is important to monitor\n\nits performance and accuracy, and update the solution as\n\nneeded. This may include retraining the algorithms, adding\n\nor removing data sources, and updating the parameters\n\nand thresholds used by the algorithms.\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Avoid overfitting:** Overfitting occurs when the anomaly\n\ndetection solution is too complex and learns the noise\n\nin the data rather than the underlying patterns. To avoid\n\noverfitting, it is important to choose algorithms that are\n\nappropriate for the size and complexity of the data, and to\n\nvalidate the results using a separate test dataset.\n\n- **False positive and false negative results:** False positive\n\nand false negative results can occur when the anomaly\n\ndetection solution is not properly calibrated, or when\n\nthe solution is applied to data that is significantly\n\ndifferent from the training data. To minimize the risk of\n\nfalse positive and false negative results, it is important\n\nto validate the results using a separate test dataset, and\n\nto fine-tune the parameters and thresholds used by the\n\nalgorithms as needed.\n\n- **Scalability:** Scalability can be a concern when\n\nimplementing an anomaly detection solution, especially\n\nwhen dealing with large amounts of data. To ensure that\n\nthe solution can scale to meet the demands of a growing\n\nplayer base, it is important to choose algorithms that\n\nare fast and scalable, and to deploy the solution using a\n\nscalable infrastructure.\n\n### Getting Started with Build Pipeline\n\nAn operational goal game projects have is to make sure\n\ngame project builds are generated, delivered quickly and\n\nefficiently to internal testing & external users.\n\n\nA few of the key metrics and capabilities with analyzing your\n\nbuild pipelines are the below:\n\n- **Build time and speed:** This includes metrics such as\n\nthe time it takes to create a build, number of builds, and\n\ncompute spent.\n\n- **Build size and storage:** size of the builds, amount of\n\nstorage, and network costs.\n\n- **Bug tracking and resolution:** This includes metrics such\n\nas the number of bugs reported, the time it takes to\n\nresolve them, and the number of bugs that are resolved in\n\neach build.\n\n- **Code quality and efficiency:** This includes metrics such\n\nas code complexity, code duplication, and the number of\n\ncode lines written.\n\n- **Collaboration and communication:** Such as the number\n\nof code reviews, the number of team meetings, and the\n\nnumber of code commits.\n\n- **Advanced capabilities:** Such as Predicting real time build\n\nfailure to reduce spend and combining build data with\n\nCrash Analytics (see below) to have “commit to build”\n\nvisibility for accelerated bug fixing.\n\nBefore you start implementing your build pipeline, it’s\n\nimportant to define your requirements. What are the key\n\ngoals of your build pipeline? Choosing the right CI/CD tools is\n\ncritical to the success of your build pipeline. There are many\n\ndifferent tools available, including Jenkins, Azure Devops,\n\nPerforce, gitlab and more. When choosing a CI/CD tool,\n\nconsider factors such as ease of use, scalability, and cost. In\n\naddition, consider the specific needs of your game project,\n\nand choose a tool that can meet those needs.\n\nThe general recommendation is to look at automating your\n\nbuild process early. Once you’ve chosen your CI/CD tools, you\n\ncan automate your build process by setting up a build server,\n\nconfiguring your CI/CD tool, and creating a script to build your\n\ngame project. The build process should be automated as much\n\nas possible, and it should include steps to compile your code,\n\nrun automated tests, and generate a build of your project.\n\nOnce you have automated your build process, often the\n\nnext step is to implement CD (Continuous Delivery). This\n\ninvolves automating the deployment of your game builds\n\ndelivery to stakeholders, such as QA testers, beta testers, or\n\nend-users via publishing platforms. CD can help ensure that\n\nstakeholders have access to the latest version of your game\n\n\n-----\n\nas soon as possible, allowing them to provide feedback and\n\nhelp drive the development process forward.\n\nFinally, it’s important to monitor and measure your build\n\npipeline to ensure that it’s working as expected. This can\n\ninvolve using tools such as Databricks Dashboards to\n\nvisualize the status of your pipeline, or using metrics such\n\nas build times, test results, and deployment success rates\n\nto evaluate the performance of your pipeline. By monitoring\n\nand measuring your build pipeline, you can identify areas for\n\nimprovement and make changes as needed to ensure that\n\nyour pipeline continues to meet your needs.\n\nIf you have any questions about how databricks can\n\nintegrate into your devops solution, please don’t hesitate to\n\n[reach out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Seek to automate early and often:** Automate as much\n\nof the build process as possible, from checking code into\n\nversion control to generating builds and distributing them\n\nto stakeholders. This can help reduce errors and save time,\n\nallowing game teams to focus on more high value tasks.\n\n\n**Version control, version control, version control:** Use a\n\nversion control system to manage the source code and\n\nother assets. This ensures that changes to the codebase\n\nare tracked and can be easily undone if needed.\n\n**Implement continuous integration and delivery:**\n\nContinuous integration (CI) involves automatically building\n\nand testing after code changes are checked into version\n\ncontrol. With CI, new changes to the codebase do not\n\nbreak existing functionality. By automating the build\n\nprocess, CI helps to reduce errors and save time. CD, on\n\nthe other hand, involves automatically delivering builds to\n\nstakeholders, such as QA testers, beta testers, or end-\n\nusers, after they have passed the automated tests. By\n\ncombining CI and CD, a video game project can ensure\n\nthat builds are generated and delivered quickly and\n\nefficiently, without the need for manual intervention.\n\n**Build for scalability:** As your game project grows, you\n\nwill need a build pipeline solution that is scalable and can\n\nhandle the needs of your game team.\n\n**Integration with other tools:** Integrate the build pipeline\n\nsolution with other tools and systems, such as issue\n\ntracking, testing, and deployment tools, to ensure a\n\nsmooth and efficient workflow.\n\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n\n|GAME INFRASTRUCTURE|Col2|\n|---|---|\n|||\n|||\n\n\n**AWS**\n\n**Quicksight**\n\n\n-----\n\n### Getting Started with Crash Analytics\n\nBuilding a pipeline to build a holistic view to support crash\n\nanalytics means data coming from multiple different\n\nsources, different velocities and joining the data together.\n\nThe amount of data sources depends on your game projects\n\npublishing platforms, some may come from console based\n\nproviders such as sony, xbox, and nintendo or pc platforms\n\nlike Steam, Epic Games Marketplace, GoG and many others.\n\n**High level steps**\n\n- Determine what platforms your game is running on and\n\nhow to interface to collect data.\n\n- **Collect crash data:** Implement crash reporting tools in\n\nyour game to collect data on crashes. The source data\n\nmay be delivered in varying formats such as JSON or CSV.\n\n- **Load crash data into Databricks:** Use Databricks’ data\n\ningestion tools to load the crash data into your workspace.\n\nThis could involve using Databricks’ built-in data source\n\nconnectors, or programmatically ingest files to load the data.\n\n\n\n- **Transform and clean the crash data:** Use Databricks’\n\ndata processing and transformation tools to clean and\n\nprepare the crash data for analysis. This could involve\n\nusing Databricks’ capabilities like DLT, or using SQL to\n\nperform custom transformations.\n\n- **Visualize crash data:** Use Databricks’ dashboarding tools\n\nto create visualizations that help you understand the\n\npatterns and trends in your crash data. This could involve\n\nusing Databricks’ built-in visualization tools, or integrating\n\nwith external visualization tools like Tableau or PowerBI.\n\n- **Analyze crash data:** Use Databricks’ machine learning\n\nand statistical analysis tools to identify the root causes\n\nof crashes. This could involve using Spark MLlib or many\n\nof the popular tools to build machine learning models, or\n\nusing SQL to perform custom analyses.\n\n- **Monitor and refine your pipeline:** Regularly review your\n\npipeline to ensure that it remains relevant and useful.\n\nRefine your pipeline as necessary to reflect changes in\n\nyour crash data or your goals.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Automated collection and aggregation of crash reports:**\n\nCollecting crash reports should be an automated process\n\nthat is integrated into the output of the build pipeline\n\nfor the game. The crash reports should be automatically\n\naggregated and made available for analysis in near real-time.\n\n- **Clear reporting and prioritization of issues:** The solution\n\nshould provide clear reporting on the most common\n\nissues and allow game developers to prioritize fixing the\n\nmost impactful problems first.\n\n- **Integration with other analytics tools:** The crash analytics\n\nsolution should integrate with other analytics tools, such\n\nas player behavior tracking, to provide a more complete\n\npicture of how crashes are impacting the player experience.\n\n- **Flexibility and scalability:** As the game grows, the\n\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Data privacy and security:** Ensure that crash reports do\n\nnot contain sensitive information that could be used to\n\nidentify individual players.\n\n- **Scalability:** As the number of players and crashes\n\nincreases, it may become difficult to manage and analyze\n\nthe growing volume of data.\n\n- **Integration with other tools:** Be aware when integrating\n\ncrash analytics with other tools and systems, especially if\n\nthe tools use different data formats or data structures.\n\n- **Prioritization of issues:** Determine which crashes are\n\nthe most impactful and prioritize fixes accordingly. This\n\ncan be a complex process, especially if there are a large\n\nnumber of different crash types and causes.\n\n\nsolution should be able to scale to accommodate an\n\nincreasing number of players and crashes.\n\n**Data privacy and security:** It’s important to consider data\n\nprivacy and security when implementing a crash analytics\n\nsolution. This may involve implementing measures to\n\nanonymize crash reports, or taking steps to ensure that\n\nsensitive information is not included in the reports.\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n**AWS**\n\n**Quicksight**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "### Executive Guide\n\n# Transform and Scale Your Organization With Data and AI\n\n#### A guide for CIOs, CDOs, and\n data and AI executives\n\n\n-----\n\n## Contents\n\n**A U T H O R :**\n\n**Chris D’Agostino**\n\nGlobal Field CTO\n\nDatabricks\n\n**E D I T O R S :**\n\nManveer Sahota\n\n\n**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n\n**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n\n**1.** Establish the goals and business value 8\n\n**2.** Identify and prioritize use cases 19\n\n**3.** Build successful data teams 22\n\n**4.** Deploy a modern data stack 28\n\n**5.** Improve data governance and compliance 36\n\n**6.** Democratize access to quality data 41\n\n**7.** Dramatically increase productivity of your workforce 47\n\n**8.** Make informed build vs. buy decisions 52\n\n**9.** Allocate, monitor and optimize costs 55\n\n**10.** Move to production and scale adoption 58\n\n\nJessica Barbieri\n\n\nToby Balfre\n\n\n**C H A P T E R 3 :** **Conclusion** \u0007 63\n\n\n-----\n\n**CHAPTER 1:**\n## Executive Summary\n\nData and AI leaders are faced with the challenge\n\nof future-proofing their architecture and platform\n\ninvestments. The Lakehouse implementation from\n\nDatabricks combines the best features of EDWs\n\nand data lakes by enabling all their workloads using\n\nopen source and open standards — avoiding the\n\nvendor lock-in, black box design and proprietary\n\ndata formats of other cloud vendors.\n\n\nIt’s not surprising that many industry experts say data is the most valuable resource in the modern\n\neconomy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n\nwater. Its core compound never changes, and it can be transformed to whatever use case is desired,\n\nwith the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n\nessential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n\nimportance of data are growing exponentially in both our professional and personal lives, while artificial\n\nintelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n\nover the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n\n2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\n\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\n\ngoals but also in minimizing these seven key business risks.\n\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\n\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\n\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\n\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\n\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n\nsignificant return on investment (ROI) — one that starts in months, not years.\n\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n\nto deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n\nidentify and execute on AI opportunities.\n\n\n-----\n\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\n\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n\norganizations have the option of moving away from closed, proprietary systems offered by a variety\n\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n\nindustry standards.\n\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n\nwe’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n\narchitecture, which decouples data storage from compute while providing the best price/performance\n\nmetrics for all your data workloads — including data warehousing. We have captured the lessons learned\n\nand summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\n\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n\nshown in Figure 1.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n**Figure 1:**\nThe Databricks Lakehouse Platform\n\n\n-----\n\n**The lakehouse architecture benefits organizations in several ways:**\n\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n\n**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n\n**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n\nOur intention is to present key considerations and equip you with the knowledge to ask informed questions,\n\nmake the most critical decisions early in the process, and develop the comprehensive strategy that most\n\norganizations lack.\n\nIn addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n\nprofessional services offering that organizations can leverage to measure their readiness, reskill their staff\n\nand track progress as they embark on their data transformation initiative.\n\n\n-----\n\n**CHAPTER 2:**\n## Define the Strategy\n\n\nThe most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n\nstrategy for how your organization will leverage people, processes and platforms to drive measurable\n\nbusiness results against your corporate priorities. The strategy serves as a set of principles that every\n\nmember of your organization can refer to when making decisions. The strategy should cover the roles and\n\nresponsibilities of teams within your organization for how you capture, store, curate and process data to run\n\nyour business — including the internal and external resources (labor and budget) needed to be successful.\n\n\nEstablish the\ngoals and\nbusiness value\n\n\nBuild\nsuccessful\ndata teams\n\n\nEase data\ngovernance and\ncompliance\n\n\nSimplify\nthe user\nexperience\n\n\nAllocate,\nmonitor and\noptimize costs\n\n\nIdentify and\nprioritize\nuse cases\n\n\nDeploy a modern\ndata architecture\n\n\nDemocratize\naccess to\nquality data\n\n\nMake informed\nbuild vs. buy\ndecisions\n\n\nMove to\nproduction and\ndrive adoption\n\n\n**Figure 2:**\nThe 10 steps to a winning data and AI strategy\n\n\n-----\n\n#### Here are 10 key considerations\n\n**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n\n**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n\n**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n\nand data engineering talent.\n\n**4.** \u0007Future-proof your technology investment with a modern data architecture.\n\n**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n\nConsumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n\n**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n\ndata access and the sharing of all your data across the organization.\n\n**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n\n**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n\nimportant problems.\n\n**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n\n**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n\nuser satisfaction.\n\nThe strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n\nowned and governed by the CDO and made available for everyone in the organization to review and provide\n\nfeedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n\nthe technology landscape or a combination of any of these — but it should serve as the North Star for\n\nhow you will navigate the many decisions and trade-offs that you will need to make over the course of the\n\ntransformation.\n\n\nThis guide takes a stepwise approach to\n\naddressing each of these 10 topics.\n\n\n-----\n\nStudies have shown that data scientists spend 80%\n\nof their time collecting and compiling data sets\n\n\n#### 1. Establish the goals and business value\n\nMost organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n\nThe goals generally fall into one of three categories:\n\n**1.** **Business outcomes**\n\n**2.** **People**\n\n**3.** **Technology**\n\n\nand only 20% of their time developing insights and\n\n\nIn terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n\nemerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n\nbusiness leaders see the digital transformation as an opportunity to build a new technology foundation\n\nfrom which to run their business and increase business value. One that is more agile, scalable, secure and\n\neasier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n\ndynamic economy.\n\nFor organizations today, people are one of their most valuable assets — you cannot succeed in data,\n\nanalytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n\nimpacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n\nemployees work in a frictionless data environment, to the extent possible, so they feel productive each day\n\nand can do their best work.\n\nFinally, from a technology perspective, organizations have grown tired of the high costs associated with\n\ncomplex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n\nindustry trend is to move away from large capital expenditures (capex) to pay for network and server\n\ncapacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n\napproach. Your data analytics environment should support this trend as well — using open standards, low-\n\ncost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n\nonce they are complete.\n\n\nalgorithms. Organizations that are able to invert\n\nthese numbers benefit in two ways — happier\n\nemployees and improved time to market for use\n\ncases. These employers create more favorable\n\nworking environments and lower the risk of burnout\n\nand the resulting regrettable attrition.\n\n\n-----\n\n**Executive buy-in and support**\n\nLarge organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n\nto have unwavering buy-in and support from the highest levels of management — including the CEO and\n\nboard of directors. With this support, you have the leverage you need to develop the strategy, decide on\n\nan architecture and implement a solution that can truly change the way your business is run. Without it,\n\nyou have a very expensive science project that has little hope of succeeding. Why? Because the majority\n\nof people in your organization are busy doing their day jobs. The added work to support the initiative must\n\nbe offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n\nwithin it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n\nTransformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n\nall the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n\nto be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n\nvocal proponents.\n\n\n-----\n\n**Evolve to an AI-first company — not just a data-first company**\n\nData and AI transformations should truly transform the way organizations use data, not just evolve it. For\n\ndecades, businesses have operated using traditional business processes and leveraged Structured Query\n\nLanguage (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n\ndata. There are five major challenges with this approach:\n\n**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n\nuse pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n\nability to replicate and scale the results is nearly impossible.\n\nAuto�ated Decision�Ma�ing\n\n#### Tech leaders are to the right of the Data Maturity Curve\n\n\nPrescriptive Anal�tics\n\nPredictive Modeling\n\nData Exploration\n\n\nFrom hindsight to foresight\n\n\nHow should\nwe respond?\n\n\nAuto�aticall� �a��\nthe best decision\n\n\nAd Hoc Queries\n\nReports\nClean Data\n\nWHAT HAPPENED? WHAT W255 HAPPEN?\n\nData and A2 Maturit�\n\n\n**Figure 3:**\nThe Data Maturity Curve\n\n\n-----\n\n**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n\nprocessing.\n\n**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n\nprogrammatically state, in a priority manner, how data insights can be achieved or how the business\n\nshould react to changing data.\n\n**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n\nnumber of people needed to respond to every piece of data flowing into your environment. Machines\n\nscale, people do not.\n\n**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n\ngain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n\n21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n\nof processing and managing data will not work. Using ML and AI will empower your workforce to\n\nleverage data to make better decisions for managing risk, helping your organization succeed in the\n\nmodern economy.\n\n**Go “all in” on the cloud**\n\nThe COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n\nvideoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n\ncloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n\nas a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n\nespecially when combined with the use of open source software (OSS), increase the speed at which\n\norganizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n\nFor AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n\ncorporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n\ntime, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n\nare well aware of vendor lock-in and want to abstract their applications so they can be moved across\n\nclouds if there is a compelling business reason.\n\n\n-----\n\nApproaching your technology choices with a multicloud point of view gives the organization more sovereignty\n\nover the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n\nrun on different cloud providers and simplified compliance with emerging regulations that may require\n\ncompanies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\n\nincreasingly important.\n\n**Modernize business applications**\n\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\n\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\n\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n\nservices and APIs to easily provide access to an application’s functionality.\n\nCloud-based architectures, commodity databases and software application development frameworks make\n\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\n\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n\na backing database) has become straightforward with the latest tooling available to your application\n\ndevelopment teams.\n\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\n\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n\napplications that generate and store a significant amount of the data consumed within an organization. Using\n\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n\n\n“We are on an amazing journey. Being among\n\nthe fastest-growing enterprise software cloud\n\ncompanies on record was unimaginable when\n\nwe started Databricks. To get here, we’ve stayed\n\nfocused on the three big bets we made when\n\nfounding the company — cloud, open source\n\nand machine learning. Fast-forward seven years,\n\nthousands of data teams around the globe are\n\nworking better together on Databricks.”\n\n**Ali Ghodsi**\n\nCo-founder and CEO\n\nDatabricks\n\n\n-----\n\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n\nother applications within your environment to store copies of the data — unless absolutely necessary for\n\nperformance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n\nthe data from the actual SOR.\n\nData from these SORs should be made available in three ways:\n\n**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n\n**2.** \u0007Ensure that copies of the data land in the data lake.\n\n**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n\nconsumption by downstream applications.\n\n**Move toward real-time decisioning**\n\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n\nand the second is to view data as an individual event. This so-called “time value of data” is an important\n\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n\nthe same data platform.\n\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\n\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n\ncan positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n\nThe goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n\nThis “time value of data” is shown in Figure 4 on the next page.\n\n\n-----\n\nFor example, real-time processing of clickstream data from your customer-facing mobile application can\n\nindicate when the customer is having trouble and may need to call into your call center. This insight gives\n\nyou the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n\ncenter agents — improving the customer experience and lowering customer churn.\n\nData, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n\nmachine learning models using historical data and provides you with the ability to make real-time decisions\n\nas new events take place. For example, credit card fraud models can use deep historical data about a given\n\ncustomer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n\nbuild rich models that are then executed for each new credit card transaction. This real-time execution,\n\ncombined with historical data, enables the best possible customer experience.\n\n#### Time Value of Data\n\n\nThe Databricks Lakehouse Platform allows you to\n\ncombine real-time streaming and batch processing\n\nusing one architecture and a consistent set of\n\nprogramming APIs.\n\n**Figure 4:**\nTime Value of Data\n\n\nValue of an individual data\n\nrecord is very high once created\nbut decreases over time\n\n\nValue of data records\n\nin aggregate increases\nover time\n\n\nReal-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n\n\n-----\n\n**Land** **_all_** **data in a data lake**\n\nIn order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n\nuser as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n\naccess. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n\nwarehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n\nthat data only. What do you do when you want to ask a different question? To further complicate matters,\n\nhow do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n\nHow do you find new insights as quickly as possible?\n\nThe overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n\nalong the data pipeline — including raw, refined and curated data states.\n\nThis phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n\n_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n\nall their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n\nknowing whether or not you can trust the data.\n\n**Change the way people work**\n\nWhen done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n\nsavings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n\nminimizing the number of steps needed to produce results with data. This can be accomplished by:\n\n**1.** \u0007 Making data more accessible and ensuring it can be trusted\n\n**2.** Minimizing the number of tools/systems needed to perform work\n\n**3.** Creating a flywheel effect by leveraging the work of others\n\n\n“We believe that the data lakehouse architecture\n\npresents an opportunity comparable to the one\n\nwe saw during early years of the data warehouse\n\nmarket. The unique ability of the lakehouse to\n\nmanage data in an open environment, blend all\n\nvarieties of data from all parts of the enterprise and\n\ncombine the data science focus of the data lake\n\nwith the end-user analytics of the data warehouse\n\nwill unlock incredible value for organizations.”\n\n**Bill Inmon**\n\nThe father of the data warehouse\n\n\n-----\n\nIn large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n\nis laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n\nSystems and applications get built over time to satisfy specific needs within a line of business. As a result,\n\nit’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n\ndata they need to do their jobs. It should be as simple as getting your identity and PC.\n\nWith Databricks, users can collaborate and perform\n\n\nA primary goal of your data and AI transformation should be to focus on improving the user experience —\n\nin other words, improving how your entire organization interacts with data. Data must be easily discoverable\n\nwith default access to users based on their role(s) — with a simple process to compliantly request access to\n\ndata sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n\nthe various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n\nFinally, the results of the work performed by a user or system upstream should be made available to users\n\nand systems downstream as “data assets” that can drive business value.\n\nOrganizations that maximize the productivity of their workforce and enable employees to do their best work\n\nunder optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n\n**Minimize time in the “seam”**\n\nAs you begin your data transformation, it is important to know that the longer it takes, the more risk and\n\ncost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n\nto a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n\nfor some period of time. This will have a series of momentary adverse effects on your business:\n\n\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n\n\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n\nvery different ecosystems\n\n\ntheir work more efficiently, regardless of their\n\npersona or role. The user experience is designed\n\nto support the workloads of data analysts, SQL\n\ndevelopers, data engineers, data scientists and\n\nmachine learning professionals.\n\n\n-----\n\n\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n\nmodels and cyber defenses\n\n\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n\n\u0007It will require precise communications to ensure that your business partners know which environment to\n\nuse and for what data workloads\n\nTo mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n\n“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n\nIt’s important to remember this is a critical but short-lived experience for business continuity.\n\n**Shut down legacy platforms**\n\nIn keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n\nsteps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n\npremises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n\npremises Hadoop system is generally as follows:\n\n**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n\n**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n\nfixes or absolutely critical new business use cases.\n\n**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n\n**4.** \u0007Identify the source systems that feed the data.\n\n**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n\n**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n\n**7.** \u0007Determine the downstream consumers of the output from the jobs.\n\n\n-----\n\n**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n\n**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n\narchitecture.\n\n**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n\nworking smoothly.\n\n**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n\n**12.** \u0007Rinse and repeat — until all jobs are migrated.\n\n**13.** \u0007Shut down the Hadoop cluster.\n\nA similar model can also be applied to legacy on-premises enterprise data warehouses.\n\nYou can follow the same process for other legacy systems in your environment. Some of these systems\n\nmay be more complex and require the participation of more stakeholders to identify the fastest way to\n\nrationalize the data and processes. It is important, however, to make sure that the organization has the\n\nfortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n\ntheir lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n\nfor teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n\n9 plays a crucial role in seeing the shutdown of legacy platforms through.\n\n\n-----\n\n#### 2. Identify and prioritize use cases\n\nAn important next step in enabling data, analytics and AI to transform your business is to identify use cases\n\nthat drive business value — while prioritizing the ones that are achievable under the current conditions\n\n(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n\nthat could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n\nLeaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n\n**Establish the list of potential use cases**\n\nThe first step is to ideate by bringing together various stakeholders from across the organization and\n\nunderstand the overall business drivers — especially those that are monitored by the CEO and board of\n\ndirectors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n\nand understand the business processes and the data required to implement the use case. After steps one and\n\ntwo, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n\nproject within the data/IT teams, it’s important to have a line of business champion at the executive level.\n\nThere needs to be a balance between use cases that are complex and ones that are considered low-\n\nhanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n\nstraightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n\ngiven individual or household. However, developing a sophisticated credit card fraud model that takes into\n\naccount geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n\nto perform the analytics.\n\nIn terms of performance, thought should be given to the speed at which the use case must execute. In\n\ngeneral, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n\ncases into three categories:\n\n**1.** Sub-second response\n\n**2.** Multi-second response\n\n**3.** Multi-minute response\n\n\n-----\n\nBeing pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n\nengineering the design and infrastructure.\n\n**Thinking in terms of “data assets”**\n\nMachine learning algorithms require data — data that is readily available, of high quality and relevant — to\n\nperform the experiments, train the models, and then execute the model when it is deployed to production.\n\nThe quality and veracity of the data used to perform these machine learning steps are key to deploying\n\nmodels into production that produce a tangible ROI.\n\nIt is critical to understand what steps are needed in order to make the data available for a given use case.\n\nOne point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n\nteams need to perform work to make data available for one use case, then look for opportunities to have the\n\nengineers do incremental work in order to surface data for adjacent use cases.\n\nMature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n\nthe importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n\napproach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n\nthe level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n\nroadmap helps data source owners understand the priority and complexity of the data assets that need to\n\nbe created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n\ninfluences the design of business applications and other systems within the organization.\n\n**Determine the highest impact/priority**\n\nAs shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n\naccount three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n\nwhether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n\nreduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n\ndata science talent readily available, to implement the use case. The ROI score indicates whether or not the\n\norganization can easily measure the impact to the P/L.\n\n\n-----\n\n|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n|---|---|---|---|---|\n|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n\n\n**Figure 5:**\nMethodology for scoring use cases\n**Ensure business and technology leadership alignment**\n\nPrioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n\nIt is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n\nreduction (defensive). For example, data governance and compliance use cases should take priority\n\nover offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n\nacquisition of a new customer.\n\n\n-----\n\nThe Databricks Professional Services team can\n\nhelp customers identify revenue-generating and\n\ncost-saving opportunities for data and AI use cases\n\nthat provide a significant ROI when adopting the\n\n\n#### 3. Build successful data teams\n\nIn order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n\nperforming teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n\ntraining and leadership. Digital transformations require executive-level support and are likely to fail without\n\nit — especially in large organizations.\n\nHowever, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n\nan enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n\ndata literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n\n\nlakehouse architecture.\n\n**Chief information officers and chief data officers — two sides of the data coin**\n\nData native companies generally have a single, accountable executive who is responsible for areas such\n\nas data science, business analytics, data strategy, data governance and data management. The data\n\nmanagement aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n\nthrough the environment, performing data quality checks and scanning for sensitive data in the clear.\n\nMany organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n\nto oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n\nstakeholders to establish the overall project plan, design and implementation — and to align project\n\nmanagement, product management, business analysis, data engineering, data scientist and machine\n\nlearning talent.\n\nThe CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n\nmake the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n\nmust understand the benefits of — and their role and responsibilities in — supporting the initiative.\n\n\n-----\n\nThere are two organizational constructs that are found in most successful data native companies. The first is\n\nthe creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n\nML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n\nthe formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n\npriorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n\nFurthermore, CDOs need to bring their CIOs along early in the journey.\n\n**Creating an AI/ML COE**\n\nData science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n\neverything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n\ndifficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n\ndocument, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n\nHowever, the general distinction is that data science is used to produce insights, machine learning is used to\n\nproduce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n\nis expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n\nvarious data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n\nset of questions.\n\nOrganizations wanting to build a data science competency should consider hiring talent into a centralized\n\norganization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n\ndata science. The COE works with the rest of the organization to educate and promote the appropriate use\n\nof data science for various use cases.\n\n\n-----\n\nA common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n\nthe business units or department. Using this approach, you achieve two goals:\n\n\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n\nwithin a business unit and can help identify use cases that drive value\n\n\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n\nand consistency in how work is performed among the cohort and brings that to the entire organization\n\n**Data and AI transformation steering committee**\n\nThe purpose of the steering committee is to provide governance and guidance to the data transformation\n\ninitiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n\na vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n\ninitiative.\n\nThe steering committee should meet regularly with leaders from across the organization to hear status\n\nreports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n\ngroup of stakeholders, including:\n\n\u0007\n**Program/project management:** To report the status of progress for deploying the new data\n\necosystem and driving adoption through use cases\n\n\u0007\n**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n\nof the platform\n\n\u0007\n**Engineering:** To report the status of the implementation and what technology trade-offs need\n\nto be made\n\n\u0007\n**Data science:** To report on the progress made by the COE on educating the organization about\n\nuse cases for ML and to report the status of various implementations\n\n\n-----\n\n\u0007\n**InfoSec:** To review the overall security, including network, storage, application and data\n\nencryption and tokenization\n\n\u0007\n**Architecture:** To oversee that the implementation adheres to architectural standards\n\nand guardrails\n\n\u0007\n**Risk, compliance and legal:** To oversee the approach to data governance\n\nand ethics in ML\n\n\u0007\n**User experience:** To serve as the voice of the end users who will perform their jobs using\n\nthe new data ecosystem\n\n\u0007\n**Communication:** To provide up-to-date communications to the organization about next\n\nsteps and how to drive adoption\n\n**Partnering with architecture and InfoSec**\n\nEarly on, the CDO and CIO should engage the engineering and architecture community within the\n\norganization to ensure that everyone understands the technical implications of the overall strategy. This\n\nminimizes the chances that the engineering teams will build separate and competing data platforms. In\n\nregulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n\nThe EA is responsible for validating that the overall technology design and data management features\n\nsupport the performance and regulatory compliance requirements — specifically, whether the proposed\n\ndesign can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n\nvariety and veracity (four Vs) of the data environment.\n\n\nIt is important to fully understand which\n\nenvironments and accounts your data is stored\n\nin. The goal is to minimize the number of copies of\n\nyour data and to keep the data within your cloud\n\naccount — and not the vendor’s.\n\nMake sure the architecture and security model for\n\nprotecting data is well understood.\n\n\n-----\n\nFrom an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n\napplied to the new data ecosystem and that the authentication, authorization and access control methods\n\nmeet all the data governance requirements. An industry best practice is to enable self-service registration\n\nof data sets, by the data owner, and support the assignment of security groups or roles to help automate\n\nthe access control process. This allows data sets to be accessible only to the personnel that belong to a\n\ngiven group. The group membership could be based primarily on job function or role within the organization.\n\nThis approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n\ntoo many access control groups — in other words, do not get too fine grained with group permissions, as\n\nthey will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n\nrow- and column-level security sparingly.\n\n**Centralized vs. federated labor strategy**\n\nIn most organizations today, managers work in silos, making decisions with the best intentions but focused\n\non their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n\nconflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n\nand money and potentially erode the confidence and motivation of the various teams. While it certainly is\n\nbeneficial to compare and contrast different approaches to implementing an architecture, the approaches\n\nshould be strictly managed, with everyone designing for the same goals and requirements — as described in\n\nthis strategy document and adhering to the architectural principles and best practices.\n\nEven still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n\nleast amount of complexity as possible, and one that can easily scale across the organization. It is very\n\nchallenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n\nin front of this wave of innovation and take input from the various teams to create a single, centralized\n\nplatform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n\nmodern data stack — while ensuring that there is no duplication of effort when implementing the platform\n\ncomponents. Figure 6 shows one possible structure.\n\n\n-----\n\n**Figure 6:**\nCentralized teams with matrixed responsibilities\n\n\n**Data Scientist**\nModel and predict with data\n\n**Data Analyst**\nVisualize and describe data\n\n\n**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n\n**Data Engineer**\nStore, process, maintain data\n\n**Business Partners**\n**and Domain Experts**\n\n\nCentralize data scientists under CDO — embed in lines of business for day-to-day tasking\n\nCentralize data engineers under CIO/CTO — initially as an enterprise function\n\n**Hiring, training and upskilling your talent**\n\nWhile this guide does not cover recruiting strategies, it is important to note that data engineering and data\n\nscience talent is very difficult to find in this competitive market. As a result, every organization should\n\nconsider what training and upskilling opportunities exist for their current staff. A large number of online\n\ncourses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n\naugment your existing staff with experienced data scientists and machine learning experts. You will then\n\nneed to establish clear training paths, resources and timelines to upskill your talent.\n\nUsing the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n\nless experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n\nin educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n\ntransfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n\nexperience of your talent is to enable data science using production-like data and creating a collaborative\n\nenvironment for code sharing.\n\n\n-----\n\nThe Databricks training, [documentation](https://docs.databricks.com) and\n\n[certification](https://databricks.com/learn/certification) available to customers is industry-\n\nleading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n\n\n#### 4. Deploy a modern data stack\n\nThe modern data architecture can most easily be described as the evolution of the enterprise data\n\nwarehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n\nlimitations and lessons learned from working with these two legacy data architectures inspired the next\n\ngeneration of data architecture — what the industry now refers to as the lakehouse.\n\nFigure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n\nhave improved over time.\n\n\nexemplar code for organizations to hit the ground\n\nrunning with data and AI.\n\n**Figure 7:**\nA brief history of data architectures\n\n\n-----\n\n**Evolving beyond the enterprise data warehouse and data lake**\n\nThe EDW provided organizations with the ability to easily load structured and semi-structured data into\n\nwell-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n\n(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n\nthe business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n\ncatalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n\nusers — while still being performant. The EDW served its purpose for decades. However, most of the recent\n\nadvances in AI have been in better models to process unstructured data (text, images, video, audio), but\n\nthese are precisely the types of data that an EDW is not optimized for.\n\nTherefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n\n_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n\n_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n\noften leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n\ncommodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n\norganizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n\nrange of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n\ncomplex programming model known as MapReduce, and the performance fell short of the majority of real-\n\ntime use cases.\n\nOver time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n\nprocessing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n\nmodel was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n\nfor languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n\nfrom storage and providing the scale needed for distributed workloads.\n\n\n-----\n\nA data lakehouse combines the best of data\n\n\n**Cloud-based data lakes**\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n\nAmazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n\nstorage systems in the world — which make them an attractive platform to serve as the next generation\n\nof data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n\nwarehouses.\n\n\nlakes and data warehouses, enabling BI and ML\n\n\nHowever, data lakes lack some critical features: They do not support transactions, they do not enforce\n\ndata quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\n\n**Lakehouse — the modern data architecture**\n\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\n\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n\narchitecture possible.\n\n\non all data on a simple, open and multicloud\n\nmodern data stack.\n\n\n-----\n\n**Exploratory Data Scientist**\n\n\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n\n\n**Curated Data Lake**\n\n\n**Raw Data Ingest**\n“Bronze”\n\n\n**Filtered/Cleaned/Augmented**\n“Silver”\n\n\n**Business-Level Aggregates**\n“Gold”\n\n\n**D ATA Q U A L I T Y**\n\n**Data Sources (Batch and Real-Time)**\n\n\n**Unstructured**\n\n- Image, Video, Audio\n\n- Free Text, Blob\n\n\n**Semi-Structured**\n\n- Logs, Clickstream\n\n- CSV, JSON, XML\n\n\n**Structured**\n\n- Systems of Record\n\n- Operational DBs\n\n\n**Figure 8:**\nThe building blocks for a modern data architecture\n\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\n\ntarget-state architecture supports loading all the data types that might be interesting to an organization —\n\nstructured, semi-structured and unstructured — and provides a single processing layer, using consistent\n\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\n\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\n\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n\nThe architecture makes possible the efficient creation of “data assets” for the organization by taking a\n\nstepwise approach to improving data.\n\n\n-----\n\n**Lakehouse key features**\n\nTo effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n\navailable for stakeholders to run business-critical production workloads:\n\n\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\n\nmonitoring and recovery.\n\n\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n\nread or write data, typically using SQL.\n\n\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n\n\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n\nunstructured data like tables, files, models and dashboards in concert with existing data, storage and\n\ncatalogs.\n\n\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n\nclusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n\nSome modern data warehouses also have this property.\n\n\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n\nan API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n\naccess the data directly.\n\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\nlifecycle management functions that are needed\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once, replace a subset of\n\nthe objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\npoint-in-time snapshots or rollback of erroneous\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n\nused to store, refine, analyze and access data types needed for many new data applications, including\n\nimages, video, audio, semi-structured data and text.\n\n\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n\ntools might be needed to support all these workloads, but they all rely on the same data repository.\n\n\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n\neliminates the need for separate systems dedicated to serving real-time data applications.\n\n\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n\nimproves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n\ndata in both a data lake and a warehouse.\n\n\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n\ngovernance experience across all clouds. You don’t need to invest in reinventing processes for every\n\ncloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n\nfocus on putting all your data to work to discover new insights and create business value.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n**Figure 9:**\nDelta Lake is the open data storage layer that delivers reliability,\nsecurity and performance on your data lake — for both\nstreaming and batch operations\n\n\n-----\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n\nfor security and access control are basic requirements. Data governance capabilities, including auditing,\n\nretention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n\nenable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n\nsuch enterprise features only need to be implemented, tested and administered for a single system.\n\nDatabricks is the only cloud-native vendor\n\n\n**Databricks — innovation driving performance**\n\nAdvanced analytics and machine learning on unstructured and large-scale data are two of the most\n\nstrategic priorities for enterprises today — and the growth of unstructured data is going to increase\n\nexponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n\ncenter of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n\nenough to meet the SLAs of the various workloads — especially SQL-based analytics.\n\nDatabricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n\nand hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n\non the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n\ntechnologies to provide the performance customers need to simplify their architecture. These innovations\n\ncombine to provide a single architecture that can store and process all the data sets within an organization —\n\nsupporting the range of analytics outlined above.\n\n**BI and SQL workloads**\n\nPerhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n\nfor star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n\npart of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n\nto compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n\nsatisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n\nexecution, statistical analysis of files in the object store, and hardware and software improvements make it\n\npossible to deliver on this promise.\n\n\nto be recognized as a Leader in both\n\n[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n\n**Cloud Database Management Systems** and\n\n**Data Science and Machine Learning Platforms**\n\n\n-----\n\n**A word about the data mesh architecture**\n\nIn 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n\nwhat some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n\nusing a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n\nmesh approach avoids centralizing data in one location and encourages the source systems to create\n\n“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n\ndesigners advocate for a federated approach to data and AI — while using enterprise policies to govern how\n\nsource systems make data assets available.\n\nThere are several challenges with this approach. First, the data mesh assumes that each source system\n\ncan dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n\nbecome “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n\ndetails to the individual teams. This has the potential of inconsistent implementations, which may lead to\n\nperformance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n\nsource system team has the necessary skills, or can acquire them, to build robust data products.\n\nThe lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n\nfrom the source systems reduces the curation steps needed inside the data lake itself.\n\n\n-----\n\n#### 5. Improve data governance and compliance\n\nData governance is perhaps the most challenging aspect of data transformation initiatives. Every\n\nstakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n\ndrive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n\nundetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n\nenvironments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n\na blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n\ndata governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n\nand privacy practices, and protect their data assets\n\n**Why data governance fails**\n\nWhile most people agree that data governance is a set of principles, practices and tooling that helps\n\nmanage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n\napproach — one that balances realistic policies with automation and scalability.\n\nToo often the policies developed around data governance define very strict data management principles —\n\nfor example, the development of an enterprise-wide ontological model that all data must adhere to.\n\nOrganizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n\neffort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n\ncomplexity of the requirements. Meanwhile, data continues to flow through the organization without a\n\nconsistent approach to governance, and too much of the effort is done manually and fraught with human error.\n\n\nWhat are the basic building blocks of a sound data\n\ngovernance approach?\n\n\n-----\n\n**A pragmatic approach to data governance**\n\nAt a high level, organizations should enable the following data management capabilities:\n\n**\u0007Identify all sources of data**\n\n\u0007Identify all data-producing and data-storing applications\n\n\u0007Identify the systems of record (SOR) for each data set\n\n\u0007Label data sets as internal or external (third party)\n\n\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n\n\u0007Limit which operational data stores (ODSs) can re-store SOR data\n\n**\u0007Catalog data sets**\n\n\u0007Register all data sets in a centralized data catalog\n\n\u0007Create a lightweight, self-service data registration process\n\n\u0007Limit manual entry as much as possible\n\n\u0007Record the schema, if any, for the data set\n\n\u0007Use an inference engine or tool to extract the data set schema\n\n\u0007Add business and technical metadata to make it meaningful\n\n\u0007Use machine learning to classify data sets\n\n\u0007Use crowdsourcing to validate the machine-based results\n\n**Track data lineage**\n\n\u0007Track data set flow and what systems act on data\n\n\u0007Create an enumerated list of action values for specific operations\n\n\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n\n\n\n\u0007Optional: Add a source code repository URL for action traceability\n\n\n-----\n\n**\u0007Perform data quality checks**\n\n\u0007Create a rules library that is centrally managed and versioned\n\n\u0007Update the rules library periodically with new rules\n\n\u0007Use a combination of checks — null/not null, regex, valid values\n\n\u0007Perform schema enforcement checks against data set registration\n\nBy minimizing the number of copies of your data\n\n\n**\u0007Scan for sensitive data**\n\n\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n\n\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n\n\u0007Use fixed-length tokens to preserve analytic value\n\n\u0007Determine the approach for token lookup/resolution when needed\n\n\u0007Ensure that any central token stores are secure with rotating keys\n\n\u0007Identify which data elements from GDPR/CCPA to include in scans\n\n\u0007Efficiently scan for sensitive data in cleartext using the rules library\n\n**\u0007Establish approved data flow patterns**\n\n\u0007Determine pathways for data flow (source —> target)\n\n\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n\n\u0007Determine read/write patterns for the data lake\n\n\u0007Strictly enforce data flow pathways to/from data lake\n\n\u0007Detect violations and anomalies using lineage event analysis\n\n\u0007Identify offending systems and shut down or grant exception\n\n\u0007Record data flow exceptions and set a remediation deadline\n\n**\u0007Centralize data access controls**\n\n\u0007Establish a common governance model for all data and AI assets\n\n\u0007Centrally define access policies for all data and AI assets\n\n\u0007Enable fine-grained access controls at row and column levels\n\n\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n\n\nand moving to a single data processing layer where\n\nall your data governance controls can run together,\n\nyou improve your chances of staying in compliance\n\nand detecting a data breach.\n\n\n-----\n\n**\u0007Make data discovery easy**\n\n\u0007Establish a data discovery model\n\n\u0007Use manual or automatic data classification\n\n\u0007Provide a visual interface for data discovery across your data estate\n\n\u0007Simplify data discovery with rich keyword- or business glossary-based search\n\n**\u0007Centralize data access auditing**\n\n\u0007Establish a framework or best practices for access auditing\n\n\u0007Capture audit logs for all CRUD operations performed on data\n\n\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n\nThis is not intended to be an exhaustive list of features and requirements but rather a framework to\n\nevaluate your data governance approach. There will be violations at runtime, so it will be important to have\n\nprocedures in place for how to handle these violations. In some cases, you may want to be very strict and\n\nshut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n\nthe offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n\nthese cases, the receiving systems must have their own methodology for dealing with bad data.\n\n\n-----\n\n**Hidden cost of data governance**\n\nThere are numerous examples of high-profile data breaches and failure to comply with consumer data\n\nprotection legislation. You don’t have to look very far to see reports of substantial fines levied against\n\norganizations that were not able to fully protect the data within their data ecosystem. As organizations\n\nproduce and collect more and more data, it’s important to remember that while storage is cheap, failing\n\nto enforce proper data governance is very, very expensive.\n\nIn order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n\ncompute power when you consider the massive amounts of data that exist in your organization. Each\n\ntime you copy a piece of data to load it into another tool or platform, you need to determine what data\n\ngovernance techniques exist there and how you ensure that you truly know where all your data resides.\n\nImagine the scenario where data flows through your environment and is loaded into multiple platforms\n\nusing various ETL processes. How do you handle the situation when you discover that sensitive data is\n\nin cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n\nproblem before it’s flagged for violation.\n\nHaving a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n\norganization’s brand and balance sheet.\n\nThe bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n\nit is to get data governance right.\n\n\n-----\n\n#### 6. Democratize access to quality data\n\nEffective data and AI solutions rely more on the amount of quality data available than on the sophistication\n\nor complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n\nData” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n\ndata scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n\nwhich is to create new opportunities for revenue growth, cost reduction and risk reduction.\n\n**The 80/20 data science dilemma**\n\nMost existing data environments have their data stored primarily in different operational data stores within a\n\ngiven business unit (BU) — creating several challenges:\n\n\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n\nof cross-BU opportunities\n\n\u0007The schemas are generally not well understood outside of BU or department — with only the database\n\ndesigners and power users being able to make efficient use of the data. This is referred to as the “tribal\n\nknowledge” phenomenon.\n\n\u0007The approval process and different system-level security models make it difficult and time-consuming\n\nfor data scientists to gain the proper access to the data they need\n\nIn order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n\noften done using single-node data science and generates unnecessary copies of data stored on local disk\n\ndrives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n\nspaces” within production platform environments. This has the strong potential of degrading the overall\n\nperformance for true production workloads.\n\nTo make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n\nbe needed in order to get the best model performance for your ML and AI workloads.\n\n\n-----\n\nSmall data sets reduce the effectiveness of exploration, experimentation, model development and model\n\ntraining — resulting in inaccurate models when deployed into production and used with full-size data sets.\n\nAs a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n\ntime performing analytic work — work that may need to be redone once they have access to the full-size\n\ndata sets. This is a serious problem for organizations that want to remain competitive and generate game-\n\nchanging results.\n\nAnother factor contributing to reduced productivity is the way in which end users are typically granted\n\naccess to data. Security policies usually require both coarse-grained and fine-grained data protections.\n\nIn other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n\ngrained) within the data set.\n\n**Rationalize data access roles**\n\nThe most common approach to providing coarse-grained and fine-grained access is to use what’s known\n\nas role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n\n(SSO) authentication and access control solution.\n\nUsers can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n\nThere are different strategies for identifying and creating these groups — but typically, they are done on a\n\nsystem-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n\nThis approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n\nthousand discrete security groups for large organizations — despite having a much smaller number of\n\ndefined job functions.\n\nThis approach creates one of the biggest security challenges in large organizations. When personnel leave\n\nthe company, it is fairly straightforward to remove them from the various security groups. However, when\n\npersonnel move around within the organization, their old security group assignments often remain intact\n\nand new ones are assigned based on their new job function. This leads to personnel continuing to have\n\naccess to data that they no longer have a “need to know.”\n\n\nThe Databricks Lakehouse Platform brings together\n\nall the data and AI personas into one environment\n\nand makes it easy to collaborate, share code and\n\ninsights, and operate against the same view of data.\n\n\n-----\n\n**Data classification**\n\nHaving all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n\nstrategies to segment your data based on “need to know.” Some organizations create a partition based\n\non which business unit owns the data and which one owns the data classification. For example, in a\n\nfinancial services company, credit card customers’ data could be stored separately from that of debit card\n\ncustomers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n\nThe simplest approach to data classification is to use three labels:\n\n\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n\nreleases, etc.\n\n\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n\ncompetitors. This would include strategy briefings and market or customer segmentation research.\n\n\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n\ncould negatively affect operations and put the organization at financial or legal risk. Restricted data\n\nrequires the highest level of security protection.\n\nSome organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n\nunderstands how to apply them.\n\nThe data classification requirements should be clearly documented and mapped to any legal or regulatory\n\nrequirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n\nand defines “personal information” as “information that identifies, relates to, describes, is capable of\n\nbeing associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n\nhousehold.”\n\n\n-----\n\nJust examining one CCPA category, _Customer Records Information_ , we see that the following information is\n\nto be protected: name, signature, social security number, physical characteristics or description, address,\n\ntelephone number, passport number, driver’s license or state identification card number, insurance policy\n\nnumber, education, employment, employment history, bank account number, credit or debit card number,\n\nother financial information, medical information, and health insurance information.\n\nThere are generally three different approaches in industry to performing data classification:\n\n**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n\ndone using regular expressions and lookup tables to map values to actual entities stored inside the\n\norganization (e.g., customer SSN).\n\n**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n\nthe sensitivity of the data.\n\n**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n\ndomain knowledge to ensure accuracy.\n\nTaking all this into account, an organization could implement a streamlined set of roles for RBAC that\n\nuses the convention where “domain” might be the\n\nbusiness unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n\nthe ID, and “classification” is one of the three values (public, internal, restricted).\n\nThere is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n\nrole assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n\ncombination.\n\n\n-----\n\nFor example, gives a user or a system access to all the\n\ndata fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n\nWhereas would allow the user or system\n\naccess only to nonsensitive data regarding the transaction.\n\nThis gives organizations the chance to rationalize their security groups by using a domain naming\n\nconvention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n\ngroups. It also dramatically eases the administration of granting access to data for a given user.\n\n**Everyone working from the same view of data**\n\nThe modern data stack, when combined with a simplified security group approach and a robust data\n\ngovernance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n\nimproves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n\nshared view of your data.\n\nCombining this with a sensitive data tokenization strategy can make it straightforward to empower data\n\nscientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n\nsets that both obfuscate NPI/PII information and preserve analytic value.\n\nNow, data discovery is easier because data sets have been registered in the catalog with full descriptions\n\nand business metadata — with some organizations going as far as showing realistic sample data for a\n\nparticular data set. If a user does not have access to the underlying data files, having data in one physical\n\nlocation eases the burden of granting access, and then it’s easier to deploy access-control policies and\n\ncollect/analyze audit logs to monitor data usage and to look for bad actors.\n\n\nAdopting the Databricks Lakehouse Platform allows\n\nyou to add data sets into a well-managed data lake\n\nusing low-cost object stores, and makes it easy to\n\npartition data based on domain, entity, data set and\n\nclassification levels to provide fine-grained (row-\n\nlevel and column-level) security.\n\n\n-----\n\n**Data security, validation and curation — in one place**\n\nThe modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n\nprotecting, validating and improving your organization’s data. Data governance policies can be enforced\n\nusing the built-in features of schema validation, expectations and pipelines — the three main steps to data\n\ncuration. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n\nrefer to it at Databricks, Bronze —> Silver —> Gold.\n\nThe raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n\ndata. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n\nthe data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n\nlevel” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n\nACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n\n“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n\ncustomer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n\ncross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n\nand ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n\njobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n\nimproves the user experience and helps retain talent.\n\n**Extend the impact of your data with secure data sharing**\n\nData sharing is crucial to drive business value in today’s digital economy. More and more organizations\n\nare now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n\ncustomers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n\nmonetization. Additionally, organizations are interested in leveraging external data to drive new product\n\ninnovations and services.\n\nBusiness executives must establish and promote a data sharing culture in their organizations to build\n\ncompetitive advantage.\n\n\n-----\n\n#### 7. Dramatically increase productivity of your workforce\n\nNow that you have deployed a modern data stack and have landed all your analytical data in a well-\n\nmanaged data lake with a rationalized approach to access control, the next question is, “What tools should I\n\nprovide to the user community so they can be most effective at using the new data ecosystem?”\n\n**Design thinking: working backward from the user experience**\n\nDesign thinking is a human-centered approach to innovation — focused on understanding customer needs,\n\nrapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n\nprocesses and organizations. Design thinking was introduced as a technique to not only improve but also\n\nbring joy to the way people work. The essence of design thinking is to determine what motivates people to\n\ndo their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n\n**Moving beyond best of breed**\n\nIf you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n\ntraining and model deployment tools. Many organizations take a “best of breed” approach in providing\n\ntooling for their end users. This typically occurs because leaders genuinely want to empower business\n\nunits, departments and teams to select the tool that best suits their specific needs — so-called federated\n\ntool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n\ngiven the high cost of rolling it out to the entire user population.\n\n\n-----\n\nWhen tool selection becomes localized, there are a few things to consider:\n\n\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n\ninterchangeable with criteria that are established within a specific tool category. The tool with the best\n\noverall score gets selected.\n\n\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n\npersonal preference or adoption within a department, or because a given tool is better suited to support\n\na current business process\n\n\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n\n\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n\nmoved into production\n\n\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n\nand security environment but nothing more\n\n\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n\nof tools in play or streamlining the user experience\n\n\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n\npartnership model, the ability to influence the roadmap and professional services support\n\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\n\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\n\n\n-----\n\nDatabricks is a leading data and AI company —\n\n\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\n\ndata processing, validation and curation should work. It’s the integration between the discrete functions\n\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\n\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\n\nconsequences of not doing the integration properly can be serious — in terms of security, compliance,\n\nefficiency, cost, etc.\n\n\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\n\n\nSo, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\n\ntake from both parties — sometimes calling for an organization to adjust their processes to better fit how\n\nthe platform works. There are many instances where a given business process could be simplified or recast\n\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\n\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\n\napply to the broadest set of customers.\n\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\n\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\n\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\n\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\n\nand collaboration helps improve the user experience and decreases time to market.\n\n\n[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n\nlistening to the needs of thousands of customers\n\nand having our engineers work side by side with\n\ncustomer teams to deliver real business value using\n\ndata and AI.\n\n\n-----\n\n**Unified platform, unified personas**\n\nDeploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n\ndata stack — will provide an integrated suite of tools for the full range of personas in your organization,\n\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\n\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\n\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n\nand deployment. All the work streams function off a single view of the data, and the handoffs between\n\nsubsystems are well managed.\n\nData processing happens in one auditable environment, and the number of copies of data is kept to an\n\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\n\nis eliminated.\n\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n\ndifferently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n\nconsumers must be able to adjust by monitoring the execution and detecting the changes. The data\n\nscientist, in turn, must update and test new models on the new data. Your data platform should make the\n\ndetection and remediation easier, not harder.\n\nFor the data engineers, their primary focus is extracting data from source systems and moving it into the\n\nnew data ecosystem. The data pipeline function can be simplified with a unified data platform because\n\nthe programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n\nresults in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n\nand debug since the compute layer is consistent, and the logging and auditing associated with the data\n\nprocessing and data management is centralized and of more value.\n\n\n-----\n\n**Maximize the productivity of your workforce**\n\nOnce you have a data platform that brings together your full range of personas, you should focus on the\n\nnext step for increasing productivity — namely, self-service environments.\n\nIn large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n\nenvironments for development, testing and production. These environments need to be nearly identical to\n\none another — using the same version of software while limiting the number, size and horsepower of the\n\ncompute nodes. To the extent possible, development and test should be performed with realistic test/\n\nsynthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n\npercentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n\ngeneral shape and range of values.\n\nThe **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n\nenvironments should be small and controlled with policies that spin them up and tear them down efficiently.\n\nEvery aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n\nenvironment that cannot be destroyed and easily rebuilt.\n\nThe **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n\ntools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n\nthe developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n\nmanagement.\n\nMoving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n\ndevelopers cannot randomly promote software to run in production. Again, this process should be\n\nstrictly governed using a workflow/sign-off approval approach — signed off by management as well.\n\nMany organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n\ndeployments.\n\n\n**DEV** **TEST**\n\n**PROD**\n\n\n-----\n\n#### 8. Make informed build vs. buy decisions\n\nA key piece of the strategy will involve the decision around which components of the data ecosystem are\n\nbuilt by the in-house engineering team and which components are purchased through a vendor relationship.\n\nThere is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n\nengineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n\n**Competitive advantage**\n\nThis “roll your own’’ approach has some advantages — including being able to establish the overall product\n\nvision, prioritize features and directly allocate the resources to build the software. However, it is important to\n\nkeep in mind which aspects of your development effort give you the most competitive advantage.\n\nSpend some time working with the data transformation steering committee and other stakeholders to\n\ndebate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n\ncome down to whether or not a given solution offers true competitive advantage for the organization. Does\n\nbuilding this piece of software make it harder for your competitors to compete with you? If the answer is no,\n\nthen it is better to focus your engineering and data science resources on deriving insights from your data.\n\n**Beware: becoming your own software vendor**\n\nAs many engineering leaders know, building your own software is an exciting challenge. However, it does\n\ncome with added responsibility — namely, managing the overall project timeline and costs, and being\n\nresponsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n\nupdates. You basically are becoming your own software vendor for every component of the ecosystem\n\nthat you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n\nseveral million dollars per year building out individual component parts of the new data system. This doesn’t\n\ninclude the cost to operate and maintain the software once it is in production.\n\n\n-----\n\nTo offset the anticipated development costs, engineering teams will oftentimes make the argument that\n\nthey are starting with open source software and extending it to meet the “unique requirements” of your\n\norganization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n\nunique and b) the development offers the competitive advantage that you need.\n\nEven software built on top of open source still requires significant investment in integration and testing.\n\nThe integration work is particularly challenging because of the large number of open source libraries that\n\nare required in the data science space. The question becomes, “Is this really the area that you want your\n\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n\n**How long will it take? Can the organization afford to wait?**\n\nEven if you decide the software component provides a competitive advantage and is something worth\n\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\n\ntake longer and cost more money than initially planned.\n\nThe organization should understand the impact to the overall performance and capabilities of the daily\n\necosystem for any features tied to the in-house development effort. Your business partners likely do\n\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n\nfeatures and schedule.\n\n\nDatabricks is built on top of popular open source\n\nsoftware that it created. Engineering teams can\n\nimprove the underpinnings of the Databricks\n\nplatform by submitting code via pull request and\n\nbecoming committers to the projects. The benefit\n\nto organizations is that their engineers contribute\n\nto the feature set of the data platform while\n\nDatabricks remains responsible for all integration\n\nand performance testing plus all the runtime\n\nsupport, including failover and disaster recovery.\n\n\n-----\n\n**Don’t forget about the data**\n\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\n\n“data assets” consumable to the end users or systems. Data insights, model training and model execution\n\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n\nsets from multiple lines of business or departments. Focusing your data engineering and data science\n\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n\ncreating true competitive advantage.\n\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n\nserve up data for analysis should not be underestimated. The value of this work is equally important to\n\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\n\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n\nengineers innovate on components that don’t bring true competitive advantage.\n\n\n-----\n\n#### 9. Allocate, monitor and optimize costs\n\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\n\ncould be easily shared and reused by other members of the team. The more the team used the unified\n\nplatform, the more they collaborated and their level of expertise increased.\n\n**Reduce complexity, reduce costs**\n\nThe architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n\nmore complex — resulting in increased time to market and increased costs. This was mainly due to the\n\nrequirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n\nfor the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n\nserving and analytics are performed in a single compute layer.\n\nOrganizations can rightsize the data environments and control costs using policies. The centralized\n\nand consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n\nbottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n\nexpertise is developed within the workforce.\n\n\nThe Databricks platform optimizes costs for your\n\ndata and AI workloads by intelligently provisioning\n\ninfrastructure only as you need it. Customers can\n\nestablish policies that govern the size of clusters\n\nbased on DEV, TEST, PROD environments or\n\nanticipated workloads.\n\n\n-----\n\nDatabricks monitors and records usage and allows\n\norganizations to easily track costs on a data and\n\n\n**Centralized funding model**\n\nAs previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n\nunder the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n\nthe likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n\nthe funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n\nFunding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n\nsteady state that is more sustainable.\n\n\nAI workload basis. This provides the ability to\n\n\nThe budget takes into account the cost of the data engineering function, commercial software licenses and\n\nbuilding out the center of excellence to accelerate the data science capabilities of the organization. Again,\n\nthe CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n\nfocused on the overall implementation plan and to make sound build vs. buy decisions.\n\nIt’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n\nin the CIO’s organization to perform the data engineering tasks. The data science community reports into\n\nthe CDO and is matrixed into the lines of business in order to better understand the business drivers and\n\nthe data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n\nregulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n\nthe necessary time to educate leaders throughout the organization on the value of data governance.\n\n\nimplement an enterprise-wide chargeback mode\n\nand put in place appropriate spending limits.\n\n\n-----\n\n**Chargeback models**\n\nTo establish the centralized budget to fund the data transformation initiative, some organizations impose\n\na “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n\nshould be used to build the data engineering and data science teams needed to deploy the building blocks\n\nof the new data ecosystem. However, as different teams, departments and business units begin using the\n\nnew data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n\nnot be evenly distributed, due to different levels of usage from the various parts of the organization. The\n\ngroups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n\nability to monitor and track usage — not only based on compute but also on the amount of data generated\n\nand consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n\nand above the base-level funding.\n\nPlus, not all the departments or lines of business will require the same level of compute power or fault\n\ntolerance. The architecture should support the ability to separate out the runtime portions of the data\n\necosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n\nSome workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n\nnodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n\ncritical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n\nbetter manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n\nperformance is needed most.\n\n\n-----\n\n#### 10. Move to production and scale adoption\n\nNow that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n\necosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n\nmanaging and using data to enable use cases that drive business value. They must also establish a clear\n\nset of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n\ncontinues to improve over time.\n\n**If you build it, they will come**\n\nKeep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n\nset registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n\nA high level of automation for the registration process is important because it’s not uncommon to see\n\nthousands of data sets in large organizations. The business and technical metadata plus the data quality\n\nrules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n\nprovide a visualization that shows the data movement and verifies that the approved data flow paths are\n\nbeing followed.\n\nSome key metrics to keep an eye on are:\n\n\u0007Percentage of source systems contributing data to the ecosystem\n\n\u0007Percentage of real-time streaming relative to API and batch transfers\n\n\u0007Percentage of registered data sets with full business and technical metadata\n\n\u0007Volume of data written to the data lake\n\n\u0007Percentage of raw data that enters a data curation pipeline\n\n\u0007Volume of data consumed from the data lake\n\n\u0007Number of tables defined and populated with curated data\n\n\u0007Number of models trained with data from the data lake\n\n\u0007Lineage reports and anomaly detection incidents\n\n\u0007Number of users running Python, SQL, Scala and R workloads\n\n\nIn 2018, Databricks released MLflow — an open\n\nsource platform to manage the ML lifecycle,\n\nincluding experimentation, reproducibility,\n\ndeployment and a central model registry. MLflow\n\nis included in the Databricks Lakehouse Platform\n\nand accelerates the adoption of machine learning\n\nand AI in organizations.\n\n\n-----\n\n**Communication plan**\n\nCommunication is critical throughout the data transformation initiative — however, it is particularly\n\nimportant once you move into production. Time is precious and you want to avoid rework, if at all possible.\n\nOrganizations often overlook the emotional and cultural toll that a long transformation process takes on\n\nthe workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n\nand exhausting place to be — because your business partners are busy supporting two data worlds. Most\n\nusers just want to know when the new environment will be ready. They don’t want to work with partially\n\ncompleted features, especially while performing double duty.\n\nEstablish a solid communication plan and set expectations for when features will come online. Make sure\n\nthere is detailed documentation, training and a support/help desk to field users’ questions.\n\n**DevOps — software development + IT operations**\n\nMature organizations develop a series of processes and standards for how software and data are developed,\n\nmanaged and delivered. The term “DevOps” comes from the software engineering world and refers to\n\ndeveloping and operating large-scale software systems. DevOps defines how an organization, its developers,\n\noperations staff and other stakeholders establish the goal of delivering quality software reliably and\n\nrepeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n\ncontinuous delivery (CD).\n\nThe CI portion of the process is the practice of frequently integrating newly written or changed code\n\nwith the existing code repository. As software is written, it is continuously saved back to the source code\n\nrepository, merged with other changes, built, integrated and tested — and this should occur frequently\n\nenough that the window between commit and build is narrow enough that no errors can occur without\n\ndevelopers noticing them and correcting them immediately.\n\nThis is particularly important for large, distributed teams to ensure that the software is always in a working\n\nstate — despite the frequent changes from various developers. Only software that passes the CI steps is\n\ndeployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n\ndependable releases.\n\n\nSoftware development IT operations\n\n\n-----\n\n**DataOps — data processing + IT operations**\n\nDataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n\nuse the well-established processes from DevOps to consistently and reliably improve the quality of data\n\nused to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n\nneeded for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n\ndata are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n\nend cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n\ndata sets, data assets and models that create value.\n\nFor DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n\nand the data tooling should be designed to support the workflow and make all aspects of data curation and\n\nETL more efficient.\n\n**MLOps — machine learning + IT operations**\n\nNot surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n\ndeep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n\nunique when compared with DevOps and DataOps because the approach to deploying effective machine\n\nlearning models is far more iterative and requires much more experimentation — data scientists try different\n\nfeatures, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n\nbase, understand the data used to perform the training and create reproducible results. The logging aspect\n\nof the ML development lifecycle is critical.\n\nMLOps aims to manage deployment of machine learning and deep learning models in large-scale\n\nproduction environments while also focusing on business and regulatory requirements. The ideal MLOps\n\nenvironment would include data science tools where models are constructed and analytical engines where\n\ncomputations are performed.\n\n\nData processing IT operations\n\n#### \n\nMachine learning IT operations\n\n\n-----\n\nThe overall workflow for deploying production ML models is shown in Figure 10.\n\nUnlike most software applications that execute a series of discrete operations, ML platforms are not\n\ndeterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n\nsuffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n\nbe refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n\nshould natively support this style of iterative data science.\n\n**Ethics in AI**\n\nAs more organizations deploy data and AI solutions, there is growing concern around a number of issues\n\nrelated to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n\nfair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n\nmust ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n\nexplainability to satisfy legal and regulatory safeguards.\n\nThe vast majority of AI work still involves software development by human beings and the use of curated\n\ndata sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n\nquestionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n\nexplain how it works and describe the impact of its existence on the target audience — whether internal\n\nworkers or customers.\n\n\nData extraction\n\nData preparation\n\nModel e�aluation\n\n\nData analI�i�\n\n4\nModel training\n\n6\nModel �er�ing and\nexecution\n\n\nModel monitoring\n\n**Figure 10:**\nWorkflow for deploying production ML models\n\n\n-----\n\n**Data and AI Maturity Model**\n\nWhen data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n\na data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n\nFigure 11.\n\n**Top-Line Categories and Ranking Criteria**\n\n**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n\n1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n\n\nOrganization is beginning\nto explore big data and\nAI, and understand the\npossibilities and potential\nof a few starter projects\nand experiment\n\n**Figure 11:**\nThe Data and AI Maturity Model\n\n\nOrganization builds\nthe basic capabilities\nand foundations to\nbegin exploring a more\nexpansive data and AI\nstrategy, but it lacks vision,\nlong-term objectives or\nleadership buy-in\n\n\nData and AI are budding\ninto drivers of value for\nBUs aligned to specific\nprojects and initiatives as\nthe core tenets of data\nand AI are integrated into\ncorporate strategy\n\n\nData and AI are core\ndrivers of value across the\norganization, structured\nand central to corporate\nstrategy, with a scalable\narchitecture that meets\nbusiness needs and buy-in\nfrom across the organization\n\n\nData and AI are at the\nheart of the corporate\nstrategy and are\ninvaluable differentiators\nand drivers of competitive\nadvantage\n\n\nDatabricks partners with its customers to enable them to do an internal self-assessment. The output of the\n\nself-assessment allows organizations to:\n\n\u0007Understand the current state of their journey to data and AI maturity\n\n\u0007Identify key gaps in realizing (more) value from data and AI\n\n\u0007Plot a path to increase maturity with specific actions\n\n\u0007Identify Databricks resources who can help support their journey\n\n\n-----\n\n**CHAPTER 3:**\n## Conclusion\n\n\nAfter a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n\nwith the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n\n— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n\nto future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n\narchitecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n\nunleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n\nevery use case.\n\nFor more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n\n**A B O U T T H E A U T H O R**\n\nChris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n\nis to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n\nPrior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n\nhe led a team that was responsible for building out a modern data architecture that emphasized the key\n\nattributes of the lakehouse architecture.\n\nChris has also held leadership roles at a number of technology companies.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "### eBook\n\n# A New Approach to Data Sharing\n\n#### Open data sharing and collaboration for data, analytics, and AI\n\n### Second Edition\n\n\n-----\n\n## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n\n**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n\nCommon data sharing use cases 6\n\nData monetization 6\n\nData sharing with partners or suppliers (B2B) 6\n\nInternal lines of business (LOBs) sharing 6\n\nKey benefits of data sharing 7\n\n**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n\nLegacy and homegrown solutions 9\n\nProprietary vendor solutions 11\n\nCloud object storage 13\n\n**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n\nWhat is Delta Sharing? 14\n\nKey benefits of Delta Sharing 16\n\nMaximizing value of data with Delta Sharing 18\n\nData monetization with Delta Sharing 19\n\nB2B sharing with Delta Sharing 21\n\nInternal data sharing with Delta Sharing 23\n\n**Chapter 4: How Delta Sharing Works** **26**\n\n\n-----\n\n**Chapter 5: Introducing Databricks Marketplace** **28**\n## Contents\n\nWhat is Databricks Marketplace? 30\n\nKey benefits of Databricks Marketplace 30\n\nEnable collaboration and accelerate innovation 32\n\nPowered by a fast, growing ecosystem 32\n\nUse cases for an open marketplace 32\n\nNew upcoming feature: AI model sharing 33\n\n**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n\nWhat is a data clean room? 34\n\nCommon data clean room use cases 36\n\nShortcomings of existing data clean rooms 38\n\nKey benefits of Databricks Clean Rooms 39\n\n**Resources: Getting started with Data Sharing and Collaboration** **40**\n\n**About the Authors** **42**\n\n\n-----\n\n## Introduction\n Data Sharing in Today’s Digital Economy\n\n\nToday’s economy revolves around data. Everyday, more and more\n\norganizations must exchange data with their customers, suppliers\n\nand partners. Security is critical. And yet, efficiency and immediate\n\naccessibility are equally important.\n\nWhere data sharing may have been considered optional, it’s now\n\nrequired. More organizations are investing in streamlining internal\n\nand external data sharing across the value chain. But they still face\n\nmajor roadblocks — from human inhibition to legacy solutions to\n\nvendor lock-in.\n\nTo be truly data-driven, organizations need a better way to share\n\ndata. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n\ndata sharing will outperform their peers on most business value\n\n\nwho have successfully executed data sharing initiatives are 1.7x\n\nmore effective in showing business value and return on investment\n\nfrom their data analytics strategy.\n\nTo compete in the digital economy, organizations need an open —\n\nand secure — approach to data sharing.\n\nThis eBook takes a deep dive into the modern era of data sharing\n\nand collaboration, from common use cases and key benefits to\n\nconventional approaches and the challenges of those methods.\n\nYou’ll get an overview of our open approach to data sharing and find\n\nout how Databricks allows you to share your data across platforms,\n\nto share all your data and AI, and to share all your data securely with\n\nunified governance in a privacy-safe way.\n\n\nmetrics. In addition, Gartner recently found that Chief Data Officers\n\n\n-----\n\n## Chapter 1\n What Is Data Sharing and Why Is It Important?\n\nData sharing is the ability to make the same data available to one or many stakeholders — both external\n\nand internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n\nData sharing — within your organization or externally — is an enabling technology for data commercialization\n\nand enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n\nto collaborate with partners, establish new partnerships and generate new revenue streams with data\n\nmonetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n\ngroups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n\nlimited to roles such as the data analyst, data scientist and data engineer.\n\n\n-----\n\n#### Common data sharing use cases\n\n\n#### Data\n monetization\n\nCompanies across industries are commercializing\n\ndata. Large multinational organizations have\n\nformed exclusively to monetize data, while other\n\norganizations are looking for ways to monetize\n\ntheir data and generate additional revenue\n\nstreams. Examples of these companies can\n\nrange from an agency with an identity graph to a\n\ntelecommunication company with proprietary 5G\n\ndata or to retailers that have a unique ability to\n\ncombine online and offline data. Data vendors are\n\ngrowing in importance as companies realize they\n\nneed external data for better decision-making.\n\n\n#### Data sharing with partners\n or suppliers (B2B)\n\nMany companies now strive to share data with\n\npartners and suppliers as similarly as they share\n\nit across their own organizations. For example,\n\nretailers and their suppliers continue to work more\n\nclosely together as they seek to keep their products\n\nmoving in an era of ever-changing consumer tastes.\n\nRetailers can keep suppliers posted by sharing sales\n\ndata by SKU in real time, while suppliers can share\n\nreal-time inventory data with retailers so they know\n\nwhat to expect. Scientific research organizations\n\ncan make their data available to pharmaceutical\n\ncompanies engaged in drug discovery. Public safety\n\nagencies can provide real-time public data feeds\n\nof environmental data, such as climate change\n\nstatistics or updates on potential volcanic eruptions.\n\n\n#### Internal lines of business\n (LOBs) sharing\n\nWithin any company, different departments, lines\n\nof business and subsidiaries seek to share data so\n\neveryone can make decisions based on a complete\n\nview of the current business reality. For example,\n\nfinance and HR departments need to share data\n\nas they analyze the true costs of each employee.\n\nMarketing and sales teams need a common view\n\nof data to determine the effectiveness of recent\n\nmarketing campaigns. And different subsidiaries\n\nof the same company need a unified view of the\n\nhealth of the business. Removing data silos — which\n\nare often established for the important purpose of\n\npreventing unauthorized access to data — is critical\n\nfor digital transformation initiatives and maximizing\n\nthe business value of data.\n\n\n-----\n\n#### Key benefits of data sharing\n\nAs you can see from the use cases described above, there are many benefits of data sharing, including:\n\n\n**Greater collaboration with existing partners.** In today’s hyper-\n\nconnected digital economy, no single organization can advance its\n\nbusiness objectives without partnerships. Data sharing helps solidify\n\nexisting partnerships and can help organizations establish new ones.\n\n\u0007 **Ability to generate new revenue streams.** With data sharing,\n\norganizations can generate new revenue streams by offering data\n\nproducts or data services to their end consumers.\n\n\n**Ease of producing new products, services or business models.**\n\nProduct teams can leverage both first-party data and third-party\n\ndata to refine their products and services and expand their product/\n\nservice catalog.\n\n**Greater efficiency of internal operations.** Teams across the\n\norganization can meet their business goals far more quickly when\n\nthey don’t have to spend time figuring out how to free data from\n\nsilos. When teams have access to live data, there’s no lag time\n\nbetween the need for data and the connection with the appropriate\n\ndata source.\n\n\n-----\n\n## Chapter 2\n Conventional Methods of Data Sharing and Their Challenges\n\nSharing data across different platforms, companies and clouds is no easy task. In the past,\n\norganizations have hesitated to share data more freely because of the perceived lack\n\nof secure technology, competitive concerns and the cost of implementing data sharing\n\nsolutions.\n\nEven for companies that have the budget to implement data sharing technology, many of\n\nthe current approaches can’t keep up with today’s requirements for open-format, multi-\n\ncloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n\nwhich creates friction for data providers and data consumers who use non-compatible\n\nplatforms.\n\nOver the past 30 years, data sharing solutions have come in three forms: legacy and\n\nhomegrown solutions, cloud object storage and closed source commercial solutions.\n\nEach of these approaches comes with its pros and cons.\n\n\n-----\n\n#### Legacy and homegrown solutions\n\nMany companies have built homegrown data sharing solutions based on legacy\n\ntechnologies such as email, (S)FTP or APIs.\n\n\nProvider\n\nETL\n\n\nConsumer\n\n\nBatch data\nfrom provider\n\n\nTable �\n\nTable 2\n\n\nFTP/SSH/API\nServer\n\n\nFTP/SSH/API ETL Database Analyst Run Analysis\nServer\n\n\n**Figure 1:**\nLegacy data\nsharing solutions\n\n\n**Pros**\n\n\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n\nconsumers can leverage a suite of clients to access data provided to them.\n\n\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n\nand will work both on-prem and on clouds.\n\n\n-----\n\n**Cons**\n\n\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n\nit and host it on an FTP server for different recipients. Additionally, this approach\n\nresults in creating copies of data sets. Data copying causes duplication and prevents\n\norganizations from instantly accessing live data.\n\n\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n\narchitectures due to replication and provisioning. This can add considerable time to\n\ndata sharing activities and result in out-of-date data for end consumers.\n\n\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n\nand load (ETL) the shared data for their end use cases, which further delays the time to\n\ninsights. For any new data updates from the providers, the consumers have to rerun ETL\n\npipelines again and again.\n\n\u0007 **Security and governance.** As modern data requirements become more stringent,\n\nhomegrown and legacy technologies have become more difficult to secure and govern.\n\n\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n\naccommodate large data sets.\n\n\n-----\n\n#### Proprietary vendor solutions\n\nCommercial data sharing solutions are a popular option among companies that don’t want\n\nto devote the time and resources to building an in-house solution yet also want more\n\ncontrol than what cloud object storage can offer.\n\n\nVendor 1 Platform\n\nProprietary\ndata format\n\n\nVendor V Platform\n\nProprietary\ndata format\n\n\nData Provider 1\n\nData;\nProvider\n\n\nData Provider 1\n\n\nData;\nConsumer\n\nShared data set\n\n\nData;\nProvider\n\nShared dataset\n\n\nData;\nConsumer\n\n\nNo cross-platform\nsharing\n\n\n**Figure 2:**\nProprietary\nvendor solutions\n\n\nShared dataset\n\nShared data set\n\n\nShared data set\n\n\nShared data set\n\n\nSharing limited to recipients\non the same platform\n\nData;\nConsumer\n\n\nData;\nConsumere\n\n\n**Pros**\n\n\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n\nthe same platform.\n\n\n-----\n\n**Cons**\n\n\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n\ndata sharing is easy among fellow customers, it’s usually impossible with those who\n\nuse competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n\nFurthermore, platform differences between data providers and recipients introduce\n\ndata sharing complexities.\n\n\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n\ndata copies.\n\n\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n\n\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n\nconsumers, as data providers have to replicate data for different recipients on different\n\ncloud platforms.\n\n\n-----\n\n#### Cloud object storage\n\n\n**Cons**\n\n\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n\nsame cloud to access the objects.\n\n\u0007 **Cumbersome security and governance.** Assigning permissions\n\nand managing access is complex. Custom application logic is\n\nneeded to generate signed URLs.\n\n\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n\nfind it difficult to understand Identity Access Management\n\n(IAM) policies and how data is mapped to underlying files. For\n\ncompanies with large volumes of data, sharing via cloud storage\n\nis time-consuming, cumbersome and nearly impossible to scale.\n\n\u0007 **Operational overhead for data recipients.** The data recipients\n\nhave to run extract, transform and load (ETL) pipelines on the\n\nraw files before consuming them for their end use cases.\n\nThe lack of a comprehensive solution makes it challenging for data\n\nproviders and consumers to easily share data. Cumbersome and\n\nincomplete data sharing processes also constrain the development\n\nof business opportunities from shared data.\n\n\nObject storage is considered a good fit for the cloud because it is\n\nelastic and can more easily scale into multiple petabytes to support\n\nunlimited data growth. The big three cloud providers all offer object\n\nstorage services (AWS S3, Azure Blob, Google Cloud Storage) that\n\nare cheap, scalable and extremely reliable.\n\nAn interesting feature of cloud object storage is the ability to\n\ngenerate signed URLs, which grant time-limited permission to\n\ndownload objects. Anyone who receives the presigned URL can\n\nthen access the specified objects, making this a convenient\n\nway to share data.\n\n**Pros**\n\n\u0007 **Sharing data in place.** Object storage can be shared in place,\n\nallowing consumers to access the latest available data.\n\n\u0007 **Scalability.** Cloud object storage profits from availability and\n\ndurability guarantees that typically cannot be achieved\n\non-premises. Data consumers retrieve data directly from the\n\ncloud providers, saving bandwidth for the providers.\n\n\n-----\n\n## Chapter 3\n Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n\n\nWe believe the future of data sharing should be characterized by\n\nopen technology. Data sharing shouldn’t be tied to a proprietary\n\ntechnology that introduces unnecessary limitations and financial\n\nburdens to the process. It should be readily available to anyone who\n\nwants to share data at scale. This philosophy inspired us to develop\n\nand release a new protocol for sharing data: Delta Sharing.\n\n#### What is Delta Sharing?\n\nDelta Sharing provides an open solution to securely share live data\n\nfrom your lakehouse to any computing platform. Recipients don’t\n\n\nData providers can centrally manage, govern, audit and track\n\nusage of the shared data on one platform. Delta Sharing is natively\n\nintegrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n\nmanage and audit shared data across organizations and confidently\n\nshare data assets while meeting security and compliance needs.\n\nWith Delta Sharing, organizations can easily share existing large-\n\nscale data sets based on the open source formats Apache Parquet\n\nand Delta Lake without moving data. Teams gain the flexibility to\n\nquery, visualize, transform, ingest or enrich shared data with their\n\ntools of choice.\n\n\nhave to be on the Databricks platform or on the same cloud or a\n\ncloud at all. Data providers can share live data without replicating\n\nit or moving it to another system. Recipients benefit from always\n\nhaving access to the latest version of data and can quickly query\n\nshared data using tools of their choice for BI, analytics and machine\n\nlearning, reducing time-to-value.\n\n\n-----\n\nData ����i�e�\n\n\nAny u�e cy�e\n\nAnalytics\n\nBI\n\nData Science\n\n\nData Recipient\n\nAny sool\n\nAnd many more\n\n\nAny cloud/on-prem\n\nOn-premises\n\n\nAccess permissions\n\nDelta Sharing Protocol\n\n\nDelta �a�e �a�le Delta Sharing Ser�er\n\n\nNo replication\nEasy to manage\nSecure\n\n\n**Figure 3:**\nDelta Sharing\n\n\nDatabricks designed Delta Sharing with five goals in mind:\n\n\u0007Provide an open cross-platform sharing solution\n\n\u0007Share live data without copying it to another system\n\n\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n\nprovide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n\n\u0007Provide strong security, auditing and governance\n\n\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n\nderivatives such as ML models, dashboards and notebooks, in addition to tabular data\n\n\n-----\n\n#### Key benefits of Delta Sharing\n\nBy eliminating the obstacles and shortcomings associated with typical data sharing\n\napproaches, Delta Sharing delivers several key benefits, including:\n\n\n**Open cross-platform sharing.** Delta Sharing establishes a new\n\nopen standard for secure data sharing and supports open source\n\nDelta and Apache Parquet formats. Data recipients don’t have to be\n\non the Databricks platform or on the same cloud, as Delta Sharing\n\nworks across clouds and even from cloud to on-premises setups. To\n\ngive customers even greater flexibility, Databricks has also released\n\nopen source connectors for pandas, Apache Spark, Elixir and\n\nPython, and is working with partners on many more.\n\n\u0007 **Securely share live data without replication.** Most enterprise\n\n\n**Centralized governance.** With Databricks Delta Sharing, data\n\nproviders can grant, track, audit and even revoke access to shared\n\ndata sets from a single point of enforcement to meet compliance and\n\nother regulatory requirements. Databricks Delta Sharing users get:\n\n\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n\ngovernance offering for Databricks Lakehouse\n\n\u0007Simple, more secure setup and management of shares\n\n\u0007The ability to create and manage recipients and data shares\n\n\u0007Audit logging captured automatically as part of Unity Catalog\n\n\u0007Direct integration with the rest of the Databricks ecosystem\n\n\u0007No separate compute for providing and managing shares\n\n\ndata today is stored in cloud data lakes. Any of these existing data\n\nsets on the provider’s data lake can easily be shared without any\n\ndata replication or physical movement of data. Data providers can\n\nupdate their data sets reliably in real time and provide a fresh and\n\nconsistent view of their data to recipients.\n\n\n-----\n\n**Share data products, including AI models, dashboards and**\n\n**notebooks, with greater flexibility.** Data providers can choose\n\nbetween sharing anentire table or sharing only a version or\n\nspecific partitions of a table. However, sharing just tabular data\n\nis not enough to meet today’s consumer demands. Delta Sharing\n\nalso supports sharing of non-tabular data and data derivatives\n\nsuch as data streams, AI models, SQL views and arbitrary files,\n\nenablingincreased collaboration and innovation. Data providers can\n\nbuild, package and distribute data products including data sets,\n\nAI and notebooks, allowingdata recipients to get insights faster.\n\nFurthermore, this approach promotes and empowers the exchange\n\nof knowledge — not just data — between different organizations.\n\n\n**Share data at a lower cost.** Delta Sharing lowers the cost of\n\nmanaging and consuming shares for both data providers and\n\nrecipients. Providers can share data from their cloud object store\n\nwithout replicating, thereby reducing the cost of storage. Incontrast,\n\nexisting data sharing platforms require data providers to first move\n\ntheir data into their platform or store data in proprietary formats in\n\ntheir managed storage, which often costs more and results in data\n\nduplication. With Delta Sharing, data providers don’t need to set\n\nup separate computing environments to share data. Consumers\n\ncan access shared data directly using their tools of choice without\n\nsetting up specific consumption ecosystems, thereby reducing\n\ncosts.\n\n\nWith Delta Sharing we are able to achieve a truly open marketplace\n\nand truly open ecosystem. In contrast, commercial products are\n\nmostly limited to sharing raw tabular data and cannot be used to\n\n\nshare these higher-valued data derivatives.\n\n\n\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n\nset up a new ingestion process to consume data. Data recipients\n\ncan directly access the fresh data and query it using tools of their\n\nchoice. Recipients can also enrich data with data sets from popular\n\ndata providers. The Delta Sharing ecosystem of open source and\n\ncommercial partners is growing every day.\n\n\n-----\n\n#### Maximizing value of data with Delta Sharing\n\nDelta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n\nvariety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n\nSharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n\nIn this section we will explore the building blocks of such an approach and the use cases emerging from these.\n\n\n“Delta Sharing helped us streamline our data delivery process\n\nfor large data sets. This enables our clients to bring their own\n\ncompute environment to read fresh curated data with little-to-\n\nno integration work, and enables us to continue expanding our\n\ncatalog of unique, high-quality data products.”\n\n— **William Dague** , Head of Alternative Data, Nasdaq\n\n\n“We recognize that openness of data will play a key role in\n\nachieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n\nprovides Shell with a standard, controlled and secure protocol\n\nfor sharing vast amounts of data easily with our partners to work\n\ntoward these goals without requiring our partners be on the same\n\ndata sharing platform.”\n\n— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n\n\n“Leveraging the powerful capabilities of Delta Sharing from\n\n\nDatabricks enables Pumpjack Dataworks to have a faster\n\nonboarding experience, removing the need for exporting,\n\nimporting and remodeling of data, which brings immediate\n\nvalue to our clients. Faster results yield greater commercial\n\nopportunity for our clients and their partners.”\n\n\n“Data accessibility is a massive consideration for us. We believe\n\nthat Delta Sharing will simplify data pipelines by enabling us to\n\nquery fresh data from the place where it lives, and we are not\n\nlocked into any platform or data format.”\n\n— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n\n\n— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n\n“As a data company, giving our customers access to our data sets\n\nis critical. The Databricks Lakehouse Platform with Delta Sharing\n\nreally streamlines that process, allowing us to securely reach a\n\nmuch broader user base regardless of cloud or platform.”\n\n— **Felix Cheung** , VP of Engineering, SafeGraph\n\n\n-----\n\n#### Data monetization with Delta Sharing\n\nDelta Sharing enables companies to monetize their data product simply and with necessary governance.\n\nData /on.2-er $\n\n\nCloud Storage\n\n\nFulfllleen\n\nEntitles various data products\n\nData Vendor\n\nUnity\nCatalog\n\n\nUnity\nCatalog\n\nCloud Storage\n\nData /on.2-er �\n\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O\n\nR/O\n\n\n**Figure 4:**\nData monetization\nwith Delta Sharing\n\n\nDelta\nSharing\n\n\nBillieg Audit Log\n\n\n-----\n\nWith Delta Sharing, a data provider can seamlessly share large data sets and overcome\n\nthe scalability issues associated with SFTP servers. Data providers can easily expand their\n\ndata product lines since Delta Sharing doesn’t require you to build a dedicated service\n\nfor each of your data products like API services would. The company simply grants and\n\nmanages access to the data recipients instead of replicating the data — thereby reducing\n\ncomplexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n\nfor a data product. Any data that exists on your platform can be securely shared with your\n\nconsumers. This grants a wider addressable market — your products have appeal to a\n\nbroader range of consumers, from those who say “we need access to your raw data only”\n\nto those who say “we want only small subsets of your Gold layer data.”\n\nTo mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n\naccess to the data. Data providers can use this information to determine the costs\n\nassociated with any of the data products and evaluate if such products are commercially\n\nviable and sensible.\n\n\n-----\n\n#### B2B sharing with Delta Sharing\n\nCloud Storage\n\nPartner A\n\nUnity\nCatalog\n\n\nPartner U\n\n\nUnity\nCatalog\n\nCloud Storage\n\nPartner B\n\nN o n - D ata b r i c k s C u s t o m e r\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O R/O\n\nR/O\n\n\n**Figure 5:**\nB2B sharing with\nDelta Sharing\n\n\nDelta\nSharing\n\n\n-----\n\nDelta Sharing applies in the case of bidirectional exchange of data.\n\nCompanies use Delta Sharing to incorporate partners and suppliers\n\nseamlessly into their workflows. Traditionally, this is not an easy task.\n\nAn organization typically has no control over how their partners are\n\nimplementing their own data platforms. The complexity increases\n\nwhen we consider that the partners and suppliers can reside in\n\na public cloud, private cloud or an on-premises deployed data\n\nplatform. The choices of platform and architecture are not imposed\n\non your partners and suppliers. Due to its open protocol, Delta\n\nSharing addresses this requirement foundationally. Through a wide\n\narray of existing connectors (and many more being implemented),\n\nyour data can land anywhere your partners and suppliers need to\n\nconsume it.\n\n\nIn addition to the location of data consumer residency, the\n\ncomplexity of data arises as a consideration. The traditional\n\napproach to sharing data using APIs is inflexible and imposes\n\nadditional development cycles on both ends of the exchange in\n\norder to implement both the provider pipelines and consumer\n\npipelines. With Delta Sharing, this problem can be abstracted. Data\n\ncan be shared as soon as it lands in the Delta table and when the\n\nshares and grants are defined. There are no implementation costs\n\non the provider side. On the consumer side, data simply needs\n\nto be ingested and transformed into an expected schema for the\n\ndownstream processes.\n\nThis means that you can form much more agile data exchange\n\npatterns with your partners and suppliers and attain value from your\n\ncombined data much quicker than ever before.\n\n\n-----\n\n#### Internal data sharing with Delta Sharing\n\nInternal data sharing is becoming an increasingly important consideration for any modern\n\norganization, particularly where data describing the same concepts have been produced in\n\ndifferent ways and in different data silos across the organization. In this situation it is important\n\nto design systems and platforms that allow governed and intentional federation of data and\n\nprocesses, and at the same time allow easy and seamless integration of said data and processes.\n\nArchitectural design patterns such as Data Mesh have emerged to address these specific\n\nchallenges and considerations. Data Mesh architecture assumes a federated design and\n\ndissemination of ownership and responsibility to business units or divisions. This, in fact, has\n\nseveral advantages, chief among them that data is owned by the parts of the organization closest\n\nto the source of the data. Data residence is naturally enforced since data sits within the geo-\n\nlocality where it has been generated. Finally, data volumes and data variety are kept in control\n\ndue to the localization within a data domain (or data node). On the other hand, the architecture\n\npromotes exchange of data between different data domains when that data is needed to deliver\n\noutcomes and better insights.\n\n\n-----\n\nBusiness Unit 1 Business Unit ,\ni n R e g i o n A i n R e g i o n -\n\nCloud Storage\n\nUnity\nCatalog\n\nR/O R/O\n\n\nUnity\nCatalog\n\nCloud Storage\n\n\nDelta\nSharing\n\n\nBusiness Unit B\n\ni n R e g i o n A\n\n\nDelta\nSharing\n\nR/O R/O\n\n\nCloud Storage\n\nBusiness Unit �\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\n**Figure 6:**\nBuilding a Data Mesh\nwith Delta Sharing\n\n\n-----\n\nUnity Catalog enables consolidated data access control across\n\ndifferent data domains within an organization using the Lakehouse\n\non Databricks. In addition, Unity Catalog adds a set of simple and\n\neasy-to-use declarative APIsto govern and control data exchange\n\npatterns between the data domains in the Data Mesh.\n\nTo make matters even more complicated, organizations can grow\n\nthrough mergers and acquisitions. In such cases we cannot assume\n\nthat organizations being acquired have followed the same set of\n\nrules and standards to define their platforms and produce their\n\ndata. Furthermore, we cannot even assume that they have used\n\nthe same cloud providers, nor can we assume the complexity of\n\ntheir data models. Delta Sharing can simplify and accelerate the\n\n\nunification and assimilation of newly acquired organizations and\n\ntheir data and processes.. Individual organizations can be treated\n\nas new data domains in the overarching mesh. Only selected data\n\nsources can be exchanged between the different platforms. This\n\nenables teams to move freely between the organizations that are\n\nmerging without losing their data — if anything, they are empowered\n\nto drive insights of higher quality by combining the data of both.\n\nWith Unity Catalog and Delta Sharing, the Lakehouse architecture\n\nseamlessly combines with the Data Mesh architecture to deliver\n\nmore power than ever before, pushing the boundaries of what’s\n\npossible and simplifying activities that were deemed daunting not\n\nso long ago.\n\n\n-----\n\n## Chapter 4\n How Delta Sharing Works\n\n\nDelta Sharing is designed to be simple, scalable, nonproprietary\n\nand cost-effective for organizations that are serious about getting\n\nmore from their data. Delta Sharing is natively integrated with Unity\n\nCatalog, which enables customers to add fine-grained governance\n\nand security controls, making it easy and safe to share data\n\n\nDelta Sharing is a simple REST protocol that securely grants\n\ntemporary access to part of a cloud data set. It leverages modern\n\ncloud storage systems — such as AWS S3, Azure ADLS or Google’s\n\nGCS — to reliably grant read-only access to large data sets. Here’s\n\nhow it works for data providers and data recipients.\n\n\ninternally or externally.\n\nData PJQIiLeJ Data Recipient\n\nAccess permissions\n\nRequest table\n\nPre-signed short-lived URLs\n\n\nDelta Lake\n\nParquet `iles\n\n\nDelta Sharing Server\n\n\n**Figure 7:**\nHow Delta Sharing\nworks connecting data\nproviders and data\nrecipients\n\n\nTemporary direct access to fles Parquet ormatt\nin the object store — AWS S3, GCP, ADLS\n\n\n\n- • •\nDelta Sharing Client\n\n\n-----\n\n#### Data providers\n\nThe data provider shares existing tables or parts thereof (such as\n\nspecific table versions or partitions) stored on the cloud data lake\n\nin Delta Lake format. The provider decides what data they want to\n\nshare and runs a sharing server in front of it that implements the\n\nDelta Sharing protocol and manages recipient access. . To manage\n\nshares and recipients, you can use SQL commands,the Unity\n\nCatalog CLI or the intuitive user interface.\n\n#### Data recipients\n\nThe data recipient only needs one of the many Delta Sharing clients\n\nthat support the protocol. Databricks has released open source\n\nconnectors for pandas, Apache Spark, Java and Python, and is\n\nworking with partners on many more.\n\n\n#### The data exchange\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n**1.** \u0007The recipient’s client authenticates to the sharing server and\n\nasks to query a specific table. The client can also provide filters\n\non the data (for example, “country=US”) as a hint to read just a\n\nsubset of the data.\n\n**2.** \u0007The server verifies whether the client is allowed to access the\n\ndata, logs the request, and then determines which data to send\n\nback. This will be a subset of the data objects in cloud storage\n\nsystems that make up the table.\n\n**3.** \u0007To allow temporary access to the data, the server generates\n\nshort-lived presigned URLs that allow the client to read Parquet\n\nfiles directly from the cloud provider so that the read-only\n\naccess can happen in parallel at massive bandwidth, without\n\nstreaming through the sharing server.\n\n\n-----\n\n## Chapter 5\n Introducing Databricks Marketplace\n\n\nEnterprises need open collaboration for data and AI. Data sharing\n\n— within an organization or externally — allows companies to\n\ncollaborate with partners, establish new partnerships and generate\n\nnew revenue streams with data monetization.\n\nThe demand for generative AI is driving disruption across industries,\n\nincreasing the urgency for technical teams to build generative AI\n\nmodels and Large Language Models (LLMs) on top of their own data\n\nto differentiate their offerings.\n\n\nTraditional data marketplaces are restricted and offer only data or\n\nsimple applications, therefore limiting their value to data consumers.\n\nThey also don’t offer tools to evaluate the data assets beyond basic\n\ndescriptions or examples. Finally, data delivery is limited, often\n\nrequiring ETL or a proprietary delivery mechanism.\n\nEnterprises need a better way to share data and AI that is flexible,\n\nsecure and unlocks business value. An ecosystem makes data\n\nsharing and collaboration powerful.\n\n\n**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n\n**Data Consumers** **Data Providers**\n\n\nFocus on data only\nor simple applications\n\nLengthy discovery and\nevaluation\n\nDelayed time-to-insights\nwith vendor lock-in\n\n\nLimited opportunities to\n\nmonetize new types of assets\n\n\nLimited opportunities to\n\n\nDifficulty reaching\n\nmore users\n\n\nDifficulty reaching\n\n\nLack of secure technology\n\nand unified governance\n\n\nLack of secure technology\n\n\n-----\n\n#### Challenges in today's data marketplaces\n\n**Data Consumers** **Data Providers**\n\n\n\u0007 **Focus on data only or simple applications:** Accessing only\n\ndata sets means organizations looking to take advantage of\n\nAI/ML need to look elsewhere or start from scratch, causing\n\ndelays in driving business insights.\n\n\u0007 **Lengthy discovery and evaluation:** The tools most\n\nmarketplaces provide for data consumers to evaluate data\n\nare simply descriptions and example SQL statements. Minimal\n\n\n\u0007 **Limited opportunities to monetize new types of assets:**\n\nA data-only approach means organizations are limited to\n\nmonetizing anything beyond a data set and will face more\n\nfriction to create new revenue opportunities with non-\n\ncompatible platforms.\n\n**Difficulty reaching more users:** Data providers must choose\n\nbetween forgoing potential business or incurring the expense\n\nof replicating data.\n\n\nevaluation tools mean it takes more time to figure out if a data\n\nproduct is right for you, which might include more time in\n\nback-and-forth messages with a provider or searching for a\n\nnew provider altogether.\n\n\n**Delayed time-to-insights with vendor lock-in:** Delivery\n\nthrough proprietary sharing technologies or FTP means either\n\nvendor lock-in or lengthy ETL processes to get the data where\n\nyou need to work with it.\n\n\n**Lack of secure technology and unified governance:** Without\n\nopen standards for sharing data securely across platforms\n\nand clouds, data providers must use multiple tools to secure\n\naccess to scattered data, leading to inconsistent governance.\n\n\n-----\n\n#### What is Databricks Marketplace?\n\n\napproach allows you to put your data to work more quickly in\n\nevery cloud with your tools of choice.\n\nMarketplace brings together a vast ecosystem of data\n\nconsumers and data providers to collaborate across a wide\n\narray of data sets without platform dependencies, complicated\n\nETL, expensive replication and vendor lock-in.\n\n\nDatabricks Marketplace is an open marketplace for all your data,\n\nanalytics and AI, powered by Delta Sharing.\n\nSince Marketplace is powered by Delta Sharing, you can benefit\n\nfrom open source flexibility and no vendor lock-in, enabling you\n\nto collaborate across all platforms, clouds and regions. This open\n\n\n#### Key Benefits of Databricks Marketplace\n\n**Consumers** **Providers**\n\n\nDatabricks\nMarketplace\nprovides key benefits\nfor both data\nconsumers and data\nproviders.\n\n\nDiscover more\n\nthan just data\n\n\nReach users\n\non any platform\n\n\nReach users\n\n\nEvaluate data\n\nproducts faster\n\nAvoid vendor lock-in\n\n\nMonetize more\n\nthan just data\n\n\nMonetize more\n\n\nShare data securely\n\n\n-----\n\n#### Databricks Marketplace drives innovation and expands revenue opportunities\n\n\n##### Data Consumers\n\n For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n\n**Discover more than just data:** Access more than just data sets,\n\nincluding AI models, notebooks, applications and solutions.\n\n**Evaluate data products faster:** Pre-built notebooks and sample\n\ndata help you quickly evaluate and have much greater confidence\n\nthat a data product is right for your AI or analytics initiatives.\n\nObtain the fastest and simplest time to insight.\n\n**Avoid vendor lock-in:** Substantially reduce the time to deliver\n\ninsights and avoid lock-in with open and seamless sharing and\n\ncollaboration across clouds, regions, or platforms. Directly\n\nintegrate with your tools of choice and right where you work.\n\n\n##### Data Providers\n\n For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n\n**Reach users on any platform:** Expand your reach across\n\nplatforms and access a massive ecosystem beyond walled\n\ngardens. Streamline delivery of simple data sharing to any cloud\n\nor region, without replication.\n\n**Monetize more than just data:** Monetize the broadest set of\n\ndata assets including data sets, notebooks, AI models to reach\n\nmore data consumers.\n\n**Share data securely:** Share all your data sets, notebooks, AI\n\nmodels, dashboards and more securely across clouds, regions\n\nand data platforms.\n\n\n-----\n\n#### Enable collaboration and accelerate innovation\n\n\n#### Powered by a fast, growing ecosystem\n\nEnterprises need open collaboration for data and AI. In the past few\n\nmonths, we've continued to increase partners across industries,\n\nincluding Retail, Communications and Media & Entertainment,\n\n\n\u0007 **Advertising and Retail**\n\nIncorporate shopper behavior analysis | Ads uplift/\n\nperformance | Demand forecasting | “Next best SKU”\n\nprediction | Inventory analysis | Live weather data\n\n\nFinancial Services, with 520+ listings you can explore in our open\n\n\n\u0007 **Finance**\n\nIncorporate data from stock exchange to predict\n\neconomic impact | Market research | Public census and\n\nhousing data to predict insurance sales\n\n\u0007 **Healthcare and Life Sciences**\n\nGenomic target identification | Patient risk scoring\n\nAccelerating drug discovery | Commercial effectiveness |\n\nClinical research\n\nFor more on Databricks Marketplace,\n\ngo to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n\nResources section on page 41 .\n\n\nMarketplace from 80+ providers and counting.\n\n#### Use cases for an open marketplace\n\nOrganizations across all industries have many use cases for\n\nconsuming and sharing third-party data from the simple (dataset\n\njoins) to the more advanced (AI notebooks, applications and\n\ndashboards).\n\n\n-----\n\n#### New upcoming feature: AI model sharing\n\n\nNowadays, it may seem like every organization wants to become\n\nan AI organization. However, most organizations are new to AI.\n\nDatabricks has heard from customers that they want to discover\n\nout-of-the-box AI models on Marketplace to help them kickstart\n\ntheir AI innovation journey.\n\nTo meet this demand, Databricks will be adding AI model sharing\n\ncapabilities on Marketplace to provide users access to both OSS\n\nand proprietary AI (both first-and third-party) models. This will\n\nenable data consumers and providers to discover and monetize AI\n\nmodels and integrate AI into their data solutions.\n\n\nUsing this feature, data consumers can evaluate AI models with\n\nrich previews, including visualizations and pre-built notebooks with\n\nsample data. With Databricks Marketplace, there are no difficult\n\ndata delivery mechanisms — you can get the AI models instantly\n\nwith the click of a button. All of this works out-of-the-box with the AI\n\ncapabilities of the Databricks Lakehouse Platform for both real-time\n\nand batch inference. For real-time inference, you can use model\n\nserving endpoints. For batch inference, you can invoke the models\n\nas functions directly from DBSQL or notebooks.\n\nWith AI model sharing, Databricks customers will have access\n\nto best-in-class models from leading providers, as well as OSS\n\nmodels published by Databricks which can be quickly and securely\n\napplied on top of their data. Databricks will curate and publish\n\nits own open source models across common use cases, such as\n\ninstruction-following and text summarization, and optimize tuning or\n\ndeployment of these models.\n\nUsing AI models from Databricks Marketplace can help your\n\norganization summarize complex information quickly and easily to\n\nhelp accelerate the pace of innovation.\n\n\n-----\n\n## Chapter 6\n Share securely with Databricks Clean Rooms\n\n\nWhile the demand for external data to make data-driven\n\ninnovations is greater than ever, there is growing concern among\n\norganizations around data privacy. The need for organizations to\n\nshare data and collaborate with their partners and customers in a\n\nsecure, governed and privacy-centric way is driving the concept\n\nof “data clean rooms.”\n\n\n#### What is a data clean room?\n\nA data clean room provides a secure, governed and privacy-safe\n\nenvironment where participants can bring their sensitive data, which\n\nmight include personally identifiable information (PII), and perform\n\njoint analysis on that private data. Participants have full control\n\nof the data and can decide which participants can perform what\n\nanalysis without exposing any sensitive data.\n\n\n###### Collaborator A\n Data Cleanroom\nE.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n\n\u0007What is our audience overlap?\n\n\n###### Collaborator B\n\nE.G., ADVERTISERTS\n\n\n**Figure 8:**\nData clean room\ndiagram example\nfor audience\noverlap analysis in\nadvertising\n\n\nHow did my campaign do in\n\nterms of reach and frequency?\n\n\n\u0007What is the lift in purchases\n\namong those in-segment versus\nthose out-of-segment?\n\n**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n\n\n-----\n\nA data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n\nadvertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n\nthe last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n\nsharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n\nThere are various compelling needs driving this demand:\n\n\n**Privacy-first world.** Stringent data privacy regulations such as\n\nGDPR and CCPA, along with sweeping changes in third-party\n\nmeasurement, have transformed how organizations collect, use\n\nand share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n\n[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n\nand flexibility to easily opt out of app tracking. Google also plans\n\nto [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n\n2024. As these privacy laws and practices evolve, the demand\n\nfor data cleanrooms is likely to rise as the industry moves to new\n\n\n**Collaboration in a fragmented ecosystem.** Today, consumers have\n\nmore options than ever before when it comes to where, when and\n\nhow they engage with content. As a result, the digital footprint of\n\nconsumers is fragmented across different platforms, necessitating\n\nthat companies collaborate with their partners to create a unified\n\nview of their customers’ needs and requirements. To facilitate\n\ncollaboration across organizations, cleanrooms provide a secure\n\nand private way to combine their data with other data to unlock new\n\ninsights or capabilities.\n\n\nidentifiers that are PII based, such as UID 2.0, and organizations\n\ntry to find new ways to share and join data with customers and\n\npartners in a privacy-centric way.\n\n**New ways to monetize data.** Most organizations are looking to\n\nmonetize their data in one form or another. With today’s privacy\n\nlaws, companies will try to find any possible advantages to monetize\n\ntheir data without the risk of breaking privacy rules. This creates an\n\nopportunity for data vendors or publishers to join data for big data\n\nanalytics without having direct access to the data.\n\n\n-----\n\n#### Common data clean room uses cases\n\n\n#### Category management for retail and consumer goods\n\nClean rooms enable real-time collaboration between retailers\n\nand suppliers, ensuring secure information exchange for demand\n\nforecasting, inventory planning and supply chain optimization.\n\nThis improves product availability, reduces costs and streamlines\n\noperations for both parties.\n\n#### Real-world evidence (RWE) for healthcare\n\nClean rooms provide secure access to sensitive healthcare data sets,\n\nallowing collaborators to connect and query multiple sources of data\n\nwithout comprising data privacy. This supports RWE use cases such\n\nas regulatory decisions, safety, clinical trial design and observational\n\nresearch.\n\n\n#### Audience overlap exploration for media and entertainment\n\nBy creating a clean room environment, media companies can\n\nsecurely share their audience data with advertisers or other media\n\npartners. This allows them to perform in-depth analysis and identify\n\nshared audience segments without directly accessing or exposing\n\nindividual user information.\n\n#### Know Your Customer (KYC) in banking\n\nKYC standards are designed to combat financial fraud, money\n\nlaundering and terrorism financing. Clean rooms can be used within a\n\ngiven jurisdiction to allow financial services companies to collaborate\n\nand run shared analytics to build a holistic view of a transaction for\n\ninvestigations.\n\n\n-----\n\n#### Personalization with expanded interests for retailers\n\nRetailers want to target consumers based on past purchases, as\n\nwell as other purchases with different retailers. Clean rooms enable\n\nretailers to augment their knowledge of consumers to suggest new\n\nproducts and services that are relevant to the individual but have\n\n\n#### 5G data monetization for telecom\n\n5G data monetization enables telecoms to capitalize on data\n\nfrom 5G networks. Clean rooms provide a secure environment\n\nfor collaboration with trusted partners, ensuring privacy while\n\nmaximizing data value for optimized services, personalized\n\nexperiences and targeted advertising.\n\n\nnot yet been purchased.\n\n\n-----\n\n#### Shortcomings of existing data clean rooms\n\n\nOrganizations exploring clean room options are finding some glaring\n\nshortcomings in the existing solutions that limit the full potential of the\n\n“clean rooms” concept.\n\nFirst, many existing data clean room vendors require data to be on the\n\nsame cloud, same region, and/or same data platform. Participants then\n\nhave to move data into proprietary platforms, which results in lock-in\n\nand additional data storage costs.\n\n\nSecond, most existing solutions are not scalable to expand\n\ncollaboration beyond a few collaborators at a time. For example,\n\nan advertiser might want to get a detailed view of their ad\n\nperformance across different platforms, which requires analysis\n\nof the aggregated data from multiple data publishers. With\n\ncollaboration limited to just a few participants, organizations get\n\npartial insights on one clean room platform and end up moving\n\ntheir data to another clean room vendor to aggregate the data,\n\nincurring the operational overhead of collating partial insights.\n\nFinally, existing clean room solutions do not provide the flexibility\n\nto run arbitrary analysis and are mainly restricted to SQL, a\n\nsubset of Python, and pre-defined templates. While SQL is\n\nabsolutely needed for clean rooms, there are times when you\n\nrequire complex computations such as machine learning or\n\nintegration with APIs where SQL doesn’t satisfy the full depth of\n\nthe technical requirements.\n\n\n-----\n\n#### Key benefits of Databricks Clean Rooms\n\nDatabricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n\nany cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n\n\n**Flexible - your language and workload of**\n\n**choice.** Databricks Clean Rooms empower\n\ncollaborators to share and join their existing\n\ndata and run complex workloads in any\n\nlanguage —Python, R, SQL, Java and Scala —\n\non the data while maintaining data privacy.\n\nBeyond traditional SQL, users can run arbitrary\n\nworkloads and languages, allowing them to train\n\nmachine learning models, perform inference\n\nand utilize open-source or third-party privacy-\n\nenhancing technologies. This flexibility enables\n\ndata scientists and analysts to achieve more\n\ncomprehensive and advanced data analysis\n\nwithin the secure Clean Room environment.\n\n\n**Scalable, multi-party collaboration.**\n\nWith Databricks Clean Rooms, you can\n\nlaunch a clean room and work with multiple\n\ncollaborators at a time. This capability\n\nenables real-time collaboration, fostering\n\nefficient and rapid results. Moreover,\n\nDatabricks Clean Rooms seamlessly\n\nintegrate with identity service providers,\n\nallowing users to leverage offerings from\n\nthese providers during collaboration. The\n\nability to collaborate with multiple parties\n\nand leverage identity services enhances the\n\noverall data collaboration experience within\n\nDatabricks Clean Rooms.\n\n\n**Interoperable - any data source**\n\n**with no replication.** Databricks Clean\n\nRooms excel in interoperability, ensuring\n\nsmooth collaboration across diverse\n\nenvironments. With Delta Sharing,\n\ncollaborators can seamlessly work\n\ntogether across different cloud providers,\n\nregions and even data platforms without\n\nthe need for extensive data movement.\n\nThis eliminates data silos and enables\n\norganizations to leverage existing\n\ninfrastructure and data ecosystems while\n\nmaintaining the utmost security and\n\ncompliance.\n\n\n-----\n\n## Resources\n Getting started with Data Sharing and Collaboration\n\n\nData sharing plays a key role in business processes across the\n\nenterprise, from product development and internal operations to\n\ncustomer experience and compliance. However, most businesses\n\nhave been slow to move forward because of incompatibility\n\nbetween systems, complexity and security concerns.\n\nData-driven organizations need an open — and secure — approach\n\nto data sharing.\n\n\nDatabricks offers an open approach to data sharing and\n\ncollaboration with a variety of tools to:\n\n\u0007 **Share across platforms:** You can share live data sets, as well\n\nas AI models, dashboards and notebooks across platforms,\n\nclouds and regions. This open approach is powered by\n\nDelta Sharing, the world’s first open protocol for secure data\n\nsharing, which allows organizations to share data for any use\n\ncase, any tool and on any cloud.\n\n\u0007 **Share all your data and AI: Databricks Marketplace** is an\n\nopen marketplace for all your data, analytics and AI, enabling\n\nboth data consumers and data providers with the ability to\n\ndeliver innovation and advance analytics and AI initiatives.\n\n\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n\nto easily collaborate with customers and partners on any\n\ncloud in a privacy-safe way. With Delta Sharing, clean room\n\nparticipants can securely share data from their data lakes\n\nwithout any data replication across clouds or regions. Your\n\ndata stays with you without vendor lock-in, and you can\n\ncentrally audit and monitor the usage of your data.\n\n\n-----\n\nGet started with these products by exploring the resources below.\n\n\n**Delta Sharing**\n\n\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n\n[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n\n[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n\n**Databricks Marketplace**\n\n[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n\n[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n\n[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n\n[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n\n[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n\n\n[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n\n\n**Databricks Clean Rooms**\n\n\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n\n[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n\n[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n\n[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n\n\n-----\n\n## About the Authors\n\n\n**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n\nmaking analytics and AI simple for customers by leveraging the\n\npower of the Databricks Lakehouse Platform. You can reach Vuong\n\non [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n\n\n**Sachin Thakur** is a Principal Product Marketing Manager on the\n\nDatabricks Data Engineering and Analytics team. His area of focus\n\nis data governance with Unity Catalog, and he is passionate about\n\nhelping organizations democratize data and AI with the Databricks\n\nLakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n\n\n**Milos Colic** is a Senior Solution Architect at Databricks. His\n\n\npassion is to help customers with their data exchange and data\n\nmonetization needs. Furthermore, he is passionate about geospatial\n\ndata processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n\n\n**Jay Bhankharia** is a Senior Director on the Databricks Data\n\nPartnerships team. His passion is to help customers gain insights\n\nfrom data to use the power of the Databricks Lakehouse Platform\n\nfor their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n\n\n**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n\n\nover 20 years of experience in helping organizations of any size\n\nbuild data solutions. He focuses on data monetization and loves to\n\nhelp customers and businesses get more value from the data they\n\nhave. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n\n**Somasekar Natarajan** (Som) is a Solution Architect at\n\nDatabricks specializing in enterprise data management. Som has\n\nworked with Fortune organizations spanning three continents for\n\nclose to two decades with one objective — helping customers to\n\n\n**Giselle Goicochea** is a Senior Product Marketing Manager\n\non the Databricks Data Engineering and Analytics team. Her area\n\nof focus is data sharing and collaboration with Delta Sharing and\n\nDatabricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n\n**Kelly Albano** is a Product Marketing Manager on the Databricks\n\nData Engineering and Analytics team. Her area of focus is security,\n\ncompliance and Databricks Clean Rooms. You can reach\n\nKelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n\n\nharness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n© Databricks 2023 All rights reserved\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### The Delta Lake Series Complete Collection\n\n\n-----\n\n### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\nlifecycle management to data lakes. With Delta Lake, there will be no more\nmalformed data ingestion, difficulties deleting data for compliance, or issues\nmodifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\nyour data lake and the rate that teams can leverage that data with a secure and\nscalable cloud service.\n\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\n\n\n-----\n\nContents Processes Petabytes With Data Skipping and Z-Ordering\n\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n\nPerformance Matter **you’ll find inside** 5 Features 22\n\n\n\nProcesses Petabytes With Data Skipping and Z-Ordering\n\n\nRollbacks 39\n\nPinned view of a continuously updating\n\nDelta Lake table across multiple downstream jobs\n\nQueries for time series analytics made simple\n\nEasily Clone Your Delta Lake\n\nfor Testing, Sharing and ML\n\nReproducibility 41\n\nWhat are clones? 41\n\n\nA lakehouse combines the best elements\n\nof data lakes and data warehouses 52\n\nSome early examples 55\n\nFrom BI to AI 55\n\nDiving Deep Into the\n\nInner Workings of the Lakehouse and Delta Lake 56\n\n1. Data lakes 57\n\n2. Custom storage engines 57\n\n\nCreating the Dashboard /\n\nVirtual Network Operation Centers 82\n\nCreating (near) real-time alerts 85\n\nNext steps: machine learning 86\n\nPoint-of-failure prediction and remediation 87\n\nCustomer churn 87\n\nGetting started with the Databricks streaming video QoS solution 87\n\nCustomer Use Cases 88\n\nHealthdirect Australia 89\n\nData quality and governance issues, silos, and the inability to scale 89\n\n\nFundamentals & Performance\n\n\nUsing data skipping and Z-Order clustering 21\n\n\nThe Fundamentals of Delta Lake: Why Reliability and\n\n\nExploring the details 21\n\n\nPerformance Matter\n\n\nFeatures\n\n\nChallenges with data lakes\n\nDelta Lake’s key functionalities\n\nUnpacking the Transaction Log\n\nImplementing atomicity to ensure\n\n\nWhy Use MERGE\n\nWith Delta Lake?\n\nWhen are upserts necessary? 24\n\nWhy upserts into data lakes have\n\n\noperations complete fully\n\n\noperations complete fully 9\n\nDealing with multiple concurrent reads and writes **Chapter**\n\nTime travel, data lineage and debugging 10\n\nHow to Use Schema Enforcement and Evolution\n\nUnderstanding table schemas 11\n\n#### 01\n\n\nFundamentals and Performance traditionally been challenging 25\n\n\ntraditionally been challenging\n\n\nShallow clones\n\nDeep clones\n\n\n**Chapter**\n\n42\n\n42\n\n#### 04\n\n\n3. Lakehouse\n\n\nDealing with multiple concurrent reads and writes\n\n\nIntroducing MERGE in Delta Lake\n\n\nIn the research paper, the authors explain: 59\n\n\n3. Lakehouse Streaming 58\n\n\n\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\nand Performance Matter Deleting data due to GDPR 26\n\n\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n\nDelta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n\nScaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n\n\nTime travel, data lineage and debugging\n\n\nSimplifying use cases with MERGE\n\n\nWhere do clones help?\n\n\nUnderstanding\n\n\nModernizing analytics with Databricks and Delta Lake\n\n\nHow to Use Schema Enforcement and Evolution\n\n\nDeleting data due to GDPR\n\n\nTesting and experimentation with a production table\n\n\nDelta Engine\n\n\nFaster data pipelines result in better patient-driven healthcare\n\n\n\n- Unpacking the Transaction Log Applying change data from databases 26\n\n- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n\n- Delta Lake DML Internals How to start using Delta Lake 28\n\n- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\nWith Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n\n\nUnderstanding table schemas\n\n\nApplying change data from databases\n\n\nStaging major changes to a production table\n\n\nScaling execution performance\n\n\nComcast\n\n\nAnnouncing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n\nhigh-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n\n\nWhat is schema enforcement?\n\nHow does schema enforcement work?\n\nHow is schema enforcement useful?\n\nWhat is schema evolution?\n\nHow does schema evolution work?\n\n\nUpdating session information from streaming pipelines\n\n\nMachine learning result reproducibility\n\nData migration\n\nData sharing\n\nData archiving\n\nLooks awesome! Any gotchas?\n\nHow can I use it?\n\nEnabling Spark SQL DDL\n\n\nAnnouncing Delta Engine for\n\n\nInfrastructure unable to support data and ML needs\n\n\nHow to start using Delta Lake\n\n\nhigh-performance query execution\n\n\nAutomated infrastructure, faster data\n\n\nGetting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n\nStreaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n\n\nLoading and saving our Delta Lake data\n\n\nGetting started with Delta Engine\n\n\npipelines with Delta Lake\n\n\nIn-place conversion to Delta Lake\n\n\nStreaming\n\n\nDelivering personalized experiences with ML\n\n\nDelete our flight data\n\nUpdate our flight data 31\n\nMerge our flight data 31\n\n\nHow Delta Lake Solves Common Pain Points in Streaming\n\n\nBanco Hipotecario 97\n\nLegacy analytics tools are slow, rigid and\n\nimpossible to scale 98\n\n\nHow is schema evolution useful? 14\n\nSummary **Chapter** 14\n\nDelta Lake\n\nDML Internals 15\n\nDelta Lake DML: UPDATE 15\n\n#### 02\n\n\nFeatures\n\n\n#### 05 Chapter\n\n\nData lake pain points Customer Use Cases 64\n\n\nHow is schema evolution useful?\n\n\nData lake pain points\n\n\nSummary\n\n\nData warehouse pain points\n\n\n\n- Why Use MERGE With Delta Lake? View table history 32\n\n- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\nTables Using Python APIs Clean up old table versions with vacuum 33\n\n\nHow Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n\n\nDelta Lake\n\n\nView table history\n\n\nand DML in Delta Lake on\n\n\nHow Delta Lake on Databricks solves these issues\n\n\nA unified platform powers the data lake\n\n\nDML Internals\n\n\nTravel back in time with table history\n\n\nApache Spark 3.0\n\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake\n\n\nand easy collaboration\n\n\nImplement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n\nstock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n\n\nDelta Lake DML: UPDATE\n\n\nClean up old table versions with vacuum\n\n\nSupport for SQL DDL commands\n\n\nImplement your streaming\n\n\nAn efficient team maximizes customer\n\n\n\n- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n\n- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\nand ML Reproducibility 1. Using a timestamp 36\n\n\nUPDATE: Under the hood 16\n\nUPDATE + Delta Lake time travel = Easy debugging\n\nUPDATE: Performance tuning tips 16\n\nDelta Lake DML: DELETE 16\n\nDELETE: Under the hood 17\n\nDELETE + VACUUM: Cleaning up old data files\n\n\nCommon challenges with changing data\n\n\nto define tables in the Hive metastore\n\n\nstock analysis solution with Delta Lake\n\n\nacquisition and retention\n\n\nAnalyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n\n\nWorking with Time Travel\n\n\nCreate or replace tables\n\n\nAnalyze streaming stock data in real time 69\n\n\nViacom18\n\n\n1. Using a timestamp\n\n\nExplicitly alter the table schema\n\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake\n\n\nGrowth in subscribers and terabytes of viewing data push Hadoop to its limits\n\n\n\n- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\non Apache Spark 3.0 Python syntax 37\n\n\nHow data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n\nLeveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n\n\nScala syntax\n\n\nSupport for SQL Insert, Delete, Update and Merge\n\nAutomatic and incremental Presto/Athena manifest generation\n\nConfiguring your table through table properties\n\nSupport for adding user-defined metadata\n\nin Delta Lake table commits 48\n\nOther highlights 49\n\nLakehouse 50\n\nWhat Is a\n\nLakehouse? 51\n\n\nHow data flows and associated challenges 72\n\n\nRapid data processing for analytics\n\n\nPython syntax\n\n\nLeveraging Structured Streaming with blob store as\n\n\nand ML with Databricks\n\n\nSQL syntax 37\n\n2. Using a version number\n\nScala syntax\n\n\nsource and Delta Lake tables as sink\n\n\nLeveraging viewer data to power personalized viewing experiences 104\n\n\nDELETE: Performance tuning tips 18\n\nDelta Lake DML: MERGE **Chapter** 18\n\nHere’s how an upsert works: 18\n\nMERGE: Under the hood 19\n\nMERGE: Performance tuning tips **03** 19\n\n\nDELETE: Performance tuning tips\n\n\nLakehouse\n\n\nBuilding a Quality of Service Analytics Solution for Streaming Video Services 75\n\nDatabricks Quality of Service solution overview 76\n\nVideo QoS solution architecture 77\n\nMaking your data ready for analytics 79\n\nVideo applications events 80\n\nCDN logs 81\n\n\nDelta Lake DML: MERGE\n\n\n\n- What Is a Lakehouse? Python syntax 38\n\n- Diving Deep Into the Inner Workings of the SQL syntax 38\nLakehouse and Delta Lake Audit data changes 39\n\n\nHere’s how an upsert works:\n\n\nPython syntax\n\n\nMERGE: Under the hood\n\n\nSQL syntax\n\n\nMERGE: Performance tuning tips\n\n\nAudit data changes\n\n\nHow Delta Lake Quickly\n\n\n\n- Understanding Delta Engine Reproduce experiments and reports 39\n\n\n-----\n\n**Fundamentals and Performance**\nBoost data reliability for machine learning and\nbusiness intelligence with Delta Lake\n\n## CHAPTER 01\n\n\n-----\n\n**The Fundamentals of Delta**\n**Lake: Why Reliability and**\n**Performance Matter**\n\nWhen it comes to data reliability, performance — the speed at which your programs\nrun — is of utmost importance. Because of the ACID transactional protections that\nDelta Lake provides, you’re able to get the reliability and performance you need.\n\nWith Delta Lake, you can stream and batch concurrently, perform CRUD operations,\nand save money because you’re now using fewer VMs. It’s easier to maintain your data\nengineering pipelines by taking advantage of streaming, even for batch jobs.\n\nDelta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\ncloud object storage by providing ACID transactions through optimistic concurrency\ncontrol between writes and snapshot isolation for consistent reads during writes.\nDelta Lake also provides built-in data versioning for easy rollbacks and reproducing\nreports.\n\nIn this chapter, we’ll share some of the common challenges with data lakes as well as\nthe Delta Lake features that address them.\n\n**Challenges with data lakes**\nData lakes are a common element within modern data architectures. They serve as a\ncentral ingestion point for the plethora of data that organizations seek to gather and\nmine. While a good step forward in getting to grips with the range of data, they run\ninto the following common problems:\n\n\n-----\n\n**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\nthe problem of unsafe writes into data lakes that cause readers to see garbage\ndata during writes. They have to build workarounds to ensure readers always see\nconsistent data during writes.\n\n**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\nlake is easy, but this comes at the cost of data quality. Without any mechanisms\nfor validating schema and the data, data lakes suffer from poor data quality. As a\nconsequence, analytics projects that strive to mine this data also fail.\n\n**3. Poor performance with increasing amounts of data.** As the amount of data\nthat gets dumped into a data lake increases, the number of files and directories\nalso increases. Big data jobs and query engines that process the data spend a\nsignificant amount of time handling the metadata operations. This problem is more\npronounced in the case of streaming jobs or handling many concurrent batch jobs.\n\n**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.\n\nBecause of these challenges, many big data projects fail to deliver on their vision or\nsometimes just fail altogether. We need a solution that enables data practitioners to\nmake use of their existing data lakes, while ensuring data quality.\n\n**Delta Lake’s key functionalities**\nDelta Lake addresses the above problems to simplify how you build your data lakes.\nDelta Lake offers the following key functionalities:\n\n**• ACID transactions:** Delta Lake provides ACID transactions between multiple\nwrites. Every write is a transaction, and there is a serial order for writes recorded in\na transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n\n\n-----\n\n[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\ntrying to modify the same files don’t happen that often. In scenarios where\nthere is a conflict, Delta Lake throws a concurrent modification exception for\nusers to handle them and retry their jobs. Delta Lake also offers the highest level\nof isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\nkeep writing to a directory or table and consumers to keep reading from the same\ndirectory or table. Readers will see the latest snapshot that existed at the time the\nreading started.\n\n**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\nDataFrame being written is compatible with the schema of the table. Columns that\nare present in the table but not in the DataFrame are set to null. If there are extra\ncolumns in the DataFrame that are not present in the table, this operation throws\nan exception. Delta Lake has DDL to add new columns explicitly and the ability to\nupdate the schema automatically.\n\n**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\na table or directory in the transaction log instead of the metastore. This allows\nDelta Lake to list files in large directories in constant time and be efficient while\nreading data.\n\n**• Data versioning and time travel:** Delta Lake allows users to read a previous\nsnapshot of the table or directory. When files are modified during writes, Delta\nLake creates newer versions of the files and preserves the older versions. When\n\n\nusers want to read the older versions of the table or directory, they can provide\na timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\nconstructs the full snapshot as of that timestamp or version based on the\ninformation in the transaction log. This allows users to reproduce experiments and\nreports and also revert a table to its older versions, if needed.\n\n**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\nbe used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\nCombined with ACID transactions and scalable metadata handling, the efficient\nstreaming sink enables lots of near real-time analytics use cases without having to\nmaintain a complicated streaming and batch pipeline.\n\n**• Record update and deletion:** Delta Lake will support merge, update and delete\nDML commands. This allows engineers to easily upsert and delete records in data\nlakes and simplify their change data capture and GDPR use cases. Since Delta Lake\ntracks and modifies data at file-level granularity, it is much more efficient than\nreading and overwriting entire partitions or tables.\n\n**• Data expectations (coming soon):** Delta Lake will also support a new API to set\ndata expectations on tables or directories. Engineers will be able to specify a\nboolean condition and tune the severity to handle data expectations. When Apache\nSpark jobs write to the table or directory, Delta Lake will automatically validate\nthe records and when there is a violation, it will handle the records based on the\nseverity provided.\n\n\n-----\n\n**Unpacking the**\n**Transaction Log**\n\nThe transaction log is key to understanding Delta Lake because it is the common thread\nthat runs through many of its most important features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log is\nan ordered record of every transaction that has ever been performed on a Delta Lake\ntable since its inception.\n\nDelta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\ngiven table to work on the table at the same time. To show users correct views of the\ndata at all times, the transaction log serves as a single source of truth: the central\nrepository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on an\nopen table that has been modified since the last time it was read, Spark checks the\ntransaction log to see what new transactions are posted to the table. Then, Spark\nupdates the end user’s table with those new changes. This ensures that a user’s\nversion of a table is always synchronized with the master record as of the most recent\nquery and that users cannot make divergent, conflicting changes to a table.\n\nIn this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\nsolution to the problem of multiple concurrent reads and writes.\n\n\n-----\n\n**Implementing atomicity to ensure**\n**operations complete fully**\nAtomicity is one of the four properties of ACID transactions that guarantees that\noperations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\nfully or don’t complete at all. Without this property, it’s far too easy for a hardware\nfailure or a software bug to cause data to be only partially written to a table, resulting\nin messy or corrupted data.\n\nThe transaction log is the mechanism through which Delta Lake is able to offer\nthe guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\ntransaction log, it never happened. By only recording transactions that execute fully\nand completely, and using that record as the single source of truth, the Delta Lake\ntransaction log allows users to reason about their data and have peace of mind about\nits fundamental trustworthiness, at petabyte scale.\n\n**Dealing with multiple concurrent reads and writes**\nBut how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\nLake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n\n\ntable at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n**concurrency control** .\n\nOptimistic concurrency control is a method of dealing with concurrent transactions\nthat assumes the changes made to a table by different users can complete without\nconflicting with one another. It is incredibly fast because when dealing with petabytes\nof data, there’s a high likelihood that users will be working on different parts of the data\naltogether, allowing them to complete non-conflicting transactions simultaneously.\n\nOf course, even with optimistic concurrency control, sometimes users do try to\nmodify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\nfor that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\nthen it attempts to solve any conflict optimistically.\n\nThis protocol allows Delta Lake to deliver on the ACID principle of isolation, which\nensures that the resulting state of the table after multiple, concurrent writes is the\nsame as if those writes had occurred serially, in isolation from one another.\n\n\n-----\n\nAs all the transactions made on Delta Lake tables are stored directly to disk, this\nprocess satisfies the ACID property of durability, meaning it will persist even in the\nevent of system failure.\n\n**Time travel, data lineage and debugging**\nEvery table is the result of the sum total of all the commits recorded in the Delta Lake\ntransaction log — no more and no less. The transaction log provides a step-by-step\ninstruction guide, detailing exactly how to get from the table’s original state to its\ncurrent state.\n\nTherefore, we can recreate the state of a table at any point in time by starting with\nan original table, and processing only commits made after that point. This powerful\nability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n\n\nof situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n\nAs the definitive record of every change ever made to a table, the Delta Lake\ntransaction log offers users a verifiable data lineage that is useful for governance,\naudit and compliance purposes. It can also be used to trace the origin of an\ninadvertent change or a bug in a pipeline back to the exact action that caused it. Users\ncan run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\nthat were made.\n\n**Want to learn more about Delta Lake’s transaction log?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**How to Use Schema**\n**Enforcement and**\n**Evolution**\n\nAs business problems and requirements evolve over time, so does the structure of\nyour data. With Delta Lake, incorporating new columns or objects is easy; users have\naccess to simple semantics to control the schema of their tables. At the same time,\nit is important to call out the importance of schema enforcement to prevent users\nfrom accidentally polluting their tables with mistakes or garbage data in addition to\nschema evolution, which enables them to automatically add new columns of rich data\nwhen those columns belong.\n\n**Schema enforcement rejects any new columns or other schema changes that**\n**aren’t compatible with your table.** By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\n\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together,\nthese features make it easier than ever to block out the noise and tune in to the signal.\n\n**Understanding table schemas**\nEvery DataFrame in Apache Spark contains a schema, a blueprint that defines the\nshape of the data, such as data types and columns, and metadata. With Delta Lake,\nthe table’s schema is saved in JSON format inside the transaction log.\n\n\n-----\n\n**What is schema enforcement?**\nSchema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\ndata quality by rejecting writes to a table that don’t match the table’s schema.\n\nLike the front-desk manager at a busy restaurant who only accepts reservations, it\nchecks to see whether each column of data inserted into the table is on its list of\nexpected columns (in other words, whether each one has a “reservation”), and rejects\nany writes with columns that aren’t on the list.\n\n**How does schema enforcement work?**\nDelta Lake uses **schema validation on write,** which means that all new writes to a\ntable are checked for compatibility with the target table’s schema at write time. If the\nschema is not compatible, Delta Lake cancels the transaction altogether (no data is\nwritten), and raises an exception to let the user know about the mismatch.\n\nTo determine whether a write to a table is compatible, Delta Lake uses the following\nrules. The DataFrame to be written cannot contain:\n\n**• Any additional columns that are not present in the target table’s schema.**\nConversely, it’s OK if the incoming data doesn’t contain every column in the table —\nthose columns will simply be assigned null values.\n\n**• \u0007Column data types that differ from the column data types in the target table.**\nIf a target table’s column contains StringType data, but the corresponding column\nin the DataFrame contains IntegerType data, schema enforcement will raise an\nexception and prevent the write operation from taking place.\n\n**• Column names that differ only by case.** This means that you cannot have columns\nsuch as “Foo” and “foo” defined in the same table. While Spark can be used in case\nsensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\nwhen storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\ncolumn information. To avoid potential mistakes, data corruption or loss issues (which\nwe’ve personally experienced at Databricks), we decided to add this restriction.\n\n\n-----\n\nRather than automatically adding the new columns, Delta Lake enforces the schema,\nand stops the write from occurring. To help identify which column(s) caused the\nmismatch, Spark prints out both schemas in the stack trace for comparison.\n\n**How is schema enforcement useful?**\nBecause it’s such a stringent check, schema enforcement is an excellent tool to use\nas a gatekeeper for a clean, fully transformed data set that is ready for production or\nconsumption. It’s typically enforced on tables that directly feed:\n\n- Machine learning algorithms\n\n- BI dashboards\n\n- Data analytics and visualization tools\n\n- Any production system requiring highly structured,\nstrongly typed, semantic schemas\n\nIn order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\na look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n**What is schema evolution?**\nSchema evolution is a feature that allows users to easily change a table’s current\nschema to accommodate data that is changing over time. Most commonly, it’s used\nwhen performing an append or overwrite operation, to automatically adapt the\nschema to include one or more new columns.\n\n**How does schema evolution work?**\nFollowing up on the example from the previous section, developers can\neasily use schema evolution to add the new columns that were previously\nrejected due to a schema mismatch. Schema evolution is activated by adding\n.option(‘mergeSchema’, ‘true’) to your .write or .writeStream\nSpark command, as shown in the following example.\n\n\n#Add the mergeSchema option\n\nloans.write.format( “delta” ) \\\n\n.option( “mergeSchema” , “true” ) \\\n\n.mode( “append” ) \\\n\n.save(DELTALAKE_SILVER_PATH)\n\nBy including the mergeSchema option in your query, any columns that are present\n\nin the DataFrame but not in the target table are automatically added to the end of the\n\nschema as part of a write transaction. Nested fields can also be added, and these\n\nfields will get added to the end of their respective struct columns as well.\n\nData engineers and scientists can use this option to add new columns (perhaps a\n\nnewly tracked metric, or a column of this month’s sales figures) to their existing ML\n\nproduction tables without breaking existing models that rely on the old columns.\n\nThe following types of schema changes are eligible for schema evolution during table\n\nappends or overwrites:\n\n- Adding new columns (this is the most common scenario)\n\n- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n\n→ ShortType → IntegerType\n\nOther changes, not eligible for schema evolution, require that the schema and data\n\nare overwritten by adding .option(“overwriteSchema”,“true”) . Those\n\nchanges include:\n\n- Dropping a column\n\n- Changing an existing column’s data typeC (in place)\n\n- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n\n\n-----\n\nFinally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\nsupported, allowing users to perform the following actions on table schemas:\n\n- Adding columns\n\n- Changing column comments\n\n- Setting table properties that define the behavior of the table, such as setting the\nretention duration of the transaction log\n\n**How is schema evolution useful?**\nSchema evolution can be used anytime you _intend_ to change the schema of your table\n(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\nbe there). It’s the easiest way to migrate your schema because it automatically adds the\ncorrect column names and data types, without having to declare them explicitly.\n\n**Summary**\nSchema enforcement rejects any new columns or other schema changes that\naren’t compatible with your table. By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together, these\nfeatures make it easier than ever to block out the noise and tune in to the signal.\n\n**Want to learn more about schema enforcement and evolution?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**Delta Lake**\n**DML Internals**\n\nDelta Lake supports data manipulation language (DML) commands including UPDATE,\nDELETE and MERGE. These commands simplify change data capture (CDC), audit and\ngovernance, and GDPR/CCPA workflows, among others.\n\nIn this chapter, we will demonstrate how to use each of these DML commands,\ndescribe what Delta Lake is doing behind the scenes, and offer some performance\ntuning tips for each one.\n\n**Delta Lake DML: UPDATE**\nYou can use the UPDATE operation to selectively update any rows that match a\nfiltering condition, also known as a predicate. The code below demonstrates how\nto use each type of predicate as part of an UPDATE statement. Note that Delta Lake\noffers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\nonly the SQL code.\n\n-- Update events\n\nUPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n\n\n-----\n\n**UPDATE: Under the hood**\nDelta Lake performs an UPDATE on a table in two steps:\n\n1. Find and select the files containing data that match the predicate and, therefore,\nneed to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\nthis process.\n\n2. \u0007Read each matching file into memory, update the relevant rows, and write out the\nresult into a new data file.\n\nOnce Delta Lake has executed the UPDATE successfully, it adds a commit in the\ntransaction log indicating that the new data file will be used in place of the old one\nfrom now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n— recorded as a data file that applied to an older version of the table, but not the\ncurrent version. Delta Lake is able to use it to provide data versioning and time travel.\n\n**UPDATE + Delta Lake time travel = Easy debugging**\nKeeping the old data files turns out to be very useful for debugging because you can\nuse Delta Lake “time travel” to go back and query previous versions of a table at any\n\n\ntime. In the event that you update your table incorrectly and want to figure out what\nhappened, you can easily compare two versions of a table to one another to see what\nhas changed.\n\nSELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n\n- FROM mytable VERSION AS OF 12\n\n**UPDATE: Performance tuning tips**\nThe main way to improve the performance of the UPDATE command on Delta Lake\nis to add more predicates to narrow down the search space. The more specific the\nsearch, the fewer files Delta Lake needs to scan and/or modify.\n\n**Delta Lake DML: DELETE**\nYou can use the DELETE command to selectively delete rows based upon a\npredicate (filtering condition).\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n\n-----\n\nIn the event that you want to revert an accidental DELETE operation, you can use time\ntravel to roll back your table to the way it was.\n\n**DELETE: Under the hood**\nDELETE works just like UPDATE under the hood. Delta Lake makes two scans of\nthe data: The first scan is to identify any data files that contain rows matching the\npredicate condition. The second scan reads the matching data files into memory,\nat which point Delta Lake deletes the rows in question before writing out the newly\nclean data to disk.\n\nAfter Delta Lake completes a DELETE operation successfully, the old data files are\nnot deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\nlonger part of the active table) in the Delta Lake transaction log. Remember, those old\nfiles aren’t deleted immediately because you might still need them to time travel back\nto an earlier version of the table. If you want to delete files older than a certain time\nperiod, you can use the VACUUM command.\n\n**DELETE + VACUUM: Cleaning up old data files**\nRunning the VACUUM command permanently deletes all data files that are:\n\n1. No longer part of the active table and\n2. \u0007Older than the retention threshold, which is seven days by default\n\nDelta Lake does not automatically VACUUM old files — you must run the command\nyourself, as shown below. If you want to specify a retention period that is different\nfrom the default of seven days, you can provide it as a parameter.\n\nfrom delta.tables import - deltaTable.\n\n# vacuum files older than 30 days(720 hours)\n\ndeltaTable.vacuum( 720 )\n\n\n-----\n\n**DELETE: Performance tuning tips**\nJust like with the UPDATE command, the main way to improve the performance of\na DELETE operation on Delta Lake is to add more predicates to narrow down the\nsearch space. The Databricks managed version of Delta Lake also features other\nperformance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n\n**Delta Lake DML: MERGE**\nThe Delta Lake MERGE command allows you to perform upserts, which are a mix of\nan UPDATE and an INSERT. To understand upserts, imagine that you have an existing\ntable (aka a target table), and a source table that contains a mix of new records and\nupdates to existing records.\n\n\n**Here’s how an upsert works:**\n\n- When a record from the source table matches a preexisting record in the target\ntable, Delta Lake updates the record.\n\n- When there is no such match, Delta Lake inserts the new record.\n\nThe Delta Lake MERGE command greatly simplifies workflows that can be complex\nand cumbersome with other traditional data formats like Parquet. Common scenarios\nwhere merges/upserts come in handy include change data capture, GDPR/CCPA\ncompliance, sessionization, and deduplication of records.\n\n**For more information about upserts, read:**\n\n[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n\n[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n\n[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n\n\n-----\n\n**MERGE: Under the hood**\nDelta Lake completes a MERGE in two steps:\n\n1. Perform an inner join between the target table and source table to select all files\nthat have matches.\n2. Perform an outer join between the selected files in the target and source tables\nand write out the updated/deleted/inserted data.\n\nThe main way that this differs from an UPDATE or a DELETE under the hood is that\nDelta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\nstrategies when seeking to improve performance.\n\n**MERGE: Performance tuning tips**\nTo improve performance of the MERGE command, you need to determine which of the\ntwo joins that make up the merge is limiting your speed.\n\nIf the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\ntakes too long), try the following strategies:\n\n- Add more predicates to narrow down the search space.\n\n- Adjust shuffle partitions.\n\n- Adjust broadcast join thresholds.\n\n- Compact the small files in the table if there are lots of them, but don’t compact them\ninto files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n\n\n**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n**locality of updates.**\n\nOn the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\nthemselves takes too long), try the strategies below.\n\n- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\nbefore writes (with Optimized Writes in Databricks Delta Lake).\n\n- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\nbroadcast join, but if you’re doing a right outer join, Spark can do one, and you can\nadjust the broadcast thresholds as needed.\n\n- **Cache the source table / DataFrame:** Caching the source table can speed up the\nsecond scan, but be sure not to cache the target table, as this can lead to cache\ncoherency issues.\n\nDelta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\ngreatly simplify the workflow for many common big data operations. In this chapter, we\ndemonstrated how to use these commands in Delta Lake, shared information about\nhow each one works under the hood, and offered some performance tuning tips.\n\n**Want a deeper dive into DML internals, including snippets of code?**\n\n[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n\n\n-----\n\n**How Delta Lake Quickly**\n**Processes Petabytes With**\n**Data Skipping and Z-Ordering**\n\nDelta Lake is capable of sifting through petabytes of data within seconds. Much of this\nspeed is owed to two features: (1) data skipping and (2) Z-Ordering.\n\nCombining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\namount of data that needs to be scanned to answer selective queries against large\nDelta tables, which typically translates into substantial runtime improvements and\ncost savings.\n\nUsing Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\ndata lakes can be queried in a matter of seconds by skipping files not relevant to\nthe query. For example, 93.2% of the records in a 504 TB data set were skipped for a\ntypical query in a real-world cybersecurity analysis use case, reducing query times by\nup to two orders of magnitude. In other words, Delta Lake can speed up your queries\nby as much as 100x.\n\n**Want to see data skipping and Z-Ordering in action?**\n\nApple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n\nuse Delta Lake as a unified solution for data engineering and data science in the context\n\nof cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n\n[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n\n\n-----\n\nAND / OR / NOT are also supported as well as “literal op column” predicates.\n\nEven though data skipping kicks in when the above conditions are met, it may not\nalways be effective. But, if there are a few columns that you frequently filter by and\nwant to make sure that’s fast, then you can explicitly optimize your data layout with\nrespect to skipping effectiveness by running the following command:\n\nOPTIMIZE [ WHERE ]\nZORDER BY ( [, …])\n\n**Exploring the details**\nApart from partition pruning, another common technique that’s used in the data\nwarehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\nas minimum and maximum values at a certain granularity that are correlated with I/O\ngranularity. And we want to leverage those statistics at query planning time in order\nto avoid unnecessary I/O.\n\nThis is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\ninserted into a Delta Lake table, file-level min/max statistics are collected for all\ncolumns (including nested ones) of supported types. Then, when there’s a lookup\nquery against the table, Delta Lake first consults these statistics in order to determine\nwhich files can safely be skipped.\n\n**Want to learn more about data skipping and Z-Ordering, including**\n**how to apply it within a cybersecurity analysis?**\n\n[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n\n\n**Using data skipping and Z-Order clustering**\nData skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\nDelta Lake, kicking in whenever your SQL queries or data set operations include filters\nof the form “column op literal,” where:\n\n- column is an attribute of some Delta Lake table, be it top-level or nested, whose\ndata type is string / numeric / date/ timestamp\n\n- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n\n\n- literal is an explicit (list of) value(s) of the same data type as a column\n\n\n-----\n\n**Features**\nUse Delta Lake’s robust features\nto reliably manage your data\n\n## CHAPTER 02\n\n\n-----\n\n**Why Use MERGE**\n**With Delta Lake?**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\nMERGE command, which allows you to efficiently upsert and delete records in your\ndata lakes.\n\nMERGE dramatically simplifies how a number of common data pipelines can be built\n-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\ncan now be replaced by simple MERGE queries.\n\nThis finer-grained update capability simplifies how you build your big data\npipelines for various use cases ranging from change data capture to GDPR. You\nno longer need to write complicated logic to overwrite tables and overcome a lack\nof snapshot isolation.\n\nWith changing data, another critical capability required is the ability to roll back, in\ncase of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n\nIn this chapter, we’ll discuss common use cases where existing data might need to be\nupdated or deleted. We’ll also explore the challenges inherent to upserts and explain\nhow MERGE can address them.\n\n\n-----\n\n**When are upserts necessary?**\nThere are a number of common use cases where existing data in a data lake needs to\nbe updated or deleted:\n\n- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\nthe right to be forgotten (also known as data erasure) in GDPR, organizations must\nremove a user’s information upon request. This data erasure includes deleting user\ninformation in the data lake as well.\n\n- **Change data capture from traditional databases:** In a service-oriented\narchitecture, typically web and mobile applications are served by microservices\nbuilt on traditional SQL/NoSQL databases that are optimized for low latency. One\nof the biggest challenges organizations face is joining data across these various\nsiloed data systems, and hence data engineers build pipelines to consolidate\nall data sources into a central data lake to facilitate analytics. These pipelines\noften have to periodically read changes made on a traditional SQL/NoSQL table\nand apply them to corresponding tables in the data lake. Such changes can take\nvarious forms: Tables with slowly changing dimensions, change data capture of all\ninserted/updated/deleted rows, etc.\n\n- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\ncase in many areas ranging from product analytics to targeted advertising to\npredictive maintenance. Building continuous applications to track sessions and\nrecording the results that write into data lakes is difficult because data lakes have\nalways been optimized for appending data.\n\n- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\nDelta Lake table by appending data to the table. However, often the sources can\ngenerate duplicate records and downstream de-duplication steps are needed to\ntake care of them.\n\n\n-----\n\n**Why upserts into data lakes have**\n**traditionally been challenging**\nSince data lakes are fundamentally based on files, they have always been optimized\nfor appending data rather than for changing existing data. Hence, building the above\nuse case has always been challenging.\n\nUsers typically read the entire table (or a subset of partitions) and then overwrite\nthem. Therefore, every organization tries to reinvent the wheel for their requirement\nby handwriting complicated queries in SQL, Spark, etc. This approach is:\n\n- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\nrecords causes pipelines to be slow and costly. Hand-tuning the table layout and\nquery optimization is tedious and requires deep domain knowledge.\n\n- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\nhuman errors. For example, multiple pipelines concurrently modifying the same table\nwithout any transactional support can lead to unpredictable data inconsistencies\nand in the worst case, data losses. Often, even a single handwritten pipeline can\neasily cause data corruptions due to errors in encoding the business logic.\n\n- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\nkeep track of and maintain. In the long term, this alone can significantly increase\nthe organizational and infrastructural costs.\n\n**Introducing MERGE in Delta Lake**\nWith Delta Lake, you can easily address the use cases above without any of the\naforementioned problems using the following MERGE command:\n\nMERGE INTO\n\nUSING\n\nON\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n\n[ WHEN NOT MATCHED [ AND ] THEN ]\n\nwhere\n\n=\n\nDELETE |\n\nUPDATE SET - |\n\nUPDATE SET column1 = value1 [, column2 = value2 ...]\n\n=\n\nINSERT - |\n\nINSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n\nLet’s understand how to use MERGE with a simple example. Suppose you have a\n[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\nFurthermore, you have a table of new addresses for both existing and new users. To\nmerge all the new addresses to the main user table, you can run the following:\n\nMERGE INTO users\n\nUSING updates\n\nON users.userId = updates.userId\n\nWHEN MATCHED THEN\n\nUPDATE SET address = updates.addresses\n\nWHEN NOT MATCHED THEN\nINSERT (userId, address) VALUES (updates.userId, updates.address)\n\nThis will perform exactly what the syntax says -- for existing users (i.e., MATCHED\nclause), it will update the address column, and for new users (i.e., NOT MATCHED\nclause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\nMERGE operation can be orders of magnitude faster than overwriting entire partitions\nor tables since Delta Lake reads only relevant files and updates them. Specifically,\nDelta Lake's MERGE has the following advantages:\n\n\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n-----\n\n**Simplifying use cases with MERGE**\n**Deleting data due to GDPR**\nComplying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\nget any easier. You can set up a simple scheduled job with an example code, like\nbelow, to delete all the users who have opted out of your service.\n\nMERGE INTO users\n\nUSING opted_out_users\n\nON opted_out_users.userId = users.userId\n\nWHEN MATCHED THEN DELETE\n\n**Applying change data from databases**\nYou can easily apply all data changes — updates, deletes, inserts — generated from an\nexternal database into a Delta Lake table with the MERGE syntax as follows:\n\nMERGE INTO users\n\nUSING (\n\nSELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n\n(\n\nSELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n\nFROM changes GROUP BY userId\n\n)\n\n) latestChange\n\nON latestChange.userId = users.userId\n\nWHEN MATCHED AND latestChange.deleted = TRUE THEN\n\nDELETE\n\nWHEN MATCHED THEN\n\nUPDATE SET address = latestChange.address\n\nWHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n\nINSERT (userId, address) VALUES (userId, address)\n\n\n\n- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\npartitions. This eliminates all the complications of rewriting partitions, updating\nthe Hive metastore with MSCK and so on.\n\n- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\nrewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\nDelta Lake with all its I/O and processing optimizations makes all the reading and\nwriting data by MERGE significantly faster than similar operations in Apache Spark.\n\n- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\nconcurrent writers update the data correctly with ACID transactions, and concurrent\nreaders always see a consistent snapshot of the data.\n\nHere is a visual explanation of how MERGE compares with handwritten pipelines.\n\n\n-----\n\n**Updating session information from streaming**\n**pipelines**\nIf you have streaming event data flowing in and if you want to sessionize the streaming\nevent data and incrementally update and store sessions in a Delta Lake table, you\ncan accomplish this using the foreachBatch in Structured Streaming and MERGE.\nFor example, suppose you have a Structured Streaming DataFrame that computes\nupdated session information for each user. You can start a streaming query that\napplies all the sessions update to a Delta Lake table as follows (Scala).\n\nstreamingSessionUpdatesDF.writeStream\n\n.foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n\nmicroBatchOutputDF.createOrReplaceTempView(“updates”)\n\nmicroBatchOutputDF.sparkSession.sql(s”””\n\nMERGE INTO sessions\n\nUSING updates\n\nON sessions.sessionId = updates.sessionId\n\nWHEN MATCHED THEN UPDATE SET *\n\nWHEN NOT MATCHED THEN INSERT * “”” )\n\n}.start()\n\nFor a complete working example of each Batch and MERGE, see this notebook\n( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n\n**Additional resources**\n\n[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n\n[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n\n[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n\n[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n\n[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n\n\n-----\n\n**Simple, Reliable Upserts and**\n**Deletes on Delta Lake Tables**\n**Using Python APIs**\n\nIn this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\nLake within the context of an on-time flight performance scenario. We will show how\nto upsert and delete data, query old versions of data with time travel, and vacuum\nolder versions for cleanup.\n\n**How to start using Delta Lake**\nThe Delta Lake package is installable through PySpark by using the --packages\noption. In our example, we will also demonstrate the ability to VACUUM files and execute\nDelta Lake SQL commands within Apache Spark. As this is a short demonstration, we\nwill also enable the following configurations:\n\n\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n\nto allow us to vacuum files shorter than the default retention duration of seven days.\nNote, this is only required for the SQL command VACUUM\n\n\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n\nto enable Delta Lake SQL commands within Apache Spark; this is not required for\nPython or Scala API calls.\n\n# Using Spark Packages\n\n./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n\ndatabricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n\nsql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n\n\n-----\n\n**Loading and saving our Delta Lake data**\nThis scenario will be using the On-Time Flight Performance or Departure Delays data\nset generated from the RITA BTS Flight Departure Statistics; some examples of this data\nin action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\nby reading the data set.\n\n\u0007# Location variables\n\n\n/departureDelays.delta$ ls l\n\n.\n\n..\n\n_delta_log\n\npart- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n\nPart- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n\n\ntripdelaysFilePath = “/root/data/departuredelays.csv”\n\npathToEventsTable = “/root/deltalake/departureDelays.delta”\n\nNow, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n\n# Read flight delay data\n\n\ndepartureDelays = spark.read \\\n\n.option( “header” , “true” ) \\\n\n.option( “inferSchema” , “true” ) \\\n\n.csv(tripdelaysFilePath)\n\nNext, let’s save our departureDelays data set to a Delta Lake table. By saving this table\nto Delta Lake storage, we will be able to take advantage of its features including ACID\ntransactions, unified batch and streaming and time travel.\n\n# Save flight delay data into Delta Lake format\n\ndepartureDelays \\\n\n.write \\\n\n\n# Load flight delay data in Delta Lake format\n\ndelays_delta = spark \\\n\n.read \\\n\n.format( “delta” ) \\\n\n.load( “departureDelays.delta” )\n\n# Create temporary view\n\ndelays_delta.createOrReplaceTempView(“delays_delta”)\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’”).show()\n\n\n.format( “delta” ) \\\n\n.mode( “overwrite” ) \\\n\n.save( “departureDelays.delta” )\n\nNote, this approach is similar to how you would normally save Parquet data; instead\nof specifying format(“parquet”) , you will now specify format(“delta”) . If\nyou were to take a look at the underlying file system, you will notice four files created\nfor the departureDelays Delta Lake table.\n\n\n-----\n\nFinally, lets determine the number of flights originating from Seattle to San Francisco; in\nthis data set, there are 1698 flights.\n\n**In-place conversion to Delta Lake**\nIf you have existing Parquet tables, you have the ability to convert them to Delta Lake\nformat in place, thus not needing to rewrite your table. To convert the table, you can\nrun the following commands.\n\n\ndeltaTable DeltaTable .forPath(spark, pathToEventsTable\n\n)\n\n# Delete all on-time and early flights\n\ndeltaTable. delete ( “delay < 0” )\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’” ).show()\n\n\nfrom delta.tables import - \n\n# Convert non partitioned parquet table at path ‘/path/to/table’\n\ndeltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n\ntable`” )\n\n# Convert partitioned parquet table at path ‘/path/to/table’ and\n\npartitioned by integer column named ‘part’\n\n\nAfter we delete (more on this below) all of the on-time and early flights, as you can\nsee from the preceding query there are 837 late flights originating from Seattle to\nSan Francisco. If you review the file system, you will notice there are more files even\nthough you deleted data.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n\npart-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n\npart- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n\npart- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n\npart- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n\n\npartitionedDeltaTable = DeltaTable .convertToDelta(spark,\n\n“parquet.`/path/to/table`”, “part int” )\n\n**Delete our flight data**\nTo delete data from a traditional data lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to delete\n2. Create a new table based on the previous query\n3. Delete the original table\n4. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this process\nby running a DELETE statement. To show this, let’s delete all of the flights that had\narrived early or on-time (i.e., delay < 0).\n\n\nfrom delta.tables import - \n\nfrom pyspark.sql.functions import - \n\n# Access the Delta Lake table\n\n\n-----\n\nIn traditional data lakes, deletes are performed by rewriting the entire table\nexcluding the values to be deleted. With Delta Lake, deletes are instead performed\nby selectively writing new versions of the files containing the data to be deleted and\nonly marks the previous files as deleted. This is because Delta Lake uses multiversion\nconcurrency control (MVCC) to do atomic operations on the table: For example, while\none user is deleting data, another user may be querying the previous version of the\ntable. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\nand query previous versions as we will see later.\n\n**Update our flight data**\nTo update data from your traditional Data Lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to modify\n2. Modify the rows that need to be updated/changed\n3. Merge these two tables to create a new table\n4. Delete the original table\n5. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this\nprocess by running an UPDATE statement. To show this, let’s update all of the flights\noriginating from Detroit to Seattle.\n\n\nWith the Detroit flights now tagged as Seattle flights, we now have 986 flights\noriginating from Seattle to San Francisco. If you were to list the file system for\nyour departureDelays folder (i.e., $../departureDelays/ls -l ), you will\nnotice there are now 11 files (instead of the 8 right after deleting the files and the four\nfiles after creating the table).\n\n**Merge our flight data**\nA common scenario when working with a data lake is to continuously append data to\nyour table. This often results in duplicate data (rows you do not want to be inserted\ninto your table again), new rows that need to be inserted, and some rows that need to\nbe updated. With Delta Lake, all of this can be achieved by using the merge operation\n(similar to the SQL MERGE statement).\n\nLet’s start with a sample data set that you will want to be updated, inserted or\nde-duplicated with the following query.\n\n\n# Update all flights originating from Detroit to now be\n\n\noriginating from Seattle\n\ndeltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\n# How many flights are between Seattle and San Francisco\n\n\nThe output of this query looks like the following table. Note, the color-coding has been\nadded to clearly identify which rows are de-duplicated (blue), updated (yellow) and\ninserted (green).\n\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n\nand destination = ‘SFO’” ).show()\n\n\n-----\n\nNext, let’s generate our own merge_table that contains data we will insert, update\nor de-duplicate with the following code snippet.\n\nitems = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n\n‘SEA’ , ‘SFO’ ),\n\n(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n\n\nWith Delta Lake, this can be easily achieved via a merge statement as noted in the\nfollowing code snippet.\n\n# Merge merge_table with flights\n\ndeltaTable. alias( “flights” ) \\\n\n.merge(merge_table. alias ( “updates”),”flights.date =\n\nupdates.date” ) \\\n\n.whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n\n.whenNotMatchedInsertAll() \\\n\n.execute()\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\ncols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n\n\nmerge_table = spark.createDataFrame(items, cols)\n\nmerge_table.toPandas()\n\nIn the preceding table ( merge_table ), there are three rows with a unique date value:\n\n1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n2. 1010710: This row is a _duplicate_ (blue)\n3. 1010832: This is a new row to be _inserted_ (green)\n\n\nAll three actions of de-duplication, update and insert were efficiently completed with\none statement.\n\n**View table history**\nAs previously noted, after each of our transactions (delete, update), there were more\nfiles created within the file system. This is because for each transaction, there are\ndifferent versions of the Delta Lake table.\n\n\n-----\n\nThis can be seen by using the DeltaTable.history() method as noted below\n\nNote: You can also perform the same task with SQL:\n\nspark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n\nAs you can see, there are three rows representing the different versions of the table\n(below is an abridged version to help make it easier to read) for each of the operations\n(create table, delete and update):\n\n**Travel back in time with table history**\nWith Time Travel, you can review the Delta Lake table as of the version or timestamp.\nTo view historical data, specify the version or timestamp option; in the following code\nsnippet, we will specify the version option.\n\n\n# Load DataFrames for each version\n\ndfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n0 ).load( “departureDelays.delta” )\n\ndfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n\n1 ).load( “departureDelays.delta” )\n\ndfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n2 ).load( “departureDelays.delta” )\n\n# Calculate the SEA to SFO flight counts for each version of history\n\ncnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\n# Print out the value\n\nprint ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n\n(cnt0, cnt1, cnt2))\n\n## Output\n\nSEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n\nWhether for governance, risk management and compliance (GRC) or rolling back\nerrors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\ndelete had occurred with these operators) and data (e.g., the actual rows deleted). But\nhow do we remove the data files either for compliance or size reasons?\n\n**Clean up old table versions with vacuum**\nThe [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\nolder than seven days’ reference. If you were to view the file system, you’ll notice the\n11 files for your table.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n\npart- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n\n\n-----\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n\npart- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n\npart- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n\nPart- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n\nTo delete all of the files so that you only keep the current snapshot of data, you will specify a\nsmall value for the vacuum method (instead of the default retention of 7 days).\n\n# Remove all files older than 0 hours old.\n\ndeltaTable.vacuum( 0 )\n\nNote , you perform the same task via SQL syntax:¸\n\n# Remove all files older than 0 hours old\n\nspark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n\nOnce the vacuum has completed, when you review the file system you will notice fewer\nfiles as the historical data has been removed.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n\nNote, the ability to time travel back to a version older than the retention period is lost\nafter running vacuum.\n\n\n-----\n\n**Time Travel for**\n**Large-Scale Data Lakes**\n\nTime travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\nmetadata handling, and unifies streaming and batch data processing. Delta Lake runs on\ntop of your existing data lake and is fully compatible with Apache Spark APIs.\n\nWith this feature, Delta Lake automatically versions the big data that you store in your\ndata lake, and you can access any historical version of that data. This temporal data\nmanagement simplifies your data pipeline by making it easy to audit, roll back data\nin case of accidental bad writes or deletes, and reproduce experiments and reports.\n\nYour organization can finally standardize on a clean, centralized, versioned big data\nrepository in your own cloud storage for your analytics.\n\n**Common challenges with changing data**\n\n- **Audit data changes:** Auditing data changes is critical both in terms of data\ncompliance as well as simple debugging to understand how data has changed over\ntime. Organizations moving from traditional data systems to big data technologies\nand the cloud struggle in such scenarios.\n\n- **Reproduce experiments and reports:** During model training, data scientists\nrun various experiments with different parameters on a given set of data. When\nscientists revisit their experiments after a period of time to reproduce the models,\ntypically the source data has been modified by upstream pipelines. A lot of times,\nthey are caught unaware by such upstream data changes and hence struggle to\nreproduce their experiments. Some scientists and organizations engineer best\n\n\n-----\n\npractices by creating multiple copies of the data, leading to increased storage\ncosts. The same is true for analysts generating reports.\n\n- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n\nThis can happen because of issues ranging from infrastructure instabilities to messy\ndata to bugs in the pipeline. For pipelines that do simple appends to directories or a\ntable, rollbacks can easily be addressed by date-based partitioning. With updates\nand deletes, this can become very complicated, and data engineers typically have\nto engineer a complex pipeline to deal with such scenarios.\n\n**Working with Time Travel**\nDelta Lake’s time travel capabilities simplify building data pipelines for the above use\ncases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n\n- Data scientists manage their experiments better\n\n- Data engineers simplify their pipelines and roll back bad writes\n\n- Data analysts do easy reporting\n\nOrganizations can finally standardize on a clean, centralized, versioned big data\nrepository in their own cloud storage for analytics. We are thrilled to see what you will\nbe able to accomplish with this feature.\n\nAs you write into a Delta Lake table or directory, every operation is automatically\nversioned. You can access the different versions of the data two different ways:\n\n**1. Using a timestamp**\n**Scala syntax**\nYou can provide the timestamp or date string as an option to DataFrame reader:\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “timestampAsOf” , “2019-01-01” )\n\n.load( “/path/to/my/table” )\n\n\n-----\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “timestampAsOf” , “2019-01-01” ) \\\n\n.load( “/path/to/my/table” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n\nIf the reader code is in a library that you don’t have access to, and if you are passing\ninput parameters to the library to read data, you can still travel back in time for a table\nby passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n\nval inputPath = “/path/to/my/table@20190101000000000”\n\nval df = loadData(inputPath)\n\n// Function in a library that you don’t have access to\n\ndef loadData(inputPath : String ) : DataFrame = {\n\nspark.read\n\n.format(“delta”)\n\n.load(inputPath)\n\n}\n\ninputPath = “/path/to/my/table@20190101000000000”\n\ndf = loadData(inputPath)\n\n# Function in a library that you don’t have access to\n\ndef loadData(inputPath):\n\nreturn spark.read \\\n\n.format( “delta” ) \\\n\n.load(inputPath)\n\n\n-----\n\n**2. Using a version number**\nIn Delta Lake, every write has a version number, and you can use the version number\nto travel back in time as well.\n\n**Scala syntax**\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “versionAsOf” , “5238” )\n\n.load( “/path/to/my/table” )\n\nval df = spark.read\n\n.format( “delta” )\n\n.load( “/path/to/my/table@v5238” )\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “versionAsOf” , “5238” ) \\\n\n.load( “/path/to/my/table” )\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.load( “/path/to/my/table@v5238” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table VERSION AS OF 5238\n\n\n-----\n\n**Audit data changes**\nYou can look at the history of table changes using the DESCRIBE HISTORY command\nor through the UI.\n\n**Reproduce experiments and reports**\nTime travel also plays an important role in machine learning and data science.\nReproducibility of models and experiments is a key consideration for data scientists\nbecause they often create hundreds of models before they put one into production,\nand in that time-consuming process would like to go back to earlier models. However,\nbecause data management is often separate from data science tools, this is really\nhard to accomplish.\n\nDatabricks solves this reproducibility problem by integrating Delta Lake’s Time\nTravel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\nlifecycle. For reproducible machine learning training, you can simply log a\n\n\ntimestamped URL to the path as an MLflow parameter to track which version of the\ndata was used for each training job.\n\nThis enables you to go back to earlier settings and data sets to reproduce earlier\nmodels. You neither need to coordinate with upstream teams on the data nor worry\nabout cloning data for different experiments. This is the power of unified analytics,\nwhereby data science is closely married with data engineering.\n\n**Rollbacks**\nTime travel also makes it easy to do rollbacks in case of bad writes. For example, if\nyour GDPR pipeline job had a bug that accidentally deleted user information, you can\neasily fix the pipeline:\n\nINSERT INTO my_table\n\nSELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nWHERE userId = 111\n\n\n-----\n\nYou can also fix incorrect updates as follows:\n\n# Will use the latest version of the table for all operations below\n\nMERGE INTO my_table target\n\n\nUSING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n\nON source.userId = target.userId\n\nWHEN MATCHED THEN UPDATE SET - \n\nIf you simply want to roll back to a previous version of your table, you can do so with\neither of the following commands:\n\nRESTORE TABLE my_table VERSION AS OF [version_number]\n\nRESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n\n**Pinned view of a continuously updating**\n**Delta Lake table across multiple downstream jobs**\nWith AS OF queries, you can now pin the snapshot of a continuously updating Delta\nLake table for multiple downstream jobs. Consider a situation where a Delta Lake table\nis being continuously updated, say every 15 seconds, and there is a downstream job\nthat periodically reads from this Delta Lake table and updates different destinations.\nIn such scenarios, typically you want a consistent view of the source Delta Lake table\nso that all destination tables reflect the same state.\n\nYou can now easily handle such scenarios as follows:\n\nversion = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n\nmy_table)” ).collect()\n\n\ndata = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n\n( “event_type = e1” ).write.jdbc( “table1” )\n\ndata.where( “event_type = e2” ).write.jdbc( “table2” )\n\n...\n\ndata.where( “event_type = e10” ).write.jdbc( “table10” )\n\n**Queries for time series analytics made simple**\nTime travel also simplifies time series analytics. For example, if you want to find out\nhow many new customers you added over the last week, your query could be a very\nsimple one like this:\n\nSELECT count( distinct userId) - (\n\nSELECT count( distinct userId)\n\nFROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n\nFROM my_table\n\n**Additional resources**\n\n[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n\n[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n\n[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n\n[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n\n-----\n\n**Easily Clone Your Delta Lake**\n**for Testing, Sharing and ML**\n**Reproducibility**\n\nDelta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\nrecreate tables for ML reproducibility. Creating copies of tables in a data lake or data\nwarehouse has several practical uses. However, given the volume of data in tables\nin a data lake and the rate of its growth, making physical copies of tables is an\nexpensive operation.\n\nDelta Lake now makes the process simpler and cost-effective with the help of\ntable clones.\n\n**What are clones?**\nClones are replicas of a source table at a given point in time. They have the same\nmetadata as the source table: same schema, constraints, column descriptions, statistics\nand partitioning. However, they behave as a separate table with a separate lineage\nor history. Any changes made to clones only affect the clone and not the source. Any\nchanges that happen to the source during or after the cloning process also do not get\nreflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\nclones: shallow or deep.\n\n**Shallow clones**\nA _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\ntable being cloned; the data files of the table itself are not copied. This type of cloning\ndoes not create another physical copy of the data resulting in minimal storage costs.\nShallow clones are inexpensive and can be extremely fast to create.\n\n\n-----\n\nThese clones are not self-contained and depend on the source from which they were\ncloned as the source of data. If the files in the source that the clone depends on are removed,\nfor example with VACUUM, a shallow clone may become unusable. Therefore, shallow\nclones are typically used for short-lived use cases such as testing and experimentation.\n\n**Deep clones**\nShallow clones are great for short-lived use cases, but some scenarios require a\nseparate and independent copy of the table’s data. A deep clone makes a full copy of\nthe metadata and the data files of the table being cloned. In that sense, it is similar in\nfunctionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\nBut it is simpler to specify since it makes a faithful copy of the original table at the\nspecified version, and you don’t need to re-specify partitioning, constraints and other\ninformation as you have to do with CTAS. In addition, it is much faster, robust and can\nwork in an incremental manner against failures.\n\nWith deep clones, we copy additional metadata, such as your streaming application\ntransactions and COPY INTO transactions, so you can continue your ETL applications\nexactly where it left off on a deep clone.\n\n**Where do clones help?**\nSometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\nnot talking about human clones here. There are many scenarios where you need a\ncopy of your data sets — for exploring, sharing or testing ML models or analytical\nqueries. Below are some examples of customer use cases.\n\n**Testing and experimentation with a production table**\nWhen users need to test a new version of their data pipeline they often have to rely\non sample test data sets that are not representative of all the data in their production\nenvironment. Data teams may also want to experiment with various indexing techniques\nto improve the performance of queries against massive tables. These experiments and\n\n\ntests cannot be carried out in a production environment without risking production\ndata processes and affecting users.\n\nIt can take many hours or even days, to spin up copies of your production tables for a test\nor a development environment. Add to that, the extra storage costs for your development\nenvironment to hold all the duplicated data — there is a large overhead in setting a test\nenvironment reflective of the production data. With a shallow clone, this is trivial:\n\n-- SQL\n\nCREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n\n# Python\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=True)\n\n// Scala\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=true)\n\nAfter creating a shallow clone of your table in a matter of seconds, you can start\nrunning a copy of your pipeline to test out your new code, or try optimizing your table\nin different dimensions to see how you can improve your query performance, and much\nmuch more. These changes will only affect your shallow clone, not your original table.\n\n**Staging major changes to a production table**\nSometimes, you may need to perform some major changes to your production table.\nThese changes may consist of many steps, and you don’t want other users to see the\nchanges that you’re making until you’re done with all of your work. A shallow clone can\nhelp you out here:\n\n\n-----\n\n-- SQL\n\nCREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n\nDELETE FROM temp.staged_changes WHERE event_id is null;\n\nUPDATE temp.staged_changes SET change_date = current_date()\n\nWHERE change_date is null;\n\n...\n\n-- Perform your verifications\n\nOnce you’re happy with the results, you have two options. If no other change has\nbeen made to your source table, you can replace your source table with the clone.\nIf changes have been made to your source table, you can merge the changes into\nyour source table.\n\n-- If no changes have been made to the source\n\nREPLACE TABLE prod.events CLONE temp.staged_changes;\n\n-- If the source table has changed\n\nMERGE INTO prod.events USING temp.staged_changes\n\nON events.event_id <=> staged_changes.event_id\n\nWHEN MATCHED THEN UPDATE SET *;\n\n-- Drop the staged table\n\nDROP TABLE temp.staged_changes;\n\n**Machine learning result reproducibility**\nComing up with an effective ML model is an iterative process. Throughout this process\nof tweaking the different parts of the model, data scientists need to assess the\naccuracy of the model against a fixed data set.\n\nThis is hard to do in a system where the data is constantly being loaded or updated. A\nsnapshot of the data used to train and test the model is required. This snapshot allows\nthe results of the ML model to be reproducible for testing or model governance purposes.\n\n\n-----\n\nWe recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\nexample of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\nOnce you’re happy with the results and would like to archive the data for later retrieval,\nfor example, next Black Friday, you can use deep clones to simplify the archiving process.\nMLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\nautolog() ) will tell you which version of the table was used to run a set of experiments.\n\n# Run your ML workloads using Python and then\n\nDeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n\nstore_bf2020”)\n\n**Data migration**\nA massive table may need to be moved to a new, dedicated bucket or storage system\nfor performance or governance reasons. The original table will not receive new\nupdates going forward and will be deactivated and removed at a future point in time.\nDeep clones make the copying of massive tables more robust and scalable.\n\n-- SQL\n\nCREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n\nALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n\nWith deep clones, since we copy your streaming application transactions and\nCOPY INTO transactions, you can continue your ETL applications from exactly where\nit left off after this migration!\n\n**Data sharing**\nIn an organization, it is often the case that users from different departments are\nlooking for data sets that they can use to enrich their analysis or models. You may\nwant to share your data with other users across the organization. But rather than\nsetting up elaborate pipelines to move the data to yet another store, it is often easier\nand economical to create a copy of the relevant data set for users to explore and\n\n\n-----\n\n**Looks awesome! Any gotchas?**\nJust to reiterate some of the gotchas mentioned above as a single list, here’s what you\nshould be wary of:\n\n- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\nthe source table after the cloning process starts will not be reflected in the\nclone.\n\n- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\ndeleted in the source table (for example through VACUUM), your shallow clone\nmay not be usable.\n\n- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\nqueries on your source table and clone may not return the same result.\n\n- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\ndeep clones to migrate your tables and continue your ETL processes from\nwhere it left off.\n\n**How can I use it?**\nShallow and deep clones support new advances in how data teams test and manage\ntheir modern cloud data lakes and warehouses. Table clones can help your team\nimplement production-level testing of their pipelines, fine-tune their indexing for optimal\nquery performance, create table copies for sharing — all with minimal overhead and\nexpense. If this is a need in your organization, we hope you will take table cloning for\na spin and give us your feedback — we look forward to hearing about new use cases and\nextensions you would like to see in the future.\n\n**Additional resource**\n\n[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n\n\ntest the data to see if it is a fit for their needs without affecting your own production\nsystems. Here deep clones again come to the rescue.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n\n**Data archiving**\nFor regulatory or archiving purposes, all data in a table needs to be preserved for a\ncertain number of years, while the active table retains data for a few months. If you\nwant your data to be updated as soon as possible, but you have a requirement to keep\ndata for several years, storing this data in a single table and performing time travel\nmay become prohibitively expensive.\n\nIn this case, archiving your data in a daily, weekly or monthly manner is a better\nsolution. The incremental cloning capability of deep clones will really help you here.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE archive.events CLONE prod.events;\n\nNote that this table will have an independent history compared to the source table,\ntherefore, time travel queries on the source table and the clone may return different\nresults based on your frequency of archiving.\n\n\n-----\n\n**Enabling Spark SQL DDL**\n**and DML in Delta Lake on**\n**Apache Spark 3.0**\n\nThe release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\nenabling a new set of features that were simplified using Delta Lake from SQL. Here\nare some of the key features.\n\n**Support for SQL DDL commands**\n**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\nYou can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\nSQL operations when creating (or replacing) tables.\n\n**Create or replace tables**\n\n-- Create table in the metastore\n\nCREATE TABLE events (\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\nUSING DELTA\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n-- If a table with the same name already exists, the table is replaced\n\nwith\n\nthe new configuration, else it i s created\n\nCREATE OR REPLACE TABLE events (\n\n\n-----\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\n\nINSERT INTO events SELECT * FROM newEvents\n\n-- To atomically replace all of the data in a table, you can use\n\noverwrite mode\n\nINSERT OVERWRITE events SELECT * FROM newEvents\n\n\nUSING DELTA\n\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n**Explicitly alter the table schema**\n\n-- Alter table and schema\n\n\n-- Delete events\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n-- Update events\n\nUPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n\n\nALTER TABLE table_name ADD COLUMNS (\n\n\ncol_name data_type\n\n[COMMENT col_comment]\n\n[FIRST|AFTER colA_name],\n\n...)\n\nYou can also use the Scala/Java/Python APIs:\n\n- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\nAPIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n\n- \u0007DeltaTable.forName(tableName) API to create instances of\nio.delta.tables .DeltaTable which is useful for executing\nUpdate/Delete/Merge operations in Scala/Java/Python.\n\n**Support for SQL Insert, Delete, Update and Merge**\nOne of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\nwould DML operations such as delete, update and merge be available in Spark SQL?\nWait no more, these operations are now available in SQL! Below are examples of how\nyou can write delete, update and merge (insert, update, delete and de-duplication\noperations using Spark SQL).\n\n-- Using append mode, you can atomically add new data to an existing\n\nDelta table\n\n\n-- Upsert data to a target Delta\n\n-- table using merge\n\nMERGE INTO events\n\nUSING updates\n\nON events.eventId = updates.eventId\n\nWHEN MATCHED THEN UPDATE\n\nSET events.data = updates.data\n\nWHEN NOT MATCHED THEN INSERT (date, eventId, data)\n\nVALUES (date, eventId, data)\n\nIt is worth noting that the merge operation in Delta Lake supports more advanced\nsyntax than standard ANSI SQL syntax. For example, merge supports\n\n- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n“... WHEN MATCHED THEN DELETE ...”\n\n- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\nand source rows match. For example:\n\n...\n\nWHEN MATCHED AND events.shouldDelete THEN DELETE\n\nWHEN MATCHED THEN UPDATE SET events.data = updates.data\n\n\n-----\n\n\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\nsources column. For example:\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nsuch as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\nblock deletes and updates in a Delta table using delta.appendOnly=true .\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN MATCHED THEN SET *\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN NOT MATCHED THEN INSERT *\n\n-- equivalent to updating/inserting with event .date = updates.date,\n\nevents.eventId = updates.eventId, event .data = updates.data\n\n**Automatic and incremental Presto/Athena manifest**\n**generation**\nAs noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\nto read Delta Lake by using manifest files; the manifest files contain the list of the\nmost current version of files as of manifest generation. As described in the preceding\nchapter, you will need to:\n\n- Generate a Delta Lake manifest file\n\n- Configure Presto or Athena to read the generated manifests\n\n- Manually re-generate (update) the manifest file\n\nNew for Delta Lake 0.7.0 is the capability to update the manifest file automatically\nwith the following command:\n\nALTER TABLE delta.`pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.compatibility.symlinkFormatManifest.enabled=true\n\n)\n\n**Configuring your table through table properties**\nWith the ability to set table properties on your table by using ALTER TABLE SET\nTBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nYou can also easily control the history of your Delta Lake table retention by the\nfollowing [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n\n- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\nwant to alter this value based on your requirements (e.g., GDPR historical context)\n\n- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\nmust have been deleted before being a candidate for VACUUM. By default, data\nfiles older than seven days are deleted.\n\nAs of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\nconfigure these properties.\n\nALTER TABLE delta. `pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.logRetentionDuration = “interval “\n\ndelta.deletedFileRetentionDuration = “interval “\n\n)\n\n**Support for adding user-defined metadata**\n**in Delta Lake table commits**\nYou can specify user-defined strings as metadata in commits made by Delta\nLake table operations, either using the DataFrameWriter option userMetadata or\nthe SparkSession configuration spark.databricks.delta.commitInfo.\nuserMetadata .\n\nIn the following example, we are deleting a user (1xsdf1) from our data lake per user\nrequest. To ensure we associate the user’s request with the deletion, we have also\nadded the DELETE request ID into the userMetadata.\n\n\n-----\n\nSET spark.databricks.delta.commitInfo.userMetadata={\n\n“GDPR”:”DELETE Request 1x891jb23”\n\n\nThere were a lot of great questions during the AMA concerning structured streaming\nand using trigger.once .\n\n\n};\n\n\nFor more information, some good resources explaining this concept include:\n\n- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\ntrade-off discussed here .\n\n**Additional resources**\n\n[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n\n[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n\n[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n\n\nDELETE FROM user_table WHERE user_id = ‘1xsdf1’\n\nWhen reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\nidentify the associated deletion request within the transaction log.\n\n**Other highlights**\nOther highlights for the Delta Lake 0.7.0 release include:\n\n- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n\n- Improved support for streaming one-time triggers — With Spark 3.0, we now\nensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\nin a Delta Lake table in a single micro-batch even if rate limits are set with the\nDataStreamReader option maxFilesPerTrigger.\n\n\n-----\n\n**Lakehouse**\nCombining the best elements of data\nlakes and data warehouses\n\n## CHAPTER 03\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\nthat emerged independently across many customers and use cases: the **lakehouse.**\nIn this chapter, we’ll describe this new architecture and its advantages over previous\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\napplications. Since its inception in the late 1980s, data warehouse technology\ncontinued to evolve and MPP architectures led to systems that were able to handle\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\nhave to deal with unstructured data, semi-structured data, and data with high variety,\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\narchitects began envisioning a single system to house data for many different\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\nin a variety of formats. While suitable for storing data, data lakes lack some critical\nfeatures: They do not support transactions, they do not enforce data quality, and their\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\nA lakehouse is a new data architecture that combines the best elements of data lakes\nand data warehouses.\n\nLakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\nkind of low-cost storage used for data lakes. They are what you would get if you had\nto redesign data warehouses in the modern world, now that cheap and highly reliable\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\nbe reading and writing data concurrently. Support for ACID transactions ensures\nconsistency as multiple parties concurrently read or write data, typically using SQL.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\nwarehouses.\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\nrequire systems for diverse data applications including SQL analytics, real-time\nmonitoring, data science and machine learning. Most of the recent advances in\nAI have been in better models to process unstructured data (text, images, video,\naudio), but these are precisely the types of data that a data warehouse is not\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\nwarehouses, and other specialized systems such as streaming, time-series, graph\nand image databases. Having a multitude of systems introduces complexity and,\nmore importantly, introduces delay as data professionals invariably need to move\nor copy data between different systems.\n\n\n-----\n\n**\u0007Schema enforcement and governance:** The lakehouse should have a way to\nsupport schema enforcement and evolution, supporting DW schema paradigms\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\nreduces staleness and improves recency, reduces latency and lowers the cost of\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\nuse separate clusters, thus these systems are able to scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also have\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\nParquet, and they provide an API so a variety of tools and engines, including\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\nThe lakehouse can be used to store, refine, analyze and access data types needed\nfor many new data applications, including images, video, audio, semi-structured\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\nanalytics. Multiple tools might be needed to support all these workloads, but they all\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\nfeatures. Tools for security and access control are basic requirements. Data governance\ncapabilities including auditing, retention and lineage have become essential particularly\nin light of recent privacy regulations. Tools that enable data discovery such as data\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\nCloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\nstore large data warehouses and data lakes. Unfortunately, their implementation\nas key-value stores makes it difficult to achieve ACID transactions and high\nperformance: Metadata operations, such as listing objects, are expensive, and\nconsistency guarantees are limited. In this paper, we present Delta Lake, an\nopen source ACID table storage layer over cloud object stores initially developed\nat Databricks. Delta Lake uses a transaction log that is compacted into Apache\nParquet format to provide ACID properties, time travel, and significantly faster\nmetadata operations for large tabular data sets (e.g., the ability to quickly search\nbillions of table partitions for those relevant to a query). It also leverages this\ndesign to provide high-level features such as automatic data layout optimization,\nupserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\nSpark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\nthousands of Databricks customers that process exabytes of data per day, with\nthe largest instances managing exabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\nMukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\nMichał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\nSameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\nMicrosoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\nenables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\nsource file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\ncan move faster as they are able to use data without needing to access multiple systems.\nThe level of SQL support and integration with BI tools among these early lakehouses\nis generally sufficient for most enterprise data warehouses. Materialized views and\nstored procedures are available, but users may need to employ other mechanisms that\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\nimportant for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\nlibraries) for non-BI workloads like data science and machine learning. Data\nexploration and refinement are standard for many analytic and data science\napplications. Delta Lake is designed to let users incrementally improve the quality of\ndata in their lakehouse until it is ready for consumption.\n\n\nA note about technical building blocks. While distributed file systems can be\nused for the storage layer, object stores are more commonly used in lakehouses.\nObject stores provide low-cost, highly available storage that excels at massively\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\nThe lakehouse is a new data management architecture that radically simplifies\nenterprise data infrastructure and accelerates innovation in an age when\nmachine learning is poised to disrupt every industry. In the past, most of the\ndata that went into a company’s products or decision-making was structured\ndata from operational systems, whereas today, many products incorporate\nAI in the form of computer vision and speech models, text mining and others.\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\nversioning, governance, security and ACID properties that are needed even for\nunstructured data.\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\nnotebooks) over others so lakehouses will also need to improve their UX and their\nconnectors to popular tools so they can appeal to a variety of personas. These\nand other issues will be addressed as the technology continues to mature and\ndevelop. Over time, lakehouses will close these gaps while retaining the core\nproperties of being simpler, more cost-efficient and more capable of serving\ndiverse data applications.\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\nadopting the lakehouse pattern. The blog created a massive amount of interest\nfrom technology enthusiasts. While lots of people praised it as the next-generation\ndata architecture, some people thought the lakehouse is the same thing as\nthe data lake. Recently, several of our engineers and founders wrote a research\npaper that describes some of the core technological challenges and solutions that\nset the lakehouse architecture apart from the data lake, and it was accepted and\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\nthey would have said faster horses.” The crux of this statement is that people often\nenvision a better solution to a problem as an evolution of what they already know\nrather than rethinking the approach to the problem altogether. In the world of data\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\nstore data warehouses and data lakes. However, their nature as key-value stores\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\nperformance is hampered by expensive metadata operations (e.g., listing objects)\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\nThe first is directories of files (i.e., data lakes) that store the table as a collection\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\napproach because the table is just a group of objects that can be accessed from\na wide variety of tools without a lot of additional data stores or systems. However,\nboth performance and consistency problems are common. Hidden data corruption\nis common due to failed transactions, eventual consistency leads to inconsistent\nqueries, latency is high, and basic management capabilities like table versioning and\naudit logs are unavailable.\n\n**2. Custom storage engines**\nThe second approach is custom storage engines, such as proprietary systems built for\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\nservice that’s able to provide a single source of truth. However, all I/O operations need\nto connect to this metadata service, which can increase cloud resource costs and\nreduce performance and availability. Additionally, it takes a lot of engineering work to\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\nand PyTorch, which can be challenging for data teams that use a variety of computing\nengines on their data. Engineering challenges can be exacerbated by unstructured\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\ncustomers into a specific service provider, leaving customers to contend with\nconsistently high prices and expensive, time-consuming migrations if they decide to\nadopt a new approach later.\n\n**3. Lakehouse**\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\nwe sought to build a car instead of a faster horse with not just a better data store,\nbut a fundamental change in how data is stored and used via the lakehouse. A\nlakehouse is a new architecture that combines the best elements of data lakes and\ndata warehouses. Lakehouses are enabled by a new system design: implementing\nsimilar data structures and data management features to those in a data warehouse,\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\nget if you had to redesign storage engines in the modern world, now that cheap and\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\nthe cloud object store. This design allows clients to update multiple objects at once,\nreplace a subset of the objects with another, etc., in a serializable manner that still\nachieves high parallel read/write performance from the objects. The log also provides\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\ncaching, and audit logs. Together, these features improve both the manageability and\nperformance of working with data in cloud object stores, ultimately opening the door\nto the lakehouse architecture that combines the key features of data warehouses and\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\nexabytes of structured and unstructured data each day, as well as many organizations\nin the open source community. These use cases span a variety of data sources and\napplications. The data types stored include Change Data Capture (CDC) logs from\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\ntables for reporting, and image or feature data for machine learning. The applications\ninclude SQL workloads (most commonly), business intelligence, streaming, data\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\nbe a good fit for most data lake applications that would have used structured storage\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\nsimplify their data architecture by running more workloads directly against cloud\nobject stores, and increasingly, by creating a lakehouse with both data lake and\ntransactional features to replace some or all of the functionality provided by message\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\nAmazon Redshift).\n\n**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\nenables a wide range of DBMS-like performance and management features for data\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\naccess protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\nengine to take advantage of modern CPU architecture with optimizations to Spark\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\nRuntime 7.0. Together, these features significantly accelerate query performance on\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\nOne of the big hardware trends over the last several years is that CPU clock speeds\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\nis that we have to find new ways to process data faster beyond raw compute power.\nOne of the most impactful methods has been to improve the amount of data that can\nbe processed in parallel. However, data processing engines need to be specifically\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\nthe pace of business increases. Poorer modeling in the interest of better business\nagility drives poorer query performance. Naturally, this is not a desired state, and\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\nworkloads through three components: an improved query optimizer, a caching\nlayer that sits between the execution layer and the cloud object storage, and a native\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\ndata teams today is the native execution engine, which we call Photon. (We know.\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\n\n-----\n\nDatabricks has been built to maximize the performance from the new changes in\nmodern cloud hardware. It brings performance improvements to all workload types\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\nBy linking these three components together, we think it will be easier for customers\nto understand how improvements in multiple places within the Databricks code\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\nnew advances in how data teams design their data architectures for increased\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n**Streaming**\nUsing Delta Lake to express\ncomputation on streaming data\n\n## CHAPTER 04\n\n\n-----\n\n**How Delta Lake Solves Common**\n**Pain Points in Streaming**\n\nThe pain points of a traditional streaming and data warehousing solution can be\nbroken into two groups: data lake and data warehouse pains.\n\n**Data lake pain points**\nWhile data lakes allow you to flexibly store an immense amount of data in a file system,\nthere are many pain points including (but not limited to):\n\n- Consolidation of streaming data from many disparate systems is difficult.\n\n- Updating data in a data lake is nearly impossible, and much of the streaming\ndata needs to be updated as changes are made. This is especially important in\nscenarios involving financial reconciliation and subsequent adjustments.\n\n- Query speeds for a data lake are typically very slow.\n\n- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n\n**Data warehouse pain points**\nThe power of a data warehouse is that you have a persistent performant store of your\ndata. But the pain points for building modern continuous applications include (but are\nnot limited to):\n\n- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n\n- Accessing streaming data and stored data together is very difficult, if at all possible.\n\n- Data warehouses do not scale very well.\n\n- Tying compute and storage together makes using a warehouse very expensive.\n\n\n-----\n\n**How Delta Lake on Databricks solves these issues**\n[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\nperformance optimizations to cloud data lakes. More succinctly, Delta Lake combines\nthe advantages of data lakes and data warehouses with Apache Spark™ to allow you\nto do incredible things.\n\n- Delta Lake, along with Structured Streaming, makes it possible to analyze\nstreaming and historical data together at high speeds.\n\n- When Delta Lake tables are used as sources and destinations of streaming big\ndata, it is easy to consolidate disparate data sources.\n\n- Upserts are supported on Delta Lake tables.\n\n- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n\n- Easily include machine learning scoring and advanced analytics into ETL\nand queries.\n\n- Decouples compute and storage for a completely scalable solution.\n\nIn the following use cases, we’ll share what this looks like in practice.\n\n\n-----\n\n**Simplifying Streaming Stock**\n**Data Analysis Using Delta Lake**\n\nReal-time analysis of stock data is a complicated endeavor. After all, there are many\nchallenges in maintaining a streaming system and ensuring transactional consistency\nof legacy and streaming data concurrently.\n\nThankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\nsystem to analyze stock data in real time. In this section, we’ll share how to simplify\nthe streaming of stock data analysis using Delta Lake.\n\nIn the following diagram, you can see a high-level architecture that simplifies this\nproblem. We start by ingesting two different sets of data into two Delta Lake tables.\nThe two data sets are stock prices and fundamentals.\n\nAfter ingesting the data into their respective tables, we then join the data in an ETL\nprocess and write the data out into a third Delta Lake table for downstream analysis.\n\nDelta Lake helps solve these problems by combining the scalability, streaming and\naccess to the advanced analytics of Apache Spark with the performance and ACID\ncompliance of a data warehouse.\n\n\n-----\n\n# Create Fundamental Data (Databricks Delta table)\n\ndfBaseFund = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksFundamentals’ )\n\n# Create Price Data (Databricks Delta table)\n\ndfBasePrice = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksDailyPrices’ )\n\n\n**Implement your streaming**\n**stock analysis solution with Delta Lake**\nDelta Lake and Apache Spark do most of the work for our solution; you can try out the\nfull [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n\nAs noted in the preceding diagram, we have two data sets to process — one for\nfundamentals and one for price data. To create our two Delta Lake tables, we specify\nthe .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n\n\n-----\n\nWhile we’re updating the stockFundamentals and stocksDailyPrices ,\nwe will consolidate this data through a series of ETL jobs into a consolidated view\n( stocksDailyPricesWFund ).\n\nWith the following code snippet, we can determine the start and end date of available\ndata and then combine the price and fundamentals data for that date range into DBFS.\n\n# Determine start and end date of available data\n\nrow = dfBasePrice.agg(\n\nfunc.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n\nfunc.min(dfBasePrice.price_date) .alias ( “minDate” )\n\n).collect()[ 0 ]\n\nstartDate = row[ “minDate” ]\n\nendDate = row[ “maxDate” ]\n\n# Define our date range function\n\n\n# Save data to DBFS\n\ndfPriceWFund\n\n.write\n\n.format( ‘delta’ )\n\n.mode( ‘append’ )\n\n.save( ‘/delta/stocksDailyPricesWFund’ )\n\n# Loop through dates to complete fundamentals + price ETL process\n\nfor single_date in daterange(\n\nstartDate, (endDate + datetime.timedelta(days= 1 ))\n\n):\n\nprint ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n\nstart = datetime.datetime.now()\n\ncombinePriceAndFund(single_date)\n\nend = datetime.datetime.now()\n\nprint ( end - start)\n\n\ndef daterange(start_date, end_date):\n\n\nNow we have a stream of consolidated fundamentals and price data that is being\npushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\nDelta Lake table by specifying .format(“delta”) against that DBFS location.\n\n\nfor n in range( int ((end_date - start_date).days)):\n\nyield start_date + datetime.timedelta(n)\n\n\n# Define combinePriceAndFund information by date and\n\n\ndef combinePriceAndFund(theDate):\n\ndfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n\ndfPrice = dfBasePrice. where (\n\ndfBasePrice.price_date == theDate\n\n\ndfPriceWithFundamentals = spark\n\n.readStream\n\n.format( “delta” )\n\n.load( “/delta/stocksDailyPricesWFund” )\n\n\n).drop( ‘price_date’ )\n\n\n# Drop the updated column\n\ndfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n\n\n// Create temporary view of the data\n\ndfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n\n\n-----\n\nNow that we have created our initial Delta Lake table, let’s create a view that will\nallow us to calculate the price/earnings ratio in real time (because of the underlying\nstreaming data updating our Delta Lake table).\n\n%sql\n\nCREATE OR REPLACE TEMPORARY VIEW viewPE AS\n\nselect ticker,\n\nprice_date,\n\nfirst(close) as price,\n\n(close/eps_basic_net) as pe\n\nfrom priceWithFundamentals\n\nwhere eps_basic_net > 0\n\ngroup by ticker, price_date, pe\n\n**Analyze streaming stock data in real time**\nWith our view in place, we can quickly analyze our data using Spark SQL.\n\n%sql\n\nselect - \n\nfrom viewPE\n\nwhere ticker == “AAPL”\n\norder by price_date\n\n\n-----\n\nAs the underlying source of this consolidated data set is a Delta Lake table, this view\nisn’t just showing the batch data but also any new streams of data that are coming in\nas per the following streaming dashboard.\n\nUnderneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\ntables but also keeping the state of the distinct number of keys (in this case ticker\nsymbols) that need to be tracked.\n\n\nBecause you are using Spark SQL, you can execute aggregate queries at scale\nand in real time.\n\n%sql\n\nSELECT ticker, AVG(close) as Average_Close\n\nFROM priceWithFundamentals\n\nGROUP BY ticker\n\nORDER BY Average_Close\n\nIn closing, we demonstrated how to simplify streaming stock data analysis using\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\nDatabricks integrated workspace to create a performant, scalable solution that has\nthe advantages of both data lakes and data warehouses.\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\ncommonly associated with streaming and transactional consistency, enabling\ndata engineering and data science teams to focus on understanding the trends in\ntheir stock data.\n\n\n-----\n\n**How Tilting Point Does Streaming**\n**Ingestion Into Delta Lake**\n\nTilting Point is a new-generation games partner that provides top development\nstudios with expert resources, services and operational support to optimize\nhigh-quality live games for success. Through its user acquisition fund and its\nworld-class technology platform, Tilting Point funds and runs performance\nmarketing management and live games operations to help developers achieve\nprofitable scale.\n\nBy leveraging Delta Lake, Tilting Point is able to leverage quality data and make\nit readily available for analytics to improve the business. Diego Link, VP of\nEngineering at Tilting Point, provided insights for this use case.\n\nThe team at Tilting Point was running daily and hourly batch jobs for reporting on\ngame analytics. They wanted to make their reporting near real-time, getting insights\nwithin 5–10 minutes.\n\nThey also wanted to make their in-game LiveOps decisions based on real-time player\nbehavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\neffects and even alert on service interruptions in game operations. The goal was to\nensure that the game experience was as robust as possible for their players.\n\nAdditionally, they had to store encrypted Personally Identifiable Information (PII) data\nseparately in order to maintain GDPR compliance.\n\n\n-----\n\n**How data flows and associated challenges**\nTilting Point has a proprietary software development kit that developers integrate\nwith to send data from game servers to an ingest server hosted in AWS. This service\nremoves all PII data and then sends the raw data to an Amazon Firehose endpoint.\nFirehose then dumps the data in JSON format continuously to S3.\n\nTo clean up the raw data and make it available quickly for analytics, the team\nconsidered pushing the continuous data from Firehose to a message bus (e.g.,\nKafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\nprocess data and write to Delta Lake tables.\n\nWhile that architecture sounds ideal for low latency requirements of processing\ndata in seconds, Tilting Point didn’t have such low latency needs for their ingestion\npipeline. They wanted to make the data available for analytics in a few minutes, not\nseconds. Hence they decided to simplify our architecture by eliminating a message\nbus and instead use S3 as a continuous source for their structured streaming job.\n\nBut the key challenge in using S3 as a continuous source is identifying files that\nchanged recently.\n\nListing all files every few minutes has two major issues:\n\n- **Higher latency:** Listing all files in a directory with a large number of files has high\noverhead and increases processing time.\n\n- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n\n**Leveraging Structured Streaming with blob store as**\n**source and Delta Lake tables as sink**\nTo continuously stream data from cloud blob storage like S3, Tilting Point uses\n[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\nstream data from S3 without the need to write any state management code on what\nfiles were recently processed.\n\n\n-----\n\nThis is how Tilting Point’s ingestion pipeline looks:\n\n- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\nto SQS via SNS.\n\n- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\ninformation to read the actual file contents in S3. An example code below:\n\nspark.readStream \\\n\n.format( “s3-sqs” ) \\\n\n. option ( “fileFormat” , “json” ) \\\n\n. option ( “queueUrl” , ...) \\\n\n. schema (...) \\\n\n. load ()\n\n- Tilting Point’s structured streaming job then cleans up and transforms the data.\nBased on the game data, the streaming job uses the foreachBatch API of Spark\nstreaming and writes to 30 different Delta Lake tables.\n\n- The streaming job produces lots of small files. This affects performance of\ndownstream consumers. So, an optimize job runs daily to compact small files in\nthe table and store them as right file sizes so that consumers of the data have\ngood performance while reading the data from Delta Lake tables. Tilting Point\nalso runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n\n\n-----\n\nThe above Delta Lake ingestion architecture helps in the following ways:\n\n- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\nThis helps quickly process the new files without too much overhead in listing files.\n\n- **No explicit file state management:** There is no explicit file state management\nneeded to look for recent files.\n\n- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\nand Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n\n- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\ntransactional guarantees. This helps with reliable data ingestion.\n\n- **File compaction:** One of the major problems with streaming ingestion is tables\nending up with a large number of small files that can affect read performance.\nBefore Delta Lake, we had to set up a different table to write the compacted\ndata. With Delta Lake, thanks to ACID transactions, we can compact the files and\nrewrite the data back to the same table safely.\n\n- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\ningestion tables to downstream consumers while data is being appended by a\nstreaming job and modified during compaction.\n\n- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\nprevious version of the table.\n\nIn this section, we walked through Tilting Point’s use cases and how they do\nstreaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\nefficiently without too much operational overhead to make good quality data\nreadily available for analytics.\n\n\n-----\n\n**Building a Quality of Service**\n**Analytics Solution for Streaming**\n**Video Services**\n\nAs traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\nlibraries of content. For companies whose entire business model revolved around\nproducing great content, which they then licensed to distributors, the shift to now\nowning the entire glass-to-glass experience has required new capabilities, such as\nbuilding media supply chains for content delivery to consumers, supporting apps for\na myriad of devices and operating systems, and performing customer relationship\nfunctions like billing and customer service.\n\nWith most services renewing on a monthly basis, subscription service operators need\nto prove value to their subscribers at all times. General quality of streaming video\nissues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\nscreen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n\nWhen you start streaming, you realize there are so many places where breaks can\nhappen and the viewer experience can suffer. There may be an issue at the source in\nthe servers on-premises or in the cloud; in transit at either the CDN level or ISP level\nor the viewer’s home network; or at the playout level with player/client issues. What\nbreaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\nx 106. There is no pre-release testing that can quite replicate real-world users and\ntheir ability to push even the most redundant systems to their breaking point as they\n\n\n-----\n\nchannel surf, click in and out of the app, sign on from different devices simultaneously\nand so on. And because of the nature of TV, things will go wrong during the most\nimportant, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\nrather regional or a national issue? If national, is it across all devices or only certain\ntypes (e.g., possibly the OEM updated the OS on an older device type, which ended up\ncausing compatibility issues with the client)?\n\nIdentifying, remediating and preventing viewer quality of experience issues becomes\na big data problem when you consider the number of users, the number of actions\nthey are taking and the number of handoffs in the experience (servers to CDN to ISP to\nhome network to client). Quality of Service (QoS) helps make sense of these streams\nof data so you can understand what is going wrong, where and why. Eventually you\ncan get into predictive analytics around what could go wrong and how to remediate\nit before anything breaks.\n\n**Databricks Quality of Service solution overview**\nThe aim of this solution is to provide the core for any streaming video platform that\nwants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\na Unified Data Analytics Platform for both the real-time insights and the advanced\nanalytics capabilities.\n\n[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\nleveraging the most complete and recent data sets powered by robust and reliable\ndata pipelines. This decreases time to market for new features by accelerating\ndata science using a collaborative environment. It provides support for managing\nthe end-to-end machine learning lifecycle and reduces operational costs across\nall cycles of software development by having a unified platform for both data\nengineering and data science.\n\n\n-----\n\n**Video QoS solution architecture**\nWith complexities like low-latency monitoring alerts and highly scalable infrastructure\nrequired for peak video traffic hours, the straightforward architectural choice was\nthe Delta Architecture — both standard big data architectures like Lambda and Kappa\nArchitectures have disadvantages around the operational effort required to maintain\nmultiple types of pipelines (streaming and batch) and lack support for a unified data\nengineering and data science approach.\n\nThe Delta Architecture is the next-generation paradigm that enables all the data\npersonas in your organization to be more productive:\n\n- Data engineers can develop data pipelines in a cost-efficient manner\ncontinuously without having to choose between batch and streaming\n\n- Data analysts can get near real-time insights and faster answers to their BI queries\n\n- Data scientists can develop better machine learning models using more reliable data\nsets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n\n\n-----\n\nWriting data pipelines using the Delta Architecture follows the best practices of\nhaving a multi-layer “multi-hop” approach where we progressively add structure to\ndata: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\nreporting or data science, and “Gold” tables are the final presentation layer.\n\nFor the pure streaming use cases, the option of materializing the DataFrames in\nintermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\ncost (an example being real-time monitoring alerts vs. updates of the recommender\nsystem based on new content).\n\nA streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n\nThe number of “hops” in this approach is directly impacted by the number of consumers\ndownstream, complexity of the aggregations (e.g., Structured Streaming enforces\ncertain limitations around chaining multiple aggregations) and the maximization of\noperational efficiency.\n\nThe QoS solution architecture is focused around best practices for data processing\nand is not a full video-on-demand (VoD) solution — with some standard components\nlike the “front door” service Amazon API Gateway being avoided from the high-level\narchitecture in order to keep the focus on data and analytics.\n\n\n-----\n\nHigh-level architecture for the QoS platform\n\n\n**Making your data ready for analytics**\nBoth sources of data included in the QoS solution (application events and CDN logs)\nare using the JSON format, great for data exchange — allowing you to represent\ncomplex nested structures, but not scalable and difficult to maintain as a storage\nformat for your data lake / analytics system.\n\n\nIn order to make the data directly queryable across the entire organization, the\nBronze to Silver pipeline (the “make your data available to everyone” pipeline) should\ntransform any raw formats into Delta Lake and include all the quality checks or data\nmasking required by any regulatory agencies.\n\n\n-----\n\nRaw format of the app events\n\n**Video applications events**\nBased on the architecture, the video application events are pushed directly to\nKinesis Streams and then just ingested to a Delta Lake append-only table without\nany changes to the schema.\n\nUsing this pattern allows a high number of consumers downstream to process the\ndata in a streaming paradigm without having to scale the throughput of the Kinesis\nstream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\nwe don’t have to worry about the way the size of the processing window will impact the\nnumber of files in your target table — known as the “small files” issue in the big data world.\n\nBoth the timestamp and the type of message are being extracted from the JSON\nevent in order to be able to partition the data and allow consumers to choose the\ntype of events they want to process. Again combining a single Kinesis stream for\nthe events with a Delta Lake “Events” table reduces the operational complexity while\nmaking things easier for scaling during peak hours.\n\n\nAll the details are extracted from JSON for the Silver table\n\n\n-----\n\n**CDN logs**\nThe CDN logs are delivered to S3, so the easiest way to process them is the Databricks\nAuto Loader, which incrementally and efficiently processes new data files as they\narrive in S3 without any additional setup.\n\nauto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n\n.option( “cloudFiles.format” , “json” ) \\\n\n.option( “cloudFiles.region” , region) \\\n\n.load(input_location)\n\nanonymized_df = auto_loader_df. select ( ‘*’ , ip_\n\nanonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n\n.drop( ‘requestip’ )\\\n\n.withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n\nanonymized_df.writeStream \\\n\n.option( ‘checkpointLocation’ , checkpoint_location)\\\n\n.format( ‘delta’ ) \\\n\n.table(silver_database + ‘.cdn_logs’ )\n\nAs the logs contain IPs — considered personal data under the GDPR regulations — the\n“make your data available to everyone” pipeline has to include an anonymization step.\nDifferent techniques can be used, but we decided to just strip the last octet from IPv4\nand the last 80 bits from IPv6. On top, the data set is also enriched with information\naround the origin country and the ISP provider, which will be used later in the Network\nOperation Centers for localization.\n\n\n-----\n\n**Creating the Dashboard /**\n**Virtual Network Operation Centers**\nStreaming companies need to monitor network performance and the user experience\nas near real-time as possible, tracking down to the individual level with the ability to\nabstract at the segment level, easily defining new segments such as those defined by\ngeos, devices, networks and/or current and historical viewing behavior.\n\nFor streaming companies that has meant adopting the concept of Network Operation\nCenters (NOC) from telco networks for monitoring the health of the streaming\nexperience for their users at a macro level, flagging and responding to any issues\nearly on. At their most basic, NOCs should have dashboards that compare the current\nexperience for users against a performance baseline so that the product teams can\nquickly and easily identify and attend to any service anomalies.\n\nIn the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\nbe effortlessly connected in order to build more complex visualizations, but based\non customer feedback, built-in dashboards are, most of the time, the fastest way to\npresent the insights to business users.\n\nThe aggregated tables for the NOC will basically be the Gold layer of our Delta\nArchitecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n\n\n-----\n\nThe dashboard is just a way to visually package the results of SQL queries or Python\n/ R transformation — each notebook supports multiple dashboards so in case of\nmultiple end users with different requirements we don’t have to duplicate the code —\nas a bonus the refresh can also be scheduled as a Databricks job.\n\nVisualization of the results of a SQL query\n\nLoading time for videos (time to first frame) allows better understanding of the\nperformance for individual locations of your CDN — in this case the AWS CloudFront\nEdge nodes — which has a direct impact in your strategy for improving this KPI —\neither by spreading the user traffic over multi-CDNs or maybe just implementing a\ndynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n\n\n-----\n\nFailure to understand the reasons for high levels of buffering — and the poor video\nquality experience that it brings — has a significant impact on subscriber churn rate.\nOn top of that, advertisers are not willing to spend money on ads responsible for\nreducing the viewer engagement — as they add extra buffering on top, so the profits\non the advertising business usually are impacted too. In this context, collecting as\nmuch information as possible from the application side is crucial to allow the analysis\nto be done not only at video level but also browser or even type / version of application.\n\nOn the content side, events for the application can provide useful information about\nuser behavior and overall quality of experience. How many people that paused a video\nhave actually finished watching that episode / video? What caused the stoppage: The\nquality of the content or delivery issues? Of course, further analyses can be done by\nlinking all the sources together (user behavior, performance of CDNs /ISPs) to not only\ncreate a user profile but also to forecast churn.\n\n\n-----\n\n**Creating (near) real-time alerts**\nWhen dealing with the velocity, volume and variety of data generated in video\nstreaming from millions of concurrent users, dashboard complexity can make it\nharder for human operators in the NOC to focus on the most important data at the\nmoment and zero-in on root cause issues. With this solution, you can easily set up\nautomated alerts when performance crosses certain thresholds that can help the\nhuman operators of the network as well as set off automatic remediation protocols\nvia a Lambda function. For example:\n\n- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\nlatency vs. baseline average), initiate automatic CDN traffic shifts.\n\n- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\nproduct team that there is likely a client issue for a specific device.\n\n- If viewers on a certain ISP are having higher-than-average buffering and\npixelation issues, alert frontline customer representatives on responses and ways\nto decrease issues (e.g., set stream quality lower).\n\nFrom a technical perspective, generating real-time alerts requires a streaming\nengine capable of processing data real time and publish-subscribe service to push\nnotifications.\n\n\nupdates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\non a rule-based engine (e.g., validating the percentage of errors for each individual\ntype of app over a period of time) really straightforward.\n\ndef send_error_notification(row):\n\nsns_client = boto3.client( ‘sns’ , region)\n\nerror_message = ‘Number of errors for the App has exceeded the\n\nthreshold {}’ .format(row[ ‘percentage’ ])\n\nresponse = sns_client.publish(\n\nTopicArn =,\n\nMessage = error_message,\n\nSubject =,\n\nMessageStructure = ‘string’ )\n\n# Structured Streaming Job\n\ngetKinesisStream( “player_events” )\\\n\n.selectExpr( “type” , “app_type” )\\\n\n.groupBy( “app_type” )\\\n\n.apply(calculate_error_percentage)\\\n\n. where ( “percentage > {}” .format(threshold)) \\\n\n.writeStream\\\n\n. foreach (send_error_notification)\\\n\n.start()\n\n\nIntegrating microservices using Amazon SNS and Amazon SQS\n\nSending email notifications using AWS SNS\n\nThe QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\nby using Amazon SNS and its integrations with Amazon Lambda (see below for the\n\n\n-----\n\nOn top of the basic email use case, the Demo Player includes three widgets updated\nin real time using AWS AppSync: the number of active users, the most popular videos\nand the number of users concurrently watching a video.\n\nUpdating the application with the results of real-time aggregations\n\nThe QoS solution is applying a similar approach — Structured Streaming and Amazon\nSNS — to update all the values allowing for extra consumers to be plugged in using AWS\nSQS. This is a common pattern when huge volumes of events have to be enhanced and\nanalyzed; pre-aggregate data once and allow each service (consumer) to make their\nown decision downstream.\n\n**Next steps: machine learning**\nManually making sense of the historical data is important but is also very slow. If\nwe want to be able to make automated decisions in the future, we have to integrate\nmachine learning algorithms.\n\nAs a Unified Data Platform, Databricks empowers data scientists to build better data\nscience products using features like Runtime for Machine Learning with built-in\nor the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n\n\n-----\n\nWe have already explored a few important use cases across our customer base while\nfocusing on the possible extensions to the QoS solution.\n\n**Point-of-failure prediction and remediation**\nAs D2C streamers reach more users, the costs of even momentary loss of service\nincreases. ML can help operators move from reporting to prevention by forecasting\nwhere issues could come up and remediating before anything goes wrong (e.g.,\na spike in concurrent viewers leads to switching CDNs to one with more capacity\nautomatically).\n\n**Customer churn**\nCritical to growing subscription services is keeping the subscribers you have. By\nunderstanding the quality of service at the individual level, you can add QoS as a\nvariable in churn and customer lifetime value models. Additionally, you can create\ncustomer cohorts for those who have had video quality issues in order to test\nproactive messaging and save offers.\n\n\n**Getting started with the Databricks streaming video**\n**QoS solution**\nProviding consistent quality in the streaming video experience is table stakes at this\npoint to keep fickle audiences with ample entertainment options on your platform.\nWith this solution we have sought to create a quick start for most streaming video\nplatform environments to embed this QoS real-time streaming analytics solution in\na way that:\n1. Scales to any audience size\n2. Quickly flags quality performance issues at key parts of the distribution workflow\n3. Is flexible and modular enough to easily customize for your audience and your\nneeds, such as creating new automated alerts or enabling data scientists to test\nand roll out predictive analytics and machine learning\n\nTo get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\nsystem, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n\n\n-----\n\n**Customer Use Cases**\nSee how customers are using\nDelta Lake to rapidly innovate\n\n## CHAPTER 05\n\n\n-----\n\n**Healthdirect Australia**\nProvides Personalized and Secure Online\nPatient Care With Databricks\n\nAs the shepherds of the National Health Services Directory (NHSD), Healthdirect\nis focused on leveraging terabytes of data covering time-driven, activity-based\nhealthcare transactions to improve health care services and support. With\ngovernance requirements, siloed teams and a legacy system that was difficult\nto scale, they moved to Databricks. This boosted data processing for downstream\nmachine learning while improving data security to meet HIPAA requirements.\n\n**Spotlight on Healthdirect**\n**Industry:** Healthcare and life sciences\n6x\nImprovement in data processing\n20M\nRecords ingested in minutes\n\n**Data quality and governance issues, silos, and the**\n**inability to scale**\nDue to regulatory pressures, Healthdirect Australia set forth to improve overall data\nquality and ensure a level of governance on top of that, but they ran into challenges\nwhen it came to data storage and access. On top of that, data silos were blocking the\nteam from efficiently preparing data for downstream analytics. These disjointed data\n\n\n-----\n\nsources impacted the consistency of data reads, as data was oftentimes out-of-sync\nbetween the various systems in their stack. The low-quality data also led to higher\nerror rates and processing inefficiencies. This fragmented architecture created\nsignificant operational overhead and limited their ability to have a comprehensive\nview of the patient.\n\nFurther, they needed to ingest over 1 billion data points due to a changing landscape\nof customer demand such as bookings, appointments, pricing, eHealth transaction\nactivity, etc. — estimated at over 1TB of data.\n\n“We had a lot of data challenges. We just couldn’t process efficiently enough. We\nwere starting to get batch overruns. We were starting to see that a 24-hour window\nisn’t the most optimum time in which we want to be able to deliver healthcare data\nand services,” explained Peter James, Chief Architect at Healthdirect Australia.\n\nUltimately, Healthdirect realized they needed to modernize their end-to-end process\nand tech stack to properly support the business.\n\n**Modernizing analytics with Databricks and Delta Lake**\nDatabricks provides Healthdirect Australia with a Unified Data Platform that simplifies\ndata engineering and accelerates data science innovation. The notebook environment\nenables them to make content changes in a controlled fashion rather than having to\nrun bespoke jobs each time.\n\n“Databricks has provided a big uplift for our teams and our data operations,” said\nJames. “The analysts were working directly with the data operations teams. They are\nable to achieve the same pieces of work together within the same time frames that\nused to take twice as long. They’re working together, and we’re seeing just a massive\nacceleration in the speed at which we can deliver service.”\n\n\n-----\n\nWith Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\nWithin these zones, they store their data “as is,” in their structured or unstructured\nstate, in Delta Lake tables. From there, they use a metadata-driven schema and hold\nthe data within a nested structure within that table. What this allows them to do is\nhandle data consistently from every source and simplifies the mapping of data to the\nvarious applications pulling the data.\n\nMeanwhile, through Structured Streaming, they were able to convert all of their\nETL batch jobs into streaming ETL jobs that could serve multiple applications\nconsistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\nDatabricks Unified Data Platform provides significant architectural improvements\nthat have boosted performance, reduced operational overheads and increased\nprocess efficiencies.\n\n\n**Faster data pipelines result in better patient-driven**\n**healthcare**\nAs a result of the performance gains delivered by Databricks and the improved data\nreliability through Delta Lake, Healthdirect Australia realized improved accuracy of\ntheir fuzzy name match algorithm from less than 80% with manual verification to 95%\nand no manual intervention.\n\nThe processing improvements with Delta Lake and Structured Streaming allowed\nthem to process more than 30,000 automated updates per month. Prior to Databricks,\nthey had to use unreliable batch jobs that were highly manual to process the same\nnumber of updates over a span of 6 months — a 6x improvement in data processing.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the\nhealthcare sector.”\n\n– Peter James, Chief Architect, Healthdirect Australia\n\n\n-----\n\nThey were also able to increase their data load rate to 1 million records per minute,\nloading their entire 20 million record data set in 20 minutes. Before the adoption\nof Databricks, this used to take more than 24 hours to process the same 1 million\ntransactions, blocking analysts from making swift decisions to drive results.\n\nLast, data security, which was critical to meet compliance requirements, was greatly\nimproved. Databricks provides standard security accreditations like HIPAA, and\nHealthdirect was able to use Databricks to meet Australia’s security requirements.\nThis yielded significant cost reductions and gave them continuous data assurance\nby monitoring changes to access privileges like changes in roles, metadata-level\nsecurity changes, data leakage, etc.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the healthcare\nsector,” said James.\n\nWith the help of Databricks, they have proven the value of data and analytics and how\nit can impact their business vision. With transparent access to data that boasts\nwell-documented lineage and quality, participation across various business and\nanalyst groups has increased — empowering teams to collaborate and more\neasily and quickly extract value from their data with the goal of improving\nhealthcare for everyone.\n\n\n-----\n\n**Comcast**\nUses Delta Lake and MLflow to\nTransform the Viewer Experience\n\n**Spotlight on Comcast**\n**Industry:** Media and entertainment\n10x\nReduction in overall compute costs to process data\n90%\nReduction in required DevOps resources to manage infrastructure\nReduced\nDeployment times from weeks to minutes\n\nAs a global technology and media company connecting millions of customers to\npersonalized experiences, Comcast struggled with massive data, fragile data pipelines\n\nand poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n— they can build performant data pipelines for petabytes of data and easily manage the\nlifecycle of hundreds of models to create a highly innovative, unique and award-winning\nviewer experience using voice recognition and machine learning.\n\n\n-----\n\n**Infrastructure unable to support data and ML needs**\nInstantly answering a customer’s voice request for a particular program while turning\nbillions of individual interactions into actionable insights, strained Comcast’s IT\ninfrastructure and data analytics and data science teams. To make matters more\ncomplicated, Comcast needed to deploy models to a disjointed and disparate range\nof environments: cloud, on-premises and even directly to devices in some instances.\n\n- **Massive data:** Billions of events generated by the entertainment system and 20+\nmillion voice remotes, resulting in petabytes of data that need to be sessionized\nfor analysis.\n\n- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\nhard to recover. Small files were difficult to manage, slowing data ingestion for\ndownstream machine learning.\n\n- **Poor collaboration:** Globally dispersed data scientists working in different\nscripting languages struggled to share and reuse code.\n\n- **Manage management of ML models:** Developing, training and deploying hundreds\nof models was highly manual, slow and hard to replicate, making it difficult to scale.\n\n- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\nand models while ops wanted to deploy on proven infrastructure.\n\n\n-----\n\n**Automated infrastructure, faster data**\n**pipelines with Delta Lake**\nComcast realized they needed to modernize their entire approach to analytics from\ndata ingest to the deployment of machine learning models to delivering new features\nthat delight their customers. Today, the Databricks Unified Data Platform enables\nComcast to build rich data sets and optimize machine learning at scale, streamline\nworkflows across teams, foster collaboration, reduce infrastructure complexity, and\ndeliver superior customer experiences.\n\n- **Simplified infrastructure management:** Reduced operational costs through\nautomated cluster management and cost management features such as\nautoscaling and spot instances.\n\n\n\n- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\ninitial processing of the raw telemetry from video and voice applications and devices.\n\n- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\nand reliable ingestion at scale.\n\n- **Collaborative workspaces:** Interactive notebooks improve cross-team\ncollaboration and data science creativity, allowing Comcast to greatly accelerate\nmodel prototyping for faster iteration.\n\n- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\nand model serving via the Kubeflow environment, allowing them to track and\nmanage hundreds of models with ease.\n\n- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\nthat can reliably join historic and streaming data for richer insights.\n\n\n-----\n\n**Delivering personalized experiences with ML**\nIn the intensely competitive entertainment industry, there is no time to press the\nPause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\ndelighted with competition-beating customer experiences.\n\n- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\na highly innovative and award-winning viewer experience with intelligent voice\ncommands that boosts engagement.\n\n- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\ningestion, replacing 640 machines with 64 while improving performance. Teams\ncan spend more time on analytics and less time on infrastructure management.\n\n- **Less DevOps:** Reduced the number of DevOps full-time employees required for\nonboarding 200 users from 5 to 0.5.\n\n- **Higher data science productivity:** Fostered collaboration between global data\nscientists by enabling different programming languages through a single\ninteractive workspace. Also, Delta Lake has enabled the data team to use data at\nany point within the data pipeline, allowing them to act more quickly in building\nand training new models.\n\n- **Faster model deployment:** Reduced deployment times from weeks to minutes as\noperations teams deployed models on disparate platforms.\n\n\n-----\n\n**Banco Hipotecario**\nPersonalizes the Banking\nExperience With Data and ML\n\nBanco Hipotecario — a leading Argentinian commercial bank — is on a mission\nto leverage machine learning to deliver new insights and services that will delight\ncustomers and create upsell opportunities. With a legacy analytics and data\nwarehousing system that was rigid and complex to scale, they turned to Databricks\nto unify data science, engineering and analytics.\n\nAs a result of this partnership, they were able to significantly increase customer\nacquisition and cross-sells while lowering the cost for acquisition, greatly impacting\noverall customer retention and profitability.\n\n**Spotlight on Banco Hipotecario**\n**Industry:** Financial services\n35%\n\nReduction in cost of acquisition\n**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n\n\n-----\n\n**Legacy analytics tools are slow, rigid and**\n**impossible to scale**\nBanco Hipotecario set forth to increase customer acquisition by reducing risk and\nimproving the customer experience. With data analytics and machine learning\nanchoring their strategy, they hoped to influence a range of use cases from fraud\ndetection and risk analysis to serving product recommendations to drive upsell and\ncross-sell opportunities and forecast sales.\n\nBanco Hipotecario faced a number of the challenges that often come along with\noutdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n— the list goes on.\n\n“In order to execute on our data analytics strategy, new technologies were needed\nin order to improve data engineering and boost data science productivity,” said\nDaniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\ntook were to move to a cloud-based data lake, which led us to Azure Databricks\nand Delta Lake.”\n\n\n-----\n\n**A unified platform powers the data lake**\n**and easy collaboration**\nBanco Hipotecario turned to Databricks to modernize their data warehouse\nenvironment, improve cross-team collaboration, and drive data science innovation.\nFully managed in Microsoft Azure, they were able to easily and reliably ingest massive\nvolumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\nautomated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n\nDelta Lake has been especially useful in bringing reliability and performance to Banco\nHipotecario’s data lake environment. With Delta Lake, they are now able to build\nreliable and performant ETL pipelines like never before.\n\n\nMeanwhile, performing SQL Analytics on Databricks has helped them do data\nexploration, cleansing and generate data sets in order to create models, enabling the\nteam to deploy their first model within the first three months, and the second model\ngenerated was rolled out in just two weeks.\n\nAt the same time, data scientists were finally able to collaborate, thanks to interactive\nnotebooks; this meant faster builds, training and deployment. And MLflow streamlined\nthe ML lifecycle and removed the overreliance on data engineering.\n\n“Databricks gives our data scientists the means to easily create our own experiments\nand deploy them to production in weeks, rather than months,” said Miguel Villalba,\nHead of Data Engineering and Data Science.\n\n\n-----\n\n**An efficient team maximizes customer**\n**acquisition and retention**\nSince moving to Databricks, the data team at Banco Hipotecario could not be happier,\nas Databricks has unified them across functions in an integrated fashion.\n\nThe results of data unification and markedly improved collaboration and autonomy\ncannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\ntheir cross-sell into new products by a whopping 90%, while machine learning has\nreduced the cost of customer acquisition by 35%.\n\n\n-----\n\n**Viacom18**\nMigrates From Hadoop to Databricks to\nDeliver More Engaging Experiences\n\nViacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\nwith 40x growth over the past decade. They offer multi-platform, multigenerational\nand multicultural brand experiences to 600+ million monthly viewers.\n\nIn order to deliver more engaging experiences for their millions of viewers, Viacom18\nmigrated from their Hadoop environment due to its inability to process data at scale\nefficiently. With Databricks, they have streamlined their infrastructure management,\nincreased data pipeline speeds and increased productivity among their data teams.\n\nToday, Viacom18 is able to deliver more relevant viewing experiences to their\nsubscribers, while identifying opportunities to optimize the business and drive\ngreater ROI.\n\n**Spotlight on Viacom18**\n**Industry:** Media and entertainment\n26%\nIncrease in operational efficiency lowers overall costs\n\n\n-----\n\n**Growth in subscribers and terabytes of viewing data**\n**push Hadoop to its limits**\nViacom18, a joint venture between Network18 and ViacomCBS, is focused on\nproviding its audiences with highly personalized viewing experiences. The core\nof this strategy requires implementing an enterprise data architecture that enables\nthe building of powerful customer analytics on daily viewer data. But with millions of\nconsumers across India, the sheer amount of data was tough to wrangle: They were\ntasked with ingesting and processing over 45,000 hours of daily content on VOOT\n(Viacom18’s on-demand video subscription platform), which easily generated 700GB\nto 1TB of data per day.\n\n“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\nVice President of Digital Transformation and Technology. “We deliver personalized\ncontent recommendations across our audiences around the world based on\nindividual viewing history and preferences in order to increase viewership and\ncustomer loyalty.”\n\nViacom18’s data lake, which was leveraging on-premises Hadoop for operations,\nwasn’t able to optimally process 90 days of rolling data within their management’s\ndefined SLAs, limiting their ability to deliver on their analytics needs, which impacted\nnot only the customer experience but also overall costs.\n\nTo meet this challenge head-on, Viacom18 needed a modern data warehouse with the\nability to analyze data trends for a longer period of time instead of daily snapshots. They\nalso needed a platform that simplified infrastructure by allowing their team to easily\nprovision clusters with features like auto-scaling to help reduce compute costs.\n\n\n-----\n\n**Rapid data processing for analytics**\n**and ML with Databricks**\nTo enable the processing power and data science capabilities they required, Viacom18\npartnered with Celebal Technologies, a premier Salesforce, data analytics and big data\nconsulting organization based in India. The team at Celebal leveraged Azure Databricks\nto provide Viacom18 with a unified data platform that modernizes its data warehousing\ncapabilities and accelerates data processing at scale.\n\nThe ability to cache data within Delta Lake resulted in the much-needed acceleration\nof queries, while cluster management with auto-scaling and the decoupling of\n\n\nstorage and compute simplified Viacom18’s infrastructure management and\noptimized operational costs. “Delta Lake has created a streamlined approach to\nthe management of data pipelines,” explained Dey. “This has led to a decrease in\noperational costs while speeding up time-to-insight for downstream analytics and\ndata science.”\n\nThe notebooks feature was an unexpected bonus for Viacom18, as a common workspace\ngave data teams a way to collaborate and increase productivity on everything from\nmodel training to ad hoc analysis, dashboarding and reporting via PowerBI.\n\n\n-----\n\n**Leveraging viewer data to power personalized**\n**viewing experiences**\nCelebal Technologies and Databricks have enabled Viacom18 to deliver innovative\ncustomer solutions and insights with increased cross-team collaboration and\nproductivity. With Databricks, Viacom18’s data team is now able to seamlessly\nnavigate their data while better serving their customers.\n\n“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\nand deliver customer behavioral and engagement insights to the analysts and data\nscientists,” said Dey.\n\nIn addition to performance gains, the faster query times have also lowered the overall\ncost of ownership, even with daily increases in data volumes. “Azure Databricks has\ngreatly streamlined processes and improved productivity by an estimated 26%,”\nconcluded Dey.\n\nOverall, Dey cites the migration from Hadoop to Databricks has delivered significant\nbusiness value — reducing the cost of failure, accelerating processing speeds at\nscale, and simplifying ad hoc analysis for easier data exploration and innovations that\ndeliver highly engaging customer experiences.\n\n\n-----\n\n# What’s next?\n\nNow that you understand Delta Lake, it may be time to take a look\nat some additional resources.\n\n**Do a deep dive into Delta Lake >**\n\n- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n\n- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n\n- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n\n**[Try Databricks for free >](https://databricks.com/try-databricks)**\n**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**eBook**\n\n## The Data Team’s Guide to the Databricks Lakehouse Platform\n\n\n-----\n\n#### Contents\n\n\n**C H A P TE R 1**\n\n**C H A P TE R 2**\n\n**C H A P TE R 3**\n\n**C H A P TE R 4**\n\n**C H A P TE R 5**\n\n**C H A P TE R 6**\n\n**C H A P TE R 7**\n\n**C H A P TE R 8**\n\n**C H A P TE R 9**\n\n**C H A P TE R 10**\n\n**C H A P TE R 11**\n\n**C H A P TE R 12**\n\n\n**The data lakehouse** ...................................................................................................................................................................................... **4**\n\n**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n\n**Data reliability and performance** ................................................................................................................................... **18**\n\n**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n\n**Security** .............................................................................................................................................................................................................................. **41**\n\n**Instant compute and serverless** ................................................................................................................................... **48**\n\n**Data warehousing** ......................................................................................................................................................................................... **52**\n\n**Data engineering** ............................................................................................................................................................................................. **56**\n\n**Data streaming** .................................................................................................................................................................................................. **68.**\n\n**Data science and machine learning** ........................................................................................................................ **7** **3.**\n\n**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n\n**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n\n\n-----\n\n**I N T R O D U C T I O N**\n\n#### The Data Team’s Guide to the Databricks Lakehouse Platform\n\n_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\ndesigned for data practitioners and leaders who are embarking\non their journey into the data lakehouse architecture.\n\nIn this eBook, you will learn the full capabilities of the data lakehouse architecture\nand how the Databricks Lakehouse Platform helps organizations of all sizes — from\nenterprises to startups in every industry — with all their data, analytics, AI and\nmachine learning use cases on one platform.\n\nYou will see how the platform combines the best elements of data warehouses\nand data lakes to increase the reliability, performance and scalability of your\ndata platform. Discover how the lakehouse simplifies complex workloads in data\nengineering, data warehousing, data streaming, data science and machine learning\n— and bolsters collaboration for your data teams, allowing them to maintain new\nlevels of governance, flexibility and agility in an open and multicloud environment.\n\n\n-----\n\n**CHAPTER**\n\n### The data lakehouse\n# 01\n\n\n-----\n\n#### The evolution of data architectures\n\n\nData has moved front and center within every organization as data-driven insights\nhave fueled innovation, competitive advantage and better customer experiences.\n\nHowever, as companies place mandates on becoming more data-driven,\ntheir data teams are left in a sprint to deliver the right data for business\ninsights and innovation. With the widespread adoption of cloud, data teams\noften invest in large-scale complex data systems that have capabilities for\nstreaming, business intelligence, analytics and machine learning to support\nthe overall business objectives.\n\nTo support these objectives, data teams have deployed cloud data\n\nwarehouses and data lakes.\n\n\nTraditional data systems: The data warehouse and data lake\n\nWith the advent of big data, companies began collecting large amounts of\ndata from many different sources, such as weblogs, sensor data and images.\nData warehouses — which have a long history as the foundation for decision\nsupport and business intelligence applications — cannot handle large volumes\nof data.\n\nWhile data warehouses are great for structured data and historical analysis,\nthey weren’t designed for unstructured data, semi-structured data, and data\nwith high variety, velocity and volume, making them unsuitable for many types\nof data.\n\nThis led to the introduction of data lakes, providing a single repository of raw\ndata in a variety of formats. While suitable for storing big data, data lakes do\nnot support transactions, nor do they enforce data quality, and their lack of\nconsistency/isolation makes it almost impossible to read, write or process data.\n\nFor these reasons, many of the promises of data lakes never materialized and,\nin many cases, reduced the benefits of data warehouses.\n\nAs companies discovered new use cases for data exploration, predictive modeling\nand prescriptive analytics, the need for a single, flexible, high-performance system\nonly grew. Data teams require systems for diverse data applications including SQL\nanalytics, real-time analytics, data science and machine learning.\n\n\n-----\n\nTo solve for new use cases and new users, a common approach is to use multiple\nsystems — a data lake, several data warehouses and other specialized systems\nsuch as streaming, time-series, graph and image databases. But having multiple\nsystems introduces complexity and delay, as data teams invariably need to\nmove or copy data between different systems, effectively losing oversight and\ngovernance over data usage.\n\n\nYou have now duplicated data in two different systems and the changes you\nmake in one system are unlikely to find their way to the other. So, you are going\nto have data drift almost immediately, not to mention paying to store the same\ndata multiple times.\n\nThen, because governance is happening at two distinct levels across these\nplatforms, you are not able to control things consistently.\n\n\n**Challenges with data, analytics and AI**\n\nIn a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\nmeasurable value from data. The challenge is that most companies continue to\nimplement two different platforms: data warehouses for BI and data lakes for AI.\nThese platforms are incompatible with each other, but data from both systems\nis generally needed to deliver game-changing outcomes, which makes success\nwith AI extremely difficult.\n\nToday, most of the data is landing in the data lake, and a lot of it is unstructured.\nIn fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\nunstructured by 2025. But, this data is where much of the value from AI resides.\nSubsets of the data are then copied to the data warehouse into structured\ntables, and back again in some cases.\n\nYou also must secure and govern the data in both warehouses and offer\nfine-grained governance, while lakes tend to be coarser grained at the file level.\nThen, you stand up different stacks of tools on these platforms to do either\nBI or AI.\n\n\n-----\n\nFinally, the tool stacks on top of these platforms\nare fundamentally different, which makes it difficult\nto get any kind of collaboration going between the\nteams that support them.\n\nThis is why AI efforts fail. There is a tremendous\namount of complexity and rework being introduced\ninto the system. Time and resources are being\nwasted trying to get the right data to the right\npeople, and everything is happening too slowly\nto get in front of the competition.\n\n\n**Realizing this requires two disparate,**\n**incompatible data platforms**\n\n\n**Business** **SQL** **Incomplete** **Data science** **Data**\n\n**support for**\n\n**intelligence** **analytics** **and ML** **streaming**\n\n\n**SQL**\n**analytics**\n\n\n**Incomplete**\n**support for**\n**use cases**\n\n\n**Incompatible**\n**security and**\n**governance models**\n\n**Copy subsets of data**\n\n\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa T|n a|c b|e and security le ACLs|\n|||||\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa File|n s|c a|e and security nd blobs|\n|||||\n\n\n**Disjointed**\n**and duplicative**\n\n**Data warehouse** **data silos** **Data lake**\nStructured tables Unstructured files:\nlogs, text, images, video\n\n\n-----\n\n**Moving forward with a lakehouse architecture**\n\nTo satisfy the need to support AI and BI directly on vast amounts of data stored\nin data lakes (on low-cost cloud storage), a new data management architecture\nemerged independently across many organizations and use cases: the\ndata lakehouse.\n\nThe data lakehouse can store _all_ and _any_ type of data once in a data lake and\nmake that data accessible directly for AI and BI. The lakehouse paradigm has\nspecific capabilities to efficiently allow both AI and BI on all the enterprise’s data\nat a massive scale. Namely, it has the SQL and performance capabilities such as\nindexing, caching and MPP processing to make BI work fast on data lakes. It also\nhas direct file access and direct native support for Python, data science and AI\nframeworks without the need for a separate data warehouse.\n\nIn short, a lakehouse is a data architecture that combines the best elements\nof data warehouses and data lakes. Lakehouses are enabled by a new system\ndesign, which implements similar data structures and data management features\nfound in a data warehouse directly on the low-cost storage used for data lakes.\n\n\n-----\n\n##### Data lakehouse\n\nOne platform to unify all your data, analytics and AI workloads\n\n###### Lakehouse Platform\n\nAll machine learning, SQL,\nBI, and streaming use cases\n\nOne security and governance\napproach for all data assets\non all clouds\n\n\n-----\n\n**Key features for a lakehouse**\n\nRecent innovations with the data lakehouse architecture can help simplify\nyour data and AI workloads, ease collaboration for data teams, and maintain\nthe kind of flexibility and openness that allows your organization to stay agile\nas you scale. Here are key features to consider when evaluating data lakehouse\narchitectures:\n\nTransaction support: In an enterprise lakehouse, many data pipelines will\noften be reading and writing data concurrently. Support for ACID (Atomicity,\nConsistency, Isolation and Durability) transactions ensures consistency as\nmultiple parties concurrently read or write data.\n\nSchema enforcement and governance: The lakehouse should have\na way to support schema enforcement and evolution, supporting data\nwarehouse schema paradigms such as star/snowflake. The system should\nbe able to reason about data integrity, and it should have robust governance\nand auditing mechanisms.\n\nData governance: Capabilities including auditing, retention and lineage\nhave become essential, particularly considering recent privacy regulations.\n\nTools that allow data discovery have become popular, such as data catalogs\nand data usage metrics.\n\nBI support: Lakehouses allow the use of BI tools directly on the source\ndata. This reduces staleness and latency, improves recency and lowers cost\nby not having to operationalize two copies of the data in both a data lake\nand a warehouse.\n\n\nStorage decoupled from compute: In practice, this means storage and\ncompute use separate clusters, thus these systems can scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also\nhave this property.\n\nOpenness: The storage formats, such as Apache Parquet, are open and\nstandardized, so a variety of tools and engines, including machine learning\nand Python/R libraries, can efficiently access the data directly.\n\nSupport for diverse data types (unstructured and structured):\nThe lakehouse can be used to store, refine, analyze and access data types\nneeded for many new data applications, including images, video, audio,\nsemi-structured data and text.\n\nSupport for diverse workloads: Use the same data repository for a range\nof workloads including data science, machine learning and SQL analytics.\nMultiple tools might be needed to support all these workloads.\n\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\n**Learn more**\n\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n\n\n-----\n\n**CHAPTER**\n\n# 02\n\n\n### The Databricks Lakehouse Platform\n\n\n-----\n\n#### Lakehouse: A new generation of open platforms\n\n\n###### This is the lakehouse paradigm\n\n\nDatabricks is the inventor and pioneer of the\ndata lakehouse architecture. The data lakehouse\narchitecture was coined in the research paper,\n[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\nintroduced by Databricks’ founders, UC Berkeley\nand Stanford University at the 11th Conference on\nInnovative Data Systems Research (CIDR) in 2021.\n\nAt Databricks, we are continuously innovating on\nthe lakehouse architecture to help customers deliver\non their data, analytics and AI aspirations. The ideal\ndata, analytics and AI platform needs to operate\ndifferently. Rather than copying and transforming\ndata in multiple systems, you need one platform\nthat accommodates all data types.\n\n\n**Data science** **Data**\n**and ML** **streaming**\n\n\n**All ML, SQL, BI**\n**and streaming use cases**\n\n**One security and governance**\n**approach for all data assets**\n**on all clouds**\n\n**A reliable data platform**\n**to efficiently handle**\n**all data types**\n\n\n**Persona-based**\n**use cases**\n\n**Unity Catalog**\nFine-grained governance\nfor data and AI\n\n**Delta Lake**\nData reliability and performance\n\n\n**Business**\n**intelligence**\n\n\n**SQL**\n**analytics**\n\n\nFiles and blobs and table ACLs\n\n\nIdeally, the platform must be open, so that you\nare not locked into any walled gardens. You would\nalso have one security and governance model.\nIt would not only manage all data types, but it\nwould also be cloud-agnostic to govern data\nwherever it is stored.\n\nLast, it would support all major data, analytics and AI\nworkloads, so that your teams can easily collaborate\nand get access to all the data they need to innovate.\n\n\n-----\n\n#### What is the Databricks Lakehouse Platform?\n\nThe Databricks Lakehouse Platform unifies your\ndata warehousing and AI uses cases on a single\nplatform. It combines the best elements of data\nlakes and data warehouses to deliver the reliability,\nstrong governance and performance of data\nwarehouses with the openness, flexibility and\nmachine learning support of data lakes.\n\nThis unified approach simplifies your modern data\nstack by eliminating the data silos that traditionally\nseparate and complicate data engineering, analytics,\nBI, data science and machine learning. It’s built\non open source and open standards to maximize\nflexibility. And, its common approach to data\nmanagement, security and governance helps you\n\noperate more efficiently and innovate faster.\n\n\n**Lakehouse Platform**\n\nData Data Data Data science\nwarehousing engineering streaming and ML\n\n\n-----\n\n#### Benefits of the Databricks Lakehouse Platform\n\n\n**Simple**\n\nThe unified approach simplifies your data\narchitecture by eliminating the data silos that\ntraditionally separate analytics, BI, data science\nand machine learning. With a lakehouse, you\ncan eliminate the complexity and expense that\nmake it hard to achieve the full potential of\nyour analytics and AI initiatives.\n\n\n**Open**\n\nDelta Lake forms the open foundation of\nthe lakehouse by providing reliability and\nperformance directly on data in the data\nlake. You’re able to avoid proprietary walled\ngardens, easily share data and build your\nmodern data stack with unrestricted access\nto the ecosystem of open source data projects\nand the broad Databricks partner network.\n\n\n**Multicloud**\n\nThe Databricks Lakehouse Platform offers\nyou a consistent management, security and\ngovernance experience across all clouds. You\ndo not need to invest in reinventing processes\nfor every cloud platform that you are using to\nsupport your data and AI efforts. Instead, your\ndata teams can simply focus on putting all\nyour data to work to discover new insights.\n\n\n-----\n\n#### The Databricks Lakehouse Platform architecture\n\n**Data reliability and performance for lakehouse**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\nwith all major analytics tools and works with the widest variety of formats to\nstore and process data.\n\n\n**Instant compute and serverless**\n\nServerless compute is a fully managed service where Databricks provisions and\nmanages the compute layer on behalf of the customer in the Databricks cloud\naccount instead of the customer account. As of the current release, serverless\ncompute is supported for use with Databricks SQL.\n\nIn Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n\n\n[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\na state-of-the-art vectorized engine for fast querying and provides the best\nperformance for all workloads in the lakehouse.\n\nIn Chapter 3, we explore the details of data reliability and performance\n\nfor the lakehouse.\n\n**Unified governance and security for lakehouse**\n\nThe Databricks Lakehouse Platform provides unified governance with enterprise\nscale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\ngovernance for your data and AI assets in the lakehouse — files, tables,\ndashboards, and machine learning models — giving you much better control,\nmanagement and security across clouds.\n\n[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\ndata across the organization in real time, independent of the platform\non which the data resides.\n\nIn Chapter 4, we go into the details of unified governance for lakehouse\n\nand, in Chapter 5, we dive into the details of security for lakehouse.\n\n\n-----\n\n#### The Databricks Lakehouse Platform workloads\n\nThe Databricks Lakehouse Platform architecture supports different workloads\nsuch as data warehousing, data engineering, data streaming, data science and\nmachine learning on one simple, open and multicloud data platform.\n\n**Data warehousing**\n\nData warehousing is one of the most business-critical workloads for data teams,\nand the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\nlets you run all your SQL and BI applications at scale with up to 12x better price/\nperformance, a unified governance model, open formats and APIs, and your tools\nof choice — no lock-in. Reduce resource management overhead with serverless\ncompute, and easily ingest, transform and query all your data in-place to deliver\nreal-time business insights faster.\n\nBuilt on open standards and APIs, the Databricks Lakehouse Platform provides\nthe reliability, quality and performance that data lakes natively lack, plus\nintegrations with the ecosystem for maximum flexibility.\n\nIn Chapter 7, we go into the details of data warehousing on the lakehouse.\n\n**Data engineering**\n\nData engineering on the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows on\nany cloud platform, and meet regulatory requirements to maintain governance.\n\n\nautomates the complexity of building and maintaining pipelines and running ETL\nworkloads so data engineers and analysts can focus on quality and reliability to\ndrive valuable insights.\n\nIn Chapter 8, we go into the details of data engineering on the lakehouse.\n\n**Data streaming**\n\n[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\nLakehouse Platform and is the future of all data processing. Real-time processing\nprovides the freshest possible data to an organization’s analytics and machine\nlearning models enabling them to make better, faster decisions, more accurate\npredictions, offer improved customer experiences and more.\n\nThe Databricks Lakehouse Platform Dramatically simplifies data streaming to\ndeliver real-time analytics, machine learning and applications on one platform.\n\nIn Chapter 9, we go into the details of data streaming on the lakehouse.\n\n**Data science and machine learning**\n\nData science and machine learning (DSML) on the lakehouse is a powerful\nworkload that is unique to many other data offerings. DSML on the lakehouse\nprovides a data-native and collaborative solution for the full ML lifecycle. It\ncan maximize data and ML team productivity, streamline collaboration, empower\nML teams to prepare, process and manage data in a self-service manner,\nand standardize the ML lifecycle from experimentation to production.\n\nIn Chapter 10, we go into the details of DSML on the lakehouse.\n\n\nThe lakehouse provides an end-to-end data engineering and ETL platform that\n\n\n-----\n\n**Databricks Lakehouse Platform and your**\n**modern data stack**\n\nThe Databricks Lakehouse Platform is open and provides the flexibility to\ncontinue using existing infrastructure, to easily share data and build your modern\ndata stack with unrestricted access to the ecosystem of open source data\nprojects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n\nIn Chapter 11, we go into the details of our technology partners and the\n\nmodern data stack.\n\n#### Global adoption of the Databricks Lakehouse Platform\n\n\nToday, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\nacross industries doing transformational work. Organizations around the globe\nare driving change and delivering a new generation of data, analytics and AI\napplications. We believe that the unfulfilled promise of data and AI can finally\nbe fulfilled with one platform for data analytics, data science and machine\nlearning with the Databricks Lakehouse Platform.\n\n\n**Learn more**\n\n[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n\n[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n\n[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n\n[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n\n[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n\n[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n\n\n-----\n\n**CHAPTER**\n\n# 03\n\n\n### Data reliability and performance\n\nTo bring openness, reliability and lifecycle management to data lakes,\nthe Databricks Lakehouse Platform is built on the foundation of Delta\nLake. Delta Lake solves challenges around unstructured/structured data\ningestion, the application of data quality, difficulties with deleting data for\ncompliance or issues with modifying data for data capture.\n\nAlthough data lakes are great solutions for holding large quantities of raw\ndata, they lack important attributes for data reliability and quality and\noften don’t offer good performance when compared to data warehouses.\n\n\n-----\n\n#### Problems with today’s data lakes\n\nWhen it comes to data reliability and quality, examples of these\nmissing attributes include:\n\n**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\nappends and reads\n\n**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\nFor example, rejecting writes that don’t match a table’s schema.\n\n**•** **Lack of integration with data catalog:** Results in dark data and no single\nsource of truth\n\nEven just the absence of these three attributes can cause a lot of extra work\nfor data engineers as they strive to ensure consistent high-quality data in the\npipelines they create.\n\n\nThese challenges are solved with two key technologies that are at the foundation\nof the lakehouse: Delta Lake and Photon.\n\n**What is Delta Lake?**\n\nDelta Lake is a file-based, open source storage format that provides ACID\ntransactions and scalable metadata handling, and unifies streaming and batch\ndata processing. It runs on top of existing data lakes and is compatible with\nApache Spark™ and other processing engines.\n\nDelta Lake uses Delta Tables which are based on Apache Parquet, a commonly\nused format for structured data already utilized by many organizations. Therefore,\nswitching existing Parquet tables to Delta Tables is easy and quick. Delta\nTables can also be used with semi-structured and unstructured data, providing\nversioning, reliability, metadata management, and time travel capabilities that\nmake these types of data easily managed as well.\n\n\nAs for performance, data lakes use object storage, so data is mostly kept in\nimmutable files leading to the following problems:\n\n**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\nindexing practices in the form of partitioning that leads to hundreds of dev hours\nspent tuning file sizes to improve read/write performance. Often, partitioning\nproves to be ineffective over time if the wrong field was selected for partitioning\nor due to high cardinality columns.\n\n**•** **Too many small files:** With no support for transactions, appending new data\ntakes the form of adding more and more files, leading to “small file problems,”\na known root cause of query performance degradation.\n\n\n-----\n\n**Delta Lake features**\n\n\n**ACID guarantees**\n\nDelta Lake ensures that all data changes\nwritten to storage are committed for durability\nand made visible to readers atomically. In other\nwords, no more partial or corrupted files.\n\n**Scalable data and metadata handling**\n\nSince Delta Lake is built on data lakes, all reads\nand writes using Spark or other distributed\nprocessing engines are inherently scalable to\npetabyte-scale. However, unlike most other\nstorage formats and query engines, Delta Lake\nleverages Spark to scale out all the metadata\nprocessing, thus efficiently handling metadata\nof billions of files for petabyte-scale tables.\n\n\n**Audit history and time travel**\n\nThe Delta Lake transaction log records details\nabout every change made to data, providing a full\naudit trail of the changes. These data snapshots\nallow developers to access and revert to earlier\nversions of data for audits, rollbacks or to\nreproduce experiments.\n\n**Schema enforcement and schema evolution**\n\nDelta Lake automatically prevents the insertion of\ndata with an incorrect schema, i.e., not matching\nthe table schema. And when needed, it allows the\ntable schema to be explicitly and safely evolved to\naccommodate ever-changing data.\n\n\n**Support for deletes, updates and merges**\n\nMost distributed processing frameworks do not\nsupport atomic data modification operations on\ndata lakes. Delta Lake supports merge, update\nand delete operations to enable complex use\ncases including but not limited to change data\ncapture (CDC), slowly changing dimension (SCD)\noperations and streaming upserts.\n\n**Streaming and batch unification**\n\nA Delta Lake table can work both in batch\nand as a streaming source and sink. The\nability to work across a wide variety of latencies,\nranging from streaming data ingestion to batch\nhistoric backfill, to interactive queries all work\nout of the box.\n\n\n-----\n\n**The Delta Lake transaction log**\n\nA key to understanding how Delta Lake provides all these capabilities is the\ntransaction log. The Delta Lake transaction log is the common thread that runs\nthrough many of Delta Lake’s most notable features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log\nis an ordered record of every transaction that has ever been performed on\na Delta Lake table since its inception.\n\nDelta Lake is built on top of Spark to allow multiple readers and writers of a\ngiven table to work on a table at the same time. To always show users correct\nviews of the data, the transaction log serves as a single source of truth: the\ncentral repository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on\nan open table that has been modified since the last time it was read, Spark\nchecks the transaction log to see what new transactions are posted to the table.\nThen, Spark updates the table with those recent changes. This ensures that a\nuser’s version of a table is always synchronized with the master record as of the\nmost recent query, and that users cannot make divergent, conflicting changes\nto a table.\n\n\n**Flexibility and broad industry support**\n\nDelta Lake is an open source project, with an engaged community of\ncontributors building and growing the Delta Lake ecosystem atop a set of open\nAPIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\nas an open storage standard in different environments and use cases, comes a\nbroad set of integration with industry-leading tools, technologies and formats.\n\nOrganizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\nflexibility in how they ingest, store and query data. They are not limited in storing\ndata in a single cloud provider and can implement a true multicloud approach to\ndata storage.\n\nConnectors to tools, such as Fivetran, allow you to leverage Databricks’\necosystem of partner solutions, so organizations have full control of building the\nright ingestion pipelines for their use cases. Finally, consuming data via queries\nfor exploration or business intelligence (BI) is also flexible and open.\n\n\n-----\n\n**Delta Lake integrates with all major analytics tools**\n\nEliminates unnecessary data movement and duplication\n\n\n-----\n\nIn addition to a wide ecosystem of tools and technologies, Delta Lake supports\na broad set of data formats for structured, semi-structured and unstructured\ndata. These formats include image binary data that can be stored in Delta\nTables, graph data format, geospatial data types and key-value stores.\n\n**Learn more**\n\n[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n\n[Documentation](https://docs.databricks.com/delta/index.html)\n\n[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n\n[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n\n\n**What is Photon?**\n\nAs many organizations standardize on the lakehouse paradigm, this new\narchitecture poses challenges with the underlying query execution engine\nfor accessing and processing structured and unstructured data. The execution\nengine needs to provide the performance of a data warehouse and the scalability\nof data lakes.\n\nPhoton is the next-generation query engine on the Databricks Lakehouse\nPlatform that provides dramatic infrastructure cost savings and speedups for\nall use cases — from data ingestion, ETL, streaming, data science and interactive\nqueries — directly on your data lake. Photon is compatible with Spark APIs and\nimplements a more general execution framework that allows efficient processing\nof data with support of the Spark API. This means getting started is as easy as\nturning it on — no code change and no lock-in. With Photon, typical customers are\nseeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\nto 85% reduction in VM compute hours.\n\nSpark instructions Photon instructions\n\n\nPhoton engine\n\n\nDelta/Parquet\n\nPhoton writer\nto Delta/Parquet\n\n\n-----\n\nWhy process queries with Photon?\n\n\nQuery performance on Databricks has steadily increased over the years,\npowered by Spark and thousands of optimizations packaged as part of the\nDatabricks Runtime (DBR). Photon provides an additional 2x speedup per the\nTPC-DS 1TB benchmark compared to the latest DBR versions.\n\n**Relative speedup to DBR 2.1 by DBR version**\nHigher is better\n\n\n**Customers have observed significant speedups using**\n**Photon on workloads such as:**\n\n**•** **SQL-based jobs:** Accelerate large-scale production jobs on\nSQL and Spark DataFrames\n\n**•** **IoT use cases:** Faster time-series analysis using Photon\ncompared to Spark and traditional Databricks Runtime\n\n**•** **Data privacy and compliance:** Query petabytes-scale data\nsets to identify and delete records without duplicating data\nwith Delta Lake, production jobs and Photon\n\n**•** **Loading data into Delta and Parquet:** Vectorized I/O\nspeeds up data loads for Delta and Parquet tables, lowering\noverall runtime and costs of data engineering jobs\n\n\nRelease date - DBR version (TPC-DS 1TB 10 x i3xl)\n\n\n-----\n\n**100TB TPC-DS price/performance**\nLower is better\n\n\nBest price/performance for analytics\nin the cloud\n\nWritten from the ground up in C++, Photon takes\nadvantage of modern hardware for faster queries,\nproviding up to 12x better price/performance\ncompared to other cloud data warehouses —\nall natively on your data lake.\n\n\nDatabricks SQL Databricks SQL Cloud data Cloud data Cloud data\nspot on-demand warehouse 1 warehouse 2 warehouse 3\n\n**System**\n\n\n-----\n\nWorks with your existing code\nand avoids vendor lock-in\n\nPhoton is designed to be compatible with the\nApache Spark DataFrame and SQL APIs to ensure\nworkloads run seamlessly without code changes.\nAll you do is turn it on. Photon will seamlessly\ncoordinate work and resources and transparently\naccelerate portions of your SQL and Spark queries.\nNo tuning or user intervention required.\n\n\n**Photon in the Databricks Lakehouse Platform**\n\n**Client: submit SQL**\n\nParsing\nCatalyst: analysis/\nplanning/optimization\nscheduling\n\nExecute task Execute task Execute task Execute task\n\n_Lifecycle of a Photon query_\n\n\nSpark\ndriver\nJVM\n\nSpark\nexecutors mixed\nJVM/Native\n\n\n-----\n\nOptimizing for all data use cases\nand workloads\n\nPhoton is the first purpose-built lakehouse engine\ndesigned to accelerate all data and analytics\nworkloads: data ingestion, ETL, streaming, data\nscience, and interactive queries. While we started\nPhoton primarily focused on SQL to provide\ncustomers with world-class data warehousing\nperformance on their data lakes, we’ve significantly\nincreased the scope of ingestion sources, formats,\nAPIs and methods supported by Photon since\nthen. As a result, customers have seen dramatic\ninfrastructure cost savings and speedups on\nPhoton across all their modern Spark (e.g., Spark\nSQL and DataFrame) workloads.\n\n\nQuery optimizer\n\nNative execution engine\n\nCaching\n\n\n_Accelerating all workloads on the lakehouse_\n\n**Learn more**\n\n[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n\n[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n\n\n-----\n\n**CHAPTER**\n\n# 04\n\n\n### Unified governance and sharing for data, analytics and AI\n\nToday, more and more organizations recognize the importance of making\nhigh-quality data readily available to data teams to drive actionable insights\nand business value. At the same time, organizations also understand the risks\nof data breaches which negatively impact brand value and inevitably lead to\nerosion of customer trust. Governance is one of the most critical components\nof a lakehouse data platform architecture; it helps ensure that data assets\nare securely managed throughout the enterprise. However, many companies\nare using different incompatible governance models leading to complex and\nexpensive solutions.\n\n\n-----\n\n#### Key challenges with data and AI governance\n\n**Diversity of data and AI assets**\n\nThe increased use of data and the added complexity of the data landscape\nhave left organizations with a difficult time managing and governing all types\nof their data-related assets. No longer is data stored in files or tables. Data\nassets today take many forms, including dashboards, machine learning models\nand unstructured data like video and images that legacy data governance\nsolutions simply are not built to govern and manage.\n\n\n**Rising multicloud adoption**\n\nMore and more organizations now leverage a multicloud strategy to optimize\ncosts, avoid vendor lock-in, and meet compliance and privacy regulations. With\nnonstandard, cloud-specific governance models, data governance across clouds\nis complex and requires familiarity with cloud-specific security and governance\nconcepts, such as identity and access management (IAM).\n\n**Disjointed tools for data governance on the lakehouse**\n\nToday, data teams must deal with a myriad of fragmented tools and services for\ntheir data governance requirements, such as data discovery, cataloging, auditing,\nsharing, access controls, etc. This inevitably leads to operational inefficiencies\nand poor performance due to multiple integration points and network latency\nbetween the services.\n\n\n**Two disparate and incompatible data platforms**\n\nOrganizations today use two different platforms for their data analytics and\nAI efforts — data warehouses for BI and data lakes for AI. This results in data\nreplication across two platforms, presenting a major governance challenge.\nWith no unified view of the data landscape, it is difficult to see where data is\nstored, who has access to what data, and consistently define and enforce data\naccess policies across the two platforms with different governance models.\n\n\n-----\n\n#### One security and governance approach\n\nLakehouse systems provide a uniform way to manage access control, data\nquality and compliance across all of an organization’s data using standard\ninterfaces similar to those in data warehouses by adding a management\ninterface on top of data lake storage.\n\nModern lakehouse systems support fine-grained (row, column and view level)\naccess control via SQL, query auditing, attribute-based access control, data\nversioning and data quality constraints and monitoring. These features are\ngenerally provided using standard interfaces familiar to database administrators\n(for example, SQL GRANT commands) to allow existing personnel to manage\nall the data in an organization in a uniform way. Centralizing all the data in\na lakehouse system with a single management interface also reduces the\nadministrative burden and potential for error that comes with managing\nmultiple separate systems.\n\n\n#### What is Unity Catalog?\n\nUnity Catalog is a unified governance solution for all data, analytics and AI\nassets including files, tables, dashboards and machine learning models in your\nlakehouse on any cloud. Unity Catalog simplifies governance by empowering\ndata teams with a common governance model based on ANSI-SQL to define\nand enforce fine-grained access controls. With attribute-based access controls,\ndata administrators can enable fine-grained access controls on rows and\ncolumns using tags (attributes). Built-in data search and discovery allows\ndata teams to quickly find and reference relevant data for any use case. Unity\nCatalog offers automated data lineage for all workloads in SQL, R, Scala and\nPython, to build a better understanding of the data and its flow in the lakehouse.\nUnity Catalog also allows data sharing across or within organizations and\nseamless integrations with your existing data governance tools.\n\nWith Unity Catalog, data teams can simplify governance for all data and AI\nassets with one consistent model to discover, access and share data, giving\nyou much better native performance, management and security across clouds.\n\n\n-----\n\n**Key benefits**\n\n\nThe common metadata layer for cross-workspace metadata is at the account\nlevel and eases collaboration by allowing different workspaces to access Unity\nCatalog metadata through a common interface and break down data silos.\nFurther, the data permissions in Unity Catalog are applied to account-level\nidentities, rather than identities that are local to a workspace, allowing\na consistent view of users and groups across all workspaces.\n\n\nCatalog, secure and audit access to all data assets on any cloud\n\nUnity Catalog provides centralized metadata, enabling data teams to create\na single source of truth for all data assets ranging from files, tables, dashboards\nto machine learning models in one place.\n\n\n-----\n\nUnity Catalog offers a unified data access layer that provides a simple and\nstreamlined way to define and connect to your data through managed tables,\nexternal tables, or files, while managing their access controls. Unity Catalog\ncentralizes access controls for files, tables and views.\n\nIt allows fine-grained access controls for restricting access to certain rows\nand columns to the users and groups who are authorized to query them. With\nAttribute-Based Access Controls (ABAC), you can control access to multiple\ndata items at once based on user and data attributes, further simplifying\ngovernance at scale. For example, you will be able to tag multiple columns\nas personally identifiable information (PII) and manage access to all columns\ntagged as PII in a single rule.\n\nToday, organizations are dealing with an increased burden of regulatory\ncompliance, and data access auditing is a critical component to ensure your\norganization is set up for success while meeting compliance requirements.\nUnity Catalog also provides centralized fine-grained auditing by capturing an\naudit log of operations such as create, read, update and delete (CRUD) that have\nbeen performed against the data. This allows a fine-grained audit trail showing\nwho accessed a given data set and helps you meet your compliance and\nbusiness requirements.\n\n\n-----\n\nBuilt-in data search and discovery\n\nData discovery is a critical component to break\ndown data silos and democratize data across\nyour organization to make data-driven decisions.\nUnity Catalog provides a rich user interface for\ndata search and discovery, enabling data teams to\nquickly search relevant data assets across the data\nlandscape and reference them for all use cases —\nBI, analytics and machine learning — accelerating\ntime-to-value and boosting productivity.\n\n\n-----\n\nAutomated data lineage for all workloads\n\nData lineage describes the transformations and\nrefinements of data from source to insight. Lineage\nincludes capturing all the relevant metadata and\nevents associated with the data in its lifecycle,\nincluding the source of the data set, what other\ndata sets were used to create it, who created it and\nwhen, what transformations were performed, which\nother data sets leverage it, and many other events\nand attributes. Unity Catalog offers automated data\nlineage down to table and column level, enabling\ndata teams to get an end-to-end view of where\ndata is coming from, what transformations were\nperformed on the data and how data is consumed\nby end applications such as notebooks, workflows,\ndashboards, machine learning models, etc.\n\nWith automated data lineage for all workloads —\nSQL, R, Python and Scala, data teams can quickly\nidentify and perform root cause analysis of any\nerrors in the data pipelines or end applications.\nSecond, data teams can perform impact analysis\nto see dependencies of any data changes\non downstream consumers and notify them\nabout the potential impact. Finally, data lineage\nalso empowers data teams with increased\nunderstanding of their data and reduces tribal\nknowledge. Unity Catalog can also capture lineage\nassociated with non-data entities, such as notebooks,\nworkflows and dashboards. Lineage can be\n\n\n_Data lineage with Unity Catalog_\n\nretrieved via REST APIs to support integrations\nwith other catalogs.\n\nIntegrated with your existing tools\n\n\n**Resources**\n\n[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n\n[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n\n[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n\n\nUnity Catalog helps you to future-proof your data\nand AI governance with the flexibility to leverage\nyour existing data catalogs and governance\nsolutions — Collibra, Alation, Immuta, Privacera,\nMicrosoft Purview and AWS Lakeformation.\n\n\n-----\n\n#### Open data sharing and collaboration\n\nData sharing has become important in the digital\neconomy as enterprises wish to exchange data\neasily and securely with their customers, partners,\nsuppliers and internal lines of business to better\ncollaborate and unlock value from that data. But\nto date, a lack of standards-based data sharing\nprotocol has resulted in data sharing solutions\ntied to a single vendor or commercial product,\nintroducing vendor lock-in risks. What the industry\ndeserves is an open approach to data sharing.\n\n**Why data sharing is hard**\n\nData sharing has evolved from an optional feature\nof a few data platforms to a business necessity\nand success factor for organizations. Our solution\narchitects encounter daily the classic scenarios\nof a retailer looking to publish sales data to their\nsuppliers in real time or a supplier that wants to\nshare real-time inventory.\n\nAs a reminder, data sharing recently triggered\nthe most impressive scientific development that\nhumankind has ever seen. On January 5, 2021, the\nfirst sample of the genome of the coronavirus was\n\n\nuploaded to the internet. It wasn’t a lung biopsy\nfrom a patient in Wuhan, but a shared digital\ngenomic data set that triggered the development\nof the first batch of COVID vaccines worldwide.\n\n\ntreatments, tests and tracking mutations as they\nare passed down through a lineage, a branch of\nthe coronavirus family tree. The above graphic\nshows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n\n\nSince then, coronavirus experts have daily\nexchanged public data sets, looking for better\n\n\n-----\n\nSharing data, as well as consuming data from\nexternal sources, allows you to collaborate with\npartners, establish new partnerships, enable\nresearch and can generate new revenue streams\nwith data monetization.\n\nDespite those promising examples, existing data\nsharing technologies come with several limitations:\n\n**•** Traditional data sharing technologies, such as\nSecure File Transfer Protocol (SFTP), do not\nscale well and only serve files offloaded to a\nserver\n\n**•** Cloud object stores operate on an object level\nand are cloud-specific\n\n**•** Commercial data sharing offerings baked into\nvendor products often share tables instead of\nfiles, but scaling them is expensive and they\nare not open and, therefore, do not permit data\nsharing with a different platform\n\nThe following table compares proprietary vendor\nsolutions with SFTP, cloud object stores and Delta\nSharing.\n\n\n\n|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n|---|---|---|---|---|\n|Secure|||||\n|Cheap|||||\n|Vendor agnostic|||||\n|Multicloud|||||\n|Open source|||||\n|Table/DataFrame abstraction|||||\n|Live data|||||\n|Predicate pushdown|||||\n|Object store bandwidth|||||\n|Zero compute cost|||||\n|Scalability|||||\n\n\n-----\n\n**Open source data sharing and Databricks**\n\nTo address the limitations of existing data sharing solutions, Databricks developed\n[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\nto the Linux Foundation.\n\nAn open source–based solution, such as Delta Sharing, eliminates the lock-in\nof commercial solutions and brings a number of additional benefits such as\ncommunity-developed integrations with popular, open source data processing\nframeworks. In addition, open protocols allow the easy integration of commercial\nclients, such as BI tools.\n\n**What is Databricks Delta Sharing?**\n\nDatabricks Delta Sharing provides an open solution to securely share live data\nfrom your lakehouse to any computing platform. Recipients don’t have to be\non the Databricks platform or on the same cloud or a cloud at all. Data providers\ncan share live data, without replicating or moving it to another system. Recipients\nbenefit from always having access to the latest version of data and can quickly\nquery shared data using tools of their choice for BI, analytics and machine\nlearning, reducing time-to-value. Data providers can centrally manage, govern,\naudit and track usage of the shared data on one platform.\n\nUnity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\nfor data sharing, enabling organizations to share live, large-scale data without\nreplication and make data easily and quickly accessible from tools of your\nchoice, with enterprise-grade security.\n\n\n**Key benefits**\n\nOpen cross-platform sharing\n\nEasily share existing data in Delta Lake and Apache Parquet formats between\ndifferent vendors. Consumers don’t have to be on the Databricks platform, same\ncloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\nand Java allow recipients to consume shared data directly from the tools of their\nchoice. Delta Sharing eliminates the need to set up a new ingestion process to\nconsume data. Data recipients can directly access the fresh data and query it\nusing tools of their choice. Recipients can also enrich data with data sets from\npopular data providers.\n\nSharing live data without copying it\n\nShare live ready-to-query data, without replicating or moving it to another system.\nMost enterprise data today is stored in cloud data lakes. Any of the existing data\nsets on the provider’s data lake can easily be shared across clouds, regions or\ndata platforms without any data replication or physical movement of data. Data\nproviders can update their data sets reliably in real time and provide a fresh and\nconsistent view of their data to recipients.\n\nCentralized administration and governance\n\nYou can centrally govern, track and audit access to the shared data from a single\npoint of enforcement to meet compliance requirements. Detailed user-access\naudit logs are kept to know who is accessing the data and monitor usage of the\nshared data down to table, partition and version level.\n\n\n-----\n\nAn open Marketplace for data solutions\n\nThe demand for third-party data to make data-driven innovations is greater than ever,\n\nand data marketplaces act as a bridge between data providers and data consumers to\n\nhelp facilitate the discovery and distribution of data sets.\n\nDatabricks Marketplace provides an open marketplace for exchanging data products\n\nsuch as data sets, notebooks, dashboards and machine learning models. To accelerate\n\ninsights, data consumers can discover, evaluate and access more data products from\n\nthird-party vendors than ever before. Providers can now commercialize new offerings\n\nand shorten sales cycles by providing value-added services on top of their data.\n\nDatabricks Marketplace is powered by Delta Sharing, allowing consumers to access\n\ndata products without having to be on the Databricks platform. This open approach\n\nallows data providers to broaden their addressable market without forcing consumers\n\ninto vendor lock-in.\n\n_Databricks Marketplace_\n\n\nPrivacy-safe data cleanrooms\n\nPowered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n\na flexible data cleanroom solution allowing businesses to easily collaborate with their\n\ncustomers and partners on any cloud in a privacy-safe way. Participants in the data\n\ncleanrooms can share and join their existing data, and run complex workloads in any\n\nlanguage — Python, R, SQL, Java and Scala — on the data while maintaining data\n\nprivacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n\ndata replication across clouds or regions with other participants, which simplifies data\n\noperations and reduces cost.\n\n_Data cleanrooms with Databricks Lakehouse Platform_\n\n\n-----\n\n**How it works**\n\nDelta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\nis natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\nor externally.\n\nDelta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\nAzure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n\n**Data provider** **Data recipient**\n\nData science And many more On-premises\n\nThe data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\ndecides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\nshares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n\nThe data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\nSpark, Java and Python, and is working with partners on many more.\n\n\n-----\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n1. The recipient’s client authenticates to the sharing server and asks to query\na specific table. The client can also provide filters on the data (for example,\n“country=US”) as a hint to read just a subset of the data.\n\n2. The server verifies whether the client is allowed to access the data, logs the\nrequest, and then determines which data to send back. This will be a subset\nof the data objects in cloud storage systems that make up the table.\n\n3. To transfer the data, the server generates short-lived presigned URLs that\nallow the client to read these Parquet files directly from the cloud provider,\nso that the transfer can happen in parallel at massive bandwidth, without\nstreaming through the sharing server.\n\n**Learn more**\n\n[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n\n[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n\n[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n\n\n-----\n\n**CHAPTER**\n\n# 05\n\n\n### Security\n\nOrganizations that operate in multicloud environments need a unified, reliable\nand consistent approach to secure data. We’ve learned from our customers that\na simple and unified approach to data security for the lakehouse is one of the\nmost critical requirements for modern data solutions. Databricks is trusted by\nthe world’s largest organizations to provide a powerful lakehouse platform with\nhigh security and scalability. In fact, thousands of customers trust Databricks\nwith their most sensitive data to analyze and build data products using machine\nlearning (ML). With significant investment in building a highly secure and scalable\nplatform, Databricks delivers end-to-end platform security for data and users.\n\n\n-----\n\n#### Platform architecture reduces risk\n\nThe Databricks Lakehouse architecture is split into\ntwo separate planes to simplify your permissions,\navoid data duplication and reduce risk. The control\nplane is the management plane where Databricks\nruns the workspace application and manages\nnotebooks, configuration and clusters. Unless you\nchoose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\nruns inside your cloud service provider account,\nprocessing your data without taking it out of your\naccount. You can embed Databricks in your data\nexfiltration protection architecture using features\nlike customer-managed VPCs/VNets and admin\nconsole options that disable export.\n\nWhile certain data, such as your notebooks,\nconfigurations, logs, and user information, is\npresent within the control plane, that information\nis encrypted at rest, and communication to and\nfrom the control plane is encrypted in transit.\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n|Col1|Control pane|Col3|\n|---|---|---|\n||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n||Cluster manager||\n\n\nYou also have choices for where certain data lives:\nYou can host your own store of metadata about\nyour data tables (Hive metastore), or store query\n\n\n**Data**\n\n\n**DBFS root**\n\n\nresults in your cloud service provider account and\ndecide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n\n\n-----\n\n#### Step-by-step example\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n\n\n**DBFS root**\n\n|Col1|ample|Col3|Col4|Col5|\n|---|---|---|---|---|\n||Control pane 1 4||||\n|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n||||||\n||||||\n||||||\n||||||\n||||||\n\n\n-----\n\nSuppose you have a data engineer that signs in to Databricks and\nwrites a notebook that transforms raw data in Kafka to a normalized\ndata set sent to storage such as Amazon S3 or Azure Data Lake\nStorage. Six steps make that happen:\n\n1. The data engineer seamlessly authenticates, via your single sign-on\nif desired, to the Databricks web UI in the control plane, hosted in\nthe Databricks account.\n\n2. As the data engineer writes code, their web browser sends it to\nthe control plane. JDBC/ODBC requests also follow the same path,\nauthenticating with a token.\n\n3. When ready, the control plane uses Cloud Service Provider APIs to\ncreate a Databricks cluster, made of new instances in the data plane,\nin your CSP account. Administrators can apply cluster policies to\nenforce security profiles.\n\n4. Once the instances launch, the cluster manager sends the data\nengineer’s code to the cluster.\n\n5. The cluster pulls from Kafka in your account, transforms the data\nin your account and writes it to a storage in your account.\n\n6. The cluster reports status and any outputs back to the cluster manager.\n\nThe data engineer does not need to worry about many of the details —\nsimply write the code and Databricks runs it.\n\n\n#### Network and server security\n\nHere is how Databricks interacts with your cloud service provider\naccount to manage network and server security\n\n**Networking**\n\nRegardless of where you choose to host the data plane, Databricks networking\nis straightforward. If you host it yourself, Databricks by default will still configure\nnetworking for you, but you can also control data plane networking with your\nown managed VPC or VNet.\n\nThe serverless data plane network infrastructure is managed by Databricks in\na Databricks cloud service provider account and shared among customers,\nwith additional network boundaries between workspaces and between clusters.\n\nDatabricks does not rewrite or change your data structure in your storage, nor\ndoes it change or modify any of your security and governance policies. Local\nfirewalls complement security groups and subnet firewall policies to block\nunexpected inbound connections.\n\nCustomers at the enterprise tier can also use the IP access list feature on\nthe control plane to limit which IP addresses can connect to the web UI or\nREST API — for example, to allow only VPN or office IPs.\n\n\n-----\n\n**Servers**\n\nIn the data plane, Databricks clusters automatically run the latest hardened\nsystem image. Users cannot choose older (less secure) images or code. For AWS\nand Azure deployments, images are typically updated every two-to-four weeks.\nGCP is responsible for its system image.\n\nDatabricks runs scans for every release, including:\n\n**•** System image scanning for vulnerabilities\n\n**•** Container OS and library scanning\n\n\n**Severity** **Remediation time**\n\n**Critical** **< 14 days**\n\n**High** **< 30 days**\n\n**Medium** **< 60 days**\n\n**Low** **When appropriate**\n\n\n\n**•** Static and dynamic code scanning\n\n**Databricks access**\n\n\nDatabricks code is peer reviewed by developers who have security training.\nSignificant design documents go through comprehensive security reviews.\nScans run fully authenticated, with all checks enabled, and issues are\ntracked against the timeline shown in this table.\n\nNote that Databricks clusters are typically short-lived (often terminated\nafter a job completes) and do not persist data after they terminate. Clusters\ntypically share the same permission level (excluding high concurrency or\nDatabricks SQL clusters, where more robust security controls are in place).\nYour code is launched in an unprivileged container to maintain system\nstability. This security design provides protection against persistent attackers\nand privilege escalation.\n\n\nDatabricks access to your environment is limited to cloud service provider APIs\nfor our automation and support access. Automated access allows the Databricks\ncontrol plane to configure resources in your environment using the cloud service\nprovider APIs. The specific APIs vary based on the cloud. For instance, an AWS\ncross-account IAM role, or Azure-owned automation or GKE automation do not\ngrant access to your data sets (see the next section).\n\nDatabricks has a custom-built system that allows staff to fix issues or handle\nsupport requests — for example, when you open a support request and check the\nbox authorizing access to your workspace. Access requires either a support ticket\nor engineering ticket tied expressly to your workspace and is limited to a subset of\nemployees and for limited time periods. Additionally, if you have configured audit\nlog delivery, the audit logs show the initial access event and the staff’s actions.\n\n\n-----\n\n**Identity and access**\n\nDatabricks supports robust ACLs and SCIM. AWS customers can configure\nSAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\nGCP automatically integrate with Azure Active Directory or GCP identity.\n\nDatabricks supports a variety of ways to enable users to access their data.\n\n**Examples include:**\n\n**•** The Table ACLs feature uses traditional SQL-based statements to\nmanage access to data and enable fine-grained view-based access\n\n**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\nusers of that cluster automatically access allowed resources without\nexplicit credentials\n\n**•** External storage can be mounted or accessed using a securely\nstored access key\n\n**•** The Secrets API separates credentials from code when accessing\nexternal resources\n\n\n**Data security**\n\nDatabricks provides encryption, isolation and auditing.\n\n**Databricks encryption capabilities are**\n**in place both at rest and in motion**\n\n\n\n|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n|---|---|\n\n\n**Customers can isolate users at multiple levels:**\n\n**•** **Workspace level:** Each team or department can use a separate workspace\n\n**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n\nto a given cluster\n\n**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\nlanguages (SQL, Python) allow for the safe coexistence of users of different\nprivilege levels, and is used with Table ACLs\n\n**•** **Single-user cluster:** Users can create a private dedicated cluster\n\nActivities of Databricks users are logged and can be delivered automatically to\na cloud storage bucket. Customers can also monitor provisioning activities by\nmonitoring cloud audit logs.\n\n\n-----\n\n**Compliance**\n\n**Databricks supports the following compliance standards on**\n\n**our multi-tenant platform:**\n\n**•** **SOC 2 Type II**\n\n**•** **ISO 27001**\n\n**•** **ISO 27017**\n\n**•** **ISO 27018**\n\nCertain clouds support Databricks deployment options for FedRAMP\nHigh, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\nare also GDPR and CCPA ready.\n\n**Learn more**\n\nTo learn more about Databricks security,\nvisit the [Security and Trust Center](https://databricks.com/trust)\n\n\n-----\n\n**CHAPTER**\n\n# 06\n\n\n### Instant compute and serverless\n\n\n-----\n\n#### Benefits of Databricks Serverless SQL\n\nServerless SQL is much easier to administer with Databricks taking on the\nresponsibility of deploying, configuring and managing your cluster VMs. Databricks\ncan transfer compute capacity to user queries typically in about 15 seconds — so\nyou no longer need to wait for clusters to start up or scale out to run your queries.\n\nServerless SQL also has built-in connectors to your favorite tools such as Tableau,\nPower BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\nauthentication support and high performance. And finally, you save on cost\nbecause you do not need to overprovision or pay for the idle capacity.\n\n\n#### What is serverless compute?\n\nServerless compute is a fully managed service where Databricks provisions\nand manages the compute layer on behalf of the customer in the Databricks\ncloud account instead of the customer account. As of the current release,\nserverless compute is supported for use with Databricks SQL. This new\ncapability for Databricks SQL provides instant compute to users for their\nBI and SQL workloads, with minimal management required and capacity\noptimizations that can lower overall cost by 20%-40% on average. This\nmakes it even easier for organizations to expand adoption of the lakehouse\nfor business analysts who are looking to access the rich, real-time data sets\nof the lakehouse with a simple and performant solution.\n\n\n-----\n\n**Inside Serverless SQL**\n\n\n**Databricks Serverless SQL**\n\n**Managed servers**\n\n**Serverless SQL**\n**compute**\n\n**Secure**\n**Instant compute**\n\n\nAt the core of Serverless SQL is a compute\nplatform that operates a pool of servers located\nin a Databricks’ account, running Kubernetes\ncontainers that can be assigned to a user\nwithin seconds.\n\nWhen many users are running reports or queries\nat the same time, the compute platform adds more\nservers to the cluster (again, within seconds) to\nhandle the concurrent load. Databricks manages\nthe entire configuration of the server and\nautomatically performs the patching and upgrades\nas needed.\n\nEach server is running a secure configuration and\nall processing is secured by three layers of isolation:\nThe Kubernetes container hosting the runtime; the\nvirtual machine (VM) hosting the container; and\nthe virtual network for the workspace. Each layer\nis isolated to one workspace with no sharing or\ncross-network traffic allowed. The containers use\nhardened configurations, VMs are shut down and\nnot reused, and network traffic is restricted\nto nodes in the same cluster.\n\n\n-----\n\n#### Performance of Serverless SQL\n\nWe ran a set of internal tests to compare\nDatabricks Serverless SQL to the current\nDatabricks SQL and several traditional cloud\ndata warehouses. We found Serverless SQL\nto be the most cost-efficient and performant\nenvironment to run SQL workloads when\nconsidering cluster startup time, query\nexecution time and overall cost.\n\n\n**Databricks Serverless SQL is the highest**\n**performing and most cost-effective solution**\n\n**Cloud SQL solutions compared**\n\n\n**Faster**\n\n**Query**\n**execution**\n**time**\n\n**Slower**\n\n\n**Serverless**\n**SQL**\n\n**CDW1**\n\n**CDW3**\n\n\n**Cost Estimate**\n\n**High**\n\n**Medium**\n\n**Low**\n\n\n**CDW2**\n\n\n**CDW4**\n\n\n**Slower** **Faster**\n**(~5min)** **Startup time** **(~2-3sec)**\n\n**Learn more**\n\nThe feature is currently in Public Preview. Sign up to\n[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\nServerless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n\n\n-----\n\n**CHAPTER**\n\n# 07\n\n\n### Data warehousing\n\nData warehouses are not keeping up with today’s world. The explosion of\nlanguages other than SQL and unstructured data, machine learning, IoT and\nstreaming analytics are forcing organizations to adopt a bifurcated architecture\nof disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\nis ubiquitous and known by millions of professionals, it has never been treated\nas a first-class citizen on data lakes, until the lakehouse.\n\n\n-----\n\n#### What is data warehousing\n\nThe Databricks Lakehouse Platform provides a simplified multicloud and\nserverless architecture for your data warehousing workloads. Data warehousing on\nthe lakehouse allows SQL analytics and BI at scale with a common governance\nmodel. Now you can ingest, transform and query all your data in-place — using\nyour SQL and BI tools of choice — to deliver real-time business insights at the\nbest price/performance. Built on open standards and APIs, the lakehouse\nprovides the reliability, quality and performance that data lakes natively lack,\nand integrations with the ecosystem for maximum flexibility — no lock-in.\n\nWith data warehousing on the lakehouse, organizations can unify all analytics\nand simplify their architecture to enable their business with real-time business\ninsights at the best price/performance.\n\n\n#### Key benefits\n\n**Best price/performance**\n\nLower costs, get the best price/performance and eliminate\nresource management overhead\n\nOn-premises data warehouses have reached their limits — they physically\ncannot scale to handle the growing volumes of data, and don’t provide the\nelasticity customers need to respond to ever-changing business needs.\nCloud data warehouses are a great alternative to on-premises data\nwarehouses, providing greater scale and elasticity, but cloud costs for\nproprietary cloud data warehouses typically yield to an exponential cost\nincrease following the growth of data volume.\n\nThe Databricks Lakehouse Platform provides instant, elastic SQL serverless\ncompute — decoupled from storage on cheap cloud object stores — and\nthousands of performance optimizations that can lower overall infrastructure\ncosts by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\ntypes and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\nuse cases.\n\n\n-----\n\n**Built-in governance**\n\nOne source of truth and one unified\ngovernance layer across all data teams\n\nUnderpinned by Delta Lake, the Databricks\nLakehouse Platform simplifies your architecture by\nallowing you to establish one single copy of all your\ndata for in-place analytics and ETL/ELT on your\nexisting data lakes — no more data movements\nand copies in disjointed systems. Then, seamless\nintegration with Databricks Unity Catalog lets you\neasily discover, secure and manage all your data\nwith fine-grained governance, data lineage, and\nstandard SQL.\n\n**Rich ecosystem**\n\nIngest, transform and query all your\ndata in-place with your favorite tools\n\nVery few tools exist to conduct BI on data lakes.\nGenerally, doing so has required data analysts to\n\nsubmit Spark jobs or use a developer interface.\nWhile these tools are common for data scientists,\nthey require knowledge of languages and\ninterfaces that are not traditionally part of a data\nanalyst’s tool set. As a result, the learning curve for\nan analyst to make use of a data lake is too high\nwhen well-established tools and methods already\nexist for data warehouses.\n\n\nThe Databricks Lakehouse Platform works with\nyour preferred tools like dbt, Fivetran, Power BI or\nTableau, allowing analysts and analytical engineers\nto easily ingest, transform and query the most\nrecent and complete data, without having to move\nit into a separate data warehouse. Additionally, it\nempowers every analyst across your organization\nto quickly and collaboratively find and share new\ninsights with a built-in SQL editor, visualizations\nand dashboards.\n\n**Break down silos**\n\nAccelerate time from raw to actionable\ndata and go effortlessly from BI to ML\n\n\napplications, organizations will need to manage\nan entirely different system than their SQL-only\ndata warehouse, slowing down collaboration and\ninnovation.\n\nThe Databricks Lakehouse Platform provides the\nmost complete end-to-end data warehousing\nsolution for all your modern analytics needs,\nand more. Now you can empower data teams\nand business users to access the latest data\nfaster for downstream real-time analytics and go\neffortlessly from BI to ML. Speed up the time from\nraw to actionable data at any scale — in batch and\nstreaming. And go from descriptive to advanced\nanalytics effortlessly to uncover new insights.\n\n\nIt is challenging for data engineering teams to\nenable analysts at the speed that the business\nrequires. Data warehouses need data to be\ningested and processed ahead of time before\nanalysts can access and query it using BI tools.\nBecause traditional data warehouses lack\nreal-time processing and do not scale well for\nlarge ETL jobs, they create new data movements\nand bottlenecks for the data engineering team,\nand make it slow for analysts to access the\nlatest data. And for advanced analytics (ML)\n\n\n-----\n\n**Data warehousing on Databricks**\n\n**Truly decoupled, serverless, compute layer**\n\n\n**Data consumers**\n\n\n**Data processing**\n\n**Unity Catalog**\n\n\n**ETL** **ETL**\n\n**Bronze raw** **Silver staging** **Gold DW/marts**\n\n\n**Open storage layer**\n\n**Data ingest**\n\n**Data sources**\n\n\n**Databricks**\n**Partner Connect**\n\n\n**Continuous**\n**ingest**\n\n\n**Batch**\n**ingest**\n\n\n**On-premises**\n\n**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**On-premises**\n\n**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**Learn more**\n\n\n[Try Databricks SQL for free](https://dbricks.co/dbsql)\n\n[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n\n[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n\n\n[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n\n[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n\n\n-----\n\n**CHAPTER**\n\n# 08\n\n\n### Data engineering\n\nOrganizations realize the value data plays as a strategic asset for growing\nrevenues, improving the customer experience, operating efficiently or improving\na product or service. Data is really the driver of all these initiatives. Nowadays,\ndata is often streamed and ingested from hundreds of different data sources,\nsometimes acquired from a data exchange, cleaned in various ways with\ndifferent orchestrated steps, versioned and shared for analytics and AI.\nAnd increasingly, data is being monetized.\n\nData teams rely on getting the right data at the right time for analytics, data\nscience and machine learning, but often are faced with challenges meeting\nthe needs of their initiatives for data engineering.\n\n\n-----\n\n#### Why data engineering is hard\n\nOne of the biggest challenges is accessing and managing the increasingly\ncomplex data that lives across the organization. Most of the complexity\narises with the explosion of data volumes and data types, with organizations\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n\nWith this volume, managing data pipelines to transform and process data\nis slow and difficult, and increasingly expensive. And to top off the complexity,\nmost businesses are putting an increased emphasis on multicloud\nenvironments which can be even more difficult to maintain.\n\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\nthat data itself has become a product, and the challenging goal of the data\nengineer is to build and run the machinery that creates this high-fidelity\ndata product all the way from ingestion to monetization.\n\n\nDespite current technological advances data engineering remains\ndifficult for several reasons:\n\n**Complex data ingestion methods**\n\nData ingestion means retrieving batch and streaming data from various\nsources and in various formats. Ingesting data is hard and complex since you\neither need to use an always-running streaming platform like Apache Kafka\nor you need to be able to keep track of which files haven’t been ingested yet.\nData engineers are required to spend a lot of time hand-coding repetitive\nand error-prone data ingestion tasks.\n\n**Data engineering principles**\n\nThese days, large operations teams are often just a memory of the past.\nModern data engineering principles are based on agile software development\nmethodologies. They apply the well-known “you build it, you run it” paradigm,\nuse isolated development and production environments, CI/CD, and version\ncontrol transformations that are pushed to production after validation. Tooling\nneeds to support these principles.\n\n\n-----\n\n**Third-party tools**\n\nData engineers are often required to run additional third-party tools for\norchestration to automate tasks such as ELT/ETL or customer code in\nnotebooks. Running third-party tools increases the operational overhead\nand decreases the reliability of the system.\n\n**Performance tuning**\n\nFinally, with all pipelines and workflows written, data engineers need to\nconstantly focus on performance, tuning pipelines and architectures to meet\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\narchitecture and constantly observing throughput parameters.\n\nMost organizations are dealing with a complex landscape of data warehouses\nand data lakes these days. Each of those platforms has its own limitations,\nworkloads, development languages and governance model.\n\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:\n\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n_kinds of workflows._\n\n\n-----\n\n-----\n\n#### Benefits of data engineering on the lakehouse\n\nBy simplifying and modernizing with the lakehouse architecture, data engineers\ngain an enterprise-grade and enterprise-ready approach to building data\npipelines. The following are eight key differentiating capabilities that a data\nengineering solution team can enable with the Databricks Lakehouse Platform:\n\n**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\nengineers can enable fast, reliable, scalable and automatic data ingestion\nfor analytics, data science or machine learning.\n\n\n\n**•** **Data pipeline observability:** Monitor overall data pipeline estate status\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\nhealth for performance, quality, status and latency.\n\n**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\nanalytics and machine learning use cases by enabling easy and automatic\ndata pipeline deployments into production or roll back pipelines and\nminimize downtime.\n\n**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\nof data processing tasks for data and machine learning pipelines with the\nability to run multiple non-interactive tasks as a directed acyclic graph\n(DAG) on a Databricks compute cluster.\n\n\n\n**•** **Automated ETL pipelines:** Data engineers can reduce development\ntime and effort and focus on implementing business logic and data\nquality checks within the data pipeline using SQL or Python.\n\n**•** **Data quality checks:** Improve data reliability throughout the data\nlakehouse so data teams can confidently trust the information for\ndownstream initiatives with the ability to define data quality and\nautomatically address errors.\n\n**•** **Batch and streaming:** Allow data engineers to set tunable data latency\nwith cost controls without having to know complex stream processing\nand implement recovery logic.\n\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\nfor most common error conditions that can occur during the operation of\na pipeline with fast, scalable fault-tolerance.\n\n\n-----\n\n**Data engineering is all about data quality**\n\nThe goal of modern data engineering is to distill data with a quality that is fit for\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\nthree different levels.\n\n\n1. On a **technical level** , data quality is\nguaranteed by enforcing and evolving\nschemas for data storage and ingestion.\n\n**Kenesis**\n\n**CSV,**\n**JSON, TXT...**\n\n**Data Lake**\n\n\n2. On an **architectural level** , data quality is\noften achieved by implementing the medallion\narchitecture. A medallion architecture is a data\ndesign pattern used to logically organize data in\na [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\nprogressively improving the structure and quality\nof data as it flows through each layer of the\narchitecture, e.g., from Bronze to Silver to Gold\nlayer tables.\n\n\n3. The **Databricks Unity Catalog** comes\nwith robust data quality management with\nbuilt-in quality controls, testing, monitoring\nand enforcement to ensure accurate and\nuseful data is available for downstream BI,\nanalytics and machine learning workloads.\n\n**Streaming**\n**analytics**\n\n\n**Bronze**\n\n\n**Silver**\n\n\n**Gold**\n\n\n**BI and**\n\n**reporting**\n\n\nRaw ingestion Filtered, cleaned, Business-level\nand history augmented aggregates\n\n**Quality**\n\n\n**Data science**\n\n**and ML**\n\n\n-----\n\n#### Data ingestion\n\nWith the Databricks Lakehouse Platform, data engineers can build robust\nhyper-scale ingestion pipelines in streaming and batch mode. They can\nincrementally process new files as they land on cloud storage — with no\nneed to manage state information — in scheduled or continuous jobs.\n\nData engineers can efficiently track new files (with the ability to scale\nto billions of files) without having to list them in a directory. Databricks\nautomatically infers the schema from the source data and evolves it as\nthe data loads into the Delta Lake lakehouse. Efforts continue with\nenhancing and supporting Auto Loader, our powerful data ingestion\ntool for the Lakehouse.\n\n**What is Auto Loader?**\n\nHave you ever imagined that ingesting data could become as easy\nas dropping a file into a folder? Welcome to Databricks Auto Loader.\n\n[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\nefficiently processes new data files as they arrive in the cloud storage built\ninto the Databricks Lakehouse. Auto Loader can detect and enforce the\nschema of your data and, therefore, guarantee data quality. New files or\nfiles that have been changed since the last time new data was processed\nare identified automatically and ingested. Noncompliant data sets are\nquarantined into rescue data columns. You can use the [trigger once]\noption with Auto Loader to turn it into a job that turns itself off.\n\n\n**Ingestion for data analysts: COPY INTO**\n\nIngestion also got much easier for data analysts and analytics engineers working\nwith Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\nlake-first approach and loads data from a folder location into a Delta Lake table.\nCOPY INTO can be scheduled and called by a job repeatedly. When run, only new\nfiles from the source location will be processed.\n\n#### Data transformation\n\nTurning SQL queries into production ETL pipelines typically involves a lot\nof tedious, complicated operational work. Even at a small scale, the majority\nof a data practitioner’s time is spent on tooling and managing infrastructure.\n\nAlthough the medallion architecture is an established and reliable pattern\nfor improving data quality, the implementation of this pattern is challenging\nfor many data engineering teams.\n\nWhile hand-coding the medallion architecture was hard for data engineers,\ncreating data pipelines was outright impossible for data analysts not being\nable to code with Spark Structured Streaming in Scala or Python.\n\nEven at a small scale, most data engineering time is spent on tooling and\nmanaging infrastructure rather than transformation. Auto-scaling, observability\nand governance are difficult to implement and, as a result, often left out of the\nsolution entirely.\n\n\n-----\n\n#### What is Delta Live Tables?\n\nDelta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\ninfrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\nand apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\nboth Python and SQL and is tailored to work with both streaming and batch workloads.\n\nWith DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n\n\n**Write** **create live table**\n\n\n**Create** **a pipeline** **Click** **Start**\n\nStart\n\n\n-----\n\nDLT reduces the implementation time by accelerating development and\nautomating complex operational tasks. Since DLT can use plain SQL, it also\nenables data analysts to create production pipelines and turns them into\nthe often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\nexecution applied with Photon.\n\nSoftware engineering principles are applied for data engineering to foster the\nidea of treating your data as code. Your data is the sole source of truth for what\nis going on inside your business.\n\nBeyond just the transformations, there are many things that should be included\n\nDependency\nFull refresh\nmanagement\n\n*Coming soon\n\n\nin the code that define your data. Declaratively express entire data flows in SQL\nor Python. Natively enable modern software engineering best practices like\nseparate development and production environments, the ability to easily test\nbefore deploying, deploy and manage environments using parameterization, unit\ntesting and documentation.\n\nDLT also automatically scales compute, providing the option to set the minimum\nand maximum number of instances and let DLT size up the cluster according\nto cluster utilization. In addition, tasks like orchestration, error handling and\nrecovery, and performance optimization are all handled automatically.\n\n\nIncremental\ncomputation*\n\n\nCheckpointing\nand retries\n\n\n-----\n\nExpectations in the code help prevent bad data from flowing into tables, track\ndata quality over time, and provide tools to troubleshoot bad data with granular\npipeline observability. This enables a high-fidelity lineage diagram of your\npipeline to track dependencies and aggregate data quality metrics across all\nyour pipelines.\n\nUnlike other products that force you to deal with streaming and batch workloads\nseparately, DLT supports any type of data workload with a single API so data\nengineers and analysts alike can build cloud-scale data pipelines faster without\nthe need for advanced data engineering skills.\n\n#### Data orchestration\n\nThe lakehouse makes it much easier for businesses to undertake ambitious data\nand machine learning (ML) initiatives. However, orchestrating and managing\nend-to-end production workflows remains a bottleneck for most organizations,\nrelying on external tools or cloud-specific solutions that are not part of their\nlakehouse platform. Tools that decouple task orchestration from the underlying\ndata processing platform reduce the overall reliability of their production\nworkloads, limit observability, and increase complexity for end users.\n\n#### What is Databricks Workflows?\n\n[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\nany cloud.\n\n\nWorkflows lets you orchestrate data flow pipelines (written in DLT or dbt),\nas well as machine learning pipelines, or any other tasks such as notebooks\nor Python wheels. Since Databricks Workflows is fully managed, it eliminates\noperational overhead for data engineers, enabling them to focus on your\nworkflows not on managing your infrastructure. It provides an easy point-and-click\nauthoring experience for all your data teams, not just those with specialized skills.\nDeep integration with the underlying lakehouse platform ensures you will create\nand run reliable production workloads on any cloud while providing deep and\ncentralized monitoring with simplicity for end users.\n\nSharing job clusters over multiple tasks reduces the time a job takes, reduces\ncosts by eliminating overhead and increases cluster utilization with parallel tasks.\n\n\n-----\n\nDatabricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\nshows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\ntriggers the execution of all dependent tasks.\n\nYou can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n\nexternal orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n\n\n-----\n\n#### Orchestrate anything\n\nRemember that DLT is one of many task types for Databricks Workflows.\nThis is where the managed data flow pipelines with DLT tie together with\nthe easy point-and-click authoring experience of Databricks Workflows.\n\nIn the following example, you can see an end-to-end workflow built with\ncustomers in a workshop: Data is streamed from Twitter according to search\nterms, then ingested with Auto Loader using automatic schema detection and\nenforcement. In the next step, the data is cleaned and transformed with Delta\nLive table pipelines written in SQL, and finally run through a pre-trained BERT\nlanguage model from Hugging Face for sentiment analysis of the tweets.\nDifferent task types for ingest, cleanse/transform and ML are combined\nin a single workflow.\n\nUsing Workflows, these tasks can be scheduled to provide a daily overview of\nsocial media coverage and customer sentiment for a business. After streaming\ntweets with filtering for keywords such as “data engineering,” “lakehouse” and\n“Delta Lake,” we curated a list of those tweets that were classified as positive\nwith the highest probability score.\n\n**Learn more**\n\n\n[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n[Lakehouse](https://databricks.com/solutions/data-pipelines)\n\n\n[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n\n[Databricks Workflows](https://www.databricks.com/product/workflows)\n\n\n[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n\n\n-----\n\n**CHAPTER**\n\n### Data streaming\n# 09\n\n\n**CHAPTER**\n\n\nThere are two types of data processing: batch processing\nand streaming processing.\n\n\nBatch processing refers to the discontinuous, periodic processing\nof data that has been stored for a period of time. For example,\nan organization may need to run weekly reports on a set of\npredictable transaction data. There is no need for this data\nto be streaming — it can be processed on a weekly basis.\n\nStreaming processing, on the other hand, refers to unbounded\nprocessing of data as it arrives.\n\n\n-----\n\n**Data Streaming Challenges**\n\nHowever, getting value from streaming data can be a tricky practice. While most\ndata today can be considered streaming data, organizations are overwhelmed by\nthe need to access, process and analyze the volume, speed and variety of this\ndata moving through their platforms. To keep pace with innovation, they must\nquickly make sense of data streams decisively, consistently and in real time.\n\nThree common technical challenges organizations experience\nwith implementing real-time data streaming include:\n\n**•** **Specialized APIs and language skills:** Data practitioners encounter\nbarriers to adopting streaming skillsets because there are new languages,\nAPIs and tools to learn.\n\n**•** **Operational complexity:** To implement data streaming at scale, data\nteams need to integrate and manage streaming-specific tools with\ntheir other cloud services. They also have to manually build complex\noperational tooling to help these systems recover from failure, restart\nworkloads without reprocessing data, optimize performance, scale the\nunderlying infrastructure, and so on.\n\n**•** **Incompatible governance models:** Different governance and security\nmodels across real-time and historical data platforms makes it difficult\nto provide the right access to the right users, see the end-to-end data\nlineage, and/or meet compliance requirements.\n\n\nIn a wide variety of cases, an organization might find it useful to\nleverage streaming data. Here are some common examples:\n\n**•** **Retail:** Real-time inventory updates help support business activities, such\nas inventory and pricing optimization and optimization of the supply chain,\nlogistics and just-in-time delivery.\n\n**•** **Smart energy:** Smart meter monitoring in real time allows for smart\nelectricity pricing models and connection with renewable energy sources\nto optimize power generation and distribution.\n\n**•** **Preventative maintenance:** By reducing unplanned outages and\nunnecessary site and maintenance visits, real-time streaming analytics can\nlower operational and equipment costs.\n\n**•** **Industrial automation:** Manufacturers can use streaming and predictive\nanalytics to improve production processes and product quality, including\nsetting up automated alerts.\n\n**•** **Healthcare:** To optimize care recommendations, real-time data allows\nfor the integration of various smart sensors to monitor patient condition,\nmedication levels and even recovery speed.\n\n**•** **Financial institutions:** Firms can conduct real-time analysis of\n\ntransactions to detect fraudulent transactions and send alerts. They\ncan use fraud analytics to identify patterns and feed data into machine\nlearning algorithms.\n\n\nRegardless of specific use cases, the central tenet of streaming data is that it\ngives organizations the opportunity to leverage the freshest possible insights for\nbetter decision-making and more optimized customer experiences.\n\n\n-----\n\n**Data streaming architecture**\n\nBefore addressing these challenges head-on, it may help to take a step back and\ndiscuss the ingredients of a streaming data pipeline. Then, we will explain how\nthe Databricks Lakehouse Platform operates within this context to address the\naforementioned challenges.\n\nEvery application of streaming data requires a pipeline that brings the data from\nits origin point — whether sensors, IoT devices or database transactions — to its\nfinal destination.\n\nIn building this pipeline, streaming architectures typically employ two layers.\nFirst, streaming capture systems **capture** and temporarily store streaming data\nfor processing. Sometimes these systems are also called messaging systems\nor messaging buses. These systems are optimized for small payloads and high\nfrequency inputs/outputs. Second, streaming **processing** systems continuously\nprocess data from streaming capture systems and other storage systems.\n\n**Capturing** **Processing**\n\n\nIt may help to think of a simplified streaming pipeline\naccording to the following seven phases:\n\n1. Data is continuously generated at origin points\n\n2. The generated data is captured from those origin points by\na capture system like Apache Kafka (with limited retention)\n\n**3. The captured data is extracted and incrementally ingested to**\n**a processing platform like Databricks; data is ingested exactly**\n**once and stored permanently, even if this step is rerun**\n\n**4. The ingested data is converted into a workable format**\n\n**5. The formatted data is cleansed, transformed and joined in**\n**a number of pipeline steps**\n\n**6. The transformed data is processed downstream through**\n**analysis or ML modeling**\n\n7. The resulting analysis or model is used for some sort of practical\napplication, which may be anything from basic reporting to an\nevent-driven software application\n\nYou will notice four of the steps in this list are in boldface. This is because the\nlakehouse architecture is specifically designed to optimize this part of the\npipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\nanalyze and model on streaming data _alongside_ batch-processed data. It can\naccommodate both structured _and_ unstructured data. It is here that the value\nof unifying the best pieces of data lakes and data warehouses really shines for\ncomplex enterprise use cases.\n\n\n-----\n\n**Data Streaming on the Lakehouse**\n\nNow let’s zoom in a bit and see how the Databricks Lakehouse\nPlatform addresses each part of the pipeline mentioned above.\n\n**Streaming data ingestion and transformation** begins with continuously\nand incrementally collecting raw data from streaming sources through a\nfeature called Auto Loader. Once the data is ingested, it can be transformed\nfrom raw, messy data into clean, fresh, reliable data appropriate for downstream\nanalytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\nmanage these data pipelines while automatically taking care of infrastructure\nmanagement and scaling, data quality, error testing and other administrative\ntasks. DLT is a high-level abstraction built on Spark Structured Streaming,\na scalable and fault-tolerant stream processing engine.\n\n**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\nof streaming data. With fresher data streaming into SQL analytics or BI\nreporting, more actionable insights can be achieved, resulting in better\nbusiness outcomes.\n\n**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\ndeployment is supported with structured streaming for continuous inference\nfrom a live data stream. Like real-time analytics, real-time ML is a downstream\nimpact of streaming data, but for different business use cases (i.e., AI instead\nof BI). Real-time modeling has many benefits, including more accurate\npredictions about the future.\n\n\n**Real-time applications** process data directly from streaming pipelines and\ntrigger programmatic actions, such as displaying a relevant ad, updating the\nprice on a pricing page, stopping a fraudulent transaction, etc. There typically\nis no human-in-the-loop for such applications.\n\n\nData in cloud storage and message stores\n\n\n-----\n\n**Databricks Lakehouse Platform differentiators**\n\nUnderstanding what the lakehouse architecture provides is one\n\nthing, but it is useful to understand how Databricks uniquely\n\napproaches the common challenges mentioned earlier around\n\nworking with streaming data.\n\n**Databricks empowers unified data teams.** Data engineers, data scientists\nand analysts can easily build streaming data workloads with the languages\nand tools they already know and the APIs they already use.\n\n**Databricks simplifies development and operations.** Organizations can\nfocus on getting value from data by reducing complexity and automating\nmuch of the production aspects associated with building and maintaining\nreal-time data workloads.\n\n\nSee why customers love streaming on the Databricks\nLakehouse Platform with these resources.\n\n**Learn more**\n\n[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n\n[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n\n[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n\n[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n\n\n**Databricks is one platform for streaming and batch data.** Organizations\ncan eliminate data silos, centralize security and governance models, and\nprovide complete support for all their real-time use cases under one roof —\nthe roof of the lakehouse.\n\nFinally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n\n[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\ndeeply integrated with Spark Structured Streaming and overcomes many of\nthe limitations typically associated with streaming systems and files.\n\nIn summary, the Databricks Lakehouse Platform dramatically simplifies data\nstreaming to deliver real-time analytics, machine learning and applications on\none platform. And, that platform is built on a foundation with streaming at its\ncore. This means organizations of all sizes can use their data in motion and\nmake more informed decisions faster than ever.\n\n\n-----\n\n**CHAPTER**\n\n### Data science and machine learning\n# 10\n\n\n**CHAPTER**\n\n\nWhile most companies are aware of the potential benefits of applying\nmachine learning and AI, realizing these potentials can often be quite\nchallenging for those brave enough to take the leap. Some of the\nlargest hurdles come from siloed/disparate data systems, complex\nexperimentation environments, and getting models served in a\nproduction setting.\n\n\nFortunately, the Databricks Lakehouse Platform provides a helping\nhand and lets you use data to derive innovative insights, build\npowerful predictive models, and enable data scientists, ML engineers,\nand developers of all kinds to create within the space of machine\nlearning and AI.\n\n\n-----\n\n#### Databricks Machine Learning\n\n\n-----\n\n#### Exploratory data analysis\n\nWith all the data in one place, data is easily\nexplored and visualized from within the\nnotebook-style experience that provides support\nfor various languages (R, SQL, Python and Scala)\nas well as built-in visualizations and dashboards.\nConfidently and securely share code with\nco-authoring, commenting, automatic versioning,\nGit integrations and role-based access controls.\nThe platform provides laptop-like simplicity at\nproduction-ready scale.\n\n\n-----\n\n#### Model creation and management\n\nFrom data ingestion to model training and tuning, all the way through to\nproduction model serving and versioning, the Lakehouse brings the tools\nneeded to simplify those tasks.\n\nGet right into experimenting with the Databricks ML runtimes, optimized and\npreconfigured to include most popular libraries like scikit-learn, XGBoost and\nmore. Massively scale thanks to built-in support for distributed training and\nhardware acceleration with GPUs.\n\nFrom within the runtimes, you can track model training sessions, package and\nreuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\ncreated by Databricks and included as a managed service within the Lakehouse.\nIt provides a centralized location from which to manage models and package\ncode in an easily reusable way.\n\nTraining these models often involves the use of features housed in a centralized\nfeature store. Fortunately, Databricks has a built-in feature store that allows you\nto create new features, explore and re-use existing features, select features for\ntraining and scoring machine learning models, and publish features to low-latency\nonline stores for real-time inference.\n\nIf you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\nexperimentation by pointing to your data set and automatically training models\nand tuning hyperparameters to save both novice and advanced users precious\ntime in the machine learning process.\n\n\nAutoML will also report back metrics related to the model training results as well\nas the code needed to repeat the training already custom-tailored to your data\nset. This glass box approach ensures that you are never trapped or suffer from\nvendor lock-in.\n\nIn that regard, the Lakehouse supports the industry’s widest range of data tools,\ndevelopment environments, and a thriving ISV ecosystem so you can make your\nworkspace your own and put out your best work.\n\n##### Compute platform\n\n**Any ML workload optimized and accelerated**\n\n**Databricks Machine Learning Runtime**\n\n- Optimized and preconfigured ML frameworks\n\n- Turnkey distribution ML\n\n- Built-in AutoML\n\n- GPU support out of the box\n\n\nBuilt-in **ML frameworks**\nand **model explainability**\n\nBuilt-in support for **AutoML**\nand **hyperparameter tuning**\n\n\nBuilt-in support for\n**distributed training**\n\nBuilt-in support for\n**hardware accelerators**\n\n\n-----\n\n#### Deploy your models to production\n\nExploring and creating your machine learning models\ntypically represents only part of the task. Once the\nmodels exist and perform well, they must become\npart of a pipeline that keeps models updated,\nmonitored and available for use by others.\n\n**Webhooks** allow registering of\n\n\nDatabricks can help here by providing a world-class\nexperience for model versioning, monitoring and\nserving within the same platform that you can use\nto generate the models themselves. This means you\ncan make all your ML pipelines in the same place,\nmonitor them for drift, retrain them with new data,\nand promote and serve them easily and at scale.\n\nThroughout the ML lifecycle, rest assured knowing\nthat lineage and governance are being tracked the\nentire way. This means regulatory compliance and\nsecurity woes are significantly reduced, potentially\nsaving costly issues down the road.\n\n\ncallbacks on events like stage\n\ntransitions to integrate with CI/CD\n\nautomation.\n\n**Tags** allow storing deployment\n\n— specific metadata with model\n\nversions, e.g., whether the\n\ndeployment was successful.\n\n\n**Model lifecycle management**\n\nStaging Production Archived\n\n\nLogged\nmodel\n\n**Comments** allow communication\n\nand collaboration between\n\nteammates when reviewing\n\nmodel versions.\n\n\n-----\n\n**Learn more**\n\n[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n\n[Databricks Data Science](https://databricks.com/product/data-science)\n\n[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n\n\n-----\n\n**CHAPTER**\n\n# 11\n\n\n### Databricks Technology Partners and the modern data stack\n\nDatabricks Technology Partners integrate their solutions with Databricks to\nprovide complementary capabilities for ETL, data ingestion, business intelligence,\nmachine learning and governance. These integrations allow customers to leverage\nthe Databricks Lakehouse Platform’s reliability and scalability to innovate faster\nwhile deriving valuable data insights. Use preferred analytical tools with optimized\nconnectors for fast performance, low latency and high user concurrency to your\ndata lake.\n\n\n-----\n\nWith [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\ntools to your lakehouse using validated integrations and helps you discover and try new solutions.\n\n**Databricks thrives within your modern data stack**\n\n**BI and dashboards** **Machine learning** **Data science**\n\n\n**Data governance**\n\n**Data pipelines**\n\n**Data ingestion**\n\n\nData Data Data\nwarehousing engineering streaming\n\n**Unity Catalog**\n\n\nData science\nand ML\n\n\n**Consulting**\n**and SI partners**\n\n\n**Delta Lake**\n\n**Cloud Data Lake**\n\n**Learn more**\n\n\n[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n\n[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n\n\n[Partner Connect](https://databricks.com/partnerconnect)\n\n[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n\n\n-----\n\n**CHAPTER**\n\n### Get started with the Databricks Lakehouse Platform\n# 12\n\n\n-----\n\n#### Databricks Trial\n\nGet a collaborative environment for data teams to build solutions together with interactive\nnotebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\nscikit-learn and more.\n\n**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\nhosted by Databricks\n\n**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\n\n**[Databricks documentation](https://databricks.com/documentation)**\n\nGet detailed documentation to get started with\nthe Databricks Lakehouse Platform on your cloud\nof choice: Databricks on AWS, Azure Databricks\nand [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n\n**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n\nGet a firsthand look at Databricks from the\npractitioner’s perspective with these simple\non-demand videos. Each demo is paired with\nrelated materials — including notebooks, videos\nand eBooks — so that you can try it out for\nyourself on Databricks.\n\n\n**[Databricks Academy](https://databricks.com/learn/training/home)**\n\nWhether you are new to the data lake or building on\nan existing skill set, you can find a curriculum tailored\nto your role or interest. With training and certification\nthrough Databricks Academy, you will learn to master\nthe Databricks Lakehouse Platform for all your big\ndata analytics projects.\n\n**[Databricks Community](https://community.databricks.com/)**\n\n\n**[Databricks Labs](https://databricks.com/learn/labs)**\n\nDatabricks Labs are projects created by the\nfield to help customers get their use cases\ninto production faster.\n\n**[Databricks customers](https://databricks.com/customers)**\n\nDiscover how innovative companies across\nevery industry are leveraging the Databricks\nLakehouse Platform.\n\n\nGet answers, network with peers and solve\nthe world’s toughest problems, together.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 7,000\norganizations worldwide — including Comcast, Condé Nast,\nH&M and over 40% of the Fortune 500 — rely on the Databricks\nLakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve the\nworld’s toughest problems. To learn more, follow Databricks on\n[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### Guide\n\n## 6 Strategies for Building Personalized Customer Experiences\n\n\n-----\n\n### Contents\n\n**Introduction** ................................................................................................................................................................................................................. **3**\n\n**1.** **Building a Foundation for Personalization**\nLeveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n\n**2.** **Estimating Customer Lifetime Value**\nBuilding Brand Loyalty With Data ................................................................................................................................................................. **6**\n\n**3.** **Mitigating Customer Churn**\nBalancing Acquisition and Retention .......................................................................................................................................................... **10**\n\n**4.** **Streamlining Customer Analysis and Targeting**\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n\n**5.** **Assessing Consumer Interest Data**\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n\n**6.** **Delivering Personalized Customer Journeys**\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n\n**Conclusion**\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n\n\n-----\n\n### Introduction\n\nIn today’s experience-driven world, the most beloved brands are the ones that\nknow their customers. Customers are loyal to brands that recognize their needs\nand preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\nbuying from a brand that personalizes the shopping and user experience to the\nwants and needs of the customer. And as organizations pursue omnichannel\nexcellence, these same high expectations of online experiences also extend to\nbrick-and-mortar locations — revealing for many merchants that personalized\nengagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized\nexperiences requires integrating various types of data — including demographics,\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\nactionable strategic pillars for businesses to leverage automation, real-time data,\nAI-driven analysis and well-tuned ML models to architect and deliver customized\ncustomer experiences at every touch point.\n\n\n# 76%\n\nof consumers are more\nlikely to purchase due to\npersonalization\n\n\n# 76%\n\n\n-----\n\n### Building a Foundation for Personalization\n\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\n\n\nTo create truly personalized interactions, you need actionable insights\nabout your customers. Start by establishing a common customer profile and\naccurately linking together customer records across disparate data sets.\n\nGet a 360-degree view of your target customer by bringing together:\n\n- Sales and traffic-driven first-party data\n\n- Product ratings and surveys\n\n- Customer surveys and support center calls\n\n- Third-party data purchased from data aggregators and online trackers\n\n- Zero-party data provided by customers themselves\n\nLocation\n\n\n**C A S E S T U DY**\n\n**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n\nGrab is the largest online-to-offline platform in Southeast Asia and\nhas generated over 6 billion transactions for transport, food and\ngrocery delivery, and digital payments. Grab uses Databricks to create\nsophisticated customer segmentation and recommendation engines\nthat can now ingest and optimize thousands of user-generated signals\nand data sources simultaneously, enhancing data integrity and security,\nand reducing weeks of work to only hours.\n\n[Get the full story](https://www.databricks.com/customers/grab)\n\n\n\nDemographics\n\n\nOrders\n\nNetwork/\nUsage\n\n\n“The C360 platform empowered teams to create\nconsumer features at scale, which in turn allows\nfor these features to be extended to other markets\nand used by other teams. This helps to reduce the\nengineering overhead and costs exponentially.”\n\n**N I K H I L DWA R A K A N AT H**\nHead of Analytics, Grab\n\n\nSocial\n\nApps/\nClickstream\n\n|Col1|Col2|Col3|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|||||||\n||Cus 3|t 6|o|mer 0||\n|||||||\n|||||||\n\n\n\nService Call/\nRecords\n\n\nCustomer\n360\n\n\nBilling\n\nDevices\n\n\n-----\n\nGiven the different data sources and data types, automated matching can still\nbe incredibly challenging due to inconsistent formats, misinterpretation of data,\nand entry errors across various systems. And even if inconsistent, all that data\nmay be perfectly valid — but to accurately connect the millions of customer\nidentities most retailers manage, businesses must lean on automation.\n\nIn a machine learning (ML) approach to entity resolution, text attributes like\nname, address and phone number are translated into numerical representations\nthat can be used to quantify the degree of similarity between any two attribute\nvalues. But your ability to train such a model depends on your access to\naccurately labeled training data. It’s a time-consuming exercise, but if done right,\nthe model learns to reflect the judgments of the human reviewers.\n\nMany organizations rely on libraries encapsulating this knowledge to build their\napplications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\nbringing together ML-based approaches to intelligent candidate pair generation\nand pair-scoring. Oriented toward the construction of custom workflows, Zingg\npresents these capabilities within the context of commonly employed steps\nsuch as training data label assignment, model training, data set deduplication,\nand (cross-data set) record matching.\n\nBuilt as a native Apache Spark TM application, Zingg scales well to apply these\ntechniques to enterprise-sized data sets. Organizations can then use Zingg in\ncombination with platforms such as Databricks Lakehouse to provide the back\nend to human-in-the-middle workflow applications that automate the bulk of\nthe entity resolution work and present data experts with a more manageable\nset of edge case pairs to interpret.\n\n\nAs an active-learning solution, models can be retrained to take advantage of\nthis additional human input to improve future predictions and further reduce\nthe number of cases requiring expert review. Finally, these technologies can be\nassembled to enable their own enterprise-scaled customer entity resolution\nworkflow applications.\n\n**Need help building your foundation for a**\n**360-degree view of your customers?**\n\nGet pre-built code sample data and step-by-step instructions\nin a Databricks notebook in the **Customer Entity Resolution**\n**Solution Accelerator.**\n\n**•** Translating text attributes (like name, address, phone number)\ninto quantifiable numerical representations\n\n**•** Training ML models to determine if these numerical labels\nform a match\n\n**•** Scoring the confidence of each match\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n\n\n-----\n\n### Estimating Customer Lifetime Value\n\nBuilding brand loyalty to drive share of wallet with data\n\n\nOnce you’ve set up a 360-degree view of the customer, the next challenge\nis how to spend money to profitably grow the brand. The goal is to spend\nmarketing dollars on activities that attract loyal customers and avoid spending on\nunprofitable customers or activities that damage the brand. Keep in mind, that\nmaking decisions solely based on ROI isn’t the answer. This one-track approach\ncould ultimately weaken your brand equity and make you more dependent on\nlowering your price through promotions as a way to generate sales.\n\n**C A S E S T U DY**\n\n\n**Identifying and engaging brand loyalists**\n\nToday’s customer has overwhelmingly abundant options in products and\nservices to choose from. That’s why personalizing customer experiences is so\nimportant, as it increases revenue, marketing efficiency and customer retention.\n\nNot every customer carries the same potential for profitability. Different\ncustomers derive different value from your products and services, which directly\ntranslates into differences in the overall amount of value a business can expect\nin return. Mutually beneficial relationships carefully align customer acquisition\ncost (CAC) and retention rates with the total revenue or customer lifetime value\n(CLV).\n\n\n**Predicting and increasing customer lifetime value with ML**\n\n\nKolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\nattracts over 10 million monthly active users. With Databricks, they\nachieved a 30% increase in player LTV, improved data team productivity\nby 3x, and reduced ML model-to-production time by 40x.\n\n[Get the full story](https://databricks.com/customers/kolibri-games)\n\nWithin your existing customer base are people ranging from brand loyalists to\nbrand transients. Brand loyalists are highly engaged with your brand, are willing\nto share their experience with others, and are the most likely to purchase\nagain. Brand transients have no loyalty to your brand and shop based on price.\nYour focus should be on growing the group of brand loyalists while minimizing\ninteractions with brand transients.\n\n\n**Calculating customers’ lifetime intent**\n\nTo assess the remaining lifetime in a customer relationship, businesses must\n\ncarefully examine the transactional signals and other indicators from previous\ncustomer engagements and transactions.\n\nFor example, if a frequent customer slows down their buying habits — or simply\ndoesn’t make a purchase for an extended period of time — it may signal the\nupcoming end of the relationship. However, in the case of another customer\nwho engages infrequently, the same extended absence may not signal anything\nnotable. The infrequent buyer may continue to purchase even after a long pause\nin activity.\n\n\n-----\n\nCustomer A\n\nCustomer B\n\nCustomer C\n\n\nPast Future\n\nDifferent customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n\n\nEvery customer relationship with a business has a lifespan. Understanding what\npoint in the lifespan at a given time provides critical insight to inform marketing\nand sales tactics. By proactively discovering shifts in the relationship, you can\nadapt how to respond to each customer at the optimal time. For example, a\ncertain signal might prompt a change in how to deliver products and services,\nwhich could help maximize revenue.\n\nTransactional signals can be used to estimate the probability that a customer\nis active and likely to return in the future. Popularized as the Buy ’til You Die\n(BTYD) model, analysts can compare a customer’s frequency and recency of\n\nengagement to similar patterns across their user population to accurately\npredict individual CLV.\n\n\nThe mathematics behind these predictive CLV models is complex, but the logic\nbehind these critical models is accessible through a popular Python library\nnamed Lifetimes, which allows the input of simple summary metrics in order to\nderive customer-specific lifetime estimates.\n\n**C A S E S T U DY**\n\n**How personalized experiences keep customers coming**\n**back for more**\n\nPublicis Groupe empowers brands to transform retail experiences with\ndigital technologies, but data challenges and team silos stood in the\nway of delivering the personalization that their customers required.\nSee how they use Databricks to create a single customer view that\nallows them to drive customer loyalty and retention. As a result, they’ve\nseen a 45%–50% increase in customer campaign revenue.\n\n[Get the full story](https://databricks.com/customers/publicis-groupe)\n\n\n-----\n\n**Delivering customer lifetime estimates to the business**\n\n\nSpark natively distributes this work across a multi-server environment, enabling\nconsistent, accurate and efficient analysis. Spark’s flexibility allows models to\nadapt in real time as new information is ingested, eliminating the bottlenecks\nthat come with manual data mapping and profile building.\n\nWith per customer metrics calculated, the Lifetimes library can be used to train\nmultiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\npredict engagements over time using proprietary data can take several months\nand thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\nbusinesses tap into the infrastructure behind their Spark environments and\ndistribute the training outputs across models.\n\n\nUsing the Lifetimes library to calculate customer-specific probabilities at speed\nand scale can be challenging — from processing large volumes of transaction\ndata to deriving data curves and value distribution patterns and, eventually,\nto integration with business initiatives. But with the proper approach, you can\nresolve all of them.\n\nThese models depend on three key per customer metrics:\n\n**FREQUENCY**\nThe number of times within a given time period in which a repeat\ntransaction is observed\n\n**AGE**\nThe length of time between the occurrence of an initial transaction\nto the end of a given time period\n\n**RECENCY**\n\nThe “age” of a customer (how long they’ve engaged with a brand)\nat the time of their latest repeat transaction\n\n\n-----\n\n**Solution deployment**\n\n\nOnce properly trained, these models can determine the probability that a\ncustomer will re-engage, as well as the number of engagements a business\ncan expect from that customer over time. But the real challenge is putting\nthese predictive capabilities into the hands of those that determine\ncustomer engagement.\n\nMatrices illustrating the probability a customer is alive (left) and the number of future\npurchases in a 30-day window given a customer’s frequency and recency metrics (right).\n\n\nBusinesses need a way to develop and deploy solutions in a highly scalable\nenvironment with a limited upfront cost. Databricks Solution Accelerators\nleverage real-world sample data sets and pre-built code to show how raw data\ncan be transformed into real solutions — including step-by-step instructions\nready to go in a Databricks notebook.\n\n**Need help determining your customers’**\n**lifetime value?**\n\nUse the **Customer Lifetime Value Accelerator** to\n\n**•** Ingest sample retail data\n\n**•** Use pre-built code to develop visualizations and explore\npast purchase behavior\n\n**•** Apply machine learning to predict the likelihood and\nnature of future purchases\n\n**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n-----\n\n### Mitigating Customer Churn\n\nBalancing acquisition and retention with personalized experiences\n\n\nThere are no guarantees of success. With a bevy of options at their disposal,\ncustomer churn is a reality that companies face and are focused on overcoming\nevery day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\nestimated a segment average 7.2% monthly rate of churn. When narrowed to\nbrands focused on consumer goods, that rate jumped to 10.0%. This figure\ntranslates to a lifetime of 10 months for the average subscription box service,\nleaving businesses of this kind with little time to recover acquisition costs and\nbring subscribers to net profitability.\n\n**C A S E S T U DY**\n##### Riot Games\n\n**Creating an optimal in-game experience for League of Legends**\n\nRiot Games is one of the top PC game developers in the world, with over\n100 million monthly active users, 500 billion data points, and over 26\npetabytes of data and counting. They turned to Databricks to build a more\n\nefficient and scalable way to leverage data and improve the overall gaming\nexperience — ensuring customer engagement and reducing churn.\n\n[Get the full story](https://www.databricks.com/customers/riot-games)\n\nOrganizations must take an honest look at the cost of acquisition relative to a\ncustomer’s lifetime value (LTV) earned. These figures need to be brought into a\n\nhealthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n\n\n**Understanding attrition predictability through subscriptions:**\n**Examining retention-based acquisition variables**\n\nPublic data for subscription services is extremely hard to come by. KKBox, a\nTaiwan-based music streaming service, recently released over two years of\nanonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\nthe data, we uncover customer dynamics familiar to any subscription provider.\n\nMost subscribers join the KKBox service through a 30-day trial offer. Customers\nthen appear to enlist in one-year subscriptions, which provide the service with\na steady flow of revenue. Subscribers typically churn at the end of the 30-day\ntrial and at regular one-year intervals.\n\nThe Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\nretained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n\n\n-----\n\nBy Initial Payment Method\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different payment methods.\n\nBy Initial Payment Plan Days\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers selecting different initial payment methods and terms/days.\n\n\nThis pattern of high initial drop-off, followed by a period of slower but continuing\ndrop-off cycles makes intuitive sense. Where it gets interesting is when the\ndata changes. The patterns of customer churn become vastly different as time\npasses and new or changing elements are introduced (e.g., payment methods\nand options, membership tiers, etc.).\n\nBy Registration Channel\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different channels.\n\n\n-----\n\nThese patterns seem to indicate that KKBox _could_ potentially differentiate\nbetween customers based on their lifetime potential, using only the information\navailable at subscriber acquisition. In the same way, non-subscription businesses\ncould use similar data techniques to get an accurate illustration of the total\nlifetime value of a particular customer, even before collecting historical data.\n\nThis information can help businesses target certain shoppers with effective\ndiscounts or promotions as early as trial registration. Nevertheless, it’s always\nimportant to consider more than individual data points.\n\nThe baseline risk of customer attrition over a subscription lifespan.\n\n\nThe channel and payment method multipliers combine to explain a customer’s risk of attrition\nat various points in time. The higher the value, the higher the proportional risk of churn in the\nassociated period.\n\n\n-----\n\n**Applying churn analytics to your data**\n\nThis analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n**2)** to paint a quantitative picture of the specific factors that explain that risk,\ngiving analysts a clearer understanding of what to focus on, what to ignore and\nwhat to investigate further. The main challenge is organizing the input data.\n\nThe data required to examine customer attrition may be scattered across\nmultiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\nthe creation of transparent, sustainable data processing pipelines that are\nflexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n**condition to be managed** , and attrition data should be periodically revisited to\nmaintain alignment between acquisition and retention efforts.\n\n**Need help predicting customer churn?**\n\nUse the **Subscriber Churn Prediction Accelerator** to analyze\nbehavioral data, identify subscribers with an increased risk of\ncancellation, and predict attrition. Machine learning lets you\nquantify a user’s likelihood to churn, identifying factors that\nexplain the risk.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n### Streamlining Customer Analysis and Targeting\n\nCreating efficient and highly targeted customer experiences with behavioral data\n\n\nEffective targeting comes down to one fundamental element: the cost of\ndelivering a good or service relative to what a consumer is willing to pay.\n\nIn the earliest applications of segmentation, manufacturers recognized that\nspecialized product lines targeting specific consumer groups could help\nbrands stand out against competitors.\n\n**C A S E S T U DY**\n\n**Finding that special something every time**\n\nPandora is a jewelry company with global reach. They built their master\nconsumer view (MCV) dashboard on the Databricks Lakehouse Platform,\ngiving them the insights necessary to deliver highly targeted messaging\nand personalization — resulting in 80% growth in email marketing\nsuccess, a 50% increase in click-to-open rate across 65 million emails,\nand 255M DKK (Danish Krone) in quarterly revenue.\n\n[Get the full story](https://www.databricks.com/customers/pandora)\n\nThis mode of thinking extends beyond product development and into every\ncustomer-oriented business function, requiring specific means of ideation,\nproduction and delivery. The work put into segmentation doesn’t need to be\na gamble. Scrutinizing customers and testing responsiveness is an ongoing\nprocess. Organizations must analyze and adapt to shifting markets, changing\nconsumer demand and evolving business objectives.\n\n\n**C A S E S T U DY**\n\n**Powering insight-driven dashboards to increase customer**\n**acquisition**\n\nBagelcode is a global game company with more than 50 million global\nusers. By using the Databricks Lakehouse Platform, they are now able to\nsupport more diversified indicators, such as a user’s level of frequency\nand the amount of time they use a specific function for each game,\nenabling more well-informed responses. In addition, the company is\nmitigating customer churn by better predicting gamer behavior and\nproviding personalized experiences at scale.\n\n[Get the full story](https://www.databricks.com/customers/bagelcode)\n\n“Thanks to Databricks Lakehouse, we can support\nreal-time business decision-making based on data\nanalysis results that are automatically updated on\nan hourly and daily basis, even as data volumes have\nincreased by nearly 1,000 times.”\n\n**J O O H Y U N K I M**\nVice President, Data and AI, Bagelcode\n\n\n-----\n\nA brand’s goal with segmentation should be to define a shared customer\nperspective on customers, allowing the organization to engage users consistently\nand cohesively. But any adjustments to customer engagement require careful\nconsideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n\n**C A S E S T U DY**\n\n**Responding to global demand shifts with ease**\n\nReckitt produces some of the world’s most recognizable and trusted\nconsumer brands in hygiene, health and nutrition. With Databricks\nLakehouse on Azure, they’re able to meet the needs of billions of\nconsumers worldwide by surfacing real-time, highly accurate, deep\ncustomer insights, leading to a better understanding of trends and\ndemand, allowing them to provide best-in-class experiences in\nevery market.\n\n[Get the full story](https://www.databricks.com/customers/reckitt)\n\n\n**A segmentation walk-through: Grocery chain promotions**\n\nA promotions management team for a large grocery chain is responsible for\nrunning a number of promotional campaigns, each of which is intended to drive\ngreater overall sales. Today, these marketing campaigns include leaflets and\ncoupons mailed to individual households, manufacturer coupon matching,\nin-store discounts and the stocking of various private-label alternatives to\npopular national brands.\n\nRecognizing uneven response rates between households, the team is eager to\ndetermine if customers might be segmented based on their responsiveness\nto these promotions. They anticipate that such segmentation may allow the\npromotions management team to better target individual households, driving\noverall higher response rates for each promotional dollar spent.\n\nUsing historical data from point-of-sale systems along with campaign\ninformation from their promotions management systems, the team derives\na number of features that capture the behavior of various households with\nregard to promotions. Applying standard data preparation techniques, the data\nis organized for analysis and using a variety of clustering algorithms, such as\nk-means and hierarchical clustering, the team settles on two potentially useful\ncluster designs.\n\n\n-----\n\nOverlapping segment designs separating households based on their responsiveness to\nvarious promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n\n**Assessing results**\n\n\nComparing households by demographic factors not used in developing the\nclusters themselves, some interesting patterns separating cluster members\nby age and other factors are identified. While this information may be useful\n\nin not only predicting cluster membership and designing more effective\ncampaigns targeted to specific groups of households, the team recognizes\nthe need to collect additional demographic data before putting too much\nemphasis on these results.\n\n\nWith profiling, marketers can discern those customer households in the\nhighlighted example fall into two groups: those who are responsive to coupons\nand mailed leaflets, and those who are not. Further divisions show differing\ndegrees of responsiveness to other promotional offers.\n\n\n-----\n\n**Need help segmenting your customers for**\n**more targeted marketing?**\n\nUse the **Customer Segmentation Accelerator** and drive\nbetter purchasing predictions based on behaviors. Through\nsales data, campaigns and promotions systems, you can\nbuild useful customer clusters to effectively target various\nhouseholds with different promos and offers.\n\nAge-based differences in cluster composition of behavior-based customer segments.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\nThe results of the analysis now drive a dialog between the data scientists and\nthe promotions management team. Based on initial findings, a revised analysis\nwill be performed focused on what appear to be the most critical features\ndifferentiating households as a means to simplify the cluster design and evaluate\noverall cluster stability. Subsequent analyses will also examine the revenue\n\ngenerated by various households to understand how changes in promotional\nengagement may impact customer spending.\n\nUsing this information, the team believes they will have the ability to make a case\nfor change to upper management. Should a change in promotions targeting be\napproved, the team makes plans to monitor household spending, promotions\nspend and campaign responsiveness rates using much of the same data used in\nthis analysis. This will allow the team to assess the impact of these efforts and\nidentify when the segmentation design needs to be revisited.\n\n\n-----\n\n#### Assessing Consumer Interest Data to Inform Engagement Strategies\n\nFine-tuning ML recommendations to boost conversions\n\n\nPersonalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\nimportant to identify high-value audiences who have the highest likelihood of\nspecific actions. Here’s where **propensity scoring** comes in.\n\nSpecifically, this process allows companies to estimate customers’ potential\nreceptiveness to an offer or to content related to a subset of products, and\ndetermine which messaging to apply. Calculating propensity scores requires\nassessment of past interactions and data points (e.g., frequency of purchases,\npercentage of spend associated with a particular product category, days since\nlast purchase and other historical data).\n\nDatabricks provides critical capabilities for propensity scoring (like the Feature\nStore, AutoML and MLflow) to help businesses answer three key considerations\nand develop a robust process:\n\n**1.** How to maintain the significant number of features used\nto train propensity models\n\n**2.** How to rapidly train models aligned with new campaigns\n\n**3.** How to rapidly re-deploy models, retrained as customer\npatterns drift, into the scoring pipeline\n\n**Boosting model training efficiency**\n\nWith the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\ncreated by others.\n\n\nThe feature store is a centralized repository that enables the persistence,\ndiscovery and sharing of features across various model training exercises.\nAs features are captured, lineage and other metadata are captured. Standard\nsecurity models ensure that only permitted users and processes may\nemploy these features, enforcing the organization’s data access policies on\ndata science processes.\n\n**Extracting the complexities of ML**\n\n[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\nbest practices. As a glass box solution, AutoML first generates a collection of\nnotebooks representing various aligned model variations. In addition to iteratively\ntraining models, AutoML allows you to access the notebooks associated with each\nmodel, creating an editable starting point for further exploration.\n\n**Streamlining the overall ML lifecycle**\n\n[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\nDatabricks Lakehouse. This repository enables tracking and analysis of the various\nmodel iterations generated by both AutoML and custom training cycles alike.\n\nWhen used in combination with the Databricks Feature Store, models persisted\nwith MLflow can retain knowledge of the features used during training. As models\nare retrieved, this same information allows the model to retrieve relevant features\nfrom the Feature Store, greatly simplifying the scoring workflow and enabling\nrapid deployment.\n\n\n-----\n\n**How to build a propensity scoring workflow with Databricks**\n\nUsing these features in combination, many organizations implement propensity\nscoring as part of a three-part workflow:\n\n**1.** Data engineers work with data scientists to define features relevant\nto the propensity scoring exercise and persist these to the Feature Store.\nDaily or even real-time feature engineering processes are then defined\nto calculate up-to-date feature values as new data inputs arrive.\n\nModel Training\nand Deployment\n\n\n**2.** As part of the inference workflow, customer identifiers are presented to\npreviously trained models in order to generate propensity scores based on\nthe latest features available. Feature Store information captured with the\nmodel allows data engineers to retrieve these features and easily generate\nthe desired scores, which can then be used for analysis within Databricks\nLakehouse or published to downstream marketing systems.\n\n**3.** In the model-training workflow, data scientists periodically retrain the\npropensity score models to capture shifts in customer behaviors. As these\nmodels are persisted to MLfLow, change management processes are used\nto evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\nproduction version of each model is retrieved to generate customer scores.\n\n\nScore Generation\nand Publication ETL\n\n**Need help assessing interest from your**\n**target audience?**\n\n\nFeature\nEngineering ETL\n\nFeature Store Profiles\n\n\nSales\n\nPromotions\n\nCustomer\n\n\nUse the **Propensity Scoring Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nDownstream\nApplications\n\n\nA three-part propensity scoring workflow.\n\n\n-----\n\n### Delivering Personalized Customer Journeys\n\nStrategies for crafting a real-time recommendation engine\n\n\nAs the economy continues to weather unpredictable disruptions, shortages and\ndemand, delivering personalized customer experiences at speed and scale will\nrequire adaptability on the ground and within a company’s operational tech stack.\n\n\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\nstrategy and operations, allowing them to create a “golden customer\nrecord” that improves all decision-making from forecasting demand to\npowering their global loyalty program.\n\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\n\n\n**C A S E S T U DY**\n\n\n“Databricks Lakehouse allows every division in our\norganization — from automotive to retail — to gain\na unified view of our customer across businesses.\nWith these insights, we can optimize everything from\nforecasting and supply chain, to powering our loyalty\nprogram through personalized marketing campaigns,\ncross-sell strategies and offers.”\n\n**D M I T R I Y D O V G A N**\nHead of Data Science, Al-Futtaim Group\n\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\nsafety and community, brands most attuned to changing needs and sentiments\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\nbusiness and many lost, organizations that had already begun the journey toward\nimproved customer experience saw better outcomes, closely mirroring patterns\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n\n\n**Creating a unified view across 200+ brands**\n\nAs a driving force for economic growth in the Middle East, Al-Futtaim\nimpacts the lives of millions of people across the region through the\ndistribution and operations of global brands like Toyota, IKEA, Ace\nHardware and Marks & Spencer.\n\nAl-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**\n\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n\n**C A S E S T U DY**\n\n**Personalizing the beauty product shopping experience**\n\nFlaconi wanted to leverage data and AI to become the No. 1 online\nbeauty product destination in Europe. However, they struggled with\nmassive volumes of streaming data and with infrastructure complexity\nthat was resource-intensive and costly to scale. See how they used\nDatabricks to increase time-to-market by 200x, reduce staff costs by\n40% and increase net order income.\n\nGet the full story\n\n\n¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\nExperience Performance Index in 2007-09.\n\nSource: Forrester Customer Experience Performance Index (2007-09); press search\n\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n\n\n-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nFlipp is an online marketplace that aggregates weekly shopping circulars,\nso consumers get deals and discounts without clipping coupons. Siloed\ncustomer data sources once made getting insights difficult. Now with\nDatabricks, Flipp’s data teams can access and democratize data, helping\nthem do their jobs more effectively while bringing better deals to users,\nmore meaningful insights to partners, and a 10% jump in foot traffic to\nbrick-and-mortar retailers.\n\nGet the full story\n\nThe engines we use to serve content based on customer preferences are known\nas recommenders. With some recommenders, a heavy focus on the shared\npreferences of similar customers helps define what recommendations will actually\nmake an impact. With others, it can be more useful to focus on the properties of\nthe content itself (e.g., product descriptions).\n\n\n-----\n\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n\n\nProviding deep, effective personalized experiences to customers depends\non a brand’s ability to intelligently leverage consumer and market data from a\nwide variety of sources to fuel faster, smarter decisions — without sacrificing\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\nexactly that, offering a scalable data architecture that unifies all your data,\nanalytics and AI to deliver unforgettable customer experiences.\n\nCreated on open source and open standards, Databricks offers a robust\nand cost-effective platform for brands to collaborate with partners, clients,\nmanufacturers and distributors to unleash more innovation and efficiencies\nat every touch point. Businesses can rapidly ingest available data in real time,\n\n\nat scale, and create accessible, data-driven insights that enable actionable\nstrategies across the value chain.\n\nDatabricks is a multicloud platform, designed for quick enterprise development.\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\ntheir company’s operational health and the evolving needs of their customers\n— all while empowering teams to easily unify data efforts, perform fine-grained\nanalyses and streamline cross-functional data operations using a single,\nsophisticated solution.\n\n\n###### Learn more about Databricks Lakehouse for industries\n like Retail & Consumer Goods, Media & Entertainment\n and more at databricks.com/solutions\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n\nis headquartered in San Francisco, with offices around the globe. Founded by\n\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n\na mission to help data teams solve the world’s toughest problems. To learn more,\n\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n\n##### Contact us for a personalized demo databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "#### eBook\n\n# Big Book of Retail\n & Consumer Goods Use Cases\n\n##### Driving real-time decisions\n with the Lakehouse\n\n\n-----\n\n### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n\n**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n\nCommon challenges 6\n\nThe Lakehouse for Retail 8\n\n**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n\nCase Study: Gousto 14\n\nCase Study: ButcherBox 14\n\n**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n\nCase Study: Embark 16\n\n**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n\nCase Study: H&M 19\n\nCase Study: Edmunds 19\n\n**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n\n**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n\nCase Study: Reckitt 25\n\n**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n\n**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n\nCase Study: Wehkamp 31\n\nCase Study: Columbia 31\n\nCase Study: Pandora 31\n\n**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n\n**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n\n\n-----\n\n### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n\nCase Study: ButcherBox 37\n\nCase Study: Sam’s Club 37\n\n**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n\n**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n\n**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n\n**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n\n\n-----\n\n**CHAPTER 1:**\n### Introduction\n\n\nRetailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n\ne-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n\ndecisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n\nto support these decisions, but legacy systems are limited to data that’s hours or days old.\n\n**When seconds matter, only the Lakehouse delivers better decisions**\n\nRetail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n\nThe integration of physical and e-commerce customer experiences into an omnichannel journey has been\n\nhappening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n\npurchasing patterns.\n\nIn reaction to these industry changes, retailers have responded with significant, rapid investments — including\n\nstronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n\ncapabilities have addressed the immediate need — and created expectations of making decisions in real\n\ntime — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n\nUnfortunately, most legacy systems are only able to process information in hours or days.\n\nThe delays caused by waiting for data are leading to significant risks and costs for the industry.\n\n**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n\nthe-minute order data. Not having this information causes them to spend more resources on having people\n\npick orders separately, at a higher operating cost.\n\n**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n\nthat in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n\nsales, or worse, the customer becoming unsatisfied and moving to different retailers.\n\n\n-----\n\n**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n\nand other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n\nweek.\n\nThe margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n\nzero. Reducing the error rate requires better predictions and real-time data.\n\n**Use Case Guide**\n\nIn this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n\n**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n\n**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n\n**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n\nperformance, while achieving a market-leading total cost of ownership.\n\nDatabricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n\nin real time. This use case guide also highlights common use cases across the industry, and offers additional\n\nresources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n\njourney to drive better customer experiences with data and AI.\n\n\n-----\n\n**CHAPTER 2:**\n### Modern Data Platform\n for Real-Time Retail\n\n\nRetailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n\nchanges, retailers are increasingly focused on improving the real-time availability of data and insights, and\n\nperforming advanced analytics delivered within tight business service windows.\n\n**Common challenges**\n\nIn response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n\nin modernizing distribution centers, partnering with delivery companies, and investing in customer\n\nengagement systems.\n\nWarehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n\ndistribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n\nthat became accustomed to having fast, same-day, and sometimes even overnight delivery options\n\nduring the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n\nexperience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n\n## $41B Market | Retail Warehouse Automation\n\nYet while retailers modernize different areas of their operations, they’re constrained by a single point of\n\nweakness, as they are reliant on legacy data platforms to bring together all of this data.\n\nPowering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n\ngovernance of information, and powering business intelligence and predictive analytics all within the time\n\nrequired by retail operations.\n\n\n-----\n\n**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n\nis the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n\nsystems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n\nand other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n\naggregated and integrated with each other before they can be used. The problem? Retailers have used\n\nlegacy data warehouses that are built around batch processing. And worse, increasing the frequency\n\nof how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n\nmerchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n\nother data sets. The result? Accurate data to drive decisions can be delayed by days.\n\n**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n\ntrade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n\nRunning forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n\nbut doing so requires tens of millions of model calculations that need to be performed in narrow service\n\nwindows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n\nforced to accept the trade-off and live with less accurate predictions.\n\n**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n\nreal-time data to thousands of employees is a daunting task. While data warehouses are capable\n\nof serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n\nfrequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n\ndecisions that are more frequent.\n\n**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n\nfocused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n\na trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n\nstruggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n\ndeliver personalized experiences at scale to win in retail.\n\n\n-----\n\n###### The Lakehouse for Retail\n\nDatabricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n\nall types of data — from images to structured data — in real time, provide enterprise-class management\n\nand governance, and then immediately turn that data into actionable insights with real-time reporting and\n\npredictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n\n(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n\n**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n**or frequency** **processing** **any persona** **& collaboration**\n\n_Semi-structured batch_\n\n\n**All of**\n**your sources**\n\nCompetitive activity\n\nE-commerce\n\nMobile Applications\n\nVideo & Images\n\nPoint of Sale\n\nDistribution & Logistics\n\nCustomer & Loyalty\n\nDelivery & Partners\n\n\n_Structured real-time_\n\n_Semi-structured real-time_\n\n_Unstructured batch_\n\n_Semi-structured real-time_\n\n_Structured real-time_\n\n_Structured batch_\n\n\nData Lakehouse\n\nData Management and Governance\n\nProcess, manage and query all of your data\n\n\nAd Hoc Data Science\n\n**Internal Teams**\n\nProduction\nMachine Learning\n\n**Customers**\n\nBI Reporting\n& Dashboarding\n\n**Partners**\n\nReal-time Applications\n\n\nAny Cloud\n\n\n_Structured real-time_\n\n\n-----\n\n**Reference Architecture**\n\nAt the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n\noffs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n\nthat combines the best elements of data warehouses and data lakes — to directly address these factors by\n\nenabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n\nmanaged and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n\ndata scientists and data engineers can all leverage this information to serve models for applications and\n\npower real-time reporting, advanced analytics, large-scale forecasting models and more.\n\n**EDGE** **HYBRID** **CLOUD**\n\n\n\nREST Model Serving\n\n|Machine Learning Operations Tracking Registery|RES|\n|---|---|\n||Application|\n\n\n\nReplication\n\n\nAutomatic DBs\n\n|Col1|Real-tim|\n|---|---|\n|||\n\n\nRaw Data\n\n(Bronze Table)\n\n\nClean Data\n\n(Silver Table)\n\n\nRefined Data\n\n(Gold Table)\n\n\nBusiness\nApplications\n\nPower BI\n\n\nBatch\n\n\n-----\n\n###### How it works\n\nThe Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n\nsimplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n\ndifferentiated capabilities that help retailers win.\n\nRobust data Time-sensitive machine\nData in real time Use all of your data Real-time reporting\nmanagement learning\n\n\n**Limited.** EDWs support the\n\nmanagement of structured\n\ndata.\n\n**No.** Data lakes lack\n\nenterprise-class data\n\nmanagement tools.\n\n**Yes.** Delta and Unity\n\nCatalog offer native\n\ndata management and\n\ngovernance of all data types.\n\n\n**No.** EDWs offer quick access\n\nto reports on old data.\n\n**No.** Data lakes were not\n\ndesigned for reporting, let\n\nalone real-time reporting.\n\n**No.** Data lakes are able to\n\nsupport large analytics,\n\nbut lack the ability to meet\n\nbusiness SLAs.\n\n\n**No.** EDWs must extract data\n\nand send it to a third party\n\nfor machine learning.\n\n**Yes.** Data views can be\n\nmaterialized, enabling front-\n\nline employees with real-\n\ntime data.\n\n**Yes.** The Lakehouse can\n\nscale to process the most\n\ndemanding predictions\n\nwithin business SLAs.\n\n\n**No.** Data warehouses are\n\nbatch oriented, restricting\n\ndata updates to hours or days.\n\n**No.** Data lakes are batch\n\noriented.\n\n**Yes.** Support for real-time\n\nstreaming data.\n\n\n**No.** Data warehouses have\n\nvery limited support for\n\nunstructured data.\n\n**Yes.** Data lakes offer support\n\nfor all types of data.\n\n**Yes.** Supports all types of\n\ndata in a centrally managed\n\nplatform.\n\n\n**LEGACY DATA**\n\n**WAREHOUSE**\n\n\n**LEGACY DATA**\n\n\n**DATA LAKES**\n\n**(HADOOP)**\n\n\n**DATA LAKES**\n\n\n**ROBUST**\n\n**DATA**\n\n\n**ROBUST**\n\n\n-----\n\n**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n\nfor streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n\nand point-of-sale data. And Delta Lake enables this world-record-leading performance while\n\nmaintaining support for ACID transactions.\n\n**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n\nand a growing variety of other data sources. This data is extremely powerful in helping to improve our\n\nunderstanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n\nto take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n\narchitecture.\n\n**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n\nwas lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n\ncompliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n\nto a modern data architecture does not require sacrificing enterprise maturity.\n\n**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n\nor recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n\ncan scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n\nsensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n\navailability and out-of-stocks, and delivering highly personalized predictions.\n\n**Value with Databricks**\n\nBy using Databricks to build and support your lakehouse, you can empower your business with even more\n\nspeed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n\nstart with the use case that will have the most impact on your business. As you implement the pattern, you\n\nwill find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n\nguidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n\n\n-----\n\n**CHAPTER 3**\n### Use Case:\n Real-Time Supply\n Chain Data\n\n\n**Overview**\n\nAs companies see a surge in demand from e-commerce and delivery services, and seek increasing\n\nefficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n\nroadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n\nitems are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n\ncontrol tower.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nManufacturers Distributors Logistics Restaurants\n\n\n**Challenges**\n\n**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n\nhappening and when a customer can act on it\n\n**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n\nhave the added pressure to take immediate action on it\n\n**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n\nplant operations or as part of a supply chain control tower.\n\n**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n\nimprovement in speed to data, with greater reliability\n\n**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n\nthat they were able to move from batch to real-time at neutral costs\n\n**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n\nreal-time data ingestions. Customers frequently report that the amount of code required to support\n\nstreaming ingestion is 50% less than previous solutions.\n\n**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n\nhappening, predict and prevent issues, and even gain days on major changes such as production\n\nschedules between shifts\n\n**Solution overview**\n\nDatabricks allows for both streaming and batch data sets to be ingested and made available to enable\n\nreal-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n\nACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n\nDelta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n\nlearning experiments.\n\n**Typical use case data sources include:**\n\nSupply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n\ndata, IoT sensor, transportation management\n\n\n-----\n\n**CASE STUDY**\n\nWith Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n\ndaily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n\nprovided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n\noutbreak by providing real-time insight into performance on the factory picking lines.\n\n**CASE STUDY**\n\nAs a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n\nhundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n\ndata in under three minutes.\n\nNow, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n\naddress any logistical and delivery issues.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 4**\n### Use Case: Truck Monitoring\n\n\nWith many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n\nof trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n\nmanner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n\ndelays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics\n\n\n**Challenges**\n\n**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n\n\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n\n\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n\naccidents or shipment delays\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n\nto delivery.\n\n**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n\nability to monitor driver safety more immediately\n\n**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n\nexpansion without sacrificing data quality and speed\n\n**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n\nproactive maintenance and reduced risk of accidents\n\n**Solution overview**\n\nDatabricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n\nto route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n\nanalytics provide companies with the tools they need to scale and improve their operations.\n\n**Typical use case data sources include:**\n\nSupply planning, transportation management, manufacturing, predictive maintenance\n\n**CASE STUDY**\n\nWith 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n\nto unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n\nvia dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n\nautonomous trucks.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 5**\n### Use Case: Inventory Allocation\n\n\n**Overview**\n\nReplenishment planning is the process of determining what needs to go where. It is used by replenishment\n\nplanning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n\nvendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n\nand on what day.\n\nReplenishment is challenging for companies because it deals with rapidly changing data and the need to\n\nmake complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n\ndata to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n\nnumber of products being sent to stores. This results in lost sales and low customer satisfaction.\n\nInventory allocation is a process that might be performed multiple times a day during peak seasons, or\n\ndaily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n\nmultiple times a day — on demand and dynamically — during peak season without paying a premium for\n\nthis capability throughout the year.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Restaurants\n\n\n-----\n\n**Challenges**\n\n\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n\nThis information is used to determine which products get put on trucks and go to specific stores.\n\n\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n\nthe service windows\n\n\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n\nattributes that may be more or less popular by store\n\n**Value with Databricks**\n\nCustomers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n\n\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n\nimprovement in forecast accuracy\n\n\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n\n\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n\ninformation on flavors or apparel sizes for specific stores\n\n**Solution overview**\n\nThe objective of inventory allocation is to quickly determine when to distribute items and where — from\n\nwarehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n\nrate of products, the available inventory and the shipping schedules, and then using this information to\n\ncreate an optimized manifest of what items should be carried on which trucks, at what point, and at what\n\ntime. This becomes the plan for route accounting systems that arrange deliveries.\n\nInventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n\nin a store for a long time, that store may receive heightened priority for the item in the allocation.\n\n\n-----\n\nHOW TO GET STARTED\n\n\n**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n\nstock, promotions data, weather\n\n**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n\n**demand forecasting.**\n\n**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n\nOur most popular notebook at Databricks. This blog walks you through the business and technical\n\nchallenges of performing demand forecasting and explains how we approached solving it.\n\n**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n\nVideo and Q&A from our webinar with Starbucks.\n\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n**CASE STUDY**\n\nH&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n\nperformant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n\ndriven organization that could better forecast operations to streamline costs and boost revenue.\n\n**CASE STUDY**\n\nEdmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n\nPlatform, they are able to simplify access to their disparate data sources and build ML models that make\n\npredictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n\non their website is accurate and up to date, improving overall customer satisfaction.\n\n\n-----\n\n**CHAPTER 6**\n### Use Case: Point of Sale\n and Clickstream\n\n\n**Overview**\n\nDisruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n\ncoupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n\nretailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n\nrecorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n\nThis would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n\ncrucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n\ncomplete and real-time snapshot of each customer’s shopping behavior.\n\nNear real-time availability of information means that retailers can continuously update their estimates of\n\nitem availability. No longer is the business managing operations based on their knowledge of inventory\n\nstates as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n\nthey are now.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce\n\n**Challenges**\n\n\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n\nincomplete sales data\n\n\u0007Both POS and clickstream data need to be unified and ingested in real time\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\n\n**Value with Databricks**\n\nDatabricks brings POS and clickstream data together for a unified data source that leads to real-time\n\ninsights and a clearer understanding of customer behavior.\n\n\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n\nclickstream data\n\n\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n\ncustomer purchasing behaviors and trends\n\n\nto have them perform a free proof-of-\n\n\nconcept with your real-time data.\n\n\n\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n\n\n-----\n\n**CHAPTER 7**\n### Use Case: On-Shelf Availability\n\n\n**Overview**\n\nEnsuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n\nmissing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n\ntheir stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n\nworldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n\naccording to industry research firm IHL.\n\nIn the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n\nof going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n\nbelong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n\nitems online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n\nbuy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n\n$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n\nOn-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n\nconsidered in stock when it is actually in a current customer’s basket. If another customer places the same\n\nitem in their basket, there is the possibility that the first customer will purchase the last available item\n\nbefore the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n\nthese situations, customers may order an item that is picked for delivery at a much later time. The window\n\nbetween ordering and picking creates the probability of out-of-stocks.\n\nOn-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n\nreplenishment points, and generates a signal that suggests an item may be out of stock. This information is\n\nused to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n\nof thousands of people around the world do work that is generated by these algorithms.\n\nThe sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n\nall of their products. Companies have between midnight and 4 AM to collect all of the needed information\n\nand run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n\nthe priority categories or products to analyze, which means a significant percentage of their unavailable\n\nproducts will not be proactively addressed.\n\n\n-----\n\nOne of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n\nWhile some retailers are investing in computer vision and robots, and others employ the use of people to\n\nmanually survey item availability, most retailers default to a signal of determining when an item has not been\n\nscanned in an acceptable time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nE-commerce Direct to\nConsumer\n\n\n**Challenges**\n\nThe biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n\ndata from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n\nvolumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n\nwarehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n\nprocess that can require multiple hours per night.\n\nFor this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n\ndifferent days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n\nlevels. Among the challenges:\n\n\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n\n\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n\nfew categories\n\n\u0007Dealing with false positives and negatives in predictions\n\nDistributing information quickly and efficiently to internal systems and external partners\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n\ncompromises.\n\n**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n\nprocess large volumes of highly detailed and frequently changing point-of-sale transaction data.\n\n**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n\n**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n\n**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n\nreporting\n\n**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n\n**Solution overview**\n\nDatabricks enables companies to perform on-shelf availability analysis without making compromises to the\n\nbreadth or quality of predictions.\n\nIt begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n\nbiggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n\na data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n\nmetadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n\nbased systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n\nOnce data is available, users need to generate predictions about item availability on the shelf. With its\n\nextremely performant engine and the ability to distribute computation across countless nodes, Spark\n\nprovides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n\nor against a subset of data.\n\n\n-----\n\n**HOW TO GET STARTED**\n\n[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\n[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\nIn this solution, we show how the\n\nDatabricks Lakehouse Platform enables\n\nreal-time insights to rapidly respond\n\n\nAnd lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n\nLake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n\nfeed their predictive alerts to downstream retail operations systems or even to external partners within the\n\ntightest service windows, and in enough time to drive actions on that day.\n\n**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n\nmanual inventory data (optional), robotic or computer vision inventory data (optional)\n\n**CASE STUDY**\n\nReckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n\norganization to struggle with the complexity of forecast demand, especially with large volumes of different\n\ntypes of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n\nuses predictive analytics, product placement and business forecasting to better support neighborhood\n\ngrocery stores.\n\n\nto demand, drive more sales by\n\nensuring stock is available on shelf, and\n\nscale out your forecasting models to\n\naccommodate any size operation.\n\n\n-----\n\n**CHAPTER 8**\n### Use Case: Customer and Vehicle Identification\n\n\n**Overview**\n\nCOVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n\noptions. Retailers that were able to implement these new services have been able to differentiate overall\n\ncustomer experiences and mitigate catastrophic hits on revenue levels.\n\nFor retailers to create a seamless contactless experience for customers, they need real-time data to\n\nknow when a customer has arrived and where they’re located, as well as provide updates throughout the\n\npickup journey. And through the use of computer vision, they can capture that data by employing optical\n\nrecognition on images to read vehicle license plates.\n\nRetailers can also use information captured from license plates to make recommendations on buying\n\npatterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n\ninformation to better serve their customers in real time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDrive-Through\nFood Retailers\n\n\n**Challenges**\n\n\u0007Ineffective data processing can lead to suboptimal order preparation timing\n\n\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n\ntime communications throughout the entire shopping and curbside or drive-through experience.\n\n\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n\npreparation timing\n\n\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n\neach subsequent visit is equally as or more seamless than the last\n\n\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n\nvehicle identification and order prediction\n\n**CASE STUDY**\n\n**CASE STUDY**\n\n\n-----\n\n**CHAPTER 9**\n### Use Case: Recommendation Engines\n\n\n**Overview**\n\nCustomers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n\nfrequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n\nis by recommending products and services that align with customer needs.\n\nProviding an experience that makes customers feel understood helps retailers stand out from the crowd\n\nof mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n\nthis more critical than ever for retail organizations. With research showing the cost of customer acquisition\n\nis as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n\ncontinue to build deeper connections with existing customers in order to retain a solid consumer base.\n\nThere is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n\nof spending.\n\nRecommendation engines are used to create personalized experiences for users across retail channels.\n\nThese recommendations are generated based on the data collected from purchases, items interacted\n\nwith, users’ behavior across physical and digital channels, and other data such as from customer service\n\ninteractions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n\nbehavioral data, marketers are able to create recommendations that are integrated with other business\n\nobjectives such as highlighting items that are on promotion or product availability.\n\nCreating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n\nthe customer experience in every possible area of consumer engagement, from proactive notifications and\n\noffers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n\nand upsell, and even suggestions for complementary items after the purchase.\n\n\n-----\n\n**R E L E V A N T F O R**\n\n\nRetail E-commerce Direct to\nConsumer\n\n\nMedia Telecom Financial Services\n(any B2B or B2C\ncompany)\n\n\n**Challenges**\n\nRecommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n\nbut traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n\nrecommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n\nirrelevant.\n\n**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n\nis improved by having recent data, and yet most systems struggle to handle the large volumes of\n\ninformation involved.\n\n**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n\ntouchpoints in one place are critical to enabling this use case. More data, including transaction and\n\nclickstream data, is critical for driving accuracy and precision in messaging.\n\n**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n\nchanging dynamics, and deliver real-time recommendations via APIs.\n\n**Automation.** This is an “always-on” use case where automation is essential for scalability and\n\nresponsiveness based on frequent model updates.\n\n\n-----\n\nMany firms choose to use recommender systems from Amazon or Google. Using these systems trains\n\nthe general recommendation engine in a way that helps competitors improve the accuracy of their own\n\nrecommendations.\n\n**Value with Databricks**\n\nRecommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n\nretailers must own, and Databricks provides a solid platform for enabling this.\n\nUsing Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n\npersonalization, sample value metrics from a media agency include:\n\n**200% ROI for 70% of retailers** engaging in advanced personalization\n\n**10% improvement** in conversions\n\n**35% improvement** in purchase frequency\n\n**37% improvement** in customer lifetime value\n\n**Solution overview**\n\nRecommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n\ncapturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n\nto combine various sources of data in a timely and efficient manner, from transactions, demographics and\n\npreference information across products, to clickstream, digital journey and marketing analytics data to bring\n\na 360 view of customer interactions to enable omnichannel personalization.\n\nBy identifying changes in user behavior or engagement, retailers are able to detect early signals that\n\nindicate a propensity to buy or a change in preferences, and recommend products and services that will\n\nkeep consumers engaged.\n\n\n-----\n\n**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n\nclickstream data, mobile data:\n\n**Engagement data** — transaction log data, clickstream data, promotion interaction\n\n**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n\nchildren, location\n\n**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n\nto churn\n\n**CASE STUDY**\n\nFor Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n\nfor help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n\npersonalized to each of their customers.\n\n**CASE STUDY**\n\nColumbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n\nDatabricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n\nbusiness decisions.\n\n**CASE STUDY**\n\nPandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n\nLakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n\nquarterly revenue.\n\n\nHOW TO GET STARTED\n\nDatabricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\n[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\nwith content-based and collaborative\n\nfilter methods, and both item-\n\nand user-based analysis. These\n\naccelerators have been further refined\n\nto be highly performant to enable\n\nfrequent retraining of models.\n\nTo begin working on recommendation\n\nengines, contact your Databricks\n\naccount team.\n\n\n-----\n\n**CHAPTER 10**\n### Use Case: Perpetual Inventory\n\n\n**Overview**\n\nWith the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n\ncustomer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n\ninventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n\nlevels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n\naccurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n\nThe key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n\nrecords related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n\nand lower overall costs.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Supply Chain\n\n\nInventory\nManagement\n\n\n**Challenges**\n\n**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n\n**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n\nview of inventory\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n\nthe latest sales data\n\n**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n\ncompanies know they’re getting the most accurate information at any point\n\n**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n\nmanagement costs\n\n\n-----\n\n**CHAPTER 11**\n### Use Case: Automated\n Replenishments\n\n\n**Overview**\n\nCustomers favor convenience more than ever when it comes to their goods, and automated replenishments\n\nhelp meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n\nkey role in ensuring consumers get a refill automatically delivered at the right time.\n\nOn the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n\nreducing the time needed to forecast, order and receive thousands of items.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Direct to\nCustomer\n\n\n**Challenges**\n\n**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n\nreplenishment orders\n\nWith VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n\nfor replenishment even when the customer can’t fulfill that order\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n\ncustomer needs\n\n**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n\nunique properties and expiry dates\n\n**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n\n\n-----\n\n**CHAPTER 12**\n### Use Case: Fresh Food Forecasting\n\n\n**Overview**\n\nFresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n\nstore traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n\nrange of suppliers to work with and the products expire, which creates significant amounts of waste.\n\nIn order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n\nsell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n\ntiming for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n\nchanging needs around fresh food.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Distributors Logistics Restaurants\n\n**Challenges**\n\n**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n\nenough to conduct daily forecasting and daily replenishment\n\n**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n\n**\u0007** Customers are forced to compromise on what they can analyze\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team to get\n\nstarted with inventory allocation. Databricks\n\ndoes not have a Solution Accelerator.\n\nView our webinar covering demand forecasting\n\nwith Starbucks and then read our blog about\n\ndemand forecasting.\n\n[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n\nThis blog details the importance of time series\n\nforecasting, walks through building a simple\n\nmodel to show the use of Facebook Prophet, and\n\nthen shows off the combination of Facebook\n\nProphet and Adobe Spark to scale to hundreds\n\nof models.\n\n[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n\nVideo and Q&A from our webinar with Starbucks\n\n\n**Value with Databricks**\n\nCustomers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\nspoiled products, as well as lower inventory and handling costs.\n\n**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n\ndouble-digit improvement in forecast accuracy\n\n**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n\nmillions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\nfew hours.\n\n**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n\nthe products they forecast. They can predict demand for all products as frequently as required.\n\n**Solution overview:**\n\nDatabricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\nSolution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\ncan be efficiently scaled to tens of millions of predictions in tight service windows.\n\n**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n\nexpiration dates and weather.\n\n**CASE STUDY**\n\nButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\ncustomer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\nDatabricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\nrest of its data estate.\n\n\non demand forecasting.\n\n**CASE STUDY**\n\nSam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\ntrillions of events going through the company. Find out how Databricks became a key component in the shift\nfrom on premises Hadoop clusters to a cloud based platform\n\n\n-----\n\n**CHAPTER 13**\n### Use Case: Propensity-to-Buy\n\n\n**Overview**\n\nCustomers often have repeatable purchase patterns that may not be noticed upon initial observation.\n\nWhile we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n\nmornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n\npredict these buying moments when customers are not in our stores?\n\nThe purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n\npurchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n\nmodels leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n\nuseful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n\nmodels are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n\ndata from receipts, and causal data that helps to explain when and why customers make purchases.\n\nPropensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n\nmanagement, email and mobile alerts, recommendations and others.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Direct to\nConsumer\n\n\n-----\n\n**Challenges**\n\n**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n\noutreach to customers to avoid angering them.\n\nCompanies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequently\n\nCompanies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Value with Databricks**\n\n**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n\nefficiently synthesize this into data for analysis\n\n**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequency\n\n**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Solution overview:**\n\nPropensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n\nmoment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n\nincorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n\nmoment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n\nor even later in the day you have lost the opportunity to act\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\nWith the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n\nenables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n\nto refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n\nsystems, where the consumer can be engaged.\n\nAs this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n\nCalculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n\nperiods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n\nskipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n\n**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n\nmobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n\n\n-----\n\n**CHAPTER 14**\n### Use Case: Next Best Action\n\n\n**Overview**\n\nThe e-commerce boom over the last couple of years has given consumers ample choice for digital\n\nshopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n\nrisk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n\nbest action for customers, you can greatly increase your conversion rates.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDirect to\nConsumer\n\n\nE-commerce\n\n\n**Challenges**\n\nSiloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n\nresulting in suboptimal recommendations for the next best action\n\nCompanies need to ingest large amounts of data in real time and then take action on it immediately\n\nMany businesses still struggle with training their ML models to properly determine the next best action\n\n(and self-optimize based on the results)\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\n**Value with Databricks:**\n\nDatabricks provides all the tools needed to **process large volumes of data and find the next best**\n\n**action** at any given point in the customer journey\n\n**Near real-time insights** — the greater speed to data means businesses can react immediately to\n\ncustomer actions\n\n**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n\nbasic information, transactional data, online behavior/purchase history, and more) to get a complete\n\ncustomer profile\n\n**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n\nstep for customers\n\n\n-----\n\n**CHAPTER 15**\n### Customers That Innovate With Databricks Lakehouse for Retail\n\n\nSome of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n\nfor Retail to deliver real-time experiences to their customers.\n\nToday, data is at the core of every innovation in the retail and consumer packaged goods industry.\n\nDatabricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n\nharness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n\nexperiences to customers.\n\nGet started with a free trial of Lakehouse for Retail and start building better data applications today.\n\n**[Start your free trial](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n\n\n-----\n\n###### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "**eBook**\n\n# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n\n### Real-world use cases with Databricks Lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nThree Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n\nThe Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n\nCommon Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n\nWhy Lakehouse for Insurance ............................................................................................................................................................................ **10**\n\nKey Use Cases for Insurance:\n\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n\nGlobal Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n\n**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n\nGet Started With Industry Solutions ............................................................................................................................................................. **20**\n\nConclusion ................................................................................................................................................................................................................... **26**\n\n\n-----\n\n## Introduction\n\nWith the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\nfrom the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\ninsights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\ndata from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\nclickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n\nConsumers want stronger reassurance for what they value most: financial security and greater peace of mind.\nInsurers have always prided themselves on delivering such protection and security. However, customer needs\nhave changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\nchallenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\ntheir customers to remain competitive.\n\nData-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\npricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\ndigital investments and enterprise data strategy has become a top priority for boards and senior executives\nin the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\nto having one reliable source of truth for data, which is derived from batch and streaming data, structured and\nunstructured data, from multiple clouds and jurisdictions.\n\n\nIn a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\nfundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\nDatabricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\ntransform their businesses, resulting in significant operational efficiencies and improved customer experiences\nat a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\nand common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\nLakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\npartners available to assist you on your journey.\n\n\n**The future of insurance will**\n\n**become increasingly data-driven,**\n\n**and analytics enabled.”**\n\n**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n\n\n-----\n\nThe Lakehouse reference architecture below illustrates a sample framework upon\nwhich insurers can build. Moving from left to right in the diagram, the first layer\nrepresents various data sources such as on-premises systems, web and mobile\napplications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\nis then ingested through automated data pipelines, and processed within the\nLakehouse platform across three layers (Bronze, Silver and Gold). These layers\nare responsible for data preparation, including ML model registry, centralized\n\n\ngovernance, workflow orchestration, and job scheduling. They ensure a compliant\nand secure infrastructure that sits atop the cloud layer (or multiple clouds),\neliminating the need for data duplication. Finally, the transformed data is delivered\nas actionable insights and supports use cases such as automated reporting,\nbusiness analytics, customer 360, and claims analytics. These use cases not only\nmitigate risk but also drive revenue.\n\n\n**Data Sources**\n\n**On-Premises**\n**Servers**\n\n\n**Ingestion**\n\n\n**Lakehouse for Financial Services**\n\n**Bronze Layer** **Silver Layer** **Gold Layer**\n\n\n**Serving**\n\n**Automated**\n**Reporting**\n\n\n**Web and Mobile**\n**Applications**\n\n\n**Business Analytics**\n**and Interactive**\n**Dashboards**\n\n\n**Raw Entity Data**\n\n\n**Curated Feature**\n**Sets**\n\n\n**Aggregated**\n**Business Views**\n\n\n**Automated Data Pipelines**\n**(Batch or Streaming)**\n\n**Collaborative**\n**Data Source**\n\n\n**Internet-of-Things**\n**(IoT) Devices**\n\n\n**Enterprise Data**\n**Warehouses**\n\n\n**Third-Party APIs**\n**and Services**\n\n\n**ML Model**\n**Registry**\n\n\n**Centralized Data**\n**Governance**\n\n\n**Workflow**\n**Orchestration**\n\n\n**Productionized**\n**Referenced Data**\n**and Models**\n\n**Job Scheduling**\n\n\n-----\n\n## Three Trends Driving Transformation in Insurance\n\nOver the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\nThe following three trends are driving this transformation in the insurance industry:\n\n\n**The rapid emergence of large language**\n**models and generative AI**\n\nIn recent years, there has been a significant\nbreakthrough in the field of artificial intelligence with\nthe emergence of large language models (LLMs)\nand generative AI. These models, such as GPT-4 and\nits predecessors, Databricks Dolly and others are\nbuilt using deep learning techniques and massive\namounts of training data, enabling them to generate\nhuman-like text and perform a wide range of natural\nlanguage processing tasks. LLMs and generative AI\ncan help insurance companies automate repetitive\ntasks such as underwriting, claims processing,\n\nand customer service, improving efficiency and\nreducing costs. They can also help insurers to better\nunderstand customer needs and preferences,\nleading to more personalized products and services.\nHowever, as with any disruptive technology, the\nadoption of LLMs and generative AI will require\ncareful consideration of ethical and regulatory\nissues, such as data privacy and algorithmic bias.\n\n\n**Transformed ecosystems**\n**and open insurance**\n\n[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\ninsurtechs in their ecosystems to achieve high\nmargins in commoditized products. Open insurance,\nwhich involves sharing and managing insurancerelated data through APIs, is more than an item in\nthe regulatory agenda. It can give consumers access\nto better products and accurate pricing, as well as\nenable them to execute transactions more easily.\nIn its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\nfound that organizations that promote external data\nsharing have three times the measurable economic\n\nbenefit across a variety of performance metrics\ncompared to their peers.\n\n\n**Revised target operating model**\n**with a focus on talent**\n\nDemographic shifts and perennial cost pressures\nmake it critical for insurers to attract and retain\ntalent. Consequently, it’s important for insurers\nto equip their workforces with the right tools\nand technologies to help them identify business\nprocesses that can be optimized to differentiate\nthemselves from their competitors, with an emphasis\non moments that matter in the customer journey,\naccording to EY. Recent research from Deloitte\nhighlights the advantages of upskilling and building\na future-ready workforce. One of the benefits\n\nof AI adoption in the workforce is that it enables\norganizations to automate a wide range of business\nprocesses, boosting speed and efficiency. But what’s\neven more important is that it enables employees to\nfocus on higher-value work, according to Deloitte.\n\n\n-----\n\n## The Need for Modern Data Infrastructure\n\n**Insurers turning to cloud and data analytics**\n\n\nThe insurance industry has undergone significant changes over the years, and\none of the areas that has evolved the most is data management. With the\ngrowing need for advanced analytics and digital transformation, many insurance\ncompanies are turning to cloud technology and modern data infrastructures\nto enhance their data management strategies. The benefits of adopting cloud\ntechnology are numerous, particularly the ability to efficiently store and quickly\naccess vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\ninsurers to scale costs, adapt to changing work environments, and meet evolving\ncustomer and business requirements.\n\n\ndynamic pricing and underwriting, and form the foundation for claims automation.\nBy implementing advanced analytics, insurers can innovate more easily, scale their\nbusinesses, and bring new products to market more quickly.\n\nTo remain competitive, insurance companies must increase their investment in\ncloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\npolicy administration, and customer satisfaction. Overall, the adoption of cloud\ntechnology and data analytics is imperative for insurance providers to enhance\noperational efficiency, improve business processes, and stay relevant in today’s\nfast-paced business landscape.\n\n\nFurthermore, insurance providers can leverage the cloud to analyze customer\ndata at scale, gaining insights into behaviors that drive hyper-personalization,\n\n\n-----\n\n**Let’s take a closer look look at a few examples:**\n\n\n**Auto insurers** need to integrate new data sources, such as weather and traffic,\nto build solutions capable of real-time processing. This enables them to alert\nemergency services promptly and gain a better understanding of drivers’ driving\npatterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n\n**Commercial insurance** , including property, general liability, cyber insurance and\nbusiness income insurance, utilizes ML-based automation of actuarial models.\nThis automation facilitates underwriting, claims forecasting and dynamic pricing\nfor their customers. Another notable trend in recent years is the use of IoT-\n\n\nbased alerting for sensitive or valuable commodities. For example, in the case of\nvaccines, IoT sensors can monitor the temperature in real time and send alerts to\nthe appropriate team or person if the temperature exceeds acceptable thresholds.\nThis is crucial as vaccines must be stored within specific temperature ranges.\n\nIn **life insurance** , complex ML models can be employed to create a profile of\nthe customer’s lifestyle and, importantly, detect any changes to it. This deeper\nunderstanding and 360-degree view of the customer enable more customized\nunderwriting and pricing based on the policyholder’s current health, lifestyle and\neating habits.\n\n\n|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n\n\n**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n\n\n-----\n\n## Common Challenges Insurers Face Using Legacy Technology\n\n\nModernization is not an easy process for insurers, and while transforming IT\necosystems is necessary to improve business outcomes, ensuring business\ncontinuity is absolutely critical. However, the volume of data they collect, along\nwith changes in user behavior and legacy systems that can’t handle this amount of\ndata, are forcing insurance providers to accelerate their modernization journeys.\n\nInsurance providers face several challenges when using legacy technology, including:\n\n**Legacy on-premises systems:** Legacy on-premises systems are not only\nexpensive to maintain, but they also store large amounts of big data in silos across\nthe business. This makes it difficult to access the data, hindering data analytics\nefforts and limiting executives’ ability to make informed business decisions.\n\n**Ingesting large volumes of transactional data in real time:** The inability to\ningest data from transaction systems in real time is a major obstacle to obtaining\ncritical insights. Transaction logs from operations such as policy administration,\nenrollment and claims constantly stream data. However, many insurance\ncompanies still rely on legacy data warehouses built around batch processing,\nwhich is not suitable for ingesting and integrating large data sets. As a result,\ninsurers often opt to ingest data nightly, leading to delays in receiving accurate\ndata for decision-making.\n\n\n**Performing fine-grained analysis at scale within tight time frames:** Legacy\ntechnology forces insurers to make a trade-off when analyzing data for user intent.\nThey can choose between detailed and accurate predictions or fast predictions.\nRunning detailed forecasts can improve accuracy, but it requires performing\nmillions of model calculations within narrow service windows, which exceeds the\ncapability of legacy data platforms. Consequently, insurance companies have to\naccept less accurate predictions.\n\n**Powering real-time decisions on the front line:** Serving real-time data to\nthousands of workers is a complex task. While data warehouses can serve reports\nto large groups of users, they are limited to providing stale data. As a result, most\ninsurers only provide daily or weekly updates to reports and rely on employees’\njudgment for more frequent decisions.\n\n**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\nto deliver personalized experiences across every channel, both digital and offline.\nWhile insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\nhigh volumes, leading to inaccurate analytics. To succeed in the insurance industry,\ncompanies must deliver personalized experiences at scale.\n\n\n-----\n\nDatabricks Lakehouse for Insurance addresses the key challenges faced across the\ninsurance value chain. The lakehouse enables the integration of various data types,\nincluding images and structured data, in real time. It offers robust management\nand governance capabilities, and rapidly transforms data into actionable insights\n\n\nthrough real-time reporting and predictive analytics. This platform-as-a-service\nsolution delivers exceptional speed and industry-leading total cost of ownership,\nproviding insurers with faster insights to enhance the customer experience and\ngain a competitive edge.\n\n\n**Product**\n**Development &**\n**Feature Selection**\n\n\n**Application**\n**Review &**\n**Submission**\n\n\n**Policy Issue,**\n**Service &**\n**Administration**\n\n\n**Sales & Lead**\n**Management**\n\n**Hyperpersonalization/**\n**life events**\n\n\n**Underwriting**\n**and Pricing**\n\n**UW rules**\n**guidelines &**\n**technical pricing**\n\n\n**Rating Offer &**\n**Endorsements**\n\n**Evaluate**\n**rate options,**\n**pricing and**\n**endorsements**\n\n\n**Claims**\n\n\n**Coverage/** **Review policy**\n**features/riders** **documents**\n**(submission)**\n\n\n**Omnichannel** **Fraud, frequency,**\n**severity and**\n**reserves**\n\n\n**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n\n\n\n**•** Dynamic segmentation\n\n**•** Personas\n\n**•** Hyper-personalization\n\n**•** Intelligent automation\n\n\n\n**•** Product architecture and\nmanufacturing\n\n**•** Configurable products\n\n**•** Competitor rates\n\n\n\n**•** Reflexive questionnaire\n\n**•** LLM assistance for\ndocument summarization\n\n**•** NLP for unstructured data\n\n\n\n**•** Evaluation of risk within\nappetite\n\n**•** Validation of UW\nrequirements\n\n**•** Straight-through\nprocessing optimization\n\n**•** Risk assessment via\nactuarial pricing\n\n**•** Triaging of risk to\nunderwriter SME for policy/\nexposure changes\n\n\n\n**•** Predict loss cost\n(frequency and severity)\n\n**•** Computer vision on images\nto identify loss\n\n**•** Auto-adjudication and\ntriaging of claims to claim\nadjuster\n\n**•** Tailor communication by\nsegment (e.g., email, text,\nmail, or omnichannel)\n\n**•** Identify Fraud, Waste and\nAbuse, route to ICU\n\n\n**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n\n\n-----\n\n## Why Lakehouse for Insurance\n\nDatabricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\nbest-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n\n\n\n**•** Insurance companies can store any type of\ndata using Databricks Lakehouse for Insurance,\nleveraging the low-cost object storage supported\nby cloud providers. This helps break down data\nsilos that hinder efforts to aggregate data for\nadvanced analytics, such as claim triaging and\nfraud identification, regulatory reporting, or\ncompute-intensive risk workloads. Another critical\nfeature is the time-travel capabilities of the\nlakehouse architecture, allowing insurers to access\nany historical version of their data.\n\n\n\n**•** Supporting streaming use cases, such as\nmonitoring transaction data, is easier with the\nlakehouse. It utilizes Apache Spark ™ as the data\nprocessing engine and Delta Lake as the storage\nlayer. Spark enables seamless switching between\nbatch and streaming workloads with just a single\nline of code. Delta Lake’s native support for ACID\ntransactions ensures reliable and high-performing\nstreaming workloads.\n\n\n\n**•** For both machine learning and non-machine\nlearning insurance models, a comprehensive\ngovernance framework is provided. Data, code,\nlibraries and models are linked and independently\nversion controlled using technologies like Delta\nLake and MLflow. Delta Lake ensures stability by\nallowing insurance companies to declare their\nexpectations for data quality upfront. MLflow\nenables training models in any language and\ndeploying them anywhere, minimizing the need for\ncomplex handoffs between data science practices,\nindependent validation units and operational teams.\n\n\n-----\n\n**Level-up value with Databricks Lakehouse for insurance**\n\nBuilding your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\nuse cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n\nWith a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\ncloud data architecture. The key benefits include:\n\n\n**1. Cost and complexity reduction**\n\nThe Databricks Lakehouse provides an open, simple\nand unified cloud data management architecture\nthat streamlines operational inefficiencies, reduces\nIT infrastructure costs, and enhances productivity\nacross teams.\n\n\n**2. Enhanced risk management and control**\n\nBy unlocking the value of enterprise data, the\nplatform helps reduce corporate governance and\nsecurity risks. It facilitates data-driven decisionmaking through governed discovery, access and\ndata sharing.\n\n\n**3. Accelerated innovation**\n\nThe platform enables the acceleration of digital\ntransformation, modernization and cloud migration\ninitiatives, fostering new growth opportunities\nand driving innovation for improved customer and\nworkforce experiences.\n\n\nTo help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n\n\n-----\n\n**Reference Architecture for Smart Claims**\n\n\n**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n\nor incrementally through change data capture (CDC). These\n\ninclude structured and unstructured data sets like images, text,\n\nand video, such as IoT sensor data, operational data like claims\n\nand policies, and on-prem or third-party data such as from\n\ncredit bureaus, weather, and driving records. Partner Connect\n\noffers a range of ingest tools from different vendors that you can\n\ndirectly use from the Databricks portal.\n\n\n**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n\npath to transform the data based on business\n\nrequirements. All the data resides in cloud storage,\n\nwhere Delta refines it into Bronze, Silver and Gold\n\nzones of a medallion pipeline blueprint. Databricks\n\nWorkflows provide orchestration of the various\n\ndependent tasks, with advanced capabilities like\n\n\n**3.** \u0007Databricks SQL, with Photon\n\nand serverless options, caters\n\nto BI consumption use cases to\n\nrefresh a dashboard monitoring\n\nkey metrics and KPIs, with\n\nquery history and alerts on\n\ncritical events.\n\n\n**4.** \u0007Databricks ML Runtime,\n\nMLFlow, along with\n\nFeature Store, Auto ML,\n\nand real-time Model\n\nServing enable ML\n\nuse cases to provide\n\n\n**5.** \u0007Delta Sharing provides\n\na secure and governed\n\nway of sharing data\n\ninternally and externally\n\nwithout copying it,\n\nusing Unity Catalog.\n\n\npredictive insights.\n\n\nretry, repair and job status notifications.\n\n\n-----\n\n**Secure data sharing with Delta Lake**\n\nAt the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\nLake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\nunify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n\nOnce the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\nThey can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n\n**Business intelligence**\n\n**Streaming**\n\n**Centralized**\n**governance**\n\n\n##### Lakehouse Platform\n\n\n**Data science / ML**\n\n**One copy**\n**of data**\n\n**Data warehouse**\n\n**Orchestration**\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Claims automation and transformation\n\n**Overview**\n\n\nInsurers are entering a new era of claims transformation, supported by evolving technological advancements\nand increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\namount of structured and unstructured data coming in from different sources, in different formats, and time\nframes. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\nby a combination of technology and human intervention that seamlessly expedites the process.\n\n**Business problem**\n\nMissing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\nleakage and inefficient processes in triaging claims to the right resource.\n\n**Solution/value with Databricks**\n\nEnable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\nincluding MLflow model lifecycle management.\n\n**Business outcomes and benefits**\n\n**•** Decrease in annual claims payout\n\n**•** Increase in claim fraud detection/prevention\n\n**•** Improve efficiencies by 15%\n\n**“Applying AI as broadly, as aggressively**\n\n**and as enthusiastically as possible. No part**\n\n**of our business should be untouched by it.”**\n\n— \u0007Masashi Namatame, Group Chief Digital Officer,\nManaging Executive Officer, Tokio Marine\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**Tokio Marine: Striving to**\n**become Al-driven**\n\nInsurers of all types now routinely use AI\nmodels to drive underwriting, streamline claims\nprocessing and accelerate claims adjudication,\nprotect against insurance fraud, and improve\nrisk forecasting, for example. Tokio Marine —\nJapan’s oldest insurance company, which has\ndone business since 1879 — has been applying\nadvanced uses of AI, particularly in its auto\ninsurance business, says Masashi Namatame,\nGroup Chief Digital Officer and Managing\nExecutive Officer at Tokio Marine: “To assess\ncollision damages, the company uses an AIbased computer vision solution to analyze\nphotos from accident scenes.” Comparing these\nwith what he describes as “thousands or even\nmillions” of photos of past analogous incidents,\nthe model produces liability assessments of the\nparties involved and projects anticipated repair\ncosts. AI has also provided the company with\ntangible benefits in online sales — especially in\npersonalized product recommendations and\ncontract writing, according to Namatame. Read\nthe case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n\n\n-----\n\n**K E Y U S E C A S E**\n## Dynamic pricing and underwriting\n\n**Overview**\n\n\nIn modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\ncarriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\nThis involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\ngeolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\noffers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n\n**Business problem**\n\nActuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\ncapabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n\n**Solution/value with Databricks**\n\n**•** Unified cloud-native platform\n\n**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n\n**•** Reduced total cost of ownership compared to legacy Hadoop systems\n\n**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\nlowering loss ratios\n\n**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n\n**Business outcomes and benefits**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**American financial services**\n**mutual organization**\n\nThis organization aimed to leverage the vast\namounts of structured and unstructured data\nit collected to enhance its underwriting and\ndecision-making processes, enabling greater\nefficiency and effectiveness. However, the\ncompany’s legacy infrastructure struggled\nto scale with the increasing data volume and\nprocessing demands, limiting its ability to\nanalyze the data and derive actionable insights.\n\nWith Databricks, the insurer centralized\neverything on one unified Lakehouse platform,\n\nsupporting all operational and analytical\nuse cases. This allowed them to analyze\nbroader sets of data for superior underwriting\nperformance and create a digitally empowered,\nend-to-end underwriting experience.\n\n\n\n**•** Improve competitive position\n\n**•** Decrease combined ratio\n\n**•** 15% improvement in efficiencies\n\n\n-----\n\n**K E Y U S E C A S E**\n## Anomaly detection and fraudulent claims\n\n**Overview**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**One of the largest U.S.**\n**insurance companies and a**\n**leading small business insurer**\n\nThe increasing availability of data and market\ncompetition challenge insurance providers to\noffer better pricing to their customers. This\nU.S.-based insurer, with hundreds of millions of\ninsurance records to analyze for downstream\nML, realized that their legacy batch analysis\nprocess was slow and inaccurate, providing\nlimited insight for predicting the frequency\nand severity of claims. With Databricks, they\nwere able to scale up the use of deep learning\nmodels, resulting in more accurate pricing\npredictions and increased revenue from\nclaims. By leveraging Databricks Lakehouse,\nthey harmonized data, analytics and AI at\nscale, enabling accurate pricing predictions\nand supporting various use cases from vehicle\ntelematics to actuarial modeling.\n\n\nFraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\nAmerican consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\nin 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\nchange to support new channels and services, offering transactional features and facilitating payments through\ndigital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\nconsumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\nmodels. It often involves a complex decision science process that combines a rules engine with a robust and\nscalable machine learning platform.\n\n**Business problem**\n\nInsurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n\n**Solution/value with Databricks**\n\nModernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\nproviders (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\nThis data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n\n**$1 of fraud costs companies 3.36x in chargeback,**\n**replacement and operational costs**\n\n\n[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Customer 360 and hyper-personalization\n\n\n**Overview**\n\nWinning the hearts and minds of your customers\nstarts with personalizing the user experience. The\nability to offer complementary products to meet\nthe needs of your customers lets you build deeper\nrelationships with them and engender their loyalty.\nIn addition, a better understanding of the finer\ndetails within accounts allows you to offer them\nmore personalized products. To do this, you need\n360-degree customer views, which requires you to\nlocate and consolidate all your customers’ contact\ndata from every digital tool that you use and house\nit in one central location. With Databricks Lakehouse,\ninsurers can “hyper-personalize,” increase\ncross-sell/upsell opportunities, enhance customer\n360 and bring new products to market faster.\n\n**Business problem**\n\nThe inability to reconcile customer records across\ndifferent lines of business limits real-time customer\ninsights necessary for upselling and cross-selling.\nSiloed data makes it challenging to create accurate\nand comprehensive customer profiles, resulting in\nsuboptimal recommendations for the next best action.\n\n\n**Solution/value with Databricks**\n\nDatabricks provides the tools needed to process\nlarge volumes of data and determine the next best\naction at any point in the customer journey.\n\n**•** Eliminates data silos by unifying all customer data,\nincluding basic information, transactional data,\nonline behavior/purchase history, etc., to create\ncomplete customer profiles\n\n**•** Integrated data security ensures that security\nmeasures are incorporated at every layer of the\nDatabricks Lakehouse Platform\n\n**•** Delta improves data quality, providing a single\nsource of truth for real-time streams and ensuring\nreliable and high-quality data for data teams\n\n**•** Integrated ML and AI capabilities utilize AI to\ncreate self-optimizing ML models that determine\nthe next best step for each customer\n\n**•** MLflow model lifecycle management helps manage\nthe entire machine learning lifecycle reliably,\nsecurely and at scale\n\n\n**Business outcomes and benefits**\n\n**•** Use AI, ML, automation and real-time data to\ngain deeper customer insights and understand\ntheir needs\n\n**•** Improve competitive positioning\n\n**•** Enhance the customer experience\n\n**C U S T O M E R C A S E S T U D Y**\n\n**160-year-old U.S.**\n**insurance company**\n\nThis insurance provider underwent a significant\ndigital transformation to provide a more\npersonalized financial services experience to\nits 10,000 advisors and millions of customers\nacross various touchpoints. Recognizing the\nimportance of becoming data-driven, the\ncompany leveraged Databricks in its client\n360 platform to aggregate transactional and\nbehavioral data, along with core attributes,\nproviding business users with next-best-action\nrecommendations for seamless customer\nengagement.\n\n\n-----\n\n## Global Regulatory Impact in Insurance\n\n\n**Navigating global regulations**\n**with technical implementation**\n\nDigital innovation continues to reshape the insurance sector. The pace and scale\nof technological change are likely to increase due to factors such as artificial\nintelligence (AI), cloud computing, and the entry of new players like insurtechs,\ne-tailers, and manufacturers from outside the insurance industry.\n\nTo succeed and thrive in today’s economic environment, insurers should prioritize\nupgrading their infrastructure and technology, rather than solely focusing on\ntransforming operations. For example, migrating from on-premises systems to the\ncloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n\nAs insurers upgrade their compliance processes to meet new global regulations,\nsuch as IFRS 17 and LDTI, the impact of regulatory updates becomes more\ncomplex for organizations operating across multiple jurisdictions. Instead of\nmerely responding to regulatory and industry requirements, insurance companies\nshould make data-focused investments that help them anticipate and meet the\nexpectations of distributors and policyholders.\n\n\n**IFRS-17**\n\nIFRS 17 is an International Finance Reporting Standard (IFRS) for\ninsurance contracts. IFRS 17 aims to standardize insurance accounting\nby providing consistent principles for all facets of accounting for\ninsurance contracts. IFRS 17 removes existing inconsistencies so\nanalysts, investors and others can more easily compare companies,\ncontracts and industries.\n\n**LDTI for long-duration contracts**\n\nThe Financial Accounting Standards Board long-duration targeted\nimprovements (LDTI) introduced changes to the U.S. GAAP accounting\nmodel to simplify and improve the financial reporting of long-duration\ncontracts, including providing financial statement users with more\ntimely and relevant information about those contracts.\n\n\nIt is crucial for insurers to redirect their focus toward developing advanced data\nmanagement and utilization capabilities that offer better insights and improved\nperformance. These investments serve as not only a foundation for regulatory\ncompliance but also a starting point for more comprehensive and proactive\ntransformation initiatives.\n\n\n-----\n\n**I N D U S T R Y S O L U T I O N S**\n\n## Get Started With Accelerators, Brickbuilders and Enablers\n\nInsurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n\n**Adoption challenges**\n\n\nNumerous challenges hinder organizations from developing and implementing the\nnecessary technical solutions to enhance their operational effectiveness, increase\nrevenue, and stay competitive. These challenges include:\n\n**•** Lack of technical skills (data scientists/data engineers): Companies often\nstruggle to find employees proficient in Python or Scala, or individuals who\npossess extensive experience in data science.\n\n\n\n**•** Business problems require in-depth data science and industry knowledge:\nBusinesses seek solutions tailored to address specific problems, rather than\ngeneric technical features.\n\n**•** Companies seek actionable insights: Organizations prefer readily applicable\npatterns that can be quickly implemented, rather than custom data science\nsolutions that come with potential costs and risks of implementation failure.\n\n\n**What are accelerators/enablers?**\n\n\n**Solution Accelerators**\n\nSave hours on discovery, design, development and\ntesting with Databricks Solution Accelerators. Our\npurpose-built guides, including fully functional\nnotebooks and best practices, expedite results for\nyour most common and high-impact use cases. With\nthese accelerators, you can go from idea to proof of\nconcept (PoC) in as little as two weeks.\n\n\n**Brickbuilders**\n\nBrickbuilder Solutions are data and AI solutions\ndesigned by leading consulting companies to\naddress industry-specific business requirements.\nBuilt on the Databricks Lakehouse Platform and\nbacked by the industry experience of these\nconsultancies, businesses can have confidence\nin solutions tailored to their specific use cases.\nBrickbuilder Solutions can be implemented at any\nstage of the customer journey.\n\n\n**Solution Enablers**\n\nSolution enablers consist of targeted collections\nof notebooks and materials, such as webinars and\nblog posts, designed to support larger solutions.\nThey aim to solve pain points or address specific\nlayers of business capabilities, such as resolving data\ningestion challenges.\n\n\n-----\n\n## Get Started With Industry Solutions\n\n\n**Claims transformation:**\n**automation and fraud prevention**\n\nInsurers are entering a new era of claims transformation, supported by evolving\ntechnological advancements and growing data availability. The end-to-end claims\nprocess, from extracting relevant information from documentation submitted\nwhen filing a claim to triaging and routing claims and the underwriting process,\nis ripe for digital transformation. By leveraging the Databricks Lakehouse,\norganizations can handle millions of data points coming in different formats and\ntime frames, from various sources, at an unprecedented volume. Every touchpoint\nin the claims journey, starting even before an incident occurs, will be supported by\na combination of technology and human intervention that seamlessly expedites\nthe process. Personalizing the claims experience by anticipating needs, providing\nreal-time status alerts, and reducing friction in the process increases customer\nloyalty and retention.\n\n\n**Customer/Partner Successes**\n\n**Accelerate underwriting through collaboration and efficient ML**\n\nA leading P&C insurer took full advantage of the MongoDB and Databricks\nintegration, leveraging both platforms to foster collaboration between their data\nand developer teams. The integration provides a more natural development\nexperience for Spark users and exposes all of Spark’s libraries. This allows\nMongoDB data to be materialized as DataFrames and data sets for analysis\nusing machine learning, graph, streaming and SQL APIs. The insurer also benefits\nfrom automatic schema inference. With this integration, the insurer was able to\ntrain and observe their ML models (MongoDB Atlas Charts) more efficiently and\nincorporate them into business applications.\n\nAs a result, crucial underwriting processes that previously took days are now executed\nin seconds. In addition to the time and cost savings, the company can provide a more\nimmediate response to customers within its digital experience platform.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n\n**Claims processing is the process whereby an insurer receives,**\n\n\n**verifies and processes a claim report submitted by a policyholder.**\n\n**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n\n**criticial component of customer satisfaction with their carrier.”**\n\n\n**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n\n\n[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n\n\n**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n\n\n**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n\n**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n\n\n-----\n\n**Risk management:**\n**dynamic pricing and underwriting**\n\nModernized approaches at insurance carriers require a full digital transformation,\nand one aspect of this transformation involves dynamic pricing and underwriting\nto reduce premiums. Insurance providers are now consuming data from the largest\nmobile telematics providers to obtain the most granular sensor and trip summaries\nfor users of online insurance applications. Not only is this data critical for pricing,\nbut it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\nand underwriting automate routine tasks and provide teams with alternative\ndata sources to empower actuarial and underwriting professionals to become\n“exponential.” This allows teams to focus on key aspects of risk selection and\nanalysis that drive competitive advantage and market differentiation. By leveraging\npersonalized data points, insurers can deliver near real-time underwriting\ndecisions for life insurance applicants, reducing policy abandonment and costs.\n\n\n**Customer/Partner Successes**\n\n**Automated extraction of medical risk factors for life insurance underwriting**\n**(John Snow Labs)**\n\nLife insurance underwriting considers an applicant’s medical risk factors in\naddition to mortality risk characteristics. These risk factors are often found\nin free-text documents. New insurance-specific natural language processing\n(NLP) models can automatically extract relevant medical history and risk factors\nfrom such documents. Forward-thinking companies are embracing accelerated\nunderwriting, which utilizes new data along with algorithmic tools and modeling\ntechniques to quickly assess and group applicants without requiring bodily fluids,\nphysician’s notes, and so on. This joint Solution Accelerator from Databricks and\nJohn Snow Labs simplifies the implementation of this approach, creating a faster,\nmore consistent, and scalable underwriting experience.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n\n**Risk is highly influenced by behavior, and 80% of morbidity in**\n\n\n**healthcare risk is driven by factors such as smoking, drinking**\n\n**alcohol, physical activity and diet. In the case of driving,**\n\n**60% of fatal accidents are a result of behavior alone. If insurers**\n\n**can change customer behaviors and help them make better**\n\n**choices, then the risk curve shifts.”**\n\n\n**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n\n**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n\n\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n\n\n**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n\n\n-----\n\n**Product distribution:**\n**segmentation and personalization**\n\nThe most forward-thinking and data-driven insurers are\nfocused on achieving personalization at scale. They are\nexploring new partnerships and business models to create\nintegrated, value-added experiences that prioritize the\noverall health and financial wellness of their customers,\nrather than just their insurance needs. These insurers\nare investing in new data sources, analytics platforms,\nand artificial intelligence (AI)-powered decision engines\nthat enable them to connect producers with like-minded\ncustomers or engage customers with enticing offers\nand actionable steps based on their previous choices.\nThe outcome is more efficient and effective service\nfrom producers, trusted and convenient interactions for\nconsumers, and increased customer engagement and\ngrowth for insurers in an increasingly digital-oriented world.\n\n\n**Customer/Partner Successes**\n\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n\nWith Persona 360, you can:\n\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n1,695+ attributes and segments\n\n**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\nPersona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\nusers to comprehend and activate the data\n\n**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\npersonalized campaigns\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Demand for hyper-personalized and real-time risk protection**\n\n\n**requires broad adoption of artificial** **intelligence (AI), machine**\n\n**learning and digital platforms.**\n\n**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n\n\n**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n\n\n**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n\n**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n\n\n-----\n\n**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n**by insurance provider type**\n\n\n\n\n\n\n\n\n\n|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n\n\n-----\n\n|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n\n\n\n\n\n\n|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n\n\n-----\n\n|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n\n\n-----\n\n## Conclusion\n\nToday, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\nInsurance empowers insurance providers to leverage the potential of data and analytics to address strategic\nchallenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n\n**Customers that innovate with Databricks Lakehouse for Insurance**\n\nSome of the top property and casualty, life and health insurance companies and reinsurers in the world turn\nto Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\nsmarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000 organizations worldwide — including\n\nComcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n\nLake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n\n#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n\n###### Contact us for a personalized demo at:\n dbricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "```\nTECHNICAL GUIDE\n\n```\n\n# Solving Common Data Challenges \n\n\n#### Startups and Digital Native Businesses\n\n\n-----\n\n### Table of Contents\n\n\n# 01\n```\nCHALLENGE:\n \u0003\n\n###### Creating a unified data architecture for data quality, governance and efficiency\n\n# 03\nCHALLENGE:\n \u0003\n\n###### Building effective machine learning operations\n\n```\n\n# 02\n```\nCHALLENGE:\n \u0003\n\n###### Building a data architecture to support scale and performance\n\n# 04\nSUMMARY:\n\n###### The Databricks Lakehouse Platform addresses these challenges\n\n```\n\n-----\n\n**I N T R O D U C T I O N**\n\n\nThis guide shares how the lakehouse architecture can increase\nproductivity and cost-efficiently support all your data, analytics\nand AI workloads, and flexibly scale with the pace of growth\nfor your company. Read the entire guide or dive straight into a\nspecific challenge.\n\nWith the advent of cloud infrastructure, a new generation of\nstartups has rapidly built and scaled their businesses. The use of\ncloud infrastructure, once seen as innovative, has now become\ntable stakes. The differentiator for the fastest-moving startups\nand digital natives now comes from the effective use of data\nat scale, primarily analytics and AI. Digital natives — defined\nas fast-moving, lean, and technically savvy, born-in-the-cloud\norganizations — are beginning to focus on new data-driven use\ncases such as real-time machine learning and personalized\ncustomer experiences.\n\nTo pursue these new data-intensive use cases and initiatives,\norganizations must look beyond the technologies that delivered\nthem to this point in time. Over time, these technologies, such\nas transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n\nThis guide examines some of the biggest data challenges and\nsolutions for startups and for scaling digital native businesses\nthat have reached the point where an end-to-end modern data\nplatform is a smart investment. Some key considerations include:\nsystems that are not cost-efficient and require time-consuming\nadministration and engineering toil. In addition to growing\nmaintenance needs, data is often stored in disparate locations\nand formats, with little or no governance, making real-time use\ncases, analytics and AI difficult or impossible.\n\n\n**Consolidating on a unified data platform**\nAs mentioned above, siloed data storage and management add administrative and\nfinancial cost. You can benefit significantly when you unify your data in one location\nwith a flexible architecture that scales with your needs and delivers performance\nfor future success. For this, you will want an open platform that supports all your\ndata including batch and streaming workloads, data analytics and machine learning.\nWith data unification, you create a more efficient, integrated approach to ingesting,\ncleaning and organizing your data. You also need automation to make data analysis\neasier for the nontechnical users in the company. But broader data access also\nmeans more focus on security, privacy, compliance and access control, which can\ncreate overhead for a growing.\n\n**Scaling up capacity and increasing performance**\n**and usability of the data solutions**\nData teams at growing digital native organizations find it time intensive and costly to\nhandle the growing volume and velocity of their data being ingested from multiple\nsources, across multiple clouds. You now need a unified and simplified platform that\ncan instantly scale up capacity and deliver more computing power on demand to\nfree up your data teams to produce outputs more quickly. This lowers the total cost\nfor the overall infrastructure by eliminating redundant licensing, infrastructure and\nadministration costs.\n\n**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency\n\n```\nAs cloud-born companies grow, data volumes rapidly increase, leading to new\nchallenges and use cases. Among the challenges:\n\n\nApplication stacks optimized for transaction\nuse cases aren’t able to handle the volume,\nvelocity and variety of data that modern data\nteams require. For example, this leads to query\nperformance issues as data volume grows.\n\nData silos develop as each team within an\norganization chooses different ETL/ELT and\nstorage solutions for their needs. As the\norganization grows and changes, these pipelines\nand storage solutions become brittle, hard to\nmaintain and nearly impossible to integrate.\n\n\nThese data silos lead to discoverability,\nintegration and access issues, which prevent\nteams from leveraging the full value of the\norganization’s available data.\n\nData governance is hard. Disparate ETL/ELT\nand storage solutions lead to governance,\ncompliance, auditability and access control\nchallenges, which expose organizations to\ntremendous risk.\n\n\nThe Databricks Lakehouse Platform provides\na unified set of tools for building, deploying,\nsharing and maintaining data solutions at scale.\nIt integrates with cloud storage and the security\nin your cloud account, manages and deploys\ncloud infrastructure on your behalf. Your data\npractitioners no longer need separate storage\nsystems for their data. And you don’t have to rely\non your cloud provider for security. The lakehouse\nhas its own robust security built into the platform.\n\n\nFor all the reasons above, the most\nconsistent advice from successful data\npractitioners is to create a “single source\nof truth” by unifying all data on a single\nplatform. With the Databricks Lakehouse\nPlatform, you can unify all your data on one\nplatform, reducing data infrastructure costs\nand compute. You don’t need excess data\ncopies and you can retire expensive\nlegacy infrastructure.\n```\n 01\n\n```\n\n-----\n\n```\nCUSTOMER STORY: GRAMMARLY\n\n### Helping 30 million people and 50,000 teams communicate more effectively\n\n```\n\nWhile its business is based on analytics, [Grammarly](http://www.grammarly.com)\n\nfor many years relied on a homegrown analytics\n\nplatform to drive its AI writing assistant to\n\nhelp users improve multiple aspects of written\n\ncommunications. As teams developed their own\n\nrequirements, data silos inevitably emerged as\n\ndifferent business areas implemented analytics\n\ntools individually.\n\n“Every team decided to solve their analytics\n\nneeds in the best way they saw fit,” said Chris\n\nLocklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds\n\nthe standards we set as a data platform team,” said Locklin. “Lineage is\n\nthe last crucial piece for access control.”\n\nData analysts within Grammarly now have a consolidated interface for\n\nanalytics, which leads to a single source of truth and confidence in the\n\naccuracy and availability of all data managed by the data platform team.\n\nHaving a consistent data source across the company also resulted in\n\ngreater speed and efficiency and reduced costs. Data practitioners\n\nexperienced 110% faster querying at 10% of the cost to ingest compared\n\nto a data warehouse. Grammarly can now make its 5 billion daily events\n\navailable for analytics in under 15 minutes rather than 4 hours. Migrating\n\noff its rigid legacy infrastructure gave Grammarly the flexibility to do\n\nmore and the confidence that the platform will evolve with its needs.\n\nGrammarly is now able to sustain a flexible, scalable and highly secure\n\nanalytics platform that helps 30 million people and 50,000 teams\n\nworldwide write more effectively every day.\n\n[Read the full story here.](https://www.databricks.com/customers/grammarly)\n\n\n-----\n\n###### How to unify the data infrastructure with Databricks\n\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\nis composed of two primary parts:\n\n- The infrastructure to deploy, configure and\nmanage the platform and services\n\n\nYou can build a Databricks workspace by configuring\nsecure integrations between the Databricks platform\nand your cloud account, and then Databricks deploys\ntemporary Apache Spark™/Photon clusters using cloud\nresources in your account to process and store data\nin object storage and other integrated services you\ncontrol. Here are three steps to get started with the\nDatabricks Lakehouse Platform:\n\n**Understand the architecture**\nThe lakehouse provides a unified architecture,\nmeaning that all data is stored in the same\naccessible place. The diagram shows how data\ncomes in from sources like a customer relationship\nmanagement (CRM) system, an enterprise resource\nplanning (ERP) system, websites or unstructured\ncustomer emails.\n\n**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).\n\n[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n\n\n-----\n\nThe Databricks Lakehouse organizes data stored with Delta Lake in cloud object\nstorage with familiar concepts like database, tables and views. Delta Lake extends\nParquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\nscalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\nand was developed for tight integration with Structured Streaming, allowing you to\neasily use a single copy of data for both batch and streaming operations to provide\nincremental processing at scale.This model combines many of the benefits of a data\nwarehouse with the scalability and flexibility of a data lake.\n\nTo learn more about the optimized storage layer that provides the foundation for\nstoring data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n\nThe first step in unifying your data architecture is setting up how data is to be\naccessed and used across the organization. We’ll discuss this as a series of steps:\n\n**1** Set up governance with Unity Catalog\n\n**2** Grant secure access to the data\n\n\n###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n\n[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\n**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n\nTo set up Unity Catalog for your organization,\nyou do the following:\n\n\n**1** Configure an S3 bucket and IAM role that\nUnity Catalog can use to store and access\ndata in your AWS account.\n\n**2** Create a metastore for each region in\n\nwhich your organization operates, and\nattach workspaces to the metastore. Each\nworkspace will have the same view of the\ndata you manage in Unity Catalog.\n\n\n**3** If you have a new account, add users,\ngroups and service principals to your\nDatabricks account.\n\n**4** Next, create and grant access to\n\ncatalogs, schemas and tables.\n\n\nFor complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n\n\n-----\n\n###### How Unity Catalog works\n\n\nYou will notice that the hierarchy of primary data\nobjects in Unity Catalog flows from metastore to table:\n\n**Metastore** is the top-level container for metadata.\nEach metastore exposes a three-level namespace\n(catalog.schema.table) that organizes your data.\n\n\n**Metastore** **Catalog** **Schemas**\n\n\n**Views**\n\n**Managed**\n**Tables**\n\n\n**Catalog** is the first layer of the object hierarchy, used\nto organize your data assets.\n\n\n**Schemas** , also known as databases, are the second\nlayer of the object hierarchy and contain tables and\nviews.\n\n**Table** is the lowest level in the object hierarchy, and\ntables can be external (stored in external locations in\nyour cloud storage of choice) or managed (stored in a\nstorage container in your cloud storage that you create\n\nexpressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n\nUnity Catalog users, service principals, and groups\nmust also be added to workspaces to access Unity\nCatalog data in a notebook, a Databricks SQL query,\nData Explorer or a REST API command. The assignment\nof users, service principals, and groups to workspaces\nis called identity federation. All workspaces attached\nto a Unity Catalog metastore are enabled for identity\nfederation.\n\nSecurable objects in Unity Catalog are hierarchical,\nmeaning that granting a privilege on a catalog or schema\nautomatically grants the privilege to all current and\nfuture objects within the catalog or schema. For more\non granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\nA common scenario is to set up a schema per team\nwhere only that team has USE SCHEMA and CREATE on\nthe schema. This means that any tables produced by\nteam members can only be shared within the team.\nData Explorer uses the privileges configured by Unity\nCatalog administrators to ensure that users are only\nable to see catalogs, databases, tables and views that\nthey have permission to query.\n\n\n[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\nmany Unity Catalog features. Use Data Explorer to view\nschema details, preview sample data, and see table\ndetails and properties. Administrators can view and\nchange owners. Admins and data object owners can grant\nand revoke permissions through this interface.\n\n**Set up secure access**\nIn Unity Catalog, data is secure by default. Initially, users\nhave no access to data in a metastore. Access can\nbe granted by either a metastore admin, the owner of\nan object, or the owner of the catalog or schema that\ncontains the object. Securable objects in Unity Catalog\nare hierarchical and privileges are inherited downward.\n\nUnity Catalog’s security model is based on standard ANSI\nSQL and allows administrators to grant permissions in\ntheir existing data lake using familiar syntax, at the level of\ncatalogs, databases (schema), tables and views. Privileges\nand metastores are shared across workspaces, allowing\nadministrators to set secure permissions once against\n\ngroups synced from identity providers and know that\nend users only have access to the proper data in any\nDatabricks workspace they enter.\n\n\n-----\n\n```\nCUSTOMER STORY: BUTCHERBOX\n\n### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant\n\n\n“We knew we needed to migrate from our legacy data warehouse\n\nenvironment to a data analytics platform that would unify our\n\ndata and make it easily accessible for quick analysis to improve\n\nsupply chain operations, forecast demand and, most importantly,\n\nkeep up with our growing customer base,” explained Jake Stone,\n\nSenior Manager, Business Analytics, at ButcherBox.\n\nThe platform allows analysts to share builds and iterate on a\n\nproject without getting into the code. Querying a table of 18\n\nbillion rows would have been problematic with a traditional\n\nplatform. With Databricks, ButcherBox can do it in three minutes.\n\n“Delta Lake provides us with a single source of truth for all of\n\nour data,” said Stone. “Now our data engineers are able to build\n\nreliable data pipelines that thread the needle on key topics such\n\nas inventory management, allowing us to identify in near real-\n\ntime what our trends are so we can figure out how to effectively\n\nmove inventory.”\n\n[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\nproblem because they blocked complete\n\nvisibility into critical insights needed to make\n\nstrategic and marketing decisions.\n\n\n-----\n\n**Set up secure data sharing**\nDatabricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\nto share data with other entities regardless of their\ncomputing platforms. Delta Sharing is integrated with\nUnity Catalog. Your data must be registered with Unity\nCatalog to manage, govern, audit and track usage of the\nshared data on the Lakehouse Platform. The primary\nconcepts of Delta Sharing are shares (read-only\ncollections of tables and table partitions to be shared)\nand recipients (objects that associate an organization\nwith a credential or secure sharing identifier).\n\nAs a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n\n**View data lineage**\nYou can use Unity Catalog to capture runtime data\nlineage across queries in any language executed on\na Databricks cluster or SQL warehouse. Lineage can\nbe visualized in Data Explorer in near real-time and\nretrieved with the Databricks REST API. Lineage is\naggregated across all workspaces attached to Unity\nCatalog and captured down to the column level, and\nincludes notebooks, workflows and dashboards related\nto the query. To understand the requirements and how\nto capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n\n\nUnity Catalog Metastore\n\n\nCatalog\n\n\nData providers can use Databricks audit logging to\nmonitor the creation and modification of shares,\nand recipients can monitor recipient activity on\nshares. Data recipients who use shared data in a\nDatabricks account can use Databricks audit logging\nto understand who is accessing which data.\n\n\n-----\n\n###### Resources:\n\n- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n\n- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n\n- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n\n- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n\n- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n\n\n###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud\n\nThe Databricks Lakehouse Platform is open\nsource with multicloud flexibility so that you can\nuse your data however and wherever you want —\nno vendor lock-in\n\n\n-----\n\n# 02\n```\nCHALLENGE: \u0003\n\n## Build your data architecture to support scale and performance\n\n```\n\n-----\n\n```\nCHALLENGE 02\n\n### Build your data architecture to support scale and performance\n\n```\nAs modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\nthe increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\nsuddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\ncost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\nupon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n\nHere are some common challenges around managing data and performance at scale:\n\n\n**Volume and velocity** — Exponentially\nincreasing data sources, and the speed at\nwhich they capture and create data.\n\n**Latency requirements** — The demands of\ndownstream applications and users have\nevolved (people want data and the results\nfrom the data faster).\n\n\n**Governance** — Cataloging, auditing, securing and\nreporting on data is burdensome at scale when\nusing old systems not built with data access\ncontrols and compliance in mind.\n\n**Multicloud** is really hard.\n\n\n**Data storage** — Storing data in the wrong\nformat is slow to access, query and is\nexpensive at scale.\n\n\n**Data format** — Supporting structured, semistructured and unstructured data formats\nis now a requirement. Most data storage\nsolutions are designed to handle only one type\nof data, requiring multiple products\nto be stitched together.\n\n```\n02\n\n```\n\n-----\n\n###### Lakehouse solves scale and performance challenges\n\n\nThe solution for growing digital companies is a unified\nand simplified platform that can instantly scale up\ncapacity to deliver more computing power on demand,\nfreeing up teams to go after the much-needed data\nand produce outputs more quickly. With a lakehouse,\nthey can replace their data silos with a single home for\ntheir structured, semi-structured and unstructured\ndata. Users and applications throughout the enterprise\nenvironment can connect to the same single copy of\nthe data to drive diverse workloads.\n\nThe lakehouse architecture is cost-efficient for\nscaling, lowering the total cost of ownership for the\noverall infrastructure by consolidating all data estate\nand use cases onto a single platform and eliminating\nredundant licensing, infrastructure and administration\ncosts. Unlike other warehouse options that can only\nscale horizontally, the Databricks Lakehouse can scale\nhorizontally and vertically based on workload demands.\n\nWith the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN\n\n```\n\nWith more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n\nday, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n\nlegacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n\ndecreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n\ninfrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n\ndownstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n\nactionable insights for different use cases, from predictive maintenance to smarter product development.\n\n“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n\nperformant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n\nWassym Bensaid, Vice President of Software Development at Rivian.\n\nFor instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n\naccelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n\nroll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n\nconnected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n\nsmart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n\nhas seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n\n[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.\n\n\n**Step 2**\n**Understand the common Delta Lake operations**\nThe Databricks Lakehouse Platform simplifies the\nentire data lifecycle, from data ingestion to monitoring\nand governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\nopen-source storage system based on the Delta\nformat providing reliability through ACID transactions\nand scalable metadata handling. Large quantities of\nraw files in blob storage can be converted to Delta to\norganize and store the data cheaply. This allows for\nflexibility of data movement while being performant\nand less expensive.\n\n\n**Step 1**\n**Get a trial Databricks account**\nStart your 14-day free trial with Databricks on\nAWS in a few easy steps.\n[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\nuses compute and S3 storage resources in your cloud\nprovider account.\n\n\nand writing data can occur simultaneously without risk\nof many queries resulting in performance degradation\nor deadlock for business-critical workloads.\n\nThis means that users and applications throughout\nthe enterprise environment can connect to the same\nsingle copy of the data to drive diverse workloads, with\nall viewers guaranteed to receive the most current\nversion of the data at the time their query executes.\nWith performance features like indexing, Delta Lake\ncustomers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n[up to 48x faster.](https://www.databricks.com/customers/columbia)\n\n\n[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\nand learn how to create, manage and query tables.\nWith support for ACID transactions and schema\nenforcement, Delta Lake provides the reliability that\ntraditional data lakes lack. This enables you to scale\nreliable data insights throughout the organization and\nrun analytics and other data projects directly on your\ndata lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n\nDelta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n**Step 3**\n**Ingest data efficiently at scale**\nWith a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\nfrom hundreds of data sources for analytics, AI and\nstreaming applications into one place.\n\nDatabricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\ndata ingestion. To ingest any file that can land in a data\nlake, Auto Loader incrementally and automatically\nprocesses new data files as they arrive in cloud storage\nin scheduled or continuous jobs. Auto Loader scales to\nsupport near real-time ingestion of millions of files\nper hour.\n\nFor pushing data in Delta Lake, the SQL command\n[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\ninto Delta Lake. COPY INTO is best used when the input\ndirectory contains thousands of files or fewer, and the\nuser prefers SQL. COPY INTO can be used over JDBC\nto push data into Delta Lake at your convenience.\n\n\n**Step 4**\n**Leverage production-ready tools**\n**to automate ETL pipelines**\nOnce the raw data is ingested, Databricks provides\na suite of production-ready tools that allow data\nprofessionals to quickly develop and deploy extract,\n\ntransform and load (ETL) pipelines. Databricks SQL\nallows analysts to run SQL queries against the same\ntables used in production ETL workloads, allowing for\nreal-time business intelligence at scale.\n\nWith your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\nfor data orchestration and learn how easy it is to create\na cluster, create a Databricks notebook, configure\n[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\ninteract with the data, and schedule a job.\n\n\nDatabricks supports workloads in SQL, Python, Scala\nand R, allowing users with diverse skill sets and\ntechnical backgrounds to leverage their knowledge\nto derive analytic insights. You can use all languages\nsupported by Databricks to define production jobs, and\nnotebooks can leverage a combination of languages.\n\nThis means that you can promote queries written by\nSQL analysts for last-mile ETL into production data\nengineering code with almost no effort. Queries and\nworkloads defined by personas across the organization\nleverage the same data sets, so there’s no need to\nreconcile field names or make sure dashboards are up\nto date before sharing code and results with\nother teams.\n\n\n-----\n\nWith [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\na framework that uses a simple declarative approach\nto build ETL and ML pipelines on batch or streaming\ndata while automating operational complexities such\nas infrastructure management, task orchestration,\nerror handling and recovery, retries, and performance\noptimization.\n\nDelta Live Tables extends functionality in Apache Spark\nStructured Streaming and allows you to write just a\nfew lines of declarative Python or SQL to deploy a\nproduction-quality data pipeline with:\n\n- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n\n- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n\n- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n\n- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n\nWith DLT, engineers can also treat their data as code\nand apply software engineering best practices like\ntesting, monitoring and documentation to deploy\nreliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\nmaintain all data dependencies across the pipeline and\nreuse ETL pipelines with environment-independent\ndata management.\n\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n### Stopping sophisticated ransomware in its tracks\n\n```\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n```\n\nThe increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n\nto meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n\nthat scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n\nAbnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n\npipelines and constantly refined ML models.\n\n“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n\nAbnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n\nproduct better.”\n\nThe company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n\nmaximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n\ndirectly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n\ndelivers reliability, security and performance on the data lake for both streaming and batch operations. With\n\nDatabricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n\ndecisions and improve detection efficacy.\n\nDatabricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n\nproductivity and work in the same space without constantly competing for compute resources.\n\nWith Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n\ninfrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n\n\n-----\n\nDelta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\nthat trigger intermittently and are unpredictable. It optimizes cluster utilization\nby only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\nunnecessary idle node capacity.\n\n\nDelta Live Tables helps prevent bad data from flowing into tables through validation,\nintegrity checks and predefined error policies. In addition, you can monitor data\n\nquality trends over time to get insight into how your data is evolving and where\nchanges may be necessary.\n\n\n-----\n\n**Step 5**\n**Use Databricks SQL for serverless compute**\n[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\nwarehouse on the Lakehouse Platform for running your\nSQL and BI applications at scale with up to 12x better\nprice/performance. It’s imperative for younger, growing\ncompanies to reduce resource contention, and one way\nto accomplish that is with serverless compute. Running\nserverless removes the need to manage, configure or\nscale cloud infrastructure on the lakehouse, freeing up\nyour data team for what they do best.\n\n\nSee for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\nstored in your data lake.\n\nThe Databricks SQL REST API supports services to\nmanage queries and dashboards, query history and SQL\nwarehouses.\n\n\nDatabricks SQL warehouses provide instant, elastic\nSQL compute — decoupled from storage — and will\nautomatically scale to provide unlimited concurrency\nwithout disruption, for high concurrency use cases. DB\nSQL has data governance and security built in. Handle\nhigh concurrency with fully managed load balancing\nand scaling of compute resources.\n\n\n-----\n\n**Faster queries with Photon**\n[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\nto deliver dramatic infrastructure cost savings and\naccelerate all data and analytics workloads: data\ningestion, ETL, streaming, interactive queries, data\nscience and machine learning.\n\nPhoton is used by default in Databricks SQL. To\nenable Photon acceleration, select the **Use Photon**\n**Acceleration** checkbox when you create the cluster.\nIf you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\nset runtime_engine to PHOTON.\n\nPhoton supports a number of instance types on\nthe driver and worker nodes. Photon instance types\nconsume DBUs at a different rate than the same\ninstance type running the non-Photon runtime. For\nmore information about Photon instances and DBU\nconsumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n\nPhoton will seamlessly coordinate work and resources\nand transparently accelerate portions of your SQL and\nSpark queries. No tuning or user intervention required.\nPhoton is compatible with Apache Spark APIs, so\ngetting started is as easy as turning it on — no code\nchange and no lock- in. Written entirely in C++, Photon\nprovides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\nper the TPC-DS 1TB benchmark, and customers have\nobserved 3x–8x speedups on average.\n\n\nWith Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\nDatabricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n\nLearn how to connect BI tools to Databricks SQL\ncompute resources with the following user guides:\n\n\n[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n\n[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n\n\n[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n\n[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n\n\n[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n\n[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n\n\n-----\n\n**Step 6**\n**Orchestrate workflows**\nDatabricks provides a comprehensive suite of tools and integrations to support your\ndata processing workflows.\n\nDatabricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\norchestration service for all your teams, so you can focus on your workflows, not on\nmanaging your infrastructure. Orchestrate diverse workloads for the full lifecycle\nincluding Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n\nHere’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\nlearn how to create notebooks, create and run a job, view the run details, and run jobs\nwith different parameters.\n\n\n-----\n\n**Step 7**\n**Run an end-to-end analytics pipeline**\nThis where you can see how everything works together to run efficiently at scale. First\ntake the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\nwill write to and read data from an external location managed by Unity Catalog and\nconfigure Auto Loader to ingest data to Unity Catalog.\n\n###### Resources:\n\n- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n\n- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n\n- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n\n- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n\n- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n\n- [Databricks documentation](https://docs.databricks.com/)\n\n\n###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n —Anup Segu, Senior Software Engineer, YipitData\n\n[Learn more.](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n# 03\n```\nCHALLENGE: \u0003\n\n## Building effective machine-learning operations\n\n```\n\n-----\n\n```\nCHALLENGE 03\n\n### Building effective machine-learning operations\n\n```\nGrowing startups and digital native companies face several challenges when they\nstart building, maintaining and scaling machine learning operations (MLOps) for their\ndata science teams.\n\n\nMLOps is different from DevOps. DevOps practices\nand tooling alone are insufficient because ML\napplications rely on an assortment of artifacts (e.g.,\nmodels, data, code) that can each require different\nmethods of experiment tracking, model training,\nfeature development, governance, feature and\nmodel serving.\n\nFor data teams beginning their machine learning\njourneys, the challenge of training data models can\nbe labor-intensive and not cost-effective because\nthe data has to be converted into features and\n\ntrained on a separate machine learning platform\n\n\nData teams often perform development in\ndisjointed, siloed stacks spanning DataOps,\nModelOps and DevOps\n\nDevelopment and training environment\ndisconnect. Moving code and data between\npersonal development environments and\nmachine learning platforms for model training\nat scale is error prone and cumbersome. The\n“it worked on my machine” problem.\n\nGathering high-quality data. Data that is siloed\nacross the organization is hard to discover,\ncollect, clean and use. This leads to stale data\nand delays in development of models.\n\n\nSee **Create a unified data architecture.**\n```\n 03\n\n```\n\n-----\n\n###### Siloed stacks spanning DataOps, ModelOps and DevOps\n\nWhen data engineers help ingest, refine and prep\ndata, they do so on their own stack. This data has\nto be converted into features and then trained on\na separate machine learning platform. This cross-\nplatform handoff often results in data staleness,\ndifficulty in maintaining versions, and eventually,\npoorly performing models. Even after you have\ntrained your model, you have to deal with yet another\ntech stack for model deployment. It’s challenging\nto serve features in real time and difficult to trace\nproblems in production back to the data.\n\nThe downstream business impact is massive —\nlonger and more expensive projects, and lower\nmodel accuracy in production leading to declining\nbusiness metrics.\n\nIf you are looking at launching or scaling your\nMLOps, you should probably focus on an incremental\nstrategy. At Databricks, we see firsthand how\ncustomers develop their MLOps approaches across\na huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\nbuilding robust MLOps practices.\n\n\n###### Databricks solution:\n\nDatabricks Machine Learning is an integrated\nend-to-end machine learning environment\nincorporating managed services for experiment\ntracking, model training, feature development and\nmanagement, and model serving. The capabilities\nof Databricks map directly to the steps of model\ndevelopment and deployment. With Databricks\nMachine Learning, you can:\n\n\nTrain models either manually or with AutoML\n\nTrack training parameters and models using\nexperiments with MLflow tracking\n\nCreate feature tables and access them for model\ntraining and inference\n\nShare, manage and serve models using MLflow\nModel Registry\n\nDeploy models for Serverless Real-time Inference\n\n\n-----\n\n###### Use MLOps on the Databricks Lakehouse Platform\n\nTo gain efficiencies and reduce costs, many smaller\ndigital companies are employing machine learning\noperations. MLOps is a set of processes and\nautomation for managing models, data and code, and\nunique library dependencies to improve performance\nstability and long-term efficiency in ML systems.\n\nTo describe it simply, MLOps = ModelOps + DataOps +\nDevOps. The aim of MLOps is to improve the long-term\nperformance, stability and success rate of ML systems\nwhile maximizing the efficiency of the teams who\nbuild them.\n\n\nNot only does MLOps improve organizational efficiency,\nit also allows the models to iterate faster and react\nto real-life changes in the data. This ability separates\ncompanies that can grow to meet their customer’s\nchallenges in a reactive manner versus those that will\nspend significant time on data updates/processes and\nmiss the opportunity to do something with\ntheir models.\n\nThe absence of MLOps is typically marked by an\noverabundance of manual processes which are slower\n\n\nand more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\ncapping the ability for a data team to take on new projects. The process is complex. In larger organizations,\nseveral specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\nnatives and high-growth startups may be forced to wear several hats.\n\n\n-----\n\nAnd once an ML project goes into production, the\nMLOps continues, since the models, data and code\nchange over time due to regulatory and business\nrequirements. But the ML system must be resilient and\nflexible. Addressing these challenges with a defined\nMLOps strategy can dramatically reduce the iteration\ncycle of delivering models to production.\n\n\n-----\n\n###### Steps in machine learning model development and deployment:\n\n\n**Step 1**\n**Data preparation**\nManually preparing and labeling data is a thankless,\ntime-consuming job. With Databricks, teams can\nlabel data with human effort, machine learning\nmodels in Databricks, or a combination of both.\nTeams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\nworkflow that allows humans to easily inspect and\ncorrect a model’s predicted labels. This process can\ndrastically reduce the amount of unstructured data\nyou need to achieve strong model performance.\n\nThe [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\nready-to-go environment with many external\nlibraries, including TensorFlow, PyTorch, Horovod,\nscikit-learn and XGBoost. It provides\nextensions to improve performance, including GPU\nacceleration in XGBoost, distributed deep\nlearning using HorovodRunner, and model\ncheckpointing.\n\nTo use Databricks Runtime ML, select the ML version\nof the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\naccess data in Unity Catalog for machine learning\nworkflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\nisolation clusters are not compatible with Databricks\nRuntime for Machine Learning.\n\n\nMachine learning applications often\nneed to use shared storage for data\nloading and model checkpointing. You\ncan load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\nfiles. A table is a collection of\nstructured data stored as a directory\non cloud object storage.\n\nFor [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\nuse [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\nnew features, explore and reuse\nexisting features, track lineage and\nfeature creation code, and publish\nfeatures to low-latency online stores\nfor real-time inference. The Feature\nStore is a centralized repository\nthat enables data scientists to find\nand share features. It ensures that\nthe same code used to compute\nthe feature values is used for model\ntraining and inference. The Feature\nStore library is available only on\nDatabricks Runtime for Machine\nLearning and is accessible through\nDatabricks notebooks and workflows.\n\n\n###### Resources:\n\n- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n\n- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n\n- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n\n\n-----\n\nC `USTOMER STORY: ZIPLINE`\n\n### Data-driven drones deliver lifesaving medical aid around the world\n\n\nAutomated logistics and delivery system\n\nprovider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n\ncutting-edge drone technology and a global\n\nautonomous logistics network to save lives\n\n\ninformation they need to accurately measure success, find\n\nthe metrics that relate to customer experiences or logistics,\n\nand improve on them exponentially as more data is ingested\n\nand machine learning models are refined.\n\n\nby giving remote communities access to\n\n\nemergency and preparatory medical aid and\n\nresources, regardless of where they are in the\n\nworld.\n\nDoing so requires the ability to ingest and\n\nanalyze huge chunks of time series data in real\n\ntime. This data is produced every time a drone\n\ntakes flight and includes performance data,\n\nin-flight battery management, regional weather\n\npatterns, geographic obstacles, landing errors\n\nand a litany of other information that must be\n\nprocessed.\n\n\n“About 30% of the deliveries we do are lifesaving emergency\n\ndeliveries, where the product being delivered does not exist\n\nat the hospital. We have to be fast, and we have to be able\n\nto rely on all the different kinds of data to predict failures\n\nbefore they occur so that we can guarantee a really, really\n\nhigh service level to the people who are literally depending\n\non us with their lives,” said Zipline CEO Keller Rinaudo.\n\n“Databricks gives us confidence in our operations, and\n\nenables us to continuously improve our technology, expand\n\nour impact, and provide lifesaving aid where and when it’s\n\nneeded, every single day.”\n\n[Read full story here.](https://www.databricks.com/customers/zipline)\n\n\nEvery Zipline flight generates a gigabyte of data\n\nwith potential life-or-death consequences,\n\nbut accessing and federating the data for both\n\ninternal and external decision-making was\n\nchallenging. With Databricks as the common\n\nplatform, Zipline’s data team can access all the\n\n\n-----\n\n**Step 2**\n**Model training**\nFor training machine learning and deep learning\nmodels, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\nprepares a data set for model training, performs a set\nof trials using open-source libraries such as scikit-learn\nand XGBoost, and creates a Python notebook with\nthe source code for each trial run so you can review,\nreproduce and modify the code.\n\nIn Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\ncreating data science and machine learning workflows\nand collaborating with colleagues. Databricks\nnotebooks provide real-time coauthoring in multiple\nlanguages, automatic versioning and built-in data\nvisualizations.\n\n\n###### Resources:\n\n- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n\n- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n\n\n-----\n\n###### Resources:\n\n- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n\n- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n\n- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n\n- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n\n- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n\n\n**Step 3**\n**Track model development**\nThe model development process is iterative, and can\nbe challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\nyou keep track of the model development process,\nincluding parameter settings or combinations you have\ntried and how they affected the model’s performance.\n\nMLflow tracking uses experiments and runs to log\nand track your model development. A run is a single\nexecution of model code. An experiment is a collection\nof related runs. Within an experiment, you can compare\nand filter runs to understand how your model performs\nand how its performance depends on the parameter\nsettings, input data, etc.\n\nMLflow can automatically log training code written\nin many ML frameworks. This is the easiest way to\nget started using MLflow tracking. With MLflow’s\nautologging capabilities, a single line of code\nautomatically logs the resulting model.\n\n\nA hosted version of MLflow Model Registry can help\n[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\napply webhooks to automatically trigger actions based\non registry events. For example, you can trigger CI\nbuilds when a new model version is created or notify\nyour team members through Slack each time a model\ntransition to production is requested. This promotes\na traceable version control work process. You can\nleverage this feature for web traffic A/B testing and\nfunneled to versions of deployed models for more\nprecise population studies.\n\n\n**Step 4**\n**Deploy machine learning models**\nYou can use MLflow to deploy models for batch or\nstreaming inference or to set up a REST endpoint to\nserve the model. Simplify your model deployment by\nregistering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\nyou have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\nthe model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n\n[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\napplications, Databricks recommends the following\nworkflow.\n\nTo debug and tune model inference on Databricks,\nusing GPUs (graphics processing units) can efficiently\noptimize the running speed for model inference. As\nGPUs and other accelerators become faster, it is\nimportant that the data input pipeline keep up with\ndemand. The data input pipeline reads the data into\nSpark DataFrames, transforms it and loads it as the\ninput for model inference.\n\n\n-----\n\n```\nCUSTOMER STORY: ITERABLE\n\n### Optimizing touch points across the entire customer journey\n\n```\n“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n\nrising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n\nPrincipal Product Manager, [Iterable](https://iterable.com/)\n\nCaptivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n\nconnections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n\npatients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n\nthan 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n\nThis need to build personalized and automated customer experiences for its clients drove the company to find a\n\nfully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n\nthe ability to scale for analytics and AI.\n\nWith Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n\nunique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n\nlearning models that deliver top-notch and personalized user experiences for higher-converting marketing\n\ncampaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n\n\n-----\n\n###### ML Stages\n\nML workflows include the following key assets: code,\nmodels and data. These assets need to be developed\n(dev), tested (staging) and deployed (production).\nEach stage needs to operate within an execution\nenvironment. So the execution environments, code,\nmodels and data are divided into dev, staging and\nproduction.\n\nML project code is often stored in a version control\nrepository (such as Git), with most organizations using\nbranches corresponding to the lifecycle phases of\ndevelopment, staging or production.\n\nSince model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\nmanagement to have its own service. MLflow and its\nModel Registry support managing model artifacts\ndirectly via UI and APIs. The loose coupling of model\nartifacts and code provides flexibility to update\nproduction models without code changes, streamlining\nthe deployment process in many cases.\n\nDatabricks recommends creating separate\nenvironments for the different stages of ML code and\nmodel development with clearly defined transitions\nbetween stages. The recommended MLOps workflow is\nbroken into these three stages:\n\n\n[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\nis experimentation. Data scientists develop features\nand models and run experiments to optimize model\nperformance. The output of the development process is\nML pipeline code that can include feature computation,\nmodel training inference and monitoring\n\n\n-----\n\n[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\nThis stage focuses on testing the ML pipeline code\nfor production readiness, including code for model\ntraining as well as feature engineering pipelines and\ninference code. The output of the staging process is a\nrelease branch that triggers the CI/CD system to start\nthe production stage.\n\n\n-----\n\n[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\nML engineers own the production environment\nwhere ML pipelines are deployed. These pipelines\ncompute fresh feature values, train and test new model\nversions, publish predictions to downstream tables\nor applications, and monitor the entire process to\navoid performance degradation and instability. Data\nscientists have visibility to test results, logs, model\nartifacts and production pipeline status to allow them\nto identify and diagnose problems in production.\n\nThe Databricks Machine Learning home page provides\nquick access to all the machine learning resources. To\naccess this page, move your mouse or pointer over\nthe left sidebar in the Databricks workspace. From\nthe persona switcher at the top of the sidebar, select\n\nMachine Learning.\n\nFrom the shortcuts menu, you can create\na [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\nThe center of the screen includes any recently viewed\nitems, and the sidebar provides quick access to\nthe [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\nNew users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\nthat illustrate how to use Databricks throughout the\n\n\n-----\n\n###### Resources:\n\n- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n\n- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n\n- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n\n- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n\n- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n\n- [Watch the demos](https://www.databricks.com/discover/demos)\n\n\nML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\nfor a model-training tutorial notebook that steps\nthrough loading data, training and tuning a model,\ncomparing and analyzing model performance and\nusing the model for inference.\n\nAlso be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\nlearn how your organization can build a robust MLOPs\npractice incrementally.\n\n\n-----\n\n# 04\n```\nSUMMARY: \u0003\n\n## The Databricks Lakehouse Platform addresses these challenges\n 04\n\n```\n\n-----\n\n### Summary\n\nWe’ve organized the common data challenges for startups and growing digital native\n\nbusinesses into three main buckets: Building a **unified data architecture** — one that\n\nsupports **scalability and performance** ; and building effective **machine learning**\n\n**operations** , all with an eye on cost efficiency and increased productivity.\n\nThe Lakehouse Platform provides an efficient and scalable architecture that solves\nthese challenges and will support your data, analytics and AI workloads now and as\nyou scale.\n\nWith [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\nperformant digital native applications and analytic workloads — designed to scale as\nyou grow. Use your data however and wherever you want with open-source flexibility,\nleverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\ndata workloads while Databricks automatically manages your infrastructure as you\nscale. Leverage serverless Databricks SQL to increase productivity and scale on\ndemand with up to 12x better price/performance.\n\nEasily access data for ML models and accelerate the full ML lifecycle from\nexperimentation to production.\n\nDiscover more about the lakehouse for companies born in the cloud **.**\n\n\n-----\n\n### Get started with Databricks Trial\n\nGet a collaborative environment for data teams to build\nsolutions together with interactive notebooks to use\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\nTensorFlow, Keras, scikit-learn and more.\n\n\n### Get started with About Databricks Trial Databricks\n\nGet a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\nsolutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\nTensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San\n\nAvailable as a 14-day full trial in your own cloud or as\n\nFrancisco, with offices around the globe. Founded by\n\na lightweight trial hosted by Databricks.\n\nthe original creators of Apache Spark™, Delta Lake and\nMLflow, Databricks is on a mission to help data teams\nsolve the world’s toughest problems. To learn more,\nfollow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n\n\n\n- Available as a 14-day full trial in your own cloud or as\na lightweight trial hosted by Databricks.\n\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "**EBOOK**\n\n# Four Forces Driving Intelligent Manufacturing\n\n### A data-driven business built on Lakehouse for Manufacturing\n\n\n-----\n\n## Contents\n\nIntroduction .................................................................................................................................................................................................................................................. **03**\n\nThe four driving forces of change ..................................................................................................................................................................................................... **04**\n\nDigital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n\nManufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n\nThe foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n\nDRIVING FORCE NO. 1\nThe shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n\nDRIVING FORCE NO. 2\nTransparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n\nDRIVING FORCE NO. 3\nFuture opportunities for manufacturing business models ......................................................................................................................................................... **13**\n\nDRIVING FORCE NO. 4\nThe focus on sustainability ....................................................................................................................................................................................................................... **15**\n\nLeveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n\nThe building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n\nManufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n\n2 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Introduction\n\n##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n\n\nBut today it’s data- and AI-driven businesses that\nare being rewarded because they’re using process\nand product optimization not previously possible,\nable to forecast and sense supply chain demand,\nand, crucially, introduce new forms of revenue\nbased upon service rather than product.\n\nThe drivers for this evolution will be the emergence\nof what we refer to as “Intelligent Manufacturing”\nthat has been enabled by the rise of computational\npower at the Edge and in the Cloud. As well as\nnew levels of connectivity speed enabled by 5G\nand fiber optic, combined with increased use of\nadvanced analytics and machine learning (ML).\n\n\nYet, even with all the technological advances\nenabling these new data-driven businesses,\nchallenges exist.\n\nMcKinsey’s recent research with the World\nEconomic Forum estimates the value creation\npotential of manufacturers and suppliers that\nimplement Industry 4.0 in their operations\nat USD$37 trillion by 2025. Truly a huge number.\nBut the challenge that most companies still\nstruggle with is the move from piloting point\nsolutions to delivering sustainable impact at scale.\n[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n\n\n##### 80% of manufacturers\n[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n\n##### 57% of manufacturing leaders feel their organization\n[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n\n[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n##### manufacturers by 2025\n\n\n3 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The four driving forces of change\n\n###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n\n\n###### Skills and production gaps\n\nThe rise of the digital economy is demanding a new set of skills.\nFor today’s Intelligent Manufacturing organizations, there’s a fundamental\nneed for computer and programming skills for automation, along\nwith critical-thinking abilities. Also important is the ability to use\ncollaboration systems and new advanced assistance tools, such as\nautomation, virtual reality (VR) and augmented reality (AR). The deficit\nof workers with these skills is of critical concern to manufacturers.\n\nIn addition, the industry dynamics are pushing companies to increase\nand refine both partner/supplier relationships, optimize internal\noperations and build robust supply chains that do not rely upon\nsafety stock to weather supply chain swings. Historical focus on\noperational use cases is now extending to building agile supply chains.\n\n###### Supply chain volatility\n\nIf the events of the last few years proved anything, it’s that supply\nchains need to be robust and resilient. Historically, supply chain volatility\nwas smoothed by holding “safety stock,” which added costs without\nfinancial value. Then the pendulum swung to “just in time delivery,”\nwhere efficient use of working capital disregarded demand risks.\n\nRecent experiences have highlighted that demand sensing is needed\nin addition to safety stock for high-risk parts or raw materials. The ability\nto monitor, predict and respond to external factors – including natural\ndisasters, shipping and warehouse constraints, and geopolitical disruption\n– is vital to reduce risk and promote agility. Many of these external\ndata sources leverage unstructured data (news, social posts, videos\nand images), and being able to manage both structured and unstructured\ndata available to measure and analyze this volatility is key.\n\n\n###### Need for new and additional sources of revenue\n\nManufacturers’ growth historically has been limited\nto new product introduction rate or expansion into\nnew geographies. The emergence of “equipment\nas-a-service” is changing that dynamic. It’s pivoting\nthe business from product-centric growth to one\nleveraging added services, which are not slaves to the\nproduct development introduction cycle and can be highly\ndifferentiated depending on the market segment and types\nof products. Real-time data plays an outsize role, as now\nbusinesses are in unison with use cases such as predictive\nmaintenance, stock replenishment and worker safety.\n\n###### An increased focus on sustainability\n\nManufacturers have always focused on efficiency,\nbut they’re increasingly seeing the value chain as circular.\nIt’s no longer enough to consider an organization’s own\ncarbon footprint – it needs to also include indirect\nemissions and other environmental impacts from the\nactivities it doesn’t own or control. This requires a\n360-degree view of sustainability, which includes both\ninternal and external factors in measuring compliance\nwith ESG programs.\n\n**This eBook will look closer at these four key challenges**\n**and their associated use cases, as well as some**\n**of the most effective technologies and solutions**\n**that can be implemented to respond to them.**\n\n\n4 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Digital transformation is not a destination, it’s a journey\n\n##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n\nThis transition from manual operations to automated\nsolutions is enhancing and optimizing operational\nefficiency and decision-making, while also making\nsupply chains more frictionless and reliable, as well\nas enabling organizations to become more responsive\nand adaptable to market and customer needs.\n\nThis disruption has been driven by a rush of new\ntechnologies including artificial intelligence, machine\nlearning, advanced analytics, digital twins, Internet\nof Things (IoT), and automation. These, in turn, have\nbeen enabled by the greater network capabilities of 5G.\nIndustry 4.0 is well underway. Intelligent Manufacturing\nisn’t the future, it’s what competitive organizations\nhave established today.\n\n\n## The data and AI maturity curve\n### From descriptive to prescriptive\n\nPrescriptive\nAnalytics\n\nPredictive\nModeling\n\n**How** can we make it happen?\n\nData\nExploration\n\n\n**What** will happen?\n\n**Why** did it happen?\n\n\nAd Hoc\nQueries\n\n\nReports\n\n\nCleaned\nData\n\n**What** happened?\n\nAnalytics Maturity\n\n\nRaw\nData\n\n\n5 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturing – use case maturity matrix\n\n\nNo\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n10\n\n11\n\n12\n\n13\n\n14\n\n15\n\n16\n\n17\n\n18\n\n19\n\n20\n\n21\n\n22\n\n23\n\n\nUse case name\n\nEDW offload\n\nProduct 360\n\nVoice of customer insights\n\nTesting & simulation optimization\n\nSupplier 360\n\nSpend analytics\n\nSourcing event optimization\n\nProcess & quality monitoring\n\nProcess 360\n\nEquipment predictive maintenance\n\nQuality & yield optimization\n\nSupply chain 360\n\nDemand analytics\n\nInventory visibility & tracking\n\nInventory optimization\n\nLogistics route optimization\n\nCustomer 360\n\nMarketing & sales personalization\n\nRecommendation engine\n\nAsset/Vehicle 360\n\nConnected asset & value-added services\n\nQuality event detection & traceability\n\nAsset predictive maintenance\n\n\nPeer Competitive Scale\n\nStandard among peer group\n\nCommon among peer group\n\nStrategic among peer group\n\n\nDesign\n\n\nPurchasing\n\n**11**\n\n**10**\n\n**13**\n\n**12**\n\n**17**\n\n\nNew innovations\n\nManufacturing\n\nSupply Chain\n\n\nThat is not to say that the digital transformation\njourney is simple. Replacing legacy systems, breaking\ndown data and organizational silos, bridging the gap\nbetween operational technology (OT) and informational\ntechnology (IT), reskilling workforces, and much more\nrequires a clear and determined digitalization strategy,\nand to reach new levels of IT and data maturity.\n\n\n**16**\n\n\nMuch of the aforementioned transformation requires\na foundation of effective data management and\narchitecture to be in place. Without this ability to\ncontrol the vast amounts of structured data (highly\norganized and easily decipherable) and unstructured\ndata (qualitative, no predefined data model),\nmanufacturers cannot generate actionable insights\nfrom their data, derive value from machine learning,\nmonitor and analyze supply chains, or coordinate\ndecisions across the business.\n\n\n**15**\n\n\n**14**\n\n\nMarketing & Sales\n\nService\n\n\n**19**\n\n\n**18**\n\n\n**23**\n\n\n**22**\n**21**\n**20**\n\n\nAwareness\n\n\nExploration Optimization Transformation\n\nMaturity Stages\n\n\n6 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The foundations for data-driven manufacturing\n\n###### Cloud-native platforms\n\nImprove data management, enhance data analytics\nand expand the use of enterprise data, including streaming\nstructured and unstructured data\n\n###### Technology-enabled collaboration\n\nDemocratize analytics and ML capabilities – ensure the right\nusers have access to the right data driving business value\n\n###### The ability to scale machine learning use cases\n\nA central place to store and discover ML models and enabling\ngreater collaboration between ML, data and business users\n\n\n##### 95% agree that\n[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n\n[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n##### USD$2.8 trillion by 2025\n\n\n##### 85% have accelerated\n[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n\n\n###### Open standards and open data architectures\n\nLeverage open source standards and open data formats\nto accelerate innovation and enable the integration\nof best-of-breed, third-party tools and services\n\n\n7 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 1\n\n## The shift from manufacturing to Intelligent Manufacturing\n\n##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n\n\nYet the reasons for the lack of manufacturing\ntalent today are manifold, and COVID-19 has only\ncontributed to an existing problem. For instance,\nmany highly experienced baby boomers are\nretiring from the workforce, leaving fewer people\nwith the in-depth knowledge of custom equipment\nand machines. Meanwhile, younger generations\nhave a poor perception of what manufacturing jobs\nare like and are reluctant to step into the industry.\nMeaning not only a problem with retaining skills,\nbut also attracting them.\n\nAnd, of course, there is a growing gap between\nthe current capabilities of industrial workers and\nthe skill sets needed for today’s data-driven,\nsensor-filled, 5G-enabled Intelligent Manufacturing.\n\n\nWith the drive to optimize operations, stabilize\nsupply chains and reinvent business models\nthrough equipment-as-a-service, the skill sets\nhave radically changed from even a decade ago.\n\nIntelligent Manufacturing’s use cases are placing\na high demand on robotics programmers and\ntechnicians, cybersecurity experts, digital twin\narchitects, supply network analysts, and people\nwho can leverage AI and machine learning\nalgorithms because deployment of these common\nuse cases is producing multiples of returns for\nthose embracing Intelligent Manufacturing.\n\n\n8 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n\n\n##### 44% report difficulty\n[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n##### 83% of manufacturing workers are interested\n[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n\n##### 56% of Gen Z say\n[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n\n### Proof through customer success\n\n##### Watch our case study\n\n\n###### Digital twins\n\nIngesting information from sensors and other data sources,\nthese virtual replicas of physical assets create models\nto which a layer of visualization can be applied. This enables\nusers to predict failures, assess performance and reveal\nopportunities for optimization. Digital twins unlock the ability\nfor manufacturers to monitor and manage production remotely,\nas well as explore “what-if” scenarios.\n\n###### Process and quality optimization\n\nProcess and quality optimization generally covers the\noptimization of equipment, operating procedures, and control\nloops. It requires access to accurate, up-to-date data about\nconditions, collected through IoT devices to monitor every\naspect. The introduction of deep learning architectures is\nenabling manufacturing machinery to identify visual clues\nthat are indicative of quality issues in manufactured goods,\nwhile digital twins can be used to spot inefficiencies without\nthe need to pause production.\n\n###### Throughput optimization\n\nIncreasing throughput is critical for meeting delivery schedules,\nand manufacturers are always looking for ways to identify\nand eliminate bottlenecks, reduce inventory and increase\nthe utilization of assets. Throughput optimization makes\nuse of data-driven algorithms to identify, rank and resolve\nlabor, equipment or inventory bottlenecks.\n\n\n###### Equipment predictive maintenance\n\nRather than wait for a piece of equipment to fail or\nstick to a fixed schedule, predictive maintenance adopts\na predictive approach to equipment maintenance.\nBy monitoring real-time data collected from hundreds\nof IoT sensors, machine learning techniques can detect\nanomalies in operations and possible defects in equipment\nand processes. Predictive maintenance correlates data across\nmany more dimensions than traditional inspection techniques,\nto anticipate failures and prevent costly breakdowns.\n\n###### Quality and yield optimization (with computer vision)\n\nQuality assurance focuses on the use of data analytics,\nAI and machine learning to identify and prevent defects\nduring the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\nrecognition and machine learning, computer vision\ncan automate visual inspections, detecting faults\nand imperfections faster and more cost effectively\nthan manual approaches.\n\n\n9 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 2\n\n## Transparency, visibility, data: optimizing the supply chain\n\n##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n\n\nSuch resiliency requires a combination\nof technologies and solutions. For example,\ndecision support tools with predictive capabilities\n– to monitor the supply chain and analyze\nwhat-if scenarios. Demand sensing and forecasting\nin combination with enterprise critical systems\n(ERP) needs to combine data from a wide variety\nof sources.\n\n10 Four Forces Driving Intelligent Manufacturing\n\n\nWorking together, combining millions of data points\nfrom across organizations’ operations along with\nother external sources, these technologies can\nbe used to optimize supply chains, reduce costs\nand improve customer service and loyalty.\nHowever, achieving this – embracing the latest\nin AI, machine learning and predictive analytics –\nmeans being able to manage and maintain\na flow of accurate, relevant data and to be able\nto translate this data into actionable insights.\n\n\n-----\n\n#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n\n\n###### Demand, inventory, logistics\n\n\n###### Purchasing\n\n**Spend analytics:** Most obviously, transparency and insight into where\ncash is spent is vital for identifying opportunities to reduce external\nspending across supply markets, suppliers and locations. However, spend\nanalytics are also hugely important to supply chain agility and resilience.\nThis requires a single source of data truth for finance and procurement\ndepartments. For example, integrating purchase order, invoice,\naccounts payable, and general-ledger account data to create a level of\ntransparency, visibility and consistency to inform supplier discussions\nand deploy strategies to manage cash better during times\nof disruption.\n\n###### Cross supply chain collaboration\n\n**Supply chain 360:** With real-time insights and aggregated supply\nchain data in a single business intelligence dashboard, manufacturers\nare empowered with greater levels of visibility, transparency\nand insights for more informed decision-making. This dashboard\ncan be used to identify risks and take corrective steps,\nassess suppliers, control costs and more.\n\n\n**Demand analytics:** By collecting and analyzing millions –\nif not billions – of data points about market and customer\nbehavior and product performance, manufacturers can\nuse this understanding to improve operations and support\nstrategic decisions that affect the demand of products\nand services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n\n**Inventory visibility and tracking:**\nInventory visibility is the ability to view and track\ninventory in real time, with insights into SKU stock levels\nand which warehouse or fulfillment center it is stored at.\nWith complete oversight of inventory across multiple\nchannels, this helps improve supply chain efficiency,\ndemand forecasting and order accuracy, while ultimately\nenhancing the customer experience.\n\n\n**Inventory optimization:** The practice of having the right\namount of available inventory to meet demand, both in the\npresent and the future, enables manufacturers to address\ndemand expectations, and reduce the costs of common\ninventory issues. Inventory optimization incorporates\ndata for demand forecasting, inventory strategy and\nstock replenishment. With the addition of AI reinforced\nlearning models, this can help improve demand prediction,\nrecommend stock levels, and automatically order\nraw materials to fulfill orders, while also detecting\nand responding to shifts in demand.\n\n**Logistics route optimization:** Using AI, route optimization\ncan help manufacturers go beyond normal route planning\nand include parameters to further drive logistics efficiency.\nWhat-if scenarios present route options that help cut\ntransportation costs, boost productivity and execute\non-time deliveries.\n\n\n**Supply chain network design:** By building and modeling the supply\nchain, it enables manufacturers to understand the costs and time\nto bring goods and services to market. Supply chain network design\nhelps to evaluate delivery at the lowest possible cost, optimal sources\nand inventory deployment, as well as define distribution strategies.\n\n11 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n\n Only 6% of companies believe\n[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n\n##### 57% believe that supply chain management \n[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n\n### Supply chain optimization case study\n\n##### Watch our case study\n\n12 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 3\n\n## Future opportunities for manufacturing business models\n\n##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n\n\nThese new opportunities are forming part\nof a longer-term industry shift from the sale\nof goods (CapEx) to recurring revenue streams,\nsuch as through Equipment-as-a-Service (EaaS)\nmodels. While this approach is not new to many\n(Rolls-Royce’s “Power-by-the-Hour” engine\nsubscription model has been around since 1962),\ncustomer demand, advances in industrial IoT\ntechnology, and a continuing decline in\nsales and margins have seen EaaS emerge\nas an imperative for manufacturers.\n\n\nOpening up some of these new revenue streams,\nof course, demands operational flexibility, but more\nimportantly, digital maturity. This means cloud\ntechnologies that allow employees new levels\nof access to data, the ability to work anywhere,\nand adapt rapidly to new needs. The introduction\nof a microservices architecture, to allow the agile\ndevelopment and deployment of new IT services.\nAnd the democratization of data, so the entire\norganization and its ecosystem of partners\nand suppliers have access to information\nabout market demand, operations, production,\nlogistics and transportation.\n\n\n13 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n##### By 2023, 20% of industrial equipment manufacturers will\n[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n\n##### In 2025, the global EaaS market is estimated\n[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n\n##### In the U.S., 34% said\n[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n\n### Equipment as a service case study\n\n##### Read our case study\n\n\n### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n\n\n###### Connected assets\n\nThe digital connectivity of high-value\nphysical assets is helping to drive a\nmore efficient use of assets and cost\nsavings. Connected assets can provide\ncontinuous, real-time data on their\noperating conditions, even if they are on\nthe other side of the world. Connected\nassets can also be used as the foundation\nof as-a-service business models to\ntrack the usage of rented machines, and\nfor automakers to use with connected\nvehicles and electrification strategies.\n\n\n###### Quality event detection and traceability\n\nManufacturers are increasingly seeking\nend-to-end supply chain traceability —\nto be able to identify and trace\nthe history, distribution, location\nand application of products, parts\nand materials. With event-based\ntraceability, typically using blockchain\nledgers, manufacturers can record\nevents along the supply chain.\nThis can help aid legal compliance,\nsupport quality assurance and brand\ntrust, and provide full supply chain\nvisibility for better risk management.\n\n\n###### Demand-driven manufacturing\n\n**Equipment-as-a-Service:**\nStartup organizations without\nthe in-house infrastructure can\nuse a third-party to realize their\nconcepts, while manufacturers\nwith the production capabilities\ncan ensure minimal downtime\nfor their assets. This involves\ngreater risk for the manufacturer,\nbut also the potential for higher\nand annuitized revenues.\n\n\n14 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 4\n\n## The focus on sustainability\n\n##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n\n\nWhen looking at the entire manufacturing\nvalue chain, there are many areas where\nmore sustainable practices can deliver\nmeasurable change. Products can be\ndesigned in a way that reduces waste\nand increases their longevity; materials\ncan be selected and sourced in a more\nethical way; operational efficiency and\ngreen energy can improve production;\nand the introduction of sustainable\npractices for transportation and\nshipping can help reduce carbon\nfootprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n\n\nThere are a number of business\noperating models that employ the four\nRs and support the circular economy.\nSharing platforms and aaS models help\noptimize manufacturing capacity and\nenable businesses to rent rather than\nbuy the machinery and equipment\nthey need. Product use extension helps\nextend the lifecycle of products through\nrepair and refurbishment, while resource\nrecovery means recovering raw materials\nfrom end-of-life products.\n\nAchieving this means establishing\na redesigned supply chain that\nleverages many use cases, technologies\nand solutions we covered earlier.\n\n\nIt will require greater levels of\ncollaboration between suppliers\nand vendors. It will require optimizing\nproduction lines and transportation.\nIt will require greater levels of customer\nengagement to extend product lifecycles\nand close the loop of the supply chain.\n\nBut most of all, it will require data,\nto provide visibility and intelligence\nacross the network, and to be able\nto make the decisions to improve\nefficiency in the present, as well as\nlonger-term decisions based on a\nbroad view of sustainability impacts.\n\n\n15 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Sustainability Solution Accelerator\n\n##### Read now\n\n\n[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n##### world’s energy consumption\n[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n\n\n##### 80% of the world’s leading companies \n[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n\n\n##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n\n\n16 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Leveraging the Databricks Lakehouse for Manufacturing\n\nOur open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\nand transportation & logistics organizations to unlock more value and transform how they use data and AI.\n\n\nAll your sources Any structure or frequency\n\n\nReliable, real-time processing Analytics capabilities for any use case or persona\n\n\nCompetitor News\n& Social\n\nConsumer Devices\n\nVideo & Images\n\nIoT & Shop Floor\n\nEnterprise Resource\nPlanning\n\nSales Transaction\n& Syndicated\n\nInventory & Logistics\n\n\nUnstructured batch\n\n\nAd Hoc Data Science\n\nLow-cost, rapid experimentation\nwith new data and models.\n\nProduction Machine Learning\n\nHigh volume, fine-grained analysis at scale\nserved in the tightest of service windows.\n\nBI Reporting and Dashboarding\n\nPower real-time dashboarding directly,\nor feed data to a data warehouse for\nhigh-concurrency reporting.\n\nReal-Time Applications\n\n\nLakehouse enables a real-time\ndata-driven business with the ability\nto ingest structured, semi-structured\nand unstructured data from ERP,\nSCM, IoT, social or other sources\nin your value chain so that predictive\nAI and ML insights can be realized.\nThis enables them to operate their\nbusiness in real time, deliver more\naccurate analytics that leverage all\ntheir data, and drive collaboration\nand innovation across their value\nchain. Most important for capital\nintensive manufacturing business,\nit enables them to move quickly\nfrom proof-of-concept (PoC)\nideation to ROI quickly.\n\n\nSemi-structured real-time\n\nUnstructured batch\n\nSemi-structured real-time\n\nStructured real-time\n\nStructured batch\n\nStructured real-time\n\n\nData Lakehouse\n\nProcess, manage, and\nquery all your data.\n\nAny cloud\n\n\nProvide real-time data to downstream\napplications or power applications via APIs.\n\n\n17 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The building blocks of Lakehouse for Manufacturing\n\n\n###### Real Time\n\nMake data-informed decisions\n\n\n###### Solution Accelerators\n\nAccelerate the possibilities\nof capabilities\n\n\n###### Partner Solutions\n\nAccelerate the\ncreation of insights\n\n\n###### Speed\n\nDelivering fast ROI\n\n\n**Real-time data to make informed**\n**decisions:** The Lakehouse Platform\nstreamlines data ingestion and\nmanagement in a way that makes it easy\nto automate and secure data with fast,\nreal-time performance. This means you\ncan consolidate and enhance data from\nacross the organization and turn it into\naccessible, actionable insights.\n\n\n**Solution Accelerators for new**\n**capabilities:** Through our Solution\nAccelerators, manufacturers can\neasily access and deploy common and\nhigh-impact use cases. For manufacturers\nrestricted by time and resources, these\naccelerators provide the tools and\npre-built code to deliver PoCs in\nless than two weeks.\n\n\n**Pre-built applications to deliver**\n**solutions faster:** We make it easy\nfor you to discover data, analytics\nand AI tools, using pre-built integrations\nto connect with partner solutions,\nintegrating them (and existing solutions)\ninto the Lakehouse Platform to rapidly\nexpand capabilities in a few clicks.\n\n\n**The speed to deliver fast ROI:**\nWith faster data ingestion and access\nto insights combined with easier, quicker\ndeployments, this means accelerated\ndigital transformation and higher ROI.\n\n\n18 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturers’ end goals\n\n##### Intelligent Manufacturing leaders leverage a combination of familiar manufacturing techniques and recent value producing and differentiating use of data-led use cases.\n\nThis means making use of IIoT, cloud computing, data analytics,\nmachine learning and more to create an end-to-end digital ecosystem\nacross the entire value chain and build scalable architectures\nthat take data from edge to AI. It means embracing automation\nand robotics, optimizing how organizations use assets and\naugmenting the capabilities of workforces, and introducing new\nlevels of connectivity to accelerate performance. Not to mention\nopen the door to new platform and as-a-service business models\nwith the potential to generate new revenue streams.\n\nAlso key to the data-driven transformation of manufacturing is visibility:\na 360-degree, end-end-to view of the supply chain. Not only is this\ncritical for the efficiency, optimization and profitability of operations,\nit is needed to be able to take new strides in sustainability.\n\nOf course, better data management is not only about unlocking\ninsight, empowering AI, and enabling decision-making. It’s also about\ngovernance: acknowledging format issues, adhering to compliance,\nprotecting IP, ensuring data security. All this needs to be taken into\nconsideration when bringing onboard an ISV to establish a modern,\nunified architecture for data and AI.\n\n19 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe. Founded by\nthe original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems. To learn more,\nfollow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\nGet started with a free trial of Databricks and\nstart building data applications today\n\n##### Start your free trial\n\nTo learn more, visit us at:\n\n**[Databricks for Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "## Driving Innovation and Transformation in the Federal Government With Data + AI\n\nEmpowering the federal government\nto efficiently deliver on mission objectives\nand better serve citizens\n\n\n-----\n\n### Contents\n\nState of the union: Data and AI in the federal government **03**\n\nRecognizing the opportunity for data and AI **04**\n\nChallenges to innovation **07**\n\nThe Databricks Lakehouse Platform: Modernizing the federal government to achieve mission objectives **09**\n\nCustomer story: U.S. Citizenship and Immigration Services **13**\n\nConclusion **15**\n\n\n-----\n\n### State of the union: Data and AI in the federal government\n\nFor the private sector, the growth, maturation and application of data analytics and\n\nartificial intelligence (AI) have driven innovation. This has resulted in solutions that have\n\nhelped to improve efficiencies in everything from optimizing supply chains to accelerating\n\ndrug development to creating personalized customer experiences and much more.\n\nUnfortunately, the federal government and many of its agencies are just beginning to take\n\nadvantage of the benefits that data, analytics and AI can deliver. This inability to innovate\n\nis largely due to aging technology investments, resulting in a sprawl of legacy systems\n\nsiloed by agencies and departments.\n\nAdditionally, the government is one of the largest employers in the world, which introduces\n\nsignificant complexity, operational inefficiencies and a lack of transparency that limit the\n\nability of its agencies to leverage the data at their disposal for even basic analytics – let\n\nalone advanced data analytic techniques, such as machine learning.\n\n\n-----\n\n### Recognizing the opportunity for data and AI\n\nThe opportunity for the federal government to leverage data analytics and AI cannot be\n\noverstated. With access to some of the largest current and historical data sets available to the\n\n\nUnited States — and with vast personnel resources and some of the best private sector use\n\ncases and applications of AI available in the world — the federal government has the ability to\n\ntransform the efficiency and effectiveness of many of its agencies.\n\nIn fact, the federal government plans to spend $4.3 billion in artificial intelligence research and\n\ndevelopment across agencies in fiscal year 2023, according to a recent report from Bloomberg\n\nGovernment. These priorities are validated by a recent Gartner study of government CIOs\n\nacross all levels (including state and local), confirming that the top game-changing technologies\n\nare AI, data analytics and the cloud.\n\nAnd as an indication of the potential impact, a recent study by Deloitte shows the government\n\ncan save upward of $3 billion annually on the low end to more than $41 billion annually on the\n\nhigh end from data-driven automation and AI.\n\nSources:\n\n[• Gartner Survey Finds Government CIOs to Focus Technology Investments on Data Analytics and Cybersecurity in 2019](https://www.gartner.com/en/newsroom/press-releases/2019-01-23-gartner-survey-finds-government-cios-to-focus-technol)\n\n[• Administration Projects Agencies Will Spend $1 Billion on Artificial Intelligence Next Year](https://www.nextgov.com/emerging-tech/2019/09/administration-projects-agencies-will-spend-1-billion-artificial-intelligence-next-year/159781/)\n\n\nInvestment in AI to\n\nautomate repetitive tasks\n\ncan improve efficiencies\n\nacross government agencies,\n\nwhich could save **96.7**\n#### million federal hours annually, with a potential\n\nsavings of **$3.3 billion.**\n\n**WILLIAM EGGERS, PETER VIECHNICKI**\n\n**AND DAVID SCHATSKY**\n\n[Deloitte Insights](https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/artificial-intelligence-government.html)\n\n\n-----\n\n**An increased focus on cloud, analytics and AI = operational efficiency**\n\n1. AI/ML\n2. Data Analytics\n3. Cloud\n\n**$1B** **TOP PRIORITIES** **$41B+**\n\nData and AI Research and Government CIOs’ top Estimated government\nDevelopment Initiative game-changing technologies savings from data-driven\nautomation\n\n**U.S. Government**\n\nFortunately, the President’s Management Agenda (PMA) has recognized the need to\n\nmodernize their existing infrastructure, federate data for easier access and build more\n\n\n**IT Modernization Act**\n\nAllows agencies to invest\n\nin modern technology\n\nsolutions to improve\n\nservice to the public,\n\nsecure sensitive systems\n\nand data, and save\n\ntaxpayer dollars.\n\n\n**Federal Data Strategy**\n\nA 10-year vision for how\n\nthe federal government will\n\naccelerate the use of data to\n\nachieve its mission, serve the\n\npublic and steward resources,\n\nwhile protecting security,\n\nprivacy and confidentiality.\n\n\n**AI Executive Order**\n\nMakes AI a top research\n\nand development priority for\n\nfederal agencies, provides\n\na shared ethics framework\n\nfor developing and using AI,\n\nand expands job rotation\n\nprograms to increase\n\nthe number of AI experts\n\nat agencies.\n\n\nadvanced data analytics capabilities by establishing mandates for modernization, data\n\nopenness and the progression of AI innovations.\n\n\nThis will put agencies in a better position to leverage the scale of the cloud and democratize\n\n\nThis will put agencies in a better position to leverage the scale of the cloud and democratize The end result will be transformative innovation that can not only improve the operational\n\nsecure access to data in order to enable downstream business intelligence and AI use cases. efficiencies of each agency, but also support the delivery of actionable insights in real time\n\n\nefficiencies of each agency, but also support the delivery of actionable insights in real time\n\n\nfor more informed decision-making. This benefits citizens in the form of better services,\n\nstronger national security and smarter resource management.\n\n\n-----\n\nTop data and AI use cases in the government\n\n\n**H E A LT H C A R E**\n\nImprove the delivery and quality of healthcare services for citizens with powerful analytics and a 360°\n\nview of patients.\n\n- Patient 360 - Insurance management\n\n- Population health - Genomics\n\n- Supply chain optimization - Drug discovery and delivery\n\n\nAcross the federal government, data and AI is providing the insights and predictive\n\ncapabilities to thwart cyberattacks and national threats, provide better social services more\n\nefficiently, and improve the delivery and quality of healthcare services.\n\n**H O M E L A N D S E C U R I T Y**\n\n\nDetect and prevent criminal activities and national threats with real-time analytics and data-driven\n\ndecision-making.\n\n\n\n- Customs and border protection - Counter-terrorism\n\n- Immigration and citizenship - Federal emergency aid management\n\n**D E F E N S E**\n\n\n**E N E R G Y**\n\nImprove energy management with data insights that ensure energy resiliency and sustainability.\n\n- Security of energy infrastructure - Energy exploration\n\n- Smarter energy management - Electrical grid reliability\n\n\nApply the power of predictive analytics to geospatial, IoT and surveillance data to improve operations\n\n\n**C O M M E R C E**\n\nProactively detect anomalies with machine learning to mitigate risk and prevent fraudulent activity.\n\n- Tax fraud and collection - Grants management\n\n- Process and operations management - Customer 360\n\n**I N T E L L I G E N C E C O M M U N I T Y**\n\nLeverage real-time insights to make informed decisions that can impact the safety of our citizens and\n\nthe world.\n\n- Threat detection - Intelligence surveillance and reconnaissance\n\n- Neutralize cyberattacks - Social media analytics\n\n\nand protect the nation.\n\n- Logistics - Surveillance and reconnaissance\n\n- Predictive maintenance - Law enforcement and readiness\n\n\n-----\n\n### Challenges to innovation\n\nThe opportunity to drive innovation throughout the federal government is massive and\n\nhas implications for every U.S. citizen. But there are several critical barriers preventing\n\n\nTen of the existing legacy systems\nmost in need of modernization\ncost about **$337 million a year**\nto operate and maintain.\n\n\nagencies from making the progress needed to realize the value of their data and delivering\n\nthose innovations.\n\n**THE GOVERNMENT ACCOUNTABILITY OFFICE,**\n\n**INFORMATION TECHNOLOGY REPORT TO CONGRESS, JUNE 2019**\n\nThe complexities and impact of legacy data warehouses and marts\n\nMultiple federal agencies are burdened with a legacy IT infrastructure that is being left\n\n\nbehind by the technological advancements seen in the private sector. This infrastructure\n\nis traditionally built with on-premises data warehouses and data marts that are highly\n\ncomplex to maintain, costly to scale as compute is coupled with storage, limited from a\n\ndata science perspective, and they lack support for the growing volumes of unstructured\n\ndata. This inhibits data-driven innovation and blocks the use of AI, leaving agencies to\n\nsearch for data science tools to fill the gaps.\n\nInfrastructure also becomes harder and more expensive to maintain as it ages. Over time,\n\nthese environments become more complex due to their need for specialized patches and\n\nupdates that keep these systems available while doing nothing to solve the issues of poor\n\ninteroperability, ever-decreasing processing speeds, and an inability to scale – all of which\n\nare critically necessary to support today’s more data-intensive use cases. For example,\n\nsystems at the departments of Education, Health and Human Services, Treasury, and Social\n\nSecurity are over 40 years old.¹ This is causing pain in a variety of areas.\n\n\noften requires significant customization and, even then, there is still a chance that the final\n\nintegration won’t be successful. These systems also keep personnel from spending their\n\nenergy and resources on emerging technologies such as AI.\n\nAnd data reliability is a big concern. Replication of data occurs across data marts as\n\nvarious teams try to access and explore it, creating data management and governance\n\nchallenges. Without a single source of truth, teams struggle with data inconsistencies,\n\nwhich can result in inaccurate analysis and model performance that is only compounded\n\nover time.\n\nThankfully, there are initiatives in place, such as the Data Center and Cloud Optimization\n\nInitiative Program Management Office (DCCOI PMO), which are investing in modernizing IT\n\ninfrastructure for federal agencies.²\n\n\nMaintaining these systems requires a massive investment of both time and money\n\ncompared to modern cloud-based systems. For the technical teams that are tasked with\n\n\ntrying to integrate any of these legacy systems with third-party tooling or services, this\n\n\n[¹ Agencies Need to Develop Modernization Plans for Critical Legacy Systems](https://www.gao.gov/assets/gao-19-471.pdf)\n\n[² IT Modernization](https://www.gsa.gov/technology/government-it-initiatives/data-center-optimization-initiative-dcoi)\n\n\n-----\n\nData is critical … and complicated\n\nData is both the greatest asset and one of the greatest challenges that federal agencies must\n\nlearn to manage. While the volume and usefulness of data collected by federal agencies are\n\nnot in question, much of it is locked in legacy source systems, comes in diverse structured\n\n\nData silos hamper any data-driven advancements\n\nIn any data-driven organization, the need to have trusted, timely and efficient access to\n\ndata is critical. For the data teams responsible for driving the digital transformation of\n\nfederal agencies, the challenges they face are myriad.\n\n\nand unstructured formats, and is subject to a variety of governance models.\n\nWe have already seen how existing, legacy infrastructure, as well as the integration of\n\n\nNot only is this data siloed and very difficult to integrate, but the data volumes collected\n\nby federal agencies are massive. At Health and Human Services, for example, or the\n\nDepartment of Veterans Affairs, healthcare data sets will be sized by population and include\n\nelectronic health records, clinical data, imaging and more. For the Department of Defense\n\n\nfragmented data sources, will strain data engineering teams trying to deliver high-quality\n\ndata at scale. Their challenge includes developing the right data pipelines that will take\n\nthe massive volumes of raw data coming from fragmented sources into one centralized\n\nlocation with clean, secure and compliant data for agency decision-makers.\n\n\nand the Department of Homeland Security, data includes everything from mapping, satellite\n\n\nData scientists and analysts alike must have the right toolset to collaboratively investigate,\n\nextract and report meaningful insights from this data. Unfortunately, data silos extend\n\nto organizational silos, which make collaboration inside an agency as well as between\n\nagencies very difficult. With different groups of data teams leveraging their own coding\n\nand analytical tools, communicating insights and working across teams — let alone\n\nacross agencies — is almost impossible. This lack of collaboration can drastically limit\n\nthe capabilities of any data analytics or AI initiatives — from the deployment of shared\n\nbusiness intelligence (BI) reports and dashboards for data investigation and decision-\n\nmaking to the training of machine learning models to automate processes and make\n\npredictions. Compounding these challenges is an overall lack of data science expertise and\n\nskills within federal agencies. As a result, even with access to their data, without intuitive\n\ntooling it’s very difficult to deliver advanced analytic use cases with ML and AI.\n\nOrganizational silos also impact the effectiveness of data analysts, who are responsible\n\nfor analyzing and reporting insights from the data to better inform subject-matter experts\n\nor policy — and decision-makers. Without a data platform that eliminates these silos and\n\nenables visualization of and reporting on shared data, data analysts will be limited in how\n\nthey are able to drive the organizational and policy agendas of their respective agencies.\n\n\nimagery and intelligence data to payroll and human resources data. The Social Security\n\nAdministration and Internal Revenue Service manage personal data for every single citizen in\n\nthe United States.\n\nCombining these various forms of data from disparate legacy systems that are not\n\nintegrated — and doing it across different government agencies and departments — can be\n\nslow and error prone, hindering downstream analytics and actionable insights. The teams\n\nthat are responsible for this are faced with not only integrating these data sources, but also\n\nmanaging the entire ETL workflow in order to enable the application of basic analytics, let\n\nalone machine learning and AI.\n\n\n-----\n\n**THE DATABRICKS LAKEHOUSE PLATFORM:**\n### Modernizing the federal government to achieve mission objectives\n\n\nDatabricks provides federal agencies with a Lakehouse Platform that combines the best of data warehouses and data\n\nlakes — to store and manage all your data for all your analytics workloads. Databricks federates all data and democratizes\n\naccess for downstream use cases, empowering federal agencies to unlock the full potential of their data to deliver on\n\ntheir mission objectives and better serve citizens.\n\n\nFederal agencies that are\npowering impactful innovations\nwith Databricks Lakehouse\n\n\nLakehouse offers a single solution for all major data workloads, whether structured or unstructured, and supports use\n\n\ncases from streaming analytics to BI, data science and AI.\n\n\nUsing predictive\nanalytics for better\npassenger safety and\nexperience\n\nEnabling operational\nefficiencies through\nprocess automation\nto streamline the path\nto citizenship\n\n\nAll your\ngovernment data\n\n\nReliable, Analytics capabilities\nreal-time processing for every use case\n\nAD HOC\nDATA SCIENCE\n\n\nHealth\n\nSurveillance\n\nSocial Security\n\nDemographics\n\nCrime\n\nAudio/Visual\n\nGeospatial\n\n\nStructured batch\n\nUnstructured stream\n\nStructured batch\n\nStructured batch\n\nUnstructured batch\n\nUnstructured stream\n\nUnstructured stream\n\n\nPRODUCTION\nMACHINE LEARNING\n\n\n**DATA LAKEHOUSE**\n\nProcess, manage\nand query all your data\n\n\nBI REPORTING AND\nSCORECARDING\n\n\nLeveraging advanced\nanalytics to improve\noutcomes for patients\nthrough Medicare and\nMedicaid services\n\n\nThe Databricks Lakehouse Platform has three unique characteristics that address head-on the biggest challenges that\n\nfederal agencies are facing:\n\n\nIt offers simplicity with regard to data\n\nmanagement, in that the Databricks\n\nLakehouse is architected to support all\n\nof an agency’s data workloads on one\n\n\nIt is built on open standards so\n\nthat any existing investments\n\nin tooling or resources can\n\nremain effective\n\n\nAnd it’s collaborative, enabling\n\nagency data engineers, analysts\n\nand data scientists to work\n\ntogether much more easily\n\n\ncommon platform\n\n\n-----\n\nManaging federal data with a unified approach\n\n\nDatabricks enables aggregation and processing of massive collections of diverse and\n\nsensitive agency data that currently exists in silos, both structured and unstructured. As\n\nwe’ve seen, for many agencies this would be incredibly difficult with the infrastructure\n\nchallenges they are experiencing. The Databricks Lakehouse leverages Delta Lake to unify\n\n\nBy providing a unified data foundation for business intelligence, data science and machine\n\nlearning, federal agencies can add reliability, performance and quality to existing data lakes\n\nwhile simplifying data engineering and infrastructure management with automation to\n\nsimplify the development and management of data pipelines.\n\n\nthe very large and diverse amounts of data that government agencies are working with.\n\nDelta Lake is an open format, centralized data storage layer that delivers reliability, security\n\nand performance — for both streaming and batch operations.\n\nThe Lakehouse Platform combines the best elements of data lakes and data warehouses — delivering the data management and performance\ntypically found in data warehouses with the low-cost, flexible object stores offered by data lakes\n\n\n-----\n\nBreak down the institutional silos limiting collaboration\n\nFoster collaboration at every step with the latest machine learning tools that allow everyone\n\nto work and build value together — from data scientists to researchers to business\n\ndecision-makers. Close the glaring skills gap within these government organizations by\n\nproviding tooling that simplifies the ML lifecycle and empowers the data teams that do not\n\nhave the data science expertise to still be productive with their data through integrating BI\n\ntools and SQL analytics capabilities.\n\nEmpower data scientists with an intuitive and interactive workspace where they can easily\n\ncollaborate on data, share models and code, and manage the entire machine learning\n\nlifecycle in one place. Databricks notebooks natively support Python, R, SQL and Scala so\n\npractitioners can work together with the languages and libraries of their choice.\n\nDeliver on mission objectives with powerful analytics across agencies\n\nThe Databricks Lakehouse Platform includes a business intelligence capability — Databricks\n\nSQL. Databricks SQL allows data analysts and users to query and run reports against all of\n\nan agency’s unified data. Databricks SQL integrates with BI tools, like Tableau and Microsoft\n\nPower BI, and complements any existing BI tools with a SQL-native interface, allowing data\n\nanalysts and data scientists to query data directly within Databricks.\n\nAdditionally, with Databricks SQL, the data team can turn insights from real-world data into\n\n\npowerful visualizations designed for machine learning. Visualizations can then be turned\n\ninto interactive dashboards to share insights with peers across agencies, policymakers,\n\n\nEasily create visualizations and share dashboards via integrations with BI tools, like Tableau and Microsoft Power BI\n\n\nregulators and decision-makers.\n\n\n-----\n\nEnsure data security and compliance at scale\n\nDatabricks is fully aware of the sensitivity of the data that many of our federal agencies are\n\nresponsible for. From national security and defense data to individual health and financial\n\ninformation to national infrastructure and energy data — all of it is critical. Data is protected\n\nat every level of the platform through deep integration with fine-grained, cloud-provider\n\naccess control mechanisms. The Databricks Lakehouse is a massively secure and scalable\n\nmulticloud platform running millions of machines every day. It is independently audited\n\nand compliant with FedRAMP security assessment protocols on the Azure cloud and can\n\nprovide a HIPAA-compliant deployment on both AWS and Azure clouds.\n\nThe platform’s administration capabilities include tools to manage user access, control\n\nspend, audit usage, and analyze activity across every workspace, all while seamlessly\n\nenforcing user and data governance, at any scale.\n\nWith complete AWS accreditation, Databricks runs across all major networks including\n\nGovCloud, SC2S, C2S and commercial; all networks, including public, NIPR, SIPR and JWICS;\n\nand ATOs, including FISMA, IL5, IL6, ICD 503 INT-A and INT-B.\n\n\n-----\n\n**CUSTOMER STORY: U.S. CITIZENSHIP AND IMMIGRATION SERVICES**\n### Streamlining the path to citizenship with data\n\n##### 24x faster\n\nquery\nperformance\n\n\n##### 10 minutes\n\nto process tables\nwith 120 million rows\n\n\n##### 40 million\n\napplications\nprocessed\n\n\nThe U.S. Citizenship and Immigration Services (USCIS) gains actionable insights from\n\ndashboards via Tableau to better understand how to streamline operations and more quickly\n\nprocess immigration and employment applications as well as petitions. Today, their data\n\nanalyst team has over 6,000 Tableau dashboards running — all powered by Databricks.\n\nThe U.S. Citizenship and Immigration Services is the government agency that oversees\n\n\nlawful immigration to the United States. Over the last decade, the volume of immigration-\n\nand citizenship-related applications has skyrocketed across naturalizations, green cards,\n\nemployment authorizations and other categories. With millions of applications and petitions\n\nflooding the USCIS, processing delays were reaching crisis levels — with overall case\n\nprocessing times increasing 91% since FY2014.\n\n\n-----\n\nProcessing delays fueled by on-premises, legacy architecture\n\nCore to these issues was an on-premises, legacy architecture that was complex, slow and\n\ncostly to scale. By migrating to AWS and Databricks, USCIS adopted a unified approach\n\nto data analytics with more big data processing power and the federation of data\n\nacross dozens of disparate sources. This has unlocked operational efficiencies and new\n\n\nA new era of data-driven innovation improves operations\n\nUSCIS now has the ability to understand their data more quickly, which has unlocked new\n\nopportunities for innovation. With Databricks, they are able to run queries in 19 minutes,\n\nsomething that used to take an entire day — a 24x performance gain. This means they are\n\nspending far less time troubleshooting and more time creating value.\n\n\nopportunities for their entire data organization to drive business intelligence and fuel ML\n\ninnovations designed to streamline application and petition processes.\n\nRemoving complexities with a fully managed cloud platform\n\n\nSince migrating to the cloud and integrating Databricks into their data analytics workflows,\n\nUSCIS has been able to make smarter decisions that help streamline processes and\n\nleverage ML to reduce application processing times. These newfound efficiencies and\n\ncapabilities have allowed them to scale their data footprint from about 30 data sources to\n\n75 without issue.\n\nDatabricks provided USCIS with significant impact where it mattered most — faster\n\nprocessing speeds that enabled data analysts to deliver timely reports to decision-\n\n\nWe discovered Databricks, and\nthe light bulb really clicked for\nus on what we needed to do\nmoving forward to stay relevant.\n\n\nmakers — and that freed up data scientists to build ML models to help improve operations.\n\nLeveraging the efficiencies of the cloud and Delta Lake, they were able to easily provision a\n\n\n26-node cluster within minutes and ingest tables with 120 million rows into S3 in under 10\n\nminutes. Prior to Databricks, performing the same processes would have taken somewhere\n\n\n**SHAWN BENJAMIN**\n\n**CHIEF OF DATA AND BUSINESS INTELLIGENCE, USCIS**\n\n\nbetween two and three hours.\n\n\n-----\n\n### Conclusion\n\nEnabling federal agencies to take advantage of data analytics and AI will help them execute\n\ntheir missions both effectively and efficiently. The Databricks Lakehouse Platform will unify\n\ndata, analytics and AI workloads, making agencies data-driven and giving policymakers\n\naccess to deeper, more meaningful insights for decision-making. It will also eliminate data\n\nsilos and increase communication and collaboration across agencies to ensure the best\n\nresults for all citizens.\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 5,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M, and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems.\n\nGet started with a free trial of Databricks and\nstart building data applications today\n\n**START YOUR FREE TRIAL**\n\nTo learn more, visit us at: **dbricks.co/federal**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Data-AI-in-Fed-Gov-Ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**eBook**\n\n# Cybersecurity in Financial Services\n\n### Protecting financial institutions with advanced analytics and AI\n\n\n-----\n\n## Contents\n\nThe State of the Industry .................................................................................................................................................................................... **03**\n\nA New Commitment to Cybersecurity ....................................................................................................................................................... **04**\n\nThe Biggest Challenge With Security Analytics ..................................................................................................................................... **05**\n\nJourney of SecOps: Destination Lakehouse ............................................................................................................................................ **06**\n\nRethinking Cybersecurity in Financial Services With Databricks Lakehouse ......................................................................... **07**\n\nLakehouse in Financial Services ..................................................................................................................................................................... **08**\n\nLakehouse and SIEM: The Pattern for Cloud-Scale Security Operations .................................................................................. **12**\n\nCommon Use Cases ................................................................................................................................................................................................ **14**\n\nGetting Started With Databricks for Cybersecurity ............................................................................................................................. **15**\n\n\n-----\n\n**I N T R O D U C T I O N**\n\n## The State of the Industry\n\n\nCloud, cost and complexity of customer data and cybersecurity are\ntop of mind for every financial services security leader today. As\nfinancial services institutions (FSIs) continue to accelerate their digital\ntransformation, cybercriminals, fraudsters and state-sponsored actors\ncontinue with more sophisticated threats. The impact of these attacks\nranges from the exposure of highly sensitive data to the disruption\nof services and the exploitation of backdoors for future attacks — all\nresulting in both financial and non-financial costs. Responding quickly\nto potential threats requires security tools capable of analyzing billions\nof threat signals in real-time.\n\nRecently, it seems like every week reveals a new data breach or ransomware assault,\nand the cost is skyrocketing: more than $4 million per incident, up 10 percent from\n2020, and about $401 million for a substantial [breach at a large corporation](https://www.ibm.com/security/data-breach) .\n\n\n**Cybersecurity is no longer just a back-office cost and now**\n**poses critical business risks, such as:**\n\n**•** Operational disruption\n\n**•** Material customer loss\n\n**•** Increase in insurance premiums\n\n**•** Lawsuits or fines\n\n**•** Systemic destabilization\n\n**•** Credit downgrade\n\n**•** Reputational damage\n\nSource: Navigating Cyber 2022, FS-ISAC, Annual Cyber Threat Review and Predictions\n\n\n-----\n\n## A New Commitment to Cybersecurity\n\n\nIt comes as no surprise that in recent years FSIs have seen an amplified\ncommitment to cybersecurity. As business leaders look to new solutions, large\nportions of IT budgets are now devoted to leveraging data and AI to thwart\ncyberattacks.\n\nFurthermore, regulators are taking notice of the increased risk of cybersecurity\nthreats. Growing geopolitical tensions have also prompted federal agencies such\nas the Cybersecurity and Infrastructure Security Agency and the Federal Bureau\nof Investigation [to warn](https://www.wsj.com/livecoverage/russia-ukraine-latest-news-2022-04-05/card/banks-haven-t-seen-rise-in-cyberattacks-from-russia-yet-p3F5ebzAhTauVjsNx46E) that “tough sanctions imposed on Russia could prompt a\nspate of cyberattacks against critical infrastructure such as banks.” Additionally,\nthe Securities and Exchange Commission released its [2022 Exam Priorities](https://www.sec.gov/news/press-release/2022-57) , which\ninclude information security, and specifically “how firms are safeguarding their\ncustomers’ records and assets from cyber threats, including oversight of thirdparty providers, identification of red flags related to identity theft, response to\nincidents, including to ransomware attacks and management of operational risk in\nlight of ‘a dispersed workforce.’”\n\nHowever, as is often the case, implementing new cybersecurity strategies and\nprocesses is easier said than done.\n\n\n**Cybersecurity needs a transformation**\n**... breaches, cost and complexity are growing**\n\n\n## 100%\nof organizations surveyed have had\nbreaches.\n**The average breach costs $4M**\n\n## 85%\n**will increase their cyber budget**\nnext FY. Cybersecurity industry will\ngrow to $366B by ‘28\n\n\n## 67%\nof organizations were **breached at**\n**least three times** . A mega breach\ncosts $401M.\n\n**Cost, Complexity, Cloud**\n\n- \u0007Hundreds of tools with expanding\nfootprints\n\n- \u0007Data locked in vendor proprietary\ntools\n\n- \u0007Humans compensating for\nanalytical and integration\ndeficiencies\n\n\nIn this eBook, we’ll take a closer look at the challenges associated with replacing\nthe infrastructure of a legacy data analytics system, and how financial institutions\nare solving them with Databricks.\n\n\n-----\n\n## The Biggest Challenge With Security Analytics\n\n\nFor many FSIs, on-premises security incident and event management (SIEM)\ntechnologies have been the go-to solution for threat detection, analysis and\ninvestigations. However, these legacy technologies were built for a world where big\ndata was measured in gigabytes, not today’s terabytes or petabytes. This means\nthat not only are legacy SIEMs unable to scale to today’s data volumes, but they\nare also unable to serve the modern, distributed enterprise.\n\nBy now, the advantages of moving to the cloud are no secret to anyone. For FSIs,\nscalability, simplicity, efficiency and cost are absolutely essential components of\nsuccess. Many within FinServ are looking to cloud computing to make this possible,\nadding detection and response in the cloud to the security team’s responsibility.\n\nBecause legacy SIEMs predate the emergence of cloud, artificial intelligence and\nmachine learning (AI/ML) in the mainstream, they’re unable to address the complex\ndata and AI-driven analytics needed for threat detection, threat hunting, in-stream\nthreat intelligence enrichment, analytical automation and analyst collaboration.\n\nIn other words, legacy SIEMs are no longer suitable for the modern enterprise or\nthe current threat landscape.\n\n\n**Counting the Financial Cost of Legacy SIEMs**\n\nThe financial cost of the continued use of legacy SIEMs continues to rise because\nmost SIEM providers charge their customers based on the volume of data\ningested. While some legacy technologies are available in the cloud, they’re either\nnot designed to be cloud-native applications or confined to a single cloud service\nprovider. As a result, security teams have to employ multiple tools for detection,\ninvestigation and response — or pay exorbitant egress charges for data transiting\nfrom one cloud provider to another. This causes operational slowdowns, errors\ndriven by complexity, and inconsistent implementation of security policies.\n\nA lack of support for multiple clouds also means an increase in maintenance\noverhead. Security staff members are often stressed because analysts have to\nlearn different tools for different cloud platforms. For some, it also creates an\nimplicit cloud vendor lock-in, meaning that security teams are unable to support\nmissions because their tools are not portable across multiple cloud providers.\n\nCollectively, these drawbacks to legacy SIEMs result in a much weaker security\nposture for FSIs.\n\n\n-----\n\n## Journey of SecOps: Destination Lakehouse\n\nHow did security analytics get to this point? In the early days, there was a need to aggregate alerts from antiviruses and intrusion detection systems. SIEMs were born, built\non data warehouses, relational databases or NoSQL database management systems. But as incident investigation needs evolved, those data warehouses weren’t able to\nhandle the volume and variety of data, which led to the development of data lakes. Data lakes were cost-effective and scalable but didn’t have strong data governance and\ndata hygiene, earning them the moniker of “data swamps.” Simply integrating the two tech stacks is really complicated because of varying governance models, data silos\nand inconsistent use case support. Fast-forward to today, security teams now need AI/ML at scale in a multicloud world.\n\nWhy choose one or the other? The lakehouse architecture has emerged in recent years to help address these concerns with a single unified architecture for all your threat\ndata, analytics and AI in the cloud. The governance and transactional capabilities of the data warehouse, the scale and flexibility of a data lake, AI/ML from the ground up\nand multicloud native deployments in one platform – this is a modern architecture called the lakehouse (data lake and data warehouse).\n\n**Current Challenges** **Introducing the Data Lakehouse**\n\n\n**Cloud Storage**\nNo support for\nanalytics or\ninvestigations\n\n**SIEMs**\nNo attack chaining.\nPoor for high\ncardinality search.\n\n\n**UBA tools**\nNo historical search,\nblackbox,\nproprietary storage\n\n**No SIEM/Log**\nsolution is\nmulticloud\nnative\n\n\n**Curated Alerts** **Cloud-scale**\n**search**\n\n**ML/AI** **Multicloud**\n\n\n-----\n\n## Rethinking Cybersecurity in Financial Services With Databricks Lakehouse\n\nDatabricks introduced the first data lakehouse platform to the industry, and today over 7,000 customers use it worldwide. With Databricks Lakehouse, FSIs that are ready to\nmodernize their data infrastructure and analytics capabilities for better protection against cyber threats now have one cost-effective solution that addresses the needs of\nall their teams.\n\nThe Databricks Lakehouse Platform combines the best elements of data lakes and data warehouses, delivering the low-cost, flexible object stores offered by data lakes and\nthe data management and performance typically found in data warehouses. This unified platform simplifies existing architecture by eliminating the data silos that traditionally\nseparate analytics, data science and ML. It’s built on open source, open data and open standards to maximize flexibility, and its inherent collaborative capabilities accelerate\nthe ability to work across teams and innovate faster. Moreover, because it’s multicloud, it works the same way no matter which cloud provider is used.\n\nETL and Enrichment\n\n**Proof Point**\n\n**Firewall**\n\n**Antivirus**\n\n\n-----\n\n## Lakehouse in Financial Services\n\nBy unifying data with analytics and AI, Lakehouse allows FSIs to easily access all their data for downstream advanced analytics capabilities to support complex security\nuse cases. Lakehouse facilitates collaboration between threat intelligence teams and cyber operations, enables security operations teams to detect advanced threats, and\nreduces human resource burnout through analytical automation and collaboration. Importantly, Lakehouse also accelerates investigations from days to minutes.\n\nAlong with a more modern architecture, the Lakehouse Platform includes Delta Lake, which unifies all security data in a transactional data lake to feed advanced analytics.\nThe analytics and collaboration are done in notebooks, and security teams can use multiple languages — SQL, Python, R and Scala — in the same notebook. This makes\nit easy for security practitioners to explore data and develop advanced analytics and reporting using their favorite methods. Additionally, a separation of compute from\nstorage means performance at scale without impacting overall storage costs.\n\n\n-----\n\n**C A S E S T U D Y**\n\n**When It Comes to Security, Data Is the Best Defense***\n\n**Protecting HSBC’s 40 million customers begins with collecting and processing data from billions**\n**of signals to make previously impossible threat detection possible**\n\nsecurity operation departments, creating an enhanced relationship that results\nin better defenses, insight into the security posture of the organization, and the\nability to respond at the pace of the adversary.\n\n\nThe old way of thinking about security — stronger locks, higher walls — is outdated\nand ineffective. “When defending an organization, too often we just focus heavily\non tools, technology, and reactive scenarios,” said T.J. Campana, managing director\nof global defense and chief technology officer at HSBC, the multinational bank. “But\nthe security business is a data business. And the data always has a story to tell us.”\n\nThe quality of security, he added, is proportional to the information that can be\n\ndistilled from petabytes of data that endlessly flows through company networks.\nThat means “empowering people to get the right insights, in the right way to\nquickly prevent, detect, and respond to threats, wherever and whenever they\noccur,” said George Webster, executive director of global cybersecurity science\nand analytics at HSBC.\n\nIf a big organization is made up of tens of millions of parts that must click together\nseamlessly, security keeps those seals tight. Data gathering, analytical tools, and\nhuman intellect work together as one. This involves fusing the data science and\n\n\nBut working across years of data at petabyte scale is not an easy task, especially\nwhen a long time is measured in minutes and the adversary is constantly working\nagainst you. To put this in perspective, the security teams at HSBC intake 10 times\nthe amount of data contained in all of the books in the U.S. Library of Congress\nevery day, and must process months, if not years, of data at a time. That is where\ninnovative design, smart people, and leveraging the right technology come into\nplay. “We have to break the paradigm of the tool being the end goal of defense\nand instead view the tools as an enabler of our people,” said Webster. “It is always\nabout the people,” added Campana.\n\nHSBC turned away from the common security paradigm by leveraging the big data\nprocessing techniques from Azure Databricks. In many ways, their open source\nDelta Lake is the key enabler, with Spark being the engine. Delta Lake allows these\nteams to structure, optimize, and unlock data at scale, while Spark allows multiple\ncomplex programs to seamlessly crunch through the data. This enables HSBC’s\nsecurity teams to constantly evolve their defenses, create new capabilities at\npace, and perform investigations that were previously impossible. When a new\nthreat emerges, the bank doesn’t have the luxury to wait for the security market to\nidentify, respond, and mitigate. Instead, the bank turns to its people and creates\nwhat is needed at breathtaking speed.\n\n\n-----\n\n**C A S E S T U D Y : C O N T I N U E D**\n\n\nIt’s an essential function for HSBC, which needs to continually think about how to\nkeep more than 40 million customers in 64 countries and territories safe. Taken\ntogether, it’s an all-brains-on-deck moment with data and people guiding the\nship. It’s also a tall task for a company as massive and multifaceted as HSBC.\nHeadquartered in the UK, it is one of the largest global banks (total assets: a\nwhopping $2.968 trillion), with operations across Africa, Europe, Asia, and the\nAmericas. It’s also the largest bank in Hong Kong and even prints some of the local\ncurrency, which bears the HSBC name.\n\nThe bank’s cybersecurity approach involves fusing the data science and security\noperation departments, creating an enhanced relationship that results in more\nefficient threat discovery, rapid development of operational use cases and AI\nmodels. This enables the continuous creation of capabilities that stop adversaries\nbefore they even start. “We have to get out of the mindset that security is a walled\ngarden,” said Webster. “We must create truly collaborative environments for our\npeople to enable the business to operate,” said Campana.\n\nStaffing this symbiotic power center will be someone Campana optimistically calls\n“the analyst of the future,” a description that’s both mindset and skillset: threat\nhunter and data scientist.\n\nIn addition, when another organization is hit by cybercrime, HSBC analyzes it\nto understand how it may have responded and then improves its defenses\naccordingly. That’s in contrast to the industry norm; a Ponemon survey revealed\n\n\nthat 47 percent of organizations have not assessed the readiness of their incident\nresponse teams. That means the first time they test their plans will be at the worst\npossible time — in the middle of a cyber attack.\n\nThe proactive approach is a far cry from the old reactive conveyor belt model of\nsecurity when alert tickets were received from tooling and processed in a slow\nand linear way. Today, cross-disciplinary security teams don’t just react; they\ncontinually search for the signals in the noise — tiny aberrations that indicate\nsomething’s not right – and send up red flags in real-time. “We’re scanning\nhundreds of billions of signals per day. I cannot wait. We need situational\nawareness right now,” said Campana.\n\nThat increased speed is critical for threat assessment. Information theft may be\nthe most expensive and fastest-rising consequence of cybercrime, but data is not\nthe only target. Core systems are being hacked in a dangerous trend to disrupt\nand destroy. Regulators are also increasingly asking banks for controls in place to\ndetect and preempt financial crimes. That’s where big data tooling like Delta Lake\nand Spark shine, and where it will continually be called on to address the security\nneeds of new initiatives.\n\n“Digital security is about organically adjusting to risks,” said Webster. “It’s a journey\nof continual discovery with one central goal: to protect customers. They want\nthings easy and they want them quick. It’s our job to make sure that it’s secure.”\n\n*This story previously appeared in [WIRED Brand Lab for Databricks](https://www.wired.com/sponsored/story/when-it-comes-to-security-data-is-the-best-defense/) .\n\n\n-----\n\n**Advantages of a Lakehouse**\n\n\n**A cost-efficient upgrade**\n\nDatabricks customers only pay for the data they\nanalyze, not for what they collect. This means that\nsecurity teams can collect any amount of data\nwithout worrying about ingest-based pricing, and\nonly pay for the data that’s actually used for analysis\n— for example, an incident investigation or a data\ncall for an audit. This pricing model enables security\nteams to collect data that was previously out of\nreach, such as netflow data, endpoint detection and\nresponse data, and application and services data.\n\nFurther, Databricks is a fully managed service,\nmeaning that security teams don’t have to\npre-commit to hardware capital expenditures.\nWith no hardware to manage and no big data\nimplementations to maintain, security teams\ncan significantly reduce their management and\nmaintenance costs.\n\n\n**Multicloud**\n\nDatabricks is cloud-native on AWS, Microsoft Azure\nand Google Cloud. This creates freedom for the\nsecurity teams to use whatever cloud provider they\nlike. Additionally, teams can acquire and maintain\noperational consistency across all providers when\nthey have multiple cloud footprints. This enables\nconsistent policy implementation, reduced\ncomplexity for staff and increased efficiency.\n\nAdditionally, Databricks enables faster detection,\ninvestigation and response across the enterprise\nbecause analytics can be reused across the\nmajor cloud providers through a unified platform\nthat centralizes data for easy sharing and fosters\ncollaboration across teams.\n\n\n**Enterprise security and**\n**360° risk management**\n\nThe Lakehouse Platform is easy to set up, manage,\nscale and, most importantly, secure. This is because\nLakehouse easily integrates with existing security\nand management tools, enabling users to extend\ntheir policies for peace of mind and greater control.\n\nWith multicloud management, security admins and\ndata teams get a consistent experience across all\nmajor cloud providers. This saves valuable time\nand the resources required to upskill talent on\nproprietary services for data, analytics and AI.\n\nSecurity, risk and compliance leaders are also\nable to give team members a range of security\npermissions that come with thorough audit trails.\nThis allows teams to quickly spin up and wind down\ncollaborative workspaces for any project and to\nmanage use cases from end to end — from enabling\nuser access and controlling spend to auditing usage\nand analyzing activity across every workspace to\nenforce user and data governance.\n\n\n-----\n\n## Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations\n\n\nAccording to George Webster, head of cybersecurity sciences and analytics at\nHSBC, Lakehouse and SIEM is the pattern for security operations. What does\nit look like? It leverages the strengths of the two components: Lakehouse for\nmulticloud native storage and analytics, SIEM for security operations workflows.\nFor Databricks customers like HSBC, there are two general patterns for this\nintegration that are both underpinned by what Webster calls the cybersecurity\ndata lake with Lakehouse.\n\nIn the first pattern, Lakehouse stores all the data for the maximum retention\nperiod. A subset of the data is then sent to the SIEM and stored for a fraction of\nthe time. This pattern has the advantage of allowing analysts to query near-term\n\n\ndata using the SIEM while having the ability to do historical analysis and more\nsophisticated analytics in Databricks. It also lets them manage any licensing or\nstorage costs for the SIEM deployment.\n\nThe second pattern is to send the highest-volume data sources to Databricks —\nfor example, cloud-native logs, endpoint threat detection and response logs, DNS\ndata and network events. Low-volume data sources such as alerts, e-mail logs\nand vulnerability scan data go to the SIEM. This pattern enables Tier 1 analysts to\nquickly handle high-priority alerts in the SIEM. Threat-hunt teams and investigators\ncan leverage the advanced analytical capabilities of Databricks. This pattern has a\ncost-benefit of offloading processing, ingestion and storage from the SIEM.\n\n\n-----\n\n**Databricks and Splunk:**\n**A Case Study in Cost-Savings**\n\nDatabricks integrates with your preferred SIEM, like\nSplunk, and the Splunk-certified Databricks add-on\ncan be used to meet SOC needs without changing\nthe user interface. This example features a global\nfinancial institution’s security operation, where\nthe organization grew throughput from 25TB per\nday with only 180 days lookback, to 100TB per day\nwith 395 days lookback using the Databricks SIEM\naugmentation. The total cost of ownership savings,\nincluding infrastructure and license costs, saved tens\nof millions (more than $80mn per year) in cloud costs.\n\n\n##### FinServ Security Operations\n\nDatabricks + Splunk **Drastically** Lowered Costs\n\n**CURRENT STATE** **FUTURE OPTION**\n\n100\n\n75\n\n\n**Throughput**\nTB per day\n\n**Lookback**\n**period**\nDays\n\n\n50\n\n\n**100**\n\n\n25\n\n**25**\n\n0\n\nSplunk only Splunk + Databricks\n\n**395**\n\n**180**\n\nSplunk only Splunk + Databricks\n\nTCO savings with Splunk and Databricks vs. Splunk only solution: $81M\n\n\n-----\n\n## Common Use Cases\n\nAs FSIs focus on modernizing their data analytics and warehousing capabilities, the Databricks Lakehouse Platform\nbrings a new level of empowerment to FSIs, allowing them to unlock the full potential of their data to deliver on their\nobjectives and better serve their customers.\n\n**Common use cases include:**\n\n\n\n**•** **Threat hunting:** Empower security teams to\nproactively detect and discover advanced\nthreats using months or years of data\n\n**•** **Incident investigation:** Gain complete visibility\nacross network, endpoint, cloud and application\ndata to respond to incidents\n\n**•** **Phishing threat detection:** Uncover social\nengineering attacks that are often used to steal\nuser data, including log-in credentials and credit\ncard numbers\n\n**•** **Supply chain monitoring:** Leverage ML to\nidentify suspicious behavior within your software\nsupply chain\n\n\n\n**•** **Ransomware detection:** Scope the impact\nand spread of ransomware attacks to inform\ncomplete mitigation and remediation\n\n**•** **Credentials-abuse detection:** Identify and\ninvestigate anomalous credential usage across\nyour infrastructure\n\n**•** **Insider-threats detection:** Find and respond\nto malicious threats from people within an\norganization who have inside information about\nsecurity practices, data and computer systems\n\n**•** **Network traffic analysis:** Examine real-time\nnetwork availability and activity to identify\nanomalies, vulnerabilities and malware\n\n\n\n**•** **Analytics automation:** Automatically\ncontextualize and enrich multiple streaming and\nbatch analytics to accelerate analyst workflows\nand decision-making\n\n**•** **Augmenting anti-money laundering practices**\n**(AML):** Using structured and unstructured\ndata to maintain a list of politically exposed\nindividuals, often referred to as PEP, to augment a\nbank’s AML processes. This includes pulling data\nfrom an organization externally (keeping the PEP\nlist up-to-date including out-of-country officials\nand diplomats) as well as internally (including\ncritical personnel, network admins, etc.) who\nneed extra scrutiny.\n\n\n-----\n\n## Getting Started With Databricks for Cybersecurity\n\nGetting up and running on Databricks to address your cybersecurity needs is easy with our Solution\nAccelerators. Databricks Solution Accelerators are highly optimized, fully functional analytics solutions that\nprovide customers with a fast start to solving their data problems.\n\n**•** [Cybersecurity analytics and AI at scale with Splunk and Databricks](https://databricks.com/solutions/accelerators/cybersecurity-analytics-and-ai) : Rapidly detect threats,\ninvestigate the impact and reduce risks with the Databricks add-on for Splunk\n\n**•** [Threat detection at scale with DNS analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html) : Recognize cybercriminals using DNS,\nthreat intelligence feeds and ML\n\nDatabricks Solution Accelerators are free. Join the hundreds of Databricks customers using Solution\nAccelerators to drive better outcomes in their businesses.\n\nIf you’d like to learn more about how we are helping financial services institutions securely leverage data and AI,\nplease visit us at [dbricks.co/fiserv](https://databricks.com/solutions/industries/financial-services) or reach out to us at [cybersecurity@databricks.com](mailto:cybersecurity%40databricks.com?subject=) .\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including\n\nComcast, Condé Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks\n\nLakehouse Platform to unify their data, analytics and AI. Databricks is headquartered in San\n\nFrancisco, with offices around the globe. Founded by the original creators of Apache Spark,™\n\nDelta Lake and MLflow, Databricks is on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n#### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n\n###### To learn more, visit us at:\n dbricks.com/fiserv\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-eBook-finServ-cyber.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "**EBOOK**\n\n## Why the Data Lakehouse Is Your Next Data Warehouse\n\n\n-----\n\n### Contents\n\nPreface .......................................................................................................................................................................................................................................... **3**\n\nIntroduction ............................................................................................................................................................................................................................. **4**\n\nOur Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n\nIntroducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n\nWhy Databricks SQL? ............................................................................................................................................................................................... 6\n\nCommon use cases .................................................................................................................................................................................................... 7\n\nThe Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n\n**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n\n**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n\n**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n\nConclusion ............................................................................................................................................................................................................................. **24**\n\nCustomer Stories ............................................................................................................................................................................................................... **25**\n\n\n-----\n\n### Preface\n\nHistorically, data teams have had to resort to a bifurcated architecture to run traditional\nBI and analytics workloads, copying subsets of the data already stored in their data lake\nto a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\ngovernance inherent in proprietary architectures.\n\nOur customers have asked us to simplify their data architecture. We decided to accelerate\nour investments to do just that.\n\n\nWe introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\nfirst-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\nWe use the term “lakehouse” to reflect our customers’ desire to combine the best of data\nwarehouses and data lakes. With the lakehouse, you can now establish one source of truth\nfor all data and enable all workloads from AI to BI on one platform. And we want to provide\nyou with ease-of-use and state-of-the-art performance at the lowest cost.\n\n\n**Reynold Xin**\n\nOriginal Creator of Apache Spark, TM\nCo-founder and Chief Architect,\nDatabricks\n\n\nThis eBook covers how we went back to the drawing board to build Databricks SQL — the\nlast mile of enabling data warehousing capabilities for your existing data lakes — as part of\nthe Databricks Lakehouse Platform.\n\n\n-----\n\n### Introduction\n\n\nMost organizations operate their business with a complex data architecture that\ncombines data warehouses and data lakes. For one thing, data lakes are great\nfor machine learning (ML). They support open formats and a large ecosystem.\nBut data lakes have poor support for business intelligence (BI) and suffer\ncomplex data quality problems. Data warehouses, on the other hand, are great\nfor BI applications. But they have limited support for ML workloads, can’t handle\nnatural language data, large-scale structured data, or raw, video, audio or image\nfiles, and are proprietary systems with only a SQL interface.\n\nAs a result, data is moved around the organization through data pipelines and\nsystems that create a multitude of data silos. A large amount of time is spent\nmaintaining these pipelines and systems rather than creating new value from\ndata, and downstream consumers struggle to get a single source of truth of the\ndata due to the inherent siloing of data that takes place. The situation becomes\nvery expensive, and decision-making speed and quality are negatively affected.\n\nUnifying these systems can be transformational in how we think about data.\n\n\n##### The need for simplification\n\nIt is time for a new data architecture that can meet both today’s and tomorrow’s\nneeds. Without any compromise. Advanced analytics and ML are one of the\nmost strategic priorities for data-driven organizations today, and the amount\nof unstructured data is growing exponentially. So it makes sense to position\nthe data lake as the center of the data infrastructure. However, for this to be\nachievable, the data lake needs to adopt the strengths of data warehouses.\n\nThe answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\nand standardized system design: one that implements data structure and data\nmanagement features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n\n##### Building the Data Lakehouse\n[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n\n\n-----\n\n### Our Approach: The Databricks Lakehouse Platform\n\nOur customers have asked us for simplification. This is why we’ve embarked on\nthis journey to deliver one simple, open and collaborative platform for all your\ndata, AI and BI workloads on your existing data lakes.\n\nThe [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\ncombining the data management and performance typically found in data\nwarehouses with the low-cost, flexible object stores offered by data lakes.\n\nIt’s built on open source and open standards to maximize flexibility, and lets you\nstore all your data — structured, semi-structured and unstructured — in your\nexisting data lake while still getting the data quality, performance, security and\ngovernance you’d expect from a data warehouse. Data only needs to exist once\nto support all of your data, AI and BI workloads on one common platform\n— establishing one source of truth.\n\nFinally, the Lakehouse Platform provides tailored and collaborative\nexperiences so data engineers, data scientists and analysts can work together\non one common platform across the entire data lifecycle — from ingestion to\nconsumption and the serving of data products — and innovate faster.\n\nLet’s look at how, with the right data structures and data management\ncapabilities in place, we can now deliver data warehouse and analytics\ncapabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----\n\n### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n\n\nDatabricks SQL is a serverless data warehouse on the Databricks Lakehouse\nPlatform that lets you run all your SQL and BI applications at scale with up to 12x\nbetter price/performance, a unified governance model, open formats and APIs,\nand your tools of choice — no vendor lock-in. Reduce resource management\noverhead with serverless compute, and easily ingest, transform and query\nall your data in place to deliver real-time business insights faster. In fact, DB\nSQL now holds the new world record in 100TB TPC-DS, the gold standard\nperformance benchmark for data warehousing.\n\nBuilt on open standards and APIs, the lakehouse provides an open, simplified and\nmulticloud architecture that brings the best of data warehousing and data lakes\ntogether, and integrations with a rich ecosystem for maximum flexibility.\n\n\n##### Why Databricks SQL?\n\nBest Price/Performance\nLower costs, get world-class performance, and eliminate the need to manage,\nconfigure or scale cloud infrastructure with serverless.\n\nBuilt-In Governance\nEstablish one single copy for all your data using open standards, and one unified\ngovernance layer across all data teams using standard SQL.\n\nRich Ecosystem\nUse SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\nto ingest, transform and query all your data in place.\n\nBreak Down Silos\nEmpower every analyst to access the latest data faster for downstream real-time\nanalytics, and go effortlessly from BI to ML.\n\n**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n\n\n-----\n\n### Common use cases\n\nThousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\nfor hundreds of analysts across their organizations, and to build custom data applications to better serve their\ncustomers. Below are some examples of use cases for Databricks SQL.\n\n**At Atlassian, we have proven**\n\n\n**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n**your BI tools of choice** **the freshest data** **data applications**\n\n\n**that there is no longer a need**\n\n**for two separate data things.**\n\n**Technology has advanced**\n\n**far enough for us to consider**\n\n**one single unified lakehouse**\n\n**architecture.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager,\nAtlassian\n\n\nEnable business analysts to\ndirectly query data lake data\nusing their favorite BI tool and\navoid data silos. Reengineered\nand optimized connectors\nensure fast performance,\nlow latency and high user\nconcurrency to your data lake.\nNow analysts can use the best\ntool for the job on one single\nsource of truth for your data.\n\n\nEmpower every analyst and SQL\nprofessional in your organization\nto quickly find and share new\ninsights by providing them with\na collaborative and self-served\nanalytics experience. Confidently\nmanage data permissions with\nfine-grained governance, share and\nreuse queries, and quickly analyze\nand share results using interactive\nvisualizations and dashboards.\n\n\nBuild more effective and\ntailored data applications\nfor your own organization or\nyour customers. Benefit from\nthe ease of connectivity,\nmanagement and better price/\nperformance of DB SQL to\nsimplify development of dataenhanced applications at scale,\nall served from your data lake.\n\n\n-----\n\n### The Inner Workings of the Lakehouse\n\n\nIn the next chapter, we’ll unpack the three foundational layers of the Databricks\nLakehouse Platform and how we went back to the drawing board to build this\nexperience. Specifically, we’ll dive into how we built Databricks SQL to deliver\nanalytics and data warehousing workloads on your lakehouse.\n\n\nThose layers are:\n\n**1 .** The storage layer, or how we store and govern data\n\n**2 .** The compute layer, or how we process queries\n\n**3 .** The consumption layer, or the tools you can use to interface with the system\n\n\n###### PART 1: STORAGE LAYER\n\nIn order to bring the best of data lakes and data\nwarehouses, we needed to support the openness\nand flexibility of data lakes, as well as the quality,\nperformance and governance you’d expect from a\ndata warehouse.\n\n\n**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n|---|---|---|\n|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n|All data types|Structured only|All data types|\n|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n\n\n-----\n\n##### Transactional guarantees for your data lake\n\n\nThe open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\nlake challenges around data quality and reliability. It is the foundation for the\nlakehouse, and Databricks SQL stores and processes data using Delta Lake.\n\nFor example, it provides ACID transactions to ensure that every operation either\nfully succeeds or fully aborts for later retries — without requiring new data\npipelines to be created. It unifies batch and streaming pipelines so you can\neasily merge existing and new data at the speed required for your business. With\nTime Travel, Delta Lake automatically records all past transactions, so it’s easy\nto access and use previous versions of your data for compliance needs or for\nML applications. Advanced indexing, caching and auto-tuning allow optimization\nof Delta tables for the best query performance. Delta Lake also acts as the\nfoundation for fine-grained, role-based access controls on the lakehouse.\n\nAs a result, Delta Lake allows you to treat tables in Databricks SQL just like you\ntreat tables in a database: updates, inserts and merges can take place with high\nperformance at the row level. This is particularly useful if you are inserting new\n\n\ndata rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\nwith one open and standard format — not only for SQL but also for Python, Scala\nand other languages — so you can run all analytical and ML use cases on the\nsame data.\n\n**Delta Lake provides the key**\n\nAn open format storage layer built for lake-first architecture\n\nACID transactions, Time Travel, highly available\n\nAdvanced indexing, caching, auto-tuning\n\nFine-grained, role-based access controls\n\nStreaming & batch, analytics & ML\n\nPython, SQL, R, Scala\n\nDelta Lake brings data quality, performance and governance to the lakehouse\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n##### Delta Lake: The Definitive Guide\n[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n\n\n-----\n\n##### A framework for building a curated data lake\n\n\nWith the ability to ingest petabytes of data with auto-evolving schemas, Delta\nLake helps turn raw data into actionable data by incrementally and efficiently\nprocessing data as it arrives from files or streaming sources like Kafka, Kinesis,\nEvent Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\nas it arrives with no manual intervention, as well as infer schema, detect column\nchanges for structured and unstructured data formats, and prevent data loss by\nrescuing data columns that don’t meet data quality specifications. And now with\n[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\nvarious sources.\n\nAs you refine the data, you can add more structure to it. Databricks recommends\nthe Bronze, Silver and Gold pattern. It lets you easily merge and transform new\nand existing data — in batch or streaming — while benefiting from the low-cost,\nflexible object storage offered by data lakes. Bronze is the initial landing zone\nfor the pipeline. We recommend copying data that’s as close to its raw form as\npossible to easily replay the whole pipeline from the beginning, if needed. Silver\nis where the raw data gets cleansed (think data quality checks), transformed\nand potentially enriched with external data sets. Gold is the production-grade\ndata that your entire company can rely on for business intelligence, descriptive\nstatistics, and data science/machine learning.\n\n\nBy the time you get to Gold, the tables are high-value business-level metrics\nthat have all the schema enforcement and constraints applied. This way, you can\nretain the flexibility of the data lake at the Bronze and Silver levels, and then use\nthe Gold level for high-quality business data.\n\nAuto Loader\n\n\nBRONZE\n\n\nSILVER GOLD\n\n\nStructured Streaming\n\nBatch\n\nCOPY INTO\n\nPartners\n\n\nRaw ingestion Filtered, cleaned Business-level\nand history and augmented aggregates\n\n|Col1|Col2|\n|---|---|\n||R|\n\n\n**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n\n\n-----\n\n##### An aside on batch and streaming data pipelines\n\n\nThe best way to set up and run data pipelines in the Bronze/Silver/Gold\npattern recommended on the previous page is in Delta Live Tables (DLT).\nDLT makes it easy to build and manage reliable batch and streaming\ndata pipelines that deliver high-quality data. It helps data engineering\nteams simplify ETL development and management with declarative\npipeline development, automatic data testing, and deep visibility for\nmonitoring and recovery.\n\nThe fact that you can run all your batch and streaming pipelines together\nin one simple, declarative framework makes data engineering easy on the\nDatabricks Lakehouse Platform. We regularly talk to customers who have\nbeen able to reduce pipeline development time from weeks — or months\n— to mere minutes with Delta Live Tables. And by the way, even data\n\n\nanalysts can easily interrogate DLT pipelines for the queries they need\nto run, without knowing any sort of specialized programming language\nor niche skills.\n\nOne of the top benefits of DLT, and Delta Lake in general, is that it is built\nwith streaming pipelines in mind. Today, the world operates in real time, and\nbusinesses are increasingly expected to analyze and respond to their data in\nreal time. With streaming data pipelines built on DLT, analysts can easily access,\nquery and analyze data with greater accuracy and actionability than with\nconventional batch processing. Delta Live Tables makes real-time analytics a\nreality for our customers.\n\n\n-----\n\n##### Fine-grained governance on the lakehouse\n\nDelta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\non the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\nprovides fine-grained governance across clouds, data and ML assets. Among the\nbenefits of the Unity Catalog, it allows you to:\n\n**• Discover, audit and govern data assets in one place:** A user-friendly\ninterface, automated data lineage across tables, columns, notebooks,\nworkflows and dashboards, role-based security policies, table or\ncolumn-level tags, and central auditing capabilities make it easy for\ndata stewards to discover, manage and secure data access to meet\ncompliance and privacy needs directly on the lakehouse.\n\n\n\n**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\nopen standard SQL DCL. This means database administrators can easily\ngrant permission to arbitrary, user-specific views, or set permissions on\nall columns tagged together, using familiar SQL.\n\n**• Centrally manage and audit shared data across organizations:** Every\norganization needs to share data with customers, partners and suppliers\nto better collaborate and to unlock value from their data. Unity Catalog\nbuilds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\nshared assets within and across organizations.\n\n\nThe Unity Catalog makes it easy for data stewards to discover, manage and secure data access\nto meet compliance and privacy needs on the lakehouse.\n\n**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n\n\n-----\n\n###### PART 2: COMPUTE LAYER\n\n\nThe next layer to look at is the compute layer, or how we process queries.\n\nApache Spark TM has been the de facto standard for data lake compute. It’s great\nfor processing terabytes and petabytes of data cheaply, but historically Spark\nSQL uses a nonstandard syntax and can be difficult to configure.\n\n\nData warehouses, on the other hand, tend to support short running queries\nreally well, especially when you have a lot of users issuing queries concurrently.\nThey tend to be easier to set up, but don’t necessarily scale or they become\ntoo costly.\n\n\n**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n|---|---|---|\n|Economical|Scaling is exponentially more expensive|Economical|\n|High operational complexity|Ease of use|Ease of use|\n||||\n\n\nA popular belief is that large workloads require a drastically different system\nthan low latency, high concurrency workloads. For example, there’s the classic\ntrade-off in computer systems between latency and throughput.\n\nBut after spending a lot of time analyzing these systems, we found that it was\npossible to simultaneously improve large query performance and concurrency\n\n\nand latency. Although the classic trade-offs definitely existed, they were only\nexplicit when we optimized the system to the very theoretical optimal. It turned\nout the vast majority of software — and this includes all data warehouse systems\nand Databricks — were far away from optimal.\n\n\n-----\n\n##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n\n\nTo achieve world-class performance for analytics on the lakehouse, we chose to\ncompletely rebuild the compute layer. But performance isn’t everything. We also\nwant it to be simple to administer and cheaper to use. Databricks SQL leverages\nserverless SQL warehouses that let you get started in seconds, and it’s powered\nby a new native MPP vectorized engine: Photon.\n\nDatabricks SQL warehouses are optimized and elastic SQL compute resources.\nJust pick the cluster size and Databricks automatically determines the best\ninstance types and VMs configuration for the best price/performance. This\nmeans you don’t have to worry about estimating peak demand or paying too\nmuch by overprovisioning. You just need to click a few buttons to operate.\nTo further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\nWith the serverless capability, queries start rapidly with zero infrastructure\nmanagement or configuration overhead. This lowers your total cost, as you pay\nonly for what you consume without idle time or overprovisioned resources.\n\n\nSince CPU clock speeds have plateaued, we also wanted to find new ways to\nprocess data faster, beyond raw compute power. One of the most impactful\nmethods has been to improve the amount of data that can be processed in\nparallel. However, data processing engines need to be specifically architected to\ntake advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\nC++ based vectorized query processing engine that dramatically improves query\nperformance while remaining fully compatible with open Spark APIs. Databricks\nSQL warehouses are powered by Photon, which seamlessly coordinates work and\nresources and transparently accelerates portions of your SQL queries directly on\nyour data lake. No need to move the data to a data warehouse.\n\n**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n##### Photon: A Fast Query Engine for Lakehouse Systems\n\n[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n\n\n-----\n\n**Did you know?**\n\nDatabricks SQL warehouses scale automatically throughout the day to\nbetter suit your business needs. Administration is simplified by identifying\nhow many clusters can scale out with min and max, and Databricks SQL will\nauto-scale as needed. This ensures that you have ample compute to serve\nyour needs, without overprovisioning. Administrators appreciate the ability\nto have better control over consumption costs, while users appreciate that\ntheir queries process as fast and efficiently as possible. For most BI and\nanalytics use cases, using medium-size warehouses with scaling is a great\nbalance of price/performance that fits most business needs.\n\nIn the next section, we will discuss examples of Databricks SQL performance results\non large-scale analytic workloads as well as highly concurrent workloads.\n\n\nRunning Scheduled Starting Cluster Scale\n\n\n-----\n\n##### Large query performance: the fastest data warehouse\n\n\nThe industry standard benchmark used by data warehouses is TPC-DS. It includes\n100 queries that range from very simple to very sophisticated to simulate decision\nsupport workloads. This benchmark was created by a committee formed by\ndata warehousing vendors. The chart at right shows price/performance results\nrunning the 100TB version of TPC-DS, since for large workloads the numbers that\nultimately matter pertain to the performance cost. As you can see, Databricks SQL\noutperforms all cloud data warehouses we have measured.\n\n**[LEARN MORE](https://dbricks.co/benchmark)**\n\n**Did you know?**\n\n\n**$2,000**\n\n**$1,791**\n\n**$1,500**\n\n**$1,000**\n\n**$952**\n\n\n**$500**\n\n\n**$242**\n**$146**\n\n\n**$358**\n\n\n**$0**\nDatabricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\nSpot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n\nSystem\n\n100TB TPC-DS price/performance benchmark (lower is better).\n\n\nDatabricks SQL has set a [new world record in](http://tpc.org/5013)\n[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\nbenchmark for data warehousing. Databricks\nSQL outperformed the previous record by 2.2x.\nAnd this result has been formally audited and\nreviewed by the TPC council.\n\n\n-----\n\n##### Highly concurrent analytics workloads\n\nBeyond large queries, it is also common for highly concurrent analytics workloads\nto execute over small data sets. To optimize concurrency, we used the same\nTPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\nstreams. We analyzed the results to identify and remove bottlenecks, and\nbuilt hundreds of optimizations to improve concurrency. Databricks SQL now\noutperforms some of the best cloud data warehouses for both large queries and\nsmall queries with lots of users.\n\nReal-world workloads, however, are not just about either large or small queries.\nDatabricks SQL also provides intelligent workload management with a dual\nqueuing system and highly parallel reads.\n\n\n16,523\n\n12,248\n\n###### ~3X\n\n4,672\n\n\n11,690\n\n\nJuly 2020\n\n\nJan 2021 Oct 2022\n\n\nCLOUD DW X SQL WAREHOUSE X - L SIZE\n\n10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n\n\n-----\n\n##### Intelligent workload management with smart queuing system\n\nReal-world workloads typically include a mix of small and large queries. Therefore\nthe smart queuing and load balancing capabilities of Databricks SQL need to\naccount for that too. Databrick SQL uses a smart dual queuing system (in preview)\nthat prioritizes small queries over large, as analysts typically care more about the\nlatency of short queries than large ones.\n\n\n##### Highly parallel reads with improved I/O performance\n\nIt is common for some tables in a lakehouse to be composed of many files — for\nexample, in streaming scenarios such as IoT ingest when data arrives continuously.\nIn legacy systems, the execution engine can spend far more time listing these\nfiles than actually executing the query. Our customers told us they do not want to\nsacrifice performance for data freshness. With async and highly parallel I/O, when\nexecuting a query, Databricks SQL now automatically reads the next blocks of data\nfrom cloud storage while the current block is being processed. This considerably\nincreases overall query performance on small files (by 12x for 1MB files) and “cold\ndata” (data that is not cached) use cases as well.\n\n**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n\n\n-----\n\n###### PART 3: CONSUMPTION LAYER\n\n\nThe third layer of the Databricks Lakehouse Platform would similarly have to bridge\nthe best of both data lakes and data warehouses. In the lakehouse, you would\nhave to be able to work seamlessly with your tools of choice — whether you are a\nbusiness analyst, data scientist, or ML or data engineer.\n\n\nThe lakehouse must treat Python, Scala, R and SQL programming languages\nand ecosystems as first-class citizens to truly unify data engineering, ML and BI\nworkloads in one place.\n\n\n**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n|---|---|---|\n|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n||||\n\n\n-----\n\n##### A platform for your tools of choice\n\n\nAt Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\nclosely with a large number of software vendors to make sure you can easily use your tools of choice\non Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\nyour favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\nconcurrency and performance improvements, we make sure that the direct and live query experience is great.\n\n\n**Now more than ever, organizations**\n\n**need a data strategy that enables**\n\n**speed and agility to be adaptable.**\n\n**As organizations are rapidly moving**\n\n**their data to the cloud, we’re**\n\n**seeing growing interest in doing**\n\n**analytics on the data lake. The**\n\n**introduction of Databricks SQL**\n\n**delivers an entirely new experience**\n\n**for customers to tap into insights**\n\n**from massive volumes of data with**\n\n**the performance, reliability and**\n\n**scale they need. We’re proud to**\n\n**partner with Databricks to bring**\n\n**that opportunity to life.**\n\n**Francois Ajenstat**\nChief Product Officer, Tableau\n\n\n+ Any other Apache Spark-compatible client\n\n\n-----\n\n##### Faster BI results retrieval with Cloud Fetch\n\nOnce query results are computed, cloud data warehouses often collect and\nstream back results to BI clients on a single thread. This can create a bottleneck\nand greatly slows down the experience if you are fetching anything more than a\nfew megabytes of results in size. To provide analysts with the best experience\nfrom their favorite BI tools, we also needed to speed up how the system delivers\nresults to BI tools like Power BI or Tableau once computed.\n\nThat’s why we’ve reimagined this approach with a new architecture called\n[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\nall of the compute nodes to cloud storage, and then sends the list of files using\npre-signed URLs back to the client. The client then can download in parallel\nall the data from cloud storage. This approach provides up to 10x performance\nimprovement in real-world scenarios.\n\n\nparallel\ndata\ntransfers\n\n\nCloud Storage\n\n**Cluster**\n\n\nSQL Endpoint\n\n\nCUSTOMER BENCHMARK\nTABLEAU EXTRACT\n\n\nCloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n\n\n-----\n\n##### A first-class SQL development experience\n\nIn addition to supporting your favorite tools, we\nare also focused on providing a native first-class\nSQL development experience. We’ve talked to\nhundreds of analysts using various SQL editors\nlike SQL Workbench every day, and worked with\nthem to provide the dream set of capabilities\nfor SQL development.\n\nFor example, Databricks SQL now supports\n[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\nspecial SQL dialect. Query tabs allow you to work\non multiple queries at once, autosave gives you\npeace of mind so you never have to worry about\nlosing your drafts, integrated history lets you\neasily look at what you have run in the past, and\nintelligent auto-complete understands subqueries\nand aliases for a delightful experience.\n\n\nThe built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n\n\n-----\n\nFinally, with Databricks SQL, analysts can easily\nmake sense of query results through a wide variety\nof rich visualizations and quickly build dashboards\nwith an intuitive drag-and-drop interface. To keep\neveryone current, dashboards can be shared and\nconfigured to automatically refresh, as well as to\nalert the team to meaningful changes in the data.\n\n\nEasily combine visualizations to build rich dashboards that can be shared with stakeholders.\n\n\n-----\n\n### Conclusion\n\nDatabricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\ninto actionable data, combining the flexibility and openness of data lakes\nwith the reliability and performance of data warehouses. The Unity Catalog\nprovides fine-grained governance on the lakehouse across all clouds using\none friendly interface and standard SQL.\n\nDatabricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\nstandard performance benchmark for data warehousing. It is powered by\nPhoton, the new vectorized query engine for the lakehouse, and by SQL\nwarehouses for instant, elastic compute decoupled from storage.\n\nFinally, Databricks SQL offers a native first-class SQL development\nexperience, with a built-in SQL editor, rich visualizations and dashboards,\nand integrates seamlessly with your favorite BI- and SQL-based tools for\nmaximum productivity.\n\n\nDatabricks SQL under the hood.\n\n\n-----\n\n### Atlassian\n\n\nAtlassian is a leading provider of collaboration, development and issue-tracking\n\nsoftware for teams. With over 150,000 global customers (including 85 of the Fortune\n\n100), Atlassian is advancing the power of collaboration with products including Jira,\n\nConfluence, Bitbucket, Trello and more.\n\nUSE CASE\n\nAtlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\ndown operational costs. Atlassian currently has a number of use cases focused on putting the\ncustomer experience at the forefront.\n\n**Customer support and service experience**\nWith the majority of their customers being server-based (using products like Jira and Confluence),\nAtlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\ncustomer support experience.\n\n**Marketing personalization**\nThe same insights could also be used to deliver personalized marketing emails to drive\nengagement with new features and products.\n\n**Anti-abuse and fraud detection**\nThey can predict license abuse and fraudulent behavior through anomaly detection and\npredictive analytics.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nAtlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\nand externally. They have moved from a data warehousing paradigm to standardization on Databricks,\nenabling the company to become more data driven across the organization. Over 3,000 internal users in\nareas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\ninsights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\nusing the platform to drive more personalized support and service experiences to their customers.\n\n**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\nfinance, sales, support and R&D\n\n**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n\n**•** MLflow streamlines MLOps for faster delivery\n\n**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n\nWith cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\naccess all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\nAlready the company has:\n\n**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\njobs from EMR to Databricks with minimal effort and low-code change\n\n**•** Decreased delivery time by 30% with shorter dev cycles\n\n**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n\n**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n\n\n**At Atlassian, we need to ensure**\n**teams can collaborate well**\n**across functions to achieve**\n**constantly evolving goals. A**\n**simplified lakehouse architecture**\n**would empower us to ingest high**\n**volumes of user data and run the**\n**analytics necessary to better**\n**predict customer needs and**\n**improve the experience of our**\n**customers. A single, easy-to-use**\n**cloud analytics platform allows**\n**us to rapidly improve and build**\n**new collaboration tools based on**\n**actionable insights.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager, Atlassian\n\n\n-----\n\n### ABN AMRO\n\n\nAs an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n\nby legacy infrastructure and data warehouses that complicated access to data across various\n\nsources and created inefficient data processes and workflows. Today, Azure Databricks\n\nempowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n\nscientists and analysts who work collaboratively on improving business operations and\n\nintroducing new go-to-market capabilities across the company.\n\nUSE CASE\n\nABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\nproviding automation and insight across operations.\n\n**Personalized finance**\nABN AMRO leverages real-time data and customer insights to provide products and services tailored to\ncustomers’ needs. For example, they use machine learning to power targeted messaging within their automated\nmarketing campaigns to help drive engagement and conversion.\n\n**Risk management**\nUsing data-driven decision-making, they are focused on mitigating risk for both the company and their\ncustomers. For example, they generate reports and dashboards that internal decision makers and leaders use to\nbetter understand risk and keep it from impacting ABN AMRO’s business.\n\n**Fraud detection**\nWith the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\nimpacts their customers. Among the activities they’re trying to address are money laundering and fake credit\ncard applications.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nToday, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\nscientists and analysts who work collaboratively on improving business operations and introducing new\ngo-to-market capabilities across the company.\n\n**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\ndownstream analytics\n\n**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\nthrough reports and dashboards\n\n**•** MLflow speeds deployment of new models that improve the customer experience — with new use\ncases delivered in under two months\n\n\n**Databricks has changed the way**\n**we do business. It has put us in**\n**a better position to succeed in**\n**our data and AI transformation**\n**as a company by enabling data**\n**professionals with advanced data**\n**capabilities in a controlled and**\n**scalable way.**\n\n**Stefan Groot**\nHead of Analytics Engineering,\nABN AMRO\n\n\n#### 10x faster\n\ntime to market — use cases\ndeployed in two months\n\n\n#### 100+ \n\nuse cases to be delivered\nover the coming year\n\n\n#### 500+\n\nempowered business\nand IT users\n\n\n**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n\n\n-----\n\n### SEGA Europe\n\n**Improving the player experience**\n\n# “ is at the heart of everything\n\n**we do, and we very much**\n**see Databricks as a key**\n**partner, supporting us to drive**\n**forward the next generation of**\n**community gaming.**\n\n**Felix Baker**\nData Services Manager, SEGA Europe\n\n\nSEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n\nLakehouse Platform to personalize the player experience and build its own machine\n\nlearning algorithm to help target and tailor games for over 30 million of its customers.\n\nAs housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\ntitles, including Football Manager,™ saw over double the number of sales during the first lockdown\ncompared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\nin players over the course of the COVID-19 pandemic. With more anonymized data being collected through\nan analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\nsheer volume of data, extract meaningful insights from it and enable the data science team to improve\ngeneral workflow.\n\n**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n\n\n-----\n\n### About Databricks\n\nDatabricks is the lakehouse company. More than 7,000 organizations\n\nworldwide — including Comcast, Condé Nast and over 50% of the\n\nFortune 500 — rely on the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of\n\nApache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "# Big Book of Data and AI Use Cases for the Public Sector\n\n### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n\n\n-----\n\n## Contents\n\nThe State of Data and AI in the Government .......................................................................................... 3\n\nThe Need for a Modern Data Architecture ............................................................................................. 5\n\nIntroducing the Lakehouse for Public Sector ......................................................................................... 6\n\n**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n\n**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n\n**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n\n**U S E C A S E :** Money Laundering ................................................................................................................. 17\n\n**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n\n**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n\n**U S E C A S E :** Public Health Management .................................................................................................. 24\n\nConclusion ................................................................................................................................................. 26\n\n\n-----\n\n## The State of Data and AI in the Government\n\n###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n\n\nIn 2018, the U.S. Federal Government embarked on one of its most ambitious\nefforts since putting a man on the moon — embedding data into all aspects of\ndecision-making. By enacting the Evidence-Based Policymaking Act of 2018,\nCongress set in motion requirements for agencies to modernize their data and\nanalytics capabilities, including the appointment of agency-level chief data\nofficers. A year later came the Federal Data Strategy, which provided further\nguidance for how agencies should manage and use data by 2030.\n\n\nWith all of this guidance, agencies are starting to make meaningful improvements\nto their data strategy, but when it comes to innovating with data, agencies still\nlag behind the private sector. This begs the question: what’s standing in the way?\nThe hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\nthey can largely be attributed to a patchwork of legacy technologies that have\nbeen amassed over the last 30 to 40 years. While these hurdles stand in the\nway, a number of innovative agencies are making significant progress as they\nembrace new data and AI capabilities.\n\n\n-----\n\nFederal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n50% from 2018. There’s a good reason for this level of spend: Deloitte recently\npublished a report, “AI-augmented Government,” that estimates the federal\ngovernment could free up as many as 1.2 billion hours of work and save up to\n$41.1 billion annually through the use of AI-driven automation. Early adopters\nof advanced analytics are starting to see the fruits of their labor. For example,\n[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\non applicants by 24x, automate the processing of millions of applications,\nand reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n\nIn this eBook, we explore the hurdles of legacy technologies and how a modern\ndata lakehouse can help agencies unlock innovative data and analytics use cases\nat all levels of government. Over the following seven example use cases, covering\neverything from cyber threat detection to improving public health,\n\n\n**An increased focus on cloud, analytics and AI = operational efficiency**\n\n1. AI/ML\n2. Data Analytics\n3. Cloud\n\n**$1B** **TOP PRIORITIES** **$41B+**\n\nData and AI Research and Government CIOs’ top Estimated government\nDevelopment Initiative game-changing technologies savings from data-driven\nautomation\n\n**U.S. Government**\n\nwe demonstrate how the Databricks Lakehouse for Public Sector is critical to\nimproving citizen services and delivering on mission objectives. This guide also\nincludes resources in the form of Solution Accelerators, reference architectures\nand real-world customer stories to help as you embark on your own journey to\ndrive a safer and more prosperous nation through the use of data and AI.\n\n\n-----\n\n## The Need for a Modern Data Architecture\n\n###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n\n##### Common challenges\n\n\nMany government agencies are burdened with a legacy IT infrastructure that is\nbuilt with on-premises data warehouses that are complex to maintain, are costly\nto scale as compute is coupled with storage, and lack support for unstructured\ndata and advanced analytics. This severely inhibits data-driven innovation.\nMaintaining these systems requires a massive investment of both time and\nmoney compared to modern cloud-based systems and creates a number of\navoidable challenges:\n\n\ngovernment is often done in weekly or daily batches, but decision-making\nneeds to happen in real time. Critical events like cyber attacks and health\npandemics can’t wait a week.\n\n**Lack of citizen insights**\n\nWhen data is siloed, teams get an incomplete view of the citizen,\nresulting in missed opportunities to improve the delivery of services that\nimpact the quality of life for their constituents.\n\n\n**Lack of reliability**\n\n\nSiloed systems result in data replication as teams spin up new data marts\nto support their one-off use cases. Without a single source of truth, teams\nstruggle with data inconsistencies, which can result in inaccurate analysis\nand model performance that is only compounded over time.\n\n**Lack of agility**\n\nDisjointed analytics tools and legacy infrastructure hinder the ability of\nteams to conduct real-time analytics. Most data processing in the\n\n\n**Lack of productivity**\n\nData scientists and data analysts alike must have the right tool set to\ncollaboratively investigate, extract and report meaningful insights from\ntheir data. Unfortunately, data silos lead to organizational silos, which make\ncollaboration inside an agency as well as between agencies very difficult.\nWith different groups of data teams leveraging their own coding and\nanalytical tools, communicating insights and working across teams —\nlet alone across agencies — is almost impossible. This lack of collaboration\ncan drastically limit the capabilities of any data analytics or AI initiative.\n\n\n-----\n\n## Introducing the Lakehouse for Public Sector\n\n\nThe reason that the Databricks Lakehouse is\nable to deliver the simplicity, flexibility and\nspeed that a government agency requires is\nthat it fundamentally reimagines the modern\ndata architecture. Databricks provides federal,\nstate and local agencies with a cloud-native\nLakehouse Platform that combines the best\nof data warehouses and data lakes — to store\nand manage all your data for all your analytics\nworkloads. With this modern architecture,\nagencies can federate all their data and\ndemocratize access for downstream use\ncases, empowering their teams to deliver on\ntheir mission objectives by unlocking the full\npotential of their data.\n\n\n**Delivering real-time data insight in support of the mission**\n\n- Fraud, Waste & Abuse\n\n- Cybersecurity\n\n- Medicaid Dashboards &\nReporting\n\n- Process Improvement\n\n- Predictive Maintenance\n\n- SCM & Demand Forecasting\n\n- Smart Military/Censor Data\n\n- Military Heatlh\n\n- COVID Response/Decision\nSupport\n\n- Smart Cities/Connected\nVehicles\n\n- Citizen Engagement\n\n- Data-Driven Decision-Making\n\n\n-----\n\n**Federate all of your agency’s data**\n\nAny type of data can be stored because, like a data lake, the Databricks\nLakehouse is built using the low-cost object storage supported by cloud\nproviders. Leveraging this capability helps break down the data silos that\nhinder efforts to aggregate data for advanced analytics (e.g., predictive\nmaintenance) or compute-intensive workloads like detecting cyber\nthreats across billions of signals. Probably even more important is the\nability of the lakehouse architecture to travel back in time, ensuring full\naudit compliance and high governance standards for analytics and AI.\n\n**Power real-time decision-making**\n\nStreaming use cases such as IoT analytics or disease spread tracking is\nsimpler to support because the lakehouse uses Apache Spark TM as the\ndata processing engine and Delta Lake as a storage layer. With Spark,\nyou can toggle between batch and streaming workloads with just a line\nof code. With Delta Lake, native support for ACID transactions means\nthat you can deploy streaming workloads without the overhead of\ncommon reliability and performance issues. These capabilities make\nreal-time analytics possible.\n\n\n**Unlock collaborative analytics for all personas**\n\nThe Databricks Lakehouse for Public Sector is your one-stop shop for\nall your analytics and AI. The platform includes a business intelligence\ncapability — Databricks SQL — that empowers data analysts to query and run\nreports against all of an agency’s unified data. Databricks SQL integrates with\nBI tools like Tableau and Microsoft Power BI and complements any existing BI\ntools with a SQL-native interface, allowing data analysts and data scientists\nto query data directly within Databricks and build powerful dashboards.\n\n\n-----\n\n**Deliver on your mission with predictive insights**\nIn the same environment, data scientists can build, share and collaborate\non machine learning models for advanced use cases like fraud detection\nor geospatial analytics. Additionally, MLflow, an open source toolkit for\nmanaging the ML lifecycle, is built into the Lakehouse so data scientists\ncan manage everything in one place. Databricks natively supports Python,\nR, SQL and Scala so practitioners can work together with the languages and\nlibraries of their choice, reducing the need for separate tools. With these\ncapabilities, data teams can turn insights from real-world data into powerful\nvisualizations designed for machine learning. Visualizations can then be\nturned into interactive dashboards to share insights with peers across\nagencies, policymakers, regulators and decision-makers.\n\n\n##### Customers That Innovate With Databricks Lakehouse for Public Sector\n\nSome of the top government agencies in the world turn to the\nDatabricks Lakehouse for Public Sector to bring analytics and AI-driven\nautomation and innovation to the communities they serve.\n\n\n-----\n\n###### USE CASE:\n## Cybersecurity\n\n##### Overview\n\n\n**Limited window of data**\nGiven the high cost of storage, most agencies retain only a few weeks of threat\ndata. This can be a real problem in scenarios where a perpetrator gains access\nto a network but waits months before doing anything malicious. Without a long\nhistorical record, security teams can’t analyze cyberattacks over long-term\nhorizons or conduct deep forensic reviews.\n\n##### Solution overview\n\nFor government agencies that are ready to modernize their security data\ninfrastructure and analyze data at petabyte-scale more cost-effectively,\nDatabricks provides an open lakehouse platform that augments existing SIEMs\nto help democratize access to data for downstream analytics and AI. Built\non Apache Spark and Delta Lake, Databricks is optimized to process large\nvolumes of streaming and historic data for real-time threat analysis and incident\nresponse. Security teams can query threat data going years into the past in just\nminutes and build ML models to detect new threat patterns and reduce false\npositives. Additionally, Databricks created a Splunk-certified add-on to augment\nSplunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n\n\nCyberattacks from bad actors and nation states are a huge and growing threat\nto government agencies. Recent large-scale attacks like the ones on SolarWinds,\nlog4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\nfrequency of broad-reaching cyberattacks. Data breaches cost the federal\ngovernment more than $4 million per incident in 2021 and threaten national\nsecurity. Staying ahead of the next threat requires continuous monitoring of\nsecurity data from an agency’s entire attack surface before, during and after\nan incident.\n\n##### Challenges\n\n**Scaling existing SIEM solutions**\nAgencies looking to expand existing SIEM tools for today’s petabytes of data can\nexpect increased licensing, storage, compute and integration resources resulting\nin tens of millions of dollars in additional costs per year.\n\n**Rules-based systems**\nMany legacy SIEM tools lack the critical analytics capabilities — such as\nadvanced analytics, graph processing and machine learning — needed to detect\nunknown threat patterns or deliver on a broader set of security use cases like\nbehavioral analytics.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n\nDetecting criminals and nation states through DNS analytics. In order to address\ncommon cybersecurity challenges such as deployment complexity, tech\nlimitation and cost, security teams need a real-time data analytics platform that\ncan handle cloud scale, analyze data wherever it is, natively support streaming\nand batch analytics, and have collaborative content development capabilities.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n\n**Fighting Cyber Threats in Real Time**\nSince partnering with Databricks, HSBC has reduced costs, accelerated threat\ndetection and response, and improved their security posture. Not only can\nthey process all of their required data, but they’ve also increased online query\nretention from just days to months at petabyte scale. HSBC is now able to\nexecute 2-3x more threat hunts per analyst.\n\n\n[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n\nDesigned for cloud-scale security operations, the add-on provides Splunk\nanalysts with access to all data stored in the Lakehouse. Bidirectional pipelines\nbetween Splunk and Databricks allow agency analysts to integrate directly into\nSplunk visualizations and security workflows.\n\n\n-----\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Predictive Maintenance\n\n##### Overview\n\n\n**Integrating unstructured data**\nEquipment data doesn’t just come in the form of IoT data. Agencies can gather\nrich unstructured signals like audio, visual (e.g., video inspections) and text\n(e.g., maintenance logs). Most legacy data architectures are unable to integrate\nstructured and unstructured data sources.\n\n**Operationalizing machine learning**\nMost agencies lack the advanced analytics tools needed to build models that\ncan predict potential equipment failures. Those that do typically have their\ndata scientists working in a siloed set of tools, resulting in unnecessary data\nreplication and inefficient workflows.\n\n##### Solution overview\n\nThe Databricks Lakehouse is tailor-made for building IoT applications at scale.\nWith Databricks, agencies can easily manage large streaming volumes of small\nfiles, with ACID transaction guarantees and reduced job fails compared to\ntraditional data warehouse architectures. Additionally, the Lakehouse is cloud\nnative and built on Apache Spark, so scaling for petabytes of data is not an issue.\nWith the Lakehouse, agencies can bring together all of their structured and\nunstructured data with a unified set of tooling for data engineering, model building\nand production rollout. With these capabilities, operations teams can quickly\ndetect and act on pending equipment failures before they affect performance.\n\n\nPredictive maintenance is oftentimes associated with the manufacturing sector,\nbut in reality it extends far beyond the factory floor. Consider this for a moment:\nthe U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\nbuses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\nvehicles — like multimillion-dollar aircraft — contain sensors that generate\nmassive amounts of data on the use and conditions of various components. And\nit’s not just vehicles. Modern public utilities stream data through connected IoT\ndevices. All of this data can be analyzed to identify the root cause of a failure\nand predict future maintenance, helping to avoid costly repairs and critical\nassets from being out of service.\n\n##### Challenges\n\n**Managing IoT data at scale**\nWith billions of sensors generating information, most data systems are unable to\nhandle the sheer volume of data. Before agencies can even start analyzing their\ndata, legacy data warehouse–based tools require preprocessing of data, making\nreal-time analysis impossible.\n\n\n-----\n\n##### How to get started\n\n\n**Solution Accelerator: Predictive Maintenance**\nLearn how to ingest real-time IoT data from field devices, perform complex\ntime series processing in Delta Lake and leverage machine learning to build\npredictive maintenance models.\n\n[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n\n[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n\n[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n\n\n[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n\n##### Customer story\n\n**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n\n**Protecting the Water Supply for 700,000 Residents**\nUtilizing machine learning for predictive analytics to help stop water main\nbreaks before they occur, potentially saving hundreds of thousands of dollars\nin repairs while reducing service interruption.\n\n\n-----\n\n##### Reference architecture\n\nWeather Sensor\nReadings\n(semi-structured)\n\nReal-time\nstreaming\n\nWind Turbine\nTelematics\n(semi-structured)\n\nMaintenance Logs\n(unstructured)\n\n\n#### Databricks Lakehouse Platform\n\nBronze Layer Silver Layer Gold Layer\n\n\nAppend Raw\nMerge Data\nData\n\n\nJoin Streams and\nAnalyze Data\n\nEnriched\nReadings\n\n\nOutput\n\n\nBuild Predictive\nMaintenance Model\n\n\nGranular\nReadings\n\n\nAggregated\nHourly\nReadings\n\n\nReal-time Dashboards for Real-Time Dashboards for\nOptimizing Performance Optimizing Performance\n\n|Col1|Col2|Col3|\n|---|---|---|\n\n\n-----\n\n###### USE CASE:\n## Fraud Detection\n\n\n##### Overview\n\nAccording to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\nmonetary losses to fraud, waste and abuse go undetected and total tens of\nbillions of dollars. Financial fraud comes in many forms, from individuals taking\nadvantage of relief programs to complex networks of criminal organizations\nworking together to falsify medical claims and rebate forms. Investigative teams\nhoping to stay ahead of fraudsters need advanced analytics techniques so they\ncan detect anomalous behavior buried in a sea of data.\n\n##### Challenges\n\n**Lack of machine learning**\nA rules-based approach is not enough. Bad actors are getting more and more\nsophisticated in how they take advantage of government programs, necessitating\nan AI-driven approach.\n\n**Unreliable data**\nGetting high-quality, clean data and maintaining a rich feature store is critical\nfor identifying ever-evolving fraud patterns while maintaining a strict record of\nprevious data points.\n\n\n##### Solution overview\n\nThe Databricks Lakehouse enables teams to develop complex ML models with\nhigh governance standards and bridge the gap between data science and\ntechnology to address the challenge of analyzing large volumes of data at scale\n— 40 billion financial transactions a year are made in the United States alone.\nAdditionally, Databricks makes it possible to combine modern AI techniques\nwith the legacy rules-based methods that underpin current approaches to fraud\ndetection all within a common and efficient Spark-based orchestration engine.\n\n##### How to get started\n\n[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n\nDue to an ever-changing landscape, building a financial fraud detection\nframework often goes beyond just creating a highly accurate machine learning\nmodel. Oftentimes it involves a complex-decision science setup that combines\na rules engine with a need for a robust and scalable machine learning platform.\nIn this example, we show how to build a holistic fraud detection solution on\nDatabricks using data from a financial institution.\n\n\n**Analytics at scale**\nTraining complex ML models with hundreds of features on gigabytes of\nstructured, semi-structured and unstructured data can be impossible without a\nhighly scalable and distributed infrastructure.\n\n\n-----\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n\n**Identifying Financial Fraud at Scale**\nProcesses hundreds of billions of market events\nper day on the Databricks Lakehouse and uses\nthe power of machine learning to identify illicit\nactivity in near real-time.\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Money Laundering\n\n##### Overview\n\n\nApproximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\nand with criminal organizations — both at home and abroad — implementing\nincreasingly sophisticated methods for laundering funds, it’s getting harder to\nstop. While the federal government continues to apply pressure on the financial\nsector through heightened regulation, more is needed to combat laundering.\nModern AI techniques such as graph analytics and computer vision can be\nused to process different types of structured (e.g., financial transactions) and\nunstructured (e.g., real estate images) data and identify illicit behavior. This\nallows investigative teams to automate labor-intensive activities like confirming\na residential address or reviewing transaction histories, and instead dig into\npriority threats.\n\n##### Challenges\n\n**Complex data science**\nModern anti-money laundering (AML) practices require multiple ML capabilities\nsuch as entity resolution, computer vision and graph analytics on entity\nmetadata, which is typically not supported by any one data platform.\n\n\n**Time-consuming false positives**\nAny reported suspicious activity must be investigated manually to ensure\naccuracy. Many legacy solutions generate a high number of false positives or fail\nto identify unknown patterns, resulting in wasted effort by investigators.\n\n##### Solution overview\n\nAML solutions face the operational burden of processing billions of transactions\na day. The Databricks Lakehouse Platform combines the low storage cost\nbenefits of cloud data lakes with the robust transaction capabilities of data\nwarehouses, making it the ideal foundation for building AML analytics at massive\nscale. At the core of Databricks is Delta Lake, which can store and combine\nboth unstructured and structured data to build entity relationships; moreover,\nDatabricks Delta Engine provides efficient access using the new Photon compute\nto speed up BI queries on tables spanning billions of transactions. On top of\nthese capabilities, ML is a first-class citizen in the Lakehouse, which means\nanalysts and data scientists do not waste time subsampling or moving data to\nshare dashboards and stay one step ahead of bad actors.\n\n\n**Model transparency**\nAlthough AI can be used to address many money laundering use cases, the lack\nof transparency in the development of ML models offers little explainability,\ninhibiting broader adoption.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n\n\nLakehouse Platform leveraging a series of next-gen machine learning techniques\nincluding NLP, computer vision, entity resolution and graph analytics. This\napproach helps teams better adapt to the reality of modern laundering practices.\n\n\nCurrent anti-money laundering practices bear little resemblance to those of the\nlast decade. In today’s digital world, financial institutions are processing billions\nof transactions daily, increasing the surface area of money laundering. With this\naccelerator, we demonstrate how to build a scalable AML solution on the\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Entity Analytics\n\n##### Overview\n\n\n**No machine learning capabilities**\nEntity resolution typically relies on basic rules-based logic to compare records\n(e.g., matching on name and address), but with messy, large volumes of data,\nadvanced analytics is needed to improve accuracy and accelerate efforts.\n\n##### Solution overview\n\nThe Databricks Lakehouse is an ideal platform for building entity analytics at\nscale. With support for a wide range of data formats and a rich and extensible\nset of data transformation and ML capabilities, Databricks enables agencies to\nbring together all of their data in a central location and move beyond simple\nrules-based methods for entity resolution. Data teams can easily explore\ndifferent machine learning techniques like natural language processing,\nclassification and graph analytics to automate entity matching. And one-click\nprovisioning and deprovisioning of cloud resources makes it easy for teams to\ncost-effectively allocate the necessary compute resources for any size job so\nthey can uncover findings faster.\n\n\nEntity analytics aims to connect disparate data sources to build a full view of\na person or an organization. This has many applications in the public sector,\nsuch as fraud detection, national security and population health. For example,\nMedicare fraud teams need to understand which prescriptions are filled, claims\nfiled and facilities visited across geographies to uncover suspicious behavior.\nBefore teams can even look for suspicious behavior, they must first determine\nwhich records are associated. In the United States, nearly 50,000 people share\nthe name John Smith (and there are thousands of others with similar names).\nImagine trying to identify the right John Smith for this type of analysis. That’s no\neasy task.\n\n##### Challenges\n\n**Disjointed data**\nManaging complex and brittle ETL pipelines in order to cleanse and join data\nacross siloed systems and data stores.\n\n\n**Compute intensive**\nIdentifying related entities across population-level data sets requires massive\ncompute power that far outstrips legacy on-prem data architectures.\n\n\n-----\n\n##### How to get started\n\n[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n\nLearn from Databricks experts on how entity analytics is being deployed\nin the public sector and watch a demo that shows how to use ML to link\npayments and treatments across millions of records in a public CMS data set.\n\n[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n\nWhile focused on retail, this accelerator has applications for any organization\nworking on entity matching, especially as it relates to items that might be stored\nacross locations. In this notebook, we demonstrate how to use machine learning\nand the Databricks Lakehouse Platform to resolve differences between product\ndefinitions and descriptions, and determine which items are likely pairs and\nwhich are distinct across disparate data sets.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n\nIn this talk, NewWave shares the specifics on CMS’s entity resolution use case,\nthe ML necessary for this data and the unique uses of Databricks in providing\nthis capability.\n\n##### Sample workflow\n\n\n-----\n\n###### USE CASE:\n## Geospatial Analytics\n\n##### Overview\n\n\n**Broad range of analytics capabilities**\nEnterprises require a diverse set of data applications — including SQL-based\nanalytics, real-time monitoring, data science and machine learning — to support\ngeospatial workloads given the diverse nature of the data and use cases.\n\n##### Solution overview\n\nWith Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\nworkloads, as it provides a single source of truth for all types of structured,\nunstructured, streaming and batch data, enabling seamless spatio-temporal\nunification and cross-querying with tabular and raster-based data. Built on\nApache Spark, the Lakehouse easily scales for data sets consisting of billions\nof rows of data with distributed processing in the cloud. To expand on the core\ncapabilities of the Lakehouse, Databricks has introduced the Mosaic library,\nan extension to the Apache Spark framework, built for fast and easy processing\nof large geospatial data sets. Popular frameworks such as Apache Sedona or\nGeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\nLakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\nto support all types of geospatial use cases.\n\n\nEvery day billions of handheld and IoT devices, along with thousands of\nairborne and satellite remote sensing platforms, generate hundreds of exabytes\nof location-aware data. This boom of geospatial big data combined with\nadvancements in machine learning is enabling government agencies to develop\nnew capabilities. The potential use cases for geospatial analytics and AI touch\nevery part of the government, including disaster recovery (e.g., flood/earthquake\nmapping), defense and intel (e.g., detecting threats using drone footage),\ninfrastructure (e.g., public transportation planning), civilian safety (e.g., crime\nprediction), public health (e.g., disease spread tracking), and much more. Every\nagency at the state and federal level needs to consider how they can tap into\ngeospatial data.\n\n##### Challenges\n\n**Massive volumes of geospatial data**\nWith the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\ndaily, outpacing their ability to store and process this data at scale.\n\n\n**Compute-intensive spatial workloads**\nGeospatial data is complex in structure, with various formats not well suited for\nlegacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nBuild a Lakehouse to support all of your geospatial analytics and AI use cases\nwith the Mosaic library. Mosaic provides a number of capabilities including easy\nconversion between common spatial data encodings, constructors to easily\ngenerate new geometries from Spark native data types, many of the OGC SQL\nstandard ST_ functions implemented as Spark Expressions for transforming,\naggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\nall provided with the flexibility of a Scala, SQL or Python API.\n\n[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n\nLearn how to build powerful geospatial insights and visualizations with a\nLakehouse for all your geospatial data processing, analytics and AI.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n\n**Analyzing Flight Data to Improve Aviation**\nTo help airlines better serve their millions of passengers, USDOT built a\nmodern analytics architecture on Databricks that incorporates data such as\nweather, flight, aeronautical and surveillance information. With this new\nplatform, they reduced compute costs by 90% and can now power use cases\nsuch as predicting air cargo traffic patterns, flight delays and the financial\nimpact of flight cancellations.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n\n**Customer Story: Flood Prediction With Machine Learning**\nIn an effort to improve the safety of civil projects, Stantec built a machine\nlearning model on Databricks leveraging large volumes of weather and geological\ndata — oftentimes consisting of trillions of data points — to predict the impact\nof flash floods on various regions and adjust civil planning accordingly.\n\n\n-----\n\n##### Reference architecture\n\nMosaic Kepler Magics\nGeometry Display Functions\nfor Map Display\n\nESRI Java API for\nGeometry Operations\n\n\nBuilt-In Indexing\nSystem Support\n\n\nJTS Java API for\nGeometry Operations\n\n\n-----\n\n###### USE CASE:\n## Public Health Management\n\n##### Overview\n\n\nIn their lifetime, every human is expected to generate a million gigabytes of\nhealth data spanning electronic health records, medical images, claims, wearable\ndata, genomics and more. This data is critical to understanding the health of\nthe individual, but when aggregated and analyzed across large populations,\ngovernment agencies can glean important insights like disease trends, the\nimpact of various treatment guidelines and the effectiveness of resources. By\nadding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\nlocation, income level, education, housing — agencies can better identify\nunderserved communities and the critical factors that contribute to positive\nhealth outcomes.\n\n##### Challenges\n\n**Rapidly growing health data**\nHealthcare data is growing exponentially. Unfortunately, legacy on-premises data\narchitectures are complex to manage and too costly to scale for populationscale analytics.\n\n\n**Complexities of ML in healthcare**\nThe legacy analytics platforms that underpin healthcare lack the robust data\nscience capabilities needed for predictive health use cases like disease risk\nscoring. There’s also the challenge of managing reproducibility, which is critical\nwhen building ML models that can impact patient outcomes.\n\n##### Solution overview\n\nThe Databricks Lakehouse enables public health agencies to bring together all\ntheir research and patient data in a HIPAA-certified environment and marry it\nwith powerful analytics and AI capabilities to deliver real-time and predictive\ninsights at population scale. The Lakehouse eliminates the need for legacy\ndata architectures, which have historically inhibited innovation in patient care\nby creating data silos and making advanced analytics difficult. Databricks led\nopen source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\nthat make it easy to ingest and prepare healthcare-specific data modalities for\ndownstream analytics.\n\n\n**Fragmented patient data**\nIt is widely accepted that over 80% of medical data is unstructured, yet most\norganizations still focus their attention on data warehouses designed to only\nsupport structured data and SQL-based analytics.\n\n\n-----\n\n##### How to get started\n\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nOur joint solutions with John Snow Labs bring together the power of Spark NLP\nfor Healthcare with the collaborative analytics and AI capabilities of Databricks.\nInformatics teams can ingest raw unstructured medical text files into Databricks,\nextract meaningful insights using natural language processing techniques,\nand make the data available for downstream analytics. We have specific NLP\nsolutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n\n[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n\nOne of the most powerful tools for identifying patients at risk for a chronic\ncondition is the analysis of real world data (RWD). This Solution Accelerator\nnotebook provides a template for building a machine learning model that\nassesses the risk of a patient for a given condition within a given window of time\nbased on a patient’s encounter history and demographics information.\n\n\n[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n\nDatabricks COVID-19 surveillance solution takes a data-driven approach to\nadaptive response, applying predictive analytics to COVID-19 data sets to\nhelp drive more effective shelter-in-place policies.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n\n**From Vaccine Management to ICU Planning**\nDuring the pandemic, the Chesapeake Regional Information System for our\nPatients implemented a modern data architecture on Databricks to address\ncritical reporting needs. This allowed them to analyze 400 billion data points\n\nfor innovative use cases like real-time disease spread tracking, vaccine\ndistribution and prioritizing vulnerable populations.\n\n\n-----\n\n## Conclusion\n\nToday, data is at the core of how government agencies operate and AI is at the\n\nforefront of driving innovation into the future. The Databricks Lakehouse for\n\nPublic Sector enables government agencies at the federal, state and local level\n\nto harness the full power of data and analytics to solve strategic challenges and\n\nmake smarter decisions that improve the safety and quality of life of all citizens.\n\nGet started with a free trial of Databricks Lakehouse and start building better\n\ndata applications today.\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n###### Contact us for a personalized demo databricks.com/contact\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "###### EBOOK\n\n# Lakehouse for Manufacturing\n\n###### Build a connected customer experience, optimize operations and unify your data ecosystem\n\n\n-----\n\n## Contents\n\nIntroduction .......................................................................................................................... **3**\n\nManufacturing Transformation Trends .............................................................................. **5**\n\nManufacturing Data Challenges ......................................................................................... **9**\n\nDatabricks Lakehouse for Manufacturing ....................................................................... **10**\n\nBuilding Innovative Solutions on the Lakehouse ............................................................. **12**\n\n**SOLUTION:** Part-Level Demand Forecasting ....................................................................... 12\n\n**SOLUTION:** Overall Equipment Effectiveness & KPI Monitoring ............................................. 14\n\n**SOLUTION:** Digital Twins ................................................................................................... 15\n\n**SOLUTION:** Computer Vision ............................................................................................ 16\n\nAn Ecosystem on the Lakehouse for Manufacturing ...................................................... **17**\n\n**SOLUTION:** Avanade Intelligent Manufacturing .................................................................. **18**\n\n**SOLUTION:** DataSentics Quality Inspector ........................................................................ **18**\n\nSOLUTION: Tredence Predictive Supply Risk Management ................................................. **19**\n\nLeading Manufacturing Companies That Choose Us ................................................... **20**\n\n\n-----\n\n## Introduction\n\nMarket conditions in manufacturing are more challenging than ever. Operating margins\nand growth are impacted by the rising cost of labor, materials, energy and transportation, all\npeaking at the same time. Disruptive events in the supply chain are increasing in frequency\nand intensity, leading to significant revenue losses and damaged brand reputation.\n\nEffective acquisition and retention of next-generation talent is a considerable issue for\nmanufacturers. There are more jobs in the industry than there are people to do them, further\ncompounding the problem of slower than expected industrial productivity growth over the\nlast 15 years. The industry is also one of the largest consumers of energy, and faces a direct\nchallenge of transforming operations to be more sustainable as governments are prioritizing\nnet-zero policies that require a step change in energy efficiency and transition to low-carbon\nenergy sources.\n\nThe manufacturing industry generates massive amounts of new data every day — estimated\nto be two to four times more in size than in industries such as communications, media,\nretail and financial services. This explosion of data has opened the door for the global\nmanufacturing ecosystem to boost productivity, quality, sustainability and growth beyond\nwhat was previously thought possible.\n\nUnfortunately, legacy data warehouse-based architectures weren’t built for the massive\nvolumes and type of data coming in through today’s factories, products, processes and\nworkers, let alone to support the advanced AI/ML use cases required to meet the customer\nexpectations of shorter lead times, reliable delivery and smarter products.\n\n\n-----\n\nFor that, companies need to adopt a modern data architecture that provides the speed, scale and\ncollaboration needed by broad teams of data engineers, data scientists, and analysts. Manufacturers need\na comprehensive data platform that can not only handle massive volumes of data, but effectively and\nseamlessly operationalize the value from data, analytics and AI.\n\nThis is achieved by:\n\nRemoving data silos by placing all data, regardless of type or frequency, in a single, open\narchitecture — including unstructured data from sensors, telemetry, natural language logs,\nvideos and images — helping you to gain end-to-end visibility into your business\n\nEnsuring your data is “always on” so that the freshest and highest quality data is available for\nall for the full spectrum of enterprise analytics and AI/ML use cases, allowing you to drive ITOT convergence\n\nHaving a comprehensive open architecture so IT and data teams can move with agility\nto bring AI and ML to where it’s needed, when it’s needed, including in connectivityconstrained environments\n\nMaintaining fine-grained governance and access control on your data assets, protecting\n\nsensitive intellectual property and customer data\n\nThe Databricks Lakehouse for Manufacturing does just this. It’s a comprehensive approach that empowers\nteams in the industry to collaborate and innovate around data, analytics and AI. It eliminates the technical\nlimitations of legacy technologies and gives data teams the ability to drive deeper, end-to-end insight\ninto supply chains, automate processes to reduce costs and grow productivity, and achieve sustainable\ntransformation for a more prosperous future. Welcome to the Lakehouse for Manufacturing.\n\n\n-----\n\n## Manufacturing Transformation Trends\n\n\nThe future of manufacturing is smart, sustainable and service oriented. Today’s\nforward-thinking leaders are preparing the foundation they need to support that\nfuture by leveraging fast and connected data from all corners of the enterprise.\nThere are four key trends driving transformation in manufacturing:\n\n**Boosting industrial productivity through automation**\n\nA spike in labor costs, as well as the cost of energy and materials, puts significant\npressure on operating margins. At the same time, industrial productivity has\nplateaued — it is at the same level today as it was in the late 2000s. In the face\nof these macro challenges and economic uncertainty, there has never been a\nmore burning need to reduce costs and improve productivity through greater\nvisibility and automation.\n\nThe industry has made strides in collecting data from machines and performing\npredictive analytics on sensor readings, with 47% of manufacturers citing the\nuse of predictive maintenance to reduce operational costs with considerable\nupside ahead.\n\nHowever, there is an entirely different class of unstructured data in the form of\nimages, videos and LiDAR that is opening the door to game-changing automation\nin quality inspection, flow optimization and production scheduling. Historically,\nthese critical processes have depended on manual and visual inspection of\nproducts and operations, which is resource intensive and less accurate than\nML-driven computer vision techniques. This untapped data and capability\nis allowing manufacturers to deliver higher product quality and deliver on\nproduction demands using fewer resources. Andrew Ng, a machine learning\n\n\npioneer, rightly describes the massive opportunity for these technologies in\nhis quote: “It is incumbent on every CEO in any manufacturing or industrial\nautomation company to figure out how to make deep learning technology work\nfor your business.”\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Corning\n\n#### $2 million in cost avoidance through \n\nmanufacturing upset event reduction\n\n**Driving Better Efficiency in Manufacturing Process With ML**\n\nCorning has been one of the world’s leading innovators in materials science for\n\nnearly 200 years. Delivering high-quality products is a key objective across the\n\ncompany’s manufacturing facilities around the world, and it’s always on a mission\n\nto explore how ML can help deliver on that goal. Databricks has been central\n\nto the company’s digital transformation, as it provides a simplified and unified\n\nplatform where teams can centralize all data and ML work. Now, they can train\n\nmodels, register them in MLflow, generate all additional artifacts — like exported\n\nformats — and track them in the same place as the base model.\n\n[LEARN MORE](https://www.databricks.com/blog/2023/01/05/how-corning-built-end-end-ml-databricks-lakehouse-platform.html)\n\n\n-----\n\n**Gaining end-to-end operations and**\n**supply chain visibility**\n\nModern customer expectations are forcing manufacturers to focus on more\ncustomer-centric KPIs: quality, on-time commitments and speed of delivery.\nThat’s not to say that asset and labor efficiency are less important — however,\nwith customer expectations of shorter lead times and more reliable delivery,\nthe success measures in manufacturing are shifting to a mantra of “measure\nwhat your customer values.”\n\nHigh-performing manufacturers that embed this deep into their operational\nplaybook also perform best on productivity and ROIC growth results, as\nevidenced in a recent study by the World Economic Forum and the International\nCentre of Industrial Transformation. The problem? In a post-pandemic world,\noperations and supply chains are persistently constrained, with increasing\ndisruptions, spiraling costs and unpredictable performance. The business\nimpact is considerable — studies have shown that a 30-day disruption can\nreduce EBITDA by 5% and impact annual revenue by as much as 20%.\n\nManufacturing companies need to be able to deliver on customer expectations,\ncommitments and service levels, all while lowering costs and increasing\nproductivity. Manufacturers need an enterprise data platform that can provide\nreal-time visibility into order flows, production processes, supplier performance,\ninventory and logistics execution, breaking down departmental silos to maximize\ncustomer responsiveness, improve manufacturing agility and boost performance.\n\n\n**Transforming your business model through**\n**tech-fueled services**\n\nServitization, defined as the process of building revenue streams from services,\nhas been trending for some time. The adaptation of the business model has\nbeen considerably profitable: on average, services account for ~30% of industrial\nmanufacturing companies but contribute 60%+ of profit.\n\nIn aftersale services, a clear customer preference for business outcome-based\nofferings has emerged in almost every corner of the manufacturing industry.\nThe use of data, analytics and AI is foundational to delivering more personalized\ncustomer outcomes, proactive field service delivery and differentiated missioncritical applications to their customers.\n\nWith greater autonomy, connectivity and sensorization, manufacturers operate\nin a paradigm where their products generate more and more data every second,\nopening up numerous new addressable opportunities for value creation. The\nbusiness of manufacturing is no longer linear, and manufacturers will need to\nreimagine their businesses to go beyond merely providing the primary unit of\nproduction — the next SKU, machine, vehicle or airplane — and leverage this data\nto operate a platform business with higher growth, stickier revenue streams and\ngreater resilience to demand shocks.\n\n\n-----\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Rolls-Royce\n\n**Aerospace Goes Green With Data and AI**\n\nWhile most people think of luxury cars when they hear “Rolls-Royce,” the\n\nCivil Aerospace branch is its own company, having separated from the car\n\nmanufacturing arm in 1971. The now wildly successful manufacturer of commercial\n\nairplane engines is a leader in its industry for innovation. Today, Rolls-Royce\n\n\n_“We employed Databricks to optimize inventory planning using data and analytics,_\n_positioning parts where they need to be, based on the insight we gain from our_\n_connected engines in real time and usage patterns we see in our service network. This_\n_has helped us minimize risks to engine availability, reduce lead times for spare parts_\n_and drive more efficiency in stock turns — all of this enables us to deliver TotalCare,_\n_the aviation industry’s leading Power-by-the-Hour (PBH) maintenance program.”_\n\n**S T U A R T H U G H E S**\n\nChief Information and Digital Officer\nRolls-Royce Civil Aerospace\n\n\nobtains information directly from the airlines’ engines and funnels it into the\n\nDatabricks platform. This gives the company insights into how the engines are\n\nperforming and ways to improve maintenance schedules, translating to less\n\ndowntime, delays, and rerouting — all of which reduce carbon footprint.\n\n[LEARN MORE](https://www.wired.com/sponsored/story/how-tech-is-helping-to-save-the-world/)\n\n\n-----\n\n**Driving a more sustainable approach**\n**to manufacturing**\n\nGlobal efforts on reducing greenhouse gas (GHG)\nemissions are accelerating, with over 70 countries\nrepresenting more than 75% of global emissions\nhaving signed agreements to reach net-zero\nemissions by 2050. Manufacturing-centric sectors\nare critical to achieving net-zero sustainability\ncommitments around the world, as they represent\nover 50% of global energy consumption and\ncontribute to ~25% of global emissions.\n\nThose at the forefront of data, analytics and\nAI are setting science-based targets and are\ndriving favorable sustainability outcomes today\nby deriving better insights from their operations,\nsupply chains and the outcomes that their\nproducts generate for their end customers.\n\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Shell\n\n**Delivering Innovative Energy Solutions for a Cleaner World**\n\n\nShell has been at the forefront of creating a cleaner tomorrow by investing in digital\n\ntechnologies to tackle climate change and become a net-zero emissions energy\n\nbusiness. Across the business, they are turning to data and AI to improve operational\n\nefficiencies, drive customer engagement, and tap into new innovations like renewable\n\nenergy. Hampered by large volumes of data, Shell chose Databricks to be one of\n\nthe foundational components of its Shell.ai platform. Today, Databricks empowers\n\nhundreds of Shell’s engineers, scientists and analysts to innovate together as part of\n\ntheir ambition to deliver cleaner energy solutions more rapidly and efficiently.\n\n[LEARN MORE](https://www.google.com/url?q=https://www.databricks.com/customers/shell&sa=D&source=editors&ust=1679097620349908&usg=AOvVaw00lb46oTfGRpOREXOI1Ue3)\n\n_“Shell has been undergoing a digital transformation as part of our ambition to deliver more_\n_and cleaner energy solutions. As part of this, we have been investing heavily in our data lake_\n_architecture. Our ambition has been to enable our data teams to rapidly query our massive_\n_data sets in the simplest possible way. The ability to execute rapid queries on petabyte_\n_scale data sets using standard BI tools is a game changer for us. Our co-innovation_\n_approach with Databricks has allowed us to influence the product road map, and we are_\n_excited to see this come to market.”_\n\n\n### Millions\nof dollars saved in\npotential engine\nrepair costs\n\ndata team\n### 250\nmembers supporting\n160+ high-value use\ncases\n\nfaster –\n### 9x\n5 minutes to validate\na label, reduced from\n45 minutes\n\n\n**D A N I E L J E AV O N S**\nGeneral Manager – Advanced Analytics CoE\n\nShell\n\n\n-----\n\n## Manufacturing Data Challenges\n\n\n**Massive unstructured/OT data volumes**\n\nThe industry is seeing immense growth in data volumes: much of this massive\ngrowth is due to semi-structured and unstructured data from connected workers,\nbuildings, vehicles and factories. This growth in multi-modal data from IoT sensors,\nprocess historians, product telemetry, images, cameras and perception systems\nhas outpaced legacy data warehouse-centric technologies. On-prem and cloud\ndata warehouse tech-based architectures are too complex and too costly for the\nlarge and heterogeneous data sets prevalent in the industry.\n\n**Driving IT-OT convergence**\n\nThe success and pace of data modernization efforts in manufacturing is so often\nmuted by critical data being stuck in multiple closed systems and proprietary\nformats, making it difficult and cost-prohibitive to extract the full potential of IT\nand OT data sets. In addition, data quality issues such as outdated or inaccurate\ndata can often lead to a disjointed and incomplete view of customers, operations\nand assets. For years, companies have lacked a common foundation for complex\nand heterogeneous manufacturing data — from IoT-generated data streams to\nfinancial metrics stored in ERP applications — and it has impacted their ability to\nprovide the freshest, highest-quality and most complete data for analytics.\n\n\n**Bringing AI/ML to where it’s needed**\n\nTo realize the promise of AI/ML in manufacturing, machine learning models need\nto be brought as close to the decision as possible, often at the edge in facilities\nand locations with limited or intermittent connectivity to the internet or cloud.\nThis requires deployment flexibility to on-premises or edge devices, with an\nexperience comparable to that in the cloud.\n\n**Inability to innovate at scale**\n\nCDOs want to be able to quickly and efficiently reproduce successes at global\nscale. Technical and business users want to simply and quickly know what data\nsets are available to solve the business issue at hand. Analysts want flexibility to\nuse the tools they are most familiar with in order to stay responsive to business\nneeds. Fragmented approaches to architecture and tooling make scaling\nbusiness impact very difficult, which results in talent churn, slower development\nand duplicative efforts — all leading to higher costs.\n\n\n-----\n\n## Databricks Lakehouse for Manufacturing\n\n**Deliver personalized outcomes and frictionless experiences**\n\n**Millions of assets streaming IoT data**\n\n**5%–10% reduction in unplanned downtime and cost**\n\n**Accurate prices across 1,000s of locations and millions of dealers**\n\n**200%+ increase in offer conversion rates**\n\nWith Databricks Lakehouse for Manufacturing, manufacturers can gain a\nsingle view of their customers that combines data from each stage of the\ncustomer journey. With a 360-degree view in place, manufacturers can drive\nmore differentiated sales strategies and precise service outcomes in the\nfield, delivering higher revenue growth, profitability and CSAT scores.\n\nWith the Databricks Lakehouse, you can analyze product telemetry data,\ncustomer insights and service networks to deliver highest uptime, quality of\nservice and economic value through the product lifecycle.\n\n**Optimize the supply chain, production processes and fulfillment logistics**\n\n**with real-time analytics and AI.**\n\nThe Databricks Lakehouse for Manufacturing is the only enterprise data platform\nthat helps manufacturing organizations optimize their supply chains, boost\nproduct innovation, increase operational efficiencies, predict fulfillment needs\nand reduce overall costs.\n\n\n-----\n\n**Gain real-time insight for agile manufacturing and logistics**\n\n**30%–50% improvement in forecast accuracy**\n\n**90% lower cost for new manufacturing line**\n\n**4%–8% reduction in logistics costs**\n\n**10% improvement in carbon footprint**\n\nThe Databricks Lakehouse lets you build a resilient and predictive supply\nchain by eliminating the trade-off between accuracy or depth of analysis\nand time. With scalable, fine-grained forecasts to predict or sense demand,\nor perform supply chain planning and optimization, Databricks improves\naccuracy of decisions, leading to higher revenue growth and lower costs.\n\nThe lakehouse provides an “always on” architecture that makes IT-OT\nconvergence a reality, by continuously putting all data to work regardless of the\nfrequency at which it arrives (periodic, event-driven or real-time streaming)\nand creates valuable data products that can empower decision makers. This\ncreates real-time insight into performance with data from connected factory\nequipment, order flows and production processes to drive the most effective\nresource scheduling.\n\n\n**Empower the manufacturing workforce of the future**\n\n**25% improvement in data team productivity**\n\n**50x faster time to insight**\n\n**50% reduction in workplace injuries**\n\nWith Databricks, manufacturers can increase the impact and decrease the\ntime-to-value of their data assets, ultimately making data and AI central to every\npart of their operation. And by empowering data teams across engineering,\nanalytics and AI to work together, Databricks frees up employees to self-serve\nand focus on realizing maximum business value — improving product quality,\nreducing downtime and exceeding customer expectations.\n\n**Execute product innovation at the speed of data**\n\n**90% decrease in time to market of new innovations**\n\n**20x faster data processing of vehicle and road data**\n\nIt is critical that manufacturers are offering the most desirable value\npropositions so end consumers don’t look elsewhere. By tapping into product\nperformance and attribute data along with market trends and operations\ninformation, manufacturers can make strategic decisions.\n\nWith Databricks, manufacturers can decrease time to market with new products\nto increase sales by analyzing customer behavior and insights (structured,\nunstructured and semi-structured), product telemetry (streaming, RFID, computer\nvision) and digital twins, and leveraging that data to drive product decisions.\n\n\n-----\n\n## Building Innovative Solutions on the Lakehouse\n\n\nThe flexibility of the Databricks Lakehouse Platform means that you can start\nwith the use case that will have the most impact on your business. Through\nour experience working with some of the largest and most cutting-edge\nmanufacturers in the world, we’ve developed Solution Accelerators based\non the most common needs of manufacturers to help you get started. These\npurpose-built guides — fully functional notebooks and best practices — speed\nup results across your most common and high-impact use cases. Go from idea\nto proof of concept (PoC) in as little as two weeks. Check out the full list of\nSolution Accelerators [here](https://www.databricks.com/solutions/accelerators) .\n\n**S O L U T I O N**\n**Part-Level Demand**\n**Forecasting**\n\n\nDemand forecasting is a critical business process for manufacturing and\nsupply chains. McKinsey estimates that over the next 10 years, supply\nchain disruptions can cost close to half (~45%) of a year’s worth of profits\nfor companies. Having accurate and up-to-date forecasts is vital to plan\nthe scaling of manufacturing operations, ensure sufficient inventory and\nguarantee customer fulfillment.\n\nIn recent years, manufacturers have been investing heavily in quantitativebased forecasting that is driven by historical data and powered using either\nstatistical or machine learning techniques. Benefits include:\n\n**•** Better sales planning and revenue forecasting\n\n**•** Optimized safety stock to maximize turn-rates and\nservice-delivery performance\n\n**•** Improved production planning by tracing back\nproduction outputs to raw material levels\n\n**A disruption lasting just 30 days or less could**\n\n**equal losses of** **3%-5% of EBITDA.**\n\n\n-----\n\nDatabricks Lakehouse can enable large-scale forecasting solutions to help\nmanufacturers navigate the most common data challenges when trying to\nforecast demand.\n\n**C O M M O N U S E C A S E S :**\n\nScalable, accurate forecasts across large numbers of store-item\ncombinations experiencing intermittent demand\n\nAutomated model selection to ensure the best model is selected\nfor each store-item combination\n\nMetrics to identify the optimal frequency with which to generate\nnew predictions\n\nManage material shortages and predict overplanning\n\n**Try our** **[Parts-Level Solution Accelerator](https://www.databricks.com/solutions/accelerators/demand-forecasting)** **to facilitate**\n\n**fine-grained demand forecasts and planning.**\n\n\n-----\n\n**S O L U T I O N**\n**Overall Equipment Effectiveness**\n**& KPI Monitoring**\n\n\n​The need to monitor and measure manufacturing equipment performance is\ncritical for operational teams within manufacturing. Today, Overall Equipment\nEffectiveness (OEE) is considered the standard for measuring manufacturing\nequipment productivity. According to Engineering USA, an OEE value of 85% or\nabove is considered world-leading. However, many manufacturers typcially achieve\na range of between 40% and 60%. Reasons for underachievement often include:\n\n**•** Delayed inputs due to manual processes that are prone to human error\n\n**•** Bottlenecks created by data silos, impeding the flow of fresh data to\nstakeholders\n\n**•** A lack of collaboration capabilities, keeping stakeholders from working on the\nsame information at the same time\n\n**Poor OEE value** **can be a result of poor parts quality, slow**\n**production performance and production availability issues.**\n\nDatabricks Lakehouse can help manufacturers maneuver through the\nchallenges of ingesting and converging operational technology (OT) data with\ntraditional data from IT systems to build forecasting solutions.\n\n**C O M M O N U S E C A S E S**\n\nIncrementally ingest and process sensor data from IoT devices\nin a variety of formats\n\nCompute and surface KPIs and metrics to drive valuable insights\n\nOptimize plant operations with data-driven decisions\n\n**Try our** **[Solution Accelerator for OEE and KPI Monitoring](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)** **for**\n**performant and scalable end-to-end monitoring.**\n\n\n-----\n\nMarket dynamics and volatility are requiring manufacturers to bring products to\nmarket more quickly, optimize production processes and build agile supply chains\nat scale at a lower price. To do so, many manufacturers have turned to building\ndigital twins, which are virtual representations of objects, products, pieces of\nequipment, people, processes or even complete manufacturing ecosystems.\n\nDigital twins provide insights — derived from sensors (often IoT or IIoT) that\nare embedded in the original equipment — that have the potential to transform\nthe manufacturing industry by driving greater efficiency, reducing costs and\nimproving quality.\n\n\n**S O L U T I O N**\n**Digital Twins**\n\n\n**Digital twin technologies can improve product**\n\n**quality by** **up to 25%.**\n\nDatabricks Lakehouse can bring digital twins to life through fault-tolerant\nprocessing of streaming workloads generated by IoT sensor data and complex\nevent processing (important for modeling physical processes).\n\n**C O M M O N U S E C A S E S**\n\nProcess real-world data in real time\n\nCompute insights at scale and deliver to multiple downstream applications\n\nOptimize plant operations with data-driven decisions\n\n**Try our** **[Solution Accelerator for Digital Twins](https://www.databricks.com/solutions/accelerators/digital-twins)** **to accelerate**\n**time to market of new innovations.**\n\n\n-----\n\n**S O L U T I O N**\n**Computer Vision**\n\nThe rise in computer vision has been fueled by the rapid developments in\nneural network technologies, which use AI to better understand and interpret\nimages with near-perfect precision. In manufacturing, computer vision can\ntransform operations by, for example, identifying product defects to improve\nquality control, detecting safety hazards on the production floor, and tracking\nand managing inventory levels.\n\n**As per the American Society for Quality, cost of poor quality for**\n\n**companies can be as high as** **20% of revenue.**\n\n\nDatabricks Lakehouse can easily ingest complex, unstructured image and video\ndata at massive scale. Through the most popular computer vision libraries, data\nteams can scale AI models that leverage computer vision to recognize patterns,\ndetect objects and make predictions with 99% accuracy.\n\n**C O M M O N U S E C A S E S**\n\nQuickly identify defects and ensure that products and processes meet\nquality standards\n\nAutomate positioning and guidance to ensure that parts and products are\nproperly aligned and assembled\n\nPredict maintenance issues to reduce downtime and maintenance costs,\nimprove parts reliability, and increase safety for workers\n\n**Try our** **[Solution Accelerator for Computer Vision](https://www.databricks.com/blog/2021/12/17/enabling-computer-vision-applications-with-the-data-lakehouse.html)** **to improve**\n**efficiency, reduce costs and enhance overall safety.**\n\n\n-----\n\n## An Ecosystem on the Lakehouse for Manufacturing\n\nWe’ve partnered with leading consulting firms and\nindependent software vendors to deliver innovative,\nmanufacturing-specific solutions. Databricks\nBrickbuilder Solutions help you cut costs and\nincrease value from your data. Backed by decades\nof industry expertise — and built for the Databricks\nLakehouse Platform — Brickbuilder Solutions are\ntailored to your exact needs.\n\nWe also work with technology partners like Alteryx,\nAtScale, Fivetran, Microsoft Power BI, Qlik, Sigma,\nSimplement, Tableau and ThoughtSpot to accelerate\nthe availability and value of data. This allows\nbusinesses to unify data from complex source\nsystems and operationalize it for analytics, AI and\nML on the Databricks Lakehouse Platform.\n\n\n-----\n\n**S O L U T I O N**\n**Avanade Intelligent Manufacturing**\n\nEvery year, businesses lose millions of dollars due to equipment failure,\nunscheduled downtime and lack of control in maintenance scheduling. Along\nwith lost dollars, businesses will experience lower employee morale when\nstations are in and out of service. Avanade’s Intelligent Manufacturing solution\nsupports connected production facilities and assets, workers, products and\nconsumers to create value through enhanced insights and improved outcomes.\nManufacturers can harness data to drive interoperability and enhanced insights\nat scale using analytics and AI. Outcomes include improvements across\nproduction (e.g., uptime, quantity and yield), better experiences for workers,\nand greater insight into what customers want.\n\n**Try our joint solution,** **[Intelligent Manufacturing](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/avanade-intelligent-manufacturing)** **, to drive value and**\n**operationalize team coordination and productivity.**\n\n\n**S O L U T I O N**\n**DataSentics Quality Inspector**\n\nQuality control is a crucial aspect of any production process, but traditional\nmethods can be time-consuming and prone to human error. Quality\nInspector by DataSentics, an Atos company, offers a solution that is\nboth efficient and reliable. With out-of-the-box models for visual quality\ninspection, which are tailored to meet specific business requirements,\norganizations will experience stable, scalable quality control that’s easy to\nimprove over time. Quality Inspector is an end-to-end solution that can be\nseamlessly integrated into an existing setup, delivering high performance\nand reliability.\n\n**Try our joint solution,** **[Quality Inspector](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to automate production quality**\n**control with an increase in accuracy and quicker time to value.**\n\n\n-----\n\nTREDENCE PSRM_1”: PREDICT SUPPLY RISK\n\nTREDENCE PSRM_2”: REAL-TIME SHIPMENT VISIBILITY\n\nTREDENCE PSRM_3”: DELAY ALERTS\n\n\n**S O L U T I O N**\n**Tredence Predictive Supply Risk Management**\n\nCustomers today are faced with multiple supply risks including lack of\nin-transit visibility, disruptions caused by weather, local events, among\nothers. Tredence’s Predictive Supply Risk Management solution, built on\nthe Databricks Lakehouse Platform, helps businesses meet supply risk\nchallenges by providing a scalable, cloud-based solution that can be\ntailored to the specific needs of each organization. The platform’s flexibility\nand scalability allow businesses to keep pace with changing regulations\nand customer demands, while their comprehensive suite of tools helps\nidentify and mitigate risks across the enterprise.\n\n**Try our joint solution,** **[Predictive Supply Risk Management](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to**\n**predict order delays, identify root causes and quantify supply**\n**chain impact.**\n\nVisit our [site](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview) to learn more about our Databricks Partner Solutions.\n\n\n-----\n\n## Leading Manufacturing Companies That Choose Us\n\n\n-----\n\nDatabricks is the lakehouse company. More than 9,000 organizations worldwide\n\n— including Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the\n\nDatabricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the\n\noriginal creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission\n\nto help data teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n###### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=634147899783&utm_term=try%20databricks&gclid=CjwKCAiAr4GgBhBFEiwAgwORrTnkJaDf9SpIDy2RxOV28a2G2HtUDvJnLXiVWBsqcAWa_XmSvabkVRoCiwgQAvD_BwE#account)**\n\nTo learn more, visit us at:\n**[Manufacturing Industry Solutions](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Lakehouse-for-Manufacturing.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**2 0 2 0 E D I T I O N** | U P D AT E D\n\n# Standardizing the Machine Learning Lifecycle\n\n### From experimentation to production with MLflow\n\n[��](https://mlflow.org)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n#### Contents\n\nChapter 1: \u0007Machine Learning\nLifecycle Challenges 3\n\nChapter 2: \u0007Applying Good Engineering\nPrinciples to Machine Learning 7\n\nChapter 3: \u0007Introducing MLflow 9\n\nChapter 4: \u0007A Closer Look at MLflow\nModel Registry 16\n\nChapter 5: \u0007Making Organizations\nSuccessful with ML 19\n\nChapter 6: \u0007Introducing the Unified\nData Analytics Platform 20\n\nChapter 7: \u0007Standardizing the Machine\nLearning Lifecycle on Databricks 25\n\nChapter 8: \u0007Getting Started 26\n\nChapter 9: \u0007Comparison Matrix 27\n\n\n#### Preface\n\n##### Technology changes quickly. Data science and machine learning (ML) are moving\n even faster. In the short time since we first published this eBook, businesses across industries have rapidly matured their machine learning operations (MLOps) — implementing ML applications and moving their first models into production. This has turned ML models into corporate assets that need to be managed across the lifecycle.\n\n That’s why MLflow, an open-source platform developed by Databricks, has emerged\n as a leader in automating the end-to-end ML lifecycle. With 1.8 million 1 downloads a month — and growing support in the developer community — this open-source platform is simplifying the complex process of standardizing and productionizing MLOps. This updated eBook explores the advantages of MLflow and introduces you to the newest component: MLflow Model Registry. You’ll also discover how MLflow fits into the Databricks Unified Data Analytics Platform for data engineering, science and analytics.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 1: **\u0007** **Machine Learning**\n\n#### Lifecycle Challenges\n\n\nBuilding machine learning models is hard. Putting them into production is harder. Enabling others — data\n\nscientists, engineers or even yourself — to reproduce your pipeline and results is equally challenging. How\n\nmany times have you or your peers had to discard previous work because it was either not documented\n\nproperly or too difficult to replicate?\n\nGetting models up to speed in the first place is significant enough that it can be easy to overlook long-\n\nterm management. What does this involve in practice? In essence, we have to compare the results of\n\ndifferent versions of ML models along with corresponding artifacts — code, dependencies, visualizations,\n\nintermediate data and more — to track what’s running where, and to redeploy and roll back updated models\n\nas needed. Each of these requires its own specific tools, and it’s these changes that make the ML lifecycle\n\nso challenging compared with traditional software development lifecycle (SDLC) management.\n\nThis represents a serious shift and creates challenges compared with a more traditional software\n\ndevelopment lifecycle for the following reasons:\n\n\nThe diversity and number of ML\n\ntools involved, coupled with a\n\nlack of standardization across\n\nML libraries and frameworks\n\n\nThe continuous nature of ML\n\ndevelopment, accompanied by a\n\nlack of tracking and management\n\ntools for machine learning models\n\nand experiments\n\n\nThe complexity of productionizing\n\nML models due to the lack of\n\nintegration among data pipelines,\n\nML environments and production\n\nservices\n\n\nLet’s look at each of these areas in turn.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The diversity and number of ML tools involved\n\n\nWhile the traditional software development process leads to the\n\nrationalization and governance of tools and platforms used for developing and\n\nmanaging applications, the ML lifecycle relies on data scientists’ ability to use\n\nmultiple tools, whether for preparing data and training models, or deploying\n\nthem for production use. Data scientists will seek the latest algorithms from\n\n\nHowever, due to the variety of available tools and the lack of detailed tracking,\n\nteams often have trouble getting the same code to work again in the same way.\n\nReproducing the ML workflow is a critical challenge, whether a data scientist\n\nneeds to pass training code to an engineer for use in production or go back to\n\npast work to debug a problem.\n\n\nthe most up-to-date ML libraries and frameworks available to compare results\n\nand improve performance.\n\n**PREP DATA** **BUILD MODEL** **DEPLOY MODEL**\n\nAzure ML\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The continuous nature of ML development\n\nTechnology never stands still. New data, algorithms,\n\nlibraries and frameworks impact model performance\n\ncontinuously and, thus, need to be tested. Therefore,\n\nmachine learning development requires a continuous\n\n\napproach, along with tracking capabilities to\n\ncompare and reproduce results. The performance\n\nof ML models depends not only on the algorithms\n\nused, but also on the quality of the data sets and the\n\nparameter values for the models.\n\n\n**P R E P**\n**D ATA**\n\n**B U I L D**\n**M O D E L**\n\n\nWhether practitioners work alone or on teams, it’s\n\nstill very difficult to track which parameters, code\n\nand data went into each experiment to produce a\n\nmodel, due to the intricate nature of the ML\n\nlifecycle itself.\n\n**D E P L O Y**\n**M O D E L**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The complexity of productionizing ML models\n\n\nIn software development, the architecture is set early on, based on the target\n\napplication. Once the infrastructure and architecture have been chosen, they\n\nwon’t be updated or changed due to the sheer amount of work involved in\n\nrebuilding applications from scratch. Modern developments, such as the move\n\nto microservices, are making this easier, but for the most part, SDLC focuses on\n\nmaintaining and improving what already exists.\n\n\nOne of today’s key challenges is to effectively transition models from\n\nexperimentation to staging and production — without needing to rewrite the code\n\nfor production use. This is time-consuming and risky as it can introduce new\n\nbugs. There are many solutions available to productionize a model quickly, but\n\npractitioners need the ability to choose and deploy models across any platform,\n\nand scale resources as needed to manage model inference effectively on big data,\n\nin batch or real time.\n\n\nWith machine learning the first goal is to build a model. And keep in mind: a\n\nmodel’s performance in terms of accuracy and sensitivity is agnostic from the\n\ndeployment mode. However, models can be heavily dependent on latency, and\n\nthe chosen architecture requires significant scalability based on the business\n\napplication. End-to-end ML pipeline designs can be great for batch analytics and\n\nlooking at streaming data, but they can involve different approaches for real-time\n\nscoring when an application is based on a microservice architecture working via\n\nREST APIs, etc.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 2: **\u0007** **Applying Good Engineering**\n\n#### Principles to Machine Learning\n\n\nMany data science and machine learning projects fail due to preventable issues that have been resolved\n\nin software engineering for more than a decade. However, those solutions need to be adapted due to key\n\ndifferences between developing code and training ML models.\n\n- \u0007 **Expertise, code and data** — With the addition of data, data science and ML, code not only needs to deal\n\nwith data dependencies but also handle the inherent nondeterministic characteristics of statistical\n\nmodeling. ML models are not guaranteed to behave the same way when trained twice, unlike traditional\n\ncode, which can be easily unit tested.\n\n- \u0007 **Model artifacts** — In addition to application code, ML products and features also depend on models\n\nthat are the result of a training process. Those model artifacts can often be large — on the order of\n\ngigabytes — and often need to be served differently from code itself.\n\n- \u0007 **Collaboration** — In large organizations, models that are deployed in an application are usually not trained\n\nby the same people responsible for the deployment. Handoffs between experimentation, testing and\n\nproduction deployments are similar but not identical to approval processes in software engineering.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The need for standardization\n\nSome of the world’s largest tech companies have already begun solving these problems internally with\n\ntheir own machine learning platforms and lifecycle management tools. 2 These internal platforms have\n\nbeen extremely successful and are designed to accelerate the ML lifecycle by standardizing the process of\n\ndata preparation, model training, and deployment via APIs built for data scientists. The platforms not only\n\nhelp standardize the ML lifecycle but also play a major role in retaining knowledge and best practices, and\n\nmaximizing data science team productivity and collaboration, thereby leading to greater ROI.\n\nInternally driven strategies still have limitations. First, they are limited to a few algorithms or frameworks.\n\nAdoption of new tools or libraries can lead to significant bottlenecks. Of course, data scientists always\n\nwant to try the latest and the best algorithms, libraries and frameworks — the most recent versions of\n\nPyTorch, TensorFlow and so on. Unfortunately, production teams cannot easily incorporate these into\n\nthe custom ML platform without significant rework. The second limitation is that each platform is tied\n\nto a specific company’s infrastructure. This can limit sharing of efforts among data scientists. As each\n\nframework is so specific, options for deployment can be limited.\n\nThe question then is: Can similar benefits to these systems be provided in an open manner? This evaluation\n\nmust be based on the widest possible mix of tools, languages, libraries and infrastructures. Without this\n\napproach, it will be very difficult for data scientists to evolve their ML models and keep pace with industry\n\ndevelopments. Moreover, by making it available as open source, the wider industry will be able to join in and\n\ncontribute to ML’s wider adoption. This also makes it easier to move between various tools and libraries\n\nover time.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 3: **\u0007** **Introducing MLflow**\n\n**M AT E I Z A H A R I A**\n\nCo-founder and Chief Technologist at Databricks\n\n\nAt Databricks, we believe that there should be a better way to manage the ML lifecycle. So in June 2018,\n\nwe unveiled [MLflow](https://mlflow.org/) , an open-source machine learning platform for managing the complete ML lifecycle.\n\n###### “MLflow is designed to be a cross-cloud, modular, API-first framework, to work well with\n all popular ML frameworks and libraries. It is open and extensible by design, and platform\n agnostic for maximum flexibility.”\n\nWith MLflow, data scientists can now package code as reproducible runs, execute and\n\ncompare hundreds of parallel experiments, and leverage any hardware or software platform\n\nfor training, hyperparameter tuning and more. Also, organizations can deploy and manage\n\nmodels in production on a variety of clouds and serving platforms.\n\n###### “ With MLflow, data science teams can systematically package and reuse models\n across frameworks, track and share experiments locally or in the cloud, and deploy\n models virtually anywhere,” says Zaharia. “The flurry of interest and contributions we’ve\n seen from the data science community validates the need for an open-source framework to\n streamline the machine learning lifecycle.”\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n**EXPERIMENT TRACKING** As mentioned previously, getting ML models to perform takes significant trial and error, and continuous configuration, building, tuning, testing,\n\netc. Therefore, it is imperative to allow data science teams to track all that goes into a specific run, along with the results. With MLflow, data scientists can quickly record\n\nruns and keep track of model parameters, results, code and data from each experiment, all in one place.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n\n**FLEXIBLE DEPLOYMENT** There is virtually no limit to what machine learning can\n\ndo for your business. However, there are different ways to architect ML applications\n\nfor production, and various tools can be used for deploying models, which often\n\nlead to code rewrites prior to deploying ML models into production. With MLflow,\n\nyour data scientists can quickly download or deploy any saved models to various\n\nplatforms — locally or in the cloud — from experimentation to production.\n\n\n**REPRODUCIBLE PROJECTS** The ability to reproduce a project — entirely or just\n\nparts of it — is key to data science productivity, knowledge sharing and, hence,\n\naccelerating innovation. With MLflow, data scientists can build and package\n\ncomposable projects, capture dependencies and code history for reproducible\n\nresults, and quickly share projects with their peers.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n**MODEL MANAGEMENT** Use one central place to share ML models, collaborate on moving them from experimentation to online testing and production, integrate with\n\napproval and governance workflows, and monitor ML deployments and their performance. This is powered by the latest MLflow component, MLflow Model Registry.\n\n**M O D E L D E P L O Y M E N T A N D M O N I T O R I N G**\n\n**I N - L I N E C O D E**\n\n��\n\n**M L L I B R A R I E S**\n\n###### Model Format\n\n**C O N TA I N E R S**\n\n\n**F L AV O R 1**\n\n\n**F L AV O R 2**\n\n**B AT C H A N D S T R E A M S C O R I N G**\n\n\nSimple model flavors\nusable by many tools\n\n\n**C L O U D I N F E R E N C E S E R V I C E S**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Use case examples\n\nLet‘s examine three use cases to explore how users can leverage some of the MLflow components.\n\n\n**EXPERIMENT TRACKING** A European energy\n\ncompany is using MLflow to track and update\n\nhundreds of energy-grid models. This company’s\n\ngoal is to build a time-series model for every major\n\nenergy producer (e.g., power plant) and consumer\n\n(e.g., factory), monitor these models using standard\n\nmetrics, and combine the predictions to drive\n\nbusiness processes, such as pricing. Because a\n\nsingle team is responsible for hundreds of models,\n\npossibly using different ML libraries, it’s important to\n\nhave a standard development and tracking process.\n\nThe team has standardized on Jupyter notebooks\n\nfor development, MLflow Tracking for metrics, and\n\nDatabricks Jobs for inference.\n\n\n**REPRODUCIBLE PROJECTS** An online marketplace\n\nis using MLflow to package deep learning jobs using\n\nKeras and run them in the cloud. Each data scientist\n\ndevelops models locally on a laptop using a small\n\ndata set, checks them into a Git repository with\n\nan MLproject file, and submits remote runs of the\n\nproject to GPU instances in the cloud for large-scale\n\ntraining or hyperparameter search. Using MLflow\n\nProjects makes it easy to create the same software\n\nenvironment in the cloud and share project code\n\namong data scientists.\n\n\n**MODEL PACKAGING** An e-commerce site’s data\n\nscience team is using MLflow Model Registry to\n\npackage recommendation models for use by\n\napplication engineers. This presents a technical\n\nchallenge because the recommendation\n\napplication includes both a standard, off-the-shelf\n\nrecommendation model and custom business logic\n\nfor pre- and post-processing. For example, the\n\napplication might include custom code to ensure the\n\nrecommended items are diverse. This business logic\n\nneeds to change in sync with the model, and the data\n\nscience team wants to control both the business logic\n\nand the model, without having to submit a patch to\n\nthe web application each time the logic has to change.\n\nMoreover, the team wants to A/B test distinct models\n\nwith distinct versions of the processing logic. The\n\nsolution was to package both the recommendation\n\nmodel and the custom logic using the python_\n\nfunction flavor in an MLflow Model, which can then\n\nbe deployed and tested as a single unit.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Open and extensible by design\n\nSince we [unveiled](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) and open sourced MLflow in June 2018 at the Spark + AI Summit in San Francisco, community engagement and contributions have led to an impressive\n\narray of new features and integrations:\n\n\n**SUPPORT FOR MULTIPLE**\n\n**PROGRAMMING LANGUAGES**\n\nTo give developers a choice, MLflow supports R,\n\nPython, Java and Scala, along with a REST server\n\ninterface that can be used from any language.\n\n\n**INTEGRATION WITH POPULAR ML**\n\n**LIBRARIES AND FRAMEWORKS**\n\nMLflow has built-in integrations with the most popular\n\nmachine learning libraries — such as scikit-learn,\n\nTensorFlow, Keras, PyTorch, H2O, and Apache Spark™\n\nMLlib — to help teams build, test and deploy machine\n\nlearning applications.\n\n\n**CROSS-CLOUD SUPPORT**\n\nOrganizations can use MLflow to quickly deploy\n\nmachine learning models to multiple cloud services,\n\nincluding Databricks, Azure Machine Learning and\n\nAmazon SageMaker, depending on their needs.\n\nMLflow leverages AWS S3, Google Cloud Storage and\n\nAzure Data Lake Storage, allowing teams to easily\n\ntrack and share artifacts from their code.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Rapid community adoption\n\n## 2.5M\n#### monthly downloads\n\n## 200+\n#### code contributors\n\n\n## 100+\n#### contributing organizations\n\n\nOrganizations using and contributing to MLflow\n\nSource: [mlflow.org](https://mlflow.org)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 4: **\u0007** **A Closer Look at**\n\n#### MLflow Model Registry\n\n\nMLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\n\n[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\n\nThe latest MLflow component — MLflow Model Registry — builds on MLflow’s original capabilities to\n\nprovide organizations with one central place to share ML models, collaborate on moving them from\n\nexperimentation to testing and production, and implement approval and governance workflows.\n\n��\n\n\n**Model Registry**\n\n\n**D O W N S T R E A M**\n\n\n��\n\n**Tracking Server**\n\n\nData Scientists\n\n**Staging**\n\n\nData Engineers\n\n**Production** **Archived**\n\n**A U T O M AT E D J O B S**\n\n\n**Parameters**\n\n\n**Metrics** **Artifacts**\n\n\nThe Model Registry gives MLflow users new\n\n\ntools for sharing, reviewing and managing\n\nML models throughout their lifecycle\n\n\n**Metadata** **Models**\n\n**R E S T S E R V I N G**\n\n**R E V I E W E R S + C I / C D T O O L S**\n\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\n\nimplement good engineering principles with machine learning initiatives, such as collaboration,\n\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\n\nfeatures of this new component.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n###### One hub for managing ML models collaboratively\n\nBuilding and deploying ML models is a team sport. Not only are the responsibilities\n\nalong the machine learning model lifecycle often split across multiple people\n\n(e.g., data scientists train models whereas production engineers deploy them),\n\nbut also at each lifecycle stage, teams can benefit from collaboration and sharing\n\n\n###### Flexible CI/CD pipelines to manage stage transitions\n\nMLflow lets you manage your models’ lifecycles either manually or through\n\nautomated tools. Analogous to the approval process in software engineering,\n\nusers can manually request to move a model to a new lifecycle stage (e.g., from\n\nstaging to production), and review or comment on other users’ transition requests.\n\n\n(e.g., a fraud model built in one part of the organization could be reused in others).\n\nAlternatively, you can use the Model Registry’s API to plug in continuous integration\n\n\nMLflow facilitates sharing of expertise and knowledge across teams by making ML\n\nmodels more discoverable and providing collaborative features to jointly improve\n\non common ML tasks. Simply register an MLflow model from your experiments to\n\n\nand deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\n\nyour models. Each model also links to the experiment run that built it — in MLflow\n\nTracking — to let you easily review models.\n\n\nget started. The MLflow Model Registry will then let you track multiple versions\n\nof the model and mark each one with a lifecycle stage: development, staging,\n\nproduction or archived.\n\n\nSample machine learning\nmodels displayed via the\nMLflow Model Registry\ndashboard\n\n\nThe machine learning model\npage view in MLflow, showing\nhow users can request and\nreview changes to a model’s\nstage\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Visibility and governance for the full ML lifecycle\n\nIn large enterprises, the number of ML models that are in development, staging\n\nand production at any given point in time may be in the hundreds or thousands.\n\nHaving full visibility into which models exist, what stages they are in and who\n\nhas collaborated on and changed the deployment stages of a model allows\n\norganizations to better manage their ML efforts.\n\nMLflow provides full visibility and enables governance by keeping track of each\n\nmodel’s history and managing who can approve changes to the model’s stages.\n\nIdentify versions, stages and\nauthors of each model\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 5: **\u0007** **Making Organizations**\n\n#### Successful with ML\n\n\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\n\ntrack experiments, compare results, reproduce runs and productionize faster.\n\nIn addition to increasing data science team productivity and collaboration and applying good engineering\n\npractices to machine learning, organizations also need to do the following:\n\n\n**Reliably ingest, ETL and**\n\n**catalog big data**\n\n\n**Work with state-of-the-art**\n\n**ML frameworks and tools**\n\n\n**Easily scale compute from**\n\n**single to multi-node**\n\n\nDatabricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 6: **\u0007** **Introducing the Unified**\n\n#### Data Analytics Platform\n\n\nDatabricks accelerates innovation by unifying data science, engineering and business. Through a fully\n\nmanaged, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\n\nDatabricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\n\naccelerates their innovation.\n\n**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\n\n\n**BI INTEGRATIONS**\n\n**Access all your data**\n\n\n**DATA SCIENCE WORKSPACE**\n\n**Collaboration across the lifecycle**\n\n**UNIFIED DATA SERVICE**\n\n**High-quality data with great performance**\n\n\n\n**ENTERPRISE CLOUD SERVICE**\n\n**A simple, scalable and secure managed service**\n\n##### RAW DATA LAKE\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n###### Data engineering\n\nSpeed up the preparation of high-quality\n\ndata, essential for best-in-class ML\n\napplications, at scale\n\n\n###### Data science\n\nCollaboratively explore large data sets,\n\nbuild models iteratively and deploy across\n\nmultiple platforms\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Providing managed MLflow on Databricks\n\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\n\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\n\nBy using MLflow as part of Databricks, data scientists can:\n\n\n**WORKSPACES**\n\nBenefit from a streamlined\n\nexperiment tracking experience\n\nwith Databricks Workspace and\n\ncollaborative Notebooks\n\n\n**BIG DATA SNAPSHOTS**\n\nTrack large-scale data that fed\n\nthe models, along with all the\n\nother model parameters, then\n\n\n**JOBS**\n\nEasily initiate jobs remotely, from\n\nan on-premises environment or\n\nfrom Databricks notebooks\n\n\n**SECURITY**\n\nTake advantage of one common\n\nsecurity model for the entire\n\nmachine learning lifecycle\n\n\nreproduce training runs reliably\n\n\nRead our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Getting data ready for ML with Delta Lake\n\nDelta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\n\ndata processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\n\nBy using Delta Lake, data engineers and data scientists can keep track of data used for model training.\n\nFiles ML Runtime\n\n- \u0007Schema enforced high\n\nquality data\n\n\n\n- Optimized performance\n\n��\n\n- \u0007Full data lineage /\n\ngovernance\n\n- \u0007reproductibility through\n\ntime travel\n\n\nStreaming\n\nBatch\n\n\nIngestion\n\nTables\n\n\nIngestion\n\n\nData\n\nCatalog\n\n\nData\n\n\nFeature\n\nStore\n\n\nFeature\n\n\n**Y O U R E X I S T I N G D E LTA L A K E**\n\n\n3rd Party Data\n\nMarketplace\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Ready-to-use ML environments\n\nDatabricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\n\npreconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\n\nBy using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\n\nand simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\n\n\n**P A C K A G E S A N D O P T I M I Z E S M O S T**\n\n**C O M M O N M L F R A M E W O R K S**\n\n\n**C U S T O M I Z E D E N V I R O N M E N T S**\n\n**U S I N G C O N D A**\n\n\n**C U S T O M I Z E D E N V I R O N M E N T S**\n\n\nrequirements.txt\nconda.yaml\n\n\n**...**\n\n\n**B U I LT- I N O P T I M I Z AT I O N F O R**\n\n**D I S T R I B U T E D D E E P L E A R N I N G**\n\nDistribute and Scale any Single-Machine\nML Code to thousands of machines\n\n\n**B U I LT- I N A U T O M L A N D**\n\n**E X P E R I M E N T T R A C K I N G**\n\n\nMachine\n\nLearning\n\n\nMachine\n\n\n\nAuto ML and Tracking /\nVisualizations with MLflow\n\n\nConda-\n\nBased\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 7: **\u0007** **Standardizing the**\n\n#### Machine Learning\n Lifecycle on Databricks\n\n**B U I L D M O D E L**\n**P R E P D ATA**\n\n��\n\nAzure ML\n\n**D E P L O Y M O D E L**\n\n��\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 8: **\u0007** **Getting Started**\nTake the next step toward standardizing your ML lifecycle — test drive MLflow and the\n\nDatabricks Unified Data Analytics Platform.\n\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\n\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 8: **\u0007** **Comparison Matrix**\n\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\n|---|---|---|\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_improper_payments_eBook_v4_image.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "### Technical Migration Guide\n\n# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n\n\n-----\n\n## Contents Lakehouse Architecture 3\n\nThe Databricks Lakehouse Platform 4\n\nBusiness Value 5\n\nSingle source of truth 5\n\nData team 6\n\nFuture-proof 6\n\nMigration to Lakehouse 7\n\nOverview 7\n\nMigration strategy 8\n\nMigration planning 9\n\nELT approach 12\n\nAgile modernization 15\n\nSecurity and data governance 17\n\nTeam involvement 19\n\nConclusion 19\n\n\n-----\n\n## Lakehouse Architecture\n\n\nData warehouses were designed to provide a central data repository\n\nwith analytic compute capabilities to help business leaders\n\nget analytical insights, support decision-making and business\n\nintelligence (BI). Legacy on-premises data warehouse architectures\n\nare difficult to scale and make it difficult for data teams to keep up\n\nwith the exponential growth of data. Oftentimes data teams publish\n\nand use a subset of well-defined data for development and testing.\n\nThis slows down both innovation and time to insight.\n\nCloud data warehouses (CDW) were an attempt to tackle the\n\non-premises data warehouse challenges. CDWs removed the\n\nadministrative burden of tasks such as setup, upgrades and\n\nbackups. CDWs also improved scalability and introduced cloud’s\n\npay-as-you-go model to reduce cost. CDWs leverage a proprietary\n\ndata format to achieve cloud-scale and performance; however, this\n\nalso leads to customers locked into these formats with difficult\n\n\nBut enterprise data teams don’t need a better data warehouse.\n\nThey need an innovative, simple solution that provides reliable\n\nperformance, elastic scale and allows self-service to unblock\n\nanalytics to access all data at a reasonable cost. The answer is\n\nthe lakehouse.\n\nThe lakehouse pattern represents a paradigm shift from traditional\n\non-premises data warehouse systems that are expensive and\n\ncomplex to manage. It uses an open data management architecture\n\nthat combines the flexibility, cost-efficiency and scale of data\n\nlakes with the data management and ACID semantics of data\n\nwarehouses. A lakehouse pattern enables data transformation,\n\ncleansing and validation to support both business intelligence and\n\nmachine learning (ML) users on all data. Lakehouse is cloud-centric\n\nand unifies a complete up-to-date data set for teams, allowing\n\ncollaboration across an organization.\n\n\npaths to support use cases outside the data warehouse itself\n\n(i.e., machine learning). Customers often find themselves with a\n\nbifurcated architecture, which ultimately leads to a more costly and\n\ncomplex data platform over time.\n\n\n-----\n\n## The Databricks Lakehouse Platform\n\nThe Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n\nand AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n\necosystem with open standards and data formats. Databricks is **multicloud** — delivering\n\none **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n\nevery cloud platform that you’re using to support your data and AI efforts.\n\nDatabricks SQL stores and processes data using Delta Lake to simplify and enhance\n\ndata warehousing capabilities. Analysts can use their favorite language, SQL, popular\n\ntransformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n\nanalyze data. The built-in query editor reduces contextual switching and improves\n\nproductivity. Administrators enjoy simplified workload management via serverless\n\ncompute and auto-scaling to meet high-concurrency workload needs. All this at a\n\nfraction of the cost of traditional data warehouses.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\nSimple Open Multicloud\n\n\n-----\n\n## Business Value\n\n#### Single source of truth\n\nDatabricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n\nscalable storage layer where you can store all your data, including raw and historical data,\n\nalongside structured data tables in the data warehouse. The lakehouse pattern avoids\n\ndata silos and shares the same elastic scale and governance across all use cases: BI, data\n\nengineering, streaming and AI/ML. This means that data engineering teams don’t have to\n\nmove data to a proprietary data warehouse for business analysts or create a separate\n\ndata store to support data science.\n\nInstead, data teams can access the open format Delta tables directly and combine data\n\nsets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n\ndata with access to versioned history to facilitate repeatable experiments. A single source\n\nof truth facilitates moving from descriptive to predictive analytics.\n\n\n-----\n\n#### Data team\n\n\nWith central data governance and fine-grained access control\n\ncapabilities to secure the lakehouse, you can enable self-service\n\nSQL analytics for everyone on the Databricks Lakehouse Platform.\n\nThis allows each team to be more agile and innovate faster.\n\n**Data Analysts** — Using the Databricks SQL editor\n\nor their tools of choice (DBT, Power BI, Tableau), SQL\n\nanalysts can leverage familiar toolsets.\n\n**Data Engineers** — Utilizing Delta Lake as a unified\n\nstorage layer, data engineering teams can eliminate\n\nduplicate data and ETL jobs that move data across\n\nvarious systems. Databricks supports both batch and\n\nstreaming workloads to reduce bottlenecks and serve\n\nthe most up-to-date data to downstream users and\n\napplications.\n\n**Administrators** — The pay-as-you-go, decentralized\n\ncompute resource allows each team to run their\n\n\nThe Databricks Lakehouse Platform provides a reliable ETL and data\n\nmanagement framework to simplify ETL pipelines. Data teams can\n\nbuild end-to-end data transformations in a single pipeline instead of\n\nmany small ETL tasks. Databricks supports data quality enforcement\n\nto ensure reliability with auto-scalable infrastructure. Your teams\n\ncan onboard new data sources quickly to power new use cases with\n\nfresh data. This not only allows your team to efficiently and reliably\n\ndeliver high-quality data in a timely manner, it also reduces ETL\n\nworkload cost significantly.\n\n#### Future-proof\n\nUnlike CDWs that lock customers in, Databricks offers an open\n\nplatform with open standards, open protocols and open data\n\nformats. It supports a full range of popular languages (SQL, Python,\n\nR, Scala) and popular BI tools. You can leverage the performant\n\nand low-cost distributed compute layer for data processing — or\n\nuse a variety of tools and engines to efficiently access the data via\n\nDatabricks APIs. Databricks also allows data consumption with a rich\n\npartner ecosystem. Teams can handle all existing BI and AI use cases\n\nwith the flexibility to support future use cases as they emerge.\n\n\nworkload in isolated environments without worrying\n\nabout contention. Serverless SQL endpoint frees your\n\nteam from infrastructure management challenges.\n\n\n-----\n\n## Migration to Lakehouse\n\n#### Overview\n\nA lakehouse is the ideal data architecture for data-driven organizations. It combines the\n\nbest qualities of data warehouses and data lakes to provide a single solution for all major\n\ndata workloads and supports use cases from streaming analytics to BI, data science and\n\nAI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n\nonly consumes (charges for) compute resources when workloads are running. This pay-\n\n\n**C U S T O M E R S T O R Y**\n##### Building the Lakehouse\n at Atlassian\n\n[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n\n\nas-you-go model means compute resources are automatically shut down if no processing\n\nis needed. Data teams can use small clusters that can power individual workloads\n\nthey plan to migrate. They can make the choice to leverage serverless SQL endpoints\n\nand completely free data teams from infrastructure capacity planning and cluster\n\nmaintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n\nsavings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n\nsavings compared to other cloud data warehouses.\n\nData warehouse migration is never an easy task. Databricks aims to mitigate the things\n\nthat can go wrong in these demanding migration projects. The Databricks Lakehouse\n\nPlatform provides many out-of-the-box features to mitigate migration risks.\n\n**C U S T O M E R S T O R Y**\n##### Driving Freight Transportation Into the Future\n\n[Read more](https://databricks.com/customers/jbhunt)\n\n\n-----\n\n#### Migration strategy\n\n\nMigration is a huge effort and very expensive. Yet, almost every\n\nenterprise has to migrate to new platforms every 3–5 years because\n\nthe old platform cannot support new use cases, catch up with\n\ndata growth or meet scaling needs. To get better ROI on migration,\n\nimplement a migration strategy that can reduce future re-platform\n\nneeds and extend to your future data and AI strategy.\n\nUse the opportunity of a data migration to standardize your data\n\nin open Delta format to allow existing and future tools to access\n\nit directly without moving or converting it. Merge your siloed\n\ndata warehouses into the unified storage layer in the Databricks\n\nLakehouse Platform — without worrying about storage capacity.\n\nThe unified storage layer allows your team to deploy a unified data\n\ngovernance on top to secure all data access consistently. Simplify\n\nyour data governance story with Databricks Unity Catalog.\n\n\nMove toward a single, consistent approach to data pipelining\n\nand refinement. Merge batch and streaming into a single end-\n\nto-end pipeline to get fresher data and provide more real-time\n\ndecisions. Take a metadata-driven approach to align the dataflow\n\nwith business processes and have data validation and quality\n\ncheck built-in. Through a series of curation and refinement steps,\n\nthe output results in highly consumable and trusted data for\n\ndownstream use cases.\n\nThe lakehouse architecture makes it possible for the organization\n\nto create “data assets” by taking a stepwise approach to improving\n\ndata and serving all essential use cases. Encourage your BI/analyst\n\nteam to leverage Databricks serverless endpoints for self-serve\n\nand agility. Each team can evaluate their top priority workloads and\n\nmigrate them in parallel to speed up migration.\n\nTake advantage of Databricks’ rich partner ecosystem. Your favorite\n\npartners are likely already integrated via Partner Connect and\n\ncan be set up with a few clicks. There are also many ISV and SI\n\nconsulting partners who can help your migration journey.\n\n\n-----\n\n#### Migration planning\n\nMigrating a data warehouse to the cloud can be time consuming and challenging for your\n\ndata teams. It’s important to agree on the data architecture, migration strategy and process/\n\nframeworks to be used before undertaking a data migration. Databricks provides Migration\n\nAssessment and Architecture Review sessions to develop a joint migration roadmap. This\n\nprocess is designed to help organizations to successfully migrate to a lakehouse architecture.\n\nBased on information collected and business objectives, the Databricks team will work with\n\ncustomers to propose a target architecture and provide a tailored migration roadmap.\n\nThese assessments help get a full picture of current data systems and the future vision. They\n\nclarify what you are migrating and do proper use case discovery. This includes identifying\n\nworkloads and data source dependency, for example:\n\nSample migration assessment checklist:\n\nIdentify upstream data sources and workload dependencies\n\nIdentify active/inactive data sets and database objects\n\nIdentify downstream application dependencies and data freshness requirements\n\nDefine a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n\nDefine security requirements and data governance\n\nClarify access management need, document needed permissions per user/group\n\nOutline current tooling (ingestion, ETL and BI) and what’s needed\n\n\n-----\n\nIt’s important to identify key stakeholders and keep them engaged during the migration to\n\nmake sure they are aligned with the overall objectives. The workload assessment result will\n\nbe reviewed with key stakeholders. Through the review process, data teams can get a better\n\nunderstanding of which workloads can most benefit from modernization.\n\nDatabricks often works with partners to provide a workload assessment and help customers\n\nunderstand their migration complexity and properly plan a budget. Databricks also partners\n\nwith third-party vendors that provide migration tools to securely automate major migration\n\ntasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n\nhelp with the migration, including:\n\n\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n\nyour current system to Databricks optimized code with Delta and other best practices\n\n\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n\nmigration time and cost\n\n\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n\n2x–3x faster than what was previously possible\n\n\n-----\n\n#### We can use Automated conversion for most workload types\n\n###### EDWs\n\n\nOpen Cloud Storage\nADLS, S3, GCP Storage\n\nDatabricks Tables, �ie�s\n\nSpark SQL Databricks Notebooks\n\nSpark SQL � little bit o� Python or Scal�\n\nRuns on Databricks JDBC/ODBC\n\nDatabricks permissions- Table ACLs\n\nCredential Pass-throughs to Files\n\nBig Data ETL tools, Databricks Notebooks\n\nAir5o� DAGs, ADF, Databricks Job\nand any other Enterprise Schedulers\n\n\nData Migration\n\nMetastore Migration\n\nSQL Migration\n\nSecurity\n\nETL Tools\n\n\nDB locked �ormats on Disks\n\nDatabases, Tables, �ie�s\n\nAd-hoc SQL �ueries\n\nT-SQL, PL/SQL, BTEQ\n\nReports �rom PB`, Tableau etc^\n\nGRANTs, Roles\n\nExternal tables- File permissions\n\nData Stage, Po�erCenter, Ab `nitio etc^\n\n\nOrchestration ETL Schedulers\n\n\n-----\n\n#### ELT approach\n\nThe separation of storage and compute makes ELT on lakehouse a better choice than traditional\n\nETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n\ndata implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n\nuse cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n\nthe foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n\nas needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n\nby exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n\n“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n\n**I M P R O V E D ATA Q U A L I T Y**\n\nData B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n\nStreaming Analytics\n\nCSV TXT JSON\n\n\nD ata �a �e\n\n\nRaw\nintegration\n\n\nFiltered, Cleaned,\nAugmented\n\n\nBusiness-level\nAggregates\n\n\nReuorting\n\n\n-----\n\nWe highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n\nservice in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n\nmodernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n\na traditional data warehouse, you can focus on source and expected output, and create your\n\nentire dataflow graph declaratively. Delta Live Tables offers:\n\n\u0007A metadata-driven approach — You just specify what data should be in each table or view\n\nrather than the details of how processing should be done\n\n\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n\nmonitoring/visibility, error recovery, and lineage, which reduces the strain on data\n\nengineering teams and improves time-to-value in building data pipelines\n\n\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n\nare populated correctly, whether continuously or on a regular schedule. For example,\n\nupdating one table will automatically trigger all downstream table updates to keep data\n\nup-to-date.\n\n\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n\npipelines simpler and easier. DLT can also automatically recover from common error\n\nconditions, reducing operational overhead.\n\n\n-----\n\n#### Agile modernization\n\n\nAgile development allows teams to move quickly knowing migrated\n\npipelines can be revisited at a later cycle and evolving data models\n\nare supported within the architecture. Allowing business impact to\n\ndrive priorities via an agile approach helps mitigate migration risks.\n\nPrioritizing and selecting use cases where modernization brings\n\nbusiness benefits quickly is a good starting point. Focus on the 20%\n\nof workloads that consume 80% of budget. By breaking workflows\n\ndown into components and managing data stories, teams can adjust\n\npriorities over time. Changes can be made in collaboration with the\n\nuser community to fit the business definition of value.\n\nMigrating to a lakehouse architecture leverages separation of storage\n\nand compute to remove resource contention between ETL and BI\n\nworkloads. As a result, the migration process can be more agile,\n\nallowing you to evolve your design iteratively without big-bang effort:\n\n\u0007Reduce time during the initial phase on full capacity plan and\n\n\nAll of this allows you to take a more iterative and business-focused\n\napproach for migration instead of a full planning, execution, test/\n\nvalidation approach. Here are more approaches that help facilitate\n\nthis phased implementation:\n\n\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n\nnew data into pipelines quicker to get data in near real-time.\n\n\u0007Delta Live Tables (DLT) improves data quality during data\n\ntransformation and automatically scales to address data volume\n\nchange. DLT can also support schema evolution and quarantine\n\nbad data or data that needs to be reprocessed at a later stage.\n\n\u0007Use dedicated clusters to isolate workloads, lower the total cost\n\nof ownership and improve overall performance. By using multiple\n\nclusters, we can shut down resources when not in use and move\n\naway from managing fixed resources in a single large cluster.\n\n\nscoping\n\n\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n\n\u0007Workload management is much simpler, you can isolate each\n\nworkload with a dedicated compute resource, without worrying\n\nabout managing workload contention\n\n\u0007Auto-scale and tear down the compute resources after the job\n\nis done to achieve cost efficiency\n\n\n-----\n\nLeverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n\n\u0007Create a migration factory for iterative migration process\n\n\u0007Determine and implement a security and governance framework\n\n\u0007Establish a to-be environment and move use cases/workloads in logical units\n\n\u0007Prove business value and scale over time\n\n\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n\nTake this iterative and templated approach. Migration speed will accelerate. Customers can\n\nfinish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n\n\n“ M a k e i t w o r k ”\n\nPa r e l l e l i z e t h e\nB u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\ni t e r at i o n s\n\n“ M a k e i t w o r k >a s t 2\n\n\nFull %i\"ecycle %ig�t�ou�e /or�load�\n\nLeverage Databricks’ deep\n\nbench of expertise to build\n\nout some **templates for the**\n\n**most effective Databricks**\n\n**implementation.**\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nTake an **iterative, bite-sized**\n\n**approach** to migration, reduce tech\n\ndebt and rework, and bring forward\n\nthe value of the solution earlier.\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\n\n-----\n\nTo maximize the value of your lakehouse, you should consider retiring\n\nsome legacy architecture design patterns. Leverage the migration\n\nprocess to simplify data warehousing tasks. Regardless of how you\n\ncomplete your migration, you could utilize lakehouse strengths to\n\nimprove architectural patterns:\n\n\u0007Merge your siloed data warehouses on your unified lakehouse\n\nplatform and unify data access and data governance via Unity\n\nCatalog. The lakehouse architecture provides a unified storage\n\nlayer for all your data where there is no physical boundary\n\nbetween data. There is no need to keep data copies for each\n\nsystem using the data set. Clean up and remove jobs that are\n\ncreated to keep data in sync across various data systems.\n\nKeep a single copy of raw data in your lakehouse as a single\n\nsource of truth.\n\n\u0007The Databricks Lakehouse Platform allows you to merge batch\n\nand streaming into a single system to build a simple continuous\n\n\n\u0007Simplify your workload isolation and management by running jobs\n\nin dedicated clusters. Separating storage and compute allows you\n\nto easily isolate each task with isolated compute resources. There\n\nis no need to squeeze them into a single large data appliance\n\nand spend lots of time managing and coordinating resources.\n\nLeverage the elasticity of the Databricks compute layer to\n\nautomatically handle workload concurrency changes at peak time\n\ninstead of paying for over-provisioned resources for most of the\n\ntime. This greatly simplifies the workload management effort the\n\ntraditional data warehouses require.\n\n\u0007Simplify disaster recovery. Storage and compute separation\n\nallows easy disaster recovery. The cloud storage provides very\n\ngood data redundancy and supports automated replication\n\nto another region. Customers can spin up compute resources\n\nquickly in another region and maintain service availability in case\n\nof an outage.\n\n\ndata flow model to process data as it arrives. Process data in\n\nnear real-time and enable data-driven decisions with the most\n\nrecent updates.\n\n\n-----\n\n#### Security and data governance\n\n\nSecurity is paramount in any data-driven organization. Data security\n\nshould enforce the business needs for both internal and external\n\ndata, so the lakehouse should be set up to meet your organization’s\n\nsecurity requirements. Databricks provides built-in security to\n\nprotect your data during and after migration.\n\n\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n\nor your own\n\n\u0007Set up a custom network policy, use IP range to control access\n\n\u0007Leverage Private Link to limit network traffic to not traverse the\n\npublic internet\n\n\nThe challenge with the traditional data warehouse and data lake\n\narchitecture is that data is stored in multiple stores and your data\n\nteam also needs to manage data access and data governance\n\ntwice. The lakehouse pattern uses unified storage which simplifies\n\ngovernance. The Databricks Lakehouse Platform provides a unified\n\ngovernance layer across all your data teams. Migrating to Databricks\n\nUnity Catalog provides data discovery, data lineage, role-based\n\nsecurity policies, table or row/column-level access control, and\n\ncentral auditing capabilities that make the data platform easy for\n\ndata stewards to confidently manage and secure data access to\n\nmeet compliance and privacy needs, directly on the lakehouse.\n\n\n\u0007Enable SSO, integrate with active directory and other IdPs\n\n\u0007Control data access to database objects using RBAC\n\n\u0007Enable audit logs to monitor user activities\n\n\n-----\n\nA-�it Log\n\nAcco-nt Level$\nUser Management\n\nCre�entials\n\n##### Centralized Governance\n\nACL Store\n\nAccess Control\n\n\nMetastore\n\nLineage Explorer\n\nData Explorer\n\n\n-----\n\n#### Team involvement\n\nPlan to educate and train your team iteratively throughout the\n\nmigration process. As new workloads are migrated, new teams will\n\ngain exposure to the lakehouse pattern. Plan to ramp up new team\n\nmembers as the migration process progresses, developing a data\n\nCenter of Excellence within the organization. Databricks provides\n\na cost effective platform for ad hoc work to be performed. A\n\nsandbox environment can be leveraged for teams to get exposure\n\nto Databricks technology and get hands-on experience. Databricks\n\nalso provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n\nto get hands-on experience relevant to their immediate tasks, gain\n\n\n#### Conclusion\n\nData warehouse migration touches many business areas and\n\nimpacts many teams, but the Databricks Lakehouse Platform\n\nsimplifies this transition, reduces risks and accelerates your ROI.\n\nThe Databricks Business Value Consulting team can work with you\n\nto quantify the impact of your use cases to both data and business\n\nteams. And the Databricks team of solution architects, professional\n\nservices, and partners are ready to help.\n\nReach out to your Databricks account team or send a message to\n\n[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n\n\nexposure to new things and try new ideas.\n\n#### Additional resources\n\n[Migrate to Databricks](https://databricks.com/solutions/migration)\n\n[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "**The**\n**Delta Lake**\n**Series**\n**Lakehouse**\n\nCombining the best elements of\ndata lakes and data warehouses\n\n\n-----\n\n###### Here’s what\n#### What’s \n###### you’ll find inside\n#### inside?\n\n\nThe Delta Lake Series of eBooks is published\n\n\nby Databricks to help leaders and practitioners\n\nunderstand the full capabilities of Delta Lake as\n\n\n**Introduction**\n**What is Delta Lake?**\n\n\nwell as the landscape it resides in. This eBook,\n\n\n**The Delta Lake Series — Lakehouse** , focuses\n\non lakehouse.\n\n\n**Chapter** **01**\n\n##### 02 Chapter\n 03 Chapter\n\n\nWhat Is\na Lakehouse?\n\nDiving Deep Into the Inner Workings\nof the Lakehouse and Delta Lake\n\nUnderstanding\nDelta Engine\n\n\n#### What’s next?\n\nAfter reading this eBook, you’ll not only\n\n\nunderstand what Delta Lake offers, but you’ll\n\nalso understand how its features result in\n\nsubstantial performance improvements.\n\n\n-----\n\n#### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n\nlifecycle management to data lakes. Our customers have found that Delta Lake\n\nsolves for challenges around malformed data ingestion, difficulties deleting data for\n\ncompliance, or issues modifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\n\nyour data lake and the rate that teams can leverage that data with a secure and\n\nscalable cloud service.\n\n\n-----\n\n**What Is a Lakehouse?**\n### CHAPTER 01\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n# 01\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\n\nthat emerged independently across many customers and use cases: the **lakehouse.**\n\nIn this chapter, we’ll describe this new architecture and its advantages over previous\n\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\n\napplications. Since its inception in the late 1980s, data warehouse technology\n\ncontinued to evolve and MPP architectures led to systems that were able to handle\n\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\n\nhave to deal with unstructured data, semi-structured data, and data with high variety,\n\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\n\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\n\narchitects began envisioning a single system to house data for many different\n\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n\nin a variety of formats. While suitable for storing data, data lakes lack some critical\n\nfeatures: They do not support transactions, they do not enforce data quality, and their\n\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\n\nA lakehouse is a new data architecture that combines the best elements of data lakes\n\nand data warehouses.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\n\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\n\nwarehouses.\n\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\n\n\nrequire systems for diverse data applications including SQL analytics, real-time\n\nmonitoring, data science and machine learning. Most of the recent advances in\n\nAI have been in better models to process unstructured data (text, images, video,\n\naudio), but these are precisely the types of data that a data warehouse is not\n\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\n\nwarehouses, and other specialized systems such as streaming, time-series, graph\n\nand image databases. Having a multitude of systems introduces complexity and,\n\nmore importantly, introduces delay as data professionals invariably need to move\n\nor copy data between different systems.\n\n\nLakehouses are enabled by a new system design: implementing similar data struc-\n\ntures and data management features to those in a data warehouse, directly on the\n\nkind of low-cost storage used for data lakes. They are what you would get if you had\n\nto redesign data warehouses in the modern world, now that cheap and highly reliable\n\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n\nbe reading and writing data concurrently. Support for ACID transactions ensures\n\nconsistency as multiple parties concurrently read or write data, typically using\n\nSQL.\n\n\n-----\n\n- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n\nsupport schema enforcement and evolution, supporting DW schema paradigms\n\nsuch as star/snowflake-schemas. The system should be able to reason about data\n\nintegrity, and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n\nreduces staleness and improves recency, reduces latency and lowers the cost of\n\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n\ncompute use separate clusters, thus these systems are able to scale to many more\n\nconcurrent users and larger data sizes. Some modern data warehouses also have\n\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\n\nParquet, and they provide an API so a variety of tools and engines, including\n\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n\nThe lakehouse can be used to store, refine, analyze and access data types needed\n\nfor many new data applications, including images, video, audio, semi-structured\n\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n\nanalytics. Multiple tools might be needed to support all these workloads, but they all\n\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n\nSupport for streaming eliminates the need for separate systems dedicated to\n\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\n\nfeatures. Tools for security and access control are basic requirements. Data governance\n\ncapabilities including auditing, retention and lineage have become essential particularly\n\nin light of recent privacy regulations. Tools that enable data discovery such as data\n\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\n\nCloud object stores such as Amazon S3 are some of the largest and most\n\ncost-effective storage systems on the planet, making the main attractive\n\ntarget to store large data warehouses and data lakes. Unfortunately, their\n\nimplementation as key-value stores makes it difficult to achieve ACID\n\ntransactions and high performance: Metadata operations, such as listing\n\nobjects, are expensive, and consistency guarantees are limited. In this paper,\n\nwe present Delta Lake, an open source ACID table storage layer over cloud\n\nobject stores initially developed at Databricks. Delta Lake uses a transaction log\n\nthat is compacted into Apache Parquet format to provide ACID properties, time\n\ntravel, and significantly faster metadata operations for large tabular data sets\n\n(e.g., the ability to quickly search billions of table partitions for those relevant\n\nto a query). It also leverages this design to provide high-level features such\n\nas automatic data layout optimization, upserts, caching, and audit logs. Delta\n\nLake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n\nother systems. Delta Lake is deployed at thousands of Databricks customers\n\nthat process exabytes of data per day, with the largest instances managing\n\nexabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n\nZhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n\nŁuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n\nBoncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n\nMicrosoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n\nenables a similar lakehouse pattern. Other managed services such as BigQuery and\n\nRedshift Spectrum have some of the lakehouse features listed above, but they are\n\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\n\nsource file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\n\ncan move faster as they are able to use data without needing to access multiple systems.\n\nThe level of SQL support and integration with BI tools among these early lakehouses\n\nis generally sufficient for most enterprise data warehouses. Materialized views and\n\n\nA note about technical building blocks. While distributed file systems can be\n\nused for the storage layer, object stores are more commonly used in lakehouses.\n\nObject stores provide low-cost, highly available storage that excels at massively\n\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\n\nThe lakehouse is a new data management architecture that radically simplifies\n\nenterprise data infrastructure and accelerates innovation in an age when\n\nmachine learning is poised to disrupt every industry. In the past, most of the\n\ndata that went into a company’s products or decision-making was structured\n\ndata from operational systems, whereas today, many products incorporate\n\nAI in the form of computer vision and speech models, text mining and others.\n\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n\nversioning, governance, security and ACID properties that are needed even for\n\nunstructured data.\n\n\nstored procedures are available, but users may need to employ other mechanisms that\n\n\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\n\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\n\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\n\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n\nlibraries) for non-BI workloads like data science and machine learning. Data\n\nexploration and refinement are standard for many analytic and data science\n\napplications. Delta Lake is designed to let users incrementally improve the quality of\n\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\n\nsystems (such as data warehouses) that have years of investments and real-\n\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n\nnotebooks) over others so lakehouses will also need to improve their UX and their\n\nconnectors to popular tools so they can appeal to a variety of personas. These\n\nand other issues will be addressed as the technology continues to mature and\n\ndevelop. Over time, lakehouses will close these gaps while retaining the core\n\nproperties of being simpler, more cost-efficient and more capable of serving\n\ndiverse data applications.\n\n\ndata in their lakehouse until it is ready for consumption.\n\n\n-----\n\n**Diving Deep Into the Inner Workings**\n**of the Lakehouse and Delta Lake**\n\n### CHAPTER 02\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n# 02\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n\nadopting the lakehouse pattern. The blog created a massive amount of interest\n\nfrom technology enthusiasts. While lots of people praised it as the next-generation\n\ndata architecture, some people thought the lakehouse is the same thing as\n\nthe data lake. Recently, several of our engineers and founders wrote a research\n\npaper that describes some of the core technological challenges and solutions that\n\nset the lakehouse architecture apart from the data lake, and it was accepted and\n\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\n\ncan read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\n\nthey would have said faster horses.” The crux of this statement is that people often\n\nenvision a better solution to a problem as an evolution of what they already know\n\nrather than rethinking the approach to the problem altogether. In the world of data\n\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\n\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\n\nobject stores like Amazon S3 have become some of the largest and most cost-\n\neffective storage systems in the world, which makes them an attractive platform to\n\nstore data warehouses and data lakes. However, their nature as key-value stores\n\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\n\nperformance is hampered by expensive metadata operations (e.g., listing objects)\n\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\n\nThe first is directories of files (i.e., data lakes) that store the table as a collection\n\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\n\napproach because the table is just a group of objects that can be accessed from\n\na wide variety of tools without a lot of additional data stores or systems. However,\n\nboth performance and consistency problems are common. Hidden data corruption\n\nis common due to failed transactions, eventual consistency leads to inconsistent\n\nqueries, latency is high, and basic management capabilities like table versioning and\n\naudit logs are unavailable.\n\n**2. Custom storage engines**\n\nThe second approach is custom storage engines, such as proprietary systems built for\n\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\n\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\n\nservice that’s able to provide a single source of truth. However, all I/O operations need\n\nto connect to this metadata service, which can increase cloud resource costs and\n\nreduce performance and availability. Additionally, it takes a lot of engineering work to\n\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\n\nand PyTorch, which can be challenging for data teams that use a variety of computing\n\nengines on their data. Engineering challenges can be exacerbated by unstructured\n\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\n\ncustomers into a specific service provider, leaving customers to contend with\n\nconsistently high prices and expensive, time-consuming migrations if they decide to\n\nadopt a new approach later.\n\n**3. Lakehouse**\n\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\n\nwe sought to build a car instead of a faster horse with not just a better data store,\n\nbut a fundamental change in how data is stored and used via the lakehouse. A\n\nlakehouse is a new architecture that combines the best elements of data lakes and\n\ndata warehouses. Lakehouses are enabled by a new system design: implementing\n\nsimilar data structures and data management features to those in a data warehouse,\n\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\n\nget if you had to redesign storage engines in the modern world, now that cheap and\n\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\n\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n\nthe cloud object store. This design allows clients to update multiple objects at once,\n\nreplace a subset of the objects with another, etc., in a serializable manner that still\n\nachieves high parallel read/write performance from the objects. The log also provides\n\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\n\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n\ncaching, and audit logs. Together, these features improve both the manageability and\n\nperformance of working with data in cloud object stores, ultimately opening the door\n\nto the lakehouse architecture that combines the key features of data warehouses and\n\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\n\nexabytes of structured and unstructured data each day, as well as many organizations\n\nin the open source community. These use cases span a variety of data sources and\n\napplications. The data types stored include Change Data Capture (CDC) logs from\n\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\n\ntables for reporting, and image or feature data for machine learning. The applications\n\ninclude SQL workloads (most commonly), business intelligence, streaming, data\n\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n\nbe a good fit for most data lake applications that would have used structured storage\n\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\n\nsimplify their data architecture by running more workloads directly against cloud\n\nobject stores, and increasingly, by creating a lakehouse with both data lake and\n\ntransactional features to replace some or all of the functionality provided by message\n\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n\nAmazon Redshift).\n\n**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\n\nenables a wide range of DBMS-like performance and management features for data\n\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\n\naccess protocols make it simple to operate, highly available, and able to deliver high-\n\nbandwidth access to the object store.\n\n\n-----\n\n**Understanding Delta Engine**\n\n### CHAPTER 03\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n# 03\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n\nengine to take advantage of modern CPU architecture with optimizations to Spark\n\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n\nRuntime 7.0. Together, these features significantly accelerate query performance on\n\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\n\nOne of the big hardware trends over the last several years is that CPU clock speeds\n\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\n\nis that we have to find new ways to process data faster beyond raw compute power.\n\nOne of the most impactful methods has been to improve the amount of data that can\n\nbe processed in parallel. However, data processing engines need to be specifically\n\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\n\nthe pace of business increases. Poorer modeling in the interest of better business\n\nagility drives poorer query performance. Naturally, this is not a desired state, and\n\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\n\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n\nworkloads through three components: an improved query optimizer, a caching\n\nlayer that sits between the execution layer and the cloud object storage, and a native\n\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\n\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\n\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\n\ndata teams today is the native execution engine, which we call Photon. (We know.\n\n\n-----\n\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\nDatabricks has been built to maximize the performance from the new changes in\n\nmodern cloud hardware. It brings performance improvements to all workload types\n\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\n\nBy linking these three components together, we think it will be easier for customers\n\nto understand how improvements in multiple places within the Databricks code\n\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\n\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\n\nnew advances in how data teams design their data architectures for increased\n\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n\n[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n## What’s next?\n\n\nNow that you understand Delta Lake and how its features can improve\n\nperformance, it may be time to take a look at some additional resources.\n\n**Data + AI Summit Europe 2020 >**\n\n- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n\n\n**Explore subsequent eBooks in the collection >**\n\n- The Delta Lake Series — Fundamentals and Performance\n\n- The Delta Lake Series — Features\n\n- The Delta Lake Series — Streaming\n\n- The Delta Lake Series — Customer Use Cases\n\n\n\n- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n\n- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n\n\n- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n\n- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n\n\n**Do a deep dive into Delta Lake >**\n\n- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n\n- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n\n\n**Vodcasts and podcasts >**\n\n\n\n- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n\n- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n\n\n**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n\n\n\n- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**EBOOK**\n\n# All Roads Lead to the Lakehouse\n\n#### A deep dive into data ingestion with the lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction...................................................................................................................................................................................................................... **03**\n\nLife of a Data Engineer ............................................................................................................................................................................................... **04**\n\nIngesting From Cloud Object Stores...................................................................................................................................................................... **05**\n\nCOPY INTO ......................................................................................................................................................................................................... **06**\n\nAuto Loader ....................................................................................................................................................................................................... **09**\n\nIngesting Data From External Applications .......................................................................................................................................................... **13**\n\nPartner Connect ............................................................................................................................................................................................... **13**\n\n\n-----\n\n### Introduction\n\nOrganizations today are inundated with data siloed across various on-premises\napplication systems, databases, data warehouses and SaaS applications. This\nfragmentation makes it difficult to support new use cases for analytics or machine\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\narchitecture built on top of Delta Lake, an open format storage layer.\n\nThe first thing data engineers need to do to support the lakehouse architecture is to\nefficiently move data from various systems into their lakehouse. Ingesting data is a\ncritical first step in the data engineering and management lifecycle.\n\n\n-----\n\n### Life of a Data Engineer\n\nThe primary focus of data engineers is to provide timely and reliable data to downstream\n\ndata teams at an organization. Requests for data can come from a variety of teams, and for\n\n\na variety of data types. For example:\n\n**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n\nbetter allocate their budget for ads\n\n**•** Security team looking to get access to a table with low latency security data from Kafka,\n\nin order to run rules to detect intrusions into the network\n\n**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n\n**•** Finance team hoping to find a way to automatically ingest critical data from Google\n\nSheets or transaction data from AWS Kinesis\n\nIn each of these common scenarios, data engineers must create usable and easily\n\nqueryable tables from semi-structured and unstructured data. Beyond writing queries to\n\nretrieve and transform all this data, the data engineering team must also be concerned\n\nwith performance, because running these queries on an ongoing basis can be a big load on\n\nthe system.\n\nData engineers face the challenge of constant requests and ongoing business\n\n\n###### W H AT I S \n D E LTA L A K E ?\n\nBefore thinking about ingestion into Delta Lake, it’s important to\n\nunderstand why ingesting into Delta Lake is the right solution in\n\nthe first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n\nlayer that brings data warehouse capabilities to your open data\n\nlake. Across industries, enterprises have enabled true collaboration\n\namong their data teams with a reliable single source of truth\n\nenabled by Delta Lake. By delivering quality, reliability, security and\n\nperformance on your data lake — for both streaming and batch\n\noperations — Delta Lake eliminates data silos and makes analytics\n\naccessible across the enterprise. With Delta Lake, customers can\n\nbuild a cost-efficient, highly scalable lakehouse that eliminates\n\ndata silos and provides self-serving analytics to end users.\n\n\nrequirements, as well as an ever-changing ecosystem. As business requirements change,\n\nso do the requirements around schemas, necessitating custom code to handle the\n\nchanges. With all of these challenges, the work of a data engineer is extremely critical, and\n\nincreasingly complex, with many steps involved before getting data to a state where it can\n\nactually be queried by the business stakeholders. So how do data engineers get the data\n\nthat each of these teams need at the frequency, with the freshness, and in the format\n\nrequired?\n\n\n-----\n\n### Ingesting From Cloud Object Stores\n\nThere are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n\ncloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n\nexisting tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n\n[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n\n\n**Auto Loader**\n\nAuto Loader is an optimized data ingestion tool that incrementally and efficiently\n\nprocesses new data files as they arrive in cloud storage with minimal DevOps effort. You\n\njust need to provide a source directory path and start a streaming job. The new structured\n\nstreaming source, called “cloudFiles”, will automatically set up file notification services that\n\n\n**COPY INTO**\n\nCOPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n\nLake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n\nwhen the input directory contains thousands of files or fewer, and the user prefers SQL.\n\nCOPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n\n\nsubscribe file events from the input directory and process new files as they arrive, with the\n\noption of also processing existing files in that directory. Auto Loader has interfaces through\n\nPython and Scala, and can be used with SQL through Delta Live Tables.\n\n\n-----\n\n##### COPY INTO\n\n\nCOPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n\ningestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n\nINTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n\n```\nFILEFORMAT = CSV\nFORMAT_OPTIONS (‘header’ = ‘true’)\n\n```\n\nWhile COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n\n\nevents by using cloud functions such as AWS Lambda or through orchestrators like Apache\n\nAirflow. COPY INTO supports incremental appends and simple transformations.\n\nCOPY INTO is a great command to use when your source directory contains a small number\n\nof files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n\nAuto Loader, which we will cover later in this eBook.\n\n**Common Use Cases for COPY INTO**\n\n**Ingesting data to a new Delta table**\n\nA common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n\ntable. To copy data into a new Delta table, users can use CREATE TABLE command first,\n\nfollowed by COPY INTO.\n\nStep 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\nStep 2 1 : `COPY INTO` `my_table`\n```\n FROM ‘s3://my_bucket/my_path’ WITH (\n CREDENTIAL (\n AWS_ACCESS_KEY = ‘*****’,\n AWS_SECRET_KEY = ‘*****’,\n AWS_SESSION_TOKEN = ‘*****’\n )\n ENCRYPTION (\n TYPE = ‘AWS_SSE_C’,\n MASTER_KEY = ‘*****’\n\n```\n\nThe code block above covers the AWS temporary in-line credential format. When you use\n\nin-line credentials in Azure and AWS, the following parameters are required for each type of\n\ncredential and encryption:\n\n\n|Credential Name|Required Parameters|\n|---|---|\n|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n||AWS_SESSION_TOKEN|\n|Azure SAS token|AZURE_SAS_TOKEN|\n\n\n\n\n\n|Encryption Name|Required Parameters|\n|---|---|\n|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n\n\n**Appending data to your Delta table**\n\nTo append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n\nis a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n\nusers point to a location of files, and once those files are ingested, Delta Lake will keep\n\n1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\nthe cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\ncloud object store, you do not need to specify credentials in-line for COPY INTO.\n\n\n-----\n\ntrack of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n\nget idempotency with COPY INTO, which means users are prevented from ingesting the\n\nsame data twice to the same table.\n```\n COPY INTO table_identifier\n FROM [ file_location | ( SELECT expression_list FROM file_location)]\n FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n\n```\nOne of the main benefits of COPY INTO is that users don’t have to worry about providing a\n\nschema, because the schema is automatically inferred from your data files. Here is a very\n\nsimple example of how you would ingest data from CSV files that have headers, where you\n\nleave the tool to infer the schema and the proper data types. It’s as simple as that.\n```\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n\n```\n**Using COPY INTO without an existing table** 2\n\n```\n CREATE TABLE my_delta_table (dummy string);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS (\n ‘header’ = ‘true’ ,\n ‘inferSchema’ = ‘true’ ,\n ‘mergeSchema’ = ‘true’\n )\n COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n\n```\n**Ingesting a CSV file without headers**\n\nIf you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n\n_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n\ndata type that you want and then alias these columns to whatever you want to call them.\n```\n COPY INTO my_delta_table\n FROM ( SELECT\n _c0::int as key,\n _c1::double value,\n _c2::timestamp event_time\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n\n```\n\nIn the most common case, in order to use COPY INTO, a table definition is required.\n\nHowever, if you would like to get started quickly and don’t have an existing table or require\n\na specific schema, you can create your table with a dummy schema. Then, once you run\n\nCOPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n\ninfer the data types, and then change your Delta table to have the required schema.\n\n2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n\n\n-----\n\n**Evolving schema over time for CSV files** 3\n\nWhen ingesting CSV files that have a different number of columns than your existing table,\n\nyou can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n\nas FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n\nOnce “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n\nfiles and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n\nwhen you’re running the COPY INTO command. When “mergeSchema” is provided as a\n\ncopy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n\nevolution only allows the addition of new columns. Data type changes for existing columns\n\nare not supported.\n```\n COPY INTO my_delta_table\n FROM (SELECT\n _C0::int as key,\n _C1::double value,\n _C2::timestamp event_time,\n ...\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n\n```\n\n**Fixing bad data**\n\nIf you find that there is a mistake in the source data file and some of the data you ingested\n\nis bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n\nthe Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n\ncan rerun your COPY INTO command.\n\nAlternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n\nthe use of the “force” copy option. You can manually remove the old data from your Delta\n\nLake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n\nYou can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n\nnames with the FILES keyword to reload a subset of files in conjunction with “force”.\n```\n RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n 1);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n PATTERN = ‘2021-09-08*.csv’\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘force’ = ‘true’ )\n\n```\n3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\nclusters enabled with table ACLs.\n\n\n-----\n\n##### Auto Loader\n\n\nWhile COPY INTO can solve a lot of the key use cases our customers face, due to its\n\nlimitations (scalability), there are many scenarios where we recommend Auto Loader\n\nfor data ingestion. Auto Loader is a data source on Databricks that incrementally and\n\nefficiently processes new data files as they arrive in cloud storage with minimal DevOps\n\neffort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n\nAuto Loader is an incremental streaming source that provides exactly-once ingestion\n\nguarantees. It keeps track of which files have been ingested using a durable key-value store.\n\nIt can discover new files very efficiently and is extremely scalable. Auto Loader has been\n\nbattle tested. We have seen customers running Auto Loader on millions of files an hour, and\n\npetabytes of data per day.\n\nTo use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n\nthat you will use Auto Loader to load files from the cloud object stores. Next, you specify\n\nthe format of the file — for example, JSON — as an option to Auto Loader, and you specify\n\nwhere to load it from.\n```\n df = spark.readStream.format( “cloudFiles” )\n .option( “cloudfiles.format” , “json” )\n .load( “/path/to/table” )\n\n```\nUnder the hood, when data lands in your cloud storage, Auto Loader discovers files either\n\nthrough directory listing or file notifications. Given permissions to the underlying storage\n\nbucket or container, Auto Loader can list the directory that you want to load data from\n\nin an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n\ncan also automatically set up file notifications on your storage account, which allows it\n\n\nfrom queues, deduplicate these notifications using its key-value store and then process\n\nthe underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n\nprocessed, giving you exactly-once semantics.\n\nDirectory listing mode is very easy to get started with. If your files are uploaded to your\n\ncloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n\nfiles by starting directory listing from the latest uploaded files, saving you both time and\n\nmoney. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n\nto scale to high volumes, Databricks recommends using the file notification mode. Cloud\n\nservices such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n\nto upload files in a lexical order, typically by providing the upload time of records in the file\n\npath, such as /base/path/yyyy/MM/dd/HH/file.format.\n\n**Common Use Cases for Auto Loader**\n\n**New to Auto Loader**\n\nAs a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n\nstores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n\nexample using Python to demonstrate the ease and flexibility of Auto Loader with a few\n\ndefined options. You can run the code in a notebook.\n```\n stream = spark.readStream \\\n .format( “cloudFiles” ) \\\n .option( “cloudFiles.format” , “csv” ) \\\n .option( “cloudFiles.schemaLocation” , schema_location) \\\n .load(raw_data_location)\n\n```\n\nto efficiently discover newly arriving files. When a file lands in file notification mode, the\n\ncloud storage system sends a notification to a queuing system. For example, in AWS, S3\n\nwill send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n\nOn Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n\n\n-----\n\nIn order to write to a Delta table from the stream, follow the example below:\n```\n stream.writeStream \\\n .option( “mergeSchema” , “true” ) \\\n .option( “checkpointLocation” , checkpoint_location) \\\n .start(target_delta_table_location)\n\n```\n**Migrating to Auto Loader**\n\nAs a Spark user, you may be using an existing Spark structured streaming to process data.\n\nTo migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n\ntwo lines of it into ‘cloudFiles’, specifying the file format within an option.\n\n\n**Migrating a livestreaming pipeline**\n\nMigrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n\nINTO, you can specify a timestamp when the source files are updated or created and Auto\n\nLoader will ingest all modified data after that point.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n**Schema inference and evolution**\n\nAuto Loader provides schema inference and management capabilities. With a schema\n\nlocation specified, Auto Loader can store the changes to the inferred schema over time. For\n\nfile formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n\nLoader can automatically infer data types or treat everything as a string.\n\nWhen data does not match your schema (e.g., an unknown column or format), Auto Loader\n\nhas a data rescue capability that will “rescue” all data in a separate column, stored as a\n\nJSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n\nAuto Loader supports three schema evolution modes: add new columns as they are\n\ndiscovered, fail if an unexpected column is seen, or rescue new columns.\n\n```\ndf = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.\nformat” , “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n```\ndf = spark.readStream\n .format( “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n\nOnce it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n\nLoader can scale to trillions of files, unlike the open-source file streaming source. One of\n\nthe ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n\nto discover files first, then plan, Auto Loader discovers and processes files concurrently,\n\nmaking it much more efficient and leading to cost reductions in compute resources.\n\n\n-----\n\n**Fixing a file that was processed with Auto Loader**\n\nTo fix a file that was already processed, Auto Loader supports an option called\n\n‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n\nnew timestamp. If you want to enable this option in an existing Auto Loader stream, you\n\nneed to stop and restart the Auto Loader stream with the enabled option.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .schema(schema)\n .option( “cloudFiles.allowOverwrites” , “true” )\n .options(format_options)\n .load( “/path/to/table” )\n\n```\n**Discover missing data**\n\nWhile event notification is a very scalable method to collect all data, it relies on cloud\n\nservices, which are distributed systems and are not always reliable. With Auto Loader, you\n\ncan additionally specify a backfill interval, where Auto Loader will perform asynchronous\n\nbackfills at whatever interval you set up. This can be enabled with a once trigger,\n\n```\n df = spark.readStream\n .format(“cloudFiles”)\n .option(“cloudFiles.format”, “json”)\n .schema(schema)\n .option( “cloudFiles.backfillInterval” , “1 week” )\n .options(format_options)\n .load(“/path/to/table”)\n .writeStream\n .trigger(Trigger.AvailableNow())\n .option(“checkpointLocation”, checkpointDir)\n .start()\n\n```\nThe trigger tells Auto Loader how frequently to process incoming data. A processing time\n\ntrigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n\ninterval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n\nprocess all new data that has been added until the start of your application. Once the data\n\nis processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n\nprocess all the new data in a single micro-batch, which requires it to first discover all the\n\nnew files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n\nand perform rate limiting, which makes it a preferable alternative to Trigger Once.\n\n\nprocessing time trigger and available now trigger. The following example shows how to use\n\nbackfill internal and trigger availableNow together:\n\n\n-----\n\n**Using Auto Loader in SQL with Delta Live Tables**\n\nDelta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n\nframework to develop, test, monitor, manage and operationalize data pipelines at scale to\n\ndrive insights for data science, machine learning and analytics. Auto Loader is available in\n\nDelta Live Tables.\n\n```\nCREATE INCREMENTAL LIVE TABLE\n autoloader_test\nAS\nSELECT\n *,\n id + id2 AS new_id\nFROM\n CLOUD_FILES (\n “some/cloud/path” , – the path to the data\n “json” – the file format\n );\n\n```\n\n**Live Tables understands**\n\n**and coordinates data flow**\n\n**between your queries**\n\n\n-----\n\n### Ingesting Data From External Applications\n\nWhile Auto Loader and COPY INTO are powerful tools, not all data is available as files\n\nin cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n\nyour data and break down the silos between sources and downstream teams. To do this,\n\ncustomers need to discover and connect a broad set of data, BI and AI tools, and systems\n\nto the data within their lakehouse.\n\n##### Partner Connect\n\nHistorically, stitching multiple enterprise tools and data sources together has been a burden\n\non the end user, making it very complicated and expensive to execute at any scale. Partner\n\nConnect solves this challenge by making it easy for you to integrate data, analytics and AI\n\ntools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n\nvalidated solutions from Databricks partners that support your expanding analytics needs.\n\nTo ingest into the lakehouse, select the partner tile in Partner Connect via the left\n\nnavigation bar in Databricks. Partner Connect will automatically configure resources such\n\nas clusters, tokens and connection files for you to connect with your data ingestion tools\n\nof choice. You can finish signing up for a trial account on the partner’s website or directly\n\nlog in if you already used Partner Connect to create a trial account. Once you log in, you will\n\nsee that Databricks is already configured as a destination in the partner portal and ready\n\nto be used.\n\n\n-----\n\n**Common Use Case for Partner Connect**\n\n**Ingest Salesforce data via Fivetran into Delta Lake**\n\nClicking on the Fivetran tile in Partner Connect starts an automated workflow between\n\nthe two products. Databricks automatically provisions a SQL endpoint and associated\n\ncredentials for Fivetran to interact with, and passes the user’s identity and the SQL\n\n\nendpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n\nDatabricks destination is automatically created. This destination is configured to ingest into\n\nDelta via the SQL endpoint that was auto-configured by Partner Connect.\n\nThe customer now selects their choice of data source in Fivetran from hundreds of pre-\n\nbuilt connectors — for example, Salesforce. The user authenticates to the Salesforce\n\nsource, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n\n\n-----\n\n(in this case the Account & Contact objects) and starts the initial sync. This automation\n\nhas saved users dozens of manual steps and copying/pasting of configuration if they\n\nmanually set up the connection. It also protects the user from making any unintentional\n\nconfiguration errors and spending time debugging those errors. The Salesforce tables\n\nare now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n\ndetails or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n\nglobe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "# 2023 State\n of Data + AI\n```\nPowered by the Databricks Lakehouse\n\n```\n2023 STATE OF DATA + AI\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||W|e’|r|e|in||th|e|||||||\n|||||||go|l|de|n|a|ge||of|||||||\n|||||||||||||||||||||\n|||||||d|a|ta|a|n|d|A|I|||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\n-----\n\nINTRO\n\nIn the 6 months since ChatGPT launched, the world has woken up to the vast potential\nof AI. The unparalleled pace of AI discoveries, model improvements and new products\non the market puts data and AI strategy at the top of conversations across every\norganization around the world. We believe that AI will usher in the next generation of\nproduct and software innovation, and we’re already seeing this play out in the market.\nThe next generation of winning companies and executives will be those who understand\nand leverage AI.\n\nIn this report, we examine patterns and trends in data and AI adoption across more\nthan 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\napplications across companies’ entire data estates, the Databricks Lakehouse provides\na unique vantage point into the state of data and AI, including which products and\ntechnologies are the fastest growing, the types of data science and machine learning\n(DS/ML) applications being developed and more.\n\n\n-----\n\n```\nHere are the major stories we uncovered:\n\n```\n\nCompanies are adopting\nmachine learning and large\nlanguage models (LLMs)\nat a rapid pace. Natural\nlanguage processing (NLP)\nis dominating use cases,\nwith an accelerated focus\non LLMs.\n\n\nOrganizations are investing in\ndata integration products as\nthey prioritize more DS/ML\ninitiatives. 50% of our fastestgrowing products represent\nthe data integration category.\n\n\nOrganizations are increasingly\nusing the Lakehouse for data\nwarehousing, as evidenced\nby the high growth of data\nintegration tools dbt and\nFivetran, and the accelerated\nadoption of Databricks SQL.\n\n\nWe hope that by sharing these trends, data leaders will be able to benchmark\ntheir organizations and gain insights that help inform their strategies for an\nera defined by data and AI.\n\n\n-----\n\n```\nSummary of\n\nKey Findings\n DATA SCIENCE AND MACHINE LEARNING:\n\n NLP AND LLMS ARE IN HIGH DEMAND\n 1\n\n```\n**•** The number of companies using SaaS LLM APIs (used to access\nservices like ChatGPT) has grown 1310% between the end of\nNovember 2022 and the beginning of May 2023\n\n**•** NLP accounts for 49% of daily Python data science library usage,\nmaking it the most popular application\n\n**•** Organizations are putting substantially more models into production\n(411% YoY growth) while also increasing their ML experimentation\n(54% YoY growth)\n\n**•** Organizations are getting more efficient with ML; for every three\n\nexperimental models, roughly one is put into production, compared\nto five experimental models a year prior\n\n\n-----\n\n```\nFASTEST-GROWING DATA\nAND AI PRODUCTS\n\n```\n```\nADOPTION AND\nMIGRATION TRENDS\n\n```\n61% of customers migrating to the\nLakehouse are coming from onprem and cloud data warehouses\n\nThe volume of data in Delta Lake\nhas grown 304% YoY\n\nThe Lakehouse is increasingly\nbeing used for data warehousing,\nincluding serverless data\nwarehousing with Databricks\nSQL, which grew 144% YoY\n\n\nBI is the top data and AI market, but\ngrowth trends in other markets show that\ncompanies are increasingly looking at\nmore advanced data use cases\n\nThe fastest-growing data and AI product\nis dbt, which grew 206% YoY by number\nof customers\n\nData integration is the fastest-growing\ndata and AI market on the Databricks\nLakehouse with 117% YoY growth\n\n\n-----\n\n```\nMethodology: How did Databricks\n\ncreate this report?\n\n```\nThe _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\ncollected from our customers based on how they are using the Databricks\nLakehouse and its broad ecosystem of integrated tools. This report focuses\non machine learning adoption, data architecture (integrations and migrations)\nand use cases. The customers in this report represent every major industry\nand range in size from startups to many of the world’s largest enterprises.\n\nUnless otherwise noted, this report presents and analyzes data from February 1,\n2022, to January 31, 2023, and usage is measured by number of customers.\nWhen possible, we provide YoY comparisons to showcase growth trends over time.\n\n\n-----\n\n```\nData Science and\n\nMachine Learning\nNATURAL LANGUAGE PROCESSING AND LARGE\nLANGUAGE MODELS ARE IN HIGH DEMAND\n\n```\nAcross all industries, companies leverage data science and\nmachine learning (DS/ML) to accelerate growth, improve\npredictability and enhance customer experiences. Recent\nadvancements in large language models (LLMs) are propelling\ncompanies to rethink AI within their own data strategies.\nGiven the rapidly evolving DS/ML landscape, we wanted to\nunderstand several aspects of the market:\n\n- Which types of DS/ML applications are companies investing\nin? In particular, given the recent buzz, what does the data\naround LLMs look like?\n\n- Are companies making headway on operationalizing\n\ntheir machine learning models (MLOps)?\n\n\n-----\n\n```\nTime Series Time Series\nSpeech Recognition\nSimulations &\u0003\n\nOptimizations\nRecommender Systems\nNatural\n\n\u0003Language \u0003\n\nProcessing\nIndustry Data Modeling\nGraph\nGeospatial\nComputer Vision\nAnomaly Detection\n\u0003& Segmentation\n\n```\n```\n SPECIALIZED PYTHON \u0003DS/ML\n\n LIBRARIES FROM \u0003FEBRUARY 2022 \n\n TO JANUARY 2023\n\n```\n\nNote: This chart reflects the unique\nnumber of notebooks using ML\nlibraries per day in each of the\ncategories. It includes libraries used\nfor the particular problem-solving use\ncases mentioned. It does not include\nlibraries used in tooling for data\npreparations and modeling.\n\n\n-----\n\n```\nNatural language processing dominates\n\nmachine learning use cases\n\n```\n\nOur second most popular DS/ML application is\nsimulations and optimization, which accounts for 30% of\nall use cases. This signals organizations are using data to\nmodel prototypes and solve problems cost-effectively.\n\n\nTo understand how organizations are applying AI and\nML within the Lakehouse, we aggregated the usage\nof specialized Python libraries, which include NLTK,\nTransformers and FuzzyWuzzy, into popular data science\nuse cases. 1 We look at data from these libraries because\nPython is on the cutting edge of new developments in ML,\nadvanced analytics and AI, and has consistently ranked\nas one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\nrecent years.\n\nOur most popular use case is natural language processing\n(NLP), a rapidly growing field that enables businesses to\ngain value from unstructured textual data. This opens the\ndoor for users to accomplish tasks that were previously\ntoo abstract for code, such as summarizing content or\nextracting sentiment from customer reviews. In our data\nset, 49% of libraries used are associated with NLP. LLMs\nalso fall within this bucket. Given the innovations launched\nin recent months, we expect to see NLP take off even\nmore in coming years as it is applied to use cases like\nchatbots, research assistance, fraud detection, content\ngeneration and more.\n\n```\n In our data set, 49% of\n specialized Python libraries\n used are associated with NLP\n\n```\nMany of the DS/ML use cases are predominantly\nleveraged by specific industries. While they take up a\nsmaller share of the total, they are mission-critical for\nmany organizations. For example, time series includes\nforecasting, a use case that is especially popular in\nindustries such as Retail and CPG, which rely heavily\non the ability to forecast the need for every item in\nevery store.\n\n\n1. This data does not include general-purpose ML libraries, including\nscikit-learn or TensorFlow.\n\n\n-----\n\n```\n USE OF LARGE LANGUAGE MODELS (LLMS)\n\n```\n\n\n\n\n\n\n\nWe have rolled these libraries up into groupings based on the type of functionality they provide.\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||||||||\n|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n|||SaaS|||||||||||||||||||||||||||||\n|||LLM|Tools||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n|2022||||||||||||||||||||20|23||||||||||\n||||||||||||||||||||||||||||||||\n||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n\n\n\nD t i t tl di i th l t k f D b d t lit\n\n\n-----\n\n```\nLarge language models are\n\nthe “it” tool\n\n```\nLLMs are currently one of the hottest and most-watched areas\nin the field of NLP. LLMs have been instrumental in enabling\nmachines to understand, interpret and generate human language\nin a way that was previously impossible, powering everything\nfrom machine translation to content creation to virtual assistants\nand chatbots.\n\nTransformer-related libraries have been growing in popularity\neven before ChatGPT thrust LLMs into the public consciousness.\nWithin the last 6 months, our data shows two accelerating\ntrends: organizations are building their own LLMs, which models\nlike [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\nthey are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\nLLMs, have the highest adoption within the Lakehouse.\n\nThe second most popular type is SaaS LLMs, which are used\nto access models like OpenAI. This category has grown\nexponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\nnumber of Lakehouse customers using SaaS LLMs has grown\n\n\nOrganizations can leverage LLMs either by\nusing SaaS LLM APIs to call services like\nChatGPT from OpenAI or by operating their\nown LLMs in-house.\n\nThinking of building your own modern LLM\napplication? This approach could entail\nthe use of specialized transformer-related\nPython libraries to train the model, as well as\nLLM tools like LangChain to develop prompt\ninterfaces or integrations to other systems.\n```\nLLM DEFINITIONS\n\n```\n**◊** **Transformer-related libraries:**\nPython libraries used to train LLMs\n(example: Hugging Face)\n\n**◊** **SaaS LLM APIs:** Libraries used to access\nLLMs as a service (example: OpenAI)\n\n**◊** **LLM tools:** Toolchains for working\nwith and building proprietary LLMs\n(example: LangChain)\n\n\nan impressive 1310% between the end of November 2022 and\nthe beginning of May 2023. (In contrast, transformer-related\nlibraries grew 82% in this same period.)\n\n\n-----\n\n```\n ac e ea g e pe e a o a d p oduc o\ntake off across industries\n\n```\n\nThe increasing demand for ML solutions and the growing\navailability of technologies have led to a significant\nincrease in experimentation and production, two distinct\nparts of the ML model lifecycle. We look at the _logging_ and\n_registering_ of models in MLflow, an open source platform\ndeveloped by Databricks, to understand how ML is\ntrending and being adopted within organizations.\n```\n LOGGED MODELS AND\n\n ML EXPERIMENTATION\n\n```\nDuring the experimentation phase of ML, data scientists\ndevelop models designed to solve given tasks. After training\nthe models, they test them to evaluate their accuracy,\nprecision, recall (the percentage of correctly predicted\npositive instances out of all actual positive instances), and\nmore. These metrics are logged (recorded) in order to analyze\nthe various models’ performance and identify which approach\nworks best for the given task.\n\nWe have chosen logged models as a proxy to measure ML\nexperimentation because the MLflow Tracking Server is\n\ndesigned to facilitate experiment tracking and reproducibility.\n\n\nMLflow Model Registry launched in May 2021. Overall, the\nnumber of logged models has grown 54% since February\n2022, while the number of registered models has grown\n411% over the same period. This growth in volume suggests\norganizations are understanding the value of investing in\nand allocating more people power to ML.\n```\nREGISTERED MODELS AND ML PRODUCTION\n\n```\nProduction models have undergone the experimentation\nphase and are then deployed in real-world applications. They\nare typically used to make predictions or decisions based on\nnew data. Registering a model is the process of recording and\nstoring metadata about a trained model in a centralized location\nthat allows users to easily access and reuse existing models.\nRegistering models prior to production enables organizations to\nensure consistency and reliability in model deployment and scale.\n\nWe have chosen registered models to represent ML production\nbecause the MLflow Model Registry is designed to manage\nmodels that have left the experimentation phase through the\n\nrest of their lifecycle.\n\n\n-----\n\ng y yi p\n\nwas registered. Recent advances in ML, such as improved\nopen source libraries like MLflow and Hugging Face, have\n\nradically simplified building and putting models into\nproduction. The result is that 34% of logged models are\nnow candidates for production today, an improvement\nfrom over 20% just a year ago.\n\n\nbefore committing an ML model to production. We wanted\nto understand, “How many models do data scientists\n\nexperiment with before moving to production?”\n\nOur data shows the ratio of logged to registered models\nis 2.9 : 1 as of January 2023. This means that for roughly\nevery three experimental models, one model will get\nregistered as a candidate for production. This ratio has\nimproved significantly from just a year prior, when we\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||VS. S|||||||||||||||||||||||\n|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n||||||Models|||||||||||||||||||||||\n||||||ber of|||||||||||||||||||||||\n||||||Num|||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n|2.|9 :|1||||||||||||||||||||||||||\n\n```\nRatio of Logged to Registered\n\n Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\nModels in Jan 2023 2023\n\n```\n\n-----\n\n```\nThe Modern Data\nand AI Stack\n\n```\nOver the last several years, the trend toward building\nopen, unified data architectures has played out in our\nown data. We see that data leaders are opting to preserve\nchoice, leverage the best products and deliver innovation\nacross their organizations by democratizing access to\ndata for more people.\n\n\n-----\n\n```\n FASTEST-GROWING DATA AND AI PRODUCTS\n dbt 206%\n\n```\n```\nFivetran\nInformatica\nQlik Data Integration\nEsri\nLooker\nHugging Face\n\n```\n```\n 181%\n 174%\n 152%\n 145%\n 141%\n110%\n\n```\n```\nLytics\nGreat Expectations\nKepler.gl\n\n```\n```\n 101%\n 100%\n95%\n\n```\n```\n0% 50% 100% 150% 200%\n Year-Over-Year Growth by Number of Customers\n\n```\n\n-----\n\n```\nDBT IS THE FASTEST-GROWING DATA\n\nAND AI PRODUCT OF 2023\n\n```\nAs companies move quickly to develop more advanced\nuse cases with their data, they are investing in newer\nproducts that produce trusted data sets for reporting,\nML modeling and operational workflows. Hence, we see\nthe rapid rise of data integration products. dbt, a data\ntransformation tool, and Fivetran, which automates\ndata pipelines, are our two fastest-growing data and AI\nproducts. This suggests a new era of the data integration\nmarket with challenger tools making headway as\ncompanies shift to prioritize DS/ML initiatives. With Great\nExpectations from Superconductive in the ninth spot,\na full 50% of our fastest-growing products represent\nthe data integration category.\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||Busi|ness I|ntelli|gence|\n|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n|Custom||||||||||||||||||||\n|ber of||||||||||||||||||||\n|Num||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\nNote: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\ncategories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n\n\n-----\n\n```\n a a a d a e s bus ess e ge ce s\nstandard, organizations invest in their machine\nlearning foundation\n\n```\n\nTo understand how organizations are prioritizing their data\ninitiatives, we aggregated all data and AI products on the\nDatabricks Lakehouse and categorized them into four\ncore markets: BI, data governance and security, DS/ML,\nand data integration. Our data set confirms that BI tools\nare more widely adopted across organizations relative to\nmore nascent categories — and they continue to grow,\nwith a 66% YoY increase in adoption. This aligns with the\nbroader trend of more organizations performing data\nwarehousing on a Lakehouse, covered in the next section,\nViews from the Lakehouse.\n\n\nWhile BI is often where organizations start their data\njourney, companies are increasingly looking at more\nadvanced data and AI use cases.\n```\nDEMAND FOR DATA INTEGRATION PRODUCTS\n\nIS GROWING FAST\n\n```\nWe see the fastest growth in the data integration market.\nThese tools enable a company to integrate vast amounts\nof upstream and downstream data in one consolidated\nview. Data integration products ensure that all BI and DS/\nML initiatives are built on solid foundation.\n\nWhile it’s easier for smaller markets to experience\nfaster growth, at 117% YoY increased adoption, the data\nintegration market is growing substantially faster than BI.\nThis trend dovetails with the rapid growth of ML adoption\nwe see across the Lakehouse, covered in the DS/ML\nsection of the report.\n\n```\nData integration is the\nfastest-growing market,\n\n with 117% YoY growth\n\n```\n\n-----\n\n```\nViews from\nthe Lakehouse\nMIGRATION AND DATA\n\nFORMAT TRENDS\n\n```\nData migration is a major undertaking: it can be risky,\nexpensive and delay companies’ timelines. It’s not a\ntask to jump into lightly. As organizations run into the\nlimitations, scalability challenges and the cost burden\nof legacy data platforms, they are increasingly likely\nto migrate to a new type of architecture.\n\n\n-----\n\n```\nMigration trends:\n\nthe best data warehouse\n\nis a Lakehouse\n\n```\nThe Lakehouse Platform is an attractive\nalternative to traditional data warehouses\nbecause it supports advanced use cases and\nDS/ML, allowing organizations to boost their\noverall data strategy. As evidenced by the most\npopular data and AI products, with BI and data\nintegration tools at the top, organizations are\nincreasingly using the data lakehouse for data\nwarehousing. To better understand which legacy\nplatforms organizations are moving away from,\n\nwe look at the migrations of new customers\nto Databricks.\n\nAn interesting takeaway is that roughly half of the\ncompanies moving to the Lakehouse are coming\nfrom data warehouses. This includes the 22%\nthat are moving from cloud data warehouses.\nIt also demonstrates a growing focus on running\ndata warehousing workloads on a Lakehouse\nand unifying data platforms to reduce cost.\n\n```\n SOURCE OF NEW CUSTOMER \u0003\n\n MIGRATIONS TO DATABRICKS\n\n```\n```\n12%\n\n```\n```\n39%\n\n```\n```\n27%\n\n```\n```\n22%\n\n```\n\n-----\n\n```\nRising tides: the volume\n\nof data in Delta Lake\n\nhas grown 304% YoY\n\n```\nAs the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\nlarge proportion is in the form of semi-structured\nand unstructured data. Previously, organizations\nhad to manage multiple different platforms for\ntheir structured, unstructured and semi-structured\ndata, which caused unnecessary complexity and\nhigh costs. The Lakehouse solves this problem by\nproviding a unified platform for all data types\nand formats.\n\nDelta Lake is the foundation of the Databricks\nLakehouse. The Delta Lake format encompasses\nstructured, unstructured and semi-structured\ndata. Use has surged over the past 2 years.\nWhen compared to the steady, flat or declining\ngrowth in other storage formats (e.g., text, JSON\nand CSV), our data shows that a growing number\nof organizations are turning to Delta Lake to manage\ntheir data. In June 2022, Delta Lake surpassed\nParquet as the most popular data lake source,\nreaching 304% YoY growth.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||BY|STO||RAG||E FO||RMA|T|||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|ata||||||||||||||||||\n|e of D||||||||||||||||||\n|Volum||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n||Jan|||||||J|an|||Jan||||Ja||\n|||||Jan||||||||||||||\n|2|019|||2020||||20|21|||2022||||202||\n|||||||||Delta|Te|xt||CSV||Av||ro||\n|||||||||Parquet|OR|C||JSON||||||\n|||||||||||||||||||\n\n\n-----\n\n```\n g g ,\nwith emphasis on serverless\n\n```\n\nOver the past 2 years, companies have vastly increased their usage\nof data warehousing on the Lakehouse Platform. This is especially\ndemonstrated by use of Databricks SQL ­— the serverless data\nwarehouse on the Lakehouse — which shows 144% YoY growth.\nThis suggests that organizations are increasingly ditching traditional\ndata warehouses and are able to perform all their BI and analytics\non a Lakehouse.\n\n```\n Data \nWarehouse\n\n```\n```\nData \n\n```\n```\nLakehouse\nPlatform\n\n```\n```\nLakehouse\n\n```\n\n\n\n\n\n\n\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||\n||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n||DA|TABR|ICK|S SQ|||||||||||||||||||||\n||||||||ustome||||||||||||||||||\n||||||||r of C||||||||||||||||||\n||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n\n\n-----\n\nCONCLUSION\n```\nGeneration AI\n\n```\nWe’re excited that companies are progressing into more\nadvanced ML and AI use cases, and the modern data and\nAI stack is evolving to keep up. Along with the rapid growth\nof data integration tools (including our fastest growing,\ndbt), we’re seeing the rapid rise of NLP and LLM usage in\nour own data set, and there’s no doubt that the next few\nyears will see an explosion in these technologies. It’s never\nbeen more clear: the companies that harness the power\nof DS/ML will lead the next generation of data.\n\n\n-----\n\n```\nAbout Databricks\n\n```\nDatabricks is the data and AI company. More than 9,000\norganizations worldwide — including Comcast, Condé Nast, and\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\nPlatform to unify their data, analytics and AI. Databricks is\nheadquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve\nthe world’s toughest problems. To learn more, follow Databricks\non Twitter, LinkedIn and Instagram.\n\n[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "**eBook**\n\n# Making Your Digital Twin Come to Life\n\n##### With the Lakehouse for Manufacturing and Tredence\n\n\n-----\n\n### Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\n\nDigital Twin Architectures .................................................................................................................................................................................. **08**\n\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\n\nWhy Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n\nWhy Databricks for Digital Twins? ................................................................................................................................................................... **13**\n\nWhy Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n\nUsing Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n\n\n-----\n\n### Introduction\n\n\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\ncost-effective and are now an imperative in today’s data-driven businesses.\n\nToday’s manufacturing industries are expected to streamline and optimize all the processes in their value\nchain from product development and design, through operations and supply chain optimization to obtaining\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\n\n\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n\n\n**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 10%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 50%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 25%\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n-----\n\n**Introduction (continued)**\n\n\n**Digital twin market growth rate accelerates**\n\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\nat a CAGR of 58%, riding on the wave of Industry 4.0.\n\n\n**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing\n\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\nwould have come at significant costs without digital twin technology.\n\n**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n\n\n\n**•** Product design and development is performed with\nless cost and is completed in less time as iterative\nsimulations, using multiple constraints, deliver the\nbest or most optimized design. All commercial\naircraft are designed using digital twins.\n\n**•** Digital twins provide the awareness of how long\ninventory will last, when to replenish and how to\nminimize the supply chain disruptions. The oil and gas\nindustry, for example, uses supply chain–oriented\ndigital twins to reduce supply chain bottlenecks in\nstorage and midstream delivery, schedule tanker\noff-loads and model demand with externalities.\n\n\n\n**•** Continuous quality checks on produced items\nwith ML/AI generated feedback pre-emptively\nassuring improved product quality. Final paint\ninspection in the automotive industry, for example,\nis performed with computer vision built on top of\ndigital twin technology.\n\n**•** Striking the sweet spot between when to replace\na part before the process degrades or breaks\ndown and utilizing the components to their fullest,\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\nbuilding an asset performance management suite.\n\n\n\n**•** Digital twins create the opportunity to have\nmultiple departments in sync by providing\nnecessary instructions modularly to attain\na required throughput. Digital twins are the\nbackbone of kaizen events that optimize\nmanufacturing process flow.\n\n**•** Customer feedback loops can be modeled through\ninputs, from point of sale customer behavior,\nbuying preferences, or product performance and\nthen integrated into the product development\nprocess, forming a closed loop providing an\nimproved product design.\n\n\n-----\n\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\ndeployment, but typically offer higher and longer-lasting value.\n\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n\n\nImprove product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**\n\n**8%**\n**8%**\n\n\nCan you imagine the cost to change\nan oil refinery’s crude distillation\nunit process conditions to improve\nthe output of diesel one week\nand gasoline the next to address\nchanges in demand and ensure\nmaximum economic value? Can you\nimagine how to replicate an even\nsimple supply chain to model risk?\n\n\n**5%**\n\n\n**1%**\n\n\n-----\n\n### What Are Digital Twins?\n\n\nKnowing the business challenges and benefits digital twins deliver, let’s turn to\nthe basics and explore what digital twins are and how a modern data stack is\nnecessary to build effective and timely digital twins. The classic definition of\ndigital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n\n\nFor a discrete or continuous manufacturing process, a digital twin gathers system\nand processes state data with the help of various IoT sensors [operational\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\nvirtual model which is then used to run simulations, study performance issues and\ngenerate possible insights.\n\n\n**Types of Digital Twins**\n\n\n-----\n\n### Digital Twin Architectures\n\nClassic digital twins have been physics-based models of specific systems. More recently,\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n\n\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\nthe industrial environment.\n\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\n\n**Data-Driven Operational Digital Twins: Maturity Journey**\n\n**AI**\n\nSimulate & Optimize\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n# 6-8 18-24\n## years to months\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n# 20% to 25%\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n\nIdentify next best action and\nintegrate with actuation systems\n\n\n**IoT**\n\n**Edge/**\n**Cloud**\n\n\n**Digital Twins**\n\n**ERP**\n\n\nPredict & Diagnose\n\n|Col1|I i|\n|---|---|\n\n\n\nPredictive maintenance, process\nimprovements and Root Causing\n\n\nMonitor & Alert\n\n|Col1|P i|\n|---|---|\n\n\nReal-time operations monitoring\nand alerting\n\n\n-----\n\n### How to Build a Digital Twin\n\n\nA data architecture capability is needed to capture\nand collect the ever-expanding volume and variety\nof data streaming in real time from example\nprotocols, such as ABB Total Flow, Allen Bradley,\nEmerson, Fanuc, GE, Hitachi and Mitsubishi.\n\n\nData collection, data analytics, application\nenablement and data integration orchestrate the\ntime-series data stream and transfer to the cloud.\nAzure IoT Hub is used to securely ingest data from\nedge to cloud.\n\n\nCloud infrastructure and analytics capabilities are\noffered within the flexibility of the cloud. Azure\nDigital Twin is used to model and visualize process\nworkflows. Databricks MLflow and Delta Lake scale to\ndeliver real-time predictive analytics.\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Digital Twins: Technical Architecture**\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n\n\n**System and use case discovery**\n**and blueprinting**\n\n**•** Identify priority plant processes and systems\nto model, with focused use cases (e.g., asset\nmaintenance, energy management, process\nmonitoring/optimization, etc.)\n\n**•** Develop a validated process outline, blueprint and\nkey performance indicators\n\n**•** Develop a set of process variables, control\nvariables and manipulated variables\n\n**•** Design control loop\n\n**•** Validate and document process and asset FMEA\nfor all assets and sub-systems\n\n\n**Technology infrastructure requirements**\n\n**•** Technical edge infrastructure onsite — to sense,\ncollect and transmit real-time information\n\n**•** Clean, reliable data availability in the cloud\n\n**•** Data processing and analytics platform — to\ndesign, develop and implement solutions\n\n**•** Stream processing and deployment of models for\npredictions and soft sensing\n\n\n**Visualization delivered**\n\n**•** Information communication — visual\nrepresentation of digital twin along with remote\ncontrolling functions (e.g., Power BI dashboards,\ntime series insights, web app-based digital\ntwin portals)\n\n**•** Closed-loop feedback — to send the insights and\nactions back to form a closed loop — Azure – Event\nGrid and Event Hub with connection from IoT Hub to\nAzure IoT edge devices and control systems is used\n\n\n\n**•** Edge platform to orchestrate the data, insights and\nactions between the cloud and site IT systems\n\n**•** Cloud to edge integration — to enable seamless\nmonitoring, alerting and integration with plant\nOT/IT systems\n\n\n-----\n\n### Why Is Manufacturing Struggling With Data and AI?\n\n**Challenge** **Root Cause** **Goal**\n\n\nAggregate high volumes and velocities of\n\nstructured and unstructured data to power\n\npredictive analytics (e.g., images, IoT, ERP/SCM)\n\nData architectures that scale for TBs /PBs of\n\nenterprise IT and OT data\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\nfor on-premises 30 years ago\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\n\nLegacy architectures such as data\n\nhistorians that can’t handle semi-structured\n\nor unstructured data\n\n\n**Unable to scale enterprise data sets**\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\ngranular supply chain issues in the real world\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\n\n**Can’t meet intellectual property**\n\n\n**Can’t meet intellectual property** Data lineage established across organizational\n\nSystems that do not establish data lineage\n**requirements** silos and disjointed workflows\n\n\nsilos and disjointed workflows\n\n\n### Data architecture is the root cause of this struggle.\n\n\n-----\n\n### Why Databricks for Digital Twins?\n\n\nLakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\nfrom across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\ndigital twins with predictive insights across the value chain from product development to optimizing operations\nto building agile supply chains to robust customer insights.\n\n\nDatabricks open Lakehouse\n\nPlatform has shown time and\n\nagain that it is the foundational\n\nenabling technology to power\n\ndigital twins for manufacturing. But\n\nthe real power is the Databricks\n\npartnership with Tredence that\n\nspeeds implementation for\n\ntailored use cases that deliver\n\nsuperior ROI in less time.”\n\n**Dr. Bala Amavasai** ,\n\nManufacturing CTO, Databricks\n\n\n**Supports Real-Time**\n**Decisions**\n\nLakehouse for Manufacturing\nleverages any enterprise data\nsource — from business critical\nERP data to edge sensor data in\none integrated platform, making it\neasy to automate and secure data\nwith fast, real-time performance.\n\n\n**Faster and More**\n**Accurate Analysis**\n\nThe true benefits of digital twins\nare not the business intelligence\ndashboards, but machine\nlearning insights generated\nfrom incorporating real-time\ndata. Scalable and shareable\nnotebook-based machine learning\naccelerates ROI.\n\n\n**Open Data Sharing**\n**and Collaboration**\n\nDrive stronger customer insights\nand greater service with partners\nleveraging open and secure\ndata collaboration between\ndepartments or your supply chain\ndelivering faster ROI.\n\n\n-----\n\n### Why Tredence for Digital Twins?\n\n\nOver the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\nexpertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\nNow, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\nfurther strengthen their ‘’last mile impact’’ vision.\n\n\nTredence is excited to\n\nco-innovate with Databricks to\n\ndeliver the solutions required for\n\nenterprises to create digital twins\n\nfrom the ground up and implement\n\nthem swiftly to maximize their ROI.\n\nOur partnership enables clients to\n\nget the most out of Tredence’s data\n\nscience capabilities to build decision\n\nintelligence around manufacturing\n\nprocesses and Databricks’\n\nLakehouse Platform to realize the full\n\npromise of digital twins.”\n\n**Naresh Agarwal** ,\n\nHead of Industrials, Tredence\n\n\n**Global Reach**\n\nTredence offers a global team with\nthe subject matter expertise that\ndelivers practitioner and useroriented solutions to identify\nand solve for challenges in\ndigital transformation design\nand implementation.\n\n\n**Purpose-Built Solutions**\n\nAdopt contextual edge to cloud,\npurpose-built AIoT solutions\nthat unify your ecosystems with\nconnected insights and enhance\nproductivity, while enabling\nefficient cost structures.\n\n\n**Focused Dedication**\n\nA dedicated centre of excellence\n(CoE) for AIoT and smart\nmanufacturing solutions —\nserving the entire manufacturing\nvalue chain from product\ndevelopment to manufacturing and\ndownstream operations.\n\n\n-----\n\n### Using Digital Twins to Drive Insights\n\n\n**Use Case**\n\n**Predictive Maintenance**\n\n- \u0007Rolls-Royce sought to use real-time\nengine data to reduce unplanned\nmaintenance and downtime\n\n- \u0007Legacy systems were unable to\nscale data ingestion of engine\nsensor data in real time for ML\n\n**Impact**\n\n\n**Why Databricks?**\n\n- \u0007The Lakehouse Platform on Azure unifies in-flight data\nstreams with external environmental conditions data to\npredict engine performance issues\n\n- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\nacross use cases\n\n- \u0007MLflow speeds deployment of new models and reduces\nincidents of grounded planes\n\n\nRolls-Royce uses Databricks\nto drive insights around predictive\nmaintenance, improving\nairframe reliability and reducing\ncarbon emissions.\n\n\n#### 22 million tons\nof carbon emissions saved\n\n\n#### 5% reduction\nin unplanned airplane groundings\n\n\n#### Millions of pounds\nin inventory cost savings from a 50%\nimprovement in maintenance efficiency\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n\nNast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n\noriginal creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n###### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\nTo learn more, visit us at:\n\n**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "### EBOOK\n\n# A Compact Guide to Large Language Models\n\n\n-----\n\nSECTION 1\n## Introduction\n\n##### Definition of large language models (LLMs)\n\nLarge language models are AI systems that are designed to process and analyze\nvast amounts of natural language data and then use that information to generate\nresponses to user prompts. These systems are trained on massive data sets\nusing advanced machine learning algorithms to learn the patterns and structures\nof human language, and are capable of generating natural language responses to\na wide range of written inputs. Large language models are becoming increasingly\nimportant in a variety of applications such as natural language processing,\nmachine translation, code and text generation, and more.\n\nWhile this guide will focus on language models, it’s important to understand that\nthey are only one aspect under a larger generative AI umbrella. Other noteworthy\ngenerative AI implementations include projects such as art generation from text,\naudio and video generation, and certainly more to come in the near future.\n\n\n-----\n\n##### Extremely brief historical background and development of LLMs\n\n\n###### 1950s–1990s\nInitial attempts are made to map hard rules around languages and\nfollow logical steps to accomplish tasks like translating a sentence\nfrom one language to another.\n\nWhile this works sometimes, strictly defined rules only work for\nconcrete, well-defined tasks that the system has knowledge about.\n\n###### 1990s \nLanguage models begin evolving into statistical models and\nlanguage patterns start being analyzed, but larger-scale projects\nare limited by computing power.\n\n###### 2000s \nAdvancements in machine learning increase the complexity of\nlanguage models, and the wide adoption of the internet sees an\n\nenormous increase in available training data.\n\n###### 2012 \nAdvancements in deep learning architectures and larger data sets\nlead to the development of GPT (Generative Pre-trained Transformer).\n\n\n###### 2018\nGoogle introduces BERT (Bidirectional Encoder Representations\nfrom Transformers), which is a big leap in architecture and paves\nthe way for future large language models.\n\n###### 2020\nOpenAI releases GPT-3, which becomes the largest model at\n175B parameters and sets a new performance benchmark for\nlanguage-related tasks.\n\n###### 2022\nChatGPT is launched, which turns GPT-3 and similar models into\na service that is widely accessible to users through a web interface\nand kicks off a huge increase in public awareness of LLMs and\ngenerative AI.\n\n###### 2023\nOpen source LLMs begin showing increasingly impressive results\nwith releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\nGPT-4 is also released, setting a new benchmark for both parameter\nsize and performance.\n\n\n-----\n\nSECTION 2\n## Understanding Large Language Models\n\n\n##### What are language models and how do they work?\n\nLarge language models are advanced artificial intelligence systems that take\nsome input and generate humanlike text as a response. They work by first\nanalyzing vast amounts of data and creating an internal structure that models\nthe natural language data sets that they’re trained on. Once this internal\nstructure has been developed, the models can then take input in the form of\nnatural language and approximate a good response.\n\n##### If they’ve been around for so many years, why are they just now making headlines?\n\nA few recent advancements have really brought the spotlight to generative AI\nand large language models:\n\n**A D VA N C E M E N T S I N T E C H N I Q U E S**\nOver the past few years, there have been significant advancements in the\ntechniques used to train these models, resulting in big leaps in performance.\nNotably, one of the largest jumps in performance has come from integrating\nhuman feedback directly into the training process.\n\n\n**I N C R E A S E D A C C E S S I B I L I T Y**\nThe release of ChatGPT opened the door for anyone with internet access\nto interact with one of the most advanced LLMs through a simple web\ninterface. This brought the impressive advancements of LLMs into the\nspotlight, since previously these more powerful LLMs were only available\nto researchers with large amounts of resources and those with very deep\ntechnical knowledge.\n\n**G R O W I N G C O M P U TAT I O N A L P O W E R**\nThe availability of more powerful computing resources, such as graphics\nprocessing units (GPUs), and better data processing techniques allowed\nresearchers to train much larger models, improving the performance of\nthese language models.\n\n**I M P R O V E D T R A I N I N G D ATA**\nAs we get better at collecting and analyzing large amounts of data, the\n\nmodel performance has improved dramatically. In fact, Databricks showed\nthat you can get amazing results training a relatively small model with a\nhigh-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\nwith the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n\n\n-----\n\n##### So what are organizations using large language models for?\n\nHere are just a few examples of common use cases for large language models:\n\n**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\nOne of the most common implementations, LLMs can be used by\norganizations to provide help with things like customer support,\ntroubleshooting, or even having open-ended conversations with userprovided prompts.\n\n**C O D E G E N E R AT I O N A N D D E B U G G I N G**\nLLMs can be trained on large amounts of code examples and give\nuseful code snippets as a response to a request written in natural language.\nWith the proper techniques, LLMs can also be built in a way to reference\nother relevant data that it may not have been trained with, such as a\ncompany’s documentation, to help provide more accurate responses.\n\n**S E N T I M E N T A N A LY S I S**\nOften a hard task to quantify, LLMs can help take a piece of text and gauge\nemotion and opinions. This can help organizations gather the data and\n\nfeedback needed to improve customer satisfaction.\n\n\n**L A N G U A G E T R A N S L AT I O N**\nGlobalize all your content without hours of painstaking work by simply\nfeeding your web pages through the proper LLMs and translating them to\ndifferent languages. As more LLMs are trained in other languages, quality\nand availability will continue to improve.\n\n**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\nEntire customer calls or meetings could be efficiently summarized so that\nothers can more easily digest the content. LLMs can take large amounts of\ntext and boil it down to just the most important bytes.\n\n**C O N T E N T G E N E R AT I O N**\nStart with a detailed prompt and have an LLM develop an outline for you.\nThen continue on with those prompts and LLMs can generate a good first\ndraft for you to build off. Use them to brainstorm ideas, and ask the LLM\nquestions to help you draw inspiration from.\n\n**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\nlanguage, but they might not know who won the big sporting event last year.\nIt’s always important to fact check and understand the responses before\n\nusing them as a reference.\n\n\n**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\nThe ability to categorize and sort large volumes of data enables the\nidentification of common themes and trends, supporting informed\ndecision-making and more targeted strategies.\n\n\n-----\n\nSECTION 3\n## Applying Large Language Models\n\n\nThere are a few paths that one can take when looking to apply large language\nmodels for their given use case. Generally speaking, you can break them down\ninto two categories, but there’s some crossover between each. We’ll briefly cover\nthe pros and cons of each and what scenarios fit best for each.\n\n##### Proprietary services\n\nAs the first widely available LLM powered service, OpenAI’s ChatGPT was the\nexplosive charge that brought LLMs into the mainstream. ChatGPT provides\na nice user interface (or API) where users can feed prompts to one of many\nmodels (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\namong the highest-performing models, trained on enormous data sets, and are\ncapable of extremely complex tasks both from a technical standpoint, such as\ncode generation, as well as from a creative perspective like writing poetry in a\nspecific style.\n\nThe downside of these services is the absolutely enormous amount of compute\nrequired not only to train them (OpenAI has said GPT-4 cost them over $100\nmillion to develop) but also to serve the responses. For this reason, these\nextremely large models will likely always be under the control of organizations,\n\n\nand require you to send your data to their servers in order to interact with their\nlanguage models. This raises privacy and security concerns, and also subjects\nusers to “black box” models, whose training and guardrails they have no control\nover. Also, due to the compute required, these services are not free beyond a\nvery limited use, so cost becomes a factor in applying these at scale.\n\nIn summary: Proprietary services are great to use if you have very complex tasks,\nare okay with sharing your data with a third party, and are prepared to incur\ncosts if operating at any significant scale.\n\n##### Open source models\n\nThe other avenue for language models is to go to the open source community,\nwhere there has been similarly explosive growth over the past few years.\nCommunities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n\nfrom contributors that can help solve tons of specific use cases such as text\ngeneration, summarization and classification. The open source community has\nbeen quickly catching up to the performance of the proprietary models, but\nultimately still hasn’t matched the performance of something like GPT-4.\n\n\n-----\n\nIt does currently take a little bit more work to grab an open source model and\nstart using it, but progress is moving very quickly to make them more accessible\nto users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\nexperience to pull any Hugging Face transformer model and use it as a Python\nobject. Oftentimes, you can find an open source model that solves your specific\nproblem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\nthe model into your environment and host it yourself. This means that you can\nkeep the data in your control for privacy and governance concerns as well as\nmanage your costs.\n\n\n##### Conclusion and general guidelines\n\nUltimately, every organization is going to have unique challenges to overcome,\nand there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\nbecomes more data driven, everything, including LLMs, will be reliant on having\na strong foundation of data. LLMs are incredible tools, but they have to be used\nand implemented on top of this strong data foundation. Databricks brings both\nthat strong data foundation as well as the integrated tools to let you use and\nfine-tune LLMs in your domain.\n\n\nAnother huge upside to using open source models is the ability to fine-tune\nthem to your own data. Since you’re not dealing with a black box of a proprietary\nservice, there are techniques that let you take open source models and train\nthem to your specific data, greatly improving their performance on your\nspecific domain. We believe the future of language models is going to move\nin this direction, as more and more organizations will want full control and\nunderstanding of their LLMs.\n\n\n-----\n\nSECTION 4\n## So What Do I Do Next If I Want to Start Using LLMs?\n\n\nThat depends where you are on your journey! Fortunately, we have a few paths\nfor you.\n\nIf you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\nyou can watch one of Databricks’ most talented developers and speakers go\nover these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n\nIf you’re ready to dive a little deeper and expand your education and\nunderstanding of LLM foundations, we’d recommend checking out our\n[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\nand dive into the theory behind foundation models.\n\nIf your hands are already shaking with excitement and you already have some\nworking knowledge of Python and Databricks, we’ll provide some great examples\nwith sample code that can get you up and running with LLMs right away!\n\n\n###### Getting started with NLP using Hugging Face transformers pipelines\n\n Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n\n Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n#### Contact us for a personalized demo: databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "# Building Reliable Data Lakes at Scale With Delta Lake\n\n\n-----\n\n## Contents\n\n#### Data Engineering Drivers 2\n\n Data Pipeline Key Goals 4\n\n Apache Spark™: The First Unified Analytics Engine 5\n\n Data Reliability Challenges With Data Lakes 6\n\n Delta Lake: A New Storage Layer 7\n\n Delta Lake: Key Features 8\n\n Getting Started With Delta Lake 10\n\n\n-----\n\n## Drivers\n\n#### Data Engineering Drivers\n\nData engineering professionals are needing to respond to several different drivers.\n\nChief among the drivers they face are:\n\n**Rise of Advanced Analytics** — Advanced analytics, including methods\n\nbased on machine learning techniques, have evolved to such a degree that\n\norganizations seek to derive far more value from their corporate assets.\n\n**Widespread Adoption** — Once the province of leading edge, high-tech\n\ncompanies, these advanced approaches are being adopted across a\n\nmultitude of industries from retail to hospitality to healthcare and across\n\nprivate as well as public sector organizations. This is further driving the need\n\nfor strong data engineering practices.\n\n**Regulation** — With the growth of data generation and data collection,\n\nthere is increased interest in how the data is protected and managed.\n\nRegulatory regimes such as GDPR (General Data Protection Regulation)\n\nfrom the EU and other jurisdictions mandate very specific ways in which\n\ndata must be managed.\n\n\n-----\n\n## Drivers\n\n**Technology Innovation** — The move to cloud-based analytics architectures\n\nthat is now well underway is being propelled further by innovations such as\n\nanalytics-focused chipsets, pipeline automation and the unification of data\n\nand machine learning. All these offer data professionals new approaches for\n\ntheir data initiatives.\n\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n\nalso subject to increasing scrutiny. There is also a greater understanding of\n\ndata as a valuable asset. Deriving value from data must be done in a manner\n\nthat is financially responsible and actually value adding to the enterprise and\n\nmeeting ROI hurdles.\n\n**Role Evolution** — Reflecting the importance of managing the data and\n\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\n\nmore prominent and newer roles such as Data Curator are emerging.\n\nThey must balance the needs of governance, security and democratization.\n\n\n-----\n\n## Key Goals\n\n#### Data Pipeline Key Goals\n\nMaking quality data available in a reliable manner is a major determinant of success for data\n\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\n\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n\nresponsibility need to take account of a broad set of dependencies and requirements as they\n\ndesign and build their data pipelines.\n\nThree primary goals that data engineers typically seek to address as they work to enable the\n\nanalytics professionals in their organizations are:\n\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\n\nare key. Data with gaps or errors (which can arise for many reasons) is\n\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n\nusers. Equally well, many applications require up-to-date information (who\n\nwants to use last night’s closing stock price or weather forecast) and are of\n\nlimited value without it.\n\n**Enable faster queries** — Wanting fast responses to queries is natural enough\n\nin today’s “New York minute,” online world. Achieving this is particularly\n\ndemanding when the queries are based on very large data sets.\n\n**Simplify data engineering at scale** — It is one thing to have high reliability and\n\nperformance in a limited, development or test environment. What matters\n\nmore is the ability to have robust, production data pipelines at scale without\n\nrequiring high operational overhead.\n\n\n-----\n\n### ™\n## Apache Spark\n\n#### Apache Spark ™ : The First Unified Analytics Engine\n\nOriginally developed at UC Berkeley in 2009, Apache Spark can be\n\nconsidered the first unified analytics engine. Uniquely bringing data\n\n\nand AI technologies together, Spark comes packaged with higher-level\n\nlibraries, including support for SQL queries, streaming data, machine\n\nlearning and graph processing. These standard libraries increase\n\ndeveloper productivity and can be seamlessly combined to create\n\n\nCustomer\nData\n\nEmails/\nWeb Pages\n\n\nClick\nStreams\n\nVideo/\nSpeech\n\n...\n\nSensor\nData (IoT)\n\n\ncomplex workflows.\n\n\n#### Big Data Processing\n\n\n#### Machine Learning\n\n\nSince its release, Apache Spark, has seen rapid adoption by\n\nenterprises across a wide range of industries. Internet powerhouses\n\n\nETL + SQL + Streaming MLlib + SparkR\n\n\nsuch as Netflix, Yahoo and eBay have deployed Spark at massive scale,\n\n\ncollectively processing multiple petabytes of data on clusters of over\n\n8,000 nodes making it the de facto choice for new analytics initiatives.\n\nIt has quickly become the largest open source community in big data,\n\nwith over 1000 contributors from 250+ organizations.\n\n\n##### While Spark has had a significant impact in taking data analytics to the next level, practitioners continue to face data reliability and performance challenges with their data lakes.\n\n\n-----\n\n## Data Reliability Challenges With Data Lakes\n\n\n**Failed Writes** — If a production job that is writing data experiences failures which\n\nare inevitable in large distributed environments, it can result in data corruption\n\nthrough partial or multiple writes. What is needed is a mechanism that is able to\n\nensure that either a write takes place completely or not at all (and not multiple times,\n\nadding spurious data). Failed jobs can impose a considerable burden to recover\n\nto a clean state.\n\n\n**Schema Mismatch** — When ingesting content from multiple sources, typical of\n\nlarge, modern big data environments, it can be difficult to ensure that the same\n\ndata is encoded in the same way i.e., the schema matches. A similar challenge\n\narises when the formats for data elements are changed without informing the\n\ndata engineering team. Both can result in low quality, inconsistent data that\n\nrequires cleaning up to improve its usability. The ability to observe and enforce\n\nschema would serve to mitigate this.\n\n\n**Lack of Consistency** — In a complex big data environment, one may be interested\n\nin considering a mix of both batch and streaming data. Trying to read data while\n\nit is being appended to provides a challenge since on the one hand there is a\n\ndesire to keep ingesting new data while on the other hand anyone reading the\n\ndata prefers a consistent view. This is especially an issue when there are multiple\n\nreaders and writers at work. It is undesirable and impractical, of course, to\n\nstop read access while writes complete or stop write access while reads are\n\nin progress.\n\n\n-----\n\n## Delta Lake: A New Storage Layer\n\n[Delta Lake](https://delta.io/) is an open source storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable metadata handling, and unifies\n\nstreaming and batch data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs. Raw data is ingested\n\nfrom various batch and streaming input sources. Simple, reliable data pipelines help create a curated data lake containing tables of differing degrees of\n\nrefinement based on business needs. The data in these tables is then made available via the standard Spark APIs or special connectors for various use cases\n\nsuch as machine learning, SQL analytics or feeding to a data warehouse.\n\nStreaming\n\n###### Analytics and Machine Learning\n\n\nBatch\n\n\nIngestion Tables Refined Tables\n(Bronze) (Silver)\n\n\nFeature/Agg Data Store\n(Gold)\n\n\n###### Your Existing Data Lake\n\n\n-----\n\n## Delta Lake: Key Features\n\n\n**ACID Transactions —** Data lakes typically have multiple data pipelines reading\n\nand writing data concurrently, and data engineers have to go through a tedious\n\nprocess to ensure data integrity, due to the lack of transactions. Delta Lake\n\nbrings ACID transactions to your data lakes. It provides serializability, the\n\n\n**Scalable Metadata Handling —** In big data, even the metadata itself can be “big\n\ndata.” Delta Lake treats metadata just like data, leveraging Spark’s distributed\n\nprocessing power to handle all its metadata. As a result, Delta Lake can handle\n\npetabyte-scale tables with billions of partitions and files at ease.\n\n\nstrongest level of isolation level.\n\n\n**Time Travel (data versioning) —** Delta Lake provides snapshots of data enabling\n\ndevelopers to access and revert to earlier versions of data for audits, rollbacks or\n\nto reproduce experiments. For further details, please see this [documentation](https://www.google.com/url?q=https://docs.delta.io/latest/delta-batch.html%23-deltatimetravel&sa=D&source=editors&ust=1666305658154469&usg=AOvVaw0Zh1svr9wsqkIDKGQTgtLh) .\n\n\n**Schema Enforcement —** Delta Lake provides the ability to specify your schema\n\nand enforce it. This helps ensure that the data types are correct and required\n\ncolumns are present, preventing bad data from causing data corruption.\n\n\n-----\n\n## Delta Lake: Key Features\n\nParquet\n\n\n**Open Format —** All data in Delta Lake is stored in Apache Parquet format,\n\nenabling Delta Lake to leverage the efficient compression and encoding schemes\n\nthat are native to Parquet.\n\n**Unified Batch and Streaming Source and Sink** — A table in Delta Lake is both a\n\nbatch table, as well as a streaming source and sink. Streaming data ingest, batch\n\nhistoric backfill, and interactive queries all just work out of the box.\n\n\n**Schema Evolution —** Big data is continuously changing. Delta Lake\n\nenables you to make changes to a table schema that can be applied\n\nautomatically, without the need for cumbersome DDL.\n\n**100% Compatible With Apache Spark API —** Developers can use Delta\n\nLake with their existing data pipelines with minimal change as it is fully\n\ncompatible with Spark, the commonly used big data processing engine.\n\n\n-----\n\n## Getting Started With Delta Lake\n\n**Getting started with Delta Lake is easy. Specifically, to create a Delta table simply specify Delta instead of using Parquet.**\n\n\n#### Instead of parquet ...\n```\ndataframe\n.write\n.format(“ parquet ”)\n.save(“/data”)\n\n```\n\n#### … simply say delta\n```\ndataframe\n.write\n.format(“ delta ”)\n.save(“/data”)\n\n```\n\n##### Learn more about Delta Lake :\n\n[Delta Lake Blogs](https://delta.io/blog)\n\nDelta Lake Tutorials\n\n[Delta Lake Integrations](https://delta.io/integrations/)\n\n**For more information, please refer to the** **[documentation](https://docs.delta.io/latest/index.html)** **.**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "#### eBook\n\n# The CDP Build vs Buy Guide:\n\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n\n\n-----\n\n## The Need for a Customer Data Platform\n\n\nOrganizations need to deliver personalized experiences to their customers to stay ahead\nof the curve — that means they need a customer data platform (CDP). Through a CDP, data\nfrom every touch point, along with third-party information, is brought together to provide\na unified view of the customer. This enables your marketing team to analyze, identify and\nactivate customers with targeted content.\n\nThe key question for all IT teams at these organizations is whether to build or to buy.\n\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\nfastest path to a solution.\n\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\nexisting marketing and analytics systems.. The cost of adding another system to the\nlandscape and the redundancy of sensitive customer data creates a governance challenge\nthat has immediate consequences.\n\n**Critical IT Needs** **Critical Business Needs**\n\n\nKeep control of data access and\ngovernance; ability to architecture a\ncustomer data stack with decisions on\nwhere data is stored and where queries\nare executed\n\n\nGet customer data access via a no-code\ninterface to generate insights; build customer\nexperiences and activate data within\nbusiness applications\n\n\n-----\n\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\nside or the other unaddressed — which is why so many organizations who have built a CDP\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\n\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n**both sides of the debate and provide organizations a third choice of both building and**\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\nthe business with no-code and ease of use interface along with the flexibility and centralized\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\nbuying, we’ve opened the door to finding the right balance of approaches for our customer\norganizations, helping organizations find greater success in their personalization journey.\n\n**“We made an attempt to internally build a CDP platform and while we**\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n**or offer a campaign interface to our product marketers that could empower**\n**them to create and manage those journeys. It was going to take at least two**\n**years for us to build all of that functionality in house.”**\n\n– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n\n\n-----\n\n## Combining the Build and Buy Approaches\n\n\nBringing together the best of build and buy involves the deployment of the CDP alongside or\nwithin the lakehouse platform. There are three approaches to this:\n\n**Bundled** **Composable**\n\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n\n\nCompute\n\nStorage\n\n\nCompute\n\nStorage\n(Local & Views)\n\n\nQuery\nVirtualization\n\nMetadata\n\n\nData Copy\n\n\nLakehouse\n\nStorage\n\n\nLakehouse\n\n\nLakehouse\n\n\nCompute Compute\n\nStorage Storage\n\n\n-----\n\nDeployment Type\n\n**Bundled**\n\n**Composable –**\n**Hybrid**\n\n**Composable –**\n**Lakehouse-Only**\n\n\nDescription\n\nThe CDP and the lakehouse are managed as two separate systems. Connectors in either system (as well as\nthird-party tools) allow data to be exchanged, typically as part of an ad hoc or batch process. This approach\nallows the organization to leverage the functionality of both systems but data is duplicated making governance\nan on-going concern.\n\nThe CDP and the lakehouse are managed as two separate systems, but deeper integrations between the two\nallow the organization to decide within which system a specific dataset should reside. Real-time integrations\nbetween the systems allow CDP users to select information assets in the lakehouse and generate queries\nspanning data on either side of the platform divide. This approach minimizes the need for data duplication\nwhich simplifies data governance, even though it must be implemented within two separate systems.\n\nAll CDP information assets reside within the lakehouse. User interfaces built on other technologies, directly\ninteract with the lakehouse for access to data. This approach minimizes redundancy and allows organizations\nto implement a centralized data governance strategy for all consumers of customer-relevant data.\n\n\n-----\n\n## Deployment Architectures \n\n\nThe choice of which of these deployment architectures is best depends on the functional\nrequirements of a specific organization. Each has its benefits, and in the case of parallel\nand federated deployments, organizations can easily transition between deployment\narchitectures over time. The following table captures many of the typical benefits\nassociated with the different deployment architectures.\n\n\nBundled CDP\nDeployment Composable CDPHybrid Composable CDPLakehouse-Only\n\n\nTypical\nUser\n\n**IT**\n\n\nComponent\n\nDigital Touchpoints\n\nData Modeling\n\nIdentity Resolution\n\nData Governance\n\n\nDescription\n\nCollect and integrate\ndata from digital\nchannels (website,\napp, etc.)\n\nUnify and model data\nto make it usable by\nother applications\n\nDeduplicate records to\nbuild a private ID graph\nwith a single view of\nthe customer\n\nControl data access\nand permitted actions\non the data\n\n\nIncluded with CDP\nvia a tag\n\nSometimes included\nwith CDP\n\nPrimarily with CDP\nor other tools (MDM,\nLakehouse)\n\nIncluded with CDP\n\n\nWorks with any digital\ntouchpoint collection\nsystem\n\nEither within the CDP\nor in Lakehouse via\nreal-time integration\n\nCDP, MDM, or\nLakehouse\n\nBoth CDP and\nLakehouse\n\n\nWorks with any digital\ntouchpoint collection\nsystem\n\nUnified environment with\nminimal data replication\nin and centralized\ngovernance in Lakehouse\n\nBuilt with Lakehouse and\nadditional tools\n\nManaged centrally from\nLakehouse\n\n\n-----\n\nBundled CDP\nDeployment Composable CDPHybrid Composable CDPLakehouse-Only\n\n\nTypical\nUser\n\n**Business**\n\n\nComponent\n\nPredictive Scoring\n\nMarketing Audience\nSegments\n\nCustomer Journey\nOrchestration\n\nData Activations\n\nAnalytics\n\n\nDescription\n\nCreate and execute\nmodels predicting\nuser behaviors such as\npurchase or churn\n\nUse a self-service UI\nto build rule-based\nor model-based\naudiences\n\nDefine and optimize\nthe customer journey\nand interactions with\nthe brand across every\nchannel and every\nphase of the customer\nlifecycle\n\nIntegrate seamlessly\nwith delivery systems\nfor both inbound and\noutbound customer\nexperiences\n\nUnderstand audience\nand customer journey\nperformance\n\n\nIncluded with CDP\nwith supplement\nscoring from\nLakehouse\n\nIncluded with CDP\n\nSometimes included\nwith CDP\n\nIncluded with CDP\n\nSometimes included\nwith CDP\n\n\nCDP, or automatically\npresent with Lakehouse\n\nIncluded with CDP\n\nCDP, marketing\nautomation, or\nadditional tools\n\nIncluded with CDP\n\nSometimes included\nwith CDP or built\nwith Lakehouse and\nadditional tools\n\n\nAutomatically present\nwith Lakehouse\n\nIncluded with CDP\n\nCDP, marketing\nautomation, or\nadditional tools\n\nCDP, or additional tools\n\nBuilt with Lakehouse\nand additional tools\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000 organizations worldwide —\nincluding Comcast, Condé Nast, H&M, and over 50% of the Fortune 500 — rely on\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe. Founded by the\noriginal creators of Apache SparkTM, Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems.\n\n## About ActionIQ\n\nAIQ brings order to CX chaos. Our Customer Experience Hub empowers\neveryone to be a CX champion by giving business teams the freedom to explore\nand action on customer data while helping technical teams regain control of\nwhere data lives and how it’s used.\n\n**[Get in touch](https://www.actioniq.com/get-started/)** with our experts to learn more.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf", + "2024-09-19T16:57:19Z" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "content", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "parser_status", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doc_uri", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "last_modified", + "type": "\"timestamp\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from cookbook.data_pipeline.parse_docs import (\n", + " load_files_to_df,\n", + " apply_parsing_fn,\n", + " check_parsed_df_for_errors,\n", + " check_parsed_df_for_empty_parsed_files\n", + ")\n", + "from cookbook.data_pipeline.utils.typed_dicts_to_spark_schema import typed_dicts_to_spark_schema\n", + "from cookbook.databricks_utils import get_table_url\n", + "\n", + "# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small.\n", + "NUM_PARTITIONS = 50\n", + "\n", + "# Load the UC Volume files into a Spark DataFrame\n", + "raw_files_df = load_files_to_df(\n", + " spark=spark,\n", + " source_path=source_config.volume_path,\n", + ").repartition(NUM_PARTITIONS)\n", + "\n", + "# Apply the parsing UDF to the Spark DataFrame\n", + "parsed_files_df = apply_parsing_fn(\n", + " raw_files_df=raw_files_df,\n", + " # Modify this function to change the parser, extract additional metadata, etc\n", + " parse_file_fn=file_parser,\n", + " # The schema of the resulting Delta Table will follow the schema defined in ParserReturnValue\n", + " parsed_df_schema=typed_dicts_to_spark_schema(ParserReturnValue),\n", + ")\n", + "\n", + "# Write to a Delta Table\n", + "parsed_files_df.write.mode(\"overwrite\").option(\"overwriteSchema\", \"true\").saveAsTable(\n", + " output_config.parsed_docs_table\n", + ")\n", + "\n", + "# Get resulting table\n", + "parsed_files_df = spark.table(output_config.parsed_docs_table)\n", + "parsed_files_no_errors_df = parsed_files_df.filter(\n", + " parsed_files_df.parser_status == \"SUCCESS\"\n", + ")\n", + "\n", + "# Show successfully parsed documents\n", + "print(f\"Parsed {parsed_files_df.count()} / {parsed_files_no_errors_df.count()} documents successfully. Inspect `parsed_files_no_errors_df` or visit {get_table_url(output_config.parsed_docs_table)} to see all parsed documents, including any errors.\")\n", + "display(parsed_files_no_errors_df.toPandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "51699851-1785-4c25-8c4b-01529341809a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Show any parsing failures or successfully parsed files that resulted in an empty document." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20230e25-bdab-41f5-bda1-a61d7db60cb4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "All documents were parsed.\nAll documents produced non-null parsing results.\n" + ] + } + ], + "source": [ + "\n", + "# Any documents that failed to parse\n", + "is_error, msg, failed_docs_df = check_parsed_df_for_errors(parsed_files_df)\n", + "if is_error:\n", + " display(failed_docs_df.toPandas())\n", + " raise Exception(msg)\n", + " \n", + "# Any documents that returned empty parsing results\n", + "is_error, msg, empty_docs_df = check_parsed_df_for_empty_parsed_files(parsed_files_df)\n", + "if is_error:\n", + " display(empty_docs_df.toPandas())\n", + " raise Exception(msg)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e21c84e8-7682-4a7a-86fc-7f4f990bb490", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### Pipeline step 2: Compute chunks of documents\n", + "\n", + "In this step, we will split our documents into smaller chunks so they can be indexed in our vector database." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eecd460c-f287-47ce-98f1-cea78a1f3f64", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "##### ✅✏️ Chunking logic.\n", + "\n", + "We provide a default implementation of a recursive text splitter. To create your own chunking logic, adapt the `get_recursive_character_text_splitter()` function inside `cookbook.data_pipeline.recursive_character_text_splitter.py`." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "02c40228-f933-4af8-9121-ed2efa0985dd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Chunk size in tokens: 1024 and chunk overlap in tokens: 256 are valid. Using 16.0% (1280 tokens) of the 8192 token context window.\n" + ] + } + ], + "source": [ + "from cookbook.data_pipeline.recursive_character_text_splitter import (\n", + " get_recursive_character_text_splitter,\n", + ")\n", + "\n", + "# Get the chunking function\n", + "recursive_character_text_splitter_fn = get_recursive_character_text_splitter(\n", + " model_serving_endpoint=chunking_config.embedding_model_endpoint,\n", + " chunk_size_tokens=chunking_config.chunk_size_tokens,\n", + " chunk_overlap_tokens=chunking_config.chunk_overlap_tokens,\n", + ")\n", + "\n", + "# Determine which columns to propagate from the docs table to the chunks table.\n", + "\n", + "# Get the columns from the parser except for the content\n", + "# You can modify this to adjust which fields are propagated from the docs table to the chunks table.\n", + "propagate_columns = [\n", + " field.name\n", + " for field in typed_dicts_to_spark_schema(ParserReturnValue).fields\n", + " if field.name != \"content\"\n", + "]\n", + "\n", + "# If you want to implement retrieval strategies such as presenting the entire document vs. the chunk to the LLM, include `contentich contains the doc's full parsed text. By default this is not included because the size of contcontentquite large and cause performance issues.\n", + "# propagate_columns = [\n", + "# field.name\n", + "# for field in typed_dicts_to_spark_schema(ParserReturnValue).fields\n", + "# ]" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b17add2c-e7f0-4903-8ae9-40ca0633a8d5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "🚫✏️ Run the chunking function within Spark" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0dfa90f8-c4dc-4485-8fa8-dcd4c7d40618", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Applying chunking UDF to 29 documents using Spark - this may take a long time if you have many documents...\nCreated 581 chunks. Inspect `chunked_docs_df` or visit https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs_chunked__v2 to see the results.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
chunk_idcontent_chunkedparser_statusdoc_urilast_modified
c2c667a9122ae783d9d5bfef443e7c5e**EBOOK**\n", + "\n", + "## The Big Book of Data Engineering 2nd Edition\n", + "\n", + "A collection of technical\n", + "blogs, including code\n", + "samples and notebooks\n", + "\n", + "##### With all-new content\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n", + "\n", + "**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n", + "\n", + "**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n", + "\n", + "**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n", + "\n", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
ba5806f0679d7bbc4a72328d25697ece**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n", + "\n", + "**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n", + "\n", + "**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n", + "\n", + "**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n", + "\n", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
28de646b6bc9ea25b7bb33e1d80de127**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n", + "\n", + "**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n", + "\n", + "**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n", + "\n", + "**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n", + "\n", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
61a3db616e17315f75ec0e473cc055ba**4 . 1** Akamai .................................................................................................................................................................................................... 77\n", + "\n", + "**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n", + "\n", + "**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n", + "\n", + "**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n", + "\n", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
87ad37fc256c9d60868a865a4ea85bf4**4 . 5** Rivian .................................................................................................................................................................................................... 90\n", + "\n", + "**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 01\n", + "\n", + "\n", + "### Introduction to Data Engineering on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "Organizations realize the value data plays as a strategic asset for various\n", + "business-related initiatives, such as growing revenues, improving the customer\n", + "experience, operating efficiently or improving a product or service. However,\n", + "accessing and managing data for these initiatives has become increasingly\n", + "complex. Most of the complexity has arisen with the explosion of data volumes\n", + "and data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\n", + "to increase, 73% of the data goes unused for analytics or decision-making. In\n", + "order to try and decrease this percentage and make more data usable, data\n", + "engineering teams are responsible for building data pipelines to efficiently and\n", + "reliably deliver data. But the process of building these complex data pipelines\n", + "comes with a number of difficulties:\n", + "\n", + "**•** In order to get data into a data lake, data engineers are required\n", + "to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
bf114a736c5b9b473f4e1c81c2bbaa5e**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
d85d526722f3ca9735bc45d98a9ad449**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline stateSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
31bbc89514393b32579d699cbd8173e7Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state\n", + "\n", + "**Data pipeline observability**\n", + "Monitor overall data pipeline status from a dataflow graph dashboard and\n", + "visually track end-to-end pipeline health for performance, quality and latency.\n", + "Data pipeline observability capabilities include:\n", + "\n", + "**•** A high-quality, high-fidelity lineage diagram that provides visibility\n", + "into how data flows for impact analysis\n", + "\n", + "**•** Granular logging with performance and status of the data pipeline\n", + "at a row level\n", + "\n", + "**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n", + "\n", + "\n", + "**Automatic deployments and operations**\n", + "Ensure reliable and predictable delivery of data for analytics and machine\n", + "learning use cases by enabling easy and automatic data pipeline deployments\n", + "and rollbacks to minimize downtime. Benefits include:\n", + "\n", + "**•** Complete, parameterized and automated deployment for the\n", + "continuous delivery of data\n", + "\n", + "**•** End-to-end orchestration, testing and monitoring of data pipeline\n", + "deployment across all major cloud providers\n", + "\n", + "**Migrations**\n", + "Accelerating and de-risking the migration journey to the lakehouse, whether\n", + "from legacy on-prem systems or disparate cloud services.\n", + "\n", + "The migration process starts with a detailed discovery and assessment to\n", + "get insights on legacy platform workloads and estimate migration as well as\n", + "Databricks platform consumption costs. Get help with the target architecture\n", + "and how the current technology stack maps to Databricks, followed by a\n", + "phased implementation based on priorities and business needs. Throughout\n", + "this journey companies can leverage:\n", + "\n", + "**•** Automation tools from Databricks and its ISV partners\n", + "\n", + "**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n", + "\n", + "**•** Databricks Professional Services and training\n", + "\n", + "This is the recommended approach for a successful migration, whereby\n", + "customers have seen a 25-50% reduction in costs and 2-3x faster time to value\n", + "for their use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified governance**\n", + "With Unity Catalog, data engineering and governance teams benefit from an\n", + "enterprisewide data catalog with a single interface to manage permissions,\n", + "centralize auditing, automatically track data lineage down to the column level,\n", + "and share data across platforms, clouds and regions. Benefits:\n", + "\n", + "**•** Discover all your data in one place, no matter where it lives,\n", + "and centrally manage fine-grained access permissions using an\n", + "ANSI SQL-based interface\n", + "\n", + "**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
7ef598df8d413a6a97a8acbf6316ff8c**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**\n", + "\n", + "As organizations strive to become data-driven, data engineering is a focal\n", + "point for success. To deliver reliable, trustworthy data, data engineers shouldn’t\n", + "need to spend time manually developing and maintaining an end-to-end\n", + "ETL lifecycle. Data engineering teams need an efficient, scalable way to\n", + "simplify ETL development, improve data reliability and manage operations.\n", + "\n", + "As described, the eight key differentiating capabilities simplify the\n", + "management of the ETL lifecycle by automating and maintaining all data\n", + "dependencies, leveraging built-in quality controls with monitoring and by\n", + "providing deep visibility into pipeline operations with automatic recovery.\n", + "Data engineering teams can now focus on easily and rapidly building reliable\n", + "end-to-end production-ready data pipelines using only SQL or Python\n", + "for batch and streaming that deliver high-value data for analytics, data\n", + "science or machine learning.\n", + "\n", + "\n", + "**Follow proven best practices**\n", + "\n", + "In the next section, we describe best practices for data engineering\n", + "end-to end use cases drawn from real-world examples. From data ingestion\n", + "and real-time processing to analytics and machine learning, you’ll learn\n", + "how to translate raw data into actionable data.\n", + "\n", + "As you explore the rest of this guide, you can find data sets and code\n", + "samples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\n", + "get your hands dirty as you explore all aspects of the data lifecycle on the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### Guidance and Best Practices\n", + "\n", + "**2.1** Top 5 Databricks Performance Tips\n", + "\n", + "**2.2** How to Profile PySpark\n", + "\n", + "**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n", + "\n", + "**2.4** Streaming in Production: Collected Best Practices\n", + "\n", + "**2.5** Streaming in Production: Collected Best Practices, Part 2\n", + "\n", + "**2.6** Building Geospatial Data Products\n", + "\n", + "**2.7** Data Lineage With Unity Catalog\n", + "\n", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
03da766c8ecba0e1ef483a5639ff3aed**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:\n", + "\n", + "**•** **Use larger clusters.** It may sound obvious, but this is the number\n", + "one problem we see. It’s actually not any more expensive to use a large\n", + "cluster for a workload than it is to use a smaller one. It’s just faster.\n", + "If there’s anything you should take away from this article, it’s this.\n", + "\n", + "Read section 1. Really.\n", + "\n", + "**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\n", + "to learn more. You won’t regret it.\n", + "\n", + "\n", + "\n", + "**•** **Clean out your configurations** . Configurations carried from one\n", + "Apache Spark™ version to the next can cause massive problems. Clean up!\n", + "Read section 3 to learn more.\n", + "\n", + "**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\n", + "correctly, if at all. See Section 4 to learn more.\n", + "\n", + "**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\n", + "you’re writing Spark code, jump to section 5.\n", + "\n", + "**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\n", + "blog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n", + "\n", + "**1. Give your clusters horsepower!**\n", + "\n", + "This is the number one mistake customers make. Many customers create tiny\n", + "clusters of two workers with four cores each, and it takes forever to do anything.\n", + "The concern is always the same: they don’t want to spend too much money on\n", + "larger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n", + "**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n", + "\n", + "\n", + "-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
63a5c596165f1ac1b23faf5431a91677-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2\n", + "\n", + "Notice that the total cost of the workload stays the same while the real-world\n", + "time it takes for the job to run drops significantly. So, bump up your Databricks\n", + "cluster specs and speed up your workloads without spending any more money. It\n", + "\n", + "can’t really get any simpler than that.\n", + "\n", + "**2. Use Photon**\n", + "\n", + "Our colleagues in engineering have rewritten the Spark execution engine in C++\n", + "and dubbed it Photon. The results are impressive!\n", + "\n", + "\n", + "Beyond the obvious improvements due to running the engine in native code,\n", + "they’ve also made use of CPU-level performance features and better memory\n", + "\n", + "management. On top of this, they’ve rewritten the Parquet writer in C++. So this\n", + "makes writing to Parquet and Delta (based on Parquet) super fast as well!\n", + "\n", + "But let’s also be clear about what Photon is speeding up. It improves\n", + "computation speed for any built-in functions or operations, as well as writes to\n", + "Parquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n", + "(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\n", + "spending most of its time reading from an ancient on-prem database? Won’t\n", + "help there either, unfortunately.\n", + "\n", + "\n", + "-----\n", + "\n", + "The good news is that it helps where it can. So even if part of your job can’t be\n", + "sped up, it will speed up the other parts. Also, most jobs are written with the\n", + "native operations and spend a lot of time writing to Delta, and Photon helps a lot\n", + "there. So give it a try. You may be amazed by the results!\n", + "\n", + "**3. Clean out old configurations**\n", + "\n", + "You know those Spark configurations you’ve been carrying along from version to\n", + "version and no one knows what they do anymore? They may not be harmless.\n", + "We’ve seen jobs go from running for hours down to minutes simply by cleaning\n", + "out old configurations. There may have been a quirk in a particular version of\n", + "Spark, a performance tweak that has not aged well, or something pulled off\n", + "some blog somewhere that never really made sense. At the very least, it’s worth\n", + "revisiting your Spark configurations if you’re in this situation. Often the default\n", + "configurations are the best, and they’re only getting better. Your configurations\n", + "may be holding you back.\n", + "\n", + "**4. The Delta Cache is your friend**\n", + "\n", + "This may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
1ff1e7336923f2d0af4396b7ae44350eThis may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.\n", + "\n", + "Of course, your mileage may vary. If you’re doing BI, which involves reading the\n", + "same tables over and over again, caching gives an amazing boost. However, if\n", + "you’re simply reading a table once and writing out the results as in some ETL\n", + "jobs, you may not get much benefit. You know your jobs better than anyone.\n", + "Go forth and conquer.\n", + "\n", + "\n", + "-----\n", + "\n", + "**5. Be aware of lazy evaluation**\n", + "\n", + "\n", + "However, there is a catch here. Every time you try to display or write out\n", + "results, it runs the execution plan again. Let’s look at the same block of code\n", + "but extend it and do a few more operations.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ".filter(...)\n", + ")\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "\n", + "_# Unfortunately this will run the plan again, including filtering, joining,_\n", + "_etc_\n", + "df2.display()\n", + "\n", + "_# So will this…_\n", + "df2.count()\n", + "—------\n", + "\n", + "\n", + "If you’re a data analyst or data scientist only using SQL or doing BI you can skip\n", + "this section. However, if you’re in data engineering and writing pipelines or doing\n", + "processing using Databricks/Spark, read on.\n", + "\n", + "When you’re writing Spark code like select, groupBy, filter, etc., you’re really\n", + "building an execution plan. You’ll notice the code returns almost immediately when\n", + "you run these functions. That’s because it’s not actually doing any computation. So\n", + "even if you have petabytes of data, it will return in less than a second.\n", + "\n", + "However, once you go to write your results out you’ll notice it takes longer. This\n", + "is due to lazy evaluation. It’s not until you try to display or write results that your\n", + "execution plan is actually run.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
0b935d4987169eb45d9abb94dceb2ad6—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.\n", + "\n", + "\n", + "This works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\n", + "benefit greatly from lazy evaluation, but it’s something a lot of customers trip\n", + "over. So be aware of its existence and save results you reuse in order to avoid\n", + "unnecessary computation.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "Let’s look at the same block of code again, but this time let’s avoid the\n", + "recomputation:\n", + "\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + ")\n", + "\n", + "_# save it_\n", + "df2.write.save(path)\n", + "\n", + "_# load it back in_\n", + "df3 = spark.read.load(path)\n", + "\n", + "_# now use it_\n", + "df3.display()\n", + "\n", + "_# this is not doing any extra computation anymore. No joins, filtering,_\n", + "_etc. It’s already done and saved._\n", + "df3.display()\n", + "\n", + "_# nor is this_\n", + "df3.count()\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.2 \u0007\n", + "\n", + "**How to Profile PySpark**\n", + "\n", + "by **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n", + "\n", + "October 6, 2022\n", + "\n", + "\n", + "In Apache Spark™, declarative Python APIs are supported for big data workloads.\n", + "They are powerful enough to handle most common use cases. Furthermore,\n", + "PySpark UDFs offer more flexibility since they enable users to run arbitrary\n", + "Python code on top of the Apache Spark™ engine. Users only have to state\n", + "“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\n", + "PySpark easier to use, but it can be difficult to identify performance bottlenecks\n", + "and apply custom optimizations.\n", + "\n", + "To address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
a0691e5c37475a0f10612ea46afc205eTo address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**\n", + "\n", + "PySpark applications run as independent sets of processes on a cluster,\n", + "coordinated by the SparkContext object in the driver program. On the driver\n", + "side, PySpark is a regular Python process; thus, we can profile it as a normal\n", + "Python program using cProfile as illustrated below:\n", + "\n", + "import cProfile\n", + "\n", + "with cProfile.Profile() as pr:\n", + "_# Your code_\n", + "\n", + "pr.print_stats()\n", + "\n", + "**Workers profiling**\n", + "\n", + "Executors are distributed on worker nodes in the cluster, which introduces\n", + "complexity because we need to aggregate profiles. Furthermore, a Python worker\n", + "process is spawned per executor for PySpark UDF execution, which makes the\n", + "profiling more intricate.\n", + "\n", + "\n", + "-----\n", + "\n", + "The UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\n", + "and becomes a major tool to profile workers for PySpark applications. We’ll\n", + "illustrate how to use the UDF profiler with a simple Pandas UDF example.\n", + "\n", + "Firstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n", + "```\n", + " sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n", + " 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n", + " by 'id' )\n", + " ).withColumn( 'v' , rand())\n", + "\n", + "```\n", + "Later, we will group by the id column, which results in 8 groups with 1,000 rows\n", + "per group.\n", + "\n", + "The Pandas UDF plus_one is then created and applied as shown below:\n", + "```\n", + " import pandas as pd\n", + " def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf.apply( lambda x: x + 1 , axis= 1 )\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "Executing the example above and running sc.show_profiles() prints the\n", + "following profile. The profile below can also be dumped to disk by sc.dump_\n", + "profiles(path).\n", + "\n", + "The UDF id in the profile (271, highlighted above) matches that in the Spark plan\n", + "for res. The Spark plan can be shown by calling res.explain() .\n", + "\n", + "\n", + "Note that plus_one takes a pandas DataFrame and returns another pandas\n", + "DataFrame. For each group, all columns are passed together as a pandas\n", + "DataFrame to the plus_one UDF, and the returned pandas DataFrames are\n", + "combined into a PySpark DataFrame.\n", + "\n", + "\n", + "-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each functionSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
dd07dacad46874c8f7a92f1c7ad7099d-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each function\n", + "\n", + "Digging into the column details: plus_one is triggered once per group, 8 times\n", + "in total; _arith_method of pandas Series is called once per row, 8,000 times\n", + "in total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\n", + "row, thus suffering from high invocation overhead.\n", + "\n", + "We can reduce such overhead by substituting the pandas.DataFrame.apply\n", + "with pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\n", + "follows:\n", + "```\n", + " import pandas as pd\n", + " def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf + 1\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n", + " schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "The updated profile is as shown below.\n", + "\n", + "We can summarize the optimizations as follows:\n", + "\n", + "**•** Arithmetic operation from 8,000 calls to 8 calls\n", + "\n", + "**•** Total function calls from 2,898,160 calls to 2,384 calls\n", + "\n", + "**•** Total execution time from 2.300 seconds to 0.004 seconds\n", + "\n", + "The short example above demonstrates how the UDF profiler helps us deeply\n", + "understand the execution, identify the performance bottleneck and enhance\n", + "the overall performance of the user-defined function.\n", + "\n", + "The UDF profiler was implemented based on the executor-side profiler,\n", + "which is designed for PySpark RDD API. The executor-side profiler is available\n", + "in all active Databricks Runtime versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "Both the UDF profiler and the executor-side profiler run on Python workers.\n", + "They are controlled by the spark.python.profile Spark configuration, which\n", + "is false by default. We can enable that Spark configuration on a Databricks\n", + "Runtime cluster as shown below.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "PySpark profilers are implemented based on cProfile; thus, the profile reporting\n", + "relies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\n", + "collecting profile reports from Python workers.\n", + "\n", + "Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
1bb64e9939be901e7a31554b2a84b4b3Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022\n", + "\n", + "\n", + "[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\n", + "approach for creating reliable data pipelines and fully manages the underlying\n", + "infrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\n", + "actionable insights derived from near real-time data. Delta Live Tables enables\n", + "low-latency streaming data pipelines to support such use cases with low\n", + "latencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n", + "[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n", + "\n", + "This article will walk through using DLT with Apache Kafka while providing the\n", + "required Python code to ingest streams. The recommended system architecture\n", + "will be explained, and related DLT settings worth considering will be explored\n", + "along the way.\n", + "\n", + "**Streaming platforms**\n", + "\n", + "Event buses or message buses decouple message producers from consumers.\n", + "A popular streaming use case is the collection of click-through data from\n", + "users navigating a website where every user interaction is stored as an event in\n", + "\n", + "\n", + "Apache Kafka. The event stream from Kafka is then used for real-time streaming\n", + "data analytics. Multiple message consumers can read the same data from Kafka\n", + "and use the data to learn about audience interests, conversion rates, and bounce\n", + "reasons. The real-time, streaming event data from the user interactions often\n", + "also needs to be correlated with actual purchases stored in a billing database.\n", + "\n", + "**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
bc24283f816ae54daa94d5fcb0bd9f6e**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”\n", + "\n", + "When developing DLT with Python, the @dlt.table decorator is used to create a\n", + "Delta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\n", + "which are simple SQL constraints clauses that define the pipeline’s behavior with\n", + "invalid records.\n", + "\n", + "Since streaming workloads often come with unpredictable data volumes,\n", + "Databricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\n", + "overall end-to-end latency while reducing cost by shutting down unnecessary\n", + "infrastructure.\n", + "\n", + "**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\n", + "pipeline run.\n", + "\n", + "In contrast, **streaming Delta Live Tables** are stateful, incrementally computed\n", + "and only process data that has been added since the last pipeline run. If the\n", + "query which defines a streaming live tables changes, new data will be processed\n", + "based on the new query but existing data is not recomputed. Streaming live\n", + "tables always use a streaming source and only work over append-only streams,\n", + "such as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\n", + "Structured Streaming.\n", + "\n", + "\n", + "You can chain multiple streaming pipelines, for example, workloads with very\n", + "large data volume and low latency requirements.\n", + "\n", + "**Direct ingestion from streaming engines**\n", + "\n", + "Delta Live Tables written in Python can directly ingest data from an event bus like\n", + "Kafka using Spark Structured Streaming. You can set a short retention period for\n", + "the Kafka topic to avoid compliance issues, reduce costs and then benefit from\n", + "the cheap, elastic and governable storage that Delta provides.\n", + "\n", + "As a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n", + "(raw) table and avoid complex transformations that could drop important data.\n", + "Like any Delta table the Bronze table will retain the history and allow it to perform\n", + "GDPR and other compliance tasks.\n", + "\n", + "Ingest streaming data from Apache Kafka\n", + "\n", + "\n", + "-----\n", + "\n", + "When writing DLT pipelines in Python, you use the @dlt.table annotation\n", + "to create a DLT table. There is no special attribute to mark streaming DLTs in\n", + "Python; simply use spark.readStream() to access the stream. Example code\n", + "for creating a DLT table with the name kafka_bronze that is consuming data\n", + "from a Kafka topic looks as follows:\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "TOPIC = \"tracker-events\"\n", + "KAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n", + "_# subscribe to TOPIC at KAFKA_BROKER_\n", + "raw_kafka_events = (spark.readStream\n", + ". format ( \"kafka\" )\n", + ".option( \"subscribe\" , TOPIC)\n", + ".option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n", + ".option( \"startingOffsets\" , \"earliest\" )\n", + ".load()\n", + ")SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
9eccab88cb97885330ddb5b2d5e96a79Ingest streaming data from Apache Kafka\n", + "\n", + "\n", + "-----\n", + "\n", + "When writing DLT pipelines in Python, you use the @dlt.table annotation\n", + "to create a DLT table. There is no special attribute to mark streaming DLTs in\n", + "Python; simply use spark.readStream() to access the stream. Example code\n", + "for creating a DLT table with the name kafka_bronze that is consuming data\n", + "from a Kafka topic looks as follows:\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "TOPIC = \"tracker-events\"\n", + "KAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n", + "_# subscribe to TOPIC at KAFKA_BROKER_\n", + "raw_kafka_events = (spark.readStream\n", + ". format ( \"kafka\" )\n", + ".option( \"subscribe\" , TOPIC)\n", + ".option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n", + ".option( \"startingOffsets\" , \"earliest\" )\n", + ".load()\n", + ")\n", + "\n", + "**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n", + "```\n", + " def kafka_bronze ():\n", + "\n", + "```\n", + "return raw_kafka_events\n", + "\n", + "pipelines.reset.allowed\n", + "\n", + "Note that event buses typically expire messages after a certain period of time,\n", + "whereas Delta is designed for infinite retention.\n", + "\n", + "This might lead to the effect that source data on Kafka has already been deleted\n", + "when running a full refresh for a DLT pipeline. In this case, not all historic data\n", + "could be backfilled from the messaging platform, and data would be missing in\n", + "DLT tables. To prevent dropping data, use the following DLT table property:\n", + "\n", + "\n", + "pipelines.reset.allowed=false\n", + "\n", + "Setting pipelines.reset.allowed to false prevents refreshes to the table but\n", + "does not prevent incremental writes to the tables or new data from flowing into\n", + "the table.\n", + "\n", + "**Checkpointing**\n", + "\n", + "If you are an experienced Spark Structured Streaming developer, you will notice\n", + "the absence of checkpointing in the above code. In Spark Structured Streaming\n", + "checkpointing is required to persist progress information about what data has\n", + "been successfully processed and upon failure, this metadata is used to restart a\n", + "failed query exactly where it left off.\n", + "\n", + "Whereas checkpoints are necessary for failure recovery with exactly-once\n", + "guarantees in Spark Structured Streaming, DLT handles state automatically\n", + "without any manual configuration or explicit checkpointing required.\n", + "\n", + "**Mixing SQL and Python for a DLT pipeline**\n", + "\n", + "A DLT pipeline can consist of multiple notebooks but one DLT notebook is\n", + "required to be written entirely in either SQL or Python (unlike other Databricks\n", + "notebooks where you can have cells of different languages in a single notebook).\n", + "\n", + "Now, if your preference is SQL, you can code the data ingestion from Apache\n", + "Kafka in one notebook in Python and then implement the transformation logic of\n", + "your data pipelines in another notebook in SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Schema mapping**\n", + "\n", + "When reading data from messaging platform, the data stream is opaque and a\n", + "schema has to be provided.\n", + "\n", + "The Python example below shows the schema definition of events from a fitness\n", + "tracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n", + "\n", + "event_schema = StructType([ \\\n", + "StructField( \"time\" , TimestampType(), True ) , \\\n", + "StructField( \"version\" , StringType(), True ), \\\n", + "StructField( \"model\" , StringType(), True ) , \\\n", + "StructField( \"heart_bpm\" , IntegerType(), True ), \\\n", + "StructField( \"kcal\" , IntegerType(), True ) \\\n", + "])SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
420316efc848ce013053d1e9f161a508-----\n", + "\n", + "**Schema mapping**\n", + "\n", + "When reading data from messaging platform, the data stream is opaque and a\n", + "schema has to be provided.\n", + "\n", + "The Python example below shows the schema definition of events from a fitness\n", + "tracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n", + "\n", + "event_schema = StructType([ \\\n", + "StructField( \"time\" , TimestampType(), True ) , \\\n", + "StructField( \"version\" , StringType(), True ), \\\n", + "StructField( \"model\" , StringType(), True ) , \\\n", + "StructField( \"heart_bpm\" , IntegerType(), True ), \\\n", + "StructField( \"kcal\" , IntegerType(), True ) \\\n", + "])\n", + "\n", + "_# temporary table, visible in pipeline but not in data browser,_\n", + "_# cannot be queried interactively_\n", + "**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n", + "**temporary=** **True** **)**\n", + "```\n", + " def kafka_silver ():\n", + "\n", + "```\n", + "return (\n", + "_# kafka streams are (timestamp,value)_\n", + "_# value contains the kafka payload_\n", + "\n", + "dlt.read_stream( \"kafka_bronze\" )\n", + ".select(col( \"timestamp\" ),from_json(col( \"value\" )\n", + ".cast( \"string\" ), event_schema).alias( \"event\" ))\n", + ".select( \"timestamp\" , \"event.*\" )\n", + "\n", + "\n", + "**Benefits**\n", + "\n", + "Reading streaming data in DLT directly from a message broker minimizes the\n", + "architectural complexity and provides lower end-to-end latency since data is\n", + "directly streamed from the messaging broker and no intermediary step is involved.\n", + "\n", + "**Streaming ingest with cloud object store intermediary**\n", + "\n", + "For some specific use cases, you may want to offload data from Apache Kafka,\n", + "e.g., using a Kafka connector, and store your streaming data in a cloud object\n", + "intermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\n", + "ingest the files.\n", + "\n", + "Auto Loader can ingest data with a single line of SQL code. The syntax to ingest\n", + "JSON files into a DLT table is shown below (it is wrapped across two lines for\n", + "readability).\n", + "\n", + "_-- INGEST with Auto Loader_\n", + "create or replace streaming live table raw\n", + "as select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n", + "\n", + "\n", + "-----\n", + "\n", + "Note that Auto Loader itself is a streaming data source and all newly arrived files\n", + "will be processed exactly once, hence the streaming keyword for the raw table\n", + "that indicates data is ingested incrementally to that table.\n", + "\n", + "Since offloading streaming data to a cloud object store introduces an additional\n", + "step in your system architecture it will also increase the end-to-end latency\n", + "and create additional storage costs. Keep in mind that the Kafka connector\n", + "writing event data to the cloud object store needs to be managed, increasing\n", + "operational complexity.\n", + "\n", + "Therefore Databricks recommends as a best practice to directly access event\n", + "bus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n", + "\n", + "**Other event buses or messaging systems**\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to other event buses or messaging systems. DLT supports any data\n", + "source that Databricks Runtime directly supports.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
e6e44af0aaa9b015f23cd88d6bc493f9Since offloading streaming data to a cloud object store introduces an additional\n", + "step in your system architecture it will also increase the end-to-end latency\n", + "and create additional storage costs. Keep in mind that the Kafka connector\n", + "writing event data to the cloud object store needs to be managed, increasing\n", + "operational complexity.\n", + "\n", + "Therefore Databricks recommends as a best practice to directly access event\n", + "bus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n", + "\n", + "**Other event buses or messaging systems**\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to other event buses or messaging systems. DLT supports any data\n", + "source that Databricks Runtime directly supports.\n", + "\n", + "**Amazon Kinesis**\n", + "In Kinesis, you write messages to a fully managed serverless stream. Same as\n", + "Kafka, Kinesis does not permanently store messages. The default message\n", + "retention in Kinesis is one day.\n", + "\n", + "When using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\n", + "Python code for streaming ingestion above and add Amazon Kinesis-specific\n", + "settings with option(). For more information, check the section about Kinesis\n", + "Integration in the Spark Structured Streaming documentation.\n", + "\n", + "\n", + "**Azure Event Hubs**\n", + "\n", + "For Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\n", + "the article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n", + "\n", + "**Summary**\n", + "\n", + "DLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\n", + "streaming and batch sources, cleanse and transform data on the Databricks\n", + "Lakehouse Platform on any cloud with guaranteed data quality.\n", + "\n", + "Data from Apache Kafka can be ingested by directly connecting to a Kafka broker\n", + "from a DLT notebook in Python. Data loss can be prevented for a full pipeline\n", + "refresh even when the source data in the Kafka streaming layer expired.\n", + "\n", + "**Get started**\n", + "\n", + "If you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\n", + "release notes to learn more about what’s included in this GA release. If you are\n", + "not an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\n", + "detailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n", + "\n", + "Join the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\n", + "are chatting about Data + AI Summit 2022 announcements and updates. Learn.\n", + "Network.\n", + "\n", + "Last but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\n", + "summit. In that session, I walk you through the code of another streaming data\n", + "example with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\n", + "Hugging Face sentiment analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.4 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices**\n", + "\n", + "by **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "December 12, 2022\n", + "\n", + "\n", + "Releasing any data pipeline or application into a production state requires\n", + "planning, testing, monitoring, and maintenance. Streaming pipelines are no\n", + "different in this regard; in this blog we present some of the most important\n", + "considerations for deploying streaming pipelines and applications to a\n", + "production environment.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
80cc83dcbd5eb42991856a090e952ce8Last but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\n", + "summit. In that session, I walk you through the code of another streaming data\n", + "example with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\n", + "Hugging Face sentiment analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.4 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices**\n", + "\n", + "by **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "December 12, 2022\n", + "\n", + "\n", + "Releasing any data pipeline or application into a production state requires\n", + "planning, testing, monitoring, and maintenance. Streaming pipelines are no\n", + "different in this regard; in this blog we present some of the most important\n", + "considerations for deploying streaming pipelines and applications to a\n", + "production environment.\n", + "\n", + "At Databricks, we offer two different ways of building and running streaming\n", + "pipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\n", + "DLT is our flagship, fully managed ETL product that supports both batch and\n", + "streaming pipelines. It offers declarative development, automated operations,\n", + "data quality, advanced observability capabilities, and more. Workflows enable\n", + "customers to run Apache Spark™ workloads in Databricks’ optimized runtime\n", + "environment (i.e., Photon) with access to unified governance (Unity Catalog) and\n", + "storage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n", + "\n", + "share the same core streaming engine — Spark Structured Streaming. In the\n", + "case of DLT, customers program against the DLT API and DLT uses the Structured\n", + "Streaming engine under the hood. In the case of Jobs, customers program\n", + "against the Spark API directly.\n", + "\n", + "\n", + "The recommendations in this blog post are written from the Structured\n", + "Streaming engine perspective, most of which apply to both DLT and Workflows\n", + "(although DLT does take care of some of these automatically, like Triggers and\n", + "Checkpoints). We group the recommendations under the headings “Before\n", + "Deployment” and “After Deployment” to highlight when these concepts will\n", + "need to be applied and are releasing this blog series with this split between\n", + "the two. There will be additional deep-dive content for some of the sections\n", + "beyond as well. We recommend reading all sections before beginning work\n", + "to productionalize a streaming pipeline or application, and revisiting these\n", + "recommendations as you promote it from dev to QA and eventually production.\n", + "\n", + "**Before deployment**\n", + "\n", + "There are many things you need to consider when creating your streaming\n", + "application to improve the production experience. Some of these topics, like\n", + "unit testing, checkpoints, triggers, and state management, will determine how\n", + "your streaming application performs. Others, like naming conventions and how\n", + "many streams to run on which clusters, have more to do with managing multiple\n", + "streaming applications in the same environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unit testing**\n", + "\n", + "\n", + "The cost associated with finding and fixing a bug goes up exponentially\n", + "the farther along you get in the SDLC process, and a Structured Streaming\n", + "application is no different. When you’re turning that prototype into a hardened\n", + "production pipeline you need a CI/CD process with built-in tests. So how do you\n", + "create those tests?\n", + "\n", + "At first you might think that unit testing a streaming pipeline requires something\n", + "special, but that isn’t the case. The general guidance for streaming pipelines is\n", + "no different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\n", + "organizing your code so that it can be unit tested effectively:\n", + "\n", + "**•** Divide your code into testable chunks\n", + "\n", + "**•** Organize your business logic into functions calling other functions.\n", + "If you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n", + "[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\n", + "multiple functions that can be individually tested.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
9c08dab20cbc4981e1fa79ec23a29b7cAt first you might think that unit testing a streaming pipeline requires something\n", + "special, but that isn’t the case. The general guidance for streaming pipelines is\n", + "no different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\n", + "organizing your code so that it can be unit tested effectively:\n", + "\n", + "**•** Divide your code into testable chunks\n", + "\n", + "**•** Organize your business logic into functions calling other functions.\n", + "If you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n", + "[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\n", + "multiple functions that can be individually tested.\n", + "\n", + "**•** Do not code in dependencies on the global state or external systems\n", + "\n", + "**•** Any function manipulating a DataFrame or data set should be organized\n", + "to take the DataFrame/data set/configuration as input and output the\n", + "DataFrame/data set\n", + "\n", + "Once your code is separated out in a logical manner you can implement unit\n", + "tests for each of your functions. Spark-agnostic functions can be tested like any\n", + "other function in that language. For testing UDFs and functions with DataFrames\n", + "and data sets, there are multiple Spark testing frameworks available. These\n", + "\n", + "\n", + "frameworks support all of the DataFrame/data set APIs so that you can easily\n", + "create input, and they have specialized assertions that allow you to compare\n", + "DataFrame content and schemas. Some examples are:\n", + "\n", + "**•** The built-in Spark test suite, designed to test all parts of Spark\n", + "\n", + "**•** spark-testing-base, which has support for both Scala and Python\n", + "\n", + "**•** spark-fast-tests, for testing Scala Spark 2 & 3\n", + "\n", + "**•** chispa, a Python version of spark-fast-tests\n", + "\n", + "Code examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n", + "\n", + "But wait! I’m testing a streaming application here — don’t I need to make\n", + "streaming DataFrames for my unit tests? The answer is no; you do not! Even\n", + "though a streaming DataFrame represents a data set with no defined ending,\n", + "when functions are executed on it they are executed on a microbatch — a\n", + "discrete set of data. You can use the same unit tests that you would use for a\n", + "batch application, for both stateless and stateful streams. One of the advantages\n", + "of Structured Streaming over other frameworks is the ability to use the same\n", + "transformation code for both streaming and with other batch operations for\n", + "the same sink. This allows you to simplify some operations, like backfilling\n", + "data, for example, where rather than trying to sync the logic between two\n", + "different applications, you can just modify the input sources and write to the\n", + "same destination. If the sink is a Delta table, you can even do these operations\n", + "concurrently if both processes are append-only operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Triggers**\n", + "\n", + "\n", + "process a microbatch in order to maximize resource utilization, but setting the\n", + "interval longer would make sense if your stream is running on a shared cluster\n", + "and you don’t want it to constantly take the cluster resources.\n", + "\n", + "If you do not need your stream to run continuously, either because data doesn’t\n", + "come that often or your SLA is 10 minutes or greater, then you can use the\n", + "Trigger.Once option. This option will start up the stream, check for anything new\n", + "since the last time it ran, process it all in one big batch, and then shut down.\n", + "Just like with a continuously running stream when using Trigger.Once, the\n", + "checkpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
64979eeceeec57cd56c9ae0a8c21ca10-----\n", + "\n", + "**Triggers**\n", + "\n", + "\n", + "process a microbatch in order to maximize resource utilization, but setting the\n", + "interval longer would make sense if your stream is running on a shared cluster\n", + "and you don’t want it to constantly take the cluster resources.\n", + "\n", + "If you do not need your stream to run continuously, either because data doesn’t\n", + "come that often or your SLA is 10 minutes or greater, then you can use the\n", + "Trigger.Once option. This option will start up the stream, check for anything new\n", + "since the last time it ran, process it all in one big batch, and then shut down.\n", + "Just like with a continuously running stream when using Trigger.Once, the\n", + "checkpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n", + "\n", + "Spark has a new version of Trigger.Once called Trigger.AvailableNow. While\n", + "Trigger.Once will process everything in one big batch, which depending on your\n", + "data size may not be ideal, Trigger.AvailableNow will split up the data based on\n", + "maxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\n", + "processed in multiple batches. Those settings are ignored with Trigger.Once.\n", + "You can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n", + "\n", + "**Pop quiz —** how do you turn your streaming process into a batch process\n", + "that automatically keeps track of where it left off with just one line of code?\n", + "\n", + "**Answer —** change your processing time trigger to Trigger.Once/Trigger.\n", + "AvailableNow! Exact same code, running on a schedule, that will neither miss nor\n", + "reprocess any records.\n", + "\n", + "\n", + "Now that you know your code works, you need to determine how often your\n", + "stream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\n", + "one of the options for the writeStream command, and it looks like this:\n", + "\n", + "_// Scala/Java_\n", + ".trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n", + "\n", + "_# Python_\n", + ".trigger(processingTime= '30 seconds' )\n", + "\n", + "In the above example, if a microbatch completes in less than 30 seconds,\n", + "then the engine will wait for the rest of the time before kicking off the next\n", + "microbatch. If a microbatch takes longer than 30 seconds to complete, then the\n", + "engine will start the next microbatch immediately after the previous one finishes.\n", + "\n", + "The two factors you should consider when setting your trigger interval are how\n", + "long you expect your stream to process a microbatch and how often you want\n", + "the system to check for new data. You can lower the overall processing latency\n", + "by using a shorter trigger interval and increasing the resources available for\n", + "the streaming query by adding more workers or using compute or memory\n", + "optimized instances tailored to your application’s performance. These increased\n", + "resources come with increased costs, so if your goal is to minimize costs, then a\n", + "longer trigger interval with less compute can work. Normally you would not set a\n", + "trigger interval longer than what it would typically take for your stream to\n", + "\n", + "\n", + "-----\n", + "\n", + "**Name your stream**\n", + "\n", + "\n", + "You name your children, you name your pets, now it’s time to name your streams.\n", + "There’s a writeStream option called .queryName that allows you to provide a\n", + "friendly name for your stream. Why bother? Well, suppose you don’t name it. In\n", + "that case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\n", + "is the string and the unintelligible guid that is automatically generated\n", + "as the stream’s unique identifier. If you have more than one stream running on a\n", + "cluster, and all of them have and unintelligible strings as identifiers,\n", + "how do you find the one you want? If you’re exporting metrics how do you tell\n", + "which is which?\n", + "\n", + "Make it easy on yourself, and name your streams. When you’re managing them in\n", + "production you’ll be glad you did, and while you’re at it, go and name your batch\n", + "queries in any foreachBatch() code you have.\n", + "\n", + "**Fault tolerance**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
ea7bdc029f34b7ff3e5937db55b11a2d-----\n", + "\n", + "**Name your stream**\n", + "\n", + "\n", + "You name your children, you name your pets, now it’s time to name your streams.\n", + "There’s a writeStream option called .queryName that allows you to provide a\n", + "friendly name for your stream. Why bother? Well, suppose you don’t name it. In\n", + "that case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\n", + "is the string and the unintelligible guid that is automatically generated\n", + "as the stream’s unique identifier. If you have more than one stream running on a\n", + "cluster, and all of them have and unintelligible strings as identifiers,\n", + "how do you find the one you want? If you’re exporting metrics how do you tell\n", + "which is which?\n", + "\n", + "Make it easy on yourself, and name your streams. When you’re managing them in\n", + "production you’ll be glad you did, and while you’re at it, go and name your batch\n", + "queries in any foreachBatch() code you have.\n", + "\n", + "**Fault tolerance**\n", + "\n", + "How does your stream recover from being shut down? There are a few different\n", + "cases where this can come into play, like cluster node failures or intentional\n", + "halts, but the solution is to set up checkpointing. Checkpoints with write-ahead\n", + "logs provide a degree of protection from your streaming application being\n", + "interrupted, ensuring it will be able to pick up again where it last left off.\n", + "\n", + "Checkpoints store the current offsets and state values (e.g., aggregate values) for\n", + "your stream. Checkpoints are stream specific so each should be set to its own\n", + "location. Doing this will let you recover more gracefully from shutdowns, failures\n", + "from your application code or unexpected cloud provider failures or limitations.\n", + "\n", + "\n", + "To configure checkpoints, add the checkpointLocation option to your stream\n", + "definition:\n", + "\n", + "_// Scala/Java/Python_\n", + "streamingDataFrame.writeStream\n", + ".format( \"delta\" )\n", + ".option( \"path\" , \"\" )\n", + ".queryName( \"TestStream\" )\n", + ".option( \"checkpointLocation\" , \"\" )\n", + ".start()\n", + "\n", + "To keep it simple — every time you call .writeStream, you must specify the\n", + "checkpoint option with a unique checkpoint location. Even if you’re using\n", + "foreachBatch and the writeStream itself doesn’t specify a path or table option,\n", + "you must still specify that checkpoint. It’s how Spark Structured Streaming gives\n", + "you hassle-free fault tolerance.\n", + "\n", + "Efforts to manage the checkpointing in your stream should be of little concern\n", + "in general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\n", + "analytics is not having to reason about streaming at all.” That said, one setting\n", + "\n", + "deserves mention as questions around the maintenance of checkpoint files\n", + "come up occasionally. Though it is an internal setting that doesn’t require direct\n", + "configuration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\n", + "controls the number of checkpoint files that get created. Basically, the number\n", + "of files will be roughly this number times two, as there is a file created noting the\n", + "offsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\n", + "on completing the batch (commits). The number of files is checked periodically\n", + "for cleanup as part of the internal processes. This simplifies at least one aspect\n", + "of long-term streaming application maintenance for you.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
701356b6bf73f801bad3ac82f9c7f2adEfforts to manage the checkpointing in your stream should be of little concern\n", + "in general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\n", + "analytics is not having to reason about streaming at all.” That said, one setting\n", + "\n", + "deserves mention as questions around the maintenance of checkpoint files\n", + "come up occasionally. Though it is an internal setting that doesn’t require direct\n", + "configuration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\n", + "controls the number of checkpoint files that get created. Basically, the number\n", + "of files will be roughly this number times two, as there is a file created noting the\n", + "offsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\n", + "on completing the batch (commits). The number of files is checked periodically\n", + "for cleanup as part of the internal processes. This simplifies at least one aspect\n", + "of long-term streaming application maintenance for you.\n", + "\n", + "\n", + "-----\n", + "\n", + "It is also important to note that some changes to your application code can\n", + "invalidate the checkpoint. Checking for any of these changes during code\n", + "reviews before deployment is recommended. You can find examples of changes\n", + "where this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n", + "[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\n", + "whether asynchronous checkpointing might improve the latency in your\n", + "streaming application. In that case, these are covered in greater depth in\n", + "[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n", + "\n", + "**State management and RocksDB**\n", + "\n", + "Stateful streaming applications are those where current records may depend\n", + "on previous events, so Spark has to retain data in between microbatches.\n", + "The data it retains is called state, and Spark will store it in a state store and\n", + "read, update and delete it during each microbatch. Typical stateful operations\n", + "are streaming aggregations, streaming dropDuplicates, stream-stream joins,\n", + "mapGroupsWithState, or flatMapGroupsWithState. Some common types of\n", + "examples where you’ll need to think about your application state could be\n", + "sessionization or hourly aggregation using group by methods to calculate\n", + "\n", + "business metrics. Each record in the state store is identified by a key that is used\n", + "as part of the stateful computation, and the more unique keys that are required\n", + "the larger the amount of state data that will be stored.\n", + "\n", + "When the amount of state data needed to enable these stateful operations\n", + "grows large and complex, it can degrade your workloads’ performance, leading\n", + "to increased latency or even failures. A typical indicator of the state store being\n", + "\n", + "\n", + "the culprit of added latency is large amounts of time spent in garbage collection\n", + "(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\n", + "this could look like a continual increase or wildly varying processing time across\n", + "microbatches.\n", + "\n", + "The default configuration for a state store, which is sufficient for most general\n", + "streaming workloads, is to store the state data in the executors’ JVM memory.\n", + "Large number of keys (typically millions, see the Monitoring & Instrumentation\n", + "section in part 2 of this blog) can add excessive memory pressure on the\n", + "machine memory and increase the frequency of hitting these GC pauses as it\n", + "tries to free up resources.\n", + "\n", + "On the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\n", + "use [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\n", + "memory pressure. RocksDB is an embeddable persistent key-value store for fast\n", + "storage. It features high performance through a log-structured database engine\n", + "written entirely in C++ and optimized for fast, low-latency storage.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
e29cc632dc23cfaf8a58cd45fbe25eb9The default configuration for a state store, which is sufficient for most general\n", + "streaming workloads, is to store the state data in the executors’ JVM memory.\n", + "Large number of keys (typically millions, see the Monitoring & Instrumentation\n", + "section in part 2 of this blog) can add excessive memory pressure on the\n", + "machine memory and increase the frequency of hitting these GC pauses as it\n", + "tries to free up resources.\n", + "\n", + "On the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\n", + "use [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\n", + "memory pressure. RocksDB is an embeddable persistent key-value store for fast\n", + "storage. It features high performance through a log-structured database engine\n", + "written entirely in C++ and optimized for fast, low-latency storage.\n", + "\n", + "Leveraging RocksDB as the state store provider still uses machine memory\n", + "but no longer occupies space in the JVM and makes for a more efficient\n", + "state management system for large amounts of keys. This doesn’t come for\n", + "free, however, as it introduces an extra step in processing every microbatch.\n", + "Introducing RocksDB shouldn’t be expected to reduce latency except when it is\n", + "related to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\n", + "regular state storage as it is included in the stream checkpointing.\n", + "\n", + "\n", + "-----\n", + "\n", + "RocksDB configuration, like checkpoint configuration, is minimal by design and so\n", + "you only need to declare it in your overall Spark configuration:\n", + "\n", + "spark.conf. set (\n", + "\"spark.sql.streaming.stateStore.providerClass\" ,\n", + "\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n", + "\n", + "If you are monitoring your stream using the streamingQueryListener class, then\n", + "you will also notice that RocksDB metrics will be included in the stateOperators\n", + "field. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n", + "[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n", + "\n", + "It’s worth noting that large numbers of keys can have other adverse impacts in\n", + "addition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\n", + "also gets backed up in checkpoints for fault tolerance. So it makes sense that\n", + "if you have state files being created so that they will not expire, you will keep\n", + "accumulating files in the checkpoint, increasing the amount of storage required\n", + "and potentially the time to write it or recover from failures as well. For the data\n", + "in memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n", + "\n", + "this situation can lead to somewhat vague out-of-memory errors, and for the\n", + "checkpointed data written to cloud storage you might observe unexpected\n", + "and unreasonable growth. Unless you have a business need to retain streaming\n", + "state for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n", + "[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\n", + "operations so that the system can drop state records that are no longer needed\n", + "(pay close attention to dropDuplicates and stream-stream joins).\n", + "\n", + "\n", + "**Running multiple streams on a cluster**\n", + "\n", + "Once your streams are fully tested and configured, it’s time to figure out how to\n", + "organize them in production. It’s a common pattern to stack multiple streams on\n", + "the same Spark cluster to maximize resource utilization and save cost. This is fine\n", + "to a point, but there are limits to how much you can add to one cluster before\n", + "performance is affected. The driver has to manage all of the streams running on\n", + "the cluster, and all streams will compete for the same cores across the workers.\n", + "You need to understand what your streams are doing and plan your capacity\n", + "appropriately to stack effectively.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
3198901ba583ef2045fa27542ef5cd04**Running multiple streams on a cluster**\n", + "\n", + "Once your streams are fully tested and configured, it’s time to figure out how to\n", + "organize them in production. It’s a common pattern to stack multiple streams on\n", + "the same Spark cluster to maximize resource utilization and save cost. This is fine\n", + "to a point, but there are limits to how much you can add to one cluster before\n", + "performance is affected. The driver has to manage all of the streams running on\n", + "the cluster, and all streams will compete for the same cores across the workers.\n", + "You need to understand what your streams are doing and plan your capacity\n", + "appropriately to stack effectively.\n", + "\n", + "Here is what you should take into account when you’re planning on stacking\n", + "multiple streams on the same cluster:\n", + "\n", + "**•** Make sure your driver is big enough to manage all of your streams. Is your\n", + "driver struggling with a high CPU utilization and garbage collection? That\n", + "means it’s struggling to manage all of your streams. Either reduce the\n", + "number of streams or increase the size of your driver.\n", + "\n", + "**•** Consider the amount of data each stream is processing. The more data\n", + "you are ingesting and writing to a sink, the more cores you will need in\n", + "order to maximize your throughput for each stream. You’ll need to reduce\n", + "the number of streams or increase the number of workers depending on\n", + "how much data is being processed. For sources like Kafka you will need to\n", + "configure how many cores are being used to ingest with the minPartitions\n", + "option if you don’t have enough cores for all of the partitions across all of\n", + "your streams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**•** Consider the complexity and data volume of your streams. If all of the\n", + "streams are doing minimal manipulation and just appending to a sink, then\n", + "each stream will need fewer resources per microbatch and you’ll be able to\n", + "stack more. If the streams are doing stateful processing or computation/\n", + "memory-intensive operations, that will require more resources for good\n", + "performance and you’ll want to stack fewer streams.\n", + "\n", + "**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\n", + "contending for the same workers and cores, and one stream that needs a\n", + "lot of cores will cause the other streams to wait. Scheduler pools enable\n", + "you to have different streams execute on different parts of the cluster.\n", + "This will enable streams to execute in parallel with a subset of the available\n", + "resources.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "Some of the ideas we’ve addressed here certainly deserve their own time\n", + "and special treatment with a more in-depth discussion, which you can look\n", + "forward to in later deep dives. However, we hope these recommendations are\n", + "useful as you begin your journey or seek to enhance your production streaming\n", + "experience. Be sure to continue with the next post, “Streaming in Production:\n", + "Collected Best Practices, Part 2.”\n", + "\n", + "**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n", + "\n", + "\n", + "\n", + "**•** Consider your SLA. If you have mission critical streams, isolate them as a\n", + "best practice so lower-criticality streams do not affect them.\n", + "\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "On Databricks we typically see customers stack between 10-30 streams on a\n", + "cluster, but this varies depending on the use case. Consider the factors above so\n", + "that you can have a good experience with performance, cost and maintainability.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.5 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices, Part 2**\n", + "\n", + "by **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "January 10, 2023\n", + "\n", + "\n", + "In our two-part blog series titled “Streaming in Production: Collected Best\n", + "Practices,” this is the second article. Here we discuss the “After Deployment”\n", + "considerations for a Structured Streaming Pipeline. The majority of the\n", + "suggestions in this post are relevant to both Structured Streaming Jobs and\n", + "Delta Live Tables (our flagship and fully managed ETL product that supports\n", + "both batch and streaming pipelines).\n", + "\n", + "**After deployment**\n", + "\n", + "After the deployment of your streaming application, there are typically three\n", + "main things you’ll want to know:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
ca43835d6b287f050d49c4ff6a5c01eeOn Databricks we typically see customers stack between 10-30 streams on a\n", + "cluster, but this varies depending on the use case. Consider the factors above so\n", + "that you can have a good experience with performance, cost and maintainability.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.5 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices, Part 2**\n", + "\n", + "by **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "January 10, 2023\n", + "\n", + "\n", + "In our two-part blog series titled “Streaming in Production: Collected Best\n", + "Practices,” this is the second article. Here we discuss the “After Deployment”\n", + "considerations for a Structured Streaming Pipeline. The majority of the\n", + "suggestions in this post are relevant to both Structured Streaming Jobs and\n", + "Delta Live Tables (our flagship and fully managed ETL product that supports\n", + "both batch and streaming pipelines).\n", + "\n", + "**After deployment**\n", + "\n", + "After the deployment of your streaming application, there are typically three\n", + "main things you’ll want to know:\n", + "\n", + "**•** How is my application running?\n", + "\n", + "**•** Are resources being used efficiently?\n", + "\n", + "**•** How do I manage any problems that come up?\n", + "\n", + "We’ll start with an introduction to these topics, followed by a deeper dive later in\n", + "this blog series.\n", + "\n", + "\n", + "**Monitoring and instrumentation (How is my application running?)**\n", + "\n", + "Streaming workloads should be pretty much hands-off once deployed to\n", + "production. However, one thing that may sometimes come to mind is: “how is my\n", + "application running?” Monitoring applications can take on different levels and\n", + "forms depending on:\n", + "\n", + "**•** the metrics collected for your application (batch duration/latency,\n", + "throughput, …)\n", + "\n", + "**•** where you want to monitor the application from\n", + "\n", + "At the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n", + "[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\n", + "used in a variety of situations.\n", + "\n", + "This is in addition to setting up failure alerts on jobs running streaming\n", + "workloads.\n", + "\n", + "If you want more fine-grained metrics or to create custom actions based on\n", + "these metrics as part of your code base, then the StreamingQueryListener is\n", + "better aligned with what you’re looking for.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you want the Spark metrics to be reported (including machine level traces for\n", + "drivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n", + "\n", + "The Apache Spark Structured Streaming UI\n", + "\n", + "\n", + "Another point to consider is where you want to surface these metrics for\n", + "observability. There is a Ganglia dashboard at the cluster level, integrated partner\n", + "applications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\n", + "source options you can build using tools like Prometheus and Grafana. Each\n", + "has advantages and disadvantages to consider around cost, performance, and\n", + "maintenance requirements.\n", + "\n", + "Whether you have low volumes of streaming workloads where interactions in the\n", + "UI are sufficient or have decided to invest in a more robust monitoring platform,\n", + "you should know how to observe your production streaming workloads. Further\n", + "“Monitoring and Alerting” posts later in this series will contain a more thorough\n", + "discussion. In particular, we’ll see different measures on which to monitor\n", + "streaming applications and then later take a deeper look at some of the tools\n", + "you can leverage for observability.\n", + "\n", + "**Application optimization (Are resources being used effectively?**\n", + "\n", + "**Think “cost”)**\n", + "\n", + "The next concern we have after deploying to production is “is my applicationSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
1d13998a4c717e8522f9b108d682adc2Whether you have low volumes of streaming workloads where interactions in the\n", + "UI are sufficient or have decided to invest in a more robust monitoring platform,\n", + "you should know how to observe your production streaming workloads. Further\n", + "“Monitoring and Alerting” posts later in this series will contain a more thorough\n", + "discussion. In particular, we’ll see different measures on which to monitor\n", + "streaming applications and then later take a deeper look at some of the tools\n", + "you can leverage for observability.\n", + "\n", + "**Application optimization (Are resources being used effectively?**\n", + "\n", + "**Think “cost”)**\n", + "\n", + "The next concern we have after deploying to production is “is my application\n", + "\n", + "using resources effectively?” As developers, we understand (or quickly learn) the\n", + "distinction between working code and well-written code. Improving the way your\n", + "code runs is usually very satisfying, but what ultimately matters is the overall\n", + "cost of running it. Cost considerations for Structured Streaming applications will\n", + "be largely similar to those for other Spark applications. One notable difference\n", + "is that failing to optimize for production workloads can be extremely costly,\n", + "as these workloads are frequently “always-on” applications, and thus wasted\n", + "expenditure can quickly compound. Because assistance with cost optimization is\n", + "\n", + "\n", + "-----\n", + "\n", + "frequently requested, a separate post in this series will address it. The key points\n", + "that we’ll focus on will be efficiency of usage and sizing.\n", + "\n", + "Getting the cluster sizing right is one of the most significant differences between\n", + "efficiency and wastefulness in streaming applications. This can be particularly\n", + "tricky because in some cases it’s difficult to estimate the full load conditions of\n", + "the application in production before it’s actually there. In other cases, it may be\n", + "difficult due to natural variations in volume handled throughout the day, week, or\n", + "year. When first deploying, it can be beneficial to oversize slightly, incurring the\n", + "extra expense to avoid inducing performance bottlenecks. Utilize the monitoring\n", + "tools you chose to employ after the cluster has been running for a few weeks\n", + "to ensure proper cluster utilization. For example, are CPU and memory levels\n", + "being used at a high level during peak load or is the load generally small and the\n", + "cluster may be downsized? Maintain regular monitoring of this and keep an eye\n", + "out for changes in data volume over time; if either occurs, a cluster resize may be\n", + "required to maintain cost-effective operation.\n", + "\n", + "As a general guideline, you should avoid excessive shuffle operations, joins, or an\n", + "excessive or extreme watermark threshold (don’t exceed your needs), as each\n", + "can increase the number of resources you need to run your application. A large\n", + "watermark threshold will cause Structured Streaming to keep more data in the\n", + "state store between batches, leading to an increase in memory requirements\n", + "across the cluster. Also, pay attention to the type of VM configured — are you\n", + "using memory-optimized for your memory-intense stream? Compute-optimized\n", + "for your computationally-intensive stream? If not, look at the utilization levels\n", + "for each and consider trying a machine type that could be a better fit. Newer\n", + "families of servers from cloud providers with more optimal CPUs often lead to\n", + "faster execution, meaning you might need fewer of them to meet your SLA.\n", + "\n", + "\n", + "**Troubleshooting (How do I manage any problems that come up?)**\n", + "\n", + "The last question we ask ourselves after deployment is “how do I manage any\n", + "problems that come up?” As with cost optimization, troubleshooting streaming\n", + "applications in Spark often looks the same as other applications since most of\n", + "the mechanics remain the same under the hood. For streaming applications,\n", + "issues usually fall into two categories — failure scenarios and latency scenarios\n", + "\n", + "**Failure scenarios**\n", + "\n", + "Failure scenarios typically manifest with the stream stopping with an error,\n", + "executors failing or a driver failure causing the whole cluster to fail. Common\n", + "causes for this are:\n", + "\n", + "**•** Too many streams running on the same cluster, causing the driver to be\n", + "overwhelmed. On Databricks, this can be seen in Ganglia, where the driver\n", + "node will show up as overloaded before the cluster fails.\n", + "\n", + "**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\n", + "This can also be seen on Databricks in Ganglia before an executor fails,\n", + "or in the Spark UI under the executors tab.\n", + "\n", + "**•** Using a collect to send too much data to the driver, causing it to fail\n", + "with an Out Of Memory error.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Latency scenarios**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
c78de09600f8dd6cbf1ef88ae8462fb3**Failure scenarios**\n", + "\n", + "Failure scenarios typically manifest with the stream stopping with an error,\n", + "executors failing or a driver failure causing the whole cluster to fail. Common\n", + "causes for this are:\n", + "\n", + "**•** Too many streams running on the same cluster, causing the driver to be\n", + "overwhelmed. On Databricks, this can be seen in Ganglia, where the driver\n", + "node will show up as overloaded before the cluster fails.\n", + "\n", + "**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\n", + "This can also be seen on Databricks in Ganglia before an executor fails,\n", + "or in the Spark UI under the executors tab.\n", + "\n", + "**•** Using a collect to send too much data to the driver, causing it to fail\n", + "with an Out Of Memory error.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Latency scenarios**\n", + "\n", + "For latency scenarios, your stream will not execute as fast as you want or expect.\n", + "A latency issue can be intermittent or constant. Too many streams or too small\n", + "of a cluster can be the cause of this as well. Some other common causes are:\n", + "\n", + "**•** Data skew — when a few tasks end up with much more data than the rest\n", + "of the tasks. With skewed data, these tasks take longer to execute than the\n", + "others, often spilling to disk. Your stream can only run as fast as its slowest\n", + "task.\n", + "\n", + "**•** Executing a stateful query without defining a watermark or defining a very\n", + "long one will cause your state to grow very large, slowing down your stream\n", + "over time and potentially leading to failure.\n", + "\n", + "**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n", + "\n", + "**•** Stable but high latency (batch execution time). Depending on the cause,\n", + "adding more workers to increase the number of cores concurrently available\n", + "for Spark tasks can help. Increasing the number of input partitions and/or\n", + "decreasing the load per core through batch size settings can also reduce\n", + "the latency.\n", + "\n", + "Just like troubleshooting a batch job, you’ll use Ganglia to check cluster\n", + "utilization and the Spark UI to find performance bottlenecks. There is a\n", + "specific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\n", + "troubleshoot streaming applications. On that tab each stream that is running will\n", + "be listed, and you’ll see either your stream name if you named your stream or\n", + "\n", + "\n", + " if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\n", + "tab of the Spark UI so that you can tell which jobs are for a given stream.\n", + "\n", + "You’ll notice above we said which jobs are for a given stream. It’s a common\n", + "misconception that if you were to look at a streaming application in the Spark\n", + "UI you would just see one job in the Jobs tab running continuously. Instead,\n", + "depending on your code, you will see one or more jobs that start and complete\n", + "for each microbatch. Each job will have the stream ID from the Structured\n", + "Streaming tab and a microbatch number in the description, so you’ll be able to\n", + "tell which jobs go with which stream. You can click into those jobs to find the\n", + "longest running stages and tasks, check for disk spills, and search by Job ID in\n", + "the SQL tab to find the slowest queries and check their explain plans.\n", + "\n", + "The Jobs tab in the Apache Spark UI\n", + "\n", + "\n", + "-----\n", + "\n", + "If you click on your stream in the Structured Streaming tab you’ll see how much\n", + "time the different streaming operations are taking for each microbatch, such as\n", + "adding a batch, query planning and committing (see earlier screenshot of the\n", + "Apache Spark Structured Streaming UI). You can also see how many rows are\n", + "being processed as well as the size of your state store for a stateful stream.\n", + "This can give insights into where potential latency issues are.\n", + "\n", + "We will go more in-depth with troubleshooting later in this blog series, where\n", + "we’ll look at some of the causes and remedies for both failure scenarios and\n", + "latency scenarios as we outlined above.\n", + "\n", + "**Conclusion**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
d0f45ea2c08b30fbe5cd7aa463e99148The Jobs tab in the Apache Spark UI\n", + "\n", + "\n", + "-----\n", + "\n", + "If you click on your stream in the Structured Streaming tab you’ll see how much\n", + "time the different streaming operations are taking for each microbatch, such as\n", + "adding a batch, query planning and committing (see earlier screenshot of the\n", + "Apache Spark Structured Streaming UI). You can also see how many rows are\n", + "being processed as well as the size of your state store for a stateful stream.\n", + "This can give insights into where potential latency issues are.\n", + "\n", + "We will go more in-depth with troubleshooting later in this blog series, where\n", + "we’ll look at some of the causes and remedies for both failure scenarios and\n", + "latency scenarios as we outlined above.\n", + "\n", + "**Conclusion**\n", + "\n", + "You may have noticed that many of the topics covered here are very similar to\n", + "how other production Spark applications should be deployed. Whether your\n", + "workloads are primarily streaming applications or batch processes, the majority\n", + "of the same principles will apply. We focused more on things that become\n", + "especially important when building out streaming applications, but as we’re\n", + "\n", + "\n", + "sure you’ve noticed by now, the topics we discussed should be included in\n", + "most production deployments.\n", + "\n", + "Across the majority of industries in the world today information is needed\n", + "faster than ever, but that won’t be a problem for you. With Spark Structured\n", + "Streaming you’re set to make it happen at scale in production. Be on the lookout\n", + "for more in-depth discussions on some of the topics we’ve covered in this blog,\n", + "and in the meantime keep streaming!\n", + "\n", + "**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n", + "**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.6 \u0007\n", + "\n", + "**Building Geospatial Data Products**\n", + "\n", + "by **M I L O S C O L I C**\n", + "\n", + "January 6, 2023\n", + "\n", + "\n", + "Geospatial data has been driving innovation for centuries, through use of\n", + "maps, cartography and more recently through digital content. For example,\n", + "the oldest map has been found etched in a piece of mammoth tusk and dates\n", + "[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\n", + "sources used by society to make decisions. A more recent example, labeled\n", + "as the birth of spatial analysis, is that of Charles Picquet in 1832 who used\n", + "geospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\n", + "later John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n", + "[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\n", + "problems of their times and in effect save countless lives. Fast-forwarding to the\n", + "20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n", + "[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\n", + "Rural Development.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
e010236328d56ff6a8034a16f0a7902aToday we are in the midst of the cloud computing industry revolution —\n", + "supercomputing scale available to any organization, virtually infinitely scalable\n", + "for both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\n", + "are emerging within the data community to address questions like platform\n", + "federation and interoperability. How can we adopt these concepts to geospatial\n", + "data, spatial analysis and GIS systems? By adopting the concept of data\n", + "products and approaching the design of geospatial data as a product.\n", + "\n", + "\n", + "In this blog we will provide a point of view on how to design scalable geospatial\n", + "data products that are modern and robust. We will discuss how Databricks\n", + "Lakehouse Platform can be used to unlock the full potential of geospatial\n", + "products that are one of the most valuable assets in solving the toughest\n", + "problems of today and the future.\n", + "\n", + "**What is a data product? And how to design one?**\n", + "\n", + "The most broad and the most concise definition of a “data product” was coined\n", + "by DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n", + "_Data into Product:_ “a product that facilitates an end goal through the use of\n", + "data.” The complexity of this definition (as admitted by Patil himself) is needed to\n", + "encapsulate the breadth of possible products, to include dashboards, reports, Excel\n", + "\n", + "spreadsheets, and even CSV extracts shared via emails. You might notice that the\n", + "examples provided deteriorate rapidly in quality, robustness and governance.\n", + "\n", + "What are the concepts that differentiate a successful product versus an\n", + "unsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\n", + "content? Or is it only the product adoption in the market? Forbes defines the\n", + "10 must-haves of a successful product. A good framework to summarize this is\n", + "through the value pyramid.\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 1: Product value pyramid (source)\n", + "\n", + "The value pyramid provides a priority on each aspect of the product. Not every\n", + "value question we ask about the product carries the same amount of weight. If\n", + "the output is not useful none of the other aspects matter — the output isn’t really\n", + "a product but becomes more of a data pollutant to the pool of useful results.\n", + "Likewise, scalability only matters after simplicity and explainability are addressed.\n", + "\n", + "How does the value pyramid relate to the data products? Each data output, in\n", + "order to be a data product:\n", + "\n", + "**•** **Should have clear usefulness.** The amount of the data society is\n", + "generating is rivaled only by the amount of data pollutants we are\n", + "generating. These are outputs lacking clear value and use, much less a\n", + "strategy for what to do with them.\n", + "\n", + "\n", + "\n", + "**•** **Should be explainable.** With the emergence of AI/ML, explainability has\n", + "become even more important for data driven decision-making. Data\n", + "is as good as the metadata describing it. Think of it in terms of food —\n", + "taste does matter, but a more important factor is the nutritional value\n", + "of ingredients.\n", + "\n", + "**•** **Should be simple.** An example of product misuse is using a fork to eat\n", + "cereal instead of using a spoon. Furthermore, simplicity is essential but\n", + "not sufficient — beyond simplicity the products should be intuitive.\n", + "Whenever possible both intended and unintended uses of the data\n", + "should be obvious.\n", + "\n", + "**•** **Should be scalable.** Data is one of the few resources that grows with\n", + "use. The more data you process the more data you have. If both inputs\n", + "and outputs of the system are unbounded and ever-growing, then the\n", + "system has to be scalable in compute power, storage capacity and\n", + "compute expressive power. Cloud data platforms like Databricks are in\n", + "a unique position to answer for all of the three aspects.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
cade53da742fa49ccab64c5a0370b569**•** **Should be explainable.** With the emergence of AI/ML, explainability has\n", + "become even more important for data driven decision-making. Data\n", + "is as good as the metadata describing it. Think of it in terms of food —\n", + "taste does matter, but a more important factor is the nutritional value\n", + "of ingredients.\n", + "\n", + "**•** **Should be simple.** An example of product misuse is using a fork to eat\n", + "cereal instead of using a spoon. Furthermore, simplicity is essential but\n", + "not sufficient — beyond simplicity the products should be intuitive.\n", + "Whenever possible both intended and unintended uses of the data\n", + "should be obvious.\n", + "\n", + "**•** **Should be scalable.** Data is one of the few resources that grows with\n", + "use. The more data you process the more data you have. If both inputs\n", + "and outputs of the system are unbounded and ever-growing, then the\n", + "system has to be scalable in compute power, storage capacity and\n", + "compute expressive power. Cloud data platforms like Databricks are in\n", + "a unique position to answer for all of the three aspects.\n", + "\n", + "**•** **Should generate habits.** In the data domain we are not concerned\n", + "with customer retention as is the case for the retail products. However,\n", + "the value of habit generation is obvious if applied to best practices.\n", + "The systems and data outputs should exhibit the best practices and\n", + "promote them — it should be easier to use the data and the system in\n", + "the intended way than the opposite.\n", + "\n", + "The geospatial data should adhere to all the aforementioned aspects — any data\n", + "products should. On top of this tall order, geospatial data has some specific needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Geospatial data standards**\n", + "\n", + "\n", + "\n", + "**•** **“Advocate the understanding and use of geospatial data standards**\n", + "**within other sectors of government.”** — Value pyramid applies to\n", + "the standards as well — concepts like ease of adherence (usefulness/\n", + "simplicity), purpose of the standard (explainability/usefulness), adoption\n", + "(habit generation) are critical for the value generation of a standard.\n", + "\n", + "A critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\n", + "principles:\n", + "\n", + "**•** **Findable** — The first step in (re)using data is to find them. Metadata\n", + "and data should be easy to find for both humans and computers.\n", + "Machine-readable metadata are essential for automatic discovery of\n", + "data sets and services.\n", + "\n", + "**•** **Accessible** — Once the user finds the required data, she/he/they\n", + "need to know how they can be accessed, possibly including\n", + "authentication and authorization.\n", + "\n", + "**•** **Interoperable** — The data usually needs to be integrated with\n", + "other data. In addition, the data needs to interoperate with\n", + "applications or workflows for analysis, storage, and processing.\n", + "\n", + "**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\n", + "To achieve this, metadata and data should be well-described so that\n", + "they can be replicated and/or combined in different settings.\n", + "\n", + "\n", + "Geospatial data standards are used to ensure that geographic data is collected,\n", + "organized, and shared in a consistent and reliable way. These standards can\n", + "include guidelines for things like data formatting, coordinate systems, map\n", + "projections, and metadata. Adhering to standards makes it easier to share data\n", + "between different organizations, allowing for greater collaboration and broader\n", + "access to geographic information.\n", + "\n", + "The Geospatial Commision (UK government) has defined the UK Geospatial\n", + "Data Standards Register as a central repository for data standards to be applied\n", + "in the case of geospatial data. Furthermore, the mission of this registry is to:\n", + "\n", + "**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n", + "**across a wider range of systems.”** — These concepts are a callout for the\n", + "importance of explainability, usefulness and habit generation (possibly\n", + "other aspects of the value pyramid).\n", + "\n", + "**•** **“Empower the UK geospatial community to become more engaged with**\n", + "**the relevant standards and standards bodies.”** — Habit generation within\n", + "the community is as important as the robust and critical design on the\n", + "standard. If not adopted standards are useless.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
7b16e03feb511254489054d702a882a4The Geospatial Commision (UK government) has defined the UK Geospatial\n", + "Data Standards Register as a central repository for data standards to be applied\n", + "in the case of geospatial data. Furthermore, the mission of this registry is to:\n", + "\n", + "**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n", + "**across a wider range of systems.”** — These concepts are a callout for the\n", + "importance of explainability, usefulness and habit generation (possibly\n", + "other aspects of the value pyramid).\n", + "\n", + "**•** **“Empower the UK geospatial community to become more engaged with**\n", + "**the relevant standards and standards bodies.”** — Habit generation within\n", + "the community is as important as the robust and critical design on the\n", + "standard. If not adopted standards are useless.\n", + "\n", + "\n", + "-----\n", + "\n", + "We share the belief that the FAIR principles are crucial for the design of scalable\n", + "data products we can trust. To be fair, FAIR is based on common sense, so why\n", + "is it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n", + "_does well is to articulate, in an accessible way, the need for a holistic approach_\n", + "_to data improvement. This ease in communication is why FAIR is being used_\n", + "_increasingly widely as an umbrella for data improvement — and not just in the_\n", + "_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n", + "\n", + "To further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\n", + "developed the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\n", + "covers the years 2021-2024 and was approved in November 2020. The goals\n", + "of NSDI are in essence FAIR principles and convey the same message of designing\n", + "systems that promote the circular economy of data — data products that flow\n", + "between organizations following common standards and in each step through the\n", + "data supply chain unlock new value and new opportunities. The fact that these\n", + "principles are permeating different jurisdictions and are adopted across different\n", + "regulators is a testament to the robustness and soundness of the approach.\n", + "\n", + "\n", + "The FAIR concepts weave really well together with the data product design.\n", + "In fact FAIR is traversing the whole product value pyramid and forms a value\n", + "cycle. By adopting both the value pyramid and FAIR principles we design data\n", + "products with both internal and external outlook. This promotes data reuse\n", + "as opposed to data accumulation.\n", + "\n", + "Why do FAIR principles matter for geospatial data and geospatial data\n", + "\n", + "products? FAIR is transcendent to geospatial data, it is actually transcendent\n", + "to data, it is a simple yet coherent system of guiding principles for good design\n", + "— and that good design can be applied to anything including geospatial data\n", + "and geospatial systems.\n", + "\n", + "\n", + "Figure 2:\n", + "NDSI Strategic Goals\n", + "\n", + "\n", + "-----\n", + "\n", + "**Grid index systems**\n", + "\n", + "In traditional GIS solutions’ performance of spatial operations are usually\n", + "achieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\n", + "The issue with tree approaches is that they eventually break the scalability\n", + "principle — when the data is too big to be processed in order to build the tree\n", + "and the computation required to build the tree is too long and defeats the\n", + "purpose. This also negatively affects the accessibility of data; if we cannot\n", + "construct the tree we cannot access the complete data and in effect we cannot\n", + "reproduce the results. In this case, grid index systems provide a solution.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
999a832f6cc41314899f6ef6c3cb3f06Figure 2:\n", + "NDSI Strategic Goals\n", + "\n", + "\n", + "-----\n", + "\n", + "**Grid index systems**\n", + "\n", + "In traditional GIS solutions’ performance of spatial operations are usually\n", + "achieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\n", + "The issue with tree approaches is that they eventually break the scalability\n", + "principle — when the data is too big to be processed in order to build the tree\n", + "and the computation required to build the tree is too long and defeats the\n", + "purpose. This also negatively affects the accessibility of data; if we cannot\n", + "construct the tree we cannot access the complete data and in effect we cannot\n", + "reproduce the results. In this case, grid index systems provide a solution.\n", + "\n", + "\n", + "Grid index systems are built from the start with the scalability aspects of the\n", + "geospatial data in mind. Rather than building the trees, they define a series of\n", + "grids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\n", + "the grid covers the area of the Earth; in the case of local grid index systems\n", + "(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\n", + "These grids are composed of cells that have unique identifiers. There is a\n", + "mathematical relationship between location and the cell in the grid. This makes\n", + "the grid index systems very scalable and parallel in nature.\n", + "\n", + "\n", + "Figure 4: Grid Index Systems (H3, British National Grid)\n", + "\n", + "\n", + "-----\n", + "\n", + "Another important aspect of grid index systems is that they are open source,\n", + "allowing index values to be universally leveraged by data producers and\n", + "consumers alike. Data can be enriched with the grid index information at any\n", + "step of its journey through the data supply chain. This makes the grid index\n", + "systems an example of community driven data standards. Community driven\n", + "data standards by nature do not require enforcement, which fully adheres\n", + "to the habit generation aspect of value pyramid and meaningfully addresses\n", + "interoperability and accessibility principles of FAIR.\n", + "\n", + "\n", + "Databricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\n", + "following the same value proposition. Adopting common industry standards\n", + "driven by the community is the only way to properly drive habit generation and\n", + "interoperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\n", + "and [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\n", + "GIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n", + "[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\n", + "the UK government. Grid index systems are key for the scalability of geospatial\n", + "data processing and for properly designing solutions for complex problems\n", + "(e.g., figure 5 — flight holding patterns using H3).\n", + "\n", + "**Geospatial data diversity**\n", + "\n", + "Geospatial data standards spend a solid amount of effort regarding data\n", + "format standardization, and format for that matter is one of the most\n", + "important considerations when it comes to interoperability and reproducibility.\n", + "Furthermore, if the reading of your data is complex — how can we talk about\n", + "simplicity? Unfortunately geospatial data formats are typically complex, as\n", + "data can be produced in a number of formats including both open sourceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
62d0abb59a0dcbc090a459f7bcd48faf**Geospatial data diversity**\n", + "\n", + "Geospatial data standards spend a solid amount of effort regarding data\n", + "format standardization, and format for that matter is one of the most\n", + "important considerations when it comes to interoperability and reproducibility.\n", + "Furthermore, if the reading of your data is complex — how can we talk about\n", + "simplicity? Unfortunately geospatial data formats are typically complex, as\n", + "data can be produced in a number of formats including both open source\n", + "\n", + "and vendor-specific formats. Considering only vector data, we can expect\n", + "data to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\n", + "and many others. On the other hand, if we are considering raster data we can\n", + "expect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\n", + "GeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n", + "\n", + "\n", + "Figure 5: Example of using H3 to express flight holding patterns\n", + "\n", + "\n", + "-----\n", + "\n", + "Geospatial data domain is so diverse and has organically grown over the years\n", + "around the use cases it was addressing. Unification of such a diverse ecosystem\n", + "is a massive challenge. A recent effort by the Open Geospatial Consortium\n", + "(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n", + "[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\n", + "of designing a good scalable and robust product — unification leads to simplicity\n", + "and addresses one of the main sources of friction in the ecosystem — the data\n", + "ingestion. Standardizing to GeoParquet brings a lot of value that addresses all of\n", + "the aspects of FAIR data and value pyramid.\n", + "\n", + "Figure 6: Geoparquet as a geospatial standard data format\n", + "\n", + "\n", + "Why introduce another format into an already complex ecosystem? GeoParquet\n", + "isn’t a new format — it is a schema specification for Apache Parquet format that\n", + "is already widely adopted and used by the industry and the community. Parquet\n", + "as the base format supports binary columns and allows for storage of arbitrary\n", + "data payload. At the same time the format supports structured data columns\n", + "that can store metadata together with the data payload. This makes it a choice\n", + "that promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\n", + "has been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\n", + "properties of a format are crucial for reproducibility and for trusted outputs. In\n", + "addition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n", + "\n", + "Delta Sharing enables enterprise scale data sharing between any public cloud\n", + "using Databricks (DIY options for private cloud are available using open source\n", + "building blocks). Delta Sharing completely abstracts the need for custom built\n", + "Rest APIs for exposing data to other third parties. Any data asset stored in Delta\n", + "(using GeoParquet schema) automatically becomes a data product that can be\n", + "exposed to external parties in a controlled and governed manner. Delta Sharing\n", + "has been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 7: Delta Sharing simplifying data access in the ecosystem\n", + "\n", + "**Circular data economy**\n", + "\n", + "\n", + "Borrowing the concepts from the sustainability domain, we can define a circular\n", + "data economy as a system in which data is collected, shared, and used in a way\n", + "that maximizes its value while minimizing waste and negative impacts, such as\n", + "unnecessary compute time, untrustworthy insights, or biased actions based\n", + "data pollutants. Reusability is the key concept in this consideration — how can\n", + "we minimize the \"reinvention of the wheel.\" There are countless data assets out\n", + "in the wild that represent the same area, same concepts with just ever slight\n", + "alterations to better match a specific use case. Is this due to the actualSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
d142161c1b5bc29be7c1b51d12681385-----\n", + "\n", + "Figure 7: Delta Sharing simplifying data access in the ecosystem\n", + "\n", + "**Circular data economy**\n", + "\n", + "\n", + "Borrowing the concepts from the sustainability domain, we can define a circular\n", + "data economy as a system in which data is collected, shared, and used in a way\n", + "that maximizes its value while minimizing waste and negative impacts, such as\n", + "unnecessary compute time, untrustworthy insights, or biased actions based\n", + "data pollutants. Reusability is the key concept in this consideration — how can\n", + "we minimize the \"reinvention of the wheel.\" There are countless data assets out\n", + "in the wild that represent the same area, same concepts with just ever slight\n", + "alterations to better match a specific use case. Is this due to the actual\n", + "\n", + "\n", + "optimizations or due to the fact it was easier to create a new copy of the assets\n", + "than to reuse the existing ones? Or was it too hard to find the existing data\n", + "assets, or maybe it was too complex to define data access patterns.\n", + "\n", + "Data asset duplication has many negative aspects in both FAIR considerations\n", + "and data value pyramid considerations — having many disparate similar (but\n", + "different) data assets that represent the same area and same concepts can\n", + "deteriorate simplicity considerations of the data domain — it becomes hard\n", + "to identify the data asset we actually can trust. It can also have very negative\n", + "\n", + "\n", + "-----\n", + "\n", + "implications toward habit generation. Many niche communities will emerge\n", + "that will standardize to themselves ignoring the best practices of the wider\n", + "ecosystem, or worse yet they will not standardize at all.\n", + "\n", + "In a circular data economy, data is treated as a valuable resource that can be\n", + "used to create new products and services, as well as improving existing ones.\n", + "This approach encourages the reuse and recycling of data, rather than treating it\n", + "as a disposable commodity. Once again, we are using the sustainability analogy\n", + "in a literal sense — we argue that this is the correct way of approaching the\n", + "problem. Data pollutants are a real challenge for organizations both internally and\n", + "externally. An article by The Guardian states that less than 1% of collected data is\n", + "actually analyzed. There is too much data duplication, the majority of data is hard\n", + "to access and deriving actual value is too cumbersome. Circular data economy\n", + "promotes best practices and reusability of existing data assets allowing for a more\n", + "consistent interpretation and insights across the wider data ecosystem.\n", + "\n", + "\n", + "Figure 8: Databricks Marketplace\n", + "\n", + "\n", + "-----\n", + "\n", + "Interoperability is a key component of FAIR data principles, and from\n", + "interoperability a question of circularity comes to mind. How can we design an\n", + "ecosystem that maximizes data utilization and data reuse? Once again, FAIR\n", + "together with the value pyramid holds answers. Findability of the data is key to\n", + "the data reuse and to solving for data pollution. With data assets that can be\n", + "discovered easily we can avoid the recreation of same data assets in multiple\n", + "places with just slight alteration. Instead we gain a coherent data ecosystem\n", + "with data that can be easily combined and reused. Databricks has recently\n", + "announced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\n", + "line with the original definition of data product by DJ Patel. The marketplace\n", + "will support sharing of data sets, notebooks, dashboards, and machine learning\n", + "models. The critical building block for such a marketplace is the concept of\n", + "Delta Sharing — the scalable, flexible and robust channel for sharing any data —\n", + "geospatial data included.\n", + "\n", + "\n", + "Designing scalable data products that will live in the marketplace is crucial.\n", + "In order to maximize the value add of each data product one should strongly\n", + "consider FAIR principles and the product value pyramid. Without these guiding\n", + "principles we will only increase the issues that are already present in the\n", + "current systems. Each data product should solve a unique problem and should\n", + "solve it in a simple, reproducible and robust way.\n", + "\n", + "**You can read more on how Databricks Lakehouse**\n", + "**Platform can help you accelerate time to value from**\n", + "**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n", + "**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
7947b116f21540425c77af4e75583f06Designing scalable data products that will live in the marketplace is crucial.\n", + "In order to maximize the value add of each data product one should strongly\n", + "consider FAIR principles and the product value pyramid. Without these guiding\n", + "principles we will only increase the issues that are already present in the\n", + "current systems. Each data product should solve a unique problem and should\n", + "solve it in a simple, reproducible and robust way.\n", + "\n", + "**You can read more on how Databricks Lakehouse**\n", + "**Platform can help you accelerate time to value from**\n", + "**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n", + "**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.7 \u0007\n", + "\n", + "**Data Lineage With Unity Catalog**\n", + "\n", + "by **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n", + "\n", + "June 8, 2022\n", + "\n", + "\n", + "This blog will discuss the importance of data lineage, some of the common\n", + "use cases, our vision for better data transparency and data understanding with\n", + "data lineage.\n", + "\n", + "**What is data lineage and why is it important?**\n", + "\n", + "Data lineage describes the transformations and refinements of data from source\n", + "to insight. Lineage includes capturing all the relevant metadata and events\n", + "associated with the data in its lifecycle, including the source of the data set,\n", + "what other data sets were used to create it, who created it and when, what\n", + "transformations were performed, what other data sets leverage it, and many other\n", + "events and attributes. With a data lineage solution, data teams get an end-to-end\n", + "view of how data is transformed and how it flows across their data estate.\n", + "\n", + "As more and more organizations embrace a data-driven culture and set up\n", + "processes and tools to democratize and scale data and AI, data lineage is\n", + "becoming an essential pillar of a pragmatic data management and governance\n", + "strategy.\n", + "\n", + "To understand the importance of data lineage, we have highlighted some of the\n", + "common use cases we have heard from our customers below.\n", + "\n", + "\n", + "**Impact analysis**\n", + "Data goes through multiple updates or revisions over its lifecycle, and\n", + "understanding the potential impact of any data changes on downstream\n", + "consumers becomes important from a risk management standpoint. With data\n", + "lineage, data teams can see all the downstream consumers — applications,\n", + "dashboards, machine learning models or data sets, etc. — impacted by data\n", + "changes, understand the severity of the impact, and notify the relevant\n", + "stakeholders. Lineage also helps IT teams proactively communicate data\n", + "migrations to the appropriate teams, ensuring business continuity.\n", + "\n", + "**Data understanding and transparency**\n", + "Organizations deal with an influx of data from multiple sources, and building\n", + "a better understanding of the context around data is paramount to ensure\n", + "the trustworthiness of the data. Data lineage is a powerful tool that enables\n", + "data leaders to drive better transparency and understanding of data in their\n", + "organizations. Data lineage also empowers data consumers such as data scientists,\n", + "data engineers and data analysts to be context-aware as they perform analyses,\n", + "resulting in better quality outcomes. Finally, data stewards can see which data sets\n", + "are no longer accessed or have become obsolete to retire unnecessary data and\n", + "ensure data quality for end business users .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Debugging and diagnostics**\n", + "You can have all the checks and balances in place, but something will eventually\n", + "break. Data lineage helps data teams perform a root cause analysis of any errors\n", + "in their data pipelines, applications, dashboards, machine learning models, etc.,\n", + "by tracing the error to its source. This significantly reduces the debugging time,\n", + "saving days, or in many cases, months of manual effort.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
710f6037bd92be3299640636b705d3f3**Data understanding and transparency**\n", + "Organizations deal with an influx of data from multiple sources, and building\n", + "a better understanding of the context around data is paramount to ensure\n", + "the trustworthiness of the data. Data lineage is a powerful tool that enables\n", + "data leaders to drive better transparency and understanding of data in their\n", + "organizations. Data lineage also empowers data consumers such as data scientists,\n", + "data engineers and data analysts to be context-aware as they perform analyses,\n", + "resulting in better quality outcomes. Finally, data stewards can see which data sets\n", + "are no longer accessed or have become obsolete to retire unnecessary data and\n", + "ensure data quality for end business users .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Debugging and diagnostics**\n", + "You can have all the checks and balances in place, but something will eventually\n", + "break. Data lineage helps data teams perform a root cause analysis of any errors\n", + "in their data pipelines, applications, dashboards, machine learning models, etc.,\n", + "by tracing the error to its source. This significantly reduces the debugging time,\n", + "saving days, or in many cases, months of manual effort.\n", + "\n", + "**Compliance and audit readiness**\n", + "Many compliance regulations, such as the General Data Protection Regulation\n", + "(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\n", + "Accountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\n", + "and Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\n", + "and visibility of data flow. As a result, data traceability becomes a key requirement\n", + "in order for their data architecture to meet legal regulations. Data lineage helps\n", + "organizations be compliant and audit-ready, thereby alleviating the operational\n", + "overhead of manually creating the trails of data flows for audit reporting purposes.\n", + "\n", + "\n", + "**Effortless transparency and proactive control with**\n", + "**data lineage**\n", + "\n", + "The [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\n", + "substantially simplifies enterprise data infrastructure and accelerates innovation\n", + "by unifying your data warehousing and AI use cases on a single platform.\n", + "We believe data lineage is a key enabler of better data transparency and data\n", + "understanding in your lakehouse, surfacing the relationships between data,\n", + "jobs, and consumers, and helping organizations move toward proactive data\n", + "management practices. For example:\n", + "\n", + "**•** As the owner of a dashboard, do you want to be notified next time that a\n", + "table your dashboard depends upon wasn’t loaded correctly?\n", + "\n", + "**•** As a machine learning practitioner developing a model, do you want to be\n", + "alerted that a critical feature in your model will be deprecated soon?\n", + "\n", + "**•** As a governance admin, do you want to automatically control access to\n", + "data based on its provenance?\n", + "\n", + "All of these capabilities rely upon the automatic collection of data lineage across\n", + "all use cases and personas — which is why the lakehouse and data lineage are a\n", + "powerful combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data lineage for tables\n", + "\n", + "Data lineage for table columns\n", + "\n", + "\n", + "Data Lineage for notebooks, workflows, dashboards\n", + "\n", + "**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\n", + "the same permission model as Unity Catalog. If users do not have access to\n", + "a table, they will not be able to explore the lineage associated with the table,\n", + "adding an additional layer of security for privacy considerations.\n", + "\n", + "**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\n", + "in near real-time, and retrieved via REST API to support integrations with our\n", + "catalog partners.\n", + "\n", + "**Getting started with data lineage in Unity Catalog**\n", + "\n", + "Data lineage is available with Databricks Premium and Enterprise tiers for\n", + "no additional cost. If you already are a Databricks customer, follow the data\n", + "lineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\n", + "customer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.8\n", + "\n", + "**Easy Ingestion to Lakehouse With COPY INTO**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
c73fcf9efc9ef252aeab38bb4602e795**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\n", + "in near real-time, and retrieved via REST API to support integrations with our\n", + "catalog partners.\n", + "\n", + "**Getting started with data lineage in Unity Catalog**\n", + "\n", + "Data lineage is available with Databricks Premium and Enterprise tiers for\n", + "no additional cost. If you already are a Databricks customer, follow the data\n", + "lineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\n", + "customer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.8\n", + "\n", + "**Easy Ingestion to Lakehouse With COPY INTO**\n", + "\n", + "by **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n", + "\n", + "January 17, 2023\n", + "\n", + "\n", + "A new data management architecture known as the data lakehouse emerged\n", + "independently across many organizations and use cases to support AI and BI\n", + "directly on vast amounts of data. One of the key success factors for using the\n", + "data lakehouse for analytics and machine learning is the ability to quickly and\n", + "easily ingest data of various types, including data from on-premises storage\n", + "platforms (data warehouses, mainframes), real-time streaming data, and bulk\n", + "data assets.\n", + "\n", + "As data ingestion into the lakehouse is an ongoing process that feeds the\n", + "proverbial ETL pipeline, you will need multiple options to ingest various formats,\n", + "types and latency of data. For data stored in cloud object stores such as AWS\n", + "S3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\n", + "Auto Loader, a natively integrated feature, that allows data engineers to ingest\n", + "millions of files from the cloud storage continuously. In other streaming cases\n", + "\n", + "(e.g., IoT sensor or clickstream data), Databricks provides native connectors\n", + "for Apache Spark Structured Streaming to quickly ingest data from popular\n", + "message queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\n", + "latencies. Furthermore, many customers can leverage popular ingestion tools\n", + "\n", + "\n", + "that integrate with Databricks, such as Fivetran — to easily ingest data from\n", + "enterprise applications, databases, mainframes and more into the lakehouse.\n", + "Finally, analysts can use the simple “COPY INTO” command to pull new data into\n", + "the lakehouse automatically, without the need to keep track of which files have\n", + "already been processed.\n", + "\n", + "This blog focuses on COPY INTO, a simple yet powerful SQL command that allows\n", + "you to perform batch file ingestion into Delta Lake from cloud object stores.\n", + "It’s idempotent, which guarantees to ingest files with exactly-once semantics\n", + "when executed multiple times, supporting incremental appends and simple\n", + "transformations. It can be run once, in an ad hoc manner, and can be scheduled\n", + "through Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\n", + "INTO introduced new functionalities for data preview, validation, enhanced error\n", + "handling, and a new way to copy into a schemaless Delta Lake table so that users\n", + "\n", + "can get started quickly, completing the end-to-end user journey to ingest from\n", + "cloud object stores. Let’s take a look at the popular COPY INTO use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. Ingesting data for the first time**\n", + "\n", + "\n", + "The default for data validation is to parse all the data in the source directory to\n", + "ensure that there aren’t any issues, but the rows returned for preview are limited.\n", + "Optionally, you can provide the number of rows to preview after VALIDATE.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
025b935f36585c781431ea3cee33f9b6can get started quickly, completing the end-to-end user journey to ingest from\n", + "cloud object stores. Let’s take a look at the popular COPY INTO use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. Ingesting data for the first time**\n", + "\n", + "\n", + "The default for data validation is to parse all the data in the source directory to\n", + "ensure that there aren’t any issues, but the rows returned for preview are limited.\n", + "Optionally, you can provide the number of rows to preview after VALIDATE.\n", + "\n", + "The COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\n", + "of your target Delta table. Schema evolution only allows the addition of new\n", + "columns, and does not support data type changes for existing columns. In other\n", + "use cases, you can omit this option if you intend to manage your table schema\n", + "more strictly as your data pipeline may have strict schema requirements and\n", + "may not want to evolve the schema at all times. However, our target Delta table\n", + "in the example above is an empty, columnless table at the moment; therefore,\n", + "we have to specify the COPY_OPTION “mergeSchema” here.\n", + "\n", + "Figure 1: COPY INTO VALIDATE mode output\n", + "\n", + "\n", + "COPY INTO requires a table to exist as it ingests the data into a target Delta\n", + "table. However, you have no idea what your data looks like. You first create an\n", + "empty Delta table.\n", + "```\n", + " CREATE TABLE my_example_data;\n", + "\n", + "```\n", + "Before you write out your data, you may want to preview it and ensure the\n", + "data looks correct. The COPY INTO Validate mode is a new feature in\n", + "Databricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\n", + "source data before ingesting many files from the cloud object stores.\n", + "These validations include:\n", + "\n", + "**•** if the data can be parsed\n", + "\n", + "**•** the schema matches that of the target table or if the schema\n", + "needs to be evolved\n", + "\n", + "**•** all nullability and check constraints on the table are met\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Configuring COPY INTO**\n", + "\n", + "\n", + "Figure 2 shows the validate output that the header is properly parsed.\n", + "\n", + "Figure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n", + "\n", + "**3. Appending data to a Delta table**\n", + "\n", + "Now that the preview looks good, we can remove the VALIDATE keyword and\n", + "execute the COPY INTO command.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "When looking over the results of VALIDATE (see Figure 1), you may notice that\n", + "your data doesn’t look like what you want. Aren’t you glad you previewed your\n", + "data set first? The first thing you notice is the column names are not what is\n", + "specified in the CSV header. What’s worse, the header is shown as a row in your\n", + "data. You can configure the CSV parser by specifying FORMAT_OPTIONS.\n", + "Let’s add those next.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "When using the FORMAT OPTION, you can tell COPY INTO to infer the data types\n", + "of the CSV file by specifying the inferSchema option; otherwise, all default\n", + "data types are STRINGs. On the other hand, binary file formats like AVRO and\n", + "PARQUET do not need this option since they define their own schema. AnotherSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
f0435ebff1911d8a7202adcdcd4eb6ccCOPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "When using the FORMAT OPTION, you can tell COPY INTO to infer the data types\n", + "of the CSV file by specifying the inferSchema option; otherwise, all default\n", + "data types are STRINGs. On the other hand, binary file formats like AVRO and\n", + "PARQUET do not need this option since they define their own schema. Another\n", + "\n", + "option, “mergeSchema” states that the schema should be inferred over a\n", + "comprehensive sample of CSV files rather than just one. The comprehensive list\n", + "of format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO keeps track of the state of files that\n", + "have been ingested. Unlike commands like INSERT\n", + "INTO, users get idempotency with COPY INTO,\n", + "which means users won’t get duplicate data in\n", + "the target table when running COPY INTO multiple\n", + "times from the same source data.\n", + "\n", + "COPY INTO can be run once, in an ad hoc manner,\n", + "and can be scheduled with Databricks Workflows.\n", + "While COPY INTO does not support low latencies\n", + "for ingesting natively, you can trigger COPY INTO\n", + "through orchestrators like Apache Airflow.\n", + "\n", + "\n", + "Figure 3: Databricks workflow UI to schedule a task\n", + "\n", + "\n", + "-----\n", + "\n", + "**4. Secure data access with COPY INTO**\n", + "\n", + "COPY INTO supports secure access in several ways. In this section, we want to\n", + "highlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\n", + "from recent releases:\n", + "\n", + "**Unity Catalog**\n", + "With the general availability of Databrick Unity Catalog, you can use COPY INTO\n", + "to ingest data to Unity Catalog managed or external tables from any source and\n", + "file format supported by COPY INTO. Unity Catalog also adds new options for\n", + "configuring secure access to raw data, allowing you to use Unity Catalog external\n", + "locations or storage credentials to access data in cloud object storage. Learn\n", + "more about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n", + "\n", + "**Temporary Credentials**\n", + "What if you have not configured Unity Catalog or instance profile? How about\n", + "data from a trusted third party bucket? Here is a convenient COPY INTO feature\n", + "that allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\n", + "hoc bulk ingestion use case.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath' WITH (\n", + "CREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\n", + "TOKEN `=` '...' )\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**5. Filtering files for ingestion**\n", + "\n", + "What about ingesting a subset of files where the filenames match a pattern? You\n", + "can apply glob patterns — a glob pattern that identifies the files to load from the\n", + "source directory. For example, let’s filter and ingest files which contain the word\n", + "`raw_data` in the filename below.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data*.csv'\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' )\n", + "\n", + "**6. Ingest files in a time period**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
375f71f98c2fbe67e42a245d06b4d39bCOPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath' WITH (\n", + "CREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\n", + "TOKEN `=` '...' )\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**5. Filtering files for ingestion**\n", + "\n", + "What about ingesting a subset of files where the filenames match a pattern? You\n", + "can apply glob patterns — a glob pattern that identifies the files to load from the\n", + "source directory. For example, let’s filter and ingest files which contain the word\n", + "`raw_data` in the filename below.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data*.csv'\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' )\n", + "\n", + "**6. Ingest files in a time period**\n", + "\n", + "In data engineering, it is frequently necessary to ingest files that have been\n", + "modified before or after a specific timestamp. Data between two timestamps\n", + "may also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\n", + "offered by COPY INTO allow users to ingest data from a chosen time window into\n", + "a Delta table.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_*.csv'\n", + "FORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n", + "\n", + "\n", + "-----\n", + "\n", + "**7. Correcting data with the force option**\n", + "\n", + "Because COPY INTO is by default idempotent, running the same query against\n", + "the same source files more than once has no effect on the destination table\n", + "after the initial execution. You must propagate changes to the target table\n", + "because, in real-world circumstances, source data files in cloud object storage\n", + "may be altered for correction at a later time. In such a case, it is possible to first\n", + "erase the data from the target table before ingesting the more recent data files\n", + "from the source. For this operation you only need to set the copy option ‘force’\n", + "to ‘true’.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_2022*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "\n", + "**8. Applying simple transformations**\n", + "\n", + "What if you want to rename columns? Or the source data has changed and a\n", + "previous column has been renamed to something else? You don’t want to ingest\n", + "that data as two separate columns, but as a single column. We can leverage the\n", + "SELECT statement in COPY INTO perform simple transformations.\n", + "\n", + "COPY INTO demo.my_example_data\n", + "FROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n", + "`*` EXCEPT (first_name, last_name)\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "**9. Error handling and observability with COPY INTO**\n", + "\n", + "**Error handling:**\n", + "How about ingesting data with file corruption issues? Common examples of file\n", + "corruption are:\n", + "\n", + "**•** Files with an incorrect file format\n", + "\n", + "**•** Failure to decompress\n", + "\n", + "**•** Unreadable files (e.g., invalid Parquet)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
6dc0f31a19a017a8eebf29308cb0bac0COPY INTO demo.my_example_data\n", + "FROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n", + "`*` EXCEPT (first_name, last_name)\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "**9. Error handling and observability with COPY INTO**\n", + "\n", + "**Error handling:**\n", + "How about ingesting data with file corruption issues? Common examples of file\n", + "corruption are:\n", + "\n", + "**•** Files with an incorrect file format\n", + "\n", + "**•** Failure to decompress\n", + "\n", + "**•** Unreadable files (e.g., invalid Parquet)\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO’s format option ignoreCorruptFiles helps skip those files while\n", + "processing. The result of the COPY INTO command returns the number of files\n", + "skipped in the num_skipped_corrupt_files column. In addition, these corrupt\n", + "files aren’t tracked by the ingestion state in COPY INTO, therefore they can be\n", + "reloaded in a subsequent execution once the corruption is fixed. This option is\n", + "available in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n", + "\n", + "You can see which files have been detected as corrupt by running COPY INTO in\n", + "VALIDATE mode.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE ALL\n", + "FORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n", + "\n", + "**Observability:**\n", + "In Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\n", + "input file metadata information, which allows users to monitor and get key\n", + "properties of the ingested files like path, name, size and modification time, by\n", + "querying a hidden STRUCT column called _metadata. To include this information\n", + "in the destination, you must explicitly reference the _metadata column in your\n", + "query in COPY INTO.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM (\n", + "SELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\n", + "exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**How does it compare to Auto Loader?**\n", + "\n", + "COPY INTO is a simple and powerful command to use when your source\n", + "directory contains a small number of files (i.e., thousands of files or less), and if\n", + "you prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\n", + "Delta Lake at your convenience, a common pattern by many ingestion partners.\n", + "To ingest a larger number of files both in streaming and batch we recommend\n", + "using [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n", + "[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\n", + "leveraging advanced capabilities of automatic error handling, quality control,\n", + "data lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n", + "\n", + "**How to get started?**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
f319ce9e48ca97f0ca3c7c1441bc406b**How to get started?**\n", + "\n", + "To get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\n", + "example SQL commands to ingest from your cloud object stores. Check out\n", + "the options in No. 4 to establish secure access to your data for querying it in\n", + "Databricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\n", + "follow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n", + "\n", + "As an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\n", + "Machine Learning workspaces to learn most of the COPY INTO features in this\n", + "blog, where source data and target Delta tables are generated in DBFS.\n", + "\n", + "More tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.9 \u0007\n", + "\n", + "**Simplifying Change Data Capture With Databricks Delta Live Tables**\n", + "\n", + "by **M O J G A N M A Z O U C H I**\n", + "\n", + "April 25, 2022\n", + "\n", + "\n", + "This guide will demonstrate how you can leverage change data capture in Delta\n", + "Live Tables pipelines to identify new records and capture changes made to the\n", + "data set in your data lake. Delta Live Tables pipelines enable you to develop\n", + "scalable, reliable and low latency data pipelines, while performing change data\n", + "capturee in your data lake with minimum required computation resources and\n", + "seamless out-of-order data handling.\n", + "\n", + "**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\n", + "which explains creating scalable and reliable pipelines using Delta Live Tables\n", + "(DLT) and its declarative ETL definitions.\n", + "\n", + "**Background on change data capture**\n", + "\n", + "Change data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\n", + "changes (data deletes, inserts and updates) in databases, like tracking customer,\n", + "order or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\n", + "new events occur.\n", + "\n", + "\n", + "Since [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n", + "[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\n", + "real-time centralization of all data changes in your ETL pipeline across multiple\n", + "environments is critical.\n", + "\n", + "By capturing CDC events, Databricks users can re-materialize the source table\n", + "as Delta Table in Lakehouse and run their analysis on top of it, while being able\n", + "to combine data with external systems. The MERGE INTO command in Delta Lake\n", + "on Databricks enables customers to efficiently upsert and delete records in\n", + "their data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\n", + "This is a common use case that we observe many of Databricks customers are\n", + "leveraging Delta Lakes to perform, and keeping their data lakes up to date with\n", + "real-time business data.\n", + "\n", + "While Delta Lake provides a complete solution for real-time CDC synchronization\n", + "in a data lake, we are now excited to announce the change data capture feature\n", + "in Delta Live Tables that makes your architecture even simpler, more efficient and\n", + "scalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
00ed295603d6779afac889a6d0524a93By capturing CDC events, Databricks users can re-materialize the source table\n", + "as Delta Table in Lakehouse and run their analysis on top of it, while being able\n", + "to combine data with external systems. The MERGE INTO command in Delta Lake\n", + "on Databricks enables customers to efficiently upsert and delete records in\n", + "their data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\n", + "This is a common use case that we observe many of Databricks customers are\n", + "leveraging Delta Lakes to perform, and keeping their data lakes up to date with\n", + "real-time business data.\n", + "\n", + "While Delta Lake provides a complete solution for real-time CDC synchronization\n", + "in a data lake, we are now excited to announce the change data capture feature\n", + "in Delta Live Tables that makes your architecture even simpler, more efficient and\n", + "scalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n", + "\n", + "Earlier CDC solutions with Delta tables were using MERGE INTO operation, which\n", + "requires manually ordering the data to avoid failure when multiple rows of the\n", + "source data set match while attempting to update the same rows of the target\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta table. To handle the out-of-order data, there was an extra step required to\n", + "preprocess the source table using a foreachBatch implementation to eliminate\n", + "the possibility of multiple matches, retaining only the latest change for each\n", + "key (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\n", + "operation in DLT pipelines automatically and seamlessly handles out-of-order\n", + "data without any need for data engineering manual intervention.\n", + "\n", + "**CDC with Databricks Delta Live Tables**\n", + "\n", + "In this blog, we will demonstrate how to use the APPLY CHANGES INTO command\n", + "in Delta Live Tables pipelines for a common CDC use case where the CDC data\n", + "is coming from an external system. A variety of CDC tools are available such\n", + "as Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\n", + "implementations differ, these tools generally capture and record the history\n", + "of data changes in logs; downstream applications consume these CDC logs. In\n", + "our example, data is landed in cloud object storage from a CDC tool such as\n", + "Debezium, Fivetran, etc.\n", + "\n", + "We have data from various CDC tools landing in a cloud object storage or a\n", + "message queue like Apache Kafka. Typically we see CDC used in an ingestion\n", + "to what we refer as the medallion architecture. A medallion architecture is a\n", + "data design pattern used to logically organize data in a Lakehouse, with the\n", + "goal of incrementally and progressively improving the structure and quality of\n", + "data as it flows through each layer of the architecture. Delta Live Tables allows\n", + "you to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\n", + "combining this functionality with the medallion architecture allows for\n", + "\n", + "\n", + "incremental changes to easily flow through analytical workloads at scale. Using\n", + "CDC together with the medallion architecture provides multiple benefits to users\n", + "since only changed or added data needs to be processed. Thus, it enables users\n", + "to cost-effectively keep Gold tables up-to-date with the latest business data.\n", + "\n", + "**NOTE:** The example here applies to both SQL and Python versions of CDC\n", + "and also on a specific way to use the operations; to evaluate variations,\n", + "please see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n", + "\n", + "**Prerequisites**\n", + "\n", + "To get the most out of this guide, you should have a basic familiarity with:\n", + "\n", + "**•** SQL or Python\n", + "\n", + "**•** Delta Live Tables\n", + "\n", + "**•** Developing ETL pipelines and/or working with Big Data systems\n", + "\n", + "**•** Databricks interactive notebooks and clusters\n", + "\n", + "**•** You must have access to a Databricks Workspace with permissions\n", + "to create new clusters, run jobs, and save data to a location on\n", + "external cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
bad7fff4d3f967a085f6f130ed286489**NOTE:** The example here applies to both SQL and Python versions of CDC\n", + "and also on a specific way to use the operations; to evaluate variations,\n", + "please see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n", + "\n", + "**Prerequisites**\n", + "\n", + "To get the most out of this guide, you should have a basic familiarity with:\n", + "\n", + "**•** SQL or Python\n", + "\n", + "**•** Delta Live Tables\n", + "\n", + "**•** Developing ETL pipelines and/or working with Big Data systems\n", + "\n", + "**•** Databricks interactive notebooks and clusters\n", + "\n", + "**•** You must have access to a Databricks Workspace with permissions\n", + "to create new clusters, run jobs, and save data to a location on\n", + "external cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n", + "\n", + "**•** For the pipeline we are creating in this blog, “Advanced” product\n", + "edition which supports enforcement of data quality constraints,\n", + "needs to be selected\n", + "\n", + "\n", + "-----\n", + "\n", + "**The data set**\n", + "\n", + "Here we are consuming realistic looking CDC data from an external database. In\n", + "this pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\n", + "tool like Debezium can produce and bring into cloud storage for the initial ingest\n", + "in Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\n", + "object storage, and store them in the Bronze table as it stores the raw messages.\n", + "The Bronze tables are intended for data ingestion which enable quick access to a\n", + "single source of truth. Next we perform APPLY CHANGES INTO from the cleaned\n", + "Bronze layer table to propagate the updates downstream to the Silver table. As\n", + "data flows to Silver tables, generally it becomes more refined and optimized\n", + "(“just-enough”) to provide an enterprise a view of all its key business entities.\n", + "See the diagram below.\n", + "\n", + "\n", + "This blog focuses on a simple example that requires a JSON message with\n", + "four fields of customer’s name, email, address and id along with the two fields:\n", + "operation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\n", + "operation_date (which stores the date and timestamp for the record came for\n", + "each operation action) to describe the changed data.\n", + "\n", + "To generate a sample data set with the above fields, we are using a Python\n", + "package that generates fake data, Faker. You can find the notebook related to this\n", + "data generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\n", + "location to write the generated data there. We are using the DBFS functionality of\n", + "Databricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\n", + "we use a PySpark user-defined function to generate the synthetic data set for\n", + "each field, and write the data back to the defined storage location, which we will\n", + "refer to in other notebooks for accessing the synthetic data set.\n", + "\n", + "**Ingesting the raw data set using Auto Loader**\n", + "\n", + "According to the medallion architecture paradigm, the Bronze layer holds the\n", + "most raw data quality. At this stage we can incrementally read new data using\n", + "Auto Loader from a location in cloud storage. Here we are adding the path to our\n", + "generated data set to the configuration section under pipeline settings, which\n", + "allows us to load the source path as a variable. So now our configuration under\n", + "pipeline settings looks like below:\n", + "\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\"\n", + "\n", + "\n", + "-----\n", + "\n", + "Then we load this configuration property in our notebooks.\n", + "\n", + "Let’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n", + "\n", + "**A . S Q L**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
42209110d6df5aaf79390f53667c369f**Ingesting the raw data set using Auto Loader**\n", + "\n", + "According to the medallion architecture paradigm, the Bronze layer holds the\n", + "most raw data quality. At this stage we can incrementally read new data using\n", + "Auto Loader from a location in cloud storage. Here we are adding the path to our\n", + "generated data set to the configuration section under pipeline settings, which\n", + "allows us to load the source path as a variable. So now our configuration under\n", + "pipeline settings looks like below:\n", + "\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\"\n", + "\n", + "\n", + "-----\n", + "\n", + "Then we load this configuration property in our notebooks.\n", + "\n", + "Let’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n", + "\n", + "**A . S Q L**\n", + "\n", + "SET spark.source;\n", + "CREATE STREAMING LIVE TABLE customer_bronze\n", + "(\n", + "address string ,\n", + "email string ,\n", + "id string ,\n", + "firstname string ,\n", + "lastname string ,\n", + "operation string ,\n", + "operation_date string ,\n", + "_rescued_data string\n", + ")\n", + "TBLPROPERTIES ( \"quality\" = \"bronze\" )\n", + "COMMENT \"New customer data incrementally ingested from cloud object\n", + "storage landing zone\"\n", + "AS\n", + "SELECT *\n", + "FROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\n", + "inferColumnTypes\" , \"true\" ));\n", + "\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "source = spark.conf.get( \"source\" )\n", + "\n", + "**@dlt.table(name=** **\"customer_bronze\"** **,**\n", + "**comment =** **\"New customer data incrementally ingested from**\n", + "**cloud object storage landing zone\"** **,**\n", + "**table_properties={**\n", + "**\"quality\"** **:** **\"bronze\"**\n", + "**}**\n", + "**)**\n", + "```\n", + " def customer_bronze ():\n", + "\n", + "```\n", + "return (\n", + "spark.readStream. format ( \"cloudFiles\" ) \\\n", + ".option( \"cloudFiles.format\" , \"json\" ) \\\n", + ".option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n", + ".load( f\" {source} /customers\" )\n", + ")\n", + "\n", + "The above statements use the Auto Loader to create a streaming live table\n", + "called customer_bronze from json files. When using Auto Loader in Delta Live\n", + "\n", + "Tables, you do not need to provide any location for schema or checkpoint, as\n", + "those locations will be managed automatically by your DLT pipeline.\n", + "\n", + "Auto Loader provides a Structured Streaming source called cloud_files in\n", + "SQL and cloudFiles in Python, which takes a cloud storage path and format as\n", + "parameters.\n", + "\n", + "To reduce compute costs, we recommend running the DLT pipeline in\n", + "Triggered mode as a micro-batch assuming you do not have very low latency\n", + "requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Expectations and high-quality data**\n", + "\n", + "In the next step to create a high-quality, diverse, and accessible data set,\n", + "we impose quality check expectation criteria using Constraints. Currently,\n", + "a constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\n", + "constraints are logged to enable streamlined quality monitoring.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\n", + "CONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\n", + "CONSTRAINT valid_address EXPECT (address IS NOT NULL ),\n", + "CONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\n", + "DROP ROW\n", + ")\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\n", + "AS SELECT `*`\n", + "FROM STREAM(LIVE.customer_bronze);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " @dlt.view(name= \"customer_bronze_clean_v\" ,\n", + " comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n", + "\n", + "```SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
16725d26cc4892da8c004b7b86e4208e**A . S Q L**\n", + "\n", + "CREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\n", + "CONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\n", + "CONSTRAINT valid_address EXPECT (address IS NOT NULL ),\n", + "CONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\n", + "DROP ROW\n", + ")\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\n", + "AS SELECT `*`\n", + "FROM STREAM(LIVE.customer_bronze);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " @dlt.view(name= \"customer_bronze_clean_v\" ,\n", + " comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n", + "\n", + "```\n", + "\n", + "**Using APPLY CHANGES INTO statement to propagate changes to**\n", + "\n", + "**downstream target table**\n", + "\n", + "Prior to executing the Apply Changes Into query, we must ensure that a target\n", + "streaming table which we want to hold the most up-to-date data exists. If it\n", + "does not exist we need to create one. Below cells are examples of creating a\n", + "target streaming table. Note that at the time of publishing this blog, the target\n", + "streaming table creation statement is required along with the Apply Changes\n", + "Into query, and both need to be present in the pipeline — otherwise your table\n", + "creation query will fail.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE STREAMING LIVE TABLE customer_silver\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Clean, merged customers\";\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "dlt.create_target_table(name= \"customer_silver\" ,\n", + "comment= \"Clean, merged customers\" ,\n", + "table_properties={\n", + "\"quality\" : \"silver\"\n", + "\n", + "```\n", + "@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n", + "@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n", + "@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\n", + "def customer_bronze_clean_v ():\n", + " return dlt.read_stream( \"customer_bronze\" ) \\\n", + "\n", + "```\n", + "`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n", + "```\n", + "\"operation\" , \"operation_date\" , \"_rescued_data\" )\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "Now that we have a target streaming table, we can propagate changes to the\n", + "downstream target table using the Apply Changes Into query. While CDC feed\n", + "comes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\n", + "INSERT and UPDATE events from any record in the source data set matching\n", + "on primary keys, and sequenced by a field which identifies the order of events.\n", + "More specifically it updates any row in the existing target table that matches\n", + "the primary key(s) or inserts a new row when a matching record does not exist\n", + "in the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\n", + "equivalent apply_as_deletes argument in Python to handle DELETE events.\n", + "\n", + "In this example we used \"id\" as my primary key, which uniquely identifies the\n", + "customers and allows CDC events to apply to those identified customer records\n", + "in the target streaming table. Since \"operation_date\" keeps the logical order of\n", + "CDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\n", + "SQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\n", + "change events that arrive out of order. Keep in mind that the field value we use\n", + "with SEQUENCE BY (or sequence_by) should be unique among all updates to\n", + "the same key. In most cases, the sequence by column will be a column with\n", + "timestamp information.\n", + "\n", + "Finally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\n", + "data)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\n", + "date\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n", + "\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\n", + "the columns are included in the target streaming table, when we do not specify\n", + "the \"COLUMNS\" clause.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
23039b84f7a58b633ecbba062199cfb8In this example we used \"id\" as my primary key, which uniquely identifies the\n", + "customers and allows CDC events to apply to those identified customer records\n", + "in the target streaming table. Since \"operation_date\" keeps the logical order of\n", + "CDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\n", + "SQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\n", + "change events that arrive out of order. Keep in mind that the field value we use\n", + "with SEQUENCE BY (or sequence_by) should be unique among all updates to\n", + "the same key. In most cases, the sequence by column will be a column with\n", + "timestamp information.\n", + "\n", + "Finally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\n", + "data)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\n", + "date\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n", + "\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\n", + "the columns are included in the target streaming table, when we do not specify\n", + "the \"COLUMNS\" clause.\n", + "\n", + "\n", + "**A . S Q L**\n", + "\n", + "APPLY CHANGES INTO LIVE.customer_silver\n", + "FROM stream(LIVE.customer_bronze_clean_v)\n", + "KEYS (id)\n", + "APPLY AS DELETE WHEN operation `=` \"DELETE\"\n", + "SEQUENCE BY operation_date\n", + "COLUMNS `*` EXCEPT (operation, operation_date,\n", + "_rescued_data);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " dlt.apply_changes(\n", + " target = \"customer_silver\",\n", + " source = \"customer_bronze_clean_v\",\n", + " keys = [\"id\"],\n", + " sequence_by = col(\"operation_date\"),\n", + " apply_as_deletes = expr(\"operation = 'DELETE'\"),\n", + " except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n", + "\n", + "```\n", + "To check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n", + "\n", + "Please note that, at the time of publishing this blog, a table that reads from the\n", + "target of an APPLY CHANGES INTO query or apply_changes function must be a\n", + "live table, and cannot be a streaming live table.\n", + "\n", + "A [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\n", + "we have all the cells ready, let’s create a pipeline to ingest data from cloud object\n", + "storage. Open Jobs in a new tab or window in your workspace, and select “Delta\n", + "Live Tables.”\n", + "\n", + "\n", + "-----\n", + "\n", + "The pipeline associated with this blog has the following DLT pipeline settings:\n", + "\n", + "{\n", + "\"clusters\" : [\n", + "{\n", + "\"label\" : \"default\" ,\n", + "\"num_workers\" : 1\n", + "}\n", + "],\n", + "\"development\" : true ,\n", + "\"continuous\" : false ,\n", + "\"edition\" : \"advanced\" ,\n", + "\"photon\" : false ,\n", + "\"libraries\" : [\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/1-CDC_DataGenerator\"\n", + "}\n", + "},\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/2-Retail_DLT_CDC_sql\"\n", + "}\n", + "}\n", + "],\n", + "\"name\" : \"CDC_blog\" ,\n", + "\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\" ,\n", + "\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n", + "},\n", + "\"target\" : \"my_database\"\n", + "\n", + "\n", + "1. Select “Create Pipeline” to create a new pipeline\n", + "\n", + "2. Specify a name such as “Retail CDC Pipeline”SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
24fa9012ddc358cd759a98159e4681101. Select “Create Pipeline” to create a new pipeline\n", + "\n", + "2. Specify a name such as “Retail CDC Pipeline”\n", + "\n", + "3. Specify the Notebook Paths that you already created earlier, one for the\n", + "generated data set using Faker package, and another path for the ingestion\n", + "of the generated data in DLT. The second notebook path can refer to the\n", + "notebook written in SQL, or Python depending on your language of choice.\n", + "\n", + "4. To access the data generated in the first notebook, add the data set path in\n", + "configuration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\n", + "we set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\n", + "our second notebook.\n", + "\n", + "5. Specify the Target (which is optional and referring to the target database),\n", + "where you can query the resulting tables from your pipeline\n", + "\n", + "6. Specify the Storage Location in your object storage (which is optional), to\n", + "access your DLT produced data sets and metadata logs for your pipeline\n", + "\n", + "7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\n", + "new data in the source all at once, and once the processing is done it will\n", + "terminate the compute resource automatically. You can toggle between\n", + "Triggered and Continuous modes when editing your pipeline settings. Setting\n", + "“continuous”: false in the JSON is equivalent to setting the pipeline to\n", + "Triggered mode.\n", + "\n", + "8. For this workload you can disable the autoscaling under Autopilot Options,\n", + "and use only one worker cluster. For production workloads, we recommend\n", + "enabling autoscaling and setting the maximum numbers of workers needed\n", + "for cluster size.\n", + "\n", + "9. Select “Start”\n", + "\n", + "10. Your pipeline is created and running now!\n", + "\n", + "\n", + "-----\n", + "\n", + "You can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\n", + "to see pipeline observability and data quality monitoring on the example DLT\n", + "pipeline associated with this blog.\n", + "\n", + "**Conclusion**\n", + "\n", + "In this blog, we showed how we made it seamless for users to efficiently\n", + "implement change data capture (CDC) into their lakehouse platform with Delta\n", + "Live Tables (DLT). DLT provides built-in quality controls with deep visibility into\n", + "pipeline operations, observing pipeline lineage, monitoring schema, and quality\n", + "checks at each step in the pipeline. DLT supports automatic error handling and\n", + "best in class auto-scaling capability for streaming workloads, which enables\n", + "users to have quality data with optimum resources required for their workload.\n", + "\n", + "Data engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n", + "[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\n", + "your ETL pipelines easily identify changes and apply those changes across tens\n", + "of thousands of tables with low-latency support.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
3725239f09e02ff819d5ef17ec0f1c5a**Conclusion**\n", + "\n", + "In this blog, we showed how we made it seamless for users to efficiently\n", + "implement change data capture (CDC) into their lakehouse platform with Delta\n", + "Live Tables (DLT). DLT provides built-in quality controls with deep visibility into\n", + "pipeline operations, observing pipeline lineage, monitoring schema, and quality\n", + "checks at each step in the pipeline. DLT supports automatic error handling and\n", + "best in class auto-scaling capability for streaming workloads, which enables\n", + "users to have quality data with optimum resources required for their workload.\n", + "\n", + "Data engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n", + "[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\n", + "your ETL pipelines easily identify changes and apply those changes across tens\n", + "of thousands of tables with low-latency support.\n", + "\n", + "**Ready to get started and try out CDC in Delta Live Tables for yourself?**\n", + "Please watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\n", + "complexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n", + "[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n", + "[video](https://vimeo.com/700994477) to create your pipeline!\n", + "\n", + "\n", + "**DLT pipeline lineage observability and data quality**\n", + "**monitoring**\n", + "\n", + "All DLT pipeline logs are stored in the pipeline’s storage location. You can specify\n", + "your storage location only when you are creating your pipeline. Note that once\n", + "the pipeline is created you can no longer modify storage location.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.10 \u0007\n", + "\n", + "**Best Practices for Cross-Government Data Sharing**\n", + "\n", + "by **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n", + "\n", + "**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n", + "\n", + "February 21, 2023SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
64a7fd4d47f0417252c1425f0f46c67d**DLT pipeline lineage observability and data quality**\n", + "**monitoring**\n", + "\n", + "All DLT pipeline logs are stored in the pipeline’s storage location. You can specify\n", + "your storage location only when you are creating your pipeline. Note that once\n", + "the pipeline is created you can no longer modify storage location.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.10 \u0007\n", + "\n", + "**Best Practices for Cross-Government Data Sharing**\n", + "\n", + "by **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n", + "\n", + "**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n", + "\n", + "February 21, 2023\n", + "\n", + "\n", + "Government data exchange is the practice of sharing data between different\n", + "government agencies and often partners in commercial sectors. Government\n", + "can share data for various reasons, such as to improve government operations’\n", + "efficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\n", + "private sector or receiving data from the private sector. The considerations span\n", + "multiple jurisdictions and over almost all industries. In this blog, we will address the\n", + "needs disclosed as part of national data strategies and how modern technologies,\n", + "particularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\n", + "implement and manage a future-proof and sustainable data ecosystem.\n", + "\n", + "**Data sharing and public sector**\n", + "\n", + "“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n", + "\n", + "Probably the quote about sharing that applies the most profoundly to the\n", + "topic of data sharing. To the extent that the purpose of sharing the data is to\n", + "create new information, new insights, and new data. The importance of data\n", + "sharing is even more amplified in the government context, where federation\n", + "\n", + "\n", + "between departments allows for increased focus. Still, the very same federation\n", + "introduces challenges around data completeness, data quality, data access,\n", + "security and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\n", + "and require a strategic, multifaceted approach to be addressed appropriately.\n", + "Technology, people, process, legal frameworks, etc., require dedicated\n", + "consideration when designing a robust data sharing ecosystem.\n", + "\n", + "[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\n", + "missions through which we can materialize the value of data for the citizen and\n", + "society-wide benefits.\n", + "\n", + "\n", + "-----\n", + "\n", + "It comes as no surprise that each and every one of the missions is strongly\n", + "related to the concept of data sharing, or more broadly, data access both within\n", + "and outside of government departments:\n", + "\n", + "**1. Unlocking the value of the data across the economy** — Mission 1 of the\n", + "NDS aims to assert government and the regulators as enablers of the value\n", + "extraction from data through the adoption of best practices. The UK data\n", + "economy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\n", + "In this context, it is essential to understand that the government-collected\n", + "and provided open data can be crucial for addressing many of the challenges\n", + "across all industries.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
38268ee8f57ce584fd9a83226a52d486[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\n", + "missions through which we can materialize the value of data for the citizen and\n", + "society-wide benefits.\n", + "\n", + "\n", + "-----\n", + "\n", + "It comes as no surprise that each and every one of the missions is strongly\n", + "related to the concept of data sharing, or more broadly, data access both within\n", + "and outside of government departments:\n", + "\n", + "**1. Unlocking the value of the data across the economy** — Mission 1 of the\n", + "NDS aims to assert government and the regulators as enablers of the value\n", + "extraction from data through the adoption of best practices. The UK data\n", + "economy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\n", + "In this context, it is essential to understand that the government-collected\n", + "and provided open data can be crucial for addressing many of the challenges\n", + "across all industries.\n", + "\n", + "For example, insurance providers can better assess the risk of insuring\n", + "properties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\n", + "the other hand, capital market investors could better understand the risk of\n", + "their investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\n", + "Reversely, it is crucial for regulators to have well-defined data access and\n", + "data sharing patterns for conducting their regulatory activities. This clarity\n", + "truly enables the economic actors that interact with government data.\n", + "\n", + "\n", + "**2. Securing a pro-growth and trusted data regime** — The key aspect of\n", + "Mission 2 is data trust, or more broadly, adherence to data quality norms.\n", + "Data quality considerations become further amplified for data sharing and\n", + "data exchange use cases where we are considering the whole ecosystem\n", + "at once, and quality implications transcend the boundaries of our own\n", + "platform. This is precisely why we have to adopt “data sustainability.” What\n", + "we mean by sustainable data products are data products that harness the\n", + "existing sources over reinvention of the same/similar assets, accumulation of\n", + "unnecessary data (data pollutants) and that anticipate future uses.\n", + "\n", + "Ungoverned and unbounded data sharing could negatively impact data\n", + "quality and hinder the growth and value of data. The quality of how the data\n", + "is shared should be a key consideration of data quality frameworks. For\n", + "this reason, we require a solid set of standards and best practices for data\n", + "sharing with governance and quality assurance built into the process and\n", + "technologies. Only this way can we ensure the sustainability of our data and\n", + "secure a pro-growth trusted data regime.\n", + "\n", + "\n", + "-----\n", + "\n", + "**3. Transforming government’s use of data to drive efficiency and improve**\n", + "**public services** — “By 2025 data assets are organized and supported as\n", + "products, regardless of whether they’re used by internal teams or external\n", + "customers… Data products continuously evolve in an agile manner to meet\n", + "the needs of consumers… these products provide data solutions that can\n", + "more easily and repeatedly be used to meet various business challenges and\n", + "reduce the time and cost of delivering new AI-driven capabilities.” —\n", + "[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\n", + "enablers of digital transformation for both the public and private sectors.\n", + "\n", + "AI, ML, reports, and dashboards are just a few examples of data products\n", + "and services that extract value from data. The quality of these solutions is\n", + "directly reflected in the quality of data used for building them and our ability\n", + "to access and leverage available data assets both internally and externally.\n", + "Whilst there is a vast amount of data available for us to build new intelligent\n", + "solutions for driving efficiency for better processes, better decision-making,\n", + "and better policies — there are numerous barriers that can trap the data,\n", + "such as legacy systems, data silos, fragmented standards, proprietary\n", + "formats, etc. Modeling data solutions as data products and standardizing\n", + "them to a unified format allows us to abstract such barriers and truly\n", + "leverage the data ecosystem.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
11226970147e60bdd4099a6d7dbbc43dAI, ML, reports, and dashboards are just a few examples of data products\n", + "and services that extract value from data. The quality of these solutions is\n", + "directly reflected in the quality of data used for building them and our ability\n", + "to access and leverage available data assets both internally and externally.\n", + "Whilst there is a vast amount of data available for us to build new intelligent\n", + "solutions for driving efficiency for better processes, better decision-making,\n", + "and better policies — there are numerous barriers that can trap the data,\n", + "such as legacy systems, data silos, fragmented standards, proprietary\n", + "formats, etc. Modeling data solutions as data products and standardizing\n", + "them to a unified format allows us to abstract such barriers and truly\n", + "leverage the data ecosystem.\n", + "\n", + "\n", + "**4. Ensuring the security and resilience of the infrastructure on which**\n", + "**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\n", + "from now and even in a not so distant future, we will be required to rethink\n", + "our approach to data, more specifically — what is our digital supply chain\n", + "infrastructure/data sharing infrastructure? Data and data assets are products\n", + "and should be managed as products. If data is a product, we need a coherent\n", + "and unified way of providing those products.\n", + "\n", + "If data is to be used across industries and across both private and public\n", + "sectors, we need an open protocol that drives adoption and habit generation.\n", + "To drive adoption, the technologies we use must be resilient, robust, trusted\n", + "and usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\n", + "boundaries to achieving this vision.\n", + "\n", + "**5. Championing the international flow of data** — Data exchange between\n", + "jurisdictions and across governments will likely be one of the most\n", + "transformative applications of data at scale. Some of the world’s toughest\n", + "challenges depend on the efficient exchange of data between governments\n", + "— prevention of criminal activities, counterterrorism activities, net-zero\n", + "emission goals, international trade, the list goes on and on. Some steps in\n", + "this direction are already materializing: the U.S. federal government and UK\n", + "government have agreed on data exchange for countering serious crime\n", + "activities. This is a true example of championing international flow data and\n", + "using data for good. It is imperative that for these use cases, we approach\n", + "data sharing from a security-first angle. Data sharing standards and protocols\n", + "need to adhere to security and privacy best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "While originally built with a focus on the UK government and how to better\n", + "integrate data as a key asset of a modern government, these concepts apply in\n", + "a much wider global public sector context. In the same spirit, the U.S. Federal\n", + "Government proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\n", + "practices, action steps and timeline through which government can leverage\n", + "the full value of Federal data for mission, service and the public good.\n", + "\n", + "The principles are grouped into three primary topics:\n", + "\n", + "**•** **Ethical governance** — Within the domain of ethics, the sharing of data\n", + "is a fundamental tool for promoting transparency, accountability and\n", + "explainability of decision-making. It is practically impossible to uphold\n", + "ethics without some form of audit conducted by an independent party.\n", + "Data (and metadata) exchange is a critical enabler for continuous robust\n", + "processes that ensure we are using the data for good and we are using data\n", + "we can trust.\n", + "\n", + "\n", + "\n", + "**•** **Conscious design** — These principles are strongly aligned with the idea of\n", + "data sustainability. The guidelines promote forward thinking around usability\n", + "and interoperability of the data and user-centric design principles of\n", + "sustainable data products.\n", + "\n", + "**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\n", + "an important role in building a scalable learning ecosystem and learning\n", + "culture. Data is front and center of knowledge synthesis, and from a\n", + "scientific angle, data proves factual knowledge. Another critical component\n", + "of knowledge is the “Why?” and data is what we need to address the\n", + "“Why?” component of any decisions we make, which policy to enforce, who\n", + "to sanction, who to support with grants, how to improve the efficiency of\n", + "government services, how to better serve citizens and society.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
0ee51c0c228a4f8a1444765ec2ca8db7**•** **Conscious design** — These principles are strongly aligned with the idea of\n", + "data sustainability. The guidelines promote forward thinking around usability\n", + "and interoperability of the data and user-centric design principles of\n", + "sustainable data products.\n", + "\n", + "**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\n", + "an important role in building a scalable learning ecosystem and learning\n", + "culture. Data is front and center of knowledge synthesis, and from a\n", + "scientific angle, data proves factual knowledge. Another critical component\n", + "of knowledge is the “Why?” and data is what we need to address the\n", + "“Why?” component of any decisions we make, which policy to enforce, who\n", + "to sanction, who to support with grants, how to improve the efficiency of\n", + "government services, how to better serve citizens and society.\n", + "\n", + "In contrast to afore discussed qualitative analysis of the value of data sharing\n", + "across governments, the European Commission forecasts the economic value\n", + "of the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\n", + "same size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\n", + "million data professionals in Europe alone. The technology and infrastructure to\n", + "support the data society have to be accessible to all, interoperable, extensible,\n", + "flexible and open. Imagine a world in which you’d need a different truck to\n", + "transport products between different warehouses because each road requires a\n", + "different set of tires — the whole supply chain would collapse. When it comes to\n", + "data, we often experience the “one set of tires for one road” paradox. Rest APIs\n", + "and data exchange protocols have been proposed in the past but have failed\n", + "to address the need for simplicity, ease of use and cost of scaling up with the\n", + "number of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Sharing — the new data**\n", + "**highway**\n", + "\n", + "Delta Sharing provides an open protocol for\n", + "secure data sharing to any computing platform.\n", + "The protocol is based on Delta data format and is\n", + "agnostic concerning the cloud of choice.\n", + "\n", + "Delta is an open source data format that avoids\n", + "vendor, platform and cloud lock-in, thus fully\n", + "adhering to the principles of data sustainability,\n", + "conscious design of the U.S. Federal Data Strategy\n", + "and mission 4 of the UK National Data Strategy.\n", + "Delta provides a governance layer on top of the\n", + "Parquet data format. Furthermore, it provides many\n", + "performance optimizations not available in Parquet\n", + "out of the box. The openness of the data format\n", + "is a critical consideration. It is the main factor for\n", + "driving the habit generation and adoption of best\n", + "practices and standards.\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\n", + "permissions and access to any data asset stored in Delta or Parquet formats.\n", + "The protocol defines two main actors, the data provider (data supplier, data\n", + "owner) and the data recipient (data consumer). The recipient, by definition, is\n", + "agnostic to the data format at the source. Delta Sharing provides the necessary\n", + "abstractions for governed data access in many different languages and tools.\n", + "\n", + "Delta Sharing is uniquely positioned to answer many of the challenges of data\n", + "sharing in a scalable manner within the context of highly regulated domains like\n", + "the public sector:\n", + "\n", + "**• Privacy and security concerns** — Personally identifiable data or otherwise\n", + "sensitive or restricted data is a major part of the data exchange needs of a\n", + "data-driven and modernized government. Given the sensitive nature of such\n", + "data, it is paramount that the governance of data sharing is maintained in a\n", + "coherent and unified manner. Any unnecessary process and technological\n", + "complexities increase the risk of over-sharing data. With this in mind,\n", + "Delta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\n", + "very inception. The protocol provides end-to-end encryption, short-lived\n", + "credentials, and accessible and intuitive audit and governance features. All\n", + "of these capabilities are available in a centralized way across all your Delta\n", + "tables across all clouds.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
af8513c2a14ec8c66be9f231f6759c24Delta Sharing is uniquely positioned to answer many of the challenges of data\n", + "sharing in a scalable manner within the context of highly regulated domains like\n", + "the public sector:\n", + "\n", + "**• Privacy and security concerns** — Personally identifiable data or otherwise\n", + "sensitive or restricted data is a major part of the data exchange needs of a\n", + "data-driven and modernized government. Given the sensitive nature of such\n", + "data, it is paramount that the governance of data sharing is maintained in a\n", + "coherent and unified manner. Any unnecessary process and technological\n", + "complexities increase the risk of over-sharing data. With this in mind,\n", + "Delta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\n", + "very inception. The protocol provides end-to-end encryption, short-lived\n", + "credentials, and accessible and intuitive audit and governance features. All\n", + "of these capabilities are available in a centralized way across all your Delta\n", + "tables across all clouds.\n", + "\n", + "**• Quality and accuracy** — Another challenge of data sharing is ensuring\n", + "that the data being shared is of high quality and accuracy. Given that\n", + "the underlying data is stored as Delta tables, we can guarantee that the\n", + "[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\n", + "of data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n", + "\n", + "\n", + "quality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n", + "[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\n", + "additional effort. The issue becomes even more emphasized by the fact\n", + "that data quality cannot be ensured in the same way on both the data\n", + "provider and data recipient side without the exact reimplementation of the\n", + "source systems. It is critical to embed quality and metadata together with\n", + "data to ensure quality travels together with data. Any decoupled approach\n", + "to managing data, metadata and quality separately increases the risk of\n", + "sharing and can lead to undesirable outcomes.\n", + "\n", + "**• Lack of standardization** — Another challenge of data sharing is the lack\n", + "of standardization in how data is collected, organized, and stored. This is\n", + "particularly pronounced in the context of governmental activities. While\n", + "governments have proposed standard formats (e.g., Office for National\n", + "Statistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\n", + "sector companies to standards proposed by such initiatives is a massive\n", + "challenge. Other industries may have different requirements for scalability,\n", + "interoperability, format complexity, lack of structure in data, etc. Most of\n", + "the currently advocated standards are lacking in multiple such aspects.\n", + "Delta is the most mature candidate for assuming the central role in the\n", + "standardization of data exchange format. It has been built as a transactional\n", + "and scalable data format, it supports structured, semi-structured and\n", + "unstructured data, it stores data schema and metadata together with data\n", + "and it provides a scalable enterprise-grade sharing protocol through Delta\n", + "Sharing. Finally, Delta is one of the most popular open source projects\n", + "in the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n", + "[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
efc4fbee742e1cddfcd60a0586c59586-----\n", + "\n", + "**• Cultural and organizational barriers** — These challenges can be\n", + "summarized by one word: friction. Unfortunately, it’s a common problem\n", + "for civil servants to struggle to obtain access to both internal and external\n", + "data due to over-cumbersome processes, policies and outdated standards.\n", + "The principles we are using to build our data platforms and our data sharing\n", + "platforms have to be self-promoting, have to drive adoption and have to\n", + "generate habits that adhere to best practices.\n", + "\n", + "If there is friction with standard adoption, the only way to ensure standards\n", + "are respected is by enforcement and that itself is yet another barrier to\n", + "achieving data sustainability. Organizations have already adopted Delta\n", + "Sharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n", + "[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n", + "[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\n", + "Sharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\n", + "and governed.\n", + "\n", + "\n", + "\n", + "**• Technical challenges** — Federation at the government scale or even\n", + "further across multiple industries and geographies poses technical\n", + "challenges. Each organization within this federation owns its platform\n", + "and drives technological, architectural, platform and tooling choices.\n", + "\n", + "How can we promote interoperability and data exchange in this vast,\n", + "diverse technological ecosystem? The data is the only viable integration\n", + "vehicle. As long as the data formats we utilize are scalable, open and\n", + "governed, we can use them to abstract from individual platforms and\n", + "their intrinsic complexities.\n", + "\n", + "Delta format and Delta Sharing solve this wide array of requirements and\n", + "challenges in a scalable, robust and open way. This positions Delta Sharing\n", + "as the strongest choice for unification and simplification of the protocol and\n", + "mechanism through which we share data across both private and public sectors.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Sharing through data clean rooms**\n", + "\n", + "\n", + "[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\n", + "share data with third parties in a privacy-safe environment. With Unity Catalog ,\n", + "you can enable fine-grained access controls on the data and meet your privacy\n", + "requirements. In this architecture, the data participants never get access to\n", + "the raw data. The only outputs from the clean rooms are those data assets\n", + "generated in a pre-agreed, governed and fully controlled manner that ensures\n", + "compliance with the requirements of all parties involved.\n", + "\n", + "Finally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\n", + "on the premise. In contrast, less restricted data is free to leverage the power\n", + "of the cloud offerings. In said scenario, there may be a need to combine the\n", + "power of the cloud with the restricted data to solve advanced use cases where\n", + "capabilities are unavailable on the on-premises data platforms. Data clean rooms\n", + "can ensure that no physical data copies of the raw restricted data are created,\n", + "results are produced within the clean room’s controlled environment and results\n", + "are shared back to the on-premises environment (if the results maintain the\n", + "restricted access within the defined policies) or are forwarded to any other\n", + "compliant and predetermined destination system.\n", + "\n", + "\n", + "Taking the complexities of data sharing within highly regulated space and the\n", + "public sector one step further — what if we require to share the knowledge\n", + "contained in the data without ever granting direct access to the source data to\n", + "external parties? These requirements may prove achievable and desirable where\n", + "the data sharing risk appetite is very low.\n", + "\n", + "In many public sector contexts, there are concerns that combining the data that\n", + "describes citizens could lead to a big brother scenario where simply too much\n", + "data about an individual is concentrated in a single data asset. If it were to fall\n", + "into the wrong hands, such a hypothetical data asset could lead to immeasurable\n", + "consequences for individuals and the trust in public sector services could\n", + "erode. On the other hand, the value of a 360 view of the citizen could accelerate\n", + "important decision-making. It could immensely improve the quality of policies\n", + "and services provided to the citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Citizen value of data sharing**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
71748a10f27ff3863ddbfa7c97bff3c4Taking the complexities of data sharing within highly regulated space and the\n", + "public sector one step further — what if we require to share the knowledge\n", + "contained in the data without ever granting direct access to the source data to\n", + "external parties? These requirements may prove achievable and desirable where\n", + "the data sharing risk appetite is very low.\n", + "\n", + "In many public sector contexts, there are concerns that combining the data that\n", + "describes citizens could lead to a big brother scenario where simply too much\n", + "data about an individual is concentrated in a single data asset. If it were to fall\n", + "into the wrong hands, such a hypothetical data asset could lead to immeasurable\n", + "consequences for individuals and the trust in public sector services could\n", + "erode. On the other hand, the value of a 360 view of the citizen could accelerate\n", + "important decision-making. It could immensely improve the quality of policies\n", + "and services provided to the citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Citizen value of data sharing**\n", + "\n", + "Every decision made by the government is a decision that affects its citizens.\n", + "Whether the decision is a change to a policy, granting a benefit or preventing\n", + "crime, it can significantly influence the quality of our society. Data is a key factor\n", + "in making the right decisions and justifying the decisions made. Simply put,\n", + "we can’t expect high-quality decisions without the high quality of data and a\n", + "complete view of the data (within the permitted context). Without data sharing,\n", + "we will remain in a highly fragmented position where our ability to make those\n", + "decisions is severely limited or even completely compromised. In this blog, we\n", + "have covered several technological solutions available within the lakehouse that\n", + "can derisk and accelerate how the government is leveraging the data ecosystem\n", + "in a sustainable and scalable way.\n", + "\n", + "For more details on the industry use cases that Delta Sharing is addressing\n", + "please consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Ready-to-Use Notebooks and Data Sets\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins**\n", + "\n", + "Leverage digital twins — virtual\n", + "representations of devices and\n", + "objects — to optimize operations and\n", + "gain insights\n", + "\n", + "\n", + "This section includes several Solution Accelerators — free, ready-to-use\n", + "\n", + "examples of data solutions from different industries ranging from retail to\n", + "\n", + "manufacturing and healthcare. Each of the following scenarios includes\n", + "\n", + "notebooks with code and step-by-step instructions to help you get\n", + "\n", + "started. Get hands-on experience with the Databricks Lakehouse Platform\n", + "\n", + "\n", + "by trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n", + "\n", + "\n", + "**Overall Equipment**\n", + "**Effectiveness**\n", + "\n", + "Ingest equipment sensor data for\n", + "metric generation and data driven\n", + "decision-making\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n", + "\n", + "**Real-time point of**\n", + "**sale analytics**\n", + "\n", + "Calculate current inventories for\n", + "various products across multiple store\n", + "locations with Delta Live Tables\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n", + "\n", + "\n", + "**Recommendation Engines**\n", + "**for Personalization**\n", + "\n", + "Improve customers’ user experience\n", + "and conversion with personalized\n", + "recommendations\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Understanding Price**\n", + "**Transparency Data**\n", + "\n", + "Efficiently ingest large healthcare data\n", + "sets to create price transparency for\n", + "better understanding of healthcare costs\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n", + "\n", + "Additional Solution Accelerators with ready-to-use notebooks can be found here:\n", + "\n", + "**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 04SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
0f5ece87887f6f5d390380dda2471957**Recommendation Engines**\n", + "**for Personalization**\n", + "\n", + "Improve customers’ user experience\n", + "and conversion with personalized\n", + "recommendations\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Understanding Price**\n", + "**Transparency Data**\n", + "\n", + "Efficiently ingest large healthcare data\n", + "sets to create price transparency for\n", + "better understanding of healthcare costs\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n", + "\n", + "Additional Solution Accelerators with ready-to-use notebooks can be found here:\n", + "\n", + "**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Case Studies\n", + "\n", + "**4.1** Akamai\n", + "\n", + "**4.2** Grammarly\n", + "\n", + "**4.3** Honeywell\n", + "\n", + "**4.4** Wood Mackenzie\n", + "\n", + "**4.5** Rivian\n", + "\n", + "**4.6** AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.1\n", + "**Akamai delivers real-time security**\n", + "**analytics using Delta Lake**\n", + "\n", + "\n", + "###### <1\n", + "\n", + "**Min ingestion time,**\n", + "**reduced from 15 min**\n", + "\n", + "\n", + "###### <85%\n", + "\n", + "**Of queries have a response**\n", + "**time of 7 seconds or less**\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Delta Lake, Data Streaming, Photon,\n", + "[Databricks SQL](https://databricks.com/product/databricks-sql)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "Akamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n", + "\n", + "uses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n", + "\n", + "worldwide to route internet traffic for some of the largest enterprises in media, commerce,\n", + "\n", + "finance, retail and many other industries. About 30% of the internet’s traffic flows through\n", + "\n", + "Akamai servers. Akamai also provides cloud security solutions.\n", + "\n", + "In 2018, the company launched a web security analytics tool that offers Akamai customers\n", + "\n", + "a single, unified interface for assessing a wide range of streaming security events and\n", + "\n", + "performing analysis of those events. The web analytics tool helps Akamai customers to\n", + "\n", + "take informed actions in relation to security events in real time. Akamai is able to stream\n", + "\n", + "massive amounts of data and meet the strict SLAs it provides to customers by leveraging\n", + "\n", + "Delta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting and streaming enormous amounts of data**\n", + "\n", + "Akamai’s web security analytics tool ingests approximately 10GB of data related\n", + "to security events per second. Data volume can increase significantly when\n", + "retail customers conduct a large number of sales — or on big shopping days like\n", + "Black Friday or Cyber Monday. The web security analytics tool stores several\n", + "petabytes of data for analysis purposes. Those analyses are performed to\n", + "protect Akamai’s customers and provide them with the ability to explore and\n", + "query security events on their own.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
8cd3f41491cb6ba2467c5ee8216eb665In 2018, the company launched a web security analytics tool that offers Akamai customers\n", + "\n", + "a single, unified interface for assessing a wide range of streaming security events and\n", + "\n", + "performing analysis of those events. The web analytics tool helps Akamai customers to\n", + "\n", + "take informed actions in relation to security events in real time. Akamai is able to stream\n", + "\n", + "massive amounts of data and meet the strict SLAs it provides to customers by leveraging\n", + "\n", + "Delta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting and streaming enormous amounts of data**\n", + "\n", + "Akamai’s web security analytics tool ingests approximately 10GB of data related\n", + "to security events per second. Data volume can increase significantly when\n", + "retail customers conduct a large number of sales — or on big shopping days like\n", + "Black Friday or Cyber Monday. The web security analytics tool stores several\n", + "petabytes of data for analysis purposes. Those analyses are performed to\n", + "protect Akamai’s customers and provide them with the ability to explore and\n", + "query security events on their own.\n", + "\n", + "The web security analytics tool initially relied on an on-premises architecture\n", + "running Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n", + "(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\n", + "displayed in the tool. The company sought to improve ingestion and query speed\n", + "to meet those SLAs. “Data needs to be as real-time as possible so customers\n", + "can see what is attacking them,” says Tomer Patel, Engineering Manager at\n", + "Akamai. “Providing queryable data to customers quickly is critical. We wanted to\n", + "move away from on-prem to improve performance and our SLAs so the latency\n", + "would be seconds rather than minutes.”\n", + "\n", + "**Delta Lake allows us to not only query the data better but to**\n", + "**also acquire an increase in the data volume. We’ve seen an**\n", + "**80% increase in traffic and data in the last year, so being able**\n", + "**to scale fast is critical.**\n", + "\n", + "\n", + "After conducting proofs of concept with several companies, Akamai chose to\n", + "base its streaming analytics architecture on Spark and the Databricks Lakehouse\n", + "Platform. “Because of our scale and the demands of our SLA, we determined that\n", + "Databricks was the right solution for us,” says Patel. “When we consider storage\n", + "optimization, and data caching, if we went with another solution, we couldn’t\n", + "achieve the same level of performance.”\n", + "\n", + "**Improving speed and reducing costs**\n", + "\n", + "Today, the web security analytics tool ingests and transforms data, stores it\n", + "in cloud storage, and sends the location of the file via Kafka. It then uses a\n", + "Databricks Job as the ingest application. Delta Lake, the open source storage\n", + "format at the base of the Databricks Lakehouse Platform, supports real-time\n", + "querying on the web security analytics data. Delta Lake also enables Akamai to\n", + "scale quickly. “Delta Lake allows us to not only query the data better but to also\n", + "acquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\n", + "in traffic and data in the last year, so being able to scale fast is critical.”\n", + "\n", + "Akamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n", + "\n", + "fast query performance. Patel added that Photon provided a significant boost\n", + "to query performance. Overall, Databricks’ streaming architecture combined\n", + "with DBSQL and Photon enables Akamai to achieve real-time analytics, which\n", + "translates to real-time business benefits.\n", + "\n", + "\n", + "**Tomer Patel**\n", + "Engineering Manager, Akamai\n", + "\n", + "\n", + "-----\n", + "\n", + "Patel says he likes that Delta Lake is open source, as the company has benefitted\n", + "from a community of users working to improve the product. “The fact that Delta\n", + "Lake is open source and there’s a big community behind it means we don’t need\n", + "to implement everything ourselves,” says Patel. “We benefit from fixed bugs that\n", + "others have encountered and from optimizations that are contributed to the\n", + "project.” Akamai worked closely with Databricks to ensure Delta Lake can meet\n", + "the scale and performance requirements Akamai defined. These improvements\n", + "have been contributed back to the project (many of which were made available as\n", + "part of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\n", + "technology being tested at such a large scale in a real-world production scenario.\n", + "\n", + "\n", + "**Meeting aggressive requirements for scale,**\n", + "**reliability and performance**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
ae9c07b08ea4b4c558823863c844f0ce**Tomer Patel**\n", + "Engineering Manager, Akamai\n", + "\n", + "\n", + "-----\n", + "\n", + "Patel says he likes that Delta Lake is open source, as the company has benefitted\n", + "from a community of users working to improve the product. “The fact that Delta\n", + "Lake is open source and there’s a big community behind it means we don’t need\n", + "to implement everything ourselves,” says Patel. “We benefit from fixed bugs that\n", + "others have encountered and from optimizations that are contributed to the\n", + "project.” Akamai worked closely with Databricks to ensure Delta Lake can meet\n", + "the scale and performance requirements Akamai defined. These improvements\n", + "have been contributed back to the project (many of which were made available as\n", + "part of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\n", + "technology being tested at such a large scale in a real-world production scenario.\n", + "\n", + "\n", + "**Meeting aggressive requirements for scale,**\n", + "**reliability and performance**\n", + "\n", + "Using Spark Structured Streaming on the Databricks Lakehouse Platform enables\n", + "the web security analytics tool to stream vast volumes of data and provide\n", + "low-latency, real-time analytics-as-a-service to Akamai’s customers. That way\n", + "Akamai is able to make available security event data to customers within the\n", + "SLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\n", + "performance, performance,” says Patel. “The platform’s performance and\n", + "scalability are what drives us.”\n", + "\n", + "Using the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\n", + "the security event data. “Reducing ingestion time from 15 minutes to under 1\n", + "minute is a huge improvement,” says Patel. “It benefits our customers because\n", + "they can see the security event data faster and they have a view of what exactly\n", + "is happening as well as the capability to filter all of it.”\n", + "\n", + "Akamai’s biggest priority is to provide customers with a good experience and\n", + "fast response times. To date, Akamai has moved about 70% of security event\n", + "data from its on-prem architecture to Databricks, and the SLA for customer\n", + "query and response time has improved significantly as a result. “Now, with the\n", + "move to Databricks, our customers experience much better response time, with\n", + "over 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\n", + "optimal security configuration.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.2\n", + "**Grammarly uses Databricks Lakehouse to improve**\n", + "**user experience**\n", + "\n", + "\n", + "###### 110%\n", + "\n", + "**Faster querying, at 10% of the cost**\n", + "**to ingest, than a data warehouse**\n", + "\n", + "\n", + "###### 5 billion\n", + "\n", + "**Daily events available for**\n", + "**analytics in under 15 minutes**\n", + "\n", + "\n", + "Grammarly’s mission is to improve lives by improving communication. The company’s\n", + "\n", + "trusted AI-powered communication assistance provides real-time suggestions to\n", + "\n", + "help individuals and teams write more confidently and achieve better results. Its\n", + "\n", + "comprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n", + "\n", + "[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n", + "\n", + "wherever writing happens. As the company grew over the years, its legacy, homegrown\n", + "\n", + "analytics system made it challenging to evaluate large data sets quickly and cost-\n", + "\n", + "effectively.\n", + "\n", + "By migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n", + "\n", + "flexible, scalable and highly secure analytics platform that helps 30 million people and\n", + "\n", + "50,000 teams worldwide write more effectively every day.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "Recommendation Engines, Advertising\n", + "Effectiveness, Customer Lifetime ValueSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
9fbdfbb670eb38599226f9156685eaa7[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n", + "\n", + "wherever writing happens. As the company grew over the years, its legacy, homegrown\n", + "\n", + "analytics system made it challenging to evaluate large data sets quickly and cost-\n", + "\n", + "effectively.\n", + "\n", + "By migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n", + "\n", + "flexible, scalable and highly secure analytics platform that helps 30 million people and\n", + "\n", + "50,000 teams worldwide write more effectively every day.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "Recommendation Engines, Advertising\n", + "Effectiveness, Customer Lifetime Value\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Unity Catalog,\n", + "[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Harnessing data to improve communications for millions of**\n", + "**users and thousands of teams**\n", + "\n", + "When people use Grammarly’s AI communication assistance, they receive\n", + "suggestions to help them improve multiple dimensions of communication,\n", + "including spelling and grammar correctness, clarity and conciseness, word\n", + "choice, style, and tone. Grammarly receives feedback when users accept, reject\n", + "or ignore its suggestions through app-created events, which total about 5 billion\n", + "events per day.\n", + "\n", + "Historically, Grammarly relied on a homegrown legacy analytics platform and\n", + "leveraged an in-house SQL-like language that was time-intensive to learn and\n", + "made it challenging to onboard new hires. As the company grew, Grammarly\n", + "data analysts found that the platform did not sufficiently meet the needs of its\n", + "essential business functions, especially marketing, sales and customer success.\n", + "Analysts found themselves copying and pasting data from spreadsheets\n", + "because the existing system couldn’t effectively ingest the external data needed\n", + "to answer questions such as, “Which marketing channel delivers the highest\n", + "ROI?” Reporting proved challenging because the existing system didn’t support\n", + "Tableau dashboards, and company leaders and analysts needed to ensure they\n", + "could make decisions quickly and confidently.\n", + "\n", + "\n", + "**Databricks Lakehouse has given us the flexibility to unleash**\n", + "**our data without compromise. That flexibility has allowed us**\n", + "**to speed up analytics to a pace we’ve never achieved before.**\n", + "\n", + "**Chris Locklin**\n", + "Engineering Manager, Data Platforms, Grammarly\n", + "\n", + "Grammarly also sought to unify its data warehouses in order to scale and\n", + "improve data storage and query capabilities. As it stood, large Amazon EMR\n", + "clusters ran 24/7 and drove up costs. With the various data sources, the team\n", + "also needed to maintain access control. “Access control in a distributed file\n", + "system is difficult, and it only gets more complicated as you ingest more data\n", + "sources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\n", + "Meanwhile, reliance on a single streaming workflow made collaboration among\n", + "teams challenging. Data silos emerged as different business areas implemented\n", + "analytics tools individually. “Every team decided to solve their analytics needs in\n", + "the best way they saw fit,” says Locklin. “That created challenges in consistency\n", + "and knowing which data set was correct.”\n", + "\n", + "\n", + "-----\n", + "\n", + "As its data strategy was evolving, Grammarly’s priority was to get the most out\n", + "of analytical data while keeping it secure. This was crucial because security is\n", + "Grammarly’s number-one priority and most important feature, both in how it\n", + "protects its users’ data and how it ensures its own company data remains secure.\n", + "To accomplish that, Grammarly’s data platform team sought to consolidate\n", + "data and unify the company on a single platform. That meant sustaining a highly\n", + "secure infrastructure that could scale alongside the company’s growth, improving\n", + "ingestion flexibility, reducing costs and fueling collaboration.\n", + "\n", + "**Improving analytics, visualization and decision-making**\n", + "**with the lakehouse**\n", + "\n", + "After conducting several proofs of concept to enhance its infrastructure,\n", + "Grammarly migrated to the Databricks Lakehouse Platform. Bringing all the\n", + "analytical data into the lakehouse created a central hub for all data producers\n", + "and consumers across Grammarly, with Delta Lake at the core.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
35e24e644b33b8558da8f9295b4361fd-----\n", + "\n", + "As its data strategy was evolving, Grammarly’s priority was to get the most out\n", + "of analytical data while keeping it secure. This was crucial because security is\n", + "Grammarly’s number-one priority and most important feature, both in how it\n", + "protects its users’ data and how it ensures its own company data remains secure.\n", + "To accomplish that, Grammarly’s data platform team sought to consolidate\n", + "data and unify the company on a single platform. That meant sustaining a highly\n", + "secure infrastructure that could scale alongside the company’s growth, improving\n", + "ingestion flexibility, reducing costs and fueling collaboration.\n", + "\n", + "**Improving analytics, visualization and decision-making**\n", + "**with the lakehouse**\n", + "\n", + "After conducting several proofs of concept to enhance its infrastructure,\n", + "Grammarly migrated to the Databricks Lakehouse Platform. Bringing all the\n", + "analytical data into the lakehouse created a central hub for all data producers\n", + "and consumers across Grammarly, with Delta Lake at the core.\n", + "\n", + "Using the lakehouse architecture, data analysts within Grammarly now have a\n", + "consolidated interface for analytics, which leads to a single source of truth and\n", + "\n", + "confidence in the accuracy and availability of all data managed by the data\n", + "platform team. Across the organization, teams are using Databricks SQL to\n", + "conduct queries within the platform on both internally generated product data\n", + "and external data from digital advertising platform partners. Now, they can easily\n", + "connect to Tableau and create dashboards and visualizations to present to\n", + "executives and key stakeholders.\n", + "\n", + "\n", + "“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\n", + "companies ask for your data, hold it for you, and then let you perform analytics\n", + "on it. Just as Grammarly ensures our users’ data always remains theirs, we\n", + "wanted to ensure our company data remained ours. Grammarly’s data stays\n", + "inside of Grammarly.”\n", + "\n", + "With its data consolidated in the lakehouse, different areas of Grammarly’s\n", + "business can now analyze data more thoroughly and effectively. For example,\n", + "Grammarly’s marketing team uses advertising to attract new business. Using\n", + "Databricks, the team can consolidate data from various sources to extrapolate\n", + "a user’s lifetime value, compare it with customer acquisition costs and get rapid\n", + "feedback on campaigns. Elsewhere, data captured from user interactions flow\n", + "into a set of tables used by analysts for ad hoc analysis to inform and improve\n", + "the user experience.\n", + "\n", + "By consolidating data onto one unified platform, Grammarly has eliminated data\n", + "silos. “The ability to bring all these capabilities, data processing and analysis\n", + "under the same platform using Databricks is extremely valuable,” says Sergey\n", + "Blanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\n", + "and engineering to analytics and ML under the same umbrella removes barriers\n", + "and makes it easy for everyone to work with the data and each other.”\n", + "\n", + "\n", + "-----\n", + "\n", + "To manage access control, enable end-to-end observability and monitor data\n", + "quality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n", + "“Data lineage allows us to effectively monitor usage of our data and ensure it\n", + "upholds the standards we set as a data platform team,” says Locklin. “Lineage is\n", + "the last crucial piece for access control. It allows analysts to leverage data to do\n", + "their jobs while adhering to all usage standards and access controls, even when\n", + "recreating tables and data sets in another environment.”\n", + "\n", + "**Faster time to insight drives more intelligent**\n", + "**business decisions**\n", + "\n", + "Using the Databricks Lakehouse Platform, Grammarly’s engineering teams now\n", + "have a tailored, centralized platform and a consistent data source across the\n", + "company, resulting in greater speed and efficiency and reduced costs. The\n", + "lakehouse architecture has led to 110% faster querying, at 10% of the cost to\n", + "ingest, than a data warehouse. Grammarly can now make its 5 billion daily events\n", + "available for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n", + "\n", + "receive feedback about new features being rolled out and understand if they are\n", + "being adopted as expected. Ultimately, it helps them understand how groups\n", + "of users engage with the UX, improving the experience and ensuring features\n", + "and product releases bring the most value to users. “Everything my team does\n", + "is focused on creating a rich, personalized experience that empowers people to\n", + "communicate more effectively and achieve their potential,” says Locklin.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
f43621b04d372aa556042b5b86d86dfd**Faster time to insight drives more intelligent**\n", + "**business decisions**\n", + "\n", + "Using the Databricks Lakehouse Platform, Grammarly’s engineering teams now\n", + "have a tailored, centralized platform and a consistent data source across the\n", + "company, resulting in greater speed and efficiency and reduced costs. The\n", + "lakehouse architecture has led to 110% faster querying, at 10% of the cost to\n", + "ingest, than a data warehouse. Grammarly can now make its 5 billion daily events\n", + "available for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n", + "\n", + "receive feedback about new features being rolled out and understand if they are\n", + "being adopted as expected. Ultimately, it helps them understand how groups\n", + "of users engage with the UX, improving the experience and ensuring features\n", + "and product releases bring the most value to users. “Everything my team does\n", + "is focused on creating a rich, personalized experience that empowers people to\n", + "communicate more effectively and achieve their potential,” says Locklin.\n", + "\n", + "\n", + "Moving to the lakehouse architecture also solved the challenge of access control\n", + "over distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\n", + "ability to manage file permissions with more flexibility than a database would\n", + "allow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\n", + "using Databricks allows us to keep analytical data in-house, Unity Catalog helps\n", + "us continue to uphold the highest standards of data protection by controlling\n", + "access paradigms inside our data. That opens a whole new world of things that\n", + "we can do.”\n", + "\n", + "Ultimately, migrating to the Databricks Lakehouse Platform has helped\n", + "Grammarly to foster a data-driven culture where employees get fast access\n", + "to analytics without having to write complex queries, all while maintaining\n", + "Grammarly’s enterprise-grade security practices. “Our team’s mission is to help\n", + "Grammarly make better, faster business decisions,” adds Blanket. “My team\n", + "would not be able to effectively execute on that mission if we did not have a\n", + "platform like Databricks available to us.” Perhaps most critically, migrating off its\n", + "rigid legacy infrastructure gives Grammarly the adaptability to do more while\n", + "knowing the platform will evolve as its needs evolve. “Databricks has given us the\n", + "flexibility to unleash our data without compromise,” says Locklin. “That flexibility\n", + "has allowed us to speed up analytics to a pace we’ve never achieved before.”\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.3\n", + "**Honeywell selects Delta Live Tables for streaming data**\n", + "\n", + "Companies are under growing pressure to reduce energy use, while at the same time\n", + "\n", + "they are looking to lower costs and improve efficiency. Honeywell delivers industry-\n", + "\n", + "specific solutions that include aerospace products and services, control technologies\n", + "\n", + "for buildings and industry, and performance materials globally. Honeywell’s Energy\n", + "\n", + "and Environmental Solutions division uses IoT sensors and other technologies to help\n", + "\n", + "businesses worldwide manage energy demand, reduce energy consumption and carbon\n", + "\n", + "emissions, optimize indoor air quality, and improve occupant well-being.\n", + "\n", + "Accomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n", + "\n", + "Tables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n", + "\n", + "billions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n", + "\n", + "real-time queries and multilayer insights into data at scale — helping Honeywell improve\n", + "\n", + "how it manages data and extract more value from it, both for itself and for its customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Delta Live Tables\n", + "\n", + "\n", + "**C LO U D**\n", + "[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n", + "**aggregations, and bring the significant amount of data we collect**\n", + "**from our buildings under control so we can provide customers value.**\n", + "\n", + "**Dr. Chris Inkpen**\n", + "Global Solutions Architect, Honeywell Energy and Environmental Solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "**Processing billions of IoT data points per day**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
2f7ae7affaa93ddaf6c69a7779f21f56real-time queries and multilayer insights into data at scale — helping Honeywell improve\n", + "\n", + "how it manages data and extract more value from it, both for itself and for its customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Delta Live Tables\n", + "\n", + "\n", + "**C LO U D**\n", + "[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n", + "**aggregations, and bring the significant amount of data we collect**\n", + "**from our buildings under control so we can provide customers value.**\n", + "\n", + "**Dr. Chris Inkpen**\n", + "Global Solutions Architect, Honeywell Energy and Environmental Solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "**Processing billions of IoT data points per day**\n", + "\n", + "Honeywell’s solutions and services are used in millions of buildings around the\n", + "world. Helping its customers create buildings that are safe, more sustainable\n", + "and productive can require thousands of sensors per building. Those sensors\n", + "monitor key factors such as temperature, pressure, humidity and air quality.\n", + "In addition to the data collected by sensors inside a building, data is also\n", + "collected from outside, such as weather and pollution data. Another data set\n", + "consists of information about the buildings themselves — such as building\n", + "type, ownership, floor plan, square footage of each floor and square footage\n", + "of each room. That data set is combined with the two disparate data streams,\n", + "adding up to a lot of data across multiple structured and unstructured formats,\n", + "including images and video streams, telemetry data, event data, etc. At peaks,\n", + "Honeywell ingests anywhere between 200 to 1,000 events per second for any\n", + "building, which equates to billions of data points per day. Honeywell’s existing\n", + "data infrastructure was challenged to meet such demand. It also made it difficult\n", + "for Honeywell’s data team to query and visualize its disparate data so it could\n", + "provide customers with fast, high-quality information and analysis.\n", + "\n", + "**ETL simplified: high-quality, reusable data pipelines**\n", + "\n", + "With Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\n", + "data team can now ingest billions of rows of sensor data into Delta Lake and\n", + "automatically build SQL endpoints for real-time queries and multilayer insights\n", + "into data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n", + "\n", + "\n", + "Chris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\n", + "Solutions. “We give the system more data, and it copes. Out of the box, it’s given\n", + "us the confidence that it will handle whatever we throw at it.”\n", + "\n", + "Honeywell credits the Databricks Lakehouse Platform for helping it to unify its\n", + "vast and varied data — batch, streaming, structured and unstructured — into\n", + "one platform. “We have many different data types. The Databricks Lakehouse\n", + "Platform allows us to use things like Apache Kafka and Auto Loader to load and\n", + "process multiple types of data and treat everything as a stream of data, which is\n", + "awesome. Once we’ve got structured data from unstructured data, we can write\n", + "standardized pipelines.”\n", + "\n", + "Honeywell data engineers can now build and leverage their own ETL pipelines\n", + "with Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\n", + "be reused regardless of environment, and data can run in batches or streams. It’s\n", + "also helped Honeywell’s data team transition from a small team to a larger team.\n", + "“When we wrote our first few pipelines before DLT existed, only one person could\n", + "work in one part of the functionality. Now that we’ve got DLT and the ability to\n", + "have folders with common functionality, we’ve got a really good platform where\n", + "we can easily spin off different pipelines.”\n", + "\n", + "DLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\n", + "pipeline need optimization,” says Inkpen. “With standard pipelines, that was\n", + "much more chaotic.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling ease, simplicity and scalability across the**\n", + "**infrastructure**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
d01a04d6efbdc542df8ca255e712f59aHoneywell data engineers can now build and leverage their own ETL pipelines\n", + "with Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\n", + "be reused regardless of environment, and data can run in batches or streams. It’s\n", + "also helped Honeywell’s data team transition from a small team to a larger team.\n", + "“When we wrote our first few pipelines before DLT existed, only one person could\n", + "work in one part of the functionality. Now that we’ve got DLT and the ability to\n", + "have folders with common functionality, we’ve got a really good platform where\n", + "we can easily spin off different pipelines.”\n", + "\n", + "DLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\n", + "pipeline need optimization,” says Inkpen. “With standard pipelines, that was\n", + "much more chaotic.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling ease, simplicity and scalability across the**\n", + "**infrastructure**\n", + "\n", + "Delta Live Tables has helped Honeywell’s data team consistently query\n", + "complex data while offering simplicity of scale. It also enables end-to-end data\n", + "visualization of Honeywell’s data streams as they flow into its infrastructure, are\n", + "transformed, and then flow out. “Ninety percent of our ETL is now captured in\n", + "diagrams, so that’s helped considerably and improves data governance. DLT\n", + "encourages — and almost enforces — good design,” says Inkpen.\n", + "\n", + "Using the lakehouse as a shared workspace has helped promote teamwork and\n", + "collaboration at Honeywell. “The team collaborates beautifully now, working\n", + "together every day to divvy up the pipeline into their own stories and workloads,”\n", + "says Inkpen.\n", + "\n", + "Meanwhile, the ability to manage streaming data with low latency and better\n", + "throughput has improved accuracy and reduced costs. “Once we’ve designed\n", + "something using DLT, we’re pretty safe from scalability issues — certainly a\n", + "hundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\n", + "then go back and look at how we can take a traditional job and make it more\n", + "performant and less costly. We’re in a much better position to try and do that\n", + "from DLT.”\n", + "\n", + "\n", + "Using Databricks and DLT also helps the Honeywell team perform with greater\n", + "agility, which allows them to innovate faster while empowering developers to\n", + "respond to user requirements almost immediately. “Our previous architecture\n", + "made it impossible to know what bottlenecks we had and what we needed to\n", + "scale. Now we can do data science in near real-time.”\n", + "\n", + "Ultimately, Honeywell can now more quickly provide its customers with the\n", + "data and analysis they need to make their buildings more efficient, healthier\n", + "and safer for occupants. “I’m continuously looking for ways to improve our\n", + "lifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\n", + "us pull together many different data sources, do aggregations, and bring the\n", + "significant amount of data we collect from our buildings under control so we\n", + "can provide customers value.”\n", + "\n", + "**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.4\n", + "**Wood Mackenzie helps customers transition to a more**\n", + "**sustainable future**\n", + "\n", + "\n", + "###### 12 Billion\n", + "\n", + "**Data points processed**\n", + "**each week**\n", + "\n", + "\n", + "###### 80-90%\n", + "\n", + "**Reduction in**\n", + "**processing time**\n", + "\n", + "\n", + "###### Cost Savings\n", + "\n", + "**In operations through**\n", + "**workflow automation**\n", + "\n", + "\n", + "Wood Mackenzie offers customized consulting and analysis for a wide range of clients\n", + "\n", + "in the energy and natural resources sectors. Founded in Edinburgh, the company first\n", + "\n", + "cultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n", + "\n", + "detailed insight for every interconnected sector of the energy, chemicals, metals and\n", + "\n", + "mining industries.\n", + "\n", + "Today it sees itself playing an important role in the transition to a more sustainable\n", + "\n", + "future. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n", + "\n", + "ingest and process massive amounts of data. Using a common workflow provided\n", + "\n", + "higher visibility to engineering team members, encouraging better collaboration. WithSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
4104d022cbbba479db59c7b52e2125ac###### 12 Billion\n", + "\n", + "**Data points processed**\n", + "**each week**\n", + "\n", + "\n", + "###### 80-90%\n", + "\n", + "**Reduction in**\n", + "**processing time**\n", + "\n", + "\n", + "###### Cost Savings\n", + "\n", + "**In operations through**\n", + "**workflow automation**\n", + "\n", + "\n", + "Wood Mackenzie offers customized consulting and analysis for a wide range of clients\n", + "\n", + "in the energy and natural resources sectors. Founded in Edinburgh, the company first\n", + "\n", + "cultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n", + "\n", + "detailed insight for every interconnected sector of the energy, chemicals, metals and\n", + "\n", + "mining industries.\n", + "\n", + "Today it sees itself playing an important role in the transition to a more sustainable\n", + "\n", + "future. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n", + "\n", + "ingest and process massive amounts of data. Using a common workflow provided\n", + "\n", + "higher visibility to engineering team members, encouraging better collaboration. With\n", + "\n", + "an automated, transparent workflow in place, the team saw improved productivity and\n", + "\n", + "data quality and an easier path to fix pipeline issues when they arise.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Workflows\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering insights to the energy industry**\n", + "\n", + "Fulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\n", + "built to deliver insights at key decision points for customers in the energy sector.\n", + "Feeding into Lens are vast amounts of data collected from various data sources\n", + "and sensors used to monitor energy creation, oil and gas production, and more.\n", + "Those data sources update about 12 billion data points every week that must\n", + "be ingested, cleaned and processed as part of the input for the Lens platform.\n", + "Yanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\n", + "data professionals that build and maintain the ETL pipeline that provides input\n", + "data for Lens. The team is leveraging the Databricks Lakehouse Platform and\n", + "uses Apache Spark™ for parallel processing, which provides greater performance\n", + "and scalability benefits compared to an earlier single-node system working\n", + "sequentially. “We saw a reduction of 80-90% in data processing time, which\n", + "results in us providing our clients with more up-to-date, more complete and\n", + "more accurate data,” says Wu.\n", + "\n", + "**Our mission is to transform the way we power the planet.**\n", + "**Our clients in the energy sector need data, consulting services**\n", + "**and research to achieve that transformation. Databricks**\n", + "**Workflows gives us the speed and flexibility to deliver the**\n", + "**insights our clients need.**\n", + "\n", + "\n", + "**Improved collaboration and transparency with a common**\n", + "**workflow**\n", + "\n", + "The data pipeline managed by the team includes several stages for standardizing\n", + "and cleaning raw data, which can be structured or unstructured and may be in\n", + "the form of PDFs or even handwritten notes.\n", + "\n", + "Different members of the data team are responsible for different parts of\n", + "the pipeline, and there is a dependency between the processing stages each\n", + "team member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\n", + "workstream that the entire team uses. Each stage of the pipeline is implemented\n", + "in a Python notebook, which is run as a job in the main workflow.\n", + "\n", + "Each team member can now see exactly what code is running on each stage,\n", + "making it easy to find the cause of the issue. Knowing who owns the part of the\n", + "pipeline that originated the problem makes fixing issues much faster. “Without\n", + "a common workflow, different members of the team would run their notebooks\n", + "independently, not knowing that failure in their run affected stages downstream,”\n", + "says Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\n", + "rerun notebooks, it was hard to tell which notebook version was initially run and\n", + "the latest version to use.”\n", + "\n", + "\n", + "**Yanyan Wu**\n", + "Vice President of Data, Wood Mackenzie\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
99a18819e51138db0281c37f58807378Different members of the data team are responsible for different parts of\n", + "the pipeline, and there is a dependency between the processing stages each\n", + "team member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\n", + "workstream that the entire team uses. Each stage of the pipeline is implemented\n", + "in a Python notebook, which is run as a job in the main workflow.\n", + "\n", + "Each team member can now see exactly what code is running on each stage,\n", + "making it easy to find the cause of the issue. Knowing who owns the part of the\n", + "pipeline that originated the problem makes fixing issues much faster. “Without\n", + "a common workflow, different members of the team would run their notebooks\n", + "independently, not knowing that failure in their run affected stages downstream,”\n", + "says Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\n", + "rerun notebooks, it was hard to tell which notebook version was initially run and\n", + "the latest version to use.”\n", + "\n", + "\n", + "**Yanyan Wu**\n", + "Vice President of Data, Wood Mackenzie\n", + "\n", + "\n", + "-----\n", + "\n", + "Using Workflows’ alerting capabilities to notify the team when a workflow task\n", + "fails ensures everyone knows a failure occurred and allows the team to work\n", + "together to resolve the issue quickly. The definition of a common workflow\n", + "created consistency and transparency that made collaboration easier. “Using\n", + "Databricks Workflows allowed us to encourage collaboration and break up the\n", + "walls between different stages of the process,” explains Wu. “It allowed us all to\n", + "speak the same language.”\n", + "\n", + "Creating transparency and consistency is not the only advantage the team saw.\n", + "Using Workflows to automate notebook runs also led to cost savings compared\n", + "to running interactive notebooks manually.\n", + "\n", + "**Improved code development productivity**\n", + "\n", + "The team’s ETL pipeline development process involves iteration on PySpark\n", + "notebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\n", + "for data professionals on the team to manually develop and test a notebook.\n", + "Because Databricks Workflows supports running notebooks as task type\n", + "(along with Python files, JAR files and other types), when the code is ready for\n", + "\n", + "\n", + "developing notebooks with the interactive notebook UI while leveraging the\n", + "power of automation, which reduces potential issues that may happen when\n", + "running notebooks manually.\n", + "\n", + "The team has gone even further in increasing productivity by developing a\n", + "CI/CD process. “By connecting our source control code repository, we know\n", + "the workflow always runs the latest code version we committed to the repo,”\n", + "explains Zhang. “It’s also easy to switch to a development branch to develop a\n", + "new feature, fix a bug and run a development workflow. When the code passes\n", + "all tests, it is merged back to the main branch and the production workflow is\n", + "automatically updated with the latest code.”\n", + "\n", + "Going forward, Wood Mackenzie plans to optimize its use of Databricks\n", + "Workflows to automate machine learning processes such as model training,\n", + "model monitoring and handling model drift. The firm uses ML to improve its data\n", + "quality and extract insights to provide more value to its clients. “Our mission is to\n", + "transform how we power the planet,” Wu says. “Our clients in the energy sector\n", + "need data, consulting services and research to achieve that transformation.\n", + "Databricks Workflows gives us the speed and flexibility to deliver the insights our\n", + "clients need.”\n", + "\n", + "\n", + "production, it’s easy and cost effective to automate it by adding it to a workflow.\n", + "The workflow can then be easily revised by adding or removing any steps to\n", + "or from the defined flow. This way of working keeps the benefit of manually\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.5\n", + "**Rivian redefines driving experience with**\n", + "**the Databricks Lakehouse**\n", + "\n", + "###### 250 platform users\n", + "\n", + "**A 50x increase from a year ago**\n", + "\n", + "Rivian is preserving the natural world for future generations with revolutionary Electric\n", + "\n", + "Adventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n", + "\n", + "terabytes of IoT data per day, the company is using data insights and machine\n", + "\n", + "learning to improve vehicle health and performance. However, with legacy cloud\n", + "\n", + "tooling, it struggled to scale pipelines cost-effectively and spent significant resources\n", + "\n", + "on maintenance — slowing its ability to be truly data driven.\n", + "\n", + "Since moving to the Databricks Lakehouse Platform, Rivian can now understand howSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
9284ced0d78802d34cf693f2bb49fbb6-----\n", + "\n", + "SECTION 4.5\n", + "**Rivian redefines driving experience with**\n", + "**the Databricks Lakehouse**\n", + "\n", + "###### 250 platform users\n", + "\n", + "**A 50x increase from a year ago**\n", + "\n", + "Rivian is preserving the natural world for future generations with revolutionary Electric\n", + "\n", + "Adventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n", + "\n", + "terabytes of IoT data per day, the company is using data insights and machine\n", + "\n", + "learning to improve vehicle health and performance. However, with legacy cloud\n", + "\n", + "tooling, it struggled to scale pipelines cost-effectively and spent significant resources\n", + "\n", + "on maintenance — slowing its ability to be truly data driven.\n", + "\n", + "Since moving to the Databricks Lakehouse Platform, Rivian can now understand how\n", + "\n", + "a vehicle is performing and how this impacts the driver using it. Equipped with these\n", + "\n", + "insights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n", + "\n", + "driving experience to customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Predictive Maintenance, Scaling ML Models\n", + "for IoT, Data-Driven ESG\n", + "\n", + "**P L AT F O R M**\n", + "[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Struggling to democratize data on a legacy platform**\n", + "\n", + "\n", + "sharing of data, which further contributed to productivity issues. Required data\n", + "languages and specific expertise of toolsets created a barrier to entry that\n", + "limited developers from making full use of the data available. Jason Shiverick,\n", + "Principal Data Scientist at Rivian, said the biggest issue was the data access. “I\n", + "wanted to open our data to a broader audience of less technical users so they\n", + "could also leverage data more easily.”\n", + "\n", + "Rivian knew that once its EAVs hit the market, the amount of data ingested would\n", + "explode. In order to deliver the reliability and performance it promised, Rivian\n", + "needed an architecture that would not only democratize data access, but also\n", + "provide a common platform to build innovative solutions that can help ensure a\n", + "reliable and enjoyable driving experience.\n", + "\n", + "**Databricks Lakehouse empowers us to lower the barrier of**\n", + "**entry for data access across our organization so we can build**\n", + "**the most innovative and reliable electric vehicles in the world.**\n", + "\n", + "**Wassym Bensaid**\n", + "Vice President of Software Development, Rivian\n", + "\n", + "\n", + "Building a world that will continue to be enjoyed by future generations requires\n", + "a shift in the way we operate. At the forefront of this movement is Rivian —\n", + "an electric vehicle manufacturer focused on shifting our planet’s energy and\n", + "transportation systems entirely away from fossil fuel. Today, Rivian’s fleet\n", + "includes personal vehicles and involves a partnership with Amazon to deliver\n", + "100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\n", + "capture petabytes of data ranging from how the vehicle drives to how various\n", + "parts function. With all this data at its fingertips, Rivian is using machine learning\n", + "to improve the overall customer experience with predictive maintenance so that\n", + "potential issues are addressed before they impact the driver.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility\n", + "and tooling limitations that decreased output, prevented collaboration and\n", + "increased operational costs. It had 30 to 50 large and operationally complicated\n", + "compute clusters at any given time, which was costly. Not only was the system\n", + "difficult to manage, but the company experienced frequent cluster outages\n", + "as well, forcing teams to dedicate more time to troubleshooting than to data\n", + "analysis. Additionally, data silos created by disjointed systems slowed the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Predicting maintenance issues with Databricks Lakehouse**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
c63502363fdd19ca89d789804b8a67aeBefore Rivian even shipped its first EAV, it was already up against data visibility\n", + "and tooling limitations that decreased output, prevented collaboration and\n", + "increased operational costs. It had 30 to 50 large and operationally complicated\n", + "compute clusters at any given time, which was costly. Not only was the system\n", + "difficult to manage, but the company experienced frequent cluster outages\n", + "as well, forcing teams to dedicate more time to troubleshooting than to data\n", + "analysis. Additionally, data silos created by disjointed systems slowed the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Predicting maintenance issues with Databricks Lakehouse**\n", + "\n", + "Rivian chose to modernize its data infrastructure on the Databricks Lakehouse\n", + "Platform, giving it the ability to unify all of its data into a common view for\n", + "downstream analytics and machine learning. Now, unique data teams have\n", + "a range of accessible tools to deliver actionable insights for different use\n", + "cases, from predictive maintenance to smarter product development. Venkat\n", + "Sivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\n", + "to build a culture around an open data platform that provided a system for\n", + "really democratizing data and analysis in an efficient way.” Databricks’ flexible\n", + "support of all programming languages and seamless integration with a variety of\n", + "toolsets eliminated access roadblocks and unlocked new opportunities. Wassym\n", + "Bensaid, Vice President of Software Development at Rivian, explains, “Today we\n", + "have various teams, both technical and business, using Databricks Lakehouse\n", + "to explore our data, build performant data pipelines, and extract actionable\n", + "business and product insights via visual dashboards.”\n", + "\n", + "\n", + "metrics, Rivian can improve the accuracy of smart features and the control\n", + "that drivers have over them. Designed to take the stress out of long drives and\n", + "driving in heavy traffic, features like adaptive cruise control, lane change assist,\n", + "automatic emergency driving, and forward collision warning can be honed over\n", + "time to continuously optimize the driving experience for customers.\n", + "\n", + "Secure data sharing and collaboration was also facilitated with the Databricks\n", + "Unity Catalog. Shiverick describes how unified governance for the lakehouse\n", + "benefits Rivian productivity. “Unity Catalog gives us a truly centralized data\n", + "catalog across all of our different teams,” he said. “Now we have proper access\n", + "management and controls.” Venkat adds, “With Unity Catalog, we are centralizing\n", + "data catalog and access management across various teams and workspaces,\n", + "which has simplified governance.” End-to-end version controlled governance\n", + "and auditability of sensitive data sources, like the ones used for autonomous\n", + "driving systems, produces a simple but secure solution for feature engineering.\n", + "This gives Rivian a competitive advantage in the race to capture the autonomous\n", + "driving grid.\n", + "\n", + "\n", + "Rivian’s ADAS (advanced driver-assistance systems) Team can now easily\n", + "prepare telemetric accelerometer data to understand all EAV motions. This core\n", + "recording data includes information about pitch, roll, speed, suspension and\n", + "airbag activity, to help Rivian understand vehicle performance, driving patterns\n", + "and connected car system predictability. Based on these key performance\n", + "\n", + "\n", + "-----\n", + "\n", + "**Accelerating into an electrified and sustainable world**\n", + "\n", + "\n", + "By scaling its capacity to deliver valuable data insights with speed, efficiency\n", + "and cost-effectiveness, Rivian is primed to leverage more data to improve\n", + "operations and the performance of its vehicles to enhance the customer\n", + "experience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\n", + "money from a cloud perspective, and that’s a huge win for us.” With Databricks\n", + "Lakehouse providing a unified and open source approach to data and analytics,\n", + "the Vehicle Reliability Team is able to better understand how people are using\n", + "their vehicles, and that helps to inform the design of future generations of\n", + "vehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n", + "30%–50% increase in runtime performance, which has led to faster insights and\n", + "model performance.\n", + "\n", + "Shiverick explains, “From a reliability standpoint, we can make sure that\n", + "components will withstand appropriate lifecycles. It can be as simple as\n", + "making sure door handles are beefy enough to endure constant usage, or as\n", + "complicated as predictive and preventative maintenance to eliminate the\n", + "chance of failure in the field. Generally speaking, we’re improving software quality\n", + "based on key vehicle metrics for a better customer experience.”SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
503e0d13ef37677f2a5f0033eb59e6a8By scaling its capacity to deliver valuable data insights with speed, efficiency\n", + "and cost-effectiveness, Rivian is primed to leverage more data to improve\n", + "operations and the performance of its vehicles to enhance the customer\n", + "experience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\n", + "money from a cloud perspective, and that’s a huge win for us.” With Databricks\n", + "Lakehouse providing a unified and open source approach to data and analytics,\n", + "the Vehicle Reliability Team is able to better understand how people are using\n", + "their vehicles, and that helps to inform the design of future generations of\n", + "vehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n", + "30%–50% increase in runtime performance, which has led to faster insights and\n", + "model performance.\n", + "\n", + "Shiverick explains, “From a reliability standpoint, we can make sure that\n", + "components will withstand appropriate lifecycles. It can be as simple as\n", + "making sure door handles are beefy enough to endure constant usage, or as\n", + "complicated as predictive and preventative maintenance to eliminate the\n", + "chance of failure in the field. Generally speaking, we’re improving software quality\n", + "based on key vehicle metrics for a better customer experience.”\n", + "\n", + "\n", + "From a design optimization perspective, Rivian’s unobstructed data view is also\n", + "producing new diagnostic insights that can improve fleet health, safety, stability\n", + "and security. Venkat says, “We can perform remote diagnostics to triage a\n", + "problem quickly, or have a mobile service come in, or potentially send an OTA\n", + "to fix the problem with the software. All of this needs so much visibility into\n", + "the data, and that’s been possible with our partnership and integration on the\n", + "platform itself.” With developers actively building vehicle software to improve\n", + "issues along the way.\n", + "\n", + "Moving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\n", + "different teams — increasing the number of platform users from 5 to 250 in only\n", + "one year. This has unlocked new use cases including using machine learning to\n", + "optimize battery efficiency in colder temperatures, increasing the accuracy of\n", + "autonomous driving systems, and serving commercial depots with vehicle health\n", + "dashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\n", + "of commercial vans expands, Rivian will continue to leverage the troves of data\n", + "generated by its EAVs to deliver new innovations and driving experiences that\n", + "revolutionize sustainable transportation.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.6\n", + "**Migrating to the cloud to better serve**\n", + "**millions of customers**\n", + "\n", + "\n", + "###### 300%\n", + "\n", + "**ROI from OpEx savings**\n", + "**and cost avoidance**\n", + "\n", + "\n", + "###### 3X\n", + "\n", + "**Faster delivery of ML/data**\n", + "**science use cases**\n", + "\n", + "\n", + "Consistency in innovation is what keeps customers with a telecommunications company\n", + "\n", + "and is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n", + "\n", + "Hadoop system proved complex and costly to manage, impeding operational agility\n", + "\n", + "and efficiency and engineering resources. The need to pivot to cloud to better support\n", + "\n", + "hundreds of millions of subscribers was apparent.\n", + "\n", + "Migrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n", + "\n", + "savings in operating costs. Additionally, the new cloud-based environment has unlocked\n", + "\n", + "access to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n", + "\n", + "2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n", + "\n", + "overburdening its engineering team or exploding operational costs — to deliver new\n", + "\n", + "features and innovations to its millions of end users.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Customer Retention, Subscriber Churn\n", + "Prediction, Threat Detection\n", + "\n", + "**P L AT F O R M**\n", + "Lakehouse, Data Science, Machine Learning,\n", + "[Data Streaming](https://www.databricks.com/product/data-streaming)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hadoop technology adds operational complexity and**\n", + "**unnecessary costs**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
7f2a1d49873576a3d22cb339b23518d72,500+ users across 60+ business units. AT&T can now leverage all its data — without\n", + "\n", + "overburdening its engineering team or exploding operational costs — to deliver new\n", + "\n", + "features and innovations to its millions of end users.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Customer Retention, Subscriber Churn\n", + "Prediction, Threat Detection\n", + "\n", + "**P L AT F O R M**\n", + "Lakehouse, Data Science, Machine Learning,\n", + "[Data Streaming](https://www.databricks.com/product/data-streaming)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hadoop technology adds operational complexity and**\n", + "**unnecessary costs**\n", + "\n", + "AT&T is a technology giant with hundreds of millions of subscribers and ingests\n", + "10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\n", + "this data, it has a team of 2,500+ data users across 60+ business units to ensure\n", + "the business is data powered — from building analytics to ensure decisions are\n", + "based on the best data-driven situation awareness to building ML models that\n", + "bring new innovations to its customers. To support these requirements, AT&T\n", + "needed to democratize and establish a data single version of truth (SVOT) while\n", + "simplifying infrastructure management to increase agility and lower overall costs.\n", + "\n", + "However, physical infrastructure was too resource intensive. The combination\n", + "of a highly complex hardware setup (12,500 data sources and 1,500+ servers)\n", + "coupled with an on-premises Hadoop architecture proved complex to\n", + "maintain and expensive to manage. Not only were the operational costs to\n", + "support workloads high, but there were also additional capital costs around\n", + "data centers, licensing and more. Up to 70% of the on-prem platform had to\n", + "\n", + "be prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n", + "\n", + "data quality objectives. Engineers’ time was focused on managing updates,\n", + "\n", + "\n", + "With these deeply rooted technology issues, AT&T was not in the best position\n", + "to achieve its goals of increasing its use of insights for improving its customer\n", + "experience and operating more efficiently. “To truly democratize data across\n", + "the business, we needed to pivot to a cloud-native technology environment,”\n", + "said Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\n", + "up resources that had been focused on managing our infrastructure and move\n", + "them up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n", + "\n", + "**A seamless migration journey to Databricks**\n", + "\n", + "As part of its due diligence, AT&T ran a comprehensive cost analysis and\n", + "concluded that Databricks was both the fastest and achieved the best price/\n", + "performance for data pipelines and machine learning workloads. AT&T knew the\n", + "migration would be a massive undertaking. As such, the team did a lot of upfront\n", + "planning — they prioritized migrating their largest workloads first to immediately\n", + "reduce their infrastructure footprint. They also decided to migrate their data\n", + "before migrating users to ensure a smooth transition and experience for their\n", + "thousands of data practitioners.\n", + "\n", + "\n", + "fixing performance issues or simply provisioning resources rather than focusing\n", + "\n", + "\n", + "on higher-valued tasks. The resource constraints of physical infrastructure\n", + "\n", + "also drove serialization of data science activities, slowing innovation. Another\n", + "\n", + "hurdle faced in operationalizing petabytes of data was the challenge of building\n", + "\n", + "streaming data pipelines for real-time analytics, an area that was key to\n", + "\n", + "supporting innovative use cases required to better serve its customers.\n", + "\n", + "\n", + "**The migration from Hadoop to Databricks enables us to bring**\n", + "**more value to our customers and do it more cost-efficiently**\n", + "**and much faster than before.**\n", + "\n", + "**Mark Holcomb**\n", + "Distinguished Solution Architect, AT&T\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
fcc5f70d1ab79d942fe15a5e2eda0cfbfixing performance issues or simply provisioning resources rather than focusing\n", + "\n", + "\n", + "on higher-valued tasks. The resource constraints of physical infrastructure\n", + "\n", + "also drove serialization of data science activities, slowing innovation. Another\n", + "\n", + "hurdle faced in operationalizing petabytes of data was the challenge of building\n", + "\n", + "streaming data pipelines for real-time analytics, an area that was key to\n", + "\n", + "supporting innovative use cases required to better serve its customers.\n", + "\n", + "\n", + "**The migration from Hadoop to Databricks enables us to bring**\n", + "**more value to our customers and do it more cost-efficiently**\n", + "**and much faster than before.**\n", + "\n", + "**Mark Holcomb**\n", + "Distinguished Solution Architect, AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "They spent a year deduplicating and synchronizing data to the cloud before\n", + "migrating any users. This was a critical step in ensuring the successful migration\n", + "of such a large, complex multi-tenant environment of 2,500+ users from 60+\n", + "business units and their workloads. The user migration process occurred over\n", + "nine months and enabled AT&T to retire on-premises hardware in parallel with\n", + "migration to accelerate savings as early as possible. Plus, due to the horizontal,\n", + "scalable nature of Databricks, AT&T didn’t need to have everything in one\n", + "contiguous environment. Separating data and compute, and across multiple\n", + "accounts and workspaces, ensured analytics worked seamlessly without any API\n", + "call limits or bandwidth issues and consumption clearly attributed to the 60+\n", + "business units.\n", + "\n", + "All in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n", + "12,500 data sources and 300 schemas. The entire process took about two and a\n", + "half years. And it was able to manage the entire migration with the equivalent of\n", + "15 full-time internal resources. “Databricks was a valuable collaborator throughout\n", + "the process,” said Holcomb. “The team worked closely with us to resolve product\n", + "features and security concerns to support our migration timeline.”\n", + "\n", + "**Databricks reduces TCO and opens new paths to**\n", + "**innovation**\n", + "\n", + "One of the immediate benefits of moving to Databricks was huge cost savings.\n", + "AT&T was able to rationalize about 30% of its data by identifying and not\n", + "migrating underutilized and duplicate data. And prioritizing the migration of\n", + "the largest workloads allowed half the on-prem equipment to be rationalized\n", + "\n", + "\n", + "during the course of the migration. “By prioritizing the migration of our most\n", + "compute-intensive workloads to Databricks, we were able to significantly drive\n", + "down costs while putting us in position to scale more efficiently moving forward,”\n", + "explained Holcomb. The result is an anticipated 300% five-year migration ROI\n", + "from OpEx savings and cost avoidance (e.g., not needing to refresh data center\n", + "hardware).\n", + "\n", + "With data readily available and the means to analyze data at any scale, teams\n", + "of citizen data scientists and analysts can now spend more time innovating,\n", + "instead of serializing analytics efforts or waiting on engineering to provide the\n", + "necessary resources — or having data scientists spend their valuable time\n", + "on less complex or less insightful analyses. Data scientists are now able to\n", + "collaborate more effectively and speed up machine learning workflows so that\n", + "teams can deliver value more quickly, with a 3x faster time to delivery for new\n", + "data science use cases.\n", + "\n", + "“Historically you would have had operations in one system and analytics in a\n", + "separate one,” said Holcomb. “Now we can do more use cases like operational\n", + "analytics in a platform that fosters cross-team collaboration, reduces cost and\n", + "improves the consistency of answers.” Since migrating to Databricks, AT&T now\n", + "has a single version of truth to create new data-driven opportunities, including\n", + "a self-serve AI-as-a-Service analytics platform that will enable new revenue\n", + "streams and help it continue delivering exceptional innovations to its millions\n", + "of customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks onSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
3fd63e084c48ff4b0ebabb60d20f5243-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
1b74eac4a063d67e5f727e36b040965b##### The Delta Lake Series Complete Collection\n", + "\n", + "\n", + "-----\n", + "\n", + "### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "lifecycle management to data lakes. With Delta Lake, there will be no more\n", + "malformed data ingestion, difficulties deleting data for compliance, or issues\n", + "modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "scalable cloud service.\n", + "\n", + "In this eBook, the Databricks team has compiled all of their insights into a comprehensive\n", + "format so that you can gain a full understanding of Delta Lake and its capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Contents Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "Fundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n", + "\n", + "Performance Matter **you’ll find inside** 5 Features 22\n", + "\n", + "\n", + "\n", + "Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "\n", + "Rollbacks 39\n", + "\n", + "Pinned view of a continuously updating\n", + "\n", + "Delta Lake table across multiple downstream jobs\n", + "\n", + "Queries for time series analytics made simple\n", + "\n", + "Easily Clone Your Delta Lake\n", + "\n", + "for Testing, Sharing and ML\n", + "\n", + "Reproducibility 41\n", + "\n", + "What are clones? 41\n", + "\n", + "\n", + "A lakehouse combines the best elements\n", + "\n", + "of data lakes and data warehouses 52\n", + "\n", + "Some early examples 55\n", + "\n", + "From BI to AI 55\n", + "\n", + "Diving Deep Into the\n", + "\n", + "Inner Workings of the Lakehouse and Delta Lake 56\n", + "\n", + "1. Data lakes 57\n", + "\n", + "2. Custom storage engines 57\n", + "\n", + "\n", + "Creating the Dashboard /\n", + "\n", + "Virtual Network Operation Centers 82\n", + "\n", + "Creating (near) real-time alerts 85\n", + "\n", + "Next steps: machine learning 86\n", + "\n", + "Point-of-failure prediction and remediation 87\n", + "\n", + "Customer churn 87\n", + "\n", + "Getting started with the Databricks streaming video QoS solution 87\n", + "\n", + "Customer Use Cases 88\n", + "\n", + "Healthdirect Australia 89\n", + "\n", + "Data quality and governance issues, silos, and the inability to scale 89\n", + "\n", + "\n", + "Fundamentals & Performance\n", + "\n", + "\n", + "Using data skipping and Z-Order clustering 21\n", + "\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and\n", + "\n", + "\n", + "Exploring the details 21\n", + "\n", + "\n", + "Performance Matter\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "Challenges with data lakes\n", + "\n", + "Delta Lake’s key functionalities\n", + "\n", + "Unpacking the Transaction Log\n", + "\n", + "Implementing atomicity to ensure\n", + "\n", + "\n", + "Why Use MERGE\n", + "\n", + "With Delta Lake?\n", + "\n", + "When are upserts necessary? 24\n", + "\n", + "Why upserts into data lakes have\n", + "\n", + "\n", + "operations complete fully\n", + "\n", + "\n", + "operations complete fully 9\n", + "\n", + "Dealing with multiple concurrent reads and writes **Chapter**\n", + "\n", + "Time travel, data lineage and debugging 10\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "Understanding table schemas 11\n", + "\n", + "#### 01\n", + "\n", + "\n", + "Fundamentals and Performance traditionally been challenging 25\n", + "\n", + "\n", + "traditionally been challenging\n", + "\n", + "\n", + "Shallow clones\n", + "\n", + "Deep clones\n", + "\n", + "\n", + "**Chapter**\n", + "\n", + "42\n", + "\n", + "42\n", + "\n", + "#### 04\n", + "\n", + "\n", + "3. Lakehouse\n", + "\n", + "\n", + "Dealing with multiple concurrent reads and writes\n", + "\n", + "\n", + "Introducing MERGE in Delta Lake\n", + "\n", + "\n", + "In the research paper, the authors explain: 59\n", + "\n", + "\n", + "3. Lakehouse Streaming 58\n", + "\n", + "\n", + "\n", + "- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\n", + "and Performance Matter Deleting data due to GDPR 26\n", + "\n", + "\n", + "Understanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
cc06bdd5d0bca0e491757d186e00b991operations complete fully\n", + "\n", + "\n", + "operations complete fully 9\n", + "\n", + "Dealing with multiple concurrent reads and writes **Chapter**\n", + "\n", + "Time travel, data lineage and debugging 10\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "Understanding table schemas 11\n", + "\n", + "#### 01\n", + "\n", + "\n", + "Fundamentals and Performance traditionally been challenging 25\n", + "\n", + "\n", + "traditionally been challenging\n", + "\n", + "\n", + "Shallow clones\n", + "\n", + "Deep clones\n", + "\n", + "\n", + "**Chapter**\n", + "\n", + "42\n", + "\n", + "42\n", + "\n", + "#### 04\n", + "\n", + "\n", + "3. Lakehouse\n", + "\n", + "\n", + "Dealing with multiple concurrent reads and writes\n", + "\n", + "\n", + "Introducing MERGE in Delta Lake\n", + "\n", + "\n", + "In the research paper, the authors explain: 59\n", + "\n", + "\n", + "3. Lakehouse Streaming 58\n", + "\n", + "\n", + "\n", + "- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\n", + "and Performance Matter Deleting data due to GDPR 26\n", + "\n", + "\n", + "Understanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n", + "\n", + "Delta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n", + "\n", + "Scaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n", + "\n", + "\n", + "Time travel, data lineage and debugging\n", + "\n", + "\n", + "Simplifying use cases with MERGE\n", + "\n", + "\n", + "Where do clones help?\n", + "\n", + "\n", + "Understanding\n", + "\n", + "\n", + "Modernizing analytics with Databricks and Delta Lake\n", + "\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "\n", + "Deleting data due to GDPR\n", + "\n", + "\n", + "Testing and experimentation with a production table\n", + "\n", + "\n", + "Delta Engine\n", + "\n", + "\n", + "Faster data pipelines result in better patient-driven healthcare\n", + "\n", + "\n", + "\n", + "- Unpacking the Transaction Log Applying change data from databases 26\n", + "\n", + "- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n", + "\n", + "- Delta Lake DML Internals How to start using Delta Lake 28\n", + "\n", + "- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\n", + "With Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n", + "\n", + "\n", + "Understanding table schemas\n", + "\n", + "\n", + "Applying change data from databases\n", + "\n", + "\n", + "Staging major changes to a production table\n", + "\n", + "\n", + "Scaling execution performance\n", + "\n", + "\n", + "Comcast\n", + "\n", + "\n", + "Announcing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n", + "\n", + "high-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n", + "\n", + "\n", + "What is schema enforcement?\n", + "\n", + "How does schema enforcement work?\n", + "\n", + "How is schema enforcement useful?\n", + "\n", + "What is schema evolution?\n", + "\n", + "How does schema evolution work?\n", + "\n", + "\n", + "Updating session information from streaming pipelines\n", + "\n", + "\n", + "Machine learning result reproducibility\n", + "\n", + "Data migration\n", + "\n", + "Data sharing\n", + "\n", + "Data archiving\n", + "\n", + "Looks awesome! Any gotchas?\n", + "\n", + "How can I use it?\n", + "\n", + "Enabling Spark SQL DDL\n", + "\n", + "\n", + "Announcing Delta Engine for\n", + "\n", + "\n", + "Infrastructure unable to support data and ML needs\n", + "\n", + "\n", + "How to start using Delta Lake\n", + "\n", + "\n", + "high-performance query execution\n", + "\n", + "\n", + "Automated infrastructure, faster data\n", + "\n", + "\n", + "Getting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n", + "\n", + "Streaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n", + "\n", + "\n", + "Loading and saving our Delta Lake data\n", + "\n", + "\n", + "Getting started with Delta Engine\n", + "\n", + "\n", + "pipelines with Delta Lake\n", + "\n", + "\n", + "In-place conversion to Delta Lake\n", + "\n", + "\n", + "Streaming\n", + "\n", + "\n", + "Delivering personalized experiences with ML\n", + "\n", + "\n", + "Delete our flight data\n", + "\n", + "Update our flight data 31\n", + "\n", + "Merge our flight data 31\n", + "\n", + "\n", + "How Delta Lake Solves Common Pain Points in Streaming\n", + "\n", + "\n", + "Banco Hipotecario 97\n", + "\n", + "Legacy analytics tools are slow, rigid and\n", + "\n", + "impossible to scale 98\n", + "\n", + "\n", + "How is schema evolution useful? 14\n", + "\n", + "Summary **Chapter** 14\n", + "\n", + "Delta Lake\n", + "\n", + "DML Internals 15\n", + "\n", + "Delta Lake DML: UPDATE 15\n", + "\n", + "#### 02\n", + "\n", + "\n", + "FeaturesSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
5059253ce068094b5181ff09e66e1503Automated infrastructure, faster data\n", + "\n", + "\n", + "Getting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n", + "\n", + "Streaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n", + "\n", + "\n", + "Loading and saving our Delta Lake data\n", + "\n", + "\n", + "Getting started with Delta Engine\n", + "\n", + "\n", + "pipelines with Delta Lake\n", + "\n", + "\n", + "In-place conversion to Delta Lake\n", + "\n", + "\n", + "Streaming\n", + "\n", + "\n", + "Delivering personalized experiences with ML\n", + "\n", + "\n", + "Delete our flight data\n", + "\n", + "Update our flight data 31\n", + "\n", + "Merge our flight data 31\n", + "\n", + "\n", + "How Delta Lake Solves Common Pain Points in Streaming\n", + "\n", + "\n", + "Banco Hipotecario 97\n", + "\n", + "Legacy analytics tools are slow, rigid and\n", + "\n", + "impossible to scale 98\n", + "\n", + "\n", + "How is schema evolution useful? 14\n", + "\n", + "Summary **Chapter** 14\n", + "\n", + "Delta Lake\n", + "\n", + "DML Internals 15\n", + "\n", + "Delta Lake DML: UPDATE 15\n", + "\n", + "#### 02\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "#### 05 Chapter\n", + "\n", + "\n", + "Data lake pain points Customer Use Cases 64\n", + "\n", + "\n", + "How is schema evolution useful?\n", + "\n", + "\n", + "Data lake pain points\n", + "\n", + "\n", + "Summary\n", + "\n", + "\n", + "Data warehouse pain points\n", + "\n", + "\n", + "\n", + "- Why Use MERGE With Delta Lake? View table history 32\n", + "\n", + "- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\n", + "Tables Using Python APIs Clean up old table versions with vacuum 33\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "\n", + "View table history\n", + "\n", + "\n", + "and DML in Delta Lake on\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues\n", + "\n", + "\n", + "A unified platform powers the data lake\n", + "\n", + "\n", + "DML Internals\n", + "\n", + "\n", + "Travel back in time with table history\n", + "\n", + "\n", + "Apache Spark 3.0\n", + "\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake\n", + "\n", + "\n", + "and easy collaboration\n", + "\n", + "\n", + "Implement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n", + "\n", + "stock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n", + "\n", + "\n", + "Delta Lake DML: UPDATE\n", + "\n", + "\n", + "Clean up old table versions with vacuum\n", + "\n", + "\n", + "Support for SQL DDL commands\n", + "\n", + "\n", + "Implement your streaming\n", + "\n", + "\n", + "An efficient team maximizes customer\n", + "\n", + "\n", + "\n", + "- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n", + "\n", + "- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\n", + "and ML Reproducibility 1. Using a timestamp 36\n", + "\n", + "\n", + "UPDATE: Under the hood 16\n", + "\n", + "UPDATE + Delta Lake time travel = Easy debugging\n", + "\n", + "UPDATE: Performance tuning tips 16\n", + "\n", + "Delta Lake DML: DELETE 16\n", + "\n", + "DELETE: Under the hood 17\n", + "\n", + "DELETE + VACUUM: Cleaning up old data files\n", + "\n", + "\n", + "Common challenges with changing data\n", + "\n", + "\n", + "to define tables in the Hive metastore\n", + "\n", + "\n", + "stock analysis solution with Delta Lake\n", + "\n", + "\n", + "acquisition and retention\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n", + "\n", + "\n", + "Working with Time Travel\n", + "\n", + "\n", + "Create or replace tables\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69\n", + "\n", + "\n", + "Viacom18\n", + "\n", + "\n", + "1. Using a timestamp\n", + "\n", + "\n", + "Explicitly alter the table schema\n", + "\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake\n", + "\n", + "\n", + "Growth in subscribers and terabytes of viewing data push Hadoop to its limits\n", + "\n", + "\n", + "\n", + "- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\n", + "on Apache Spark 3.0 Python syntax 37SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
d924d3d8e4e14883aedaebf75db94374DELETE + VACUUM: Cleaning up old data files\n", + "\n", + "\n", + "Common challenges with changing data\n", + "\n", + "\n", + "to define tables in the Hive metastore\n", + "\n", + "\n", + "stock analysis solution with Delta Lake\n", + "\n", + "\n", + "acquisition and retention\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n", + "\n", + "\n", + "Working with Time Travel\n", + "\n", + "\n", + "Create or replace tables\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69\n", + "\n", + "\n", + "Viacom18\n", + "\n", + "\n", + "1. Using a timestamp\n", + "\n", + "\n", + "Explicitly alter the table schema\n", + "\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake\n", + "\n", + "\n", + "Growth in subscribers and terabytes of viewing data push Hadoop to its limits\n", + "\n", + "\n", + "\n", + "- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\n", + "on Apache Spark 3.0 Python syntax 37\n", + "\n", + "\n", + "How data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n", + "\n", + "Leveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n", + "\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "Support for SQL Insert, Delete, Update and Merge\n", + "\n", + "Automatic and incremental Presto/Athena manifest generation\n", + "\n", + "Configuring your table through table properties\n", + "\n", + "Support for adding user-defined metadata\n", + "\n", + "in Delta Lake table commits 48\n", + "\n", + "Other highlights 49\n", + "\n", + "Lakehouse 50\n", + "\n", + "What Is a\n", + "\n", + "Lakehouse? 51\n", + "\n", + "\n", + "How data flows and associated challenges 72\n", + "\n", + "\n", + "Rapid data processing for analytics\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "Leveraging Structured Streaming with blob store as\n", + "\n", + "\n", + "and ML with Databricks\n", + "\n", + "\n", + "SQL syntax 37\n", + "\n", + "2. Using a version number\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "source and Delta Lake tables as sink\n", + "\n", + "\n", + "Leveraging viewer data to power personalized viewing experiences 104\n", + "\n", + "\n", + "DELETE: Performance tuning tips 18\n", + "\n", + "Delta Lake DML: MERGE **Chapter** 18\n", + "\n", + "Here’s how an upsert works: 18\n", + "\n", + "MERGE: Under the hood 19\n", + "\n", + "MERGE: Performance tuning tips **03** 19\n", + "\n", + "\n", + "DELETE: Performance tuning tips\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Building a Quality of Service Analytics Solution for Streaming Video Services 75\n", + "\n", + "Databricks Quality of Service solution overview 76\n", + "\n", + "Video QoS solution architecture 77\n", + "\n", + "Making your data ready for analytics 79\n", + "\n", + "Video applications events 80\n", + "\n", + "CDN logs 81\n", + "\n", + "\n", + "Delta Lake DML: MERGE\n", + "\n", + "\n", + "\n", + "- What Is a Lakehouse? Python syntax 38\n", + "\n", + "- Diving Deep Into the Inner Workings of the SQL syntax 38\n", + "Lakehouse and Delta Lake Audit data changes 39\n", + "\n", + "\n", + "Here’s how an upsert works:\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "MERGE: Under the hood\n", + "\n", + "\n", + "SQL syntax\n", + "\n", + "\n", + "MERGE: Performance tuning tips\n", + "\n", + "\n", + "Audit data changes\n", + "\n", + "\n", + "How Delta Lake Quickly\n", + "\n", + "\n", + "\n", + "- Understanding Delta Engine Reproduce experiments and reports 39\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fundamentals and Performance**\n", + "Boost data reliability for machine learning and\n", + "business intelligence with Delta Lake\n", + "\n", + "## CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Fundamentals of Delta**\n", + "**Lake: Why Reliability and**\n", + "**Performance Matter**\n", + "\n", + "When it comes to data reliability, performance — the speed at which your programs\n", + "run — is of utmost importance. Because of the ACID transactional protections that\n", + "Delta Lake provides, you’re able to get the reliability and performance you need.\n", + "\n", + "With Delta Lake, you can stream and batch concurrently, perform CRUD operations,\n", + "and save money because you’re now using fewer VMs. It’s easier to maintain your data\n", + "engineering pipelines by taking advantage of streaming, even for batch jobs.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
46eacc3b731787c7cd2915fa2047764aPython syntax\n", + "\n", + "\n", + "MERGE: Under the hood\n", + "\n", + "\n", + "SQL syntax\n", + "\n", + "\n", + "MERGE: Performance tuning tips\n", + "\n", + "\n", + "Audit data changes\n", + "\n", + "\n", + "How Delta Lake Quickly\n", + "\n", + "\n", + "\n", + "- Understanding Delta Engine Reproduce experiments and reports 39\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fundamentals and Performance**\n", + "Boost data reliability for machine learning and\n", + "business intelligence with Delta Lake\n", + "\n", + "## CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Fundamentals of Delta**\n", + "**Lake: Why Reliability and**\n", + "**Performance Matter**\n", + "\n", + "When it comes to data reliability, performance — the speed at which your programs\n", + "run — is of utmost importance. Because of the ACID transactional protections that\n", + "Delta Lake provides, you’re able to get the reliability and performance you need.\n", + "\n", + "With Delta Lake, you can stream and batch concurrently, perform CRUD operations,\n", + "and save money because you’re now using fewer VMs. It’s easier to maintain your data\n", + "engineering pipelines by taking advantage of streaming, even for batch jobs.\n", + "\n", + "Delta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\n", + "cloud object storage by providing ACID transactions through optimistic concurrency\n", + "control between writes and snapshot isolation for consistent reads during writes.\n", + "Delta Lake also provides built-in data versioning for easy rollbacks and reproducing\n", + "reports.\n", + "\n", + "In this chapter, we’ll share some of the common challenges with data lakes as well as\n", + "the Delta Lake features that address them.\n", + "\n", + "**Challenges with data lakes**\n", + "Data lakes are a common element within modern data architectures. They serve as a\n", + "central ingestion point for the plethora of data that organizations seek to gather and\n", + "mine. While a good step forward in getting to grips with the range of data, they run\n", + "into the following common problems:\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\n", + "the problem of unsafe writes into data lakes that cause readers to see garbage\n", + "data during writes. They have to build workarounds to ensure readers always see\n", + "consistent data during writes.\n", + "\n", + "**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\n", + "lake is easy, but this comes at the cost of data quality. Without any mechanisms\n", + "for validating schema and the data, data lakes suffer from poor data quality. As a\n", + "consequence, analytics projects that strive to mine this data also fail.\n", + "\n", + "**3. Poor performance with increasing amounts of data.** As the amount of data\n", + "that gets dumped into a data lake increases, the number of files and directories\n", + "also increases. Big data jobs and query engines that process the data spend a\n", + "significant amount of time handling the metadata operations. This problem is more\n", + "pronounced in the case of streaming jobs or handling many concurrent batch jobs.\n", + "\n", + "**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain.\n", + "\n", + "Because of these challenges, many big data projects fail to deliver on their vision or\n", + "sometimes just fail altogether. We need a solution that enables data practitioners to\n", + "make use of their existing data lakes, while ensuring data quality.\n", + "\n", + "**Delta Lake’s key functionalities**\n", + "Delta Lake addresses the above problems to simplify how you build your data lakes.\n", + "Delta Lake offers the following key functionalities:\n", + "\n", + "**• ACID transactions:** Delta Lake provides ACID transactions between multiple\n", + "writes. Every write is a transaction, and there is a serial order for writes recorded in\n", + "a transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
67b604e572d4a8d01ec4adef4baee177**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain.\n", + "\n", + "Because of these challenges, many big data projects fail to deliver on their vision or\n", + "sometimes just fail altogether. We need a solution that enables data practitioners to\n", + "make use of their existing data lakes, while ensuring data quality.\n", + "\n", + "**Delta Lake’s key functionalities**\n", + "Delta Lake addresses the above problems to simplify how you build your data lakes.\n", + "Delta Lake offers the following key functionalities:\n", + "\n", + "**• ACID transactions:** Delta Lake provides ACID transactions between multiple\n", + "writes. Every write is a transaction, and there is a serial order for writes recorded in\n", + "a transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n", + "\n", + "\n", + "-----\n", + "\n", + "[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\n", + "trying to modify the same files don’t happen that often. In scenarios where\n", + "there is a conflict, Delta Lake throws a concurrent modification exception for\n", + "users to handle them and retry their jobs. Delta Lake also offers the highest level\n", + "of isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\n", + "keep writing to a directory or table and consumers to keep reading from the same\n", + "directory or table. Readers will see the latest snapshot that existed at the time the\n", + "reading started.\n", + "\n", + "**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\n", + "DataFrame being written is compatible with the schema of the table. Columns that\n", + "are present in the table but not in the DataFrame are set to null. If there are extra\n", + "columns in the DataFrame that are not present in the table, this operation throws\n", + "an exception. Delta Lake has DDL to add new columns explicitly and the ability to\n", + "update the schema automatically.\n", + "\n", + "**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\n", + "a table or directory in the transaction log instead of the metastore. This allows\n", + "Delta Lake to list files in large directories in constant time and be efficient while\n", + "reading data.\n", + "\n", + "**• Data versioning and time travel:** Delta Lake allows users to read a previous\n", + "snapshot of the table or directory. When files are modified during writes, Delta\n", + "Lake creates newer versions of the files and preserves the older versions. When\n", + "\n", + "\n", + "users want to read the older versions of the table or directory, they can provide\n", + "a timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\n", + "constructs the full snapshot as of that timestamp or version based on the\n", + "information in the transaction log. This allows users to reproduce experiments and\n", + "reports and also revert a table to its older versions, if needed.\n", + "\n", + "**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\n", + "be used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\n", + "Combined with ACID transactions and scalable metadata handling, the efficient\n", + "streaming sink enables lots of near real-time analytics use cases without having to\n", + "maintain a complicated streaming and batch pipeline.\n", + "\n", + "**• Record update and deletion:** Delta Lake will support merge, update and delete\n", + "DML commands. This allows engineers to easily upsert and delete records in data\n", + "lakes and simplify their change data capture and GDPR use cases. Since Delta Lake\n", + "tracks and modifies data at file-level granularity, it is much more efficient than\n", + "reading and overwriting entire partitions or tables.\n", + "\n", + "**• Data expectations (coming soon):** Delta Lake will also support a new API to set\n", + "data expectations on tables or directories. Engineers will be able to specify a\n", + "boolean condition and tune the severity to handle data expectations. When Apache\n", + "Spark jobs write to the table or directory, Delta Lake will automatically validate\n", + "the records and when there is a violation, it will handle the records based on the\n", + "severity provided.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unpacking the**\n", + "**Transaction Log**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
2e8fa5b824c302814cf399c84115dbd0**• Record update and deletion:** Delta Lake will support merge, update and delete\n", + "DML commands. This allows engineers to easily upsert and delete records in data\n", + "lakes and simplify their change data capture and GDPR use cases. Since Delta Lake\n", + "tracks and modifies data at file-level granularity, it is much more efficient than\n", + "reading and overwriting entire partitions or tables.\n", + "\n", + "**• Data expectations (coming soon):** Delta Lake will also support a new API to set\n", + "data expectations on tables or directories. Engineers will be able to specify a\n", + "boolean condition and tune the severity to handle data expectations. When Apache\n", + "Spark jobs write to the table or directory, Delta Lake will automatically validate\n", + "the records and when there is a violation, it will handle the records based on the\n", + "severity provided.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unpacking the**\n", + "**Transaction Log**\n", + "\n", + "The transaction log is key to understanding Delta Lake because it is the common thread\n", + "that runs through many of its most important features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log is\n", + "an ordered record of every transaction that has ever been performed on a Delta Lake\n", + "table since its inception.\n", + "\n", + "Delta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\n", + "given table to work on the table at the same time. To show users correct views of the\n", + "data at all times, the transaction log serves as a single source of truth: the central\n", + "repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on an\n", + "open table that has been modified since the last time it was read, Spark checks the\n", + "transaction log to see what new transactions are posted to the table. Then, Spark\n", + "updates the end user’s table with those new changes. This ensures that a user’s\n", + "version of a table is always synchronized with the master record as of the most recent\n", + "query and that users cannot make divergent, conflicting changes to a table.\n", + "\n", + "In this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\n", + "solution to the problem of multiple concurrent reads and writes.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Implementing atomicity to ensure**\n", + "**operations complete fully**\n", + "Atomicity is one of the four properties of ACID transactions that guarantees that\n", + "operations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\n", + "fully or don’t complete at all. Without this property, it’s far too easy for a hardware\n", + "failure or a software bug to cause data to be only partially written to a table, resulting\n", + "in messy or corrupted data.\n", + "\n", + "The transaction log is the mechanism through which Delta Lake is able to offer\n", + "the guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\n", + "transaction log, it never happened. By only recording transactions that execute fully\n", + "and completely, and using that record as the single source of truth, the Delta Lake\n", + "transaction log allows users to reason about their data and have peace of mind about\n", + "its fundamental trustworthiness, at petabyte scale.\n", + "\n", + "**Dealing with multiple concurrent reads and writes**\n", + "But how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\n", + "Lake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n", + "\n", + "\n", + "table at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n", + "**concurrency control** .\n", + "\n", + "Optimistic concurrency control is a method of dealing with concurrent transactions\n", + "that assumes the changes made to a table by different users can complete without\n", + "conflicting with one another. It is incredibly fast because when dealing with petabytes\n", + "of data, there’s a high likelihood that users will be working on different parts of the data\n", + "altogether, allowing them to complete non-conflicting transactions simultaneously.\n", + "\n", + "Of course, even with optimistic concurrency control, sometimes users do try to\n", + "modify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\n", + "for that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\n", + "then it attempts to solve any conflict optimistically.\n", + "\n", + "This protocol allows Delta Lake to deliver on the ACID principle of isolation, which\n", + "ensures that the resulting state of the table after multiple, concurrent writes is the\n", + "same as if those writes had occurred serially, in isolation from one another.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
9afc565db7357152dfe1cfe399ae37e8table at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n", + "**concurrency control** .\n", + "\n", + "Optimistic concurrency control is a method of dealing with concurrent transactions\n", + "that assumes the changes made to a table by different users can complete without\n", + "conflicting with one another. It is incredibly fast because when dealing with petabytes\n", + "of data, there’s a high likelihood that users will be working on different parts of the data\n", + "altogether, allowing them to complete non-conflicting transactions simultaneously.\n", + "\n", + "Of course, even with optimistic concurrency control, sometimes users do try to\n", + "modify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\n", + "for that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\n", + "then it attempts to solve any conflict optimistically.\n", + "\n", + "This protocol allows Delta Lake to deliver on the ACID principle of isolation, which\n", + "ensures that the resulting state of the table after multiple, concurrent writes is the\n", + "same as if those writes had occurred serially, in isolation from one another.\n", + "\n", + "\n", + "-----\n", + "\n", + "As all the transactions made on Delta Lake tables are stored directly to disk, this\n", + "process satisfies the ACID property of durability, meaning it will persist even in the\n", + "event of system failure.\n", + "\n", + "**Time travel, data lineage and debugging**\n", + "Every table is the result of the sum total of all the commits recorded in the Delta Lake\n", + "transaction log — no more and no less. The transaction log provides a step-by-step\n", + "instruction guide, detailing exactly how to get from the table’s original state to its\n", + "current state.\n", + "\n", + "Therefore, we can recreate the state of a table at any point in time by starting with\n", + "an original table, and processing only commits made after that point. This powerful\n", + "ability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n", + "\n", + "\n", + "of situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "\n", + "As the definitive record of every change ever made to a table, the Delta Lake\n", + "transaction log offers users a verifiable data lineage that is useful for governance,\n", + "audit and compliance purposes. It can also be used to trace the origin of an\n", + "inadvertent change or a bug in a pipeline back to the exact action that caused it. Users\n", + "can run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\n", + "that were made.\n", + "\n", + "**Want to learn more about Delta Lake’s transaction log?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Use Schema**\n", + "**Enforcement and**\n", + "**Evolution**\n", + "\n", + "As business problems and requirements evolve over time, so does the structure of\n", + "your data. With Delta Lake, incorporating new columns or objects is easy; users have\n", + "access to simple semantics to control the schema of their tables. At the same time,\n", + "it is important to call out the importance of schema enforcement to prevent users\n", + "from accidentally polluting their tables with mistakes or garbage data in addition to\n", + "schema evolution, which enables them to automatically add new columns of rich data\n", + "when those columns belong.\n", + "\n", + "**Schema enforcement rejects any new columns or other schema changes that**\n", + "**aren’t compatible with your table.** By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
710f6a1f26a207f904d2d61197c5e9c0**Want to learn more about Delta Lake’s transaction log?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Use Schema**\n", + "**Enforcement and**\n", + "**Evolution**\n", + "\n", + "As business problems and requirements evolve over time, so does the structure of\n", + "your data. With Delta Lake, incorporating new columns or objects is easy; users have\n", + "access to simple semantics to control the schema of their tables. At the same time,\n", + "it is important to call out the importance of schema enforcement to prevent users\n", + "from accidentally polluting their tables with mistakes or garbage data in addition to\n", + "schema evolution, which enables them to automatically add new columns of rich data\n", + "when those columns belong.\n", + "\n", + "**Schema enforcement rejects any new columns or other schema changes that**\n", + "**aren’t compatible with your table.** By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together,\n", + "these features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Understanding table schemas**\n", + "Every DataFrame in Apache Spark contains a schema, a blueprint that defines the\n", + "shape of the data, such as data types and columns, and metadata. With Delta Lake,\n", + "the table’s schema is saved in JSON format inside the transaction log.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What is schema enforcement?**\n", + "Schema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\n", + "data quality by rejecting writes to a table that don’t match the table’s schema.\n", + "\n", + "Like the front-desk manager at a busy restaurant who only accepts reservations, it\n", + "checks to see whether each column of data inserted into the table is on its list of\n", + "expected columns (in other words, whether each one has a “reservation”), and rejects\n", + "any writes with columns that aren’t on the list.\n", + "\n", + "**How does schema enforcement work?**\n", + "Delta Lake uses **schema validation on write,** which means that all new writes to a\n", + "table are checked for compatibility with the target table’s schema at write time. If the\n", + "schema is not compatible, Delta Lake cancels the transaction altogether (no data is\n", + "written), and raises an exception to let the user know about the mismatch.\n", + "\n", + "To determine whether a write to a table is compatible, Delta Lake uses the following\n", + "rules. The DataFrame to be written cannot contain:\n", + "\n", + "**• Any additional columns that are not present in the target table’s schema.**\n", + "Conversely, it’s OK if the incoming data doesn’t contain every column in the table —\n", + "those columns will simply be assigned null values.\n", + "\n", + "**• \u0007Column data types that differ from the column data types in the target table.**\n", + "If a target table’s column contains StringType data, but the corresponding column\n", + "in the DataFrame contains IntegerType data, schema enforcement will raise an\n", + "exception and prevent the write operation from taking place.\n", + "\n", + "**• Column names that differ only by case.** This means that you cannot have columns\n", + "such as “Foo” and “foo” defined in the same table. While Spark can be used in case\n", + "sensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\n", + "when storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\n", + "column information. To avoid potential mistakes, data corruption or loss issues (which\n", + "we’ve personally experienced at Databricks), we decided to add this restriction.\n", + "\n", + "\n", + "-----\n", + "\n", + "Rather than automatically adding the new columns, Delta Lake enforces the schema,\n", + "and stops the write from occurring. To help identify which column(s) caused the\n", + "mismatch, Spark prints out both schemas in the stack trace for comparison.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
e0bbc5fd6f6b9a9377e648ca4ddfd868**• Column names that differ only by case.** This means that you cannot have columns\n", + "such as “Foo” and “foo” defined in the same table. While Spark can be used in case\n", + "sensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\n", + "when storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\n", + "column information. To avoid potential mistakes, data corruption or loss issues (which\n", + "we’ve personally experienced at Databricks), we decided to add this restriction.\n", + "\n", + "\n", + "-----\n", + "\n", + "Rather than automatically adding the new columns, Delta Lake enforces the schema,\n", + "and stops the write from occurring. To help identify which column(s) caused the\n", + "mismatch, Spark prints out both schemas in the stack trace for comparison.\n", + "\n", + "**How is schema enforcement useful?**\n", + "Because it’s such a stringent check, schema enforcement is an excellent tool to use\n", + "as a gatekeeper for a clean, fully transformed data set that is ready for production or\n", + "consumption. It’s typically enforced on tables that directly feed:\n", + "\n", + "- Machine learning algorithms\n", + "\n", + "- BI dashboards\n", + "\n", + "- Data analytics and visualization tools\n", + "\n", + "- Any production system requiring highly structured,\n", + "strongly typed, semantic schemas\n", + "\n", + "In order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\n", + "a look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "**What is schema evolution?**\n", + "Schema evolution is a feature that allows users to easily change a table’s current\n", + "schema to accommodate data that is changing over time. Most commonly, it’s used\n", + "when performing an append or overwrite operation, to automatically adapt the\n", + "schema to include one or more new columns.\n", + "\n", + "**How does schema evolution work?**\n", + "Following up on the example from the previous section, developers can\n", + "easily use schema evolution to add the new columns that were previously\n", + "rejected due to a schema mismatch. Schema evolution is activated by adding\n", + ".option(‘mergeSchema’, ‘true’) to your .write or .writeStream\n", + "Spark command, as shown in the following example.\n", + "\n", + "\n", + "#Add the mergeSchema option\n", + "\n", + "loans.write.format( “delta” ) \\\n", + "\n", + ".option( “mergeSchema” , “true” ) \\\n", + "\n", + ".mode( “append” ) \\\n", + "\n", + ".save(DELTALAKE_SILVER_PATH)\n", + "\n", + "By including the mergeSchema option in your query, any columns that are present\n", + "\n", + "in the DataFrame but not in the target table are automatically added to the end of the\n", + "\n", + "schema as part of a write transaction. Nested fields can also be added, and these\n", + "\n", + "fields will get added to the end of their respective struct columns as well.\n", + "\n", + "Data engineers and scientists can use this option to add new columns (perhaps a\n", + "\n", + "newly tracked metric, or a column of this month’s sales figures) to their existing ML\n", + "\n", + "production tables without breaking existing models that rely on the old columns.\n", + "\n", + "The following types of schema changes are eligible for schema evolution during table\n", + "\n", + "appends or overwrites:\n", + "\n", + "- Adding new columns (this is the most common scenario)\n", + "\n", + "- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n", + "\n", + "→ ShortType → IntegerType\n", + "\n", + "Other changes, not eligible for schema evolution, require that the schema and data\n", + "\n", + "are overwritten by adding .option(“overwriteSchema”,“true”) . Those\n", + "\n", + "changes include:\n", + "\n", + "- Dropping a column\n", + "\n", + "- Changing an existing column’s data typeC (in place)\n", + "\n", + "- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\n", + "supported, allowing users to perform the following actions on table schemas:\n", + "\n", + "- Adding columns\n", + "\n", + "- Changing column commentsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
1e592fd72acb0cf9b59313daed7183c1The following types of schema changes are eligible for schema evolution during table\n", + "\n", + "appends or overwrites:\n", + "\n", + "- Adding new columns (this is the most common scenario)\n", + "\n", + "- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n", + "\n", + "→ ShortType → IntegerType\n", + "\n", + "Other changes, not eligible for schema evolution, require that the schema and data\n", + "\n", + "are overwritten by adding .option(“overwriteSchema”,“true”) . Those\n", + "\n", + "changes include:\n", + "\n", + "- Dropping a column\n", + "\n", + "- Changing an existing column’s data typeC (in place)\n", + "\n", + "- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\n", + "supported, allowing users to perform the following actions on table schemas:\n", + "\n", + "- Adding columns\n", + "\n", + "- Changing column comments\n", + "\n", + "- Setting table properties that define the behavior of the table, such as setting the\n", + "retention duration of the transaction log\n", + "\n", + "**How is schema evolution useful?**\n", + "Schema evolution can be used anytime you _intend_ to change the schema of your table\n", + "(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\n", + "be there). It’s the easiest way to migrate your schema because it automatically adds the\n", + "correct column names and data types, without having to declare them explicitly.\n", + "\n", + "**Summary**\n", + "Schema enforcement rejects any new columns or other schema changes that\n", + "aren’t compatible with your table. By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together, these\n", + "features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Want to learn more about schema enforcement and evolution?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake**\n", + "**DML Internals**\n", + "\n", + "Delta Lake supports data manipulation language (DML) commands including UPDATE,\n", + "DELETE and MERGE. These commands simplify change data capture (CDC), audit and\n", + "governance, and GDPR/CCPA workflows, among others.\n", + "\n", + "In this chapter, we will demonstrate how to use each of these DML commands,\n", + "describe what Delta Lake is doing behind the scenes, and offer some performance\n", + "tuning tips for each one.\n", + "\n", + "**Delta Lake DML: UPDATE**\n", + "You can use the UPDATE operation to selectively update any rows that match a\n", + "filtering condition, also known as a predicate. The code below demonstrates how\n", + "to use each type of predicate as part of an UPDATE statement. Note that Delta Lake\n", + "offers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\n", + "only the SQL code.\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n", + "\n", + "\n", + "-----\n", + "\n", + "**UPDATE: Under the hood**\n", + "Delta Lake performs an UPDATE on a table in two steps:\n", + "\n", + "1. Find and select the files containing data that match the predicate and, therefore,\n", + "need to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\n", + "this process.\n", + "\n", + "2. \u0007Read each matching file into memory, update the relevant rows, and write out the\n", + "result into a new data file.\n", + "\n", + "Once Delta Lake has executed the UPDATE successfully, it adds a commit in the\n", + "transaction log indicating that the new data file will be used in place of the old one\n", + "from now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n", + "— recorded as a data file that applied to an older version of the table, but not the\n", + "current version. Delta Lake is able to use it to provide data versioning and time travel.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
b7dca6105fca69505d8c6fc54e87219dUPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n", + "\n", + "\n", + "-----\n", + "\n", + "**UPDATE: Under the hood**\n", + "Delta Lake performs an UPDATE on a table in two steps:\n", + "\n", + "1. Find and select the files containing data that match the predicate and, therefore,\n", + "need to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\n", + "this process.\n", + "\n", + "2. \u0007Read each matching file into memory, update the relevant rows, and write out the\n", + "result into a new data file.\n", + "\n", + "Once Delta Lake has executed the UPDATE successfully, it adds a commit in the\n", + "transaction log indicating that the new data file will be used in place of the old one\n", + "from now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n", + "— recorded as a data file that applied to an older version of the table, but not the\n", + "current version. Delta Lake is able to use it to provide data versioning and time travel.\n", + "\n", + "**UPDATE + Delta Lake time travel = Easy debugging**\n", + "Keeping the old data files turns out to be very useful for debugging because you can\n", + "use Delta Lake “time travel” to go back and query previous versions of a table at any\n", + "\n", + "\n", + "time. In the event that you update your table incorrectly and want to figure out what\n", + "happened, you can easily compare two versions of a table to one another to see what\n", + "has changed.\n", + "\n", + "SELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n", + "\n", + "- FROM mytable VERSION AS OF 12\n", + "\n", + "**UPDATE: Performance tuning tips**\n", + "The main way to improve the performance of the UPDATE command on Delta Lake\n", + "is to add more predicates to narrow down the search space. The more specific the\n", + "search, the fewer files Delta Lake needs to scan and/or modify.\n", + "\n", + "**Delta Lake DML: DELETE**\n", + "You can use the DELETE command to selectively delete rows based upon a\n", + "predicate (filtering condition).\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "\n", + "-----\n", + "\n", + "In the event that you want to revert an accidental DELETE operation, you can use time\n", + "travel to roll back your table to the way it was.\n", + "\n", + "**DELETE: Under the hood**\n", + "DELETE works just like UPDATE under the hood. Delta Lake makes two scans of\n", + "the data: The first scan is to identify any data files that contain rows matching the\n", + "predicate condition. The second scan reads the matching data files into memory,\n", + "at which point Delta Lake deletes the rows in question before writing out the newly\n", + "clean data to disk.\n", + "\n", + "After Delta Lake completes a DELETE operation successfully, the old data files are\n", + "not deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\n", + "longer part of the active table) in the Delta Lake transaction log. Remember, those old\n", + "files aren’t deleted immediately because you might still need them to time travel back\n", + "to an earlier version of the table. If you want to delete files older than a certain time\n", + "period, you can use the VACUUM command.\n", + "\n", + "**DELETE + VACUUM: Cleaning up old data files**\n", + "Running the VACUUM command permanently deletes all data files that are:\n", + "\n", + "1. No longer part of the active table and\n", + "2. \u0007Older than the retention threshold, which is seven days by default\n", + "\n", + "Delta Lake does not automatically VACUUM old files — you must run the command\n", + "yourself, as shown below. If you want to specify a retention period that is different\n", + "from the default of seven days, you can provide it as a parameter.\n", + "\n", + "from delta.tables import - deltaTable.\n", + "\n", + "# vacuum files older than 30 days(720 hours)\n", + "\n", + "deltaTable.vacuum( 720 )\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
fefdd902aa67221084956739775933ef**DELETE + VACUUM: Cleaning up old data files**\n", + "Running the VACUUM command permanently deletes all data files that are:\n", + "\n", + "1. No longer part of the active table and\n", + "2. \u0007Older than the retention threshold, which is seven days by default\n", + "\n", + "Delta Lake does not automatically VACUUM old files — you must run the command\n", + "yourself, as shown below. If you want to specify a retention period that is different\n", + "from the default of seven days, you can provide it as a parameter.\n", + "\n", + "from delta.tables import - deltaTable.\n", + "\n", + "# vacuum files older than 30 days(720 hours)\n", + "\n", + "deltaTable.vacuum( 720 )\n", + "\n", + "\n", + "-----\n", + "\n", + "**DELETE: Performance tuning tips**\n", + "Just like with the UPDATE command, the main way to improve the performance of\n", + "a DELETE operation on Delta Lake is to add more predicates to narrow down the\n", + "search space. The Databricks managed version of Delta Lake also features other\n", + "performance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n", + "[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "\n", + "**Delta Lake DML: MERGE**\n", + "The Delta Lake MERGE command allows you to perform upserts, which are a mix of\n", + "an UPDATE and an INSERT. To understand upserts, imagine that you have an existing\n", + "table (aka a target table), and a source table that contains a mix of new records and\n", + "updates to existing records.\n", + "\n", + "\n", + "**Here’s how an upsert works:**\n", + "\n", + "- When a record from the source table matches a preexisting record in the target\n", + "table, Delta Lake updates the record.\n", + "\n", + "- When there is no such match, Delta Lake inserts the new record.\n", + "\n", + "The Delta Lake MERGE command greatly simplifies workflows that can be complex\n", + "and cumbersome with other traditional data formats like Parquet. Common scenarios\n", + "where merges/upserts come in handy include change data capture, GDPR/CCPA\n", + "compliance, sessionization, and deduplication of records.\n", + "\n", + "**For more information about upserts, read:**\n", + "\n", + "[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n", + "\n", + "[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n", + "\n", + "[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**MERGE: Under the hood**\n", + "Delta Lake completes a MERGE in two steps:\n", + "\n", + "1. Perform an inner join between the target table and source table to select all files\n", + "that have matches.\n", + "2. Perform an outer join between the selected files in the target and source tables\n", + "and write out the updated/deleted/inserted data.\n", + "\n", + "The main way that this differs from an UPDATE or a DELETE under the hood is that\n", + "Delta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\n", + "strategies when seeking to improve performance.\n", + "\n", + "**MERGE: Performance tuning tips**\n", + "To improve performance of the MERGE command, you need to determine which of the\n", + "two joins that make up the merge is limiting your speed.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
e8a30d8cf3f0c5e33c90468fbd5804db[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**MERGE: Under the hood**\n", + "Delta Lake completes a MERGE in two steps:\n", + "\n", + "1. Perform an inner join between the target table and source table to select all files\n", + "that have matches.\n", + "2. Perform an outer join between the selected files in the target and source tables\n", + "and write out the updated/deleted/inserted data.\n", + "\n", + "The main way that this differs from an UPDATE or a DELETE under the hood is that\n", + "Delta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\n", + "strategies when seeking to improve performance.\n", + "\n", + "**MERGE: Performance tuning tips**\n", + "To improve performance of the MERGE command, you need to determine which of the\n", + "two joins that make up the merge is limiting your speed.\n", + "\n", + "If the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\n", + "takes too long), try the following strategies:\n", + "\n", + "- Add more predicates to narrow down the search space.\n", + "\n", + "- Adjust shuffle partitions.\n", + "\n", + "- Adjust broadcast join thresholds.\n", + "\n", + "- Compact the small files in the table if there are lots of them, but don’t compact them\n", + "into files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n", + "\n", + "\n", + "**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n", + "**locality of updates.**\n", + "\n", + "On the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\n", + "themselves takes too long), try the strategies below.\n", + "\n", + "- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\n", + "before writes (with Optimized Writes in Databricks Delta Lake).\n", + "\n", + "- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\n", + "broadcast join, but if you’re doing a right outer join, Spark can do one, and you can\n", + "adjust the broadcast thresholds as needed.\n", + "\n", + "- **Cache the source table / DataFrame:** Caching the source table can speed up the\n", + "second scan, but be sure not to cache the target table, as this can lead to cache\n", + "coherency issues.\n", + "\n", + "Delta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\n", + "greatly simplify the workflow for many common big data operations. In this chapter, we\n", + "demonstrated how to use these commands in Delta Lake, shared information about\n", + "how each one works under the hood, and offered some performance tuning tips.\n", + "\n", + "**Want a deeper dive into DML internals, including snippets of code?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Quickly**\n", + "**Processes Petabytes With**\n", + "**Data Skipping and Z-Ordering**\n", + "\n", + "Delta Lake is capable of sifting through petabytes of data within seconds. Much of this\n", + "speed is owed to two features: (1) data skipping and (2) Z-Ordering.\n", + "\n", + "Combining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\n", + "amount of data that needs to be scanned to answer selective queries against large\n", + "Delta tables, which typically translates into substantial runtime improvements and\n", + "cost savings.\n", + "\n", + "Using Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\n", + "data lakes can be queried in a matter of seconds by skipping files not relevant to\n", + "the query. For example, 93.2% of the records in a 504 TB data set were skipped for a\n", + "typical query in a real-world cybersecurity analysis use case, reducing query times by\n", + "up to two orders of magnitude. In other words, Delta Lake can speed up your queries\n", + "by as much as 100x.\n", + "\n", + "**Want to see data skipping and Z-Ordering in action?**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
45f7948c2a554f1131404199c2ea647aDelta Lake is capable of sifting through petabytes of data within seconds. Much of this\n", + "speed is owed to two features: (1) data skipping and (2) Z-Ordering.\n", + "\n", + "Combining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\n", + "amount of data that needs to be scanned to answer selective queries against large\n", + "Delta tables, which typically translates into substantial runtime improvements and\n", + "cost savings.\n", + "\n", + "Using Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\n", + "data lakes can be queried in a matter of seconds by skipping files not relevant to\n", + "the query. For example, 93.2% of the records in a 504 TB data set were skipped for a\n", + "typical query in a real-world cybersecurity analysis use case, reducing query times by\n", + "up to two orders of magnitude. In other words, Delta Lake can speed up your queries\n", + "by as much as 100x.\n", + "\n", + "**Want to see data skipping and Z-Ordering in action?**\n", + "\n", + "Apple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n", + "\n", + "use Delta Lake as a unified solution for data engineering and data science in the context\n", + "\n", + "of cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n", + "\n", + "[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n", + "\n", + "\n", + "-----\n", + "\n", + "AND / OR / NOT are also supported as well as “literal op column” predicates.\n", + "\n", + "Even though data skipping kicks in when the above conditions are met, it may not\n", + "always be effective. But, if there are a few columns that you frequently filter by and\n", + "want to make sure that’s fast, then you can explicitly optimize your data layout with\n", + "respect to skipping effectiveness by running the following command:\n", + "\n", + "OPTIMIZE [ WHERE ]\n", + "ZORDER BY ( [, …])\n", + "\n", + "**Exploring the details**\n", + "Apart from partition pruning, another common technique that’s used in the data\n", + "warehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n", + "[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\n", + "as minimum and maximum values at a certain granularity that are correlated with I/O\n", + "granularity. And we want to leverage those statistics at query planning time in order\n", + "to avoid unnecessary I/O.\n", + "\n", + "This is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\n", + "inserted into a Delta Lake table, file-level min/max statistics are collected for all\n", + "columns (including nested ones) of supported types. Then, when there’s a lookup\n", + "query against the table, Delta Lake first consults these statistics in order to determine\n", + "which files can safely be skipped.\n", + "\n", + "**Want to learn more about data skipping and Z-Ordering, including**\n", + "**how to apply it within a cybersecurity analysis?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n", + "\n", + "\n", + "**Using data skipping and Z-Order clustering**\n", + "Data skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\n", + "Delta Lake, kicking in whenever your SQL queries or data set operations include filters\n", + "of the form “column op literal,” where:\n", + "\n", + "- column is an attribute of some Delta Lake table, be it top-level or nested, whose\n", + "data type is string / numeric / date/ timestamp\n", + "\n", + "- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n", + "\n", + "\n", + "- literal is an explicit (list of) value(s) of the same data type as a column\n", + "\n", + "\n", + "-----
SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
862d197d353a337dcbecd6b04a9c76ab[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n", + "\n", + "\n", + "**Using data skipping and Z-Order clustering**\n", + "Data skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\n", + "Delta Lake, kicking in whenever your SQL queries or data set operations include filters\n", + "of the form “column op literal,” where:\n", + "\n", + "- column is an attribute of some Delta Lake table, be it top-level or nested, whose\n", + "data type is string / numeric / date/ timestamp\n", + "\n", + "- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n", + "\n", + "\n", + "- literal is an explicit (list of) value(s) of the same data type as a column\n", + "\n", + "\n", + "-----\n", + "\n", + "**Features**\n", + "Use Delta Lake’s robust features\n", + "to reliably manage your data\n", + "\n", + "## CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why Use MERGE**\n", + "**With Delta Lake?**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\n", + "MERGE command, which allows you to efficiently upsert and delete records in your\n", + "data lakes.\n", + "\n", + "MERGE dramatically simplifies how a number of common data pipelines can be built\n", + "-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\n", + "can now be replaced by simple MERGE queries.\n", + "\n", + "This finer-grained update capability simplifies how you build your big data\n", + "pipelines for various use cases ranging from change data capture to GDPR. You\n", + "no longer need to write complicated logic to overwrite tables and overcome a lack\n", + "of snapshot isolation.\n", + "\n", + "With changing data, another critical capability required is the ability to roll back, in\n", + "case of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n", + "\n", + "In this chapter, we’ll discuss common use cases where existing data might need to be\n", + "updated or deleted. We’ll also explore the challenges inherent to upserts and explain\n", + "how MERGE can address them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**When are upserts necessary?**\n", + "There are a number of common use cases where existing data in a data lake needs to\n", + "be updated or deleted:\n", + "\n", + "- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\n", + "the right to be forgotten (also known as data erasure) in GDPR, organizations must\n", + "remove a user’s information upon request. This data erasure includes deleting user\n", + "information in the data lake as well.\n", + "\n", + "- **Change data capture from traditional databases:** In a service-oriented\n", + "architecture, typically web and mobile applications are served by microservices\n", + "built on traditional SQL/NoSQL databases that are optimized for low latency. One\n", + "of the biggest challenges organizations face is joining data across these various\n", + "siloed data systems, and hence data engineers build pipelines to consolidate\n", + "all data sources into a central data lake to facilitate analytics. These pipelines\n", + "often have to periodically read changes made on a traditional SQL/NoSQL table\n", + "and apply them to corresponding tables in the data lake. Such changes can take\n", + "various forms: Tables with slowly changing dimensions, change data capture of all\n", + "inserted/updated/deleted rows, etc.\n", + "\n", + "- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\n", + "case in many areas ranging from product analytics to targeted advertising to\n", + "predictive maintenance. Building continuous applications to track sessions and\n", + "recording the results that write into data lakes is difficult because data lakes have\n", + "always been optimized for appending data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ce66f1cf55c56b762dad363ec5618ccf- **Change data capture from traditional databases:** In a service-oriented\n", + "architecture, typically web and mobile applications are served by microservices\n", + "built on traditional SQL/NoSQL databases that are optimized for low latency. One\n", + "of the biggest challenges organizations face is joining data across these various\n", + "siloed data systems, and hence data engineers build pipelines to consolidate\n", + "all data sources into a central data lake to facilitate analytics. These pipelines\n", + "often have to periodically read changes made on a traditional SQL/NoSQL table\n", + "and apply them to corresponding tables in the data lake. Such changes can take\n", + "various forms: Tables with slowly changing dimensions, change data capture of all\n", + "inserted/updated/deleted rows, etc.\n", + "\n", + "- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\n", + "case in many areas ranging from product analytics to targeted advertising to\n", + "predictive maintenance. Building continuous applications to track sessions and\n", + "recording the results that write into data lakes is difficult because data lakes have\n", + "always been optimized for appending data.\n", + "\n", + "- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\n", + "Delta Lake table by appending data to the table. However, often the sources can\n", + "generate duplicate records and downstream de-duplication steps are needed to\n", + "take care of them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why upserts into data lakes have**\n", + "**traditionally been challenging**\n", + "Since data lakes are fundamentally based on files, they have always been optimized\n", + "for appending data rather than for changing existing data. Hence, building the above\n", + "use case has always been challenging.\n", + "\n", + "Users typically read the entire table (or a subset of partitions) and then overwrite\n", + "them. Therefore, every organization tries to reinvent the wheel for their requirement\n", + "by handwriting complicated queries in SQL, Spark, etc. This approach is:\n", + "\n", + "- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\n", + "records causes pipelines to be slow and costly. Hand-tuning the table layout and\n", + "query optimization is tedious and requires deep domain knowledge.\n", + "\n", + "- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\n", + "human errors. For example, multiple pipelines concurrently modifying the same table\n", + "without any transactional support can lead to unpredictable data inconsistencies\n", + "and in the worst case, data losses. Often, even a single handwritten pipeline can\n", + "easily cause data corruptions due to errors in encoding the business logic.\n", + "\n", + "- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\n", + "keep track of and maintain. In the long term, this alone can significantly increase\n", + "the organizational and infrastructural costs.\n", + "\n", + "**Introducing MERGE in Delta Lake**\n", + "With Delta Lake, you can easily address the use cases above without any of the\n", + "aforementioned problems using the following MERGE command:\n", + "\n", + "MERGE INTO\n", + "\n", + "USING\n", + "\n", + "ON\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "\n", + "[ WHEN NOT MATCHED [ AND ] THEN ]\n", + "\n", + "where\n", + "\n", + "=\n", + "\n", + "DELETE |\n", + "\n", + "UPDATE SET - |\n", + "\n", + "UPDATE SET column1 = value1 [, column2 = value2 ...]\n", + "\n", + "=\n", + "\n", + "INSERT - |\n", + "\n", + "INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n", + "\n", + "Let’s understand how to use MERGE with a simple example. Suppose you have a\n", + "[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\n", + "Furthermore, you have a table of new addresses for both existing and new users. To\n", + "merge all the new addresses to the main user table, you can run the following:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING updates\n", + "\n", + "ON users.userId = updates.userId\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = updates.addresses\n", + "\n", + "WHEN NOT MATCHED THEN\n", + "INSERT (userId, address) VALUES (updates.userId, updates.address)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
b302cede7dc4417b382ced696712982awhere\n", + "\n", + "=\n", + "\n", + "DELETE |\n", + "\n", + "UPDATE SET - |\n", + "\n", + "UPDATE SET column1 = value1 [, column2 = value2 ...]\n", + "\n", + "=\n", + "\n", + "INSERT - |\n", + "\n", + "INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n", + "\n", + "Let’s understand how to use MERGE with a simple example. Suppose you have a\n", + "[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\n", + "Furthermore, you have a table of new addresses for both existing and new users. To\n", + "merge all the new addresses to the main user table, you can run the following:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING updates\n", + "\n", + "ON users.userId = updates.userId\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = updates.addresses\n", + "\n", + "WHEN NOT MATCHED THEN\n", + "INSERT (userId, address) VALUES (updates.userId, updates.address)\n", + "\n", + "This will perform exactly what the syntax says -- for existing users (i.e., MATCHED\n", + "clause), it will update the address column, and for new users (i.e., NOT MATCHED\n", + "clause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\n", + "MERGE operation can be orders of magnitude faster than overwriting entire partitions\n", + "or tables since Delta Lake reads only relevant files and updates them. Specifically,\n", + "Delta Lake's MERGE has the following advantages:\n", + "\n", + "\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying use cases with MERGE**\n", + "**Deleting data due to GDPR**\n", + "Complying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\n", + "get any easier. You can set up a simple scheduled job with an example code, like\n", + "below, to delete all the users who have opted out of your service.\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING opted_out_users\n", + "\n", + "ON opted_out_users.userId = users.userId\n", + "\n", + "WHEN MATCHED THEN DELETE\n", + "\n", + "**Applying change data from databases**\n", + "You can easily apply all data changes — updates, deletes, inserts — generated from an\n", + "external database into a Delta Lake table with the MERGE syntax as follows:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING (\n", + "\n", + "SELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n", + "\n", + "(\n", + "\n", + "SELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n", + "\n", + "FROM changes GROUP BY userId\n", + "\n", + ")\n", + "\n", + ") latestChange\n", + "\n", + "ON latestChange.userId = users.userId\n", + "\n", + "WHEN MATCHED AND latestChange.deleted = TRUE THEN\n", + "\n", + "DELETE\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = latestChange.address\n", + "\n", + "WHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n", + "\n", + "INSERT (userId, address) VALUES (userId, address)\n", + "\n", + "\n", + "\n", + "- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\n", + "partitions. This eliminates all the complications of rewriting partitions, updating\n", + "the Hive metastore with MSCK and so on.\n", + "\n", + "- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\n", + "rewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\n", + "Delta Lake with all its I/O and processing optimizations makes all the reading and\n", + "writing data by MERGE significantly faster than similar operations in Apache Spark.\n", + "\n", + "- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\n", + "concurrent writers update the data correctly with ACID transactions, and concurrent\n", + "readers always see a consistent snapshot of the data.\n", + "\n", + "Here is a visual explanation of how MERGE compares with handwritten pipelines.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
2a2ccd995214417b940431182cb57108UPDATE SET address = latestChange.address\n", + "\n", + "WHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n", + "\n", + "INSERT (userId, address) VALUES (userId, address)\n", + "\n", + "\n", + "\n", + "- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\n", + "partitions. This eliminates all the complications of rewriting partitions, updating\n", + "the Hive metastore with MSCK and so on.\n", + "\n", + "- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\n", + "rewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\n", + "Delta Lake with all its I/O and processing optimizations makes all the reading and\n", + "writing data by MERGE significantly faster than similar operations in Apache Spark.\n", + "\n", + "- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\n", + "concurrent writers update the data correctly with ACID transactions, and concurrent\n", + "readers always see a consistent snapshot of the data.\n", + "\n", + "Here is a visual explanation of how MERGE compares with handwritten pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Updating session information from streaming**\n", + "**pipelines**\n", + "If you have streaming event data flowing in and if you want to sessionize the streaming\n", + "event data and incrementally update and store sessions in a Delta Lake table, you\n", + "can accomplish this using the foreachBatch in Structured Streaming and MERGE.\n", + "For example, suppose you have a Structured Streaming DataFrame that computes\n", + "updated session information for each user. You can start a streaming query that\n", + "applies all the sessions update to a Delta Lake table as follows (Scala).\n", + "\n", + "streamingSessionUpdatesDF.writeStream\n", + "\n", + ".foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n", + "\n", + "microBatchOutputDF.createOrReplaceTempView(“updates”)\n", + "\n", + "microBatchOutputDF.sparkSession.sql(s”””\n", + "\n", + "MERGE INTO sessions\n", + "\n", + "USING updates\n", + "\n", + "ON sessions.sessionId = updates.sessionId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *\n", + "\n", + "WHEN NOT MATCHED THEN INSERT * “”” )\n", + "\n", + "}.start()\n", + "\n", + "For a complete working example of each Batch and MERGE, see this notebook\n", + "( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n", + "\n", + "[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n", + "\n", + "[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n", + "\n", + "[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n", + "\n", + "[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simple, Reliable Upserts and**\n", + "**Deletes on Delta Lake Tables**\n", + "**Using Python APIs**\n", + "\n", + "In this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\n", + "Lake within the context of an on-time flight performance scenario. We will show how\n", + "to upsert and delete data, query old versions of data with time travel, and vacuum\n", + "older versions for cleanup.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
73e577990b7c3a09d1aafcdc124d9ed0[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n", + "\n", + "[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n", + "\n", + "[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simple, Reliable Upserts and**\n", + "**Deletes on Delta Lake Tables**\n", + "**Using Python APIs**\n", + "\n", + "In this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\n", + "Lake within the context of an on-time flight performance scenario. We will show how\n", + "to upsert and delete data, query old versions of data with time travel, and vacuum\n", + "older versions for cleanup.\n", + "\n", + "**How to start using Delta Lake**\n", + "The Delta Lake package is installable through PySpark by using the --packages\n", + "option. In our example, we will also demonstrate the ability to VACUUM files and execute\n", + "Delta Lake SQL commands within Apache Spark. As this is a short demonstration, we\n", + "will also enable the following configurations:\n", + "\n", + "\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n", + "\n", + "to allow us to vacuum files shorter than the default retention duration of seven days.\n", + "Note, this is only required for the SQL command VACUUM\n", + "\n", + "\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n", + "\n", + "to enable Delta Lake SQL commands within Apache Spark; this is not required for\n", + "Python or Scala API calls.\n", + "\n", + "# Using Spark Packages\n", + "\n", + "./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n", + "\n", + "databricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n", + "\n", + "sql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Loading and saving our Delta Lake data**\n", + "This scenario will be using the On-Time Flight Performance or Departure Delays data\n", + "set generated from the RITA BTS Flight Departure Statistics; some examples of this data\n", + "in action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\n", + "by reading the data set.\n", + "\n", + "\u0007# Location variables\n", + "\n", + "\n", + "/departureDelays.delta$ ls l\n", + "\n", + ".\n", + "\n", + "..\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n", + "\n", + "Part- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n", + "\n", + "\n", + "tripdelaysFilePath = “/root/data/departuredelays.csv”\n", + "\n", + "pathToEventsTable = “/root/deltalake/departureDelays.delta”\n", + "\n", + "Now, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n", + "\n", + "# Read flight delay data\n", + "\n", + "\n", + "departureDelays = spark.read \\\n", + "\n", + ".option( “header” , “true” ) \\\n", + "\n", + ".option( “inferSchema” , “true” ) \\SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
2e0b5f9b3069618ee6c293d23c466506part- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n", + "\n", + "Part- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n", + "\n", + "\n", + "tripdelaysFilePath = “/root/data/departuredelays.csv”\n", + "\n", + "pathToEventsTable = “/root/deltalake/departureDelays.delta”\n", + "\n", + "Now, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n", + "\n", + "# Read flight delay data\n", + "\n", + "\n", + "departureDelays = spark.read \\\n", + "\n", + ".option( “header” , “true” ) \\\n", + "\n", + ".option( “inferSchema” , “true” ) \\\n", + "\n", + ".csv(tripdelaysFilePath)\n", + "\n", + "Next, let’s save our departureDelays data set to a Delta Lake table. By saving this table\n", + "to Delta Lake storage, we will be able to take advantage of its features including ACID\n", + "transactions, unified batch and streaming and time travel.\n", + "\n", + "# Save flight delay data into Delta Lake format\n", + "\n", + "departureDelays \\\n", + "\n", + ".write \\\n", + "\n", + "\n", + "# Load flight delay data in Delta Lake format\n", + "\n", + "delays_delta = spark \\\n", + "\n", + ".read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “departureDelays.delta” )\n", + "\n", + "# Create temporary view\n", + "\n", + "delays_delta.createOrReplaceTempView(“delays_delta”)\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’”).show()\n", + "\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".mode( “overwrite” ) \\\n", + "\n", + ".save( “departureDelays.delta” )\n", + "\n", + "Note, this approach is similar to how you would normally save Parquet data; instead\n", + "of specifying format(“parquet”) , you will now specify format(“delta”) . If\n", + "you were to take a look at the underlying file system, you will notice four files created\n", + "for the departureDelays Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, lets determine the number of flights originating from Seattle to San Francisco; in\n", + "this data set, there are 1698 flights.\n", + "\n", + "**In-place conversion to Delta Lake**\n", + "If you have existing Parquet tables, you have the ability to convert them to Delta Lake\n", + "format in place, thus not needing to rewrite your table. To convert the table, you can\n", + "run the following commands.\n", + "\n", + "\n", + "deltaTable DeltaTable .forPath(spark, pathToEventsTable\n", + "\n", + ")\n", + "\n", + "# Delete all on-time and early flights\n", + "\n", + "deltaTable. delete ( “delay < 0” )\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "# Convert non partitioned parquet table at path ‘/path/to/table’\n", + "\n", + "deltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n", + "\n", + "table`” )\n", + "\n", + "# Convert partitioned parquet table at path ‘/path/to/table’ and\n", + "\n", + "partitioned by integer column named ‘part’\n", + "\n", + "\n", + "After we delete (more on this below) all of the on-time and early flights, as you can\n", + "see from the preceding query there are 837 late flights originating from Seattle to\n", + "San Francisco. If you review the file system, you will notice there are more files even\n", + "though you deleted data.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquetSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
9f6993ce559f866ae52a5b376d3ad1ebdestination = ‘SFO’” ).show()\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "# Convert non partitioned parquet table at path ‘/path/to/table’\n", + "\n", + "deltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n", + "\n", + "table`” )\n", + "\n", + "# Convert partitioned parquet table at path ‘/path/to/table’ and\n", + "\n", + "partitioned by integer column named ‘part’\n", + "\n", + "\n", + "After we delete (more on this below) all of the on-time and early flights, as you can\n", + "see from the preceding query there are 837 late flights originating from Seattle to\n", + "San Francisco. If you review the file system, you will notice there are more files even\n", + "though you deleted data.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n", + "\n", + "part-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n", + "\n", + "part- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n", + "\n", + "part- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n", + "\n", + "part- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n", + "\n", + "\n", + "partitionedDeltaTable = DeltaTable .convertToDelta(spark,\n", + "\n", + "“parquet.`/path/to/table`”, “part int” )\n", + "\n", + "**Delete our flight data**\n", + "To delete data from a traditional data lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to delete\n", + "2. Create a new table based on the previous query\n", + "3. Delete the original table\n", + "4. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this process\n", + "by running a DELETE statement. To show this, let’s delete all of the flights that had\n", + "arrived early or on-time (i.e., delay < 0).\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "from pyspark.sql.functions import - \n", + "\n", + "# Access the Delta Lake table\n", + "\n", + "\n", + "-----\n", + "\n", + "In traditional data lakes, deletes are performed by rewriting the entire table\n", + "excluding the values to be deleted. With Delta Lake, deletes are instead performed\n", + "by selectively writing new versions of the files containing the data to be deleted and\n", + "only marks the previous files as deleted. This is because Delta Lake uses multiversion\n", + "concurrency control (MVCC) to do atomic operations on the table: For example, while\n", + "one user is deleting data, another user may be querying the previous version of the\n", + "table. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\n", + "and query previous versions as we will see later.\n", + "\n", + "**Update our flight data**\n", + "To update data from your traditional Data Lake table, you will need to:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ac06a55857e208aef4a47cafe26630cffrom delta.tables import - \n", + "\n", + "from pyspark.sql.functions import - \n", + "\n", + "# Access the Delta Lake table\n", + "\n", + "\n", + "-----\n", + "\n", + "In traditional data lakes, deletes are performed by rewriting the entire table\n", + "excluding the values to be deleted. With Delta Lake, deletes are instead performed\n", + "by selectively writing new versions of the files containing the data to be deleted and\n", + "only marks the previous files as deleted. This is because Delta Lake uses multiversion\n", + "concurrency control (MVCC) to do atomic operations on the table: For example, while\n", + "one user is deleting data, another user may be querying the previous version of the\n", + "table. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\n", + "and query previous versions as we will see later.\n", + "\n", + "**Update our flight data**\n", + "To update data from your traditional Data Lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to modify\n", + "2. Modify the rows that need to be updated/changed\n", + "3. Merge these two tables to create a new table\n", + "4. Delete the original table\n", + "5. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this\n", + "process by running an UPDATE statement. To show this, let’s update all of the flights\n", + "originating from Detroit to Seattle.\n", + "\n", + "\n", + "With the Detroit flights now tagged as Seattle flights, we now have 986 flights\n", + "originating from Seattle to San Francisco. If you were to list the file system for\n", + "your departureDelays folder (i.e., $../departureDelays/ls -l ), you will\n", + "notice there are now 11 files (instead of the 8 right after deleting the files and the four\n", + "files after creating the table).\n", + "\n", + "**Merge our flight data**\n", + "A common scenario when working with a data lake is to continuously append data to\n", + "your table. This often results in duplicate data (rows you do not want to be inserted\n", + "into your table again), new rows that need to be inserted, and some rows that need to\n", + "be updated. With Delta Lake, all of this can be achieved by using the merge operation\n", + "(similar to the SQL MERGE statement).\n", + "\n", + "Let’s start with a sample data set that you will want to be updated, inserted or\n", + "de-duplicated with the following query.\n", + "\n", + "\n", + "# Update all flights originating from Detroit to now be\n", + "\n", + "\n", + "originating from Seattle\n", + "\n", + "deltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n", + "\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "\n", + "The output of this query looks like the following table. Note, the color-coding has been\n", + "added to clearly identify which rows are de-duplicated (blue), updated (yellow) and\n", + "inserted (green).\n", + "\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n", + "\n", + "and destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "-----\n", + "\n", + "Next, let’s generate our own merge_table that contains data we will insert, update\n", + "or de-duplicate with the following code snippet.\n", + "\n", + "items = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n", + "\n", + "‘SEA’ , ‘SFO’ ),\n", + "\n", + "(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n", + "\n", + "\n", + "With Delta Lake, this can be easily achieved via a merge statement as noted in the\n", + "following code snippet.\n", + "\n", + "# Merge merge_table with flights\n", + "\n", + "deltaTable. alias( “flights” ) \\\n", + "\n", + ".merge(merge_table. alias ( “updates”),”flights.date =\n", + "\n", + "updates.date” ) \\SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
fd039f130d7c2e0fb95ead61b4197d3bspark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n", + "\n", + "and destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "-----\n", + "\n", + "Next, let’s generate our own merge_table that contains data we will insert, update\n", + "or de-duplicate with the following code snippet.\n", + "\n", + "items = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n", + "\n", + "‘SEA’ , ‘SFO’ ),\n", + "\n", + "(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n", + "\n", + "\n", + "With Delta Lake, this can be easily achieved via a merge statement as noted in the\n", + "following code snippet.\n", + "\n", + "# Merge merge_table with flights\n", + "\n", + "deltaTable. alias( “flights” ) \\\n", + "\n", + ".merge(merge_table. alias ( “updates”),”flights.date =\n", + "\n", + "updates.date” ) \\\n", + "\n", + ".whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n", + "\n", + ".whenNotMatchedInsertAll() \\\n", + "\n", + ".execute()\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "cols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n", + "\n", + "\n", + "merge_table = spark.createDataFrame(items, cols)\n", + "\n", + "merge_table.toPandas()\n", + "\n", + "In the preceding table ( merge_table ), there are three rows with a unique date value:\n", + "\n", + "1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n", + "2. 1010710: This row is a _duplicate_ (blue)\n", + "3. 1010832: This is a new row to be _inserted_ (green)\n", + "\n", + "\n", + "All three actions of de-duplication, update and insert were efficiently completed with\n", + "one statement.\n", + "\n", + "**View table history**\n", + "As previously noted, after each of our transactions (delete, update), there were more\n", + "files created within the file system. This is because for each transaction, there are\n", + "different versions of the Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "This can be seen by using the DeltaTable.history() method as noted below\n", + "\n", + "Note: You can also perform the same task with SQL:\n", + "\n", + "spark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n", + "\n", + "As you can see, there are three rows representing the different versions of the table\n", + "(below is an abridged version to help make it easier to read) for each of the operations\n", + "(create table, delete and update):\n", + "\n", + "**Travel back in time with table history**\n", + "With Time Travel, you can review the Delta Lake table as of the version or timestamp.\n", + "To view historical data, specify the version or timestamp option; in the following code\n", + "snippet, we will specify the version option.\n", + "\n", + "\n", + "# Load DataFrames for each version\n", + "\n", + "dfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "0 ).load( “departureDelays.delta” )\n", + "\n", + "dfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n", + "\n", + "1 ).load( “departureDelays.delta” )\n", + "\n", + "dfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "2 ).load( “departureDelays.delta” )\n", + "\n", + "# Calculate the SEA to SFO flight counts for each version of history\n", + "\n", + "cnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
230f954a58a1e8798d1b7d8266b0f3ed# Load DataFrames for each version\n", + "\n", + "dfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "0 ).load( “departureDelays.delta” )\n", + "\n", + "dfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n", + "\n", + "1 ).load( “departureDelays.delta” )\n", + "\n", + "dfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "2 ).load( “departureDelays.delta” )\n", + "\n", + "# Calculate the SEA to SFO flight counts for each version of history\n", + "\n", + "cnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "# Print out the value\n", + "\n", + "print ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n", + "\n", + "(cnt0, cnt1, cnt2))\n", + "\n", + "## Output\n", + "\n", + "SEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n", + "\n", + "Whether for governance, risk management and compliance (GRC) or rolling back\n", + "errors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\n", + "delete had occurred with these operators) and data (e.g., the actual rows deleted). But\n", + "how do we remove the data files either for compliance or size reasons?\n", + "\n", + "**Clean up old table versions with vacuum**\n", + "The [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\n", + "older than seven days’ reference. If you were to view the file system, you’ll notice the\n", + "11 files for your table.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n", + "\n", + "part- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n", + "\n", + "part- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquetSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ca33594916a7cf4ce26423a0d3d5f337part- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n", + "\n", + "part- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n", + "\n", + "part- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n", + "\n", + "Part- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "To delete all of the files so that you only keep the current snapshot of data, you will specify a\n", + "small value for the vacuum method (instead of the default retention of 7 days).\n", + "\n", + "# Remove all files older than 0 hours old.\n", + "\n", + "deltaTable.vacuum( 0 )\n", + "\n", + "Note , you perform the same task via SQL syntax:¸\n", + "\n", + "# Remove all files older than 0 hours old\n", + "\n", + "spark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n", + "\n", + "Once the vacuum has completed, when you review the file system you will notice fewer\n", + "files as the historical data has been removed.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "Note, the ability to time travel back to a version older than the retention period is lost\n", + "after running vacuum.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Time Travel for**\n", + "**Large-Scale Data Lakes**\n", + "\n", + "Time travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n", + "[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\n", + "metadata handling, and unifies streaming and batch data processing. Delta Lake runs on\n", + "top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "With this feature, Delta Lake automatically versions the big data that you store in your\n", + "data lake, and you can access any historical version of that data. This temporal data\n", + "management simplifies your data pipeline by making it easy to audit, roll back data\n", + "in case of accidental bad writes or deletes, and reproduce experiments and reports.\n", + "\n", + "Your organization can finally standardize on a clean, centralized, versioned big data\n", + "repository in your own cloud storage for your analytics.\n", + "\n", + "**Common challenges with changing data**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
88cd3f53e2c6a1283a4224629fc87774Time travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n", + "[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\n", + "metadata handling, and unifies streaming and batch data processing. Delta Lake runs on\n", + "top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "With this feature, Delta Lake automatically versions the big data that you store in your\n", + "data lake, and you can access any historical version of that data. This temporal data\n", + "management simplifies your data pipeline by making it easy to audit, roll back data\n", + "in case of accidental bad writes or deletes, and reproduce experiments and reports.\n", + "\n", + "Your organization can finally standardize on a clean, centralized, versioned big data\n", + "repository in your own cloud storage for your analytics.\n", + "\n", + "**Common challenges with changing data**\n", + "\n", + "- **Audit data changes:** Auditing data changes is critical both in terms of data\n", + "compliance as well as simple debugging to understand how data has changed over\n", + "time. Organizations moving from traditional data systems to big data technologies\n", + "and the cloud struggle in such scenarios.\n", + "\n", + "- **Reproduce experiments and reports:** During model training, data scientists\n", + "run various experiments with different parameters on a given set of data. When\n", + "scientists revisit their experiments after a period of time to reproduce the models,\n", + "typically the source data has been modified by upstream pipelines. A lot of times,\n", + "they are caught unaware by such upstream data changes and hence struggle to\n", + "reproduce their experiments. Some scientists and organizations engineer best\n", + "\n", + "\n", + "-----\n", + "\n", + "practices by creating multiple copies of the data, leading to increased storage\n", + "costs. The same is true for analysts generating reports.\n", + "\n", + "- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n", + "\n", + "This can happen because of issues ranging from infrastructure instabilities to messy\n", + "data to bugs in the pipeline. For pipelines that do simple appends to directories or a\n", + "table, rollbacks can easily be addressed by date-based partitioning. With updates\n", + "and deletes, this can become very complicated, and data engineers typically have\n", + "to engineer a complex pipeline to deal with such scenarios.\n", + "\n", + "**Working with Time Travel**\n", + "Delta Lake’s time travel capabilities simplify building data pipelines for the above use\n", + "cases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n", + "\n", + "- Data scientists manage their experiments better\n", + "\n", + "- Data engineers simplify their pipelines and roll back bad writes\n", + "\n", + "- Data analysts do easy reporting\n", + "\n", + "Organizations can finally standardize on a clean, centralized, versioned big data\n", + "repository in their own cloud storage for analytics. We are thrilled to see what you will\n", + "be able to accomplish with this feature.\n", + "\n", + "As you write into a Delta Lake table or directory, every operation is automatically\n", + "versioned. You can access the different versions of the data two different ways:\n", + "\n", + "**1. Using a timestamp**\n", + "**Scala syntax**\n", + "You can provide the timestamp or date string as an option to DataFrame reader:\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "\n", + "-----\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
5d0a6e641d2854282bc57f0b4249014aval df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "\n", + "-----\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n", + "\n", + "If the reader code is in a library that you don’t have access to, and if you are passing\n", + "input parameters to the library to read data, you can still travel back in time for a table\n", + "by passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n", + "\n", + "val inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "val df = loadData(inputPath)\n", + "\n", + "// Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath : String ) : DataFrame = {\n", + "\n", + "spark.read\n", + "\n", + ".format(“delta”)\n", + "\n", + ".load(inputPath)\n", + "\n", + "}\n", + "\n", + "inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "df = loadData(inputPath)\n", + "\n", + "# Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath):\n", + "\n", + "return spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load(inputPath)\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Using a version number**\n", + "In Delta Lake, every write has a version number, and you can use the version number\n", + "to travel back in time as well.\n", + "\n", + "**Scala syntax**\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “versionAsOf” , “5238” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “versionAsOf” , “5238” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table VERSION AS OF 5238\n", + "\n", + "\n", + "-----\n", + "\n", + "**Audit data changes**\n", + "You can look at the history of table changes using the DESCRIBE HISTORY command\n", + "or through the UI.\n", + "\n", + "**Reproduce experiments and reports**\n", + "Time travel also plays an important role in machine learning and data science.\n", + "Reproducibility of models and experiments is a key consideration for data scientists\n", + "because they often create hundreds of models before they put one into production,\n", + "and in that time-consuming process would like to go back to earlier models. However,\n", + "because data management is often separate from data science tools, this is really\n", + "hard to accomplish.\n", + "\n", + "Databricks solves this reproducibility problem by integrating Delta Lake’s Time\n", + "Travel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\n", + "lifecycle. For reproducible machine learning training, you can simply log aSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
711c60d6bd61a75a8e06d2833cbe0ac5.format( “delta” ) \\\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table VERSION AS OF 5238\n", + "\n", + "\n", + "-----\n", + "\n", + "**Audit data changes**\n", + "You can look at the history of table changes using the DESCRIBE HISTORY command\n", + "or through the UI.\n", + "\n", + "**Reproduce experiments and reports**\n", + "Time travel also plays an important role in machine learning and data science.\n", + "Reproducibility of models and experiments is a key consideration for data scientists\n", + "because they often create hundreds of models before they put one into production,\n", + "and in that time-consuming process would like to go back to earlier models. However,\n", + "because data management is often separate from data science tools, this is really\n", + "hard to accomplish.\n", + "\n", + "Databricks solves this reproducibility problem by integrating Delta Lake’s Time\n", + "Travel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\n", + "lifecycle. For reproducible machine learning training, you can simply log a\n", + "\n", + "\n", + "timestamped URL to the path as an MLflow parameter to track which version of the\n", + "data was used for each training job.\n", + "\n", + "This enables you to go back to earlier settings and data sets to reproduce earlier\n", + "models. You neither need to coordinate with upstream teams on the data nor worry\n", + "about cloning data for different experiments. This is the power of unified analytics,\n", + "whereby data science is closely married with data engineering.\n", + "\n", + "**Rollbacks**\n", + "Time travel also makes it easy to do rollbacks in case of bad writes. For example, if\n", + "your GDPR pipeline job had a bug that accidentally deleted user information, you can\n", + "easily fix the pipeline:\n", + "\n", + "INSERT INTO my_table\n", + "\n", + "SELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "WHERE userId = 111\n", + "\n", + "\n", + "-----\n", + "\n", + "You can also fix incorrect updates as follows:\n", + "\n", + "# Will use the latest version of the table for all operations below\n", + "\n", + "MERGE INTO my_table target\n", + "\n", + "\n", + "USING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n", + "\n", + "ON source.userId = target.userId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET - \n", + "\n", + "If you simply want to roll back to a previous version of your table, you can do so with\n", + "either of the following commands:\n", + "\n", + "RESTORE TABLE my_table VERSION AS OF [version_number]\n", + "\n", + "RESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n", + "\n", + "**Pinned view of a continuously updating**\n", + "**Delta Lake table across multiple downstream jobs**\n", + "With AS OF queries, you can now pin the snapshot of a continuously updating Delta\n", + "Lake table for multiple downstream jobs. Consider a situation where a Delta Lake table\n", + "is being continuously updated, say every 15 seconds, and there is a downstream job\n", + "that periodically reads from this Delta Lake table and updates different destinations.\n", + "In such scenarios, typically you want a consistent view of the source Delta Lake table\n", + "so that all destination tables reflect the same state.\n", + "\n", + "You can now easily handle such scenarios as follows:\n", + "\n", + "version = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n", + "\n", + "my_table)” ).collect()\n", + "\n", + "\n", + "data = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n", + "\n", + "( “event_type = e1” ).write.jdbc( “table1” )\n", + "\n", + "data.where( “event_type = e2” ).write.jdbc( “table2” )\n", + "\n", + "...\n", + "\n", + "data.where( “event_type = e10” ).write.jdbc( “table10” )\n", + "\n", + "**Queries for time series analytics made simple**\n", + "Time travel also simplifies time series analytics. For example, if you want to find out\n", + "how many new customers you added over the last week, your query could be a very\n", + "simple one like this:\n", + "\n", + "SELECT count( distinct userId) - (\n", + "\n", + "SELECT count( distinct userId)\n", + "\n", + "FROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n", + "\n", + "FROM my_table\n", + "\n", + "**Additional resources**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
aa0e375a10a06dfdfa31a3d111220cd6my_table)” ).collect()\n", + "\n", + "\n", + "data = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n", + "\n", + "( “event_type = e1” ).write.jdbc( “table1” )\n", + "\n", + "data.where( “event_type = e2” ).write.jdbc( “table2” )\n", + "\n", + "...\n", + "\n", + "data.where( “event_type = e10” ).write.jdbc( “table10” )\n", + "\n", + "**Queries for time series analytics made simple**\n", + "Time travel also simplifies time series analytics. For example, if you want to find out\n", + "how many new customers you added over the last week, your query could be a very\n", + "simple one like this:\n", + "\n", + "SELECT count( distinct userId) - (\n", + "\n", + "SELECT count( distinct userId)\n", + "\n", + "FROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n", + "\n", + "FROM my_table\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n", + "\n", + "[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n", + "\n", + "[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n", + "\n", + "[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Easily Clone Your Delta Lake**\n", + "**for Testing, Sharing and ML**\n", + "**Reproducibility**\n", + "\n", + "Delta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\n", + "recreate tables for ML reproducibility. Creating copies of tables in a data lake or data\n", + "warehouse has several practical uses. However, given the volume of data in tables\n", + "in a data lake and the rate of its growth, making physical copies of tables is an\n", + "expensive operation.\n", + "\n", + "Delta Lake now makes the process simpler and cost-effective with the help of\n", + "table clones.\n", + "\n", + "**What are clones?**\n", + "Clones are replicas of a source table at a given point in time. They have the same\n", + "metadata as the source table: same schema, constraints, column descriptions, statistics\n", + "and partitioning. However, they behave as a separate table with a separate lineage\n", + "or history. Any changes made to clones only affect the clone and not the source. Any\n", + "changes that happen to the source during or after the cloning process also do not get\n", + "reflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\n", + "clones: shallow or deep.\n", + "\n", + "**Shallow clones**\n", + "A _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\n", + "table being cloned; the data files of the table itself are not copied. This type of cloning\n", + "does not create another physical copy of the data resulting in minimal storage costs.\n", + "Shallow clones are inexpensive and can be extremely fast to create.\n", + "\n", + "\n", + "-----\n", + "\n", + "These clones are not self-contained and depend on the source from which they were\n", + "cloned as the source of data. If the files in the source that the clone depends on are removed,\n", + "for example with VACUUM, a shallow clone may become unusable. Therefore, shallow\n", + "clones are typically used for short-lived use cases such as testing and experimentation.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
82ac667cde1a486cfd40d0b4dcfae632**Shallow clones**\n", + "A _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\n", + "table being cloned; the data files of the table itself are not copied. This type of cloning\n", + "does not create another physical copy of the data resulting in minimal storage costs.\n", + "Shallow clones are inexpensive and can be extremely fast to create.\n", + "\n", + "\n", + "-----\n", + "\n", + "These clones are not self-contained and depend on the source from which they were\n", + "cloned as the source of data. If the files in the source that the clone depends on are removed,\n", + "for example with VACUUM, a shallow clone may become unusable. Therefore, shallow\n", + "clones are typically used for short-lived use cases such as testing and experimentation.\n", + "\n", + "**Deep clones**\n", + "Shallow clones are great for short-lived use cases, but some scenarios require a\n", + "separate and independent copy of the table’s data. A deep clone makes a full copy of\n", + "the metadata and the data files of the table being cloned. In that sense, it is similar in\n", + "functionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\n", + "But it is simpler to specify since it makes a faithful copy of the original table at the\n", + "specified version, and you don’t need to re-specify partitioning, constraints and other\n", + "information as you have to do with CTAS. In addition, it is much faster, robust and can\n", + "work in an incremental manner against failures.\n", + "\n", + "With deep clones, we copy additional metadata, such as your streaming application\n", + "transactions and COPY INTO transactions, so you can continue your ETL applications\n", + "exactly where it left off on a deep clone.\n", + "\n", + "**Where do clones help?**\n", + "Sometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\n", + "not talking about human clones here. There are many scenarios where you need a\n", + "copy of your data sets — for exploring, sharing or testing ML models or analytical\n", + "queries. Below are some examples of customer use cases.\n", + "\n", + "**Testing and experimentation with a production table**\n", + "When users need to test a new version of their data pipeline they often have to rely\n", + "on sample test data sets that are not representative of all the data in their production\n", + "environment. Data teams may also want to experiment with various indexing techniques\n", + "to improve the performance of queries against massive tables. These experiments and\n", + "\n", + "\n", + "tests cannot be carried out in a production environment without risking production\n", + "data processes and affecting users.\n", + "\n", + "It can take many hours or even days, to spin up copies of your production tables for a test\n", + "or a development environment. Add to that, the extra storage costs for your development\n", + "environment to hold all the duplicated data — there is a large overhead in setting a test\n", + "environment reflective of the production data. With a shallow clone, this is trivial:\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n", + "\n", + "# Python\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=True)\n", + "\n", + "// Scala\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=true)\n", + "\n", + "After creating a shallow clone of your table in a matter of seconds, you can start\n", + "running a copy of your pipeline to test out your new code, or try optimizing your table\n", + "in different dimensions to see how you can improve your query performance, and much\n", + "much more. These changes will only affect your shallow clone, not your original table.\n", + "\n", + "**Staging major changes to a production table**\n", + "Sometimes, you may need to perform some major changes to your production table.\n", + "These changes may consist of many steps, and you don’t want other users to see the\n", + "changes that you’re making until you’re done with all of your work. A shallow clone can\n", + "help you out here:\n", + "\n", + "\n", + "-----\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n", + "\n", + "DELETE FROM temp.staged_changes WHERE event_id is null;\n", + "\n", + "UPDATE temp.staged_changes SET change_date = current_date()\n", + "\n", + "WHERE change_date is null;\n", + "\n", + "...\n", + "\n", + "-- Perform your verificationsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
3b5c7eb5038ea40cfd3fa6b26ef3f196After creating a shallow clone of your table in a matter of seconds, you can start\n", + "running a copy of your pipeline to test out your new code, or try optimizing your table\n", + "in different dimensions to see how you can improve your query performance, and much\n", + "much more. These changes will only affect your shallow clone, not your original table.\n", + "\n", + "**Staging major changes to a production table**\n", + "Sometimes, you may need to perform some major changes to your production table.\n", + "These changes may consist of many steps, and you don’t want other users to see the\n", + "changes that you’re making until you’re done with all of your work. A shallow clone can\n", + "help you out here:\n", + "\n", + "\n", + "-----\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n", + "\n", + "DELETE FROM temp.staged_changes WHERE event_id is null;\n", + "\n", + "UPDATE temp.staged_changes SET change_date = current_date()\n", + "\n", + "WHERE change_date is null;\n", + "\n", + "...\n", + "\n", + "-- Perform your verifications\n", + "\n", + "Once you’re happy with the results, you have two options. If no other change has\n", + "been made to your source table, you can replace your source table with the clone.\n", + "If changes have been made to your source table, you can merge the changes into\n", + "your source table.\n", + "\n", + "-- If no changes have been made to the source\n", + "\n", + "REPLACE TABLE prod.events CLONE temp.staged_changes;\n", + "\n", + "-- If the source table has changed\n", + "\n", + "MERGE INTO prod.events USING temp.staged_changes\n", + "\n", + "ON events.event_id <=> staged_changes.event_id\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *;\n", + "\n", + "-- Drop the staged table\n", + "\n", + "DROP TABLE temp.staged_changes;\n", + "\n", + "**Machine learning result reproducibility**\n", + "Coming up with an effective ML model is an iterative process. Throughout this process\n", + "of tweaking the different parts of the model, data scientists need to assess the\n", + "accuracy of the model against a fixed data set.\n", + "\n", + "This is hard to do in a system where the data is constantly being loaded or updated. A\n", + "snapshot of the data used to train and test the model is required. This snapshot allows\n", + "the results of the ML model to be reproducible for testing or model governance purposes.\n", + "\n", + "\n", + "-----\n", + "\n", + "We recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\n", + "example of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "Once you’re happy with the results and would like to archive the data for later retrieval,\n", + "for example, next Black Friday, you can use deep clones to simplify the archiving process.\n", + "MLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\n", + "autolog() ) will tell you which version of the table was used to run a set of experiments.\n", + "\n", + "# Run your ML workloads using Python and then\n", + "\n", + "DeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n", + "\n", + "store_bf2020”)\n", + "\n", + "**Data migration**\n", + "A massive table may need to be moved to a new, dedicated bucket or storage system\n", + "for performance or governance reasons. The original table will not receive new\n", + "updates going forward and will be deactivated and removed at a future point in time.\n", + "Deep clones make the copying of massive tables more robust and scalable.\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n", + "\n", + "ALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n", + "\n", + "With deep clones, since we copy your streaming application transactions and\n", + "COPY INTO transactions, you can continue your ETL applications from exactly where\n", + "it left off after this migration!SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
6a1474e01a1c53beb43c0e56be508317# Run your ML workloads using Python and then\n", + "\n", + "DeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n", + "\n", + "store_bf2020”)\n", + "\n", + "**Data migration**\n", + "A massive table may need to be moved to a new, dedicated bucket or storage system\n", + "for performance or governance reasons. The original table will not receive new\n", + "updates going forward and will be deactivated and removed at a future point in time.\n", + "Deep clones make the copying of massive tables more robust and scalable.\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n", + "\n", + "ALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n", + "\n", + "With deep clones, since we copy your streaming application transactions and\n", + "COPY INTO transactions, you can continue your ETL applications from exactly where\n", + "it left off after this migration!\n", + "\n", + "**Data sharing**\n", + "In an organization, it is often the case that users from different departments are\n", + "looking for data sets that they can use to enrich their analysis or models. You may\n", + "want to share your data with other users across the organization. But rather than\n", + "setting up elaborate pipelines to move the data to yet another store, it is often easier\n", + "and economical to create a copy of the relevant data set for users to explore and\n", + "\n", + "\n", + "-----\n", + "\n", + "**Looks awesome! Any gotchas?**\n", + "Just to reiterate some of the gotchas mentioned above as a single list, here’s what you\n", + "should be wary of:\n", + "\n", + "- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\n", + "the source table after the cloning process starts will not be reflected in the\n", + "clone.\n", + "\n", + "- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\n", + "deleted in the source table (for example through VACUUM), your shallow clone\n", + "may not be usable.\n", + "\n", + "- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\n", + "queries on your source table and clone may not return the same result.\n", + "\n", + "- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\n", + "deep clones to migrate your tables and continue your ETL processes from\n", + "where it left off.\n", + "\n", + "**How can I use it?**\n", + "Shallow and deep clones support new advances in how data teams test and manage\n", + "their modern cloud data lakes and warehouses. Table clones can help your team\n", + "implement production-level testing of their pipelines, fine-tune their indexing for optimal\n", + "query performance, create table copies for sharing — all with minimal overhead and\n", + "expense. If this is a need in your organization, we hope you will take table cloning for\n", + "a spin and give us your feedback — we look forward to hearing about new use cases and\n", + "extensions you would like to see in the future.\n", + "\n", + "**Additional resource**\n", + "\n", + "[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n", + "\n", + "\n", + "test the data to see if it is a fit for their needs without affecting your own production\n", + "systems. Here deep clones again come to the rescue.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n", + "\n", + "**Data archiving**\n", + "For regulatory or archiving purposes, all data in a table needs to be preserved for a\n", + "certain number of years, while the active table retains data for a few months. If you\n", + "want your data to be updated as soon as possible, but you have a requirement to keep\n", + "data for several years, storing this data in a single table and performing time travel\n", + "may become prohibitively expensive.\n", + "\n", + "In this case, archiving your data in a daily, weekly or monthly manner is a better\n", + "solution. The incremental cloning capability of deep clones will really help you here.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE archive.events CLONE prod.events;\n", + "\n", + "Note that this table will have an independent history compared to the source table,\n", + "therefore, time travel queries on the source table and the clone may return different\n", + "results based on your frequency of archiving.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling Spark SQL DDL**\n", + "**and DML in Delta Lake on**\n", + "**Apache Spark 3.0**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
85528e52b21fbee2e55b92c64392467a**Data archiving**\n", + "For regulatory or archiving purposes, all data in a table needs to be preserved for a\n", + "certain number of years, while the active table retains data for a few months. If you\n", + "want your data to be updated as soon as possible, but you have a requirement to keep\n", + "data for several years, storing this data in a single table and performing time travel\n", + "may become prohibitively expensive.\n", + "\n", + "In this case, archiving your data in a daily, weekly or monthly manner is a better\n", + "solution. The incremental cloning capability of deep clones will really help you here.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE archive.events CLONE prod.events;\n", + "\n", + "Note that this table will have an independent history compared to the source table,\n", + "therefore, time travel queries on the source table and the clone may return different\n", + "results based on your frequency of archiving.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling Spark SQL DDL**\n", + "**and DML in Delta Lake on**\n", + "**Apache Spark 3.0**\n", + "\n", + "The release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\n", + "enabling a new set of features that were simplified using Delta Lake from SQL. Here\n", + "are some of the key features.\n", + "\n", + "**Support for SQL DDL commands**\n", + "**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\n", + "You can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\n", + "SQL operations when creating (or replacing) tables.\n", + "\n", + "**Create or replace tables**\n", + "\n", + "-- Create table in the metastore\n", + "\n", + "CREATE TABLE events (\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "USING DELTA\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "-- If a table with the same name already exists, the table is replaced\n", + "\n", + "with\n", + "\n", + "the new configuration, else it i s created\n", + "\n", + "CREATE OR REPLACE TABLE events (\n", + "\n", + "\n", + "-----\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "\n", + "INSERT INTO events SELECT * FROM newEvents\n", + "\n", + "-- To atomically replace all of the data in a table, you can use\n", + "\n", + "overwrite mode\n", + "\n", + "INSERT OVERWRITE events SELECT * FROM newEvents\n", + "\n", + "\n", + "USING DELTA\n", + "\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "**Explicitly alter the table schema**\n", + "\n", + "-- Alter table and schema\n", + "\n", + "\n", + "-- Delete events\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n", + "\n", + "\n", + "ALTER TABLE table_name ADD COLUMNS (\n", + "\n", + "\n", + "col_name data_type\n", + "\n", + "[COMMENT col_comment]\n", + "\n", + "[FIRST|AFTER colA_name],\n", + "\n", + "...)\n", + "\n", + "You can also use the Scala/Java/Python APIs:\n", + "\n", + "- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\n", + "APIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n", + "\n", + "- \u0007DeltaTable.forName(tableName) API to create instances of\n", + "io.delta.tables .DeltaTable which is useful for executing\n", + "Update/Delete/Merge operations in Scala/Java/Python.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
4d04b69d8db8500adb050d5a90426616LOCATION ‘/delta/events’\n", + "\n", + "**Explicitly alter the table schema**\n", + "\n", + "-- Alter table and schema\n", + "\n", + "\n", + "-- Delete events\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n", + "\n", + "\n", + "ALTER TABLE table_name ADD COLUMNS (\n", + "\n", + "\n", + "col_name data_type\n", + "\n", + "[COMMENT col_comment]\n", + "\n", + "[FIRST|AFTER colA_name],\n", + "\n", + "...)\n", + "\n", + "You can also use the Scala/Java/Python APIs:\n", + "\n", + "- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\n", + "APIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n", + "\n", + "- \u0007DeltaTable.forName(tableName) API to create instances of\n", + "io.delta.tables .DeltaTable which is useful for executing\n", + "Update/Delete/Merge operations in Scala/Java/Python.\n", + "\n", + "**Support for SQL Insert, Delete, Update and Merge**\n", + "One of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\n", + "would DML operations such as delete, update and merge be available in Spark SQL?\n", + "Wait no more, these operations are now available in SQL! Below are examples of how\n", + "you can write delete, update and merge (insert, update, delete and de-duplication\n", + "operations using Spark SQL).\n", + "\n", + "-- Using append mode, you can atomically add new data to an existing\n", + "\n", + "Delta table\n", + "\n", + "\n", + "-- Upsert data to a target Delta\n", + "\n", + "-- table using merge\n", + "\n", + "MERGE INTO events\n", + "\n", + "USING updates\n", + "\n", + "ON events.eventId = updates.eventId\n", + "\n", + "WHEN MATCHED THEN UPDATE\n", + "\n", + "SET events.data = updates.data\n", + "\n", + "WHEN NOT MATCHED THEN INSERT (date, eventId, data)\n", + "\n", + "VALUES (date, eventId, data)\n", + "\n", + "It is worth noting that the merge operation in Delta Lake supports more advanced\n", + "syntax than standard ANSI SQL syntax. For example, merge supports\n", + "\n", + "- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n", + "“... WHEN MATCHED THEN DELETE ...”\n", + "\n", + "- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\n", + "and source rows match. For example:\n", + "\n", + "...\n", + "\n", + "WHEN MATCHED AND events.shouldDelete THEN DELETE\n", + "\n", + "WHEN MATCHED THEN UPDATE SET events.data = updates.data\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\n", + "sources column. For example:\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "such as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\n", + "block deletes and updates in a Delta table using delta.appendOnly=true .\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN MATCHED THEN SET *\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN NOT MATCHED THEN INSERT *\n", + "\n", + "-- equivalent to updating/inserting with event .date = updates.date,\n", + "\n", + "events.eventId = updates.eventId, event .data = updates.dataSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
6abd3284045bf3c58bcecc212ea2a929[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "such as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\n", + "block deletes and updates in a Delta table using delta.appendOnly=true .\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN MATCHED THEN SET *\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN NOT MATCHED THEN INSERT *\n", + "\n", + "-- equivalent to updating/inserting with event .date = updates.date,\n", + "\n", + "events.eventId = updates.eventId, event .data = updates.data\n", + "\n", + "**Automatic and incremental Presto/Athena manifest**\n", + "**generation**\n", + "As noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\n", + "to read Delta Lake by using manifest files; the manifest files contain the list of the\n", + "most current version of files as of manifest generation. As described in the preceding\n", + "chapter, you will need to:\n", + "\n", + "- Generate a Delta Lake manifest file\n", + "\n", + "- Configure Presto or Athena to read the generated manifests\n", + "\n", + "- Manually re-generate (update) the manifest file\n", + "\n", + "New for Delta Lake 0.7.0 is the capability to update the manifest file automatically\n", + "with the following command:\n", + "\n", + "ALTER TABLE delta.`pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.compatibility.symlinkFormatManifest.enabled=true\n", + "\n", + ")\n", + "\n", + "**Configuring your table through table properties**\n", + "With the ability to set table properties on your table by using ALTER TABLE SET\n", + "TBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "You can also easily control the history of your Delta Lake table retention by the\n", + "following [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n", + "\n", + "- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n", + "(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\n", + "want to alter this value based on your requirements (e.g., GDPR historical context)\n", + "\n", + "- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\n", + "must have been deleted before being a candidate for VACUUM. By default, data\n", + "files older than seven days are deleted.\n", + "\n", + "As of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\n", + "configure these properties.\n", + "\n", + "ALTER TABLE delta. `pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.logRetentionDuration = “interval “\n", + "\n", + "delta.deletedFileRetentionDuration = “interval “\n", + "\n", + ")\n", + "\n", + "**Support for adding user-defined metadata**\n", + "**in Delta Lake table commits**\n", + "You can specify user-defined strings as metadata in commits made by Delta\n", + "Lake table operations, either using the DataFrameWriter option userMetadata or\n", + "the SparkSession configuration spark.databricks.delta.commitInfo.\n", + "userMetadata .SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
0f2e4c9a4079b11e8fa2e29dbb6b7555- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\n", + "must have been deleted before being a candidate for VACUUM. By default, data\n", + "files older than seven days are deleted.\n", + "\n", + "As of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\n", + "configure these properties.\n", + "\n", + "ALTER TABLE delta. `pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.logRetentionDuration = “interval “\n", + "\n", + "delta.deletedFileRetentionDuration = “interval “\n", + "\n", + ")\n", + "\n", + "**Support for adding user-defined metadata**\n", + "**in Delta Lake table commits**\n", + "You can specify user-defined strings as metadata in commits made by Delta\n", + "Lake table operations, either using the DataFrameWriter option userMetadata or\n", + "the SparkSession configuration spark.databricks.delta.commitInfo.\n", + "userMetadata .\n", + "\n", + "In the following example, we are deleting a user (1xsdf1) from our data lake per user\n", + "request. To ensure we associate the user’s request with the deletion, we have also\n", + "added the DELETE request ID into the userMetadata.\n", + "\n", + "\n", + "-----\n", + "\n", + "SET spark.databricks.delta.commitInfo.userMetadata={\n", + "\n", + "“GDPR”:”DELETE Request 1x891jb23”\n", + "\n", + "\n", + "There were a lot of great questions during the AMA concerning structured streaming\n", + "and using trigger.once .\n", + "\n", + "\n", + "};\n", + "\n", + "\n", + "For more information, some good resources explaining this concept include:\n", + "\n", + "- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\n", + "trade-off discussed here .\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n", + "\n", + "[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n", + "\n", + "[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n", + "\n", + "\n", + "DELETE FROM user_table WHERE user_id = ‘1xsdf1’\n", + "\n", + "When reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\n", + "identify the associated deletion request within the transaction log.\n", + "\n", + "**Other highlights**\n", + "Other highlights for the Delta Lake 0.7.0 release include:\n", + "\n", + "- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n", + "3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n", + "\n", + "- Improved support for streaming one-time triggers — With Spark 3.0, we now\n", + "ensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\n", + "in a Delta Lake table in a single micro-batch even if rate limits are set with the\n", + "DataStreamReader option maxFilesPerTrigger.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse**\n", + "Combining the best elements of data\n", + "lakes and data warehouses\n", + "\n", + "## CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
b9ecac60d86210f02ec6120208247874**Other highlights**\n", + "Other highlights for the Delta Lake 0.7.0 release include:\n", + "\n", + "- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n", + "3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n", + "\n", + "- Improved support for streaming one-time triggers — With Spark 3.0, we now\n", + "ensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\n", + "in a Delta Lake table in a single micro-batch even if rate limits are set with the\n", + "DataStreamReader option maxFilesPerTrigger.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse**\n", + "Combining the best elements of data\n", + "lakes and data warehouses\n", + "\n", + "## CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "architects began envisioning a single system to house data for many different\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "and data warehouses.\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "consistency as multiple parties concurrently read or write data, typically using SQL.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "warehouses.\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "or copy data between different systems.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
accf6ad13717062292245537ffbd0249- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "consistency as multiple parties concurrently read or write data, typically using SQL.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "warehouses.\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "or copy data between different systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "such as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n", + "[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\n", + "use separate clusters, thus these systems are able to scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
9b85a3fa086f1fa4e09197bc46d91dab- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "Cloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\n", + "store large data warehouses and data lakes. Unfortunately, their implementation\n", + "as key-value stores makes it difficult to achieve ACID transactions and high\n", + "performance: Metadata operations, such as listing objects, are expensive, and\n", + "consistency guarantees are limited. In this paper, we present Delta Lake, an\n", + "open source ACID table storage layer over cloud object stores initially developed\n", + "at Databricks. Delta Lake uses a transaction log that is compacted into Apache\n", + "Parquet format to provide ACID properties, time travel, and significantly faster\n", + "metadata operations for large tabular data sets (e.g., the ability to quickly search\n", + "billions of table partitions for those relevant to a query). It also leverages this\n", + "design to provide high-level features such as automatic data layout optimization,\n", + "upserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\n", + "Spark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\n", + "thousands of Databricks customers that process exabytes of data per day, with\n", + "the largest instances managing exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\n", + "Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\n", + "Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\n", + "Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "Microsoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "enables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n", + "[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\n", + "examples that focus primarily on BI and other SQL applications.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
0c1a7a0ab76b4274b45f53089582bed3**Some early examples**\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "Microsoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "enables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n", + "[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "source file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "important for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "exploration and refinement are standard for many analytic and data science\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "data that went into a company’s products or decision-making was structured\n", + "data from operational systems, whereas today, many products incorporate\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "unstructured data.\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "systems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "and other issues will be addressed as the technology continues to mature and\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "diverse data applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
8375eac494bff392a37d6dff7c40c1b1Current lakehouses reduce cost, but their performance can still lag specialized\n", + "systems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "and other issues will be addressed as the technology continues to mature and\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "diverse data applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "paper that describes some of the core technological challenges and solutions that\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "can read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "object stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "approach because the table is just a group of objects that can be accessed from\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "customers into a specific service provider, leaving customers to contend with\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "adopt a new approach later.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
106451175b1a3fe452158b21f2f224b8**2. Custom storage engines**\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "customers into a specific service provider, leaving customers to contend with\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "similar data structures and data management features to those in a data warehouse,\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "in the open source community. These use cases span a variety of data sources and\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "transactional features to replace some or all of the functionality provided by message\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performanceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
58289f2c000adf6c7a0dac805e19949bAcross these use cases, we found that customers often use Delta Lake to significantly\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "transactional features to replace some or all of the functionality provided by message\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "access protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "workloads through three components: an improved query optimizer, a caching\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "By linking these three components together, we think it will be easier for customers\n", + "to understand how improvements in multiple places within the Databricks code\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
5926bb5ce8f74c7fafe652fb85efc82cDelta Engine’s caching layer automatically chooses which input data to cache for the\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "By linking these three components together, we think it will be easier for customers\n", + "to understand how improvements in multiple places within the Databricks code\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "new advances in how data teams design their data architectures for increased\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming**\n", + "Using Delta Lake to express\n", + "computation on streaming data\n", + "\n", + "## CHAPTER 04\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Solves Common**\n", + "**Pain Points in Streaming**\n", + "\n", + "The pain points of a traditional streaming and data warehousing solution can be\n", + "broken into two groups: data lake and data warehouse pains.\n", + "\n", + "**Data lake pain points**\n", + "While data lakes allow you to flexibly store an immense amount of data in a file system,\n", + "there are many pain points including (but not limited to):\n", + "\n", + "- Consolidation of streaming data from many disparate systems is difficult.\n", + "\n", + "- Updating data in a data lake is nearly impossible, and much of the streaming\n", + "data needs to be updated as changes are made. This is especially important in\n", + "scenarios involving financial reconciliation and subsequent adjustments.\n", + "\n", + "- Query speeds for a data lake are typically very slow.\n", + "\n", + "- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n", + "\n", + "**Data warehouse pain points**\n", + "The power of a data warehouse is that you have a persistent performant store of your\n", + "data. But the pain points for building modern continuous applications include (but are\n", + "not limited to):\n", + "\n", + "- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n", + "\n", + "- Accessing streaming data and stored data together is very difficult, if at all possible.\n", + "\n", + "- Data warehouses do not scale very well.\n", + "\n", + "- Tying compute and storage together makes using a warehouse very expensive.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake on Databricks solves these issues**\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\n", + "performance optimizations to cloud data lakes. More succinctly, Delta Lake combines\n", + "the advantages of data lakes and data warehouses with Apache Spark™ to allow you\n", + "to do incredible things.\n", + "\n", + "- Delta Lake, along with Structured Streaming, makes it possible to analyze\n", + "streaming and historical data together at high speeds.\n", + "\n", + "- When Delta Lake tables are used as sources and destinations of streaming big\n", + "data, it is easy to consolidate disparate data sources.\n", + "\n", + "- Upserts are supported on Delta Lake tables.\n", + "\n", + "- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n", + "\n", + "- Easily include machine learning scoring and advanced analytics into ETL\n", + "and queries.\n", + "\n", + "- Decouples compute and storage for a completely scalable solution.\n", + "\n", + "In the following use cases, we’ll share what this looks like in practice.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying Streaming Stock**\n", + "**Data Analysis Using Delta Lake**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
b9e920255e44aed01952f834a693b695- Delta Lake, along with Structured Streaming, makes it possible to analyze\n", + "streaming and historical data together at high speeds.\n", + "\n", + "- When Delta Lake tables are used as sources and destinations of streaming big\n", + "data, it is easy to consolidate disparate data sources.\n", + "\n", + "- Upserts are supported on Delta Lake tables.\n", + "\n", + "- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n", + "\n", + "- Easily include machine learning scoring and advanced analytics into ETL\n", + "and queries.\n", + "\n", + "- Decouples compute and storage for a completely scalable solution.\n", + "\n", + "In the following use cases, we’ll share what this looks like in practice.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying Streaming Stock**\n", + "**Data Analysis Using Delta Lake**\n", + "\n", + "Real-time analysis of stock data is a complicated endeavor. After all, there are many\n", + "challenges in maintaining a streaming system and ensuring transactional consistency\n", + "of legacy and streaming data concurrently.\n", + "\n", + "Thankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\n", + "system to analyze stock data in real time. In this section, we’ll share how to simplify\n", + "the streaming of stock data analysis using Delta Lake.\n", + "\n", + "In the following diagram, you can see a high-level architecture that simplifies this\n", + "problem. We start by ingesting two different sets of data into two Delta Lake tables.\n", + "The two data sets are stock prices and fundamentals.\n", + "\n", + "After ingesting the data into their respective tables, we then join the data in an ETL\n", + "process and write the data out into a third Delta Lake table for downstream analysis.\n", + "\n", + "Delta Lake helps solve these problems by combining the scalability, streaming and\n", + "access to the advanced analytics of Apache Spark with the performance and ACID\n", + "compliance of a data warehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Create Fundamental Data (Databricks Delta table)\n", + "\n", + "dfBaseFund = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksFundamentals’ )\n", + "\n", + "# Create Price Data (Databricks Delta table)\n", + "\n", + "dfBasePrice = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksDailyPrices’ )\n", + "\n", + "\n", + "**Implement your streaming**\n", + "**stock analysis solution with Delta Lake**\n", + "Delta Lake and Apache Spark do most of the work for our solution; you can try out the\n", + "full [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n", + "\n", + "As noted in the preceding diagram, we have two data sets to process — one for\n", + "fundamentals and one for price data. To create our two Delta Lake tables, we specify\n", + "the .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n", + "\n", + "\n", + "-----\n", + "\n", + "While we’re updating the stockFundamentals and stocksDailyPrices ,\n", + "we will consolidate this data through a series of ETL jobs into a consolidated view\n", + "( stocksDailyPricesWFund ).\n", + "\n", + "With the following code snippet, we can determine the start and end date of available\n", + "data and then combine the price and fundamentals data for that date range into DBFS.\n", + "\n", + "# Determine start and end date of available data\n", + "\n", + "row = dfBasePrice.agg(\n", + "\n", + "func.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n", + "\n", + "func.min(dfBasePrice.price_date) .alias ( “minDate” )\n", + "\n", + ").collect()[ 0 ]\n", + "\n", + "startDate = row[ “minDate” ]\n", + "\n", + "endDate = row[ “maxDate” ]\n", + "\n", + "# Define our date range function\n", + "\n", + "\n", + "# Save data to DBFS\n", + "\n", + "dfPriceWFund\n", + "\n", + ".write\n", + "\n", + ".format( ‘delta’ )\n", + "\n", + ".mode( ‘append’ )SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
f6fa995d4c37ab485d6b8180de8b831bWith the following code snippet, we can determine the start and end date of available\n", + "data and then combine the price and fundamentals data for that date range into DBFS.\n", + "\n", + "# Determine start and end date of available data\n", + "\n", + "row = dfBasePrice.agg(\n", + "\n", + "func.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n", + "\n", + "func.min(dfBasePrice.price_date) .alias ( “minDate” )\n", + "\n", + ").collect()[ 0 ]\n", + "\n", + "startDate = row[ “minDate” ]\n", + "\n", + "endDate = row[ “maxDate” ]\n", + "\n", + "# Define our date range function\n", + "\n", + "\n", + "# Save data to DBFS\n", + "\n", + "dfPriceWFund\n", + "\n", + ".write\n", + "\n", + ".format( ‘delta’ )\n", + "\n", + ".mode( ‘append’ )\n", + "\n", + ".save( ‘/delta/stocksDailyPricesWFund’ )\n", + "\n", + "# Loop through dates to complete fundamentals + price ETL process\n", + "\n", + "for single_date in daterange(\n", + "\n", + "startDate, (endDate + datetime.timedelta(days= 1 ))\n", + "\n", + "):\n", + "\n", + "print ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n", + "\n", + "start = datetime.datetime.now()\n", + "\n", + "combinePriceAndFund(single_date)\n", + "\n", + "end = datetime.datetime.now()\n", + "\n", + "print ( end - start)\n", + "\n", + "\n", + "def daterange(start_date, end_date):\n", + "\n", + "\n", + "Now we have a stream of consolidated fundamentals and price data that is being\n", + "pushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\n", + "Delta Lake table by specifying .format(“delta”) against that DBFS location.\n", + "\n", + "\n", + "for n in range( int ((end_date - start_date).days)):\n", + "\n", + "yield start_date + datetime.timedelta(n)\n", + "\n", + "\n", + "# Define combinePriceAndFund information by date and\n", + "\n", + "\n", + "def combinePriceAndFund(theDate):\n", + "\n", + "dfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n", + "\n", + "dfPrice = dfBasePrice. where (\n", + "\n", + "dfBasePrice.price_date == theDate\n", + "\n", + "\n", + "dfPriceWithFundamentals = spark\n", + "\n", + ".readStream\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/delta/stocksDailyPricesWFund” )\n", + "\n", + "\n", + ").drop( ‘price_date’ )\n", + "\n", + "\n", + "# Drop the updated column\n", + "\n", + "dfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n", + "\n", + "\n", + "// Create temporary view of the data\n", + "\n", + "dfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n", + "\n", + "\n", + "-----\n", + "\n", + "Now that we have created our initial Delta Lake table, let’s create a view that will\n", + "allow us to calculate the price/earnings ratio in real time (because of the underlying\n", + "streaming data updating our Delta Lake table).\n", + "\n", + "%sql\n", + "\n", + "CREATE OR REPLACE TEMPORARY VIEW viewPE AS\n", + "\n", + "select ticker,\n", + "\n", + "price_date,\n", + "\n", + "first(close) as price,\n", + "\n", + "(close/eps_basic_net) as pe\n", + "\n", + "from priceWithFundamentals\n", + "\n", + "where eps_basic_net > 0\n", + "\n", + "group by ticker, price_date, pe\n", + "\n", + "**Analyze streaming stock data in real time**\n", + "With our view in place, we can quickly analyze our data using Spark SQL.\n", + "\n", + "%sql\n", + "\n", + "select - \n", + "\n", + "from viewPE\n", + "\n", + "where ticker == “AAPL”\n", + "\n", + "order by price_date\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
e44bb96fbdeb00e80da1083d9176e45a-----\n", + "\n", + "Now that we have created our initial Delta Lake table, let’s create a view that will\n", + "allow us to calculate the price/earnings ratio in real time (because of the underlying\n", + "streaming data updating our Delta Lake table).\n", + "\n", + "%sql\n", + "\n", + "CREATE OR REPLACE TEMPORARY VIEW viewPE AS\n", + "\n", + "select ticker,\n", + "\n", + "price_date,\n", + "\n", + "first(close) as price,\n", + "\n", + "(close/eps_basic_net) as pe\n", + "\n", + "from priceWithFundamentals\n", + "\n", + "where eps_basic_net > 0\n", + "\n", + "group by ticker, price_date, pe\n", + "\n", + "**Analyze streaming stock data in real time**\n", + "With our view in place, we can quickly analyze our data using Spark SQL.\n", + "\n", + "%sql\n", + "\n", + "select - \n", + "\n", + "from viewPE\n", + "\n", + "where ticker == “AAPL”\n", + "\n", + "order by price_date\n", + "\n", + "\n", + "-----\n", + "\n", + "As the underlying source of this consolidated data set is a Delta Lake table, this view\n", + "isn’t just showing the batch data but also any new streams of data that are coming in\n", + "as per the following streaming dashboard.\n", + "\n", + "Underneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\n", + "tables but also keeping the state of the distinct number of keys (in this case ticker\n", + "symbols) that need to be tracked.\n", + "\n", + "\n", + "Because you are using Spark SQL, you can execute aggregate queries at scale\n", + "and in real time.\n", + "\n", + "%sql\n", + "\n", + "SELECT ticker, AVG(close) as Average_Close\n", + "\n", + "FROM priceWithFundamentals\n", + "\n", + "GROUP BY ticker\n", + "\n", + "ORDER BY Average_Close\n", + "\n", + "In closing, we demonstrated how to simplify streaming stock data analysis using\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\n", + "Databricks integrated workspace to create a performant, scalable solution that has\n", + "the advantages of both data lakes and data warehouses.\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\n", + "commonly associated with streaming and transactional consistency, enabling\n", + "data engineering and data science teams to focus on understanding the trends in\n", + "their stock data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Tilting Point Does Streaming**\n", + "**Ingestion Into Delta Lake**\n", + "\n", + "Tilting Point is a new-generation games partner that provides top development\n", + "studios with expert resources, services and operational support to optimize\n", + "high-quality live games for success. Through its user acquisition fund and its\n", + "world-class technology platform, Tilting Point funds and runs performance\n", + "marketing management and live games operations to help developers achieve\n", + "profitable scale.\n", + "\n", + "By leveraging Delta Lake, Tilting Point is able to leverage quality data and make\n", + "it readily available for analytics to improve the business. Diego Link, VP of\n", + "Engineering at Tilting Point, provided insights for this use case.\n", + "\n", + "The team at Tilting Point was running daily and hourly batch jobs for reporting on\n", + "game analytics. They wanted to make their reporting near real-time, getting insights\n", + "within 5–10 minutes.\n", + "\n", + "They also wanted to make their in-game LiveOps decisions based on real-time player\n", + "behavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\n", + "effects and even alert on service interruptions in game operations. The goal was to\n", + "ensure that the game experience was as robust as possible for their players.\n", + "\n", + "Additionally, they had to store encrypted Personally Identifiable Information (PII) data\n", + "separately in order to maintain GDPR compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How data flows and associated challenges**\n", + "Tilting Point has a proprietary software development kit that developers integrate\n", + "with to send data from game servers to an ingest server hosted in AWS. This service\n", + "removes all PII data and then sends the raw data to an Amazon Firehose endpoint.\n", + "Firehose then dumps the data in JSON format continuously to S3.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
feafe0caaf0a289198396ca22ff931c1The team at Tilting Point was running daily and hourly batch jobs for reporting on\n", + "game analytics. They wanted to make their reporting near real-time, getting insights\n", + "within 5–10 minutes.\n", + "\n", + "They also wanted to make their in-game LiveOps decisions based on real-time player\n", + "behavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\n", + "effects and even alert on service interruptions in game operations. The goal was to\n", + "ensure that the game experience was as robust as possible for their players.\n", + "\n", + "Additionally, they had to store encrypted Personally Identifiable Information (PII) data\n", + "separately in order to maintain GDPR compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How data flows and associated challenges**\n", + "Tilting Point has a proprietary software development kit that developers integrate\n", + "with to send data from game servers to an ingest server hosted in AWS. This service\n", + "removes all PII data and then sends the raw data to an Amazon Firehose endpoint.\n", + "Firehose then dumps the data in JSON format continuously to S3.\n", + "\n", + "To clean up the raw data and make it available quickly for analytics, the team\n", + "considered pushing the continuous data from Firehose to a message bus (e.g.,\n", + "Kafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\n", + "process data and write to Delta Lake tables.\n", + "\n", + "While that architecture sounds ideal for low latency requirements of processing\n", + "data in seconds, Tilting Point didn’t have such low latency needs for their ingestion\n", + "pipeline. They wanted to make the data available for analytics in a few minutes, not\n", + "seconds. Hence they decided to simplify our architecture by eliminating a message\n", + "bus and instead use S3 as a continuous source for their structured streaming job.\n", + "\n", + "But the key challenge in using S3 as a continuous source is identifying files that\n", + "changed recently.\n", + "\n", + "Listing all files every few minutes has two major issues:\n", + "\n", + "- **Higher latency:** Listing all files in a directory with a large number of files has high\n", + "overhead and increases processing time.\n", + "\n", + "- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n", + "\n", + "**Leveraging Structured Streaming with blob store as**\n", + "**source and Delta Lake tables as sink**\n", + "To continuously stream data from cloud blob storage like S3, Tilting Point uses\n", + "[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\n", + "stream data from S3 without the need to write any state management code on what\n", + "files were recently processed.\n", + "\n", + "\n", + "-----\n", + "\n", + "This is how Tilting Point’s ingestion pipeline looks:\n", + "\n", + "- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\n", + "to SQS via SNS.\n", + "\n", + "- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\n", + "information to read the actual file contents in S3. An example code below:\n", + "\n", + "spark.readStream \\\n", + "\n", + ".format( “s3-sqs” ) \\\n", + "\n", + ". option ( “fileFormat” , “json” ) \\\n", + "\n", + ". option ( “queueUrl” , ...) \\\n", + "\n", + ". schema (...) \\\n", + "\n", + ". load ()\n", + "\n", + "- Tilting Point’s structured streaming job then cleans up and transforms the data.\n", + "Based on the game data, the streaming job uses the foreachBatch API of Spark\n", + "streaming and writes to 30 different Delta Lake tables.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
4cffec7831b4b93dde76b2fc65f0ac9b- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\n", + "to SQS via SNS.\n", + "\n", + "- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\n", + "information to read the actual file contents in S3. An example code below:\n", + "\n", + "spark.readStream \\\n", + "\n", + ".format( “s3-sqs” ) \\\n", + "\n", + ". option ( “fileFormat” , “json” ) \\\n", + "\n", + ". option ( “queueUrl” , ...) \\\n", + "\n", + ". schema (...) \\\n", + "\n", + ". load ()\n", + "\n", + "- Tilting Point’s structured streaming job then cleans up and transforms the data.\n", + "Based on the game data, the streaming job uses the foreachBatch API of Spark\n", + "streaming and writes to 30 different Delta Lake tables.\n", + "\n", + "- The streaming job produces lots of small files. This affects performance of\n", + "downstream consumers. So, an optimize job runs daily to compact small files in\n", + "the table and store them as right file sizes so that consumers of the data have\n", + "good performance while reading the data from Delta Lake tables. Tilting Point\n", + "also runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n", + "\n", + "\n", + "-----\n", + "\n", + "The above Delta Lake ingestion architecture helps in the following ways:\n", + "\n", + "- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\n", + "This helps quickly process the new files without too much overhead in listing files.\n", + "\n", + "- **No explicit file state management:** There is no explicit file state management\n", + "needed to look for recent files.\n", + "\n", + "- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\n", + "and Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n", + "\n", + "- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\n", + "transactional guarantees. This helps with reliable data ingestion.\n", + "\n", + "- **File compaction:** One of the major problems with streaming ingestion is tables\n", + "ending up with a large number of small files that can affect read performance.\n", + "Before Delta Lake, we had to set up a different table to write the compacted\n", + "data. With Delta Lake, thanks to ACID transactions, we can compact the files and\n", + "rewrite the data back to the same table safely.\n", + "\n", + "- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\n", + "ingestion tables to downstream consumers while data is being appended by a\n", + "streaming job and modified during compaction.\n", + "\n", + "- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\n", + "previous version of the table.\n", + "\n", + "In this section, we walked through Tilting Point’s use cases and how they do\n", + "streaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\n", + "efficiently without too much operational overhead to make good quality data\n", + "readily available for analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Building a Quality of Service**\n", + "**Analytics Solution for Streaming**\n", + "**Video Services**\n", + "\n", + "As traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\n", + "libraries of content. For companies whose entire business model revolved around\n", + "producing great content, which they then licensed to distributors, the shift to now\n", + "owning the entire glass-to-glass experience has required new capabilities, such as\n", + "building media supply chains for content delivery to consumers, supporting apps for\n", + "a myriad of devices and operating systems, and performing customer relationship\n", + "functions like billing and customer service.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ab4dfbd6fe492203c62a8b3f60e4ad55In this section, we walked through Tilting Point’s use cases and how they do\n", + "streaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\n", + "efficiently without too much operational overhead to make good quality data\n", + "readily available for analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Building a Quality of Service**\n", + "**Analytics Solution for Streaming**\n", + "**Video Services**\n", + "\n", + "As traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\n", + "libraries of content. For companies whose entire business model revolved around\n", + "producing great content, which they then licensed to distributors, the shift to now\n", + "owning the entire glass-to-glass experience has required new capabilities, such as\n", + "building media supply chains for content delivery to consumers, supporting apps for\n", + "a myriad of devices and operating systems, and performing customer relationship\n", + "functions like billing and customer service.\n", + "\n", + "With most services renewing on a monthly basis, subscription service operators need\n", + "to prove value to their subscribers at all times. General quality of streaming video\n", + "issues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\n", + "screen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n", + "[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n", + "\n", + "When you start streaming, you realize there are so many places where breaks can\n", + "happen and the viewer experience can suffer. There may be an issue at the source in\n", + "the servers on-premises or in the cloud; in transit at either the CDN level or ISP level\n", + "or the viewer’s home network; or at the playout level with player/client issues. What\n", + "breaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\n", + "x 106. There is no pre-release testing that can quite replicate real-world users and\n", + "their ability to push even the most redundant systems to their breaking point as they\n", + "\n", + "\n", + "-----\n", + "\n", + "channel surf, click in and out of the app, sign on from different devices simultaneously\n", + "and so on. And because of the nature of TV, things will go wrong during the most\n", + "important, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n", + "[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\n", + "rather regional or a national issue? If national, is it across all devices or only certain\n", + "types (e.g., possibly the OEM updated the OS on an older device type, which ended up\n", + "causing compatibility issues with the client)?\n", + "\n", + "Identifying, remediating and preventing viewer quality of experience issues becomes\n", + "a big data problem when you consider the number of users, the number of actions\n", + "they are taking and the number of handoffs in the experience (servers to CDN to ISP to\n", + "home network to client). Quality of Service (QoS) helps make sense of these streams\n", + "of data so you can understand what is going wrong, where and why. Eventually you\n", + "can get into predictive analytics around what could go wrong and how to remediate\n", + "it before anything breaks.\n", + "\n", + "**Databricks Quality of Service solution overview**\n", + "The aim of this solution is to provide the core for any streaming video platform that\n", + "wants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n", + "[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\n", + "a Unified Data Analytics Platform for both the real-time insights and the advanced\n", + "analytics capabilities.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
bbbd4003f5c1346b6d1798b187bd59deIdentifying, remediating and preventing viewer quality of experience issues becomes\n", + "a big data problem when you consider the number of users, the number of actions\n", + "they are taking and the number of handoffs in the experience (servers to CDN to ISP to\n", + "home network to client). Quality of Service (QoS) helps make sense of these streams\n", + "of data so you can understand what is going wrong, where and why. Eventually you\n", + "can get into predictive analytics around what could go wrong and how to remediate\n", + "it before anything breaks.\n", + "\n", + "**Databricks Quality of Service solution overview**\n", + "The aim of this solution is to provide the core for any streaming video platform that\n", + "wants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n", + "[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\n", + "a Unified Data Analytics Platform for both the real-time insights and the advanced\n", + "analytics capabilities.\n", + "\n", + "[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\n", + "leveraging the most complete and recent data sets powered by robust and reliable\n", + "data pipelines. This decreases time to market for new features by accelerating\n", + "data science using a collaborative environment. It provides support for managing\n", + "the end-to-end machine learning lifecycle and reduces operational costs across\n", + "all cycles of software development by having a unified platform for both data\n", + "engineering and data science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Video QoS solution architecture**\n", + "With complexities like low-latency monitoring alerts and highly scalable infrastructure\n", + "required for peak video traffic hours, the straightforward architectural choice was\n", + "the Delta Architecture — both standard big data architectures like Lambda and Kappa\n", + "Architectures have disadvantages around the operational effort required to maintain\n", + "multiple types of pipelines (streaming and batch) and lack support for a unified data\n", + "engineering and data science approach.\n", + "\n", + "The Delta Architecture is the next-generation paradigm that enables all the data\n", + "personas in your organization to be more productive:\n", + "\n", + "- Data engineers can develop data pipelines in a cost-efficient manner\n", + "continuously without having to choose between batch and streaming\n", + "\n", + "- Data analysts can get near real-time insights and faster answers to their BI queries\n", + "\n", + "- Data scientists can develop better machine learning models using more reliable data\n", + "sets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n", + "\n", + "\n", + "-----\n", + "\n", + "Writing data pipelines using the Delta Architecture follows the best practices of\n", + "having a multi-layer “multi-hop” approach where we progressively add structure to\n", + "data: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n", + "(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\n", + "reporting or data science, and “Gold” tables are the final presentation layer.\n", + "\n", + "For the pure streaming use cases, the option of materializing the DataFrames in\n", + "intermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\n", + "cost (an example being real-time monitoring alerts vs. updates of the recommender\n", + "system based on new content).\n", + "\n", + "A streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n", + "\n", + "The number of “hops” in this approach is directly impacted by the number of consumers\n", + "downstream, complexity of the aggregations (e.g., Structured Streaming enforces\n", + "certain limitations around chaining multiple aggregations) and the maximization of\n", + "operational efficiency.\n", + "\n", + "The QoS solution architecture is focused around best practices for data processing\n", + "and is not a full video-on-demand (VoD) solution — with some standard components\n", + "like the “front door” service Amazon API Gateway being avoided from the high-level\n", + "architecture in order to keep the focus on data and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "High-level architecture for the QoS platform\n", + "\n", + "\n", + "**Making your data ready for analytics**\n", + "Both sources of data included in the QoS solution (application events and CDN logs)\n", + "are using the JSON format, great for data exchange — allowing you to represent\n", + "complex nested structures, but not scalable and difficult to maintain as a storage\n", + "format for your data lake / analytics system.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ad49d73d4d4b66a958ad88f05e980ce7A streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n", + "\n", + "The number of “hops” in this approach is directly impacted by the number of consumers\n", + "downstream, complexity of the aggregations (e.g., Structured Streaming enforces\n", + "certain limitations around chaining multiple aggregations) and the maximization of\n", + "operational efficiency.\n", + "\n", + "The QoS solution architecture is focused around best practices for data processing\n", + "and is not a full video-on-demand (VoD) solution — with some standard components\n", + "like the “front door” service Amazon API Gateway being avoided from the high-level\n", + "architecture in order to keep the focus on data and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "High-level architecture for the QoS platform\n", + "\n", + "\n", + "**Making your data ready for analytics**\n", + "Both sources of data included in the QoS solution (application events and CDN logs)\n", + "are using the JSON format, great for data exchange — allowing you to represent\n", + "complex nested structures, but not scalable and difficult to maintain as a storage\n", + "format for your data lake / analytics system.\n", + "\n", + "\n", + "In order to make the data directly queryable across the entire organization, the\n", + "Bronze to Silver pipeline (the “make your data available to everyone” pipeline) should\n", + "transform any raw formats into Delta Lake and include all the quality checks or data\n", + "masking required by any regulatory agencies.\n", + "\n", + "\n", + "-----\n", + "\n", + "Raw format of the app events\n", + "\n", + "**Video applications events**\n", + "Based on the architecture, the video application events are pushed directly to\n", + "Kinesis Streams and then just ingested to a Delta Lake append-only table without\n", + "any changes to the schema.\n", + "\n", + "Using this pattern allows a high number of consumers downstream to process the\n", + "data in a streaming paradigm without having to scale the throughput of the Kinesis\n", + "stream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\n", + "we don’t have to worry about the way the size of the processing window will impact the\n", + "number of files in your target table — known as the “small files” issue in the big data world.\n", + "\n", + "Both the timestamp and the type of message are being extracted from the JSON\n", + "event in order to be able to partition the data and allow consumers to choose the\n", + "type of events they want to process. Again combining a single Kinesis stream for\n", + "the events with a Delta Lake “Events” table reduces the operational complexity while\n", + "making things easier for scaling during peak hours.\n", + "\n", + "\n", + "All the details are extracted from JSON for the Silver table\n", + "\n", + "\n", + "-----\n", + "\n", + "**CDN logs**\n", + "The CDN logs are delivered to S3, so the easiest way to process them is the Databricks\n", + "Auto Loader, which incrementally and efficiently processes new data files as they\n", + "arrive in S3 without any additional setup.\n", + "\n", + "auto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n", + "\n", + ".option( “cloudFiles.format” , “json” ) \\\n", + "\n", + ".option( “cloudFiles.region” , region) \\\n", + "\n", + ".load(input_location)\n", + "\n", + "anonymized_df = auto_loader_df. select ( ‘*’ , ip_\n", + "\n", + "anonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n", + "\n", + ".drop( ‘requestip’ )\\\n", + "\n", + ".withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n", + "\n", + "anonymized_df.writeStream \\\n", + "\n", + ".option( ‘checkpointLocation’ , checkpoint_location)\\\n", + "\n", + ".format( ‘delta’ ) \\\n", + "\n", + ".table(silver_database + ‘.cdn_logs’ )\n", + "\n", + "As the logs contain IPs — considered personal data under the GDPR regulations — the\n", + "“make your data available to everyone” pipeline has to include an anonymization step.\n", + "Different techniques can be used, but we decided to just strip the last octet from IPv4\n", + "and the last 80 bits from IPv6. On top, the data set is also enriched with information\n", + "around the origin country and the ISP provider, which will be used later in the Network\n", + "Operation Centers for localization.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
79c3bd3bc2ddef2dc23c74442ac386f7anonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n", + "\n", + ".drop( ‘requestip’ )\\\n", + "\n", + ".withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n", + "\n", + "anonymized_df.writeStream \\\n", + "\n", + ".option( ‘checkpointLocation’ , checkpoint_location)\\\n", + "\n", + ".format( ‘delta’ ) \\\n", + "\n", + ".table(silver_database + ‘.cdn_logs’ )\n", + "\n", + "As the logs contain IPs — considered personal data under the GDPR regulations — the\n", + "“make your data available to everyone” pipeline has to include an anonymization step.\n", + "Different techniques can be used, but we decided to just strip the last octet from IPv4\n", + "and the last 80 bits from IPv6. On top, the data set is also enriched with information\n", + "around the origin country and the ISP provider, which will be used later in the Network\n", + "Operation Centers for localization.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating the Dashboard /**\n", + "**Virtual Network Operation Centers**\n", + "Streaming companies need to monitor network performance and the user experience\n", + "as near real-time as possible, tracking down to the individual level with the ability to\n", + "abstract at the segment level, easily defining new segments such as those defined by\n", + "geos, devices, networks and/or current and historical viewing behavior.\n", + "\n", + "For streaming companies that has meant adopting the concept of Network Operation\n", + "Centers (NOC) from telco networks for monitoring the health of the streaming\n", + "experience for their users at a macro level, flagging and responding to any issues\n", + "early on. At their most basic, NOCs should have dashboards that compare the current\n", + "experience for users against a performance baseline so that the product teams can\n", + "quickly and easily identify and attend to any service anomalies.\n", + "\n", + "In the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\n", + "be effortlessly connected in order to build more complex visualizations, but based\n", + "on customer feedback, built-in dashboards are, most of the time, the fastest way to\n", + "present the insights to business users.\n", + "\n", + "The aggregated tables for the NOC will basically be the Gold layer of our Delta\n", + "Architecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n", + "\n", + "\n", + "-----\n", + "\n", + "The dashboard is just a way to visually package the results of SQL queries or Python\n", + "/ R transformation — each notebook supports multiple dashboards so in case of\n", + "multiple end users with different requirements we don’t have to duplicate the code —\n", + "as a bonus the refresh can also be scheduled as a Databricks job.\n", + "\n", + "Visualization of the results of a SQL query\n", + "\n", + "Loading time for videos (time to first frame) allows better understanding of the\n", + "performance for individual locations of your CDN — in this case the AWS CloudFront\n", + "Edge nodes — which has a direct impact in your strategy for improving this KPI —\n", + "either by spreading the user traffic over multi-CDNs or maybe just implementing a\n", + "dynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n", + "\n", + "\n", + "-----\n", + "\n", + "Failure to understand the reasons for high levels of buffering — and the poor video\n", + "quality experience that it brings — has a significant impact on subscriber churn rate.\n", + "On top of that, advertisers are not willing to spend money on ads responsible for\n", + "reducing the viewer engagement — as they add extra buffering on top, so the profits\n", + "on the advertising business usually are impacted too. In this context, collecting as\n", + "much information as possible from the application side is crucial to allow the analysis\n", + "to be done not only at video level but also browser or even type / version of application.\n", + "\n", + "On the content side, events for the application can provide useful information about\n", + "user behavior and overall quality of experience. How many people that paused a video\n", + "have actually finished watching that episode / video? What caused the stoppage: The\n", + "quality of the content or delivery issues? Of course, further analyses can be done by\n", + "linking all the sources together (user behavior, performance of CDNs /ISPs) to not only\n", + "create a user profile but also to forecast churn.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
ae2829e8bde14a57789f9914fb41bddd-----\n", + "\n", + "Failure to understand the reasons for high levels of buffering — and the poor video\n", + "quality experience that it brings — has a significant impact on subscriber churn rate.\n", + "On top of that, advertisers are not willing to spend money on ads responsible for\n", + "reducing the viewer engagement — as they add extra buffering on top, so the profits\n", + "on the advertising business usually are impacted too. In this context, collecting as\n", + "much information as possible from the application side is crucial to allow the analysis\n", + "to be done not only at video level but also browser or even type / version of application.\n", + "\n", + "On the content side, events for the application can provide useful information about\n", + "user behavior and overall quality of experience. How many people that paused a video\n", + "have actually finished watching that episode / video? What caused the stoppage: The\n", + "quality of the content or delivery issues? Of course, further analyses can be done by\n", + "linking all the sources together (user behavior, performance of CDNs /ISPs) to not only\n", + "create a user profile but also to forecast churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating (near) real-time alerts**\n", + "When dealing with the velocity, volume and variety of data generated in video\n", + "streaming from millions of concurrent users, dashboard complexity can make it\n", + "harder for human operators in the NOC to focus on the most important data at the\n", + "moment and zero-in on root cause issues. With this solution, you can easily set up\n", + "automated alerts when performance crosses certain thresholds that can help the\n", + "human operators of the network as well as set off automatic remediation protocols\n", + "via a Lambda function. For example:\n", + "\n", + "- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\n", + "latency vs. baseline average), initiate automatic CDN traffic shifts.\n", + "\n", + "- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\n", + "product team that there is likely a client issue for a specific device.\n", + "\n", + "- If viewers on a certain ISP are having higher-than-average buffering and\n", + "pixelation issues, alert frontline customer representatives on responses and ways\n", + "to decrease issues (e.g., set stream quality lower).\n", + "\n", + "From a technical perspective, generating real-time alerts requires a streaming\n", + "engine capable of processing data real time and publish-subscribe service to push\n", + "notifications.\n", + "\n", + "\n", + "updates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n", + "[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\n", + "on a rule-based engine (e.g., validating the percentage of errors for each individual\n", + "type of app over a period of time) really straightforward.\n", + "\n", + "def send_error_notification(row):\n", + "\n", + "sns_client = boto3.client( ‘sns’ , region)\n", + "\n", + "error_message = ‘Number of errors for the App has exceeded the\n", + "\n", + "threshold {}’ .format(row[ ‘percentage’ ])\n", + "\n", + "response = sns_client.publish(\n", + "\n", + "TopicArn =,\n", + "\n", + "Message = error_message,\n", + "\n", + "Subject =,\n", + "\n", + "MessageStructure = ‘string’ )\n", + "\n", + "# Structured Streaming Job\n", + "\n", + "getKinesisStream( “player_events” )\\\n", + "\n", + ".selectExpr( “type” , “app_type” )\\\n", + "\n", + ".groupBy( “app_type” )\\\n", + "\n", + ".apply(calculate_error_percentage)\\\n", + "\n", + ". where ( “percentage > {}” .format(threshold)) \\\n", + "\n", + ".writeStream\\\n", + "\n", + ". foreach (send_error_notification)\\\n", + "\n", + ".start()\n", + "\n", + "\n", + "Integrating microservices using Amazon SNS and Amazon SQS\n", + "\n", + "Sending email notifications using AWS SNS\n", + "\n", + "The QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\n", + "by using Amazon SNS and its integrations with Amazon Lambda (see below for the\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
32aae84b9afcab02dd16df7cda65615fSubject =,\n", + "\n", + "MessageStructure = ‘string’ )\n", + "\n", + "# Structured Streaming Job\n", + "\n", + "getKinesisStream( “player_events” )\\\n", + "\n", + ".selectExpr( “type” , “app_type” )\\\n", + "\n", + ".groupBy( “app_type” )\\\n", + "\n", + ".apply(calculate_error_percentage)\\\n", + "\n", + ". where ( “percentage > {}” .format(threshold)) \\\n", + "\n", + ".writeStream\\\n", + "\n", + ". foreach (send_error_notification)\\\n", + "\n", + ".start()\n", + "\n", + "\n", + "Integrating microservices using Amazon SNS and Amazon SQS\n", + "\n", + "Sending email notifications using AWS SNS\n", + "\n", + "The QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\n", + "by using Amazon SNS and its integrations with Amazon Lambda (see below for the\n", + "\n", + "\n", + "-----\n", + "\n", + "On top of the basic email use case, the Demo Player includes three widgets updated\n", + "in real time using AWS AppSync: the number of active users, the most popular videos\n", + "and the number of users concurrently watching a video.\n", + "\n", + "Updating the application with the results of real-time aggregations\n", + "\n", + "The QoS solution is applying a similar approach — Structured Streaming and Amazon\n", + "SNS — to update all the values allowing for extra consumers to be plugged in using AWS\n", + "SQS. This is a common pattern when huge volumes of events have to be enhanced and\n", + "analyzed; pre-aggregate data once and allow each service (consumer) to make their\n", + "own decision downstream.\n", + "\n", + "**Next steps: machine learning**\n", + "Manually making sense of the historical data is important but is also very slow. If\n", + "we want to be able to make automated decisions in the future, we have to integrate\n", + "machine learning algorithms.\n", + "\n", + "As a Unified Data Platform, Databricks empowers data scientists to build better data\n", + "science products using features like Runtime for Machine Learning with built-in\n", + "or the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n", + "\n", + "\n", + "-----\n", + "\n", + "We have already explored a few important use cases across our customer base while\n", + "focusing on the possible extensions to the QoS solution.\n", + "\n", + "**Point-of-failure prediction and remediation**\n", + "As D2C streamers reach more users, the costs of even momentary loss of service\n", + "increases. ML can help operators move from reporting to prevention by forecasting\n", + "where issues could come up and remediating before anything goes wrong (e.g.,\n", + "a spike in concurrent viewers leads to switching CDNs to one with more capacity\n", + "automatically).\n", + "\n", + "**Customer churn**\n", + "Critical to growing subscription services is keeping the subscribers you have. By\n", + "understanding the quality of service at the individual level, you can add QoS as a\n", + "variable in churn and customer lifetime value models. Additionally, you can create\n", + "customer cohorts for those who have had video quality issues in order to test\n", + "proactive messaging and save offers.\n", + "\n", + "\n", + "**Getting started with the Databricks streaming video**\n", + "**QoS solution**\n", + "Providing consistent quality in the streaming video experience is table stakes at this\n", + "point to keep fickle audiences with ample entertainment options on your platform.\n", + "With this solution we have sought to create a quick start for most streaming video\n", + "platform environments to embed this QoS real-time streaming analytics solution in\n", + "a way that:\n", + "1. Scales to any audience size\n", + "2. Quickly flags quality performance issues at key parts of the distribution workflow\n", + "3. Is flexible and modular enough to easily customize for your audience and your\n", + "needs, such as creating new automated alerts or enabling data scientists to test\n", + "and roll out predictive analytics and machine learningSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
bcc6aa2a7284ca86601537158789cd67**Customer churn**\n", + "Critical to growing subscription services is keeping the subscribers you have. By\n", + "understanding the quality of service at the individual level, you can add QoS as a\n", + "variable in churn and customer lifetime value models. Additionally, you can create\n", + "customer cohorts for those who have had video quality issues in order to test\n", + "proactive messaging and save offers.\n", + "\n", + "\n", + "**Getting started with the Databricks streaming video**\n", + "**QoS solution**\n", + "Providing consistent quality in the streaming video experience is table stakes at this\n", + "point to keep fickle audiences with ample entertainment options on your platform.\n", + "With this solution we have sought to create a quick start for most streaming video\n", + "platform environments to embed this QoS real-time streaming analytics solution in\n", + "a way that:\n", + "1. Scales to any audience size\n", + "2. Quickly flags quality performance issues at key parts of the distribution workflow\n", + "3. Is flexible and modular enough to easily customize for your audience and your\n", + "needs, such as creating new automated alerts or enabling data scientists to test\n", + "and roll out predictive analytics and machine learning\n", + "\n", + "To get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n", + "[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\n", + "system, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Customer Use Cases**\n", + "See how customers are using\n", + "Delta Lake to rapidly innovate\n", + "\n", + "## CHAPTER 05\n", + "\n", + "\n", + "-----\n", + "\n", + "**Healthdirect Australia**\n", + "Provides Personalized and Secure Online\n", + "Patient Care With Databricks\n", + "\n", + "As the shepherds of the National Health Services Directory (NHSD), Healthdirect\n", + "is focused on leveraging terabytes of data covering time-driven, activity-based\n", + "healthcare transactions to improve health care services and support. With\n", + "governance requirements, siloed teams and a legacy system that was difficult\n", + "to scale, they moved to Databricks. This boosted data processing for downstream\n", + "machine learning while improving data security to meet HIPAA requirements.\n", + "\n", + "**Spotlight on Healthdirect**\n", + "**Industry:** Healthcare and life sciences\n", + "6x\n", + "Improvement in data processing\n", + "20M\n", + "Records ingested in minutes\n", + "\n", + "**Data quality and governance issues, silos, and the**\n", + "**inability to scale**\n", + "Due to regulatory pressures, Healthdirect Australia set forth to improve overall data\n", + "quality and ensure a level of governance on top of that, but they ran into challenges\n", + "when it came to data storage and access. On top of that, data silos were blocking the\n", + "team from efficiently preparing data for downstream analytics. These disjointed data\n", + "\n", + "\n", + "-----\n", + "\n", + "sources impacted the consistency of data reads, as data was oftentimes out-of-sync\n", + "between the various systems in their stack. The low-quality data also led to higher\n", + "error rates and processing inefficiencies. This fragmented architecture created\n", + "significant operational overhead and limited their ability to have a comprehensive\n", + "view of the patient.\n", + "\n", + "Further, they needed to ingest over 1 billion data points due to a changing landscape\n", + "of customer demand such as bookings, appointments, pricing, eHealth transaction\n", + "activity, etc. — estimated at over 1TB of data.\n", + "\n", + "“We had a lot of data challenges. We just couldn’t process efficiently enough. We\n", + "were starting to get batch overruns. We were starting to see that a 24-hour window\n", + "isn’t the most optimum time in which we want to be able to deliver healthcare data\n", + "and services,” explained Peter James, Chief Architect at Healthdirect Australia.\n", + "\n", + "Ultimately, Healthdirect realized they needed to modernize their end-to-end process\n", + "and tech stack to properly support the business.\n", + "\n", + "**Modernizing analytics with Databricks and Delta Lake**\n", + "Databricks provides Healthdirect Australia with a Unified Data Platform that simplifies\n", + "data engineering and accelerates data science innovation. The notebook environment\n", + "enables them to make content changes in a controlled fashion rather than having to\n", + "run bespoke jobs each time.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
5f58acfe64720103ee19410297467f1aFurther, they needed to ingest over 1 billion data points due to a changing landscape\n", + "of customer demand such as bookings, appointments, pricing, eHealth transaction\n", + "activity, etc. — estimated at over 1TB of data.\n", + "\n", + "“We had a lot of data challenges. We just couldn’t process efficiently enough. We\n", + "were starting to get batch overruns. We were starting to see that a 24-hour window\n", + "isn’t the most optimum time in which we want to be able to deliver healthcare data\n", + "and services,” explained Peter James, Chief Architect at Healthdirect Australia.\n", + "\n", + "Ultimately, Healthdirect realized they needed to modernize their end-to-end process\n", + "and tech stack to properly support the business.\n", + "\n", + "**Modernizing analytics with Databricks and Delta Lake**\n", + "Databricks provides Healthdirect Australia with a Unified Data Platform that simplifies\n", + "data engineering and accelerates data science innovation. The notebook environment\n", + "enables them to make content changes in a controlled fashion rather than having to\n", + "run bespoke jobs each time.\n", + "\n", + "“Databricks has provided a big uplift for our teams and our data operations,” said\n", + "James. “The analysts were working directly with the data operations teams. They are\n", + "able to achieve the same pieces of work together within the same time frames that\n", + "used to take twice as long. They’re working together, and we’re seeing just a massive\n", + "acceleration in the speed at which we can deliver service.”\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\n", + "Within these zones, they store their data “as is,” in their structured or unstructured\n", + "state, in Delta Lake tables. From there, they use a metadata-driven schema and hold\n", + "the data within a nested structure within that table. What this allows them to do is\n", + "handle data consistently from every source and simplifies the mapping of data to the\n", + "various applications pulling the data.\n", + "\n", + "Meanwhile, through Structured Streaming, they were able to convert all of their\n", + "ETL batch jobs into streaming ETL jobs that could serve multiple applications\n", + "consistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\n", + "Databricks Unified Data Platform provides significant architectural improvements\n", + "that have boosted performance, reduced operational overheads and increased\n", + "process efficiencies.\n", + "\n", + "\n", + "**Faster data pipelines result in better patient-driven**\n", + "**healthcare**\n", + "As a result of the performance gains delivered by Databricks and the improved data\n", + "reliability through Delta Lake, Healthdirect Australia realized improved accuracy of\n", + "their fuzzy name match algorithm from less than 80% with manual verification to 95%\n", + "and no manual intervention.\n", + "\n", + "The processing improvements with Delta Lake and Structured Streaming allowed\n", + "them to process more than 30,000 automated updates per month. Prior to Databricks,\n", + "they had to use unreliable batch jobs that were highly manual to process the same\n", + "number of updates over a span of 6 months — a 6x improvement in data processing.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the\n", + "healthcare sector.”\n", + "\n", + "– Peter James, Chief Architect, Healthdirect Australia\n", + "\n", + "\n", + "-----\n", + "\n", + "They were also able to increase their data load rate to 1 million records per minute,\n", + "loading their entire 20 million record data set in 20 minutes. Before the adoption\n", + "of Databricks, this used to take more than 24 hours to process the same 1 million\n", + "transactions, blocking analysts from making swift decisions to drive results.\n", + "\n", + "Last, data security, which was critical to meet compliance requirements, was greatly\n", + "improved. Databricks provides standard security accreditations like HIPAA, and\n", + "Healthdirect was able to use Databricks to meet Australia’s security requirements.\n", + "This yielded significant cost reductions and gave them continuous data assurance\n", + "by monitoring changes to access privileges like changes in roles, metadata-level\n", + "security changes, data leakage, etc.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the healthcare\n", + "sector,” said James.\n", + "\n", + "With the help of Databricks, they have proven the value of data and analytics and how\n", + "it can impact their business vision. With transparent access to data that boasts\n", + "well-documented lineage and quality, participation across various business and\n", + "analyst groups has increased — empowering teams to collaborate and more\n", + "easily and quickly extract value from their data with the goal of improving\n", + "healthcare for everyone.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Comcast**\n", + "Uses Delta Lake and MLflow to\n", + "Transform the Viewer ExperienceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
59a7715022c21c9e2cf313c9bc094c0bLast, data security, which was critical to meet compliance requirements, was greatly\n", + "improved. Databricks provides standard security accreditations like HIPAA, and\n", + "Healthdirect was able to use Databricks to meet Australia’s security requirements.\n", + "This yielded significant cost reductions and gave them continuous data assurance\n", + "by monitoring changes to access privileges like changes in roles, metadata-level\n", + "security changes, data leakage, etc.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the healthcare\n", + "sector,” said James.\n", + "\n", + "With the help of Databricks, they have proven the value of data and analytics and how\n", + "it can impact their business vision. With transparent access to data that boasts\n", + "well-documented lineage and quality, participation across various business and\n", + "analyst groups has increased — empowering teams to collaborate and more\n", + "easily and quickly extract value from their data with the goal of improving\n", + "healthcare for everyone.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Comcast**\n", + "Uses Delta Lake and MLflow to\n", + "Transform the Viewer Experience\n", + "\n", + "**Spotlight on Comcast**\n", + "**Industry:** Media and entertainment\n", + "10x\n", + "Reduction in overall compute costs to process data\n", + "90%\n", + "Reduction in required DevOps resources to manage infrastructure\n", + "Reduced\n", + "Deployment times from weeks to minutes\n", + "\n", + "As a global technology and media company connecting millions of customers to\n", + "personalized experiences, Comcast struggled with massive data, fragile data pipelines\n", + "\n", + "and poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n", + "— they can build performant data pipelines for petabytes of data and easily manage the\n", + "lifecycle of hundreds of models to create a highly innovative, unique and award-winning\n", + "viewer experience using voice recognition and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Infrastructure unable to support data and ML needs**\n", + "Instantly answering a customer’s voice request for a particular program while turning\n", + "billions of individual interactions into actionable insights, strained Comcast’s IT\n", + "infrastructure and data analytics and data science teams. To make matters more\n", + "complicated, Comcast needed to deploy models to a disjointed and disparate range\n", + "of environments: cloud, on-premises and even directly to devices in some instances.\n", + "\n", + "- **Massive data:** Billions of events generated by the entertainment system and 20+\n", + "million voice remotes, resulting in petabytes of data that need to be sessionized\n", + "for analysis.\n", + "\n", + "- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\n", + "hard to recover. Small files were difficult to manage, slowing data ingestion for\n", + "downstream machine learning.\n", + "\n", + "- **Poor collaboration:** Globally dispersed data scientists working in different\n", + "scripting languages struggled to share and reuse code.\n", + "\n", + "- **Manage management of ML models:** Developing, training and deploying hundreds\n", + "of models was highly manual, slow and hard to replicate, making it difficult to scale.\n", + "\n", + "- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\n", + "and models while ops wanted to deploy on proven infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Automated infrastructure, faster data**\n", + "**pipelines with Delta Lake**\n", + "Comcast realized they needed to modernize their entire approach to analytics from\n", + "data ingest to the deployment of machine learning models to delivering new features\n", + "that delight their customers. Today, the Databricks Unified Data Platform enables\n", + "Comcast to build rich data sets and optimize machine learning at scale, streamline\n", + "workflows across teams, foster collaboration, reduce infrastructure complexity, and\n", + "deliver superior customer experiences.\n", + "\n", + "- **Simplified infrastructure management:** Reduced operational costs through\n", + "automated cluster management and cost management features such as\n", + "autoscaling and spot instances.\n", + "\n", + "\n", + "\n", + "- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\n", + "initial processing of the raw telemetry from video and voice applications and devices.\n", + "\n", + "- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\n", + "and reliable ingestion at scale.\n", + "\n", + "- **Collaborative workspaces:** Interactive notebooks improve cross-team\n", + "collaboration and data science creativity, allowing Comcast to greatly accelerate\n", + "model prototyping for faster iteration.\n", + "\n", + "- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\n", + "and model serving via the Kubeflow environment, allowing them to track and\n", + "manage hundreds of models with ease.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
97c9548f51ef028ef3850848b3a03fe8- **Simplified infrastructure management:** Reduced operational costs through\n", + "automated cluster management and cost management features such as\n", + "autoscaling and spot instances.\n", + "\n", + "\n", + "\n", + "- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\n", + "initial processing of the raw telemetry from video and voice applications and devices.\n", + "\n", + "- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\n", + "and reliable ingestion at scale.\n", + "\n", + "- **Collaborative workspaces:** Interactive notebooks improve cross-team\n", + "collaboration and data science creativity, allowing Comcast to greatly accelerate\n", + "model prototyping for faster iteration.\n", + "\n", + "- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\n", + "and model serving via the Kubeflow environment, allowing them to track and\n", + "manage hundreds of models with ease.\n", + "\n", + "- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\n", + "that can reliably join historic and streaming data for richer insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering personalized experiences with ML**\n", + "In the intensely competitive entertainment industry, there is no time to press the\n", + "Pause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\n", + "delighted with competition-beating customer experiences.\n", + "\n", + "- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\n", + "a highly innovative and award-winning viewer experience with intelligent voice\n", + "commands that boosts engagement.\n", + "\n", + "- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\n", + "ingestion, replacing 640 machines with 64 while improving performance. Teams\n", + "can spend more time on analytics and less time on infrastructure management.\n", + "\n", + "- **Less DevOps:** Reduced the number of DevOps full-time employees required for\n", + "onboarding 200 users from 5 to 0.5.\n", + "\n", + "- **Higher data science productivity:** Fostered collaboration between global data\n", + "scientists by enabling different programming languages through a single\n", + "interactive workspace. Also, Delta Lake has enabled the data team to use data at\n", + "any point within the data pipeline, allowing them to act more quickly in building\n", + "and training new models.\n", + "\n", + "- **Faster model deployment:** Reduced deployment times from weeks to minutes as\n", + "operations teams deployed models on disparate platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Banco Hipotecario**\n", + "Personalizes the Banking\n", + "Experience With Data and ML\n", + "\n", + "Banco Hipotecario — a leading Argentinian commercial bank — is on a mission\n", + "to leverage machine learning to deliver new insights and services that will delight\n", + "customers and create upsell opportunities. With a legacy analytics and data\n", + "warehousing system that was rigid and complex to scale, they turned to Databricks\n", + "to unify data science, engineering and analytics.\n", + "\n", + "As a result of this partnership, they were able to significantly increase customer\n", + "acquisition and cross-sells while lowering the cost for acquisition, greatly impacting\n", + "overall customer retention and profitability.\n", + "\n", + "**Spotlight on Banco Hipotecario**\n", + "**Industry:** Financial services\n", + "35%\n", + "\n", + "Reduction in cost of acquisition\n", + "**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n", + "\n", + "\n", + "-----\n", + "\n", + "**Legacy analytics tools are slow, rigid and**\n", + "**impossible to scale**\n", + "Banco Hipotecario set forth to increase customer acquisition by reducing risk and\n", + "improving the customer experience. With data analytics and machine learning\n", + "anchoring their strategy, they hoped to influence a range of use cases from fraud\n", + "detection and risk analysis to serving product recommendations to drive upsell and\n", + "cross-sell opportunities and forecast sales.\n", + "\n", + "Banco Hipotecario faced a number of the challenges that often come along with\n", + "outdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n", + "— the list goes on.\n", + "\n", + "“In order to execute on our data analytics strategy, new technologies were needed\n", + "in order to improve data engineering and boost data science productivity,” said\n", + "Daniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\n", + "took were to move to a cloud-based data lake, which led us to Azure Databricks\n", + "and Delta Lake.”\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
570d73760acc1c981d74f3474639d57e-----\n", + "\n", + "**Legacy analytics tools are slow, rigid and**\n", + "**impossible to scale**\n", + "Banco Hipotecario set forth to increase customer acquisition by reducing risk and\n", + "improving the customer experience. With data analytics and machine learning\n", + "anchoring their strategy, they hoped to influence a range of use cases from fraud\n", + "detection and risk analysis to serving product recommendations to drive upsell and\n", + "cross-sell opportunities and forecast sales.\n", + "\n", + "Banco Hipotecario faced a number of the challenges that often come along with\n", + "outdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n", + "— the list goes on.\n", + "\n", + "“In order to execute on our data analytics strategy, new technologies were needed\n", + "in order to improve data engineering and boost data science productivity,” said\n", + "Daniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\n", + "took were to move to a cloud-based data lake, which led us to Azure Databricks\n", + "and Delta Lake.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**A unified platform powers the data lake**\n", + "**and easy collaboration**\n", + "Banco Hipotecario turned to Databricks to modernize their data warehouse\n", + "environment, improve cross-team collaboration, and drive data science innovation.\n", + "Fully managed in Microsoft Azure, they were able to easily and reliably ingest massive\n", + "volumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\n", + "automated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n", + "\n", + "Delta Lake has been especially useful in bringing reliability and performance to Banco\n", + "Hipotecario’s data lake environment. With Delta Lake, they are now able to build\n", + "reliable and performant ETL pipelines like never before.\n", + "\n", + "\n", + "Meanwhile, performing SQL Analytics on Databricks has helped them do data\n", + "exploration, cleansing and generate data sets in order to create models, enabling the\n", + "team to deploy their first model within the first three months, and the second model\n", + "generated was rolled out in just two weeks.\n", + "\n", + "At the same time, data scientists were finally able to collaborate, thanks to interactive\n", + "notebooks; this meant faster builds, training and deployment. And MLflow streamlined\n", + "the ML lifecycle and removed the overreliance on data engineering.\n", + "\n", + "“Databricks gives our data scientists the means to easily create our own experiments\n", + "and deploy them to production in weeks, rather than months,” said Miguel Villalba,\n", + "Head of Data Engineering and Data Science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**An efficient team maximizes customer**\n", + "**acquisition and retention**\n", + "Since moving to Databricks, the data team at Banco Hipotecario could not be happier,\n", + "as Databricks has unified them across functions in an integrated fashion.\n", + "\n", + "The results of data unification and markedly improved collaboration and autonomy\n", + "cannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\n", + "their cross-sell into new products by a whopping 90%, while machine learning has\n", + "reduced the cost of customer acquisition by 35%.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Viacom18**\n", + "Migrates From Hadoop to Databricks to\n", + "Deliver More Engaging Experiences\n", + "\n", + "Viacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\n", + "with 40x growth over the past decade. They offer multi-platform, multigenerational\n", + "and multicultural brand experiences to 600+ million monthly viewers.\n", + "\n", + "In order to deliver more engaging experiences for their millions of viewers, Viacom18\n", + "migrated from their Hadoop environment due to its inability to process data at scale\n", + "efficiently. With Databricks, they have streamlined their infrastructure management,\n", + "increased data pipeline speeds and increased productivity among their data teams.\n", + "\n", + "Today, Viacom18 is able to deliver more relevant viewing experiences to their\n", + "subscribers, while identifying opportunities to optimize the business and drive\n", + "greater ROI.\n", + "\n", + "**Spotlight on Viacom18**\n", + "**Industry:** Media and entertainment\n", + "26%\n", + "Increase in operational efficiency lowers overall costs\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
8165c97cf16d36d736f0cfa3da080901-----\n", + "\n", + "**Viacom18**\n", + "Migrates From Hadoop to Databricks to\n", + "Deliver More Engaging Experiences\n", + "\n", + "Viacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\n", + "with 40x growth over the past decade. They offer multi-platform, multigenerational\n", + "and multicultural brand experiences to 600+ million monthly viewers.\n", + "\n", + "In order to deliver more engaging experiences for their millions of viewers, Viacom18\n", + "migrated from their Hadoop environment due to its inability to process data at scale\n", + "efficiently. With Databricks, they have streamlined their infrastructure management,\n", + "increased data pipeline speeds and increased productivity among their data teams.\n", + "\n", + "Today, Viacom18 is able to deliver more relevant viewing experiences to their\n", + "subscribers, while identifying opportunities to optimize the business and drive\n", + "greater ROI.\n", + "\n", + "**Spotlight on Viacom18**\n", + "**Industry:** Media and entertainment\n", + "26%\n", + "Increase in operational efficiency lowers overall costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**Growth in subscribers and terabytes of viewing data**\n", + "**push Hadoop to its limits**\n", + "Viacom18, a joint venture between Network18 and ViacomCBS, is focused on\n", + "providing its audiences with highly personalized viewing experiences. The core\n", + "of this strategy requires implementing an enterprise data architecture that enables\n", + "the building of powerful customer analytics on daily viewer data. But with millions of\n", + "consumers across India, the sheer amount of data was tough to wrangle: They were\n", + "tasked with ingesting and processing over 45,000 hours of daily content on VOOT\n", + "(Viacom18’s on-demand video subscription platform), which easily generated 700GB\n", + "to 1TB of data per day.\n", + "\n", + "“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\n", + "Vice President of Digital Transformation and Technology. “We deliver personalized\n", + "content recommendations across our audiences around the world based on\n", + "individual viewing history and preferences in order to increase viewership and\n", + "customer loyalty.”\n", + "\n", + "Viacom18’s data lake, which was leveraging on-premises Hadoop for operations,\n", + "wasn’t able to optimally process 90 days of rolling data within their management’s\n", + "defined SLAs, limiting their ability to deliver on their analytics needs, which impacted\n", + "not only the customer experience but also overall costs.\n", + "\n", + "To meet this challenge head-on, Viacom18 needed a modern data warehouse with the\n", + "ability to analyze data trends for a longer period of time instead of daily snapshots. They\n", + "also needed a platform that simplified infrastructure by allowing their team to easily\n", + "provision clusters with features like auto-scaling to help reduce compute costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Rapid data processing for analytics**\n", + "**and ML with Databricks**\n", + "To enable the processing power and data science capabilities they required, Viacom18\n", + "partnered with Celebal Technologies, a premier Salesforce, data analytics and big data\n", + "consulting organization based in India. The team at Celebal leveraged Azure Databricks\n", + "to provide Viacom18 with a unified data platform that modernizes its data warehousing\n", + "capabilities and accelerates data processing at scale.\n", + "\n", + "The ability to cache data within Delta Lake resulted in the much-needed acceleration\n", + "of queries, while cluster management with auto-scaling and the decoupling of\n", + "\n", + "\n", + "storage and compute simplified Viacom18’s infrastructure management and\n", + "optimized operational costs. “Delta Lake has created a streamlined approach to\n", + "the management of data pipelines,” explained Dey. “This has led to a decrease in\n", + "operational costs while speeding up time-to-insight for downstream analytics and\n", + "data science.”\n", + "\n", + "The notebooks feature was an unexpected bonus for Viacom18, as a common workspace\n", + "gave data teams a way to collaborate and increase productivity on everything from\n", + "model training to ad hoc analysis, dashboarding and reporting via PowerBI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Leveraging viewer data to power personalized**\n", + "**viewing experiences**\n", + "Celebal Technologies and Databricks have enabled Viacom18 to deliver innovative\n", + "customer solutions and insights with increased cross-team collaboration and\n", + "productivity. With Databricks, Viacom18’s data team is now able to seamlessly\n", + "navigate their data while better serving their customers.\n", + "\n", + "“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\n", + "and deliver customer behavioral and engagement insights to the analysts and data\n", + "scientists,” said Dey.\n", + "\n", + "In addition to performance gains, the faster query times have also lowered the overall\n", + "cost of ownership, even with daily increases in data volumes. “Azure Databricks has\n", + "greatly streamlined processes and improved productivity by an estimated 26%,”\n", + "concluded Dey.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
d8efcb2139fbb03f80ddfc58b319efc1The notebooks feature was an unexpected bonus for Viacom18, as a common workspace\n", + "gave data teams a way to collaborate and increase productivity on everything from\n", + "model training to ad hoc analysis, dashboarding and reporting via PowerBI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Leveraging viewer data to power personalized**\n", + "**viewing experiences**\n", + "Celebal Technologies and Databricks have enabled Viacom18 to deliver innovative\n", + "customer solutions and insights with increased cross-team collaboration and\n", + "productivity. With Databricks, Viacom18’s data team is now able to seamlessly\n", + "navigate their data while better serving their customers.\n", + "\n", + "“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\n", + "and deliver customer behavioral and engagement insights to the analysts and data\n", + "scientists,” said Dey.\n", + "\n", + "In addition to performance gains, the faster query times have also lowered the overall\n", + "cost of ownership, even with daily increases in data volumes. “Azure Databricks has\n", + "greatly streamlined processes and improved productivity by an estimated 26%,”\n", + "concluded Dey.\n", + "\n", + "Overall, Dey cites the migration from Hadoop to Databricks has delivered significant\n", + "business value — reducing the cost of failure, accelerating processing speeds at\n", + "scale, and simplifying ad hoc analysis for easier data exploration and innovations that\n", + "deliver highly engaging customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "# What’s next?\n", + "\n", + "Now that you understand Delta Lake, it may be time to take a look\n", + "at some additional resources.\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n", + "\n", + "- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n", + "\n", + "- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/try-databricks)**\n", + "**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
b6cb59222f689e17c785b0d37ad018bb**EBOOK**\n", + "\n", + "# All Roads Lead to the Lakehouse\n", + "\n", + "#### A deep dive into data ingestion with the lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction...................................................................................................................................................................................................................... **03**\n", + "\n", + "Life of a Data Engineer ............................................................................................................................................................................................... **04**\n", + "\n", + "Ingesting From Cloud Object Stores...................................................................................................................................................................... **05**\n", + "\n", + "COPY INTO ......................................................................................................................................................................................................... **06**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
e49a7d2e3bd1f6a60e1306c0186dcdd5COPY INTO ......................................................................................................................................................................................................... **06**\n", + "\n", + "Auto Loader ....................................................................................................................................................................................................... **09**\n", + "\n", + "Ingesting Data From External Applications .......................................................................................................................................................... **13**\n", + "\n", + "Partner Connect ............................................................................................................................................................................................... **13**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "Organizations today are inundated with data siloed across various on-premises\n", + "application systems, databases, data warehouses and SaaS applications. This\n", + "fragmentation makes it difficult to support new use cases for analytics or machine\n", + "learning, so many IT teams are now centralizing all of their data with a lakehouse\n", + "architecture built on top of Delta Lake, an open format storage layer.\n", + "\n", + "The first thing data engineers need to do to support the lakehouse architecture is to\n", + "efficiently move data from various systems into their lakehouse. Ingesting data is a\n", + "critical first step in the data engineering and management lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Life of a Data Engineer\n", + "\n", + "The primary focus of data engineers is to provide timely and reliable data to downstream\n", + "\n", + "data teams at an organization. Requests for data can come from a variety of teams, and for\n", + "\n", + "\n", + "a variety of data types. For example:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
292dc167706156bfcc1bfad9b793a6e7-----\n", + "\n", + "### Introduction\n", + "\n", + "Organizations today are inundated with data siloed across various on-premises\n", + "application systems, databases, data warehouses and SaaS applications. This\n", + "fragmentation makes it difficult to support new use cases for analytics or machine\n", + "learning, so many IT teams are now centralizing all of their data with a lakehouse\n", + "architecture built on top of Delta Lake, an open format storage layer.\n", + "\n", + "The first thing data engineers need to do to support the lakehouse architecture is to\n", + "efficiently move data from various systems into their lakehouse. Ingesting data is a\n", + "critical first step in the data engineering and management lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Life of a Data Engineer\n", + "\n", + "The primary focus of data engineers is to provide timely and reliable data to downstream\n", + "\n", + "data teams at an organization. Requests for data can come from a variety of teams, and for\n", + "\n", + "\n", + "a variety of data types. For example:\n", + "\n", + "**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n", + "\n", + "better allocate their budget for ads\n", + "\n", + "**•** Security team looking to get access to a table with low latency security data from Kafka,\n", + "\n", + "in order to run rules to detect intrusions into the network\n", + "\n", + "**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n", + "\n", + "**•** Finance team hoping to find a way to automatically ingest critical data from Google\n", + "\n", + "Sheets or transaction data from AWS Kinesis\n", + "\n", + "In each of these common scenarios, data engineers must create usable and easily\n", + "\n", + "queryable tables from semi-structured and unstructured data. Beyond writing queries to\n", + "\n", + "retrieve and transform all this data, the data engineering team must also be concerned\n", + "\n", + "with performance, because running these queries on an ongoing basis can be a big load on\n", + "\n", + "the system.\n", + "\n", + "Data engineers face the challenge of constant requests and ongoing business\n", + "\n", + "\n", + "###### W H AT I S \n", + " D E LTA L A K E ?\n", + "\n", + "Before thinking about ingestion into Delta Lake, it’s important to\n", + "\n", + "understand why ingesting into Delta Lake is the right solution in\n", + "\n", + "the first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n", + "\n", + "layer that brings data warehouse capabilities to your open data\n", + "\n", + "lake. Across industries, enterprises have enabled true collaboration\n", + "\n", + "among their data teams with a reliable single source of truth\n", + "\n", + "enabled by Delta Lake. By delivering quality, reliability, security and\n", + "\n", + "performance on your data lake — for both streaming and batch\n", + "\n", + "operations — Delta Lake eliminates data silos and makes analytics\n", + "\n", + "accessible across the enterprise. With Delta Lake, customers can\n", + "\n", + "build a cost-efficient, highly scalable lakehouse that eliminates\n", + "\n", + "data silos and provides self-serving analytics to end users.\n", + "\n", + "\n", + "requirements, as well as an ever-changing ecosystem. As business requirements change,\n", + "\n", + "so do the requirements around schemas, necessitating custom code to handle the\n", + "\n", + "changes. With all of these challenges, the work of a data engineer is extremely critical, and\n", + "\n", + "increasingly complex, with many steps involved before getting data to a state where it can\n", + "\n", + "actually be queried by the business stakeholders. So how do data engineers get the data\n", + "\n", + "that each of these teams need at the frequency, with the freshness, and in the format\n", + "\n", + "required?\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting From Cloud Object Stores\n", + "\n", + "There are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n", + "\n", + "cloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n", + "\n", + "existing tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
8b3547e698ff3b7bcbffb21d2cc6b55achanges. With all of these challenges, the work of a data engineer is extremely critical, and\n", + "\n", + "increasingly complex, with many steps involved before getting data to a state where it can\n", + "\n", + "actually be queried by the business stakeholders. So how do data engineers get the data\n", + "\n", + "that each of these teams need at the frequency, with the freshness, and in the format\n", + "\n", + "required?\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting From Cloud Object Stores\n", + "\n", + "There are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n", + "\n", + "cloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n", + "\n", + "existing tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n", + "\n", + "[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n", + "\n", + "\n", + "**Auto Loader**\n", + "\n", + "Auto Loader is an optimized data ingestion tool that incrementally and efficiently\n", + "\n", + "processes new data files as they arrive in cloud storage with minimal DevOps effort. You\n", + "\n", + "just need to provide a source directory path and start a streaming job. The new structured\n", + "\n", + "streaming source, called “cloudFiles”, will automatically set up file notification services that\n", + "\n", + "\n", + "**COPY INTO**\n", + "\n", + "COPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n", + "\n", + "Lake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n", + "\n", + "when the input directory contains thousands of files or fewer, and the user prefers SQL.\n", + "\n", + "COPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "subscribe file events from the input directory and process new files as they arrive, with the\n", + "\n", + "option of also processing existing files in that directory. Auto Loader has interfaces through\n", + "\n", + "Python and Scala, and can be used with SQL through Delta Live Tables.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### COPY INTO\n", + "\n", + "\n", + "COPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n", + "\n", + "ingestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n", + "\n", + "INTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n", + "\n", + "```\n", + "FILEFORMAT = CSV\n", + "FORMAT_OPTIONS (‘header’ = ‘true’)\n", + "\n", + "```\n", + "\n", + "While COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n", + "\n", + "\n", + "events by using cloud functions such as AWS Lambda or through orchestrators like Apache\n", + "\n", + "Airflow. COPY INTO supports incremental appends and simple transformations.\n", + "\n", + "COPY INTO is a great command to use when your source directory contains a small number\n", + "\n", + "of files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n", + "\n", + "Auto Loader, which we will cover later in this eBook.\n", + "\n", + "**Common Use Cases for COPY INTO**\n", + "\n", + "**Ingesting data to a new Delta table**\n", + "\n", + "A common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n", + "\n", + "table. To copy data into a new Delta table, users can use CREATE TABLE command first,\n", + "\n", + "followed by COPY INTO.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
86f91de7df1d57274e8dc263ee0837a3```\n", + "FILEFORMAT = CSV\n", + "FORMAT_OPTIONS (‘header’ = ‘true’)\n", + "\n", + "```\n", + "\n", + "While COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n", + "\n", + "\n", + "events by using cloud functions such as AWS Lambda or through orchestrators like Apache\n", + "\n", + "Airflow. COPY INTO supports incremental appends and simple transformations.\n", + "\n", + "COPY INTO is a great command to use when your source directory contains a small number\n", + "\n", + "of files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n", + "\n", + "Auto Loader, which we will cover later in this eBook.\n", + "\n", + "**Common Use Cases for COPY INTO**\n", + "\n", + "**Ingesting data to a new Delta table**\n", + "\n", + "A common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n", + "\n", + "table. To copy data into a new Delta table, users can use CREATE TABLE command first,\n", + "\n", + "followed by COPY INTO.\n", + "\n", + "Step 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\n", + "Step 2 1 : `COPY INTO` `my_table`\n", + "```\n", + " FROM ‘s3://my_bucket/my_path’ WITH (\n", + " CREDENTIAL (\n", + " AWS_ACCESS_KEY = ‘*****’,\n", + " AWS_SECRET_KEY = ‘*****’,\n", + " AWS_SESSION_TOKEN = ‘*****’\n", + " )\n", + " ENCRYPTION (\n", + " TYPE = ‘AWS_SSE_C’,\n", + " MASTER_KEY = ‘*****’\n", + "\n", + "```\n", + "\n", + "The code block above covers the AWS temporary in-line credential format. When you use\n", + "\n", + "in-line credentials in Azure and AWS, the following parameters are required for each type of\n", + "\n", + "credential and encryption:\n", + "\n", + "\n", + "|Credential Name|Required Parameters|\n", + "|---|---|\n", + "|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n", + "||AWS_SESSION_TOKEN|\n", + "|Azure SAS token|AZURE_SAS_TOKEN|\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Encryption Name|Required Parameters|\n", + "|---|---|\n", + "|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n", + "|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n", + "\n", + "\n", + "**Appending data to your Delta table**\n", + "\n", + "To append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n", + "\n", + "is a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n", + "\n", + "users point to a location of files, and once those files are ingested, Delta Lake will keep\n", + "\n", + "1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\n", + "the cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\n", + "cloud object store, you do not need to specify credentials in-line for COPY INTO.\n", + "\n", + "\n", + "-----\n", + "\n", + "track of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n", + "\n", + "get idempotency with COPY INTO, which means users are prevented from ingesting the\n", + "\n", + "same data twice to the same table.\n", + "```\n", + " COPY INTO table_identifier\n", + " FROM [ file_location | ( SELECT expression_list FROM file_location)]\n", + " FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n", + " [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n", + " [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n", + " [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n", + "\n", + "```\n", + "One of the main benefits of COPY INTO is that users don’t have to worry about providing a\n", + "\n", + "schema, because the schema is automatically inferred from your data files. Here is a very\n", + "\n", + "simple example of how you would ingest data from CSV files that have headers, where youSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
279384a45f18d48f7cfb7d752294e9fdtrack of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n", + "\n", + "get idempotency with COPY INTO, which means users are prevented from ingesting the\n", + "\n", + "same data twice to the same table.\n", + "```\n", + " COPY INTO table_identifier\n", + " FROM [ file_location | ( SELECT expression_list FROM file_location)]\n", + " FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n", + " [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n", + " [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n", + " [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n", + "\n", + "```\n", + "One of the main benefits of COPY INTO is that users don’t have to worry about providing a\n", + "\n", + "schema, because the schema is automatically inferred from your data files. Here is a very\n", + "\n", + "simple example of how you would ingest data from CSV files that have headers, where you\n", + "\n", + "leave the tool to infer the schema and the proper data types. It’s as simple as that.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Using COPY INTO without an existing table** 2\n", + "\n", + "```\n", + " CREATE TABLE my_delta_table (dummy string);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS (\n", + " ‘header’ = ‘true’ ,\n", + " ‘inferSchema’ = ‘true’ ,\n", + " ‘mergeSchema’ = ‘true’\n", + " )\n", + " COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Ingesting a CSV file without headers**\n", + "\n", + "If you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n", + "\n", + "_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n", + "\n", + "data type that you want and then alias these columns to whatever you want to call them.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ( SELECT\n", + " _c0::int as key,\n", + " _c1::double value,\n", + " _c2::timestamp event_time\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + "\n", + "```\n", + "\n", + "In the most common case, in order to use COPY INTO, a table definition is required.\n", + "\n", + "However, if you would like to get started quickly and don’t have an existing table or require\n", + "\n", + "a specific schema, you can create your table with a dummy schema. Then, once you run\n", + "\n", + "COPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n", + "\n", + "infer the data types, and then change your Delta table to have the required schema.\n", + "\n", + "2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving schema over time for CSV files** 3\n", + "\n", + "When ingesting CSV files that have a different number of columns than your existing table,\n", + "\n", + "you can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n", + "\n", + "as FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n", + "\n", + "Once “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n", + "\n", + "files and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n", + "\n", + "when you’re running the COPY INTO command. When “mergeSchema” is provided as a\n", + "\n", + "copy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n", + "\n", + "evolution only allows the addition of new columns. Data type changes for existing columnsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
4e8ea1d153355dc7b0defb3cc23841bd2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving schema over time for CSV files** 3\n", + "\n", + "When ingesting CSV files that have a different number of columns than your existing table,\n", + "\n", + "you can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n", + "\n", + "as FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n", + "\n", + "Once “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n", + "\n", + "files and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n", + "\n", + "when you’re running the COPY INTO command. When “mergeSchema” is provided as a\n", + "\n", + "copy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n", + "\n", + "evolution only allows the addition of new columns. Data type changes for existing columns\n", + "\n", + "are not supported.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM (SELECT\n", + " _C0::int as key,\n", + " _C1::double value,\n", + " _C2::timestamp event_time,\n", + " ...\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + "\n", + "```\n", + "\n", + "**Fixing bad data**\n", + "\n", + "If you find that there is a mistake in the source data file and some of the data you ingested\n", + "\n", + "is bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n", + "\n", + "the Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n", + "\n", + "can rerun your COPY INTO command.\n", + "\n", + "Alternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n", + "\n", + "the use of the “force” copy option. You can manually remove the old data from your Delta\n", + "\n", + "Lake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n", + "\n", + "You can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n", + "\n", + "names with the FILES keyword to reload a subset of files in conjunction with “force”.\n", + "```\n", + " RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n", + " 1);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " PATTERN = ‘2021-09-08*.csv’\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘force’ = ‘true’ )\n", + "\n", + "```\n", + "3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\n", + "clusters enabled with table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Auto Loader\n", + "\n", + "\n", + "While COPY INTO can solve a lot of the key use cases our customers face, due to its\n", + "\n", + "limitations (scalability), there are many scenarios where we recommend Auto Loader\n", + "\n", + "for data ingestion. Auto Loader is a data source on Databricks that incrementally and\n", + "\n", + "efficiently processes new data files as they arrive in cloud storage with minimal DevOps\n", + "\n", + "effort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n", + "\n", + "Auto Loader is an incremental streaming source that provides exactly-once ingestion\n", + "\n", + "guarantees. It keeps track of which files have been ingested using a durable key-value store.\n", + "\n", + "It can discover new files very efficiently and is extremely scalable. Auto Loader has been\n", + "\n", + "battle tested. We have seen customers running Auto Loader on millions of files an hour, and\n", + "\n", + "petabytes of data per day.\n", + "\n", + "To use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicatingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
468a232d6502fe03815a58da27ad126flimitations (scalability), there are many scenarios where we recommend Auto Loader\n", + "\n", + "for data ingestion. Auto Loader is a data source on Databricks that incrementally and\n", + "\n", + "efficiently processes new data files as they arrive in cloud storage with minimal DevOps\n", + "\n", + "effort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n", + "\n", + "Auto Loader is an incremental streaming source that provides exactly-once ingestion\n", + "\n", + "guarantees. It keeps track of which files have been ingested using a durable key-value store.\n", + "\n", + "It can discover new files very efficiently and is extremely scalable. Auto Loader has been\n", + "\n", + "battle tested. We have seen customers running Auto Loader on millions of files an hour, and\n", + "\n", + "petabytes of data per day.\n", + "\n", + "To use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n", + "\n", + "that you will use Auto Loader to load files from the cloud object stores. Next, you specify\n", + "\n", + "the format of the file — for example, JSON — as an option to Auto Loader, and you specify\n", + "\n", + "where to load it from.\n", + "```\n", + " df = spark.readStream.format( “cloudFiles” )\n", + " .option( “cloudfiles.format” , “json” )\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "Under the hood, when data lands in your cloud storage, Auto Loader discovers files either\n", + "\n", + "through directory listing or file notifications. Given permissions to the underlying storage\n", + "\n", + "bucket or container, Auto Loader can list the directory that you want to load data from\n", + "\n", + "in an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n", + "\n", + "can also automatically set up file notifications on your storage account, which allows it\n", + "\n", + "\n", + "from queues, deduplicate these notifications using its key-value store and then process\n", + "\n", + "the underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n", + "\n", + "processed, giving you exactly-once semantics.\n", + "\n", + "Directory listing mode is very easy to get started with. If your files are uploaded to your\n", + "\n", + "cloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n", + "\n", + "files by starting directory listing from the latest uploaded files, saving you both time and\n", + "\n", + "money. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n", + "\n", + "to scale to high volumes, Databricks recommends using the file notification mode. Cloud\n", + "\n", + "services such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n", + "\n", + "to upload files in a lexical order, typically by providing the upload time of records in the file\n", + "\n", + "path, such as /base/path/yyyy/MM/dd/HH/file.format.\n", + "\n", + "**Common Use Cases for Auto Loader**\n", + "\n", + "**New to Auto Loader**\n", + "\n", + "As a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n", + "\n", + "stores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n", + "\n", + "example using Python to demonstrate the ease and flexibility of Auto Loader with a few\n", + "\n", + "defined options. You can run the code in a notebook.\n", + "```\n", + " stream = spark.readStream \\\n", + " .format( “cloudFiles” ) \\\n", + " .option( “cloudFiles.format” , “csv” ) \\\n", + " .option( “cloudFiles.schemaLocation” , schema_location) \\\n", + " .load(raw_data_location)\n", + "\n", + "```\n", + "\n", + "to efficiently discover newly arriving files. When a file lands in file notification mode, the\n", + "\n", + "cloud storage system sends a notification to a queuing system. For example, in AWS, S3\n", + "\n", + "will send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n", + "\n", + "On Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
03ec31bf5ade7d7b5e7d464b53e164b3stores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n", + "\n", + "example using Python to demonstrate the ease and flexibility of Auto Loader with a few\n", + "\n", + "defined options. You can run the code in a notebook.\n", + "```\n", + " stream = spark.readStream \\\n", + " .format( “cloudFiles” ) \\\n", + " .option( “cloudFiles.format” , “csv” ) \\\n", + " .option( “cloudFiles.schemaLocation” , schema_location) \\\n", + " .load(raw_data_location)\n", + "\n", + "```\n", + "\n", + "to efficiently discover newly arriving files. When a file lands in file notification mode, the\n", + "\n", + "cloud storage system sends a notification to a queuing system. For example, in AWS, S3\n", + "\n", + "will send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n", + "\n", + "On Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n", + "\n", + "\n", + "-----\n", + "\n", + "In order to write to a Delta table from the stream, follow the example below:\n", + "```\n", + " stream.writeStream \\\n", + " .option( “mergeSchema” , “true” ) \\\n", + " .option( “checkpointLocation” , checkpoint_location) \\\n", + " .start(target_delta_table_location)\n", + "\n", + "```\n", + "**Migrating to Auto Loader**\n", + "\n", + "As a Spark user, you may be using an existing Spark structured streaming to process data.\n", + "\n", + "To migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n", + "\n", + "two lines of it into ‘cloudFiles’, specifying the file format within an option.\n", + "\n", + "\n", + "**Migrating a livestreaming pipeline**\n", + "\n", + "Migrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n", + "\n", + "INTO, you can specify a timestamp when the source files are updated or created and Auto\n", + "\n", + "Loader will ingest all modified data after that point.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Schema inference and evolution**\n", + "\n", + "Auto Loader provides schema inference and management capabilities. With a schema\n", + "\n", + "location specified, Auto Loader can store the changes to the inferred schema over time. For\n", + "\n", + "file formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n", + "\n", + "Loader can automatically infer data types or treat everything as a string.\n", + "\n", + "When data does not match your schema (e.g., an unknown column or format), Auto Loader\n", + "\n", + "has a data rescue capability that will “rescue” all data in a separate column, stored as a\n", + "\n", + "JSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n", + "\n", + "Auto Loader supports three schema evolution modes: add new columns as they are\n", + "\n", + "discovered, fail if an unexpected column is seen, or rescue new columns.\n", + "\n", + "```\n", + "df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.\n", + "format” , “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "```\n", + "df = spark.readStream\n", + " .format( “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "\n", + "Once it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n", + "\n", + "Loader can scale to trillions of files, unlike the open-source file streaming source. One of\n", + "\n", + "the ways that Auto Loader does this is with asynchronous backfills. Instead of needingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
a7ce16b7acddcb62a9d5d9081ed85761Auto Loader supports three schema evolution modes: add new columns as they are\n", + "\n", + "discovered, fail if an unexpected column is seen, or rescue new columns.\n", + "\n", + "```\n", + "df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.\n", + "format” , “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "```\n", + "df = spark.readStream\n", + " .format( “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "\n", + "Once it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n", + "\n", + "Loader can scale to trillions of files, unlike the open-source file streaming source. One of\n", + "\n", + "the ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n", + "\n", + "to discover files first, then plan, Auto Loader discovers and processes files concurrently,\n", + "\n", + "making it much more efficient and leading to cost reductions in compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fixing a file that was processed with Auto Loader**\n", + "\n", + "To fix a file that was already processed, Auto Loader supports an option called\n", + "\n", + "‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n", + "\n", + "new timestamp. If you want to enable this option in an existing Auto Loader stream, you\n", + "\n", + "need to stop and restart the Auto Loader stream with the enabled option.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .schema(schema)\n", + " .option( “cloudFiles.allowOverwrites” , “true” )\n", + " .options(format_options)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Discover missing data**\n", + "\n", + "While event notification is a very scalable method to collect all data, it relies on cloud\n", + "\n", + "services, which are distributed systems and are not always reliable. With Auto Loader, you\n", + "\n", + "can additionally specify a backfill interval, where Auto Loader will perform asynchronous\n", + "\n", + "backfills at whatever interval you set up. This can be enabled with a once trigger,\n", + "\n", + "```\n", + " df = spark.readStream\n", + " .format(“cloudFiles”)\n", + " .option(“cloudFiles.format”, “json”)\n", + " .schema(schema)\n", + " .option( “cloudFiles.backfillInterval” , “1 week” )\n", + " .options(format_options)\n", + " .load(“/path/to/table”)\n", + " .writeStream\n", + " .trigger(Trigger.AvailableNow())\n", + " .option(“checkpointLocation”, checkpointDir)\n", + " .start()\n", + "\n", + "```\n", + "The trigger tells Auto Loader how frequently to process incoming data. A processing time\n", + "\n", + "trigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n", + "\n", + "interval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n", + "\n", + "process all new data that has been added until the start of your application. Once the data\n", + "\n", + "is processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n", + "\n", + "process all the new data in a single micro-batch, which requires it to first discover all the\n", + "\n", + "new files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n", + "\n", + "and perform rate limiting, which makes it a preferable alternative to Trigger Once.\n", + "\n", + "\n", + "processing time trigger and available now trigger. The following example shows how to use\n", + "\n", + "backfill internal and trigger availableNow together:\n", + "\n", + "\n", + "-----\n", + "\n", + "**Using Auto Loader in SQL with Delta Live Tables**\n", + "\n", + "Delta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n", + "\n", + "framework to develop, test, monitor, manage and operationalize data pipelines at scale to\n", + "\n", + "drive insights for data science, machine learning and analytics. Auto Loader is available in\n", + "\n", + "Delta Live Tables.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
416b7ff99129f37d49da101b799d54a1process all new data that has been added until the start of your application. Once the data\n", + "\n", + "is processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n", + "\n", + "process all the new data in a single micro-batch, which requires it to first discover all the\n", + "\n", + "new files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n", + "\n", + "and perform rate limiting, which makes it a preferable alternative to Trigger Once.\n", + "\n", + "\n", + "processing time trigger and available now trigger. The following example shows how to use\n", + "\n", + "backfill internal and trigger availableNow together:\n", + "\n", + "\n", + "-----\n", + "\n", + "**Using Auto Loader in SQL with Delta Live Tables**\n", + "\n", + "Delta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n", + "\n", + "framework to develop, test, monitor, manage and operationalize data pipelines at scale to\n", + "\n", + "drive insights for data science, machine learning and analytics. Auto Loader is available in\n", + "\n", + "Delta Live Tables.\n", + "\n", + "```\n", + "CREATE INCREMENTAL LIVE TABLE\n", + " autoloader_test\n", + "AS\n", + "SELECT\n", + " *,\n", + " id + id2 AS new_id\n", + "FROM\n", + " CLOUD_FILES (\n", + " “some/cloud/path” , – the path to the data\n", + " “json” – the file format\n", + " );\n", + "\n", + "```\n", + "\n", + "**Live Tables understands**\n", + "\n", + "**and coordinates data flow**\n", + "\n", + "**between your queries**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting Data From External Applications\n", + "\n", + "While Auto Loader and COPY INTO are powerful tools, not all data is available as files\n", + "\n", + "in cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n", + "\n", + "your data and break down the silos between sources and downstream teams. To do this,\n", + "\n", + "customers need to discover and connect a broad set of data, BI and AI tools, and systems\n", + "\n", + "to the data within their lakehouse.\n", + "\n", + "##### Partner Connect\n", + "\n", + "Historically, stitching multiple enterprise tools and data sources together has been a burden\n", + "\n", + "on the end user, making it very complicated and expensive to execute at any scale. Partner\n", + "\n", + "Connect solves this challenge by making it easy for you to integrate data, analytics and AI\n", + "\n", + "tools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n", + "\n", + "validated solutions from Databricks partners that support your expanding analytics needs.\n", + "\n", + "To ingest into the lakehouse, select the partner tile in Partner Connect via the left\n", + "\n", + "navigation bar in Databricks. Partner Connect will automatically configure resources such\n", + "\n", + "as clusters, tokens and connection files for you to connect with your data ingestion tools\n", + "\n", + "of choice. You can finish signing up for a trial account on the partner’s website or directly\n", + "\n", + "log in if you already used Partner Connect to create a trial account. Once you log in, you will\n", + "\n", + "see that Databricks is already configured as a destination in the partner portal and ready\n", + "\n", + "to be used.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Common Use Case for Partner Connect**\n", + "\n", + "**Ingest Salesforce data via Fivetran into Delta Lake**\n", + "\n", + "Clicking on the Fivetran tile in Partner Connect starts an automated workflow between\n", + "\n", + "the two products. Databricks automatically provisions a SQL endpoint and associated\n", + "\n", + "credentials for Fivetran to interact with, and passes the user’s identity and the SQL\n", + "\n", + "\n", + "endpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n", + "\n", + "Databricks destination is automatically created. This destination is configured to ingest into\n", + "\n", + "Delta via the SQL endpoint that was auto-configured by Partner Connect.\n", + "\n", + "The customer now selects their choice of data source in Fivetran from hundreds of pre-\n", + "\n", + "built connectors — for example, Salesforce. The user authenticates to the Salesforce\n", + "\n", + "source, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "(in this case the Account & Contact objects) and starts the initial sync. This automation\n", + "\n", + "has saved users dozens of manual steps and copying/pasting of configuration if theySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
aef326197a1a461981dba1e157a1928fClicking on the Fivetran tile in Partner Connect starts an automated workflow between\n", + "\n", + "the two products. Databricks automatically provisions a SQL endpoint and associated\n", + "\n", + "credentials for Fivetran to interact with, and passes the user’s identity and the SQL\n", + "\n", + "\n", + "endpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n", + "\n", + "Databricks destination is automatically created. This destination is configured to ingest into\n", + "\n", + "Delta via the SQL endpoint that was auto-configured by Partner Connect.\n", + "\n", + "The customer now selects their choice of data source in Fivetran from hundreds of pre-\n", + "\n", + "built connectors — for example, Salesforce. The user authenticates to the Salesforce\n", + "\n", + "source, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "(in this case the Account & Contact objects) and starts the initial sync. This automation\n", + "\n", + "has saved users dozens of manual steps and copying/pasting of configuration if they\n", + "\n", + "manually set up the connection. It also protects the user from making any unintentional\n", + "\n", + "configuration errors and spending time debugging those errors. The Salesforce tables\n", + "\n", + "are now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n", + "\n", + "details or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n", + "\n", + "globe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n", + "\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
43842dc949a714a1b1213f165eb853a8-----\n", + "\n", + "# TABLE OF CONTENTS\n", + "\n", + "\n", + "##### Welcome to Data, Analytics and AI ....... 02\n", + "\n", + "**Do you know what you’re getting into?** ............................................ **02**\n", + "\n", + "**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n", + "\n", + "##### Business Value .......................................................................... 03\n", + "\n", + "**Talking to the business (feels like combat)** \b����������������������������� **03**\n", + "\n", + "**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n", + "\n", + "**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n", + "\n", + "##### Ultimate Class Build Guide .................................. 04\n", + "\n", + "**Creating a character** \b����������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Engineers \b������������������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Scientists \b������������������������������������������������������������������������������������� **05**\n", + "\n", + "- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n", + "\n", + "##### Diving In ............................................................................................... 05\n", + "\n", + "**Producing game data** \b���������������������������������������������������������������������������� **05**\n", + "\n", + "**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n", + "\n", + "**Getting data from your game to the cloud** \b������������������������������ **08**\n", + "\n", + "##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n", + "\n", + "**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n", + "\n", + "**Use data to develop a next-generation**\n", + "\n", + "**customer experience** \b��������������������������������������������������������������������������� **09**\n", + "\n", + "##### Getting Started with Gaming Use Cases .............................................................. 10\n", + "\n", + "**Where do I start? Start with Game Analytics** \b������������������������� **10**\n", + "\n", + "**Understand your audience** \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Segmentation \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n", + "\n", + "- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n", + "\n", + "- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n", + "\n", + "- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n", + "\n", + "**Find your audience** \b���������������������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n", + "\n", + "- Player Recommendations \b����������������������������������������������������������������� **15**\n", + "\n", + "- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n", + "\n", + "- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n", + "\n", + "- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n", + "\n", + "**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n", + "\n", + "- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n", + "\n", + "- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n", + "\n", + "- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n", + "\n", + "##### Things to Look Forward To ..................................... 19SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
814fb87a426da33b8f56cd59da2087ba- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n", + "\n", + "- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n", + "\n", + "- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n", + "\n", + "- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n", + "\n", + "**Find your audience** \b���������������������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n", + "\n", + "- Player Recommendations \b����������������������������������������������������������������� **15**\n", + "\n", + "- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n", + "\n", + "- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n", + "\n", + "- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n", + "\n", + "**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n", + "\n", + "- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n", + "\n", + "- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n", + "\n", + "- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n", + "\n", + "##### Things to Look Forward To ..................................... 19\n", + "\n", + " Appendix .............................................................................................. 21\n", + "\n", + "**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n", + "\n", + "- Creating a Character \b��������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Engineers \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Scientists \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Analysts \b������������������������������������������������������������������������������ **22**\n", + "\n", + "**Data Access and the Major Cloud Providers** ................................ **23**\n", + "\n", + "- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n", + "\n", + "- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n", + "\n", + "- Getting started with the major cloud providers \b������������������� **23**\n", + "\n", + "**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n", + "\n", + "- Game analytics \b������������������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Segmentation \b�������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n", + "\n", + "- Social Media Monitoring \b������������������������������������������������������������������� **28**\n", + "\n", + "- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n", + "\n", + "- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n", + "\n", + "- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n", + "\n", + "- Player Recommendations \b���������������������������������������������������������������� **32**\n", + "\n", + "- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n", + "\n", + "- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n", + "\n", + "- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n", + "\n", + "**Getting Started with Operational Use Cases** \b�������������������������� **36**\n", + "\n", + "- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n", + "\n", + "- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n", + "\n", + "- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n", + "\n", + "\n", + "Multi-Touch Attribution \b��������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "-----\n", + "\n", + "# Welcome to Data, Analytics, and AI\n", + "\n", + "\n", + "### Do you know what you’re getting into?\n", + "\n", + "You may have heard the stories of game studios spending\n", + "\n", + "countless hours trying to more effectively acquire, engage,\n", + "\n", + "and retain players. Well, did you know that data, analytics,\n", + "\n", + "and AI plays a central role in the development and operation\n", + "\n", + "of today’s top-grossing video games? Studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are no\n", + "\n", + "longer a “nice-to-have”, but table stakes for success.\n", + "\n", + "The objective of this handbook is to guide you on the\n", + "\n", + "role data, analytics, and AI plays in the development\n", + "\n", + "and operations of video games. We’ll cover who the key\n", + "\n", + "stakeholders are and how to align people across businessSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
4aabe7bca7a25a31b9106e788373642b# Welcome to Data, Analytics, and AI\n", + "\n", + "\n", + "### Do you know what you’re getting into?\n", + "\n", + "You may have heard the stories of game studios spending\n", + "\n", + "countless hours trying to more effectively acquire, engage,\n", + "\n", + "and retain players. Well, did you know that data, analytics,\n", + "\n", + "and AI plays a central role in the development and operation\n", + "\n", + "of today’s top-grossing video games? Studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are no\n", + "\n", + "longer a “nice-to-have”, but table stakes for success.\n", + "\n", + "The objective of this handbook is to guide you on the\n", + "\n", + "role data, analytics, and AI plays in the development\n", + "\n", + "and operations of video games. We’ll cover who the key\n", + "\n", + "stakeholders are and how to align people across business\n", + "\n", + "units. Then we’ll talk through strategies to help you\n", + "\n", + "successfully advocate for data, analytics, and AI projects\n", + "\n", + "internally. Finally, we dive deep through the most common\n", + "\n", + "use cases. We want to give you enough information to feel\n", + "\n", + "\n", + "well as helpful tips when operating as or working with one of\n", + "\n", + "these classes.\n", + "\n", + "We follow this with the fundamentals for building a Proof\n", + "\n", + "of Concept (POC) or Minimum Viable Product (MVP). That\n", + "\n", + "is, connecting to the cloud; accessing your data; and\n", + "\n", + "most importantly, being able to represent the value you’re\n", + "\n", + "seeking to unlock as you sell your project into your team and\n", + "\n", + "broader organization.\n", + "\n", + "Finally, we’ll dive into the most common use cases for data,\n", + "\n", + "analytics, and AI within game development. Similar to a tech-\n", + "\n", + "tree in a video game, we begin with the most basic use cases\n", + "\n", + "- setting up your game analytics. Then we progress through\n", + "\n", + "more advanced data use cases such as player segmentation,\n", + "\n", + "assessing lifetime value, detecting and mitigating toxicity,\n", + "\n", + "multi-touch attribution, recommendation engines, player\n", + "\n", + "churn prediction and prevention, and more.\n", + "\n", + "Don’t forget to review the Appendix. You’ll find a handy\n", + "\n", + "“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n", + "\n", + "guide for the three major cloud providers ”. All incredibly\n", + "\n", + "helpful assets to keep as hotkeys.\n", + "\n", + "\n", + "empowered to make a demonstrable impact. Just by reading\n", + "\n", + "this you are adding incredible insight and value to yourself as\n", + "\n", + "\n", + "an industry professional. Quest on!\n", + "\n", + "### How to use this book\n", + "\n", + "This book is primarily intended for technical professionals\n", + "\n", + "who are engaging with data within game studios. No\n", + "\n", + "matter your role in the gaming industry, you will be able to\n", + "\n", + "glean key takeaways that will make you more effective in\n", + "\n", + "your individual role and within the larger team — be that\n", + "\n", + "production, art, engineering, marketing, or otherwise.\n", + "\n", + "Begin your journey by reviewing the “ **Data, Analytics, and AI**\n", + "\n", + "**Ground Rules** ” section to the right, which presents some This\n", + "\n", + "section presents some rules and guidelines for interpreting\n", + "\n", + "the role that data plays in the game development lifecycle.\n", + "\n", + "Next, it’s time to learn about the key professions (aka\n", + "\n", + "character classes) that interact and engage with data,\n", + "\n", + "analytics, and AI on a consistent basis within a game studio.\n", + "\n", + "This section breaks down each of the classes, providing an\n", + "\n", + "\n", + "**Data, Analytics, and AI Ground Rules**\n", + "\n", + "This guide assumes you understand the following:\n", + "\n", + "- You understand the basics of data, analytics, and AI:\n", + "\n", + "How and why data is stored in a system, why data\n", + "\n", + "is transformed, the different types of output that\n", + "\n", + "data can feed into — such as a report, an analysis\n", + "\n", + "answering a question, or a machine learning model.\n", + "\n", + "If this is the first time you’re creating a character,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
63873e31dde7e8f550e5f42350b7b441Begin your journey by reviewing the “ **Data, Analytics, and AI**\n", + "\n", + "**Ground Rules** ” section to the right, which presents some This\n", + "\n", + "section presents some rules and guidelines for interpreting\n", + "\n", + "the role that data plays in the game development lifecycle.\n", + "\n", + "Next, it’s time to learn about the key professions (aka\n", + "\n", + "character classes) that interact and engage with data,\n", + "\n", + "analytics, and AI on a consistent basis within a game studio.\n", + "\n", + "This section breaks down each of the classes, providing an\n", + "\n", + "\n", + "**Data, Analytics, and AI Ground Rules**\n", + "\n", + "This guide assumes you understand the following:\n", + "\n", + "- You understand the basics of data, analytics, and AI:\n", + "\n", + "How and why data is stored in a system, why data\n", + "\n", + "is transformed, the different types of output that\n", + "\n", + "data can feed into — such as a report, an analysis\n", + "\n", + "answering a question, or a machine learning model.\n", + "\n", + "If this is the first time you’re creating a character,\n", + "\n", + "we highly recommend reviewing our data, analytics,\n", + "\n", + "and AI tutorial — aka getting started training and\n", + "\n", + "documentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n", + "\n", + "- You have a basic understanding of cloud\n", + "\n", + "infrastructure. Specifically what it is, who are the\n", + "\n", + "key players, and associated terms (e.g., virtual\n", + "\n", + "machines, APIs, applications)\n", + "\n", + "- You are generally aware of the game development\n", + "\n", + "lifecycle; pre-production, production, testing/QA,\n", + "\n", + "launch, operation\n", + "\n", + "\n", + "overview of each character’s strengths and weaknesses as\n", + "\n", + "\n", + "-----\n", + "\n", + "# Business Value\n", + "\n", + "\n", + "Demonstrating business value is important when working\n", + "\n", + "on data, analytics, and AI projects because it helps ensure\n", + "\n", + "that the efforts of the project are aligned with the goals\n", + "\n", + "and objectives of the business. By showing how the project\n", + "\n", + "can positively impact a game’s key performance indicators\n", + "\n", + "(KPIs) and bottom-line metrics, such as game revenue, player\n", + "\n", + "satisfaction, and operational efficiency, studio stakeholders\n", + "\n", + "are more likely to support and invest in the project.\n", + "\n", + "Additionally, demonstrating business value can help justify\n", + "\n", + "the resources, time, and money that are required to execute\n", + "\n", + "the project, and can also help prioritize which projects should\n", + "\n", + "be pursued. By focusing on business value, data, analytics,\n", + "\n", + "and AI projects can become strategic initiatives that\n", + "\n", + "contribute to the long-term success of your game studio.\n", + "\n", + "### Talking to the business (feels like combat)\n", + "\n", + "While we highly encourage everyone to read this section,\n", + "\n", + "you may already feel confident understanding the needs and\n", + "\n", + "concerns of your internal stakeholders, and how to sell-in a\n", + "\n", + "project successfully. If so, feel free to skip this section.\n", + "\n", + "We would love to dive into the data to explore and discover\n", + "\n", + "as much as possible, unfortunately in most environments,\n", + "\n", + "we are limited by resources and time. Understanding both\n", + "\n", + "the businesses pain points and strategic goals is crucial to\n", + "\n", + "choosing projects that will benefit the business, create value\n", + "\n", + "and make your message much easier to sell.\n", + "\n", + "Whenever we embark on a proof-of-concept (PoC) or\n", + "\n", + "minimum viable product (MVP) — to prove out a new\n", + "\n", + "**Questions to ask:**\n", + "\n", + "- What other strategic goals and pain points can\n", + "\n", + "you list out and how would you prioritize them as\n", + "\n", + "a business leader?\n", + "\n", + "- Does your prioritization match how your team,\n", + "\n", + "manager and/or leadership would prioritize?\n", + "\n", + "Typically the closer the match, the easier initial\n", + "\n", + "projects will be to “sell”.\n", + "\n", + "\n", + "methodology or technology — we will need to pitch it back\n", + "\n", + "for adoption. The technology could be revolutionary and\n", + "\n", + "absolutely amazing, but without the value proposition and tie\n", + "\n", + "back to goals, it is likely to land flat or fail to be adopted.\n", + "\n", + "It is key to talk to your stakeholders to understand theirSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
772c9eca2e71c36590bdc92e8d2d54ffthe businesses pain points and strategic goals is crucial to\n", + "\n", + "choosing projects that will benefit the business, create value\n", + "\n", + "and make your message much easier to sell.\n", + "\n", + "Whenever we embark on a proof-of-concept (PoC) or\n", + "\n", + "minimum viable product (MVP) — to prove out a new\n", + "\n", + "**Questions to ask:**\n", + "\n", + "- What other strategic goals and pain points can\n", + "\n", + "you list out and how would you prioritize them as\n", + "\n", + "a business leader?\n", + "\n", + "- Does your prioritization match how your team,\n", + "\n", + "manager and/or leadership would prioritize?\n", + "\n", + "Typically the closer the match, the easier initial\n", + "\n", + "projects will be to “sell”.\n", + "\n", + "\n", + "methodology or technology — we will need to pitch it back\n", + "\n", + "for adoption. The technology could be revolutionary and\n", + "\n", + "absolutely amazing, but without the value proposition and tie\n", + "\n", + "back to goals, it is likely to land flat or fail to be adopted.\n", + "\n", + "It is key to talk to your stakeholders to understand their\n", + "\n", + "perception of pain points and positions on potential projects\n", + "\n", + "to add value. Much like stopping at the Tavern when the\n", + "\n", + "adventuring party gets to town, these can be informal\n", + "\n", + "conversations where you socialize potential solutions while\n", + "\n", + "gathering information about what matters.\n", + "\n", + "### Creating value alignment\n", + "\n", + "So what are your strategic goals and pain points and how\n", + "\n", + "might they be addressed through a use case from a PoC or\n", + "\n", + "MVP leveraging your data?\n", + "\n", + "A few examples of strategic goals that are top of mind for our\n", + "\n", + "customers at the beginning of any fiscal or calendar year:\n", + "\n", + "- Reduce costs\n", + "\n", + "- Simplify your infrastructure\n", + "\n", + "- Acquire more players\n", + "\n", + "- Monetize your playerbase\n", + "\n", + "- Retain your players (aka prevent churn)\n", + "\n", + "Here are four ways the Databricks Lakehouse can provide\n", + "\n", + "value that aligns with your strategic goals and pain points:\n", + "\n", + "`1.` **\u0007Improved collaboration:** Databricks platform allows\n", + "\n", + "everyone to share and collaborate on data, notebooks and\n", + "\n", + "models between data scientists, engineers and business\n", + "\n", + "users. This enables for a more efficient and streamlined\n", + "\n", + "process for data analysis and decision making.\n", + "\n", + "`2.` **Find and explore your data:** The data in the Lakehouse is\n", + "\n", + "cataloged and accessible, which enables business users\n", + "\n", + "to explore and query the data easily and discover insights\n", + "\n", + "by themselves.\n", + "\n", + "`3.` **\u0007Uncover actionable business insights:** By putting\n", + "\n", + "your game’s data into a Lakehouse architecture, it\n", + "\n", + "can be better analyzed using various tools provided\n", + "\n", + "by Databricks such as SQL, dashboards, notebooks,\n", + "\n", + "visualization and machine learning to better understand\n", + "\n", + "your playerbase, providing valuable insights into player\n", + "\n", + "behavior and performance. These insights can help the\n", + "\n", + "\n", + "-----\n", + "\n", + "and retention, and use that information to improve the\n", + "\n", + "game and grow monetization.\n", + "\n", + "`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n", + "\n", + "architecture provides a single source of truth for your\n", + "\n", + "organization’s data. Data engineers write once, data\n", + "\n", + "analysts interpret the data, and data scientists can run\n", + "\n", + "machine machine learning models on the same data.\n", + "\n", + "_This cannot be understated in the value this provides an_\n", + "\n", + "_organization from a total cost of ownership perspective._\n", + "\n", + "With the ability to access and analyze all the data in one\n", + "\n", + "place, the business can make unified data-driven decisions,\n", + "\n", + "rather than relying on intuition or fragmented data.\n", + "\n", + "### Goals and outcomes\n", + "\n", + "Like many projects, starting with a strong foundation of ‘what\n", + "\n", + "success looks like’ will significantly improve your likelihood\n", + "\n", + "of achieving your objectives. Here are a few best-practices\n", + "\n", + "we recommend:\n", + "\n", + "`1.` **Set goals:** Define your hypothesis, then use your data\n", + "\n", + "and process to prove or disprove your hypothesis. YouSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
3f363efc9a09828918cfc25136a598b3`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n", + "\n", + "architecture provides a single source of truth for your\n", + "\n", + "organization’s data. Data engineers write once, data\n", + "\n", + "analysts interpret the data, and data scientists can run\n", + "\n", + "machine machine learning models on the same data.\n", + "\n", + "_This cannot be understated in the value this provides an_\n", + "\n", + "_organization from a total cost of ownership perspective._\n", + "\n", + "With the ability to access and analyze all the data in one\n", + "\n", + "place, the business can make unified data-driven decisions,\n", + "\n", + "rather than relying on intuition or fragmented data.\n", + "\n", + "### Goals and outcomes\n", + "\n", + "Like many projects, starting with a strong foundation of ‘what\n", + "\n", + "success looks like’ will significantly improve your likelihood\n", + "\n", + "of achieving your objectives. Here are a few best-practices\n", + "\n", + "we recommend:\n", + "\n", + "`1.` **Set goals:** Define your hypothesis, then use your data\n", + "\n", + "and process to prove or disprove your hypothesis. You\n", + "\n", + "have a goal in mind, make it part of the experiment. If\n", + "\n", + "the outcome differs from the expectation, that is part of\n", + "\n", + "experiments and we can learn from it to improve the next\n", + "\n", + "experiment. This is all about shortening the feedback loop\n", + "\n", + "\n", + "project appropriately. For example, are you doing this as\n", + "\n", + "a side project? Do you have 2 sprints to show progress?\n", + "\n", + "It’s important to scope your project based on the time,\n", + "\n", + "resources, and quality needed for the said project to be a\n", + "\n", + "success.\n", + "\n", + "`3.` **Scope down:** Ruthlessly control scope for any PoC or\n", + "\n", + "MVP. Prioritization is your best friend. Stakeholders and\n", + "\n", + "your own internal team will naturally want to increase\n", + "\n", + "scope because there’s no shortage of good ideas. But by\n", + "\n", + "controlling scope, you improve your chances of shipping\n", + "\n", + "on time and on budget. Don’t let perfection be the enemy\n", + "\n", + "of good. There are always exceptions to this, but that is\n", + "\n", + "what the next sprint is for.\n", + "\n", + "`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n", + "\n", + "difficult - strive to always deliver on time. Make sure your\n", + "\n", + "goals, constraints and scope creep will not explode your\n", + "\n", + "timeline as creating tight feedback loops and iteration\n", + "\n", + "cycles is what will make you more agile than the competition.\n", + "\n", + "`5.` **Socialize early, and often:** Show quantifiable value as\n", + "\n", + "quickly as possible, both to your immediate team and\n", + "\n", + "business stakeholders. Measure the value as frequently\n", + "\n", + "as makes sense, and socialize early and often to promote\n", + "\n", + "visibility of the project and ensure tight alignment across\n", + "\n", + "teams. This will empower you to create tighter feedback\n", + "\n", + "loops that will help improve any future iterations of your\n", + "\n", + "product, platform, or technology.\n", + "\n", + "\n", + "between insight and action.\n", + "\n", + "# Ultimate Class Build Guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "Have you rolled your character already? Data engineers, data\n", + "\n", + "scientists, and data analysts form the heart of mature game\n", + "\n", + "data teams. Though, depending on studio size and resources,\n", + "\n", + "\n", + "making sense of large amounts of data. Depending on the size\n", + "\n", + "of the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in data\n", + "\n", + "engineering, analytics and data science. Key characters include:\n", + "\n", + "\n", + "game developers may also be pulled in from time to time to\n", + "\n", + "\n", + "perform data engineering and or data science tasks. Though for\n", + "\n", + "the sake of this guide, we’ll keep focus on roles of data engineers,\n", + "\n", + "data scientists, and data analysts. There are many aspects to\n", + "\n", + "these roles, but they can be summarized in that Data Engineers\n", + "\n", + "create and maintain critical data workflows, Data Analysts\n", + "\n", + "interpret data and create reports that keep the business teams\n", + "\n", + "running seamlessly, and Data Scientists are responsible for\n", + "\n", + "\n", + "**Data Engineers**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
e22b1fb61a6fa9c2003b7a8ef3f86a82scientists, and data analysts form the heart of mature game\n", + "\n", + "data teams. Though, depending on studio size and resources,\n", + "\n", + "\n", + "making sense of large amounts of data. Depending on the size\n", + "\n", + "of the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in data\n", + "\n", + "engineering, analytics and data science. Key characters include:\n", + "\n", + "\n", + "game developers may also be pulled in from time to time to\n", + "\n", + "\n", + "perform data engineering and or data science tasks. Though for\n", + "\n", + "the sake of this guide, we’ll keep focus on roles of data engineers,\n", + "\n", + "data scientists, and data analysts. There are many aspects to\n", + "\n", + "these roles, but they can be summarized in that Data Engineers\n", + "\n", + "create and maintain critical data workflows, Data Analysts\n", + "\n", + "interpret data and create reports that keep the business teams\n", + "\n", + "running seamlessly, and Data Scientists are responsible for\n", + "\n", + "\n", + "**Data Engineers**\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Data Analysts**\n", + "\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "**Learn more about these character classes**\n", + "\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "# Diving In\n", + "\n", + "\n", + "Before we get to the primary use cases of game data,\n", + "\n", + "analytics, and AI, we need to cover some basics. That is, the\n", + "\n", + "different types of game data and how they are produced.\n", + "\n", + "And the subsequent receiving of that data in the cloud to\n", + "\n", + "\n", + "### Producing game data…\n", + "\n", + "Speaking in generalities, there are four buckets of data as it\n", + "\n", + "relates to your video game.\n", + "\n", + "\n", + "collect, clean, and prepare for analysis.\n", + "\n", + "**1. Game Telemetry**\n", + "\n", + "Game telemetry refers to the data collected about player\n", + "\n", + "behavior and interactions within a video game. The primary\n", + "\n", + "data source is the game engine. And the goal of game\n", + "\n", + "telemetry is to gather information that can help game\n", + "\n", + "developers understand player behavior and improve the\n", + "\n", + "overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in\n", + "\n", + "game telemetry include:\n", + "\n", + "- **Player engagement:** Track the amount of time players\n", + "\n", + "spend playing the game, and their level of engagement\n", + "\n", + "with different parts of the game.\n", + "\n", + "- **Game progress:** Monitor player progress through\n", + "\n", + "different levels and milestones in the game.\n", + "\n", + "- **In-game purchases:** Track the number and value of\n", + "\n", + "in-game purchases made by players.\n", + "\n", + "- **Player demographics:** Collect demographic information\n", + "\n", + "about players, such as age, gender, location, and device type.\n", + "\n", + "- **Session length:** Monitor the length of each player session,\n", + "\n", + "and how often players return to the game.\n", + "\n", + "- **Retention:** Track the percentage of players who return to\n", + "\n", + "the game after their first session.\n", + "\n", + "\n", + "-----\n", + "\n", + "such as the types of actions taken, the number of deaths,\n", + "\n", + "and the use of power-ups.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels.\n", + "\n", + "**2. Business KPIs**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
19649d3d466f3b25c73f87e08cb38e3bwith different parts of the game.\n", + "\n", + "- **Game progress:** Monitor player progress through\n", + "\n", + "different levels and milestones in the game.\n", + "\n", + "- **In-game purchases:** Track the number and value of\n", + "\n", + "in-game purchases made by players.\n", + "\n", + "- **Player demographics:** Collect demographic information\n", + "\n", + "about players, such as age, gender, location, and device type.\n", + "\n", + "- **Session length:** Monitor the length of each player session,\n", + "\n", + "and how often players return to the game.\n", + "\n", + "- **Retention:** Track the percentage of players who return to\n", + "\n", + "the game after their first session.\n", + "\n", + "\n", + "-----\n", + "\n", + "such as the types of actions taken, the number of deaths,\n", + "\n", + "and the use of power-ups.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels.\n", + "\n", + "**2. Business KPIs**\n", + "\n", + "The second bucket of data is business key performance\n", + "\n", + "indicators (or KPIs). Business KPIs are metrics that measure\n", + "\n", + "the performance and success of a video game from a\n", + "\n", + "business perspective. The primary data source for business\n", + "\n", + "KPIs include game telemetry, stores, and marketplaces.\n", + "\n", + "These KPIs help game studios understand the financial and\n", + "\n", + "operational performance of their games and make informed\n", + "\n", + "decisions about future development and growth.\n", + "\n", + "Some of the primary business metrics that are typically\n", + "\n", + "tracked include:\n", + "\n", + "- **Revenue:** Track the total revenue generated by the game,\n", + "\n", + "including sales of the game itself, in-game purchases,\n", + "\n", + "and advertising.\n", + "\n", + "- **Player Acquisition Cost (CAC):** Calculate the cost\n", + "\n", + "of acquiring a new player, including marketing and\n", + "\n", + "advertising expenses.\n", + "\n", + "- **Lifetime Value (LTV):** Estimate the amount of revenue a\n", + "\n", + "player will generate over the course of their time playing\n", + "\n", + "the game.\n", + "\n", + "- **Player Retention:** Track the percentage of players who\n", + "\n", + "continue to play the game over time, and how long they\n", + "\n", + "play for.\n", + "\n", + "- **Engagement:** Measure the level of engagement of players\n", + "\n", + "with the game, such as the number of sessions played,\n", + "\n", + "time spent playing, and in-game actions taken.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels and the\n", + "\n", + "cost of acquiring each player.\n", + "\n", + "- **Conversion Rate:** Measure the percentage of players who\n", + "\n", + "make an in-game purchase or complete a specific action.\n", + "\n", + "- **Gross Margin:** Calculate the profit generated by the game\n", + "\n", + "after subtracting the cost of goods sold, such as the cost\n", + "\n", + "of game development and server hosting.\n", + "\n", + "**3. Game Services**\n", + "\n", + "Similar to game telemetry, game services provide critical\n", + "\n", + "infrastructure that requires careful monitoring and management.\n", + "\n", + "These services include things like game server hosting,\n", + "\n", + "\n", + "and more. Here the source of data is the game services used.\n", + "\n", + "Some of the common metrics game teams typically track for\n", + "\n", + "these services include:\n", + "\n", + "- **Concurrent Players:** Track the number of players who are\n", + "\n", + "simultaneously connected to the game servers to ensure\n", + "\n", + "that the servers have enough capacity to handle the\n", + "\n", + "player demand.\n", + "\n", + "- **Server Availability:** Monitor the uptime and downtime of\n", + "\n", + "the game servers to ensure that players have access to\n", + "\n", + "the game when they want to play, particularly important\n", + "\n", + "for global live service games where demand fluctuates\n", + "\n", + "throught the day.\n", + "\n", + "- **Latency:** Measure the time it takes for data to travel\n", + "\n", + "from the player’s device to the game server and back,\n", + "\n", + "to ensure that players have a smooth and responsive\n", + "\n", + "gaming experience.\n", + "\n", + "- **Network Bandwidth:** Monitor the amount of data being\n", + "\n", + "transmitted between the player’s device and the gameSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
7f3f865b56025cd59d2e65a95d3fbe29and more. Here the source of data is the game services used.\n", + "\n", + "Some of the common metrics game teams typically track for\n", + "\n", + "these services include:\n", + "\n", + "- **Concurrent Players:** Track the number of players who are\n", + "\n", + "simultaneously connected to the game servers to ensure\n", + "\n", + "that the servers have enough capacity to handle the\n", + "\n", + "player demand.\n", + "\n", + "- **Server Availability:** Monitor the uptime and downtime of\n", + "\n", + "the game servers to ensure that players have access to\n", + "\n", + "the game when they want to play, particularly important\n", + "\n", + "for global live service games where demand fluctuates\n", + "\n", + "throught the day.\n", + "\n", + "- **Latency:** Measure the time it takes for data to travel\n", + "\n", + "from the player’s device to the game server and back,\n", + "\n", + "to ensure that players have a smooth and responsive\n", + "\n", + "gaming experience.\n", + "\n", + "- **Network Bandwidth:** Monitor the amount of data being\n", + "\n", + "transmitted between the player’s device and the game\n", + "\n", + "server to ensure that players have a high-quality gaming\n", + "\n", + "experience, even on slow internet connections.\n", + "\n", + "- **Live Operations:** Monitor the success of in-game events,\n", + "\n", + "promotions, and other live operations to understand what\n", + "\n", + "resonates with players and what doesn’t.\n", + "\n", + "- **Player Feedback:** Monitor player feedback and reviews,\n", + "\n", + "including ratings and comments on social media, forums,\n", + "\n", + "and app stores, to understand what players like and dislike\n", + "\n", + "about the game.\n", + "\n", + "- **Chat Activity:** Track the number of messages and\n", + "\n", + "interactions between players in the game’s chat channels\n", + "\n", + "to understand the level of social engagement and\n", + "\n", + "community building in the game.\n", + "\n", + "**4. Data beyond the game**\n", + "\n", + "The last bucket comes from data sources beyond the video\n", + "\n", + "game. These typically include the following:\n", + "\n", + "- **Social Media Data:** Social media platforms, such as\n", + "\n", + "Facebook, Twitter, TikTok and Instagram, can provide\n", + "\n", + "valuable insights into player behavior, feedback and\n", + "\n", + "preferences, as well as help game teams understand\n", + "\n", + "how players are talking about their games online with\n", + "\n", + "different communities.\n", + "\n", + "- **Forum Data:** Online forums and discussion boards, such\n", + "\n", + "as Reddit and Discord, can be rich sources of player\n", + "\n", + "feedback and opinions about the game.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The secret to success is bringing all of the disparate data sources\n", + " together, so you have as complete a 360-degree view as possible of\n", + " what’s happening in and around your game.\n", + "\n", + "\n", + "\n", + "- **Player Reviews:** Ratings and reviews on app stores, such\n", + "\n", + "as Steam, Epic, Google Play and the Apple App Store, can\n", + "\n", + "provide valuable feedback on player experiences and help\n", + "\n", + "game teams identify areas for improvement.\n", + "\n", + "- **Third-Party Data:** Third-party data sources, such as\n", + "\n", + "market research firms and industry data providers, can\n", + "\n", + "provide valuable insights into broader gaming trends and\n", + "\n", + "help game teams make informed decisions about their\n", + "\n", + "games and marketing strategies.\n", + "\n", + "This is a lot of data. And it’s no wonder that studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are now\n", + "\n", + "table stakes for a game to be successful. Tapping into these\n", + "\n", + "four buckets of data sources, you’ll find actionable insights that\n", + "\n", + "drive better understanding of your playerbase, more efficient\n", + "\n", + "acquisition, stronger and longer lasting engagement, and\n", + "\n", + "monetization that deepens the relationship with your players.\n", + "\n", + "That’s what we’re going to dig into throughout the rest of\n", + "\n", + "this book.\n", + "\n", + "**Let’s begin with how to get data out of your game!**\n", + "\n", + "There are a variety of ways to get data out of the game andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
75c893ec37e406d416e6c57926fac08fhelp game teams make informed decisions about their\n", + "\n", + "games and marketing strategies.\n", + "\n", + "This is a lot of data. And it’s no wonder that studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are now\n", + "\n", + "table stakes for a game to be successful. Tapping into these\n", + "\n", + "four buckets of data sources, you’ll find actionable insights that\n", + "\n", + "drive better understanding of your playerbase, more efficient\n", + "\n", + "acquisition, stronger and longer lasting engagement, and\n", + "\n", + "monetization that deepens the relationship with your players.\n", + "\n", + "That’s what we’re going to dig into throughout the rest of\n", + "\n", + "this book.\n", + "\n", + "**Let’s begin with how to get data out of your game!**\n", + "\n", + "There are a variety of ways to get data out of the game and\n", + "\n", + "into cloud resources. In this section, we will provide resources\n", + "\n", + "for producing data streams in Unity and Unreal. In addition,\n", + "\n", + "we will also provide a generic approach that will work for any\n", + "\n", + "game engine, as long as you are able to send HTTP requests.\n", + "\n", + "**Unity**\n", + "\n", + "Since Unity supports C#, you would use a .NET SDK from the\n", + "\n", + "cloud provider of your choice. All three major cloud providers\n", + "\n", + "\n", + "[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n", + "\n", + "- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n", + "\n", + "- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n", + "\n", + "- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n", + "\n", + "- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n", + "\n", + "From here, the SDK is used to send data to a messaging\n", + "\n", + "service. These messaging services will be covered in more\n", + "\n", + "detail in the next section.\n", + "\n", + "**Unreal Engine**\n", + "\n", + "Unreal supports development with C++, so you could use\n", + "\n", + "C++ SDKs or Blueprint interfaces to those SDKs.\n", + "\n", + "The resources for each SDK are provided here\n", + "\n", + "- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n", + "\n", + "- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n", + "\n", + "- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "Just like with the Unity example above, from here the data is\n", + "\n", + "sent to a messaging streaming service.\n", + "\n", + "Other engines may not support C++ or C#, but there is still a\n", + "\n", + "way to get your data into the cloud, no matter the language!\n", + "\n", + "By hitting an API Gateway with a HTTP POST request, you areSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
a82eedb5cafbf946f8ccdaaae71556a8- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n", + "\n", + "- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "Just like with the Unity example above, from here the data is\n", + "\n", + "sent to a messaging streaming service.\n", + "\n", + "Other engines may not support C++ or C#, but there is still a\n", + "\n", + "way to get your data into the cloud, no matter the language!\n", + "\n", + "By hitting an API Gateway with a HTTP POST request, you are\n", + "\n", + "able to send data to cloud services from many more types of\n", + "\n", + "applications. A sample high level architecture of this solution\n", + "\n", + "in AWS and Azure can be seen below:\n", + "\n", + "**AWS:**\n", + "\n", + "\n", + "have .NET SDKs to use and I have linked the documentation\n", + "\n", + "\n", + "**Azure:**\n", + "\n", + "\n", + "for each below.\n", + "\n", + "No matter the cloud provider, if you want to use a SDK you\n", + "\n", + "install it through the NuGet package manager into your Unity\n", + "\n", + "project. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n", + "\n", + "\n", + "-----\n", + "\n", + "Once the data has been sent from the game into an event-\n", + "\n", + "streaming service, how do we get that data to a more\n", + "\n", + "permanent home? Here we will start by outlining what these\n", + "\n", + "messaging services do and how we can use them to point\n", + "\n", + "our data to a desired location.\n", + "\n", + "Messaging services ingest real-time event data, being\n", + "\n", + "streamed to them from a number of different sources,\n", + "\n", + "and then send them to their appropriate target locations.\n", + "\n", + "These target locations can be databases, compute clusters\n", + "\n", + "or cloud object stores. A key property of the messaging\n", + "\n", + "services is to preserve the time in which the events arrive, so\n", + "\n", + "that it is always known the order that events occurred.\n", + "\n", + "\n", + "\n", + "- Data is stored in object storage such as S3, Azure Storage\n", + "\n", + "or GCP Buckets using Delta Lake.\n", + "\n", + "- Delta Lake is an open-source storage framework that makes\n", + "\n", + "it easy to maintain data consistency and track changes.\n", + "\n", + "**Data Governance & Cataloging:**\n", + "\n", + "- Unity Catalog in Databricks provides tools for data\n", + "\n", + "governance that helps with compliance and controlling\n", + "\n", + "access to data in the lake.\n", + "\n", + "- Unity Catalog also allows to track data lineage, auditing and\n", + "\n", + "data discovery with the use of data catalogs and governance.\n", + "\n", + "- Metadata about the data including the structure, format,\n", + "\n", + "and location of the data can be stored in a data catalog.\n", + "\n", + "\n", + "Examples of cloud messaging services include AWS Kinesis\n", + "\n", + "\n", + "Firehose, Google PubSub, and Azure Event Hubs Messaging.\n", + "\n", + "If you prefer to use open-source products, Apache Kafka is a\n", + "\n", + "very popular open-source alternative.\n", + "\n", + "### Getting data from your game to the cloud\n", + "\n", + "Moving to the cloud platform part of the journey involves\n", + "\n", + "building a gaming Lakehouse. The gaming Lakehouse allows\n", + "\n", + "gaming companies to store, manage, and analyze large volumes\n", + "\n", + "of gaming data, such as player behavior, performance metrics,\n", + "\n", + "and financial transactions, to gain valuable insights and make\n", + "\n", + "data-driven decisions to improve their business outcomes.\n", + "\n", + "**Next here are the basics of the Databricks**\n", + "\n", + "**platform simplified.**\n", + "\n", + "**Data Ingestion:**\n", + "\n", + "- Data can be ingested into the Gaming Lakehouse usingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
9d5c34638836cb6cdd89bf8cecacbac5- Metadata about the data including the structure, format,\n", + "\n", + "and location of the data can be stored in a data catalog.\n", + "\n", + "\n", + "Examples of cloud messaging services include AWS Kinesis\n", + "\n", + "\n", + "Firehose, Google PubSub, and Azure Event Hubs Messaging.\n", + "\n", + "If you prefer to use open-source products, Apache Kafka is a\n", + "\n", + "very popular open-source alternative.\n", + "\n", + "### Getting data from your game to the cloud\n", + "\n", + "Moving to the cloud platform part of the journey involves\n", + "\n", + "building a gaming Lakehouse. The gaming Lakehouse allows\n", + "\n", + "gaming companies to store, manage, and analyze large volumes\n", + "\n", + "of gaming data, such as player behavior, performance metrics,\n", + "\n", + "and financial transactions, to gain valuable insights and make\n", + "\n", + "data-driven decisions to improve their business outcomes.\n", + "\n", + "**Next here are the basics of the Databricks**\n", + "\n", + "**platform simplified.**\n", + "\n", + "**Data Ingestion:**\n", + "\n", + "- Data can be ingested into the Gaming Lakehouse using\n", + "\n", + "various built-in data ingestion capabilities provided by\n", + "\n", + "Databricks such as Structured Streaming and Delta Live\n", + "\n", + "Tables for a single simple API that handles streaming or\n", + "\n", + "batch pipelines.\n", + "\n", + "- Data can be ingested in real-time or batch mode from\n", + "\n", + "\n", + "**Data Quality:**\n", + "\n", + "- Databricks platform enables you to validate, clean\n", + "\n", + "and enrich data using built-in libraries and rule-based\n", + "\n", + "validation using Delta Live Tables.\n", + "\n", + "- It also allows tracking data quality issues and missing\n", + "\n", + "values by using Databricks Delta Live Tables tables.\n", + "\n", + "**Data Security:**\n", + "\n", + "- Databricks provides a comprehensive security model to\n", + "\n", + "secure data stored in the lake.\n", + "\n", + "- Access to data can be controlled through robust access\n", + "\n", + "controls on objects such as catalogs, schemas, tables,\n", + "\n", + "rows, columns, models, experiments, and clusters.\n", + "\n", + "**Analytics:**\n", + "\n", + "- The processed data can be analyzed using various\n", + "\n", + "tools provided by Databricks such as SQL Dashboards,\n", + "\n", + "Notebooks, visualizations and ML.\n", + "\n", + "- Game studios can gain insights into player performance and\n", + "\n", + "behaviorto better engageplayers and improve their games.\n", + "\n", + "**Get started with your preferred cloud**\n", + "\n", + "\n", + "various sources such as game clients, servers or APIs.\n", + "\n", + "Data can be cleaned, transformed and enriched with\n", + "\n", + "additional data sources, making it ready for analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "# The Value of Data Throughout the Game Development Lifecycle\n", + "\n", + "\n", + "### Lifecycle overview\n", + "\n", + "Over the last decade, the way games have been developed\n", + "\n", + "and monetized has changed dramatically. Most if not all\n", + "\n", + "top grossing games are now built using a games-as-service\n", + "\n", + "strategy, meaning titles shipped in cycles of constant\n", + "\n", + "iteration to increase engagement and monetization of\n", + "\n", + "players over time. Games-as-a-Service models have the\n", + "\n", + "ability to create sticky, high-margin games, but they also\n", + "\n", + "heavily depend on cloud-based services such as game\n", + "\n", + "play analytics, multiplayer servers and matchmaking, player\n", + "\n", + "relationship management, performance marketing and more.\n", + "\n", + "Data plays an integral role in the development and operation\n", + "\n", + "of video games. Teams need tools and services to optimize\n", + "\n", + "player lifetime value (LTV) with databases that can process\n", + "\n", + "terabytes-petabytes of evolving data, analytics solutions\n", + "\n", + "that can access that data with near real-time latency, and\n", + "\n", + "machine learning (ML) models that can translate insights into\n", + "\n", + "actionable and innovative gameplay features.\n", + "\n", + "A game’s development lifecycle is unique to each studio. With\n", + "\n", + "different skillsets, resources, and genres of games, there is no\n", + "\n", + "\n", + "one model. Below is a simplified view of a game development\n", + "\n", + "lifecycle for a studio running a games-as-a-service model.\n", + "\n", + "What’s important to remember is that throughout your title’s\n", + "\n", + "development lifecycle, there is data that can help you betterSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
a7c2b5dd3629ef4793294057fff00039heavily depend on cloud-based services such as game\n", + "\n", + "play analytics, multiplayer servers and matchmaking, player\n", + "\n", + "relationship management, performance marketing and more.\n", + "\n", + "Data plays an integral role in the development and operation\n", + "\n", + "of video games. Teams need tools and services to optimize\n", + "\n", + "player lifetime value (LTV) with databases that can process\n", + "\n", + "terabytes-petabytes of evolving data, analytics solutions\n", + "\n", + "that can access that data with near real-time latency, and\n", + "\n", + "machine learning (ML) models that can translate insights into\n", + "\n", + "actionable and innovative gameplay features.\n", + "\n", + "A game’s development lifecycle is unique to each studio. With\n", + "\n", + "different skillsets, resources, and genres of games, there is no\n", + "\n", + "\n", + "one model. Below is a simplified view of a game development\n", + "\n", + "lifecycle for a studio running a games-as-a-service model.\n", + "\n", + "What’s important to remember is that throughout your title’s\n", + "\n", + "development lifecycle, there is data that can help you better\n", + "\n", + "understand your audience, more effectively find and acquire\n", + "\n", + "players, and more easily activate and engage them. Whether\n", + "\n", + "using game play data to optimize creative decision making\n", + "\n", + "during pre-production, tapping machine learning models to\n", + "\n", + "predict and prevent churn, or identifying the next best offer\n", + "\n", + "or action for your players in real-time, **data is your friend** .\n", + "\n", + "### Use data to develop a next-generation customer experience\n", + "\n", + "In the game industry, customer experience (CX) is an\n", + "\n", + "important factor that can impact a player’s enjoyment of a\n", + "\n", + "game and the length they choose to play that game over time.\n", + "\n", + "In today’s highly competitive and fast-paced games industry,\n", + "\n", + "a game studio’s ability to deliver exceptional and seamless\n", + "\n", + "customer experiences can be a strategic differentiator when\n", + "\n", + "it comes to cutting through the noise and winning a gamer’s\n", + "\n", + "\n", + "## Game Development Lifecycle\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n", + "\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "\n", + "_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n", + "\n", + "\n", + "**1. Pre-Production**\n", + "\n", + "Brainstorm how to give life to the many\n", + "\n", + "ideas laid out in the planning phase\n", + "\n", + "\n", + "**3. Testing**\n", + "\n", + "Every feature and mechanic in the game needs\n", + "\n", + "to be tested for game loop and quality control\n", + "\n", + "\n", + "**5. Operation**\n", + "\n", + "As studios increasingly adopt games-as-a-service models, the\n", + "\n", + "ongoing operation of a video game is as critical as the launch itself\n", + "\n", + "**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n", + "\n", + "\n", + "\n", + "|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n", + "|---|---|---|---|---|---|---|---|\n", + "|||||||||\n", + "|||||||||\n", + "\n", + "\n", + "**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n", + "**EXPERIMENTATION**\n", + "\n", + "\n", + "**2. Production**\n", + "\n", + "Most of the time, effort, and resources\n", + "\n", + "spent on developing video games are\n", + "\n", + "spent in production stage\n", + "\n", + "\n", + "**4. Launch**\n", + "\n", + "Whether developing alongside the community with\n", + "\n", + "alpha and beta releases, or launching into general\n", + "\n", + "availability, a game launch is a critical milestone\n", + "\n", + "\n", + "-----\n", + "\n", + "can help drive value through customer experience:\n", + "\n", + "`1.` **Personalization:** Game studios can use data analytics\n", + "\n", + "and machine learning to personalize the game experience\n", + "\n", + "for each player based on their preferences and behavior.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
e0222cd32cd29efca3084c4ba130b142**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n", + "**EXPERIMENTATION**\n", + "\n", + "\n", + "**2. Production**\n", + "\n", + "Most of the time, effort, and resources\n", + "\n", + "spent on developing video games are\n", + "\n", + "spent in production stage\n", + "\n", + "\n", + "**4. Launch**\n", + "\n", + "Whether developing alongside the community with\n", + "\n", + "alpha and beta releases, or launching into general\n", + "\n", + "availability, a game launch is a critical milestone\n", + "\n", + "\n", + "-----\n", + "\n", + "can help drive value through customer experience:\n", + "\n", + "`1.` **Personalization:** Game studios can use data analytics\n", + "\n", + "and machine learning to personalize the game experience\n", + "\n", + "for each player based on their preferences and behavior.\n", + "\n", + "This can include personalized recommendations for\n", + "\n", + "content, in-game events, and other features that are\n", + "\n", + "tailored to the player’s interests.\n", + "\n", + "`2.` **Omnichannel support:** Players often use multiple\n", + "\n", + "channels, such as social media, forums, and in-game\n", + "\n", + "support, to communicate with game studios. Next\n", + "\n", + "generation customer experience involves providing a\n", + "\n", + "seamless and integrated support experience across all\n", + "\n", + "these channels in near-real time.\n", + "\n", + "`3.` **Continuous improvement:** Game studios can use data\n", + "\n", + "and feedback from players to continuously improve\n", + "\n", + "\n", + "gathering feedback on new features and using it to refine\n", + "\n", + "and optimize the game over time.\n", + "\n", + "In summary, defining what a next generation customer\n", + "\n", + "experience looks like for your game is important because it can\n", + "\n", + "help you create a more personalized, seamless, and enjoyable\n", + "\n", + "experience for your players, which can lead to increased\n", + "\n", + "engagement, monetization, and loyalty. There are many\n", + "\n", + "ways teams can use data throughout a game’s development\n", + "\n", + "lifecycle, but far and away the most valuable focus area will be\n", + "\n", + "in building and refining the customer experience.\n", + "\n", + "Throughout the rest of this guide, we will dig into the most\n", + "\n", + "common use cases for data, analytics, and AI in game\n", + "\n", + "development, starting with where we recommend everyone\n", + "\n", + "begins: game analytics.\n", + "\n", + "\n", + "# Getting Started with Gaming Use Cases\n", + "\n", + "\n", + "### Where do I start? Start with game analytics\n", + "\n", + "**Overview**\n", + "\n", + "Big question: Where’s the best place to start when it comes\n", + "\n", + "to game data, analytics, and AI? For most game studios,\n", + "\n", + "the best place to start is with game analytics. Setting up a\n", + "\n", + "dashboard for your game analytics that helps you correlate\n", + "\n", + "data across disparate sources is infinitely valuable in a world\n", + "\n", + "\n", + "where there is no one gaming data source to rule them all.\n", + "\n", + "An effective dashboard should include your game telemetry\n", + "\n", + "data, data from any game services you’re running, and data\n", + "\n", + "sources outside of your game such as stores, marketplaces,\n", + "\n", + "and social media. See below.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Getting a strong foundation in game analytics unlocks more\n", + "\n", + "advanced data, analytics, and AI use cases. For example,\n", + "\n", + "concurrent player count plus store and marketplace data\n", + "\n", + "\n", + "**GAME TELEMETRY**\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**GAME SERVICES** **OTHER SOURCES**\n", + "\n", + "\n", + "-----\n", + "\n", + "and lifetime value. Usage telemetry combined with crash\n", + "\n", + "reporting and social media listening helps you more quickly\n", + "\n", + "uncover where players might be getting frustrated. And\n", + "\n", + "correlating chat logs, voice transcriptions, and or discord\n", + "\n", + "\n", + "that are relevant and engaging to your players, giving you\n", + "\n", + "tools to effectively market and monetize with your audience.\n", + "\n", + "**Let’s start with Player Segmentation.**\n", + "\n", + "\n", + "and reddit forums can help you identify disruptive behavior\n", + "\n", + "\n", + "before it gets out of hand, giving you the tools to take\n", + "\n", + "actionable steps to mitigate toxicity within your community.\n", + "\n", + "**Get started and set up your Analytics Dashboard**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
9e966502682ec7b0fdb300a5e74e2770advanced data, analytics, and AI use cases. For example,\n", + "\n", + "concurrent player count plus store and marketplace data\n", + "\n", + "\n", + "**GAME TELEMETRY**\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**GAME SERVICES** **OTHER SOURCES**\n", + "\n", + "\n", + "-----\n", + "\n", + "and lifetime value. Usage telemetry combined with crash\n", + "\n", + "reporting and social media listening helps you more quickly\n", + "\n", + "uncover where players might be getting frustrated. And\n", + "\n", + "correlating chat logs, voice transcriptions, and or discord\n", + "\n", + "\n", + "that are relevant and engaging to your players, giving you\n", + "\n", + "tools to effectively market and monetize with your audience.\n", + "\n", + "**Let’s start with Player Segmentation.**\n", + "\n", + "\n", + "and reddit forums can help you identify disruptive behavior\n", + "\n", + "\n", + "before it gets out of hand, giving you the tools to take\n", + "\n", + "actionable steps to mitigate toxicity within your community.\n", + "\n", + "**Get started and set up your Analytics Dashboard**\n", + "\n", + "### Understand your audience\n", + "\n", + "With your analytics pipelines set up, the first area of focus is to\n", + "\n", + "better understand your audience. This can help you inform a\n", + "\n", + "variety of key business decisions, from the highest macro order\n", + "\n", + "of “what game(s) to develop”, to how to market and monetize\n", + "\n", + "those games, and how to optimize the player experience.\n", + "\n", + "By understanding the demographics, preferences, and\n", + "\n", + "behaviors of their audience, a game studio can create games\n", + "\n", + "that are more likely to appeal to their target market and be\n", + "\n", + "successful. You can also use this understanding to tailor your\n", + "\n", + "marketing and monetization strategies to the needs and\n", + "\n", + "preferences of your players.\n", + "\n", + "Additionally, understanding your audience can help you\n", + "\n", + "\n", + "##### Player Segmentation\n", + "\n", + "**Overview**\n", + "\n", + "Player segmentation is the practice of dividing players\n", + "\n", + "into groups based on shared characteristics or behaviors.\n", + "\n", + "Segmentation has a number of benefits. You can better\n", + "\n", + "understand your players, create more personalized content,\n", + "\n", + "improve player retention, and optimize monetization, all of\n", + "\n", + "which contributes to an improved player experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The primary objective of segmentation is to ensure you’re\n", + "\n", + "not treating your entire playerbase the exact same. Humans\n", + "\n", + "are different, and your players have different motivations,\n", + "\n", + "preferences and behaviors. Recognizing this and engaging\n", + "\n", + "with them in a way that meets them where they’re at\n", + "\n", + "is one of the most impactful ways you can cultivate\n", + "\n", + "engagement with your game. As we mentioned above,\n", + "\n", + "the benefits of segmentation are broad reaching. Through\n", + "\n", + "better understanding of your playerbase, you can better\n", + "\n", + "personalize experiences, tailoring content and customer\n", + "\n", + "experience to specific groups of players that increases\n", + "\n", + "engagement and satisfaction. Better understanding of\n", + "\n", + "your players also helps in improving player retention. By\n", + "\n", + "identifying common characteristics of players who are at\n", + "\n", + "risk of churning (i.e., stopping play), you can develop targeted\n", + "\n", + "strategies that only reach specific audiences.\n", + "\n", + "Create advanced customer segments to build out more\n", + "\n", + "effective user stories, and identify potential purchasing\n", + "\n", + "predictions based on behaviors. Leverage existing sales\n", + "\n", + "data, campaigns and promotions systems to create robust\n", + "\n", + "segments with actionable behavior insights to inform your\n", + "\n", + "product roadmap. You can then use this information to build\n", + "\n", + "useful customer clusters that are targetable with different\n", + "\n", + "promos and offers to drive more efficient acquisition and\n", + "\n", + "deeper engagement with existing players.\n", + "\n", + "\n", + "identify potential pain points or areas for improvement\n", + "\n", + "\n", + "within your games, allowing you to proactively make changes\n", + "\n", + "\n", + "**Get started with Player Segmentation**\n", + "\n", + "\n", + "to address these issues and improve the player experience\n", + "\n", + "before a player potentially churns.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player lifetime value (LTV) is a measure of the value that a\n", + "\n", + "player brings to a game over the lifetime they play that game.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
eadfe1f8b7a0e36411630a8ece16bbebstrategies that only reach specific audiences.\n", + "\n", + "Create advanced customer segments to build out more\n", + "\n", + "effective user stories, and identify potential purchasing\n", + "\n", + "predictions based on behaviors. Leverage existing sales\n", + "\n", + "data, campaigns and promotions systems to create robust\n", + "\n", + "segments with actionable behavior insights to inform your\n", + "\n", + "product roadmap. You can then use this information to build\n", + "\n", + "useful customer clusters that are targetable with different\n", + "\n", + "promos and offers to drive more efficient acquisition and\n", + "\n", + "deeper engagement with existing players.\n", + "\n", + "\n", + "identify potential pain points or areas for improvement\n", + "\n", + "\n", + "within your games, allowing you to proactively make changes\n", + "\n", + "\n", + "**Get started with Player Segmentation**\n", + "\n", + "\n", + "to address these issues and improve the player experience\n", + "\n", + "before a player potentially churns.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player lifetime value (LTV) is a measure of the value that a\n", + "\n", + "player brings to a game over the lifetime they play that game.\n", + "\n", + "It is typically calculated by multiplying the average revenue\n", + "\n", + "per user (ARPU) by the average player lifespan. For example,\n", + "\n", + "if the average player spends $50 per year and plays the\n", + "\n", + "game for 2 years, their LTV would be $50 * 2 = $100.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Game studios care about LTV because it helps them\n", + "\n", + "understand the long-term value of their players and make\n", + "\n", + "informed decisions about how to invest in player acquisition\n", + "\n", + "and retention. For example, if the LTV of a player is higher\n", + "\n", + "than the cost of acquiring them (e.g., through advertising),\n", + "\n", + "it may be worth investing more in player acquisition. On the\n", + "\n", + "other hand, if the LTV of a player is lower than the cost of\n", + "\n", + "acquiring them, it may be more cost-effective to focus on\n", + "\n", + "retaining existing players rather than acquiring new ones.\n", + "\n", + "LTV is one of the more important metrics that game studios,\n", + "\n", + "particularly those building live service games, can use to\n", + "\n", + "understand the value of their players. It is important to\n", + "\n", + "consider other metrics as well, such as player retention,\n", + "\n", + "monetization, and engagement.\n", + "\n", + "**Get started with Player Lifetime Value**\n", + "\n", + "##### Social Media Monitoring\n", + "\n", + "**Overview**\n", + "\n", + "As the great Warren Buffet once said, “It takes 20 years to\n", + "\n", + "build a reputation and five minutes to ruin it. If you think\n", + "\n", + "about that, you’ll do things differently.” Now more than ever,\n", + "\n", + "people are able to use social media and instantly amplify\n", + "\n", + "their voices to thousands of people who share similar\n", + "\n", + "interests and hobbies. Take Reddit as an example. r/gaming,\n", + "\n", + "the largest video game community (also called a subreddit)\n", + "\n", + "has over 35 million members with nearly 500 new posts\n", + "\n", + "and 10,000 new comments per day, while over 120 game-\n", + "\n", + "specific subreddits have more than 10,000 members each,\n", + "\n", + "the largest being League of Legends with over 700,000\n", + "\n", + "members. The discourse that takes place on online social\n", + "\n", + "platforms generates massive amounts of raw and organic\n", + "\n", + "\n", + "be used to understand how customers think and discover\n", + "\n", + "exactly what they want.\n", + "\n", + "The act and process of monitoring content online across the\n", + "\n", + "internet and social media for keyword mentions and trends\n", + "\n", + "for downstream processing and analytics is called media\n", + "\n", + "monitoring. By applying media monitoring to social media\n", + "\n", + "platforms, game developers are able to gain new advantages\n", + "\n", + "that previously might not have been possible, including:\n", + "\n", + "- Programmatically aggregate product ideas for new\n", + "\n", + "feature prioritization\n", + "\n", + "- Promote a better user experience by automatically\n", + "\n", + "responding to positive or negative comments\n", + "\n", + "- Understand the top influencers in the industry who can\n", + "\n", + "sway public opinion\n", + "\n", + "- Monitor broader industry trends and emerging segments\n", + "\n", + "such as free-to-play games\n", + "\n", + "- Detect and react to controversies or crises as they beginSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
3469495b7a915327321a284743718afethe largest being League of Legends with over 700,000\n", + "\n", + "members. The discourse that takes place on online social\n", + "\n", + "platforms generates massive amounts of raw and organic\n", + "\n", + "\n", + "be used to understand how customers think and discover\n", + "\n", + "exactly what they want.\n", + "\n", + "The act and process of monitoring content online across the\n", + "\n", + "internet and social media for keyword mentions and trends\n", + "\n", + "for downstream processing and analytics is called media\n", + "\n", + "monitoring. By applying media monitoring to social media\n", + "\n", + "platforms, game developers are able to gain new advantages\n", + "\n", + "that previously might not have been possible, including:\n", + "\n", + "- Programmatically aggregate product ideas for new\n", + "\n", + "feature prioritization\n", + "\n", + "- Promote a better user experience by automatically\n", + "\n", + "responding to positive or negative comments\n", + "\n", + "- Understand the top influencers in the industry who can\n", + "\n", + "sway public opinion\n", + "\n", + "- Monitor broader industry trends and emerging segments\n", + "\n", + "such as free-to-play games\n", + "\n", + "- Detect and react to controversies or crises as they begin\n", + "\n", + "- Get organic and unfiltered feedback of games and features\n", + "\n", + "- Understand customer sentiment at scale\n", + "\n", + "- Make changes faster to keep customer satisfaction high\n", + "\n", + "and prevent churn\n", + "\n", + "By failing to monitor, understand, and act on what customers\n", + "\n", + "are saying about the games and content you release as\n", + "\n", + "well as broader industry trends, you risk those customers\n", + "\n", + "leaving for a better experience that meets the demands and\n", + "\n", + "requirements of what customers want.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "By monitoring and listening to what existing and potential\n", + "\n", + "customers are saying on social media, game developers\n", + "\n", + "are able to get a natural and organic understanding of how\n", + "\n", + "customers actually feel about the games and products they\n", + "\n", + "release, or gauge consumer interest before investing time\n", + "\n", + "and money in a new idea. The main process for social media\n", + "\n", + "monitoring is to gather data from different social media\n", + "\n", + "platforms, such as Twitter or YouTube, process those comments\n", + "\n", + "or tweets, then take action on the processed data. While\n", + "\n", + "customer feedback can be manually discovered and processed\n", + "\n", + "in search of certain keyword mentions or feedback, it is a much\n", + "\n", + "better idea to automate it and do it programmatically.\n", + "\n", + "**Get started with Social Media Monitoring**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player feedback analysis is the process of collecting,\n", + "\n", + "analyzing, and acting on player feedback to inform game\n", + "\n", + "development. It involves collecting player feedback from\n", + "\n", + "multiple sources, such as in-game surveys, customer\n", + "\n", + "support tickets, social media, marketplace reviews, and\n", + "\n", + "forums, and using data analytics tools to identify patterns,\n", + "\n", + "trends, and insights. The goal of player feedback analysis is\n", + "\n", + "to better understand player needs, preferences, and pain\n", + "\n", + "points, and use this information to inform game development\n", + "\n", + "decisions and improve the overall player experience.\n", + "\n", + "Player feedback analysis is an important part of game\n", + "\n", + "development as it helps ensure that the game continues to\n", + "\n", + "meet player needs and expectations. By regularly collecting and\n", + "\n", + "analyzing player feedback, game studios can make data-driven\n", + "\n", + "decisions to improve the game, increase player engagement\n", + "\n", + "and retention, and ultimately drive success and growth.\n", + "\n", + "For this use case, we’re going to focus on taking online\n", + "\n", + "reviews for your video game and categorizing the different\n", + "\n", + "topics players are talking about (bucketing topics) in order\n", + "\n", + "to better understand the themes (via positive or negative\n", + "\n", + "sentiment) affecting your community.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "This is incredibly helpful, providing data-driven customer\n", + "\n", + "insight into your development process. Whether used in\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Across massively multiplayer online video games (MMOs),\n", + "\n", + "multiplayer online battle arena games (MOBAs) and other\n", + "\n", + "forms of online gaming, players continuously interact in real\n", + "\n", + "time to either coordinate or compete as they move toward a\n", + "\n", + "common goal — winning. This interactivity is integral to gameSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
9b831c107aaf2a3678bf3a75ed18d52canalyzing player feedback, game studios can make data-driven\n", + "\n", + "decisions to improve the game, increase player engagement\n", + "\n", + "and retention, and ultimately drive success and growth.\n", + "\n", + "For this use case, we’re going to focus on taking online\n", + "\n", + "reviews for your video game and categorizing the different\n", + "\n", + "topics players are talking about (bucketing topics) in order\n", + "\n", + "to better understand the themes (via positive or negative\n", + "\n", + "sentiment) affecting your community.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "This is incredibly helpful, providing data-driven customer\n", + "\n", + "insight into your development process. Whether used in\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Across massively multiplayer online video games (MMOs),\n", + "\n", + "multiplayer online battle arena games (MOBAs) and other\n", + "\n", + "forms of online gaming, players continuously interact in real\n", + "\n", + "time to either coordinate or compete as they move toward a\n", + "\n", + "common goal — winning. This interactivity is integral to game\n", + "\n", + "play dynamics, but at the same time, it’s a prime opening for\n", + "\n", + "toxic behavior — an issue pervasive throughout the online\n", + "\n", + "video gaming sphere.\n", + "\n", + "Toxic behavior manifests in many forms, such as the varying\n", + "\n", + "degrees of griefing, cyberbullying and sexual harassment\n", + "\n", + "that are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n", + "\n", + "[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n", + "\n", + "the multiplayer game, _Dead by Daylight_ .\n", + "\n", + "\n", + "pre-production, such as looking at games that are similar\n", + "\n", + "\n", + "**Survivors**\n", + "\n", + "\n", + "with reviews to learn where those games have strengths and\n", + "\n", + "weaknesses; or using player feedback analysis with a live\n", + "\n", + "service title to identify themes that can apply to your product\n", + "\n", + "roadmap, player feedback analysis helps teams better\n", + "\n", + "support and cultivate engagement with the player community.\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "**RUSHING**\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "\n", + "**HIDING** **ACTIVATING** **LOOPING**\n", + "**EMOTES**\n", + "\n", + "\n", + "**RUSH** **BLINDING** **SANDBAGGING**\n", + "**UNHOOKING**\n", + "\n", + "**TEABAGGING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**TEXT**\n", + "**CHATTING**\n", + "\n", + "\n", + "Ultimately, player feedback analysis does two things. 1) It\n", + "\n", + "\n", + "**Less**\n", + "\n", + "**toxic**\n", + "\n", + "\n", + "**Most**\n", + "**toxic**\n", + "\n", + "\n", + "can help you stack rank themes according to positive and\n", + "\n", + "negative sentiment, and 2) you can weight those themes\n", + "\n", + "according to impact on player engagement, toxicity,\n", + "\n", + "monetization, churn, and more. We’ve all read reviews that\n", + "\n", + "are overly positive, or overly negative. The process of player\n", + "\n", + "feedback analysis helps to normalize feedback across the\n", + "\n", + "community (keeping in mind, only for those who have written\n", + "\n", + "a review), so you’re not over indexing on one review, or a\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**FARMING** **FARMING**\n", + "\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**BEING AWAY**\n", + "**FROM**\n", + "**KEYBOARD**\n", + "**(AFK)**\n", + "\n", + "\n", + "**CAMPING**\n", + "\n", + "**DRIBBLING** **TUNNELING**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
e3f8982a30fe6e2560236fad0446093aare overly positive, or overly negative. The process of player\n", + "\n", + "feedback analysis helps to normalize feedback across the\n", + "\n", + "community (keeping in mind, only for those who have written\n", + "\n", + "a review), so you’re not over indexing on one review, or a\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**FARMING** **FARMING**\n", + "\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**BEING AWAY**\n", + "**FROM**\n", + "**KEYBOARD**\n", + "**(AFK)**\n", + "\n", + "\n", + "**CAMPING**\n", + "\n", + "**DRIBBLING** **TUNNELING**\n", + "\n", + "\n", + "**LOBBY**\n", + "**DODGING**\n", + "\n", + "**BODY**\n", + "**BLOCKING**\n", + "\n", + "**FACE**\n", + "**SLUGGING** **CAMPING**\n", + "\n", + "\n", + "**Killers**\n", + "\n", + "\n", + "single theme that may seem in the moment very pressing.\n", + "\n", + "In addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n", + "\n", + "\n", + "**Get started with Player Feedback Analysis**\n", + "\n", + "\n", + "on gamers and the community -- an issue that cannot be\n", + "\n", + "\n", + "-----\n", + "\n", + "game studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n", + "\n", + "\n", + "[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n", + "\n", + "toxicity, and of those, 20% reported leaving the game due to\n", + "\n", + "these interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n", + "\n", + "showed that having a disruptive or toxic encounter in the first\n", + "\n", + "session of the game led to players being over three times\n", + "\n", + "more likely to leave the game without returning. Given that\n", + "\n", + "player retention is a top priority for many studios, particularly\n", + "\n", + "as game delivery transitions from physical media releases to\n", + "\n", + "long-lived services, it’s clear that toxicity must be curbed.\n", + "\n", + "Compounding this issue related to churn, some companies\n", + "\n", + "face challenges related to toxicity early in development,\n", + "\n", + "even before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n", + "\n", + "released into testing without text or voice chat due in part\n", + "\n", + "to not having a system in place to monitor or manage toxic\n", + "\n", + "\n", + "In this section, we’re going to talk about how to use your data\n", + "\n", + "to more effectively find your target audience across the web.\n", + "\n", + "Whether you’re engaging in paid advertising, influencer or\n", + "\n", + "referral marketing, PR, cross promotion, community building,\n", + "\n", + "etc - use data to separate activity from impact. You want\n", + "\n", + "to focus on the channels and strategies that leverage your\n", + "\n", + "resources most effectively, be that time or money.\n", + "\n", + "Say you have a cohort of highly engaged players who are\n", + "\n", + "spending money on your title, and you want to find more\n", + "\n", + "gamers just like that. Doing an analysis on the demographic\n", + "\n", + "and behavioral data of this cohort will give you the\n", + "\n", + "information needed to use an ad platform (such as Meta,\n", + "\n", + "Google, or Unity) to do lookalike modeling and target those\n", + "\n", + "potential gamers for acquisition.\n", + "\n", + "\n", + "gamers and interactions. This illustrates that the scale of\n", + "\n", + "\n", + "the gaming space has far surpassed most teams’ ability to\n", + "\n", + "manage such behavior through reports or by intervening in\n", + "\n", + "disruptive interactions. Given this, it’s essential for studiosSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
7db6c2bd164244acf22349256c2cf9c3to more effectively find your target audience across the web.\n", + "\n", + "Whether you’re engaging in paid advertising, influencer or\n", + "\n", + "referral marketing, PR, cross promotion, community building,\n", + "\n", + "etc - use data to separate activity from impact. You want\n", + "\n", + "to focus on the channels and strategies that leverage your\n", + "\n", + "resources most effectively, be that time or money.\n", + "\n", + "Say you have a cohort of highly engaged players who are\n", + "\n", + "spending money on your title, and you want to find more\n", + "\n", + "gamers just like that. Doing an analysis on the demographic\n", + "\n", + "and behavioral data of this cohort will give you the\n", + "\n", + "information needed to use an ad platform (such as Meta,\n", + "\n", + "Google, or Unity) to do lookalike modeling and target those\n", + "\n", + "potential gamers for acquisition.\n", + "\n", + "\n", + "gamers and interactions. This illustrates that the scale of\n", + "\n", + "\n", + "the gaming space has far surpassed most teams’ ability to\n", + "\n", + "manage such behavior through reports or by intervening in\n", + "\n", + "disruptive interactions. Given this, it’s essential for studios\n", + "\n", + "to integrate analytics into games early in the development\n", + "\n", + "lifecycle and then design for the ongoing management of\n", + "\n", + "toxic interactions.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Toxicity in gaming is clearly a multifaceted issue that\n", + "\n", + "has become a part of video game culture and cannot be\n", + "\n", + "addressed universally in a single way. That said, addressing\n", + "\n", + "toxicity within in-game chat can have a huge impact given\n", + "\n", + "the frequency of toxic behavior and the ability to automate\n", + "\n", + "the detection of it using natural language processing (NLP). In\n", + "\n", + "summary, by leveraging machine learning to better identify\n", + "\n", + "disruptive behavior so that better-informed decisions\n", + "\n", + "around handling actions can be made.\n", + "\n", + "**Get started with Toxicity Detection**\n", + "\n", + "\n", + "##### Multi-Touch Attribution\n", + "\n", + "**Overview**\n", + "\n", + "Multi-touch attribution is a method of attributing credit to\n", + "\n", + "different marketing channels or touchpoints that contribute to\n", + "\n", + "a sale or conversion. In other words, it is a way of understanding\n", + "\n", + "how different marketing efforts influence a customer’s decision\n", + "\n", + "to make a purchase or take a desired action.\n", + "\n", + "There are a variety of different attribution models that can\n", + "\n", + "be used to assign credit to different touchpoints, each with\n", + "\n", + "its own strengths and limitations. For example, the last-\n", + "\n", + "click model attributes all credit to the last touchpoint that\n", + "\n", + "the customer interacted with before making a purchase,\n", + "\n", + "while the first-click model attributes all credit to the first\n", + "\n", + "touchpoint. Other models, such as the linear model or\n", + "\n", + "the time decay model, distribute credit across multiple\n", + "\n", + "touchpoints based on different algorithms.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Multi-touch attribution can be useful for game studios because\n", + "\n", + "it can help them understand which marketing channels or\n", + "\n", + "efforts are most effective at driving conversions and inform their\n", + "\n", + "marketing strategy. However, it is important to choose the right\n", + "\n", + "attribution model for your title based on your business model\n", + "\n", + "(one-time purchase, subscription, free-to-play, freemium,\n", + "\n", + "in-game advertising, etc.) and regularly review and optimize your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "**Get started with Multi-Touch Attribution**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Activating Your Playerbase\n", + "\n", + "So far, we’ve discussed how to better understand your\n", + "\n", + "players, and how to acquire more of your target audience.\n", + "\n", + "Next, we’re going to dig into how to better activate your\n", + "\n", + "players to create a more engaged and loyal playerbase that\n", + "\n", + "stays with your game for the long-term. Here, we’re going to\n", + "\n", + "focus on strategies that differentiate your gamer experience.\n", + "\n", + "##### Player Recommendations\n", + "\n", + "\n", + "and make in-game purchases. Additionally, personalized\n", + "\n", + "recommendations can help improve the overall player\n", + "\n", + "experience and increase satisfaction.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
aabb362105ded42be4f95f2d29ae3027attribution model for your title based on your business model\n", + "\n", + "(one-time purchase, subscription, free-to-play, freemium,\n", + "\n", + "in-game advertising, etc.) and regularly review and optimize your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "**Get started with Multi-Touch Attribution**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Activating Your Playerbase\n", + "\n", + "So far, we’ve discussed how to better understand your\n", + "\n", + "players, and how to acquire more of your target audience.\n", + "\n", + "Next, we’re going to dig into how to better activate your\n", + "\n", + "players to create a more engaged and loyal playerbase that\n", + "\n", + "stays with your game for the long-term. Here, we’re going to\n", + "\n", + "focus on strategies that differentiate your gamer experience.\n", + "\n", + "##### Player Recommendations\n", + "\n", + "\n", + "and make in-game purchases. Additionally, personalized\n", + "\n", + "recommendations can help improve the overall player\n", + "\n", + "experience and increase satisfaction.\n", + "\n", + "Game studios can use a variety of techniques to create player\n", + "\n", + "recommendations, such as machine learning algorithms,\n", + "\n", + "collaborative filtering, and manual curation. It is important\n", + "\n", + "to regularly review and optimize these recommendations to\n", + "\n", + "ensure that they are effective and relevant to players.\n", + "\n", + "**Get started with Player Recommendations**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Player recommendations are suggestions for content or actions\n", + "\n", + "\n", + "that a game studio makes to individual players based on their\n", + "\n", + "interests and behaviors. These recommendations can be used\n", + "\n", + "to promote specific in-game items, encourage players to try\n", + "\n", + "new features, or simply provide a personalized experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Player recommendations matter to game studios because\n", + "\n", + "they can help improve player retention, engagement, and\n", + "\n", + "monetization. By providing players with recommendations\n", + "\n", + "that are relevant and engaging, studios can increase the\n", + "\n", + "likelihood that players will continue to play their games\n", + "\n", + "\n", + "##### Next Best Offer/Action\n", + "\n", + "**Overview**\n", + "\n", + "Next best offer (NBO) and next best action (NBA) are\n", + "\n", + "techniques that businesses use to make personalized\n", + "\n", + "recommendations to their customers. NBO refers to the\n", + "\n", + "practice of recommending the most relevant product or\n", + "\n", + "service to a customer based on their past purchases and\n", + "\n", + "behaviors. NBA refers to the practice of recommending the\n", + "\n", + "most relevant action or interaction to a customer based on\n", + "\n", + "the same information.\n", + "\n", + "\n", + "-----\n", + "\n", + "in-game purchase to a player based on their past spending\n", + "\n", + "habits and the items they have shown an interest in. They\n", + "\n", + "might use NBA to recommend a specific level or event to a\n", + "\n", + "player based on their progress and interests.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "It’s important to remember that next best offer is a specific\n", + "\n", + "use case within personalization that involves making\n", + "\n", + "recommendations to players on the most valuable in-game\n", + "\n", + "item or action they should take next. For example, a next\n", + "\n", + "best offer recommendation in a mobile game might suggest\n", + "\n", + "that a player purchase a specific in-game currency or unlock\n", + "\n", + "a new character.\n", + "\n", + "Both NBO and NBA can be used to improve customer\n", + "\n", + "retention, engagement, and monetization by providing\n", + "\n", + "personalized recommendations that are more likely to be\n", + "\n", + "relevant and appealing to individual customers. They can be\n", + "\n", + "implemented using a variety of techniques, such as machine\n", + "\n", + "learning algorithms or manual curation.\n", + "\n", + "**Get started with Next Best Offer/Action**\n", + "\n", + "##### Churn Prediction & Prevention\n", + "\n", + "**Overview**\n", + "\n", + "Video games live and die by their player base. For Games-\n", + "\n", + "\n", + "may overwhelm the ability of these players to consume,\n", + "\n", + "reinforcing the overall problem of player churn.\n", + "\n", + "At some point, it becomes critical for teams to take a cold,\n", + "\n", + "hard look at the cost of acquisition relative to the subscriberSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
228e006f7573d58e282d8049ca8f2351item or action they should take next. For example, a next\n", + "\n", + "best offer recommendation in a mobile game might suggest\n", + "\n", + "that a player purchase a specific in-game currency or unlock\n", + "\n", + "a new character.\n", + "\n", + "Both NBO and NBA can be used to improve customer\n", + "\n", + "retention, engagement, and monetization by providing\n", + "\n", + "personalized recommendations that are more likely to be\n", + "\n", + "relevant and appealing to individual customers. They can be\n", + "\n", + "implemented using a variety of techniques, such as machine\n", + "\n", + "learning algorithms or manual curation.\n", + "\n", + "**Get started with Next Best Offer/Action**\n", + "\n", + "##### Churn Prediction & Prevention\n", + "\n", + "**Overview**\n", + "\n", + "Video games live and die by their player base. For Games-\n", + "\n", + "\n", + "may overwhelm the ability of these players to consume,\n", + "\n", + "reinforcing the overall problem of player churn.\n", + "\n", + "At some point, it becomes critical for teams to take a cold,\n", + "\n", + "hard look at the cost of acquisition relative to the subscriber\n", + "\n", + "lifetime value (LTV) earned. These figures need to be brought\n", + "\n", + "into a healthy balance, and retention needs to be actively\n", + "\n", + "managed, not as a point-in-time problem to be solved, but\n", + "\n", + "as a “chronic condition” which needs to be managed for the\n", + "\n", + "ongoing health of the title.\n", + "\n", + "Headroom for continued acquisition-driven growth can\n", + "\n", + "be created by carefully examining why some players leave\n", + "\n", + "and some players stay. When centered on factors known\n", + "\n", + "at the time of acquisition, gaming studios may have the\n", + "\n", + "opportunity to rethink key aspects of their acquisition\n", + "\n", + "strategy that promote higher average retention rates, which\n", + "\n", + "can lead to higher average revenue per user.\n", + "\n", + "**Prerequisites for use case**\n", + "\n", + "This use case assumes a certain level of existing data\n", + "\n", + "collection infrastructure in the studio. Notably, a studio ready\n", + "\n", + "to implement a churn prediction and prevention model\n", + "\n", + "should have\n", + "\n", + "- A cloud environment where player data is stored\n", + "\n", + "- This source data should contain player behavior and\n", + "\n", + "session telemetry events from within the game. This is\n", + "\n", + "the foundation that insights can be built on top of.\n", + "\n", + "\n", + "as-a-Service (GaaS) titles, engagement is the most\n", + "\n", + "\n", + "important metric a team can measure. Naturally, proactively\n", + "\n", + "preventing churn is critical to sustained engagement and\n", + "\n", + "\n", + "**Get started with Churn Prediction & Prevention**\n", + "\n", + "\n", + "growth. Through churn prediction and prevention, you will\n", + "\n", + "\n", + "be able to analyze behavioral data to identify subscribers\n", + "\n", + "with an increased risk of churn. Next, you will use machine\n", + "\n", + "learning to quantify the likelihood of a subscriber to churn, as\n", + "\n", + "well as indicate which factors create that risk.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Balancing customer acquisition and retention is critical.\n", + "\n", + "This is the central challenge to the long-term success of\n", + "\n", + "any live service game. This is particularly challenging in that\n", + "\n", + "successful customer acquisition strategies needed to get\n", + "\n", + "games to scale tend to be followed by service disruptions or\n", + "\n", + "declines in quality and customer experience, accelerating\n", + "\n", + "player abandonment. To replenish lost subscribers, the\n", + "\n", + "acquisition engine continues to grind and expenses mount.\n", + "\n", + "As games reach for customers beyond the core playerbase\n", + "\n", + "they may have initially targeted, the title may not resonate\n", + "\n", + "\n", + "##### Real-time Ad Targeting\n", + "\n", + "**Overview**\n", + "\n", + "Real-time ad targeting in the context of game development\n", + "\n", + "focuses on using data to deliver personalized and relevant\n", + "\n", + "advertisements to players in near real-time, while they are\n", + "\n", + "playing a game. Real-time targeting is performanced based,\n", + "\n", + "using highly personalized messagings which are achieved\n", + "\n", + "by using data to precisely determine the most opportune\n", + "\n", + "moments to display ads, based on factors such as player\n", + "\n", + "behavior, game state, and other contextual information.\n", + "\n", + "Knowing when to send those ads is based on data. ThisSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
da4fee979a43f9f958bc567f6e1debe3successful customer acquisition strategies needed to get\n", + "\n", + "games to scale tend to be followed by service disruptions or\n", + "\n", + "declines in quality and customer experience, accelerating\n", + "\n", + "player abandonment. To replenish lost subscribers, the\n", + "\n", + "acquisition engine continues to grind and expenses mount.\n", + "\n", + "As games reach for customers beyond the core playerbase\n", + "\n", + "they may have initially targeted, the title may not resonate\n", + "\n", + "\n", + "##### Real-time Ad Targeting\n", + "\n", + "**Overview**\n", + "\n", + "Real-time ad targeting in the context of game development\n", + "\n", + "focuses on using data to deliver personalized and relevant\n", + "\n", + "advertisements to players in near real-time, while they are\n", + "\n", + "playing a game. Real-time targeting is performanced based,\n", + "\n", + "using highly personalized messagings which are achieved\n", + "\n", + "by using data to precisely determine the most opportune\n", + "\n", + "moments to display ads, based on factors such as player\n", + "\n", + "behavior, game state, and other contextual information.\n", + "\n", + "Knowing when to send those ads is based on data. This\n", + "\n", + "use case is specific to titles using in-game advertising as a\n", + "\n", + "business model. It’s important to note that in-game real-\n", + "\n", + "time ad targeting requires a sophisticated tech stack, with\n", + "\n", + "\n", + "-----\n", + "\n", + "with bigger ad ecosystem, ad networks and partners. The\n", + "\n", + "Databricks Lakehouse platform is an optimal foundation as it\n", + "\n", + "already contains many of the connectors required to enable\n", + "\n", + "this use case.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of in-game real-time ad targeting is to provide a\n", + "\n", + "more immersive and relevant advertising experience for\n", + "\n", + "players, while also increasing the effectiveness of the ads\n", + "\n", + "for advertisers. By delivering targeted ads that are relevant\n", + "\n", + "to each player’s interests, game developers can create a\n", + "\n", + "more enjoyable and personalized gaming experience, which\n", + "\n", + "can help to reduce churn and increase the lifetime value of\n", + "\n", + "each player. Additionally, real-time ad targeting can also help\n", + "\n", + "game developers monetize their games more effectively, as\n", + "\n", + "advertisers are willing to pay a premium for hyper-targeted\n", + "\n", + "and engaged audiences.\n", + "\n", + "**Get started with Real-time Ad Targeting**\n", + "\n", + "### Operational use cases\n", + "\n", + "In the game development industry, operational analytics\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Anomaly detection plays an important role in the operation\n", + "\n", + "of a live service video game by helping to identify and\n", + "\n", + "diagnose unexpected behaviors in real-time. By identifying\n", + "\n", + "patterns and anomalies in player behavior, system\n", + "\n", + "performance, and network traffic, this information can then\n", + "\n", + "be used to detect and diagnose server crashes, performance\n", + "\n", + "bottlenecks, and hacking attempts. The ability to understand\n", + "\n", + "if there will be an issue before it becomes widespread is\n", + "\n", + "immensely valuable. Without anomaly detection, which is\n", + "\n", + "a form of advanced analytics, you’re always in a reactive\n", + "\n", + "(rather than proactive) state. Anomaly detection is a type of\n", + "\n", + "quality of service solution.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of anomaly detection is to ensure that players\n", + "\n", + "have a stable and enjoyable gaming experience. This has\n", + "\n", + "an impact across your game, from reducing downtime,\n", + "\n", + "to minimizing player churn, and improving your game’s\n", + "\n", + "reputation and revenue. Additionally, the insights gained from\n", + "\n", + "anomaly detection can also be used to mitigate cheating and\n", + "\n", + "disruptive behavior.\n", + "\n", + "**Get started with Anomaly Detection**\n", + "\n", + "\n", + "are essential for ensuring a smooth and efficient production\n", + "\n", + "\n", + "process. One common use case is anomaly detection, where\n", + "\n", + "data analytics is utilized to identify any unusual patterns\n", + "\n", + "or behaviors in the game, such as crashes or performance\n", + "\n", + "issues. This helps developers quickly identify and fix\n", + "\n", + "problems, improving the overall quality of the game. Another\n", + "\n", + "example is build pipelines, where data analytics can be used\n", + "\n", + "to monitor and optimize the process of creating new buildsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
b594c19e3f84c444c8a005a86a439733quality of service solution.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of anomaly detection is to ensure that players\n", + "\n", + "have a stable and enjoyable gaming experience. This has\n", + "\n", + "an impact across your game, from reducing downtime,\n", + "\n", + "to minimizing player churn, and improving your game’s\n", + "\n", + "reputation and revenue. Additionally, the insights gained from\n", + "\n", + "anomaly detection can also be used to mitigate cheating and\n", + "\n", + "disruptive behavior.\n", + "\n", + "**Get started with Anomaly Detection**\n", + "\n", + "\n", + "are essential for ensuring a smooth and efficient production\n", + "\n", + "\n", + "process. One common use case is anomaly detection, where\n", + "\n", + "data analytics is utilized to identify any unusual patterns\n", + "\n", + "or behaviors in the game, such as crashes or performance\n", + "\n", + "issues. This helps developers quickly identify and fix\n", + "\n", + "problems, improving the overall quality of the game. Another\n", + "\n", + "example is build pipelines, where data analytics can be used\n", + "\n", + "to monitor and optimize the process of creating new builds\n", + "\n", + "of the game. By tracking key metrics such as build time,\n", + "\n", + "error rates, and resource utilization, developers can make\n", + "\n", + "informed decisions about how to optimize the build process\n", + "\n", + "for maximum efficiency. Other operational use cases in game\n", + "\n", + "development include tracking player behavior, measuring\n", + "\n", + "server performance, and analyzing sales and marketing data.\n", + "\n", + "Lets explore a few of these below.\n", + "\n", + "\n", + "##### Build Pipeline\n", + "\n", + "**Overview**\n", + "\n", + "A build pipeline is a set of automated processes that\n", + "\n", + "are used to compile and assemble the code, assets, and\n", + "\n", + "resources that make up a game project. The build pipeline\n", + "\n", + "typically includes several stages, such as code compilation,\n", + "\n", + "optimization, testing, and release. The purpose of a build\n", + "\n", + "pipeline is to streamline the game development process\n", + "\n", + "and ensure that each stage of development is completed\n", + "\n", + "efficiently and effectively. A build pipeline can be configured\n", + "\n", + "to run automatically, so that new builds are generated\n", + "\n", + "whenever changes are made to the code or assets. This\n", + "\n", + "helps to ensure that the game is always up-to-date and\n", + "\n", + "ready for testing and release. The logs are collected are in\n", + "\n", + "near-real time from build servers. A simplified example:Dev\n", + "\n", + "X is committing code on title Y, submitted on day Z,\n", + "\n", + "along with the log files from the pipeline and build server.\n", + "\n", + "Builds typically take multiple hours to complete, requiring\n", + "\n", + "significant amounts of compute via build farms. Being able to\n", + "\n", + "\n", + "-----\n", + "\n", + "are wasting compute, and being able to predict which builds\n", + "\n", + "will fail as they goes through the pipeline are ways to curb\n", + "\n", + "operational expenses.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "With this use case, we’re seeking to reduce wasted compute\n", + "\n", + "and build a foundational view of what was developed, by\n", + "\n", + "who, when and how testing performed. In an ideal state, our\n", + "\n", + "automated build pipeline could send a notification to the\n", + "\n", + "developer with a confidence metric on the build making it\n", + "\n", + "through, allowing them to decide whether to continue or\n", + "\n", + "move another build through the pipeline. Often, developers\n", + "\n", + "do not have clear visibility until the build has completed\n", + "\n", + "or failed. By providing more insight to devs into the build\n", + "\n", + "pipeline process, we can increase the rate at which builds\n", + "\n", + "are completed efficiently and effectively.\n", + "\n", + "**Get started with Build Pipeline**\n", + "\n", + "##### Crash Analytics\n", + "\n", + "\n", + "resources were being used. How long crash testing takes\n", + "\n", + "can vary, depending on the game’s business model, amount\n", + "\n", + "of content, and scope. For a title with a one-time release,\n", + "\n", + "where there is a large amount of content and a complex\n", + "\n", + "storyline, the chances of hidden crashes causing errors while\n", + "\n", + "in development are high, making it require more time to\n", + "\n", + "perform testing before the game can be published. For titles\n", + "\n", + "built in a game-as-a-service model, i.e. a game shipped in\n", + "\n", + "cycles of constant iteration, crash detection should be doneSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
0d75377e5be40a4a0c15fdaea8622460through, allowing them to decide whether to continue or\n", + "\n", + "move another build through the pipeline. Often, developers\n", + "\n", + "do not have clear visibility until the build has completed\n", + "\n", + "or failed. By providing more insight to devs into the build\n", + "\n", + "pipeline process, we can increase the rate at which builds\n", + "\n", + "are completed efficiently and effectively.\n", + "\n", + "**Get started with Build Pipeline**\n", + "\n", + "##### Crash Analytics\n", + "\n", + "\n", + "resources were being used. How long crash testing takes\n", + "\n", + "can vary, depending on the game’s business model, amount\n", + "\n", + "of content, and scope. For a title with a one-time release,\n", + "\n", + "where there is a large amount of content and a complex\n", + "\n", + "storyline, the chances of hidden crashes causing errors while\n", + "\n", + "in development are high, making it require more time to\n", + "\n", + "perform testing before the game can be published. For titles\n", + "\n", + "built in a game-as-a-service model, i.e. a game shipped in\n", + "\n", + "cycles of constant iteration, crash detection should be done\n", + "\n", + "continuously, since errors in newly released content might\n", + "\n", + "affect the base game and lead to crashes.\n", + "\n", + "Increasingly, titles are being released in alpha (where\n", + "\n", + "developers do the testing), closed beta (which includes a\n", + "\n", + "limited group of testers/sample-users who do the gameplay\n", + "\n", + "testing) and open betas (where anyone interested can register\n", + "\n", + "to try the game). All of which happens before the game is\n", + "\n", + "“officially” released. Regardless of alpha, beta, or GA, players\n", + "\n", + "may stumble over game crashes, which triggers crash reports\n", + "\n", + "that are sent to the developers for fixing. But sometimes, it\n", + "\n", + "can be challenging to understand the issue that caused the\n", + "\n", + "crash from crash reports provided by your game’s platform.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Ultimately, the purpose of crash analytics is to identify the\n", + "\n", + "root cause of a crash, and help you take steps to prevent\n", + "\n", + "similar crashes from happening in the future. This feedback\n", + "\n", + "loop can be tightened through automation in the data\n", + "\n", + "pipeline. For example, by tracking crashes caused on builds\n", + "\n", + "from committers, the data can provide build suggestions\n", + "\n", + "to improve crash rate. Furthermore, teams can automate\n", + "\n", + "deduplication when multiple players experience the same\n", + "\n", + "errors, helping to reduce noise in the alerts received.\n", + "\n", + "**Get started with Crash Analytics**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Games crash, it is a fact of game development. The\n", + "\n", + "combination of drivers, hardware, software, and\n", + "\n", + "configurations create unique challenges in tracking, resolving\n", + "\n", + "and managing the user experience.\n", + "\n", + "Crash analytics and reporting is the process of collecting\n", + "\n", + "information about crashes or unexpected failures in a\n", + "\n", + "software application, in this case, a video game. A crash\n", + "\n", + "report typically includes information about the state of the\n", + "\n", + "game at the time of the crash, such as what the player was\n", + "\n", + "\n", + "-----\n", + "\n", + "# Things to look forward to\n", + "\n", + "\n", + "This eBook was created to help game developers better\n", + "\n", + "wrap their heads around the general concepts in which data,\n", + "\n", + "analytics, and AI can be used to support the development\n", + "\n", + "and growth of video games. **If you only have 5 minutes,**\n", + "\n", + "**these takeaways are critical to your success** .\n", + "\n", + "For more information on advanced data, analytics, and AI use\n", + "\n", + "cases, as well as education resources, we highly recommend\n", + "\n", + "Databricks training portal [dbricks.co/training](http://dbricks.co/training) .\n", + "\n", + "**Top takeaways:**\n", + "\n", + "If you take nothing else from this guide, here are the most\n", + "\n", + "important takeaways we want to leave with you on your journey.\n", + "\n", + "`1.` **Data is fundamental. Data, analytics, and AI play a role**\n", + "\n", + "throughout the entire game development lifecycle - from\n", + "\n", + "discovery to pre-production, development to operating\n", + "\n", + "a game as a live service. Build better games, cultivateSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
9036a5784356944e4eb45e43a0f91ec0wrap their heads around the general concepts in which data,\n", + "\n", + "analytics, and AI can be used to support the development\n", + "\n", + "and growth of video games. **If you only have 5 minutes,**\n", + "\n", + "**these takeaways are critical to your success** .\n", + "\n", + "For more information on advanced data, analytics, and AI use\n", + "\n", + "cases, as well as education resources, we highly recommend\n", + "\n", + "Databricks training portal [dbricks.co/training](http://dbricks.co/training) .\n", + "\n", + "**Top takeaways:**\n", + "\n", + "If you take nothing else from this guide, here are the most\n", + "\n", + "important takeaways we want to leave with you on your journey.\n", + "\n", + "`1.` **Data is fundamental. Data, analytics, and AI play a role**\n", + "\n", + "throughout the entire game development lifecycle - from\n", + "\n", + "discovery to pre-production, development to operating\n", + "\n", + "a game as a live service. Build better games, cultivate\n", + "\n", + "deeper player engagements, and operate more effectively\n", + "\n", + "\n", + "by utilizing the full potential of your data.\n", + "\n", + "`2.` **Define your goals.** Start by establishing the goals of what\n", + "\n", + "you’re hoping to learn and or understand around your\n", + "\n", + "game. Clear goals make it easier to identify key metrics\n", + "\n", + "to track, example goals include; developing high-quality\n", + "\n", + "games that provide engaging and satisfying player\n", + "\n", + "experiences, increasing player engagement and retention\n", + "\n", + "by analyzing and improving gameplay and mechanics, and\n", + "\n", + "building a strong and positive brand reputation through\n", + "\n", + "effective marketing and community outreach.\n", + "\n", + "`3.` **Identify and understand your data sources.** Spend time\n", + "\n", + "to identify and understand the breadth of data sources\n", + "\n", + "you are already collecting, be that game telemetry,\n", + "\n", + "marketplace, game services, or sources beyond the game\n", + "\n", + "like social media. It is critical to collect the right data, and\n", + "\n", + "track the right metrics based on the goals and objectives\n", + "\n", + "you have set for your game.\n", + "\n", + "`4.` **Start small, and iterate quickly.** Recognize that goals and\n", + "\n", + "objectives evolve as you learn more about the interaction\n", + "\n", + "\n", + "-----\n", + "\n", + "are most effective when scoped small with tight feedback\n", + "\n", + "loops, allowing you to quickly adapt with your community\n", + "\n", + "and alongside shifting market conditions.\n", + "\n", + "`5.` **Game analytics forms the foundation.** Start by getting a\n", + "\n", + "game analytics dashboard up and running. The process of\n", + "\n", + "building out a dashboard will naturally require connecting\n", + "\n", + "and transforming your data in a way to unlock more\n", + "\n", + "advanced use cases down the road.\n", + "\n", + "`6.` **Plan and revisit your data strategy frequently.** Once\n", + "\n", + "dashboarding is set up, you’ll have a better picture of what\n", + "\n", + "downstream data use cases make the most sense for\n", + "\n", + "your game and business objectives. As you move to use\n", + "\n", + "cases such as player segmentation, churn analysis, and\n", + "\n", + "player lifetime value, revisit your data strategy frequently\n", + "\n", + "to ensure you’re spending time on use cases that drive\n", + "\n", + "actionable insights for you and your team.\n", + "\n", + "`7.` **Show value broad and wide.** Whether your data strategy\n", + "\n", + "is new or well established on the team, build the habit\n", + "\n", + "of communicating broadly to stakeholders across the\n", + "\n", + "company. Early in the process, it is important to gather\n", + "\n", + "critical feedback on what data is helpful and where there\n", + "\n", + "are opportunities for improvement. The worst thing that\n", + "\n", + "can happen is you create something that no one uses.\n", + "\n", + "That is a waste of everyone’s time and money.\n", + "\n", + "`8.` **Ask for help.** Engage with your technical partners. There\n", + "\n", + "are humans who can help ensure you’re developing your\n", + "\n", + "data and analytics platform in a way that is efficient and\n", + "\n", + "effective. There are numerous partners with domain\n", + "\n", + "expertise in data science and data engineering that can\n", + "\n", + "accelerate your data journey - here is our recommendedSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
c9711ea58e3c015192f3db3f146bafabplayer lifetime value, revisit your data strategy frequently\n", + "\n", + "to ensure you’re spending time on use cases that drive\n", + "\n", + "actionable insights for you and your team.\n", + "\n", + "`7.` **Show value broad and wide.** Whether your data strategy\n", + "\n", + "is new or well established on the team, build the habit\n", + "\n", + "of communicating broadly to stakeholders across the\n", + "\n", + "company. Early in the process, it is important to gather\n", + "\n", + "critical feedback on what data is helpful and where there\n", + "\n", + "are opportunities for improvement. The worst thing that\n", + "\n", + "can happen is you create something that no one uses.\n", + "\n", + "That is a waste of everyone’s time and money.\n", + "\n", + "`8.` **Ask for help.** Engage with your technical partners. There\n", + "\n", + "are humans who can help ensure you’re developing your\n", + "\n", + "data and analytics platform in a way that is efficient and\n", + "\n", + "effective. There are numerous partners with domain\n", + "\n", + "expertise in data science and data engineering that can\n", + "\n", + "accelerate your data journey - here is our recommended\n", + "\n", + "partner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n", + "\n", + "`9.` **Participate in the community.** The community for game\n", + "\n", + "analytics is large and growing. It is important to research and\n", + "\n", + "\n", + "your needs and interests. Here are a few of our favorites:\n", + "\n", + "`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n", + "\n", + "Special Interest Groups that bring together user\n", + "\n", + "researchers, designers, data engineers and data\n", + "\n", + "scientists focused on understanding player behavior\n", + "\n", + "and experiences. They offer resources and events\n", + "\n", + "for those working in games user research, including a\n", + "\n", + "yearly Games User Research Summit.\n", + "\n", + "`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n", + "\n", + "global community of data scientists and engineers.\n", + "\n", + "While not specifically focused on game development,\n", + "\n", + "they offer a wealth of resources and opportunities for\n", + "\n", + "learning, networking, and collaboration in the field of\n", + "\n", + "data science.\n", + "\n", + "`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n", + "\n", + "Language Processing, computer vision, and other fields\n", + "\n", + "where AI plays its role. They also provide an online\n", + "\n", + "platform where users can access pre-trained models\n", + "\n", + "and tools, share their own models and datasets, and\n", + "\n", + "collaborate with other developers in the community.\n", + "\n", + "`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n", + "\n", + "subreddit is a forum for data engineers to discuss\n", + "\n", + "topics related to building and managing data pipelines,\n", + "\n", + "data warehousing, and related technologies. While\n", + "\n", + "not specifically focused on game development, it\n", + "\n", + "can be a valuable resource for those working on data\n", + "\n", + "engineering in the gaming industry.\n", + "\n", + "`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n", + "\n", + "first step in your data journey. Imagine how the output of\n", + "\n", + "your data can be presented in a way to help stakeholders\n", + "\n", + "across your company achieve more. For example, dropping\n", + "\n", + "data into an application that can help game designers\n", + "\n", + "make balancing decisions based on player events.\n", + "\n", + "\n", + "-----\n", + "\n", + "# APPENDIX Ultimate class build guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "The heart and soul of mature data teams are formed by this\n", + "\n", + "trio of classes. There are many aspects to these roles, but\n", + "\n", + "they can be summarized in that Data Engineers create and\n", + "\n", + "maintain critical data workflows, Data Analysts interpret data\n", + "\n", + "and create reports that keep the business teams running\n", + "\n", + "seamlessly, and Data Scientists are responsible for making\n", + "\n", + "sense of large amounts of data. Depending on the size of\n", + "\n", + "the organization, individuals may be required to multiclassSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
1ce1d861d15136fd48438be91479e567engineering in the gaming industry.\n", + "\n", + "`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n", + "\n", + "first step in your data journey. Imagine how the output of\n", + "\n", + "your data can be presented in a way to help stakeholders\n", + "\n", + "across your company achieve more. For example, dropping\n", + "\n", + "data into an application that can help game designers\n", + "\n", + "make balancing decisions based on player events.\n", + "\n", + "\n", + "-----\n", + "\n", + "# APPENDIX Ultimate class build guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "The heart and soul of mature data teams are formed by this\n", + "\n", + "trio of classes. There are many aspects to these roles, but\n", + "\n", + "they can be summarized in that Data Engineers create and\n", + "\n", + "maintain critical data workflows, Data Analysts interpret data\n", + "\n", + "and create reports that keep the business teams running\n", + "\n", + "seamlessly, and Data Scientists are responsible for making\n", + "\n", + "sense of large amounts of data. Depending on the size of\n", + "\n", + "the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in\n", + "\n", + "data engineering, analytics and data science.\n", + "\n", + "Whether you’re looking to stand-up an analytics dashboard\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "##### Data Engineers\n", + "\n", + "\n", + "**Goals and Priorities of Data Engineers**\n", + "\n", + "- Enable access to usable data for real-time insights — data\n", + "\n", + "that both enables timely decision-making and is accurate\n", + "\n", + "and reproducible\n", + "\n", + "- Increase user confidence and trust in data. This involves\n", + "\n", + "ensuring high consistency and reliability in ETL processes\n", + "\n", + "- Limit the issues and failures experienced by other\n", + "\n", + "engineers and data scientists, allowing those roles to\n", + "\n", + "focus less on troubleshooting and more on drawing\n", + "\n", + "meaningful conclusions from data and building new\n", + "\n", + "products / features\n", + "\n", + "**What Data Engineers care about:**\n", + "\n", + "- Enabling access to data for real-time insights — data that\n", + "\n", + "both enables timely decision-making and is accurate and\n", + "\n", + "reproducible\n", + "\n", + "- Building high-performance, reliable and scalable pipelines\n", + "\n", + "for data processing\n", + "\n", + "- Delivering data for consumption from a variety of sources\n", + "\n", + "by Data Analysts and Data Scientists against tight SLAs\n", + "\n", + "- A Data Engineer’s biggest challenge? Collaboration\n", + "\n", + "across teams\n", + "\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Data Engineers are responsible for data migration,\n", + "\n", + "manipulation, and integration of data (joining dissimilar\n", + "\n", + "data systems)\n", + "\n", + "- Setup and maintenance of ETL pipelines to convert\n", + "\n", + "source data into actionable data for insights. It is the\n", + "\n", + "responsibility of the data engineer to make sure these\n", + "\n", + "pipelines run efficiently and are well orchestrated.\n", + "\n", + "- The Data Engineer sets up the workflow process\n", + "\n", + "to orchestrate pipelines for the studio’s data and\n", + "\n", + "continuously validates it\n", + "\n", + "- Managing workflows to enable data scientists and data\n", + "\n", + "analysts, and ensuring workflows are well-integrated with\n", + "\n", + "different parts of the studio (e.g., marketing, test/QA, etc)\n", + "\n", + "\n", + "##### Data Scientists\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Responsible for making sense of the large amounts of data\n", + "\n", + "collected for a given game title, such as game telemetry,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
daee446a9e3d5402fc1e2ae7ee387d8d- Setup and maintenance of ETL pipelines to convert\n", + "\n", + "source data into actionable data for insights. It is the\n", + "\n", + "responsibility of the data engineer to make sure these\n", + "\n", + "pipelines run efficiently and are well orchestrated.\n", + "\n", + "- The Data Engineer sets up the workflow process\n", + "\n", + "to orchestrate pipelines for the studio’s data and\n", + "\n", + "continuously validates it\n", + "\n", + "- Managing workflows to enable data scientists and data\n", + "\n", + "analysts, and ensuring workflows are well-integrated with\n", + "\n", + "different parts of the studio (e.g., marketing, test/QA, etc)\n", + "\n", + "\n", + "##### Data Scientists\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Responsible for making sense of the large amounts of data\n", + "\n", + "collected for a given game title, such as game telemetry,\n", + "\n", + "business KPIs, game health and quality, and sources\n", + "\n", + "beyond the game such as social media listening\n", + "\n", + "- The analytics portion of a Data Scientist’s job means\n", + "\n", + "looking at new and existing data to try and discover new\n", + "\n", + "things within it\n", + "\n", + "- The engineering component may include writing out\n", + "\n", + "pipeline code and deploying it to a repository\n", + "\n", + "- Data Scientists are responding for building, maintaining, and\n", + "\n", + "monitoring models used for analytics and/or data products\n", + "\n", + "\n", + "-----\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Developing new business capabilities (such as behavioral\n", + "\n", + "segmentation, churn prediction, recommendations) and\n", + "\n", + "optimizing processes around those capabilities\n", + "\n", + "- Increase ROI by building algorithms and tools that are\n", + "\n", + "maintainable and reusable\n", + "\n", + "- Exploring (or further expanding) the use of machine\n", + "\n", + "learning models for specific use cases\n", + "\n", + "- Bridges the gap between engineering and analytics,\n", + "\n", + "between the technology teams and business teams\n", + "\n", + "- Provides business side of studio with data that is crucial\n", + "\n", + "in decision-making, for example a churn model that helps\n", + "\n", + "predict the impact of a new feature set\n", + "\n", + "**What Data Scientists care about:**\n", + "\n", + "- Creating exploratory analysis or models to accurately\n", + "\n", + "predict business metrics, e.g., customer spend, churn,\n", + "\n", + "etc., and provide data-driven recommendations\n", + "\n", + "- Enable team with actionable insights that are easy to\n", + "\n", + "understand and well curated\n", + "\n", + "- Create and move models from experimentation to\n", + "\n", + "production\n", + "\n", + "- A Data Scientist’s biggest challenge? Keeping up with\n", + "\n", + "advancements and innovation in data science, and\n", + "\n", + "knowing which tools and libraries to use\n", + "\n", + "##### Data Analysts\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Often serves as the go-to point of contact for non-\n", + "\n", + "\n", + "\n", + "- Analysts often interpret data and create reports or other\n", + "\n", + "documentation for studio leadership\n", + "\n", + "- Analysts typically are responsible for mining and\n", + "\n", + "compiling data\n", + "\n", + "- Streamline and or simplify processes when possible\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Empower stakeholder and business teams with\n", + "\n", + "actionable data\n", + "\n", + "- “Catch things before they break”. Proactively mitigate\n", + "\n", + "potential data issues before they occur (for internal and\n", + "\n", + "external customers)\n", + "\n", + "- Analysts are often recruited to assist other teams (i.e., BI\n", + "\n", + "teams) with their domain knowledge\n", + "\n", + "- Driving business impact through documentation and\n", + "\n", + "reliable data\n", + "\n", + "**What Data Analysts care about:**\n", + "\n", + "- Easy access to high quality data.\n", + "\n", + "- Quickly find insights from data with SQL queries and\n", + "\n", + "interactive visualizations.\n", + "\n", + "- The ability to easily share insights and while creatingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
7c881f4c6e03e3d222ec7082d373b2ab- Often serves as the go-to point of contact for non-\n", + "\n", + "\n", + "\n", + "- Analysts often interpret data and create reports or other\n", + "\n", + "documentation for studio leadership\n", + "\n", + "- Analysts typically are responsible for mining and\n", + "\n", + "compiling data\n", + "\n", + "- Streamline and or simplify processes when possible\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Empower stakeholder and business teams with\n", + "\n", + "actionable data\n", + "\n", + "- “Catch things before they break”. Proactively mitigate\n", + "\n", + "potential data issues before they occur (for internal and\n", + "\n", + "external customers)\n", + "\n", + "- Analysts are often recruited to assist other teams (i.e., BI\n", + "\n", + "teams) with their domain knowledge\n", + "\n", + "- Driving business impact through documentation and\n", + "\n", + "reliable data\n", + "\n", + "**What Data Analysts care about:**\n", + "\n", + "- Easy access to high quality data.\n", + "\n", + "- Quickly find insights from data with SQL queries and\n", + "\n", + "interactive visualizations.\n", + "\n", + "- The ability to easily share insights and while creating\n", + "\n", + "impactful assets for others to consume (dashboards, reports).\n", + "\n", + "- A Data Analyst’s biggest challenge? Working with complex\n", + "\n", + "processes and complicated technologies that are filled\n", + "\n", + "with messy data. While fighting these challenges, Analysts\n", + "\n", + "are often left alone or forced through paths that prevent\n", + "\n", + "collaboration with others across team/organization.\n", + "\n", + "- Untrustworthy data: often Analysts get asked to provide\n", + "\n", + "answers to leadership that will leverage the data to\n", + "\n", + "determine the direction of the company. When the data is\n", + "\n", + "untrustworthy or incorrect due to previously mentioned\n", + "\n", + "challenges this can eventually lead to lack of trust in the\n", + "\n", + "data teams from leadership or the business.\n", + "\n", + "\n", + "technical business / operations colleagues for data\n", + "\n", + "access / analysis questions\n", + "\n", + "\n", + "-----\n", + "\n", + "# Data access and the major cloud providers\n", + "\n", + "\n", + "### Cloud Rosetta Stone\n", + "\n", + "[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n", + "\n", + "If you are newer to the cloud computing space, it is easy to\n", + "\n", + "get lost between the hundreds of different services between\n", + "\n", + "the three major cloud providers. The table below is meant to\n", + "\n", + "highlight the important data, analytics, and AI services used\n", + "\n", + "by the various hyperscale service providers Amazon,\n", + "\n", + "Microsoft, and Google. In addition, it aims to pair up services\n", + "\n", + "from different cloud providers that serve the same purpose.\n", + "\n", + "### Getting started with the major cloud providers\n", + "\n", + "Here are some quick ways to get started with the three major\n", + "\n", + "cloud providers: AWS, Azure, and GCP:\n", + "\n", + "**AWS:**\n", + "\n", + "`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n", + "\n", + "account on the AWS website. This will give you access to\n", + "\n", + "the AWS Management Console, which is the web-based\n", + "\n", + "interface for managing your AWS resources.\n", + "\n", + "\n", + "`2.` **Use the AWS free tier:** AWS offers a free tier of service\n", + "\n", + "that provides a limited amount of free resources each\n", + "\n", + "month. This is a great way to get started and try out\n", + "\n", + "various AWS services without incurring any charges.\n", + "\n", + "`3.` **Explore the AWS Management Console:** Once you have\n", + "\n", + "an account and are logged in, take some time to explore\n", + "\n", + "the AWS Management Console and familiarize yourself\n", + "\n", + "with the various services that are available.\n", + "\n", + "`4.` **Next you can search for Databricks:** In the AWS\n", + "\n", + "Management Console, use the search bar in the top-left\n", + "\n", + "corner of the page and search for “Databricks”.\n", + "\n", + "`5.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started withSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
2adea317df8b15dd30fcfedb786f6474interface for managing your AWS resources.\n", + "\n", + "\n", + "`2.` **Use the AWS free tier:** AWS offers a free tier of service\n", + "\n", + "that provides a limited amount of free resources each\n", + "\n", + "month. This is a great way to get started and try out\n", + "\n", + "various AWS services without incurring any charges.\n", + "\n", + "`3.` **Explore the AWS Management Console:** Once you have\n", + "\n", + "an account and are logged in, take some time to explore\n", + "\n", + "the AWS Management Console and familiarize yourself\n", + "\n", + "with the various services that are available.\n", + "\n", + "`4.` **Next you can search for Databricks:** In the AWS\n", + "\n", + "Management Console, use the search bar in the top-left\n", + "\n", + "corner of the page and search for “Databricks”.\n", + "\n", + "`5.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`6.` **Launch Databricks Workspace:** To launch the Databricks\n", + "\n", + "Workspace on AWS, you can use the CloudFormation\n", + "\n", + "template provided by Databricks. Databricks\n", + "\n", + "CloudFormation template creates an IAM role, security\n", + "\n", + "group, and Databricks Workspace in your AWS account.\n", + "\n", + "**Azure:**\n", + "\n", + "`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n", + "\n", + "an account on Azure portal. This will give you access to\n", + "\n", + "the Azure portal, which is the web-based interface for\n", + "\n", + "managing your Azure resources.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n", + "|---|---|---|---|---|\n", + "|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n", + "|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n", + "|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n", + "|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n", + "\n", + "\n", + "-----\n", + "\n", + "**Jargon Glossary**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
727106f5081e38bb55fa6b831992c7c7|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n", + "|---|---|---|---|---|\n", + "|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n", + "|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n", + "|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n", + "|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n", + "\n", + "\n", + "-----\n", + "\n", + "**Jargon Glossary**\n", + "\n", + "|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n", + "|---|---|\n", + "|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n", + "|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n", + "|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n", + "|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n", + "|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n", + "|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n", + "|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n", + "\n", + "\n", + "`2.` **Take Azure tutorials:** Azure provides tutorials,\n", + "\n", + "documentation, and sample templates to help you get\n", + "\n", + "started. These resources can help you understand the\n", + "\n", + "basics of Azure and how to use its services.\n", + "\n", + "`3.` **You can search for Databricks:** In the Azure portal, use the\n", + "\n", + "search bar at the top of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the Azure portal, Azure\n", + "\n", + "CLI or Azure Powershell. Once created, you’ll be able to\n", + "\n", + "access your Databricks Workspace through the Azure portal.\n", + "\n", + "`6.` **Other Azure Services:** Once you have a Databricks\n", + "\n", + "workspace setup, you can easily connect it to other Azure\n", + "\n", + "Services such as Azure Storage, Event Hubs, Azure DataSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
64813058f8cb44c5e7915fedef13435abasics of Azure and how to use its services.\n", + "\n", + "`3.` **You can search for Databricks:** In the Azure portal, use the\n", + "\n", + "search bar at the top of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the Azure portal, Azure\n", + "\n", + "CLI or Azure Powershell. Once created, you’ll be able to\n", + "\n", + "access your Databricks Workspace through the Azure portal.\n", + "\n", + "`6.` **Other Azure Services:** Once you have a Databricks\n", + "\n", + "workspace setup, you can easily connect it to other Azure\n", + "\n", + "Services such as Azure Storage, Event Hubs, Azure Data\n", + "\n", + "Lake Storage, Azure SQL and Cosmos DB for example.\n", + "\n", + "\n", + "**GCP:**\n", + "\n", + "`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n", + "\n", + "account on GCP portal. This will give you access to the\n", + "\n", + "GCP Console, which is the web-based interface for\n", + "\n", + "managing your GCP resources.\n", + "\n", + "`2.` **Explore the GCP Console:** Once you have an account\n", + "\n", + "and are logged in, take some time to explore the GCP\n", + "\n", + "Console and familiarize yourself with the various services\n", + "\n", + "that are available.\n", + "\n", + "`3.` **Search for Databricks:** In the GCP Console, use the search bar\n", + "\n", + "in the top-left corner of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the GCP Console or\n", + "\n", + "the gcloud command-line tool. Once created, you’ll be\n", + "\n", + "able to access your Databricks Workspace through the\n", + "\n", + "GCP Console.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Detailed Use Cases\n", + "\n", + "\n", + "### Getting started with game analytics\n", + "\n", + "Fortunately, standing up an effective analytics dashboard\n", + "\n", + "is getting easier. It all starts with getting your data into an\n", + "\n", + "architecture that sets your team up for success. Selecting\n", + "\n", + "any of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n", + "\n", + "you can land all your data into a cloud data lake, then use\n", + "\n", + "Databricks Lakehouse architecture to run real-time and\n", + "\n", + "reliable processing. Databricks can then help you visualize\n", + "\n", + "that data in a dashboard, or send to a visual analytics\n", + "\n", + "platform, such as Tableau.\n", + "\n", + "`1.` **Sign up for a Databricks account:** You’ll need to create\n", + "\n", + "an account on the Databricks website in order to use the\n", + "\n", + "platform.\n", + "\n", + "`2.` **Access the Databricks portal:** Interact with the\n", + "\n", + "Databricks platform and run tasks such as creating\n", + "\n", + "clusters, running jobs, and accessing data.\n", + "\n", + "`3.` **Set up a development environment:** You’ll need a\n", + "\n", + "development environment where you can write andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
107a6e85f62ef7ce272fcfa69f254f7byou can land all your data into a cloud data lake, then use\n", + "\n", + "Databricks Lakehouse architecture to run real-time and\n", + "\n", + "reliable processing. Databricks can then help you visualize\n", + "\n", + "that data in a dashboard, or send to a visual analytics\n", + "\n", + "platform, such as Tableau.\n", + "\n", + "`1.` **Sign up for a Databricks account:** You’ll need to create\n", + "\n", + "an account on the Databricks website in order to use the\n", + "\n", + "platform.\n", + "\n", + "`2.` **Access the Databricks portal:** Interact with the\n", + "\n", + "Databricks platform and run tasks such as creating\n", + "\n", + "clusters, running jobs, and accessing data.\n", + "\n", + "`3.` **Set up a development environment:** You’ll need a\n", + "\n", + "development environment where you can write and\n", + "\n", + "test your code, whether you’re using a local IDE or the\n", + "\n", + "Databricks Workspace.\n", + "\n", + "`4.` **Collect data:** Once you have your development environment\n", + "\n", + "set up, you can start collecting data from your game. This\n", + "\n", + "can involve integrating or building a SDK into your game\n", + "\n", + "code, or using another tool to send data to cloud storage.\n", + "\n", + "`5.` **Process and analyze the data:** Once you have collected\n", + "\n", + "your data, you can use Databricks to process and analyze\n", + "\n", + "it. This can involve cleaning and transforming the data,\n", + "\n", + "running queries or machine learning algorithms, or\n", + "\n", + "creating visualizations.\n", + "\n", + "`6.` **Monitor and optimize:** Regularly monitor your analytics\n", + "\n", + "to ensure that they are accurate and relevant, and use the\n", + "\n", + "insights you gain to optimize your game.\n", + "\n", + "Keep in mind that these are just general steps to get started\n", + "\n", + "with Databricks for game analytics. The specific steps you’ll\n", + "\n", + "need to take will depend on your specific use case and needs.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** What do you want to learn from your\n", + "\n", + "analytics data? Having clear goals will help you focus on\n", + "\n", + "collecting the right data and making meaningful use of it.\n", + "\n", + "- **Plan your data collection:** Determine what data you need\n", + "\n", + "to collect, how you will collect it, and how you will store it.\n", + "\n", + "- **Consider privacy:** Make sure you are transparent with your\n", + "\n", + "players about what data you are collecting and how you\n", + "\n", + "will use it, and give them the option to opt out if they wish.\n", + "\n", + "- **Use analytics to inform design:** Leverage your analytics data\n", + "\n", + "to inform decisions around game design, such as any balance\n", + "\n", + "changes or new content targeting a specific audience.\n", + "\n", + "- **Monitor and test your analytics implementation:** Regularly\n", + "\n", + "check your analytics to ensure that data is being collected\n", + "\n", + "correctly, and conduct tests to validate the accuracy of\n", + "\n", + "your data.\n", + "\n", + "- **Visualize your data:** Dashboarding your data is one of the\n", + "\n", + "most effective ways to quickly and effectively make sense\n", + "\n", + "of what’s happening at a given moment in time.\n", + "\n", + "- **Use data to improve player retention:** Analyze player\n", + "\n", + "behavior and use the insights you gain to improve player\n", + "\n", + "retention, such as by identifying and addressing pain\n", + "\n", + "points or by providing personalized content.\n", + "\n", + "- **Collaborate with your team:** Share your analytics\n", + "\n", + "findings with your team and encourage them to use the\n", + "\n", + "data to inform their work.\n", + "\n", + "- **Keep it simple:** Don’t try to collect too much data or\n", + "\n", + "create overly complex analytics systems. Keep it simple\n", + "\n", + "and focused on your goals.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
c48db58d6a08214c8ed3d4cd460c6180- **Monitor and test your analytics implementation:** Regularly\n", + "\n", + "check your analytics to ensure that data is being collected\n", + "\n", + "correctly, and conduct tests to validate the accuracy of\n", + "\n", + "your data.\n", + "\n", + "- **Visualize your data:** Dashboarding your data is one of the\n", + "\n", + "most effective ways to quickly and effectively make sense\n", + "\n", + "of what’s happening at a given moment in time.\n", + "\n", + "- **Use data to improve player retention:** Analyze player\n", + "\n", + "behavior and use the insights you gain to improve player\n", + "\n", + "retention, such as by identifying and addressing pain\n", + "\n", + "points or by providing personalized content.\n", + "\n", + "- **Collaborate with your team:** Share your analytics\n", + "\n", + "findings with your team and encourage them to use the\n", + "\n", + "data to inform their work.\n", + "\n", + "- **Keep it simple:** Don’t try to collect too much data or\n", + "\n", + "create overly complex analytics systems. Keep it simple\n", + "\n", + "and focused on your goals.\n", + "\n", + "- **Start where you are:** If you’ve yet to gather all of your\n", + "\n", + "data, don’t go build some fancy model. Start with the data\n", + "\n", + "you have available to you and build from there.\n", + "\n", + "### Getting started with Player Segmentation\n", + "\n", + "Player segmentation is crucial to studios as it allows them\n", + "\n", + "to better understand their audience and tailor their game\n", + "\n", + "experience to meet their specific needs and preferences.\n", + "\n", + "By dividing players into different segments based on factors\n", + "\n", + "such as demographics, playing styles, and in-game behavior,\n", + "\n", + "\n", + "-----\n", + "\n", + "studios can gain valuable insights into what motivates and\n", + "\n", + "engages their players. This information can then be used\n", + "\n", + "to design games that not only provide a more enjoyable\n", + "\n", + "experience for players, but also drive player retention\n", + "\n", + "and increase revenue for the studio. In a competitive\n", + "\n", + "industry where player satisfaction is key to success, player\n", + "\n", + "segmentation is an essential tool for studios to stay ahead of\n", + "\n", + "the game.\n", + "\n", + "Start by evaluating the segmentation goals such as:\n", + "\n", + "- **Personalize the experience:** Changing or creating\n", + "\n", + "experience specific designs to the player.\n", + "\n", + "- **Create relevant content:** Surface the best content to\n", + "\n", + "players based on features and behaviors that will matter\n", + "\n", + "the most depending on the player’s place in the games\n", + "\n", + "life cycle.\n", + "\n", + "- **Monetization:** Create tailored monetization strategies\n", + "\n", + "that effectively reach and convert each player group. For\n", + "\n", + "example, you may have a group of highly engaged players\n", + "\n", + "who are more likely to make in-app purchases, while\n", + "\n", + "another group is less likely to spend money but may be\n", + "\n", + "more receptive to advertisements.\n", + "\n", + "The next steps would be to identify, collect and analyze\n", + "\n", + "player data. By gathering information on player behavior,\n", + "\n", + "preferences, and demographics, you can gain insights\n", + "\n", + "into their motivations, pain points, and what drives their\n", + "\n", + "engagement with your game.\n", + "\n", + "There are multiple types of player data to collect, including:\n", + "\n", + "- **Player Behavior:** Track player behavior and actions\n", + "\n", + "within your game to gain insights into their play style,\n", + "\n", + "preferences, and patterns.\n", + "\n", + "- **Surveys:** Ask players directly about their preferences,\n", + "\n", + "motivations, and feedback through in-game surveys, email\n", + "\n", + "questionnaires, or other forms of direct communication.\n", + "\n", + "- **Focus groups:** Gather a small group of players to discuss\n", + "\n", + "and provide feedback on specific aspects of your game\n", + "\n", + "and player experience.\n", + "\n", + "- **Social media listening:** Monitor social media platforms\n", + "\n", + "to gather insights into how players are engaging with and\n", + "\n", + "talking about your game.\n", + "\n", + "**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "Define your segmentation goals: Determine what you want\n", + "\n", + "to learn about your players and why. This will help you focusSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
5c29708c331107e5cc6e18f0a765e54e- **Player Behavior:** Track player behavior and actions\n", + "\n", + "within your game to gain insights into their play style,\n", + "\n", + "preferences, and patterns.\n", + "\n", + "- **Surveys:** Ask players directly about their preferences,\n", + "\n", + "motivations, and feedback through in-game surveys, email\n", + "\n", + "questionnaires, or other forms of direct communication.\n", + "\n", + "- **Focus groups:** Gather a small group of players to discuss\n", + "\n", + "and provide feedback on specific aspects of your game\n", + "\n", + "and player experience.\n", + "\n", + "- **Social media listening:** Monitor social media platforms\n", + "\n", + "to gather insights into how players are engaging with and\n", + "\n", + "talking about your game.\n", + "\n", + "**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "Define your segmentation goals: Determine what you want\n", + "\n", + "to learn about your players and why. This will help you focus\n", + "\n", + "your analysis and ensure that your segments are meaningful\n", + "\n", + "and actionable.\n", + "\n", + "- **Use meaningful criteria:** Choose criteria that are relevant\n", + "\n", + "to your goals and that differentiate players in meaningful\n", + "\n", + "ways. This could include demographic information, in-game\n", + "\n", + "behavior, spending habits, or a combination of factors.\n", + "\n", + "- **Analyze player data:** Use data from your players to inform\n", + "\n", + "your segmentation strategy. This could include data\n", + "\n", + "on in-game behavior, spending habits, or demographic\n", + "\n", + "information.\n", + "\n", + "- **Use multiple methods:** We recommend using a\n", + "\n", + "combination of methods, such as clustering to create\n", + "\n", + "segments that are statistically meaningful and actionable\n", + "\n", + "to your game.\n", + "\n", + "- **Validate your segments:** Test your segments to ensure\n", + "\n", + "that they accurately reflect the differences you observed\n", + "\n", + "in your player data. This could involve comparing the\n", + "\n", + "segments to each other, or validating the segments\n", + "\n", + "against external data sources.\n", + "\n", + "- **Consider ethical and privacy concerns:** Ensure that\n", + "\n", + "your segmentation strategy is ethical and complies\n", + "\n", + "with privacy laws and regulations. This could involve\n", + "\n", + "anonymizing your player data, obtaining consent from\n", + "\n", + "players, or other measures to protect player privacy.\n", + "\n", + "- **Monitor and refine your segments:** Regularly review\n", + "\n", + "your segments to ensure that they remain relevant and\n", + "\n", + "meaningful. Refine your segments as necessary to reflect\n", + "\n", + "changes in your player data or your goals.\n", + "\n", + "### Getting Started with Player Lifetime Value\n", + "\n", + "Assuming you’ve followed the steps to collecting, storing, and\n", + "\n", + "preparing your player data for analysis; To calculate player\n", + "\n", + "lifetime value (LTV), the quick and dirty way of assessing\n", + "\n", + "overall player LTV is to divide the total revenue by the total\n", + "\n", + "number of registered players. Note, LTV is a critical calculation\n", + "\n", + "for return on investment, which is player lifetime spend versus\n", + "\n", + "the amount spent on player acquisition. Ideally, you want\n", + "\n", + "lifetime spend to be equal to or more than cost of acquisition.\n", + "\n", + "\n", + "-----\n", + "\n", + "As long as your game and its community are currently active,\n", + "\n", + "any player lifetime value calculations should be considered\n", + "\n", + "models, not exact numbers. This is because many of the players\n", + "\n", + "you’re considering are likely actively registered and actively\n", + "\n", + "playing, so the exact player LTV number is a moving target.\n", + "\n", + "Advanced\n", + "predictive\n", + "models\n", + "\n", + "Simple\n", + "predictive\n", + "models\n", + "\n", + "\n", + "Historical\n", + "average and\n", + "benchmarks\n", + "\n", + "\n", + "But these models are not entirely accurate since it doesn’t\n", + "\n", + "take into account the players who are registered but have\n", + "\n", + "yet to generate any revenue. Instead, a data-driven approach\n", + "\n", + "pivoted around player segmentation or cohorts will generally\n", + "\n", + "yield more actionable insight, far more than calculating a\n", + "\n", + "single LTV for the entire player base.\n", + "\n", + "You can define your game’s cohorts in multiple ways. Perhaps\n", + "\n", + "the most obvious in terms of calculating LTV is going by dailySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
bf4bfcfdf8244e27d15c44645d42b670lifetime spend to be equal to or more than cost of acquisition.\n", + "\n", + "\n", + "-----\n", + "\n", + "As long as your game and its community are currently active,\n", + "\n", + "any player lifetime value calculations should be considered\n", + "\n", + "models, not exact numbers. This is because many of the players\n", + "\n", + "you’re considering are likely actively registered and actively\n", + "\n", + "playing, so the exact player LTV number is a moving target.\n", + "\n", + "Advanced\n", + "predictive\n", + "models\n", + "\n", + "Simple\n", + "predictive\n", + "models\n", + "\n", + "\n", + "Historical\n", + "average and\n", + "benchmarks\n", + "\n", + "\n", + "But these models are not entirely accurate since it doesn’t\n", + "\n", + "take into account the players who are registered but have\n", + "\n", + "yet to generate any revenue. Instead, a data-driven approach\n", + "\n", + "pivoted around player segmentation or cohorts will generally\n", + "\n", + "yield more actionable insight, far more than calculating a\n", + "\n", + "single LTV for the entire player base.\n", + "\n", + "You can define your game’s cohorts in multiple ways. Perhaps\n", + "\n", + "the most obvious in terms of calculating LTV is going by daily\n", + "\n", + "active cohorts, or users who joined your game on the same\n", + "\n", + "day. You could also organize cohorts by users who joined\n", + "\n", + "your game through a certain ad campaign or promotional\n", + "\n", + "effort, by country or geographic location, or by the type of\n", + "\n", + "device used.\n", + "\n", + "**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "**ACCURACY**\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "\n", + "**Use multiple data sources:** To get a complete picture of\n", + "\n", + "a player’s value, be sure to consider data from a variety\n", + "\n", + "of sources, including in-game purchases, ad revenue, and\n", + "\n", + "other monetization strategies.\n", + "\n", + "**Consider player retention:** Player retention is a key factor\n", + "\n", + "in LTV, so be sure to consider how long players are likely to\n", + "\n", + "play your game when calculating LTV.\n", + "\n", + "**Use accurate data:** Make sure you are using accurate\n", + "\n", + "data when calculating LTV. This might involve cleaning and\n", + "\n", + "processing your data, or using trusted sources such as in-\n", + "\n", + "game analytics tools.\n", + "\n", + "**Regularly review and update your LTV estimates:** Player\n", + "\n", + "LTV can change over time, so be sure to regularly review\n", + "\n", + "and update your estimates to ensure they are accurate.\n", + "\n", + "**Test and optimize:** Use experimentation methods such\n", + "\n", + "as A/B testing to see how different variables, such as\n", + "\n", + "in-game events or pricing strategies, affect LTV. Use the\n", + "\n", + "insights you gain to optimize your LTV calculations.\n", + "\n", + "**Be aware of outside factors:** Your calculations should\n", + "\n", + "consider the many outside factors that can affect your\n", + "\n", + "LTV, such as the virality of your game, any spikes or surge\n", + "\n", + "in visitors due to unexpected promotions (influencers,\n", + "\n", + "reviewers talking about your game), any significant changes\n", + "\n", + "to your game that users respond well to, and other organic\n", + "\n", + "lifts that are difficult to predict with existing data.\n", + "\n", + "\n", + "The first calculation is relatively simple. We suggest using\n", + "\n", + "average revenue per user (ARPU), which is a game’s daily\n", + "\n", + "revenue divided by the number of active users, to help you\n", + "\n", + "calculate lifetime value. First, you’ll need to define what is\n", + "\n", + "an active player using retention values; which can be set to\n", + "\n", + "a week, multi-day, or multi-week period of time depending\n", + "\n", + "on how your game has performed to date. You can then look\n", + "\n", + "at the number of users who churn on a given day, averaging\n", + "\n", + "with the number of days from the player’s first visit to the\n", + "\n", + "current date (or the specific date you’ve considered the end\n", + "\n", + "for said exercise). This is your playerbase lifetime value (note\n", + "\n", + "not Player Lifetime Value). To get Lifetime Value, divide daily\n", + "\n", + "revenue by the number of daily active users, and multiply\n", + "\n", + "that by the Lifetime Value to get your player LTV.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
d3bae8db08e6aad1fcd5f5954ef6c4f9lifts that are difficult to predict with existing data.\n", + "\n", + "\n", + "The first calculation is relatively simple. We suggest using\n", + "\n", + "average revenue per user (ARPU), which is a game’s daily\n", + "\n", + "revenue divided by the number of active users, to help you\n", + "\n", + "calculate lifetime value. First, you’ll need to define what is\n", + "\n", + "an active player using retention values; which can be set to\n", + "\n", + "a week, multi-day, or multi-week period of time depending\n", + "\n", + "on how your game has performed to date. You can then look\n", + "\n", + "at the number of users who churn on a given day, averaging\n", + "\n", + "with the number of days from the player’s first visit to the\n", + "\n", + "current date (or the specific date you’ve considered the end\n", + "\n", + "for said exercise). This is your playerbase lifetime value (note\n", + "\n", + "not Player Lifetime Value). To get Lifetime Value, divide daily\n", + "\n", + "revenue by the number of daily active users, and multiply\n", + "\n", + "that by the Lifetime Value to get your player LTV.\n", + "\n", + "It’s important to note that while calculating player lifetime\n", + "\n", + "value, the term is not entirely accurate since most player\n", + "\n", + "lifetimes are not over (particularly true for live service\n", + "\n", + "games). But for the purpose of modeling, we recommend\n", + "\n", + "keeping the amount of time that you consider a lifetime\n", + "\n", + "relatively short, allowing you to extrapolate. Keeping the time\n", + "\n", + "period shorter helps mitigate inaccuracies, specifically, the\n", + "\n", + "longer you stretch out what you consider a lifetime the more\n", + "\n", + "likely you are to collect inactive users in your count.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Social Media Monitoring\n", + "\n", + "Social media monitoring has three primary components:\n", + "\n", + "collecting the data, processing the results, and taking action\n", + "\n", + "on the findings. When it comes to collecting the data, whether\n", + "\n", + "you’re looking for tweets, YouTube comments, or Reddit\n", + "\n", + "posts, it can be very easy to get started since many social\n", + "\n", + "media platforms such as Twitter, YouTube, and Reddit all\n", + "\n", + "provide their own detailed and comprehensive APIs making it\n", + "\n", + "easy to start gathering data from those platforms with proper\n", + "\n", + "documentation and code examples to help along the way.\n", + "\n", + "Once the data has been collected, the next step is to process\n", + "\n", + "it and prepare it to be used in the next step. Processing your\n", + "\n", + "data can range in complexity from a simple keywords filter\n", + "\n", + "or more complicated approach such as filtering by location,\n", + "\n", + "removing emojis, and censoring and substituting words. With\n", + "\n", + "the data collected and processed, it can move to the final\n", + "\n", + "stage and be analyzed for downstream use and actionable\n", + "\n", + "insights by applying sentiment analysis or text mining.\n", + "\n", + "If a game studio is looking to save time and have the above\n", + "\n", + "steps performed for them, it may be appealing to buy a\n", + "\n", + "pre-built tool. The primary benefits of buying an off the shelf\n", + "\n", + "solution is that it is often faster and easier to get started\n", + "\n", + "with, and the development of the tool is handled by a third\n", + "\n", + "party who will have experience in building media monitoring\n", + "\n", + "\n", + "solutions. On the other hand, building your own custom\n", + "\n", + "solution will provide more flexibility and control. Many pre-\n", + "\n", + "built media monitoring tools might not have the capabilities\n", + "\n", + "required to effectively process video, audio, and image\n", + "\n", + "data, and may not be able to control the frequency in which\n", + "\n", + "data is processed, whether it be near real-time or batch.\n", + "\n", + "Additionally, pre-built solutions tend to take a generalist\n", + "\n", + "approach for NLP, whether it be keyword extraction, topic\n", + "\n", + "filtering, or sentiment analysis, which often leads to poor\n", + "\n", + "results and feedback, especially for an industry as unique as\n", + "\n", + "the gaming industry where certain industry-specific slang\n", + "\n", + "or terminology is frequently used. Overall, building your\n", + "\n", + "own media monitoring tool will provide greater control and\n", + "\n", + "flexibility leading to a better tailored return on investment,\n", + "\n", + "and luckily Databricks makes it even easier to get started.\n", + "\n", + "With the Databricks Lakehouse platform, all data engineering,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
0e5110779c7dcf40e6eb72b0f3e5c5e4party who will have experience in building media monitoring\n", + "\n", + "\n", + "solutions. On the other hand, building your own custom\n", + "\n", + "solution will provide more flexibility and control. Many pre-\n", + "\n", + "built media monitoring tools might not have the capabilities\n", + "\n", + "required to effectively process video, audio, and image\n", + "\n", + "data, and may not be able to control the frequency in which\n", + "\n", + "data is processed, whether it be near real-time or batch.\n", + "\n", + "Additionally, pre-built solutions tend to take a generalist\n", + "\n", + "approach for NLP, whether it be keyword extraction, topic\n", + "\n", + "filtering, or sentiment analysis, which often leads to poor\n", + "\n", + "results and feedback, especially for an industry as unique as\n", + "\n", + "the gaming industry where certain industry-specific slang\n", + "\n", + "or terminology is frequently used. Overall, building your\n", + "\n", + "own media monitoring tool will provide greater control and\n", + "\n", + "flexibility leading to a better tailored return on investment,\n", + "\n", + "and luckily Databricks makes it even easier to get started.\n", + "\n", + "With the Databricks Lakehouse platform, all data engineering,\n", + "\n", + "data science, machine learning, and data analytics can\n", + "\n", + "be done in a single place without having to stitch multiple\n", + "\n", + "systems and tools together.\n", + "\n", + "Data engineers can use Workflows and Jobs to call social\n", + "\n", + "media platform APIs on a scheduled basis and use Delta Live\n", + "\n", + "Tables to create declarative data pipelines for cleaning and\n", + "\n", + "processing the data that comes in. Data scientists can use\n", + "\n", + "tools such as ML-specific Databricks runtimes (DBRs) that\n", + "\n", + "come with many of the most popular and common libraries\n", + "\n", + "already installed, MLflow which makes model development,\n", + "\n", + "\n", + "-----\n", + "\n", + "tracking, and serving easy and efficient, and various other\n", + "\n", + "tools such as AutoML and Bamboolib. Data analysts are able\n", + "\n", + "to create real-time alerts, dashboards, and visualizations\n", + "\n", + "using Databricks SQL. Each of the three personas will be able\n", + "\n", + "to effectively collaborate with each other and integrate each\n", + "\n", + "piece of their work into the broader data architecture.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "While social media monitoring can be easy to get started\n", + "\n", + "with, there are a few key points to keep in mind.\n", + "\n", + "- Remember the Pareto principle (roughly 80% of impact\n", + "\n", + "comes from 20% of activity) and diminishing returns. While\n", + "\n", + "it’s important to monitor large platforms such as Reddit,\n", + "\n", + "Twitter, and YouTube, it might not be worthwhile to monitor\n", + "\n", + "smaller platforms (in terms of engagement) as the bulk of\n", + "\n", + "customer feedback will be on those major platforms.\n", + "\n", + "- Monitor other sources of information. It is also useful to\n", + "\n", + "monitor mentions of key company personnel such as\n", + "\n", + "executives or public facing employees.\n", + "\n", + "- While follower count does matter on platforms such as\n", + "\n", + "Twitter, don’t ignore users with low-follower counts. It only\n", + "\n", + "takes one or two re-tweets from other users to become a\n", + "\n", + "large issue.\n", + "\n", + "- On social media, customers can see through generic\n", + "\n", + "corporate responses to complaints, so it is important\n", + "\n", + "to get a clear understanding of the issue and provide a\n", + "\n", + "clear response.\n", + "\n", + "### Getting Started with Player Feedback Analysis\n", + "\n", + "The easiest place to start is gathering your data. With\n", + "\n", + "accounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n", + "\n", + "Nintendo (or whatever platform you’re using), identify the ID\n", + "\n", + "for your game(s), and pull the reviews corresponding to that\n", + "\n", + "game into Databricks through an API call.\n", + "\n", + "\n", + "From here, you clean the data using some of the pre-\n", + "\n", + "processing available in Python that removes any emojis and\n", + "\n", + "ASCII characters. Once complete, run through Spark NLPSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
8a4c09960b51191088f561ef1575b018- While follower count does matter on platforms such as\n", + "\n", + "Twitter, don’t ignore users with low-follower counts. It only\n", + "\n", + "takes one or two re-tweets from other users to become a\n", + "\n", + "large issue.\n", + "\n", + "- On social media, customers can see through generic\n", + "\n", + "corporate responses to complaints, so it is important\n", + "\n", + "to get a clear understanding of the issue and provide a\n", + "\n", + "clear response.\n", + "\n", + "### Getting Started with Player Feedback Analysis\n", + "\n", + "The easiest place to start is gathering your data. With\n", + "\n", + "accounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n", + "\n", + "Nintendo (or whatever platform you’re using), identify the ID\n", + "\n", + "for your game(s), and pull the reviews corresponding to that\n", + "\n", + "game into Databricks through an API call.\n", + "\n", + "\n", + "From here, you clean the data using some of the pre-\n", + "\n", + "processing available in Python that removes any emojis and\n", + "\n", + "ASCII characters. Once complete, run through Spark NLP\n", + "\n", + "pipeline which does the basic natural language processing\n", + "\n", + "steps such as normalization, stemming, lemmatization. We\n", + "\n", + "recommend running through pre-trained models, such as Word\n", + "\n", + "Embeddings and Named Entity Recognition models from John\n", + "\n", + "Snow Labs. This should complete the pipeline and generates\n", + "\n", + "the aspects for the reviews provided by the community.\n", + "\n", + "This data is then loaded into a Delta table for further analysis,\n", + "\n", + "such as using a visual dashboard (built on SQL queries inside\n", + "\n", + "Databricks) to analyze and understand the aspects the\n", + "\n", + "community is talking about, which can then be shared back\n", + "\n", + "with the development team for analysis and action. This is a\n", + "\n", + "great exercise to run once per month.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Check for word groupings:** Make sure your word groupings\n", + "\n", + "are accurate to improve the analysis. For example, if your\n", + "\n", + "game is called Football Manager, and the shorthand is FM,\n", + "\n", + "make sure both of those are grouped appropriately.\n", + "\n", + "- **Leverage domain knowledge:** Clean the reviews based\n", + "\n", + "on your domain knowledge. There are generic steps one\n", + "\n", + "could take, but that will not be as effective as someone\n", + "\n", + "with domain, and specific game knowledge of your title.\n", + "\n", + "- **Experiment with models:** Feel free to try multiple pre-\n", + "\n", + "trained models, and or tweak the pre-trained models\n", + "\n", + "based on your understanding of the domain to improve\n", + "\n", + "the accuracy of your results.\n", + "\n", + "- **Work one title at a time:** This process works best when\n", + "\n", + "pulling reviews for a single title, specifically one version of\n", + "\n", + "one title at a time.\n", + "\n", + "- **Let the model to the heavy lift, but use humans to double-**\n", + "\n", + "**check:** The sentiment corresponding to the aspects in the\n", + "\n", + "model will be labeled as Positive or Negative. In the case\n", + "\n", + "of a neutral review, the model will do its best to determine\n", + "\n", + "whether that is more positive or negative. A best practice\n", + "\n", + "is to spend time going back through the aspects early to\n", + "\n", + "determine model accuracy and make updates accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Toxicity Detection\n", + "\n", + "Our recommendation on tackling the toxicity issue is\n", + "\n", + "to leverage cloud-agnostic and flexible tooling that can\n", + "\n", + "consume chat data from a variety of sources, such as chat\n", + "\n", + "logs, voice transcriptions, or sources like discord and reddit\n", + "\n", + "forums. No matter if the data is in log form from game\n", + "\n", + "servers or events from a message system, Databricks can\n", + "\n", + "provide quick and easy ways to ingest the data.\n", + "\n", + "Leveraging a simplified architecture like the diagramSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
66941af18e24e0318ab56ffb753bbeab**check:** The sentiment corresponding to the aspects in the\n", + "\n", + "model will be labeled as Positive or Negative. In the case\n", + "\n", + "of a neutral review, the model will do its best to determine\n", + "\n", + "whether that is more positive or negative. A best practice\n", + "\n", + "is to spend time going back through the aspects early to\n", + "\n", + "determine model accuracy and make updates accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Toxicity Detection\n", + "\n", + "Our recommendation on tackling the toxicity issue is\n", + "\n", + "to leverage cloud-agnostic and flexible tooling that can\n", + "\n", + "consume chat data from a variety of sources, such as chat\n", + "\n", + "logs, voice transcriptions, or sources like discord and reddit\n", + "\n", + "forums. No matter if the data is in log form from game\n", + "\n", + "servers or events from a message system, Databricks can\n", + "\n", + "provide quick and easy ways to ingest the data.\n", + "\n", + "Leveraging a simplified architecture like the diagram\n", + "\n", + "above shows no matter the source, getting chat data for\n", + "\n", + "inferencing and model development can be as simple. While\n", + "\n", + "we leveraged a pre-built model from John Snow Labs to\n", + "\n", + "accelerate development, you can bring the ML framework of\n", + "\n", + "your choice to the platform.\n", + "\n", + "**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define what toxic and disruptive behavior looks**\n", + "\n", + "**like within your community:** Clearly define what you\n", + "\n", + "consider to be toxic behavior, as this will determine how\n", + "\n", + "you measure and detect it. This might include things like\n", + "\n", + "hateful language, harassment, or cheating.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you detect toxicity. This might include\n", + "\n", + "data on in-game chat, player reports, and other sources.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and identify patterns of toxic\n", + "\n", + "behavior. This will allow you to more accurately detect\n", + "\n", + "toxicity and prioritize cases for review.\n", + "\n", + "- **Test and optimize:** Regularly review and test your toxicity\n", + "\n", + "detection systems to ensure they are accurate and\n", + "\n", + "effective. Use experimentation methods such as A/B\n", + "\n", + "testing to see how different strategies impact toxicity rates.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are detecting toxicity, and give\n", + "\n", + "them the option to opt out if they wish.\n", + "\n", + "- **Take action:** When toxic behavior is detected, take\n", + "\n", + "appropriate action to address it. The health and wellness\n", + "\n", + "of your community depends on it. This might involve\n", + "\n", + "banning players, issuing warnings, or taking other\n", + "\n", + "disciplinary measures.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n", + "\n", + "To get started with multi-touch attribution, you need to first\n", + "\n", + "select an attribution model. There are a variety of different\n", + "\n", + "attribution models to choose from, each with its own\n", + "\n", + "\n", + "attribution credit according to your chosen model (above).\n", + "\n", + "We highly recommend you regularly review and test your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "Use experimentation methods such as A/B testing to see\n", + "\n", + "how different strategies impact conversion rates.\n", + "\n", + "**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n", + "\n", + "\n", + "strengths and limitations.\n", + "\n", + "\n", + "`1.` **Last-click model:** This model attributes all credit to the\n", + "\n", + "last touchpoint that the customer interacted with before\n", + "\n", + "making a purchase or taking a desired action.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
5cc40222e7110f4588869738b6e75f20To get started with multi-touch attribution, you need to first\n", + "\n", + "select an attribution model. There are a variety of different\n", + "\n", + "attribution models to choose from, each with its own\n", + "\n", + "\n", + "attribution credit according to your chosen model (above).\n", + "\n", + "We highly recommend you regularly review and test your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "Use experimentation methods such as A/B testing to see\n", + "\n", + "how different strategies impact conversion rates.\n", + "\n", + "**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n", + "\n", + "\n", + "strengths and limitations.\n", + "\n", + "\n", + "`1.` **Last-click model:** This model attributes all credit to the\n", + "\n", + "last touchpoint that the customer interacted with before\n", + "\n", + "making a purchase or taking a desired action.\n", + "\n", + "`2.` **First-click model:** This model attributes all credit to the\n", + "\n", + "first touchpoint that the customer interacted with.\n", + "\n", + "`3.` **Linear model:** This model attributes equal credit to each\n", + "\n", + "touchpoint that the customer interacted with.\n", + "\n", + "`4.` **Time decay model:** This model attributes more credit to\n", + "\n", + "touchpoints that are closer in time to the purchase\n", + "\n", + "or desired action.\n", + "\n", + "`5.` **Position-based model:** This model attributes a portion of\n", + "\n", + "the credit to the first and last touchpoints, and the remainder\n", + "\n", + "is distributed evenly among the other touchpoints.\n", + "\n", + "`6.` **Custom model:** Some businesses create their own\n", + "\n", + "attribution model based on specific business needs or goals.\n", + "\n", + "Each attribution model has its own strengths and limitations,\n", + "\n", + "and the right model for a particular video game will depend\n", + "\n", + "on a variety of factors, including the goals of your title, the\n", + "\n", + "customer journey, and the types of marketing channels being\n", + "\n", + "used. It is important to carefully consider the pros and cons\n", + "\n", + "of each model and choose the one that best aligns with the\n", + "\n", + "needs of your game.\n", + "\n", + "Next, you’re going to want to set up tracking. In order to\n", + "\n", + "attribute credit to different touchpoints, you’ll need to set up\n", + "\n", + "tracking to capture data on customer interactions. This might\n", + "\n", + "involve integrating tracking code into the game, or using a\n", + "\n", + "third-party tracking tool.\n", + "\n", + "With tracking set up, you’ll start collecting data on player\n", + "\n", + "interactions and be able to use that information to calculate\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define clear goals:** Sounds simple, but by clearly defining\n", + "\n", + "the goals of your acquisition campaign and what success\n", + "\n", + "looks like, you will be able to guide your decision-making\n", + "\n", + "and ensure that you are measuring the right metrics -\n", + "\n", + "such as cost per install, return on ad spend, conversion\n", + "\n", + "rate, lifetime value, retention rate, and more.\n", + "\n", + "- **Use a data-driven approach:** Use data to inform your\n", + "\n", + "decision-making. Collect data on all touchpoints in the\n", + "\n", + "player journey, including ad impressions, clicks, installs,\n", + "\n", + "and in-game actions.\n", + "\n", + "- **Choose the right attribution model:** Select the right\n", + "\n", + "attribution model that accurately reflects the player\n", + "\n", + "journey for your specific genre of game. This can be a\n", + "\n", + "complex process. A couple of things to keep in mind\n", + "\n", + "- Consider the touchpoints that are most important for\n", + "\n", + "your player journey, such as first ad impression, first\n", + "\n", + "click, or first in-game action\n", + "\n", + "- Consider the business goals you’re trying to achieve.\n", + "\n", + "For example, if you are focused on maximizing return\n", + "\n", + "on investment, a last-click attribution model may be\n", + "\n", + "most appropriate. On the other hand, if you are looking\n", + "\n", + "to understand the impact of each touchpoint, a multi-SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
c1ff59b253186b6ec6769a2984f8461a- **Use a data-driven approach:** Use data to inform your\n", + "\n", + "decision-making. Collect data on all touchpoints in the\n", + "\n", + "player journey, including ad impressions, clicks, installs,\n", + "\n", + "and in-game actions.\n", + "\n", + "- **Choose the right attribution model:** Select the right\n", + "\n", + "attribution model that accurately reflects the player\n", + "\n", + "journey for your specific genre of game. This can be a\n", + "\n", + "complex process. A couple of things to keep in mind\n", + "\n", + "- Consider the touchpoints that are most important for\n", + "\n", + "your player journey, such as first ad impression, first\n", + "\n", + "click, or first in-game action\n", + "\n", + "- Consider the business goals you’re trying to achieve.\n", + "\n", + "For example, if you are focused on maximizing return\n", + "\n", + "on investment, a last-click attribution model may be\n", + "\n", + "most appropriate. On the other hand, if you are looking\n", + "\n", + "to understand the impact of each touchpoint, a multi-\n", + "\n", + "touch attribution model may be more appropriate.\n", + "\n", + "- Consider the data you have available, including ad\n", + "\n", + "impressions, clicks, installs, and in-game actions.\n", + "\n", + "- **Continuously monitor and optimize:** Continuously\n", + "\n", + "monitor and optimize your acquisition campaigns based on\n", + "\n", + "the data. Test different approaches, make adjustments as\n", + "\n", + "needed, and use A/B testing to determine what works best.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Player Recommendations\n", + "\n", + "Recommendations is an advanced use case. We don’t\n", + "\n", + "recommend (hehe) that you start here, instead, we’re\n", + "\n", + "assuming that you’ve done the work to set up your game\n", + "\n", + "analytics (collecting, cleaning, and preparing data for analysis)\n", + "\n", + "and that you’ve done basic segmentation to place your\n", + "\n", + "players in cohorts based on their interests and behaviors.\n", + "\n", + "Recommendations can come in many forms for video games.\n", + "\n", + "For this context, we’re going to focus on the wide-and-deep\n", + "\n", + "learning for recommender systems, which has the ability\n", + "\n", + "to both memorize and generalize recommendations based\n", + "\n", + "on player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n", + "\n", + "[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n", + "\n", + "deep machine learning (ML) model has become popular in a\n", + "\n", + "variety of online scenarios for its ability to personalize user\n", + "\n", + "engagements, even in ‘cold start problem’ scenarios with\n", + "\n", + "sparse data inputs.\n", + "\n", + "The goal with wide-and-deep recommenders is to provide\n", + "\n", + "\n", + "**Understanding the model design**\n", + "\n", + "To understand the concept of wide-and-deep recommend­\n", + "\n", + "ations, it’s best to think of it as two separate, but collaborating,\n", + "\n", + "engines. The wide model, often referred to in the literature as\n", + "\n", + "the linear model, memorizes users and their past choices. Its\n", + "\n", + "inputs may consist simply of a user identifier and a product\n", + "\n", + "identifier, though other attributes relevant to the pattern (such\n", + "\n", + "as time of day) may also be incorporated.\n", + "\n", + "The deep portion of the model, so named as it is a deep\n", + "\n", + "neural network, examines the generalizable attributes of a\n", + "\n", + "user and their choices. From these, the model learns the\n", + "\n", + "broader characteristics that tend to favor user selections.\n", + "\n", + "Together, the wide-and-deep submodels are trained\n", + "\n", + "on historical product selections by individual users to\n", + "\n", + "predict future selections. The end result is a single model\n", + "\n", + "capable of calculating the probability with which a user will\n", + "\n", + "purchase a given item, given both memorized past choices\n", + "\n", + "and generalizations about a user’s preferences. These\n", + "\n", + "probabilities form the basis for user-specific rankings, which\n", + "\n", + "can be used for making recommendations.\n", + "\n", + "\n", + "an intimate level of player understanding. This model uses\n", + "\n", + "\n", + "explicit and implicit feedback to expand the considerationsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
84f2e08f39518551850d737ef0a7ef5binputs may consist simply of a user identifier and a product\n", + "\n", + "identifier, though other attributes relevant to the pattern (such\n", + "\n", + "as time of day) may also be incorporated.\n", + "\n", + "The deep portion of the model, so named as it is a deep\n", + "\n", + "neural network, examines the generalizable attributes of a\n", + "\n", + "user and their choices. From these, the model learns the\n", + "\n", + "broader characteristics that tend to favor user selections.\n", + "\n", + "Together, the wide-and-deep submodels are trained\n", + "\n", + "on historical product selections by individual users to\n", + "\n", + "predict future selections. The end result is a single model\n", + "\n", + "capable of calculating the probability with which a user will\n", + "\n", + "purchase a given item, given both memorized past choices\n", + "\n", + "and generalizations about a user’s preferences. These\n", + "\n", + "probabilities form the basis for user-specific rankings, which\n", + "\n", + "can be used for making recommendations.\n", + "\n", + "\n", + "an intimate level of player understanding. This model uses\n", + "\n", + "\n", + "explicit and implicit feedback to expand the considerations\n", + "\n", + "set for players. Wide-and-deep recommenders go beyond\n", + "\n", + "simple weighted averaging of player feedback found in some\n", + "\n", + "collaborative filters to balance what is understood about\n", + "\n", + "the individual with what is known about similar gamers. If\n", + "\n", + "done properly, the recommendations make the gamer feel\n", + "\n", + "understood (by your title) and this should translate into\n", + "\n", + "greater value for both the player and you as the business.\n", + "\n", + "\n", + "**Building the model**\n", + "\n", + "The intuitive logic of the wide-and-deep recommender\n", + "\n", + "belies the complexity of its actual construction. Inputs\n", + "\n", + "must be defined separately for each of the wide-and-\n", + "\n", + "deep portions of the model and each must be trained in a\n", + "\n", + "coordinated manner to arrive at a single output, but tuned\n", + "\n", + "using optimizers specific to the nature of each submodel.\n", + "\n", + "Thankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n", + "\n", + "[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n", + "\n", + "simplifying the assembly of an overall model.\n", + "\n", + "\n", + "**User A**\n", + "\n", + "- user identity\n", + "\n", + "- user attributes\n", + "\n", + "**Product B**\n", + "\n", + "\n", + "**Wide**\n", + "**Sub-Model**\n", + "\n", + "\n", + "**Probability of**\n", + "\n", + "**User A + Product B**\n", + "\n", + "**Wide & Deep**\n", + "**Model**\n", + "\n", + "\n", + "**Deep**\n", + "**Sub-Model**\n", + "\n", + "\n", + "\n", + "- product identity\n", + "\n", + "- product attributes\n", + "\n", + "\n", + "-----\n", + "\n", + "**Training**\n", + "\n", + "The challenge for most teams is then training the\n", + "\n", + "recommender on the large number of user-product\n", + "\n", + "combinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n", + "\n", + "open-source library for serving large datasets assembled in\n", + "\n", + "Apache Spark™ to Tensorflow (and other ML libraries), one can\n", + "\n", + "cache the data on high-speed, temporary storage and then\n", + "\n", + "read that data in manageable increments to the model during\n", + "\n", + "training. In doing so, we limit the memory overhead associated\n", + "\n", + "with the training exercise while preserving performance.\n", + "\n", + "**Tuning**\n", + "\n", + "Tuning the model becomes the next challenge. Various model\n", + "\n", + "parameters control its ability to arrive at an optimal solution.\n", + "\n", + "The most efficient way to work through the potential parameter\n", + "\n", + "combinations is simply to iterate through some number of\n", + "\n", + "training cycles, comparing the models’ evaluation metrics with\n", + "\n", + "each run to identify the ideal parameter combinations. By\n", + "\n", + "trials, we can parallelize this work across many compute nodes,\n", + "\n", + "allowing the optimizations to be performed in a timely manner.\n", + "\n", + "**Deploying**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
048fc590af3f4c45c67e3dff40b8ef9aopen-source library for serving large datasets assembled in\n", + "\n", + "Apache Spark™ to Tensorflow (and other ML libraries), one can\n", + "\n", + "cache the data on high-speed, temporary storage and then\n", + "\n", + "read that data in manageable increments to the model during\n", + "\n", + "training. In doing so, we limit the memory overhead associated\n", + "\n", + "with the training exercise while preserving performance.\n", + "\n", + "**Tuning**\n", + "\n", + "Tuning the model becomes the next challenge. Various model\n", + "\n", + "parameters control its ability to arrive at an optimal solution.\n", + "\n", + "The most efficient way to work through the potential parameter\n", + "\n", + "combinations is simply to iterate through some number of\n", + "\n", + "training cycles, comparing the models’ evaluation metrics with\n", + "\n", + "each run to identify the ideal parameter combinations. By\n", + "\n", + "trials, we can parallelize this work across many compute nodes,\n", + "\n", + "allowing the optimizations to be performed in a timely manner.\n", + "\n", + "**Deploying**\n", + "\n", + "Finally, we need to deploy the model for integration with\n", + "\n", + "various retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n", + "\n", + "to both persist our model and package it for deployment\n", + "\n", + "across a wide variety of microservices layers, including\n", + "\n", + "Azure Machine Learning, AWS Sagemaker, Kubernetes and\n", + "\n", + "Databricks Model Serving.\n", + "\n", + "While this seems like a large number of technologies to bring\n", + "\n", + "together just to build a single model, Databricks integrates all\n", + "\n", + "of these technologies within a single platform, providing data\n", + "\n", + "scientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n", + "\n", + "ience. The pre-integration of these technologies means various\n", + "\n", + "per­sonas can work faster and leverage additional capabilities,\n", + "\n", + "such as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n", + "\n", + "transparency of the organization’s model building efforts.\n", + "\n", + "To see an end-to-end example of how a wide and deep\n", + "\n", + "recommender model may be built on Databricks, please\n", + "\n", + "check out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n", + "\n", + "**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Use data to inform recommendations:** Use data from\n", + "\n", + "your analytics, player feedback, and other sources to\n", + "\n", + "understand what players like and dislike. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and engaging for individual players.\n", + "\n", + "- **Segment your players:** Consider segmenting your players\n", + "\n", + "based on characteristics such as playstyle, spending\n", + "\n", + "habits, and demographic information. This will allow you\n", + "\n", + "to create more targeted recommendations for different\n", + "\n", + "groups of players.\n", + "\n", + "- **Consider the player’s current context:** When creating\n", + "\n", + "recommendations, consider the player’s current context,\n", + "\n", + "such as what they are doing in the game and what\n", + "\n", + "content they have already consumed. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and timely.\n", + "\n", + "- **Test and optimize your recommendations:** Use\n", + "\n", + "experimentation methods such as A/B testing to see\n", + "\n", + "how different recommendations perform with different\n", + "\n", + "player segments. Use the insights you gain to optimize\n", + "\n", + "your recommendations.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "players about how you are creating recommendations and\n", + "\n", + "give them the option to opt out if they wish.\n", + "\n", + "- **Use recommendations to improve the player experience:**\n", + "\n", + "Use personalized recommendations to improve the playerSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
bbbd83b865bab5295d7bbc440084f3a2habits, and demographic information. This will allow you\n", + "\n", + "to create more targeted recommendations for different\n", + "\n", + "groups of players.\n", + "\n", + "- **Consider the player’s current context:** When creating\n", + "\n", + "recommendations, consider the player’s current context,\n", + "\n", + "such as what they are doing in the game and what\n", + "\n", + "content they have already consumed. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and timely.\n", + "\n", + "- **Test and optimize your recommendations:** Use\n", + "\n", + "experimentation methods such as A/B testing to see\n", + "\n", + "how different recommendations perform with different\n", + "\n", + "player segments. Use the insights you gain to optimize\n", + "\n", + "your recommendations.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "players about how you are creating recommendations and\n", + "\n", + "give them the option to opt out if they wish.\n", + "\n", + "- **Use recommendations to improve the player experience:**\n", + "\n", + "Use personalized recommendations to improve the player\n", + "\n", + "experience and increase engagement and satisfaction.\n", + "\n", + "### Getting Started with Next Best Offer/Action\n", + "\n", + "Since NBO/NBA is a specific use case of personalization, how a\n", + "\n", + "team might get started implementing this will look very similar\n", + "\n", + "to how they would with broader personalization activities.\n", + "\n", + "Begin with ensuring you are appropriately collecting player\n", + "\n", + "data (behavior, preferences, in-game purchases, etc), storing\n", + "\n", + "it in your cloud data lake using a service such as Delta Lake\n", + "\n", + "from Databricks. From here, you’ll prepare the data using\n", + "\n", + "Databricks to clean, transform, and prepare for analysis.\n", + "\n", + "This may include aggregating data from multiple sources,\n", + "\n", + "removing duplicates and outliers, and transforming the data\n", + "\n", + "into a format suitable for analysis. As you analyze the player\n", + "\n", + "data, seek to identify patterns and trends in player behavior\n", + "\n", + "\n", + "-----\n", + "\n", + "and preferences that will give you signal on which actions are\n", + "\n", + "more likely to be successful.\n", + "\n", + "From here, you can build a recommendation model based\n", + "\n", + "on the player data analysis, and incorporate information\n", + "\n", + "on in-game items and player preferences to make\n", + "\n", + "personalized recommendations.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** Like every use case, starting with\n", + "\n", + "clearly defined goals helps to ensure your implementation\n", + "\n", + "of NBO and NBA will be as effective and efficient as\n", + "\n", + "possible. Your goals will also help you determine what data\n", + "\n", + "to collect and how it will be used.\n", + "\n", + "- **Collect relevant data:** Based on your goals, make sure\n", + "\n", + "you are collecting the right data to inform your NBO and\n", + "\n", + "NBA recommendations. This might include data on player\n", + "\n", + "behavior, engagement, and spending habits.\n", + "\n", + "- **Leverage machine learning to scale your**\n", + "\n", + "**recommendations:** Use machine learning algorithms to\n", + "\n", + "analyze your data and make personalized recommendations\n", + "\n", + "to your players. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n", + "\n", + "methods such as A/B testing to see how different\n", + "\n", + "recommendations perform with different player segments.\n", + "\n", + "Past performance is not a perfect indicator of future\n", + "\n", + "success. Consistent testing allows you to tune your NBO and\n", + "\n", + "NBA recommendations so they evolve with your playerbase.\n", + "\n", + "- **Consider the player’s context:** When making recommend­\n", + "\n", + "ations, consider the player’s current context, such as what\n", + "\n", + "they are doing in the game and what content they have\n", + "\n", + "already consumed. This will help you create recommend­\n", + "\n", + "ations that are more likely to be relevant and timely.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
003f9e95e81a28915c19991b9d5cdb90behavior, engagement, and spending habits.\n", + "\n", + "- **Leverage machine learning to scale your**\n", + "\n", + "**recommendations:** Use machine learning algorithms to\n", + "\n", + "analyze your data and make personalized recommendations\n", + "\n", + "to your players. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n", + "\n", + "methods such as A/B testing to see how different\n", + "\n", + "recommendations perform with different player segments.\n", + "\n", + "Past performance is not a perfect indicator of future\n", + "\n", + "success. Consistent testing allows you to tune your NBO and\n", + "\n", + "NBA recommendations so they evolve with your playerbase.\n", + "\n", + "- **Consider the player’s context:** When making recommend­\n", + "\n", + "ations, consider the player’s current context, such as what\n", + "\n", + "they are doing in the game and what content they have\n", + "\n", + "already consumed. This will help you create recommend­\n", + "\n", + "ations that are more likely to be relevant and timely.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "your players about how you are using their data to make\n", + "\n", + "recommendations, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your NBO and NBA\n", + "\n", + "\n", + "### Getting Started with Churn Prediction & Prevention\n", + "\n", + "The exciting part of this analysis is that not only does it\n", + "\n", + "help to quantify the risk of customer churn but it paints a\n", + "\n", + "quantitative picture of exactly which factors explain that risk.\n", + "\n", + "It’s important that we not draw too rash of a conclusion with\n", + "\n", + "regards to the causal linkage between a particular attribute\n", + "\n", + "and its associated hazard, but it’s an excellent starting point\n", + "\n", + "for identifying where an organization needs to focus its\n", + "\n", + "attention for further investigation.\n", + "\n", + "The hard part in this analysis is not the analytic techniques.\n", + "\n", + "The Kaplan-Meier curves and Cox Proportional Hazard\n", + "\n", + "models used to perform the analysis above are well\n", + "\n", + "established and widely supported across analytics platforms.\n", + "\n", + "The principal challenge is organizing the input data.\n", + "\n", + "The vast majority of subscription services are fairly new as\n", + "\n", + "businesses. As such, the data required to examine customer\n", + "\n", + "attrition may be scattered across multiple systems,\n", + "\n", + "making an integrated analysis more difficult. Data Lakes\n", + "\n", + "are a starting point for solving this problem, but complex\n", + "\n", + "transformations required to cleanse and restructure data\n", + "\n", + "that has evolved as the business itself has (often rapidly)\n", + "\n", + "evolved requires considerable processing power. This is\n", + "\n", + "certainly the case with the KKBox information assets and is a\n", + "\n", + "point noted by the data provider in their public challenge.\n", + "\n", + "The key to successfully completing this work is the\n", + "\n", + "establishment of transparent, maintainable data processing\n", + "\n", + "pipelines executed on an elastically scalable (and therefore\n", + "\n", + "cost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n", + "\n", + "cost-conscious in their initial approach, it’s important to\n", + "\n", + "remember the point made above that churn is a chronic\n", + "\n", + "condition to be managed. As such, this is an analysis that\n", + "\n", + "should be periodically revisited to ensure acquisition and\n", + "\n", + "retention practices are aligned.\n", + "\n", + "To support this, we are making the code behind our\n", + "\n", + "analysis available for download and review. If you have any\n", + "\n", + "questions about how this solution can be deployed in your\n", + "\n", + "environment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "efforts with your team and encourage them to use the\n", + "\n", + "\n", + "data to inform their work.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
66c0f25b63f07da139c017833db0174b[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n", + "\n", + "cost-conscious in their initial approach, it’s important to\n", + "\n", + "remember the point made above that churn is a chronic\n", + "\n", + "condition to be managed. As such, this is an analysis that\n", + "\n", + "should be periodically revisited to ensure acquisition and\n", + "\n", + "retention practices are aligned.\n", + "\n", + "To support this, we are making the code behind our\n", + "\n", + "analysis available for download and review. If you have any\n", + "\n", + "questions about how this solution can be deployed in your\n", + "\n", + "environment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "efforts with your team and encourage them to use the\n", + "\n", + "\n", + "data to inform their work.\n", + "\n", + "\n", + "**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define churn:** Clearly define what you consider to be\n", + "\n", + "player churn, as this will determine how you measure\n", + "\n", + "and predict it. For example, you might consider churn to\n", + "\n", + "be when a player stops playing your game for a certain\n", + "\n", + "number of days, or when they uninstall it.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you predict and prevent churn. This\n", + "\n", + "might include data on player behavior, engagement, and\n", + "\n", + "spending habits.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and predict which players are at\n", + "\n", + "risk of churning. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** Use experimentation methods such as\n", + "\n", + "A/B testing to see how different strategies impact churn\n", + "\n", + "rates. Use the insights you gain to optimize your churn\n", + "\n", + "prevention efforts.\n", + "\n", + "- **Focus on retention:** Implement retention strategies that are\n", + "\n", + "tailored to the needs and preferences of your players. This\n", + "\n", + "might involve providing personalized content, addressing\n", + "\n", + "pain points, or offering incentives to continue playing.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are using their data to predict and\n", + "\n", + "prevent churn, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your churn prediction\n", + "\n", + "and prevention efforts with your team and encourage\n", + "\n", + "them to use the data to inform their work.\n", + "\n", + "### Getting Started with Read-time Ad Targeting\n", + "\n", + "Typically, implementing a real-time ad targeting strategy begins\n", + "\n", + "outside of your game (in services such as Google Ads, Unity\n", + "\n", + "Advertising), where your game becomes the delivery point\n", + "\n", + "for the advertisement. Here, you will need to integrate with\n", + "\n", + "Ad networks that provide real-time ad targeting capabilities.\n", + "\n", + "That will allow you to access a range of available ad assets\n", + "\n", + "to dynamically select and display the most relevant ads to\n", + "\n", + "players. Both Google AdMob and Unity Ads are great for banner\n", + "\n", + "ads, native ads, and rewarded video ads. Your role is to ensure\n", + "\n", + "that the data you’re collecting is fed back into the advertising\n", + "\n", + "platform to better serve targeted ads to your playerbase.\n", + "\n", + "\n", + "To use a service like Databricks to manage the data needed\n", + "\n", + "to provide real-time ad targeting in your application, you can\n", + "\n", + "follow the below steps:\n", + "\n", + "`1.` **Collect and store player data:** Collect data on player\n", + "\n", + "behavior, preferences, and demographics, and store it in\n", + "\n", + "a data lake using Databricks. Popular analytics tools such\n", + "\n", + "as Google Analytics or Mixpanel can be integrated intoSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
f326bb843ecfb32627472ccddee6a6daAdvertising), where your game becomes the delivery point\n", + "\n", + "for the advertisement. Here, you will need to integrate with\n", + "\n", + "Ad networks that provide real-time ad targeting capabilities.\n", + "\n", + "That will allow you to access a range of available ad assets\n", + "\n", + "to dynamically select and display the most relevant ads to\n", + "\n", + "players. Both Google AdMob and Unity Ads are great for banner\n", + "\n", + "ads, native ads, and rewarded video ads. Your role is to ensure\n", + "\n", + "that the data you’re collecting is fed back into the advertising\n", + "\n", + "platform to better serve targeted ads to your playerbase.\n", + "\n", + "\n", + "To use a service like Databricks to manage the data needed\n", + "\n", + "to provide real-time ad targeting in your application, you can\n", + "\n", + "follow the below steps:\n", + "\n", + "`1.` **Collect and store player data:** Collect data on player\n", + "\n", + "behavior, preferences, and demographics, and store it in\n", + "\n", + "a data lake using Databricks. Popular analytics tools such\n", + "\n", + "as Google Analytics or Mixpanel can be integrated into\n", + "\n", + "the game to collect data on player behavior. These tools,\n", + "\n", + "just like tracking website traffic, can track in-game events,\n", + "\n", + "provide insights on player behavior and demographics..\n", + "\n", + "and they give you access to detailed reports and\n", + "\n", + "dashboards. Another option is to build in-house tracking\n", + "\n", + "systems to collect data on player behavior - logging\n", + "\n", + "events, e.g in-game purchases or player actions, activities\n", + "\n", + "such as “at which level does a player quit playing” and\n", + "\n", + "storing this in a database for analysis. The downside of\n", + "\n", + "building in-house tracking systems is you will need to host\n", + "\n", + "and maintain your own logging servers.\n", + "\n", + "`2.` **Prepare the data:** Use Databricks to clean, transform,\n", + "\n", + "and prepare the player data for analysis. This may\n", + "\n", + "include aggregating data from multiple sources, removing\n", + "\n", + "duplicates and outliers, and transforming the data into a\n", + "\n", + "format suitable for analysis.\n", + "\n", + "`3.` **Analyze the data:** Use Databricks’ built-in machine\n", + "\n", + "learning and data analytics capabilities to analyze the\n", + "\n", + "player data and identify patterns and trends.\n", + "\n", + "`4.` **Create audience segments:** Based on the analysis,\n", + "\n", + "use Databricks to create audience segments based on\n", + "\n", + "common characteristics such as interests, behaviors,\n", + "\n", + "and preferences.\n", + "\n", + "`5.` **Integrate with the ad server:** When an ad opportunity\n", + "\n", + "presents itself within the game, a call is made to the ad\n", + "\n", + "server. This call includes information about the player,\n", + "\n", + "such as the audience segment that they belong to. The\n", + "\n", + "ad server then uses this information to decide what ad to\n", + "\n", + "deliver to the player.\n", + "\n", + "`6.` **Monitor and optimize:** Use Databricks to monitor the\n", + "\n", + "performance of the ad targeting and make optimizations\n", + "\n", + "as needed, such as adjusting the audience segments or\n", + "\n", + "adjusting the targeting algorithms.\n", + "\n", + "By using a service like Databricks to manage the data needed\n", + "\n", + "for real-time ad targeting, game developers can effectively\n", + "\n", + "leverage their player data to create more personalized and\n", + "\n", + "engaging experiences, increase revenue, and reduce churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Focus on player data:** Make player data the center of your\n", + "\n", + "targeting strategy by collecting and storing comprehensive\n", + "\n", + "information on player behavior, preferences, and\n", + "\n", + "demographics. Here, it’s critical to ensure the game code\n", + "\n", + "data trackers are properly implemented in order to collect\n", + "\n", + "this data (see Game Analytics section for detail).\n", + "\n", + "- **Segment your audience:** Create audience segmentsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
0166095a24155148336e7b77c0f50cc4for real-time ad targeting, game developers can effectively\n", + "\n", + "leverage their player data to create more personalized and\n", + "\n", + "engaging experiences, increase revenue, and reduce churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Focus on player data:** Make player data the center of your\n", + "\n", + "targeting strategy by collecting and storing comprehensive\n", + "\n", + "information on player behavior, preferences, and\n", + "\n", + "demographics. Here, it’s critical to ensure the game code\n", + "\n", + "data trackers are properly implemented in order to collect\n", + "\n", + "this data (see Game Analytics section for detail).\n", + "\n", + "- **Segment your audience:** Create audience segments\n", + "\n", + "based on common characteristics such as interests,\n", + "\n", + "behaviors, and preferences, and use these segments to\n", + "\n", + "\n", + "**Test and iterate:** Continuously test and iterate your\n", + "\n", + "targeting strategy to refine your audience segments and\n", + "\n", + "improve targeting accuracy.\n", + "\n", + "**Balance relevance and privacy:** Balance the need for\n", + "\n", + "relevant, personalized ads with players’ privacy by only\n", + "\n", + "collecting and using data that is necessary for targeting\n", + "\n", + "and obtaining player consent.\n", + "\n", + "**Monitor performance:** Regularly monitor the performance\n", + "\n", + "of your targeting strategy to ensure that it is delivering the\n", + "\n", + "desired results and make optimizations as needed.\n", + "\n", + "**Partner with the right ad platform:** Choose an ad\n", + "\n", + "platform that is well-suited to your needs and aligns with\n", + "\n", + "your goals, and work closely with them to ensure that your\n", + "\n", + "targeting strategy is delivering the best results.\n", + "\n", + "\n", + "deliver targeted ads.\n", + "\n", + "# Operational use cases\n", + "\n", + "\n", + "### Anomaly Detection\n", + "\n", + "First thing is to begin collecting the data, game server / client\n", + "\n", + "logs out of your project. Then consume this into Databricks\n", + "\n", + "Delta, to have a continuous anomaly detection model\n", + "\n", + "running. Focus this on key pieces of information you want to\n", + "\n", + "monitor, for example - for live service games, this is going to\n", + "\n", + "be infrastructure and network-related metrics such as Ping\n", + "\n", + "and Server Health (# of clients connected, server uptime,\n", + "\n", + "server usage, CPU/RAM, # of sessions, time of sessions).\n", + "\n", + "Once the model is ingesting and tuned specifically for the\n", + "\n", + "metrics based on the information you have above. You would\n", + "\n", + "build out alerts or notifications based on these specific\n", + "\n", + "metrics hitting a threshold that you define as needing\n", + "\n", + "attention. From here, you can build out automated systems\n", + "\n", + "to mitigate those effects - such as migrating players to a\n", + "\n", + "different server, canceling matches, scaling infrastructure,\n", + "\n", + "creating tickets for admins to review.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define the problem and objectives clearly:** Before\n", + "\n", + "implementing an anomaly detection solution, it is\n", + "\n", + "important to define the problem you are trying to solve\n", + "\n", + "and your specific objectives. This will help ensure that\n", + "\n", + "you have the right data sources and use the appropriate\n", + "\n", + "algorithms to achieve your goals.\n", + "\n", + "- **Choose the right data sources:** To effectively detect\n", + "\n", + "anomalies, you need to have the right data sources.\n", + "\n", + "Consider data from player behavior, system performance,\n", + "\n", + "and network traffic, as well as any other data sources that\n", + "\n", + "are relevant to your problem and objectives.\n", + "\n", + "- **Clean and preprocess the data:** To ensure that theSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
7457b3772e1b7fe52c1b29d823c92836deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define the problem and objectives clearly:** Before\n", + "\n", + "implementing an anomaly detection solution, it is\n", + "\n", + "important to define the problem you are trying to solve\n", + "\n", + "and your specific objectives. This will help ensure that\n", + "\n", + "you have the right data sources and use the appropriate\n", + "\n", + "algorithms to achieve your goals.\n", + "\n", + "- **Choose the right data sources:** To effectively detect\n", + "\n", + "anomalies, you need to have the right data sources.\n", + "\n", + "Consider data from player behavior, system performance,\n", + "\n", + "and network traffic, as well as any other data sources that\n", + "\n", + "are relevant to your problem and objectives.\n", + "\n", + "- **Clean and preprocess the data:** To ensure that the\n", + "\n", + "data you use for anomaly detection is accurate and\n", + "\n", + "meaningful, it is important to clean and preprocess the\n", + "\n", + "data. This includes removing any irrelevant or invalid data,\n", + "\n", + "handling missing values, and normalizing the data\n", + "\n", + "if necessary.\n", + "\n", + "- **Choose the right algorithms:** There are many algorithms\n", + "\n", + "that can be used for anomaly detection, including\n", + "\n", + "statistical methods, machine learning algorithms, and\n", + "\n", + "rule-based systems. Choose the algorithms that are best\n", + "\n", + "\n", + "-----\n", + "\n", + "suited to your data and problem, and that provide the\n", + "\n", + "right level of accuracy, speed, and scalability.\n", + "\n", + "- **Validate the results:** Before deploying the anomaly\n", + "\n", + "detection solution in production, it is important to validate\n", + "\n", + "the results by testing the solution on a small subset of\n", + "\n", + "data and comparing the results to expected outcomes.\n", + "\n", + "- **Monitor and update the solution:** Once the anomaly\n", + "\n", + "detection solution is deployed, it is important to monitor\n", + "\n", + "its performance and accuracy, and update the solution as\n", + "\n", + "needed. This may include retraining the algorithms, adding\n", + "\n", + "or removing data sources, and updating the parameters\n", + "\n", + "and thresholds used by the algorithms.\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Avoid overfitting:** Overfitting occurs when the anomaly\n", + "\n", + "detection solution is too complex and learns the noise\n", + "\n", + "in the data rather than the underlying patterns. To avoid\n", + "\n", + "overfitting, it is important to choose algorithms that are\n", + "\n", + "appropriate for the size and complexity of the data, and to\n", + "\n", + "validate the results using a separate test dataset.\n", + "\n", + "- **False positive and false negative results:** False positive\n", + "\n", + "and false negative results can occur when the anomaly\n", + "\n", + "detection solution is not properly calibrated, or when\n", + "\n", + "the solution is applied to data that is significantly\n", + "\n", + "different from the training data. To minimize the risk of\n", + "\n", + "false positive and false negative results, it is important\n", + "\n", + "to validate the results using a separate test dataset, and\n", + "\n", + "to fine-tune the parameters and thresholds used by the\n", + "\n", + "algorithms as needed.\n", + "\n", + "- **Scalability:** Scalability can be a concern when\n", + "\n", + "implementing an anomaly detection solution, especially\n", + "\n", + "when dealing with large amounts of data. To ensure that\n", + "\n", + "the solution can scale to meet the demands of a growing\n", + "\n", + "player base, it is important to choose algorithms that\n", + "\n", + "are fast and scalable, and to deploy the solution using a\n", + "\n", + "scalable infrastructure.\n", + "\n", + "### Getting Started with Build Pipeline\n", + "\n", + "An operational goal game projects have is to make sure\n", + "\n", + "game project builds are generated, delivered quickly and\n", + "\n", + "efficiently to internal testing & external users.\n", + "\n", + "\n", + "A few of the key metrics and capabilities with analyzing your\n", + "\n", + "build pipelines are the below:\n", + "\n", + "- **Build time and speed:** This includes metrics such asSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
ae5562e80c8b28bec6bb2ee3f01b5f5ffalse positive and false negative results, it is important\n", + "\n", + "to validate the results using a separate test dataset, and\n", + "\n", + "to fine-tune the parameters and thresholds used by the\n", + "\n", + "algorithms as needed.\n", + "\n", + "- **Scalability:** Scalability can be a concern when\n", + "\n", + "implementing an anomaly detection solution, especially\n", + "\n", + "when dealing with large amounts of data. To ensure that\n", + "\n", + "the solution can scale to meet the demands of a growing\n", + "\n", + "player base, it is important to choose algorithms that\n", + "\n", + "are fast and scalable, and to deploy the solution using a\n", + "\n", + "scalable infrastructure.\n", + "\n", + "### Getting Started with Build Pipeline\n", + "\n", + "An operational goal game projects have is to make sure\n", + "\n", + "game project builds are generated, delivered quickly and\n", + "\n", + "efficiently to internal testing & external users.\n", + "\n", + "\n", + "A few of the key metrics and capabilities with analyzing your\n", + "\n", + "build pipelines are the below:\n", + "\n", + "- **Build time and speed:** This includes metrics such as\n", + "\n", + "the time it takes to create a build, number of builds, and\n", + "\n", + "compute spent.\n", + "\n", + "- **Build size and storage:** size of the builds, amount of\n", + "\n", + "storage, and network costs.\n", + "\n", + "- **Bug tracking and resolution:** This includes metrics such\n", + "\n", + "as the number of bugs reported, the time it takes to\n", + "\n", + "resolve them, and the number of bugs that are resolved in\n", + "\n", + "each build.\n", + "\n", + "- **Code quality and efficiency:** This includes metrics such\n", + "\n", + "as code complexity, code duplication, and the number of\n", + "\n", + "code lines written.\n", + "\n", + "- **Collaboration and communication:** Such as the number\n", + "\n", + "of code reviews, the number of team meetings, and the\n", + "\n", + "number of code commits.\n", + "\n", + "- **Advanced capabilities:** Such as Predicting real time build\n", + "\n", + "failure to reduce spend and combining build data with\n", + "\n", + "Crash Analytics (see below) to have “commit to build”\n", + "\n", + "visibility for accelerated bug fixing.\n", + "\n", + "Before you start implementing your build pipeline, it’s\n", + "\n", + "important to define your requirements. What are the key\n", + "\n", + "goals of your build pipeline? Choosing the right CI/CD tools is\n", + "\n", + "critical to the success of your build pipeline. There are many\n", + "\n", + "different tools available, including Jenkins, Azure Devops,\n", + "\n", + "Perforce, gitlab and more. When choosing a CI/CD tool,\n", + "\n", + "consider factors such as ease of use, scalability, and cost. In\n", + "\n", + "addition, consider the specific needs of your game project,\n", + "\n", + "and choose a tool that can meet those needs.\n", + "\n", + "The general recommendation is to look at automating your\n", + "\n", + "build process early. Once you’ve chosen your CI/CD tools, you\n", + "\n", + "can automate your build process by setting up a build server,\n", + "\n", + "configuring your CI/CD tool, and creating a script to build your\n", + "\n", + "game project. The build process should be automated as much\n", + "\n", + "as possible, and it should include steps to compile your code,\n", + "\n", + "run automated tests, and generate a build of your project.\n", + "\n", + "Once you have automated your build process, often the\n", + "\n", + "next step is to implement CD (Continuous Delivery). This\n", + "\n", + "involves automating the deployment of your game builds\n", + "\n", + "delivery to stakeholders, such as QA testers, beta testers, or\n", + "\n", + "end-users via publishing platforms. CD can help ensure that\n", + "\n", + "stakeholders have access to the latest version of your game\n", + "\n", + "\n", + "-----\n", + "\n", + "as soon as possible, allowing them to provide feedback and\n", + "\n", + "help drive the development process forward.\n", + "\n", + "Finally, it’s important to monitor and measure your build\n", + "\n", + "pipeline to ensure that it’s working as expected. This can\n", + "\n", + "involve using tools such as Databricks Dashboards to\n", + "\n", + "visualize the status of your pipeline, or using metrics such\n", + "\n", + "as build times, test results, and deployment success rates\n", + "\n", + "to evaluate the performance of your pipeline. By monitoring\n", + "\n", + "and measuring your build pipeline, you can identify areas forSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
859ef65fc5f6ffd1ab8dfee683f35d36run automated tests, and generate a build of your project.\n", + "\n", + "Once you have automated your build process, often the\n", + "\n", + "next step is to implement CD (Continuous Delivery). This\n", + "\n", + "involves automating the deployment of your game builds\n", + "\n", + "delivery to stakeholders, such as QA testers, beta testers, or\n", + "\n", + "end-users via publishing platforms. CD can help ensure that\n", + "\n", + "stakeholders have access to the latest version of your game\n", + "\n", + "\n", + "-----\n", + "\n", + "as soon as possible, allowing them to provide feedback and\n", + "\n", + "help drive the development process forward.\n", + "\n", + "Finally, it’s important to monitor and measure your build\n", + "\n", + "pipeline to ensure that it’s working as expected. This can\n", + "\n", + "involve using tools such as Databricks Dashboards to\n", + "\n", + "visualize the status of your pipeline, or using metrics such\n", + "\n", + "as build times, test results, and deployment success rates\n", + "\n", + "to evaluate the performance of your pipeline. By monitoring\n", + "\n", + "and measuring your build pipeline, you can identify areas for\n", + "\n", + "improvement and make changes as needed to ensure that\n", + "\n", + "your pipeline continues to meet your needs.\n", + "\n", + "If you have any questions about how databricks can\n", + "\n", + "integrate into your devops solution, please don’t hesitate to\n", + "\n", + "[reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Seek to automate early and often:** Automate as much\n", + "\n", + "of the build process as possible, from checking code into\n", + "\n", + "version control to generating builds and distributing them\n", + "\n", + "to stakeholders. This can help reduce errors and save time,\n", + "\n", + "allowing game teams to focus on more high value tasks.\n", + "\n", + "\n", + "**Version control, version control, version control:** Use a\n", + "\n", + "version control system to manage the source code and\n", + "\n", + "other assets. This ensures that changes to the codebase\n", + "\n", + "are tracked and can be easily undone if needed.\n", + "\n", + "**Implement continuous integration and delivery:**\n", + "\n", + "Continuous integration (CI) involves automatically building\n", + "\n", + "and testing after code changes are checked into version\n", + "\n", + "control. With CI, new changes to the codebase do not\n", + "\n", + "break existing functionality. By automating the build\n", + "\n", + "process, CI helps to reduce errors and save time. CD, on\n", + "\n", + "the other hand, involves automatically delivering builds to\n", + "\n", + "stakeholders, such as QA testers, beta testers, or end-\n", + "\n", + "users, after they have passed the automated tests. By\n", + "\n", + "combining CI and CD, a video game project can ensure\n", + "\n", + "that builds are generated and delivered quickly and\n", + "\n", + "efficiently, without the need for manual intervention.\n", + "\n", + "**Build for scalability:** As your game project grows, you\n", + "\n", + "will need a build pipeline solution that is scalable and can\n", + "\n", + "handle the needs of your game team.\n", + "\n", + "**Integration with other tools:** Integrate the build pipeline\n", + "\n", + "solution with other tools and systems, such as issue\n", + "\n", + "tracking, testing, and deployment tools, to ensure a\n", + "\n", + "smooth and efficient workflow.\n", + "\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "\n", + "|GAME INFRASTRUCTURE|Col2|\n", + "|---|---|\n", + "|||\n", + "|||\n", + "\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Crash Analytics\n", + "\n", + "Building a pipeline to build a holistic view to support crash\n", + "\n", + "analytics means data coming from multiple different\n", + "\n", + "sources, different velocities and joining the data together.\n", + "\n", + "The amount of data sources depends on your game projects\n", + "\n", + "publishing platforms, some may come from console based\n", + "\n", + "providers such as sony, xbox, and nintendo or pc platforms\n", + "\n", + "like Steam, Epic Games Marketplace, GoG and many others.\n", + "\n", + "**High level steps**\n", + "\n", + "- Determine what platforms your game is running on and\n", + "\n", + "how to interface to collect data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
ba68940e19b295f296eeb8a3fbe78784smooth and efficient workflow.\n", + "\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "\n", + "|GAME INFRASTRUCTURE|Col2|\n", + "|---|---|\n", + "|||\n", + "|||\n", + "\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Crash Analytics\n", + "\n", + "Building a pipeline to build a holistic view to support crash\n", + "\n", + "analytics means data coming from multiple different\n", + "\n", + "sources, different velocities and joining the data together.\n", + "\n", + "The amount of data sources depends on your game projects\n", + "\n", + "publishing platforms, some may come from console based\n", + "\n", + "providers such as sony, xbox, and nintendo or pc platforms\n", + "\n", + "like Steam, Epic Games Marketplace, GoG and many others.\n", + "\n", + "**High level steps**\n", + "\n", + "- Determine what platforms your game is running on and\n", + "\n", + "how to interface to collect data.\n", + "\n", + "- **Collect crash data:** Implement crash reporting tools in\n", + "\n", + "your game to collect data on crashes. The source data\n", + "\n", + "may be delivered in varying formats such as JSON or CSV.\n", + "\n", + "- **Load crash data into Databricks:** Use Databricks’ data\n", + "\n", + "ingestion tools to load the crash data into your workspace.\n", + "\n", + "This could involve using Databricks’ built-in data source\n", + "\n", + "connectors, or programmatically ingest files to load the data.\n", + "\n", + "\n", + "\n", + "- **Transform and clean the crash data:** Use Databricks’\n", + "\n", + "data processing and transformation tools to clean and\n", + "\n", + "prepare the crash data for analysis. This could involve\n", + "\n", + "using Databricks’ capabilities like DLT, or using SQL to\n", + "\n", + "perform custom transformations.\n", + "\n", + "- **Visualize crash data:** Use Databricks’ dashboarding tools\n", + "\n", + "to create visualizations that help you understand the\n", + "\n", + "patterns and trends in your crash data. This could involve\n", + "\n", + "using Databricks’ built-in visualization tools, or integrating\n", + "\n", + "with external visualization tools like Tableau or PowerBI.\n", + "\n", + "- **Analyze crash data:** Use Databricks’ machine learning\n", + "\n", + "and statistical analysis tools to identify the root causes\n", + "\n", + "of crashes. This could involve using Spark MLlib or many\n", + "\n", + "of the popular tools to build machine learning models, or\n", + "\n", + "using SQL to perform custom analyses.\n", + "\n", + "- **Monitor and refine your pipeline:** Regularly review your\n", + "\n", + "pipeline to ensure that it remains relevant and useful.\n", + "\n", + "Refine your pipeline as necessary to reflect changes in\n", + "\n", + "your crash data or your goals.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Automated collection and aggregation of crash reports:**\n", + "\n", + "Collecting crash reports should be an automated process\n", + "\n", + "that is integrated into the output of the build pipeline\n", + "\n", + "for the game. The crash reports should be automatically\n", + "\n", + "aggregated and made available for analysis in near real-time.\n", + "\n", + "- **Clear reporting and prioritization of issues:** The solution\n", + "\n", + "should provide clear reporting on the most common\n", + "\n", + "issues and allow game developers to prioritize fixing the\n", + "\n", + "most impactful problems first.\n", + "\n", + "- **Integration with other analytics tools:** The crash analytics\n", + "\n", + "solution should integrate with other analytics tools, such\n", + "\n", + "as player behavior tracking, to provide a more complete\n", + "\n", + "picture of how crashes are impacting the player experience.\n", + "\n", + "- **Flexibility and scalability:** As the game grows, the\n", + "\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
8896c6ca446a54c83f1d8a25b55f77af**Tips / Best Practices**\n", + "\n", + "- **Automated collection and aggregation of crash reports:**\n", + "\n", + "Collecting crash reports should be an automated process\n", + "\n", + "that is integrated into the output of the build pipeline\n", + "\n", + "for the game. The crash reports should be automatically\n", + "\n", + "aggregated and made available for analysis in near real-time.\n", + "\n", + "- **Clear reporting and prioritization of issues:** The solution\n", + "\n", + "should provide clear reporting on the most common\n", + "\n", + "issues and allow game developers to prioritize fixing the\n", + "\n", + "most impactful problems first.\n", + "\n", + "- **Integration with other analytics tools:** The crash analytics\n", + "\n", + "solution should integrate with other analytics tools, such\n", + "\n", + "as player behavior tracking, to provide a more complete\n", + "\n", + "picture of how crashes are impacting the player experience.\n", + "\n", + "- **Flexibility and scalability:** As the game grows, the\n", + "\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Data privacy and security:** Ensure that crash reports do\n", + "\n", + "not contain sensitive information that could be used to\n", + "\n", + "identify individual players.\n", + "\n", + "- **Scalability:** As the number of players and crashes\n", + "\n", + "increases, it may become difficult to manage and analyze\n", + "\n", + "the growing volume of data.\n", + "\n", + "- **Integration with other tools:** Be aware when integrating\n", + "\n", + "crash analytics with other tools and systems, especially if\n", + "\n", + "the tools use different data formats or data structures.\n", + "\n", + "- **Prioritization of issues:** Determine which crashes are\n", + "\n", + "the most impactful and prioritize fixes accordingly. This\n", + "\n", + "can be a complex process, especially if there are a large\n", + "\n", + "number of different crash types and causes.\n", + "\n", + "\n", + "solution should be able to scale to accommodate an\n", + "\n", + "increasing number of players and crashes.\n", + "\n", + "**Data privacy and security:** It’s important to consider data\n", + "\n", + "privacy and security when implementing a crash analytics\n", + "\n", + "solution. This may involve implementing measures to\n", + "\n", + "anonymize crash reports, or taking steps to ensure that\n", + "\n", + "sensitive information is not included in the reports.\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
fb2a03466ddbcb4463e9464f2f3db3bd### Executive Guide\n", + "\n", + "# Transform and Scale Your Organization With Data and AI\n", + "\n", + "#### A guide for CIOs, CDOs, and\n", + " data and AI executives\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R :**\n", + "\n", + "**Chris D’Agostino**\n", + "\n", + "Global Field CTO\n", + "\n", + "Databricks\n", + "\n", + "**E D I T O R S :**\n", + "\n", + "Manveer Sahota\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n", + "\n", + "**1.** Establish the goals and business value 8\n", + "\n", + "**2.** Identify and prioritize use cases 19\n", + "\n", + "**3.** Build successful data teams 22\n", + "\n", + "**4.** Deploy a modern data stack 28\n", + "\n", + "**5.** Improve data governance and compliance 36\n", + "\n", + "**6.** Democratize access to quality data 41\n", + "\n", + "**7.** Dramatically increase productivity of your workforce 47\n", + "\n", + "**8.** Make informed build vs. buy decisions 52\n", + "\n", + "**9.** Allocate, monitor and optimize costs 55\n", + "\n", + "**10.** Move to production and scale adoption 58\n", + "\n", + "\n", + "Jessica Barbieri\n", + "\n", + "\n", + "Toby Balfre\n", + "\n", + "\n", + "**C H A P T E R 3 :** **Conclusion** \u0007 63\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Executive Summary\n", + "\n", + "Data and AI leaders are faced with the challenge\n", + "\n", + "of future-proofing their architecture and platform\n", + "\n", + "investments. The Lakehouse implementation from\n", + "\n", + "Databricks combines the best features of EDWs\n", + "\n", + "and data lakes by enabling all their workloads using\n", + "\n", + "open source and open standards — avoiding the\n", + "\n", + "vendor lock-in, black box design and proprietary\n", + "\n", + "data formats of other cloud vendors.\n", + "\n", + "\n", + "It’s not surprising that many industry experts say data is the most valuable resource in the modern\n", + "\n", + "economy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n", + "\n", + "water. Its core compound never changes, and it can be transformed to whatever use case is desired,\n", + "\n", + "with the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n", + "\n", + "essential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n", + "\n", + "importance of data are growing exponentially in both our professional and personal lives, while artificial\n", + "\n", + "intelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n", + "\n", + "over the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n", + "\n", + "2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n", + "\n", + "(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n", + "\n", + "governance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n", + "\n", + "Every organization is working to improve business outcomes while effectively managing a variety of risks —\n", + "\n", + "including economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n", + "\n", + "Your organization’s data and the systems that process it play a critical role in not only enabling your financial\n", + "\n", + "goals but also in minimizing these seven key business risks.\n", + "\n", + "Businesses have realized that their legacy information technology (IT) platforms are not able to scale and\n", + "\n", + "meet the increasing demands for better data analytics. As a result, they are looking to transform how theirSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
f545eff42d3b9ae2b565475f4390ed442020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n", + "\n", + "(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n", + "\n", + "governance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n", + "\n", + "Every organization is working to improve business outcomes while effectively managing a variety of risks —\n", + "\n", + "including economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n", + "\n", + "Your organization’s data and the systems that process it play a critical role in not only enabling your financial\n", + "\n", + "goals but also in minimizing these seven key business risks.\n", + "\n", + "Businesses have realized that their legacy information technology (IT) platforms are not able to scale and\n", + "\n", + "meet the increasing demands for better data analytics. As a result, they are looking to transform how their\n", + "\n", + "organizations use and process data. Successful data transformation initiatives for data, analytics and AI\n", + "\n", + "involve not only the design of hardware and software systems but also the alignment of people, processes\n", + "\n", + "and platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n", + "\n", + "significant return on investment (ROI) — one that starts in months, not years.\n", + "\n", + "To guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n", + "\n", + "Despite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n", + "\n", + "to deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n", + "\n", + "efficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n", + "\n", + "identify and execute on AI opportunities.\n", + "\n", + "\n", + "-----\n", + "\n", + "To successfully lead data and AI transformation initiatives, organizations need to develop and execute\n", + "\n", + "a comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n", + "\n", + "full potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n", + "\n", + "organizations have the option of moving away from closed, proprietary systems offered by a variety\n", + "\n", + "of cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n", + "\n", + "industry standards.\n", + "\n", + "At Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n", + "\n", + "we’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n", + "\n", + "in successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n", + "\n", + "architecture, which decouples data storage from compute while providing the best price/performance\n", + "\n", + "metrics for all your data workloads — including data warehousing. We have captured the lessons learned\n", + "\n", + "and summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n", + "\n", + "CIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n", + "\n", + "initiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n", + "\n", + "unified data platform that realizes the data lakehouse architecture and enables the data personas in your\n", + "\n", + "organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n", + "\n", + "shown in Figure 1.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "**Figure 1:**\n", + "The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**The lakehouse architecture benefits organizations in several ways:**\n", + "\n", + "**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n", + "\n", + "**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n", + "\n", + "**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
4df144a0314dfaf639ae04e7ebb499d8organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n", + "\n", + "shown in Figure 1.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "**Figure 1:**\n", + "The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**The lakehouse architecture benefits organizations in several ways:**\n", + "\n", + "**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n", + "\n", + "**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n", + "\n", + "**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n", + "\n", + "Our intention is to present key considerations and equip you with the knowledge to ask informed questions,\n", + "\n", + "make the most critical decisions early in the process, and develop the comprehensive strategy that most\n", + "\n", + "organizations lack.\n", + "\n", + "In addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n", + "\n", + "professional services offering that organizations can leverage to measure their readiness, reskill their staff\n", + "\n", + "and track progress as they embark on their data transformation initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Define the Strategy\n", + "\n", + "\n", + "The most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n", + "\n", + "strategy for how your organization will leverage people, processes and platforms to drive measurable\n", + "\n", + "business results against your corporate priorities. The strategy serves as a set of principles that every\n", + "\n", + "member of your organization can refer to when making decisions. The strategy should cover the roles and\n", + "\n", + "responsibilities of teams within your organization for how you capture, store, curate and process data to run\n", + "\n", + "your business — including the internal and external resources (labor and budget) needed to be successful.\n", + "\n", + "\n", + "Establish the\n", + "goals and\n", + "business value\n", + "\n", + "\n", + "Build\n", + "successful\n", + "data teams\n", + "\n", + "\n", + "Ease data\n", + "governance and\n", + "compliance\n", + "\n", + "\n", + "Simplify\n", + "the user\n", + "experience\n", + "\n", + "\n", + "Allocate,\n", + "monitor and\n", + "optimize costs\n", + "\n", + "\n", + "Identify and\n", + "prioritize\n", + "use cases\n", + "\n", + "\n", + "Deploy a modern\n", + "data architecture\n", + "\n", + "\n", + "Democratize\n", + "access to\n", + "quality data\n", + "\n", + "\n", + "Make informed\n", + "build vs. buy\n", + "decisions\n", + "\n", + "\n", + "Move to\n", + "production and\n", + "drive adoption\n", + "\n", + "\n", + "**Figure 2:**\n", + "The 10 steps to a winning data and AI strategy\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Here are 10 key considerations\n", + "\n", + "**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n", + "\n", + "**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n", + "\n", + "**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n", + "\n", + "and data engineering talent.\n", + "\n", + "**4.** \u0007Future-proof your technology investment with a modern data architecture.\n", + "\n", + "**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n", + "\n", + "Consumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n", + "\n", + "**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n", + "\n", + "data access and the sharing of all your data across the organization.\n", + "\n", + "**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n", + "\n", + "**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n", + "\n", + "important problems.\n", + "\n", + "**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n", + "\n", + "**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n", + "\n", + "user satisfaction.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
e47cfe4574f92d0f241f9535db8cbac4and data engineering talent.\n", + "\n", + "**4.** \u0007Future-proof your technology investment with a modern data architecture.\n", + "\n", + "**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n", + "\n", + "Consumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n", + "\n", + "**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n", + "\n", + "data access and the sharing of all your data across the organization.\n", + "\n", + "**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n", + "\n", + "**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n", + "\n", + "important problems.\n", + "\n", + "**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n", + "\n", + "**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n", + "\n", + "user satisfaction.\n", + "\n", + "The strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n", + "\n", + "owned and governed by the CDO and made available for everyone in the organization to review and provide\n", + "\n", + "feedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n", + "\n", + "the technology landscape or a combination of any of these — but it should serve as the North Star for\n", + "\n", + "how you will navigate the many decisions and trade-offs that you will need to make over the course of the\n", + "\n", + "transformation.\n", + "\n", + "\n", + "This guide takes a stepwise approach to\n", + "\n", + "addressing each of these 10 topics.\n", + "\n", + "\n", + "-----\n", + "\n", + "Studies have shown that data scientists spend 80%\n", + "\n", + "of their time collecting and compiling data sets\n", + "\n", + "\n", + "#### 1. Establish the goals and business value\n", + "\n", + "Most organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n", + "\n", + "The goals generally fall into one of three categories:\n", + "\n", + "**1.** **Business outcomes**\n", + "\n", + "**2.** **People**\n", + "\n", + "**3.** **Technology**\n", + "\n", + "\n", + "and only 20% of their time developing insights and\n", + "\n", + "\n", + "In terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n", + "\n", + "emerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n", + "\n", + "business leaders see the digital transformation as an opportunity to build a new technology foundation\n", + "\n", + "from which to run their business and increase business value. One that is more agile, scalable, secure and\n", + "\n", + "easier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n", + "\n", + "dynamic economy.\n", + "\n", + "For organizations today, people are one of their most valuable assets — you cannot succeed in data,\n", + "\n", + "analytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n", + "\n", + "impacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n", + "\n", + "employees work in a frictionless data environment, to the extent possible, so they feel productive each day\n", + "\n", + "and can do their best work.\n", + "\n", + "Finally, from a technology perspective, organizations have grown tired of the high costs associated with\n", + "\n", + "complex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n", + "\n", + "industry trend is to move away from large capital expenditures (capex) to pay for network and server\n", + "\n", + "capacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n", + "\n", + "approach. Your data analytics environment should support this trend as well — using open standards, low-\n", + "\n", + "cost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n", + "\n", + "once they are complete.\n", + "\n", + "\n", + "algorithms. Organizations that are able to invert\n", + "\n", + "these numbers benefit in two ways — happier\n", + "\n", + "employees and improved time to market for use\n", + "\n", + "cases. These employers create more favorable\n", + "\n", + "working environments and lower the risk of burnout\n", + "\n", + "and the resulting regrettable attrition.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
a3c69e2621c59adc16c4f9b279b5bc3cand can do their best work.\n", + "\n", + "Finally, from a technology perspective, organizations have grown tired of the high costs associated with\n", + "\n", + "complex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n", + "\n", + "industry trend is to move away from large capital expenditures (capex) to pay for network and server\n", + "\n", + "capacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n", + "\n", + "approach. Your data analytics environment should support this trend as well — using open standards, low-\n", + "\n", + "cost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n", + "\n", + "once they are complete.\n", + "\n", + "\n", + "algorithms. Organizations that are able to invert\n", + "\n", + "these numbers benefit in two ways — happier\n", + "\n", + "employees and improved time to market for use\n", + "\n", + "cases. These employers create more favorable\n", + "\n", + "working environments and lower the risk of burnout\n", + "\n", + "and the resulting regrettable attrition.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Executive buy-in and support**\n", + "\n", + "Large organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n", + "\n", + "to have unwavering buy-in and support from the highest levels of management — including the CEO and\n", + "\n", + "board of directors. With this support, you have the leverage you need to develop the strategy, decide on\n", + "\n", + "an architecture and implement a solution that can truly change the way your business is run. Without it,\n", + "\n", + "you have a very expensive science project that has little hope of succeeding. Why? Because the majority\n", + "\n", + "of people in your organization are busy doing their day jobs. The added work to support the initiative must\n", + "\n", + "be offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n", + "\n", + "within it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n", + "\n", + "Transformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n", + "\n", + "all the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n", + "\n", + "to be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n", + "\n", + "vocal proponents.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolve to an AI-first company — not just a data-first company**\n", + "\n", + "Data and AI transformations should truly transform the way organizations use data, not just evolve it. For\n", + "\n", + "decades, businesses have operated using traditional business processes and leveraged Structured Query\n", + "\n", + "Language (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n", + "\n", + "data. There are five major challenges with this approach:\n", + "\n", + "**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n", + "\n", + "use pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n", + "\n", + "ability to replicate and scale the results is nearly impossible.\n", + "\n", + "Auto�ated Decision�Ma�ing\n", + "\n", + "#### Tech leaders are to the right of the Data Maturity Curve\n", + "\n", + "\n", + "Prescriptive Anal�tics\n", + "\n", + "Predictive Modeling\n", + "\n", + "Data Exploration\n", + "\n", + "\n", + "From hindsight to foresight\n", + "\n", + "\n", + "How should\n", + "we respond?\n", + "\n", + "\n", + "Auto�aticall� �a��\n", + "the best decision\n", + "\n", + "\n", + "Ad Hoc Queries\n", + "\n", + "Reports\n", + "Clean Data\n", + "\n", + "WHAT HAPPENED? WHAT W255 HAPPEN?\n", + "\n", + "Data and A2 Maturit�\n", + "\n", + "\n", + "**Figure 3:**\n", + "The Data Maturity Curve\n", + "\n", + "\n", + "-----\n", + "\n", + "**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n", + "\n", + "processing.\n", + "\n", + "**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n", + "\n", + "programmatically state, in a priority manner, how data insights can be achieved or how the business\n", + "\n", + "should react to changing data.\n", + "\n", + "**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n", + "\n", + "number of people needed to respond to every piece of data flowing into your environment. MachinesSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
d7d8ac7c2c7123ddb5a774d91fdb8b1fPrescriptive Anal�tics\n", + "\n", + "Predictive Modeling\n", + "\n", + "Data Exploration\n", + "\n", + "\n", + "From hindsight to foresight\n", + "\n", + "\n", + "How should\n", + "we respond?\n", + "\n", + "\n", + "Auto�aticall� �a��\n", + "the best decision\n", + "\n", + "\n", + "Ad Hoc Queries\n", + "\n", + "Reports\n", + "Clean Data\n", + "\n", + "WHAT HAPPENED? WHAT W255 HAPPEN?\n", + "\n", + "Data and A2 Maturit�\n", + "\n", + "\n", + "**Figure 3:**\n", + "The Data Maturity Curve\n", + "\n", + "\n", + "-----\n", + "\n", + "**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n", + "\n", + "processing.\n", + "\n", + "**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n", + "\n", + "programmatically state, in a priority manner, how data insights can be achieved or how the business\n", + "\n", + "should react to changing data.\n", + "\n", + "**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n", + "\n", + "number of people needed to respond to every piece of data flowing into your environment. Machines\n", + "\n", + "scale, people do not.\n", + "\n", + "**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n", + "\n", + "gain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n", + "\n", + "21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n", + "\n", + "of processing and managing data will not work. Using ML and AI will empower your workforce to\n", + "\n", + "leverage data to make better decisions for managing risk, helping your organization succeed in the\n", + "\n", + "modern economy.\n", + "\n", + "**Go “all in” on the cloud**\n", + "\n", + "The COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n", + "\n", + "videoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n", + "\n", + "cloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n", + "\n", + "as a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n", + "\n", + "especially when combined with the use of open source software (OSS), increase the speed at which\n", + "\n", + "organizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n", + "\n", + "For AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n", + "\n", + "corporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n", + "\n", + "time, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n", + "\n", + "are well aware of vendor lock-in and want to abstract their applications so they can be moved across\n", + "\n", + "clouds if there is a compelling business reason.\n", + "\n", + "\n", + "-----\n", + "\n", + "Approaching your technology choices with a multicloud point of view gives the organization more sovereignty\n", + "\n", + "over the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n", + "\n", + "run on different cloud providers and simplified compliance with emerging regulations that may require\n", + "\n", + "companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n", + "\n", + "As a result, data portability and the ability to run workloads on different cloud providers are becoming\n", + "\n", + "increasingly important.\n", + "\n", + "**Modernize business applications**\n", + "\n", + "As organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n", + "\n", + "approach. The majority of on-premises applications are not built with the cloud in mind. They usually\n", + "\n", + "differ in the way that they handle security, resiliency, scalability and failover. Their application designs\n", + "\n", + "often store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n", + "\n", + "CCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n", + "\n", + "therefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n", + "\n", + "services and APIs to easily provide access to an application’s functionality.\n", + "\n", + "Cloud-based architectures, commodity databases and software application development frameworks makeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
25ef18d715b47231f6594d1da80303e9companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n", + "\n", + "As a result, data portability and the ability to run workloads on different cloud providers are becoming\n", + "\n", + "increasingly important.\n", + "\n", + "**Modernize business applications**\n", + "\n", + "As organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n", + "\n", + "approach. The majority of on-premises applications are not built with the cloud in mind. They usually\n", + "\n", + "differ in the way that they handle security, resiliency, scalability and failover. Their application designs\n", + "\n", + "often store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n", + "\n", + "CCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n", + "\n", + "therefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n", + "\n", + "services and APIs to easily provide access to an application’s functionality.\n", + "\n", + "Cloud-based architectures, commodity databases and software application development frameworks make\n", + "\n", + "it easier for developers to build scalable, secure end-to-end applications to run all your internal business\n", + "\n", + "processes. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n", + "\n", + "a backing database) has become straightforward with the latest tooling available to your application\n", + "\n", + "development teams.\n", + "\n", + "As a first step, organizations should inventory their business-critical applications, prioritize them based\n", + "\n", + "on business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n", + "\n", + "applications that generate and store a significant amount of the data consumed within an organization. Using\n", + "\n", + "a consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n", + "\n", + "\n", + "“We are on an amazing journey. Being among\n", + "\n", + "the fastest-growing enterprise software cloud\n", + "\n", + "companies on record was unimaginable when\n", + "\n", + "we started Databricks. To get here, we’ve stayed\n", + "\n", + "focused on the three big bets we made when\n", + "\n", + "founding the company — cloud, open source\n", + "\n", + "and machine learning. Fast-forward seven years,\n", + "\n", + "thousands of data teams around the globe are\n", + "\n", + "working better together on Databricks.”\n", + "\n", + "**Ali Ghodsi**\n", + "\n", + "Co-founder and CEO\n", + "\n", + "Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "The next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n", + "\n", + "A good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n", + "\n", + "other applications within your environment to store copies of the data — unless absolutely necessary for\n", + "\n", + "performance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n", + "\n", + "the data from the actual SOR.\n", + "\n", + "Data from these SORs should be made available in three ways:\n", + "\n", + "**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n", + "\n", + "**2.** \u0007Ensure that copies of the data land in the data lake.\n", + "\n", + "**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n", + "\n", + "consumption by downstream applications.\n", + "\n", + "**Move toward real-time decisioning**\n", + "\n", + "The value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n", + "\n", + "and the second is to view data as an individual event. This so-called “time value of data” is an important\n", + "\n", + "concept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n", + "\n", + "the same data platform.\n", + "\n", + "On the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n", + "\n", + "aggregate data provides the ability to look back in time and see the complete history of an aspect of your\n", + "\n", + "business and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n", + "\n", + "newly created or arriving data event gives you the opportunity to make decisions — in the moment — thatSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
a20cadbb79e7462225d18454eb8193d4**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n", + "\n", + "consumption by downstream applications.\n", + "\n", + "**Move toward real-time decisioning**\n", + "\n", + "The value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n", + "\n", + "and the second is to view data as an individual event. This so-called “time value of data” is an important\n", + "\n", + "concept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n", + "\n", + "the same data platform.\n", + "\n", + "On the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n", + "\n", + "aggregate data provides the ability to look back in time and see the complete history of an aspect of your\n", + "\n", + "business and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n", + "\n", + "newly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n", + "\n", + "can positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n", + "\n", + "The goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n", + "\n", + "This “time value of data” is shown in Figure 4 on the next page.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, real-time processing of clickstream data from your customer-facing mobile application can\n", + "\n", + "indicate when the customer is having trouble and may need to call into your call center. This insight gives\n", + "\n", + "you the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n", + "\n", + "center agents — improving the customer experience and lowering customer churn.\n", + "\n", + "Data, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n", + "\n", + "machine learning models using historical data and provides you with the ability to make real-time decisions\n", + "\n", + "as new events take place. For example, credit card fraud models can use deep historical data about a given\n", + "\n", + "customer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n", + "\n", + "build rich models that are then executed for each new credit card transaction. This real-time execution,\n", + "\n", + "combined with historical data, enables the best possible customer experience.\n", + "\n", + "#### Time Value of Data\n", + "\n", + "\n", + "The Databricks Lakehouse Platform allows you to\n", + "\n", + "combine real-time streaming and batch processing\n", + "\n", + "using one architecture and a consistent set of\n", + "\n", + "programming APIs.\n", + "\n", + "**Figure 4:**\n", + "Time Value of Data\n", + "\n", + "\n", + "Value of an individual data\n", + "\n", + "record is very high once created\n", + "but decreases over time\n", + "\n", + "\n", + "Value of data records\n", + "\n", + "in aggregate increases\n", + "over time\n", + "\n", + "\n", + "Real-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n", + "\n", + "\n", + "-----\n", + "\n", + "**Land** **_all_** **data in a data lake**\n", + "\n", + "In order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n", + "\n", + "user as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n", + "\n", + "access. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n", + "\n", + "warehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n", + "\n", + "that data only. What do you do when you want to ask a different question? To further complicate matters,\n", + "\n", + "how do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n", + "\n", + "How do you find new insights as quickly as possible?\n", + "\n", + "The overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n", + "\n", + "along the data pipeline — including raw, refined and curated data states.\n", + "\n", + "This phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n", + "\n", + "_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and processSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
5decf6a290526bf6b3497f886667a551user as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n", + "\n", + "access. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n", + "\n", + "warehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n", + "\n", + "that data only. What do you do when you want to ask a different question? To further complicate matters,\n", + "\n", + "how do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n", + "\n", + "How do you find new insights as quickly as possible?\n", + "\n", + "The overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n", + "\n", + "along the data pipeline — including raw, refined and curated data states.\n", + "\n", + "This phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n", + "\n", + "_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n", + "\n", + "all their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n", + "\n", + "knowing whether or not you can trust the data.\n", + "\n", + "**Change the way people work**\n", + "\n", + "When done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n", + "\n", + "savings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n", + "\n", + "minimizing the number of steps needed to produce results with data. This can be accomplished by:\n", + "\n", + "**1.** \u0007 Making data more accessible and ensuring it can be trusted\n", + "\n", + "**2.** Minimizing the number of tools/systems needed to perform work\n", + "\n", + "**3.** Creating a flywheel effect by leveraging the work of others\n", + "\n", + "\n", + "“We believe that the data lakehouse architecture\n", + "\n", + "presents an opportunity comparable to the one\n", + "\n", + "we saw during early years of the data warehouse\n", + "\n", + "market. The unique ability of the lakehouse to\n", + "\n", + "manage data in an open environment, blend all\n", + "\n", + "varieties of data from all parts of the enterprise and\n", + "\n", + "combine the data science focus of the data lake\n", + "\n", + "with the end-user analytics of the data warehouse\n", + "\n", + "will unlock incredible value for organizations.”\n", + "\n", + "**Bill Inmon**\n", + "\n", + "The father of the data warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "In large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n", + "\n", + "is laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n", + "\n", + "Systems and applications get built over time to satisfy specific needs within a line of business. As a result,\n", + "\n", + "it’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n", + "\n", + "data they need to do their jobs. It should be as simple as getting your identity and PC.\n", + "\n", + "With Databricks, users can collaborate and perform\n", + "\n", + "\n", + "A primary goal of your data and AI transformation should be to focus on improving the user experience —\n", + "\n", + "in other words, improving how your entire organization interacts with data. Data must be easily discoverable\n", + "\n", + "with default access to users based on their role(s) — with a simple process to compliantly request access to\n", + "\n", + "data sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n", + "\n", + "the various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n", + "\n", + "Finally, the results of the work performed by a user or system upstream should be made available to users\n", + "\n", + "and systems downstream as “data assets” that can drive business value.\n", + "\n", + "Organizations that maximize the productivity of their workforce and enable employees to do their best work\n", + "\n", + "under optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n", + "\n", + "**Minimize time in the “seam”**\n", + "\n", + "As you begin your data transformation, it is important to know that the longer it takes, the more risk and\n", + "\n", + "cost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n", + "\n", + "to a modern data stack will require you to operate in two environments simultaneously, the old and the new,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
e98460a27fdbfdd72025b6718dc50b06with default access to users based on their role(s) — with a simple process to compliantly request access to\n", + "\n", + "data sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n", + "\n", + "the various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n", + "\n", + "Finally, the results of the work performed by a user or system upstream should be made available to users\n", + "\n", + "and systems downstream as “data assets” that can drive business value.\n", + "\n", + "Organizations that maximize the productivity of their workforce and enable employees to do their best work\n", + "\n", + "under optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n", + "\n", + "**Minimize time in the “seam”**\n", + "\n", + "As you begin your data transformation, it is important to know that the longer it takes, the more risk and\n", + "\n", + "cost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n", + "\n", + "to a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n", + "\n", + "for some period of time. This will have a series of momentary adverse effects on your business:\n", + "\n", + "\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n", + "\n", + "\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n", + "\n", + "very different ecosystems\n", + "\n", + "\n", + "their work more efficiently, regardless of their\n", + "\n", + "persona or role. The user experience is designed\n", + "\n", + "to support the workloads of data analysts, SQL\n", + "\n", + "developers, data engineers, data scientists and\n", + "\n", + "machine learning professionals.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n", + "\n", + "models and cyber defenses\n", + "\n", + "\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n", + "\n", + "\u0007It will require precise communications to ensure that your business partners know which environment to\n", + "\n", + "use and for what data workloads\n", + "\n", + "To mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n", + "\n", + "“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n", + "\n", + "It’s important to remember this is a critical but short-lived experience for business continuity.\n", + "\n", + "**Shut down legacy platforms**\n", + "\n", + "In keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n", + "\n", + "steps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n", + "\n", + "premises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n", + "\n", + "premises Hadoop system is generally as follows:\n", + "\n", + "**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n", + "\n", + "**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n", + "\n", + "fixes or absolutely critical new business use cases.\n", + "\n", + "**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n", + "\n", + "**4.** \u0007Identify the source systems that feed the data.\n", + "\n", + "**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n", + "\n", + "**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n", + "\n", + "**7.** \u0007Determine the downstream consumers of the output from the jobs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n", + "\n", + "**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n", + "\n", + "architecture.\n", + "\n", + "**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n", + "\n", + "working smoothly.\n", + "\n", + "**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n", + "\n", + "**12.** \u0007Rinse and repeat — until all jobs are migrated.\n", + "\n", + "**13.** \u0007Shut down the Hadoop cluster.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
62110994a64bb010e27d3ddcc1b3a3d6**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n", + "\n", + "**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n", + "\n", + "**7.** \u0007Determine the downstream consumers of the output from the jobs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n", + "\n", + "**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n", + "\n", + "architecture.\n", + "\n", + "**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n", + "\n", + "working smoothly.\n", + "\n", + "**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n", + "\n", + "**12.** \u0007Rinse and repeat — until all jobs are migrated.\n", + "\n", + "**13.** \u0007Shut down the Hadoop cluster.\n", + "\n", + "A similar model can also be applied to legacy on-premises enterprise data warehouses.\n", + "\n", + "You can follow the same process for other legacy systems in your environment. Some of these systems\n", + "\n", + "may be more complex and require the participation of more stakeholders to identify the fastest way to\n", + "\n", + "rationalize the data and processes. It is important, however, to make sure that the organization has the\n", + "\n", + "fortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n", + "\n", + "their lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n", + "\n", + "for teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n", + "\n", + "9 plays a crucial role in seeing the shutdown of legacy platforms through.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 2. Identify and prioritize use cases\n", + "\n", + "An important next step in enabling data, analytics and AI to transform your business is to identify use cases\n", + "\n", + "that drive business value — while prioritizing the ones that are achievable under the current conditions\n", + "\n", + "(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n", + "\n", + "that could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n", + "\n", + "Leaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n", + "\n", + "**Establish the list of potential use cases**\n", + "\n", + "The first step is to ideate by bringing together various stakeholders from across the organization and\n", + "\n", + "understand the overall business drivers — especially those that are monitored by the CEO and board of\n", + "\n", + "directors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n", + "\n", + "and understand the business processes and the data required to implement the use case. After steps one and\n", + "\n", + "two, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n", + "\n", + "project within the data/IT teams, it’s important to have a line of business champion at the executive level.\n", + "\n", + "There needs to be a balance between use cases that are complex and ones that are considered low-\n", + "\n", + "hanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n", + "\n", + "straightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n", + "\n", + "given individual or household. However, developing a sophisticated credit card fraud model that takes into\n", + "\n", + "account geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n", + "\n", + "to perform the analytics.\n", + "\n", + "In terms of performance, thought should be given to the speed at which the use case must execute. In\n", + "\n", + "general, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n", + "\n", + "cases into three categories:\n", + "\n", + "**1.** Sub-second response\n", + "\n", + "**2.** Multi-second response\n", + "\n", + "**3.** Multi-minute response\n", + "\n", + "\n", + "-----\n", + "\n", + "Being pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n", + "\n", + "engineering the design and infrastructure.\n", + "\n", + "**Thinking in terms of “data assets”**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
6e20bb6a8fb31697144f9de8e058686dstraightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n", + "\n", + "given individual or household. However, developing a sophisticated credit card fraud model that takes into\n", + "\n", + "account geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n", + "\n", + "to perform the analytics.\n", + "\n", + "In terms of performance, thought should be given to the speed at which the use case must execute. In\n", + "\n", + "general, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n", + "\n", + "cases into three categories:\n", + "\n", + "**1.** Sub-second response\n", + "\n", + "**2.** Multi-second response\n", + "\n", + "**3.** Multi-minute response\n", + "\n", + "\n", + "-----\n", + "\n", + "Being pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n", + "\n", + "engineering the design and infrastructure.\n", + "\n", + "**Thinking in terms of “data assets”**\n", + "\n", + "Machine learning algorithms require data — data that is readily available, of high quality and relevant — to\n", + "\n", + "perform the experiments, train the models, and then execute the model when it is deployed to production.\n", + "\n", + "The quality and veracity of the data used to perform these machine learning steps are key to deploying\n", + "\n", + "models into production that produce a tangible ROI.\n", + "\n", + "It is critical to understand what steps are needed in order to make the data available for a given use case.\n", + "\n", + "One point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n", + "\n", + "teams need to perform work to make data available for one use case, then look for opportunities to have the\n", + "\n", + "engineers do incremental work in order to surface data for adjacent use cases.\n", + "\n", + "Mature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n", + "\n", + "the importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n", + "\n", + "approach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n", + "\n", + "the level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n", + "\n", + "roadmap helps data source owners understand the priority and complexity of the data assets that need to\n", + "\n", + "be created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n", + "\n", + "influences the design of business applications and other systems within the organization.\n", + "\n", + "**Determine the highest impact/priority**\n", + "\n", + "As shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n", + "\n", + "account three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n", + "\n", + "whether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n", + "\n", + "reduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n", + "\n", + "data science talent readily available, to implement the use case. The ROI score indicates whether or not the\n", + "\n", + "organization can easily measure the impact to the P/L.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
0227c134834456af92ed318a9952e270the level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n", + "\n", + "roadmap helps data source owners understand the priority and complexity of the data assets that need to\n", + "\n", + "be created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n", + "\n", + "influences the design of business applications and other systems within the organization.\n", + "\n", + "**Determine the highest impact/priority**\n", + "\n", + "As shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n", + "\n", + "account three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n", + "\n", + "whether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n", + "\n", + "reduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n", + "\n", + "data science talent readily available, to implement the use case. The ROI score indicates whether or not the\n", + "\n", + "organization can easily measure the impact to the P/L.\n", + "\n", + "\n", + "-----\n", + "\n", + "|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n", + "|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n", + "||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n", + "||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n", + "|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n", + "||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n", + "||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n", + "|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n", + "\n", + "\n", + "**Figure 5:**\n", + "Methodology for scoring use cases\n", + "**Ensure business and technology leadership alignment**\n", + "\n", + "Prioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n", + "\n", + "It is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n", + "\n", + "reduction (defensive). For example, data governance and compliance use cases should take priority\n", + "\n", + "over offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n", + "\n", + "acquisition of a new customer.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Professional Services team can\n", + "\n", + "help customers identify revenue-generating and\n", + "\n", + "cost-saving opportunities for data and AI use cases\n", + "\n", + "that provide a significant ROI when adopting the\n", + "\n", + "\n", + "#### 3. Build successful data teams\n", + "\n", + "In order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n", + "\n", + "performing teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n", + "\n", + "training and leadership. Digital transformations require executive-level support and are likely to fail without\n", + "\n", + "it — especially in large organizations.\n", + "\n", + "However, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n", + "\n", + "an enterprise level. In other words, they must also evolve their company culture into one that embraces data,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
69c4fe9bf7ab670ec15a30cf31ea26f5over offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n", + "\n", + "acquisition of a new customer.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Professional Services team can\n", + "\n", + "help customers identify revenue-generating and\n", + "\n", + "cost-saving opportunities for data and AI use cases\n", + "\n", + "that provide a significant ROI when adopting the\n", + "\n", + "\n", + "#### 3. Build successful data teams\n", + "\n", + "In order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n", + "\n", + "performing teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n", + "\n", + "training and leadership. Digital transformations require executive-level support and are likely to fail without\n", + "\n", + "it — especially in large organizations.\n", + "\n", + "However, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n", + "\n", + "an enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n", + "\n", + "data literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n", + "\n", + "\n", + "lakehouse architecture.\n", + "\n", + "**Chief information officers and chief data officers — two sides of the data coin**\n", + "\n", + "Data native companies generally have a single, accountable executive who is responsible for areas such\n", + "\n", + "as data science, business analytics, data strategy, data governance and data management. The data\n", + "\n", + "management aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n", + "\n", + "through the environment, performing data quality checks and scanning for sensitive data in the clear.\n", + "\n", + "Many organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n", + "\n", + "to oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n", + "\n", + "stakeholders to establish the overall project plan, design and implementation — and to align project\n", + "\n", + "management, product management, business analysis, data engineering, data scientist and machine\n", + "\n", + "learning talent.\n", + "\n", + "The CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n", + "\n", + "make the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n", + "\n", + "must understand the benefits of — and their role and responsibilities in — supporting the initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "There are two organizational constructs that are found in most successful data native companies. The first is\n", + "\n", + "the creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n", + "\n", + "ML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n", + "\n", + "the formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n", + "\n", + "priorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n", + "\n", + "Furthermore, CDOs need to bring their CIOs along early in the journey.\n", + "\n", + "**Creating an AI/ML COE**\n", + "\n", + "Data science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n", + "\n", + "everything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n", + "\n", + "difficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n", + "\n", + "document, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n", + "\n", + "However, the general distinction is that data science is used to produce insights, machine learning is used to\n", + "\n", + "produce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n", + "\n", + "is expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n", + "\n", + "various data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n", + "\n", + "set of questions.\n", + "\n", + "Organizations wanting to build a data science competency should consider hiring talent into a centralized\n", + "\n", + "organization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n", + "\n", + "data science. The COE works with the rest of the organization to educate and promote the appropriate use\n", + "\n", + "of data science for various use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "A common approach is to have the COE report into the CDO, but still have data scientists dotted line intoSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
2cd6d562ff9fe9da4014a21f3f129fd5difficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n", + "\n", + "document, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n", + "\n", + "However, the general distinction is that data science is used to produce insights, machine learning is used to\n", + "\n", + "produce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n", + "\n", + "is expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n", + "\n", + "various data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n", + "\n", + "set of questions.\n", + "\n", + "Organizations wanting to build a data science competency should consider hiring talent into a centralized\n", + "\n", + "organization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n", + "\n", + "data science. The COE works with the rest of the organization to educate and promote the appropriate use\n", + "\n", + "of data science for various use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "A common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n", + "\n", + "the business units or department. Using this approach, you achieve two goals:\n", + "\n", + "\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n", + "\n", + "within a business unit and can help identify use cases that drive value\n", + "\n", + "\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n", + "\n", + "and consistency in how work is performed among the cohort and brings that to the entire organization\n", + "\n", + "**Data and AI transformation steering committee**\n", + "\n", + "The purpose of the steering committee is to provide governance and guidance to the data transformation\n", + "\n", + "initiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n", + "\n", + "a vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n", + "\n", + "initiative.\n", + "\n", + "The steering committee should meet regularly with leaders from across the organization to hear status\n", + "\n", + "reports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n", + "\n", + "group of stakeholders, including:\n", + "\n", + "\u0007\n", + "**Program/project management:** To report the status of progress for deploying the new data\n", + "\n", + "ecosystem and driving adoption through use cases\n", + "\n", + "\u0007\n", + "**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n", + "\n", + "of the platform\n", + "\n", + "\u0007\n", + "**Engineering:** To report the status of the implementation and what technology trade-offs need\n", + "\n", + "to be made\n", + "\n", + "\u0007\n", + "**Data science:** To report on the progress made by the COE on educating the organization about\n", + "\n", + "use cases for ML and to report the status of various implementations\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007\n", + "**InfoSec:** To review the overall security, including network, storage, application and data\n", + "\n", + "encryption and tokenization\n", + "\n", + "\u0007\n", + "**Architecture:** To oversee that the implementation adheres to architectural standards\n", + "\n", + "and guardrails\n", + "\n", + "\u0007\n", + "**Risk, compliance and legal:** To oversee the approach to data governance\n", + "\n", + "and ethics in ML\n", + "\n", + "\u0007\n", + "**User experience:** To serve as the voice of the end users who will perform their jobs using\n", + "\n", + "the new data ecosystem\n", + "\n", + "\u0007\n", + "**Communication:** To provide up-to-date communications to the organization about next\n", + "\n", + "steps and how to drive adoption\n", + "\n", + "**Partnering with architecture and InfoSec**\n", + "\n", + "Early on, the CDO and CIO should engage the engineering and architecture community within the\n", + "\n", + "organization to ensure that everyone understands the technical implications of the overall strategy. This\n", + "\n", + "minimizes the chances that the engineering teams will build separate and competing data platforms. In\n", + "\n", + "regulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n", + "\n", + "The EA is responsible for validating that the overall technology design and data management features\n", + "\n", + "support the performance and regulatory compliance requirements — specifically, whether the proposed\n", + "\n", + "design can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n", + "\n", + "variety and veracity (four Vs) of the data environment.\n", + "\n", + "\n", + "It is important to fully understand which\n", + "\n", + "environments and accounts your data is stored\n", + "\n", + "in. The goal is to minimize the number of copies ofSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
8a601498740072e4936b24cddac2e38f\u0007\n", + "**Communication:** To provide up-to-date communications to the organization about next\n", + "\n", + "steps and how to drive adoption\n", + "\n", + "**Partnering with architecture and InfoSec**\n", + "\n", + "Early on, the CDO and CIO should engage the engineering and architecture community within the\n", + "\n", + "organization to ensure that everyone understands the technical implications of the overall strategy. This\n", + "\n", + "minimizes the chances that the engineering teams will build separate and competing data platforms. In\n", + "\n", + "regulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n", + "\n", + "The EA is responsible for validating that the overall technology design and data management features\n", + "\n", + "support the performance and regulatory compliance requirements — specifically, whether the proposed\n", + "\n", + "design can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n", + "\n", + "variety and veracity (four Vs) of the data environment.\n", + "\n", + "\n", + "It is important to fully understand which\n", + "\n", + "environments and accounts your data is stored\n", + "\n", + "in. The goal is to minimize the number of copies of\n", + "\n", + "your data and to keep the data within your cloud\n", + "\n", + "account — and not the vendor’s.\n", + "\n", + "Make sure the architecture and security model for\n", + "\n", + "protecting data is well understood.\n", + "\n", + "\n", + "-----\n", + "\n", + "From an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n", + "\n", + "applied to the new data ecosystem and that the authentication, authorization and access control methods\n", + "\n", + "meet all the data governance requirements. An industry best practice is to enable self-service registration\n", + "\n", + "of data sets, by the data owner, and support the assignment of security groups or roles to help automate\n", + "\n", + "the access control process. This allows data sets to be accessible only to the personnel that belong to a\n", + "\n", + "given group. The group membership could be based primarily on job function or role within the organization.\n", + "\n", + "This approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n", + "\n", + "too many access control groups — in other words, do not get too fine grained with group permissions, as\n", + "\n", + "they will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n", + "\n", + "row- and column-level security sparingly.\n", + "\n", + "**Centralized vs. federated labor strategy**\n", + "\n", + "In most organizations today, managers work in silos, making decisions with the best intentions but focused\n", + "\n", + "on their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n", + "\n", + "conflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n", + "\n", + "and money and potentially erode the confidence and motivation of the various teams. While it certainly is\n", + "\n", + "beneficial to compare and contrast different approaches to implementing an architecture, the approaches\n", + "\n", + "should be strictly managed, with everyone designing for the same goals and requirements — as described in\n", + "\n", + "this strategy document and adhering to the architectural principles and best practices.\n", + "\n", + "Even still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n", + "\n", + "least amount of complexity as possible, and one that can easily scale across the organization. It is very\n", + "\n", + "challenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n", + "\n", + "in front of this wave of innovation and take input from the various teams to create a single, centralized\n", + "\n", + "platform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n", + "\n", + "modern data stack — while ensuring that there is no duplication of effort when implementing the platform\n", + "\n", + "components. Figure 6 shows one possible structure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6:**\n", + "Centralized teams with matrixed responsibilities\n", + "\n", + "\n", + "**Data Scientist**\n", + "Model and predict with data\n", + "\n", + "**Data Analyst**\n", + "Visualize and describe data\n", + "\n", + "\n", + "**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n", + "\n", + "**Data Engineer**\n", + "Store, process, maintain data\n", + "\n", + "**Business Partners**\n", + "**and Domain Experts**\n", + "\n", + "\n", + "Centralize data scientists under CDO — embed in lines of business for day-to-day taskingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
46976d7e483261a09e448b88ed2dab97in front of this wave of innovation and take input from the various teams to create a single, centralized\n", + "\n", + "platform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n", + "\n", + "modern data stack — while ensuring that there is no duplication of effort when implementing the platform\n", + "\n", + "components. Figure 6 shows one possible structure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6:**\n", + "Centralized teams with matrixed responsibilities\n", + "\n", + "\n", + "**Data Scientist**\n", + "Model and predict with data\n", + "\n", + "**Data Analyst**\n", + "Visualize and describe data\n", + "\n", + "\n", + "**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n", + "\n", + "**Data Engineer**\n", + "Store, process, maintain data\n", + "\n", + "**Business Partners**\n", + "**and Domain Experts**\n", + "\n", + "\n", + "Centralize data scientists under CDO — embed in lines of business for day-to-day tasking\n", + "\n", + "Centralize data engineers under CIO/CTO — initially as an enterprise function\n", + "\n", + "**Hiring, training and upskilling your talent**\n", + "\n", + "While this guide does not cover recruiting strategies, it is important to note that data engineering and data\n", + "\n", + "science talent is very difficult to find in this competitive market. As a result, every organization should\n", + "\n", + "consider what training and upskilling opportunities exist for their current staff. A large number of online\n", + "\n", + "courses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n", + "\n", + "augment your existing staff with experienced data scientists and machine learning experts. You will then\n", + "\n", + "need to establish clear training paths, resources and timelines to upskill your talent.\n", + "\n", + "Using the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n", + "\n", + "less experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n", + "\n", + "in educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n", + "\n", + "transfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n", + "\n", + "experience of your talent is to enable data science using production-like data and creating a collaborative\n", + "\n", + "environment for code sharing.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks training, [documentation](https://docs.databricks.com) and\n", + "\n", + "[certification](https://databricks.com/learn/certification) available to customers is industry-\n", + "\n", + "leading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n", + "\n", + "\n", + "#### 4. Deploy a modern data stack\n", + "\n", + "The modern data architecture can most easily be described as the evolution of the enterprise data\n", + "\n", + "warehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n", + "\n", + "limitations and lessons learned from working with these two legacy data architectures inspired the next\n", + "\n", + "generation of data architecture — what the industry now refers to as the lakehouse.\n", + "\n", + "Figure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n", + "\n", + "have improved over time.\n", + "\n", + "\n", + "exemplar code for organizations to hit the ground\n", + "\n", + "running with data and AI.\n", + "\n", + "**Figure 7:**\n", + "A brief history of data architectures\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving beyond the enterprise data warehouse and data lake**\n", + "\n", + "The EDW provided organizations with the ability to easily load structured and semi-structured data into\n", + "\n", + "well-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n", + "\n", + "(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n", + "\n", + "the business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n", + "\n", + "catalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n", + "\n", + "users — while still being performant. The EDW served its purpose for decades. However, most of the recent\n", + "\n", + "advances in AI have been in better models to process unstructured data (text, images, video, audio), butSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
170152bbd1e1dfc694f4a48a8e767c4chave improved over time.\n", + "\n", + "\n", + "exemplar code for organizations to hit the ground\n", + "\n", + "running with data and AI.\n", + "\n", + "**Figure 7:**\n", + "A brief history of data architectures\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving beyond the enterprise data warehouse and data lake**\n", + "\n", + "The EDW provided organizations with the ability to easily load structured and semi-structured data into\n", + "\n", + "well-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n", + "\n", + "(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n", + "\n", + "the business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n", + "\n", + "catalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n", + "\n", + "users — while still being performant. The EDW served its purpose for decades. However, most of the recent\n", + "\n", + "advances in AI have been in better models to process unstructured data (text, images, video, audio), but\n", + "\n", + "these are precisely the types of data that an EDW is not optimized for.\n", + "\n", + "Therefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n", + "\n", + "_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n", + "\n", + "_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n", + "\n", + "often leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n", + "\n", + "commodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n", + "\n", + "organizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n", + "\n", + "range of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n", + "\n", + "complex programming model known as MapReduce, and the performance fell short of the majority of real-\n", + "\n", + "time use cases.\n", + "\n", + "Over time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n", + "\n", + "processing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n", + "\n", + "model was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n", + "\n", + "for languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n", + "\n", + "from storage and providing the scale needed for distributed workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "A data lakehouse combines the best of data\n", + "\n", + "\n", + "**Cloud-based data lakes**\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n", + "\n", + "Amazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n", + "\n", + "storage systems in the world — which make them an attractive platform to serve as the next generation\n", + "\n", + "of data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "lakes and data warehouses, enabling BI and ML\n", + "\n", + "\n", + "However, data lakes lack some critical features: They do not support transactions, they do not enforce\n", + "\n", + "data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "and batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n", + "\n", + "example, efficiently listing the millions of files (objects) that make up most large data lakes.\n", + "\n", + "**Lakehouse — the modern data architecture**\n", + "\n", + "What if it were possible to combine the best of both worlds? The performance, concurrency and data\n", + "\n", + "management of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n", + "\n", + "the target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n", + "\n", + "the complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n", + "\n", + "of this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n", + "\n", + "architecture possible.\n", + "\n", + "\n", + "on all data on a simple, open and multicloud\n", + "\n", + "modern data stack.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
a6c4aa57b347d46b3d74ce86a7176024data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "and batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n", + "\n", + "example, efficiently listing the millions of files (objects) that make up most large data lakes.\n", + "\n", + "**Lakehouse — the modern data architecture**\n", + "\n", + "What if it were possible to combine the best of both worlds? The performance, concurrency and data\n", + "\n", + "management of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n", + "\n", + "the target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n", + "\n", + "the complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n", + "\n", + "of this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n", + "\n", + "architecture possible.\n", + "\n", + "\n", + "on all data on a simple, open and multicloud\n", + "\n", + "modern data stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Exploratory Data Scientist**\n", + "\n", + "\n", + "**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n", + "\n", + "\n", + "**Curated Data Lake**\n", + "\n", + "\n", + "**Raw Data Ingest**\n", + "“Bronze”\n", + "\n", + "\n", + "**Filtered/Cleaned/Augmented**\n", + "“Silver”\n", + "\n", + "\n", + "**Business-Level Aggregates**\n", + "“Gold”\n", + "\n", + "\n", + "**D ATA Q U A L I T Y**\n", + "\n", + "**Data Sources (Batch and Real-Time)**\n", + "\n", + "\n", + "**Unstructured**\n", + "\n", + "- Image, Video, Audio\n", + "\n", + "- Free Text, Blob\n", + "\n", + "\n", + "**Semi-Structured**\n", + "\n", + "- Logs, Clickstream\n", + "\n", + "- CSV, JSON, XML\n", + "\n", + "\n", + "**Structured**\n", + "\n", + "- Systems of Record\n", + "\n", + "- Operational DBs\n", + "\n", + "\n", + "**Figure 8:**\n", + "The building blocks for a modern data architecture\n", + "\n", + "The lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n", + "\n", + "including real-time streaming, batch processing, data warehousing, data science and machine learning. This\n", + "\n", + "target-state architecture supports loading all the data types that might be interesting to an organization —\n", + "\n", + "structured, semi-structured and unstructured — and provides a single processing layer, using consistent\n", + "\n", + "APIs across programming languages, to curate data while applying rigorous data management techniques.\n", + "\n", + "The move toward a single, consistent approach to data pipelining and refinement saves organizations\n", + "\n", + "time, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n", + "\n", + "curation and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n", + "\n", + "The architecture makes possible the efficient creation of “data assets” for the organization by taking a\n", + "\n", + "stepwise approach to improving data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse key features**\n", + "\n", + "To effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n", + "\n", + "available for stakeholders to run business-critical production workloads:\n", + "\n", + "\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n", + "\n", + "management with declarative pipeline development, automatic data testing and deep visibility for\n", + "\n", + "monitoring and recovery.\n", + "\n", + "\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n", + "\n", + "data concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n", + "\n", + "read or write data, typically using SQL.\n", + "\n", + "\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n", + "\n", + "and evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n", + "\n", + "be able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n", + "\n", + "lakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n", + "\n", + "to unify data and AI assets by centrally sharing, auditing, securing and managing structured andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
0ec047404a66ef05632a43b6b58c06efmanagement with declarative pipeline development, automatic data testing and deep visibility for\n", + "\n", + "monitoring and recovery.\n", + "\n", + "\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n", + "\n", + "data concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n", + "\n", + "read or write data, typically using SQL.\n", + "\n", + "\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n", + "\n", + "and evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n", + "\n", + "be able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n", + "\n", + "lakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n", + "\n", + "to unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n", + "\n", + "unstructured data like tables, files, models and dashboards in concert with existing data, storage and\n", + "\n", + "catalogs.\n", + "\n", + "\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n", + "\n", + "clusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n", + "\n", + "Some modern data warehouses also have this property.\n", + "\n", + "\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n", + "\n", + "an API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n", + "\n", + "access the data directly.\n", + "\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once, replace a subset of\n", + "\n", + "the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n", + "\n", + "used to store, refine, analyze and access data types needed for many new data applications, including\n", + "\n", + "images, video, audio, semi-structured data and text.\n", + "\n", + "\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n", + "\n", + "tools might be needed to support all these workloads, but they all rely on the same data repository.\n", + "\n", + "\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n", + "\n", + "eliminates the need for separate systems dedicated to serving real-time data applications.\n", + "\n", + "\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n", + "\n", + "improves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n", + "\n", + "data in both a data lake and a warehouse.\n", + "\n", + "\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n", + "\n", + "governance experience across all clouds. You don’t need to invest in reinventing processes for every\n", + "\n", + "cloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n", + "\n", + "focus on putting all your data to work to discover new insights and create business value.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured dataSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
907a8e9c378b4107ab4c12c53e599cb4\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n", + "\n", + "improves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n", + "\n", + "data in both a data lake and a warehouse.\n", + "\n", + "\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n", + "\n", + "governance experience across all clouds. You don’t need to invest in reinventing processes for every\n", + "\n", + "cloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n", + "\n", + "focus on putting all your data to work to discover new insights and create business value.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "**Figure 9:**\n", + "Delta Lake is the open data storage layer that delivers reliability,\n", + "security and performance on your data lake — for both\n", + "streaming and batch operations\n", + "\n", + "\n", + "-----\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n", + "\n", + "for security and access control are basic requirements. Data governance capabilities, including auditing,\n", + "\n", + "retention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n", + "\n", + "enable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n", + "\n", + "such enterprise features only need to be implemented, tested and administered for a single system.\n", + "\n", + "Databricks is the only cloud-native vendor\n", + "\n", + "\n", + "**Databricks — innovation driving performance**\n", + "\n", + "Advanced analytics and machine learning on unstructured and large-scale data are two of the most\n", + "\n", + "strategic priorities for enterprises today — and the growth of unstructured data is going to increase\n", + "\n", + "exponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n", + "\n", + "center of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n", + "\n", + "enough to meet the SLAs of the various workloads — especially SQL-based analytics.\n", + "\n", + "Databricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n", + "\n", + "and hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n", + "\n", + "on the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n", + "\n", + "technologies to provide the performance customers need to simplify their architecture. These innovations\n", + "\n", + "combine to provide a single architecture that can store and process all the data sets within an organization —\n", + "\n", + "supporting the range of analytics outlined above.\n", + "\n", + "**BI and SQL workloads**\n", + "\n", + "Perhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n", + "\n", + "for star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n", + "\n", + "part of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n", + "\n", + "to compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n", + "\n", + "satisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n", + "\n", + "execution, statistical analysis of files in the object store, and hardware and software improvements make it\n", + "\n", + "possible to deliver on this promise.\n", + "\n", + "\n", + "to be recognized as a Leader in both\n", + "\n", + "[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n", + "\n", + "**Cloud Database Management Systems** and\n", + "\n", + "**Data Science and Machine Learning Platforms**\n", + "\n", + "\n", + "-----\n", + "\n", + "**A word about the data mesh architecture**\n", + "\n", + "In 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n", + "\n", + "what some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lakeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
6d08d458f9c7010b81932bc03d5ed771part of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n", + "\n", + "to compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n", + "\n", + "satisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n", + "\n", + "execution, statistical analysis of files in the object store, and hardware and software improvements make it\n", + "\n", + "possible to deliver on this promise.\n", + "\n", + "\n", + "to be recognized as a Leader in both\n", + "\n", + "[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n", + "\n", + "**Cloud Database Management Systems** and\n", + "\n", + "**Data Science and Machine Learning Platforms**\n", + "\n", + "\n", + "-----\n", + "\n", + "**A word about the data mesh architecture**\n", + "\n", + "In 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n", + "\n", + "what some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n", + "\n", + "using a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n", + "\n", + "mesh approach avoids centralizing data in one location and encourages the source systems to create\n", + "\n", + "“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n", + "\n", + "designers advocate for a federated approach to data and AI — while using enterprise policies to govern how\n", + "\n", + "source systems make data assets available.\n", + "\n", + "There are several challenges with this approach. First, the data mesh assumes that each source system\n", + "\n", + "can dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n", + "\n", + "become “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n", + "\n", + "details to the individual teams. This has the potential of inconsistent implementations, which may lead to\n", + "\n", + "performance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n", + "\n", + "source system team has the necessary skills, or can acquire them, to build robust data products.\n", + "\n", + "The lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n", + "\n", + "from the source systems reduces the curation steps needed inside the data lake itself.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 5. Improve data governance and compliance\n", + "\n", + "Data governance is perhaps the most challenging aspect of data transformation initiatives. Every\n", + "\n", + "stakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n", + "\n", + "drive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n", + "\n", + "undetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n", + "\n", + "environments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n", + "\n", + "a blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n", + "\n", + "data governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n", + "\n", + "and privacy practices, and protect their data assets\n", + "\n", + "**Why data governance fails**\n", + "\n", + "While most people agree that data governance is a set of principles, practices and tooling that helps\n", + "\n", + "manage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n", + "\n", + "approach — one that balances realistic policies with automation and scalability.\n", + "\n", + "Too often the policies developed around data governance define very strict data management principles —\n", + "\n", + "for example, the development of an enterprise-wide ontological model that all data must adhere to.\n", + "\n", + "Organizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n", + "\n", + "effort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n", + "\n", + "complexity of the requirements. Meanwhile, data continues to flow through the organization without a\n", + "\n", + "consistent approach to governance, and too much of the effort is done manually and fraught with human error.\n", + "\n", + "\n", + "What are the basic building blocks of a sound data\n", + "\n", + "governance approach?\n", + "\n", + "\n", + "-----\n", + "\n", + "**A pragmatic approach to data governance**\n", + "\n", + "At a high level, organizations should enable the following data management capabilities:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
3bd334ec290957c54f467da186a0ee7dmanage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n", + "\n", + "approach — one that balances realistic policies with automation and scalability.\n", + "\n", + "Too often the policies developed around data governance define very strict data management principles —\n", + "\n", + "for example, the development of an enterprise-wide ontological model that all data must adhere to.\n", + "\n", + "Organizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n", + "\n", + "effort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n", + "\n", + "complexity of the requirements. Meanwhile, data continues to flow through the organization without a\n", + "\n", + "consistent approach to governance, and too much of the effort is done manually and fraught with human error.\n", + "\n", + "\n", + "What are the basic building blocks of a sound data\n", + "\n", + "governance approach?\n", + "\n", + "\n", + "-----\n", + "\n", + "**A pragmatic approach to data governance**\n", + "\n", + "At a high level, organizations should enable the following data management capabilities:\n", + "\n", + "**\u0007Identify all sources of data**\n", + "\n", + "\u0007Identify all data-producing and data-storing applications\n", + "\n", + "\u0007Identify the systems of record (SOR) for each data set\n", + "\n", + "\u0007Label data sets as internal or external (third party)\n", + "\n", + "\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n", + "\n", + "\u0007Limit which operational data stores (ODSs) can re-store SOR data\n", + "\n", + "**\u0007Catalog data sets**\n", + "\n", + "\u0007Register all data sets in a centralized data catalog\n", + "\n", + "\u0007Create a lightweight, self-service data registration process\n", + "\n", + "\u0007Limit manual entry as much as possible\n", + "\n", + "\u0007Record the schema, if any, for the data set\n", + "\n", + "\u0007Use an inference engine or tool to extract the data set schema\n", + "\n", + "\u0007Add business and technical metadata to make it meaningful\n", + "\n", + "\u0007Use machine learning to classify data sets\n", + "\n", + "\u0007Use crowdsourcing to validate the machine-based results\n", + "\n", + "**Track data lineage**\n", + "\n", + "\u0007Track data set flow and what systems act on data\n", + "\n", + "\u0007Create an enumerated list of action values for specific operations\n", + "\n", + "\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n", + "\n", + "\n", + "\n", + "\u0007Optional: Add a source code repository URL for action traceability\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Perform data quality checks**\n", + "\n", + "\u0007Create a rules library that is centrally managed and versioned\n", + "\n", + "\u0007Update the rules library periodically with new rules\n", + "\n", + "\u0007Use a combination of checks — null/not null, regex, valid values\n", + "\n", + "\u0007Perform schema enforcement checks against data set registration\n", + "\n", + "By minimizing the number of copies of your data\n", + "\n", + "\n", + "**\u0007Scan for sensitive data**\n", + "\n", + "\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n", + "\n", + "\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n", + "\n", + "\u0007Use fixed-length tokens to preserve analytic value\n", + "\n", + "\u0007Determine the approach for token lookup/resolution when needed\n", + "\n", + "\u0007Ensure that any central token stores are secure with rotating keys\n", + "\n", + "\u0007Identify which data elements from GDPR/CCPA to include in scans\n", + "\n", + "\u0007Efficiently scan for sensitive data in cleartext using the rules library\n", + "\n", + "**\u0007Establish approved data flow patterns**\n", + "\n", + "\u0007Determine pathways for data flow (source —> target)\n", + "\n", + "\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n", + "\n", + "\u0007Determine read/write patterns for the data lake\n", + "\n", + "\u0007Strictly enforce data flow pathways to/from data lake\n", + "\n", + "\u0007Detect violations and anomalies using lineage event analysis\n", + "\n", + "\u0007Identify offending systems and shut down or grant exception\n", + "\n", + "\u0007Record data flow exceptions and set a remediation deadline\n", + "\n", + "**\u0007Centralize data access controls**\n", + "\n", + "\u0007Establish a common governance model for all data and AI assets\n", + "\n", + "\u0007Centrally define access policies for all data and AI assets\n", + "\n", + "\u0007Enable fine-grained access controls at row and column levels\n", + "\n", + "\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n", + "\n", + "\n", + "and moving to a single data processing layer where\n", + "\n", + "all your data governance controls can run together,\n", + "\n", + "you improve your chances of staying in complianceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
1785ddb61be78e30cb44c8b841db719b\u0007Efficiently scan for sensitive data in cleartext using the rules library\n", + "\n", + "**\u0007Establish approved data flow patterns**\n", + "\n", + "\u0007Determine pathways for data flow (source —> target)\n", + "\n", + "\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n", + "\n", + "\u0007Determine read/write patterns for the data lake\n", + "\n", + "\u0007Strictly enforce data flow pathways to/from data lake\n", + "\n", + "\u0007Detect violations and anomalies using lineage event analysis\n", + "\n", + "\u0007Identify offending systems and shut down or grant exception\n", + "\n", + "\u0007Record data flow exceptions and set a remediation deadline\n", + "\n", + "**\u0007Centralize data access controls**\n", + "\n", + "\u0007Establish a common governance model for all data and AI assets\n", + "\n", + "\u0007Centrally define access policies for all data and AI assets\n", + "\n", + "\u0007Enable fine-grained access controls at row and column levels\n", + "\n", + "\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n", + "\n", + "\n", + "and moving to a single data processing layer where\n", + "\n", + "all your data governance controls can run together,\n", + "\n", + "you improve your chances of staying in compliance\n", + "\n", + "and detecting a data breach.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Make data discovery easy**\n", + "\n", + "\u0007Establish a data discovery model\n", + "\n", + "\u0007Use manual or automatic data classification\n", + "\n", + "\u0007Provide a visual interface for data discovery across your data estate\n", + "\n", + "\u0007Simplify data discovery with rich keyword- or business glossary-based search\n", + "\n", + "**\u0007Centralize data access auditing**\n", + "\n", + "\u0007Establish a framework or best practices for access auditing\n", + "\n", + "\u0007Capture audit logs for all CRUD operations performed on data\n", + "\n", + "\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n", + "\n", + "This is not intended to be an exhaustive list of features and requirements but rather a framework to\n", + "\n", + "evaluate your data governance approach. There will be violations at runtime, so it will be important to have\n", + "\n", + "procedures in place for how to handle these violations. In some cases, you may want to be very strict and\n", + "\n", + "shut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n", + "\n", + "the offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n", + "\n", + "these cases, the receiving systems must have their own methodology for dealing with bad data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hidden cost of data governance**\n", + "\n", + "There are numerous examples of high-profile data breaches and failure to comply with consumer data\n", + "\n", + "protection legislation. You don’t have to look very far to see reports of substantial fines levied against\n", + "\n", + "organizations that were not able to fully protect the data within their data ecosystem. As organizations\n", + "\n", + "produce and collect more and more data, it’s important to remember that while storage is cheap, failing\n", + "\n", + "to enforce proper data governance is very, very expensive.\n", + "\n", + "In order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n", + "\n", + "compute power when you consider the massive amounts of data that exist in your organization. Each\n", + "\n", + "time you copy a piece of data to load it into another tool or platform, you need to determine what data\n", + "\n", + "governance techniques exist there and how you ensure that you truly know where all your data resides.\n", + "\n", + "Imagine the scenario where data flows through your environment and is loaded into multiple platforms\n", + "\n", + "using various ETL processes. How do you handle the situation when you discover that sensitive data is\n", + "\n", + "in cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n", + "\n", + "problem before it’s flagged for violation.\n", + "\n", + "Having a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n", + "\n", + "organization’s brand and balance sheet.\n", + "\n", + "The bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n", + "\n", + "it is to get data governance right.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 6. Democratize access to quality data\n", + "\n", + "Effective data and AI solutions rely more on the amount of quality data available than on the sophistication\n", + "\n", + "or complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness ofSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
a475a44648b52500087a321fd8f2521egovernance techniques exist there and how you ensure that you truly know where all your data resides.\n", + "\n", + "Imagine the scenario where data flows through your environment and is loaded into multiple platforms\n", + "\n", + "using various ETL processes. How do you handle the situation when you discover that sensitive data is\n", + "\n", + "in cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n", + "\n", + "problem before it’s flagged for violation.\n", + "\n", + "Having a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n", + "\n", + "organization’s brand and balance sheet.\n", + "\n", + "The bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n", + "\n", + "it is to get data governance right.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 6. Democratize access to quality data\n", + "\n", + "Effective data and AI solutions rely more on the amount of quality data available than on the sophistication\n", + "\n", + "or complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n", + "\n", + "Data” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n", + "\n", + "data scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n", + "\n", + "which is to create new opportunities for revenue growth, cost reduction and risk reduction.\n", + "\n", + "**The 80/20 data science dilemma**\n", + "\n", + "Most existing data environments have their data stored primarily in different operational data stores within a\n", + "\n", + "given business unit (BU) — creating several challenges:\n", + "\n", + "\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n", + "\n", + "of cross-BU opportunities\n", + "\n", + "\u0007The schemas are generally not well understood outside of BU or department — with only the database\n", + "\n", + "designers and power users being able to make efficient use of the data. This is referred to as the “tribal\n", + "\n", + "knowledge” phenomenon.\n", + "\n", + "\u0007The approval process and different system-level security models make it difficult and time-consuming\n", + "\n", + "for data scientists to gain the proper access to the data they need\n", + "\n", + "In order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n", + "\n", + "often done using single-node data science and generates unnecessary copies of data stored on local disk\n", + "\n", + "drives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n", + "\n", + "spaces” within production platform environments. This has the strong potential of degrading the overall\n", + "\n", + "performance for true production workloads.\n", + "\n", + "To make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n", + "\n", + "be needed in order to get the best model performance for your ML and AI workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Small data sets reduce the effectiveness of exploration, experimentation, model development and model\n", + "\n", + "training — resulting in inaccurate models when deployed into production and used with full-size data sets.\n", + "\n", + "As a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n", + "\n", + "time performing analytic work — work that may need to be redone once they have access to the full-size\n", + "\n", + "data sets. This is a serious problem for organizations that want to remain competitive and generate game-\n", + "\n", + "changing results.\n", + "\n", + "Another factor contributing to reduced productivity is the way in which end users are typically granted\n", + "\n", + "access to data. Security policies usually require both coarse-grained and fine-grained data protections.\n", + "\n", + "In other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n", + "\n", + "grained) within the data set.\n", + "\n", + "**Rationalize data access roles**\n", + "\n", + "The most common approach to providing coarse-grained and fine-grained access is to use what’s known\n", + "\n", + "as role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n", + "\n", + "(SSO) authentication and access control solution.\n", + "\n", + "Users can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n", + "\n", + "There are different strategies for identifying and creating these groups — but typically, they are done on aSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
6cb1f6fe81af31114d2adae75b0401a0data sets. This is a serious problem for organizations that want to remain competitive and generate game-\n", + "\n", + "changing results.\n", + "\n", + "Another factor contributing to reduced productivity is the way in which end users are typically granted\n", + "\n", + "access to data. Security policies usually require both coarse-grained and fine-grained data protections.\n", + "\n", + "In other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n", + "\n", + "grained) within the data set.\n", + "\n", + "**Rationalize data access roles**\n", + "\n", + "The most common approach to providing coarse-grained and fine-grained access is to use what’s known\n", + "\n", + "as role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n", + "\n", + "(SSO) authentication and access control solution.\n", + "\n", + "Users can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n", + "\n", + "There are different strategies for identifying and creating these groups — but typically, they are done on a\n", + "\n", + "system-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n", + "\n", + "This approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n", + "\n", + "thousand discrete security groups for large organizations — despite having a much smaller number of\n", + "\n", + "defined job functions.\n", + "\n", + "This approach creates one of the biggest security challenges in large organizations. When personnel leave\n", + "\n", + "the company, it is fairly straightforward to remove them from the various security groups. However, when\n", + "\n", + "personnel move around within the organization, their old security group assignments often remain intact\n", + "\n", + "and new ones are assigned based on their new job function. This leads to personnel continuing to have\n", + "\n", + "access to data that they no longer have a “need to know.”\n", + "\n", + "\n", + "The Databricks Lakehouse Platform brings together\n", + "\n", + "all the data and AI personas into one environment\n", + "\n", + "and makes it easy to collaborate, share code and\n", + "\n", + "insights, and operate against the same view of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data classification**\n", + "\n", + "Having all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n", + "\n", + "strategies to segment your data based on “need to know.” Some organizations create a partition based\n", + "\n", + "on which business unit owns the data and which one owns the data classification. For example, in a\n", + "\n", + "financial services company, credit card customers’ data could be stored separately from that of debit card\n", + "\n", + "customers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n", + "\n", + "The simplest approach to data classification is to use three labels:\n", + "\n", + "\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n", + "\n", + "releases, etc.\n", + "\n", + "\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n", + "\n", + "competitors. This would include strategy briefings and market or customer segmentation research.\n", + "\n", + "\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n", + "\n", + "could negatively affect operations and put the organization at financial or legal risk. Restricted data\n", + "\n", + "requires the highest level of security protection.\n", + "\n", + "Some organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n", + "\n", + "understands how to apply them.\n", + "\n", + "The data classification requirements should be clearly documented and mapped to any legal or regulatory\n", + "\n", + "requirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n", + "\n", + "and defines “personal information” as “information that identifies, relates to, describes, is capable of\n", + "\n", + "being associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n", + "\n", + "household.”\n", + "\n", + "\n", + "-----\n", + "\n", + "Just examining one CCPA category, _Customer Records Information_ , we see that the following information is\n", + "\n", + "to be protected: name, signature, social security number, physical characteristics or description, address,\n", + "\n", + "telephone number, passport number, driver’s license or state identification card number, insurance policy\n", + "\n", + "number, education, employment, employment history, bank account number, credit or debit card number,\n", + "\n", + "other financial information, medical information, and health insurance information.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
79eb4b4e5211fef4892cfd40c5e3441brequires the highest level of security protection.\n", + "\n", + "Some organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n", + "\n", + "understands how to apply them.\n", + "\n", + "The data classification requirements should be clearly documented and mapped to any legal or regulatory\n", + "\n", + "requirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n", + "\n", + "and defines “personal information” as “information that identifies, relates to, describes, is capable of\n", + "\n", + "being associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n", + "\n", + "household.”\n", + "\n", + "\n", + "-----\n", + "\n", + "Just examining one CCPA category, _Customer Records Information_ , we see that the following information is\n", + "\n", + "to be protected: name, signature, social security number, physical characteristics or description, address,\n", + "\n", + "telephone number, passport number, driver’s license or state identification card number, insurance policy\n", + "\n", + "number, education, employment, employment history, bank account number, credit or debit card number,\n", + "\n", + "other financial information, medical information, and health insurance information.\n", + "\n", + "There are generally three different approaches in industry to performing data classification:\n", + "\n", + "**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n", + "\n", + "done using regular expressions and lookup tables to map values to actual entities stored inside the\n", + "\n", + "organization (e.g., customer SSN).\n", + "\n", + "**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n", + "\n", + "the sensitivity of the data.\n", + "\n", + "**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n", + "\n", + "domain knowledge to ensure accuracy.\n", + "\n", + "Taking all this into account, an organization could implement a streamlined set of roles for RBAC that\n", + "\n", + "uses the convention where “domain” might be the\n", + "\n", + "business unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n", + "\n", + "the ID, and “classification” is one of the three values (public, internal, restricted).\n", + "\n", + "There is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n", + "\n", + "role assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n", + "\n", + "combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, gives a user or a system access to all the\n", + "\n", + "data fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n", + "\n", + "Whereas would allow the user or system\n", + "\n", + "access only to nonsensitive data regarding the transaction.\n", + "\n", + "This gives organizations the chance to rationalize their security groups by using a domain naming\n", + "\n", + "convention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n", + "\n", + "groups. It also dramatically eases the administration of granting access to data for a given user.\n", + "\n", + "**Everyone working from the same view of data**\n", + "\n", + "The modern data stack, when combined with a simplified security group approach and a robust data\n", + "\n", + "governance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n", + "\n", + "improves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n", + "\n", + "shared view of your data.\n", + "\n", + "Combining this with a sensitive data tokenization strategy can make it straightforward to empower data\n", + "\n", + "scientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n", + "\n", + "sets that both obfuscate NPI/PII information and preserve analytic value.\n", + "\n", + "Now, data discovery is easier because data sets have been registered in the catalog with full descriptions\n", + "\n", + "and business metadata — with some organizations going as far as showing realistic sample data for a\n", + "\n", + "particular data set. If a user does not have access to the underlying data files, having data in one physicalSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
cf19174cbf629dea04e1166d2b6966cc**Everyone working from the same view of data**\n", + "\n", + "The modern data stack, when combined with a simplified security group approach and a robust data\n", + "\n", + "governance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n", + "\n", + "improves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n", + "\n", + "shared view of your data.\n", + "\n", + "Combining this with a sensitive data tokenization strategy can make it straightforward to empower data\n", + "\n", + "scientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n", + "\n", + "sets that both obfuscate NPI/PII information and preserve analytic value.\n", + "\n", + "Now, data discovery is easier because data sets have been registered in the catalog with full descriptions\n", + "\n", + "and business metadata — with some organizations going as far as showing realistic sample data for a\n", + "\n", + "particular data set. If a user does not have access to the underlying data files, having data in one physical\n", + "\n", + "location eases the burden of granting access, and then it’s easier to deploy access-control policies and\n", + "\n", + "collect/analyze audit logs to monitor data usage and to look for bad actors.\n", + "\n", + "\n", + "Adopting the Databricks Lakehouse Platform allows\n", + "\n", + "you to add data sets into a well-managed data lake\n", + "\n", + "using low-cost object stores, and makes it easy to\n", + "\n", + "partition data based on domain, entity, data set and\n", + "\n", + "classification levels to provide fine-grained (row-\n", + "\n", + "level and column-level) security.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data security, validation and curation — in one place**\n", + "\n", + "The modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n", + "\n", + "protecting, validating and improving your organization’s data. Data governance policies can be enforced\n", + "\n", + "using the built-in features of schema validation, expectations and pipelines — the three main steps to data\n", + "\n", + "curation. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n", + "\n", + "refer to it at Databricks, Bronze —> Silver —> Gold.\n", + "\n", + "The raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n", + "\n", + "data. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n", + "\n", + "the data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n", + "\n", + "level” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n", + "\n", + "ACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n", + "\n", + "“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n", + "\n", + "customer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n", + "\n", + "cross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n", + "\n", + "and ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n", + "\n", + "jobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n", + "\n", + "improves the user experience and helps retain talent.\n", + "\n", + "**Extend the impact of your data with secure data sharing**\n", + "\n", + "Data sharing is crucial to drive business value in today’s digital economy. More and more organizations\n", + "\n", + "are now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n", + "\n", + "customers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n", + "\n", + "monetization. Additionally, organizations are interested in leveraging external data to drive new product\n", + "\n", + "innovations and services.\n", + "\n", + "Business executives must establish and promote a data sharing culture in their organizations to build\n", + "\n", + "competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 7. Dramatically increase productivity of your workforce\n", + "\n", + "Now that you have deployed a modern data stack and have landed all your analytical data in a well-\n", + "\n", + "managed data lake with a rationalized approach to access control, the next question is, “What tools should ISUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
4a588fb0050e7497fa9478969ce37ab7jobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n", + "\n", + "improves the user experience and helps retain talent.\n", + "\n", + "**Extend the impact of your data with secure data sharing**\n", + "\n", + "Data sharing is crucial to drive business value in today’s digital economy. More and more organizations\n", + "\n", + "are now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n", + "\n", + "customers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n", + "\n", + "monetization. Additionally, organizations are interested in leveraging external data to drive new product\n", + "\n", + "innovations and services.\n", + "\n", + "Business executives must establish and promote a data sharing culture in their organizations to build\n", + "\n", + "competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 7. Dramatically increase productivity of your workforce\n", + "\n", + "Now that you have deployed a modern data stack and have landed all your analytical data in a well-\n", + "\n", + "managed data lake with a rationalized approach to access control, the next question is, “What tools should I\n", + "\n", + "provide to the user community so they can be most effective at using the new data ecosystem?”\n", + "\n", + "**Design thinking: working backward from the user experience**\n", + "\n", + "Design thinking is a human-centered approach to innovation — focused on understanding customer needs,\n", + "\n", + "rapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n", + "\n", + "processes and organizations. Design thinking was introduced as a technique to not only improve but also\n", + "\n", + "bring joy to the way people work. The essence of design thinking is to determine what motivates people to\n", + "\n", + "do their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n", + "\n", + "**Moving beyond best of breed**\n", + "\n", + "If you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n", + "\n", + "training and model deployment tools. Many organizations take a “best of breed” approach in providing\n", + "\n", + "tooling for their end users. This typically occurs because leaders genuinely want to empower business\n", + "\n", + "units, departments and teams to select the tool that best suits their specific needs — so-called federated\n", + "\n", + "tool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n", + "\n", + "given the high cost of rolling it out to the entire user population.\n", + "\n", + "\n", + "-----\n", + "\n", + "When tool selection becomes localized, there are a few things to consider:\n", + "\n", + "\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n", + "\n", + "interchangeable with criteria that are established within a specific tool category. The tool with the best\n", + "\n", + "overall score gets selected.\n", + "\n", + "\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n", + "\n", + "personal preference or adoption within a department, or because a given tool is better suited to support\n", + "\n", + "a current business process\n", + "\n", + "\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n", + "\n", + "\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n", + "\n", + "moved into production\n", + "\n", + "\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n", + "\n", + "and security environment but nothing more\n", + "\n", + "\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n", + "\n", + "of tools in play or streamlining the user experience\n", + "\n", + "\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n", + "\n", + "partnership model, the ability to influence the roadmap and professional services support\n", + "\n", + "For these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n", + "\n", + "on selecting a data platform that enables seamless integration with point solutions rather than a suite of\n", + "\n", + "discrete tools that require integration work and may no longer be category leaders over the long haul.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is a leading data and AI company —\n", + "\n", + "\n", + "Keep in mind that data platforms work well because the vendor took an opinionated point of view of how\n", + "\n", + "data processing, validation and curation should work. It’s the integration between the discrete functions\n", + "\n", + "of the platform that saves time, conserves effort and improves the user experience. Many companies trySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
eaff954d65653182857574e043c105f1and security environment but nothing more\n", + "\n", + "\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n", + "\n", + "of tools in play or streamlining the user experience\n", + "\n", + "\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n", + "\n", + "partnership model, the ability to influence the roadmap and professional services support\n", + "\n", + "For these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n", + "\n", + "on selecting a data platform that enables seamless integration with point solutions rather than a suite of\n", + "\n", + "discrete tools that require integration work and may no longer be category leaders over the long haul.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is a leading data and AI company —\n", + "\n", + "\n", + "Keep in mind that data platforms work well because the vendor took an opinionated point of view of how\n", + "\n", + "data processing, validation and curation should work. It’s the integration between the discrete functions\n", + "\n", + "of the platform that saves time, conserves effort and improves the user experience. Many companies try\n", + "\n", + "to take on the integration of different technology stacks, which increases risk, cost and complexity. The\n", + "\n", + "consequences of not doing the integration properly can be serious — in terms of security, compliance,\n", + "\n", + "efficiency, cost, etc.\n", + "\n", + "\n", + "partly due to the innovations in the [open source](https://databricks.com/product/open-source)\n", + "\n", + "\n", + "So, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n", + "\n", + "and incorporate your requirements into their platform product roadmap. This will require some give-and-\n", + "\n", + "take from both parties — sometimes calling for an organization to adjust their processes to better fit how\n", + "\n", + "the platform works. There are many instances where a given business process could be simplified or recast\n", + "\n", + "to work with the platform, as is. Sometimes it will require the vendor to add features that support your\n", + "\n", + "processes. The vendor will always be market driven and will want to build features in such a way that they\n", + "\n", + "apply to the broadest set of customers.\n", + "\n", + "The final point to consider is that it takes a substantial amount of time to become an expert user of a given\n", + "\n", + "tool. Users must make a significant investment to learn how the tool works and the most efficient way of\n", + "\n", + "performing their job. The more discrete tools in an environment, the more challenging this becomes.\n", + "\n", + "Minimizing the number of tools and their different interfaces, styles of interaction and approach to security\n", + "\n", + "and collaboration helps improve the user experience and decreases time to market.\n", + "\n", + "\n", + "[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n", + "\n", + "listening to the needs of thousands of customers\n", + "\n", + "and having our engineers work side by side with\n", + "\n", + "customer teams to deliver real business value using\n", + "\n", + "data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified platform, unified personas**\n", + "\n", + "Deploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n", + "\n", + "data stack — will provide an integrated suite of tools for the full range of personas in your organization,\n", + "\n", + "including business analysts, SQL developers, data engineers and data scientists. You will immediately\n", + "\n", + "increase productivity and reduce risk because you’ll be better able to share the key aspects of data\n", + "\n", + "pipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n", + "\n", + "and deployment. All the work streams function off a single view of the data, and the handoffs between\n", + "\n", + "subsystems are well managed.\n", + "\n", + "Data processing happens in one auditable environment, and the number of copies of data is kept to an\n", + "\n", + "absolute minimum — with each user benefiting from the data assets created by others. Redundant work\n", + "\n", + "is eliminated.\n", + "\n", + "The 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n", + "\n", + "working with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n", + "\n", + "the 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n", + "\n", + "Another challenge is that enterprise data changes rapidly. New fields are added or existing fields are typedSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
ff33877449afd48d9a7757897e586275including business analysts, SQL developers, data engineers and data scientists. You will immediately\n", + "\n", + "increase productivity and reduce risk because you’ll be better able to share the key aspects of data\n", + "\n", + "pipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n", + "\n", + "and deployment. All the work streams function off a single view of the data, and the handoffs between\n", + "\n", + "subsystems are well managed.\n", + "\n", + "Data processing happens in one auditable environment, and the number of copies of data is kept to an\n", + "\n", + "absolute minimum — with each user benefiting from the data assets created by others. Redundant work\n", + "\n", + "is eliminated.\n", + "\n", + "The 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n", + "\n", + "working with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n", + "\n", + "the 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n", + "\n", + "Another challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n", + "\n", + "differently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n", + "\n", + "consumers must be able to adjust by monitoring the execution and detecting the changes. The data\n", + "\n", + "scientist, in turn, must update and test new models on the new data. Your data platform should make the\n", + "\n", + "detection and remediation easier, not harder.\n", + "\n", + "For the data engineers, their primary focus is extracting data from source systems and moving it into the\n", + "\n", + "new data ecosystem. The data pipeline function can be simplified with a unified data platform because\n", + "\n", + "the programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n", + "\n", + "results in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n", + "\n", + "and debug since the compute layer is consistent, and the logging and auditing associated with the data\n", + "\n", + "processing and data management is centralized and of more value.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Maximize the productivity of your workforce**\n", + "\n", + "Once you have a data platform that brings together your full range of personas, you should focus on the\n", + "\n", + "next step for increasing productivity — namely, self-service environments.\n", + "\n", + "In large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n", + "\n", + "environments for development, testing and production. These environments need to be nearly identical to\n", + "\n", + "one another — using the same version of software while limiting the number, size and horsepower of the\n", + "\n", + "compute nodes. To the extent possible, development and test should be performed with realistic test/\n", + "\n", + "synthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n", + "\n", + "percentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n", + "\n", + "general shape and range of values.\n", + "\n", + "The **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n", + "\n", + "environments should be small and controlled with policies that spin them up and tear them down efficiently.\n", + "\n", + "Every aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n", + "\n", + "environment that cannot be destroyed and easily rebuilt.\n", + "\n", + "The **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n", + "\n", + "tools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n", + "\n", + "the developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n", + "\n", + "management.\n", + "\n", + "Moving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n", + "\n", + "developers cannot randomly promote software to run in production. Again, this process should be\n", + "\n", + "strictly governed using a workflow/sign-off approval approach — signed off by management as well.\n", + "\n", + "Many organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n", + "\n", + "deployments.\n", + "\n", + "\n", + "**DEV** **TEST**\n", + "\n", + "**PROD**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 8. Make informed build vs. buy decisionsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
25f075126fb3e190be163159697de795environment that cannot be destroyed and easily rebuilt.\n", + "\n", + "The **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n", + "\n", + "tools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n", + "\n", + "the developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n", + "\n", + "management.\n", + "\n", + "Moving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n", + "\n", + "developers cannot randomly promote software to run in production. Again, this process should be\n", + "\n", + "strictly governed using a workflow/sign-off approval approach — signed off by management as well.\n", + "\n", + "Many organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n", + "\n", + "deployments.\n", + "\n", + "\n", + "**DEV** **TEST**\n", + "\n", + "**PROD**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 8. Make informed build vs. buy decisions\n", + "\n", + "A key piece of the strategy will involve the decision around which components of the data ecosystem are\n", + "\n", + "built by the in-house engineering team and which components are purchased through a vendor relationship.\n", + "\n", + "There is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n", + "\n", + "engineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n", + "\n", + "**Competitive advantage**\n", + "\n", + "This “roll your own’’ approach has some advantages — including being able to establish the overall product\n", + "\n", + "vision, prioritize features and directly allocate the resources to build the software. However, it is important to\n", + "\n", + "keep in mind which aspects of your development effort give you the most competitive advantage.\n", + "\n", + "Spend some time working with the data transformation steering committee and other stakeholders to\n", + "\n", + "debate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n", + "\n", + "come down to whether or not a given solution offers true competitive advantage for the organization. Does\n", + "\n", + "building this piece of software make it harder for your competitors to compete with you? If the answer is no,\n", + "\n", + "then it is better to focus your engineering and data science resources on deriving insights from your data.\n", + "\n", + "**Beware: becoming your own software vendor**\n", + "\n", + "As many engineering leaders know, building your own software is an exciting challenge. However, it does\n", + "\n", + "come with added responsibility — namely, managing the overall project timeline and costs, and being\n", + "\n", + "responsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n", + "\n", + "updates. You basically are becoming your own software vendor for every component of the ecosystem\n", + "\n", + "that you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n", + "\n", + "several million dollars per year building out individual component parts of the new data system. This doesn’t\n", + "\n", + "include the cost to operate and maintain the software once it is in production.\n", + "\n", + "\n", + "-----\n", + "\n", + "To offset the anticipated development costs, engineering teams will oftentimes make the argument that\n", + "\n", + "they are starting with open source software and extending it to meet the “unique requirements” of your\n", + "\n", + "organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n", + "\n", + "unique and b) the development offers the competitive advantage that you need.\n", + "\n", + "Even software built on top of open source still requires significant investment in integration and testing.\n", + "\n", + "The integration work is particularly challenging because of the large number of open source libraries that\n", + "\n", + "are required in the data science space. The question becomes, “Is this really the area that you want your\n", + "\n", + "engineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n", + "\n", + "**How long will it take? Can the organization afford to wait?**\n", + "\n", + "Even if you decide the software component provides a competitive advantage and is something worth\n", + "\n", + "building in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n", + "\n", + "time-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n", + "\n", + "business due to the anticipated delivery schedule. Keep in mind that software development projects usuallySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
b5f4bd0258226132f89697f6e660b09borganization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n", + "\n", + "unique and b) the development offers the competitive advantage that you need.\n", + "\n", + "Even software built on top of open source still requires significant investment in integration and testing.\n", + "\n", + "The integration work is particularly challenging because of the large number of open source libraries that\n", + "\n", + "are required in the data science space. The question becomes, “Is this really the area that you want your\n", + "\n", + "engineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n", + "\n", + "**How long will it take? Can the organization afford to wait?**\n", + "\n", + "Even if you decide the software component provides a competitive advantage and is something worth\n", + "\n", + "building in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n", + "\n", + "time-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n", + "\n", + "business due to the anticipated delivery schedule. Keep in mind that software development projects usually\n", + "\n", + "take longer and cost more money than initially planned.\n", + "\n", + "The organization should understand the impact to the overall performance and capabilities of the daily\n", + "\n", + "ecosystem for any features tied to the in-house development effort. Your business partners likely do\n", + "\n", + "not care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n", + "\n", + "is reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n", + "\n", + "features and schedule.\n", + "\n", + "\n", + "Databricks is built on top of popular open source\n", + "\n", + "software that it created. Engineering teams can\n", + "\n", + "improve the underpinnings of the Databricks\n", + "\n", + "platform by submitting code via pull request and\n", + "\n", + "becoming committers to the projects. The benefit\n", + "\n", + "to organizations is that their engineers contribute\n", + "\n", + "to the feature set of the data platform while\n", + "\n", + "Databricks remains responsible for all integration\n", + "\n", + "and performance testing plus all the runtime\n", + "\n", + "support, including failover and disaster recovery.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Don’t forget about the data**\n", + "\n", + "Perhaps the single most important feature of a modern data stack is its ability to help make data sets and\n", + "\n", + "“data assets” consumable to the end users or systems. Data insights, model training and model execution\n", + "\n", + "cannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n", + "\n", + "In large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n", + "\n", + "sets from multiple lines of business or departments. Focusing your data engineering and data science\n", + "\n", + "efforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n", + "\n", + "creating true competitive advantage.\n", + "\n", + "The amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n", + "\n", + "serve up data for analysis should not be underestimated. The value of this work is equally important to\n", + "\n", + "the business. The ability to curate data to enable game-changing insights should be the focus of the work\n", + "\n", + "led by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n", + "\n", + "engineers innovate on components that don’t bring true competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 9. Allocate, monitor and optimize costs\n", + "\n", + "Beginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n", + "\n", + "class of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n", + "\n", + "only one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n", + "\n", + "more manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n", + "\n", + "case anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n", + "\n", + "and increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n", + "\n", + "related personas to collaborate and operate from the same point of view. Lessons learned on the platform\n", + "\n", + "could be easily shared and reused by other members of the team. The more the team used the unified\n", + "\n", + "platform, the more they collaborated and their level of expertise increased.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
c7999afe6a711c926c52c21162072b02engineers innovate on components that don’t bring true competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 9. Allocate, monitor and optimize costs\n", + "\n", + "Beginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n", + "\n", + "class of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n", + "\n", + "only one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n", + "\n", + "more manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n", + "\n", + "case anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n", + "\n", + "and increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n", + "\n", + "related personas to collaborate and operate from the same point of view. Lessons learned on the platform\n", + "\n", + "could be easily shared and reused by other members of the team. The more the team used the unified\n", + "\n", + "platform, the more they collaborated and their level of expertise increased.\n", + "\n", + "**Reduce complexity, reduce costs**\n", + "\n", + "The architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n", + "\n", + "more complex — resulting in increased time to market and increased costs. This was mainly due to the\n", + "\n", + "requirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n", + "\n", + "for the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n", + "\n", + "serving and analytics are performed in a single compute layer.\n", + "\n", + "Organizations can rightsize the data environments and control costs using policies. The centralized\n", + "\n", + "and consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n", + "\n", + "bottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n", + "\n", + "expertise is developed within the workforce.\n", + "\n", + "\n", + "The Databricks platform optimizes costs for your\n", + "\n", + "data and AI workloads by intelligently provisioning\n", + "\n", + "infrastructure only as you need it. Customers can\n", + "\n", + "establish policies that govern the size of clusters\n", + "\n", + "based on DEV, TEST, PROD environments or\n", + "\n", + "anticipated workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks monitors and records usage and allows\n", + "\n", + "organizations to easily track costs on a data and\n", + "\n", + "\n", + "**Centralized funding model**\n", + "\n", + "As previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n", + "\n", + "under the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n", + "\n", + "the likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n", + "\n", + "the funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n", + "\n", + "Funding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n", + "\n", + "steady state that is more sustainable.\n", + "\n", + "\n", + "AI workload basis. This provides the ability to\n", + "\n", + "\n", + "The budget takes into account the cost of the data engineering function, commercial software licenses and\n", + "\n", + "building out the center of excellence to accelerate the data science capabilities of the organization. Again,\n", + "\n", + "the CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n", + "\n", + "focused on the overall implementation plan and to make sound build vs. buy decisions.\n", + "\n", + "It’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n", + "\n", + "in the CIO’s organization to perform the data engineering tasks. The data science community reports into\n", + "\n", + "the CDO and is matrixed into the lines of business in order to better understand the business drivers and\n", + "\n", + "the data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n", + "\n", + "regulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n", + "\n", + "the necessary time to educate leaders throughout the organization on the value of data governance.\n", + "\n", + "\n", + "implement an enterprise-wide chargeback mode\n", + "\n", + "and put in place appropriate spending limits.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chargeback models**\n", + "\n", + "To establish the centralized budget to fund the data transformation initiative, some organizations imposeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
a44c44fd48c5153138e2f0eaeee9e374the CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n", + "\n", + "focused on the overall implementation plan and to make sound build vs. buy decisions.\n", + "\n", + "It’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n", + "\n", + "in the CIO’s organization to perform the data engineering tasks. The data science community reports into\n", + "\n", + "the CDO and is matrixed into the lines of business in order to better understand the business drivers and\n", + "\n", + "the data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n", + "\n", + "regulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n", + "\n", + "the necessary time to educate leaders throughout the organization on the value of data governance.\n", + "\n", + "\n", + "implement an enterprise-wide chargeback mode\n", + "\n", + "and put in place appropriate spending limits.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chargeback models**\n", + "\n", + "To establish the centralized budget to fund the data transformation initiative, some organizations impose\n", + "\n", + "a “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n", + "\n", + "should be used to build the data engineering and data science teams needed to deploy the building blocks\n", + "\n", + "of the new data ecosystem. However, as different teams, departments and business units begin using the\n", + "\n", + "new data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n", + "\n", + "not be evenly distributed, due to different levels of usage from the various parts of the organization. The\n", + "\n", + "groups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n", + "\n", + "ability to monitor and track usage — not only based on compute but also on the amount of data generated\n", + "\n", + "and consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n", + "\n", + "and above the base-level funding.\n", + "\n", + "Plus, not all the departments or lines of business will require the same level of compute power or fault\n", + "\n", + "tolerance. The architecture should support the ability to separate out the runtime portions of the data\n", + "\n", + "ecosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n", + "\n", + "Some workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n", + "\n", + "nodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n", + "\n", + "critical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n", + "\n", + "better manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n", + "\n", + "performance is needed most.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 10. Move to production and scale adoption\n", + "\n", + "Now that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n", + "\n", + "ecosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n", + "\n", + "managing and using data to enable use cases that drive business value. They must also establish a clear\n", + "\n", + "set of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n", + "\n", + "continues to improve over time.\n", + "\n", + "**If you build it, they will come**\n", + "\n", + "Keep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n", + "\n", + "set registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n", + "\n", + "A high level of automation for the registration process is important because it’s not uncommon to see\n", + "\n", + "thousands of data sets in large organizations. The business and technical metadata plus the data quality\n", + "\n", + "rules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n", + "\n", + "provide a visualization that shows the data movement and verifies that the approved data flow paths are\n", + "\n", + "being followed.\n", + "\n", + "Some key metrics to keep an eye on are:\n", + "\n", + "\u0007Percentage of source systems contributing data to the ecosystem\n", + "\n", + "\u0007Percentage of real-time streaming relative to API and batch transfers\n", + "\n", + "\u0007Percentage of registered data sets with full business and technical metadata\n", + "\n", + "\u0007Volume of data written to the data lakeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
44e15156f76ae559ee78a6146297901bcontinues to improve over time.\n", + "\n", + "**If you build it, they will come**\n", + "\n", + "Keep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n", + "\n", + "set registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n", + "\n", + "A high level of automation for the registration process is important because it’s not uncommon to see\n", + "\n", + "thousands of data sets in large organizations. The business and technical metadata plus the data quality\n", + "\n", + "rules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n", + "\n", + "provide a visualization that shows the data movement and verifies that the approved data flow paths are\n", + "\n", + "being followed.\n", + "\n", + "Some key metrics to keep an eye on are:\n", + "\n", + "\u0007Percentage of source systems contributing data to the ecosystem\n", + "\n", + "\u0007Percentage of real-time streaming relative to API and batch transfers\n", + "\n", + "\u0007Percentage of registered data sets with full business and technical metadata\n", + "\n", + "\u0007Volume of data written to the data lake\n", + "\n", + "\u0007Percentage of raw data that enters a data curation pipeline\n", + "\n", + "\u0007Volume of data consumed from the data lake\n", + "\n", + "\u0007Number of tables defined and populated with curated data\n", + "\n", + "\u0007Number of models trained with data from the data lake\n", + "\n", + "\u0007Lineage reports and anomaly detection incidents\n", + "\n", + "\u0007Number of users running Python, SQL, Scala and R workloads\n", + "\n", + "\n", + "In 2018, Databricks released MLflow — an open\n", + "\n", + "source platform to manage the ML lifecycle,\n", + "\n", + "including experimentation, reproducibility,\n", + "\n", + "deployment and a central model registry. MLflow\n", + "\n", + "is included in the Databricks Lakehouse Platform\n", + "\n", + "and accelerates the adoption of machine learning\n", + "\n", + "and AI in organizations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Communication plan**\n", + "\n", + "Communication is critical throughout the data transformation initiative — however, it is particularly\n", + "\n", + "important once you move into production. Time is precious and you want to avoid rework, if at all possible.\n", + "\n", + "Organizations often overlook the emotional and cultural toll that a long transformation process takes on\n", + "\n", + "the workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n", + "\n", + "and exhausting place to be — because your business partners are busy supporting two data worlds. Most\n", + "\n", + "users just want to know when the new environment will be ready. They don’t want to work with partially\n", + "\n", + "completed features, especially while performing double duty.\n", + "\n", + "Establish a solid communication plan and set expectations for when features will come online. Make sure\n", + "\n", + "there is detailed documentation, training and a support/help desk to field users’ questions.\n", + "\n", + "**DevOps — software development + IT operations**\n", + "\n", + "Mature organizations develop a series of processes and standards for how software and data are developed,\n", + "\n", + "managed and delivered. The term “DevOps” comes from the software engineering world and refers to\n", + "\n", + "developing and operating large-scale software systems. DevOps defines how an organization, its developers,\n", + "\n", + "operations staff and other stakeholders establish the goal of delivering quality software reliably and\n", + "\n", + "repeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n", + "\n", + "continuous delivery (CD).\n", + "\n", + "The CI portion of the process is the practice of frequently integrating newly written or changed code\n", + "\n", + "with the existing code repository. As software is written, it is continuously saved back to the source code\n", + "\n", + "repository, merged with other changes, built, integrated and tested — and this should occur frequently\n", + "\n", + "enough that the window between commit and build is narrow enough that no errors can occur without\n", + "\n", + "developers noticing them and correcting them immediately.\n", + "\n", + "This is particularly important for large, distributed teams to ensure that the software is always in a working\n", + "\n", + "state — despite the frequent changes from various developers. Only software that passes the CI steps is\n", + "\n", + "deployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n", + "\n", + "dependable releases.\n", + "\n", + "\n", + "Software development IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "**DataOps — data processing + IT operations**\n", + "\n", + "DataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n", + "\n", + "use the well-established processes from DevOps to consistently and reliably improve the quality of dataSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
09bdc72674d7ff408d67d2046c9637bdThe CI portion of the process is the practice of frequently integrating newly written or changed code\n", + "\n", + "with the existing code repository. As software is written, it is continuously saved back to the source code\n", + "\n", + "repository, merged with other changes, built, integrated and tested — and this should occur frequently\n", + "\n", + "enough that the window between commit and build is narrow enough that no errors can occur without\n", + "\n", + "developers noticing them and correcting them immediately.\n", + "\n", + "This is particularly important for large, distributed teams to ensure that the software is always in a working\n", + "\n", + "state — despite the frequent changes from various developers. Only software that passes the CI steps is\n", + "\n", + "deployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n", + "\n", + "dependable releases.\n", + "\n", + "\n", + "Software development IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "**DataOps — data processing + IT operations**\n", + "\n", + "DataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n", + "\n", + "use the well-established processes from DevOps to consistently and reliably improve the quality of data\n", + "\n", + "used to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n", + "\n", + "needed for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n", + "\n", + "data are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n", + "\n", + "end cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n", + "\n", + "data sets, data assets and models that create value.\n", + "\n", + "For DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n", + "\n", + "and the data tooling should be designed to support the workflow and make all aspects of data curation and\n", + "\n", + "ETL more efficient.\n", + "\n", + "**MLOps — machine learning + IT operations**\n", + "\n", + "Not surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n", + "\n", + "deep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n", + "\n", + "unique when compared with DevOps and DataOps because the approach to deploying effective machine\n", + "\n", + "learning models is far more iterative and requires much more experimentation — data scientists try different\n", + "\n", + "features, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n", + "\n", + "base, understand the data used to perform the training and create reproducible results. The logging aspect\n", + "\n", + "of the ML development lifecycle is critical.\n", + "\n", + "MLOps aims to manage deployment of machine learning and deep learning models in large-scale\n", + "\n", + "production environments while also focusing on business and regulatory requirements. The ideal MLOps\n", + "\n", + "environment would include data science tools where models are constructed and analytical engines where\n", + "\n", + "computations are performed.\n", + "\n", + "\n", + "Data processing IT operations\n", + "\n", + "#### \n", + "\n", + "Machine learning IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "The overall workflow for deploying production ML models is shown in Figure 10.\n", + "\n", + "Unlike most software applications that execute a series of discrete operations, ML platforms are not\n", + "\n", + "deterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n", + "\n", + "suffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n", + "\n", + "be refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n", + "\n", + "should natively support this style of iterative data science.\n", + "\n", + "**Ethics in AI**\n", + "\n", + "As more organizations deploy data and AI solutions, there is growing concern around a number of issues\n", + "\n", + "related to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n", + "\n", + "fair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n", + "\n", + "must ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n", + "\n", + "explainability to satisfy legal and regulatory safeguards.\n", + "\n", + "The vast majority of AI work still involves software development by human beings and the use of curated\n", + "\n", + "data sets. There is the obvious potential for bias and the application of AI in domains that are ethicallySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
c3945817f581aa2399eb0650e53d504fdeterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n", + "\n", + "suffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n", + "\n", + "be refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n", + "\n", + "should natively support this style of iterative data science.\n", + "\n", + "**Ethics in AI**\n", + "\n", + "As more organizations deploy data and AI solutions, there is growing concern around a number of issues\n", + "\n", + "related to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n", + "\n", + "fair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n", + "\n", + "must ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n", + "\n", + "explainability to satisfy legal and regulatory safeguards.\n", + "\n", + "The vast majority of AI work still involves software development by human beings and the use of curated\n", + "\n", + "data sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n", + "\n", + "questionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n", + "\n", + "explain how it works and describe the impact of its existence on the target audience — whether internal\n", + "\n", + "workers or customers.\n", + "\n", + "\n", + "Data extraction\n", + "\n", + "Data preparation\n", + "\n", + "Model e�aluation\n", + "\n", + "\n", + "Data analI�i�\n", + "\n", + "4\n", + "Model training\n", + "\n", + "6\n", + "Model �er�ing and\n", + "execution\n", + "\n", + "\n", + "Model monitoring\n", + "\n", + "**Figure 10:**\n", + "Workflow for deploying production ML models\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data and AI Maturity Model**\n", + "\n", + "When data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n", + "\n", + "a data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n", + "\n", + "Figure 11.\n", + "\n", + "**Top-Line Categories and Ranking Criteria**\n", + "\n", + "**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n", + "\n", + "1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n", + "\n", + "\n", + "Organization is beginning\n", + "to explore big data and\n", + "AI, and understand the\n", + "possibilities and potential\n", + "of a few starter projects\n", + "and experiment\n", + "\n", + "**Figure 11:**\n", + "The Data and AI Maturity Model\n", + "\n", + "\n", + "Organization builds\n", + "the basic capabilities\n", + "and foundations to\n", + "begin exploring a more\n", + "expansive data and AI\n", + "strategy, but it lacks vision,\n", + "long-term objectives or\n", + "leadership buy-in\n", + "\n", + "\n", + "Data and AI are budding\n", + "into drivers of value for\n", + "BUs aligned to specific\n", + "projects and initiatives as\n", + "the core tenets of data\n", + "and AI are integrated into\n", + "corporate strategy\n", + "\n", + "\n", + "Data and AI are core\n", + "drivers of value across the\n", + "organization, structured\n", + "and central to corporate\n", + "strategy, with a scalable\n", + "architecture that meets\n", + "business needs and buy-in\n", + "from across the organization\n", + "\n", + "\n", + "Data and AI are at the\n", + "heart of the corporate\n", + "strategy and are\n", + "invaluable differentiators\n", + "and drivers of competitive\n", + "advantage\n", + "\n", + "\n", + "Databricks partners with its customers to enable them to do an internal self-assessment. The output of the\n", + "\n", + "self-assessment allows organizations to:\n", + "\n", + "\u0007Understand the current state of their journey to data and AI maturity\n", + "\n", + "\u0007Identify key gaps in realizing (more) value from data and AI\n", + "\n", + "\u0007Plot a path to increase maturity with specific actions\n", + "\n", + "\u0007Identify Databricks resources who can help support their journey\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## Conclusion\n", + "\n", + "\n", + "After a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n", + "\n", + "with the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n", + "\n", + "— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n", + "\n", + "to future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n", + "\n", + "architecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n", + "\n", + "unleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n", + "\n", + "every use case.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
6e76a7c72ed1164f542aaa2f592c4c1aself-assessment allows organizations to:\n", + "\n", + "\u0007Understand the current state of their journey to data and AI maturity\n", + "\n", + "\u0007Identify key gaps in realizing (more) value from data and AI\n", + "\n", + "\u0007Plot a path to increase maturity with specific actions\n", + "\n", + "\u0007Identify Databricks resources who can help support their journey\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## Conclusion\n", + "\n", + "\n", + "After a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n", + "\n", + "with the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n", + "\n", + "— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n", + "\n", + "to future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n", + "\n", + "architecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n", + "\n", + "unleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n", + "\n", + "every use case.\n", + "\n", + "For more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n", + "\n", + "**A B O U T T H E A U T H O R**\n", + "\n", + "Chris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n", + "\n", + "is to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n", + "\n", + "Prior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n", + "\n", + "he led a team that was responsible for building out a modern data architecture that emphasized the key\n", + "\n", + "attributes of the lakehouse architecture.\n", + "\n", + "Chris has also held leadership roles at a number of technology companies.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
031815c68bdd885e1f9e3299f1014c9f**eBook**\n", + "\n", + "## The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "\n", + "**C H A P TE R 1**\n", + "\n", + "**C H A P TE R 2**\n", + "\n", + "**C H A P TE R 3**\n", + "\n", + "**C H A P TE R 4**\n", + "\n", + "**C H A P TE R 5**\n", + "\n", + "**C H A P TE R 6**\n", + "\n", + "**C H A P TE R 7**\n", + "\n", + "**C H A P TE R 8**\n", + "\n", + "**C H A P TE R 9**\n", + "\n", + "**C H A P TE R 10**\n", + "\n", + "**C H A P TE R 11**\n", + "\n", + "**C H A P TE R 12**\n", + "\n", + "\n", + "**The data lakehouse** ...................................................................................................................................................................................... **4**\n", + "\n", + "**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n", + "\n", + "**Data reliability and performance** ................................................................................................................................... **18**\n", + "\n", + "**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n", + "\n", + "**Security** .............................................................................................................................................................................................................................. **41**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
37c8d75a944e9c050eddc4388d8456e2**Security** .............................................................................................................................................................................................................................. **41**\n", + "\n", + "**Instant compute and serverless** ................................................................................................................................... **48**\n", + "\n", + "**Data warehousing** ......................................................................................................................................................................................... **52**\n", + "\n", + "**Data engineering** ............................................................................................................................................................................................. **56**\n", + "\n", + "**Data streaming** .................................................................................................................................................................................................. **68.**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
b7d5cd84a7d2802d8b2797d15d6c10b4**Data streaming** .................................................................................................................................................................................................. **68.**\n", + "\n", + "**Data science and machine learning** ........................................................................................................................ **7** **3.**\n", + "\n", + "**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n", + "\n", + "**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "#### The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\n", + "designed for data practitioners and leaders who are embarking\n", + "on their journey into the data lakehouse architecture.\n", + "\n", + "In this eBook, you will learn the full capabilities of the data lakehouse architecture\n", + "and how the Databricks Lakehouse Platform helps organizations of all sizes — from\n", + "enterprises to startups in every industry — with all their data, analytics, AI and\n", + "machine learning use cases on one platform.\n", + "\n", + "You will see how the platform combines the best elements of data warehouses\n", + "and data lakes to increase the reliability, performance and scalability of your\n", + "data platform. Discover how the lakehouse simplifies complex workloads in data\n", + "engineering, data warehousing, data streaming, data science and machine learning\n", + "— and bolsters collaboration for your data teams, allowing them to maintain new\n", + "levels of governance, flexibility and agility in an open and multicloud environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### The data lakehouse\n", + "# 01\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The evolution of data architectures\n", + "\n", + "\n", + "Data has moved front and center within every organization as data-driven insights\n", + "have fueled innovation, competitive advantage and better customer experiences.\n", + "\n", + "However, as companies place mandates on becoming more data-driven,\n", + "their data teams are left in a sprint to deliver the right data for business\n", + "insights and innovation. With the widespread adoption of cloud, data teams\n", + "often invest in large-scale complex data systems that have capabilities for\n", + "streaming, business intelligence, analytics and machine learning to support\n", + "the overall business objectives.\n", + "\n", + "To support these objectives, data teams have deployed cloud data\n", + "\n", + "warehouses and data lakes.\n", + "\n", + "\n", + "Traditional data systems: The data warehouse and data lake\n", + "\n", + "With the advent of big data, companies began collecting large amounts of\n", + "data from many different sources, such as weblogs, sensor data and images.\n", + "Data warehouses — which have a long history as the foundation for decision\n", + "support and business intelligence applications — cannot handle large volumes\n", + "of data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
c8188921b979381d315e5ec5ae191e05-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### The data lakehouse\n", + "# 01\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The evolution of data architectures\n", + "\n", + "\n", + "Data has moved front and center within every organization as data-driven insights\n", + "have fueled innovation, competitive advantage and better customer experiences.\n", + "\n", + "However, as companies place mandates on becoming more data-driven,\n", + "their data teams are left in a sprint to deliver the right data for business\n", + "insights and innovation. With the widespread adoption of cloud, data teams\n", + "often invest in large-scale complex data systems that have capabilities for\n", + "streaming, business intelligence, analytics and machine learning to support\n", + "the overall business objectives.\n", + "\n", + "To support these objectives, data teams have deployed cloud data\n", + "\n", + "warehouses and data lakes.\n", + "\n", + "\n", + "Traditional data systems: The data warehouse and data lake\n", + "\n", + "With the advent of big data, companies began collecting large amounts of\n", + "data from many different sources, such as weblogs, sensor data and images.\n", + "Data warehouses — which have a long history as the foundation for decision\n", + "support and business intelligence applications — cannot handle large volumes\n", + "of data.\n", + "\n", + "While data warehouses are great for structured data and historical analysis,\n", + "they weren’t designed for unstructured data, semi-structured data, and data\n", + "with high variety, velocity and volume, making them unsuitable for many types\n", + "of data.\n", + "\n", + "This led to the introduction of data lakes, providing a single repository of raw\n", + "data in a variety of formats. While suitable for storing big data, data lakes do\n", + "not support transactions, nor do they enforce data quality, and their lack of\n", + "consistency/isolation makes it almost impossible to read, write or process data.\n", + "\n", + "For these reasons, many of the promises of data lakes never materialized and,\n", + "in many cases, reduced the benefits of data warehouses.\n", + "\n", + "As companies discovered new use cases for data exploration, predictive modeling\n", + "and prescriptive analytics, the need for a single, flexible, high-performance system\n", + "only grew. Data teams require systems for diverse data applications including SQL\n", + "analytics, real-time analytics, data science and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "To solve for new use cases and new users, a common approach is to use multiple\n", + "systems — a data lake, several data warehouses and other specialized systems\n", + "such as streaming, time-series, graph and image databases. But having multiple\n", + "systems introduces complexity and delay, as data teams invariably need to\n", + "move or copy data between different systems, effectively losing oversight and\n", + "governance over data usage.\n", + "\n", + "\n", + "You have now duplicated data in two different systems and the changes you\n", + "make in one system are unlikely to find their way to the other. So, you are going\n", + "to have data drift almost immediately, not to mention paying to store the same\n", + "data multiple times.\n", + "\n", + "Then, because governance is happening at two distinct levels across these\n", + "platforms, you are not able to control things consistently.\n", + "\n", + "\n", + "**Challenges with data, analytics and AI**\n", + "\n", + "In a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\n", + "measurable value from data. The challenge is that most companies continue to\n", + "implement two different platforms: data warehouses for BI and data lakes for AI.\n", + "These platforms are incompatible with each other, but data from both systems\n", + "is generally needed to deliver game-changing outcomes, which makes success\n", + "with AI extremely difficult.\n", + "\n", + "Today, most of the data is landing in the data lake, and a lot of it is unstructured.\n", + "In fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\n", + "unstructured by 2025. But, this data is where much of the value from AI resides.\n", + "Subsets of the data are then copied to the data warehouse into structured\n", + "tables, and back again in some cases.\n", + "\n", + "You also must secure and govern the data in both warehouses and offer\n", + "fine-grained governance, while lakes tend to be coarser grained at the file level.\n", + "Then, you stand up different stacks of tools on these platforms to do either\n", + "BI or AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, the tool stacks on top of these platforms\n", + "are fundamentally different, which makes it difficult\n", + "to get any kind of collaboration going between the\n", + "teams that support them.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
71d3fdc659abf200ba4c6b379ef23c9eToday, most of the data is landing in the data lake, and a lot of it is unstructured.\n", + "In fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\n", + "unstructured by 2025. But, this data is where much of the value from AI resides.\n", + "Subsets of the data are then copied to the data warehouse into structured\n", + "tables, and back again in some cases.\n", + "\n", + "You also must secure and govern the data in both warehouses and offer\n", + "fine-grained governance, while lakes tend to be coarser grained at the file level.\n", + "Then, you stand up different stacks of tools on these platforms to do either\n", + "BI or AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, the tool stacks on top of these platforms\n", + "are fundamentally different, which makes it difficult\n", + "to get any kind of collaboration going between the\n", + "teams that support them.\n", + "\n", + "This is why AI efforts fail. There is a tremendous\n", + "amount of complexity and rework being introduced\n", + "into the system. Time and resources are being\n", + "wasted trying to get the right data to the right\n", + "people, and everything is happening too slowly\n", + "to get in front of the competition.\n", + "\n", + "\n", + "**Realizing this requires two disparate,**\n", + "**incompatible data platforms**\n", + "\n", + "\n", + "**Business** **SQL** **Incomplete** **Data science** **Data**\n", + "\n", + "**support for**\n", + "\n", + "**intelligence** **analytics** **and ML** **streaming**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "**Incomplete**\n", + "**support for**\n", + "**use cases**\n", + "\n", + "\n", + "**Incompatible**\n", + "**security and**\n", + "**governance models**\n", + "\n", + "**Copy subsets of data**\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa T|n a|c b|e and security le ACLs|\n", + "|||||\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa File|n s|c a|e and security nd blobs|\n", + "|||||\n", + "\n", + "\n", + "**Disjointed**\n", + "**and duplicative**\n", + "\n", + "**Data warehouse** **data silos** **Data lake**\n", + "Structured tables Unstructured files:\n", + "logs, text, images, video\n", + "\n", + "\n", + "-----\n", + "\n", + "**Moving forward with a lakehouse architecture**\n", + "\n", + "To satisfy the need to support AI and BI directly on vast amounts of data stored\n", + "in data lakes (on low-cost cloud storage), a new data management architecture\n", + "emerged independently across many organizations and use cases: the\n", + "data lakehouse.\n", + "\n", + "The data lakehouse can store _all_ and _any_ type of data once in a data lake and\n", + "make that data accessible directly for AI and BI. The lakehouse paradigm has\n", + "specific capabilities to efficiently allow both AI and BI on all the enterprise’s data\n", + "at a massive scale. Namely, it has the SQL and performance capabilities such as\n", + "indexing, caching and MPP processing to make BI work fast on data lakes. It also\n", + "has direct file access and direct native support for Python, data science and AI\n", + "frameworks without the need for a separate data warehouse.\n", + "\n", + "In short, a lakehouse is a data architecture that combines the best elements\n", + "of data warehouses and data lakes. Lakehouses are enabled by a new system\n", + "design, which implements similar data structures and data management features\n", + "found in a data warehouse directly on the low-cost storage used for data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Data lakehouse\n", + "\n", + "One platform to unify all your data, analytics and AI workloads\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "All machine learning, SQL,\n", + "BI, and streaming use cases\n", + "\n", + "One security and governance\n", + "approach for all data assets\n", + "on all clouds\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key features for a lakehouse**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
9cabb87127bfa514fa6f498e9f2831e7In short, a lakehouse is a data architecture that combines the best elements\n", + "of data warehouses and data lakes. Lakehouses are enabled by a new system\n", + "design, which implements similar data structures and data management features\n", + "found in a data warehouse directly on the low-cost storage used for data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Data lakehouse\n", + "\n", + "One platform to unify all your data, analytics and AI workloads\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "All machine learning, SQL,\n", + "BI, and streaming use cases\n", + "\n", + "One security and governance\n", + "approach for all data assets\n", + "on all clouds\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key features for a lakehouse**\n", + "\n", + "Recent innovations with the data lakehouse architecture can help simplify\n", + "your data and AI workloads, ease collaboration for data teams, and maintain\n", + "the kind of flexibility and openness that allows your organization to stay agile\n", + "as you scale. Here are key features to consider when evaluating data lakehouse\n", + "architectures:\n", + "\n", + "Transaction support: In an enterprise lakehouse, many data pipelines will\n", + "often be reading and writing data concurrently. Support for ACID (Atomicity,\n", + "Consistency, Isolation and Durability) transactions ensures consistency as\n", + "multiple parties concurrently read or write data.\n", + "\n", + "Schema enforcement and governance: The lakehouse should have\n", + "a way to support schema enforcement and evolution, supporting data\n", + "warehouse schema paradigms such as star/snowflake. The system should\n", + "be able to reason about data integrity, and it should have robust governance\n", + "and auditing mechanisms.\n", + "\n", + "Data governance: Capabilities including auditing, retention and lineage\n", + "have become essential, particularly considering recent privacy regulations.\n", + "\n", + "Tools that allow data discovery have become popular, such as data catalogs\n", + "and data usage metrics.\n", + "\n", + "BI support: Lakehouses allow the use of BI tools directly on the source\n", + "data. This reduces staleness and latency, improves recency and lowers cost\n", + "by not having to operationalize two copies of the data in both a data lake\n", + "and a warehouse.\n", + "\n", + "\n", + "Storage decoupled from compute: In practice, this means storage and\n", + "compute use separate clusters, thus these systems can scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also\n", + "have this property.\n", + "\n", + "Openness: The storage formats, such as Apache Parquet, are open and\n", + "standardized, so a variety of tools and engines, including machine learning\n", + "and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "Support for diverse data types (unstructured and structured):\n", + "The lakehouse can be used to store, refine, analyze and access data types\n", + "needed for many new data applications, including images, video, audio,\n", + "semi-structured data and text.\n", + "\n", + "Support for diverse workloads: Use the same data repository for a range\n", + "of workloads including data science, machine learning and SQL analytics.\n", + "Multiple tools might be needed to support all these workloads.\n", + "\n", + "End-to-end streaming: Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "**Learn more**\n", + "\n", + "**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "\n", + "**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "\n", + "**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Lakehouse: A new generation of open platforms\n", + "\n", + "\n", + "###### This is the lakehouse paradigmSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
9055c5a181008db8c024cb3f2415f1ed**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "\n", + "**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Lakehouse: A new generation of open platforms\n", + "\n", + "\n", + "###### This is the lakehouse paradigm\n", + "\n", + "\n", + "Databricks is the inventor and pioneer of the\n", + "data lakehouse architecture. The data lakehouse\n", + "architecture was coined in the research paper,\n", + "[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\n", + "introduced by Databricks’ founders, UC Berkeley\n", + "and Stanford University at the 11th Conference on\n", + "Innovative Data Systems Research (CIDR) in 2021.\n", + "\n", + "At Databricks, we are continuously innovating on\n", + "the lakehouse architecture to help customers deliver\n", + "on their data, analytics and AI aspirations. The ideal\n", + "data, analytics and AI platform needs to operate\n", + "differently. Rather than copying and transforming\n", + "data in multiple systems, you need one platform\n", + "that accommodates all data types.\n", + "\n", + "\n", + "**Data science** **Data**\n", + "**and ML** **streaming**\n", + "\n", + "\n", + "**All ML, SQL, BI**\n", + "**and streaming use cases**\n", + "\n", + "**One security and governance**\n", + "**approach for all data assets**\n", + "**on all clouds**\n", + "\n", + "**A reliable data platform**\n", + "**to efficiently handle**\n", + "**all data types**\n", + "\n", + "\n", + "**Persona-based**\n", + "**use cases**\n", + "\n", + "**Unity Catalog**\n", + "Fine-grained governance\n", + "for data and AI\n", + "\n", + "**Delta Lake**\n", + "Data reliability and performance\n", + "\n", + "\n", + "**Business**\n", + "**intelligence**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "Files and blobs and table ACLs\n", + "\n", + "\n", + "Ideally, the platform must be open, so that you\n", + "are not locked into any walled gardens. You would\n", + "also have one security and governance model.\n", + "It would not only manage all data types, but it\n", + "would also be cloud-agnostic to govern data\n", + "wherever it is stored.\n", + "\n", + "Last, it would support all major data, analytics and AI\n", + "workloads, so that your teams can easily collaborate\n", + "and get access to all the data they need to innovate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is the Databricks Lakehouse Platform?\n", + "\n", + "The Databricks Lakehouse Platform unifies your\n", + "data warehousing and AI uses cases on a single\n", + "platform. It combines the best elements of data\n", + "lakes and data warehouses to deliver the reliability,\n", + "strong governance and performance of data\n", + "warehouses with the openness, flexibility and\n", + "machine learning support of data lakes.\n", + "\n", + "This unified approach simplifies your modern data\n", + "stack by eliminating the data silos that traditionally\n", + "separate and complicate data engineering, analytics,\n", + "BI, data science and machine learning. It’s built\n", + "on open source and open standards to maximize\n", + "flexibility. And, its common approach to data\n", + "management, security and governance helps you\n", + "\n", + "operate more efficiently and innovate faster.\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "Data Data Data Data science\n", + "warehousing engineering streaming and ML\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "**Simple**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
887cf23e9a8e1aeb0ddd15dc7f5db80d-----\n", + "\n", + "#### What is the Databricks Lakehouse Platform?\n", + "\n", + "The Databricks Lakehouse Platform unifies your\n", + "data warehousing and AI uses cases on a single\n", + "platform. It combines the best elements of data\n", + "lakes and data warehouses to deliver the reliability,\n", + "strong governance and performance of data\n", + "warehouses with the openness, flexibility and\n", + "machine learning support of data lakes.\n", + "\n", + "This unified approach simplifies your modern data\n", + "stack by eliminating the data silos that traditionally\n", + "separate and complicate data engineering, analytics,\n", + "BI, data science and machine learning. It’s built\n", + "on open source and open standards to maximize\n", + "flexibility. And, its common approach to data\n", + "management, security and governance helps you\n", + "\n", + "operate more efficiently and innovate faster.\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "Data Data Data Data science\n", + "warehousing engineering streaming and ML\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "**Simple**\n", + "\n", + "The unified approach simplifies your data\n", + "architecture by eliminating the data silos that\n", + "traditionally separate analytics, BI, data science\n", + "and machine learning. With a lakehouse, you\n", + "can eliminate the complexity and expense that\n", + "make it hard to achieve the full potential of\n", + "your analytics and AI initiatives.\n", + "\n", + "\n", + "**Open**\n", + "\n", + "Delta Lake forms the open foundation of\n", + "the lakehouse by providing reliability and\n", + "performance directly on data in the data\n", + "lake. You’re able to avoid proprietary walled\n", + "gardens, easily share data and build your\n", + "modern data stack with unrestricted access\n", + "to the ecosystem of open source data projects\n", + "and the broad Databricks partner network.\n", + "\n", + "\n", + "**Multicloud**\n", + "\n", + "The Databricks Lakehouse Platform offers\n", + "you a consistent management, security and\n", + "governance experience across all clouds. You\n", + "do not need to invest in reinventing processes\n", + "for every cloud platform that you are using to\n", + "support your data and AI efforts. Instead, your\n", + "data teams can simply focus on putting all\n", + "your data to work to discover new insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform architecture\n", + "\n", + "**Data reliability and performance for lakehouse**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\n", + "with all major analytics tools and works with the widest variety of formats to\n", + "store and process data.\n", + "\n", + "\n", + "**Instant compute and serverless**\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions and\n", + "manages the compute layer on behalf of the customer in the Databricks cloud\n", + "account instead of the customer account. As of the current release, serverless\n", + "compute is supported for use with Databricks SQL.\n", + "\n", + "In Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n", + "\n", + "\n", + "[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\n", + "a state-of-the-art vectorized engine for fast querying and provides the best\n", + "performance for all workloads in the lakehouse.\n", + "\n", + "In Chapter 3, we explore the details of data reliability and performance\n", + "\n", + "for the lakehouse.\n", + "\n", + "**Unified governance and security for lakehouse**\n", + "\n", + "The Databricks Lakehouse Platform provides unified governance with enterprise\n", + "scale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\n", + "governance for your data and AI assets in the lakehouse — files, tables,\n", + "dashboards, and machine learning models — giving you much better control,\n", + "management and security across clouds.\n", + "\n", + "[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\n", + "data across the organization in real time, independent of the platform\n", + "on which the data resides.\n", + "\n", + "In Chapter 4, we go into the details of unified governance for lakehouse\n", + "\n", + "and, in Chapter 5, we dive into the details of security for lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform workloads\n", + "\n", + "The Databricks Lakehouse Platform architecture supports different workloads\n", + "such as data warehousing, data engineering, data streaming, data science and\n", + "machine learning on one simple, open and multicloud data platform.\n", + "\n", + "**Data warehousing**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
95360c98bed3a80e5d35c9f6e1347456[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\n", + "data across the organization in real time, independent of the platform\n", + "on which the data resides.\n", + "\n", + "In Chapter 4, we go into the details of unified governance for lakehouse\n", + "\n", + "and, in Chapter 5, we dive into the details of security for lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform workloads\n", + "\n", + "The Databricks Lakehouse Platform architecture supports different workloads\n", + "such as data warehousing, data engineering, data streaming, data science and\n", + "machine learning on one simple, open and multicloud data platform.\n", + "\n", + "**Data warehousing**\n", + "\n", + "Data warehousing is one of the most business-critical workloads for data teams,\n", + "and the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\n", + "lets you run all your SQL and BI applications at scale with up to 12x better price/\n", + "performance, a unified governance model, open formats and APIs, and your tools\n", + "of choice — no lock-in. Reduce resource management overhead with serverless\n", + "compute, and easily ingest, transform and query all your data in-place to deliver\n", + "real-time business insights faster.\n", + "\n", + "Built on open standards and APIs, the Databricks Lakehouse Platform provides\n", + "the reliability, quality and performance that data lakes natively lack, plus\n", + "integrations with the ecosystem for maximum flexibility.\n", + "\n", + "In Chapter 7, we go into the details of data warehousing on the lakehouse.\n", + "\n", + "**Data engineering**\n", + "\n", + "Data engineering on the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows on\n", + "any cloud platform, and meet regulatory requirements to maintain governance.\n", + "\n", + "\n", + "automates the complexity of building and maintaining pipelines and running ETL\n", + "workloads so data engineers and analysts can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "In Chapter 8, we go into the details of data engineering on the lakehouse.\n", + "\n", + "**Data streaming**\n", + "\n", + "[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\n", + "Lakehouse Platform and is the future of all data processing. Real-time processing\n", + "provides the freshest possible data to an organization’s analytics and machine\n", + "learning models enabling them to make better, faster decisions, more accurate\n", + "predictions, offer improved customer experiences and more.\n", + "\n", + "The Databricks Lakehouse Platform Dramatically simplifies data streaming to\n", + "deliver real-time analytics, machine learning and applications on one platform.\n", + "\n", + "In Chapter 9, we go into the details of data streaming on the lakehouse.\n", + "\n", + "**Data science and machine learning**\n", + "\n", + "Data science and machine learning (DSML) on the lakehouse is a powerful\n", + "workload that is unique to many other data offerings. DSML on the lakehouse\n", + "provides a data-native and collaborative solution for the full ML lifecycle. It\n", + "can maximize data and ML team productivity, streamline collaboration, empower\n", + "ML teams to prepare, process and manage data in a self-service manner,\n", + "and standardize the ML lifecycle from experimentation to production.\n", + "\n", + "In Chapter 10, we go into the details of DSML on the lakehouse.\n", + "\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform that\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform and your**\n", + "**modern data stack**\n", + "\n", + "The Databricks Lakehouse Platform is open and provides the flexibility to\n", + "continue using existing infrastructure, to easily share data and build your modern\n", + "data stack with unrestricted access to the ecosystem of open source data\n", + "projects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n", + "\n", + "In Chapter 11, we go into the details of our technology partners and the\n", + "\n", + "modern data stack.\n", + "\n", + "#### Global adoption of the Databricks Lakehouse PlatformSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
257465947dbab17362be1c00ec93dd4cIn Chapter 10, we go into the details of DSML on the lakehouse.\n", + "\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform that\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform and your**\n", + "**modern data stack**\n", + "\n", + "The Databricks Lakehouse Platform is open and provides the flexibility to\n", + "continue using existing infrastructure, to easily share data and build your modern\n", + "data stack with unrestricted access to the ecosystem of open source data\n", + "projects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n", + "\n", + "In Chapter 11, we go into the details of our technology partners and the\n", + "\n", + "modern data stack.\n", + "\n", + "#### Global adoption of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "Today, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\n", + "across industries doing transformational work. Organizations around the globe\n", + "are driving change and delivering a new generation of data, analytics and AI\n", + "applications. We believe that the unfulfilled promise of data and AI can finally\n", + "be fulfilled with one platform for data analytics, data science and machine\n", + "learning with the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n", + "\n", + "[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n", + "\n", + "[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n", + "\n", + "[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n", + "\n", + "[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n", + "\n", + "[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Data reliability and performance\n", + "\n", + "To bring openness, reliability and lifecycle management to data lakes,\n", + "the Databricks Lakehouse Platform is built on the foundation of Delta\n", + "Lake. Delta Lake solves challenges around unstructured/structured data\n", + "ingestion, the application of data quality, difficulties with deleting data for\n", + "compliance or issues with modifying data for data capture.\n", + "\n", + "Although data lakes are great solutions for holding large quantities of raw\n", + "data, they lack important attributes for data reliability and quality and\n", + "often don’t offer good performance when compared to data warehouses.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Problems with today’s data lakes\n", + "\n", + "When it comes to data reliability and quality, examples of these\n", + "missing attributes include:\n", + "\n", + "**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\n", + "appends and reads\n", + "\n", + "**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\n", + "For example, rejecting writes that don’t match a table’s schema.\n", + "\n", + "**•** **Lack of integration with data catalog:** Results in dark data and no single\n", + "source of truth\n", + "\n", + "Even just the absence of these three attributes can cause a lot of extra work\n", + "for data engineers as they strive to ensure consistent high-quality data in the\n", + "pipelines they create.\n", + "\n", + "\n", + "These challenges are solved with two key technologies that are at the foundation\n", + "of the lakehouse: Delta Lake and Photon.\n", + "\n", + "**What is Delta Lake?**\n", + "\n", + "Delta Lake is a file-based, open source storage format that provides ACID\n", + "transactions and scalable metadata handling, and unifies streaming and batch\n", + "data processing. It runs on top of existing data lakes and is compatible with\n", + "Apache Spark™ and other processing engines.\n", + "\n", + "Delta Lake uses Delta Tables which are based on Apache Parquet, a commonly\n", + "used format for structured data already utilized by many organizations. Therefore,\n", + "switching existing Parquet tables to Delta Tables is easy and quick. Delta\n", + "Tables can also be used with semi-structured and unstructured data, providing\n", + "versioning, reliability, metadata management, and time travel capabilities that\n", + "make these types of data easily managed as well.\n", + "\n", + "\n", + "As for performance, data lakes use object storage, so data is mostly kept in\n", + "immutable files leading to the following problems:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
7432d9cdd8951d10673fa8db5f963e39Even just the absence of these three attributes can cause a lot of extra work\n", + "for data engineers as they strive to ensure consistent high-quality data in the\n", + "pipelines they create.\n", + "\n", + "\n", + "These challenges are solved with two key technologies that are at the foundation\n", + "of the lakehouse: Delta Lake and Photon.\n", + "\n", + "**What is Delta Lake?**\n", + "\n", + "Delta Lake is a file-based, open source storage format that provides ACID\n", + "transactions and scalable metadata handling, and unifies streaming and batch\n", + "data processing. It runs on top of existing data lakes and is compatible with\n", + "Apache Spark™ and other processing engines.\n", + "\n", + "Delta Lake uses Delta Tables which are based on Apache Parquet, a commonly\n", + "used format for structured data already utilized by many organizations. Therefore,\n", + "switching existing Parquet tables to Delta Tables is easy and quick. Delta\n", + "Tables can also be used with semi-structured and unstructured data, providing\n", + "versioning, reliability, metadata management, and time travel capabilities that\n", + "make these types of data easily managed as well.\n", + "\n", + "\n", + "As for performance, data lakes use object storage, so data is mostly kept in\n", + "immutable files leading to the following problems:\n", + "\n", + "**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\n", + "indexing practices in the form of partitioning that leads to hundreds of dev hours\n", + "spent tuning file sizes to improve read/write performance. Often, partitioning\n", + "proves to be ineffective over time if the wrong field was selected for partitioning\n", + "or due to high cardinality columns.\n", + "\n", + "**•** **Too many small files:** With no support for transactions, appending new data\n", + "takes the form of adding more and more files, leading to “small file problems,”\n", + "a known root cause of query performance degradation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake features**\n", + "\n", + "\n", + "**ACID guarantees**\n", + "\n", + "Delta Lake ensures that all data changes\n", + "written to storage are committed for durability\n", + "and made visible to readers atomically. In other\n", + "words, no more partial or corrupted files.\n", + "\n", + "**Scalable data and metadata handling**\n", + "\n", + "Since Delta Lake is built on data lakes, all reads\n", + "and writes using Spark or other distributed\n", + "processing engines are inherently scalable to\n", + "petabyte-scale. However, unlike most other\n", + "storage formats and query engines, Delta Lake\n", + "leverages Spark to scale out all the metadata\n", + "processing, thus efficiently handling metadata\n", + "of billions of files for petabyte-scale tables.\n", + "\n", + "\n", + "**Audit history and time travel**\n", + "\n", + "The Delta Lake transaction log records details\n", + "about every change made to data, providing a full\n", + "audit trail of the changes. These data snapshots\n", + "allow developers to access and revert to earlier\n", + "versions of data for audits, rollbacks or to\n", + "reproduce experiments.\n", + "\n", + "**Schema enforcement and schema evolution**\n", + "\n", + "Delta Lake automatically prevents the insertion of\n", + "data with an incorrect schema, i.e., not matching\n", + "the table schema. And when needed, it allows the\n", + "table schema to be explicitly and safely evolved to\n", + "accommodate ever-changing data.\n", + "\n", + "\n", + "**Support for deletes, updates and merges**\n", + "\n", + "Most distributed processing frameworks do not\n", + "support atomic data modification operations on\n", + "data lakes. Delta Lake supports merge, update\n", + "and delete operations to enable complex use\n", + "cases including but not limited to change data\n", + "capture (CDC), slowly changing dimension (SCD)\n", + "operations and streaming upserts.\n", + "\n", + "**Streaming and batch unification**\n", + "\n", + "A Delta Lake table can work both in batch\n", + "and as a streaming source and sink. The\n", + "ability to work across a wide variety of latencies,\n", + "ranging from streaming data ingestion to batch\n", + "historic backfill, to interactive queries all work\n", + "out of the box.\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Delta Lake transaction log**\n", + "\n", + "A key to understanding how Delta Lake provides all these capabilities is the\n", + "transaction log. The Delta Lake transaction log is the common thread that runs\n", + "through many of Delta Lake’s most notable features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log\n", + "is an ordered record of every transaction that has ever been performed on\n", + "a Delta Lake table since its inception.\n", + "\n", + "Delta Lake is built on top of Spark to allow multiple readers and writers of a\n", + "given table to work on a table at the same time. To always show users correct\n", + "views of the data, the transaction log serves as a single source of truth: the\n", + "central repository that tracks all changes that users make to the table.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
15911b1a07f4772456ab5e2e5b11cee7**Streaming and batch unification**\n", + "\n", + "A Delta Lake table can work both in batch\n", + "and as a streaming source and sink. The\n", + "ability to work across a wide variety of latencies,\n", + "ranging from streaming data ingestion to batch\n", + "historic backfill, to interactive queries all work\n", + "out of the box.\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Delta Lake transaction log**\n", + "\n", + "A key to understanding how Delta Lake provides all these capabilities is the\n", + "transaction log. The Delta Lake transaction log is the common thread that runs\n", + "through many of Delta Lake’s most notable features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log\n", + "is an ordered record of every transaction that has ever been performed on\n", + "a Delta Lake table since its inception.\n", + "\n", + "Delta Lake is built on top of Spark to allow multiple readers and writers of a\n", + "given table to work on a table at the same time. To always show users correct\n", + "views of the data, the transaction log serves as a single source of truth: the\n", + "central repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on\n", + "an open table that has been modified since the last time it was read, Spark\n", + "checks the transaction log to see what new transactions are posted to the table.\n", + "Then, Spark updates the table with those recent changes. This ensures that a\n", + "user’s version of a table is always synchronized with the master record as of the\n", + "most recent query, and that users cannot make divergent, conflicting changes\n", + "to a table.\n", + "\n", + "\n", + "**Flexibility and broad industry support**\n", + "\n", + "Delta Lake is an open source project, with an engaged community of\n", + "contributors building and growing the Delta Lake ecosystem atop a set of open\n", + "APIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\n", + "as an open storage standard in different environments and use cases, comes a\n", + "broad set of integration with industry-leading tools, technologies and formats.\n", + "\n", + "Organizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\n", + "flexibility in how they ingest, store and query data. They are not limited in storing\n", + "data in a single cloud provider and can implement a true multicloud approach to\n", + "data storage.\n", + "\n", + "Connectors to tools, such as Fivetran, allow you to leverage Databricks’\n", + "ecosystem of partner solutions, so organizations have full control of building the\n", + "right ingestion pipelines for their use cases. Finally, consuming data via queries\n", + "for exploration or business intelligence (BI) is also flexible and open.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake integrates with all major analytics tools**\n", + "\n", + "Eliminates unnecessary data movement and duplication\n", + "\n", + "\n", + "-----\n", + "\n", + "In addition to a wide ecosystem of tools and technologies, Delta Lake supports\n", + "a broad set of data formats for structured, semi-structured and unstructured\n", + "data. These formats include image binary data that can be stored in Delta\n", + "Tables, graph data format, geospatial data types and key-value stores.\n", + "\n", + "**Learn more**\n", + "\n", + "[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "[Documentation](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n", + "\n", + "\n", + "**What is Photon?**\n", + "\n", + "As many organizations standardize on the lakehouse paradigm, this new\n", + "architecture poses challenges with the underlying query execution engine\n", + "for accessing and processing structured and unstructured data. The execution\n", + "engine needs to provide the performance of a data warehouse and the scalability\n", + "of data lakes.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
fd80575f4533ded58655e4616d3441e4**Learn more**\n", + "\n", + "[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "[Documentation](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n", + "\n", + "\n", + "**What is Photon?**\n", + "\n", + "As many organizations standardize on the lakehouse paradigm, this new\n", + "architecture poses challenges with the underlying query execution engine\n", + "for accessing and processing structured and unstructured data. The execution\n", + "engine needs to provide the performance of a data warehouse and the scalability\n", + "of data lakes.\n", + "\n", + "Photon is the next-generation query engine on the Databricks Lakehouse\n", + "Platform that provides dramatic infrastructure cost savings and speedups for\n", + "all use cases — from data ingestion, ETL, streaming, data science and interactive\n", + "queries — directly on your data lake. Photon is compatible with Spark APIs and\n", + "implements a more general execution framework that allows efficient processing\n", + "of data with support of the Spark API. This means getting started is as easy as\n", + "turning it on — no code change and no lock-in. With Photon, typical customers are\n", + "seeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\n", + "to 85% reduction in VM compute hours.\n", + "\n", + "Spark instructions Photon instructions\n", + "\n", + "\n", + "Photon engine\n", + "\n", + "\n", + "Delta/Parquet\n", + "\n", + "Photon writer\n", + "to Delta/Parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "Why process queries with Photon?\n", + "\n", + "\n", + "Query performance on Databricks has steadily increased over the years,\n", + "powered by Spark and thousands of optimizations packaged as part of the\n", + "Databricks Runtime (DBR). Photon provides an additional 2x speedup per the\n", + "TPC-DS 1TB benchmark compared to the latest DBR versions.\n", + "\n", + "**Relative speedup to DBR 2.1 by DBR version**\n", + "Higher is better\n", + "\n", + "\n", + "**Customers have observed significant speedups using**\n", + "**Photon on workloads such as:**\n", + "\n", + "**•** **SQL-based jobs:** Accelerate large-scale production jobs on\n", + "SQL and Spark DataFrames\n", + "\n", + "**•** **IoT use cases:** Faster time-series analysis using Photon\n", + "compared to Spark and traditional Databricks Runtime\n", + "\n", + "**•** **Data privacy and compliance:** Query petabytes-scale data\n", + "sets to identify and delete records without duplicating data\n", + "with Delta Lake, production jobs and Photon\n", + "\n", + "**•** **Loading data into Delta and Parquet:** Vectorized I/O\n", + "speeds up data loads for Delta and Parquet tables, lowering\n", + "overall runtime and costs of data engineering jobs\n", + "\n", + "\n", + "Release date - DBR version (TPC-DS 1TB 10 x i3xl)\n", + "\n", + "\n", + "-----\n", + "\n", + "**100TB TPC-DS price/performance**\n", + "Lower is better\n", + "\n", + "\n", + "Best price/performance for analytics\n", + "in the cloud\n", + "\n", + "Written from the ground up in C++, Photon takes\n", + "advantage of modern hardware for faster queries,\n", + "providing up to 12x better price/performance\n", + "compared to other cloud data warehouses —\n", + "all natively on your data lake.\n", + "\n", + "\n", + "Databricks SQL Databricks SQL Cloud data Cloud data Cloud data\n", + "spot on-demand warehouse 1 warehouse 2 warehouse 3\n", + "\n", + "**System**\n", + "\n", + "\n", + "-----\n", + "\n", + "Works with your existing code\n", + "and avoids vendor lock-in\n", + "\n", + "Photon is designed to be compatible with the\n", + "Apache Spark DataFrame and SQL APIs to ensure\n", + "workloads run seamlessly without code changes.\n", + "All you do is turn it on. Photon will seamlessly\n", + "coordinate work and resources and transparently\n", + "accelerate portions of your SQL and Spark queries.\n", + "No tuning or user intervention required.\n", + "\n", + "\n", + "**Photon in the Databricks Lakehouse Platform**\n", + "\n", + "**Client: submit SQL**\n", + "\n", + "Parsing\n", + "Catalyst: analysis/\n", + "planning/optimization\n", + "scheduling\n", + "\n", + "Execute task Execute task Execute task Execute task\n", + "\n", + "_Lifecycle of a Photon query_\n", + "\n", + "\n", + "Spark\n", + "driver\n", + "JVMSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
075b546940802feb12ef774f8983d5e5Written from the ground up in C++, Photon takes\n", + "advantage of modern hardware for faster queries,\n", + "providing up to 12x better price/performance\n", + "compared to other cloud data warehouses —\n", + "all natively on your data lake.\n", + "\n", + "\n", + "Databricks SQL Databricks SQL Cloud data Cloud data Cloud data\n", + "spot on-demand warehouse 1 warehouse 2 warehouse 3\n", + "\n", + "**System**\n", + "\n", + "\n", + "-----\n", + "\n", + "Works with your existing code\n", + "and avoids vendor lock-in\n", + "\n", + "Photon is designed to be compatible with the\n", + "Apache Spark DataFrame and SQL APIs to ensure\n", + "workloads run seamlessly without code changes.\n", + "All you do is turn it on. Photon will seamlessly\n", + "coordinate work and resources and transparently\n", + "accelerate portions of your SQL and Spark queries.\n", + "No tuning or user intervention required.\n", + "\n", + "\n", + "**Photon in the Databricks Lakehouse Platform**\n", + "\n", + "**Client: submit SQL**\n", + "\n", + "Parsing\n", + "Catalyst: analysis/\n", + "planning/optimization\n", + "scheduling\n", + "\n", + "Execute task Execute task Execute task Execute task\n", + "\n", + "_Lifecycle of a Photon query_\n", + "\n", + "\n", + "Spark\n", + "driver\n", + "JVM\n", + "\n", + "Spark\n", + "executors mixed\n", + "JVM/Native\n", + "\n", + "\n", + "-----\n", + "\n", + "Optimizing for all data use cases\n", + "and workloads\n", + "\n", + "Photon is the first purpose-built lakehouse engine\n", + "designed to accelerate all data and analytics\n", + "workloads: data ingestion, ETL, streaming, data\n", + "science, and interactive queries. While we started\n", + "Photon primarily focused on SQL to provide\n", + "customers with world-class data warehousing\n", + "performance on their data lakes, we’ve significantly\n", + "increased the scope of ingestion sources, formats,\n", + "APIs and methods supported by Photon since\n", + "then. As a result, customers have seen dramatic\n", + "infrastructure cost savings and speedups on\n", + "Photon across all their modern Spark (e.g., Spark\n", + "SQL and DataFrame) workloads.\n", + "\n", + "\n", + "Query optimizer\n", + "\n", + "Native execution engine\n", + "\n", + "Caching\n", + "\n", + "\n", + "_Accelerating all workloads on the lakehouse_\n", + "\n", + "**Learn more**\n", + "\n", + "[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "\n", + "[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Unified governance and sharing for data, analytics and AI\n", + "\n", + "Today, more and more organizations recognize the importance of making\n", + "high-quality data readily available to data teams to drive actionable insights\n", + "and business value. At the same time, organizations also understand the risks\n", + "of data breaches which negatively impact brand value and inevitably lead to\n", + "erosion of customer trust. Governance is one of the most critical components\n", + "of a lakehouse data platform architecture; it helps ensure that data assets\n", + "are securely managed throughout the enterprise. However, many companies\n", + "are using different incompatible governance models leading to complex and\n", + "expensive solutions.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key challenges with data and AI governance\n", + "\n", + "**Diversity of data and AI assets**\n", + "\n", + "The increased use of data and the added complexity of the data landscape\n", + "have left organizations with a difficult time managing and governing all types\n", + "of their data-related assets. No longer is data stored in files or tables. Data\n", + "assets today take many forms, including dashboards, machine learning models\n", + "and unstructured data like video and images that legacy data governance\n", + "solutions simply are not built to govern and manage.\n", + "\n", + "\n", + "**Rising multicloud adoption**\n", + "\n", + "More and more organizations now leverage a multicloud strategy to optimize\n", + "costs, avoid vendor lock-in, and meet compliance and privacy regulations. With\n", + "nonstandard, cloud-specific governance models, data governance across clouds\n", + "is complex and requires familiarity with cloud-specific security and governance\n", + "concepts, such as identity and access management (IAM).SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
7c7d19c2aca4f65da5a91323d2845774-----\n", + "\n", + "#### Key challenges with data and AI governance\n", + "\n", + "**Diversity of data and AI assets**\n", + "\n", + "The increased use of data and the added complexity of the data landscape\n", + "have left organizations with a difficult time managing and governing all types\n", + "of their data-related assets. No longer is data stored in files or tables. Data\n", + "assets today take many forms, including dashboards, machine learning models\n", + "and unstructured data like video and images that legacy data governance\n", + "solutions simply are not built to govern and manage.\n", + "\n", + "\n", + "**Rising multicloud adoption**\n", + "\n", + "More and more organizations now leverage a multicloud strategy to optimize\n", + "costs, avoid vendor lock-in, and meet compliance and privacy regulations. With\n", + "nonstandard, cloud-specific governance models, data governance across clouds\n", + "is complex and requires familiarity with cloud-specific security and governance\n", + "concepts, such as identity and access management (IAM).\n", + "\n", + "**Disjointed tools for data governance on the lakehouse**\n", + "\n", + "Today, data teams must deal with a myriad of fragmented tools and services for\n", + "their data governance requirements, such as data discovery, cataloging, auditing,\n", + "sharing, access controls, etc. This inevitably leads to operational inefficiencies\n", + "and poor performance due to multiple integration points and network latency\n", + "between the services.\n", + "\n", + "\n", + "**Two disparate and incompatible data platforms**\n", + "\n", + "Organizations today use two different platforms for their data analytics and\n", + "AI efforts — data warehouses for BI and data lakes for AI. This results in data\n", + "replication across two platforms, presenting a major governance challenge.\n", + "With no unified view of the data landscape, it is difficult to see where data is\n", + "stored, who has access to what data, and consistently define and enforce data\n", + "access policies across the two platforms with different governance models.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### One security and governance approach\n", + "\n", + "Lakehouse systems provide a uniform way to manage access control, data\n", + "quality and compliance across all of an organization’s data using standard\n", + "interfaces similar to those in data warehouses by adding a management\n", + "interface on top of data lake storage.\n", + "\n", + "Modern lakehouse systems support fine-grained (row, column and view level)\n", + "access control via SQL, query auditing, attribute-based access control, data\n", + "versioning and data quality constraints and monitoring. These features are\n", + "generally provided using standard interfaces familiar to database administrators\n", + "(for example, SQL GRANT commands) to allow existing personnel to manage\n", + "all the data in an organization in a uniform way. Centralizing all the data in\n", + "a lakehouse system with a single management interface also reduces the\n", + "administrative burden and potential for error that comes with managing\n", + "multiple separate systems.\n", + "\n", + "\n", + "#### What is Unity Catalog?\n", + "\n", + "Unity Catalog is a unified governance solution for all data, analytics and AI\n", + "assets including files, tables, dashboards and machine learning models in your\n", + "lakehouse on any cloud. Unity Catalog simplifies governance by empowering\n", + "data teams with a common governance model based on ANSI-SQL to define\n", + "and enforce fine-grained access controls. With attribute-based access controls,\n", + "data administrators can enable fine-grained access controls on rows and\n", + "columns using tags (attributes). Built-in data search and discovery allows\n", + "data teams to quickly find and reference relevant data for any use case. Unity\n", + "Catalog offers automated data lineage for all workloads in SQL, R, Scala and\n", + "Python, to build a better understanding of the data and its flow in the lakehouse.\n", + "Unity Catalog also allows data sharing across or within organizations and\n", + "seamless integrations with your existing data governance tools.\n", + "\n", + "With Unity Catalog, data teams can simplify governance for all data and AI\n", + "assets with one consistent model to discover, access and share data, giving\n", + "you much better native performance, management and security across clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key benefits**\n", + "\n", + "\n", + "The common metadata layer for cross-workspace metadata is at the account\n", + "level and eases collaboration by allowing different workspaces to access Unity\n", + "Catalog metadata through a common interface and break down data silos.\n", + "Further, the data permissions in Unity Catalog are applied to account-level\n", + "identities, rather than identities that are local to a workspace, allowing\n", + "a consistent view of users and groups across all workspaces.\n", + "\n", + "\n", + "Catalog, secure and audit access to all data assets on any cloud\n", + "\n", + "Unity Catalog provides centralized metadata, enabling data teams to create\n", + "a single source of truth for all data assets ranging from files, tables, dashboards\n", + "to machine learning models in one place.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
8edcc501e83716d9ea824e0caa38cabfWith Unity Catalog, data teams can simplify governance for all data and AI\n", + "assets with one consistent model to discover, access and share data, giving\n", + "you much better native performance, management and security across clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key benefits**\n", + "\n", + "\n", + "The common metadata layer for cross-workspace metadata is at the account\n", + "level and eases collaboration by allowing different workspaces to access Unity\n", + "Catalog metadata through a common interface and break down data silos.\n", + "Further, the data permissions in Unity Catalog are applied to account-level\n", + "identities, rather than identities that are local to a workspace, allowing\n", + "a consistent view of users and groups across all workspaces.\n", + "\n", + "\n", + "Catalog, secure and audit access to all data assets on any cloud\n", + "\n", + "Unity Catalog provides centralized metadata, enabling data teams to create\n", + "a single source of truth for all data assets ranging from files, tables, dashboards\n", + "to machine learning models in one place.\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog offers a unified data access layer that provides a simple and\n", + "streamlined way to define and connect to your data through managed tables,\n", + "external tables, or files, while managing their access controls. Unity Catalog\n", + "centralizes access controls for files, tables and views.\n", + "\n", + "It allows fine-grained access controls for restricting access to certain rows\n", + "and columns to the users and groups who are authorized to query them. With\n", + "Attribute-Based Access Controls (ABAC), you can control access to multiple\n", + "data items at once based on user and data attributes, further simplifying\n", + "governance at scale. For example, you will be able to tag multiple columns\n", + "as personally identifiable information (PII) and manage access to all columns\n", + "tagged as PII in a single rule.\n", + "\n", + "Today, organizations are dealing with an increased burden of regulatory\n", + "compliance, and data access auditing is a critical component to ensure your\n", + "organization is set up for success while meeting compliance requirements.\n", + "Unity Catalog also provides centralized fine-grained auditing by capturing an\n", + "audit log of operations such as create, read, update and delete (CRUD) that have\n", + "been performed against the data. This allows a fine-grained audit trail showing\n", + "who accessed a given data set and helps you meet your compliance and\n", + "business requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "Built-in data search and discovery\n", + "\n", + "Data discovery is a critical component to break\n", + "down data silos and democratize data across\n", + "your organization to make data-driven decisions.\n", + "Unity Catalog provides a rich user interface for\n", + "data search and discovery, enabling data teams to\n", + "quickly search relevant data assets across the data\n", + "landscape and reference them for all use cases —\n", + "BI, analytics and machine learning — accelerating\n", + "time-to-value and boosting productivity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Automated data lineage for all workloads\n", + "\n", + "Data lineage describes the transformations and\n", + "refinements of data from source to insight. Lineage\n", + "includes capturing all the relevant metadata and\n", + "events associated with the data in its lifecycle,\n", + "including the source of the data set, what other\n", + "data sets were used to create it, who created it and\n", + "when, what transformations were performed, which\n", + "other data sets leverage it, and many other events\n", + "and attributes. Unity Catalog offers automated data\n", + "lineage down to table and column level, enabling\n", + "data teams to get an end-to-end view of where\n", + "data is coming from, what transformations were\n", + "performed on the data and how data is consumed\n", + "by end applications such as notebooks, workflows,\n", + "dashboards, machine learning models, etc.\n", + "\n", + "With automated data lineage for all workloads —\n", + "SQL, R, Python and Scala, data teams can quickly\n", + "identify and perform root cause analysis of any\n", + "errors in the data pipelines or end applications.\n", + "Second, data teams can perform impact analysis\n", + "to see dependencies of any data changes\n", + "on downstream consumers and notify them\n", + "about the potential impact. Finally, data lineage\n", + "also empowers data teams with increased\n", + "understanding of their data and reduces tribal\n", + "knowledge. Unity Catalog can also capture lineage\n", + "associated with non-data entities, such as notebooks,\n", + "workflows and dashboards. Lineage can be\n", + "\n", + "\n", + "_Data lineage with Unity Catalog_\n", + "\n", + "retrieved via REST APIs to support integrations\n", + "with other catalogs.\n", + "\n", + "Integrated with your existing tools\n", + "\n", + "\n", + "**Resources**\n", + "\n", + "[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n", + "\n", + "[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
6dbd9be1aaa221ebfee5e85daf053e70_Data lineage with Unity Catalog_\n", + "\n", + "retrieved via REST APIs to support integrations\n", + "with other catalogs.\n", + "\n", + "Integrated with your existing tools\n", + "\n", + "\n", + "**Resources**\n", + "\n", + "[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n", + "\n", + "[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n", + "\n", + "\n", + "Unity Catalog helps you to future-proof your data\n", + "and AI governance with the flexibility to leverage\n", + "your existing data catalogs and governance\n", + "solutions — Collibra, Alation, Immuta, Privacera,\n", + "Microsoft Purview and AWS Lakeformation.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Open data sharing and collaboration\n", + "\n", + "Data sharing has become important in the digital\n", + "economy as enterprises wish to exchange data\n", + "easily and securely with their customers, partners,\n", + "suppliers and internal lines of business to better\n", + "collaborate and unlock value from that data. But\n", + "to date, a lack of standards-based data sharing\n", + "protocol has resulted in data sharing solutions\n", + "tied to a single vendor or commercial product,\n", + "introducing vendor lock-in risks. What the industry\n", + "deserves is an open approach to data sharing.\n", + "\n", + "**Why data sharing is hard**\n", + "\n", + "Data sharing has evolved from an optional feature\n", + "of a few data platforms to a business necessity\n", + "and success factor for organizations. Our solution\n", + "architects encounter daily the classic scenarios\n", + "of a retailer looking to publish sales data to their\n", + "suppliers in real time or a supplier that wants to\n", + "share real-time inventory.\n", + "\n", + "As a reminder, data sharing recently triggered\n", + "the most impressive scientific development that\n", + "humankind has ever seen. On January 5, 2021, the\n", + "first sample of the genome of the coronavirus was\n", + "\n", + "\n", + "uploaded to the internet. It wasn’t a lung biopsy\n", + "from a patient in Wuhan, but a shared digital\n", + "genomic data set that triggered the development\n", + "of the first batch of COVID vaccines worldwide.\n", + "\n", + "\n", + "treatments, tests and tracking mutations as they\n", + "are passed down through a lineage, a branch of\n", + "the coronavirus family tree. The above graphic\n", + "shows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n", + "\n", + "\n", + "Since then, coronavirus experts have daily\n", + "exchanged public data sets, looking for better\n", + "\n", + "\n", + "-----\n", + "\n", + "Sharing data, as well as consuming data from\n", + "external sources, allows you to collaborate with\n", + "partners, establish new partnerships, enable\n", + "research and can generate new revenue streams\n", + "with data monetization.\n", + "\n", + "Despite those promising examples, existing data\n", + "sharing technologies come with several limitations:\n", + "\n", + "**•** Traditional data sharing technologies, such as\n", + "Secure File Transfer Protocol (SFTP), do not\n", + "scale well and only serve files offloaded to a\n", + "server\n", + "\n", + "**•** Cloud object stores operate on an object level\n", + "and are cloud-specific\n", + "\n", + "**•** Commercial data sharing offerings baked into\n", + "vendor products often share tables instead of\n", + "files, but scaling them is expensive and they\n", + "are not open and, therefore, do not permit data\n", + "sharing with a different platform\n", + "\n", + "The following table compares proprietary vendor\n", + "solutions with SFTP, cloud object stores and Delta\n", + "Sharing.\n", + "\n", + "\n", + "\n", + "|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n", + "|---|---|---|---|---|\n", + "|Secure|||||\n", + "|Cheap|||||\n", + "|Vendor agnostic|||||\n", + "|Multicloud|||||\n", + "|Open source|||||\n", + "|Table/DataFrame abstraction|||||\n", + "|Live data|||||\n", + "|Predicate pushdown|||||\n", + "|Object store bandwidth|||||\n", + "|Zero compute cost|||||\n", + "|Scalability|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "**Open source data sharing and Databricks**\n", + "\n", + "To address the limitations of existing data sharing solutions, Databricks developed\n", + "[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\n", + "to the Linux Foundation.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
2e857bd69bc4575d85c5094c0d462decThe following table compares proprietary vendor\n", + "solutions with SFTP, cloud object stores and Delta\n", + "Sharing.\n", + "\n", + "\n", + "\n", + "|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n", + "|---|---|---|---|---|\n", + "|Secure|||||\n", + "|Cheap|||||\n", + "|Vendor agnostic|||||\n", + "|Multicloud|||||\n", + "|Open source|||||\n", + "|Table/DataFrame abstraction|||||\n", + "|Live data|||||\n", + "|Predicate pushdown|||||\n", + "|Object store bandwidth|||||\n", + "|Zero compute cost|||||\n", + "|Scalability|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "**Open source data sharing and Databricks**\n", + "\n", + "To address the limitations of existing data sharing solutions, Databricks developed\n", + "[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\n", + "to the Linux Foundation.\n", + "\n", + "An open source–based solution, such as Delta Sharing, eliminates the lock-in\n", + "of commercial solutions and brings a number of additional benefits such as\n", + "community-developed integrations with popular, open source data processing\n", + "frameworks. In addition, open protocols allow the easy integration of commercial\n", + "clients, such as BI tools.\n", + "\n", + "**What is Databricks Delta Sharing?**\n", + "\n", + "Databricks Delta Sharing provides an open solution to securely share live data\n", + "from your lakehouse to any computing platform. Recipients don’t have to be\n", + "on the Databricks platform or on the same cloud or a cloud at all. Data providers\n", + "can share live data, without replicating or moving it to another system. Recipients\n", + "benefit from always having access to the latest version of data and can quickly\n", + "query shared data using tools of their choice for BI, analytics and machine\n", + "learning, reducing time-to-value. Data providers can centrally manage, govern,\n", + "audit and track usage of the shared data on one platform.\n", + "\n", + "Unity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\n", + "for data sharing, enabling organizations to share live, large-scale data without\n", + "replication and make data easily and quickly accessible from tools of your\n", + "choice, with enterprise-grade security.\n", + "\n", + "\n", + "**Key benefits**\n", + "\n", + "Open cross-platform sharing\n", + "\n", + "Easily share existing data in Delta Lake and Apache Parquet formats between\n", + "different vendors. Consumers don’t have to be on the Databricks platform, same\n", + "cloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\n", + "and Java allow recipients to consume shared data directly from the tools of their\n", + "choice. Delta Sharing eliminates the need to set up a new ingestion process to\n", + "consume data. Data recipients can directly access the fresh data and query it\n", + "using tools of their choice. Recipients can also enrich data with data sets from\n", + "popular data providers.\n", + "\n", + "Sharing live data without copying it\n", + "\n", + "Share live ready-to-query data, without replicating or moving it to another system.\n", + "Most enterprise data today is stored in cloud data lakes. Any of the existing data\n", + "sets on the provider’s data lake can easily be shared across clouds, regions or\n", + "data platforms without any data replication or physical movement of data. Data\n", + "providers can update their data sets reliably in real time and provide a fresh and\n", + "consistent view of their data to recipients.\n", + "\n", + "Centralized administration and governance\n", + "\n", + "You can centrally govern, track and audit access to the shared data from a single\n", + "point of enforcement to meet compliance requirements. Detailed user-access\n", + "audit logs are kept to know who is accessing the data and monitor usage of the\n", + "shared data down to table, partition and version level.\n", + "\n", + "\n", + "-----\n", + "\n", + "An open Marketplace for data solutions\n", + "\n", + "The demand for third-party data to make data-driven innovations is greater than ever,\n", + "\n", + "and data marketplaces act as a bridge between data providers and data consumers to\n", + "\n", + "help facilitate the discovery and distribution of data sets.\n", + "\n", + "Databricks Marketplace provides an open marketplace for exchanging data products\n", + "\n", + "such as data sets, notebooks, dashboards and machine learning models. To accelerate\n", + "\n", + "insights, data consumers can discover, evaluate and access more data products from\n", + "\n", + "third-party vendors than ever before. Providers can now commercialize new offerings\n", + "\n", + "and shorten sales cycles by providing value-added services on top of their data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
14378d678f1ea82334c55482bb6fd4c0Centralized administration and governance\n", + "\n", + "You can centrally govern, track and audit access to the shared data from a single\n", + "point of enforcement to meet compliance requirements. Detailed user-access\n", + "audit logs are kept to know who is accessing the data and monitor usage of the\n", + "shared data down to table, partition and version level.\n", + "\n", + "\n", + "-----\n", + "\n", + "An open Marketplace for data solutions\n", + "\n", + "The demand for third-party data to make data-driven innovations is greater than ever,\n", + "\n", + "and data marketplaces act as a bridge between data providers and data consumers to\n", + "\n", + "help facilitate the discovery and distribution of data sets.\n", + "\n", + "Databricks Marketplace provides an open marketplace for exchanging data products\n", + "\n", + "such as data sets, notebooks, dashboards and machine learning models. To accelerate\n", + "\n", + "insights, data consumers can discover, evaluate and access more data products from\n", + "\n", + "third-party vendors than ever before. Providers can now commercialize new offerings\n", + "\n", + "and shorten sales cycles by providing value-added services on top of their data.\n", + "\n", + "Databricks Marketplace is powered by Delta Sharing, allowing consumers to access\n", + "\n", + "data products without having to be on the Databricks platform. This open approach\n", + "\n", + "allows data providers to broaden their addressable market without forcing consumers\n", + "\n", + "into vendor lock-in.\n", + "\n", + "_Databricks Marketplace_\n", + "\n", + "\n", + "Privacy-safe data cleanrooms\n", + "\n", + "Powered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n", + "\n", + "a flexible data cleanroom solution allowing businesses to easily collaborate with their\n", + "\n", + "customers and partners on any cloud in a privacy-safe way. Participants in the data\n", + "\n", + "cleanrooms can share and join their existing data, and run complex workloads in any\n", + "\n", + "language — Python, R, SQL, Java and Scala — on the data while maintaining data\n", + "\n", + "privacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n", + "\n", + "data replication across clouds or regions with other participants, which simplifies data\n", + "\n", + "operations and reduces cost.\n", + "\n", + "_Data cleanrooms with Databricks Lakehouse Platform_\n", + "\n", + "\n", + "-----\n", + "\n", + "**How it works**\n", + "\n", + "Delta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\n", + "is natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\n", + "or externally.\n", + "\n", + "Delta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\n", + "Azure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n", + "\n", + "**Data provider** **Data recipient**\n", + "\n", + "Data science And many more On-premises\n", + "\n", + "The data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\n", + "decides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\n", + "shares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\n", + "Spark, Java and Python, and is working with partners on many more.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "1. The recipient’s client authenticates to the sharing server and asks to query\n", + "a specific table. The client can also provide filters on the data (for example,\n", + "“country=US”) as a hint to read just a subset of the data.\n", + "\n", + "2. The server verifies whether the client is allowed to access the data, logs the\n", + "request, and then determines which data to send back. This will be a subset\n", + "of the data objects in cloud storage systems that make up the table.\n", + "\n", + "3. To transfer the data, the server generates short-lived presigned URLs that\n", + "allow the client to read these Parquet files directly from the cloud provider,\n", + "so that the transfer can happen in parallel at massive bandwidth, without\n", + "streaming through the sharing server.\n", + "\n", + "**Learn more**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
a9c08a28601aadaec5c9d2b4059b64a0The data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\n", + "Spark, Java and Python, and is working with partners on many more.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "1. The recipient’s client authenticates to the sharing server and asks to query\n", + "a specific table. The client can also provide filters on the data (for example,\n", + "“country=US”) as a hint to read just a subset of the data.\n", + "\n", + "2. The server verifies whether the client is allowed to access the data, logs the\n", + "request, and then determines which data to send back. This will be a subset\n", + "of the data objects in cloud storage systems that make up the table.\n", + "\n", + "3. To transfer the data, the server generates short-lived presigned URLs that\n", + "allow the client to read these Parquet files directly from the cloud provider,\n", + "so that the transfer can happen in parallel at massive bandwidth, without\n", + "streaming through the sharing server.\n", + "\n", + "**Learn more**\n", + "\n", + "[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n", + "\n", + "[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n", + "\n", + "[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 05\n", + "\n", + "\n", + "### Security\n", + "\n", + "Organizations that operate in multicloud environments need a unified, reliable\n", + "and consistent approach to secure data. We’ve learned from our customers that\n", + "a simple and unified approach to data security for the lakehouse is one of the\n", + "most critical requirements for modern data solutions. Databricks is trusted by\n", + "the world’s largest organizations to provide a powerful lakehouse platform with\n", + "high security and scalability. In fact, thousands of customers trust Databricks\n", + "with their most sensitive data to analyze and build data products using machine\n", + "learning (ML). With significant investment in building a highly secure and scalable\n", + "platform, Databricks delivers end-to-end platform security for data and users.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Platform architecture reduces risk\n", + "\n", + "The Databricks Lakehouse architecture is split into\n", + "two separate planes to simplify your permissions,\n", + "avoid data duplication and reduce risk. The control\n", + "plane is the management plane where Databricks\n", + "runs the workspace application and manages\n", + "notebooks, configuration and clusters. Unless you\n", + "choose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\n", + "runs inside your cloud service provider account,\n", + "processing your data without taking it out of your\n", + "account. You can embed Databricks in your data\n", + "exfiltration protection architecture using features\n", + "like customer-managed VPCs/VNets and admin\n", + "console options that disable export.\n", + "\n", + "While certain data, such as your notebooks,\n", + "configurations, logs, and user information, is\n", + "present within the control plane, that information\n", + "is encrypted at rest, and communication to and\n", + "from the control plane is encrypted in transit.\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Control pane|Col3|\n", + "|---|---|---|\n", + "||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n", + "||Cluster manager||\n", + "\n", + "\n", + "You also have choices for where certain data lives:\n", + "You can host your own store of metadata about\n", + "your data tables (Hive metastore), or store querySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
d3ae991214feea1723ee8744208a2907While certain data, such as your notebooks,\n", + "configurations, logs, and user information, is\n", + "present within the control plane, that information\n", + "is encrypted at rest, and communication to and\n", + "from the control plane is encrypted in transit.\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Control pane|Col3|\n", + "|---|---|---|\n", + "||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n", + "||Cluster manager||\n", + "\n", + "\n", + "You also have choices for where certain data lives:\n", + "You can host your own store of metadata about\n", + "your data tables (Hive metastore), or store query\n", + "\n", + "\n", + "**Data**\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "\n", + "results in your cloud service provider account and\n", + "decide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Step-by-step example\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "|Col1|ample|Col3|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "||Control pane 1 4||||\n", + "|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "Suppose you have a data engineer that signs in to Databricks and\n", + "writes a notebook that transforms raw data in Kafka to a normalized\n", + "data set sent to storage such as Amazon S3 or Azure Data Lake\n", + "Storage. Six steps make that happen:\n", + "\n", + "1. The data engineer seamlessly authenticates, via your single sign-on\n", + "if desired, to the Databricks web UI in the control plane, hosted in\n", + "the Databricks account.\n", + "\n", + "2. As the data engineer writes code, their web browser sends it to\n", + "the control plane. JDBC/ODBC requests also follow the same path,\n", + "authenticating with a token.\n", + "\n", + "3. When ready, the control plane uses Cloud Service Provider APIs to\n", + "create a Databricks cluster, made of new instances in the data plane,\n", + "in your CSP account. Administrators can apply cluster policies to\n", + "enforce security profiles.\n", + "\n", + "4. Once the instances launch, the cluster manager sends the data\n", + "engineer’s code to the cluster.\n", + "\n", + "5. The cluster pulls from Kafka in your account, transforms the data\n", + "in your account and writes it to a storage in your account.\n", + "\n", + "6. The cluster reports status and any outputs back to the cluster manager.\n", + "\n", + "The data engineer does not need to worry about many of the details —\n", + "simply write the code and Databricks runs it.\n", + "\n", + "\n", + "#### Network and server security\n", + "\n", + "Here is how Databricks interacts with your cloud service provider\n", + "account to manage network and server security\n", + "\n", + "**Networking**\n", + "\n", + "Regardless of where you choose to host the data plane, Databricks networking\n", + "is straightforward. If you host it yourself, Databricks by default will still configure\n", + "networking for you, but you can also control data plane networking with your\n", + "own managed VPC or VNet.\n", + "\n", + "The serverless data plane network infrastructure is managed by Databricks in\n", + "a Databricks cloud service provider account and shared among customers,\n", + "with additional network boundaries between workspaces and between clusters.\n", + "\n", + "Databricks does not rewrite or change your data structure in your storage, nor\n", + "does it change or modify any of your security and governance policies. Local\n", + "firewalls complement security groups and subnet firewall policies to block\n", + "unexpected inbound connections.\n", + "\n", + "Customers at the enterprise tier can also use the IP access list feature on\n", + "the control plane to limit which IP addresses can connect to the web UI or\n", + "REST API — for example, to allow only VPN or office IPs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Servers**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
8dc7a1ecd072e76cd966c1e96c25cf97Here is how Databricks interacts with your cloud service provider\n", + "account to manage network and server security\n", + "\n", + "**Networking**\n", + "\n", + "Regardless of where you choose to host the data plane, Databricks networking\n", + "is straightforward. If you host it yourself, Databricks by default will still configure\n", + "networking for you, but you can also control data plane networking with your\n", + "own managed VPC or VNet.\n", + "\n", + "The serverless data plane network infrastructure is managed by Databricks in\n", + "a Databricks cloud service provider account and shared among customers,\n", + "with additional network boundaries between workspaces and between clusters.\n", + "\n", + "Databricks does not rewrite or change your data structure in your storage, nor\n", + "does it change or modify any of your security and governance policies. Local\n", + "firewalls complement security groups and subnet firewall policies to block\n", + "unexpected inbound connections.\n", + "\n", + "Customers at the enterprise tier can also use the IP access list feature on\n", + "the control plane to limit which IP addresses can connect to the web UI or\n", + "REST API — for example, to allow only VPN or office IPs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Servers**\n", + "\n", + "In the data plane, Databricks clusters automatically run the latest hardened\n", + "system image. Users cannot choose older (less secure) images or code. For AWS\n", + "and Azure deployments, images are typically updated every two-to-four weeks.\n", + "GCP is responsible for its system image.\n", + "\n", + "Databricks runs scans for every release, including:\n", + "\n", + "**•** System image scanning for vulnerabilities\n", + "\n", + "**•** Container OS and library scanning\n", + "\n", + "\n", + "**Severity** **Remediation time**\n", + "\n", + "**Critical** **< 14 days**\n", + "\n", + "**High** **< 30 days**\n", + "\n", + "**Medium** **< 60 days**\n", + "\n", + "**Low** **When appropriate**\n", + "\n", + "\n", + "\n", + "**•** Static and dynamic code scanning\n", + "\n", + "**Databricks access**\n", + "\n", + "\n", + "Databricks code is peer reviewed by developers who have security training.\n", + "Significant design documents go through comprehensive security reviews.\n", + "Scans run fully authenticated, with all checks enabled, and issues are\n", + "tracked against the timeline shown in this table.\n", + "\n", + "Note that Databricks clusters are typically short-lived (often terminated\n", + "after a job completes) and do not persist data after they terminate. Clusters\n", + "typically share the same permission level (excluding high concurrency or\n", + "Databricks SQL clusters, where more robust security controls are in place).\n", + "Your code is launched in an unprivileged container to maintain system\n", + "stability. This security design provides protection against persistent attackers\n", + "and privilege escalation.\n", + "\n", + "\n", + "Databricks access to your environment is limited to cloud service provider APIs\n", + "for our automation and support access. Automated access allows the Databricks\n", + "control plane to configure resources in your environment using the cloud service\n", + "provider APIs. The specific APIs vary based on the cloud. For instance, an AWS\n", + "cross-account IAM role, or Azure-owned automation or GKE automation do not\n", + "grant access to your data sets (see the next section).\n", + "\n", + "Databricks has a custom-built system that allows staff to fix issues or handle\n", + "support requests — for example, when you open a support request and check the\n", + "box authorizing access to your workspace. Access requires either a support ticket\n", + "or engineering ticket tied expressly to your workspace and is limited to a subset of\n", + "employees and for limited time periods. Additionally, if you have configured audit\n", + "log delivery, the audit logs show the initial access event and the staff’s actions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Identity and access**\n", + "\n", + "Databricks supports robust ACLs and SCIM. AWS customers can configure\n", + "SAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\n", + "GCP automatically integrate with Azure Active Directory or GCP identity.\n", + "\n", + "Databricks supports a variety of ways to enable users to access their data.\n", + "\n", + "**Examples include:**\n", + "\n", + "**•** The Table ACLs feature uses traditional SQL-based statements to\n", + "manage access to data and enable fine-grained view-based access\n", + "\n", + "**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\n", + "users of that cluster automatically access allowed resources without\n", + "explicit credentials\n", + "\n", + "**•** External storage can be mounted or accessed using a securely\n", + "stored access keySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
3effc9d28683f767fe2ff753dd4bcb04-----\n", + "\n", + "**Identity and access**\n", + "\n", + "Databricks supports robust ACLs and SCIM. AWS customers can configure\n", + "SAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\n", + "GCP automatically integrate with Azure Active Directory or GCP identity.\n", + "\n", + "Databricks supports a variety of ways to enable users to access their data.\n", + "\n", + "**Examples include:**\n", + "\n", + "**•** The Table ACLs feature uses traditional SQL-based statements to\n", + "manage access to data and enable fine-grained view-based access\n", + "\n", + "**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\n", + "users of that cluster automatically access allowed resources without\n", + "explicit credentials\n", + "\n", + "**•** External storage can be mounted or accessed using a securely\n", + "stored access key\n", + "\n", + "**•** The Secrets API separates credentials from code when accessing\n", + "external resources\n", + "\n", + "\n", + "**Data security**\n", + "\n", + "Databricks provides encryption, isolation and auditing.\n", + "\n", + "**Databricks encryption capabilities are**\n", + "**in place both at rest and in motion**\n", + "\n", + "\n", + "\n", + "|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n", + "|---|---|\n", + "\n", + "\n", + "**Customers can isolate users at multiple levels:**\n", + "\n", + "**•** **Workspace level:** Each team or department can use a separate workspace\n", + "\n", + "**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n", + "\n", + "to a given cluster\n", + "\n", + "**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\n", + "languages (SQL, Python) allow for the safe coexistence of users of different\n", + "privilege levels, and is used with Table ACLs\n", + "\n", + "**•** **Single-user cluster:** Users can create a private dedicated cluster\n", + "\n", + "Activities of Databricks users are logged and can be delivered automatically to\n", + "a cloud storage bucket. Customers can also monitor provisioning activities by\n", + "monitoring cloud audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Compliance**\n", + "\n", + "**Databricks supports the following compliance standards on**\n", + "\n", + "**our multi-tenant platform:**\n", + "\n", + "**•** **SOC 2 Type II**\n", + "\n", + "**•** **ISO 27001**\n", + "\n", + "**•** **ISO 27017**\n", + "\n", + "**•** **ISO 27018**\n", + "\n", + "Certain clouds support Databricks deployment options for FedRAMP\n", + "High, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\n", + "are also GDPR and CCPA ready.\n", + "\n", + "**Learn more**\n", + "\n", + "To learn more about Databricks security,\n", + "visit the [Security and Trust Center](https://databricks.com/trust)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 06\n", + "\n", + "\n", + "### Instant compute and serverless\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of Databricks Serverless SQL\n", + "\n", + "Serverless SQL is much easier to administer with Databricks taking on the\n", + "responsibility of deploying, configuring and managing your cluster VMs. Databricks\n", + "can transfer compute capacity to user queries typically in about 15 seconds — so\n", + "you no longer need to wait for clusters to start up or scale out to run your queries.\n", + "\n", + "Serverless SQL also has built-in connectors to your favorite tools such as Tableau,\n", + "Power BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\n", + "authentication support and high performance. And finally, you save on cost\n", + "because you do not need to overprovision or pay for the idle capacity.\n", + "\n", + "\n", + "#### What is serverless compute?SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
bd59e24ba90fb53ddd7c055746f246e9To learn more about Databricks security,\n", + "visit the [Security and Trust Center](https://databricks.com/trust)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 06\n", + "\n", + "\n", + "### Instant compute and serverless\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of Databricks Serverless SQL\n", + "\n", + "Serverless SQL is much easier to administer with Databricks taking on the\n", + "responsibility of deploying, configuring and managing your cluster VMs. Databricks\n", + "can transfer compute capacity to user queries typically in about 15 seconds — so\n", + "you no longer need to wait for clusters to start up or scale out to run your queries.\n", + "\n", + "Serverless SQL also has built-in connectors to your favorite tools such as Tableau,\n", + "Power BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\n", + "authentication support and high performance. And finally, you save on cost\n", + "because you do not need to overprovision or pay for the idle capacity.\n", + "\n", + "\n", + "#### What is serverless compute?\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions\n", + "and manages the compute layer on behalf of the customer in the Databricks\n", + "cloud account instead of the customer account. As of the current release,\n", + "serverless compute is supported for use with Databricks SQL. This new\n", + "capability for Databricks SQL provides instant compute to users for their\n", + "BI and SQL workloads, with minimal management required and capacity\n", + "optimizations that can lower overall cost by 20%-40% on average. This\n", + "makes it even easier for organizations to expand adoption of the lakehouse\n", + "for business analysts who are looking to access the rich, real-time data sets\n", + "of the lakehouse with a simple and performant solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Inside Serverless SQL**\n", + "\n", + "\n", + "**Databricks Serverless SQL**\n", + "\n", + "**Managed servers**\n", + "\n", + "**Serverless SQL**\n", + "**compute**\n", + "\n", + "**Secure**\n", + "**Instant compute**\n", + "\n", + "\n", + "At the core of Serverless SQL is a compute\n", + "platform that operates a pool of servers located\n", + "in a Databricks’ account, running Kubernetes\n", + "containers that can be assigned to a user\n", + "within seconds.\n", + "\n", + "When many users are running reports or queries\n", + "at the same time, the compute platform adds more\n", + "servers to the cluster (again, within seconds) to\n", + "handle the concurrent load. Databricks manages\n", + "the entire configuration of the server and\n", + "automatically performs the patching and upgrades\n", + "as needed.\n", + "\n", + "Each server is running a secure configuration and\n", + "all processing is secured by three layers of isolation:\n", + "The Kubernetes container hosting the runtime; the\n", + "virtual machine (VM) hosting the container; and\n", + "the virtual network for the workspace. Each layer\n", + "is isolated to one workspace with no sharing or\n", + "cross-network traffic allowed. The containers use\n", + "hardened configurations, VMs are shut down and\n", + "not reused, and network traffic is restricted\n", + "to nodes in the same cluster.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Performance of Serverless SQL\n", + "\n", + "We ran a set of internal tests to compare\n", + "Databricks Serverless SQL to the current\n", + "Databricks SQL and several traditional cloud\n", + "data warehouses. We found Serverless SQL\n", + "to be the most cost-efficient and performant\n", + "environment to run SQL workloads when\n", + "considering cluster startup time, query\n", + "execution time and overall cost.\n", + "\n", + "\n", + "**Databricks Serverless SQL is the highest**\n", + "**performing and most cost-effective solution**\n", + "\n", + "**Cloud SQL solutions compared**\n", + "\n", + "\n", + "**Faster**\n", + "\n", + "**Query**\n", + "**execution**\n", + "**time**\n", + "\n", + "**Slower**\n", + "\n", + "\n", + "**Serverless**\n", + "**SQL**\n", + "\n", + "**CDW1**\n", + "\n", + "**CDW3**\n", + "\n", + "\n", + "**Cost Estimate**\n", + "\n", + "**High**\n", + "\n", + "**Medium**\n", + "\n", + "**Low**\n", + "\n", + "\n", + "**CDW2**\n", + "\n", + "\n", + "**CDW4**\n", + "\n", + "\n", + "**Slower** **Faster**\n", + "**(~5min)** **Startup time** **(~2-3sec)**\n", + "\n", + "**Learn more**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
11c8b386a7d3091ed8341cf2d56b6a97**Databricks Serverless SQL is the highest**\n", + "**performing and most cost-effective solution**\n", + "\n", + "**Cloud SQL solutions compared**\n", + "\n", + "\n", + "**Faster**\n", + "\n", + "**Query**\n", + "**execution**\n", + "**time**\n", + "\n", + "**Slower**\n", + "\n", + "\n", + "**Serverless**\n", + "**SQL**\n", + "\n", + "**CDW1**\n", + "\n", + "**CDW3**\n", + "\n", + "\n", + "**Cost Estimate**\n", + "\n", + "**High**\n", + "\n", + "**Medium**\n", + "\n", + "**Low**\n", + "\n", + "\n", + "**CDW2**\n", + "\n", + "\n", + "**CDW4**\n", + "\n", + "\n", + "**Slower** **Faster**\n", + "**(~5min)** **Startup time** **(~2-3sec)**\n", + "\n", + "**Learn more**\n", + "\n", + "The feature is currently in Public Preview. Sign up to\n", + "[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\n", + "Serverless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 07\n", + "\n", + "\n", + "### Data warehousing\n", + "\n", + "Data warehouses are not keeping up with today’s world. The explosion of\n", + "languages other than SQL and unstructured data, machine learning, IoT and\n", + "streaming analytics are forcing organizations to adopt a bifurcated architecture\n", + "of disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\n", + "is ubiquitous and known by millions of professionals, it has never been treated\n", + "as a first-class citizen on data lakes, until the lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is data warehousing\n", + "\n", + "The Databricks Lakehouse Platform provides a simplified multicloud and\n", + "serverless architecture for your data warehousing workloads. Data warehousing on\n", + "the lakehouse allows SQL analytics and BI at scale with a common governance\n", + "model. Now you can ingest, transform and query all your data in-place — using\n", + "your SQL and BI tools of choice — to deliver real-time business insights at the\n", + "best price/performance. Built on open standards and APIs, the lakehouse\n", + "provides the reliability, quality and performance that data lakes natively lack,\n", + "and integrations with the ecosystem for maximum flexibility — no lock-in.\n", + "\n", + "With data warehousing on the lakehouse, organizations can unify all analytics\n", + "and simplify their architecture to enable their business with real-time business\n", + "insights at the best price/performance.\n", + "\n", + "\n", + "#### Key benefits\n", + "\n", + "**Best price/performance**\n", + "\n", + "Lower costs, get the best price/performance and eliminate\n", + "resource management overhead\n", + "\n", + "On-premises data warehouses have reached their limits — they physically\n", + "cannot scale to handle the growing volumes of data, and don’t provide the\n", + "elasticity customers need to respond to ever-changing business needs.\n", + "Cloud data warehouses are a great alternative to on-premises data\n", + "warehouses, providing greater scale and elasticity, but cloud costs for\n", + "proprietary cloud data warehouses typically yield to an exponential cost\n", + "increase following the growth of data volume.\n", + "\n", + "The Databricks Lakehouse Platform provides instant, elastic SQL serverless\n", + "compute — decoupled from storage on cheap cloud object stores — and\n", + "thousands of performance optimizations that can lower overall infrastructure\n", + "costs by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\n", + "types and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\n", + "use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Built-in governance**\n", + "\n", + "One source of truth and one unified\n", + "governance layer across all data teamsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
429e8a17397c072459bbacaa5419cec3The Databricks Lakehouse Platform provides instant, elastic SQL serverless\n", + "compute — decoupled from storage on cheap cloud object stores — and\n", + "thousands of performance optimizations that can lower overall infrastructure\n", + "costs by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\n", + "types and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\n", + "use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Built-in governance**\n", + "\n", + "One source of truth and one unified\n", + "governance layer across all data teams\n", + "\n", + "Underpinned by Delta Lake, the Databricks\n", + "Lakehouse Platform simplifies your architecture by\n", + "allowing you to establish one single copy of all your\n", + "data for in-place analytics and ETL/ELT on your\n", + "existing data lakes — no more data movements\n", + "and copies in disjointed systems. Then, seamless\n", + "integration with Databricks Unity Catalog lets you\n", + "easily discover, secure and manage all your data\n", + "with fine-grained governance, data lineage, and\n", + "standard SQL.\n", + "\n", + "**Rich ecosystem**\n", + "\n", + "Ingest, transform and query all your\n", + "data in-place with your favorite tools\n", + "\n", + "Very few tools exist to conduct BI on data lakes.\n", + "Generally, doing so has required data analysts to\n", + "\n", + "submit Spark jobs or use a developer interface.\n", + "While these tools are common for data scientists,\n", + "they require knowledge of languages and\n", + "interfaces that are not traditionally part of a data\n", + "analyst’s tool set. As a result, the learning curve for\n", + "an analyst to make use of a data lake is too high\n", + "when well-established tools and methods already\n", + "exist for data warehouses.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform works with\n", + "your preferred tools like dbt, Fivetran, Power BI or\n", + "Tableau, allowing analysts and analytical engineers\n", + "to easily ingest, transform and query the most\n", + "recent and complete data, without having to move\n", + "it into a separate data warehouse. Additionally, it\n", + "empowers every analyst across your organization\n", + "to quickly and collaboratively find and share new\n", + "insights with a built-in SQL editor, visualizations\n", + "and dashboards.\n", + "\n", + "**Break down silos**\n", + "\n", + "Accelerate time from raw to actionable\n", + "data and go effortlessly from BI to ML\n", + "\n", + "\n", + "applications, organizations will need to manage\n", + "an entirely different system than their SQL-only\n", + "data warehouse, slowing down collaboration and\n", + "innovation.\n", + "\n", + "The Databricks Lakehouse Platform provides the\n", + "most complete end-to-end data warehousing\n", + "solution for all your modern analytics needs,\n", + "and more. Now you can empower data teams\n", + "and business users to access the latest data\n", + "faster for downstream real-time analytics and go\n", + "effortlessly from BI to ML. Speed up the time from\n", + "raw to actionable data at any scale — in batch and\n", + "streaming. And go from descriptive to advanced\n", + "analytics effortlessly to uncover new insights.\n", + "\n", + "\n", + "It is challenging for data engineering teams to\n", + "enable analysts at the speed that the business\n", + "requires. Data warehouses need data to be\n", + "ingested and processed ahead of time before\n", + "analysts can access and query it using BI tools.\n", + "Because traditional data warehouses lack\n", + "real-time processing and do not scale well for\n", + "large ETL jobs, they create new data movements\n", + "and bottlenecks for the data engineering team,\n", + "and make it slow for analysts to access the\n", + "latest data. And for advanced analytics (ML)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data warehousing on Databricks**\n", + "\n", + "**Truly decoupled, serverless, compute layer**\n", + "\n", + "\n", + "**Data consumers**\n", + "\n", + "\n", + "**Data processing**\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "**ETL** **ETL**\n", + "\n", + "**Bronze raw** **Silver staging** **Gold DW/marts**\n", + "\n", + "\n", + "**Open storage layer**\n", + "\n", + "**Data ingest**\n", + "\n", + "**Data sources**\n", + "\n", + "\n", + "**Databricks**\n", + "**Partner Connect**\n", + "\n", + "\n", + "**Continuous**\n", + "**ingest**\n", + "\n", + "\n", + "**Batch**\n", + "**ingest**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
22fe9fe6a29c6511b002810677a23bc5-----\n", + "\n", + "**Data warehousing on Databricks**\n", + "\n", + "**Truly decoupled, serverless, compute layer**\n", + "\n", + "\n", + "**Data consumers**\n", + "\n", + "\n", + "**Data processing**\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "**ETL** **ETL**\n", + "\n", + "**Bronze raw** **Silver staging** **Gold DW/marts**\n", + "\n", + "\n", + "**Open storage layer**\n", + "\n", + "**Data ingest**\n", + "\n", + "**Data sources**\n", + "\n", + "\n", + "**Databricks**\n", + "**Partner Connect**\n", + "\n", + "\n", + "**Continuous**\n", + "**ingest**\n", + "\n", + "\n", + "**Batch**\n", + "**ingest**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Try Databricks SQL for free](https://dbricks.co/dbsql)\n", + "\n", + "[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n", + "\n", + "[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n", + "[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n", + "\n", + "\n", + "[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "\n", + "[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 08\n", + "\n", + "\n", + "### Data engineering\n", + "\n", + "Organizations realize the value data plays as a strategic asset for growing\n", + "revenues, improving the customer experience, operating efficiently or improving\n", + "a product or service. Data is really the driver of all these initiatives. Nowadays,\n", + "data is often streamed and ingested from hundreds of different data sources,\n", + "sometimes acquired from a data exchange, cleaned in various ways with\n", + "different orchestrated steps, versioned and shared for analytics and AI.\n", + "And increasingly, data is being monetized.\n", + "\n", + "Data teams rely on getting the right data at the right time for analytics, data\n", + "science and machine learning, but often are faced with challenges meeting\n", + "the needs of their initiatives for data engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why data engineering is hard\n", + "\n", + "One of the biggest challenges is accessing and managing the increasingly\n", + "complex data that lives across the organization. Most of the complexity\n", + "arises with the explosion of data volumes and data types, with organizations\n", + "amassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "\n", + "With this volume, managing data pipelines to transform and process data\n", + "is slow and difficult, and increasingly expensive. And to top off the complexity,\n", + "most businesses are putting an increased emphasis on multicloud\n", + "environments which can be even more difficult to maintain.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
e577e0ac294ad34249c7d000936d7c72Data teams rely on getting the right data at the right time for analytics, data\n", + "science and machine learning, but often are faced with challenges meeting\n", + "the needs of their initiatives for data engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why data engineering is hard\n", + "\n", + "One of the biggest challenges is accessing and managing the increasingly\n", + "complex data that lives across the organization. Most of the complexity\n", + "arises with the explosion of data volumes and data types, with organizations\n", + "amassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "\n", + "With this volume, managing data pipelines to transform and process data\n", + "is slow and difficult, and increasingly expensive. And to top off the complexity,\n", + "most businesses are putting an increased emphasis on multicloud\n", + "environments which can be even more difficult to maintain.\n", + "\n", + "[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\n", + "that data itself has become a product, and the challenging goal of the data\n", + "engineer is to build and run the machinery that creates this high-fidelity\n", + "data product all the way from ingestion to monetization.\n", + "\n", + "\n", + "Despite current technological advances data engineering remains\n", + "difficult for several reasons:\n", + "\n", + "**Complex data ingestion methods**\n", + "\n", + "Data ingestion means retrieving batch and streaming data from various\n", + "sources and in various formats. Ingesting data is hard and complex since you\n", + "either need to use an always-running streaming platform like Apache Kafka\n", + "or you need to be able to keep track of which files haven’t been ingested yet.\n", + "Data engineers are required to spend a lot of time hand-coding repetitive\n", + "and error-prone data ingestion tasks.\n", + "\n", + "**Data engineering principles**\n", + "\n", + "These days, large operations teams are often just a memory of the past.\n", + "Modern data engineering principles are based on agile software development\n", + "methodologies. They apply the well-known “you build it, you run it” paradigm,\n", + "use isolated development and production environments, CI/CD, and version\n", + "control transformations that are pushed to production after validation. Tooling\n", + "needs to support these principles.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Third-party tools**\n", + "\n", + "Data engineers are often required to run additional third-party tools for\n", + "orchestration to automate tasks such as ELT/ETL or customer code in\n", + "notebooks. Running third-party tools increases the operational overhead\n", + "and decreases the reliability of the system.\n", + "\n", + "**Performance tuning**\n", + "\n", + "Finally, with all pipelines and workflows written, data engineers need to\n", + "constantly focus on performance, tuning pipelines and architectures to meet\n", + "SLAs. Tuning such architectures requires in-depth knowledge of the underlying\n", + "architecture and constantly observing throughput parameters.\n", + "\n", + "Most organizations are dealing with a complex landscape of data warehouses\n", + "and data lakes these days. Each of those platforms has its own limitations,\n", + "workloads, development languages and governance model.\n", + "\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
9f81ac0b52802c7152247bfd5289b744With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:\n", + "\n", + "_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n", + "_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n", + "**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n", + "_kinds of workflows._\n", + "\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "#### Benefits of data engineering on the lakehouse\n", + "\n", + "By simplifying and modernizing with the lakehouse architecture, data engineers\n", + "gain an enterprise-grade and enterprise-ready approach to building data\n", + "pipelines. The following are eight key differentiating capabilities that a data\n", + "engineering solution team can enable with the Databricks Lakehouse Platform:\n", + "\n", + "**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\n", + "engineers can enable fast, reliable, scalable and automatic data ingestion\n", + "for analytics, data science or machine learning.\n", + "\n", + "\n", + "\n", + "**•** **Data pipeline observability:** Monitor overall data pipeline estate status\n", + "from a dataflow graph dashboard and visually track end-to-end pipeline\n", + "health for performance, quality, status and latency.\n", + "\n", + "**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\n", + "analytics and machine learning use cases by enabling easy and automatic\n", + "data pipeline deployments into production or roll back pipelines and\n", + "minimize downtime.\n", + "\n", + "**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\n", + "of data processing tasks for data and machine learning pipelines with the\n", + "ability to run multiple non-interactive tasks as a directed acyclic graph\n", + "(DAG) on a Databricks compute cluster.\n", + "\n", + "\n", + "\n", + "**•** **Automated ETL pipelines:** Data engineers can reduce development\n", + "time and effort and focus on implementing business logic and data\n", + "quality checks within the data pipeline using SQL or Python.\n", + "\n", + "**•** **Data quality checks:** Improve data reliability throughout the data\n", + "lakehouse so data teams can confidently trust the information for\n", + "downstream initiatives with the ability to define data quality and\n", + "automatically address errors.\n", + "\n", + "**•** **Batch and streaming:** Allow data engineers to set tunable data latency\n", + "with cost controls without having to know complex stream processing\n", + "and implement recovery logic.\n", + "\n", + "**•** **Automatic recovery:** Handle transient errors and use automatic recovery\n", + "for most common error conditions that can occur during the operation of\n", + "a pipeline with fast, scalable fault-tolerance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data engineering is all about data quality**\n", + "\n", + "The goal of modern data engineering is to distill data with a quality that is fit for\n", + "downstream analytics and AI. Within the Lakehouse, data quality is achieved on\n", + "three different levels.\n", + "\n", + "\n", + "1. On a **technical level** , data quality is\n", + "guaranteed by enforcing and evolving\n", + "schemas for data storage and ingestion.\n", + "\n", + "**Kenesis**\n", + "\n", + "**CSV,**\n", + "**JSON, TXT...**\n", + "\n", + "**Data Lake**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
1cf679a6f5d9f2a7337862946918109e**•** **Batch and streaming:** Allow data engineers to set tunable data latency\n", + "with cost controls without having to know complex stream processing\n", + "and implement recovery logic.\n", + "\n", + "**•** **Automatic recovery:** Handle transient errors and use automatic recovery\n", + "for most common error conditions that can occur during the operation of\n", + "a pipeline with fast, scalable fault-tolerance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data engineering is all about data quality**\n", + "\n", + "The goal of modern data engineering is to distill data with a quality that is fit for\n", + "downstream analytics and AI. Within the Lakehouse, data quality is achieved on\n", + "three different levels.\n", + "\n", + "\n", + "1. On a **technical level** , data quality is\n", + "guaranteed by enforcing and evolving\n", + "schemas for data storage and ingestion.\n", + "\n", + "**Kenesis**\n", + "\n", + "**CSV,**\n", + "**JSON, TXT...**\n", + "\n", + "**Data Lake**\n", + "\n", + "\n", + "2. On an **architectural level** , data quality is\n", + "often achieved by implementing the medallion\n", + "architecture. A medallion architecture is a data\n", + "design pattern used to logically organize data in\n", + "a [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\n", + "progressively improving the structure and quality\n", + "of data as it flows through each layer of the\n", + "architecture, e.g., from Bronze to Silver to Gold\n", + "layer tables.\n", + "\n", + "\n", + "3. The **Databricks Unity Catalog** comes\n", + "with robust data quality management with\n", + "built-in quality controls, testing, monitoring\n", + "and enforcement to ensure accurate and\n", + "useful data is available for downstream BI,\n", + "analytics and machine learning workloads.\n", + "\n", + "**Streaming**\n", + "**analytics**\n", + "\n", + "\n", + "**Bronze**\n", + "\n", + "\n", + "**Silver**\n", + "\n", + "\n", + "**Gold**\n", + "\n", + "\n", + "**BI and**\n", + "\n", + "**reporting**\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned, Business-level\n", + "and history augmented aggregates\n", + "\n", + "**Quality**\n", + "\n", + "\n", + "**Data science**\n", + "\n", + "**and ML**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data ingestion\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers can build robust\n", + "hyper-scale ingestion pipelines in streaming and batch mode. They can\n", + "incrementally process new files as they land on cloud storage — with no\n", + "need to manage state information — in scheduled or continuous jobs.\n", + "\n", + "Data engineers can efficiently track new files (with the ability to scale\n", + "to billions of files) without having to list them in a directory. Databricks\n", + "automatically infers the schema from the source data and evolves it as\n", + "the data loads into the Delta Lake lakehouse. Efforts continue with\n", + "enhancing and supporting Auto Loader, our powerful data ingestion\n", + "tool for the Lakehouse.\n", + "\n", + "**What is Auto Loader?**\n", + "\n", + "Have you ever imagined that ingesting data could become as easy\n", + "as dropping a file into a folder? Welcome to Databricks Auto Loader.\n", + "\n", + "[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\n", + "efficiently processes new data files as they arrive in the cloud storage built\n", + "into the Databricks Lakehouse. Auto Loader can detect and enforce the\n", + "schema of your data and, therefore, guarantee data quality. New files or\n", + "files that have been changed since the last time new data was processed\n", + "are identified automatically and ingested. Noncompliant data sets are\n", + "quarantined into rescue data columns. You can use the [trigger once]\n", + "option with Auto Loader to turn it into a job that turns itself off.\n", + "\n", + "\n", + "**Ingestion for data analysts: COPY INTO**\n", + "\n", + "Ingestion also got much easier for data analysts and analytics engineers working\n", + "with Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\n", + "lake-first approach and loads data from a folder location into a Delta Lake table.\n", + "COPY INTO can be scheduled and called by a job repeatedly. When run, only new\n", + "files from the source location will be processed.\n", + "\n", + "#### Data transformationSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
770d96f1c053793d9736812b3605af5f**Ingestion for data analysts: COPY INTO**\n", + "\n", + "Ingestion also got much easier for data analysts and analytics engineers working\n", + "with Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\n", + "lake-first approach and loads data from a folder location into a Delta Lake table.\n", + "COPY INTO can be scheduled and called by a job repeatedly. When run, only new\n", + "files from the source location will be processed.\n", + "\n", + "#### Data transformation\n", + "\n", + "Turning SQL queries into production ETL pipelines typically involves a lot\n", + "of tedious, complicated operational work. Even at a small scale, the majority\n", + "of a data practitioner’s time is spent on tooling and managing infrastructure.\n", + "\n", + "Although the medallion architecture is an established and reliable pattern\n", + "for improving data quality, the implementation of this pattern is challenging\n", + "for many data engineering teams.\n", + "\n", + "While hand-coding the medallion architecture was hard for data engineers,\n", + "creating data pipelines was outright impossible for data analysts not being\n", + "able to code with Spark Structured Streaming in Scala or Python.\n", + "\n", + "Even at a small scale, most data engineering time is spent on tooling and\n", + "managing infrastructure rather than transformation. Auto-scaling, observability\n", + "and governance are difficult to implement and, as a result, often left out of the\n", + "solution entirely.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Live Tables?\n", + "\n", + "Delta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\n", + "infrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\n", + "and apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\n", + "both Python and SQL and is tailored to work with both streaming and batch workloads.\n", + "\n", + "With DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n", + "\n", + "\n", + "**Write** **create live table**\n", + "\n", + "\n", + "**Create** **a pipeline** **Click** **Start**\n", + "\n", + "Start\n", + "\n", + "\n", + "-----\n", + "\n", + "DLT reduces the implementation time by accelerating development and\n", + "automating complex operational tasks. Since DLT can use plain SQL, it also\n", + "enables data analysts to create production pipelines and turns them into\n", + "the often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\n", + "execution applied with Photon.\n", + "\n", + "Software engineering principles are applied for data engineering to foster the\n", + "idea of treating your data as code. Your data is the sole source of truth for what\n", + "is going on inside your business.\n", + "\n", + "Beyond just the transformations, there are many things that should be included\n", + "\n", + "Dependency\n", + "Full refresh\n", + "management\n", + "\n", + "*Coming soon\n", + "\n", + "\n", + "in the code that define your data. Declaratively express entire data flows in SQL\n", + "or Python. Natively enable modern software engineering best practices like\n", + "separate development and production environments, the ability to easily test\n", + "before deploying, deploy and manage environments using parameterization, unit\n", + "testing and documentation.\n", + "\n", + "DLT also automatically scales compute, providing the option to set the minimum\n", + "and maximum number of instances and let DLT size up the cluster according\n", + "to cluster utilization. In addition, tasks like orchestration, error handling and\n", + "recovery, and performance optimization are all handled automatically.\n", + "\n", + "\n", + "Incremental\n", + "computation*\n", + "\n", + "\n", + "Checkpointing\n", + "and retries\n", + "\n", + "\n", + "-----\n", + "\n", + "Expectations in the code help prevent bad data from flowing into tables, track\n", + "data quality over time, and provide tools to troubleshoot bad data with granular\n", + "pipeline observability. This enables a high-fidelity lineage diagram of your\n", + "pipeline to track dependencies and aggregate data quality metrics across all\n", + "your pipelines.\n", + "\n", + "Unlike other products that force you to deal with streaming and batch workloads\n", + "separately, DLT supports any type of data workload with a single API so data\n", + "engineers and analysts alike can build cloud-scale data pipelines faster without\n", + "the need for advanced data engineering skills.\n", + "\n", + "#### Data orchestrationSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
4018bedbe9433bb9032e8c093e83a934DLT also automatically scales compute, providing the option to set the minimum\n", + "and maximum number of instances and let DLT size up the cluster according\n", + "to cluster utilization. In addition, tasks like orchestration, error handling and\n", + "recovery, and performance optimization are all handled automatically.\n", + "\n", + "\n", + "Incremental\n", + "computation*\n", + "\n", + "\n", + "Checkpointing\n", + "and retries\n", + "\n", + "\n", + "-----\n", + "\n", + "Expectations in the code help prevent bad data from flowing into tables, track\n", + "data quality over time, and provide tools to troubleshoot bad data with granular\n", + "pipeline observability. This enables a high-fidelity lineage diagram of your\n", + "pipeline to track dependencies and aggregate data quality metrics across all\n", + "your pipelines.\n", + "\n", + "Unlike other products that force you to deal with streaming and batch workloads\n", + "separately, DLT supports any type of data workload with a single API so data\n", + "engineers and analysts alike can build cloud-scale data pipelines faster without\n", + "the need for advanced data engineering skills.\n", + "\n", + "#### Data orchestration\n", + "\n", + "The lakehouse makes it much easier for businesses to undertake ambitious data\n", + "and machine learning (ML) initiatives. However, orchestrating and managing\n", + "end-to-end production workflows remains a bottleneck for most organizations,\n", + "relying on external tools or cloud-specific solutions that are not part of their\n", + "lakehouse platform. Tools that decouple task orchestration from the underlying\n", + "data processing platform reduce the overall reliability of their production\n", + "workloads, limit observability, and increase complexity for end users.\n", + "\n", + "#### What is Databricks Workflows?\n", + "\n", + "[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n", + "[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\n", + "any cloud.\n", + "\n", + "\n", + "Workflows lets you orchestrate data flow pipelines (written in DLT or dbt),\n", + "as well as machine learning pipelines, or any other tasks such as notebooks\n", + "or Python wheels. Since Databricks Workflows is fully managed, it eliminates\n", + "operational overhead for data engineers, enabling them to focus on your\n", + "workflows not on managing your infrastructure. It provides an easy point-and-click\n", + "authoring experience for all your data teams, not just those with specialized skills.\n", + "Deep integration with the underlying lakehouse platform ensures you will create\n", + "and run reliable production workloads on any cloud while providing deep and\n", + "centralized monitoring with simplicity for end users.\n", + "\n", + "Sharing job clusters over multiple tasks reduces the time a job takes, reduces\n", + "costs by eliminating overhead and increases cluster utilization with parallel tasks.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\n", + "shows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\n", + "triggers the execution of all dependent tasks.\n", + "\n", + "You can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n", + "\n", + "external orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Orchestrate anything\n", + "\n", + "Remember that DLT is one of many task types for Databricks Workflows.\n", + "This is where the managed data flow pipelines with DLT tie together with\n", + "the easy point-and-click authoring experience of Databricks Workflows.\n", + "\n", + "In the following example, you can see an end-to-end workflow built with\n", + "customers in a workshop: Data is streamed from Twitter according to search\n", + "terms, then ingested with Auto Loader using automatic schema detection and\n", + "enforcement. In the next step, the data is cleaned and transformed with Delta\n", + "Live table pipelines written in SQL, and finally run through a pre-trained BERT\n", + "language model from Hugging Face for sentiment analysis of the tweets.\n", + "Different task types for ingest, cleanse/transform and ML are combined\n", + "in a single workflow.\n", + "\n", + "Using Workflows, these tasks can be scheduled to provide a daily overview of\n", + "social media coverage and customer sentiment for a business. After streaming\n", + "tweets with filtering for keywords such as “data engineering,” “lakehouse” and\n", + "“Delta Lake,” we curated a list of those tweets that were classified as positive\n", + "with the highest probability score.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
e516a51c629ef5e3497646d513213e89Remember that DLT is one of many task types for Databricks Workflows.\n", + "This is where the managed data flow pipelines with DLT tie together with\n", + "the easy point-and-click authoring experience of Databricks Workflows.\n", + "\n", + "In the following example, you can see an end-to-end workflow built with\n", + "customers in a workshop: Data is streamed from Twitter according to search\n", + "terms, then ingested with Auto Loader using automatic schema detection and\n", + "enforcement. In the next step, the data is cleaned and transformed with Delta\n", + "Live table pipelines written in SQL, and finally run through a pre-trained BERT\n", + "language model from Hugging Face for sentiment analysis of the tweets.\n", + "Different task types for ingest, cleanse/transform and ML are combined\n", + "in a single workflow.\n", + "\n", + "Using Workflows, these tasks can be scheduled to provide a daily overview of\n", + "social media coverage and customer sentiment for a business. After streaming\n", + "tweets with filtering for keywords such as “data engineering,” “lakehouse” and\n", + "“Delta Lake,” we curated a list of those tweets that were classified as positive\n", + "with the highest probability score.\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n", + "[Lakehouse](https://databricks.com/solutions/data-pipelines)\n", + "\n", + "\n", + "[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n", + "\n", + "[Databricks Workflows](https://www.databricks.com/product/workflows)\n", + "\n", + "\n", + "[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data streaming\n", + "# 09\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "There are two types of data processing: batch processing\n", + "and streaming processing.\n", + "\n", + "\n", + "Batch processing refers to the discontinuous, periodic processing\n", + "of data that has been stored for a period of time. For example,\n", + "an organization may need to run weekly reports on a set of\n", + "predictable transaction data. There is no need for this data\n", + "to be streaming — it can be processed on a weekly basis.\n", + "\n", + "Streaming processing, on the other hand, refers to unbounded\n", + "processing of data as it arrives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming Challenges**\n", + "\n", + "However, getting value from streaming data can be a tricky practice. While most\n", + "data today can be considered streaming data, organizations are overwhelmed by\n", + "the need to access, process and analyze the volume, speed and variety of this\n", + "data moving through their platforms. To keep pace with innovation, they must\n", + "quickly make sense of data streams decisively, consistently and in real time.\n", + "\n", + "Three common technical challenges organizations experience\n", + "with implementing real-time data streaming include:\n", + "\n", + "**•** **Specialized APIs and language skills:** Data practitioners encounter\n", + "barriers to adopting streaming skillsets because there are new languages,\n", + "APIs and tools to learn.\n", + "\n", + "**•** **Operational complexity:** To implement data streaming at scale, data\n", + "teams need to integrate and manage streaming-specific tools with\n", + "their other cloud services. They also have to manually build complex\n", + "operational tooling to help these systems recover from failure, restart\n", + "workloads without reprocessing data, optimize performance, scale the\n", + "underlying infrastructure, and so on.\n", + "\n", + "**•** **Incompatible governance models:** Different governance and security\n", + "models across real-time and historical data platforms makes it difficult\n", + "to provide the right access to the right users, see the end-to-end data\n", + "lineage, and/or meet compliance requirements.\n", + "\n", + "\n", + "In a wide variety of cases, an organization might find it useful to\n", + "leverage streaming data. Here are some common examples:\n", + "\n", + "**•** **Retail:** Real-time inventory updates help support business activities, such\n", + "as inventory and pricing optimization and optimization of the supply chain,\n", + "logistics and just-in-time delivery.\n", + "\n", + "**•** **Smart energy:** Smart meter monitoring in real time allows for smart\n", + "electricity pricing models and connection with renewable energy sources\n", + "to optimize power generation and distribution.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
e45a2f68c84e05c7b3e9cef52f4fbd67**•** **Incompatible governance models:** Different governance and security\n", + "models across real-time and historical data platforms makes it difficult\n", + "to provide the right access to the right users, see the end-to-end data\n", + "lineage, and/or meet compliance requirements.\n", + "\n", + "\n", + "In a wide variety of cases, an organization might find it useful to\n", + "leverage streaming data. Here are some common examples:\n", + "\n", + "**•** **Retail:** Real-time inventory updates help support business activities, such\n", + "as inventory and pricing optimization and optimization of the supply chain,\n", + "logistics and just-in-time delivery.\n", + "\n", + "**•** **Smart energy:** Smart meter monitoring in real time allows for smart\n", + "electricity pricing models and connection with renewable energy sources\n", + "to optimize power generation and distribution.\n", + "\n", + "**•** **Preventative maintenance:** By reducing unplanned outages and\n", + "unnecessary site and maintenance visits, real-time streaming analytics can\n", + "lower operational and equipment costs.\n", + "\n", + "**•** **Industrial automation:** Manufacturers can use streaming and predictive\n", + "analytics to improve production processes and product quality, including\n", + "setting up automated alerts.\n", + "\n", + "**•** **Healthcare:** To optimize care recommendations, real-time data allows\n", + "for the integration of various smart sensors to monitor patient condition,\n", + "medication levels and even recovery speed.\n", + "\n", + "**•** **Financial institutions:** Firms can conduct real-time analysis of\n", + "\n", + "transactions to detect fraudulent transactions and send alerts. They\n", + "can use fraud analytics to identify patterns and feed data into machine\n", + "learning algorithms.\n", + "\n", + "\n", + "Regardless of specific use cases, the central tenet of streaming data is that it\n", + "gives organizations the opportunity to leverage the freshest possible insights for\n", + "better decision-making and more optimized customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data streaming architecture**\n", + "\n", + "Before addressing these challenges head-on, it may help to take a step back and\n", + "discuss the ingredients of a streaming data pipeline. Then, we will explain how\n", + "the Databricks Lakehouse Platform operates within this context to address the\n", + "aforementioned challenges.\n", + "\n", + "Every application of streaming data requires a pipeline that brings the data from\n", + "its origin point — whether sensors, IoT devices or database transactions — to its\n", + "final destination.\n", + "\n", + "In building this pipeline, streaming architectures typically employ two layers.\n", + "First, streaming capture systems **capture** and temporarily store streaming data\n", + "for processing. Sometimes these systems are also called messaging systems\n", + "or messaging buses. These systems are optimized for small payloads and high\n", + "frequency inputs/outputs. Second, streaming **processing** systems continuously\n", + "process data from streaming capture systems and other storage systems.\n", + "\n", + "**Capturing** **Processing**\n", + "\n", + "\n", + "It may help to think of a simplified streaming pipeline\n", + "according to the following seven phases:\n", + "\n", + "1. Data is continuously generated at origin points\n", + "\n", + "2. The generated data is captured from those origin points by\n", + "a capture system like Apache Kafka (with limited retention)\n", + "\n", + "**3. The captured data is extracted and incrementally ingested to**\n", + "**a processing platform like Databricks; data is ingested exactly**\n", + "**once and stored permanently, even if this step is rerun**\n", + "\n", + "**4. The ingested data is converted into a workable format**\n", + "\n", + "**5. The formatted data is cleansed, transformed and joined in**\n", + "**a number of pipeline steps**\n", + "\n", + "**6. The transformed data is processed downstream through**\n", + "**analysis or ML modeling**\n", + "\n", + "7. The resulting analysis or model is used for some sort of practical\n", + "application, which may be anything from basic reporting to an\n", + "event-driven software application\n", + "\n", + "You will notice four of the steps in this list are in boldface. This is because the\n", + "lakehouse architecture is specifically designed to optimize this part of the\n", + "pipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\n", + "analyze and model on streaming data _alongside_ batch-processed data. It can\n", + "accommodate both structured _and_ unstructured data. It is here that the value\n", + "of unifying the best pieces of data lakes and data warehouses really shines for\n", + "complex enterprise use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming on the Lakehouse**\n", + "\n", + "Now let’s zoom in a bit and see how the Databricks Lakehouse\n", + "Platform addresses each part of the pipeline mentioned above.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
31aaed4d9f4d9e1c44aaeefd580abfa6**5. The formatted data is cleansed, transformed and joined in**\n", + "**a number of pipeline steps**\n", + "\n", + "**6. The transformed data is processed downstream through**\n", + "**analysis or ML modeling**\n", + "\n", + "7. The resulting analysis or model is used for some sort of practical\n", + "application, which may be anything from basic reporting to an\n", + "event-driven software application\n", + "\n", + "You will notice four of the steps in this list are in boldface. This is because the\n", + "lakehouse architecture is specifically designed to optimize this part of the\n", + "pipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\n", + "analyze and model on streaming data _alongside_ batch-processed data. It can\n", + "accommodate both structured _and_ unstructured data. It is here that the value\n", + "of unifying the best pieces of data lakes and data warehouses really shines for\n", + "complex enterprise use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming on the Lakehouse**\n", + "\n", + "Now let’s zoom in a bit and see how the Databricks Lakehouse\n", + "Platform addresses each part of the pipeline mentioned above.\n", + "\n", + "**Streaming data ingestion and transformation** begins with continuously\n", + "and incrementally collecting raw data from streaming sources through a\n", + "feature called Auto Loader. Once the data is ingested, it can be transformed\n", + "from raw, messy data into clean, fresh, reliable data appropriate for downstream\n", + "analytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\n", + "manage these data pipelines while automatically taking care of infrastructure\n", + "management and scaling, data quality, error testing and other administrative\n", + "tasks. DLT is a high-level abstraction built on Spark Structured Streaming,\n", + "a scalable and fault-tolerant stream processing engine.\n", + "\n", + "**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\n", + "of streaming data. With fresher data streaming into SQL analytics or BI\n", + "reporting, more actionable insights can be achieved, resulting in better\n", + "business outcomes.\n", + "\n", + "**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\n", + "deployment is supported with structured streaming for continuous inference\n", + "from a live data stream. Like real-time analytics, real-time ML is a downstream\n", + "impact of streaming data, but for different business use cases (i.e., AI instead\n", + "of BI). Real-time modeling has many benefits, including more accurate\n", + "predictions about the future.\n", + "\n", + "\n", + "**Real-time applications** process data directly from streaming pipelines and\n", + "trigger programmatic actions, such as displaying a relevant ad, updating the\n", + "price on a pricing page, stopping a fraudulent transaction, etc. There typically\n", + "is no human-in-the-loop for such applications.\n", + "\n", + "\n", + "Data in cloud storage and message stores\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform differentiators**\n", + "\n", + "Understanding what the lakehouse architecture provides is one\n", + "\n", + "thing, but it is useful to understand how Databricks uniquely\n", + "\n", + "approaches the common challenges mentioned earlier around\n", + "\n", + "working with streaming data.\n", + "\n", + "**Databricks empowers unified data teams.** Data engineers, data scientists\n", + "and analysts can easily build streaming data workloads with the languages\n", + "and tools they already know and the APIs they already use.\n", + "\n", + "**Databricks simplifies development and operations.** Organizations can\n", + "focus on getting value from data by reducing complexity and automating\n", + "much of the production aspects associated with building and maintaining\n", + "real-time data workloads.\n", + "\n", + "\n", + "See why customers love streaming on the Databricks\n", + "Lakehouse Platform with these resources.\n", + "\n", + "**Learn more**\n", + "\n", + "[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n", + "\n", + "[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
2df0852583ef94bc403945ac9a1e859d**Databricks simplifies development and operations.** Organizations can\n", + "focus on getting value from data by reducing complexity and automating\n", + "much of the production aspects associated with building and maintaining\n", + "real-time data workloads.\n", + "\n", + "\n", + "See why customers love streaming on the Databricks\n", + "Lakehouse Platform with these resources.\n", + "\n", + "**Learn more**\n", + "\n", + "[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n", + "\n", + "[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "\n", + "[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n", + "\n", + "[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n", + "\n", + "\n", + "**Databricks is one platform for streaming and batch data.** Organizations\n", + "can eliminate data silos, centralize security and governance models, and\n", + "provide complete support for all their real-time use cases under one roof —\n", + "the roof of the lakehouse.\n", + "\n", + "Finally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\n", + "deeply integrated with Spark Structured Streaming and overcomes many of\n", + "the limitations typically associated with streaming systems and files.\n", + "\n", + "In summary, the Databricks Lakehouse Platform dramatically simplifies data\n", + "streaming to deliver real-time analytics, machine learning and applications on\n", + "one platform. And, that platform is built on a foundation with streaming at its\n", + "core. This means organizations of all sizes can use their data in motion and\n", + "make more informed decisions faster than ever.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data science and machine learning\n", + "# 10\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "While most companies are aware of the potential benefits of applying\n", + "machine learning and AI, realizing these potentials can often be quite\n", + "challenging for those brave enough to take the leap. Some of the\n", + "largest hurdles come from siloed/disparate data systems, complex\n", + "experimentation environments, and getting models served in a\n", + "production setting.\n", + "\n", + "\n", + "Fortunately, the Databricks Lakehouse Platform provides a helping\n", + "hand and lets you use data to derive innovative insights, build\n", + "powerful predictive models, and enable data scientists, ML engineers,\n", + "and developers of all kinds to create within the space of machine\n", + "learning and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Machine Learning\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Exploratory data analysis\n", + "\n", + "With all the data in one place, data is easily\n", + "explored and visualized from within the\n", + "notebook-style experience that provides support\n", + "for various languages (R, SQL, Python and Scala)\n", + "as well as built-in visualizations and dashboards.\n", + "Confidently and securely share code with\n", + "co-authoring, commenting, automatic versioning,\n", + "Git integrations and role-based access controls.\n", + "The platform provides laptop-like simplicity at\n", + "production-ready scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Model creation and management\n", + "\n", + "From data ingestion to model training and tuning, all the way through to\n", + "production model serving and versioning, the Lakehouse brings the tools\n", + "needed to simplify those tasks.\n", + "\n", + "Get right into experimenting with the Databricks ML runtimes, optimized and\n", + "preconfigured to include most popular libraries like scikit-learn, XGBoost and\n", + "more. Massively scale thanks to built-in support for distributed training and\n", + "hardware acceleration with GPUs.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
3b9e992190e770ecb1aa8269e5d05d96#### Databricks Machine Learning\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Exploratory data analysis\n", + "\n", + "With all the data in one place, data is easily\n", + "explored and visualized from within the\n", + "notebook-style experience that provides support\n", + "for various languages (R, SQL, Python and Scala)\n", + "as well as built-in visualizations and dashboards.\n", + "Confidently and securely share code with\n", + "co-authoring, commenting, automatic versioning,\n", + "Git integrations and role-based access controls.\n", + "The platform provides laptop-like simplicity at\n", + "production-ready scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Model creation and management\n", + "\n", + "From data ingestion to model training and tuning, all the way through to\n", + "production model serving and versioning, the Lakehouse brings the tools\n", + "needed to simplify those tasks.\n", + "\n", + "Get right into experimenting with the Databricks ML runtimes, optimized and\n", + "preconfigured to include most popular libraries like scikit-learn, XGBoost and\n", + "more. Massively scale thanks to built-in support for distributed training and\n", + "hardware acceleration with GPUs.\n", + "\n", + "From within the runtimes, you can track model training sessions, package and\n", + "reuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\n", + "created by Databricks and included as a managed service within the Lakehouse.\n", + "It provides a centralized location from which to manage models and package\n", + "code in an easily reusable way.\n", + "\n", + "Training these models often involves the use of features housed in a centralized\n", + "feature store. Fortunately, Databricks has a built-in feature store that allows you\n", + "to create new features, explore and re-use existing features, select features for\n", + "training and scoring machine learning models, and publish features to low-latency\n", + "online stores for real-time inference.\n", + "\n", + "If you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\n", + "experimentation by pointing to your data set and automatically training models\n", + "and tuning hyperparameters to save both novice and advanced users precious\n", + "time in the machine learning process.\n", + "\n", + "\n", + "AutoML will also report back metrics related to the model training results as well\n", + "as the code needed to repeat the training already custom-tailored to your data\n", + "set. This glass box approach ensures that you are never trapped or suffer from\n", + "vendor lock-in.\n", + "\n", + "In that regard, the Lakehouse supports the industry’s widest range of data tools,\n", + "development environments, and a thriving ISV ecosystem so you can make your\n", + "workspace your own and put out your best work.\n", + "\n", + "##### Compute platform\n", + "\n", + "**Any ML workload optimized and accelerated**\n", + "\n", + "**Databricks Machine Learning Runtime**\n", + "\n", + "- Optimized and preconfigured ML frameworks\n", + "\n", + "- Turnkey distribution ML\n", + "\n", + "- Built-in AutoML\n", + "\n", + "- GPU support out of the box\n", + "\n", + "\n", + "Built-in **ML frameworks**\n", + "and **model explainability**\n", + "\n", + "Built-in support for **AutoML**\n", + "and **hyperparameter tuning**\n", + "\n", + "\n", + "Built-in support for\n", + "**distributed training**\n", + "\n", + "Built-in support for\n", + "**hardware accelerators**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Deploy your models to production\n", + "\n", + "Exploring and creating your machine learning models\n", + "typically represents only part of the task. Once the\n", + "models exist and perform well, they must become\n", + "part of a pipeline that keeps models updated,\n", + "monitored and available for use by others.\n", + "\n", + "**Webhooks** allow registering of\n", + "\n", + "\n", + "Databricks can help here by providing a world-class\n", + "experience for model versioning, monitoring and\n", + "serving within the same platform that you can use\n", + "to generate the models themselves. This means you\n", + "can make all your ML pipelines in the same place,\n", + "monitor them for drift, retrain them with new data,\n", + "and promote and serve them easily and at scale.\n", + "\n", + "Throughout the ML lifecycle, rest assured knowing\n", + "that lineage and governance are being tracked the\n", + "entire way. This means regulatory compliance and\n", + "security woes are significantly reduced, potentially\n", + "saving costly issues down the road.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
4fd897c16496ffce91caab2405bf7076Built-in support for\n", + "**distributed training**\n", + "\n", + "Built-in support for\n", + "**hardware accelerators**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Deploy your models to production\n", + "\n", + "Exploring and creating your machine learning models\n", + "typically represents only part of the task. Once the\n", + "models exist and perform well, they must become\n", + "part of a pipeline that keeps models updated,\n", + "monitored and available for use by others.\n", + "\n", + "**Webhooks** allow registering of\n", + "\n", + "\n", + "Databricks can help here by providing a world-class\n", + "experience for model versioning, monitoring and\n", + "serving within the same platform that you can use\n", + "to generate the models themselves. This means you\n", + "can make all your ML pipelines in the same place,\n", + "monitor them for drift, retrain them with new data,\n", + "and promote and serve them easily and at scale.\n", + "\n", + "Throughout the ML lifecycle, rest assured knowing\n", + "that lineage and governance are being tracked the\n", + "entire way. This means regulatory compliance and\n", + "security woes are significantly reduced, potentially\n", + "saving costly issues down the road.\n", + "\n", + "\n", + "callbacks on events like stage\n", + "\n", + "transitions to integrate with CI/CD\n", + "\n", + "automation.\n", + "\n", + "**Tags** allow storing deployment\n", + "\n", + "— specific metadata with model\n", + "\n", + "versions, e.g., whether the\n", + "\n", + "deployment was successful.\n", + "\n", + "\n", + "**Model lifecycle management**\n", + "\n", + "Staging Production Archived\n", + "\n", + "\n", + "Logged\n", + "model\n", + "\n", + "**Comments** allow communication\n", + "\n", + "and collaboration between\n", + "\n", + "teammates when reviewing\n", + "\n", + "model versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n", + "\n", + "[Databricks Data Science](https://databricks.com/product/data-science)\n", + "\n", + "[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 11\n", + "\n", + "\n", + "### Databricks Technology Partners and the modern data stack\n", + "\n", + "Databricks Technology Partners integrate their solutions with Databricks to\n", + "provide complementary capabilities for ETL, data ingestion, business intelligence,\n", + "machine learning and governance. These integrations allow customers to leverage\n", + "the Databricks Lakehouse Platform’s reliability and scalability to innovate faster\n", + "while deriving valuable data insights. Use preferred analytical tools with optimized\n", + "connectors for fast performance, low latency and high user concurrency to your\n", + "data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\n", + "tools to your lakehouse using validated integrations and helps you discover and try new solutions.\n", + "\n", + "**Databricks thrives within your modern data stack**\n", + "\n", + "**BI and dashboards** **Machine learning** **Data science**\n", + "\n", + "\n", + "**Data governance**\n", + "\n", + "**Data pipelines**\n", + "\n", + "**Data ingestion**\n", + "\n", + "\n", + "Data Data Data\n", + "warehousing engineering streaming\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "Data science\n", + "and ML\n", + "\n", + "\n", + "**Consulting**\n", + "**and SI partners**\n", + "\n", + "\n", + "**Delta Lake**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n", + "\n", + "[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n", + "\n", + "\n", + "[Partner Connect](https://databricks.com/partnerconnect)\n", + "\n", + "[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Get started with the Databricks Lakehouse Platform\n", + "# 12\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
1aae0175847f695b2f674a019d22936eData Data Data\n", + "warehousing engineering streaming\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "Data science\n", + "and ML\n", + "\n", + "\n", + "**Consulting**\n", + "**and SI partners**\n", + "\n", + "\n", + "**Delta Lake**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n", + "\n", + "[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n", + "\n", + "\n", + "[Partner Connect](https://databricks.com/partnerconnect)\n", + "\n", + "[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Get started with the Databricks Lakehouse Platform\n", + "# 12\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build solutions together with interactive\n", + "notebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\n", + "scikit-learn and more.\n", + "\n", + "**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\n", + "hosted by Databricks\n", + "\n", + "**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "\n", + "**[Databricks documentation](https://databricks.com/documentation)**\n", + "\n", + "Get detailed documentation to get started with\n", + "the Databricks Lakehouse Platform on your cloud\n", + "of choice: Databricks on AWS, Azure Databricks\n", + "and [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n", + "\n", + "**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n", + "\n", + "Get a firsthand look at Databricks from the\n", + "practitioner’s perspective with these simple\n", + "on-demand videos. Each demo is paired with\n", + "related materials — including notebooks, videos\n", + "and eBooks — so that you can try it out for\n", + "yourself on Databricks.\n", + "\n", + "\n", + "**[Databricks Academy](https://databricks.com/learn/training/home)**\n", + "\n", + "Whether you are new to the data lake or building on\n", + "an existing skill set, you can find a curriculum tailored\n", + "to your role or interest. With training and certification\n", + "through Databricks Academy, you will learn to master\n", + "the Databricks Lakehouse Platform for all your big\n", + "data analytics projects.\n", + "\n", + "**[Databricks Community](https://community.databricks.com/)**\n", + "\n", + "\n", + "**[Databricks Labs](https://databricks.com/learn/labs)**\n", + "\n", + "Databricks Labs are projects created by the\n", + "field to help customers get their use cases\n", + "into production faster.\n", + "\n", + "**[Databricks customers](https://databricks.com/customers)**\n", + "\n", + "Discover how innovative companies across\n", + "every industry are leveraging the Databricks\n", + "Lakehouse Platform.\n", + "\n", + "\n", + "Get answers, network with peers and solve\n", + "the world’s toughest problems, together.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About DatabricksSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
c5ae327904492731d7fd76d12d66efb9Whether you are new to the data lake or building on\n", + "an existing skill set, you can find a curriculum tailored\n", + "to your role or interest. With training and certification\n", + "through Databricks Academy, you will learn to master\n", + "the Databricks Lakehouse Platform for all your big\n", + "data analytics projects.\n", + "\n", + "**[Databricks Community](https://community.databricks.com/)**\n", + "\n", + "\n", + "**[Databricks Labs](https://databricks.com/learn/labs)**\n", + "\n", + "Databricks Labs are projects created by the\n", + "field to help customers get their use cases\n", + "into production faster.\n", + "\n", + "**[Databricks customers](https://databricks.com/customers)**\n", + "\n", + "Discover how innovative companies across\n", + "every industry are leveraging the Databricks\n", + "Lakehouse Platform.\n", + "\n", + "\n", + "Get answers, network with peers and solve\n", + "the world’s toughest problems, together.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000\n", + "organizations worldwide — including Comcast, Condé Nast,\n", + "H&M and over 40% of the Fortune 500 — rely on the Databricks\n", + "Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve the\n", + "world’s toughest problems. To learn more, follow Databricks on\n", + "[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
9f9c35c2d6e7c59e06e3fec911a0e217#### eBook\n", + "\n", + "# Big Book of Retail\n", + " & Consumer Goods Use Cases\n", + "\n", + "##### Driving real-time decisions\n", + " with the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n", + "\n", + "Common challenges 6\n", + "\n", + "The Lakehouse for Retail 8\n", + "\n", + "**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n", + "\n", + "Case Study: Gousto 14\n", + "\n", + "Case Study: ButcherBox 14\n", + "\n", + "**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n", + "\n", + "Case Study: Embark 16\n", + "\n", + "**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n", + "\n", + "Case Study: H&M 19\n", + "\n", + "Case Study: Edmunds 19\n", + "\n", + "**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n", + "\n", + "**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n", + "\n", + "Case Study: Reckitt 25\n", + "\n", + "**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n", + "\n", + "**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n", + "\n", + "Case Study: Wehkamp 31\n", + "\n", + "Case Study: Columbia 31\n", + "\n", + "Case Study: Pandora 31\n", + "\n", + "**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n", + "\n", + "**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n", + "\n", + "Case Study: ButcherBox 37\n", + "\n", + "Case Study: Sam’s Club 37\n", + "\n", + "**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n", + "\n", + "**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n", + "\n", + "**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n", + "\n", + "**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "### Introduction\n", + "\n", + "\n", + "Retailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n", + "\n", + "e-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n", + "\n", + "decisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n", + "\n", + "to support these decisions, but legacy systems are limited to data that’s hours or days old.\n", + "\n", + "**When seconds matter, only the Lakehouse delivers better decisions**\n", + "\n", + "Retail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n", + "\n", + "The integration of physical and e-commerce customer experiences into an omnichannel journey has been\n", + "\n", + "happening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n", + "\n", + "purchasing patterns.\n", + "\n", + "In reaction to these industry changes, retailers have responded with significant, rapid investments — including\n", + "\n", + "stronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n", + "\n", + "capabilities have addressed the immediate need — and created expectations of making decisions in real\n", + "\n", + "time — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n", + "\n", + "Unfortunately, most legacy systems are only able to process information in hours or days.\n", + "\n", + "The delays caused by waiting for data are leading to significant risks and costs for the industry.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
cddfc4c750af70bbe7e43384e73c4ff4to support these decisions, but legacy systems are limited to data that’s hours or days old.\n", + "\n", + "**When seconds matter, only the Lakehouse delivers better decisions**\n", + "\n", + "Retail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n", + "\n", + "The integration of physical and e-commerce customer experiences into an omnichannel journey has been\n", + "\n", + "happening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n", + "\n", + "purchasing patterns.\n", + "\n", + "In reaction to these industry changes, retailers have responded with significant, rapid investments — including\n", + "\n", + "stronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n", + "\n", + "capabilities have addressed the immediate need — and created expectations of making decisions in real\n", + "\n", + "time — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n", + "\n", + "Unfortunately, most legacy systems are only able to process information in hours or days.\n", + "\n", + "The delays caused by waiting for data are leading to significant risks and costs for the industry.\n", + "\n", + "**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n", + "\n", + "the-minute order data. Not having this information causes them to spend more resources on having people\n", + "\n", + "pick orders separately, at a higher operating cost.\n", + "\n", + "**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n", + "\n", + "that in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n", + "\n", + "sales, or worse, the customer becoming unsatisfied and moving to different retailers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n", + "\n", + "and other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n", + "\n", + "week.\n", + "\n", + "The margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n", + "\n", + "zero. Reducing the error rate requires better predictions and real-time data.\n", + "\n", + "**Use Case Guide**\n", + "\n", + "In this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n", + "\n", + "**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n", + "\n", + "**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n", + "\n", + "**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n", + "\n", + "performance, while achieving a market-leading total cost of ownership.\n", + "\n", + "Databricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n", + "\n", + "in real time. This use case guide also highlights common use cases across the industry, and offers additional\n", + "\n", + "resources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n", + "\n", + "journey to drive better customer experiences with data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "### Modern Data Platform\n", + " for Real-Time Retail\n", + "\n", + "\n", + "Retailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n", + "\n", + "changes, retailers are increasingly focused on improving the real-time availability of data and insights, and\n", + "\n", + "performing advanced analytics delivered within tight business service windows.\n", + "\n", + "**Common challenges**\n", + "\n", + "In response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n", + "\n", + "in modernizing distribution centers, partnering with delivery companies, and investing in customer\n", + "\n", + "engagement systems.\n", + "\n", + "Warehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n", + "\n", + "distribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n", + "\n", + "that became accustomed to having fast, same-day, and sometimes even overnight delivery options\n", + "\n", + "during the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n", + "\n", + "experience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n", + "\n", + "## $41B Market | Retail Warehouse Automation\n", + "\n", + "Yet while retailers modernize different areas of their operations, they’re constrained by a single point ofSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
9b5141e5ec6f6b347aa972b1c17623d2changes, retailers are increasingly focused on improving the real-time availability of data and insights, and\n", + "\n", + "performing advanced analytics delivered within tight business service windows.\n", + "\n", + "**Common challenges**\n", + "\n", + "In response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n", + "\n", + "in modernizing distribution centers, partnering with delivery companies, and investing in customer\n", + "\n", + "engagement systems.\n", + "\n", + "Warehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n", + "\n", + "distribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n", + "\n", + "that became accustomed to having fast, same-day, and sometimes even overnight delivery options\n", + "\n", + "during the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n", + "\n", + "experience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n", + "\n", + "## $41B Market | Retail Warehouse Automation\n", + "\n", + "Yet while retailers modernize different areas of their operations, they’re constrained by a single point of\n", + "\n", + "weakness, as they are reliant on legacy data platforms to bring together all of this data.\n", + "\n", + "Powering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n", + "\n", + "governance of information, and powering business intelligence and predictive analytics all within the time\n", + "\n", + "required by retail operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n", + "\n", + "is the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n", + "\n", + "systems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n", + "\n", + "and other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n", + "\n", + "aggregated and integrated with each other before they can be used. The problem? Retailers have used\n", + "\n", + "legacy data warehouses that are built around batch processing. And worse, increasing the frequency\n", + "\n", + "of how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n", + "\n", + "merchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n", + "\n", + "other data sets. The result? Accurate data to drive decisions can be delayed by days.\n", + "\n", + "**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n", + "\n", + "trade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n", + "\n", + "Running forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n", + "\n", + "but doing so requires tens of millions of model calculations that need to be performed in narrow service\n", + "\n", + "windows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n", + "\n", + "forced to accept the trade-off and live with less accurate predictions.\n", + "\n", + "**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n", + "\n", + "real-time data to thousands of employees is a daunting task. While data warehouses are capable\n", + "\n", + "of serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n", + "\n", + "frequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n", + "\n", + "decisions that are more frequent.\n", + "\n", + "**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n", + "\n", + "focused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n", + "\n", + "a trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n", + "\n", + "struggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n", + "\n", + "deliver personalized experiences at scale to win in retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### The Lakehouse for Retail\n", + "\n", + "Databricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n", + "\n", + "all types of data — from images to structured data — in real time, provide enterprise-class management\n", + "\n", + "and governance, and then immediately turn that data into actionable insights with real-time reporting andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
19724ab32f8993c136caf2d55947906afrequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n", + "\n", + "decisions that are more frequent.\n", + "\n", + "**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n", + "\n", + "focused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n", + "\n", + "a trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n", + "\n", + "struggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n", + "\n", + "deliver personalized experiences at scale to win in retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### The Lakehouse for Retail\n", + "\n", + "Databricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n", + "\n", + "all types of data — from images to structured data — in real time, provide enterprise-class management\n", + "\n", + "and governance, and then immediately turn that data into actionable insights with real-time reporting and\n", + "\n", + "predictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n", + "\n", + "(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n", + "\n", + "**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n", + "**or frequency** **processing** **any persona** **& collaboration**\n", + "\n", + "_Semi-structured batch_\n", + "\n", + "\n", + "**All of**\n", + "**your sources**\n", + "\n", + "Competitive activity\n", + "\n", + "E-commerce\n", + "\n", + "Mobile Applications\n", + "\n", + "Video & Images\n", + "\n", + "Point of Sale\n", + "\n", + "Distribution & Logistics\n", + "\n", + "Customer & Loyalty\n", + "\n", + "Delivery & Partners\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Unstructured batch_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Structured batch_\n", + "\n", + "\n", + "Data Lakehouse\n", + "\n", + "Data Management and Governance\n", + "\n", + "Process, manage and query all of your data\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "**Internal Teams**\n", + "\n", + "Production\n", + "Machine Learning\n", + "\n", + "**Customers**\n", + "\n", + "BI Reporting\n", + "& Dashboarding\n", + "\n", + "**Partners**\n", + "\n", + "Real-time Applications\n", + "\n", + "\n", + "Any Cloud\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture**\n", + "\n", + "At the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n", + "\n", + "offs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n", + "\n", + "that combines the best elements of data warehouses and data lakes — to directly address these factors by\n", + "\n", + "enabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n", + "\n", + "managed and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n", + "\n", + "data scientists and data engineers can all leverage this information to serve models for applications and\n", + "\n", + "power real-time reporting, advanced analytics, large-scale forecasting models and more.\n", + "\n", + "**EDGE** **HYBRID** **CLOUD**\n", + "\n", + "\n", + "\n", + "REST Model Serving\n", + "\n", + "|Machine Learning Operations Tracking Registery|RES|\n", + "|---|---|\n", + "||Application|\n", + "\n", + "\n", + "\n", + "Replication\n", + "\n", + "\n", + "Automatic DBs\n", + "\n", + "|Col1|Real-tim|\n", + "|---|---|\n", + "|||\n", + "\n", + "\n", + "Raw Data\n", + "\n", + "(Bronze Table)\n", + "\n", + "\n", + "Clean Data\n", + "\n", + "(Silver Table)\n", + "\n", + "\n", + "Refined Data\n", + "\n", + "(Gold Table)\n", + "\n", + "\n", + "Business\n", + "Applications\n", + "\n", + "Power BI\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How it works\n", + "\n", + "The Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n", + "\n", + "simplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n", + "\n", + "differentiated capabilities that help retailers win.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
f7507cf1a2132c8afe0151e2ccb104f9**EDGE** **HYBRID** **CLOUD**\n", + "\n", + "\n", + "\n", + "REST Model Serving\n", + "\n", + "|Machine Learning Operations Tracking Registery|RES|\n", + "|---|---|\n", + "||Application|\n", + "\n", + "\n", + "\n", + "Replication\n", + "\n", + "\n", + "Automatic DBs\n", + "\n", + "|Col1|Real-tim|\n", + "|---|---|\n", + "|||\n", + "\n", + "\n", + "Raw Data\n", + "\n", + "(Bronze Table)\n", + "\n", + "\n", + "Clean Data\n", + "\n", + "(Silver Table)\n", + "\n", + "\n", + "Refined Data\n", + "\n", + "(Gold Table)\n", + "\n", + "\n", + "Business\n", + "Applications\n", + "\n", + "Power BI\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How it works\n", + "\n", + "The Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n", + "\n", + "simplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n", + "\n", + "differentiated capabilities that help retailers win.\n", + "\n", + "Robust data Time-sensitive machine\n", + "Data in real time Use all of your data Real-time reporting\n", + "management learning\n", + "\n", + "\n", + "**Limited.** EDWs support the\n", + "\n", + "management of structured\n", + "\n", + "data.\n", + "\n", + "**No.** Data lakes lack\n", + "\n", + "enterprise-class data\n", + "\n", + "management tools.\n", + "\n", + "**Yes.** Delta and Unity\n", + "\n", + "Catalog offer native\n", + "\n", + "data management and\n", + "\n", + "governance of all data types.\n", + "\n", + "\n", + "**No.** EDWs offer quick access\n", + "\n", + "to reports on old data.\n", + "\n", + "**No.** Data lakes were not\n", + "\n", + "designed for reporting, let\n", + "\n", + "alone real-time reporting.\n", + "\n", + "**No.** Data lakes are able to\n", + "\n", + "support large analytics,\n", + "\n", + "but lack the ability to meet\n", + "\n", + "business SLAs.\n", + "\n", + "\n", + "**No.** EDWs must extract data\n", + "\n", + "and send it to a third party\n", + "\n", + "for machine learning.\n", + "\n", + "**Yes.** Data views can be\n", + "\n", + "materialized, enabling front-\n", + "\n", + "line employees with real-\n", + "\n", + "time data.\n", + "\n", + "**Yes.** The Lakehouse can\n", + "\n", + "scale to process the most\n", + "\n", + "demanding predictions\n", + "\n", + "within business SLAs.\n", + "\n", + "\n", + "**No.** Data warehouses are\n", + "\n", + "batch oriented, restricting\n", + "\n", + "data updates to hours or days.\n", + "\n", + "**No.** Data lakes are batch\n", + "\n", + "oriented.\n", + "\n", + "**Yes.** Support for real-time\n", + "\n", + "streaming data.\n", + "\n", + "\n", + "**No.** Data warehouses have\n", + "\n", + "very limited support for\n", + "\n", + "unstructured data.\n", + "\n", + "**Yes.** Data lakes offer support\n", + "\n", + "for all types of data.\n", + "\n", + "**Yes.** Supports all types of\n", + "\n", + "data in a centrally managed\n", + "\n", + "platform.\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "**WAREHOUSE**\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "**(HADOOP)**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "**DATA**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n", + "\n", + "for streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n", + "\n", + "and point-of-sale data. And Delta Lake enables this world-record-leading performance while\n", + "\n", + "maintaining support for ACID transactions.\n", + "\n", + "**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n", + "\n", + "and a growing variety of other data sources. This data is extremely powerful in helping to improve our\n", + "\n", + "understanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n", + "\n", + "to take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n", + "\n", + "architecture.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
3fd69219dd9f62357486b696bdc163f5**DATA LAKES**\n", + "\n", + "**(HADOOP)**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "**DATA**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n", + "\n", + "for streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n", + "\n", + "and point-of-sale data. And Delta Lake enables this world-record-leading performance while\n", + "\n", + "maintaining support for ACID transactions.\n", + "\n", + "**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n", + "\n", + "and a growing variety of other data sources. This data is extremely powerful in helping to improve our\n", + "\n", + "understanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n", + "\n", + "to take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n", + "\n", + "architecture.\n", + "\n", + "**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n", + "\n", + "was lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n", + "\n", + "compliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n", + "\n", + "to a modern data architecture does not require sacrificing enterprise maturity.\n", + "\n", + "**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n", + "\n", + "or recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n", + "\n", + "can scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n", + "\n", + "sensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n", + "\n", + "availability and out-of-stocks, and delivering highly personalized predictions.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "By using Databricks to build and support your lakehouse, you can empower your business with even more\n", + "\n", + "speed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n", + "\n", + "start with the use case that will have the most impact on your business. As you implement the pattern, you\n", + "\n", + "will find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n", + "\n", + "guidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3**\n", + "### Use Case:\n", + " Real-Time Supply\n", + " Chain Data\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "As companies see a surge in demand from e-commerce and delivery services, and seek increasing\n", + "\n", + "efficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n", + "\n", + "roadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n", + "\n", + "items are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n", + "\n", + "control tower.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Manufacturers Distributors Logistics Restaurants\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n", + "\n", + "happening and when a customer can act on it\n", + "\n", + "**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n", + "\n", + "have the added pressure to take immediate action on it\n", + "\n", + "**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n", + "\n", + "plant operations or as part of a supply chain control tower.\n", + "\n", + "**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n", + "\n", + "improvement in speed to data, with greater reliability\n", + "\n", + "**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers reportSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
581545a4d760d03962437f89de737436**Challenges**\n", + "\n", + "**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n", + "\n", + "happening and when a customer can act on it\n", + "\n", + "**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n", + "\n", + "have the added pressure to take immediate action on it\n", + "\n", + "**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n", + "\n", + "plant operations or as part of a supply chain control tower.\n", + "\n", + "**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n", + "\n", + "improvement in speed to data, with greater reliability\n", + "\n", + "**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n", + "\n", + "that they were able to move from batch to real-time at neutral costs\n", + "\n", + "**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n", + "\n", + "real-time data ingestions. Customers frequently report that the amount of code required to support\n", + "\n", + "streaming ingestion is 50% less than previous solutions.\n", + "\n", + "**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n", + "\n", + "happening, predict and prevent issues, and even gain days on major changes such as production\n", + "\n", + "schedules between shifts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks allows for both streaming and batch data sets to be ingested and made available to enable\n", + "\n", + "real-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n", + "\n", + "ACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n", + "\n", + "Delta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n", + "\n", + "learning experiments.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n", + "\n", + "data, IoT sensor, transportation management\n", + "\n", + "\n", + "-----\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n", + "\n", + "daily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n", + "\n", + "provided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n", + "\n", + "outbreak by providing real-time insight into performance on the factory picking lines.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "As a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n", + "\n", + "hundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n", + "\n", + "data in under three minutes.\n", + "\n", + "Now, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n", + "\n", + "address any logistical and delivery issues.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4**\n", + "### Use Case: Truck Monitoring\n", + "\n", + "\n", + "With many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n", + "\n", + "of trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n", + "\n", + "manner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n", + "\n", + "delays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n", + "\n", + "\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issuesSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
7fd334d58220f80f674987019149dba4address any logistical and delivery issues.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4**\n", + "### Use Case: Truck Monitoring\n", + "\n", + "\n", + "With many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n", + "\n", + "of trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n", + "\n", + "manner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n", + "\n", + "delays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n", + "\n", + "\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n", + "\n", + "\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n", + "\n", + "accidents or shipment delays\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n", + "\n", + "to delivery.\n", + "\n", + "**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n", + "\n", + "ability to monitor driver safety more immediately\n", + "\n", + "**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n", + "\n", + "expansion without sacrificing data quality and speed\n", + "\n", + "**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n", + "\n", + "proactive maintenance and reduced risk of accidents\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n", + "\n", + "to route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n", + "\n", + "analytics provide companies with the tools they need to scale and improve their operations.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, transportation management, manufacturing, predictive maintenance\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n", + "\n", + "to unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n", + "\n", + "via dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n", + "\n", + "autonomous trucks.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 5**\n", + "### Use Case: Inventory Allocation\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Replenishment planning is the process of determining what needs to go where. It is used by replenishment\n", + "\n", + "planning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n", + "\n", + "vendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n", + "\n", + "and on what day.\n", + "\n", + "Replenishment is challenging for companies because it deals with rapidly changing data and the need to\n", + "\n", + "make complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n", + "\n", + "data to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n", + "\n", + "number of products being sent to stores. This results in lost sales and low customer satisfaction.\n", + "\n", + "Inventory allocation is a process that might be performed multiple times a day during peak seasons, or\n", + "\n", + "daily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n", + "\n", + "multiple times a day — on demand and dynamically — during peak season without paying a premium for\n", + "\n", + "this capability throughout the year.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "GoodsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
fa319e02f812af76ebeda1e70704dec6planning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n", + "\n", + "vendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n", + "\n", + "and on what day.\n", + "\n", + "Replenishment is challenging for companies because it deals with rapidly changing data and the need to\n", + "\n", + "make complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n", + "\n", + "data to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n", + "\n", + "number of products being sent to stores. This results in lost sales and low customer satisfaction.\n", + "\n", + "Inventory allocation is a process that might be performed multiple times a day during peak seasons, or\n", + "\n", + "daily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n", + "\n", + "multiple times a day — on demand and dynamically — during peak season without paying a premium for\n", + "\n", + "this capability throughout the year.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Restaurants\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n", + "\n", + "This information is used to determine which products get put on trucks and go to specific stores.\n", + "\n", + "\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n", + "\n", + "the service windows\n", + "\n", + "\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n", + "\n", + "attributes that may be more or less popular by store\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n", + "\n", + "\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n", + "\n", + "improvement in forecast accuracy\n", + "\n", + "\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n", + "\n", + "\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n", + "\n", + "information on flavors or apparel sizes for specific stores\n", + "\n", + "**Solution overview**\n", + "\n", + "The objective of inventory allocation is to quickly determine when to distribute items and where — from\n", + "\n", + "warehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n", + "\n", + "rate of products, the available inventory and the shipping schedules, and then using this information to\n", + "\n", + "create an optimized manifest of what items should be carried on which trucks, at what point, and at what\n", + "\n", + "time. This becomes the plan for route accounting systems that arrange deliveries.\n", + "\n", + "Inventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n", + "\n", + "in a store for a long time, that store may receive heightened priority for the item in the allocation.\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "\n", + "**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n", + "\n", + "stock, promotions data, weather\n", + "\n", + "**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n", + "\n", + "**demand forecasting.**\n", + "\n", + "**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n", + "\n", + "Our most popular notebook at Databricks. This blog walks you through the business and technical\n", + "\n", + "challenges of performing demand forecasting and explains how we approached solving it.\n", + "\n", + "**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n", + "\n", + "Video and Q&A from our webinar with Starbucks.\n", + "\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "**CASE STUDY**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
32a24850e5f738d2b28c2a2e26336594**demand forecasting.**\n", + "\n", + "**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n", + "\n", + "Our most popular notebook at Databricks. This blog walks you through the business and technical\n", + "\n", + "challenges of performing demand forecasting and explains how we approached solving it.\n", + "\n", + "**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n", + "\n", + "Video and Q&A from our webinar with Starbucks.\n", + "\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "H&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n", + "\n", + "performant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n", + "\n", + "driven organization that could better forecast operations to streamline costs and boost revenue.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Edmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n", + "\n", + "Platform, they are able to simplify access to their disparate data sources and build ML models that make\n", + "\n", + "predictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n", + "\n", + "on their website is accurate and up to date, improving overall customer satisfaction.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 6**\n", + "### Use Case: Point of Sale\n", + " and Clickstream\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Disruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n", + "\n", + "coupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n", + "\n", + "retailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n", + "\n", + "recorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n", + "\n", + "This would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n", + "\n", + "crucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n", + "\n", + "complete and real-time snapshot of each customer’s shopping behavior.\n", + "\n", + "Near real-time availability of information means that retailers can continuously update their estimates of\n", + "\n", + "item availability. No longer is the business managing operations based on their knowledge of inventory\n", + "\n", + "states as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n", + "\n", + "they are now.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n", + "\n", + "incomplete sales data\n", + "\n", + "\u0007Both POS and clickstream data need to be unified and ingested in real time\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks brings POS and clickstream data together for a unified data source that leads to real-time\n", + "\n", + "insights and a clearer understanding of customer behavior.\n", + "\n", + "\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n", + "\n", + "clickstream data\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n", + "\n", + "customer purchasing behaviors and trends\n", + "\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 7**\n", + "### Use Case: On-Shelf Availability\n", + "\n", + "\n", + "**Overview**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
a9bb4ef49c9cb97f5bc20541b9536596-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks brings POS and clickstream data together for a unified data source that leads to real-time\n", + "\n", + "insights and a clearer understanding of customer behavior.\n", + "\n", + "\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n", + "\n", + "clickstream data\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n", + "\n", + "customer purchasing behaviors and trends\n", + "\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 7**\n", + "### Use Case: On-Shelf Availability\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Ensuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n", + "\n", + "missing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n", + "\n", + "their stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n", + "\n", + "worldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n", + "\n", + "according to industry research firm IHL.\n", + "\n", + "In the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n", + "\n", + "of going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n", + "\n", + "belong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n", + "\n", + "items online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n", + "\n", + "buy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n", + "\n", + "$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n", + "\n", + "On-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n", + "\n", + "considered in stock when it is actually in a current customer’s basket. If another customer places the same\n", + "\n", + "item in their basket, there is the possibility that the first customer will purchase the last available item\n", + "\n", + "before the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n", + "\n", + "these situations, customers may order an item that is picked for delivery at a much later time. The window\n", + "\n", + "between ordering and picking creates the probability of out-of-stocks.\n", + "\n", + "On-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n", + "\n", + "replenishment points, and generates a signal that suggests an item may be out of stock. This information is\n", + "\n", + "used to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n", + "\n", + "of thousands of people around the world do work that is generated by these algorithms.\n", + "\n", + "The sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n", + "\n", + "all of their products. Companies have between midnight and 4 AM to collect all of the needed information\n", + "\n", + "and run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n", + "\n", + "the priority categories or products to analyze, which means a significant percentage of their unavailable\n", + "\n", + "products will not be proactively addressed.\n", + "\n", + "\n", + "-----\n", + "\n", + "One of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n", + "\n", + "While some retailers are investing in computer vision and robots, and others employ the use of people to\n", + "\n", + "manually survey item availability, most retailers default to a signal of determining when an item has not been\n", + "\n", + "scanned in an acceptable time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "The biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final salesSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
891fec08af2e7fc29891e64459f4c5f9all of their products. Companies have between midnight and 4 AM to collect all of the needed information\n", + "\n", + "and run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n", + "\n", + "the priority categories or products to analyze, which means a significant percentage of their unavailable\n", + "\n", + "products will not be proactively addressed.\n", + "\n", + "\n", + "-----\n", + "\n", + "One of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n", + "\n", + "While some retailers are investing in computer vision and robots, and others employ the use of people to\n", + "\n", + "manually survey item availability, most retailers default to a signal of determining when an item has not been\n", + "\n", + "scanned in an acceptable time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "The biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n", + "\n", + "data from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n", + "\n", + "volumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n", + "\n", + "warehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n", + "\n", + "process that can require multiple hours per night.\n", + "\n", + "For this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n", + "\n", + "different days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n", + "\n", + "levels. Among the challenges:\n", + "\n", + "\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n", + "\n", + "\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n", + "\n", + "few categories\n", + "\n", + "\u0007Dealing with false positives and negatives in predictions\n", + "\n", + "Distributing information quickly and efficiently to internal systems and external partners\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n", + "\n", + "compromises.\n", + "\n", + "**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n", + "\n", + "process large volumes of highly detailed and frequently changing point-of-sale transaction data.\n", + "\n", + "**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n", + "\n", + "**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n", + "\n", + "**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n", + "\n", + "reporting\n", + "\n", + "**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables companies to perform on-shelf availability analysis without making compromises to the\n", + "\n", + "breadth or quality of predictions.\n", + "\n", + "It begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n", + "\n", + "biggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n", + "\n", + "a data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n", + "\n", + "metadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n", + "\n", + "based systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n", + "\n", + "Once data is available, users need to generate predictions about item availability on the shelf. With its\n", + "\n", + "extremely performant engine and the ability to distribute computation across countless nodes, Spark\n", + "\n", + "provides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n", + "\n", + "or against a subset of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "In this solution, we show how theSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
a53340337e61400ee84bf40d4bb5a86ca data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n", + "\n", + "metadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n", + "\n", + "based systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n", + "\n", + "Once data is available, users need to generate predictions about item availability on the shelf. With its\n", + "\n", + "extremely performant engine and the ability to distribute computation across countless nodes, Spark\n", + "\n", + "provides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n", + "\n", + "or against a subset of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "In this solution, we show how the\n", + "\n", + "Databricks Lakehouse Platform enables\n", + "\n", + "real-time insights to rapidly respond\n", + "\n", + "\n", + "And lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n", + "\n", + "Lake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n", + "\n", + "feed their predictive alerts to downstream retail operations systems or even to external partners within the\n", + "\n", + "tightest service windows, and in enough time to drive actions on that day.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n", + "\n", + "manual inventory data (optional), robotic or computer vision inventory data (optional)\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Reckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n", + "\n", + "organization to struggle with the complexity of forecast demand, especially with large volumes of different\n", + "\n", + "types of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n", + "\n", + "uses predictive analytics, product placement and business forecasting to better support neighborhood\n", + "\n", + "grocery stores.\n", + "\n", + "\n", + "to demand, drive more sales by\n", + "\n", + "ensuring stock is available on shelf, and\n", + "\n", + "scale out your forecasting models to\n", + "\n", + "accommodate any size operation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 8**\n", + "### Use Case: Customer and Vehicle Identification\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "COVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n", + "\n", + "options. Retailers that were able to implement these new services have been able to differentiate overall\n", + "\n", + "customer experiences and mitigate catastrophic hits on revenue levels.\n", + "\n", + "For retailers to create a seamless contactless experience for customers, they need real-time data to\n", + "\n", + "know when a customer has arrived and where they’re located, as well as provide updates throughout the\n", + "\n", + "pickup journey. And through the use of computer vision, they can capture that data by employing optical\n", + "\n", + "recognition on images to read vehicle license plates.\n", + "\n", + "Retailers can also use information captured from license plates to make recommendations on buying\n", + "\n", + "patterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n", + "\n", + "information to better serve their customers in real time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Drive-Through\n", + "Food Retailers\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Ineffective data processing can lead to suboptimal order preparation timing\n", + "\n", + "\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n", + "\n", + "time communications throughout the entire shopping and curbside or drive-through experience.\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n", + "\n", + "preparation timing\n", + "\n", + "\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensureSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
23407d8db01212b3e918b64e6fe28d48patterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n", + "\n", + "information to better serve their customers in real time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Drive-Through\n", + "Food Retailers\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Ineffective data processing can lead to suboptimal order preparation timing\n", + "\n", + "\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n", + "\n", + "time communications throughout the entire shopping and curbside or drive-through experience.\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n", + "\n", + "preparation timing\n", + "\n", + "\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n", + "\n", + "each subsequent visit is equally as or more seamless than the last\n", + "\n", + "\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n", + "\n", + "vehicle identification and order prediction\n", + "\n", + "**CASE STUDY**\n", + "\n", + "**CASE STUDY**\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 9**\n", + "### Use Case: Recommendation Engines\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n", + "\n", + "frequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n", + "\n", + "is by recommending products and services that align with customer needs.\n", + "\n", + "Providing an experience that makes customers feel understood helps retailers stand out from the crowd\n", + "\n", + "of mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n", + "\n", + "this more critical than ever for retail organizations. With research showing the cost of customer acquisition\n", + "\n", + "is as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n", + "\n", + "continue to build deeper connections with existing customers in order to retain a solid consumer base.\n", + "\n", + "There is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n", + "\n", + "of spending.\n", + "\n", + "Recommendation engines are used to create personalized experiences for users across retail channels.\n", + "\n", + "These recommendations are generated based on the data collected from purchases, items interacted\n", + "\n", + "with, users’ behavior across physical and digital channels, and other data such as from customer service\n", + "\n", + "interactions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n", + "\n", + "behavioral data, marketers are able to create recommendations that are integrated with other business\n", + "\n", + "objectives such as highlighting items that are on promotion or product availability.\n", + "\n", + "Creating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n", + "\n", + "the customer experience in every possible area of consumer engagement, from proactive notifications and\n", + "\n", + "offers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n", + "\n", + "and upsell, and even suggestions for complementary items after the purchase.\n", + "\n", + "\n", + "-----\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "Media Telecom Financial Services\n", + "(any B2B or B2C\n", + "company)\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Recommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n", + "\n", + "but traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n", + "\n", + "recommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n", + "\n", + "irrelevant.\n", + "\n", + "**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n", + "\n", + "is improved by having recent data, and yet most systems struggle to handle the large volumes of\n", + "\n", + "information involved.\n", + "\n", + "**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n", + "\n", + "touchpoints in one place are critical to enabling this use case. More data, including transaction and\n", + "\n", + "clickstream data, is critical for driving accuracy and precision in messaging.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
5010961aaebba0bff491572fc7f9703c**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "Media Telecom Financial Services\n", + "(any B2B or B2C\n", + "company)\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Recommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n", + "\n", + "but traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n", + "\n", + "recommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n", + "\n", + "irrelevant.\n", + "\n", + "**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n", + "\n", + "is improved by having recent data, and yet most systems struggle to handle the large volumes of\n", + "\n", + "information involved.\n", + "\n", + "**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n", + "\n", + "touchpoints in one place are critical to enabling this use case. More data, including transaction and\n", + "\n", + "clickstream data, is critical for driving accuracy and precision in messaging.\n", + "\n", + "**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n", + "\n", + "changing dynamics, and deliver real-time recommendations via APIs.\n", + "\n", + "**Automation.** This is an “always-on” use case where automation is essential for scalability and\n", + "\n", + "responsiveness based on frequent model updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "Many firms choose to use recommender systems from Amazon or Google. Using these systems trains\n", + "\n", + "the general recommendation engine in a way that helps competitors improve the accuracy of their own\n", + "\n", + "recommendations.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Recommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n", + "\n", + "retailers must own, and Databricks provides a solid platform for enabling this.\n", + "\n", + "Using Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n", + "\n", + "personalization, sample value metrics from a media agency include:\n", + "\n", + "**200% ROI for 70% of retailers** engaging in advanced personalization\n", + "\n", + "**10% improvement** in conversions\n", + "\n", + "**35% improvement** in purchase frequency\n", + "\n", + "**37% improvement** in customer lifetime value\n", + "\n", + "**Solution overview**\n", + "\n", + "Recommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n", + "\n", + "capturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n", + "\n", + "to combine various sources of data in a timely and efficient manner, from transactions, demographics and\n", + "\n", + "preference information across products, to clickstream, digital journey and marketing analytics data to bring\n", + "\n", + "a 360 view of customer interactions to enable omnichannel personalization.\n", + "\n", + "By identifying changes in user behavior or engagement, retailers are able to detect early signals that\n", + "\n", + "indicate a propensity to buy or a change in preferences, and recommend products and services that will\n", + "\n", + "keep consumers engaged.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n", + "\n", + "clickstream data, mobile data:\n", + "\n", + "**Engagement data** — transaction log data, clickstream data, promotion interaction\n", + "\n", + "**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n", + "\n", + "children, location\n", + "\n", + "**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n", + "\n", + "to churn\n", + "\n", + "**CASE STUDY**\n", + "\n", + "For Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n", + "\n", + "for help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n", + "\n", + "personalized to each of their customers.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Columbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n", + "\n", + "Databricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n", + "\n", + "business decisions.\n", + "\n", + "**CASE STUDY**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
222a81bc42783b79a909cdc1fc87a65bclickstream data, mobile data:\n", + "\n", + "**Engagement data** — transaction log data, clickstream data, promotion interaction\n", + "\n", + "**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n", + "\n", + "children, location\n", + "\n", + "**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n", + "\n", + "to churn\n", + "\n", + "**CASE STUDY**\n", + "\n", + "For Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n", + "\n", + "for help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n", + "\n", + "personalized to each of their customers.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Columbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n", + "\n", + "Databricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n", + "\n", + "business decisions.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Pandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n", + "\n", + "Lakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n", + "\n", + "quarterly revenue.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Databricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "with content-based and collaborative\n", + "\n", + "filter methods, and both item-\n", + "\n", + "and user-based analysis. These\n", + "\n", + "accelerators have been further refined\n", + "\n", + "to be highly performant to enable\n", + "\n", + "frequent retraining of models.\n", + "\n", + "To begin working on recommendation\n", + "\n", + "engines, contact your Databricks\n", + "\n", + "account team.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 10**\n", + "### Use Case: Perpetual Inventory\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "With the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n", + "\n", + "customer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n", + "\n", + "inventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n", + "\n", + "levels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n", + "\n", + "accurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n", + "\n", + "The key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n", + "\n", + "records related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n", + "\n", + "and lower overall costs.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Supply Chain\n", + "\n", + "\n", + "Inventory\n", + "Management\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n", + "\n", + "**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n", + "\n", + "view of inventory\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n", + "\n", + "the latest sales data\n", + "\n", + "**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n", + "\n", + "companies know they’re getting the most accurate information at any point\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n", + "\n", + "management costs\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
3ac87782fd6d7538352816e9c421c808**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n", + "\n", + "view of inventory\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n", + "\n", + "the latest sales data\n", + "\n", + "**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n", + "\n", + "companies know they’re getting the most accurate information at any point\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n", + "\n", + "management costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 11**\n", + "### Use Case: Automated\n", + " Replenishments\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers favor convenience more than ever when it comes to their goods, and automated replenishments\n", + "\n", + "help meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n", + "\n", + "key role in ensuring consumers get a refill automatically delivered at the right time.\n", + "\n", + "On the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n", + "\n", + "reducing the time needed to forecast, order and receive thousands of items.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Direct to\n", + "Customer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n", + "\n", + "replenishment orders\n", + "\n", + "With VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n", + "\n", + "for replenishment even when the customer can’t fulfill that order\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n", + "\n", + "customer needs\n", + "\n", + "**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n", + "\n", + "unique properties and expiry dates\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 12**\n", + "### Use Case: Fresh Food Forecasting\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Fresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n", + "\n", + "store traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n", + "\n", + "range of suppliers to work with and the products expire, which creates significant amounts of waste.\n", + "\n", + "In order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n", + "\n", + "sell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n", + "\n", + "timing for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n", + "\n", + "changing needs around fresh food.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Distributors Logistics Restaurants\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n", + "\n", + "enough to conduct daily forecasting and daily replenishmentSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
6e9d15b1e8da5b3d9bc282ac1e7664b5**Overview**\n", + "\n", + "Fresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n", + "\n", + "store traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n", + "\n", + "range of suppliers to work with and the products expire, which creates significant amounts of waste.\n", + "\n", + "In order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n", + "\n", + "sell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n", + "\n", + "timing for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n", + "\n", + "changing needs around fresh food.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Distributors Logistics Restaurants\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n", + "\n", + "enough to conduct daily forecasting and daily replenishment\n", + "\n", + "**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n", + "\n", + "**\u0007** Customers are forced to compromise on what they can analyze\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team to get\n", + "\n", + "started with inventory allocation. Databricks\n", + "\n", + "does not have a Solution Accelerator.\n", + "\n", + "View our webinar covering demand forecasting\n", + "\n", + "with Starbucks and then read our blog about\n", + "\n", + "demand forecasting.\n", + "\n", + "[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n", + "\n", + "This blog details the importance of time series\n", + "\n", + "forecasting, walks through building a simple\n", + "\n", + "model to show the use of Facebook Prophet, and\n", + "\n", + "then shows off the combination of Facebook\n", + "\n", + "Prophet and Adobe Spark to scale to hundreds\n", + "\n", + "of models.\n", + "\n", + "[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n", + "\n", + "Video and Q&A from our webinar with Starbucks\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\n", + "spoiled products, as well as lower inventory and handling costs.\n", + "\n", + "**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n", + "\n", + "double-digit improvement in forecast accuracy\n", + "\n", + "**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n", + "\n", + "millions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\n", + "few hours.\n", + "\n", + "**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n", + "\n", + "the products they forecast. They can predict demand for all products as frequently as required.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Databricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\n", + "Solution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\n", + "can be efficiently scaled to tens of millions of predictions in tight service windows.\n", + "\n", + "**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n", + "\n", + "expiration dates and weather.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "ButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\n", + "customer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\n", + "Databricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\n", + "rest of its data estate.\n", + "\n", + "\n", + "on demand forecasting.\n", + "\n", + "**CASE STUDY**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
537ac6a9ae1f1f02fabe20123c0b4e89**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n", + "\n", + "the products they forecast. They can predict demand for all products as frequently as required.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Databricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\n", + "Solution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\n", + "can be efficiently scaled to tens of millions of predictions in tight service windows.\n", + "\n", + "**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n", + "\n", + "expiration dates and weather.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "ButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\n", + "customer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\n", + "Databricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\n", + "rest of its data estate.\n", + "\n", + "\n", + "on demand forecasting.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Sam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\n", + "trillions of events going through the company. Find out how Databricks became a key component in the shift\n", + "from on premises Hadoop clusters to a cloud based platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 13**\n", + "### Use Case: Propensity-to-Buy\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers often have repeatable purchase patterns that may not be noticed upon initial observation.\n", + "\n", + "While we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n", + "\n", + "mornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n", + "\n", + "predict these buying moments when customers are not in our stores?\n", + "\n", + "The purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n", + "\n", + "purchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n", + "\n", + "models leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n", + "\n", + "useful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n", + "\n", + "models are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n", + "\n", + "data from receipts, and causal data that helps to explain when and why customers make purchases.\n", + "\n", + "Propensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n", + "\n", + "management, email and mobile alerts, recommendations and others.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n", + "\n", + "outreach to customers to avoid angering them.\n", + "\n", + "Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequently\n", + "\n", + "Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n", + "\n", + "efficiently synthesize this into data for analysis\n", + "\n", + "**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequency\n", + "\n", + "**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Propensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n", + "\n", + "moment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n", + "\n", + "incorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buyingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
e7a1b743421b089b6a8eb62094043f88share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n", + "\n", + "efficiently synthesize this into data for analysis\n", + "\n", + "**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequency\n", + "\n", + "**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Propensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n", + "\n", + "moment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n", + "\n", + "incorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n", + "\n", + "moment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n", + "\n", + "or even later in the day you have lost the opportunity to act\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "With the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n", + "\n", + "enables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n", + "\n", + "to refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n", + "\n", + "systems, where the consumer can be engaged.\n", + "\n", + "As this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n", + "\n", + "Calculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n", + "\n", + "periods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n", + "\n", + "skipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n", + "\n", + "mobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 14**\n", + "### Use Case: Next Best Action\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "The e-commerce boom over the last couple of years has given consumers ample choice for digital\n", + "\n", + "shopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n", + "\n", + "risk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n", + "\n", + "best action for customers, you can greatly increase your conversion rates.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Direct to\n", + "Consumer\n", + "\n", + "\n", + "E-commerce\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Siloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n", + "\n", + "resulting in suboptimal recommendations for the next best action\n", + "\n", + "Companies need to ingest large amounts of data in real time and then take action on it immediately\n", + "\n", + "Many businesses still struggle with training their ML models to properly determine the next best action\n", + "\n", + "(and self-optimize based on the results)\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
7b7343372a08967b7914fed2682394bdbest action for customers, you can greatly increase your conversion rates.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Direct to\n", + "Consumer\n", + "\n", + "\n", + "E-commerce\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Siloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n", + "\n", + "resulting in suboptimal recommendations for the next best action\n", + "\n", + "Companies need to ingest large amounts of data in real time and then take action on it immediately\n", + "\n", + "Many businesses still struggle with training their ML models to properly determine the next best action\n", + "\n", + "(and self-optimize based on the results)\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "**Value with Databricks:**\n", + "\n", + "Databricks provides all the tools needed to **process large volumes of data and find the next best**\n", + "\n", + "**action** at any given point in the customer journey\n", + "\n", + "**Near real-time insights** — the greater speed to data means businesses can react immediately to\n", + "\n", + "customer actions\n", + "\n", + "**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n", + "\n", + "basic information, transactional data, online behavior/purchase history, and more) to get a complete\n", + "\n", + "customer profile\n", + "\n", + "**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n", + "\n", + "step for customers\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 15**\n", + "### Customers That Innovate With Databricks Lakehouse for Retail\n", + "\n", + "\n", + "Some of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n", + "\n", + "for Retail to deliver real-time experiences to their customers.\n", + "\n", + "Today, data is at the core of every innovation in the retail and consumer packaged goods industry.\n", + "\n", + "Databricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n", + "\n", + "harness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n", + "\n", + "experiences to customers.\n", + "\n", + "Get started with a free trial of Lakehouse for Retail and start building better data applications today.\n", + "\n", + "**[Start your free trial](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
f420acb9b35388d3343c892d6c83435d**The**\n", + "**Delta Lake**\n", + "**Series**\n", + "**Lakehouse**\n", + "\n", + "Combining the best elements of\n", + "data lakes and data warehouses\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Here’s what\n", + "#### What’s \n", + "###### you’ll find inside\n", + "#### inside?\n", + "\n", + "\n", + "The Delta Lake Series of eBooks is published\n", + "\n", + "\n", + "by Databricks to help leaders and practitioners\n", + "\n", + "understand the full capabilities of Delta Lake as\n", + "\n", + "\n", + "**Introduction**\n", + "**What is Delta Lake?**\n", + "\n", + "\n", + "well as the landscape it resides in. This eBook,\n", + "\n", + "\n", + "**The Delta Lake Series — Lakehouse** , focuses\n", + "\n", + "on lakehouse.\n", + "\n", + "\n", + "**Chapter** **01**\n", + "\n", + "##### 02 Chapter\n", + " 03 Chapter\n", + "\n", + "\n", + "What Is\n", + "a Lakehouse?\n", + "\n", + "Diving Deep Into the Inner Workings\n", + "of the Lakehouse and Delta Lake\n", + "\n", + "Understanding\n", + "Delta Engine\n", + "\n", + "\n", + "#### What’s next?\n", + "\n", + "After reading this eBook, you’ll not only\n", + "\n", + "\n", + "understand what Delta Lake offers, but you’ll\n", + "\n", + "also understand how its features result in\n", + "\n", + "substantial performance improvements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "\n", + "lifecycle management to data lakes. Our customers have found that Delta Lake\n", + "\n", + "solves for challenges around malformed data ingestion, difficulties deleting data for\n", + "\n", + "compliance, or issues modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "\n", + "scalable cloud service.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a Lakehouse?**\n", + "### CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "# 01\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "\n", + "architects began envisioning a single system to house data for many different\n", + "\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "\n", + "and data warehouses.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
27aeb4ec0df5550cb0a51cb193c439bdvelocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "\n", + "architects began envisioning a single system to house data for many different\n", + "\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "\n", + "and data warehouses.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "\n", + "\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "\n", + "or copy data between different systems.\n", + "\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data struc-\n", + "\n", + "tures and data management features to those in a data warehouse, directly on the\n", + "\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "\n", + "consistency as multiple parties concurrently read or write data, typically using\n", + "\n", + "SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "\n", + "such as star/snowflake-schemas. The system should be able to reason about data\n", + "\n", + "integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n", + "\n", + "compute use separate clusters, thus these systems are able to scale to many more\n", + "\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "\n", + "for many new data applications, including images, video, audio, semi-structuredSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
d260bbdbcefe5b169f94c612022b7f40reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n", + "\n", + "compute use separate clusters, thus these systems are able to scale to many more\n", + "\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "\n", + "Cloud object stores such as Amazon S3 are some of the largest and most\n", + "\n", + "cost-effective storage systems on the planet, making the main attractive\n", + "\n", + "target to store large data warehouses and data lakes. Unfortunately, their\n", + "\n", + "implementation as key-value stores makes it difficult to achieve ACID\n", + "\n", + "transactions and high performance: Metadata operations, such as listing\n", + "\n", + "objects, are expensive, and consistency guarantees are limited. In this paper,\n", + "\n", + "we present Delta Lake, an open source ACID table storage layer over cloud\n", + "\n", + "object stores initially developed at Databricks. Delta Lake uses a transaction log\n", + "\n", + "that is compacted into Apache Parquet format to provide ACID properties, time\n", + "\n", + "travel, and significantly faster metadata operations for large tabular data sets\n", + "\n", + "(e.g., the ability to quickly search billions of table partitions for those relevant\n", + "\n", + "to a query). It also leverages this design to provide high-level features such\n", + "\n", + "as automatic data layout optimization, upserts, caching, and audit logs. Delta\n", + "\n", + "Lake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n", + "\n", + "other systems. Delta Lake is deployed at thousands of Databricks customers\n", + "\n", + "that process exabytes of data per day, with the largest instances managing\n", + "\n", + "exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n", + "\n", + "Zhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n", + "\n", + "Łuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n", + "\n", + "Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
cdb0b634a2405d4198c88650a922807dother systems. Delta Lake is deployed at thousands of Databricks customers\n", + "\n", + "that process exabytes of data per day, with the largest instances managing\n", + "\n", + "exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n", + "\n", + "Zhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n", + "\n", + "Łuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n", + "\n", + "Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "\n", + "Microsoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "\n", + "enables a similar lakehouse pattern. Other managed services such as BigQuery and\n", + "\n", + "Redshift Spectrum have some of the lakehouse features listed above, but they are\n", + "\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "\n", + "source file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n", + "\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "\n", + "data that went into a company’s products or decision-making was structured\n", + "\n", + "data from operational systems, whereas today, many products incorporate\n", + "\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "\n", + "unstructured data.\n", + "\n", + "\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "\n", + "\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "\n", + "important for “lift and shift scenarios,” which require systems that achieve semantics\n", + "\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "\n", + "exploration and refinement are standard for many analytic and data science\n", + "\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specializedSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
b1f28e2afb30602c0205684eb65002dfversioning, governance, security and ACID properties that are needed even for\n", + "\n", + "unstructured data.\n", + "\n", + "\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "\n", + "\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "\n", + "important for “lift and shift scenarios,” which require systems that achieve semantics\n", + "\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "\n", + "exploration and refinement are standard for many analytic and data science\n", + "\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "\n", + "systems (such as data warehouses) that have years of investments and real-\n", + "\n", + "world deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "\n", + "and other issues will be addressed as the technology continues to mature and\n", + "\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "\n", + "diverse data applications.\n", + "\n", + "\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the Inner Workings**\n", + "**of the Lakehouse and Delta Lake**\n", + "\n", + "### CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "# 02\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "\n", + "paper that describes some of the core technological challenges and solutions that\n", + "\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "\n", + "can read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "\n", + "[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "\n", + "object stores like Amazon S3 have become some of the largest and most cost-\n", + "\n", + "effective storage systems in the world, which makes them an attractive platform to\n", + "\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
36545a5c53d7999af33b9e016e0d8188Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "\n", + "object stores like Amazon S3 have become some of the largest and most cost-\n", + "\n", + "effective storage systems in the world, which makes them an attractive platform to\n", + "\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "\n", + "approach because the table is just a group of objects that can be accessed from\n", + "\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "\n", + "customers into a specific service provider, leaving customers to contend with\n", + "\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "\n", + "similar data structures and data management features to those in a data warehouse,\n", + "\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "\n", + "replace a subset of the objects with another, etc., in a serializable manner that stillSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
6f0ad77cb910ed72fc3436f747611387we sought to build a car instead of a faster horse with not just a better data store,\n", + "\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "\n", + "similar data structures and data management features to those in a data warehouse,\n", + "\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "\n", + "in the open source community. These use cases span a variety of data sources and\n", + "\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "\n", + "transactional features to replace some or all of the functionality provided by message\n", + "\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "\n", + "access protocols make it simple to operate, highly available, and able to deliver high-\n", + "\n", + "bandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding Delta Engine**\n", + "\n", + "### CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "# 03SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
f179ec9ceb185dca887837532487af04- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "\n", + "access protocols make it simple to operate, highly available, and able to deliver high-\n", + "\n", + "bandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding Delta Engine**\n", + "\n", + "### CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "# 03\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "\n", + "workloads through three components: an improved query optimizer, a caching\n", + "\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "\n", + "By linking these three components together, we think it will be easier for customers\n", + "\n", + "to understand how improvements in multiple places within the Databricks codeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
731dd131fbfa4bd813b743ecdd9eba7dDelta Engine’s caching layer automatically chooses which input data to cache for the\n", + "\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "\n", + "By linking these three components together, we think it will be easier for customers\n", + "\n", + "to understand how improvements in multiple places within the Databricks code\n", + "\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "\n", + "new advances in how data teams design their data architectures for increased\n", + "\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "\n", + "[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "## What’s next?\n", + "\n", + "\n", + "Now that you understand Delta Lake and how its features can improve\n", + "\n", + "performance, it may be time to take a look at some additional resources.\n", + "\n", + "**Data + AI Summit Europe 2020 >**\n", + "\n", + "- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n", + "\n", + "\n", + "**Explore subsequent eBooks in the collection >**\n", + "\n", + "- The Delta Lake Series — Fundamentals and Performance\n", + "\n", + "- The Delta Lake Series — Features\n", + "\n", + "- The Delta Lake Series — Streaming\n", + "\n", + "- The Delta Lake Series — Customer Use Cases\n", + "\n", + "\n", + "\n", + "- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n", + "\n", + "- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "\n", + "\n", + "- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n", + "\n", + "- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n", + "\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n", + "\n", + "- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "\n", + "**Vodcasts and podcasts >**\n", + "\n", + "\n", + "\n", + "- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
b39d47a11f4d8f74a085216623bd80f9- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n", + "\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n", + "\n", + "- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "\n", + "**Vodcasts and podcasts >**\n", + "\n", + "\n", + "\n", + "- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n", + "\n", + "- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n", + "\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "\n", + "\n", + "\n", + "- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
608c4c66830fb225969000b48507233b### eBook\n", + "\n", + "# The Big Book\n", + " of MLOps\n", + "\n", + "#### A data-centric approach\n", + " to build and scale AI,\n", + " including LLMOps\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R S :**\n", + "\n", + "**Joseph Bradley**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Rafi Kurlansik**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Matt Thomson**\n", + "\n", + "Director, EMEA Product Specialists\n", + "\n", + "**Niall Turbitt**\n", + "\n", + "Lead Data Scientist\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Introduction** 3\n", + "\n", + "###### People and process 4\n", + "\n", + " People 5\n", + "\n", + " Process 6\n", + "\n", + " Why should I care about MLOps? 8\n", + "\n", + " Guiding principles 9\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n", + "\n", + "###### Semantics of dev, staging and prod 11\n", + "\n", + " ML deployment patterns 15\n", + "\n", + "**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n", + "\n", + "###### Architecture components 19\n", + "\n", + " Data Lakehouse 19\n", + "\n", + " MLflow 19\n", + "\n", + " Databricks and MLflow Autologging 20\n", + "\n", + " Feature Store 20\n", + "\n", + " MLflow Model Serving 20\n", + "\n", + " Databricks SQL 20\n", + "\n", + " Databricks Workflows and Jobs 20\n", + "\n", + " Reference architecture 21\n", + "\n", + " Overview 22\n", + "\n", + " Dev 23\n", + "\n", + " Staging 27\n", + "\n", + " Prod 30\n", + "\n", + "**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n", + "\n", + "###### Discussion of key topics for LLMOps 39\n", + "\n", + " Reference architecture 46\n", + "\n", + " Looking ahead 48\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Introduction\n", + "\n", + "**Note:** Our prescription for MLOps is general to\n", + "\n", + "any set of tools and applications, though we give\n", + "\n", + "concrete examples using Databricks features\n", + "\n", + "and functionality. We also note that no single\n", + "\n", + "architecture or prescription will work for all\n", + "\n", + "organizations or use cases. Therefore, while we\n", + "\n", + "provide guidelines for building MLOps, we call out\n", + "\n", + "important options and variations. This whitepaper\n", + "\n", + "is written primarily for ML engineers and data\n", + "\n", + "scientists wanting to learn more about MLOps,\n", + "\n", + "with high-level guidance and pointers to more\n", + "\n", + "resources.\n", + "\n", + "\n", + "The past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n", + "\n", + "adopters were a small number of large technology companies that could afford the necessary resources,\n", + "\n", + "in recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n", + "\n", + "MIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n", + "\n", + "This democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n", + "\n", + "[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n", + "\n", + "However, building and deploying ML models is complex. There are many options available for achieving\n", + "\n", + "this but little in the way of well-defined and accessible standards. As a result, over the past few years weSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
776424d2ba7780c9b9a590ec888d5154This democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n", + "\n", + "[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n", + "\n", + "However, building and deploying ML models is complex. There are many options available for achieving\n", + "\n", + "this but little in the way of well-defined and accessible standards. As a result, over the past few years we\n", + "\n", + "have seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n", + "\n", + "**and automation for managing models, data and code to improve performance stability and long-term**\n", + "\n", + "**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n", + "\n", + "The concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n", + "\n", + "software applications, and the deployment of ML applications has much to gain from it. However, strong\n", + "\n", + "DevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n", + "\n", + "artifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n", + "\n", + "account the various people and processes that interact with these artifacts.\n", + "\n", + "Here at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n", + "\n", + "which work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n", + "\n", + "successful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n", + "\n", + "adoption is a testament to the appetite for operationalizing ML models.\n", + "\n", + "This whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n", + "\n", + "First, we describe the people and process involved in deploying ML applications and the need for\n", + "\n", + "operational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n", + "\n", + "we go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n", + "\n", + "introduce a general MLOps reference architecture, the details of its processes, and best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People and process\n", + "\n", + "**M L W O R K F L O W A N D P E R S O N A S**\n", + "\n", + "Data Governance Officer\n", + "\n", + "Dat1\n", + "Data Scientist\n", + "Engineer\n", + "\n", + "ML Engineer\n", + "\n", + "Business Stakeholder\n", + "\n", + "\n", + "Dataa\n", + "Preparation\n", + "\n", + "\n", + "Evplorator{a\n", + "Data unal{sis\n", + "\n", + "\n", + "Feature Mode� Modela Deplo{�ent\n", + "Engineering Training Validation\n", + "\n", + "\n", + "Mode� Modela Deplo{�ent Monitoring\n", + "Training Validation\n", + "\n", + "\n", + "Modela\n", + "Validation\n", + "\n", + "\n", + "**Figure 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People\n", + "\n", + "Building ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n", + "\n", + "useful to think in terms of archetypes. They help us understand roles and responsibilities and where\n", + "\n", + "handoffs are required, and they highlight areas of complexity within the system. We distinguish between\n", + "\n", + "the following personas:\n", + "\n", + "**M L P E R S O N A S**\n", + "\n", + "\n", + "Data\n", + "Governance\n", + "Officer\n", + "\n", + "Responsible for ensuring\n", + "\n", + "that data governance,\n", + "\n", + "data privacy and other\n", + "\n", + "compliance measures are\n", + "\n", + "adhered to across the\n", + "\n", + "model development andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
d8cb9d1ab8a5b591f0afe52543484c63ML Engineer\n", + "\n", + "Business Stakeholder\n", + "\n", + "\n", + "Dataa\n", + "Preparation\n", + "\n", + "\n", + "Evplorator{a\n", + "Data unal{sis\n", + "\n", + "\n", + "Feature Mode� Modela Deplo{�ent\n", + "Engineering Training Validation\n", + "\n", + "\n", + "Mode� Modela Deplo{�ent Monitoring\n", + "Training Validation\n", + "\n", + "\n", + "Modela\n", + "Validation\n", + "\n", + "\n", + "**Figure 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People\n", + "\n", + "Building ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n", + "\n", + "useful to think in terms of archetypes. They help us understand roles and responsibilities and where\n", + "\n", + "handoffs are required, and they highlight areas of complexity within the system. We distinguish between\n", + "\n", + "the following personas:\n", + "\n", + "**M L P E R S O N A S**\n", + "\n", + "\n", + "Data\n", + "Governance\n", + "Officer\n", + "\n", + "Responsible for ensuring\n", + "\n", + "that data governance,\n", + "\n", + "data privacy and other\n", + "\n", + "compliance measures are\n", + "\n", + "adhered to across the\n", + "\n", + "model development and\n", + "\n", + "deployment process. Not\n", + "\n", + "typically involved in day-to-\n", + "\n", + "day operations.\n", + "\n", + "\n", + "Data\n", + "Engineer\n", + "\n", + "Responsible for building\n", + "\n", + "data pipelines to process,\n", + "\n", + "organize and persist data\n", + "\n", + "sets for machine learning\n", + "\n", + "and other downstream\n", + "\n", + "applications.\n", + "\n", + "\n", + "Data\n", + "Scientist\n", + "\n", + "Responsible for\n", + "\n", + "understanding the business\n", + "\n", + "problem, exploring available\n", + "\n", + "data to understand\n", + "\n", + "if machine learning is\n", + "\n", + "applicable, and then training,\n", + "\n", + "tuning and evaluating a\n", + "\n", + "model to be deployed.\n", + "\n", + "\n", + "ML\n", + "Engineer\n", + "\n", + "Responsible for deploying\n", + "\n", + "machine learning models to\n", + "\n", + "production with appropriate\n", + "\n", + "governance, monitoring and\n", + "\n", + "software development best\n", + "\n", + "practices such as continuous\n", + "\n", + "integration and continuous\n", + "\n", + "deployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n", + "\n", + "\n", + "Business\n", + "Stakeholder\n", + "\n", + "Responsible for using the\n", + "\n", + "model to make decisions for\n", + "\n", + "the business or product, and\n", + "\n", + "responsible for the business\n", + "\n", + "value that the model is\n", + "\n", + "expected to generate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Process\n", + "\n", + "Together, these people develop and maintain ML applications. While the development process follows\n", + "\n", + "a distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n", + "\n", + "you take, and using techniques like reinforcement learning or online learning will change some details.\n", + "\n", + "Nevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n", + "\n", + "above.\n", + "\n", + "Let’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n", + "\n", + "which will be determined by the particular business case and data.\n", + "\n", + "**M L P R O C E S S**\n", + "\n", + "\n", + "Data\n", + "Preparation\n", + "\n", + "\n", + "Exploratory\n", + "Data Analysis\n", + "\n", + "\n", + "Feature\n", + "Engineering\n", + "\n", + "\n", + "Model\n", + "Training\n", + "\n", + "\n", + "Model\n", + "Validation\n", + "\n", + "\n", + "Deployment Monitoring\n", + "\n", + "\n", + "###### Data preparation\n", + "\n", + "Prior to any data science or ML work lies the data engineering needed to prepare production data and make\n", + "\n", + "it available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n", + "\n", + "will extract features and labels from the raw data.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "Analysis is conducted by data scientists to assess statistical properties of the data available, and determine\n", + "\n", + "if they address the business question. This requires frequent communication and iteration with business\n", + "\n", + "stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature engineering\n", + "\n", + "Data scientists clean data and apply business logic and specialized transformations to engineer features for\n", + "\n", + "model training. These data, or features, are split into training, testing and validation sets.\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists explore multiple algorithms and hyperparameter configurations using the prepared data, andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
708e5d98e42b111c0df5b6d124aaf98aModel\n", + "Training\n", + "\n", + "\n", + "Model\n", + "Validation\n", + "\n", + "\n", + "Deployment Monitoring\n", + "\n", + "\n", + "###### Data preparation\n", + "\n", + "Prior to any data science or ML work lies the data engineering needed to prepare production data and make\n", + "\n", + "it available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n", + "\n", + "will extract features and labels from the raw data.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "Analysis is conducted by data scientists to assess statistical properties of the data available, and determine\n", + "\n", + "if they address the business question. This requires frequent communication and iteration with business\n", + "\n", + "stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature engineering\n", + "\n", + "Data scientists clean data and apply business logic and specialized transformations to engineer features for\n", + "\n", + "model training. These data, or features, are split into training, testing and validation sets.\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n", + "\n", + "a best-performing model is determined according to predefined evaluation metric(s).\n", + "\n", + "###### Model validation\n", + "\n", + "Prior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n", + "\n", + "some baseline level of performance, in addition to meeting any other technical, business or regulatory\n", + "\n", + "requirements. This necessitates collaboration between data scientists, business stakeholders and ML\n", + "\n", + "engineers.\n", + "\n", + "###### Deployment\n", + "\n", + "ML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n", + "\n", + "requirements of the use case.\n", + "\n", + "###### Monitoring\n", + "\n", + "ML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n", + "\n", + "will often be involved in early monitoring phases to ensure that new models perform as expected after\n", + "\n", + "deployment. This will inform if and when the deployed model should be updated by returning to earlier\n", + "\n", + "stages in the workflow.\n", + "\n", + "The data governance officer is ultimately responsible for making sure this entire process is compliant with\n", + "\n", + "company and regulatory policies.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why should I care about MLOps?\n", + "\n", + "Consider that the typical ML application depends on the aforementioned people and process, as well\n", + "\n", + "as regulatory and ethical requirements. These dependencies change over time — and your models, data\n", + "\n", + "and code must change as well. The data that were a reliable signal yesterday become noise; open source\n", + "\n", + "libraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n", + "\n", + "resilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n", + "\n", + "moving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n", + "\n", + "iteration cycle of delivering models to production, thereby accelerating time to business value.\n", + "\n", + "There are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n", + "\n", + "**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n", + "\n", + "For example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n", + "\n", + "risk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n", + "\n", + "fails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n", + "\n", + "fines or incurring reputational damage. Recently, one private company’s data collection practices were\n", + "\n", + "found to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n", + "\n", + "$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n", + "\n", + "developed with that data.\n", + "\n", + "With respect to efficiency, the absence of MLOps is typically marked by an overabundance of manualSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
da9afac987d642e401fde2894c10390dFor example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n", + "\n", + "risk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n", + "\n", + "fails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n", + "\n", + "fines or incurring reputational damage. Recently, one private company’s data collection practices were\n", + "\n", + "found to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n", + "\n", + "$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n", + "\n", + "developed with that data.\n", + "\n", + "With respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n", + "\n", + "processes. These steps are slower and more prone to error, affecting the quality of models, data and code.\n", + "\n", + "Eventually they form a bottleneck, capping the ability for a data team to take on new projects.\n", + "\n", + "Seen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n", + "\n", + "stability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n", + "\n", + "introduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n", + "\n", + "manage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n", + "\n", + "**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n", + "\n", + "With clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n", + "\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the complexity of ML\n", + "\n", + "processes and the different personas\n", + "\n", + "involved, it is helpful to start from\n", + "\n", + "simpler, high-level guidance. We\n", + "\n", + "propose several broadly applicable\n", + "\n", + "principles to guide MLOps decisions.\n", + "\n", + "They inform our design choices in\n", + "\n", + "later sections, and we hope they can\n", + "\n", + "be adapted to support whatever your\n", + "\n", + "\n", + "#### Guiding principles\n", + "\n", + "###### Always keep your business goals in mind\n", + "\n", + "Just as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n", + "\n", + "purpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n", + "\n", + "continue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n", + "\n", + "business impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n", + "\n", + "reduce operational costs or risks?\n", + "\n", + "###### Take a data-centric approach to machine learning\n", + "\n", + "Feature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n", + "\n", + "as robust as other production data engineering processes. Data quality is crucial in any ML application, so\n", + "\n", + "ML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n", + "\n", + "Avoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n", + "\n", + "your data. The simplest way to achieve this is to develop ML applications on the same platform used to\n", + "\n", + "manage production data. For example, instead of downloading training data to a laptop, where it is hard\n", + "\n", + "to govern and reproduce results, secure the data in cloud storage and make that storage available to your\n", + "\n", + "training process.\n", + "\n", + "\n", + "business use case may be.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### \u0007Implement MLOps in a modular fashion\n", + "\n", + "As with any software application, code quality is paramount for an ML application. Modularized code\n", + "\n", + "enables testing of individual components and mitigates difficulties with future code refactoring. DefineSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
62b4254e22c0941dfae77299d22de1f7Feature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n", + "\n", + "as robust as other production data engineering processes. Data quality is crucial in any ML application, so\n", + "\n", + "ML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n", + "\n", + "Avoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n", + "\n", + "your data. The simplest way to achieve this is to develop ML applications on the same platform used to\n", + "\n", + "manage production data. For example, instead of downloading training data to a laptop, where it is hard\n", + "\n", + "to govern and reproduce results, secure the data in cloud storage and make that storage available to your\n", + "\n", + "training process.\n", + "\n", + "\n", + "business use case may be.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### \u0007Implement MLOps in a modular fashion\n", + "\n", + "As with any software application, code quality is paramount for an ML application. Modularized code\n", + "\n", + "enables testing of individual components and mitigates difficulties with future code refactoring. Define\n", + "\n", + "clear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n", + "\n", + "responsibilities to clarify the modular structure of your ML application.\n", + "\n", + "###### Process should guide automation\n", + "\n", + "We automate processes to improve productivity and lower risk of human error, but not every step of a\n", + "\n", + "process can or should be automated. People still determine the business question, and some models will\n", + "\n", + "always need human oversight before deployment. Therefore, the development process is primary and each\n", + "\n", + "module in the process should be automated as needed. This allows incremental build-out of automation\n", + "\n", + "and customization. Furthermore, when it comes to particular automation tools, choose those that align to\n", + "\n", + "your people and process. For example, instead of building a model logging framework around a generic\n", + "\n", + "database, you can choose a specialized tool like MLflow, which has been designed with the ML model\n", + "\n", + "lifecycle in mind.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Fundamentals of MLOps\n", + "\n", + "**Note:** In our experience with customers, there\n", + "\n", + "can be variations in these three stages, such as\n", + "\n", + "splitting staging into separate “test” and “QA”\n", + "\n", + "substages. However, the principles remain the\n", + "\n", + "same and we stick to a dev, staging and prod\n", + "\n", + "setup within this paper.\n", + "\n", + "\n", + "#### Semantics of dev, staging and prod\n", + "\n", + "ML workflows include the following key assets: code, models and data. These assets need to be developed\n", + "\n", + "(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n", + "\n", + "environment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n", + "\n", + "staging and prod.\n", + "\n", + "These divisions can best be understood in terms of quality guarantees and access control. On one end,\n", + "\n", + "assets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n", + "\n", + "who can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n", + "\n", + "of quality.\n", + "\n", + "For example, many data scientists will work together in a dev environment, freely producing dev model\n", + "\n", + "prototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n", + "\n", + "the live product. In contrast, the staging environment replicates the execution environment of production.\n", + "\n", + "Here, code changes made in the dev environment are tested prior to code being deployed to production.\n", + "\n", + "The staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n", + "\n", + "are given access to staging. Code promoted to production is considered a live product. In the production\n", + "\n", + "environment, human error can pose the greatest risk to business continuity, and so the least number of\n", + "\n", + "people have permission to modify production models.\n", + "\n", + "One might be tempted to say that code, models and data each share a one-to-one correspondence with\n", + "\n", + "the execution environment — e.g., all dev code, models and data are in the dev environment. That is oftenSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
85197469b7851eb47251975aad626a49who can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n", + "\n", + "of quality.\n", + "\n", + "For example, many data scientists will work together in a dev environment, freely producing dev model\n", + "\n", + "prototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n", + "\n", + "the live product. In contrast, the staging environment replicates the execution environment of production.\n", + "\n", + "Here, code changes made in the dev environment are tested prior to code being deployed to production.\n", + "\n", + "The staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n", + "\n", + "are given access to staging. Code promoted to production is considered a live product. In the production\n", + "\n", + "environment, human error can pose the greatest risk to business continuity, and so the least number of\n", + "\n", + "people have permission to modify production models.\n", + "\n", + "One might be tempted to say that code, models and data each share a one-to-one correspondence with\n", + "\n", + "the execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n", + "\n", + "close to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n", + "\n", + "and prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n", + "\n", + "access to each.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Execution environments\n", + "\n", + "An execution environment is the place where models and data are created or consumed by code. Each\n", + "\n", + "execution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n", + "\n", + "With Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n", + "\n", + "organization could create distinct environments across multiple cloud accounts, multiple Databricks\n", + "\n", + "workspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n", + "\n", + "are illustrated in Figure 2 below.\n", + "\n", + "**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n", + "\n", + "\n", + "Multiple clou$\n", + "accounts\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Multiple Databricks\n", + "workspaces\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Databricks workspace\n", + "access controls\n", + "\n", + "\n", + "dev\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "**Figure 2**\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "\n", + "###### Code\n", + "\n", + "ML project code is often stored in a version control repository (such as Git), with most organizations\n", + "\n", + "using branches corresponding to the lifecycle phases of development, staging or production. There are a\n", + "\n", + "few common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n", + "\n", + "Others use main and development branches (dev), branches cut for testing potential releases (staging), and\n", + "\n", + "branches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n", + "\n", + "through Git repository branches.\n", + "\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once and to replace a subset\n", + "\n", + "of the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "\n", + "As a best practice, code should only be run in an execution environment that corresponds to it or in one\n", + "\n", + "that’s higher. For example, the dev environment can run any code, but the prod environment can only run\n", + "\n", + "prod code.\n", + "\n", + "###### Models\n", + "\n", + "While models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n", + "\n", + "**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to pushSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
da9dc2dae18bd9c1aa982e446461522cthrough Git repository branches.\n", + "\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once and to replace a subset\n", + "\n", + "of the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "\n", + "As a best practice, code should only be run in an execution environment that corresponds to it or in one\n", + "\n", + "that’s higher. For example, the dev environment can run any code, but the prod environment can only run\n", + "\n", + "prod code.\n", + "\n", + "###### Models\n", + "\n", + "While models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n", + "\n", + "**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n", + "\n", + "a new model version before you push a code change, and vice versa. Consider the following scenarios:\n", + "\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "\n", + "\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n", + "\n", + "the code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n", + "\n", + "of being generated, tested and marked as “production” to predict on the most recent transactions. In\n", + "\n", + "this case the code lifecycle is slower than the model lifecycle.\n", + "\n", + "\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n", + "\n", + "time process due to cost. Updates to the serving and monitoring code in the project may be deployed\n", + "\n", + "more frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n", + "\n", + "Since model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n", + "\n", + "management to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n", + "\n", + "directly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n", + "\n", + "production models without code changes, streamlining the deployment process in many cases. Model\n", + "\n", + "artifacts are secured using MLflow access controls or cloud storage permissions\n", + "\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Some organizations label data as either dev, staging or prod, depending on which environment it originated\n", + "\n", + "in. For example, all prod data is produced in the prod environment, but dev and staging environments may\n", + "\n", + "have read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n", + "\n", + "may be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n", + "\n", + "\n", + "reliability and freshness. Access to data in each environment is controlled with table access controls\n", + "\n", + "( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n", + "| |\n", + "\n", + "In summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n", + "\n", + "prod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n", + "\n", + "will be the highest quality and tightly controlled.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
ebfebb0d787bbc4bd1401beb141a6e30may be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n", + "\n", + "\n", + "reliability and freshness. Access to data in each environment is controlled with table access controls\n", + "\n", + "( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n", + "| |\n", + "\n", + "In summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n", + "\n", + "prod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n", + "\n", + "will be the highest quality and tightly controlled.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|ASSET|SEMANTICS|SEPARATED BY|\n", + "|---|---|---|\n", + "|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n", + "|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n", + "|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n", + "|Code|Labeled according to software development lifecycle phase|Git repository branches|\n", + "\n", + "\n", + "**Table 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ML deployment patterns\n", + "\n", + "The fact that models and code can be managed separately results in multiple possible patterns for getting\n", + "\n", + "ML artifacts through staging and into production. We explain two major patterns below.\n", + "\n", + "**D E P L O Y M O D E L S**\n", + "\n", + "dev staging prod\n", + "\n", + "**D E P L O Y C O D E**\n", + "\n", + "dev staging prod\n", + "\n", + "These two patterns differ in terms of whether the model artifact or the training code that produces the\n", + "\n", + "model artifact is promoted toward production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Deploy models\n", + "\n", + "In the first pattern, the model artifact is generated by training code in the development environment.\n", + "\n", + "This artifact is then tested in staging for compliance and performance before finally being deployed into\n", + "\n", + "production. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n", + "\n", + "expensive, training the model once and managing that artifact may be preferable. However, this simpler\n", + "\n", + "architecture comes with limitations. If production data is not accessible from the development environment\n", + "\n", + "(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n", + "\n", + "automated model retraining. While you could automate retraining in the development environment, you\n", + "\n", + "would then be treating “dev” training code as production ready, which many deployment teams would not\n", + "\n", + "accept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n", + "\n", + "deployed to production, requiring a separate code deployment path.\n", + "\n", + "###### Deploy code\n", + "\n", + "In the second pattern, the code to train models is developed in the dev environment, and this code is\n", + "\n", + "moved to staging and then production. Models will be trained in each environment: initially in the dev\n", + "\n", + "environment as part of model development, in staging (on a limited subset of data) as part of integration\n", + "\n", + "tests, and finally in the production environment (on the full production data) to produce the final model.\n", + "\n", + "If an organization restricts data scientists’ access to production data from dev or staging environments,\n", + "\n", + "deploying code allows training on production data while respecting access controls. Since training code\n", + "\n", + "goes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n", + "\n", + "same pattern as model training code, and both can go through integration tests in staging. However, the\n", + "\n", + "learning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n", + "\n", + "project templates and workflows are helpful. Finally, data scientists need visibility into training results fromSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
f583634e6cf80e00bbe691fdbfe7ca94###### Deploy code\n", + "\n", + "In the second pattern, the code to train models is developed in the dev environment, and this code is\n", + "\n", + "moved to staging and then production. Models will be trained in each environment: initially in the dev\n", + "\n", + "environment as part of model development, in staging (on a limited subset of data) as part of integration\n", + "\n", + "tests, and finally in the production environment (on the full production data) to produce the final model.\n", + "\n", + "If an organization restricts data scientists’ access to production data from dev or staging environments,\n", + "\n", + "deploying code allows training on production data while respecting access controls. Since training code\n", + "\n", + "goes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n", + "\n", + "same pattern as model training code, and both can go through integration tests in staging. However, the\n", + "\n", + "learning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n", + "\n", + "project templates and workflows are helpful. Finally, data scientists need visibility into training results from\n", + "\n", + "the production environment, for only they have the knowledge to identify and fix ML-specific issues.\n", + "\n", + "\n", + "-----\n", + "\n", + "The diagram below contrasts the code lifecycle for the above deployment patterns across the different\n", + "\n", + "execution environments.\n", + "\n", + "\n", + "Code\n", + "development\n", + "\n", + "Development\n", + "environment\n", + "\n", + "\n", + "Unit\n", + "tests\n", + "\n", + "\n", + "Integration\n", + "tests\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "\n", + "Model\n", + "training\n", + "\n", + "\n", + "Continuous\n", + "deployment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "Deploy\n", + "pipelines\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "#### Deploy models\n", + "\n", + " Deploy code\n", + "\n", + "\n", + "**In general we recommend following the “deploy code” approach, and the reference architecture in**\n", + "\n", + "**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n", + "\n", + "the options outlined above are not mutually exclusive. Within a single organization, you may find some use\n", + "\n", + "cases deploying training code and others deploying model artifacts. Your choice of process will depend on\n", + "\n", + "the business use case, resources available and what is most likely to succeed.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n", + "|---|---|---|---|\n", + "|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n", + "||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n", + "||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n", + "|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n", + "||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n", + "||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n", + "||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n", + "||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n", + "||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n", + "|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
645ba7a46c6551d1fc243055ca207265**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## MLOps Architecture\n", + " and Process\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "#### Architecture components\n", + "\n", + "Before unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n", + "\n", + "features used to facilitate MLOps in the workflow prescribed.\n", + "\n", + "###### Data Lakehouse\n", + "\n", + "A [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n", + "\n", + "data management and performance typically found in data warehouses with the low-cost, flexible object\n", + "\n", + "stores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n", + "\n", + "of Bronze, Silver and Gold tables of increasing refinement and quality.\n", + "\n", + "###### MLflow\n", + "\n", + "[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n", + "\n", + "following primary components:\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n", + "\n", + "artifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n", + "| |\n", + "\n", + "\n", + "\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n", + "\n", + "model serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n", + "| |\n", + "\n", + "\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n", + "\n", + "\n", + "from staging to production, with capabilities for versioning and annotating. The registry also provides\n", + "\n", + "webhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n", + "| |\n", + "\n", + "Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n", + "\n", + "high availability, and other Databricks workspace features such as experiment and run management and\n", + "\n", + "notebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n", + "\n", + "machine learning model training runs and running machine learning projects.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Databricks and MLflow Autologging\n", + "\n", + "Databricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n", + "\n", + "experiment tracking for machine learning training sessions on Databricks. Databricks AutologgingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
bfb585054ac3d95182da37f6cea4c11bDatabricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n", + "\n", + "high availability, and other Databricks workspace features such as experiment and run management and\n", + "\n", + "notebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n", + "\n", + "machine learning model training runs and running machine learning projects.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Databricks and MLflow Autologging\n", + "\n", + "Databricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n", + "\n", + "experiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n", + "\n", + "\n", + "automatically captures model parameters, metrics, files and lineage information when you train models with\n", + "\n", + "training runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n", + "| |\n", + "\n", + "###### Feature Store\n", + "\n", + "The Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n", + "\n", + "\n", + "across an organization and also ensures that the same feature computation code is used for model training\n", + "\n", + "and inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n", + "| |\n", + "\n", + "###### MLflow Model Serving\n", + "\n", + "MLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n", + "\n", + "\n", + "that are updated automatically based on the availability of model versions and their stages. See\n", + "\n", + "documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n", + "| |\n", + "\n", + "###### Databricks SQL\n", + "\n", + "Databricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n", + "\n", + "\n", + "data lake, create multiple visualization types to explore query results from different perspectives, and build\n", + "\n", + "and share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n", + "| |\n", + "\n", + "###### Databricks Workflows and Jobs\n", + "\n", + "Databricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n", + "\n", + "\n", + "ways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n", + "\n", + "steps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n", + "| |\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architectureSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
a1fd060419f5e0d621ce925e95993752###### Databricks Workflows and Jobs\n", + "\n", + "Databricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n", + "\n", + "\n", + "ways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n", + "\n", + "steps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n", + "| |\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "We are now ready to review a general reference architecture for implementing MLOps on the Databricks\n", + "\n", + "Lakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n", + "\n", + "the majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n", + "\n", + "we will highlight alternative approaches to implementing different parts of the process.\n", + "\n", + "We begin with an overview of the system end-to-end, followed by more detailed views of the process\n", + "\n", + "in development, staging and production environments. These diagrams show the system as it operates\n", + "\n", + "in a steady state, with the finer details of iterative development cycles omitted. This structure is\n", + "\n", + "summarized below.\n", + "\n", + "**O V E R V I E W**\n", + "```\n", + " dev staging prod\n", + "\n", + "```\n", + "\n", + "\u0007Data\n", + "\n", + "\u0007Exploratory data analysis (EDA)\n", + "\n", + "\u0007Project code\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Commit code\n", + "\n", + "\n", + "\u0007Merge request\n", + "\n", + "\u0007Unit tests (CI)\n", + "\n", + "\u0007Integration tests (CI)\n", + "\n", + "\u0007Merge\n", + "\n", + "\u0007Cut release branch\n", + "\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Continuous deployment (CD)\n", + "\n", + "\u0007Online serving (REST APIs)\n", + "\n", + "\u0007Inference: batch or streaming\n", + "\n", + "\u0007Monitoring\n", + "\n", + "\u0007Retraining\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Overview\n", + "\n", + "Source control\n", + "\n", + "dev staging (main) release\n", + "\n", + "Merge reIuest to staging Cut release branch Pull from release branch to production\n", + "\n", + "\n", + "**Figure 3**\n", + "\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Exploratory\n", + "data analysis\n", + "\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Create dev branch Commit code C} trigger Merge\n", + "\n", + "\n", + "Production\n", + "environment\n", + "\n", + "Model Registry\n", + "\n", + "St�ge{ �one St�ge{ St�ging St�ge{ Production\n", + "\n", + "\n", + ". . .\n", + "\n", + "\n", + "Inference & serving dev\n", + "\n", + "Feature table refresh dev\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Push model to registr� Load model for testing Load model for inference\n", + "\n", + "Integration\n", + "tests (CI)\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Promote to production\n", + "\n", + "\n", + "Inference & serving\n", + "\n", + "\n", + "Model training dev\n", + "\n", + "release\n", + "\n", + "dev\n", + "\n", + "\n", + "Feature\n", + "table refresh\n", + "\n", + "release\n", + "\n", + "\n", + "Mode�\n", + "training\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring\n", + "\n", + "release\n", + "\n", + "\n", + "Data tables Feature tables Feature tables Data tables Feature tables Metrics tables\n", + "\n", + "Here we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n", + "\n", + "and model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n", + "\n", + "pipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n", + "\n", + "development environment, and changes to the codebase are committed back to source control. Upon merge\n", + "\n", + "request to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n", + "\n", + "code in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n", + "\n", + "code release. In production, a model is trained on the full production data and pushed to the MLflow Model\n", + "\n", + "Registry. A continuous deployment (CD) process tests the model and promotes it toward the productionSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
94ff21f4dd51692d8c4759c20116d3a3release\n", + "\n", + "\n", + "Mode�\n", + "training\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring\n", + "\n", + "release\n", + "\n", + "\n", + "Data tables Feature tables Feature tables Data tables Feature tables Metrics tables\n", + "\n", + "Here we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n", + "\n", + "and model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n", + "\n", + "pipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n", + "\n", + "development environment, and changes to the codebase are committed back to source control. Upon merge\n", + "\n", + "request to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n", + "\n", + "code in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n", + "\n", + "code release. In production, a model is trained on the full production data and pushed to the MLflow Model\n", + "\n", + "Registry. A continuous deployment (CD) process tests the model and promotes it toward the production\n", + "\n", + "stage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Dev\n", + "\n", + "In the development environment, data scientists and ML engineers can collaborate on all pipelines in\n", + "\n", + "an ML project, committing their changes to source control. While engineers may help to configure this\n", + "\n", + "environment, data scientists typically have significant control over the libraries, compute resources and\n", + "\n", + "code that they use.\n", + "\n", + "\n", + "**Figure 4** Development environment\n", + "\n", + "0�\n", + "\n", + "E�ploratory\n", + "data analysis\n", + "\n", + "0�\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Source control\n", + "\n", + "Tracking Server\n", + "\n", + "Metrics Parameters Models\n", + "\n", + "dev\n", + "\n", + "\n", + ". . .\n", + "\n", + "models\n", + "\n", + "\n", + "train.py\n", + "\n", + "deploy.py\n", + "\n", + "in(erence.py\n", + "\n", + "monitoring.py\n", + "\n", + "dat<\n", + "\n", + "(eaturization.py\n", + "\n", + "tests\n", + "\n", + "unit.py\n", + "\n", + "integration.py\n", + "\n", + "\n", + "Inference: Streaming or batch\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "\n", + "Model training\n", + "\n", + "Training and\n", + "Evaluation\n", + "tuning\n", + "\n", + "\n", + "Create dev mrancg\n", + "\n", + "0u\n", + "\n", + "Commit code\n", + "\n", + "\n", + "04\n", + "\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Feature tamles Bronze / Silver / Gold\n", + "\n", + "prod data\n", + "\n", + "\n", + "Feature tamles Temp tamles\n", + "\n", + "dev data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Data scientists working in the dev environment possess read-only access to production data. They also\n", + "\n", + "require read-write access to a separate dev storage environment to develop and experiment with new\n", + "\n", + "features and other data tables.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "The data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n", + "\n", + "assess whether the available data has the potential to address the business problem. EDA is also where the\n", + "\n", + "data scientist will begin discerning what data preparation and featurization are required for model training.\n", + "\n", + "This ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n", + "\n", + "###### Project code\n", + "\n", + "This is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n", + "\n", + "are used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n", + "\n", + "a project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n", + "\n", + "pipeline consists of two steps:\n", + "\n", + "\u0007 **Data preparation**\n", + "\n", + "This step checks for and corrects any data quality issues prior to featurization.\n", + "\n", + "**\u0007Featurization**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
ca6e7ba0b61410c074994bf41992e225data scientist will begin discerning what data preparation and featurization are required for model training.\n", + "\n", + "This ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n", + "\n", + "###### Project code\n", + "\n", + "This is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n", + "\n", + "are used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n", + "\n", + "a project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n", + "\n", + "pipeline consists of two steps:\n", + "\n", + "\u0007 **Data preparation**\n", + "\n", + "This step checks for and corrects any data quality issues prior to featurization.\n", + "\n", + "**\u0007Featurization**\n", + "\n", + "In the dev environment, new features and updated featurization logic can be tested by writing to feature\n", + "\n", + "tables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n", + "\n", + "featurization code is promoted to production, these changes will affect the production feature tables.\n", + "\n", + "Features already available in production feature tables can be read directly for development.\n", + "\n", + "In some organizations, feature engineering pipelines are managed separately from ML projects. In such\n", + "\n", + "cases, the featurization pipeline can be omitted from this architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "The training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n", + "\n", + "and it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n", + "\n", + "hyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n", + "\n", + "between the model, its input data, and the code used to generate it.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out data. The results of these tests are logged to the\n", + "\n", + "MLflow tracking server.\n", + "\n", + "If governance requires additional metrics or supplemental documentation about the model, this is the\n", + "\n", + "time to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n", + "\n", + "and plain text descriptions are common, but defining the specifics for such governance requires input\n", + "\n", + "from business stakeholders or a data governance officer.\n", + "\n", + "**\u0007Model output**\n", + "\n", + "The output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n", + "\n", + "training pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n", + "\n", + "via the model URI (or path) and then push the model to the Model Registry for management and testing.\n", + "\n", + "###### Commit code\n", + "\n", + "After developing code for featurization, training, inference and other pipelines, the data scientist or\n", + "\n", + "ML engineer commits the dev branch changes into source control. This section does not discuss the\n", + "\n", + "continuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n", + "\n", + "information on those.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Staging\n", + "\n", + "The transition of code from development to production occurs in the staging environment. This code\n", + "\n", + "includes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n", + "\n", + "engineers are responsible for writing tests for code and models, but ML engineers manage the continuousSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
6279c87720dd1cacba7445a01ead0c8eThe output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n", + "\n", + "training pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n", + "\n", + "via the model URI (or path) and then push the model to the Model Registry for management and testing.\n", + "\n", + "###### Commit code\n", + "\n", + "After developing code for featurization, training, inference and other pipelines, the data scientist or\n", + "\n", + "ML engineer commits the dev branch changes into source control. This section does not discuss the\n", + "\n", + "continuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n", + "\n", + "information on those.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Staging\n", + "\n", + "The transition of code from development to production occurs in the staging environment. This code\n", + "\n", + "includes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n", + "\n", + "engineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n", + "\n", + "integration pipelines and orchestration.\n", + "\n", + "Source control\n", + "\n", + "0] 0_\n", + "\n", + "dev staging >main< release\n", + "\n", + "Merge reHuest to staging Cut release branch\n", + "\n", + "Staging environment\n", + "\n", + "CI trigger Merge\n", + "\n", + "0�\n", + "\n", + "\n", + "**Figure 5**\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Tracking Server\n", + "\n", + "0�\n", + "\n", + "Model Registry\n", + "\n", + "dev\n", + "\n", + "\n", + "03\n", + "\n", + "Integration tests (CI)\n", + "\n", + "\n", + "Feature\n", + "Store tests\n", + "\n", + "\n", + "Model\n", + "training tests\n", + "\n", + "\n", + "Model\n", + "deployment\n", + "tests\n", + "\n", + "\n", + "Inference\n", + "tests\n", + "\n", + "\n", + "Model\n", + "monitoring\n", + "tests\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "dev\n", + "\n", + "Feature tables Temp tables\n", + "\n", + "staging data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "The staging environment may have its own storage area for testing feature tables and ML pipelines. This\n", + "\n", + "data is generally temporary and only retained long enough to run tests and to investigate test failures. This\n", + "\n", + "data can be made readable from the development environment for debugging.\n", + "\n", + "###### Merge code\n", + "\n", + "\u0007 **Merge request**\n", + "\n", + "The deployment process begins when a merge (or pull) request is submitted against the staging branch\n", + "\n", + "of the project in source control. It is common to use the “main” branch as the staging branch.\n", + "\n", + "**\u0007Unit tests (CI)**\n", + "\n", + "This merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n", + "\n", + "request is rejected.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Integration tests (CI)\n", + "\n", + "The merge request then goes through integration tests, which run all pipelines to confirm that they function\n", + "\n", + "correctly together. The staging environment should mimic the production environment as much as is\n", + "\n", + "reasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n", + "\n", + "Integration tests can trade off fidelity of testing for speed and cost. For example, when models are\n", + "\n", + "expensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n", + "\n", + "cost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n", + "\n", + "testing within these integration tests, whereas other models may be tested with small batch jobs or a few\n", + "\n", + "queries to temporary REST endpoints.\n", + "\n", + "Once integration tests pass on the staging branch, the code may be promoted toward production.\n", + "\n", + "\u0007 **Merge**\n", + "\n", + "If all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n", + "\n", + "system should notify users and post results on the merge (pull) request.\n", + "\n", + "Note: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n", + "\n", + "updated frequently with concurrent merge requests.\n", + "\n", + "###### Cut release branch\n", + "\n", + "Once CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n", + "\n", + "that commit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
274b8ac4ce7f9bd380391183c9e04742cost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n", + "\n", + "testing within these integration tests, whereas other models may be tested with small batch jobs or a few\n", + "\n", + "queries to temporary REST endpoints.\n", + "\n", + "Once integration tests pass on the staging branch, the code may be promoted toward production.\n", + "\n", + "\u0007 **Merge**\n", + "\n", + "If all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n", + "\n", + "system should notify users and post results on the merge (pull) request.\n", + "\n", + "Note: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n", + "\n", + "updated frequently with concurrent merge requests.\n", + "\n", + "###### Cut release branch\n", + "\n", + "Once CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n", + "\n", + "that commit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6**\n", + "\n", + "\n", + "###### Prod\n", + "\n", + "The production environment is typically managed by a select set of ML engineers and is where ML pipelines\n", + "\n", + "directly serve the business or application. These pipelines compute fresh feature values, train and test new\n", + "\n", + "model versions, publish predictions to downstream tables or applications, and monitor the entire process to\n", + "\n", + "avoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n", + "\n", + "online serving below, most ML applications will use only one of these methods, depending on the business\n", + "\n", + "requirements.\n", + "\n", + "Production environment\n", + "\n", + "\n", + "0b\n", + "\n", + "0�\n", + "\n", + "0�\n", + "\n", + "\n", + "Model Registry\n", + "\n", + "\n", + "Online serving\n", + "\n", + "\n", + "Stage: None Stage: Staging Stage: Production\n", + "\n", + "\n", + "Log\n", + "requests and\n", + "predictions\n", + "\n", + "release\n", + "\n", + "\n", + "Load model for\n", + "online serving\n", + "\n", + "\n", + "Ena�le online\n", + "serving\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "release\n", + "\n", + "0B\n", + "\n", + "\n", + "0~\n", + "\n", + "\n", + "Load model for testing\n", + "\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Inference: Batch or streaming\n", + "\n", + "\n", + "Register and request transition\n", + "\n", + "Model training\n", + "\n", + "Training\n", + "Evaluation\n", + "and tuning\n", + "\n", + "release\n", + "\n", + "\n", + "Promote to staging Promote to production\n", + "\n", + "\n", + "Model\n", + "Data ingest\n", + "inference\n", + "\n", + "\n", + "Pu�lish\n", + "predictions\n", + "\n", + "\n", + "03\n", + "\n", + "\n", + "Continuous Deployment (CD)\n", + "\n", + "\n", + "release\n", + "\n", + "Monitoring\n", + "\n", + "\n", + "Data ingest\n", + "\n", + "\n", + "Check model\n", + "performance\n", + "and data drift\n", + "\n", + "\n", + "Pu�lish\n", + "metrics\n", + "\n", + "\n", + "Compare\n", + "Staging vs\n", + "Production\n", + "\n", + "\n", + "Request model\n", + "transition to\n", + "Production\n", + "\n", + "release\n", + "\n", + "\n", + "Compliance\n", + "checks\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Trigger model training\n", + "\n", + "\n", + "release\n", + "\n", + "\n", + "Data ta�les Feature ta�les Feature ta�les Monitoring ta�les\n", + "Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "Though data scientists may not have write or compute access in the production environment, it is\n", + "\n", + "important to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n", + "\n", + "in production. This visibility allows them to identify and diagnose problems in production.\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n", + "\n", + "or streaming computation, depending on the freshness requirements for downstream training and inference.\n", + "\n", + "The pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n", + "\n", + "###### Model training\n", + "\n", + "The model training pipeline runs either when code changes affect upstream featurization or training logic, or\n", + "\n", + "when automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n", + "\n", + "\u0007 **Training and tuning**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
8ad624e8a2bccafc9e62872cd484b446in production. This visibility allows them to identify and diagnose problems in production.\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n", + "\n", + "or streaming computation, depending on the freshness requirements for downstream training and inference.\n", + "\n", + "The pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n", + "\n", + "###### Model training\n", + "\n", + "The model training pipeline runs either when code changes affect upstream featurization or training logic, or\n", + "\n", + "when automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "During the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n", + "\n", + "metrics, parameters, tags and the model itself.\n", + "\n", + "During development, data scientists may test many algorithms and hyperparameters, but it is common\n", + "\n", + "to restrict those choices to the top-performing options in the production training code. Restricting tuning\n", + "\n", + "can reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out production data. The results of these tests are\n", + "\n", + "logged to the MLflow tracking server. During development, data scientists will have selected meaningful\n", + "\n", + "evaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n", + "\n", + "**\u0007Register and request transition**\n", + "\n", + "Following model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n", + "\n", + "environment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Continuous deployment (CD)\n", + "\n", + "The CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n", + "\n", + "‘stage=Staging’. There are three key tasks in this pipeline:\n", + "\n", + "\u0007 **Compliance checks**\n", + "\n", + "These tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n", + "\n", + "etc.), and approve or reject the request based on test results. If compliance checks require human\n", + "\n", + "expertise, this automated step can compute statistics or visualizations for people to review in a manual\n", + "\n", + "approval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n", + "\n", + "are recorded to the Model Registry through metadata in tags and comments in descriptions.\n", + "\n", + "The MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n", + "\n", + "can be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n", + "\n", + "the transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n", + "\n", + "transition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n", + "\n", + "**\u0007Compare staging vs. production**\n", + "\n", + "To prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n", + "\n", + "‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n", + "\n", + "according to the use case, and the method for comparison can vary from canary deployments to A/BSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
f104800f30d27377e2cd3379dfe313eeare recorded to the Model Registry through metadata in tags and comments in descriptions.\n", + "\n", + "The MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n", + "\n", + "can be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n", + "\n", + "the transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n", + "\n", + "transition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n", + "\n", + "**\u0007Compare staging vs. production**\n", + "\n", + "To prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n", + "\n", + "‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n", + "\n", + "according to the use case, and the method for comparison can vary from canary deployments to A/B\n", + "\n", + "tests. All comparison results are saved to metrics tables in the lakehouse.\n", + "\n", + "If this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n", + "\n", + "should be compared to a business heuristic or other threshold as a baseline. For a new version\n", + "\n", + "of an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n", + "\n", + "‘stage=Production’ model.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Request model transition to production**\n", + "\n", + "If the candidate model passes the comparison tests, a request is made to transition it to\n", + "\n", + "‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n", + "\n", + "approvals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n", + "\n", + "webhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n", + "\n", + "fully available to downstream applications. A person can manually review the compliance checks and\n", + "\n", + "performance comparisons to perform checks which are difficult to automate.\n", + "\n", + "###### Online serving (REST APIs)\n", + "\n", + "For lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n", + "\n", + "simple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n", + "\n", + "custom serving layers.\n", + "\n", + "In all cases, the serving system loads the production model from the Model Registry upon initialization. On\n", + "\n", + "each request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n", + "\n", + "serving system, data transport layer or the model itself could log requests and predictions.\n", + "\n", + "###### Inference: batch or streaming\n", + "\n", + "This pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n", + "\n", + "‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n", + "\n", + "throughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n", + "\n", + "option.\n", + "\n", + "A batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n", + "\n", + "A streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n", + "\n", + "Apache Kafka.®\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Monitoring\n", + "\n", + "Input data and model predictions are monitored, both for statistical properties (data drift, model\n", + "\n", + "performance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n", + "\n", + "published for dashboards and alerts.\n", + "\n", + "\u0007 **Data ingestion**\n", + "\n", + "This pipeline reads in logs from batch, streaming or online inference.\n", + "\n", + "**\u0007Check accuracy and data drift**\n", + "\n", + "The pipeline then computes metrics about the input data, the model’s predictions and the infrastructureSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
b8305a116d1c3a3f477c13a54827b19f‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n", + "\n", + "throughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n", + "\n", + "option.\n", + "\n", + "A batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n", + "\n", + "A streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n", + "\n", + "Apache Kafka.®\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Monitoring\n", + "\n", + "Input data and model predictions are monitored, both for statistical properties (data drift, model\n", + "\n", + "performance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n", + "\n", + "published for dashboards and alerts.\n", + "\n", + "\u0007 **Data ingestion**\n", + "\n", + "This pipeline reads in logs from batch, streaming or online inference.\n", + "\n", + "**\u0007Check accuracy and data drift**\n", + "\n", + "The pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n", + "\n", + "performance. Metrics that measure statistical properties are generally chosen by data scientists during\n", + "\n", + "development, whereas metrics for infrastructure are generally chosen by ML engineers.\n", + "\n", + "\u0007 **Publish metrics**\n", + "\n", + "The pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n", + "\n", + "to produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n", + "\n", + "dashboarding tool issues notifications when health metrics surpass defined thresholds.\n", + "\n", + "**\u0007Trigger model training**\n", + "\n", + "When the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n", + "\n", + "out of date, the data scientist may need to return to the development environment and develop a new\n", + "\n", + "model version.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Note:** While automated retraining is supported\n", + "\n", + "in this architecture, it isn’t required, and caution\n", + "\n", + "\n", + "###### Retraining\n", + "\n", + "This architecture supports automatic retraining using the same model training pipeline above. While we\n", + "\n", + "recommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n", + "\n", + "retraining when needed.\n", + "\n", + "\u0007 **Scheduled**\n", + "\n", + "If fresh data are regularly made available, rerunning model training on a defined schedule can help models\n", + "\n", + "to keep up with changing trends and behavior.\n", + "\n", + "**\u0007Triggered**\n", + "\n", + "If the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n", + "\n", + "trigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n", + "\n", + "performance degrades, automatic retraining and redeployment can boost model performance with\n", + "\n", + "minimal human intervention.\n", + "\n", + "\n", + "must be taken in cases where it is implemented.\n", + "\n", + "\n", + "It is inherently difficult to automate selecting the\n", + "\n", + "correct action to take from model monitoring\n", + "\n", + "\n", + "When the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n", + "\n", + "scientist may need to return to the dev environment and resume experimentation to address such issues.\n", + "\n", + "\n", + "alerts. For example, if data drift is observed, does\n", + "\n", + "it indicate that we should automatically retrain, or\n", + "\n", + "does it indicate that we should engineer additional\n", + "\n", + "features to encode some new signal in the data?\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4:**\n", + "## LLMOps – Large Language Model Operations\n", + "\n", + "\n", + "#### Large language models\n", + "\n", + "LLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n", + "\n", + "countless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n", + "\n", + "\u0007Is prompt engineering part of operations, and if so, what is needed?\n", + "\n", + "\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
4e015774702bd6742d3bee6eec59b8ddcorrect action to take from model monitoring\n", + "\n", + "\n", + "When the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n", + "\n", + "scientist may need to return to the dev environment and resume experimentation to address such issues.\n", + "\n", + "\n", + "alerts. For example, if data drift is observed, does\n", + "\n", + "it indicate that we should automatically retrain, or\n", + "\n", + "does it indicate that we should engineer additional\n", + "\n", + "features to encode some new signal in the data?\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4:**\n", + "## LLMOps – Large Language Model Operations\n", + "\n", + "\n", + "#### Large language models\n", + "\n", + "LLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n", + "\n", + "countless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n", + "\n", + "\u0007Is prompt engineering part of operations, and if so, what is needed?\n", + "\n", + "\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n", + "\n", + "\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n", + "\n", + "…and many more!\n", + "\n", + "The good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n", + "\n", + "some parts of your MLOps platform and process may require changes, and your team will need to learn a\n", + "\n", + "mental model of how LLMs coexist alongside traditional ML in your operations.\n", + "\n", + "In this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n", + "\n", + "key topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n", + "\n", + "a reference architecture diagram to illustrate what may change in your production environment.\n", + "\n", + "###### What changes with LLMs?\n", + "\n", + "For those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n", + "\n", + "one-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n", + "\n", + "significantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n", + "\n", + "question answering, summarization and execution of near-arbitrary instructions.\n", + "\n", + "From the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n", + "\n", + "platforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n", + "\n", + "into more detail in the next section.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Table 3**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
b13bc8ea85264360bae8db33185fbadba reference architecture diagram to illustrate what may change in your production environment.\n", + "\n", + "###### What changes with LLMs?\n", + "\n", + "For those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n", + "\n", + "one-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n", + "\n", + "significantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n", + "\n", + "question answering, summarization and execution of near-arbitrary instructions.\n", + "\n", + "From the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n", + "\n", + "platforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n", + "\n", + "into more detail in the next section.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Table 3**\n", + "\n", + "\n", + "\n", + "|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n", + "|---|---|\n", + "|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n", + "|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n", + "|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n", + "|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n", + "|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n", + "\n", + "\n", + "-----\n", + "\n", + "The list above may look long, but as we will see in the next section, many existing tools and processes\n", + "\n", + "only require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n", + "\n", + "do not change:\n", + "\n", + "\u0007The separation of development, staging and production remains the same\n", + "\n", + "\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n", + "\n", + "models toward production\n", + "\n", + "\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n", + "\n", + "\u0007Existing CI/CD infrastructure should not require changes\n", + "\n", + "\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n", + "\n", + "model inference, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Discussion of key topics for LLMOps\n", + "\n", + "So far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n", + "\n", + "more details about selected topics.\n", + "\n", + "###### Prompt engineering\n", + "\n", + "Prompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n", + "\n", + "responses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n", + "\n", + "We will cover a few tips and best practices and link to useful resources.\n", + "\n", + "**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n", + "\n", + "generally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
403f5d8c7d9cbd8decd7c0e286cb08c8\u0007Existing CI/CD infrastructure should not require changes\n", + "\n", + "\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n", + "\n", + "model inference, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Discussion of key topics for LLMOps\n", + "\n", + "So far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n", + "\n", + "more details about selected topics.\n", + "\n", + "###### Prompt engineering\n", + "\n", + "Prompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n", + "\n", + "responses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n", + "\n", + "We will cover a few tips and best practices and link to useful resources.\n", + "\n", + "**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n", + "\n", + "generally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n", + "\n", + "In the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n", + "\n", + "prompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n", + "\n", + "**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n", + "\n", + "\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n", + "\n", + "prompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n", + "\n", + "more details. Checking structured LLM pipeline code into version control also helps with prompt\n", + "\n", + "development, for git diffs allow you to review changes to prompts over time. Also see the section\n", + "\n", + "below on packaging model and pipelines for more information about tracking prompt versions.\n", + "\n", + "\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n", + "\n", + "Newer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n", + "\n", + "\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n", + "\n", + "tuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n", + "\n", + "tuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n", + "\n", + "tool for prompt tuning.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "There are lots of good resources about\n", + "prompt engineering, especially for popular\n", + "\n", + "models and services:\n", + "\n", + "\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n", + "\n", + "\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "\n", + "**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n", + "\n", + "popularity. Some of these generalize to other models as well. We will provide a few tips here:\n", + "\n", + "\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n", + "\n", + "input, and a description of the desired output type or formatSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
fb3241773f0b268b7253f34772d98d1a[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n", + "\n", + "\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "\n", + "**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n", + "\n", + "popularity. Some of these generalize to other models as well. We will provide a few tips here:\n", + "\n", + "\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n", + "\n", + "input, and a description of the desired output type or format\n", + "\n", + "\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n", + "\n", + "\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n", + "\n", + "\u0007Tell the model to think step-by-step or explain its reasoning\n", + "\n", + "\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n", + "\n", + "clear which parts of the prompt correspond to your instruction vs. user input\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Packaging models or pipelines for deployment\n", + "\n", + "In traditional ML, there are generally two types of ML logic to package for deployment: models and\n", + "\n", + "pipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n", + "\n", + "control, respectively.\n", + "\n", + "With LLMs, it is common to package ML logic in new forms. These may include:\n", + "\n", + "\u0007A lightweight call to an LLM API service (third party or internal)\n", + "\n", + "\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n", + "\n", + "local LLM model.\n", + "\n", + "\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n", + "\n", + "pretrained model or a custom fine-tuned model.\n", + "\n", + "\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n", + "\n", + "Though LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n", + "\n", + "and pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n", + "\n", + "deployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n", + "\n", + "\u0007PyTorch and TensorFlow\n", + "\n", + "\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n", + "\n", + "\u0007LangChain\n", + "\n", + "\u0007OpenAI API\n", + "\n", + "\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n", + "\n", + "For other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n", + "\n", + "arbitrary Python code.\n", + "\n", + "\n", + "**Note about prompt versioning:** Just as it is helpful\n", + "\n", + "to track model versions, it is helpful to track prompt\n", + "\n", + "versions (and LLM pipeline versions, more generally).\n", + "\n", + "Packaging prompts and pipelines as MLflow Models\n", + "\n", + "simplifies versioning. Just as a newly retrained\n", + "\n", + "model can be tracked as a new model version in the\n", + "\n", + "MLflow Model Registry, a newly updated prompt can\n", + "\n", + "be tracked as a new model version.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
b7a9be595932e694392d780538b29682\u0007LangChain\n", + "\n", + "\u0007OpenAI API\n", + "\n", + "\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n", + "\n", + "For other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n", + "\n", + "arbitrary Python code.\n", + "\n", + "\n", + "**Note about prompt versioning:** Just as it is helpful\n", + "\n", + "to track model versions, it is helpful to track prompt\n", + "\n", + "versions (and LLM pipeline versions, more generally).\n", + "\n", + "Packaging prompts and pipelines as MLflow Models\n", + "\n", + "simplifies versioning. Just as a newly retrained\n", + "\n", + "model can be tracked as a new model version in the\n", + "\n", + "MLflow Model Registry, a newly updated prompt can\n", + "\n", + "be tracked as a new model version.\n", + "\n", + "**Note about deploying models vs. code:** Your\n", + "\n", + "decisions around packaging ML logic as version\n", + "\n", + "controlled code vs. registered models will help\n", + "\n", + "to inform your decision about choosing between\n", + "\n", + "the deploy models, deploy code and hybrid\n", + "\n", + "architectures. Review the subsection below about\n", + "\n", + "human feedback, and make sure that you have a\n", + "\n", + "well-defined testing process for whatever artifacts\n", + "\n", + "you choose to deploy.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Managing cost/performance trade-offs\n", + "\n", + "One of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n", + "\n", + "and serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n", + "\n", + "of billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n", + "\n", + "manage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n", + "\n", + "**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n", + "\n", + "development is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n", + "\n", + "models. As you go, make sure to collect data such as queries and responses. In the future, you can use\n", + "\n", + "that data to fine-tune a smaller, cheaper model which you can own.\n", + "\n", + "**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n", + "\n", + "How much does each query cost? These estimates will inform you about project feasibility and will help\n", + "\n", + "you to decide when to consider bringing the model in-house with open source models and fine-tuning.\n", + "\n", + "**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n", + "\n", + "computation and costs. These include shortening queries, tweaking inference configurations and using\n", + "\n", + "smaller versions of models.\n", + "\n", + "**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n", + "\n", + "unless you get human feedback from end users.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "**Fine-tuning**\n", + "\n", + "\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
58baf953efcaa5513cfbb2367b889214smaller versions of models.\n", + "\n", + "**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n", + "\n", + "unless you get human feedback from end users.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "**Fine-tuning**\n", + "\n", + "\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "**Model distillation,**\n", + "**quantization and pruning**\n", + "\n", + "\n", + "###### Methods for reducing costs of inference\n", + "\n", + "**Use a smaller model**\n", + "\n", + "\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n", + "\n", + "or alternate architectures.\n", + "\n", + "\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n", + "\n", + "perform better than a generic model.\n", + "\n", + "\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n", + "\n", + "model into a smaller model.\n", + "\n", + "\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n", + "\n", + "without losing much in quality.\n", + "\n", + "\n", + "\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\n", + "**\u0007Reduce computation for a given model**\n", + "\n", + "\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n", + "\n", + "queries and responses reduces costs.\n", + "\n", + "\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n", + "\n", + "**Other**\n", + "\n", + "\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n", + "\n", + "low ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n", + "\n", + "\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n", + "\n", + "models to use sparse computation during inference. This reduces computation for most or all queries.\n", + "\n", + "\n", + "[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
97a9d43b25175ae2b3b51d9ac813bda2\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n", + "\n", + "models to use sparse computation during inference. This reduces computation for most or all queries.\n", + "\n", + "\n", + "[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Human feedback, testing, and monitoring\n", + "\n", + "While human feedback is important in many traditional ML applications, it becomes much more important\n", + "\n", + "for LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n", + "\n", + "metrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n", + "\n", + "might have almost completely different words and word orders, so even defining a “ground-truth” label\n", + "\n", + "becomes difficult or impossible.\n", + "\n", + "Humans — ideally your end users — become essential for validating LLM output. While you can pay human\n", + "\n", + "labelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n", + "\n", + "feedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n", + "\n", + "to chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n", + "\n", + "were helpful.\n", + "\n", + "In terms of operations, not much changes from traditional MLOps:\n", + "\n", + "\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n", + "\n", + "lakehouse, and process it using the same data pipeline tooling as other data.\n", + "\n", + "\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n", + "\n", + "more important, superceding offline quality tests. If you can collect user feedback, then these rollout\n", + "\n", + "methods can validate models before they are fully deployed.\n", + "\n", + "\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n", + "\n", + "fine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n", + "\n", + "start with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n", + "\n", + "\n", + "###### Resources\n", + "\n", + "**Reinforcement Learning from**\n", + "**Human Feedback (RLHF)**\n", + "\n", + "\u0007Chip Huyen blog post on\n", + "\n", + "[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "\u0007Hugging Face blog post on\n", + "\n", + "[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n", + "\n", + "[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n", + "\n", + "\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Other topicsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
51901892d98beec26cc5fa8ab0b1f58b###### Resources\n", + "\n", + "**Reinforcement Learning from**\n", + "**Human Feedback (RLHF)**\n", + "\n", + "\u0007Chip Huyen blog post on\n", + "\n", + "[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "\u0007Hugging Face blog post on\n", + "\n", + "[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n", + "\n", + "[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n", + "\n", + "\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Other topics\n", + "\n", + "\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n", + "\n", + "but some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n", + "\n", + "your LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n", + "\n", + "fine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n", + "\n", + "[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n", + "\n", + "\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n", + "\n", + "adjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n", + "\n", + "models, most LLMs will benefit from or require GPUs for serving and inference.\n", + "\n", + "\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n", + "\n", + "based lookups of documents or other data. Vector databases may be an important addition to your\n", + "\n", + "serving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n", + "\n", + "preprocessed data which can be queried by inference jobs or model serving systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "To illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n", + "\n", + "modified version of the previous production architecture.\n", + "\n", + "Production environment\n", + "\n", + "Model Registry\n", + "\n", + "Stage: �one Stage: Staging Stage: Production\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Push model to registry Promote to production\n", + "\n", + "\n", + "Model serving\n", + "\n", + "\n", + "LLM API request\n", + "\n", + "release\n", + "\n", + "\n", + "Fine-Tine LLM\n", + "\n", + "release\n", + "\n", + "\n", + "Vector Database\n", + "Update\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring &\n", + "Evaluation\n", + "\n", + "release\n", + "\n", + "\n", + "Internal/External Data tables Vector database Metrics tables Human feedback\n", + "model hub\n", + "\n", + "**Figure 7**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Additional resources\n", + "\n", + "With LLMs being such a novel field, we link to\n", + "several LLM resources below, which are not\n", + "\n", + "necessarily “LLMOps” but may prove useful\n", + "to you.\n", + "\n", + "\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
8aae99498ab67fee9a85931282af919ePush model to registry Promote to production\n", + "\n", + "\n", + "Model serving\n", + "\n", + "\n", + "LLM API request\n", + "\n", + "release\n", + "\n", + "\n", + "Fine-Tine LLM\n", + "\n", + "release\n", + "\n", + "\n", + "Vector Database\n", + "Update\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring &\n", + "Evaluation\n", + "\n", + "release\n", + "\n", + "\n", + "Internal/External Data tables Vector database Metrics tables Human feedback\n", + "model hub\n", + "\n", + "**Figure 7**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Additional resources\n", + "\n", + "With LLMs being such a novel field, we link to\n", + "several LLM resources below, which are not\n", + "\n", + "necessarily “LLMOps” but may prove useful\n", + "to you.\n", + "\n", + "\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "LLM lists and leaderboards\n", + "\n", + "\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n", + "\n", + "\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n", + "\n", + "\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n", + "\n", + "[Foundation Models](https://crfm.stanford.edu/)\n", + "\n", + "\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n", + "\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n", + "\n", + "\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "The primary changes to this production architecture are:\n", + "\n", + "\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n", + "\n", + "an internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n", + "\n", + "production to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n", + "\n", + "tuning, this hub would mainly be used in development.\n", + "\n", + "\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n", + "\n", + "model (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n", + "\n", + "but it is similar operationally.\n", + "\n", + "\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n", + "\n", + "most often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n", + "\n", + "its Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n", + "\n", + "that these data stores and jobs are analogous in terms of operations.\n", + "\n", + "\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n", + "\n", + "API calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n", + "\n", + "potential latency or flakiness from third-party APIs, as well as another layer of credential management.\n", + "\n", + "\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n", + "\n", + "but become essential in most LLM applications. Human feedback should be managed like other data,\n", + "\n", + "ideally incorporated into monitoring based on near real-time streaming.\n", + "\n", + "\n", + "[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Looking aheadSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
70c4fc510f690efc8e9e44abb2e20db6its Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n", + "\n", + "that these data stores and jobs are analogous in terms of operations.\n", + "\n", + "\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n", + "\n", + "API calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n", + "\n", + "potential latency or flakiness from third-party APIs, as well as another layer of credential management.\n", + "\n", + "\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n", + "\n", + "but become essential in most LLM applications. Human feedback should be managed like other data,\n", + "\n", + "ideally incorporated into monitoring based on near real-time streaming.\n", + "\n", + "\n", + "[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Looking ahead\n", + "\n", + "LLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n", + "\n", + "support and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n", + "\n", + "sourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n", + "\n", + "flexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n", + "\n", + "While this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n", + "\n", + "requires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n", + "\n", + "news is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n", + "\n", + "LLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n", + "\n", + "the power of large language models.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "9,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast and over 50% of the Fortune 500 — rely\n", + "\n", + "on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered\n", + "\n", + "in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark ™ ,\n", + "\n", + "Delta Lake and MLflow, Databricks is on a mission\n", + "\n", + "to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
83fa8c714cfff14256cf56543c98c4cb**eBook**\n", + "\n", + "# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n", + "\n", + "### Real-world use cases with Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Three Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n", + "\n", + "The Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n", + "\n", + "Common Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n", + "\n", + "Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n", + "\n", + "Key Use Cases for Insurance:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
5014f5f2c09c55edb470c8b5528eb000Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n", + "\n", + "Key Use Cases for Insurance:\n", + "\n", + "**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n", + "\n", + "**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n", + "\n", + "**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n", + "\n", + "**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
fba83e9ab8b12d3c768f58c396c23616**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n", + "\n", + "Global Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n", + "\n", + "**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n", + "\n", + "Get Started With Industry Solutions ............................................................................................................................................................. **20**\n", + "\n", + "Conclusion ................................................................................................................................................................................................................... **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "With the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\n", + "from the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\n", + "insights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\n", + "data from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\n", + "clickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
a89b2827b1da6463d7f2aa3ebcea8079-----\n", + "\n", + "## Introduction\n", + "\n", + "With the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\n", + "from the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\n", + "insights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\n", + "data from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\n", + "clickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n", + "\n", + "Consumers want stronger reassurance for what they value most: financial security and greater peace of mind.\n", + "Insurers have always prided themselves on delivering such protection and security. However, customer needs\n", + "have changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\n", + "challenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\n", + "their customers to remain competitive.\n", + "\n", + "Data-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\n", + "pricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\n", + "digital investments and enterprise data strategy has become a top priority for boards and senior executives\n", + "in the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\n", + "to having one reliable source of truth for data, which is derived from batch and streaming data, structured and\n", + "unstructured data, from multiple clouds and jurisdictions.\n", + "\n", + "\n", + "In a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\n", + "fundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\n", + "Databricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\n", + "transform their businesses, resulting in significant operational efficiencies and improved customer experiences\n", + "at a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\n", + "and common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\n", + "Lakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\n", + "partners available to assist you on your journey.\n", + "\n", + "\n", + "**The future of insurance will**\n", + "\n", + "**become increasingly data-driven,**\n", + "\n", + "**and analytics enabled.”**\n", + "\n", + "**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Lakehouse reference architecture below illustrates a sample framework upon\n", + "which insurers can build. Moving from left to right in the diagram, the first layer\n", + "represents various data sources such as on-premises systems, web and mobile\n", + "applications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\n", + "is then ingested through automated data pipelines, and processed within the\n", + "Lakehouse platform across three layers (Bronze, Silver and Gold). These layers\n", + "are responsible for data preparation, including ML model registry, centralized\n", + "\n", + "\n", + "governance, workflow orchestration, and job scheduling. They ensure a compliant\n", + "and secure infrastructure that sits atop the cloud layer (or multiple clouds),\n", + "eliminating the need for data duplication. Finally, the transformed data is delivered\n", + "as actionable insights and supports use cases such as automated reporting,\n", + "business analytics, customer 360, and claims analytics. These use cases not only\n", + "mitigate risk but also drive revenue.\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**On-Premises**\n", + "**Servers**\n", + "\n", + "\n", + "**Ingestion**\n", + "\n", + "\n", + "**Lakehouse for Financial Services**\n", + "\n", + "**Bronze Layer** **Silver Layer** **Gold Layer**\n", + "\n", + "\n", + "**Serving**\n", + "\n", + "**Automated**\n", + "**Reporting**\n", + "\n", + "\n", + "**Web and Mobile**\n", + "**Applications**\n", + "\n", + "\n", + "**Business Analytics**\n", + "**and Interactive**\n", + "**Dashboards**\n", + "\n", + "\n", + "**Raw Entity Data**\n", + "\n", + "\n", + "**Curated Feature**\n", + "**Sets**\n", + "\n", + "\n", + "**Aggregated**\n", + "**Business Views**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
76dd835dcbfd9e1aaaff490ac84f4e71**Data Sources**\n", + "\n", + "**On-Premises**\n", + "**Servers**\n", + "\n", + "\n", + "**Ingestion**\n", + "\n", + "\n", + "**Lakehouse for Financial Services**\n", + "\n", + "**Bronze Layer** **Silver Layer** **Gold Layer**\n", + "\n", + "\n", + "**Serving**\n", + "\n", + "**Automated**\n", + "**Reporting**\n", + "\n", + "\n", + "**Web and Mobile**\n", + "**Applications**\n", + "\n", + "\n", + "**Business Analytics**\n", + "**and Interactive**\n", + "**Dashboards**\n", + "\n", + "\n", + "**Raw Entity Data**\n", + "\n", + "\n", + "**Curated Feature**\n", + "**Sets**\n", + "\n", + "\n", + "**Aggregated**\n", + "**Business Views**\n", + "\n", + "\n", + "**Automated Data Pipelines**\n", + "**(Batch or Streaming)**\n", + "\n", + "**Collaborative**\n", + "**Data Source**\n", + "\n", + "\n", + "**Internet-of-Things**\n", + "**(IoT) Devices**\n", + "\n", + "\n", + "**Enterprise Data**\n", + "**Warehouses**\n", + "\n", + "\n", + "**Third-Party APIs**\n", + "**and Services**\n", + "\n", + "\n", + "**ML Model**\n", + "**Registry**\n", + "\n", + "\n", + "**Centralized Data**\n", + "**Governance**\n", + "\n", + "\n", + "**Workflow**\n", + "**Orchestration**\n", + "\n", + "\n", + "**Productionized**\n", + "**Referenced Data**\n", + "**and Models**\n", + "\n", + "**Job Scheduling**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Three Trends Driving Transformation in Insurance\n", + "\n", + "Over the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\n", + "The following three trends are driving this transformation in the insurance industry:\n", + "\n", + "\n", + "**The rapid emergence of large language**\n", + "**models and generative AI**\n", + "\n", + "In recent years, there has been a significant\n", + "breakthrough in the field of artificial intelligence with\n", + "the emergence of large language models (LLMs)\n", + "and generative AI. These models, such as GPT-4 and\n", + "its predecessors, Databricks Dolly and others are\n", + "built using deep learning techniques and massive\n", + "amounts of training data, enabling them to generate\n", + "human-like text and perform a wide range of natural\n", + "language processing tasks. LLMs and generative AI\n", + "can help insurance companies automate repetitive\n", + "tasks such as underwriting, claims processing,\n", + "\n", + "and customer service, improving efficiency and\n", + "reducing costs. They can also help insurers to better\n", + "understand customer needs and preferences,\n", + "leading to more personalized products and services.\n", + "However, as with any disruptive technology, the\n", + "adoption of LLMs and generative AI will require\n", + "careful consideration of ethical and regulatory\n", + "issues, such as data privacy and algorithmic bias.\n", + "\n", + "\n", + "**Transformed ecosystems**\n", + "**and open insurance**\n", + "\n", + "[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\n", + "insurtechs in their ecosystems to achieve high\n", + "margins in commoditized products. Open insurance,\n", + "which involves sharing and managing insurancerelated data through APIs, is more than an item in\n", + "the regulatory agenda. It can give consumers access\n", + "to better products and accurate pricing, as well as\n", + "enable them to execute transactions more easily.\n", + "In its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\n", + "found that organizations that promote external data\n", + "sharing have three times the measurable economic\n", + "\n", + "benefit across a variety of performance metrics\n", + "compared to their peers.\n", + "\n", + "\n", + "**Revised target operating model**\n", + "**with a focus on talent**\n", + "\n", + "Demographic shifts and perennial cost pressures\n", + "make it critical for insurers to attract and retain\n", + "talent. Consequently, it’s important for insurers\n", + "to equip their workforces with the right tools\n", + "and technologies to help them identify business\n", + "processes that can be optimized to differentiate\n", + "themselves from their competitors, with an emphasis\n", + "on moments that matter in the customer journey,\n", + "according to EY. Recent research from Deloitte\n", + "highlights the advantages of upskilling and building\n", + "a future-ready workforce. One of the benefitsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
2db840ab2d9248420f54aab2c527ca4bbenefit across a variety of performance metrics\n", + "compared to their peers.\n", + "\n", + "\n", + "**Revised target operating model**\n", + "**with a focus on talent**\n", + "\n", + "Demographic shifts and perennial cost pressures\n", + "make it critical for insurers to attract and retain\n", + "talent. Consequently, it’s important for insurers\n", + "to equip their workforces with the right tools\n", + "and technologies to help them identify business\n", + "processes that can be optimized to differentiate\n", + "themselves from their competitors, with an emphasis\n", + "on moments that matter in the customer journey,\n", + "according to EY. Recent research from Deloitte\n", + "highlights the advantages of upskilling and building\n", + "a future-ready workforce. One of the benefits\n", + "\n", + "of AI adoption in the workforce is that it enables\n", + "organizations to automate a wide range of business\n", + "processes, boosting speed and efficiency. But what’s\n", + "even more important is that it enables employees to\n", + "focus on higher-value work, according to Deloitte.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for Modern Data Infrastructure\n", + "\n", + "**Insurers turning to cloud and data analytics**\n", + "\n", + "\n", + "The insurance industry has undergone significant changes over the years, and\n", + "one of the areas that has evolved the most is data management. With the\n", + "growing need for advanced analytics and digital transformation, many insurance\n", + "companies are turning to cloud technology and modern data infrastructures\n", + "to enhance their data management strategies. The benefits of adopting cloud\n", + "technology are numerous, particularly the ability to efficiently store and quickly\n", + "access vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\n", + "insurers to scale costs, adapt to changing work environments, and meet evolving\n", + "customer and business requirements.\n", + "\n", + "\n", + "dynamic pricing and underwriting, and form the foundation for claims automation.\n", + "By implementing advanced analytics, insurers can innovate more easily, scale their\n", + "businesses, and bring new products to market more quickly.\n", + "\n", + "To remain competitive, insurance companies must increase their investment in\n", + "cloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\n", + "policy administration, and customer satisfaction. Overall, the adoption of cloud\n", + "technology and data analytics is imperative for insurance providers to enhance\n", + "operational efficiency, improve business processes, and stay relevant in today’s\n", + "fast-paced business landscape.\n", + "\n", + "\n", + "Furthermore, insurance providers can leverage the cloud to analyze customer\n", + "data at scale, gaining insights into behaviors that drive hyper-personalization,\n", + "\n", + "\n", + "-----\n", + "\n", + "**Let’s take a closer look look at a few examples:**\n", + "\n", + "\n", + "**Auto insurers** need to integrate new data sources, such as weather and traffic,\n", + "to build solutions capable of real-time processing. This enables them to alert\n", + "emergency services promptly and gain a better understanding of drivers’ driving\n", + "patterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n", + "\n", + "**Commercial insurance** , including property, general liability, cyber insurance and\n", + "business income insurance, utilizes ML-based automation of actuarial models.\n", + "This automation facilitates underwriting, claims forecasting and dynamic pricing\n", + "for their customers. Another notable trend in recent years is the use of IoT-\n", + "\n", + "\n", + "based alerting for sensitive or valuable commodities. For example, in the case of\n", + "vaccines, IoT sensors can monitor the temperature in real time and send alerts to\n", + "the appropriate team or person if the temperature exceeds acceptable thresholds.\n", + "This is crucial as vaccines must be stored within specific temperature ranges.\n", + "\n", + "In **life insurance** , complex ML models can be employed to create a profile of\n", + "the customer’s lifestyle and, importantly, detect any changes to it. This deeper\n", + "understanding and 360-degree view of the customer enable more customized\n", + "underwriting and pricing based on the policyholder’s current health, lifestyle and\n", + "eating habits.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
46e48686f920f1e618b993de52ae72d0**Commercial insurance** , including property, general liability, cyber insurance and\n", + "business income insurance, utilizes ML-based automation of actuarial models.\n", + "This automation facilitates underwriting, claims forecasting and dynamic pricing\n", + "for their customers. Another notable trend in recent years is the use of IoT-\n", + "\n", + "\n", + "based alerting for sensitive or valuable commodities. For example, in the case of\n", + "vaccines, IoT sensors can monitor the temperature in real time and send alerts to\n", + "the appropriate team or person if the temperature exceeds acceptable thresholds.\n", + "This is crucial as vaccines must be stored within specific temperature ranges.\n", + "\n", + "In **life insurance** , complex ML models can be employed to create a profile of\n", + "the customer’s lifestyle and, importantly, detect any changes to it. This deeper\n", + "understanding and 360-degree view of the customer enable more customized\n", + "underwriting and pricing based on the policyholder’s current health, lifestyle and\n", + "eating habits.\n", + "\n", + "\n", + "|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n", + "|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n", + "|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n", + "|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n", + "|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n", + "\n", + "\n", + "**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n", + "\n", + "\n", + "-----\n", + "\n", + "## Common Challenges Insurers Face Using Legacy Technology\n", + "\n", + "\n", + "Modernization is not an easy process for insurers, and while transforming IT\n", + "ecosystems is necessary to improve business outcomes, ensuring business\n", + "continuity is absolutely critical. However, the volume of data they collect, along\n", + "with changes in user behavior and legacy systems that can’t handle this amount of\n", + "data, are forcing insurance providers to accelerate their modernization journeys.\n", + "\n", + "Insurance providers face several challenges when using legacy technology, including:\n", + "\n", + "**Legacy on-premises systems:** Legacy on-premises systems are not only\n", + "expensive to maintain, but they also store large amounts of big data in silos across\n", + "the business. This makes it difficult to access the data, hindering data analytics\n", + "efforts and limiting executives’ ability to make informed business decisions.\n", + "\n", + "**Ingesting large volumes of transactional data in real time:** The inability to\n", + "ingest data from transaction systems in real time is a major obstacle to obtaining\n", + "critical insights. Transaction logs from operations such as policy administration,\n", + "enrollment and claims constantly stream data. However, many insurance\n", + "companies still rely on legacy data warehouses built around batch processing,\n", + "which is not suitable for ingesting and integrating large data sets. As a result,\n", + "insurers often opt to ingest data nightly, leading to delays in receiving accurate\n", + "data for decision-making.\n", + "\n", + "\n", + "**Performing fine-grained analysis at scale within tight time frames:** Legacy\n", + "technology forces insurers to make a trade-off when analyzing data for user intent.\n", + "They can choose between detailed and accurate predictions or fast predictions.\n", + "Running detailed forecasts can improve accuracy, but it requires performing\n", + "millions of model calculations within narrow service windows, which exceeds the\n", + "capability of legacy data platforms. Consequently, insurance companies have to\n", + "accept less accurate predictions.\n", + "\n", + "**Powering real-time decisions on the front line:** Serving real-time data to\n", + "thousands of workers is a complex task. While data warehouses can serve reports\n", + "to large groups of users, they are limited to providing stale data. As a result, most\n", + "insurers only provide daily or weekly updates to reports and rely on employees’\n", + "judgment for more frequent decisions.\n", + "\n", + "**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\n", + "to deliver personalized experiences across every channel, both digital and offline.\n", + "While insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\n", + "high volumes, leading to inaccurate analytics. To succeed in the insurance industry,\n", + "companies must deliver personalized experiences at scale.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
2751342200e6294e9cde3b924bddfeda**Powering real-time decisions on the front line:** Serving real-time data to\n", + "thousands of workers is a complex task. While data warehouses can serve reports\n", + "to large groups of users, they are limited to providing stale data. As a result, most\n", + "insurers only provide daily or weekly updates to reports and rely on employees’\n", + "judgment for more frequent decisions.\n", + "\n", + "**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\n", + "to deliver personalized experiences across every channel, both digital and offline.\n", + "While insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\n", + "high volumes, leading to inaccurate analytics. To succeed in the insurance industry,\n", + "companies must deliver personalized experiences at scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Lakehouse for Insurance addresses the key challenges faced across the\n", + "insurance value chain. The lakehouse enables the integration of various data types,\n", + "including images and structured data, in real time. It offers robust management\n", + "and governance capabilities, and rapidly transforms data into actionable insights\n", + "\n", + "\n", + "through real-time reporting and predictive analytics. This platform-as-a-service\n", + "solution delivers exceptional speed and industry-leading total cost of ownership,\n", + "providing insurers with faster insights to enhance the customer experience and\n", + "gain a competitive edge.\n", + "\n", + "\n", + "**Product**\n", + "**Development &**\n", + "**Feature Selection**\n", + "\n", + "\n", + "**Application**\n", + "**Review &**\n", + "**Submission**\n", + "\n", + "\n", + "**Policy Issue,**\n", + "**Service &**\n", + "**Administration**\n", + "\n", + "\n", + "**Sales & Lead**\n", + "**Management**\n", + "\n", + "**Hyperpersonalization/**\n", + "**life events**\n", + "\n", + "\n", + "**Underwriting**\n", + "**and Pricing**\n", + "\n", + "**UW rules**\n", + "**guidelines &**\n", + "**technical pricing**\n", + "\n", + "\n", + "**Rating Offer &**\n", + "**Endorsements**\n", + "\n", + "**Evaluate**\n", + "**rate options,**\n", + "**pricing and**\n", + "**endorsements**\n", + "\n", + "\n", + "**Claims**\n", + "\n", + "\n", + "**Coverage/** **Review policy**\n", + "**features/riders** **documents**\n", + "**(submission)**\n", + "\n", + "\n", + "**Omnichannel** **Fraud, frequency,**\n", + "**severity and**\n", + "**reserves**\n", + "\n", + "\n", + "**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n", + "\n", + "\n", + "\n", + "**•** Dynamic segmentation\n", + "\n", + "**•** Personas\n", + "\n", + "**•** Hyper-personalization\n", + "\n", + "**•** Intelligent automation\n", + "\n", + "\n", + "\n", + "**•** Product architecture and\n", + "manufacturing\n", + "\n", + "**•** Configurable products\n", + "\n", + "**•** Competitor rates\n", + "\n", + "\n", + "\n", + "**•** Reflexive questionnaire\n", + "\n", + "**•** LLM assistance for\n", + "document summarization\n", + "\n", + "**•** NLP for unstructured data\n", + "\n", + "\n", + "\n", + "**•** Evaluation of risk within\n", + "appetite\n", + "\n", + "**•** Validation of UW\n", + "requirements\n", + "\n", + "**•** Straight-through\n", + "processing optimization\n", + "\n", + "**•** Risk assessment via\n", + "actuarial pricing\n", + "\n", + "**•** Triaging of risk to\n", + "underwriter SME for policy/\n", + "exposure changes\n", + "\n", + "\n", + "\n", + "**•** Predict loss cost\n", + "(frequency and severity)\n", + "\n", + "**•** Computer vision on images\n", + "to identify loss\n", + "\n", + "**•** Auto-adjudication and\n", + "triaging of claims to claim\n", + "adjuster\n", + "\n", + "**•** Tailor communication by\n", + "segment (e.g., email, text,\n", + "mail, or omnichannel)\n", + "\n", + "**•** Identify Fraud, Waste and\n", + "Abuse, route to ICU\n", + "\n", + "\n", + "**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Lakehouse for InsuranceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
63622c61194bbab78b3598f578b3dc38**•** Evaluation of risk within\n", + "appetite\n", + "\n", + "**•** Validation of UW\n", + "requirements\n", + "\n", + "**•** Straight-through\n", + "processing optimization\n", + "\n", + "**•** Risk assessment via\n", + "actuarial pricing\n", + "\n", + "**•** Triaging of risk to\n", + "underwriter SME for policy/\n", + "exposure changes\n", + "\n", + "\n", + "\n", + "**•** Predict loss cost\n", + "(frequency and severity)\n", + "\n", + "**•** Computer vision on images\n", + "to identify loss\n", + "\n", + "**•** Auto-adjudication and\n", + "triaging of claims to claim\n", + "adjuster\n", + "\n", + "**•** Tailor communication by\n", + "segment (e.g., email, text,\n", + "mail, or omnichannel)\n", + "\n", + "**•** Identify Fraud, Waste and\n", + "Abuse, route to ICU\n", + "\n", + "\n", + "**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Lakehouse for Insurance\n", + "\n", + "Databricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\n", + "best-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n", + "\n", + "\n", + "\n", + "**•** Insurance companies can store any type of\n", + "data using Databricks Lakehouse for Insurance,\n", + "leveraging the low-cost object storage supported\n", + "by cloud providers. This helps break down data\n", + "silos that hinder efforts to aggregate data for\n", + "advanced analytics, such as claim triaging and\n", + "fraud identification, regulatory reporting, or\n", + "compute-intensive risk workloads. Another critical\n", + "feature is the time-travel capabilities of the\n", + "lakehouse architecture, allowing insurers to access\n", + "any historical version of their data.\n", + "\n", + "\n", + "\n", + "**•** Supporting streaming use cases, such as\n", + "monitoring transaction data, is easier with the\n", + "lakehouse. It utilizes Apache Spark ™ as the data\n", + "processing engine and Delta Lake as the storage\n", + "layer. Spark enables seamless switching between\n", + "batch and streaming workloads with just a single\n", + "line of code. Delta Lake’s native support for ACID\n", + "transactions ensures reliable and high-performing\n", + "streaming workloads.\n", + "\n", + "\n", + "\n", + "**•** For both machine learning and non-machine\n", + "learning insurance models, a comprehensive\n", + "governance framework is provided. Data, code,\n", + "libraries and models are linked and independently\n", + "version controlled using technologies like Delta\n", + "Lake and MLflow. Delta Lake ensures stability by\n", + "allowing insurance companies to declare their\n", + "expectations for data quality upfront. MLflow\n", + "enables training models in any language and\n", + "deploying them anywhere, minimizing the need for\n", + "complex handoffs between data science practices,\n", + "independent validation units and operational teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Level-up value with Databricks Lakehouse for insurance**\n", + "\n", + "Building your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\n", + "use cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n", + "\n", + "With a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\n", + "cloud data architecture. The key benefits include:\n", + "\n", + "\n", + "**1. Cost and complexity reduction**\n", + "\n", + "The Databricks Lakehouse provides an open, simple\n", + "and unified cloud data management architecture\n", + "that streamlines operational inefficiencies, reduces\n", + "IT infrastructure costs, and enhances productivity\n", + "across teams.\n", + "\n", + "\n", + "**2. Enhanced risk management and control**\n", + "\n", + "By unlocking the value of enterprise data, the\n", + "platform helps reduce corporate governance and\n", + "security risks. It facilitates data-driven decisionmaking through governed discovery, access and\n", + "data sharing.\n", + "\n", + "\n", + "**3. Accelerated innovation**\n", + "\n", + "The platform enables the acceleration of digital\n", + "transformation, modernization and cloud migration\n", + "initiatives, fostering new growth opportunities\n", + "and driving innovation for improved customer and\n", + "workforce experiences.\n", + "\n", + "\n", + "To help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture for Smart Claims**\n", + "\n", + "\n", + "**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n", + "\n", + "or incrementally through change data capture (CDC). These\n", + "\n", + "include structured and unstructured data sets like images, text,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
9e5658fb4d571359c7d2e349d628c5ceThe Databricks Lakehouse provides an open, simple\n", + "and unified cloud data management architecture\n", + "that streamlines operational inefficiencies, reduces\n", + "IT infrastructure costs, and enhances productivity\n", + "across teams.\n", + "\n", + "\n", + "**2. Enhanced risk management and control**\n", + "\n", + "By unlocking the value of enterprise data, the\n", + "platform helps reduce corporate governance and\n", + "security risks. It facilitates data-driven decisionmaking through governed discovery, access and\n", + "data sharing.\n", + "\n", + "\n", + "**3. Accelerated innovation**\n", + "\n", + "The platform enables the acceleration of digital\n", + "transformation, modernization and cloud migration\n", + "initiatives, fostering new growth opportunities\n", + "and driving innovation for improved customer and\n", + "workforce experiences.\n", + "\n", + "\n", + "To help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture for Smart Claims**\n", + "\n", + "\n", + "**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n", + "\n", + "or incrementally through change data capture (CDC). These\n", + "\n", + "include structured and unstructured data sets like images, text,\n", + "\n", + "and video, such as IoT sensor data, operational data like claims\n", + "\n", + "and policies, and on-prem or third-party data such as from\n", + "\n", + "credit bureaus, weather, and driving records. Partner Connect\n", + "\n", + "offers a range of ingest tools from different vendors that you can\n", + "\n", + "directly use from the Databricks portal.\n", + "\n", + "\n", + "**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n", + "\n", + "path to transform the data based on business\n", + "\n", + "requirements. All the data resides in cloud storage,\n", + "\n", + "where Delta refines it into Bronze, Silver and Gold\n", + "\n", + "zones of a medallion pipeline blueprint. Databricks\n", + "\n", + "Workflows provide orchestration of the various\n", + "\n", + "dependent tasks, with advanced capabilities like\n", + "\n", + "\n", + "**3.** \u0007Databricks SQL, with Photon\n", + "\n", + "and serverless options, caters\n", + "\n", + "to BI consumption use cases to\n", + "\n", + "refresh a dashboard monitoring\n", + "\n", + "key metrics and KPIs, with\n", + "\n", + "query history and alerts on\n", + "\n", + "critical events.\n", + "\n", + "\n", + "**4.** \u0007Databricks ML Runtime,\n", + "\n", + "MLFlow, along with\n", + "\n", + "Feature Store, Auto ML,\n", + "\n", + "and real-time Model\n", + "\n", + "Serving enable ML\n", + "\n", + "use cases to provide\n", + "\n", + "\n", + "**5.** \u0007Delta Sharing provides\n", + "\n", + "a secure and governed\n", + "\n", + "way of sharing data\n", + "\n", + "internally and externally\n", + "\n", + "without copying it,\n", + "\n", + "using Unity Catalog.\n", + "\n", + "\n", + "predictive insights.\n", + "\n", + "\n", + "retry, repair and job status notifications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Secure data sharing with Delta Lake**\n", + "\n", + "At the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\n", + "Lake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\n", + "unify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n", + "\n", + "Once the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\n", + "They can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n", + "\n", + "**Business intelligence**\n", + "\n", + "**Streaming**\n", + "\n", + "**Centralized**\n", + "**governance**\n", + "\n", + "\n", + "##### Lakehouse Platform\n", + "\n", + "\n", + "**Data science / ML**\n", + "\n", + "**One copy**\n", + "**of data**\n", + "\n", + "**Data warehouse**\n", + "\n", + "**Orchestration**\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Claims automation and transformation\n", + "\n", + "**Overview**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
013e9842fece7a621a03d16a704b8220Once the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\n", + "They can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n", + "\n", + "**Business intelligence**\n", + "\n", + "**Streaming**\n", + "\n", + "**Centralized**\n", + "**governance**\n", + "\n", + "\n", + "##### Lakehouse Platform\n", + "\n", + "\n", + "**Data science / ML**\n", + "\n", + "**One copy**\n", + "**of data**\n", + "\n", + "**Data warehouse**\n", + "\n", + "**Orchestration**\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Claims automation and transformation\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving technological advancements\n", + "and increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\n", + "amount of structured and unstructured data coming in from different sources, in different formats, and time\n", + "frames. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\n", + "by a combination of technology and human intervention that seamlessly expedites the process.\n", + "\n", + "**Business problem**\n", + "\n", + "Missing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\n", + "leakage and inefficient processes in triaging claims to the right resource.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Enable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\n", + "including MLflow model lifecycle management.\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Decrease in annual claims payout\n", + "\n", + "**•** Increase in claim fraud detection/prevention\n", + "\n", + "**•** Improve efficiencies by 15%\n", + "\n", + "**“Applying AI as broadly, as aggressively**\n", + "\n", + "**and as enthusiastically as possible. No part**\n", + "\n", + "**of our business should be untouched by it.”**\n", + "\n", + "— \u0007Masashi Namatame, Group Chief Digital Officer,\n", + "Managing Executive Officer, Tokio Marine\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**Tokio Marine: Striving to**\n", + "**become Al-driven**\n", + "\n", + "Insurers of all types now routinely use AI\n", + "models to drive underwriting, streamline claims\n", + "processing and accelerate claims adjudication,\n", + "protect against insurance fraud, and improve\n", + "risk forecasting, for example. Tokio Marine —\n", + "Japan’s oldest insurance company, which has\n", + "done business since 1879 — has been applying\n", + "advanced uses of AI, particularly in its auto\n", + "insurance business, says Masashi Namatame,\n", + "Group Chief Digital Officer and Managing\n", + "Executive Officer at Tokio Marine: “To assess\n", + "collision damages, the company uses an AIbased computer vision solution to analyze\n", + "photos from accident scenes.” Comparing these\n", + "with what he describes as “thousands or even\n", + "millions” of photos of past analogous incidents,\n", + "the model produces liability assessments of the\n", + "parties involved and projects anticipated repair\n", + "costs. AI has also provided the company with\n", + "tangible benefits in online sales — especially in\n", + "personalized product recommendations and\n", + "contract writing, according to Namatame. Read\n", + "the case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Dynamic pricing and underwriting\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "In modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\n", + "carriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\n", + "This involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\n", + "geolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\n", + "offers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n", + "\n", + "**Business problem**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
808dbdaefdc79410d236eb7fb1e575ae-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Dynamic pricing and underwriting\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "In modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\n", + "carriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\n", + "This involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\n", + "geolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\n", + "offers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n", + "\n", + "**Business problem**\n", + "\n", + "Actuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\n", + "capabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "**•** Unified cloud-native platform\n", + "\n", + "**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n", + "\n", + "**•** Reduced total cost of ownership compared to legacy Hadoop systems\n", + "\n", + "**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\n", + "lowering loss ratios\n", + "\n", + "**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**American financial services**\n", + "**mutual organization**\n", + "\n", + "This organization aimed to leverage the vast\n", + "amounts of structured and unstructured data\n", + "it collected to enhance its underwriting and\n", + "decision-making processes, enabling greater\n", + "efficiency and effectiveness. However, the\n", + "company’s legacy infrastructure struggled\n", + "to scale with the increasing data volume and\n", + "processing demands, limiting its ability to\n", + "analyze the data and derive actionable insights.\n", + "\n", + "With Databricks, the insurer centralized\n", + "everything on one unified Lakehouse platform,\n", + "\n", + "supporting all operational and analytical\n", + "use cases. This allowed them to analyze\n", + "broader sets of data for superior underwriting\n", + "performance and create a digitally empowered,\n", + "end-to-end underwriting experience.\n", + "\n", + "\n", + "\n", + "**•** Improve competitive position\n", + "\n", + "**•** Decrease combined ratio\n", + "\n", + "**•** 15% improvement in efficiencies\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Anomaly detection and fraudulent claims\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**One of the largest U.S.**\n", + "**insurance companies and a**\n", + "**leading small business insurer**\n", + "\n", + "The increasing availability of data and market\n", + "competition challenge insurance providers to\n", + "offer better pricing to their customers. This\n", + "U.S.-based insurer, with hundreds of millions of\n", + "insurance records to analyze for downstream\n", + "ML, realized that their legacy batch analysis\n", + "process was slow and inaccurate, providing\n", + "limited insight for predicting the frequency\n", + "and severity of claims. With Databricks, they\n", + "were able to scale up the use of deep learning\n", + "models, resulting in more accurate pricing\n", + "predictions and increased revenue from\n", + "claims. By leveraging Databricks Lakehouse,\n", + "they harmonized data, analytics and AI at\n", + "scale, enabling accurate pricing predictions\n", + "and supporting various use cases from vehicle\n", + "telematics to actuarial modeling.\n", + "\n", + "\n", + "Fraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\n", + "American consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\n", + "in 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\n", + "change to support new channels and services, offering transactional features and facilitating payments through\n", + "digital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\n", + "consumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\n", + "models. It often involves a complex decision science process that combines a rules engine with a robust and\n", + "scalable machine learning platform.\n", + "\n", + "**Business problem**\n", + "\n", + "Insurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n", + "\n", + "**Solution/value with Databricks**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
fec16e5ce1d62014b12f9de0cbc3e75cFraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\n", + "American consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\n", + "in 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\n", + "change to support new channels and services, offering transactional features and facilitating payments through\n", + "digital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\n", + "consumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\n", + "models. It often involves a complex decision science process that combines a rules engine with a robust and\n", + "scalable machine learning platform.\n", + "\n", + "**Business problem**\n", + "\n", + "Insurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Modernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\n", + "providers (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\n", + "This data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n", + "\n", + "**$1 of fraud costs companies 3.36x in chargeback,**\n", + "**replacement and operational costs**\n", + "\n", + "\n", + "[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Customer 360 and hyper-personalization\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Winning the hearts and minds of your customers\n", + "starts with personalizing the user experience. The\n", + "ability to offer complementary products to meet\n", + "the needs of your customers lets you build deeper\n", + "relationships with them and engender their loyalty.\n", + "In addition, a better understanding of the finer\n", + "details within accounts allows you to offer them\n", + "more personalized products. To do this, you need\n", + "360-degree customer views, which requires you to\n", + "locate and consolidate all your customers’ contact\n", + "data from every digital tool that you use and house\n", + "it in one central location. With Databricks Lakehouse,\n", + "insurers can “hyper-personalize,” increase\n", + "cross-sell/upsell opportunities, enhance customer\n", + "360 and bring new products to market faster.\n", + "\n", + "**Business problem**\n", + "\n", + "The inability to reconcile customer records across\n", + "different lines of business limits real-time customer\n", + "insights necessary for upselling and cross-selling.\n", + "Siloed data makes it challenging to create accurate\n", + "and comprehensive customer profiles, resulting in\n", + "suboptimal recommendations for the next best action.\n", + "\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Databricks provides the tools needed to process\n", + "large volumes of data and determine the next best\n", + "action at any point in the customer journey.\n", + "\n", + "**•** Eliminates data silos by unifying all customer data,\n", + "including basic information, transactional data,\n", + "online behavior/purchase history, etc., to create\n", + "complete customer profiles\n", + "\n", + "**•** Integrated data security ensures that security\n", + "measures are incorporated at every layer of the\n", + "Databricks Lakehouse Platform\n", + "\n", + "**•** Delta improves data quality, providing a single\n", + "source of truth for real-time streams and ensuring\n", + "reliable and high-quality data for data teams\n", + "\n", + "**•** Integrated ML and AI capabilities utilize AI to\n", + "create self-optimizing ML models that determine\n", + "the next best step for each customer\n", + "\n", + "**•** MLflow model lifecycle management helps manage\n", + "the entire machine learning lifecycle reliably,\n", + "securely and at scale\n", + "\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Use AI, ML, automation and real-time data to\n", + "gain deeper customer insights and understand\n", + "their needs\n", + "\n", + "**•** Improve competitive positioning\n", + "\n", + "**•** Enhance the customer experience\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**160-year-old U.S.**\n", + "**insurance company**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
4c4fe41bf1c9f3e6411a258785e8e375**•** Integrated data security ensures that security\n", + "measures are incorporated at every layer of the\n", + "Databricks Lakehouse Platform\n", + "\n", + "**•** Delta improves data quality, providing a single\n", + "source of truth for real-time streams and ensuring\n", + "reliable and high-quality data for data teams\n", + "\n", + "**•** Integrated ML and AI capabilities utilize AI to\n", + "create self-optimizing ML models that determine\n", + "the next best step for each customer\n", + "\n", + "**•** MLflow model lifecycle management helps manage\n", + "the entire machine learning lifecycle reliably,\n", + "securely and at scale\n", + "\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Use AI, ML, automation and real-time data to\n", + "gain deeper customer insights and understand\n", + "their needs\n", + "\n", + "**•** Improve competitive positioning\n", + "\n", + "**•** Enhance the customer experience\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**160-year-old U.S.**\n", + "**insurance company**\n", + "\n", + "This insurance provider underwent a significant\n", + "digital transformation to provide a more\n", + "personalized financial services experience to\n", + "its 10,000 advisors and millions of customers\n", + "across various touchpoints. Recognizing the\n", + "importance of becoming data-driven, the\n", + "company leveraged Databricks in its client\n", + "360 platform to aggregate transactional and\n", + "behavioral data, along with core attributes,\n", + "providing business users with next-best-action\n", + "recommendations for seamless customer\n", + "engagement.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Global Regulatory Impact in Insurance\n", + "\n", + "\n", + "**Navigating global regulations**\n", + "**with technical implementation**\n", + "\n", + "Digital innovation continues to reshape the insurance sector. The pace and scale\n", + "of technological change are likely to increase due to factors such as artificial\n", + "intelligence (AI), cloud computing, and the entry of new players like insurtechs,\n", + "e-tailers, and manufacturers from outside the insurance industry.\n", + "\n", + "To succeed and thrive in today’s economic environment, insurers should prioritize\n", + "upgrading their infrastructure and technology, rather than solely focusing on\n", + "transforming operations. For example, migrating from on-premises systems to the\n", + "cloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n", + "\n", + "As insurers upgrade their compliance processes to meet new global regulations,\n", + "such as IFRS 17 and LDTI, the impact of regulatory updates becomes more\n", + "complex for organizations operating across multiple jurisdictions. Instead of\n", + "merely responding to regulatory and industry requirements, insurance companies\n", + "should make data-focused investments that help them anticipate and meet the\n", + "expectations of distributors and policyholders.\n", + "\n", + "\n", + "**IFRS-17**\n", + "\n", + "IFRS 17 is an International Finance Reporting Standard (IFRS) for\n", + "insurance contracts. IFRS 17 aims to standardize insurance accounting\n", + "by providing consistent principles for all facets of accounting for\n", + "insurance contracts. IFRS 17 removes existing inconsistencies so\n", + "analysts, investors and others can more easily compare companies,\n", + "contracts and industries.\n", + "\n", + "**LDTI for long-duration contracts**\n", + "\n", + "The Financial Accounting Standards Board long-duration targeted\n", + "improvements (LDTI) introduced changes to the U.S. GAAP accounting\n", + "model to simplify and improve the financial reporting of long-duration\n", + "contracts, including providing financial statement users with more\n", + "timely and relevant information about those contracts.\n", + "\n", + "\n", + "It is crucial for insurers to redirect their focus toward developing advanced data\n", + "management and utilization capabilities that offer better insights and improved\n", + "performance. These investments serve as not only a foundation for regulatory\n", + "compliance but also a starting point for more comprehensive and proactive\n", + "transformation initiatives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N D U S T R Y S O L U T I O N S**\n", + "\n", + "## Get Started With Accelerators, Brickbuilders and Enablers\n", + "\n", + "Insurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n", + "\n", + "**Adoption challenges**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
99907c1053249536af54c1cc15272bd1**LDTI for long-duration contracts**\n", + "\n", + "The Financial Accounting Standards Board long-duration targeted\n", + "improvements (LDTI) introduced changes to the U.S. GAAP accounting\n", + "model to simplify and improve the financial reporting of long-duration\n", + "contracts, including providing financial statement users with more\n", + "timely and relevant information about those contracts.\n", + "\n", + "\n", + "It is crucial for insurers to redirect their focus toward developing advanced data\n", + "management and utilization capabilities that offer better insights and improved\n", + "performance. These investments serve as not only a foundation for regulatory\n", + "compliance but also a starting point for more comprehensive and proactive\n", + "transformation initiatives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N D U S T R Y S O L U T I O N S**\n", + "\n", + "## Get Started With Accelerators, Brickbuilders and Enablers\n", + "\n", + "Insurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n", + "\n", + "**Adoption challenges**\n", + "\n", + "\n", + "Numerous challenges hinder organizations from developing and implementing the\n", + "necessary technical solutions to enhance their operational effectiveness, increase\n", + "revenue, and stay competitive. These challenges include:\n", + "\n", + "**•** Lack of technical skills (data scientists/data engineers): Companies often\n", + "struggle to find employees proficient in Python or Scala, or individuals who\n", + "possess extensive experience in data science.\n", + "\n", + "\n", + "\n", + "**•** Business problems require in-depth data science and industry knowledge:\n", + "Businesses seek solutions tailored to address specific problems, rather than\n", + "generic technical features.\n", + "\n", + "**•** Companies seek actionable insights: Organizations prefer readily applicable\n", + "patterns that can be quickly implemented, rather than custom data science\n", + "solutions that come with potential costs and risks of implementation failure.\n", + "\n", + "\n", + "**What are accelerators/enablers?**\n", + "\n", + "\n", + "**Solution Accelerators**\n", + "\n", + "Save hours on discovery, design, development and\n", + "testing with Databricks Solution Accelerators. Our\n", + "purpose-built guides, including fully functional\n", + "notebooks and best practices, expedite results for\n", + "your most common and high-impact use cases. With\n", + "these accelerators, you can go from idea to proof of\n", + "concept (PoC) in as little as two weeks.\n", + "\n", + "\n", + "**Brickbuilders**\n", + "\n", + "Brickbuilder Solutions are data and AI solutions\n", + "designed by leading consulting companies to\n", + "address industry-specific business requirements.\n", + "Built on the Databricks Lakehouse Platform and\n", + "backed by the industry experience of these\n", + "consultancies, businesses can have confidence\n", + "in solutions tailored to their specific use cases.\n", + "Brickbuilder Solutions can be implemented at any\n", + "stage of the customer journey.\n", + "\n", + "\n", + "**Solution Enablers**\n", + "\n", + "Solution enablers consist of targeted collections\n", + "of notebooks and materials, such as webinars and\n", + "blog posts, designed to support larger solutions.\n", + "They aim to solve pain points or address specific\n", + "layers of business capabilities, such as resolving data\n", + "ingestion challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Get Started With Industry Solutions\n", + "\n", + "\n", + "**Claims transformation:**\n", + "**automation and fraud prevention**\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving\n", + "technological advancements and growing data availability. The end-to-end claims\n", + "process, from extracting relevant information from documentation submitted\n", + "when filing a claim to triaging and routing claims and the underwriting process,\n", + "is ripe for digital transformation. By leveraging the Databricks Lakehouse,\n", + "organizations can handle millions of data points coming in different formats and\n", + "time frames, from various sources, at an unprecedented volume. Every touchpoint\n", + "in the claims journey, starting even before an incident occurs, will be supported by\n", + "a combination of technology and human intervention that seamlessly expedites\n", + "the process. Personalizing the claims experience by anticipating needs, providing\n", + "real-time status alerts, and reducing friction in the process increases customer\n", + "loyalty and retention.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Accelerate underwriting through collaboration and efficient ML**\n", + "\n", + "A leading P&C insurer took full advantage of the MongoDB and Databricks\n", + "integration, leveraging both platforms to foster collaboration between their data\n", + "and developer teams. The integration provides a more natural development\n", + "experience for Spark users and exposes all of Spark’s libraries. This allows\n", + "MongoDB data to be materialized as DataFrames and data sets for analysis\n", + "using machine learning, graph, streaming and SQL APIs. The insurer also benefits\n", + "from automatic schema inference. With this integration, the insurer was able to\n", + "train and observe their ML models (MongoDB Atlas Charts) more efficiently and\n", + "incorporate them into business applications.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
4e253736b9449112ad11cee7566da0ca**Customer/Partner Successes**\n", + "\n", + "**Accelerate underwriting through collaboration and efficient ML**\n", + "\n", + "A leading P&C insurer took full advantage of the MongoDB and Databricks\n", + "integration, leveraging both platforms to foster collaboration between their data\n", + "and developer teams. The integration provides a more natural development\n", + "experience for Spark users and exposes all of Spark’s libraries. This allows\n", + "MongoDB data to be materialized as DataFrames and data sets for analysis\n", + "using machine learning, graph, streaming and SQL APIs. The insurer also benefits\n", + "from automatic schema inference. With this integration, the insurer was able to\n", + "train and observe their ML models (MongoDB Atlas Charts) more efficiently and\n", + "incorporate them into business applications.\n", + "\n", + "As a result, crucial underwriting processes that previously took days are now executed\n", + "in seconds. In addition to the time and cost savings, the company can provide a more\n", + "immediate response to customers within its digital experience platform.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n", + "\n", + "**Claims processing is the process whereby an insurer receives,**\n", + "\n", + "\n", + "**verifies and processes a claim report submitted by a policyholder.**\n", + "\n", + "**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n", + "\n", + "**criticial component of customer satisfaction with their carrier.”**\n", + "\n", + "\n", + "**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n", + "\n", + "\n", + "[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "\n", + "\n", + "**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n", + "\n", + "\n", + "**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n", + "\n", + "**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Risk management:**\n", + "**dynamic pricing and underwriting**\n", + "\n", + "Modernized approaches at insurance carriers require a full digital transformation,\n", + "and one aspect of this transformation involves dynamic pricing and underwriting\n", + "to reduce premiums. Insurance providers are now consuming data from the largest\n", + "mobile telematics providers to obtain the most granular sensor and trip summaries\n", + "for users of online insurance applications. Not only is this data critical for pricing,\n", + "but it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\n", + "and underwriting automate routine tasks and provide teams with alternative\n", + "data sources to empower actuarial and underwriting professionals to become\n", + "“exponential.” This allows teams to focus on key aspects of risk selection and\n", + "analysis that drive competitive advantage and market differentiation. By leveraging\n", + "personalized data points, insurers can deliver near real-time underwriting\n", + "decisions for life insurance applicants, reducing policy abandonment and costs.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Automated extraction of medical risk factors for life insurance underwriting**\n", + "**(John Snow Labs)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
ccdf45dc81f51db3b4e319cdd68d324f-----\n", + "\n", + "**Risk management:**\n", + "**dynamic pricing and underwriting**\n", + "\n", + "Modernized approaches at insurance carriers require a full digital transformation,\n", + "and one aspect of this transformation involves dynamic pricing and underwriting\n", + "to reduce premiums. Insurance providers are now consuming data from the largest\n", + "mobile telematics providers to obtain the most granular sensor and trip summaries\n", + "for users of online insurance applications. Not only is this data critical for pricing,\n", + "but it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\n", + "and underwriting automate routine tasks and provide teams with alternative\n", + "data sources to empower actuarial and underwriting professionals to become\n", + "“exponential.” This allows teams to focus on key aspects of risk selection and\n", + "analysis that drive competitive advantage and market differentiation. By leveraging\n", + "personalized data points, insurers can deliver near real-time underwriting\n", + "decisions for life insurance applicants, reducing policy abandonment and costs.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Automated extraction of medical risk factors for life insurance underwriting**\n", + "**(John Snow Labs)**\n", + "\n", + "Life insurance underwriting considers an applicant’s medical risk factors in\n", + "addition to mortality risk characteristics. These risk factors are often found\n", + "in free-text documents. New insurance-specific natural language processing\n", + "(NLP) models can automatically extract relevant medical history and risk factors\n", + "from such documents. Forward-thinking companies are embracing accelerated\n", + "underwriting, which utilizes new data along with algorithmic tools and modeling\n", + "techniques to quickly assess and group applicants without requiring bodily fluids,\n", + "physician’s notes, and so on. This joint Solution Accelerator from Databricks and\n", + "John Snow Labs simplifies the implementation of this approach, creating a faster,\n", + "more consistent, and scalable underwriting experience.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n", + "\n", + "**Risk is highly influenced by behavior, and 80% of morbidity in**\n", + "\n", + "\n", + "**healthcare risk is driven by factors such as smoking, drinking**\n", + "\n", + "**alcohol, physical activity and diet. In the case of driving,**\n", + "\n", + "**60% of fatal accidents are a result of behavior alone. If insurers**\n", + "\n", + "**can change customer behaviors and help them make better**\n", + "\n", + "**choices, then the risk curve shifts.”**\n", + "\n", + "\n", + "**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n", + "\n", + "**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n", + "\n", + "**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
2bc1a24e9f2f35f29d6f23452045b7f7**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n", + "\n", + "**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "\n", + "\n", + "[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "\n", + "\n", + "**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Product distribution:**\n", + "**segmentation and personalization**\n", + "\n", + "The most forward-thinking and data-driven insurers are\n", + "focused on achieving personalization at scale. They are\n", + "exploring new partnerships and business models to create\n", + "integrated, value-added experiences that prioritize the\n", + "overall health and financial wellness of their customers,\n", + "rather than just their insurance needs. These insurers\n", + "are investing in new data sources, analytics platforms,\n", + "and artificial intelligence (AI)-powered decision engines\n", + "that enable them to connect producers with like-minded\n", + "customers or engage customers with enticing offers\n", + "and actionable steps based on their previous choices.\n", + "The outcome is more efficient and effective service\n", + "from producers, trusted and convenient interactions for\n", + "consumers, and increased customer engagement and\n", + "growth for insurers in an increasingly digital-oriented world.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n", + "\n", + "[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\n", + "insurance companies. It enables them to complete, unify and comprehensively capture customer profiles\n", + "using a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n", + "360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\n", + "as call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n", + "360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n", + "\n", + "With Persona 360, you can:\n", + "\n", + "**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n", + "1,695+ attributes and segmentsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
e86d61fb5ece85469f5408d595d3ab26[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\n", + "insurance companies. It enables them to complete, unify and comprehensively capture customer profiles\n", + "using a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n", + "360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\n", + "as call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n", + "360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n", + "\n", + "With Persona 360, you can:\n", + "\n", + "**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n", + "1,695+ attributes and segments\n", + "\n", + "**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\n", + "Persona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\n", + "users to comprehend and activate the data\n", + "\n", + "**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\n", + "personalized campaigns\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Demand for hyper-personalized and real-time risk protection**\n", + "\n", + "\n", + "**requires broad adoption of artificial** **intelligence (AI), machine**\n", + "\n", + "**learning and digital platforms.**\n", + "\n", + "**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n", + "\n", + "\n", + "**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "\n", + "\n", + "**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n", + "\n", + "**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n", + "**by insurance provider type**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
d598d42d52ff0953c4525c8a65fd365b**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n", + "\n", + "**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n", + "**by insurance provider type**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n", + "|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n", + "|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n", + "|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n", + "|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n", + "|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
26d91674b02626bd8b4b6427126750b0|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n", + "|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n", + "|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n", + "|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n", + "|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\n", + "Insurance empowers insurance providers to leverage the potential of data and analytics to address strategic\n", + "challenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n", + "\n", + "**Customers that innovate with Databricks Lakehouse for Insurance**\n", + "\n", + "Some of the top property and casualty, life and health insurance companies and reinsurers in the world turn\n", + "to Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\n", + "smarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000 organizations worldwide — including\n", + "\n", + "Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n", + "\n", + "Lake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n", + "\n", + "###### Contact us for a personalized demo at:\n", + " dbricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
77fa3ca534959648d7a8e5eebca4d12e**eBook**\n", + "\n", + "# Making Your Digital Twin Come to Life\n", + "\n", + "##### With the Lakehouse for Manufacturing and Tredence\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Digital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n", + "\n", + "What Are Digital Twins? ........................................................................................................................................................................................ **07**\n", + "\n", + "Digital Twin Architectures .................................................................................................................................................................................. **08**\n", + "\n", + "How to Build a Digital Twin ................................................................................................................................................................................ **09**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
10392cc0d1b6c4e31a30c959626d4c63How to Build a Digital Twin ................................................................................................................................................................................ **09**\n", + "\n", + "Why Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n", + "\n", + "Why Databricks for Digital Twins? ................................................................................................................................................................... **13**\n", + "\n", + "Why Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n", + "\n", + "Using Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### IntroductionSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
ed0b74c51c64e6fd2c535c1bd5dafb1aUsing Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "The concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\n", + "over 25 years ago, during the early phases of foundation and cofferdam construction for the\n", + "London Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\n", + "the years since this first application, edge computing, AI, data connectivity, 5G connectivity\n", + "and the improvements of the Internet of Things (IoT) have enabled digital twins to become\n", + "cost-effective and are now an imperative in today’s data-driven businesses.\n", + "\n", + "Today’s manufacturing industries are expected to streamline and optimize all the processes in their value\n", + "chain from product development and design, through operations and supply chain optimization to obtaining\n", + "feedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\n", + "and is addressing a multitude of challenges within manufacturing, logistics and transportation.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
33042520bb456fb0730d8ed53528a953-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "The concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\n", + "over 25 years ago, during the early phases of foundation and cofferdam construction for the\n", + "London Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\n", + "the years since this first application, edge computing, AI, data connectivity, 5G connectivity\n", + "and the improvements of the Internet of Things (IoT) have enabled digital twins to become\n", + "cost-effective and are now an imperative in today’s data-driven businesses.\n", + "\n", + "Today’s manufacturing industries are expected to streamline and optimize all the processes in their value\n", + "chain from product development and design, through operations and supply chain optimization to obtaining\n", + "feedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\n", + "and is addressing a multitude of challenges within manufacturing, logistics and transportation.\n", + "\n", + "\n", + "[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
32450e347d08b2ca314b2a9bc96b9a6e**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 10%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 50%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 25%\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Introduction (continued)**\n", + "\n", + "\n", + "**Digital twin market growth rate accelerates**\n", + "\n", + "Digital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\n", + "is forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\n", + "at a CAGR of 58%, riding on the wave of Industry 4.0.\n", + "\n", + "\n", + "**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to ManufacturingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
929aec8a6e41f875b04a8fd58c7e9553**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to Manufacturing\n", + "\n", + "Industry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\n", + "would have come at significant costs without digital twin technology.\n", + "\n", + "**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n", + "\n", + "\n", + "\n", + "**•** Product design and development is performed with\n", + "less cost and is completed in less time as iterative\n", + "simulations, using multiple constraints, deliver the\n", + "best or most optimized design. All commercial\n", + "aircraft are designed using digital twins.\n", + "\n", + "**•** Digital twins provide the awareness of how long\n", + "inventory will last, when to replenish and how to\n", + "minimize the supply chain disruptions. The oil and gas\n", + "industry, for example, uses supply chain–oriented\n", + "digital twins to reduce supply chain bottlenecks in\n", + "storage and midstream delivery, schedule tanker\n", + "off-loads and model demand with externalities.\n", + "\n", + "\n", + "\n", + "**•** Continuous quality checks on produced items\n", + "with ML/AI generated feedback pre-emptively\n", + "assuring improved product quality. Final paint\n", + "inspection in the automotive industry, for example,\n", + "is performed with computer vision built on top of\n", + "digital twin technology.\n", + "\n", + "**•** Striking the sweet spot between when to replace\n", + "a part before the process degrades or breaks\n", + "down and utilizing the components to their fullest,\n", + "digital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\n", + "building an asset performance management suite.\n", + "\n", + "\n", + "\n", + "**•** Digital twins create the opportunity to have\n", + "multiple departments in sync by providing\n", + "necessary instructions modularly to attain\n", + "a required throughput. Digital twins are the\n", + "backbone of kaizen events that optimize\n", + "manufacturing process flow.\n", + "\n", + "**•** Customer feedback loops can be modeled through\n", + "inputs, from point of sale customer behavior,\n", + "buying preferences, or product performance and\n", + "then integrated into the product development\n", + "process, forming a closed loop providing an\n", + "improved product design.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n", + "\n", + "The top four use cases are heavily focused on operational processes and are typically the first to be deployed\n", + "in manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\n", + "deployment, but typically offer higher and longer-lasting value.\n", + "\n", + "**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n", + "\n", + "\n", + "Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
c743afeca2a4f67e2f6fcc8b2a07bc10Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**\n", + "\n", + "**8%**\n", + "**8%**\n", + "\n", + "\n", + "Can you imagine the cost to change\n", + "an oil refinery’s crude distillation\n", + "unit process conditions to improve\n", + "the output of diesel one week\n", + "and gasoline the next to address\n", + "changes in demand and ensure\n", + "maximum economic value? Can you\n", + "imagine how to replicate an even\n", + "simple supply chain to model risk?\n", + "\n", + "\n", + "**5%**\n", + "\n", + "\n", + "**1%**\n", + "\n", + "\n", + "-----\n", + "\n", + "### What Are Digital Twins?\n", + "\n", + "\n", + "Knowing the business challenges and benefits digital twins deliver, let’s turn to\n", + "the basics and explore what digital twins are and how a modern data stack is\n", + "necessary to build effective and timely digital twins. The classic definition of\n", + "digital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n", + "\n", + "\n", + "For a discrete or continuous manufacturing process, a digital twin gathers system\n", + "and processes state data with the help of various IoT sensors [operational\n", + "technology data (OT)] and enterprise data [informational technology (IT)] to form a\n", + "virtual model which is then used to run simulations, study performance issues and\n", + "generate possible insights.\n", + "\n", + "\n", + "**Types of Digital Twins**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twin Architectures\n", + "\n", + "Classic digital twins have been physics-based models of specific systems. More recently,\n", + "**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n", + "\n", + "\n", + "These twins provide the opportunity to not just monitor and simulate system performance under specific\n", + "conditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\n", + "the industrial environment.\n", + "\n", + "Digital twins undergo a series of changes during their lifecycle to become completely autonomous.\n", + "\n", + "**Data-Driven Operational Digital Twins: Maturity Journey**\n", + "\n", + "**AI**\n", + "\n", + "Simulate & Optimize\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "# 6-8 18-24\n", + "## years to months\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
2028d1c3a99d0f0ed0da57cd872f75fa# 6-8 18-24\n", + "## years to months\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "# 20% to 25%\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "\n", + "Identify next best action and\n", + "integrate with actuation systems\n", + "\n", + "\n", + "**IoT**\n", + "\n", + "**Edge/**\n", + "**Cloud**\n", + "\n", + "\n", + "**Digital Twins**\n", + "\n", + "**ERP**\n", + "\n", + "\n", + "Predict & Diagnose\n", + "\n", + "|Col1|I i|\n", + "|---|---|\n", + "\n", + "\n", + "\n", + "Predictive maintenance, process\n", + "improvements and Root Causing\n", + "\n", + "\n", + "Monitor & Alert\n", + "\n", + "|Col1|P i|\n", + "|---|---|\n", + "\n", + "\n", + "Real-time operations monitoring\n", + "and alerting\n", + "\n", + "\n", + "-----\n", + "\n", + "### How to Build a Digital Twin\n", + "\n", + "\n", + "A data architecture capability is needed to capture\n", + "and collect the ever-expanding volume and variety\n", + "of data streaming in real time from example\n", + "protocols, such as ABB Total Flow, Allen Bradley,\n", + "Emerson, Fanuc, GE, Hitachi and Mitsubishi.\n", + "\n", + "\n", + "Data collection, data analytics, application\n", + "enablement and data integration orchestrate the\n", + "time-series data stream and transfer to the cloud.\n", + "Azure IoT Hub is used to securely ingest data from\n", + "edge to cloud.\n", + "\n", + "\n", + "Cloud infrastructure and analytics capabilities are\n", + "offered within the flexibility of the cloud. Azure\n", + "Digital Twin is used to model and visualize process\n", + "workflows. Databricks MLflow and Delta Lake scale to\n", + "deliver real-time predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Digital Twins: Technical Architecture**\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n", + "\n", + "\n", + "**System and use case discovery**\n", + "**and blueprinting**\n", + "\n", + "**•** Identify priority plant processes and systems\n", + "to model, with focused use cases (e.g., asset\n", + "maintenance, energy management, process\n", + "monitoring/optimization, etc.)\n", + "\n", + "**•** Develop a validated process outline, blueprint and\n", + "key performance indicatorsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
8b1bc3a24399aaf12f8b7d348990a68fCloud infrastructure and analytics capabilities are\n", + "offered within the flexibility of the cloud. Azure\n", + "Digital Twin is used to model and visualize process\n", + "workflows. Databricks MLflow and Delta Lake scale to\n", + "deliver real-time predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Digital Twins: Technical Architecture**\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n", + "\n", + "\n", + "**System and use case discovery**\n", + "**and blueprinting**\n", + "\n", + "**•** Identify priority plant processes and systems\n", + "to model, with focused use cases (e.g., asset\n", + "maintenance, energy management, process\n", + "monitoring/optimization, etc.)\n", + "\n", + "**•** Develop a validated process outline, blueprint and\n", + "key performance indicators\n", + "\n", + "**•** Develop a set of process variables, control\n", + "variables and manipulated variables\n", + "\n", + "**•** Design control loop\n", + "\n", + "**•** Validate and document process and asset FMEA\n", + "for all assets and sub-systems\n", + "\n", + "\n", + "**Technology infrastructure requirements**\n", + "\n", + "**•** Technical edge infrastructure onsite — to sense,\n", + "collect and transmit real-time information\n", + "\n", + "**•** Clean, reliable data availability in the cloud\n", + "\n", + "**•** Data processing and analytics platform — to\n", + "design, develop and implement solutions\n", + "\n", + "**•** Stream processing and deployment of models for\n", + "predictions and soft sensing\n", + "\n", + "\n", + "**Visualization delivered**\n", + "\n", + "**•** Information communication — visual\n", + "representation of digital twin along with remote\n", + "controlling functions (e.g., Power BI dashboards,\n", + "time series insights, web app-based digital\n", + "twin portals)\n", + "\n", + "**•** Closed-loop feedback — to send the insights and\n", + "actions back to form a closed loop — Azure – Event\n", + "Grid and Event Hub with connection from IoT Hub to\n", + "Azure IoT edge devices and control systems is used\n", + "\n", + "\n", + "\n", + "**•** Edge platform to orchestrate the data, insights and\n", + "actions between the cloud and site IT systems\n", + "\n", + "**•** Cloud to edge integration — to enable seamless\n", + "monitoring, alerting and integration with plant\n", + "OT/IT systems\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Is Manufacturing Struggling With Data and AI?\n", + "\n", + "**Challenge** **Root Cause** **Goal**\n", + "\n", + "\n", + "Aggregate high volumes and velocities of\n", + "\n", + "structured and unstructured data to power\n", + "\n", + "predictive analytics (e.g., images, IoT, ERP/SCM)\n", + "\n", + "Data architectures that scale for TBs /PBs of\n", + "\n", + "enterprise IT and OT data\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "for on-premises 30 years ago\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "\n", + "Legacy architectures such as data\n", + "\n", + "historians that can’t handle semi-structured\n", + "\n", + "or unstructured data\n", + "\n", + "\n", + "**Unable to scale enterprise data sets**\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "granular supply chain issues in the real world\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "\n", + "**Can’t meet intellectual property**\n", + "\n", + "\n", + "**Can’t meet intellectual property** Data lineage established across organizational\n", + "\n", + "Systems that do not establish data lineage\n", + "**requirements** silos and disjointed workflows\n", + "\n", + "\n", + "silos and disjointed workflows\n", + "\n", + "\n", + "### Data architecture is the root cause of this struggle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Databricks for Digital Twins?\n", + "\n", + "\n", + "Lakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\n", + "from across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\n", + "digital twins with predictive insights across the value chain from product development to optimizing operations\n", + "to building agile supply chains to robust customer insights.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
106e6e5a55cb20e8948c9f9d7f848eabAddress manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "granular supply chain issues in the real world\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "\n", + "**Can’t meet intellectual property**\n", + "\n", + "\n", + "**Can’t meet intellectual property** Data lineage established across organizational\n", + "\n", + "Systems that do not establish data lineage\n", + "**requirements** silos and disjointed workflows\n", + "\n", + "\n", + "silos and disjointed workflows\n", + "\n", + "\n", + "### Data architecture is the root cause of this struggle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Databricks for Digital Twins?\n", + "\n", + "\n", + "Lakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\n", + "from across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\n", + "digital twins with predictive insights across the value chain from product development to optimizing operations\n", + "to building agile supply chains to robust customer insights.\n", + "\n", + "\n", + "Databricks open Lakehouse\n", + "\n", + "Platform has shown time and\n", + "\n", + "again that it is the foundational\n", + "\n", + "enabling technology to power\n", + "\n", + "digital twins for manufacturing. But\n", + "\n", + "the real power is the Databricks\n", + "\n", + "partnership with Tredence that\n", + "\n", + "speeds implementation for\n", + "\n", + "tailored use cases that deliver\n", + "\n", + "superior ROI in less time.”\n", + "\n", + "**Dr. Bala Amavasai** ,\n", + "\n", + "Manufacturing CTO, Databricks\n", + "\n", + "\n", + "**Supports Real-Time**\n", + "**Decisions**\n", + "\n", + "Lakehouse for Manufacturing\n", + "leverages any enterprise data\n", + "source — from business critical\n", + "ERP data to edge sensor data in\n", + "one integrated platform, making it\n", + "easy to automate and secure data\n", + "with fast, real-time performance.\n", + "\n", + "\n", + "**Faster and More**\n", + "**Accurate Analysis**\n", + "\n", + "The true benefits of digital twins\n", + "are not the business intelligence\n", + "dashboards, but machine\n", + "learning insights generated\n", + "from incorporating real-time\n", + "data. Scalable and shareable\n", + "notebook-based machine learning\n", + "accelerates ROI.\n", + "\n", + "\n", + "**Open Data Sharing**\n", + "**and Collaboration**\n", + "\n", + "Drive stronger customer insights\n", + "and greater service with partners\n", + "leveraging open and secure\n", + "data collaboration between\n", + "departments or your supply chain\n", + "delivering faster ROI.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Tredence for Digital Twins?\n", + "\n", + "\n", + "Over the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\n", + "expertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\n", + "Now, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\n", + "further strengthen their ‘’last mile impact’’ vision.\n", + "\n", + "\n", + "Tredence is excited to\n", + "\n", + "co-innovate with Databricks to\n", + "\n", + "deliver the solutions required for\n", + "\n", + "enterprises to create digital twins\n", + "\n", + "from the ground up and implement\n", + "\n", + "them swiftly to maximize their ROI.\n", + "\n", + "Our partnership enables clients to\n", + "\n", + "get the most out of Tredence’s data\n", + "\n", + "science capabilities to build decision\n", + "\n", + "intelligence around manufacturing\n", + "\n", + "processes and Databricks’\n", + "\n", + "Lakehouse Platform to realize the full\n", + "\n", + "promise of digital twins.”\n", + "\n", + "**Naresh Agarwal** ,\n", + "\n", + "Head of Industrials, Tredence\n", + "\n", + "\n", + "**Global Reach**\n", + "\n", + "Tredence offers a global team with\n", + "the subject matter expertise that\n", + "delivers practitioner and useroriented solutions to identify\n", + "and solve for challenges in\n", + "digital transformation design\n", + "and implementation.\n", + "\n", + "\n", + "**Purpose-Built Solutions**\n", + "\n", + "Adopt contextual edge to cloud,\n", + "purpose-built AIoT solutions\n", + "that unify your ecosystems with\n", + "connected insights and enhance\n", + "productivity, while enabling\n", + "efficient cost structures.\n", + "\n", + "\n", + "**Focused Dedication**\n", + "\n", + "A dedicated centre of excellence\n", + "(CoE) for AIoT and smart\n", + "manufacturing solutions —\n", + "serving the entire manufacturing\n", + "value chain from product\n", + "development to manufacturing and\n", + "downstream operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Using Digital Twins to Drive Insights\n", + "\n", + "\n", + "**Use Case**\n", + "\n", + "**Predictive Maintenance**\n", + "\n", + "- \u0007Rolls-Royce sought to use real-time\n", + "engine data to reduce unplanned\n", + "maintenance and downtimeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
d56f561c8a16d9a60b6e5861216c425bLakehouse Platform to realize the full\n", + "\n", + "promise of digital twins.”\n", + "\n", + "**Naresh Agarwal** ,\n", + "\n", + "Head of Industrials, Tredence\n", + "\n", + "\n", + "**Global Reach**\n", + "\n", + "Tredence offers a global team with\n", + "the subject matter expertise that\n", + "delivers practitioner and useroriented solutions to identify\n", + "and solve for challenges in\n", + "digital transformation design\n", + "and implementation.\n", + "\n", + "\n", + "**Purpose-Built Solutions**\n", + "\n", + "Adopt contextual edge to cloud,\n", + "purpose-built AIoT solutions\n", + "that unify your ecosystems with\n", + "connected insights and enhance\n", + "productivity, while enabling\n", + "efficient cost structures.\n", + "\n", + "\n", + "**Focused Dedication**\n", + "\n", + "A dedicated centre of excellence\n", + "(CoE) for AIoT and smart\n", + "manufacturing solutions —\n", + "serving the entire manufacturing\n", + "value chain from product\n", + "development to manufacturing and\n", + "downstream operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Using Digital Twins to Drive Insights\n", + "\n", + "\n", + "**Use Case**\n", + "\n", + "**Predictive Maintenance**\n", + "\n", + "- \u0007Rolls-Royce sought to use real-time\n", + "engine data to reduce unplanned\n", + "maintenance and downtime\n", + "\n", + "- \u0007Legacy systems were unable to\n", + "scale data ingestion of engine\n", + "sensor data in real time for ML\n", + "\n", + "**Impact**\n", + "\n", + "\n", + "**Why Databricks?**\n", + "\n", + "- \u0007The Lakehouse Platform on Azure unifies in-flight data\n", + "streams with external environmental conditions data to\n", + "predict engine performance issues\n", + "\n", + "- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\n", + "across use cases\n", + "\n", + "- \u0007MLflow speeds deployment of new models and reduces\n", + "incidents of grounded planes\n", + "\n", + "\n", + "Rolls-Royce uses Databricks\n", + "to drive insights around predictive\n", + "maintenance, improving\n", + "airframe reliability and reducing\n", + "carbon emissions.\n", + "\n", + "\n", + "#### 22 million tons\n", + "of carbon emissions saved\n", + "\n", + "\n", + "#### 5% reduction\n", + "in unplanned airplane groundings\n", + "\n", + "\n", + "#### Millions of pounds\n", + "in inventory cost savings from a 50%\n", + "improvement in maintenance efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n", + "\n", + "Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n", + "\n", + "original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "###### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "To learn more, visit us at:\n", + "\n", + "**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
4da384183bd03d8a10274cfeaf813719### eBook\n", + "\n", + "# A New Approach to Data Sharing\n", + "\n", + "#### Open data sharing and collaboration for data, analytics, and AI\n", + "\n", + "### Second Edition\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n", + "\n", + "**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n", + "\n", + "Common data sharing use cases 6\n", + "\n", + "Data monetization 6\n", + "\n", + "Data sharing with partners or suppliers (B2B) 6\n", + "\n", + "Internal lines of business (LOBs) sharing 6\n", + "\n", + "Key benefits of data sharing 7\n", + "\n", + "**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n", + "\n", + "Legacy and homegrown solutions 9\n", + "\n", + "Proprietary vendor solutions 11\n", + "\n", + "Cloud object storage 13\n", + "\n", + "**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n", + "\n", + "What is Delta Sharing? 14\n", + "\n", + "Key benefits of Delta Sharing 16\n", + "\n", + "Maximizing value of data with Delta Sharing 18\n", + "\n", + "Data monetization with Delta Sharing 19\n", + "\n", + "B2B sharing with Delta Sharing 21\n", + "\n", + "Internal data sharing with Delta Sharing 23\n", + "\n", + "**Chapter 4: How Delta Sharing Works** **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chapter 5: Introducing Databricks Marketplace** **28**\n", + "## Contents\n", + "\n", + "What is Databricks Marketplace? 30\n", + "\n", + "Key benefits of Databricks Marketplace 30\n", + "\n", + "Enable collaboration and accelerate innovation 32\n", + "\n", + "Powered by a fast, growing ecosystem 32\n", + "\n", + "Use cases for an open marketplace 32\n", + "\n", + "New upcoming feature: AI model sharing 33\n", + "\n", + "**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n", + "\n", + "What is a data clean room? 34\n", + "\n", + "Common data clean room use cases 36\n", + "\n", + "Shortcomings of existing data clean rooms 38\n", + "\n", + "Key benefits of Databricks Clean Rooms 39\n", + "\n", + "**Resources: Getting started with Data Sharing and Collaboration** **40**\n", + "\n", + "**About the Authors** **42**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + " Data Sharing in Today’s Digital Economy\n", + "\n", + "\n", + "Today’s economy revolves around data. Everyday, more and more\n", + "\n", + "organizations must exchange data with their customers, suppliers\n", + "\n", + "and partners. Security is critical. And yet, efficiency and immediate\n", + "\n", + "accessibility are equally important.\n", + "\n", + "Where data sharing may have been considered optional, it’s now\n", + "\n", + "required. More organizations are investing in streamlining internal\n", + "\n", + "and external data sharing across the value chain. But they still face\n", + "\n", + "major roadblocks — from human inhibition to legacy solutions to\n", + "\n", + "vendor lock-in.\n", + "\n", + "To be truly data-driven, organizations need a better way to share\n", + "\n", + "data. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n", + "\n", + "data sharing will outperform their peers on most business value\n", + "\n", + "\n", + "who have successfully executed data sharing initiatives are 1.7x\n", + "\n", + "more effective in showing business value and return on investment\n", + "\n", + "from their data analytics strategy.\n", + "\n", + "To compete in the digital economy, organizations need an open —\n", + "\n", + "and secure — approach to data sharing.\n", + "\n", + "This eBook takes a deep dive into the modern era of data sharing\n", + "\n", + "and collaboration, from common use cases and key benefits to\n", + "\n", + "conventional approaches and the challenges of those methods.\n", + "\n", + "You’ll get an overview of our open approach to data sharing and find\n", + "\n", + "out how Databricks allows you to share your data across platforms,\n", + "\n", + "to share all your data and AI, and to share all your data securely with\n", + "\n", + "unified governance in a privacy-safe way.\n", + "\n", + "\n", + "metrics. In addition, Gartner recently found that Chief Data Officers\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 1\n", + " What Is Data Sharing and Why Is It Important?\n", + "\n", + "Data sharing is the ability to make the same data available to one or many stakeholders — both externalSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
6e916da2e05c4e43549d9fcbf3e506d5who have successfully executed data sharing initiatives are 1.7x\n", + "\n", + "more effective in showing business value and return on investment\n", + "\n", + "from their data analytics strategy.\n", + "\n", + "To compete in the digital economy, organizations need an open —\n", + "\n", + "and secure — approach to data sharing.\n", + "\n", + "This eBook takes a deep dive into the modern era of data sharing\n", + "\n", + "and collaboration, from common use cases and key benefits to\n", + "\n", + "conventional approaches and the challenges of those methods.\n", + "\n", + "You’ll get an overview of our open approach to data sharing and find\n", + "\n", + "out how Databricks allows you to share your data across platforms,\n", + "\n", + "to share all your data and AI, and to share all your data securely with\n", + "\n", + "unified governance in a privacy-safe way.\n", + "\n", + "\n", + "metrics. In addition, Gartner recently found that Chief Data Officers\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 1\n", + " What Is Data Sharing and Why Is It Important?\n", + "\n", + "Data sharing is the ability to make the same data available to one or many stakeholders — both external\n", + "\n", + "and internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n", + "\n", + "Data sharing — within your organization or externally — is an enabling technology for data commercialization\n", + "\n", + "and enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n", + "\n", + "to collaborate with partners, establish new partnerships and generate new revenue streams with data\n", + "\n", + "monetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n", + "\n", + "groups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n", + "\n", + "limited to roles such as the data analyst, data scientist and data engineer.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data sharing use cases\n", + "\n", + "\n", + "#### Data\n", + " monetization\n", + "\n", + "Companies across industries are commercializing\n", + "\n", + "data. Large multinational organizations have\n", + "\n", + "formed exclusively to monetize data, while other\n", + "\n", + "organizations are looking for ways to monetize\n", + "\n", + "their data and generate additional revenue\n", + "\n", + "streams. Examples of these companies can\n", + "\n", + "range from an agency with an identity graph to a\n", + "\n", + "telecommunication company with proprietary 5G\n", + "\n", + "data or to retailers that have a unique ability to\n", + "\n", + "combine online and offline data. Data vendors are\n", + "\n", + "growing in importance as companies realize they\n", + "\n", + "need external data for better decision-making.\n", + "\n", + "\n", + "#### Data sharing with partners\n", + " or suppliers (B2B)\n", + "\n", + "Many companies now strive to share data with\n", + "\n", + "partners and suppliers as similarly as they share\n", + "\n", + "it across their own organizations. For example,\n", + "\n", + "retailers and their suppliers continue to work more\n", + "\n", + "closely together as they seek to keep their products\n", + "\n", + "moving in an era of ever-changing consumer tastes.\n", + "\n", + "Retailers can keep suppliers posted by sharing sales\n", + "\n", + "data by SKU in real time, while suppliers can share\n", + "\n", + "real-time inventory data with retailers so they know\n", + "\n", + "what to expect. Scientific research organizations\n", + "\n", + "can make their data available to pharmaceutical\n", + "\n", + "companies engaged in drug discovery. Public safety\n", + "\n", + "agencies can provide real-time public data feeds\n", + "\n", + "of environmental data, such as climate change\n", + "\n", + "statistics or updates on potential volcanic eruptions.\n", + "\n", + "\n", + "#### Internal lines of business\n", + " (LOBs) sharing\n", + "\n", + "Within any company, different departments, lines\n", + "\n", + "of business and subsidiaries seek to share data so\n", + "\n", + "everyone can make decisions based on a complete\n", + "\n", + "view of the current business reality. For example,\n", + "\n", + "finance and HR departments need to share data\n", + "\n", + "as they analyze the true costs of each employee.\n", + "\n", + "Marketing and sales teams need a common view\n", + "\n", + "of data to determine the effectiveness of recent\n", + "\n", + "marketing campaigns. And different subsidiaries\n", + "\n", + "of the same company need a unified view of the\n", + "\n", + "health of the business. Removing data silos — which\n", + "\n", + "are often established for the important purpose of\n", + "\n", + "preventing unauthorized access to data — is critical\n", + "\n", + "for digital transformation initiatives and maximizing\n", + "\n", + "the business value of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of data sharingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
8539bb76a5be7cad0989786bdef40c4bof environmental data, such as climate change\n", + "\n", + "statistics or updates on potential volcanic eruptions.\n", + "\n", + "\n", + "#### Internal lines of business\n", + " (LOBs) sharing\n", + "\n", + "Within any company, different departments, lines\n", + "\n", + "of business and subsidiaries seek to share data so\n", + "\n", + "everyone can make decisions based on a complete\n", + "\n", + "view of the current business reality. For example,\n", + "\n", + "finance and HR departments need to share data\n", + "\n", + "as they analyze the true costs of each employee.\n", + "\n", + "Marketing and sales teams need a common view\n", + "\n", + "of data to determine the effectiveness of recent\n", + "\n", + "marketing campaigns. And different subsidiaries\n", + "\n", + "of the same company need a unified view of the\n", + "\n", + "health of the business. Removing data silos — which\n", + "\n", + "are often established for the important purpose of\n", + "\n", + "preventing unauthorized access to data — is critical\n", + "\n", + "for digital transformation initiatives and maximizing\n", + "\n", + "the business value of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of data sharing\n", + "\n", + "As you can see from the use cases described above, there are many benefits of data sharing, including:\n", + "\n", + "\n", + "**Greater collaboration with existing partners.** In today’s hyper-\n", + "\n", + "connected digital economy, no single organization can advance its\n", + "\n", + "business objectives without partnerships. Data sharing helps solidify\n", + "\n", + "existing partnerships and can help organizations establish new ones.\n", + "\n", + "\u0007 **Ability to generate new revenue streams.** With data sharing,\n", + "\n", + "organizations can generate new revenue streams by offering data\n", + "\n", + "products or data services to their end consumers.\n", + "\n", + "\n", + "**Ease of producing new products, services or business models.**\n", + "\n", + "Product teams can leverage both first-party data and third-party\n", + "\n", + "data to refine their products and services and expand their product/\n", + "\n", + "service catalog.\n", + "\n", + "**Greater efficiency of internal operations.** Teams across the\n", + "\n", + "organization can meet their business goals far more quickly when\n", + "\n", + "they don’t have to spend time figuring out how to free data from\n", + "\n", + "silos. When teams have access to live data, there’s no lag time\n", + "\n", + "between the need for data and the connection with the appropriate\n", + "\n", + "data source.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 2\n", + " Conventional Methods of Data Sharing and Their Challenges\n", + "\n", + "Sharing data across different platforms, companies and clouds is no easy task. In the past,\n", + "\n", + "organizations have hesitated to share data more freely because of the perceived lack\n", + "\n", + "of secure technology, competitive concerns and the cost of implementing data sharing\n", + "\n", + "solutions.\n", + "\n", + "Even for companies that have the budget to implement data sharing technology, many of\n", + "\n", + "the current approaches can’t keep up with today’s requirements for open-format, multi-\n", + "\n", + "cloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n", + "\n", + "which creates friction for data providers and data consumers who use non-compatible\n", + "\n", + "platforms.\n", + "\n", + "Over the past 30 years, data sharing solutions have come in three forms: legacy and\n", + "\n", + "homegrown solutions, cloud object storage and closed source commercial solutions.\n", + "\n", + "Each of these approaches comes with its pros and cons.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Legacy and homegrown solutions\n", + "\n", + "Many companies have built homegrown data sharing solutions based on legacy\n", + "\n", + "technologies such as email, (S)FTP or APIs.\n", + "\n", + "\n", + "Provider\n", + "\n", + "ETL\n", + "\n", + "\n", + "Consumer\n", + "\n", + "\n", + "Batch data\n", + "from provider\n", + "\n", + "\n", + "Table �\n", + "\n", + "Table 2\n", + "\n", + "\n", + "FTP/SSH/API\n", + "Server\n", + "\n", + "\n", + "FTP/SSH/API ETL Database Analyst Run Analysis\n", + "Server\n", + "\n", + "\n", + "**Figure 1:**\n", + "Legacy data\n", + "sharing solutions\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n", + "\n", + "consumers can leverage a suite of clients to access data provided to them.\n", + "\n", + "\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n", + "\n", + "and will work both on-prem and on clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
731baf9c1075c89f113cda106e49039e-----\n", + "\n", + "#### Legacy and homegrown solutions\n", + "\n", + "Many companies have built homegrown data sharing solutions based on legacy\n", + "\n", + "technologies such as email, (S)FTP or APIs.\n", + "\n", + "\n", + "Provider\n", + "\n", + "ETL\n", + "\n", + "\n", + "Consumer\n", + "\n", + "\n", + "Batch data\n", + "from provider\n", + "\n", + "\n", + "Table �\n", + "\n", + "Table 2\n", + "\n", + "\n", + "FTP/SSH/API\n", + "Server\n", + "\n", + "\n", + "FTP/SSH/API ETL Database Analyst Run Analysis\n", + "Server\n", + "\n", + "\n", + "**Figure 1:**\n", + "Legacy data\n", + "sharing solutions\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n", + "\n", + "consumers can leverage a suite of clients to access data provided to them.\n", + "\n", + "\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n", + "\n", + "and will work both on-prem and on clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n", + "\n", + "it and host it on an FTP server for different recipients. Additionally, this approach\n", + "\n", + "results in creating copies of data sets. Data copying causes duplication and prevents\n", + "\n", + "organizations from instantly accessing live data.\n", + "\n", + "\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n", + "\n", + "architectures due to replication and provisioning. This can add considerable time to\n", + "\n", + "data sharing activities and result in out-of-date data for end consumers.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n", + "\n", + "and load (ETL) the shared data for their end use cases, which further delays the time to\n", + "\n", + "insights. For any new data updates from the providers, the consumers have to rerun ETL\n", + "\n", + "pipelines again and again.\n", + "\n", + "\u0007 **Security and governance.** As modern data requirements become more stringent,\n", + "\n", + "homegrown and legacy technologies have become more difficult to secure and govern.\n", + "\n", + "\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n", + "\n", + "accommodate large data sets.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Proprietary vendor solutions\n", + "\n", + "Commercial data sharing solutions are a popular option among companies that don’t want\n", + "\n", + "to devote the time and resources to building an in-house solution yet also want more\n", + "\n", + "control than what cloud object storage can offer.\n", + "\n", + "\n", + "Vendor 1 Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Vendor V Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "Shared dataset\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "No cross-platform\n", + "sharing\n", + "\n", + "\n", + "**Figure 2:**\n", + "Proprietary\n", + "vendor solutions\n", + "\n", + "\n", + "Shared dataset\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Sharing limited to recipients\n", + "on the same platform\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "Data;\n", + "Consumere\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n", + "\n", + "the same platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n", + "\n", + "data sharing is easy among fellow customers, it’s usually impossible with those who\n", + "\n", + "use competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n", + "\n", + "Furthermore, platform differences between data providers and recipients introduce\n", + "\n", + "data sharing complexities.\n", + "\n", + "\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n", + "\n", + "data copies.\n", + "\n", + "\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n", + "\n", + "\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n", + "\n", + "consumers, as data providers have to replicate data for different recipients on different\n", + "\n", + "cloud platforms.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
01af964dec80339a4c762a6a74d2f97b**Pros**\n", + "\n", + "\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n", + "\n", + "the same platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n", + "\n", + "data sharing is easy among fellow customers, it’s usually impossible with those who\n", + "\n", + "use competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n", + "\n", + "Furthermore, platform differences between data providers and recipients introduce\n", + "\n", + "data sharing complexities.\n", + "\n", + "\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n", + "\n", + "data copies.\n", + "\n", + "\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n", + "\n", + "\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n", + "\n", + "consumers, as data providers have to replicate data for different recipients on different\n", + "\n", + "cloud platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Cloud object storage\n", + "\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n", + "\n", + "same cloud to access the objects.\n", + "\n", + "\u0007 **Cumbersome security and governance.** Assigning permissions\n", + "\n", + "and managing access is complex. Custom application logic is\n", + "\n", + "needed to generate signed URLs.\n", + "\n", + "\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n", + "\n", + "find it difficult to understand Identity Access Management\n", + "\n", + "(IAM) policies and how data is mapped to underlying files. For\n", + "\n", + "companies with large volumes of data, sharing via cloud storage\n", + "\n", + "is time-consuming, cumbersome and nearly impossible to scale.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** The data recipients\n", + "\n", + "have to run extract, transform and load (ETL) pipelines on the\n", + "\n", + "raw files before consuming them for their end use cases.\n", + "\n", + "The lack of a comprehensive solution makes it challenging for data\n", + "\n", + "providers and consumers to easily share data. Cumbersome and\n", + "\n", + "incomplete data sharing processes also constrain the development\n", + "\n", + "of business opportunities from shared data.\n", + "\n", + "\n", + "Object storage is considered a good fit for the cloud because it is\n", + "\n", + "elastic and can more easily scale into multiple petabytes to support\n", + "\n", + "unlimited data growth. The big three cloud providers all offer object\n", + "\n", + "storage services (AWS S3, Azure Blob, Google Cloud Storage) that\n", + "\n", + "are cheap, scalable and extremely reliable.\n", + "\n", + "An interesting feature of cloud object storage is the ability to\n", + "\n", + "generate signed URLs, which grant time-limited permission to\n", + "\n", + "download objects. Anyone who receives the presigned URL can\n", + "\n", + "then access the specified objects, making this a convenient\n", + "\n", + "way to share data.\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Sharing data in place.** Object storage can be shared in place,\n", + "\n", + "allowing consumers to access the latest available data.\n", + "\n", + "\u0007 **Scalability.** Cloud object storage profits from availability and\n", + "\n", + "durability guarantees that typically cannot be achieved\n", + "\n", + "on-premises. Data consumers retrieve data directly from the\n", + "\n", + "cloud providers, saving bandwidth for the providers.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 3\n", + " Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n", + "\n", + "\n", + "We believe the future of data sharing should be characterized by\n", + "\n", + "open technology. Data sharing shouldn’t be tied to a proprietary\n", + "\n", + "technology that introduces unnecessary limitations and financial\n", + "\n", + "burdens to the process. It should be readily available to anyone who\n", + "\n", + "wants to share data at scale. This philosophy inspired us to develop\n", + "\n", + "and release a new protocol for sharing data: Delta Sharing.\n", + "\n", + "#### What is Delta Sharing?\n", + "\n", + "Delta Sharing provides an open solution to securely share live data\n", + "\n", + "from your lakehouse to any computing platform. Recipients don’t\n", + "\n", + "\n", + "Data providers can centrally manage, govern, audit and track\n", + "\n", + "usage of the shared data on one platform. Delta Sharing is nativelySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
8d31bd1738c89d49b2778f7bacf93c94durability guarantees that typically cannot be achieved\n", + "\n", + "on-premises. Data consumers retrieve data directly from the\n", + "\n", + "cloud providers, saving bandwidth for the providers.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 3\n", + " Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n", + "\n", + "\n", + "We believe the future of data sharing should be characterized by\n", + "\n", + "open technology. Data sharing shouldn’t be tied to a proprietary\n", + "\n", + "technology that introduces unnecessary limitations and financial\n", + "\n", + "burdens to the process. It should be readily available to anyone who\n", + "\n", + "wants to share data at scale. This philosophy inspired us to develop\n", + "\n", + "and release a new protocol for sharing data: Delta Sharing.\n", + "\n", + "#### What is Delta Sharing?\n", + "\n", + "Delta Sharing provides an open solution to securely share live data\n", + "\n", + "from your lakehouse to any computing platform. Recipients don’t\n", + "\n", + "\n", + "Data providers can centrally manage, govern, audit and track\n", + "\n", + "usage of the shared data on one platform. Delta Sharing is natively\n", + "\n", + "integrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n", + "\n", + "manage and audit shared data across organizations and confidently\n", + "\n", + "share data assets while meeting security and compliance needs.\n", + "\n", + "With Delta Sharing, organizations can easily share existing large-\n", + "\n", + "scale data sets based on the open source formats Apache Parquet\n", + "\n", + "and Delta Lake without moving data. Teams gain the flexibility to\n", + "\n", + "query, visualize, transform, ingest or enrich shared data with their\n", + "\n", + "tools of choice.\n", + "\n", + "\n", + "have to be on the Databricks platform or on the same cloud or a\n", + "\n", + "cloud at all. Data providers can share live data without replicating\n", + "\n", + "it or moving it to another system. Recipients benefit from always\n", + "\n", + "having access to the latest version of data and can quickly query\n", + "\n", + "shared data using tools of their choice for BI, analytics and machine\n", + "\n", + "learning, reducing time-to-value.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data ����i�e�\n", + "\n", + "\n", + "Any u�e cy�e\n", + "\n", + "Analytics\n", + "\n", + "BI\n", + "\n", + "Data Science\n", + "\n", + "\n", + "Data Recipient\n", + "\n", + "Any sool\n", + "\n", + "And many more\n", + "\n", + "\n", + "Any cloud/on-prem\n", + "\n", + "On-premises\n", + "\n", + "\n", + "Access permissions\n", + "\n", + "Delta Sharing Protocol\n", + "\n", + "\n", + "Delta �a�e �a�le Delta Sharing Ser�er\n", + "\n", + "\n", + "No replication\n", + "Easy to manage\n", + "Secure\n", + "\n", + "\n", + "**Figure 3:**\n", + "Delta Sharing\n", + "\n", + "\n", + "Databricks designed Delta Sharing with five goals in mind:\n", + "\n", + "\u0007Provide an open cross-platform sharing solution\n", + "\n", + "\u0007Share live data without copying it to another system\n", + "\n", + "\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n", + "\n", + "provide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n", + "\n", + "\u0007Provide strong security, auditing and governance\n", + "\n", + "\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n", + "\n", + "derivatives such as ML models, dashboards and notebooks, in addition to tabular data\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Delta Sharing\n", + "\n", + "By eliminating the obstacles and shortcomings associated with typical data sharing\n", + "\n", + "approaches, Delta Sharing delivers several key benefits, including:\n", + "\n", + "\n", + "**Open cross-platform sharing.** Delta Sharing establishes a new\n", + "\n", + "open standard for secure data sharing and supports open source\n", + "\n", + "Delta and Apache Parquet formats. Data recipients don’t have to be\n", + "\n", + "on the Databricks platform or on the same cloud, as Delta Sharing\n", + "\n", + "works across clouds and even from cloud to on-premises setups. To\n", + "\n", + "give customers even greater flexibility, Databricks has also released\n", + "\n", + "open source connectors for pandas, Apache Spark, Elixir and\n", + "\n", + "Python, and is working with partners on many more.\n", + "\n", + "\u0007 **Securely share live data without replication.** Most enterprise\n", + "\n", + "\n", + "**Centralized governance.** With Databricks Delta Sharing, data\n", + "\n", + "providers can grant, track, audit and even revoke access to shared\n", + "\n", + "data sets from a single point of enforcement to meet compliance andSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
976ae92ee9091e152cd7f09d1f089fd2#### Key benefits of Delta Sharing\n", + "\n", + "By eliminating the obstacles and shortcomings associated with typical data sharing\n", + "\n", + "approaches, Delta Sharing delivers several key benefits, including:\n", + "\n", + "\n", + "**Open cross-platform sharing.** Delta Sharing establishes a new\n", + "\n", + "open standard for secure data sharing and supports open source\n", + "\n", + "Delta and Apache Parquet formats. Data recipients don’t have to be\n", + "\n", + "on the Databricks platform or on the same cloud, as Delta Sharing\n", + "\n", + "works across clouds and even from cloud to on-premises setups. To\n", + "\n", + "give customers even greater flexibility, Databricks has also released\n", + "\n", + "open source connectors for pandas, Apache Spark, Elixir and\n", + "\n", + "Python, and is working with partners on many more.\n", + "\n", + "\u0007 **Securely share live data without replication.** Most enterprise\n", + "\n", + "\n", + "**Centralized governance.** With Databricks Delta Sharing, data\n", + "\n", + "providers can grant, track, audit and even revoke access to shared\n", + "\n", + "data sets from a single point of enforcement to meet compliance and\n", + "\n", + "other regulatory requirements. Databricks Delta Sharing users get:\n", + "\n", + "\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n", + "\n", + "governance offering for Databricks Lakehouse\n", + "\n", + "\u0007Simple, more secure setup and management of shares\n", + "\n", + "\u0007The ability to create and manage recipients and data shares\n", + "\n", + "\u0007Audit logging captured automatically as part of Unity Catalog\n", + "\n", + "\u0007Direct integration with the rest of the Databricks ecosystem\n", + "\n", + "\u0007No separate compute for providing and managing shares\n", + "\n", + "\n", + "data today is stored in cloud data lakes. Any of these existing data\n", + "\n", + "sets on the provider’s data lake can easily be shared without any\n", + "\n", + "data replication or physical movement of data. Data providers can\n", + "\n", + "update their data sets reliably in real time and provide a fresh and\n", + "\n", + "consistent view of their data to recipients.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Share data products, including AI models, dashboards and**\n", + "\n", + "**notebooks, with greater flexibility.** Data providers can choose\n", + "\n", + "between sharing anentire table or sharing only a version or\n", + "\n", + "specific partitions of a table. However, sharing just tabular data\n", + "\n", + "is not enough to meet today’s consumer demands. Delta Sharing\n", + "\n", + "also supports sharing of non-tabular data and data derivatives\n", + "\n", + "such as data streams, AI models, SQL views and arbitrary files,\n", + "\n", + "enablingincreased collaboration and innovation. Data providers can\n", + "\n", + "build, package and distribute data products including data sets,\n", + "\n", + "AI and notebooks, allowingdata recipients to get insights faster.\n", + "\n", + "Furthermore, this approach promotes and empowers the exchange\n", + "\n", + "of knowledge — not just data — between different organizations.\n", + "\n", + "\n", + "**Share data at a lower cost.** Delta Sharing lowers the cost of\n", + "\n", + "managing and consuming shares for both data providers and\n", + "\n", + "recipients. Providers can share data from their cloud object store\n", + "\n", + "without replicating, thereby reducing the cost of storage. Incontrast,\n", + "\n", + "existing data sharing platforms require data providers to first move\n", + "\n", + "their data into their platform or store data in proprietary formats in\n", + "\n", + "their managed storage, which often costs more and results in data\n", + "\n", + "duplication. With Delta Sharing, data providers don’t need to set\n", + "\n", + "up separate computing environments to share data. Consumers\n", + "\n", + "can access shared data directly using their tools of choice without\n", + "\n", + "setting up specific consumption ecosystems, thereby reducing\n", + "\n", + "costs.\n", + "\n", + "\n", + "With Delta Sharing we are able to achieve a truly open marketplace\n", + "\n", + "and truly open ecosystem. In contrast, commercial products are\n", + "\n", + "mostly limited to sharing raw tabular data and cannot be used to\n", + "\n", + "\n", + "share these higher-valued data derivatives.\n", + "\n", + "\n", + "\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n", + "\n", + "set up a new ingestion process to consume data. Data recipients\n", + "\n", + "can directly access the fresh data and query it using tools of their\n", + "\n", + "choice. Recipients can also enrich data with data sets from popular\n", + "\n", + "data providers. The Delta Sharing ecosystem of open source and\n", + "\n", + "commercial partners is growing every day.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Maximizing value of data with Delta SharingSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
21fed49f6ac8d9e5d38d22f0703f33a6duplication. With Delta Sharing, data providers don’t need to set\n", + "\n", + "up separate computing environments to share data. Consumers\n", + "\n", + "can access shared data directly using their tools of choice without\n", + "\n", + "setting up specific consumption ecosystems, thereby reducing\n", + "\n", + "costs.\n", + "\n", + "\n", + "With Delta Sharing we are able to achieve a truly open marketplace\n", + "\n", + "and truly open ecosystem. In contrast, commercial products are\n", + "\n", + "mostly limited to sharing raw tabular data and cannot be used to\n", + "\n", + "\n", + "share these higher-valued data derivatives.\n", + "\n", + "\n", + "\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n", + "\n", + "set up a new ingestion process to consume data. Data recipients\n", + "\n", + "can directly access the fresh data and query it using tools of their\n", + "\n", + "choice. Recipients can also enrich data with data sets from popular\n", + "\n", + "data providers. The Delta Sharing ecosystem of open source and\n", + "\n", + "commercial partners is growing every day.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Maximizing value of data with Delta Sharing\n", + "\n", + "Delta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n", + "\n", + "variety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n", + "\n", + "Sharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n", + "\n", + "In this section we will explore the building blocks of such an approach and the use cases emerging from these.\n", + "\n", + "\n", + "“Delta Sharing helped us streamline our data delivery process\n", + "\n", + "for large data sets. This enables our clients to bring their own\n", + "\n", + "compute environment to read fresh curated data with little-to-\n", + "\n", + "no integration work, and enables us to continue expanding our\n", + "\n", + "catalog of unique, high-quality data products.”\n", + "\n", + "— **William Dague** , Head of Alternative Data, Nasdaq\n", + "\n", + "\n", + "“We recognize that openness of data will play a key role in\n", + "\n", + "achieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n", + "\n", + "provides Shell with a standard, controlled and secure protocol\n", + "\n", + "for sharing vast amounts of data easily with our partners to work\n", + "\n", + "toward these goals without requiring our partners be on the same\n", + "\n", + "data sharing platform.”\n", + "\n", + "— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n", + "\n", + "\n", + "“Leveraging the powerful capabilities of Delta Sharing from\n", + "\n", + "\n", + "Databricks enables Pumpjack Dataworks to have a faster\n", + "\n", + "onboarding experience, removing the need for exporting,\n", + "\n", + "importing and remodeling of data, which brings immediate\n", + "\n", + "value to our clients. Faster results yield greater commercial\n", + "\n", + "opportunity for our clients and their partners.”\n", + "\n", + "\n", + "“Data accessibility is a massive consideration for us. We believe\n", + "\n", + "that Delta Sharing will simplify data pipelines by enabling us to\n", + "\n", + "query fresh data from the place where it lives, and we are not\n", + "\n", + "locked into any platform or data format.”\n", + "\n", + "— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n", + "\n", + "\n", + "— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n", + "\n", + "“As a data company, giving our customers access to our data sets\n", + "\n", + "is critical. The Databricks Lakehouse Platform with Delta Sharing\n", + "\n", + "really streamlines that process, allowing us to securely reach a\n", + "\n", + "much broader user base regardless of cloud or platform.”\n", + "\n", + "— **Felix Cheung** , VP of Engineering, SafeGraph\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data monetization with Delta Sharing\n", + "\n", + "Delta Sharing enables companies to monetize their data product simply and with necessary governance.\n", + "\n", + "Data /on.2-er $\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Fulfllleen\n", + "\n", + "Entitles various data products\n", + "\n", + "Data Vendor\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Data /on.2-er �\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O\n", + "\n", + "R/OSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
8c21a4f7b812c6f92740eeb06e59f417is critical. The Databricks Lakehouse Platform with Delta Sharing\n", + "\n", + "really streamlines that process, allowing us to securely reach a\n", + "\n", + "much broader user base regardless of cloud or platform.”\n", + "\n", + "— **Felix Cheung** , VP of Engineering, SafeGraph\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data monetization with Delta Sharing\n", + "\n", + "Delta Sharing enables companies to monetize their data product simply and with necessary governance.\n", + "\n", + "Data /on.2-er $\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Fulfllleen\n", + "\n", + "Entitles various data products\n", + "\n", + "Data Vendor\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Data /on.2-er �\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 4:**\n", + "Data monetization\n", + "with Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Billieg Audit Log\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Sharing, a data provider can seamlessly share large data sets and overcome\n", + "\n", + "the scalability issues associated with SFTP servers. Data providers can easily expand their\n", + "\n", + "data product lines since Delta Sharing doesn’t require you to build a dedicated service\n", + "\n", + "for each of your data products like API services would. The company simply grants and\n", + "\n", + "manages access to the data recipients instead of replicating the data — thereby reducing\n", + "\n", + "complexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n", + "\n", + "for a data product. Any data that exists on your platform can be securely shared with your\n", + "\n", + "consumers. This grants a wider addressable market — your products have appeal to a\n", + "\n", + "broader range of consumers, from those who say “we need access to your raw data only”\n", + "\n", + "to those who say “we want only small subsets of your Gold layer data.”\n", + "\n", + "To mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n", + "\n", + "access to the data. Data providers can use this information to determine the costs\n", + "\n", + "associated with any of the data products and evaluate if such products are commercially\n", + "\n", + "viable and sensible.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### B2B sharing with Delta Sharing\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner A\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Partner U\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner B\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 5:**\n", + "B2B sharing with\n", + "Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing applies in the case of bidirectional exchange of data.\n", + "\n", + "Companies use Delta Sharing to incorporate partners and suppliers\n", + "\n", + "seamlessly into their workflows. Traditionally, this is not an easy task.\n", + "\n", + "An organization typically has no control over how their partners are\n", + "\n", + "implementing their own data platforms. The complexity increases\n", + "\n", + "when we consider that the partners and suppliers can reside in\n", + "\n", + "a public cloud, private cloud or an on-premises deployed data\n", + "\n", + "platform. The choices of platform and architecture are not imposed\n", + "\n", + "on your partners and suppliers. Due to its open protocol, Delta\n", + "\n", + "Sharing addresses this requirement foundationally. Through a wide\n", + "\n", + "array of existing connectors (and many more being implemented),\n", + "\n", + "your data can land anywhere your partners and suppliers need to\n", + "\n", + "consume it.\n", + "\n", + "\n", + "In addition to the location of data consumer residency, the\n", + "\n", + "complexity of data arises as a consideration. The traditional\n", + "\n", + "approach to sharing data using APIs is inflexible and imposes\n", + "\n", + "additional development cycles on both ends of the exchange in\n", + "\n", + "order to implement both the provider pipelines and consumer\n", + "\n", + "pipelines. With Delta Sharing, this problem can be abstracted. DataSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
bdd311ea74ae748ff1891b3c527d43feAn organization typically has no control over how their partners are\n", + "\n", + "implementing their own data platforms. The complexity increases\n", + "\n", + "when we consider that the partners and suppliers can reside in\n", + "\n", + "a public cloud, private cloud or an on-premises deployed data\n", + "\n", + "platform. The choices of platform and architecture are not imposed\n", + "\n", + "on your partners and suppliers. Due to its open protocol, Delta\n", + "\n", + "Sharing addresses this requirement foundationally. Through a wide\n", + "\n", + "array of existing connectors (and many more being implemented),\n", + "\n", + "your data can land anywhere your partners and suppliers need to\n", + "\n", + "consume it.\n", + "\n", + "\n", + "In addition to the location of data consumer residency, the\n", + "\n", + "complexity of data arises as a consideration. The traditional\n", + "\n", + "approach to sharing data using APIs is inflexible and imposes\n", + "\n", + "additional development cycles on both ends of the exchange in\n", + "\n", + "order to implement both the provider pipelines and consumer\n", + "\n", + "pipelines. With Delta Sharing, this problem can be abstracted. Data\n", + "\n", + "can be shared as soon as it lands in the Delta table and when the\n", + "\n", + "shares and grants are defined. There are no implementation costs\n", + "\n", + "on the provider side. On the consumer side, data simply needs\n", + "\n", + "to be ingested and transformed into an expected schema for the\n", + "\n", + "downstream processes.\n", + "\n", + "This means that you can form much more agile data exchange\n", + "\n", + "patterns with your partners and suppliers and attain value from your\n", + "\n", + "combined data much quicker than ever before.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Internal data sharing with Delta Sharing\n", + "\n", + "Internal data sharing is becoming an increasingly important consideration for any modern\n", + "\n", + "organization, particularly where data describing the same concepts have been produced in\n", + "\n", + "different ways and in different data silos across the organization. In this situation it is important\n", + "\n", + "to design systems and platforms that allow governed and intentional federation of data and\n", + "\n", + "processes, and at the same time allow easy and seamless integration of said data and processes.\n", + "\n", + "Architectural design patterns such as Data Mesh have emerged to address these specific\n", + "\n", + "challenges and considerations. Data Mesh architecture assumes a federated design and\n", + "\n", + "dissemination of ownership and responsibility to business units or divisions. This, in fact, has\n", + "\n", + "several advantages, chief among them that data is owned by the parts of the organization closest\n", + "\n", + "to the source of the data. Data residence is naturally enforced since data sits within the geo-\n", + "\n", + "locality where it has been generated. Finally, data volumes and data variety are kept in control\n", + "\n", + "due to the localization within a data domain (or data node). On the other hand, the architecture\n", + "\n", + "promotes exchange of data between different data domains when that data is needed to deliver\n", + "\n", + "outcomes and better insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "Business Unit 1 Business Unit ,\n", + "i n R e g i o n A i n R e g i o n -\n", + "\n", + "Cloud Storage\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Business Unit B\n", + "\n", + "i n R e g i o n A\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "Business Unit �\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "**Figure 6:**\n", + "Building a Data Mesh\n", + "with Delta Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog enables consolidated data access control across\n", + "\n", + "different data domains within an organization using the Lakehouse\n", + "\n", + "on Databricks. In addition, Unity Catalog adds a set of simple and\n", + "\n", + "easy-to-use declarative APIsto govern and control data exchange\n", + "\n", + "patterns between the data domains in the Data Mesh.\n", + "\n", + "To make matters even more complicated, organizations can grow\n", + "\n", + "through mergers and acquisitions. In such cases we cannot assume\n", + "\n", + "that organizations being acquired have followed the same set of\n", + "\n", + "rules and standards to define their platforms and produce their\n", + "\n", + "data. Furthermore, we cannot even assume that they have used\n", + "\n", + "the same cloud providers, nor can we assume the complexity ofSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
82417c072537377ab26a850fbc316698Cloud Storage\n", + "\n", + "Business Unit �\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "**Figure 6:**\n", + "Building a Data Mesh\n", + "with Delta Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog enables consolidated data access control across\n", + "\n", + "different data domains within an organization using the Lakehouse\n", + "\n", + "on Databricks. In addition, Unity Catalog adds a set of simple and\n", + "\n", + "easy-to-use declarative APIsto govern and control data exchange\n", + "\n", + "patterns between the data domains in the Data Mesh.\n", + "\n", + "To make matters even more complicated, organizations can grow\n", + "\n", + "through mergers and acquisitions. In such cases we cannot assume\n", + "\n", + "that organizations being acquired have followed the same set of\n", + "\n", + "rules and standards to define their platforms and produce their\n", + "\n", + "data. Furthermore, we cannot even assume that they have used\n", + "\n", + "the same cloud providers, nor can we assume the complexity of\n", + "\n", + "their data models. Delta Sharing can simplify and accelerate the\n", + "\n", + "\n", + "unification and assimilation of newly acquired organizations and\n", + "\n", + "their data and processes.. Individual organizations can be treated\n", + "\n", + "as new data domains in the overarching mesh. Only selected data\n", + "\n", + "sources can be exchanged between the different platforms. This\n", + "\n", + "enables teams to move freely between the organizations that are\n", + "\n", + "merging without losing their data — if anything, they are empowered\n", + "\n", + "to drive insights of higher quality by combining the data of both.\n", + "\n", + "With Unity Catalog and Delta Sharing, the Lakehouse architecture\n", + "\n", + "seamlessly combines with the Data Mesh architecture to deliver\n", + "\n", + "more power than ever before, pushing the boundaries of what’s\n", + "\n", + "possible and simplifying activities that were deemed daunting not\n", + "\n", + "so long ago.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 4\n", + " How Delta Sharing Works\n", + "\n", + "\n", + "Delta Sharing is designed to be simple, scalable, nonproprietary\n", + "\n", + "and cost-effective for organizations that are serious about getting\n", + "\n", + "more from their data. Delta Sharing is natively integrated with Unity\n", + "\n", + "Catalog, which enables customers to add fine-grained governance\n", + "\n", + "and security controls, making it easy and safe to share data\n", + "\n", + "\n", + "Delta Sharing is a simple REST protocol that securely grants\n", + "\n", + "temporary access to part of a cloud data set. It leverages modern\n", + "\n", + "cloud storage systems — such as AWS S3, Azure ADLS or Google’s\n", + "\n", + "GCS — to reliably grant read-only access to large data sets. Here’s\n", + "\n", + "how it works for data providers and data recipients.\n", + "\n", + "\n", + "internally or externally.\n", + "\n", + "Data PJQIiLeJ Data Recipient\n", + "\n", + "Access permissions\n", + "\n", + "Request table\n", + "\n", + "Pre-signed short-lived URLs\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "Parquet `iles\n", + "\n", + "\n", + "Delta Sharing Server\n", + "\n", + "\n", + "**Figure 7:**\n", + "How Delta Sharing\n", + "works connecting data\n", + "providers and data\n", + "recipients\n", + "\n", + "\n", + "Temporary direct access to fles Parquet ormatt\n", + "in the object store — AWS S3, GCP, ADLS\n", + "\n", + "\n", + "\n", + "- • •\n", + "Delta Sharing Client\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data providers\n", + "\n", + "The data provider shares existing tables or parts thereof (such as\n", + "\n", + "specific table versions or partitions) stored on the cloud data lake\n", + "\n", + "in Delta Lake format. The provider decides what data they want to\n", + "\n", + "share and runs a sharing server in front of it that implements the\n", + "\n", + "Delta Sharing protocol and manages recipient access. . To manage\n", + "\n", + "shares and recipients, you can use SQL commands,the Unity\n", + "\n", + "Catalog CLI or the intuitive user interface.\n", + "\n", + "#### Data recipients\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients\n", + "\n", + "that support the protocol. Databricks has released open source\n", + "\n", + "connectors for pandas, Apache Spark, Java and Python, and is\n", + "\n", + "working with partners on many more.\n", + "\n", + "\n", + "#### The data exchange\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
84f8fc3ba756aa770968b11bce51feb6- • •\n", + "Delta Sharing Client\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data providers\n", + "\n", + "The data provider shares existing tables or parts thereof (such as\n", + "\n", + "specific table versions or partitions) stored on the cloud data lake\n", + "\n", + "in Delta Lake format. The provider decides what data they want to\n", + "\n", + "share and runs a sharing server in front of it that implements the\n", + "\n", + "Delta Sharing protocol and manages recipient access. . To manage\n", + "\n", + "shares and recipients, you can use SQL commands,the Unity\n", + "\n", + "Catalog CLI or the intuitive user interface.\n", + "\n", + "#### Data recipients\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients\n", + "\n", + "that support the protocol. Databricks has released open source\n", + "\n", + "connectors for pandas, Apache Spark, Java and Python, and is\n", + "\n", + "working with partners on many more.\n", + "\n", + "\n", + "#### The data exchange\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "**1.** \u0007The recipient’s client authenticates to the sharing server and\n", + "\n", + "asks to query a specific table. The client can also provide filters\n", + "\n", + "on the data (for example, “country=US”) as a hint to read just a\n", + "\n", + "subset of the data.\n", + "\n", + "**2.** \u0007The server verifies whether the client is allowed to access the\n", + "\n", + "data, logs the request, and then determines which data to send\n", + "\n", + "back. This will be a subset of the data objects in cloud storage\n", + "\n", + "systems that make up the table.\n", + "\n", + "**3.** \u0007To allow temporary access to the data, the server generates\n", + "\n", + "short-lived presigned URLs that allow the client to read Parquet\n", + "\n", + "files directly from the cloud provider so that the read-only\n", + "\n", + "access can happen in parallel at massive bandwidth, without\n", + "\n", + "streaming through the sharing server.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 5\n", + " Introducing Databricks Marketplace\n", + "\n", + "\n", + "Enterprises need open collaboration for data and AI. Data sharing\n", + "\n", + "— within an organization or externally — allows companies to\n", + "\n", + "collaborate with partners, establish new partnerships and generate\n", + "\n", + "new revenue streams with data monetization.\n", + "\n", + "The demand for generative AI is driving disruption across industries,\n", + "\n", + "increasing the urgency for technical teams to build generative AI\n", + "\n", + "models and Large Language Models (LLMs) on top of their own data\n", + "\n", + "to differentiate their offerings.\n", + "\n", + "\n", + "Traditional data marketplaces are restricted and offer only data or\n", + "\n", + "simple applications, therefore limiting their value to data consumers.\n", + "\n", + "They also don’t offer tools to evaluate the data assets beyond basic\n", + "\n", + "descriptions or examples. Finally, data delivery is limited, often\n", + "\n", + "requiring ETL or a proprietary delivery mechanism.\n", + "\n", + "Enterprises need a better way to share data and AI that is flexible,\n", + "\n", + "secure and unlocks business value. An ecosystem makes data\n", + "\n", + "sharing and collaboration powerful.\n", + "\n", + "\n", + "**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "Focus on data only\n", + "or simple applications\n", + "\n", + "Lengthy discovery and\n", + "evaluation\n", + "\n", + "Delayed time-to-insights\n", + "with vendor lock-in\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "monetize new types of assets\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "more users\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "and unified governance\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Challenges in today's data marketplaces\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "\u0007 **Focus on data only or simple applications:** Accessing only\n", + "\n", + "data sets means organizations looking to take advantage of\n", + "\n", + "AI/ML need to look elsewhere or start from scratch, causing\n", + "\n", + "delays in driving business insights.\n", + "\n", + "\u0007 **Lengthy discovery and evaluation:** The tools most\n", + "\n", + "marketplaces provide for data consumers to evaluate data\n", + "\n", + "are simply descriptions and example SQL statements. Minimal\n", + "\n", + "\n", + "\u0007 **Limited opportunities to monetize new types of assets:**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
3981674a529e30218b182bb885b310c5Lengthy discovery and\n", + "evaluation\n", + "\n", + "Delayed time-to-insights\n", + "with vendor lock-in\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "monetize new types of assets\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "more users\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "and unified governance\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Challenges in today's data marketplaces\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "\u0007 **Focus on data only or simple applications:** Accessing only\n", + "\n", + "data sets means organizations looking to take advantage of\n", + "\n", + "AI/ML need to look elsewhere or start from scratch, causing\n", + "\n", + "delays in driving business insights.\n", + "\n", + "\u0007 **Lengthy discovery and evaluation:** The tools most\n", + "\n", + "marketplaces provide for data consumers to evaluate data\n", + "\n", + "are simply descriptions and example SQL statements. Minimal\n", + "\n", + "\n", + "\u0007 **Limited opportunities to monetize new types of assets:**\n", + "\n", + "A data-only approach means organizations are limited to\n", + "\n", + "monetizing anything beyond a data set and will face more\n", + "\n", + "friction to create new revenue opportunities with non-\n", + "\n", + "compatible platforms.\n", + "\n", + "**Difficulty reaching more users:** Data providers must choose\n", + "\n", + "between forgoing potential business or incurring the expense\n", + "\n", + "of replicating data.\n", + "\n", + "\n", + "evaluation tools mean it takes more time to figure out if a data\n", + "\n", + "product is right for you, which might include more time in\n", + "\n", + "back-and-forth messages with a provider or searching for a\n", + "\n", + "new provider altogether.\n", + "\n", + "\n", + "**Delayed time-to-insights with vendor lock-in:** Delivery\n", + "\n", + "through proprietary sharing technologies or FTP means either\n", + "\n", + "vendor lock-in or lengthy ETL processes to get the data where\n", + "\n", + "you need to work with it.\n", + "\n", + "\n", + "**Lack of secure technology and unified governance:** Without\n", + "\n", + "open standards for sharing data securely across platforms\n", + "\n", + "and clouds, data providers must use multiple tools to secure\n", + "\n", + "access to scattered data, leading to inconsistent governance.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Databricks Marketplace?\n", + "\n", + "\n", + "approach allows you to put your data to work more quickly in\n", + "\n", + "every cloud with your tools of choice.\n", + "\n", + "Marketplace brings together a vast ecosystem of data\n", + "\n", + "consumers and data providers to collaborate across a wide\n", + "\n", + "array of data sets without platform dependencies, complicated\n", + "\n", + "ETL, expensive replication and vendor lock-in.\n", + "\n", + "\n", + "Databricks Marketplace is an open marketplace for all your data,\n", + "\n", + "analytics and AI, powered by Delta Sharing.\n", + "\n", + "Since Marketplace is powered by Delta Sharing, you can benefit\n", + "\n", + "from open source flexibility and no vendor lock-in, enabling you\n", + "\n", + "to collaborate across all platforms, clouds and regions. This open\n", + "\n", + "\n", + "#### Key Benefits of Databricks Marketplace\n", + "\n", + "**Consumers** **Providers**\n", + "\n", + "\n", + "Databricks\n", + "Marketplace\n", + "provides key benefits\n", + "for both data\n", + "consumers and data\n", + "providers.\n", + "\n", + "\n", + "Discover more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Reach users\n", + "\n", + "on any platform\n", + "\n", + "\n", + "Reach users\n", + "\n", + "\n", + "Evaluate data\n", + "\n", + "products faster\n", + "\n", + "Avoid vendor lock-in\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "\n", + "Share data securely\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Marketplace drives innovation and expands revenue opportunities\n", + "\n", + "\n", + "##### Data Consumers\n", + "\n", + " For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "**Discover more than just data:** Access more than just data sets,\n", + "\n", + "including AI models, notebooks, applications and solutions.\n", + "\n", + "**Evaluate data products faster:** Pre-built notebooks and sample\n", + "\n", + "data help you quickly evaluate and have much greater confidence\n", + "\n", + "that a data product is right for your AI or analytics initiatives.\n", + "\n", + "Obtain the fastest and simplest time to insight.\n", + "\n", + "**Avoid vendor lock-in:** Substantially reduce the time to deliverSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
fdc1e9ec7cda82d04d9014acb796d6e7Reach users\n", + "\n", + "on any platform\n", + "\n", + "\n", + "Reach users\n", + "\n", + "\n", + "Evaluate data\n", + "\n", + "products faster\n", + "\n", + "Avoid vendor lock-in\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "\n", + "Share data securely\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Marketplace drives innovation and expands revenue opportunities\n", + "\n", + "\n", + "##### Data Consumers\n", + "\n", + " For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "**Discover more than just data:** Access more than just data sets,\n", + "\n", + "including AI models, notebooks, applications and solutions.\n", + "\n", + "**Evaluate data products faster:** Pre-built notebooks and sample\n", + "\n", + "data help you quickly evaluate and have much greater confidence\n", + "\n", + "that a data product is right for your AI or analytics initiatives.\n", + "\n", + "Obtain the fastest and simplest time to insight.\n", + "\n", + "**Avoid vendor lock-in:** Substantially reduce the time to deliver\n", + "\n", + "insights and avoid lock-in with open and seamless sharing and\n", + "\n", + "collaboration across clouds, regions, or platforms. Directly\n", + "\n", + "integrate with your tools of choice and right where you work.\n", + "\n", + "\n", + "##### Data Providers\n", + "\n", + " For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n", + "\n", + "**Reach users on any platform:** Expand your reach across\n", + "\n", + "platforms and access a massive ecosystem beyond walled\n", + "\n", + "gardens. Streamline delivery of simple data sharing to any cloud\n", + "\n", + "or region, without replication.\n", + "\n", + "**Monetize more than just data:** Monetize the broadest set of\n", + "\n", + "data assets including data sets, notebooks, AI models to reach\n", + "\n", + "more data consumers.\n", + "\n", + "**Share data securely:** Share all your data sets, notebooks, AI\n", + "\n", + "models, dashboards and more securely across clouds, regions\n", + "\n", + "and data platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Enable collaboration and accelerate innovation\n", + "\n", + "\n", + "#### Powered by a fast, growing ecosystem\n", + "\n", + "Enterprises need open collaboration for data and AI. In the past few\n", + "\n", + "months, we've continued to increase partners across industries,\n", + "\n", + "including Retail, Communications and Media & Entertainment,\n", + "\n", + "\n", + "\u0007 **Advertising and Retail**\n", + "\n", + "Incorporate shopper behavior analysis | Ads uplift/\n", + "\n", + "performance | Demand forecasting | “Next best SKU”\n", + "\n", + "prediction | Inventory analysis | Live weather data\n", + "\n", + "\n", + "Financial Services, with 520+ listings you can explore in our open\n", + "\n", + "\n", + "\u0007 **Finance**\n", + "\n", + "Incorporate data from stock exchange to predict\n", + "\n", + "economic impact | Market research | Public census and\n", + "\n", + "housing data to predict insurance sales\n", + "\n", + "\u0007 **Healthcare and Life Sciences**\n", + "\n", + "Genomic target identification | Patient risk scoring\n", + "\n", + "Accelerating drug discovery | Commercial effectiveness |\n", + "\n", + "Clinical research\n", + "\n", + "For more on Databricks Marketplace,\n", + "\n", + "go to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n", + "\n", + "Resources section on page 41 .\n", + "\n", + "\n", + "Marketplace from 80+ providers and counting.\n", + "\n", + "#### Use cases for an open marketplace\n", + "\n", + "Organizations across all industries have many use cases for\n", + "\n", + "consuming and sharing third-party data from the simple (dataset\n", + "\n", + "joins) to the more advanced (AI notebooks, applications and\n", + "\n", + "dashboards).\n", + "\n", + "\n", + "-----\n", + "\n", + "#### New upcoming feature: AI model sharing\n", + "\n", + "\n", + "Nowadays, it may seem like every organization wants to become\n", + "\n", + "an AI organization. However, most organizations are new to AI.\n", + "\n", + "Databricks has heard from customers that they want to discover\n", + "\n", + "out-of-the-box AI models on Marketplace to help them kickstart\n", + "\n", + "their AI innovation journey.\n", + "\n", + "To meet this demand, Databricks will be adding AI model sharing\n", + "\n", + "capabilities on Marketplace to provide users access to both OSS\n", + "\n", + "and proprietary AI (both first-and third-party) models. This willSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
e3a2c1df39ef549700795ff328f19b66Resources section on page 41 .\n", + "\n", + "\n", + "Marketplace from 80+ providers and counting.\n", + "\n", + "#### Use cases for an open marketplace\n", + "\n", + "Organizations across all industries have many use cases for\n", + "\n", + "consuming and sharing third-party data from the simple (dataset\n", + "\n", + "joins) to the more advanced (AI notebooks, applications and\n", + "\n", + "dashboards).\n", + "\n", + "\n", + "-----\n", + "\n", + "#### New upcoming feature: AI model sharing\n", + "\n", + "\n", + "Nowadays, it may seem like every organization wants to become\n", + "\n", + "an AI organization. However, most organizations are new to AI.\n", + "\n", + "Databricks has heard from customers that they want to discover\n", + "\n", + "out-of-the-box AI models on Marketplace to help them kickstart\n", + "\n", + "their AI innovation journey.\n", + "\n", + "To meet this demand, Databricks will be adding AI model sharing\n", + "\n", + "capabilities on Marketplace to provide users access to both OSS\n", + "\n", + "and proprietary AI (both first-and third-party) models. This will\n", + "\n", + "enable data consumers and providers to discover and monetize AI\n", + "\n", + "models and integrate AI into their data solutions.\n", + "\n", + "\n", + "Using this feature, data consumers can evaluate AI models with\n", + "\n", + "rich previews, including visualizations and pre-built notebooks with\n", + "\n", + "sample data. With Databricks Marketplace, there are no difficult\n", + "\n", + "data delivery mechanisms — you can get the AI models instantly\n", + "\n", + "with the click of a button. All of this works out-of-the-box with the AI\n", + "\n", + "capabilities of the Databricks Lakehouse Platform for both real-time\n", + "\n", + "and batch inference. For real-time inference, you can use model\n", + "\n", + "serving endpoints. For batch inference, you can invoke the models\n", + "\n", + "as functions directly from DBSQL or notebooks.\n", + "\n", + "With AI model sharing, Databricks customers will have access\n", + "\n", + "to best-in-class models from leading providers, as well as OSS\n", + "\n", + "models published by Databricks which can be quickly and securely\n", + "\n", + "applied on top of their data. Databricks will curate and publish\n", + "\n", + "its own open source models across common use cases, such as\n", + "\n", + "instruction-following and text summarization, and optimize tuning or\n", + "\n", + "deployment of these models.\n", + "\n", + "Using AI models from Databricks Marketplace can help your\n", + "\n", + "organization summarize complex information quickly and easily to\n", + "\n", + "help accelerate the pace of innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 6\n", + " Share securely with Databricks Clean Rooms\n", + "\n", + "\n", + "While the demand for external data to make data-driven\n", + "\n", + "innovations is greater than ever, there is growing concern among\n", + "\n", + "organizations around data privacy. The need for organizations to\n", + "\n", + "share data and collaborate with their partners and customers in a\n", + "\n", + "secure, governed and privacy-centric way is driving the concept\n", + "\n", + "of “data clean rooms.”\n", + "\n", + "\n", + "#### What is a data clean room?\n", + "\n", + "A data clean room provides a secure, governed and privacy-safe\n", + "\n", + "environment where participants can bring their sensitive data, which\n", + "\n", + "might include personally identifiable information (PII), and perform\n", + "\n", + "joint analysis on that private data. Participants have full control\n", + "\n", + "of the data and can decide which participants can perform what\n", + "\n", + "analysis without exposing any sensitive data.\n", + "\n", + "\n", + "###### Collaborator A\n", + " Data Cleanroom\n", + "E.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n", + "\n", + "\u0007What is our audience overlap?\n", + "\n", + "\n", + "###### Collaborator B\n", + "\n", + "E.G., ADVERTISERTS\n", + "\n", + "\n", + "**Figure 8:**\n", + "Data clean room\n", + "diagram example\n", + "for audience\n", + "overlap analysis in\n", + "advertising\n", + "\n", + "\n", + "How did my campaign do in\n", + "\n", + "terms of reach and frequency?\n", + "\n", + "\n", + "\u0007What is the lift in purchases\n", + "\n", + "among those in-segment versus\n", + "those out-of-segment?\n", + "\n", + "**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n", + "\n", + "\n", + "-----\n", + "\n", + "A data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allowsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
bfc080f09852618aa1bd62f142c2e4c6joint analysis on that private data. Participants have full control\n", + "\n", + "of the data and can decide which participants can perform what\n", + "\n", + "analysis without exposing any sensitive data.\n", + "\n", + "\n", + "###### Collaborator A\n", + " Data Cleanroom\n", + "E.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n", + "\n", + "\u0007What is our audience overlap?\n", + "\n", + "\n", + "###### Collaborator B\n", + "\n", + "E.G., ADVERTISERTS\n", + "\n", + "\n", + "**Figure 8:**\n", + "Data clean room\n", + "diagram example\n", + "for audience\n", + "overlap analysis in\n", + "advertising\n", + "\n", + "\n", + "How did my campaign do in\n", + "\n", + "terms of reach and frequency?\n", + "\n", + "\n", + "\u0007What is the lift in purchases\n", + "\n", + "among those in-segment versus\n", + "those out-of-segment?\n", + "\n", + "**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n", + "\n", + "\n", + "-----\n", + "\n", + "A data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n", + "\n", + "advertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n", + "\n", + "the last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n", + "\n", + "sharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n", + "\n", + "There are various compelling needs driving this demand:\n", + "\n", + "\n", + "**Privacy-first world.** Stringent data privacy regulations such as\n", + "\n", + "GDPR and CCPA, along with sweeping changes in third-party\n", + "\n", + "measurement, have transformed how organizations collect, use\n", + "\n", + "and share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n", + "\n", + "[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n", + "\n", + "and flexibility to easily opt out of app tracking. Google also plans\n", + "\n", + "to [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n", + "\n", + "2024. As these privacy laws and practices evolve, the demand\n", + "\n", + "for data cleanrooms is likely to rise as the industry moves to new\n", + "\n", + "\n", + "**Collaboration in a fragmented ecosystem.** Today, consumers have\n", + "\n", + "more options than ever before when it comes to where, when and\n", + "\n", + "how they engage with content. As a result, the digital footprint of\n", + "\n", + "consumers is fragmented across different platforms, necessitating\n", + "\n", + "that companies collaborate with their partners to create a unified\n", + "\n", + "view of their customers’ needs and requirements. To facilitate\n", + "\n", + "collaboration across organizations, cleanrooms provide a secure\n", + "\n", + "and private way to combine their data with other data to unlock new\n", + "\n", + "insights or capabilities.\n", + "\n", + "\n", + "identifiers that are PII based, such as UID 2.0, and organizations\n", + "\n", + "try to find new ways to share and join data with customers and\n", + "\n", + "partners in a privacy-centric way.\n", + "\n", + "**New ways to monetize data.** Most organizations are looking to\n", + "\n", + "monetize their data in one form or another. With today’s privacy\n", + "\n", + "laws, companies will try to find any possible advantages to monetize\n", + "\n", + "their data without the risk of breaking privacy rules. This creates an\n", + "\n", + "opportunity for data vendors or publishers to join data for big data\n", + "\n", + "analytics without having direct access to the data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data clean room uses cases\n", + "\n", + "\n", + "#### Category management for retail and consumer goods\n", + "\n", + "Clean rooms enable real-time collaboration between retailers\n", + "\n", + "and suppliers, ensuring secure information exchange for demand\n", + "\n", + "forecasting, inventory planning and supply chain optimization.\n", + "\n", + "This improves product availability, reduces costs and streamlines\n", + "\n", + "operations for both parties.\n", + "\n", + "#### Real-world evidence (RWE) for healthcare\n", + "\n", + "Clean rooms provide secure access to sensitive healthcare data sets,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
5fedc77ae1a861260a1c4c866e2dbd05partners in a privacy-centric way.\n", + "\n", + "**New ways to monetize data.** Most organizations are looking to\n", + "\n", + "monetize their data in one form or another. With today’s privacy\n", + "\n", + "laws, companies will try to find any possible advantages to monetize\n", + "\n", + "their data without the risk of breaking privacy rules. This creates an\n", + "\n", + "opportunity for data vendors or publishers to join data for big data\n", + "\n", + "analytics without having direct access to the data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data clean room uses cases\n", + "\n", + "\n", + "#### Category management for retail and consumer goods\n", + "\n", + "Clean rooms enable real-time collaboration between retailers\n", + "\n", + "and suppliers, ensuring secure information exchange for demand\n", + "\n", + "forecasting, inventory planning and supply chain optimization.\n", + "\n", + "This improves product availability, reduces costs and streamlines\n", + "\n", + "operations for both parties.\n", + "\n", + "#### Real-world evidence (RWE) for healthcare\n", + "\n", + "Clean rooms provide secure access to sensitive healthcare data sets,\n", + "\n", + "allowing collaborators to connect and query multiple sources of data\n", + "\n", + "without comprising data privacy. This supports RWE use cases such\n", + "\n", + "as regulatory decisions, safety, clinical trial design and observational\n", + "\n", + "research.\n", + "\n", + "\n", + "#### Audience overlap exploration for media and entertainment\n", + "\n", + "By creating a clean room environment, media companies can\n", + "\n", + "securely share their audience data with advertisers or other media\n", + "\n", + "partners. This allows them to perform in-depth analysis and identify\n", + "\n", + "shared audience segments without directly accessing or exposing\n", + "\n", + "individual user information.\n", + "\n", + "#### Know Your Customer (KYC) in banking\n", + "\n", + "KYC standards are designed to combat financial fraud, money\n", + "\n", + "laundering and terrorism financing. Clean rooms can be used within a\n", + "\n", + "given jurisdiction to allow financial services companies to collaborate\n", + "\n", + "and run shared analytics to build a holistic view of a transaction for\n", + "\n", + "investigations.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Personalization with expanded interests for retailers\n", + "\n", + "Retailers want to target consumers based on past purchases, as\n", + "\n", + "well as other purchases with different retailers. Clean rooms enable\n", + "\n", + "retailers to augment their knowledge of consumers to suggest new\n", + "\n", + "products and services that are relevant to the individual but have\n", + "\n", + "\n", + "#### 5G data monetization for telecom\n", + "\n", + "5G data monetization enables telecoms to capitalize on data\n", + "\n", + "from 5G networks. Clean rooms provide a secure environment\n", + "\n", + "for collaboration with trusted partners, ensuring privacy while\n", + "\n", + "maximizing data value for optimized services, personalized\n", + "\n", + "experiences and targeted advertising.\n", + "\n", + "\n", + "not yet been purchased.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Shortcomings of existing data clean rooms\n", + "\n", + "\n", + "Organizations exploring clean room options are finding some glaring\n", + "\n", + "shortcomings in the existing solutions that limit the full potential of the\n", + "\n", + "“clean rooms” concept.\n", + "\n", + "First, many existing data clean room vendors require data to be on the\n", + "\n", + "same cloud, same region, and/or same data platform. Participants then\n", + "\n", + "have to move data into proprietary platforms, which results in lock-in\n", + "\n", + "and additional data storage costs.\n", + "\n", + "\n", + "Second, most existing solutions are not scalable to expand\n", + "\n", + "collaboration beyond a few collaborators at a time. For example,\n", + "\n", + "an advertiser might want to get a detailed view of their ad\n", + "\n", + "performance across different platforms, which requires analysis\n", + "\n", + "of the aggregated data from multiple data publishers. With\n", + "\n", + "collaboration limited to just a few participants, organizations get\n", + "\n", + "partial insights on one clean room platform and end up moving\n", + "\n", + "their data to another clean room vendor to aggregate the data,\n", + "\n", + "incurring the operational overhead of collating partial insights.\n", + "\n", + "Finally, existing clean room solutions do not provide the flexibility\n", + "\n", + "to run arbitrary analysis and are mainly restricted to SQL, a\n", + "\n", + "subset of Python, and pre-defined templates. While SQL is\n", + "\n", + "absolutely needed for clean rooms, there are times when you\n", + "\n", + "require complex computations such as machine learning or\n", + "\n", + "integration with APIs where SQL doesn’t satisfy the full depth of\n", + "\n", + "the technical requirements.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
384c8233e32ea9b504cccff0fc400f7eand additional data storage costs.\n", + "\n", + "\n", + "Second, most existing solutions are not scalable to expand\n", + "\n", + "collaboration beyond a few collaborators at a time. For example,\n", + "\n", + "an advertiser might want to get a detailed view of their ad\n", + "\n", + "performance across different platforms, which requires analysis\n", + "\n", + "of the aggregated data from multiple data publishers. With\n", + "\n", + "collaboration limited to just a few participants, organizations get\n", + "\n", + "partial insights on one clean room platform and end up moving\n", + "\n", + "their data to another clean room vendor to aggregate the data,\n", + "\n", + "incurring the operational overhead of collating partial insights.\n", + "\n", + "Finally, existing clean room solutions do not provide the flexibility\n", + "\n", + "to run arbitrary analysis and are mainly restricted to SQL, a\n", + "\n", + "subset of Python, and pre-defined templates. While SQL is\n", + "\n", + "absolutely needed for clean rooms, there are times when you\n", + "\n", + "require complex computations such as machine learning or\n", + "\n", + "integration with APIs where SQL doesn’t satisfy the full depth of\n", + "\n", + "the technical requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Databricks Clean Rooms\n", + "\n", + "Databricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n", + "\n", + "any cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n", + "\n", + "\n", + "**Flexible - your language and workload of**\n", + "\n", + "**choice.** Databricks Clean Rooms empower\n", + "\n", + "collaborators to share and join their existing\n", + "\n", + "data and run complex workloads in any\n", + "\n", + "language —Python, R, SQL, Java and Scala —\n", + "\n", + "on the data while maintaining data privacy.\n", + "\n", + "Beyond traditional SQL, users can run arbitrary\n", + "\n", + "workloads and languages, allowing them to train\n", + "\n", + "machine learning models, perform inference\n", + "\n", + "and utilize open-source or third-party privacy-\n", + "\n", + "enhancing technologies. This flexibility enables\n", + "\n", + "data scientists and analysts to achieve more\n", + "\n", + "comprehensive and advanced data analysis\n", + "\n", + "within the secure Clean Room environment.\n", + "\n", + "\n", + "**Scalable, multi-party collaboration.**\n", + "\n", + "With Databricks Clean Rooms, you can\n", + "\n", + "launch a clean room and work with multiple\n", + "\n", + "collaborators at a time. This capability\n", + "\n", + "enables real-time collaboration, fostering\n", + "\n", + "efficient and rapid results. Moreover,\n", + "\n", + "Databricks Clean Rooms seamlessly\n", + "\n", + "integrate with identity service providers,\n", + "\n", + "allowing users to leverage offerings from\n", + "\n", + "these providers during collaboration. The\n", + "\n", + "ability to collaborate with multiple parties\n", + "\n", + "and leverage identity services enhances the\n", + "\n", + "overall data collaboration experience within\n", + "\n", + "Databricks Clean Rooms.\n", + "\n", + "\n", + "**Interoperable - any data source**\n", + "\n", + "**with no replication.** Databricks Clean\n", + "\n", + "Rooms excel in interoperability, ensuring\n", + "\n", + "smooth collaboration across diverse\n", + "\n", + "environments. With Delta Sharing,\n", + "\n", + "collaborators can seamlessly work\n", + "\n", + "together across different cloud providers,\n", + "\n", + "regions and even data platforms without\n", + "\n", + "the need for extensive data movement.\n", + "\n", + "This eliminates data silos and enables\n", + "\n", + "organizations to leverage existing\n", + "\n", + "infrastructure and data ecosystems while\n", + "\n", + "maintaining the utmost security and\n", + "\n", + "compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Resources\n", + " Getting started with Data Sharing and Collaboration\n", + "\n", + "\n", + "Data sharing plays a key role in business processes across the\n", + "\n", + "enterprise, from product development and internal operations to\n", + "\n", + "customer experience and compliance. However, most businesses\n", + "\n", + "have been slow to move forward because of incompatibility\n", + "\n", + "between systems, complexity and security concerns.\n", + "\n", + "Data-driven organizations need an open — and secure — approach\n", + "\n", + "to data sharing.\n", + "\n", + "\n", + "Databricks offers an open approach to data sharing and\n", + "\n", + "collaboration with a variety of tools to:\n", + "\n", + "\u0007 **Share across platforms:** You can share live data sets, as well\n", + "\n", + "as AI models, dashboards and notebooks across platforms,\n", + "\n", + "clouds and regions. This open approach is powered by\n", + "\n", + "Delta Sharing, the world’s first open protocol for secure data\n", + "\n", + "sharing, which allows organizations to share data for any useSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
6b59600128f0f7c536b3b2f20ba6891borganizations to leverage existing\n", + "\n", + "infrastructure and data ecosystems while\n", + "\n", + "maintaining the utmost security and\n", + "\n", + "compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Resources\n", + " Getting started with Data Sharing and Collaboration\n", + "\n", + "\n", + "Data sharing plays a key role in business processes across the\n", + "\n", + "enterprise, from product development and internal operations to\n", + "\n", + "customer experience and compliance. However, most businesses\n", + "\n", + "have been slow to move forward because of incompatibility\n", + "\n", + "between systems, complexity and security concerns.\n", + "\n", + "Data-driven organizations need an open — and secure — approach\n", + "\n", + "to data sharing.\n", + "\n", + "\n", + "Databricks offers an open approach to data sharing and\n", + "\n", + "collaboration with a variety of tools to:\n", + "\n", + "\u0007 **Share across platforms:** You can share live data sets, as well\n", + "\n", + "as AI models, dashboards and notebooks across platforms,\n", + "\n", + "clouds and regions. This open approach is powered by\n", + "\n", + "Delta Sharing, the world’s first open protocol for secure data\n", + "\n", + "sharing, which allows organizations to share data for any use\n", + "\n", + "case, any tool and on any cloud.\n", + "\n", + "\u0007 **Share all your data and AI: Databricks Marketplace** is an\n", + "\n", + "open marketplace for all your data, analytics and AI, enabling\n", + "\n", + "both data consumers and data providers with the ability to\n", + "\n", + "deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n", + "\n", + "to easily collaborate with customers and partners on any\n", + "\n", + "cloud in a privacy-safe way. With Delta Sharing, clean room\n", + "\n", + "participants can securely share data from their data lakes\n", + "\n", + "without any data replication across clouds or regions. Your\n", + "\n", + "data stays with you without vendor lock-in, and you can\n", + "\n", + "centrally audit and monitor the usage of your data.\n", + "\n", + "\n", + "-----\n", + "\n", + "Get started with these products by exploring the resources below.\n", + "\n", + "\n", + "**Delta Sharing**\n", + "\n", + "\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n", + "\n", + "[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n", + "\n", + "[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "\n", + "**Databricks Marketplace**\n", + "\n", + "[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n", + "\n", + "[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n", + "\n", + "[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n", + "\n", + "[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n", + "\n", + "[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n", + "\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n", + "\n", + "\n", + "**Databricks Clean Rooms**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
f330d8664cc5ce1bac8095d4a154a2f8[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n", + "\n", + "[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n", + "\n", + "[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n", + "\n", + "[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n", + "\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n", + "\n", + "\n", + "**Databricks Clean Rooms**\n", + "\n", + "\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n", + "\n", + "[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n", + "\n", + "[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "\n", + "[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n", + "\n", + "\n", + "-----\n", + "\n", + "## About the Authors\n", + "\n", + "\n", + "**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n", + "\n", + "making analytics and AI simple for customers by leveraging the\n", + "\n", + "power of the Databricks Lakehouse Platform. You can reach Vuong\n", + "\n", + "on [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n", + "\n", + "\n", + "**Sachin Thakur** is a Principal Product Marketing Manager on the\n", + "\n", + "Databricks Data Engineering and Analytics team. His area of focus\n", + "\n", + "is data governance with Unity Catalog, and he is passionate about\n", + "\n", + "helping organizations democratize data and AI with the DatabricksSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
783b4155a9c7a07bf4dcceef9213e9f4[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n", + "\n", + "\n", + "-----\n", + "\n", + "## About the Authors\n", + "\n", + "\n", + "**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n", + "\n", + "making analytics and AI simple for customers by leveraging the\n", + "\n", + "power of the Databricks Lakehouse Platform. You can reach Vuong\n", + "\n", + "on [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n", + "\n", + "\n", + "**Sachin Thakur** is a Principal Product Marketing Manager on the\n", + "\n", + "Databricks Data Engineering and Analytics team. His area of focus\n", + "\n", + "is data governance with Unity Catalog, and he is passionate about\n", + "\n", + "helping organizations democratize data and AI with the Databricks\n", + "\n", + "Lakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n", + "\n", + "\n", + "**Milos Colic** is a Senior Solution Architect at Databricks. His\n", + "\n", + "\n", + "passion is to help customers with their data exchange and data\n", + "\n", + "monetization needs. Furthermore, he is passionate about geospatial\n", + "\n", + "data processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n", + "\n", + "\n", + "**Jay Bhankharia** is a Senior Director on the Databricks Data\n", + "\n", + "Partnerships team. His passion is to help customers gain insights\n", + "\n", + "from data to use the power of the Databricks Lakehouse Platform\n", + "\n", + "for their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n", + "\n", + "\n", + "**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n", + "\n", + "\n", + "over 20 years of experience in helping organizations of any size\n", + "\n", + "build data solutions. He focuses on data monetization and loves to\n", + "\n", + "help customers and businesses get more value from the data they\n", + "\n", + "have. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n", + "\n", + "**Somasekar Natarajan** (Som) is a Solution Architect at\n", + "\n", + "Databricks specializing in enterprise data management. Som has\n", + "\n", + "worked with Fortune organizations spanning three continents for\n", + "\n", + "close to two decades with one objective — helping customers to\n", + "\n", + "\n", + "**Giselle Goicochea** is a Senior Product Marketing Manager\n", + "\n", + "on the Databricks Data Engineering and Analytics team. Her area\n", + "\n", + "of focus is data sharing and collaboration with Delta Sharing and\n", + "\n", + "Databricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n", + "\n", + "**Kelly Albano** is a Product Marketing Manager on the Databricks\n", + "\n", + "Data Engineering and Analytics team. Her area of focus is security,\n", + "\n", + "compliance and Databricks Clean Rooms. You can reach\n", + "\n", + "Kelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n", + "\n", + "\n", + "harness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow DatabricksSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
842ae84382f4816a7c67c6480c3a3d55Kelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n", + "\n", + "\n", + "harness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "© Databricks 2023 All rights reserved\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
76bb60c8fadfe670658fb0e87fc193c4### EBOOK\n", + "\n", + "# A Compact Guide to Large Language Models\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 1\n", + "## Introduction\n", + "\n", + "##### Definition of large language models (LLMs)\n", + "\n", + "Large language models are AI systems that are designed to process and analyze\n", + "vast amounts of natural language data and then use that information to generate\n", + "responses to user prompts. These systems are trained on massive data sets\n", + "using advanced machine learning algorithms to learn the patterns and structures\n", + "of human language, and are capable of generating natural language responses to\n", + "a wide range of written inputs. Large language models are becoming increasingly\n", + "important in a variety of applications such as natural language processing,\n", + "machine translation, code and text generation, and more.\n", + "\n", + "While this guide will focus on language models, it’s important to understand that\n", + "they are only one aspect under a larger generative AI umbrella. Other noteworthy\n", + "generative AI implementations include projects such as art generation from text,\n", + "audio and video generation, and certainly more to come in the near future.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Extremely brief historical background and development of LLMs\n", + "\n", + "\n", + "###### 1950s–1990s\n", + "Initial attempts are made to map hard rules around languages and\n", + "follow logical steps to accomplish tasks like translating a sentence\n", + "from one language to another.\n", + "\n", + "While this works sometimes, strictly defined rules only work for\n", + "concrete, well-defined tasks that the system has knowledge about.\n", + "\n", + "###### 1990s \n", + "Language models begin evolving into statistical models and\n", + "language patterns start being analyzed, but larger-scale projects\n", + "are limited by computing power.\n", + "\n", + "###### 2000s \n", + "Advancements in machine learning increase the complexity of\n", + "language models, and the wide adoption of the internet sees an\n", + "\n", + "enormous increase in available training data.\n", + "\n", + "###### 2012 \n", + "Advancements in deep learning architectures and larger data sets\n", + "lead to the development of GPT (Generative Pre-trained Transformer).\n", + "\n", + "\n", + "###### 2018\n", + "Google introduces BERT (Bidirectional Encoder Representations\n", + "from Transformers), which is a big leap in architecture and paves\n", + "the way for future large language models.\n", + "\n", + "###### 2020\n", + "OpenAI releases GPT-3, which becomes the largest model at\n", + "175B parameters and sets a new performance benchmark for\n", + "language-related tasks.\n", + "\n", + "###### 2022\n", + "ChatGPT is launched, which turns GPT-3 and similar models into\n", + "a service that is widely accessible to users through a web interface\n", + "and kicks off a huge increase in public awareness of LLMs and\n", + "generative AI.\n", + "\n", + "###### 2023\n", + "Open source LLMs begin showing increasingly impressive results\n", + "with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\n", + "GPT-4 is also released, setting a new benchmark for both parameter\n", + "size and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2\n", + "## Understanding Large Language Models\n", + "\n", + "\n", + "##### What are language models and how do they work?\n", + "\n", + "Large language models are advanced artificial intelligence systems that take\n", + "some input and generate humanlike text as a response. They work by first\n", + "analyzing vast amounts of data and creating an internal structure that models\n", + "the natural language data sets that they’re trained on. Once this internal\n", + "structure has been developed, the models can then take input in the form of\n", + "natural language and approximate a good response.\n", + "\n", + "##### If they’ve been around for so many years, why are they just now making headlines?\n", + "\n", + "A few recent advancements have really brought the spotlight to generative AI\n", + "and large language models:\n", + "\n", + "**A D VA N C E M E N T S I N T E C H N I Q U E S**\n", + "Over the past few years, there have been significant advancements in the\n", + "techniques used to train these models, resulting in big leaps in performance.\n", + "Notably, one of the largest jumps in performance has come from integrating\n", + "human feedback directly into the training process.\n", + "\n", + "\n", + "**I N C R E A S E D A C C E S S I B I L I T Y**\n", + "The release of ChatGPT opened the door for anyone with internet access\n", + "to interact with one of the most advanced LLMs through a simple web\n", + "interface. This brought the impressive advancements of LLMs into the\n", + "spotlight, since previously these more powerful LLMs were only available\n", + "to researchers with large amounts of resources and those with very deep\n", + "technical knowledge.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
b30db2ad94ab140731be7adc8169387e##### If they’ve been around for so many years, why are they just now making headlines?\n", + "\n", + "A few recent advancements have really brought the spotlight to generative AI\n", + "and large language models:\n", + "\n", + "**A D VA N C E M E N T S I N T E C H N I Q U E S**\n", + "Over the past few years, there have been significant advancements in the\n", + "techniques used to train these models, resulting in big leaps in performance.\n", + "Notably, one of the largest jumps in performance has come from integrating\n", + "human feedback directly into the training process.\n", + "\n", + "\n", + "**I N C R E A S E D A C C E S S I B I L I T Y**\n", + "The release of ChatGPT opened the door for anyone with internet access\n", + "to interact with one of the most advanced LLMs through a simple web\n", + "interface. This brought the impressive advancements of LLMs into the\n", + "spotlight, since previously these more powerful LLMs were only available\n", + "to researchers with large amounts of resources and those with very deep\n", + "technical knowledge.\n", + "\n", + "**G R O W I N G C O M P U TAT I O N A L P O W E R**\n", + "The availability of more powerful computing resources, such as graphics\n", + "processing units (GPUs), and better data processing techniques allowed\n", + "researchers to train much larger models, improving the performance of\n", + "these language models.\n", + "\n", + "**I M P R O V E D T R A I N I N G D ATA**\n", + "As we get better at collecting and analyzing large amounts of data, the\n", + "\n", + "model performance has improved dramatically. In fact, Databricks showed\n", + "that you can get amazing results training a relatively small model with a\n", + "high-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\n", + "with the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### So what are organizations using large language models for?\n", + "\n", + "Here are just a few examples of common use cases for large language models:\n", + "\n", + "**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\n", + "One of the most common implementations, LLMs can be used by\n", + "organizations to provide help with things like customer support,\n", + "troubleshooting, or even having open-ended conversations with userprovided prompts.\n", + "\n", + "**C O D E G E N E R AT I O N A N D D E B U G G I N G**\n", + "LLMs can be trained on large amounts of code examples and give\n", + "useful code snippets as a response to a request written in natural language.\n", + "With the proper techniques, LLMs can also be built in a way to reference\n", + "other relevant data that it may not have been trained with, such as a\n", + "company’s documentation, to help provide more accurate responses.\n", + "\n", + "**S E N T I M E N T A N A LY S I S**\n", + "Often a hard task to quantify, LLMs can help take a piece of text and gauge\n", + "emotion and opinions. This can help organizations gather the data and\n", + "\n", + "feedback needed to improve customer satisfaction.\n", + "\n", + "\n", + "**L A N G U A G E T R A N S L AT I O N**\n", + "Globalize all your content without hours of painstaking work by simply\n", + "feeding your web pages through the proper LLMs and translating them to\n", + "different languages. As more LLMs are trained in other languages, quality\n", + "and availability will continue to improve.\n", + "\n", + "**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\n", + "Entire customer calls or meetings could be efficiently summarized so that\n", + "others can more easily digest the content. LLMs can take large amounts of\n", + "text and boil it down to just the most important bytes.\n", + "\n", + "**C O N T E N T G E N E R AT I O N**\n", + "Start with a detailed prompt and have an LLM develop an outline for you.\n", + "Then continue on with those prompts and LLMs can generate a good first\n", + "draft for you to build off. Use them to brainstorm ideas, and ask the LLM\n", + "questions to help you draw inspiration from.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
156c59c7ad359ad7302ff542371be820feedback needed to improve customer satisfaction.\n", + "\n", + "\n", + "**L A N G U A G E T R A N S L AT I O N**\n", + "Globalize all your content without hours of painstaking work by simply\n", + "feeding your web pages through the proper LLMs and translating them to\n", + "different languages. As more LLMs are trained in other languages, quality\n", + "and availability will continue to improve.\n", + "\n", + "**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\n", + "Entire customer calls or meetings could be efficiently summarized so that\n", + "others can more easily digest the content. LLMs can take large amounts of\n", + "text and boil it down to just the most important bytes.\n", + "\n", + "**C O N T E N T G E N E R AT I O N**\n", + "Start with a detailed prompt and have an LLM develop an outline for you.\n", + "Then continue on with those prompts and LLMs can generate a good first\n", + "draft for you to build off. Use them to brainstorm ideas, and ask the LLM\n", + "questions to help you draw inspiration from.\n", + "\n", + "**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\n", + "language, but they might not know who won the big sporting event last year.\n", + "It’s always important to fact check and understand the responses before\n", + "\n", + "using them as a reference.\n", + "\n", + "\n", + "**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\n", + "The ability to categorize and sort large volumes of data enables the\n", + "identification of common themes and trends, supporting informed\n", + "decision-making and more targeted strategies.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 3\n", + "## Applying Large Language Models\n", + "\n", + "\n", + "There are a few paths that one can take when looking to apply large language\n", + "models for their given use case. Generally speaking, you can break them down\n", + "into two categories, but there’s some crossover between each. We’ll briefly cover\n", + "the pros and cons of each and what scenarios fit best for each.\n", + "\n", + "##### Proprietary services\n", + "\n", + "As the first widely available LLM powered service, OpenAI’s ChatGPT was the\n", + "explosive charge that brought LLMs into the mainstream. ChatGPT provides\n", + "a nice user interface (or API) where users can feed prompts to one of many\n", + "models (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\n", + "among the highest-performing models, trained on enormous data sets, and are\n", + "capable of extremely complex tasks both from a technical standpoint, such as\n", + "code generation, as well as from a creative perspective like writing poetry in a\n", + "specific style.\n", + "\n", + "The downside of these services is the absolutely enormous amount of compute\n", + "required not only to train them (OpenAI has said GPT-4 cost them over $100\n", + "million to develop) but also to serve the responses. For this reason, these\n", + "extremely large models will likely always be under the control of organizations,\n", + "\n", + "\n", + "and require you to send your data to their servers in order to interact with their\n", + "language models. This raises privacy and security concerns, and also subjects\n", + "users to “black box” models, whose training and guardrails they have no control\n", + "over. Also, due to the compute required, these services are not free beyond a\n", + "very limited use, so cost becomes a factor in applying these at scale.\n", + "\n", + "In summary: Proprietary services are great to use if you have very complex tasks,\n", + "are okay with sharing your data with a third party, and are prepared to incur\n", + "costs if operating at any significant scale.\n", + "\n", + "##### Open source models\n", + "\n", + "The other avenue for language models is to go to the open source community,\n", + "where there has been similarly explosive growth over the past few years.\n", + "Communities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n", + "\n", + "from contributors that can help solve tons of specific use cases such as text\n", + "generation, summarization and classification. The open source community has\n", + "been quickly catching up to the performance of the proprietary models, but\n", + "ultimately still hasn’t matched the performance of something like GPT-4.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
db2ede947d82f8c39104fbd747f97f40In summary: Proprietary services are great to use if you have very complex tasks,\n", + "are okay with sharing your data with a third party, and are prepared to incur\n", + "costs if operating at any significant scale.\n", + "\n", + "##### Open source models\n", + "\n", + "The other avenue for language models is to go to the open source community,\n", + "where there has been similarly explosive growth over the past few years.\n", + "Communities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n", + "\n", + "from contributors that can help solve tons of specific use cases such as text\n", + "generation, summarization and classification. The open source community has\n", + "been quickly catching up to the performance of the proprietary models, but\n", + "ultimately still hasn’t matched the performance of something like GPT-4.\n", + "\n", + "\n", + "-----\n", + "\n", + "It does currently take a little bit more work to grab an open source model and\n", + "start using it, but progress is moving very quickly to make them more accessible\n", + "to users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n", + "[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\n", + "experience to pull any Hugging Face transformer model and use it as a Python\n", + "object. Oftentimes, you can find an open source model that solves your specific\n", + "problem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\n", + "the model into your environment and host it yourself. This means that you can\n", + "keep the data in your control for privacy and governance concerns as well as\n", + "manage your costs.\n", + "\n", + "\n", + "##### Conclusion and general guidelines\n", + "\n", + "Ultimately, every organization is going to have unique challenges to overcome,\n", + "and there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\n", + "becomes more data driven, everything, including LLMs, will be reliant on having\n", + "a strong foundation of data. LLMs are incredible tools, but they have to be used\n", + "and implemented on top of this strong data foundation. Databricks brings both\n", + "that strong data foundation as well as the integrated tools to let you use and\n", + "fine-tune LLMs in your domain.\n", + "\n", + "\n", + "Another huge upside to using open source models is the ability to fine-tune\n", + "them to your own data. Since you’re not dealing with a black box of a proprietary\n", + "service, there are techniques that let you take open source models and train\n", + "them to your specific data, greatly improving their performance on your\n", + "specific domain. We believe the future of language models is going to move\n", + "in this direction, as more and more organizations will want full control and\n", + "understanding of their LLMs.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4\n", + "## So What Do I Do Next If I Want to Start Using LLMs?\n", + "\n", + "\n", + "That depends where you are on your journey! Fortunately, we have a few paths\n", + "for you.\n", + "\n", + "If you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\n", + "you can watch one of Databricks’ most talented developers and speakers go\n", + "over these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n", + "\n", + "If you’re ready to dive a little deeper and expand your education and\n", + "understanding of LLM foundations, we’d recommend checking out our\n", + "[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\n", + "and dive into the theory behind foundation models.\n", + "\n", + "If your hands are already shaking with excitement and you already have some\n", + "working knowledge of Python and Databricks, we’ll provide some great examples\n", + "with sample code that can get you up and running with LLMs right away!\n", + "\n", + "\n", + "###### Getting started with NLP using Hugging Face transformers pipelines\n", + "\n", + " Fine-Tuning Large Language Models with Hugging Face and DeepSpeedSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
6164b4b9647cc0b9d8049c2b7312a557If you’re ready to dive a little deeper and expand your education and\n", + "understanding of LLM foundations, we’d recommend checking out our\n", + "[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\n", + "and dive into the theory behind foundation models.\n", + "\n", + "If your hands are already shaking with excitement and you already have some\n", + "working knowledge of Python and Databricks, we’ll provide some great examples\n", + "with sample code that can get you up and running with LLMs right away!\n", + "\n", + "\n", + "###### Getting started with NLP using Hugging Face transformers pipelines\n", + "\n", + " Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n", + "\n", + " Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "#### Contact us for a personalized demo: databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
9f63c6b051354dd466246dab2d3a7ff5##### EBOOK\n", + "\n", + "# 8 Steps to Becoming an AI-Forward Retailer\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "\n", + "Introduction .............................................................................................................................................................................................. **3**\n", + "\n", + "The State of the Retail Industry:\n", + "\n", + "The Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n", + "\n", + "Begin With a Shared Vision of Success ....................................................................................................................................... **6**\n", + "\n", + "Why Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n", + "\n", + "Before Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n", + "\n", + "Getting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n", + "\n", + "Going Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n", + "\n", + "Normalizing the Process: Engraining a Data-Driven MindsetSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
f0a2f88cb135664940c16d2d9507c27fGoing Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n", + "\n", + "Normalizing the Process: Engraining a Data-Driven Mindset\n", + "\n", + "Into the Fabric of the Business ...................................................................................................................................................... **14**\n", + "\n", + "From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n", + "\n", + "The 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n", + "\n", + "Transform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "\n", + "In a world where data is king, retailers have historically been trailblazers, pioneering data technology\n", + "adoption to supercharge their operations, enhance customer understanding and sharpen\n", + "personalization. The journey began with the simple cash register about 150 years ago, progressed to\n", + "standardized product reporting with the introduction of the UPC and EAN, and has evolved to include\n", + "cutting-edge technologies such as RFID and machine learning.\n", + "\n", + "Today, we stand on the brink of “Generation AI,” defined by sophisticated language models and\n", + "images. Retailers, with their history of embracing data technologies, find themselves in a strong\n", + "position to reap the benefits of this new era. Automation of customer service, supply chain modeling\n", + "with digital twins and delivering hyper-personalized experiences in real time are all in the cards,\n", + "promising to bolster revenue, improve margins and slash costs for early adopters.\n", + "\n", + "According to an internal analysis by Databricks, data pioneers are already outstripping their\n", + "competition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\n", + "six major industry sectors, including retail — shows these front-runners outperforming the rest of the\n", + "market by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\n", + "are setting themselves up for significant gains and a robust competitive advantage.\n", + "\n", + "However, for retailers mired in the landscape of outdated data platforms, the transformation into an\n", + "AI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\n", + "feel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\n", + "evolving retail landscape.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
8af29864e6963d85e9bb0a6142ac12f2Today, we stand on the brink of “Generation AI,” defined by sophisticated language models and\n", + "images. Retailers, with their history of embracing data technologies, find themselves in a strong\n", + "position to reap the benefits of this new era. Automation of customer service, supply chain modeling\n", + "with digital twins and delivering hyper-personalized experiences in real time are all in the cards,\n", + "promising to bolster revenue, improve margins and slash costs for early adopters.\n", + "\n", + "According to an internal analysis by Databricks, data pioneers are already outstripping their\n", + "competition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\n", + "six major industry sectors, including retail — shows these front-runners outperforming the rest of the\n", + "market by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\n", + "are setting themselves up for significant gains and a robust competitive advantage.\n", + "\n", + "However, for retailers mired in the landscape of outdated data platforms, the transformation into an\n", + "AI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\n", + "feel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\n", + "evolving retail landscape.\n", + "\n", + "To help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\n", + "road map for organizations embarking on digital transformation journeys — a shift that is as much\n", + "about culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\n", + "vision for transformation, outlining a compelling case for why such change is vital for the company’s\n", + "long-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\n", + "critical business procedures.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n", + "\n", + "\n", + "The pandemic’s fallout has led to a widening chasm between the retail industry’s\n", + "leaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n", + "“Companies with tech-forward business models, who were already pulling ahead\n", + "pre-crisis, left their competitors in the dust.”\n", + "\n", + "But what exactly is a “tech-forward business model”? It isn’t a simple narrative of\n", + "digital natives dethroning traditional retailers. Heavyweights like Walmart, Target\n", + "and Costco held their own against Amazon. Nor was it purely a matter of scale —\n", + "smaller brands like Warby Parker or Everlane managed to carve out substantial\n", + "consumer bases, competing against larger, established players.\n", + "\n", + "**The common denominator among all victors**\n", + "**was their ability to harness data, analytics and AI**\n", + "**to rapidly react to shifts in consumer behavior.**\n", + "\n", + "\n", + "methods, optimizing operations to alleviate the pressure these modes exerted\n", + "on margins. They successfully established tighter partnerships with suppliers\n", + "and logistic entities, collaborating toward shared triumphs.\n", + "\n", + "In all these instances, it was their timely access to information, foresight\n", + "driven by this data, and the exploration of probable outcomes that set these\n", + "organizations apart. Infusing data-driven decision-making into core processes\n", + "within the organization, as well as those crossing partner boundaries, unlocked\n", + "this approach’s full potential.\n", + "\n", + "To illustrate the significance of prioritizing data and AI, we developed the\n", + "Databricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\n", + "stocks research, this index tracks marquee customers across our top five\n", + "verticals and partners. The Databricks 30 is an equal-weight price index,\n", + "\n", + "composed of five marquee customers each across Retail/Consumer Products,\n", + "Financial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\n", + "plus five strategic partners.\n", + "\n", + "\n", + "These businesses deftly used consumer demand insights to understand the\n", + "effects of supply chain disruptions and labor shortages and reallocate resources\n", + "to mitigate the most harmful impacts. They adeptly introduced new delivery\n", + "\n", + "\n", + "-----\n", + "\n", + "Our analysis reveals that companies in the Databricks 30 Index outpaced the\n", + "S&P 500 by an impressive +21 percentage points (pp) over the past three years.\n", + "In other words, if the stock market rose by 50% during this period, the Databricks\n", + "30 Index would have soared by 71% (outperforming by 21pp). Even more\n", + "remarkable, excluding tech entirely from the Databricks 30, the Databricks 30\n", + "ex-Tech index outperforms the S&P 500 by an even larger margin over the same\n", + "time frame: +23pp.\n", + "\n", + "\n", + "DB30 DOw30SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
7a3d84f6cfab2fffb4dd147ed10688b3composed of five marquee customers each across Retail/Consumer Products,\n", + "Financial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\n", + "plus five strategic partners.\n", + "\n", + "\n", + "These businesses deftly used consumer demand insights to understand the\n", + "effects of supply chain disruptions and labor shortages and reallocate resources\n", + "to mitigate the most harmful impacts. They adeptly introduced new delivery\n", + "\n", + "\n", + "-----\n", + "\n", + "Our analysis reveals that companies in the Databricks 30 Index outpaced the\n", + "S&P 500 by an impressive +21 percentage points (pp) over the past three years.\n", + "In other words, if the stock market rose by 50% during this period, the Databricks\n", + "30 Index would have soared by 71% (outperforming by 21pp). Even more\n", + "remarkable, excluding tech entirely from the Databricks 30, the Databricks 30\n", + "ex-Tech index outperforms the S&P 500 by an even larger margin over the same\n", + "time frame: +23pp.\n", + "\n", + "\n", + "DB30 DOw30\n", + "\n", + "\n", + "Similar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\n", + "are investing in cloud, data and innovation do, in fact, win.\n", + "\n", + "\n", + "So now that we see the impact, let’s dive into the steps retail organizations can\n", + "take to put themselves on a trajectory of continued growth and success amid an\n", + "ever-changing landscape.\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021\n", + "\n", + "\n", + "DATE\n", + "\n", + "\n", + "-----\n", + "\n", + "## Begin With a Shared Vision of Success\n", + "\n", + "\n", + "The most overlooked activity in becoming an AI-forward retailer is the most\n", + "crucial. In the rush to secure a position on the AI frontier, many companies\n", + "are leaping before they look, embarking on AI initiatives without a clear\n", + "understanding of what they want to achieve. Simply adopting the newest,\n", + "shiniest tech tools isn’t a silver bullet. Many companies set themselves up for\n", + "failure by neglecting to clearly define the expected business outcomes at the\n", + "onset of the initiative, a strategic move that can effectively reduce project risk\n", + "and costs and lead to the ultimate success of the program. In fact, in an attempt\n", + "to accelerate results, this cavalier approach can instead spiral into expensive\n", + "mistakes, wasted resources and a decrease in trust for stakeholders from\n", + "unmet expectations. It’s like setting sail on an open ocean without a destination\n", + "in mind; the journey might provide some interesting detours, but it lacks\n", + "direction and purpose.\n", + "\n", + "However, when organizations take the time to articulate their expected\n", + "business outcomes before deploying AI and data-driven programs, they position\n", + "themselves to reduce project risk and costs. By aligning AI initiatives with\n", + "specific business objectives and creating a shared vision with stakeholders,\n", + "the focus becomes less about the technology itself and more about how it\n", + "can be used to reach these defined goals.\n", + "\n", + "\n", + "Technology decisions, too, are improved by having a known target. Without\n", + "clear business outcomes in mind, companies tend to design, develop and\n", + "implement technologies that _might_ be needed to solve the problem. Aligning\n", + "the technical road map and activities with business outcomes mitigates the\n", + "risk of misallocated resources and the potential fallout from the unfulfilled\n", + "promise of AI.\n", + "\n", + "Furthermore, a clear understanding of expected business outcomes allows\n", + "for efficient project management and cost control. Companies can set key\n", + "performance indicators (KPIs) tied directly to these outcomes. This not only\n", + "provides a means to measure progress, but also helps control costs by\n", + "ensuring that resources are targeted toward initiatives that deliver value.\n", + "\n", + "It’s not just about numbers either; having explicit objectives aids in cultivating\n", + "\n", + "stakeholder buy-in. Clear communication about the purpose and potential\n", + "benefits of an AI initiative can foster support from executives, employees,\n", + "investors and customers alike. This collective backing can further mitigate risk\n", + "and cut costs by ensuring that everyone is pulling in the same direction.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Companies Struggle With Setting Clear Business Outcomes for AI\n", + "\n", + "\n", + "Getting started with AI at your organization might be daunting, and that’s\n", + "because it is a big undertaking! Struggling to define clear outcomes for AI\n", + "projects is a common issue among many businesses for a variety of reasons.\n", + "Here are some key factors that contribute to this challenge:\n", + "\n", + "**They believe the data strategy is a technology problem.**\n", + "\n", + "Companies often hire a chief data officer, or make the data strategy\n", + "the responsibility of the technology organization.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
5d904afdb1f9a9de4e4f6617833b86c6It’s not just about numbers either; having explicit objectives aids in cultivating\n", + "\n", + "stakeholder buy-in. Clear communication about the purpose and potential\n", + "benefits of an AI initiative can foster support from executives, employees,\n", + "investors and customers alike. This collective backing can further mitigate risk\n", + "and cut costs by ensuring that everyone is pulling in the same direction.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Companies Struggle With Setting Clear Business Outcomes for AI\n", + "\n", + "\n", + "Getting started with AI at your organization might be daunting, and that’s\n", + "because it is a big undertaking! Struggling to define clear outcomes for AI\n", + "projects is a common issue among many businesses for a variety of reasons.\n", + "Here are some key factors that contribute to this challenge:\n", + "\n", + "**They believe the data strategy is a technology problem.**\n", + "\n", + "Companies often hire a chief data officer, or make the data strategy\n", + "the responsibility of the technology organization.\n", + "\n", + "**They lack an understanding of their business processes**\n", + "An alarming number of businesses jump onto the AI bandwagon without\n", + "understanding how their business operates. Decisions are made at\n", + "the leadership level, but how they translate to operational decisions is\n", + "muddled. Data and AI are fundamentally business process technologies,\n", + "\n", + "and without fully understanding how the business works, any initiative\n", + "in data and AI is bound to have limited success.\n", + "\n", + "\n", + "**They lack a data culture**\n", + "\n", + "Somewhat related to the previous point, many companies have teams\n", + "that make decisions based on experience and intuition. These should\n", + "not be discounted, but the reason for intuition is often a result of a\n", + "poor definition of processes, which prevents the ability to measure\n", + "and improve processes.\n", + "\n", + "**They struggle to get high-quality data**\n", + "\n", + "AI projects require good-quality, relevant data. Many businesses\n", + "struggle with issues related to data access, quality, privacy and\n", + "security, which can complicate the process of defining clear outcomes.\n", + "\n", + "**They lack the organizational structures required**\n", + "\n", + "Implementing AI often requires significant changes in business\n", + "\n", + "processes, organizational structures and even corporate culture.\n", + "Many companies find it hard to manage these changes, leading to\n", + "difficulties in setting and achieving clear outcomes.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data and AI programs are a business process problem first, and a\n", + "technology problem last. Familiarity with technology is important, but\n", + "irrelevant if companies do not understand it.\n", + "\n", + "Addressing these challenges often requires companies to invest in\n", + "education about AI capabilities, to formulate clear strategies, to manage\n", + "change effectively, and to bring on board the necessary skills either\n", + "by hiring new talent or upskilling existing employees. It’s a journey that\n", + "requires commitment, but the potential benefits of successful AI initiatives\n", + "make it a worthwhile venture.\n", + "\n", + "\n", + "**They don’t have the right people in place**\n", + "\n", + "There’s often a gap between the skills available within a company and\n", + "the skills needed to define and achieve AI outcomes. Without team\n", + "members who understand AI, data analysis and project management,\n", + "businesses can struggle to set clear objectives for AI initiatives.\n", + "\n", + "**They struggle to quantify the value of AI projects**\n", + "\n", + "AI’s benefits can sometimes be intangible or long-term, making them\n", + "difficult to quantify. Companies may struggle to define outcomes in\n", + "measurable terms, complicating the process of setting objectives\n", + "and monitoring progress.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Before Diving In: Assess Your Readiness\n", + "\n", + "\n", + "There is a growing sense of urgency for organizations relatively new to data\n", + "and AI-driven enablement to “get in the game.” Profiles of top performers and\n", + "headline-making achievements create a clearer sense of what is possible\n", + "and what can be gained, leaving those entering into the space eager to achieve\n", + "similar results.\n", + "\n", + "But what’s missing in those articles are the sustained investments in\n", + "process, people and technology and the numerous challenges, missteps and\n", + "outright failures that had to occur before success was achieved. Data-driven\n", + "transformation is a journey, and before any successful journey is pursued,\n", + "it’s wise to reflect on the organization’s readiness so that you can anticipate\n", + "challenges and identify areas for remediation and improvement that will\n", + "deliver you to your intended destination.\n", + "\n", + "With this in mind, we encourage organizations new to this space to\n", + "assess their maturity in terms of the use and management of their existing\n", + "information assets:\n", + "\n", + "1. How easily discoverable and accessible are data in\n", + "your environment?\n", + "\n", + "\n", + "3. Is the quality of these data formally verified?\n", + "\n", + "4. Are key entities such as products and customers actively\n", + "managed, and can data related to these items be easily linked\n", + "across various data sources?SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
bc6bec299cd25b6f4221096090e241c7But what’s missing in those articles are the sustained investments in\n", + "process, people and technology and the numerous challenges, missteps and\n", + "outright failures that had to occur before success was achieved. Data-driven\n", + "transformation is a journey, and before any successful journey is pursued,\n", + "it’s wise to reflect on the organization’s readiness so that you can anticipate\n", + "challenges and identify areas for remediation and improvement that will\n", + "deliver you to your intended destination.\n", + "\n", + "With this in mind, we encourage organizations new to this space to\n", + "assess their maturity in terms of the use and management of their existing\n", + "information assets:\n", + "\n", + "1. How easily discoverable and accessible are data in\n", + "your environment?\n", + "\n", + "\n", + "3. Is the quality of these data formally verified?\n", + "\n", + "4. Are key entities such as products and customers actively\n", + "managed, and can data related to these items be easily linked\n", + "across various data sources?\n", + "\n", + "5. How quickly are data made available for analysis following their\n", + "creation or modification? Is this latency aligned with how you\n", + "might use this data?\n", + "\n", + "6. Are processes established for determining appropriate uses of\n", + "data, governing access and providing oversight on consumption?\n", + "\n", + "7. Is there one individual responsible for effective data management\n", + "across the enterprise, and has this person established a\n", + "\n", + "process for receiving and responding to feedback and shifting\n", + "organizational priorities?\n", + "\n", + "This list of questions is by no means exhaustive, but it should help to identify\n", + "blockers that are likely to become impediments down the road.\n", + "\n", + "\n", + "2. How well understood are these information assets?\n", + "\n", + "\n", + "-----\n", + "\n", + "Similarly, we would encourage organizations to assess their maturity in terms of\n", + "analytics capabilities:\n", + "\n", + "1. Is business performance at all levels assessed in terms of\n", + "key metrics?\n", + "\n", + "2. How frequently are data-driven analyses used in making key\n", + "business decisions?\n", + "\n", + "3. To what degree are advanced analytics techniques\n", + "— i.e., data science — used in decision-making processes?\n", + "\n", + "4. Are predictive models regularly leveraged as part of operational\n", + "business processes?\n", + "\n", + "5. How is experimentation used to assess the performance of\n", + "various initiatives?\n", + "\n", + "\n", + "Lastly, and probably most importantly, we’d encourage the organization to\n", + "perform a frank assessment of its readiness to embrace change. Becoming a\n", + "data-driven enterprise is fundamentally about operating differently than before.\n", + "Decision-making authority becomes more diffuse and often more automated.\n", + "Project outcomes become less certain as the organization focuses on innovation\n", + "where learning is emphasized over predictable results. Process silos often\n", + "become more intertwined as new modes of engagement evolve.\n", + "\n", + "When done right, this transition creates a healthy tension between what’s\n", + "needed to be successful today and what’s needed to be successful tomorrow.\n", + "But this can also manifest itself as employee resistance and political infighting\n", + "as processes and organizational structures evolve. What’s often needed to\n", + "overcome this is strong leadership, a clear vision and mandate for change as\n", + "well as a reassessment of incentive structures and active organizational change\n", + "management as the organization transitions into this new way of working.\n", + "\n", + "\n", + "6. Are predictive models used to automate key business decisions?\n", + "\n", + "\n", + "7. Has the organization embraced a model of continuous deployment\n", + "for the regular update of model-driven processes?\n", + "\n", + "\n", + "**TRADITIONAL APPROACH**\n", + "\n", + "**Upfront reqs** **Technical implementation** **Production**\n", + "\n", + "\n", + "**ITERATIVE APPROACH**\n", + "\n", + "\n", + "Continuous feedback\n", + "\n", + "\n", + "**Business questions** **Testing** **Production** **Optimization**\n", + "\n", + "Continuous learning and optimization\n", + "\n", + "An iterative approach involves the use of data to continually optimize the performance of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started: Putting Some Wins on the Board\n", + "\n", + "\n", + "With the organization ready to proceed, the next phase is about learning to\n", + "deliver new solutions within your organization. There will be new technologies\n", + "to deploy and new skills to develop, and there will be new patterns for\n", + "integration into business workflows and procedures for incremental updates\n", + "and improvements. But most importantly, there will need to be a new level of\n", + "partnership and trust between the business and the technology sides of the\n", + "organization that needs to be carefully nurtured.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
4759cd08dc657d0089a31804b27e273e7. Has the organization embraced a model of continuous deployment\n", + "for the regular update of model-driven processes?\n", + "\n", + "\n", + "**TRADITIONAL APPROACH**\n", + "\n", + "**Upfront reqs** **Technical implementation** **Production**\n", + "\n", + "\n", + "**ITERATIVE APPROACH**\n", + "\n", + "\n", + "Continuous feedback\n", + "\n", + "\n", + "**Business questions** **Testing** **Production** **Optimization**\n", + "\n", + "Continuous learning and optimization\n", + "\n", + "An iterative approach involves the use of data to continually optimize the performance of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started: Putting Some Wins on the Board\n", + "\n", + "\n", + "With the organization ready to proceed, the next phase is about learning to\n", + "deliver new solutions within your organization. There will be new technologies\n", + "to deploy and new skills to develop, and there will be new patterns for\n", + "integration into business workflows and procedures for incremental updates\n", + "and improvements. But most importantly, there will need to be a new level of\n", + "partnership and trust between the business and the technology sides of the\n", + "organization that needs to be carefully nurtured.\n", + "\n", + "The best way we have found to do this is to start with projects that improve\n", + "on existing operational workflows, i.e., do what you do, but do it smarter.\n", + "The business is often familiar with existing pain points and can more clearly\n", + "envision how a new capability can be folded into its processes. They are also\n", + "familiar with how to assess the impact a new approach may have on their\n", + "business and can help design tests to validate whether the intended results\n", + "\n", + "\n", + "As capabilities demonstrating value over the status quo are developed, they\n", + "are folded into business processes. This is not a one-and-done effort but part\n", + "of an ongoing cycle of deployment to continue so long as the team has a line\n", + "of sight to meaningful gains. The team does not wait for the ideal solution but\n", + "instead focuses on incremental improvements that deliver measurable value\n", + "along the way.\n", + "\n", + "Oversight for this process is provided by another body, one tasked with the\n", + "success of the overall transformative efforts within the business. As success\n", + "is delivered, there will be growing demand for the time and talents of these\n", + "teams, and the organization will need to prioritize resources across an increasing\n", + "number of opportunities. This steering committee will need to be responsible for\n", + "allocating limited resources and advocating for additional ones as well to strike\n", + "the right balance of investments for the organization.\n", + "\n", + "\n", + "are or are not being delivered.\n", + "\n", + "\n", + "**DEMAND FORECASTING**\n", + "\n", + "Demand forecasting is a massive challenge for retail and consumer goods\n", + "\n", + "organizations. And one where even an incremental change can have a massive impact,\n", + "\n", + "so it’s often one of the first projects organizations identify to put a win on the board.\n", + "\n", + "According to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n", + "\n", + "accuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n", + "\n", + "increase in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n", + "\n", + "[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n", + "\n", + "key use cases.\n", + "\n", + "\n", + "Work on these projects is a collaborative effort between the business and IT.\n", + "Together, the project team explores a potential solution with a notion of how it\n", + "may be integrated in mind from the outset. As the project unfolds, all members\n", + "are part of the iterative cycles and help to steer the solution in new directions\n", + "until an item of value is derived.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Going Big: Learning to Embrace Transformational Change\n", + "\n", + "\n", + "With some experience under your belt, it’s time to build on the organizational\n", + "muscle developed during initial efforts and flex for more transformative impact.\n", + "Again, the focus is on established functions within the business, but instead of\n", + "pointed, incremental improvements, the team begins to create a vision for the\n", + "part of the organization that would operate if it were to fully embrace data and\n", + "AI enablement.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
0d9476593a17460dc2d5ede6a2f13133[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n", + "\n", + "key use cases.\n", + "\n", + "\n", + "Work on these projects is a collaborative effort between the business and IT.\n", + "Together, the project team explores a potential solution with a notion of how it\n", + "may be integrated in mind from the outset. As the project unfolds, all members\n", + "are part of the iterative cycles and help to steer the solution in new directions\n", + "until an item of value is derived.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Going Big: Learning to Embrace Transformational Change\n", + "\n", + "\n", + "With some experience under your belt, it’s time to build on the organizational\n", + "muscle developed during initial efforts and flex for more transformative impact.\n", + "Again, the focus is on established functions within the business, but instead of\n", + "pointed, incremental improvements, the team begins to create a vision for the\n", + "part of the organization that would operate if it were to fully embrace data and\n", + "AI enablement.\n", + "\n", + "It’s at this phase that many of the concerns about organizational resistance\n", + "mentioned earlier are most likely to manifest themselves. Ideally, initial\n", + "implementation efforts have built champions within the business, but it’s still\n", + "important to be mindful of pushback that can emerge as the organization more\n", + "fully begins to change. Having and maintaining strong business sponsorship\n", + "in this phase is critical, and having that sponsor articulate and regularly\n", + "reinforce a clear vision for the change that’s now underway can help everyone\n", + "\n", + "understand the need to support these efforts.\n", + "\n", + "\n", + "So far in this exploration of the journey to data and AI transformation, we’ve\n", + "minimized the importance of technology in order to focus on the business and\n", + "organizational aspects that often get neglected in this conversation. But it’s\n", + "at this stage that the organization needs to have established its preference\n", + "for data and analytics platforms. Because of the breadth of needs that will\n", + "have to be addressed and the ongoing innovation taking place in the data\n", + "science community, we strongly suggest standardizing on a platform that is\n", + "open and flexible while also providing cost-effective use of both infrastructure\n", + "and people resources and strong data governance and protection. For many\n", + "organizations, the Databricks Lakehouse Platform has proven itself to be the\n", + "ideal platform to meet these needs.\n", + "\n", + "**WHY STANDARDIZE ON DATABRICKS?**\n", + "\n", + "The Databricks Lakehouse is the only enterprise data and AI\n", + "\n", + "platform that allows retailers to leverage all of their data, from any\n", + "\n", + "source, on any workload to always offer more engaging customer\n", + "\n", + "experiences driven by real-time data, at the lowest cost and with\n", + "\n", + "the greatest investment protection.\n", + "\n", + "\n", + "-----\n", + "\n", + "But simply standardizing on a platform is not enough. The organization\n", + "needs to work through the roles and responsibilities around the use of this\n", + "platform and processes for moving things from experimentation and formal\n", + "development to testing and operationalization.\n", + "\n", + "The importance of having an MLOps strategy really comes to life at this\n", + "phase. This doesn’t mean your strategy around MLOps can’t change, but this\n", + "phase is when you want to think about and define your answers to some key\n", + "questions such as the following:\n", + "\n", + "1. How do we evaluate new and existing (retrained) models as\n", + "part of their movement from development to production?\n", + "\n", + "2. How do we determine when a model should be retrained?\n", + "\n", + "3. What are the preferred mechanisms for production deployment?\n", + "\n", + "4. How do we fall back should we have a deployment problem?\n", + "\n", + "5. What are the service level expectations for the\n", + "deployment processes?\n", + "\n", + "\n", + "###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n", + "\n", + "**Numan Ali**\n", + "\n", + "Solutions Architect, Data and Analytics Center of Excellence at Pandora\n", + "\n", + "\n", + "-----\n", + "\n", + "## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n", + "\n", + "\n", + "Too often, leadership views innovation as a destination and not a process\n", + "(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\n", + "data-driven organization overnight and then it’s done. Yes, there will be an\n", + "upfront investment, but there will also be ongoing investment in order to\n", + "support sustained innovation.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
0e3cdc6f5aa88555c164d82f7b02ae3e2. How do we determine when a model should be retrained?\n", + "\n", + "3. What are the preferred mechanisms for production deployment?\n", + "\n", + "4. How do we fall back should we have a deployment problem?\n", + "\n", + "5. What are the service level expectations for the\n", + "deployment processes?\n", + "\n", + "\n", + "###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n", + "\n", + "**Numan Ali**\n", + "\n", + "Solutions Architect, Data and Analytics Center of Excellence at Pandora\n", + "\n", + "\n", + "-----\n", + "\n", + "## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n", + "\n", + "\n", + "Too often, leadership views innovation as a destination and not a process\n", + "(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\n", + "data-driven organization overnight and then it’s done. Yes, there will be an\n", + "upfront investment, but there will also be ongoing investment in order to\n", + "support sustained innovation.\n", + "\n", + "Ironically, one of the major obstacles to this change is viewing the goal as\n", + "simply delivering a project or projects. Think about it — just 12 months ago,\n", + "only a few specialists in academia and industry were talking about generative\n", + "AI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n", + "[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\n", + "personalized consumer experiences with it.\n", + "\n", + "\n", + "Technology, especially when it comes to data and AI, moves far too quickly.\n", + "What retailer tech teams need to deliver at the end of the day is applications,\n", + "of course, but also the ability to react quickly to change. What sort of ongoing\n", + "investments in terms of people, process and technology do retailers need to\n", + "foster in order to ingrain an innovation mindset?\n", + "\n", + "This is an ongoing balancing act where organizations need to innovate and look\n", + "for new opportunities but also sustain that innovation in a way that is realistic\n", + "for the business. For this, let’s consider the 70-20-10 rule: the idea that\n", + "companies should allocate 70% of innovation investment to core initiatives,\n", + "20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\n", + "not a hard-and-fast rule, this concept was touted by Google co-founder Larry\n", + "Page in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n", + "[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n", + "\n", + "outperformed their peers, typically realizing a P/E premium of 10% to 20%.\n", + "\n", + "\n", + "-----\n", + "\n", + "The goal of the 70-20-10 rule is to help guide the organization toward\n", + "sustained innovation and spend the bulk of time on the core business. This is\n", + "part of why we recommend starting first with fast (just 2- to 3-month total)\n", + "pilot projects to use AI on existing business use cases like demand forecasting\n", + "and call center optimization. By working in these areas with a focus on learning\n", + "and iterating, retailers will soon find where data silos and rigidity exist in the\n", + "system. As these foundational barriers are knocked down, it then makes it\n", + "possible to tackle more transformational use cases and start to build the\n", + "characteristics of a data-forward enterprise. In other words, start to utilize\n", + "data and data-driven insights as a primary driver for decision-making and\n", + "operations, while also prioritizing continuous data analysis and improvement.\n", + "\n", + "\n", + "**TRANSFORMATIVE**\n", + "\n", + "\n", + "**ADJACENT**\n", + "\n", + "\n", + "**CORE**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
9048b352c5ac27632575c975cbfca802outperformed their peers, typically realizing a P/E premium of 10% to 20%.\n", + "\n", + "\n", + "-----\n", + "\n", + "The goal of the 70-20-10 rule is to help guide the organization toward\n", + "sustained innovation and spend the bulk of time on the core business. This is\n", + "part of why we recommend starting first with fast (just 2- to 3-month total)\n", + "pilot projects to use AI on existing business use cases like demand forecasting\n", + "and call center optimization. By working in these areas with a focus on learning\n", + "and iterating, retailers will soon find where data silos and rigidity exist in the\n", + "system. As these foundational barriers are knocked down, it then makes it\n", + "possible to tackle more transformational use cases and start to build the\n", + "characteristics of a data-forward enterprise. In other words, start to utilize\n", + "data and data-driven insights as a primary driver for decision-making and\n", + "operations, while also prioritizing continuous data analysis and improvement.\n", + "\n", + "\n", + "**TRANSFORMATIVE**\n", + "\n", + "\n", + "**ADJACENT**\n", + "\n", + "\n", + "**CORE**\n", + "\n", + "\n", + "###### Companies that allocated about 70% of their innovation activity to core initiatives, \n", + "### 20% to adjacent ones and 10% to\n", + "###### transformational ones outperformed their peers.\n", + "\n", + "**Bansi Nagji & Geoff Tuff**\n", + "_Managing Your Innovation Portfolio_\n", + "Harvard Business Review, May 2012\n", + "\n", + "\n", + "-----\n", + "\n", + "## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n", + "\n", + "\n", + "So what does it take to successfully embark on this\n", + "journey to becoming a data-forward enterprise?\n", + "First and foremost, you need to not only establish\n", + "a baseline understanding of what has occurred by\n", + "examining historical data but leverage advancements\n", + "in technologies (e.g., streaming, computer vision,\n", + "voice recognition) to make predictions of the future.\n", + "\n", + "Through the use of both historical data and\n", + "predictive techniques such as forecasting,\n", + "recommendations, prescriptive care and nextbest-action, organizations can begin to improve\n", + "decisions and, in some cases, automate certain\n", + "decision-making processes. But rather than moving\n", + "\n", + "from historical views to predictive actions in a\n", + "linear fashion, this journey involves addressing both\n", + "approaches simultaneously. Once you are able to\n", + "unify historical and predictive analysis, you can then\n", + "take significant steps toward becoming a dataforward enterprise.\n", + "\n", + "\n", + "##### The Data-Forward Enterprise\n", + "\n", + "Data, analytics and AI working in concert\n", + "\n", + "\n", + "**Data Purgatory**\n", + "Things are better, but data isn’t\n", + "driving the business\n", + "\n", + "\n", + "**Data Maturity**\n", + "Every aspect of the\n", + "business is supported\n", + "by insights and AI\n", + "\n", + "\n", + "**Data Siloed**\n", + "Data and teams are segregated\n", + "into different systems\n", + "\n", + "DATA MATURITY\n", + "\n", + "Being data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The 8 Steps to Building a Data-Forward Retailer\n", + "\n", + "\n", + "Before you start your data-forward journey, a few critical steps must be\n", + "considered to establish a solid foundation to build upon. Based on our\n", + "work with the largest and most successful retailers in the world, spanning\n", + "startups to global giants, we at Databricks have seen that the most successful\n", + "followed these steps to effectively gain wallet share, whereas those who\n", + "couldn’t would often leave major gaps that competitors could take advantage\n", + "of. These steps are the basics to prepare businesses for where they need\n", + "to be both now and in the near future.\n", + "\n", + "\n", + "**2** **Get grounded: Understand the technology**\n", + "\n", + "To start, business leaders need to ground themselves in technology, especially\n", + "when it comes to AI. AI can do amazing things, but it is not magical and vendors\n", + "are prone to overpromising and underdelivering. Less than getting deep into\n", + "code, the purpose is to understand the limitations and ideal use cases.\n", + "\n", + "Databricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\n", + "starting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\n", + "perspective of how different brands are using data, analytics and AI to drive\n", + "revenue or cut operational costs.\n", + "\n", + "\n", + "**1** **Set the foundation: Define goals and objectives**\n", + "\n", + "\n", + "The best way to avoid shiny object syndrome (where you start out with aSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
f20d3832b09cd451c1938dcafc1883b0**2** **Get grounded: Understand the technology**\n", + "\n", + "To start, business leaders need to ground themselves in technology, especially\n", + "when it comes to AI. AI can do amazing things, but it is not magical and vendors\n", + "are prone to overpromising and underdelivering. Less than getting deep into\n", + "code, the purpose is to understand the limitations and ideal use cases.\n", + "\n", + "Databricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\n", + "starting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\n", + "perspective of how different brands are using data, analytics and AI to drive\n", + "revenue or cut operational costs.\n", + "\n", + "\n", + "**1** **Set the foundation: Define goals and objectives**\n", + "\n", + "\n", + "The best way to avoid shiny object syndrome (where you start out with a\n", + "\n", + "technology and then try to figure out what to do with it) is to first identify the\n", + "problems you want to solve. From there, you can set goals around innovation\n", + "to align incentives, and, most importantly, ensure you are driving specific\n", + "business outcomes such as improving customer engagement, optimizing\n", + "inventory management or increasing sales.\n", + "\n", + "\n", + "**3** **Understand the skills and processes in your business**\n", + "\n", + "As we will get into in step 4, starting with smaller pilot projects enables you\n", + "to not just deliver a quick win and validate the use of AI in the enterprise, but\n", + "also understand the in-house capabilities in terms of people, process and\n", + "technology to deliver technical projects. And if required, be willing and ready\n", + "to hire people with the right skill sets that can help you make the most of your\n", + "data. For example, building a core team of data analysts can help extract deep\n", + "insights that lead to better decision-making and identify opportunities for\n", + "growth. It is critical at this step to define the roles you need, determine how\n", + "you will source for those roles (via external hiring or internal transfer), and\n", + "ensure those roles have opportunities for career progression.\n", + "\n", + "\n", + "-----\n", + "\n", + "For inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n", + "[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\n", + "save hours of discovery, design, development and testing. Our purpose-built\n", + "guides — fully functional notebooks and best practices — speed up results\n", + "across your most common and high-impact use cases and enable you to go\n", + "from idea to proof of concept (PoC) in as little as two weeks. We have over\n", + "20 accelerators built specifically for critical retail and consumer goods use\n", + "cases, from Demand Forecasting and On-Shelf Availability to Recommendation\n", + "Engines and Customer Lifetime Value. We also have a set of Solution\n", + "Accelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n", + "\n", + "**5** **Implement data management and governance early**\n", + "\n", + "The first step to successfully implementing AI/ML in your business broadly\n", + "is to ensure you have accurate, reliable and current data to train your\n", + "models against. This data can (and should) come from a variety of sources,\n", + "so it’s key to unify all data types and sources (sales transactions, customer\n", + "feedback, social media) in a centralized location that is easily accessible,\n", + "while not losing sight of data security to maintain customer trust. Setting\n", + "up data governance parameters to control who has which kinds of access\n", + "to what data, and being able to audit the history of this access, will actually\n", + "accelerate innovation while ensuring data security and compliance.\n", + "\n", + "\n", + "**Delivering exactly what customers want,**\n", + "**every time, and on time**\n", + "\n", + "Data is at the heart of Gousto’s mission to change the\n", + "way people eat through the delivery of boxes of fresh\n", + "ingredients and easy-to-follow recipes. However, even\n", + "as their business exploded at the start of the pandemic,\n", + "their systems couldn’t ingest data fast enough, couldn’t\n", + "talk to each other and wouldn’t scale — forcing them to\n", + "temporarily stop accepting new customers. Now Gousto is\n", + "set up to achieve exciting ambitions for menu expansion,\n", + "sophisticated personalization and next-day delivery. Learn\n", + "how they did it.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
08e33a0a940c3fc8d481f2dca3f3daa6**Delivering exactly what customers want,**\n", + "**every time, and on time**\n", + "\n", + "Data is at the heart of Gousto’s mission to change the\n", + "way people eat through the delivery of boxes of fresh\n", + "ingredients and easy-to-follow recipes. However, even\n", + "as their business exploded at the start of the pandemic,\n", + "their systems couldn’t ingest data fast enough, couldn’t\n", + "talk to each other and wouldn’t scale — forcing them to\n", + "temporarily stop accepting new customers. Now Gousto is\n", + "set up to achieve exciting ambitions for menu expansion,\n", + "sophisticated personalization and next-day delivery. Learn\n", + "how they did it.\n", + "\n", + "**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n", + "\n", + "**4** **Start small: Pilot a project**\n", + "\n", + "There is no substitute for rolling your sleeves up and running a pilot project to\n", + "evaluate the feasibility and potential impact of a project before implementing\n", + "it on a larger scale. When selecting a pilot project, we recommend starting with\n", + "a project that will deliver clear business value, such as incremental revenue\n", + "or clear cost savings, yet only takes 2-3 months to complete. The more time\n", + "there is between project inception and seeing results, the more likely it will lose\n", + "momentum internally.\n", + "\n", + "\n", + "-----\n", + "\n", + "**6** **Incorporate AI across the business (starting with daily tasks)**\n", + "\n", + "Given the large upfront investment in data scientists and engineers to build\n", + "an AI program, the ROI will come from using it at scale. Constantly look to\n", + "uncover patterns and repeatable processes that can be optimized or fully\n", + "automated with AI.\n", + "\n", + "**Building a global fashion icon with a**\n", + "**customer-first approach**\n", + "\n", + "British luxury brand Burberry was seeking an efficient way to\n", + "annotate its thousands of highly specific marketing assets\n", + "for better targeting. Working with Labelbox within Databricks\n", + "Lakehouse, they are now able to complete image annotation\n", + "projects in hours instead of months. And marketing team\n", + "members now have access to powerful content insights\n", + "without needing to ask data scientists for help.\n", + "\n", + "**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n", + "\n", + "**Customizing interactions that convert clicks**\n", + "**to revenue with Databricks Lakehouse**\n", + "\n", + "Global jewelry manufacturer and retailer Pandora needed a\n", + "unified view of all their data where they could easily segment,\n", + "categorize and analyze to deliver custom messaging to\n", + "consumers. With Databricks Lakehouse, they now have the\n", + "insights they need to deliver highly targeted messaging —\n", + "increasing consumer engagement from the initial opening of\n", + "a marketing email to maximizing shopping bag conversions to\n", + "driving revenue on the website.\n", + "\n", + "**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n", + "\n", + "\n", + "**Building an operationally efficient**\n", + "**omnichannel business**\n", + "\n", + "The Hershey Company analyzes the data they need to\n", + "stay in front of changing human behavior and delight their\n", + "customers. With Databricks Lakehouse, they can analyze\n", + "data feeds from their largest retail customer — uncovering\n", + "insights that will help extend their industry leadership.\n", + "\n", + "**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n", + "\n", + "\n", + "**Ushering in a new era**\n", + "**of data-driven retailing**\n", + "\n", + "Outdoor apparel brand Columbia Sportswear has enabled\n", + "data and analytics self-service throughout the organization in\n", + "a way that ensures everyone is working from a single source\n", + "of truth. Whichever data team needs access to the data,\n", + "Databricks Lakehouse gives them the confidence that the\n", + "data is reliable and consistent.\n", + "\n", + "**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**7** **Foster a culture of data-driven decision-making**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
fb312ae35c808bf547ea524165081a95**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n", + "\n", + "\n", + "**Ushering in a new era**\n", + "**of data-driven retailing**\n", + "\n", + "Outdoor apparel brand Columbia Sportswear has enabled\n", + "data and analytics self-service throughout the organization in\n", + "a way that ensures everyone is working from a single source\n", + "of truth. Whichever data team needs access to the data,\n", + "Databricks Lakehouse gives them the confidence that the\n", + "data is reliable and consistent.\n", + "\n", + "**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**7** **Foster a culture of data-driven decision-making**\n", + "\n", + "What does it mean to have a culture of data-driven decision-making? In\n", + "practice, it means empowering all employees to use data to inform their\n", + "decisions. Only some strategic decisions will be based on complete and\n", + "accurate information. It’s unwise to assume otherwise. The right approach\n", + "is to leverage as much data as possible, from past tests or current efforts,\n", + "to mitigate risk. Leaders need to not only ask for data but also ensure\n", + "that their employees will be able to find the data they need.\n", + "\n", + "**Unlocking critical trends and insights**\n", + "**needed to serve our 180 million customers**\n", + "\n", + "Reckitt, the maker of Lysol as well as hundreds of other\n", + "household brands, was looking to deliver best-in-class\n", + "customer experiences to their over 180 million customers\n", + "spanning the globe. With Databricks Lakehouse, Reckitt\n", + "has established a data-first culture by surfacing real-time,\n", + "highly accurate, deep customer data insights that have\n", + "led to a better understanding of international market\n", + "trends and demand across the multiple product lines\n", + "they support.\n", + "\n", + "**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n", + "\n", + "\n", + "**Customer 360 to enable faster speed**\n", + "**to market, better results**\n", + "\n", + "The Middle East’s Al-Futtaim serves as a local distributor\n", + "for global brands such as Toyota, IKEA and Ace Hardware.\n", + "With Databricks Lakehouse serving as a unified platform to\n", + "aggregate and analyze various data sources on all customers,\n", + "they have created a “golden customer record” that improves\n", + "all decision-making, from forecasting demand to powering\n", + "their global loyalty program.\n", + "\n", + "**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n", + "\n", + "**8** **Continuously evaluate and improve**\n", + "\n", + "Recognize that establishing a data-driven culture is an ongoing journey and\n", + "never a set destination. Constantly evaluate your data collection, analysis and\n", + "decision-making process to identify areas for improvement. Even small and\n", + "constant incremental improvements will deliver large gains in absolute terms\n", + "when applied at scale. You can always personalize more, forecast better, or\n", + "better manage your supply chain as you bring in better data sources and refine\n", + "your models.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Transform Retail Data Into Actionable Insights\n", + "\n", + "\n", + "Becoming data forward is not a crazy idea. Too often, leaders or organizations\n", + "allow themselves to be intimidated by focusing on large-scale transformations.\n", + "But it’s the small operational changes that can make your business more efficient\n", + "as well as shift the larger culture forward. Once you’ve set this foundation, it then\n", + "allows you to move toward bigger things. These steps may fail, but it’s actually\n", + "positive to have these setbacks to learn from to try again. The bigger risk is to\n", + "not try and thus fall behind competitors who are embracing the internal changes\n", + "needed to take advantage of AI and machine learning.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
2cd345e72229d795c8e73b85ccdb8516**8** **Continuously evaluate and improve**\n", + "\n", + "Recognize that establishing a data-driven culture is an ongoing journey and\n", + "never a set destination. Constantly evaluate your data collection, analysis and\n", + "decision-making process to identify areas for improvement. Even small and\n", + "constant incremental improvements will deliver large gains in absolute terms\n", + "when applied at scale. You can always personalize more, forecast better, or\n", + "better manage your supply chain as you bring in better data sources and refine\n", + "your models.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Transform Retail Data Into Actionable Insights\n", + "\n", + "\n", + "Becoming data forward is not a crazy idea. Too often, leaders or organizations\n", + "allow themselves to be intimidated by focusing on large-scale transformations.\n", + "But it’s the small operational changes that can make your business more efficient\n", + "as well as shift the larger culture forward. Once you’ve set this foundation, it then\n", + "allows you to move toward bigger things. These steps may fail, but it’s actually\n", + "positive to have these setbacks to learn from to try again. The bigger risk is to\n", + "not try and thus fall behind competitors who are embracing the internal changes\n", + "needed to take advantage of AI and machine learning.\n", + "\n", + "Core to delivering on these steps to become a data-forward retailer is a solid\n", + "data foundation that can unify your data and AI workloads with sharing and\n", + "governance built in, so internal and external teams can get access to the\n", + "data they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\n", + "companies gain valuable insights into customer behavior, optimize supply chain\n", + "\n", + "operations and make informed business decisions in real time.\n", + "\n", + "\n", + "EXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n", + "\n", + "Access key resources to understanding how a lakehouse\n", + "for retail can set you on the path toward becoming a\n", + "data-forward organization.\n", + "\n", + "**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n", + "\n", + "\n", + "#### Visit our website to learn more about Databricks Lakehouse for Retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
5503b8c1c4a023a953c42915fd5ed36a### Technical Migration Guide\n", + "\n", + "# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Lakehouse Architecture 3\n", + "\n", + "The Databricks Lakehouse Platform 4\n", + "\n", + "Business Value 5\n", + "\n", + "Single source of truth 5\n", + "\n", + "Data team 6\n", + "\n", + "Future-proof 6\n", + "\n", + "Migration to Lakehouse 7\n", + "\n", + "Overview 7\n", + "\n", + "Migration strategy 8\n", + "\n", + "Migration planning 9\n", + "\n", + "ELT approach 12\n", + "\n", + "Agile modernization 15\n", + "\n", + "Security and data governance 17\n", + "\n", + "Team involvement 19\n", + "\n", + "Conclusion 19\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse Architecture\n", + "\n", + "\n", + "Data warehouses were designed to provide a central data repository\n", + "\n", + "with analytic compute capabilities to help business leaders\n", + "\n", + "get analytical insights, support decision-making and business\n", + "\n", + "intelligence (BI). Legacy on-premises data warehouse architectures\n", + "\n", + "are difficult to scale and make it difficult for data teams to keep up\n", + "\n", + "with the exponential growth of data. Oftentimes data teams publish\n", + "\n", + "and use a subset of well-defined data for development and testing.\n", + "\n", + "This slows down both innovation and time to insight.\n", + "\n", + "Cloud data warehouses (CDW) were an attempt to tackle the\n", + "\n", + "on-premises data warehouse challenges. CDWs removed the\n", + "\n", + "administrative burden of tasks such as setup, upgrades and\n", + "\n", + "backups. CDWs also improved scalability and introduced cloud’s\n", + "\n", + "pay-as-you-go model to reduce cost. CDWs leverage a proprietary\n", + "\n", + "data format to achieve cloud-scale and performance; however, this\n", + "\n", + "also leads to customers locked into these formats with difficult\n", + "\n", + "\n", + "But enterprise data teams don’t need a better data warehouse.\n", + "\n", + "They need an innovative, simple solution that provides reliable\n", + "\n", + "performance, elastic scale and allows self-service to unblock\n", + "\n", + "analytics to access all data at a reasonable cost. The answer is\n", + "\n", + "the lakehouse.\n", + "\n", + "The lakehouse pattern represents a paradigm shift from traditional\n", + "\n", + "on-premises data warehouse systems that are expensive and\n", + "\n", + "complex to manage. It uses an open data management architecture\n", + "\n", + "that combines the flexibility, cost-efficiency and scale of data\n", + "\n", + "lakes with the data management and ACID semantics of data\n", + "\n", + "warehouses. A lakehouse pattern enables data transformation,\n", + "\n", + "cleansing and validation to support both business intelligence and\n", + "\n", + "machine learning (ML) users on all data. Lakehouse is cloud-centric\n", + "\n", + "and unifies a complete up-to-date data set for teams, allowing\n", + "\n", + "collaboration across an organization.\n", + "\n", + "\n", + "paths to support use cases outside the data warehouse itself\n", + "\n", + "(i.e., machine learning). Customers often find themselves with a\n", + "\n", + "bifurcated architecture, which ultimately leads to a more costly and\n", + "\n", + "complex data platform over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Databricks Lakehouse Platform\n", + "\n", + "The Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n", + "\n", + "and AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n", + "\n", + "ecosystem with open standards and data formats. Databricks is **multicloud** — delivering\n", + "\n", + "one **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n", + "\n", + "every cloud platform that you’re using to support your data and AI efforts.\n", + "\n", + "Databricks SQL stores and processes data using Delta Lake to simplify and enhance\n", + "\n", + "data warehousing capabilities. Analysts can use their favorite language, SQL, popular\n", + "\n", + "transformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n", + "\n", + "analyze data. The built-in query editor reduces contextual switching and improves\n", + "\n", + "productivity. Administrators enjoy simplified workload management via serverless\n", + "\n", + "compute and auto-scaling to meet high-concurrency workload needs. All this at a\n", + "\n", + "fraction of the cost of traditional data warehouses.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and MLSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
5026816418aa0b55a9a9cb44ff1d0df7ecosystem with open standards and data formats. Databricks is **multicloud** — delivering\n", + "\n", + "one **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n", + "\n", + "every cloud platform that you’re using to support your data and AI efforts.\n", + "\n", + "Databricks SQL stores and processes data using Delta Lake to simplify and enhance\n", + "\n", + "data warehousing capabilities. Analysts can use their favorite language, SQL, popular\n", + "\n", + "transformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n", + "\n", + "analyze data. The built-in query editor reduces contextual switching and improves\n", + "\n", + "productivity. Administrators enjoy simplified workload management via serverless\n", + "\n", + "compute and auto-scaling to meet high-concurrency workload needs. All this at a\n", + "\n", + "fraction of the cost of traditional data warehouses.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "Simple Open Multicloud\n", + "\n", + "\n", + "-----\n", + "\n", + "## Business Value\n", + "\n", + "#### Single source of truth\n", + "\n", + "Databricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n", + "\n", + "scalable storage layer where you can store all your data, including raw and historical data,\n", + "\n", + "alongside structured data tables in the data warehouse. The lakehouse pattern avoids\n", + "\n", + "data silos and shares the same elastic scale and governance across all use cases: BI, data\n", + "\n", + "engineering, streaming and AI/ML. This means that data engineering teams don’t have to\n", + "\n", + "move data to a proprietary data warehouse for business analysts or create a separate\n", + "\n", + "data store to support data science.\n", + "\n", + "Instead, data teams can access the open format Delta tables directly and combine data\n", + "\n", + "sets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n", + "\n", + "data with access to versioned history to facilitate repeatable experiments. A single source\n", + "\n", + "of truth facilitates moving from descriptive to predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data team\n", + "\n", + "\n", + "With central data governance and fine-grained access control\n", + "\n", + "capabilities to secure the lakehouse, you can enable self-service\n", + "\n", + "SQL analytics for everyone on the Databricks Lakehouse Platform.\n", + "\n", + "This allows each team to be more agile and innovate faster.\n", + "\n", + "**Data Analysts** — Using the Databricks SQL editor\n", + "\n", + "or their tools of choice (DBT, Power BI, Tableau), SQL\n", + "\n", + "analysts can leverage familiar toolsets.\n", + "\n", + "**Data Engineers** — Utilizing Delta Lake as a unified\n", + "\n", + "storage layer, data engineering teams can eliminate\n", + "\n", + "duplicate data and ETL jobs that move data across\n", + "\n", + "various systems. Databricks supports both batch and\n", + "\n", + "streaming workloads to reduce bottlenecks and serve\n", + "\n", + "the most up-to-date data to downstream users and\n", + "\n", + "applications.\n", + "\n", + "**Administrators** — The pay-as-you-go, decentralized\n", + "\n", + "compute resource allows each team to run their\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides a reliable ETL and data\n", + "\n", + "management framework to simplify ETL pipelines. Data teams can\n", + "\n", + "build end-to-end data transformations in a single pipeline instead of\n", + "\n", + "many small ETL tasks. Databricks supports data quality enforcement\n", + "\n", + "to ensure reliability with auto-scalable infrastructure. Your teams\n", + "\n", + "can onboard new data sources quickly to power new use cases with\n", + "\n", + "fresh data. This not only allows your team to efficiently and reliably\n", + "\n", + "deliver high-quality data in a timely manner, it also reduces ETL\n", + "\n", + "workload cost significantly.\n", + "\n", + "#### Future-proof\n", + "\n", + "Unlike CDWs that lock customers in, Databricks offers an open\n", + "\n", + "platform with open standards, open protocols and open data\n", + "\n", + "formats. It supports a full range of popular languages (SQL, Python,SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
56f2df145130a08a1b2bbea4b2265a31applications.\n", + "\n", + "**Administrators** — The pay-as-you-go, decentralized\n", + "\n", + "compute resource allows each team to run their\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides a reliable ETL and data\n", + "\n", + "management framework to simplify ETL pipelines. Data teams can\n", + "\n", + "build end-to-end data transformations in a single pipeline instead of\n", + "\n", + "many small ETL tasks. Databricks supports data quality enforcement\n", + "\n", + "to ensure reliability with auto-scalable infrastructure. Your teams\n", + "\n", + "can onboard new data sources quickly to power new use cases with\n", + "\n", + "fresh data. This not only allows your team to efficiently and reliably\n", + "\n", + "deliver high-quality data in a timely manner, it also reduces ETL\n", + "\n", + "workload cost significantly.\n", + "\n", + "#### Future-proof\n", + "\n", + "Unlike CDWs that lock customers in, Databricks offers an open\n", + "\n", + "platform with open standards, open protocols and open data\n", + "\n", + "formats. It supports a full range of popular languages (SQL, Python,\n", + "\n", + "R, Scala) and popular BI tools. You can leverage the performant\n", + "\n", + "and low-cost distributed compute layer for data processing — or\n", + "\n", + "use a variety of tools and engines to efficiently access the data via\n", + "\n", + "Databricks APIs. Databricks also allows data consumption with a rich\n", + "\n", + "partner ecosystem. Teams can handle all existing BI and AI use cases\n", + "\n", + "with the flexibility to support future use cases as they emerge.\n", + "\n", + "\n", + "workload in isolated environments without worrying\n", + "\n", + "about contention. Serverless SQL endpoint frees your\n", + "\n", + "team from infrastructure management challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Migration to Lakehouse\n", + "\n", + "#### Overview\n", + "\n", + "A lakehouse is the ideal data architecture for data-driven organizations. It combines the\n", + "\n", + "best qualities of data warehouses and data lakes to provide a single solution for all major\n", + "\n", + "data workloads and supports use cases from streaming analytics to BI, data science and\n", + "\n", + "AI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n", + "\n", + "only consumes (charges for) compute resources when workloads are running. This pay-\n", + "\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Building the Lakehouse\n", + " at Atlassian\n", + "\n", + "[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n", + "\n", + "\n", + "as-you-go model means compute resources are automatically shut down if no processing\n", + "\n", + "is needed. Data teams can use small clusters that can power individual workloads\n", + "\n", + "they plan to migrate. They can make the choice to leverage serverless SQL endpoints\n", + "\n", + "and completely free data teams from infrastructure capacity planning and cluster\n", + "\n", + "maintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n", + "\n", + "savings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n", + "\n", + "savings compared to other cloud data warehouses.\n", + "\n", + "Data warehouse migration is never an easy task. Databricks aims to mitigate the things\n", + "\n", + "that can go wrong in these demanding migration projects. The Databricks Lakehouse\n", + "\n", + "Platform provides many out-of-the-box features to mitigate migration risks.\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Driving Freight Transportation Into the Future\n", + "\n", + "[Read more](https://databricks.com/customers/jbhunt)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration strategy\n", + "\n", + "\n", + "Migration is a huge effort and very expensive. Yet, almost every\n", + "\n", + "enterprise has to migrate to new platforms every 3–5 years because\n", + "\n", + "the old platform cannot support new use cases, catch up with\n", + "\n", + "data growth or meet scaling needs. To get better ROI on migration,\n", + "\n", + "implement a migration strategy that can reduce future re-platform\n", + "\n", + "needs and extend to your future data and AI strategy.\n", + "\n", + "Use the opportunity of a data migration to standardize your data\n", + "\n", + "in open Delta format to allow existing and future tools to access\n", + "\n", + "it directly without moving or converting it. Merge your siloed\n", + "\n", + "data warehouses into the unified storage layer in the DatabricksSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
2748250245f5cbd7aa9d6d16ef303f97**C U S T O M E R S T O R Y**\n", + "##### Driving Freight Transportation Into the Future\n", + "\n", + "[Read more](https://databricks.com/customers/jbhunt)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration strategy\n", + "\n", + "\n", + "Migration is a huge effort and very expensive. Yet, almost every\n", + "\n", + "enterprise has to migrate to new platforms every 3–5 years because\n", + "\n", + "the old platform cannot support new use cases, catch up with\n", + "\n", + "data growth or meet scaling needs. To get better ROI on migration,\n", + "\n", + "implement a migration strategy that can reduce future re-platform\n", + "\n", + "needs and extend to your future data and AI strategy.\n", + "\n", + "Use the opportunity of a data migration to standardize your data\n", + "\n", + "in open Delta format to allow existing and future tools to access\n", + "\n", + "it directly without moving or converting it. Merge your siloed\n", + "\n", + "data warehouses into the unified storage layer in the Databricks\n", + "\n", + "Lakehouse Platform — without worrying about storage capacity.\n", + "\n", + "The unified storage layer allows your team to deploy a unified data\n", + "\n", + "governance on top to secure all data access consistently. Simplify\n", + "\n", + "your data governance story with Databricks Unity Catalog.\n", + "\n", + "\n", + "Move toward a single, consistent approach to data pipelining\n", + "\n", + "and refinement. Merge batch and streaming into a single end-\n", + "\n", + "to-end pipeline to get fresher data and provide more real-time\n", + "\n", + "decisions. Take a metadata-driven approach to align the dataflow\n", + "\n", + "with business processes and have data validation and quality\n", + "\n", + "check built-in. Through a series of curation and refinement steps,\n", + "\n", + "the output results in highly consumable and trusted data for\n", + "\n", + "downstream use cases.\n", + "\n", + "The lakehouse architecture makes it possible for the organization\n", + "\n", + "to create “data assets” by taking a stepwise approach to improving\n", + "\n", + "data and serving all essential use cases. Encourage your BI/analyst\n", + "\n", + "team to leverage Databricks serverless endpoints for self-serve\n", + "\n", + "and agility. Each team can evaluate their top priority workloads and\n", + "\n", + "migrate them in parallel to speed up migration.\n", + "\n", + "Take advantage of Databricks’ rich partner ecosystem. Your favorite\n", + "\n", + "partners are likely already integrated via Partner Connect and\n", + "\n", + "can be set up with a few clicks. There are also many ISV and SI\n", + "\n", + "consulting partners who can help your migration journey.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration planning\n", + "\n", + "Migrating a data warehouse to the cloud can be time consuming and challenging for your\n", + "\n", + "data teams. It’s important to agree on the data architecture, migration strategy and process/\n", + "\n", + "frameworks to be used before undertaking a data migration. Databricks provides Migration\n", + "\n", + "Assessment and Architecture Review sessions to develop a joint migration roadmap. This\n", + "\n", + "process is designed to help organizations to successfully migrate to a lakehouse architecture.\n", + "\n", + "Based on information collected and business objectives, the Databricks team will work with\n", + "\n", + "customers to propose a target architecture and provide a tailored migration roadmap.\n", + "\n", + "These assessments help get a full picture of current data systems and the future vision. They\n", + "\n", + "clarify what you are migrating and do proper use case discovery. This includes identifying\n", + "\n", + "workloads and data source dependency, for example:\n", + "\n", + "Sample migration assessment checklist:\n", + "\n", + "Identify upstream data sources and workload dependencies\n", + "\n", + "Identify active/inactive data sets and database objects\n", + "\n", + "Identify downstream application dependencies and data freshness requirements\n", + "\n", + "Define a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n", + "\n", + "Define security requirements and data governance\n", + "\n", + "Clarify access management need, document needed permissions per user/group\n", + "\n", + "Outline current tooling (ingestion, ETL and BI) and what’s needed\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s important to identify key stakeholders and keep them engaged during the migration to\n", + "\n", + "make sure they are aligned with the overall objectives. The workload assessment result will\n", + "\n", + "be reviewed with key stakeholders. Through the review process, data teams can get a better\n", + "\n", + "understanding of which workloads can most benefit from modernization.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
b8f09be63b094f14e5cb405d10b93048clarify what you are migrating and do proper use case discovery. This includes identifying\n", + "\n", + "workloads and data source dependency, for example:\n", + "\n", + "Sample migration assessment checklist:\n", + "\n", + "Identify upstream data sources and workload dependencies\n", + "\n", + "Identify active/inactive data sets and database objects\n", + "\n", + "Identify downstream application dependencies and data freshness requirements\n", + "\n", + "Define a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n", + "\n", + "Define security requirements and data governance\n", + "\n", + "Clarify access management need, document needed permissions per user/group\n", + "\n", + "Outline current tooling (ingestion, ETL and BI) and what’s needed\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s important to identify key stakeholders and keep them engaged during the migration to\n", + "\n", + "make sure they are aligned with the overall objectives. The workload assessment result will\n", + "\n", + "be reviewed with key stakeholders. Through the review process, data teams can get a better\n", + "\n", + "understanding of which workloads can most benefit from modernization.\n", + "\n", + "Databricks often works with partners to provide a workload assessment and help customers\n", + "\n", + "understand their migration complexity and properly plan a budget. Databricks also partners\n", + "\n", + "with third-party vendors that provide migration tools to securely automate major migration\n", + "\n", + "tasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n", + "\n", + "help with the migration, including:\n", + "\n", + "\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n", + "\n", + "your current system to Databricks optimized code with Delta and other best practices\n", + "\n", + "\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n", + "\n", + "migration time and cost\n", + "\n", + "\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n", + "\n", + "2x–3x faster than what was previously possible\n", + "\n", + "\n", + "-----\n", + "\n", + "#### We can use Automated conversion for most workload types\n", + "\n", + "###### EDWs\n", + "\n", + "\n", + "Open Cloud Storage\n", + "ADLS, S3, GCP Storage\n", + "\n", + "Databricks Tables, �ie�s\n", + "\n", + "Spark SQL Databricks Notebooks\n", + "\n", + "Spark SQL � little bit o� Python or Scal�\n", + "\n", + "Runs on Databricks JDBC/ODBC\n", + "\n", + "Databricks permissions- Table ACLs\n", + "\n", + "Credential Pass-throughs to Files\n", + "\n", + "Big Data ETL tools, Databricks Notebooks\n", + "\n", + "Air5o� DAGs, ADF, Databricks Job\n", + "and any other Enterprise Schedulers\n", + "\n", + "\n", + "Data Migration\n", + "\n", + "Metastore Migration\n", + "\n", + "SQL Migration\n", + "\n", + "Security\n", + "\n", + "ETL Tools\n", + "\n", + "\n", + "DB locked �ormats on Disks\n", + "\n", + "Databases, Tables, �ie�s\n", + "\n", + "Ad-hoc SQL �ueries\n", + "\n", + "T-SQL, PL/SQL, BTEQ\n", + "\n", + "Reports �rom PB`, Tableau etc^\n", + "\n", + "GRANTs, Roles\n", + "\n", + "External tables- File permissions\n", + "\n", + "Data Stage, Po�erCenter, Ab `nitio etc^\n", + "\n", + "\n", + "Orchestration ETL Schedulers\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ELT approach\n", + "\n", + "The separation of storage and compute makes ELT on lakehouse a better choice than traditional\n", + "\n", + "ETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n", + "\n", + "data implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n", + "\n", + "use cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n", + "\n", + "the foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n", + "\n", + "as needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n", + "\n", + "by exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n", + "\n", + "“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n", + "\n", + "**I M P R O V E D ATA Q U A L I T Y**\n", + "\n", + "Data B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n", + "\n", + "Streaming AnalyticsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
2fa9b783df7db3ee2e04a3b7d5ed2ad5The separation of storage and compute makes ELT on lakehouse a better choice than traditional\n", + "\n", + "ETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n", + "\n", + "data implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n", + "\n", + "use cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n", + "\n", + "the foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n", + "\n", + "as needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n", + "\n", + "by exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n", + "\n", + "“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n", + "\n", + "**I M P R O V E D ATA Q U A L I T Y**\n", + "\n", + "Data B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n", + "\n", + "Streaming Analytics\n", + "\n", + "CSV TXT JSON\n", + "\n", + "\n", + "D ata �a �e\n", + "\n", + "\n", + "Raw\n", + "integration\n", + "\n", + "\n", + "Filtered, Cleaned,\n", + "Augmented\n", + "\n", + "\n", + "Business-level\n", + "Aggregates\n", + "\n", + "\n", + "Reuorting\n", + "\n", + "\n", + "-----\n", + "\n", + "We highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n", + "\n", + "service in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n", + "\n", + "modernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n", + "\n", + "a traditional data warehouse, you can focus on source and expected output, and create your\n", + "\n", + "entire dataflow graph declaratively. Delta Live Tables offers:\n", + "\n", + "\u0007A metadata-driven approach — You just specify what data should be in each table or view\n", + "\n", + "rather than the details of how processing should be done\n", + "\n", + "\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n", + "\n", + "monitoring/visibility, error recovery, and lineage, which reduces the strain on data\n", + "\n", + "engineering teams and improves time-to-value in building data pipelines\n", + "\n", + "\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n", + "\n", + "are populated correctly, whether continuously or on a regular schedule. For example,\n", + "\n", + "updating one table will automatically trigger all downstream table updates to keep data\n", + "\n", + "up-to-date.\n", + "\n", + "\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n", + "\n", + "pipelines simpler and easier. DLT can also automatically recover from common error\n", + "\n", + "conditions, reducing operational overhead.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Agile modernization\n", + "\n", + "\n", + "Agile development allows teams to move quickly knowing migrated\n", + "\n", + "pipelines can be revisited at a later cycle and evolving data models\n", + "\n", + "are supported within the architecture. Allowing business impact to\n", + "\n", + "drive priorities via an agile approach helps mitigate migration risks.\n", + "\n", + "Prioritizing and selecting use cases where modernization brings\n", + "\n", + "business benefits quickly is a good starting point. Focus on the 20%\n", + "\n", + "of workloads that consume 80% of budget. By breaking workflows\n", + "\n", + "down into components and managing data stories, teams can adjust\n", + "\n", + "priorities over time. Changes can be made in collaboration with the\n", + "\n", + "user community to fit the business definition of value.\n", + "\n", + "Migrating to a lakehouse architecture leverages separation of storage\n", + "\n", + "and compute to remove resource contention between ETL and BI\n", + "\n", + "workloads. As a result, the migration process can be more agile,\n", + "\n", + "allowing you to evolve your design iteratively without big-bang effort:\n", + "\n", + "\u0007Reduce time during the initial phase on full capacity plan and\n", + "\n", + "\n", + "All of this allows you to take a more iterative and business-focused\n", + "\n", + "approach for migration instead of a full planning, execution, test/\n", + "\n", + "validation approach. Here are more approaches that help facilitate\n", + "\n", + "this phased implementation:\n", + "\n", + "\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingestSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
8773044556745d4feb2616c8e75292b3of workloads that consume 80% of budget. By breaking workflows\n", + "\n", + "down into components and managing data stories, teams can adjust\n", + "\n", + "priorities over time. Changes can be made in collaboration with the\n", + "\n", + "user community to fit the business definition of value.\n", + "\n", + "Migrating to a lakehouse architecture leverages separation of storage\n", + "\n", + "and compute to remove resource contention between ETL and BI\n", + "\n", + "workloads. As a result, the migration process can be more agile,\n", + "\n", + "allowing you to evolve your design iteratively without big-bang effort:\n", + "\n", + "\u0007Reduce time during the initial phase on full capacity plan and\n", + "\n", + "\n", + "All of this allows you to take a more iterative and business-focused\n", + "\n", + "approach for migration instead of a full planning, execution, test/\n", + "\n", + "validation approach. Here are more approaches that help facilitate\n", + "\n", + "this phased implementation:\n", + "\n", + "\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n", + "\n", + "new data into pipelines quicker to get data in near real-time.\n", + "\n", + "\u0007Delta Live Tables (DLT) improves data quality during data\n", + "\n", + "transformation and automatically scales to address data volume\n", + "\n", + "change. DLT can also support schema evolution and quarantine\n", + "\n", + "bad data or data that needs to be reprocessed at a later stage.\n", + "\n", + "\u0007Use dedicated clusters to isolate workloads, lower the total cost\n", + "\n", + "of ownership and improve overall performance. By using multiple\n", + "\n", + "clusters, we can shut down resources when not in use and move\n", + "\n", + "away from managing fixed resources in a single large cluster.\n", + "\n", + "\n", + "scoping\n", + "\n", + "\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n", + "\n", + "\u0007Workload management is much simpler, you can isolate each\n", + "\n", + "workload with a dedicated compute resource, without worrying\n", + "\n", + "about managing workload contention\n", + "\n", + "\u0007Auto-scale and tear down the compute resources after the job\n", + "\n", + "is done to achieve cost efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "Leverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n", + "\n", + "\u0007Create a migration factory for iterative migration process\n", + "\n", + "\u0007Determine and implement a security and governance framework\n", + "\n", + "\u0007Establish a to-be environment and move use cases/workloads in logical units\n", + "\n", + "\u0007Prove business value and scale over time\n", + "\n", + "\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n", + "\n", + "Take this iterative and templated approach. Migration speed will accelerate. Customers can\n", + "\n", + "finish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n", + "\n", + "\n", + "“ M a k e i t w o r k ”\n", + "\n", + "Pa r e l l e l i z e t h e\n", + "B u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\n", + "i t e r at i o n s\n", + "\n", + "“ M a k e i t w o r k >a s t 2\n", + "\n", + "\n", + "Full %i\"ecycle %ig�t�ou�e /or�load�\n", + "\n", + "Leverage Databricks’ deep\n", + "\n", + "bench of expertise to build\n", + "\n", + "out some **templates for the**\n", + "\n", + "**most effective Databricks**\n", + "\n", + "**implementation.**\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Take an **iterative, bite-sized**\n", + "\n", + "**approach** to migration, reduce tech\n", + "\n", + "debt and rework, and bring forward\n", + "\n", + "the value of the solution earlier.\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "\n", + "-----\n", + "\n", + "To maximize the value of your lakehouse, you should consider retiring\n", + "\n", + "some legacy architecture design patterns. Leverage the migration\n", + "\n", + "process to simplify data warehousing tasks. Regardless of how you\n", + "\n", + "complete your migration, you could utilize lakehouse strengths to\n", + "\n", + "improve architectural patterns:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
ff4cafaa91f98cf81ce85137ca9e88edLeverage Databricks’ deep\n", + "\n", + "bench of expertise to build\n", + "\n", + "out some **templates for the**\n", + "\n", + "**most effective Databricks**\n", + "\n", + "**implementation.**\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Take an **iterative, bite-sized**\n", + "\n", + "**approach** to migration, reduce tech\n", + "\n", + "debt and rework, and bring forward\n", + "\n", + "the value of the solution earlier.\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "\n", + "-----\n", + "\n", + "To maximize the value of your lakehouse, you should consider retiring\n", + "\n", + "some legacy architecture design patterns. Leverage the migration\n", + "\n", + "process to simplify data warehousing tasks. Regardless of how you\n", + "\n", + "complete your migration, you could utilize lakehouse strengths to\n", + "\n", + "improve architectural patterns:\n", + "\n", + "\u0007Merge your siloed data warehouses on your unified lakehouse\n", + "\n", + "platform and unify data access and data governance via Unity\n", + "\n", + "Catalog. The lakehouse architecture provides a unified storage\n", + "\n", + "layer for all your data where there is no physical boundary\n", + "\n", + "between data. There is no need to keep data copies for each\n", + "\n", + "system using the data set. Clean up and remove jobs that are\n", + "\n", + "created to keep data in sync across various data systems.\n", + "\n", + "Keep a single copy of raw data in your lakehouse as a single\n", + "\n", + "source of truth.\n", + "\n", + "\u0007The Databricks Lakehouse Platform allows you to merge batch\n", + "\n", + "and streaming into a single system to build a simple continuous\n", + "\n", + "\n", + "\u0007Simplify your workload isolation and management by running jobs\n", + "\n", + "in dedicated clusters. Separating storage and compute allows you\n", + "\n", + "to easily isolate each task with isolated compute resources. There\n", + "\n", + "is no need to squeeze them into a single large data appliance\n", + "\n", + "and spend lots of time managing and coordinating resources.\n", + "\n", + "Leverage the elasticity of the Databricks compute layer to\n", + "\n", + "automatically handle workload concurrency changes at peak time\n", + "\n", + "instead of paying for over-provisioned resources for most of the\n", + "\n", + "time. This greatly simplifies the workload management effort the\n", + "\n", + "traditional data warehouses require.\n", + "\n", + "\u0007Simplify disaster recovery. Storage and compute separation\n", + "\n", + "allows easy disaster recovery. The cloud storage provides very\n", + "\n", + "good data redundancy and supports automated replication\n", + "\n", + "to another region. Customers can spin up compute resources\n", + "\n", + "quickly in another region and maintain service availability in case\n", + "\n", + "of an outage.\n", + "\n", + "\n", + "data flow model to process data as it arrives. Process data in\n", + "\n", + "near real-time and enable data-driven decisions with the most\n", + "\n", + "recent updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Security and data governance\n", + "\n", + "\n", + "Security is paramount in any data-driven organization. Data security\n", + "\n", + "should enforce the business needs for both internal and external\n", + "\n", + "data, so the lakehouse should be set up to meet your organization’s\n", + "\n", + "security requirements. Databricks provides built-in security to\n", + "\n", + "protect your data during and after migration.\n", + "\n", + "\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n", + "\n", + "or your own\n", + "\n", + "\u0007Set up a custom network policy, use IP range to control access\n", + "\n", + "\u0007Leverage Private Link to limit network traffic to not traverse the\n", + "\n", + "public internet\n", + "\n", + "\n", + "The challenge with the traditional data warehouse and data lake\n", + "\n", + "architecture is that data is stored in multiple stores and your data\n", + "\n", + "team also needs to manage data access and data governance\n", + "\n", + "twice. The lakehouse pattern uses unified storage which simplifies\n", + "\n", + "governance. The Databricks Lakehouse Platform provides a unified\n", + "\n", + "governance layer across all your data teams. Migrating to Databricks\n", + "\n", + "Unity Catalog provides data discovery, data lineage, role-based\n", + "\n", + "security policies, table or row/column-level access control, and\n", + "\n", + "central auditing capabilities that make the data platform easy for\n", + "\n", + "data stewards to confidently manage and secure data access toSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
75a1c56b0f4933fba47834345e0c4f4dsecurity requirements. Databricks provides built-in security to\n", + "\n", + "protect your data during and after migration.\n", + "\n", + "\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n", + "\n", + "or your own\n", + "\n", + "\u0007Set up a custom network policy, use IP range to control access\n", + "\n", + "\u0007Leverage Private Link to limit network traffic to not traverse the\n", + "\n", + "public internet\n", + "\n", + "\n", + "The challenge with the traditional data warehouse and data lake\n", + "\n", + "architecture is that data is stored in multiple stores and your data\n", + "\n", + "team also needs to manage data access and data governance\n", + "\n", + "twice. The lakehouse pattern uses unified storage which simplifies\n", + "\n", + "governance. The Databricks Lakehouse Platform provides a unified\n", + "\n", + "governance layer across all your data teams. Migrating to Databricks\n", + "\n", + "Unity Catalog provides data discovery, data lineage, role-based\n", + "\n", + "security policies, table or row/column-level access control, and\n", + "\n", + "central auditing capabilities that make the data platform easy for\n", + "\n", + "data stewards to confidently manage and secure data access to\n", + "\n", + "meet compliance and privacy needs, directly on the lakehouse.\n", + "\n", + "\n", + "\u0007Enable SSO, integrate with active directory and other IdPs\n", + "\n", + "\u0007Control data access to database objects using RBAC\n", + "\n", + "\u0007Enable audit logs to monitor user activities\n", + "\n", + "\n", + "-----\n", + "\n", + "A-�it Log\n", + "\n", + "Acco-nt Level$\n", + "User Management\n", + "\n", + "Cre�entials\n", + "\n", + "##### Centralized Governance\n", + "\n", + "ACL Store\n", + "\n", + "Access Control\n", + "\n", + "\n", + "Metastore\n", + "\n", + "Lineage Explorer\n", + "\n", + "Data Explorer\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Team involvement\n", + "\n", + "Plan to educate and train your team iteratively throughout the\n", + "\n", + "migration process. As new workloads are migrated, new teams will\n", + "\n", + "gain exposure to the lakehouse pattern. Plan to ramp up new team\n", + "\n", + "members as the migration process progresses, developing a data\n", + "\n", + "Center of Excellence within the organization. Databricks provides\n", + "\n", + "a cost effective platform for ad hoc work to be performed. A\n", + "\n", + "sandbox environment can be leveraged for teams to get exposure\n", + "\n", + "to Databricks technology and get hands-on experience. Databricks\n", + "\n", + "also provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n", + "\n", + "to get hands-on experience relevant to their immediate tasks, gain\n", + "\n", + "\n", + "#### Conclusion\n", + "\n", + "Data warehouse migration touches many business areas and\n", + "\n", + "impacts many teams, but the Databricks Lakehouse Platform\n", + "\n", + "simplifies this transition, reduces risks and accelerates your ROI.\n", + "\n", + "The Databricks Business Value Consulting team can work with you\n", + "\n", + "to quantify the impact of your use cases to both data and business\n", + "\n", + "teams. And the Databricks team of solution architects, professional\n", + "\n", + "services, and partners are ready to help.\n", + "\n", + "Reach out to your Databricks account team or send a message to\n", + "\n", + "[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n", + "\n", + "\n", + "exposure to new things and try new ideas.\n", + "\n", + "#### Additional resources\n", + "\n", + "[Migrate to Databricks](https://databricks.com/solutions/migration)\n", + "\n", + "[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow Databricks onSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
3205b185d56baaf2d9a6be359acdc2bdexposure to new things and try new ideas.\n", + "\n", + "#### Additional resources\n", + "\n", + "[Migrate to Databricks](https://databricks.com/solutions/migration)\n", + "\n", + "[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
10579387cee3f11fabfa76bda3175578```\n", + "TECHNICAL GUIDE\n", + "\n", + "```\n", + "\n", + "# Solving Common Data Challenges \n", + "\n", + "\n", + "#### Startups and Digital Native Businesses\n", + "\n", + "\n", + "-----\n", + "\n", + "### Table of Contents\n", + "\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Creating a unified data architecture for data quality, governance and efficiency\n", + "\n", + "# 03\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building effective machine learning operations\n", + "\n", + "```\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building a data architecture to support scale and performance\n", + "\n", + "# 04\n", + "SUMMARY:\n", + "\n", + "###### The Databricks Lakehouse Platform addresses these challenges\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "\n", + "This guide shares how the lakehouse architecture can increase\n", + "productivity and cost-efficiently support all your data, analytics\n", + "and AI workloads, and flexibly scale with the pace of growth\n", + "for your company. Read the entire guide or dive straight into a\n", + "specific challenge.\n", + "\n", + "With the advent of cloud infrastructure, a new generation of\n", + "startups has rapidly built and scaled their businesses. The use of\n", + "cloud infrastructure, once seen as innovative, has now become\n", + "table stakes. The differentiator for the fastest-moving startups\n", + "and digital natives now comes from the effective use of data\n", + "at scale, primarily analytics and AI. Digital natives — defined\n", + "as fast-moving, lean, and technically savvy, born-in-the-cloud\n", + "organizations — are beginning to focus on new data-driven use\n", + "cases such as real-time machine learning and personalized\n", + "customer experiences.\n", + "\n", + "To pursue these new data-intensive use cases and initiatives,\n", + "organizations must look beyond the technologies that delivered\n", + "them to this point in time. Over time, these technologies, such\n", + "as transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n", + "\n", + "This guide examines some of the biggest data challenges and\n", + "solutions for startups and for scaling digital native businesses\n", + "that have reached the point where an end-to-end modern data\n", + "platform is a smart investment. Some key considerations include:\n", + "systems that are not cost-efficient and require time-consuming\n", + "administration and engineering toil. In addition to growing\n", + "maintenance needs, data is often stored in disparate locations\n", + "and formats, with little or no governance, making real-time use\n", + "cases, analytics and AI difficult or impossible.\n", + "\n", + "\n", + "**Consolidating on a unified data platform**\n", + "As mentioned above, siloed data storage and management add administrative and\n", + "financial cost. You can benefit significantly when you unify your data in one location\n", + "with a flexible architecture that scales with your needs and delivers performance\n", + "for future success. For this, you will want an open platform that supports all your\n", + "data including batch and streaming workloads, data analytics and machine learning.\n", + "With data unification, you create a more efficient, integrated approach to ingesting,\n", + "cleaning and organizing your data. You also need automation to make data analysis\n", + "easier for the nontechnical users in the company. But broader data access also\n", + "means more focus on security, privacy, compliance and access control, which can\n", + "create overhead for a growing.\n", + "\n", + "**Scaling up capacity and increasing performance**\n", + "**and usability of the data solutions**\n", + "Data teams at growing digital native organizations find it time intensive and costly to\n", + "handle the growing volume and velocity of their data being ingested from multiple\n", + "sources, across multiple clouds. You now need a unified and simplified platform that\n", + "can instantly scale up capacity and deliver more computing power on demand to\n", + "free up your data teams to produce outputs more quickly. This lowers the total cost\n", + "for the overall infrastructure by eliminating redundant licensing, infrastructure and\n", + "administration costs.\n", + "\n", + "**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiencySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
44bc7408f155cba595fa74c04215d6b9**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "As cloud-born companies grow, data volumes rapidly increase, leading to new\n", + "challenges and use cases. Among the challenges:\n", + "\n", + "\n", + "Application stacks optimized for transaction\n", + "use cases aren’t able to handle the volume,\n", + "velocity and variety of data that modern data\n", + "teams require. For example, this leads to query\n", + "performance issues as data volume grows.\n", + "\n", + "Data silos develop as each team within an\n", + "organization chooses different ETL/ELT and\n", + "storage solutions for their needs. As the\n", + "organization grows and changes, these pipelines\n", + "and storage solutions become brittle, hard to\n", + "maintain and nearly impossible to integrate.\n", + "\n", + "\n", + "These data silos lead to discoverability,\n", + "integration and access issues, which prevent\n", + "teams from leveraging the full value of the\n", + "organization’s available data.\n", + "\n", + "Data governance is hard. Disparate ETL/ELT\n", + "and storage solutions lead to governance,\n", + "compliance, auditability and access control\n", + "challenges, which expose organizations to\n", + "tremendous risk.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides\n", + "a unified set of tools for building, deploying,\n", + "sharing and maintaining data solutions at scale.\n", + "It integrates with cloud storage and the security\n", + "in your cloud account, manages and deploys\n", + "cloud infrastructure on your behalf. Your data\n", + "practitioners no longer need separate storage\n", + "systems for their data. And you don’t have to rely\n", + "on your cloud provider for security. The lakehouse\n", + "has its own robust security built into the platform.\n", + "\n", + "\n", + "For all the reasons above, the most\n", + "consistent advice from successful data\n", + "practitioners is to create a “single source\n", + "of truth” by unifying all data on a single\n", + "platform. With the Databricks Lakehouse\n", + "Platform, you can unify all your data on one\n", + "platform, reducing data infrastructure costs\n", + "and compute. You don’t need excess data\n", + "copies and you can retire expensive\n", + "legacy infrastructure.\n", + "```\n", + " 01\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: GRAMMARLY\n", + "\n", + "### Helping 30 million people and 50,000 teams communicate more effectively\n", + "\n", + "```\n", + "\n", + "While its business is based on analytics, [Grammarly](http://www.grammarly.com)\n", + "\n", + "for many years relied on a homegrown analytics\n", + "\n", + "platform to drive its AI writing assistant to\n", + "\n", + "help users improve multiple aspects of written\n", + "\n", + "communications. As teams developed their own\n", + "\n", + "requirements, data silos inevitably emerged as\n", + "\n", + "different business areas implemented analytics\n", + "\n", + "tools individually.\n", + "\n", + "“Every team decided to solve their analytics\n", + "\n", + "needs in the best way they saw fit,” said Chris\n", + "\n", + "Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholdsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
112b18913a18db059bbc06be42eae3e3Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholds\n", + "\n", + "the standards we set as a data platform team,” said Locklin. “Lineage is\n", + "\n", + "the last crucial piece for access control.”\n", + "\n", + "Data analysts within Grammarly now have a consolidated interface for\n", + "\n", + "analytics, which leads to a single source of truth and confidence in the\n", + "\n", + "accuracy and availability of all data managed by the data platform team.\n", + "\n", + "Having a consistent data source across the company also resulted in\n", + "\n", + "greater speed and efficiency and reduced costs. Data practitioners\n", + "\n", + "experienced 110% faster querying at 10% of the cost to ingest compared\n", + "\n", + "to a data warehouse. Grammarly can now make its 5 billion daily events\n", + "\n", + "available for analytics in under 15 minutes rather than 4 hours. Migrating\n", + "\n", + "off its rigid legacy infrastructure gave Grammarly the flexibility to do\n", + "\n", + "more and the confidence that the platform will evolve with its needs.\n", + "\n", + "Grammarly is now able to sustain a flexible, scalable and highly secure\n", + "\n", + "analytics platform that helps 30 million people and 50,000 teams\n", + "\n", + "worldwide write more effectively every day.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/grammarly)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to unify the data infrastructure with Databricks\n", + "\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\n", + "is composed of two primary parts:\n", + "\n", + "- The infrastructure to deploy, configure and\n", + "manage the platform and services\n", + "\n", + "\n", + "You can build a Databricks workspace by configuring\n", + "secure integrations between the Databricks platform\n", + "and your cloud account, and then Databricks deploys\n", + "temporary Apache Spark™/Photon clusters using cloud\n", + "resources in your account to process and store data\n", + "in object storage and other integrated services you\n", + "control. Here are three steps to get started with the\n", + "Databricks Lakehouse Platform:\n", + "\n", + "**Understand the architecture**\n", + "The lakehouse provides a unified architecture,\n", + "meaning that all data is stored in the same\n", + "accessible place. The diagram shows how data\n", + "comes in from sources like a customer relationship\n", + "management (CRM) system, an enterprise resource\n", + "planning (ERP) system, websites or unstructured\n", + "customer emails.\n", + "\n", + "**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
6fe95ef77e68154410332294fec104f9**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).\n", + "\n", + "[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Lakehouse organizes data stored with Delta Lake in cloud object\n", + "storage with familiar concepts like database, tables and views. Delta Lake extends\n", + "Parquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\n", + "scalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\n", + "and was developed for tight integration with Structured Streaming, allowing you to\n", + "easily use a single copy of data for both batch and streaming operations to provide\n", + "incremental processing at scale.This model combines many of the benefits of a data\n", + "warehouse with the scalability and flexibility of a data lake.\n", + "\n", + "To learn more about the optimized storage layer that provides the foundation for\n", + "storing data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n", + "[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n", + "\n", + "The first step in unifying your data architecture is setting up how data is to be\n", + "accessed and used across the organization. We’ll discuss this as a series of steps:\n", + "\n", + "**1** Set up governance with Unity Catalog\n", + "\n", + "**2** Grant secure access to the data\n", + "\n", + "\n", + "###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n", + " – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n", + "\n", + "[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
7464e873c78f3ca1d4eb6292685b6fe6**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n", + "\n", + "To set up Unity Catalog for your organization,\n", + "you do the following:\n", + "\n", + "\n", + "**1** Configure an S3 bucket and IAM role that\n", + "Unity Catalog can use to store and access\n", + "data in your AWS account.\n", + "\n", + "**2** Create a metastore for each region in\n", + "\n", + "which your organization operates, and\n", + "attach workspaces to the metastore. Each\n", + "workspace will have the same view of the\n", + "data you manage in Unity Catalog.\n", + "\n", + "\n", + "**3** If you have a new account, add users,\n", + "groups and service principals to your\n", + "Databricks account.\n", + "\n", + "**4** Next, create and grant access to\n", + "\n", + "catalogs, schemas and tables.\n", + "\n", + "\n", + "For complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How Unity Catalog works\n", + "\n", + "\n", + "You will notice that the hierarchy of primary data\n", + "objects in Unity Catalog flows from metastore to table:\n", + "\n", + "**Metastore** is the top-level container for metadata.\n", + "Each metastore exposes a three-level namespace\n", + "(catalog.schema.table) that organizes your data.\n", + "\n", + "\n", + "**Metastore** **Catalog** **Schemas**\n", + "\n", + "\n", + "**Views**\n", + "\n", + "**Managed**\n", + "**Tables**\n", + "\n", + "\n", + "**Catalog** is the first layer of the object hierarchy, used\n", + "to organize your data assets.\n", + "\n", + "\n", + "**Schemas** , also known as databases, are the second\n", + "layer of the object hierarchy and contain tables and\n", + "views.\n", + "\n", + "**Table** is the lowest level in the object hierarchy, and\n", + "tables can be external (stored in external locations in\n", + "your cloud storage of choice) or managed (stored in a\n", + "storage container in your cloud storage that you create\n", + "\n", + "expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
a0f7f52e108693072a591793dd53fd03expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n", + "\n", + "Unity Catalog users, service principals, and groups\n", + "must also be added to workspaces to access Unity\n", + "Catalog data in a notebook, a Databricks SQL query,\n", + "Data Explorer or a REST API command. The assignment\n", + "of users, service principals, and groups to workspaces\n", + "is called identity federation. All workspaces attached\n", + "to a Unity Catalog metastore are enabled for identity\n", + "federation.\n", + "\n", + "Securable objects in Unity Catalog are hierarchical,\n", + "meaning that granting a privilege on a catalog or schema\n", + "automatically grants the privilege to all current and\n", + "future objects within the catalog or schema. For more\n", + "on granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\n", + "A common scenario is to set up a schema per team\n", + "where only that team has USE SCHEMA and CREATE on\n", + "the schema. This means that any tables produced by\n", + "team members can only be shared within the team.\n", + "Data Explorer uses the privileges configured by Unity\n", + "Catalog administrators to ensure that users are only\n", + "able to see catalogs, databases, tables and views that\n", + "they have permission to query.\n", + "\n", + "\n", + "[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\n", + "many Unity Catalog features. Use Data Explorer to view\n", + "schema details, preview sample data, and see table\n", + "details and properties. Administrators can view and\n", + "change owners. Admins and data object owners can grant\n", + "and revoke permissions through this interface.\n", + "\n", + "**Set up secure access**\n", + "In Unity Catalog, data is secure by default. Initially, users\n", + "have no access to data in a metastore. Access can\n", + "be granted by either a metastore admin, the owner of\n", + "an object, or the owner of the catalog or schema that\n", + "contains the object. Securable objects in Unity Catalog\n", + "are hierarchical and privileges are inherited downward.\n", + "\n", + "Unity Catalog’s security model is based on standard ANSI\n", + "SQL and allows administrators to grant permissions in\n", + "their existing data lake using familiar syntax, at the level of\n", + "catalogs, databases (schema), tables and views. Privileges\n", + "and metastores are shared across workspaces, allowing\n", + "administrators to set secure permissions once against\n", + "\n", + "groups synced from identity providers and know that\n", + "end users only have access to the proper data in any\n", + "Databricks workspace they enter.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: BUTCHERBOX\n", + "\n", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significantSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
2cb92e326f83cade2a74789a0196a281### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significant\n", + "\n", + "\n", + "“We knew we needed to migrate from our legacy data warehouse\n", + "\n", + "environment to a data analytics platform that would unify our\n", + "\n", + "data and make it easily accessible for quick analysis to improve\n", + "\n", + "supply chain operations, forecast demand and, most importantly,\n", + "\n", + "keep up with our growing customer base,” explained Jake Stone,\n", + "\n", + "Senior Manager, Business Analytics, at ButcherBox.\n", + "\n", + "The platform allows analysts to share builds and iterate on a\n", + "\n", + "project without getting into the code. Querying a table of 18\n", + "\n", + "billion rows would have been problematic with a traditional\n", + "\n", + "platform. With Databricks, ButcherBox can do it in three minutes.\n", + "\n", + "“Delta Lake provides us with a single source of truth for all of\n", + "\n", + "our data,” said Stone. “Now our data engineers are able to build\n", + "\n", + "reliable data pipelines that thread the needle on key topics such\n", + "\n", + "as inventory management, allowing us to identify in near real-\n", + "\n", + "time what our trends are so we can figure out how to effectively\n", + "\n", + "move inventory.”\n", + "\n", + "[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "problem because they blocked complete\n", + "\n", + "visibility into critical insights needed to make\n", + "\n", + "strategic and marketing decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Set up secure data sharing**\n", + "Databricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\n", + "to share data with other entities regardless of their\n", + "computing platforms. Delta Sharing is integrated with\n", + "Unity Catalog. Your data must be registered with Unity\n", + "Catalog to manage, govern, audit and track usage of the\n", + "shared data on the Lakehouse Platform. The primary\n", + "concepts of Delta Sharing are shares (read-only\n", + "collections of tables and table partitions to be shared)\n", + "and recipients (objects that associate an organization\n", + "with a credential or secure sharing identifier).\n", + "\n", + "As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
e574ab8b9c03dd45aa08ea51855a9280As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n", + "\n", + "**View data lineage**\n", + "You can use Unity Catalog to capture runtime data\n", + "lineage across queries in any language executed on\n", + "a Databricks cluster or SQL warehouse. Lineage can\n", + "be visualized in Data Explorer in near real-time and\n", + "retrieved with the Databricks REST API. Lineage is\n", + "aggregated across all workspaces attached to Unity\n", + "Catalog and captured down to the column level, and\n", + "includes notebooks, workflows and dashboards related\n", + "to the query. To understand the requirements and how\n", + "to capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n", + "[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n", + "\n", + "\n", + "Unity Catalog Metastore\n", + "\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data providers can use Databricks audit logging to\n", + "monitor the creation and modification of shares,\n", + "and recipients can monitor recipient activity on\n", + "shares. Data recipients who use shared data in a\n", + "Databricks account can use Databricks audit logging\n", + "to understand who is accessing which data.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n", + "\n", + "- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n", + "\n", + "- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n", + "\n", + "- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n", + "\n", + "- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "\n", + "\n", + "###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloudSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
f1108b13cd3a9bfb92c543c7a09639ad###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloud\n", + "\n", + "The Databricks Lakehouse Platform is open\n", + "source with multicloud flexibility so that you can\n", + "use your data however and wherever you want —\n", + "no vendor lock-in\n", + "\n", + "\n", + "-----\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 02\n", + "\n", + "### Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "As modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\n", + "the increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\n", + "suddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\n", + "cost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\n", + "upon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n", + "\n", + "Here are some common challenges around managing data and performance at scale:\n", + "\n", + "\n", + "**Volume and velocity** — Exponentially\n", + "increasing data sources, and the speed at\n", + "which they capture and create data.\n", + "\n", + "**Latency requirements** — The demands of\n", + "downstream applications and users have\n", + "evolved (people want data and the results\n", + "from the data faster).\n", + "\n", + "\n", + "**Governance** — Cataloging, auditing, securing and\n", + "reporting on data is burdensome at scale when\n", + "using old systems not built with data access\n", + "controls and compliance in mind.\n", + "\n", + "**Multicloud** is really hard.\n", + "\n", + "\n", + "**Data storage** — Storing data in the wrong\n", + "format is slow to access, query and is\n", + "expensive at scale.\n", + "\n", + "\n", + "**Data format** — Supporting structured, semistructured and unstructured data formats\n", + "is now a requirement. Most data storage\n", + "solutions are designed to handle only one type\n", + "of data, requiring multiple products\n", + "to be stitched together.\n", + "\n", + "```\n", + "02\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Lakehouse solves scale and performance challenges\n", + "\n", + "\n", + "The solution for growing digital companies is a unified\n", + "and simplified platform that can instantly scale up\n", + "capacity to deliver more computing power on demand,\n", + "freeing up teams to go after the much-needed data\n", + "and produce outputs more quickly. With a lakehouse,\n", + "they can replace their data silos with a single home for\n", + "their structured, semi-structured and unstructured\n", + "data. Users and applications throughout the enterprise\n", + "environment can connect to the same single copy of\n", + "the data to drive diverse workloads.\n", + "\n", + "The lakehouse architecture is cost-efficient for\n", + "scaling, lowering the total cost of ownership for the\n", + "overall infrastructure by consolidating all data estate\n", + "and use cases onto a single platform and eliminating\n", + "redundant licensing, infrastructure and administration\n", + "costs. Unlike other warehouse options that can only\n", + "scale horizontally, the Databricks Lakehouse can scale\n", + "horizontally and vertically based on workload demands.\n", + "\n", + "With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIANSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
a8a76379a0975b07547488cacc7a4a80With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "```\n", + "\n", + "With more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n", + "\n", + "day, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n", + "\n", + "legacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n", + "\n", + "decreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n", + "\n", + "infrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n", + "\n", + "downstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n", + "\n", + "actionable insights for different use cases, from predictive maintenance to smarter product development.\n", + "\n", + "“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n", + "\n", + "performant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n", + "\n", + "Wassym Bensaid, Vice President of Software Development at Rivian.\n", + "\n", + "For instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n", + "\n", + "accelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n", + "\n", + "roll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n", + "\n", + "connected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n", + "\n", + "smart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n", + "\n", + "has seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
7351ac4134a858391fa716f964aaaef2[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Step 2**\n", + "**Understand the common Delta Lake operations**\n", + "The Databricks Lakehouse Platform simplifies the\n", + "entire data lifecycle, from data ingestion to monitoring\n", + "and governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\n", + "open-source storage system based on the Delta\n", + "format providing reliability through ACID transactions\n", + "and scalable metadata handling. Large quantities of\n", + "raw files in blob storage can be converted to Delta to\n", + "organize and store the data cheaply. This allows for\n", + "flexibility of data movement while being performant\n", + "and less expensive.\n", + "\n", + "\n", + "**Step 1**\n", + "**Get a trial Databricks account**\n", + "Start your 14-day free trial with Databricks on\n", + "AWS in a few easy steps.\n", + "[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\n", + "uses compute and S3 storage resources in your cloud\n", + "provider account.\n", + "\n", + "\n", + "and writing data can occur simultaneously without risk\n", + "of many queries resulting in performance degradation\n", + "or deadlock for business-critical workloads.\n", + "\n", + "This means that users and applications throughout\n", + "the enterprise environment can connect to the same\n", + "single copy of the data to drive diverse workloads, with\n", + "all viewers guaranteed to receive the most current\n", + "version of the data at the time their query executes.\n", + "With performance features like indexing, Delta Lake\n", + "customers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n", + "[up to 48x faster.](https://www.databricks.com/customers/columbia)\n", + "\n", + "\n", + "[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\n", + "and learn how to create, manage and query tables.\n", + "With support for ACID transactions and schema\n", + "enforcement, Delta Lake provides the reliability that\n", + "traditional data lakes lack. This enables you to scale\n", + "reliable data insights throughout the organization and\n", + "run analytics and other data projects directly on your\n", + "data lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n", + "\n", + "Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
d9dd58cc603554c740505cf7cba15920Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 3**\n", + "**Ingest data efficiently at scale**\n", + "With a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\n", + "from hundreds of data sources for analytics, AI and\n", + "streaming applications into one place.\n", + "\n", + "Databricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\n", + "data ingestion. To ingest any file that can land in a data\n", + "lake, Auto Loader incrementally and automatically\n", + "processes new data files as they arrive in cloud storage\n", + "in scheduled or continuous jobs. Auto Loader scales to\n", + "support near real-time ingestion of millions of files\n", + "per hour.\n", + "\n", + "For pushing data in Delta Lake, the SQL command\n", + "[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\n", + "into Delta Lake. COPY INTO is best used when the input\n", + "directory contains thousands of files or fewer, and the\n", + "user prefers SQL. COPY INTO can be used over JDBC\n", + "to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "**Step 4**\n", + "**Leverage production-ready tools**\n", + "**to automate ETL pipelines**\n", + "Once the raw data is ingested, Databricks provides\n", + "a suite of production-ready tools that allow data\n", + "professionals to quickly develop and deploy extract,\n", + "\n", + "transform and load (ETL) pipelines. Databricks SQL\n", + "allows analysts to run SQL queries against the same\n", + "tables used in production ETL workloads, allowing for\n", + "real-time business intelligence at scale.\n", + "\n", + "With your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "for data orchestration and learn how easy it is to create\n", + "a cluster, create a Databricks notebook, configure\n", + "[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\n", + "interact with the data, and schedule a job.\n", + "\n", + "\n", + "Databricks supports workloads in SQL, Python, Scala\n", + "and R, allowing users with diverse skill sets and\n", + "technical backgrounds to leverage their knowledge\n", + "to derive analytic insights. You can use all languages\n", + "supported by Databricks to define production jobs, and\n", + "notebooks can leverage a combination of languages.\n", + "\n", + "This means that you can promote queries written by\n", + "SQL analysts for last-mile ETL into production data\n", + "engineering code with almost no effort. Queries and\n", + "workloads defined by personas across the organization\n", + "leverage the same data sets, so there’s no need to\n", + "reconcile field names or make sure dashboards are up\n", + "to date before sharing code and results with\n", + "other teams.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
57a6fe7bc345cc9f0a87c5e3e8917062Databricks supports workloads in SQL, Python, Scala\n", + "and R, allowing users with diverse skill sets and\n", + "technical backgrounds to leverage their knowledge\n", + "to derive analytic insights. You can use all languages\n", + "supported by Databricks to define production jobs, and\n", + "notebooks can leverage a combination of languages.\n", + "\n", + "This means that you can promote queries written by\n", + "SQL analysts for last-mile ETL into production data\n", + "engineering code with almost no effort. Queries and\n", + "workloads defined by personas across the organization\n", + "leverage the same data sets, so there’s no need to\n", + "reconcile field names or make sure dashboards are up\n", + "to date before sharing code and results with\n", + "other teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\n", + "a framework that uses a simple declarative approach\n", + "to build ETL and ML pipelines on batch or streaming\n", + "data while automating operational complexities such\n", + "as infrastructure management, task orchestration,\n", + "error handling and recovery, retries, and performance\n", + "optimization.\n", + "\n", + "Delta Live Tables extends functionality in Apache Spark\n", + "Structured Streaming and allows you to write just a\n", + "few lines of declarative Python or SQL to deploy a\n", + "production-quality data pipeline with:\n", + "\n", + "- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n", + "\n", + "- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n", + "\n", + "- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n", + "\n", + "- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n", + "\n", + "With DLT, engineers can also treat their data as code\n", + "and apply software engineering best practices like\n", + "testing, monitoring and documentation to deploy\n", + "reliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\n", + "maintain all data dependencies across the pipeline and\n", + "reuse ETL pipelines with environment-independent\n", + "data management.\n", + "\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "### Stopping sophisticated ransomware in its tracks\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "```\n", + "\n", + "The increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n", + "\n", + "to meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n", + "\n", + "that scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n", + "\n", + "Abnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n", + "\n", + "pipelines and constantly refined ML models.\n", + "\n", + "“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n", + "\n", + "Abnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n", + "\n", + "product better.”\n", + "\n", + "The company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n", + "\n", + "maximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n", + "\n", + "directly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n", + "\n", + "delivers reliability, security and performance on the data lake for both streaming and batch operations. With\n", + "\n", + "Databricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n", + "\n", + "decisions and improve detection efficacy.\n", + "\n", + "Databricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n", + "\n", + "productivity and work in the same space without constantly competing for compute resources.\n", + "\n", + "With Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n", + "\n", + "infrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
1d885a2ec6fd20eca29296519748aef4product better.”\n", + "\n", + "The company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n", + "\n", + "maximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n", + "\n", + "directly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n", + "\n", + "delivers reliability, security and performance on the data lake for both streaming and batch operations. With\n", + "\n", + "Databricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n", + "\n", + "decisions and improve detection efficacy.\n", + "\n", + "Databricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n", + "\n", + "productivity and work in the same space without constantly competing for compute resources.\n", + "\n", + "With Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n", + "\n", + "infrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\n", + "that trigger intermittently and are unpredictable. It optimizes cluster utilization\n", + "by only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\n", + "unnecessary idle node capacity.\n", + "\n", + "\n", + "Delta Live Tables helps prevent bad data from flowing into tables through validation,\n", + "integrity checks and predefined error policies. In addition, you can monitor data\n", + "\n", + "quality trends over time to get insight into how your data is evolving and where\n", + "changes may be necessary.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 5**\n", + "**Use Databricks SQL for serverless compute**\n", + "[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\n", + "warehouse on the Lakehouse Platform for running your\n", + "SQL and BI applications at scale with up to 12x better\n", + "price/performance. It’s imperative for younger, growing\n", + "companies to reduce resource contention, and one way\n", + "to accomplish that is with serverless compute. Running\n", + "serverless removes the need to manage, configure or\n", + "scale cloud infrastructure on the lakehouse, freeing up\n", + "your data team for what they do best.\n", + "\n", + "\n", + "See for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n", + "[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\n", + "stored in your data lake.\n", + "\n", + "The Databricks SQL REST API supports services to\n", + "manage queries and dashboards, query history and SQL\n", + "warehouses.\n", + "\n", + "\n", + "Databricks SQL warehouses provide instant, elastic\n", + "SQL compute — decoupled from storage — and will\n", + "automatically scale to provide unlimited concurrency\n", + "without disruption, for high concurrency use cases. DB\n", + "SQL has data governance and security built in. Handle\n", + "high concurrency with fully managed load balancing\n", + "and scaling of compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Faster queries with Photon**\n", + "[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\n", + "to deliver dramatic infrastructure cost savings and\n", + "accelerate all data and analytics workloads: data\n", + "ingestion, ETL, streaming, interactive queries, data\n", + "science and machine learning.\n", + "\n", + "Photon is used by default in Databricks SQL. To\n", + "enable Photon acceleration, select the **Use Photon**\n", + "**Acceleration** checkbox when you create the cluster.\n", + "If you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\n", + "set runtime_engine to PHOTON.\n", + "\n", + "Photon supports a number of instance types on\n", + "the driver and worker nodes. Photon instance types\n", + "consume DBUs at a different rate than the same\n", + "instance type running the non-Photon runtime. For\n", + "more information about Photon instances and DBU\n", + "consumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
5c332356b67f14614107f85b62a638e4Photon is used by default in Databricks SQL. To\n", + "enable Photon acceleration, select the **Use Photon**\n", + "**Acceleration** checkbox when you create the cluster.\n", + "If you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\n", + "set runtime_engine to PHOTON.\n", + "\n", + "Photon supports a number of instance types on\n", + "the driver and worker nodes. Photon instance types\n", + "consume DBUs at a different rate than the same\n", + "instance type running the non-Photon runtime. For\n", + "more information about Photon instances and DBU\n", + "consumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n", + "\n", + "Photon will seamlessly coordinate work and resources\n", + "and transparently accelerate portions of your SQL and\n", + "Spark queries. No tuning or user intervention required.\n", + "Photon is compatible with Apache Spark APIs, so\n", + "getting started is as easy as turning it on — no code\n", + "change and no lock- in. Written entirely in C++, Photon\n", + "provides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\n", + "per the TPC-DS 1TB benchmark, and customers have\n", + "observed 3x–8x speedups on average.\n", + "\n", + "\n", + "With Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\n", + "Databricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n", + "\n", + "Learn how to connect BI tools to Databricks SQL\n", + "compute resources with the following user guides:\n", + "\n", + "\n", + "[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n", + "\n", + "[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n", + "\n", + "\n", + "[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n", + "\n", + "[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n", + "\n", + "\n", + "[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n", + "\n", + "[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 6**\n", + "**Orchestrate workflows**\n", + "Databricks provides a comprehensive suite of tools and integrations to support your\n", + "data processing workflows.\n", + "\n", + "Databricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\n", + "orchestration service for all your teams, so you can focus on your workflows, not on\n", + "managing your infrastructure. Orchestrate diverse workloads for the full lifecycle\n", + "including Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n", + "\n", + "Here’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\n", + "learn how to create notebooks, create and run a job, view the run details, and run jobs\n", + "with different parameters.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
ec1f30eb42cf277550e929aecf858722Databricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\n", + "orchestration service for all your teams, so you can focus on your workflows, not on\n", + "managing your infrastructure. Orchestrate diverse workloads for the full lifecycle\n", + "including Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n", + "\n", + "Here’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\n", + "learn how to create notebooks, create and run a job, view the run details, and run jobs\n", + "with different parameters.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 7**\n", + "**Run an end-to-end analytics pipeline**\n", + "This where you can see how everything works together to run efficiently at scale. First\n", + "take the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\n", + "will write to and read data from an external location managed by Unity Catalog and\n", + "configure Auto Loader to ingest data to Unity Catalog.\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n", + "\n", + "- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n", + "\n", + "- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n", + "\n", + "- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n", + "\n", + "- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/)\n", + "\n", + "\n", + "###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n", + " —Anup Segu, Senior Software Engineer, YipitData\n", + "\n", + "[Learn more.](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "# 03\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Building effective machine-learning operations\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 03\n", + "\n", + "### Building effective machine-learning operations\n", + "\n", + "```\n", + "Growing startups and digital native companies face several challenges when they\n", + "start building, maintaining and scaling machine learning operations (MLOps) for their\n", + "data science teams.\n", + "\n", + "\n", + "MLOps is different from DevOps. DevOps practices\n", + "and tooling alone are insufficient because ML\n", + "applications rely on an assortment of artifacts (e.g.,\n", + "models, data, code) that can each require different\n", + "methods of experiment tracking, model training,\n", + "feature development, governance, feature and\n", + "model serving.\n", + "\n", + "For data teams beginning their machine learning\n", + "journeys, the challenge of training data models can\n", + "be labor-intensive and not cost-effective because\n", + "the data has to be converted into features and\n", + "\n", + "trained on a separate machine learning platform\n", + "\n", + "\n", + "Data teams often perform development in\n", + "disjointed, siloed stacks spanning DataOps,\n", + "ModelOps and DevOps\n", + "\n", + "Development and training environment\n", + "disconnect. Moving code and data between\n", + "personal development environments and\n", + "machine learning platforms for model training\n", + "at scale is error prone and cumbersome. The\n", + "“it worked on my machine” problem.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
f75cf4ba1fcfd8ce9fced708407dd9ca```\n", + "CHALLENGE 03\n", + "\n", + "### Building effective machine-learning operations\n", + "\n", + "```\n", + "Growing startups and digital native companies face several challenges when they\n", + "start building, maintaining and scaling machine learning operations (MLOps) for their\n", + "data science teams.\n", + "\n", + "\n", + "MLOps is different from DevOps. DevOps practices\n", + "and tooling alone are insufficient because ML\n", + "applications rely on an assortment of artifacts (e.g.,\n", + "models, data, code) that can each require different\n", + "methods of experiment tracking, model training,\n", + "feature development, governance, feature and\n", + "model serving.\n", + "\n", + "For data teams beginning their machine learning\n", + "journeys, the challenge of training data models can\n", + "be labor-intensive and not cost-effective because\n", + "the data has to be converted into features and\n", + "\n", + "trained on a separate machine learning platform\n", + "\n", + "\n", + "Data teams often perform development in\n", + "disjointed, siloed stacks spanning DataOps,\n", + "ModelOps and DevOps\n", + "\n", + "Development and training environment\n", + "disconnect. Moving code and data between\n", + "personal development environments and\n", + "machine learning platforms for model training\n", + "at scale is error prone and cumbersome. The\n", + "“it worked on my machine” problem.\n", + "\n", + "Gathering high-quality data. Data that is siloed\n", + "across the organization is hard to discover,\n", + "collect, clean and use. This leads to stale data\n", + "and delays in development of models.\n", + "\n", + "\n", + "See **Create a unified data architecture.**\n", + "```\n", + " 03\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Siloed stacks spanning DataOps, ModelOps and DevOps\n", + "\n", + "When data engineers help ingest, refine and prep\n", + "data, they do so on their own stack. This data has\n", + "to be converted into features and then trained on\n", + "a separate machine learning platform. This cross-\n", + "platform handoff often results in data staleness,\n", + "difficulty in maintaining versions, and eventually,\n", + "poorly performing models. Even after you have\n", + "trained your model, you have to deal with yet another\n", + "tech stack for model deployment. It’s challenging\n", + "to serve features in real time and difficult to trace\n", + "problems in production back to the data.\n", + "\n", + "The downstream business impact is massive —\n", + "longer and more expensive projects, and lower\n", + "model accuracy in production leading to declining\n", + "business metrics.\n", + "\n", + "If you are looking at launching or scaling your\n", + "MLOps, you should probably focus on an incremental\n", + "strategy. At Databricks, we see firsthand how\n", + "customers develop their MLOps approaches across\n", + "a huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n", + "[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\n", + "building robust MLOps practices.\n", + "\n", + "\n", + "###### Databricks solution:\n", + "\n", + "Databricks Machine Learning is an integrated\n", + "end-to-end machine learning environment\n", + "incorporating managed services for experiment\n", + "tracking, model training, feature development and\n", + "management, and model serving. The capabilities\n", + "of Databricks map directly to the steps of model\n", + "development and deployment. With Databricks\n", + "Machine Learning, you can:\n", + "\n", + "\n", + "Train models either manually or with AutoML\n", + "\n", + "Track training parameters and models using\n", + "experiments with MLflow tracking\n", + "\n", + "Create feature tables and access them for model\n", + "training and inference\n", + "\n", + "Share, manage and serve models using MLflow\n", + "Model Registry\n", + "\n", + "Deploy models for Serverless Real-time Inference\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Use MLOps on the Databricks Lakehouse Platform\n", + "\n", + "To gain efficiencies and reduce costs, many smaller\n", + "digital companies are employing machine learning\n", + "operations. MLOps is a set of processes and\n", + "automation for managing models, data and code, and\n", + "unique library dependencies to improve performance\n", + "stability and long-term efficiency in ML systems.\n", + "\n", + "To describe it simply, MLOps = ModelOps + DataOps +\n", + "DevOps. The aim of MLOps is to improve the long-term\n", + "performance, stability and success rate of ML systems\n", + "while maximizing the efficiency of the teams who\n", + "build them.\n", + "\n", + "\n", + "Not only does MLOps improve organizational efficiency,\n", + "it also allows the models to iterate faster and react\n", + "to real-life changes in the data. This ability separates\n", + "companies that can grow to meet their customer’s\n", + "challenges in a reactive manner versus those that will\n", + "spend significant time on data updates/processes and\n", + "miss the opportunity to do something with\n", + "their models.\n", + "\n", + "The absence of MLOps is typically marked by an\n", + "overabundance of manual processes which are slowerSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
49385fba85744ca2e26f86c0a4b6ffd8Deploy models for Serverless Real-time Inference\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Use MLOps on the Databricks Lakehouse Platform\n", + "\n", + "To gain efficiencies and reduce costs, many smaller\n", + "digital companies are employing machine learning\n", + "operations. MLOps is a set of processes and\n", + "automation for managing models, data and code, and\n", + "unique library dependencies to improve performance\n", + "stability and long-term efficiency in ML systems.\n", + "\n", + "To describe it simply, MLOps = ModelOps + DataOps +\n", + "DevOps. The aim of MLOps is to improve the long-term\n", + "performance, stability and success rate of ML systems\n", + "while maximizing the efficiency of the teams who\n", + "build them.\n", + "\n", + "\n", + "Not only does MLOps improve organizational efficiency,\n", + "it also allows the models to iterate faster and react\n", + "to real-life changes in the data. This ability separates\n", + "companies that can grow to meet their customer’s\n", + "challenges in a reactive manner versus those that will\n", + "spend significant time on data updates/processes and\n", + "miss the opportunity to do something with\n", + "their models.\n", + "\n", + "The absence of MLOps is typically marked by an\n", + "overabundance of manual processes which are slower\n", + "\n", + "\n", + "and more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\n", + "capping the ability for a data team to take on new projects. The process is complex. In larger organizations,\n", + "several specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\n", + "natives and high-growth startups may be forced to wear several hats.\n", + "\n", + "\n", + "-----\n", + "\n", + "And once an ML project goes into production, the\n", + "MLOps continues, since the models, data and code\n", + "change over time due to regulatory and business\n", + "requirements. But the ML system must be resilient and\n", + "flexible. Addressing these challenges with a defined\n", + "MLOps strategy can dramatically reduce the iteration\n", + "cycle of delivering models to production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Steps in machine learning model development and deployment:\n", + "\n", + "\n", + "**Step 1**\n", + "**Data preparation**\n", + "Manually preparing and labeling data is a thankless,\n", + "time-consuming job. With Databricks, teams can\n", + "label data with human effort, machine learning\n", + "models in Databricks, or a combination of both.\n", + "Teams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\n", + "workflow that allows humans to easily inspect and\n", + "correct a model’s predicted labels. This process can\n", + "drastically reduce the amount of unstructured data\n", + "you need to achieve strong model performance.\n", + "\n", + "The [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\n", + "ready-to-go environment with many external\n", + "libraries, including TensorFlow, PyTorch, Horovod,\n", + "scikit-learn and XGBoost. It provides\n", + "extensions to improve performance, including GPU\n", + "acceleration in XGBoost, distributed deep\n", + "learning using HorovodRunner, and model\n", + "checkpointing.\n", + "\n", + "To use Databricks Runtime ML, select the ML version\n", + "of the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\n", + "access data in Unity Catalog for machine learning\n", + "workflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\n", + "isolation clusters are not compatible with Databricks\n", + "Runtime for Machine Learning.\n", + "\n", + "\n", + "Machine learning applications often\n", + "need to use shared storage for data\n", + "loading and model checkpointing. You\n", + "can load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\n", + "files. A table is a collection of\n", + "structured data stored as a directory\n", + "on cloud object storage.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
04ad6067c8620647b09f29ae19400f5aTo use Databricks Runtime ML, select the ML version\n", + "of the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\n", + "access data in Unity Catalog for machine learning\n", + "workflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\n", + "isolation clusters are not compatible with Databricks\n", + "Runtime for Machine Learning.\n", + "\n", + "\n", + "Machine learning applications often\n", + "need to use shared storage for data\n", + "loading and model checkpointing. You\n", + "can load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\n", + "files. A table is a collection of\n", + "structured data stored as a directory\n", + "on cloud object storage.\n", + "\n", + "For [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\n", + "use [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\n", + "new features, explore and reuse\n", + "existing features, track lineage and\n", + "feature creation code, and publish\n", + "features to low-latency online stores\n", + "for real-time inference. The Feature\n", + "Store is a centralized repository\n", + "that enables data scientists to find\n", + "and share features. It ensures that\n", + "the same code used to compute\n", + "the feature values is used for model\n", + "training and inference. The Feature\n", + "Store library is available only on\n", + "Databricks Runtime for Machine\n", + "Learning and is accessible through\n", + "Databricks notebooks and workflows.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n", + "\n", + "- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n", + "\n", + "- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "C `USTOMER STORY: ZIPLINE`\n", + "\n", + "### Data-driven drones deliver lifesaving medical aid around the world\n", + "\n", + "\n", + "Automated logistics and delivery system\n", + "\n", + "provider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n", + "\n", + "cutting-edge drone technology and a global\n", + "\n", + "autonomous logistics network to save lives\n", + "\n", + "\n", + "information they need to accurately measure success, find\n", + "\n", + "the metrics that relate to customer experiences or logistics,\n", + "\n", + "and improve on them exponentially as more data is ingested\n", + "\n", + "and machine learning models are refined.\n", + "\n", + "\n", + "by giving remote communities access to\n", + "\n", + "\n", + "emergency and preparatory medical aid and\n", + "\n", + "resources, regardless of where they are in the\n", + "\n", + "world.\n", + "\n", + "Doing so requires the ability to ingest and\n", + "\n", + "analyze huge chunks of time series data in real\n", + "\n", + "time. This data is produced every time a drone\n", + "\n", + "takes flight and includes performance data,\n", + "\n", + "in-flight battery management, regional weather\n", + "\n", + "patterns, geographic obstacles, landing errors\n", + "\n", + "and a litany of other information that must be\n", + "\n", + "processed.\n", + "\n", + "\n", + "“About 30% of the deliveries we do are lifesaving emergency\n", + "\n", + "deliveries, where the product being delivered does not exist\n", + "\n", + "at the hospital. We have to be fast, and we have to be able\n", + "\n", + "to rely on all the different kinds of data to predict failures\n", + "\n", + "before they occur so that we can guarantee a really, really\n", + "\n", + "high service level to the people who are literally depending\n", + "\n", + "on us with their lives,” said Zipline CEO Keller Rinaudo.\n", + "\n", + "“Databricks gives us confidence in our operations, and\n", + "\n", + "enables us to continuously improve our technology, expand\n", + "\n", + "our impact, and provide lifesaving aid where and when it’s\n", + "\n", + "needed, every single day.”SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
103fa9f67351003f0db724bd575fe49eanalyze huge chunks of time series data in real\n", + "\n", + "time. This data is produced every time a drone\n", + "\n", + "takes flight and includes performance data,\n", + "\n", + "in-flight battery management, regional weather\n", + "\n", + "patterns, geographic obstacles, landing errors\n", + "\n", + "and a litany of other information that must be\n", + "\n", + "processed.\n", + "\n", + "\n", + "“About 30% of the deliveries we do are lifesaving emergency\n", + "\n", + "deliveries, where the product being delivered does not exist\n", + "\n", + "at the hospital. We have to be fast, and we have to be able\n", + "\n", + "to rely on all the different kinds of data to predict failures\n", + "\n", + "before they occur so that we can guarantee a really, really\n", + "\n", + "high service level to the people who are literally depending\n", + "\n", + "on us with their lives,” said Zipline CEO Keller Rinaudo.\n", + "\n", + "“Databricks gives us confidence in our operations, and\n", + "\n", + "enables us to continuously improve our technology, expand\n", + "\n", + "our impact, and provide lifesaving aid where and when it’s\n", + "\n", + "needed, every single day.”\n", + "\n", + "[Read full story here.](https://www.databricks.com/customers/zipline)\n", + "\n", + "\n", + "Every Zipline flight generates a gigabyte of data\n", + "\n", + "with potential life-or-death consequences,\n", + "\n", + "but accessing and federating the data for both\n", + "\n", + "internal and external decision-making was\n", + "\n", + "challenging. With Databricks as the common\n", + "\n", + "platform, Zipline’s data team can access all the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 2**\n", + "**Model training**\n", + "For training machine learning and deep learning\n", + "models, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\n", + "prepares a data set for model training, performs a set\n", + "of trials using open-source libraries such as scikit-learn\n", + "and XGBoost, and creates a Python notebook with\n", + "the source code for each trial run so you can review,\n", + "reproduce and modify the code.\n", + "\n", + "In Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\n", + "creating data science and machine learning workflows\n", + "and collaborating with colleagues. Databricks\n", + "notebooks provide real-time coauthoring in multiple\n", + "languages, automatic versioning and built-in data\n", + "visualizations.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n", + "\n", + "- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n", + "\n", + "- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n", + "\n", + "- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n", + "\n", + "- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n", + "\n", + "- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
211ac77249d86b077653e7b60ecf7232-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n", + "\n", + "- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n", + "\n", + "- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n", + "\n", + "- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n", + "\n", + "- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n", + "\n", + "\n", + "**Step 3**\n", + "**Track model development**\n", + "The model development process is iterative, and can\n", + "be challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\n", + "you keep track of the model development process,\n", + "including parameter settings or combinations you have\n", + "tried and how they affected the model’s performance.\n", + "\n", + "MLflow tracking uses experiments and runs to log\n", + "and track your model development. A run is a single\n", + "execution of model code. An experiment is a collection\n", + "of related runs. Within an experiment, you can compare\n", + "and filter runs to understand how your model performs\n", + "and how its performance depends on the parameter\n", + "settings, input data, etc.\n", + "\n", + "MLflow can automatically log training code written\n", + "in many ML frameworks. This is the easiest way to\n", + "get started using MLflow tracking. With MLflow’s\n", + "autologging capabilities, a single line of code\n", + "automatically logs the resulting model.\n", + "\n", + "\n", + "A hosted version of MLflow Model Registry can help\n", + "[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\n", + "apply webhooks to automatically trigger actions based\n", + "on registry events. For example, you can trigger CI\n", + "builds when a new model version is created or notify\n", + "your team members through Slack each time a model\n", + "transition to production is requested. This promotes\n", + "a traceable version control work process. You can\n", + "leverage this feature for web traffic A/B testing and\n", + "funneled to versions of deployed models for more\n", + "precise population studies.\n", + "\n", + "\n", + "**Step 4**\n", + "**Deploy machine learning models**\n", + "You can use MLflow to deploy models for batch or\n", + "streaming inference or to set up a REST endpoint to\n", + "serve the model. Simplify your model deployment by\n", + "registering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\n", + "you have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n", + "[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\n", + "the model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n", + "\n", + "[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\n", + "applications, Databricks recommends the following\n", + "workflow.\n", + "\n", + "To debug and tune model inference on Databricks,\n", + "using GPUs (graphics processing units) can efficiently\n", + "optimize the running speed for model inference. As\n", + "GPUs and other accelerators become faster, it is\n", + "important that the data input pipeline keep up with\n", + "demand. The data input pipeline reads the data into\n", + "Spark DataFrames, transforms it and loads it as the\n", + "input for model inference.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: ITERABLE\n", + "\n", + "### Optimizing touch points across the entire customer journey\n", + "\n", + "```\n", + "“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meetSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
5ad7d5b602b0286dc3f06bfc52998475[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\n", + "applications, Databricks recommends the following\n", + "workflow.\n", + "\n", + "To debug and tune model inference on Databricks,\n", + "using GPUs (graphics processing units) can efficiently\n", + "optimize the running speed for model inference. As\n", + "GPUs and other accelerators become faster, it is\n", + "important that the data input pipeline keep up with\n", + "demand. The data input pipeline reads the data into\n", + "Spark DataFrames, transforms it and loads it as the\n", + "input for model inference.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: ITERABLE\n", + "\n", + "### Optimizing touch points across the entire customer journey\n", + "\n", + "```\n", + "“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n", + "\n", + "rising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n", + "\n", + "Principal Product Manager, [Iterable](https://iterable.com/)\n", + "\n", + "Captivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n", + "\n", + "connections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n", + "\n", + "patients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n", + "\n", + "than 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n", + "\n", + "This need to build personalized and automated customer experiences for its clients drove the company to find a\n", + "\n", + "fully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n", + "\n", + "the ability to scale for analytics and AI.\n", + "\n", + "With Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n", + "\n", + "unique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n", + "\n", + "learning models that deliver top-notch and personalized user experiences for higher-converting marketing\n", + "\n", + "campaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### ML Stages\n", + "\n", + "ML workflows include the following key assets: code,\n", + "models and data. These assets need to be developed\n", + "(dev), tested (staging) and deployed (production).\n", + "Each stage needs to operate within an execution\n", + "environment. So the execution environments, code,\n", + "models and data are divided into dev, staging and\n", + "production.\n", + "\n", + "ML project code is often stored in a version control\n", + "repository (such as Git), with most organizations using\n", + "branches corresponding to the lifecycle phases of\n", + "development, staging or production.\n", + "\n", + "Since model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\n", + "management to have its own service. MLflow and its\n", + "Model Registry support managing model artifacts\n", + "directly via UI and APIs. The loose coupling of model\n", + "artifacts and code provides flexibility to update\n", + "production models without code changes, streamlining\n", + "the deployment process in many cases.\n", + "\n", + "Databricks recommends creating separate\n", + "environments for the different stages of ML code and\n", + "model development with clearly defined transitions\n", + "between stages. The recommended MLOps workflow is\n", + "broken into these three stages:\n", + "\n", + "\n", + "[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\n", + "is experimentation. Data scientists develop features\n", + "and models and run experiments to optimize model\n", + "performance. The output of the development process is\n", + "ML pipeline code that can include feature computation,\n", + "model training inference and monitoring\n", + "\n", + "\n", + "-----\n", + "\n", + "[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\n", + "This stage focuses on testing the ML pipeline code\n", + "for production readiness, including code for model\n", + "training as well as feature engineering pipelines and\n", + "inference code. The output of the staging process is a\n", + "release branch that triggers the CI/CD system to start\n", + "the production stage.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
0929758b2300251054152ee60e8367e6Databricks recommends creating separate\n", + "environments for the different stages of ML code and\n", + "model development with clearly defined transitions\n", + "between stages. The recommended MLOps workflow is\n", + "broken into these three stages:\n", + "\n", + "\n", + "[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\n", + "is experimentation. Data scientists develop features\n", + "and models and run experiments to optimize model\n", + "performance. The output of the development process is\n", + "ML pipeline code that can include feature computation,\n", + "model training inference and monitoring\n", + "\n", + "\n", + "-----\n", + "\n", + "[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\n", + "This stage focuses on testing the ML pipeline code\n", + "for production readiness, including code for model\n", + "training as well as feature engineering pipelines and\n", + "inference code. The output of the staging process is a\n", + "release branch that triggers the CI/CD system to start\n", + "the production stage.\n", + "\n", + "\n", + "-----\n", + "\n", + "[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\n", + "ML engineers own the production environment\n", + "where ML pipelines are deployed. These pipelines\n", + "compute fresh feature values, train and test new model\n", + "versions, publish predictions to downstream tables\n", + "or applications, and monitor the entire process to\n", + "avoid performance degradation and instability. Data\n", + "scientists have visibility to test results, logs, model\n", + "artifacts and production pipeline status to allow them\n", + "to identify and diagnose problems in production.\n", + "\n", + "The Databricks Machine Learning home page provides\n", + "quick access to all the machine learning resources. To\n", + "access this page, move your mouse or pointer over\n", + "the left sidebar in the Databricks workspace. From\n", + "the persona switcher at the top of the sidebar, select\n", + "\n", + "Machine Learning.\n", + "\n", + "From the shortcuts menu, you can create\n", + "a [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\n", + "The center of the screen includes any recently viewed\n", + "items, and the sidebar provides quick access to\n", + "the [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n", + "[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\n", + "New users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\n", + "that illustrate how to use Databricks throughout the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n", + "\n", + "- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "\n", + "- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "\n", + "- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
f1fccacf5e51dabf1c04271b9515d627- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "\n", + "- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n", + "\n", + "- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n", + "\n", + "- [Watch the demos](https://www.databricks.com/discover/demos)\n", + "\n", + "\n", + "ML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\n", + "for a model-training tutorial notebook that steps\n", + "through loading data, training and tuning a model,\n", + "comparing and analyzing model performance and\n", + "using the model for inference.\n", + "\n", + "Also be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\n", + "learn how your organization can build a robust MLOPs\n", + "practice incrementally.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 04\n", + "```\n", + "SUMMARY: \u0003\n", + "\n", + "## The Databricks Lakehouse Platform addresses these challenges\n", + " 04\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "### Summary\n", + "\n", + "We’ve organized the common data challenges for startups and growing digital native\n", + "\n", + "businesses into three main buckets: Building a **unified data architecture** — one that\n", + "\n", + "supports **scalability and performance** ; and building effective **machine learning**\n", + "\n", + "**operations** , all with an eye on cost efficiency and increased productivity.\n", + "\n", + "The Lakehouse Platform provides an efficient and scalable architecture that solves\n", + "these challenges and will support your data, analytics and AI workloads now and as\n", + "you scale.\n", + "\n", + "With [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\n", + "performant digital native applications and analytic workloads — designed to scale as\n", + "you grow. Use your data however and wherever you want with open-source flexibility,\n", + "leverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\n", + "data workloads while Databricks automatically manages your infrastructure as you\n", + "scale. Leverage serverless Databricks SQL to increase productivity and scale on\n", + "demand with up to 12x better price/performance.\n", + "\n", + "Easily access data for ML models and accelerate the full ML lifecycle from\n", + "experimentation to production.\n", + "\n", + "Discover more about the lakehouse for companies born in the cloud **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Get started with Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build\n", + "solutions together with interactive notebooks to use\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\n", + "TensorFlow, Keras, scikit-learn and more.\n", + "\n", + "\n", + "### Get started with About Databricks Trial Databricks\n", + "\n", + "Get a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\n", + "solutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\n", + "TensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San\n", + "\n", + "Available as a 14-day full trial in your own cloud or as\n", + "\n", + "Francisco, with offices around the globe. Founded by\n", + "\n", + "a lightweight trial hosted by Databricks.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
20ade079c6a0bfe78a147f5842812b5e-----\n", + "\n", + "### Get started with Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build\n", + "solutions together with interactive notebooks to use\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\n", + "TensorFlow, Keras, scikit-learn and more.\n", + "\n", + "\n", + "### Get started with About Databricks Trial Databricks\n", + "\n", + "Get a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\n", + "solutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\n", + "TensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San\n", + "\n", + "Available as a 14-day full trial in your own cloud or as\n", + "\n", + "Francisco, with offices around the globe. Founded by\n", + "\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and\n", + "MLflow, Databricks is on a mission to help data teams\n", + "solve the world’s toughest problems. To learn more,\n", + "follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n", + "\n", + "\n", + "\n", + "- Available as a 14-day full trial in your own cloud or as\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
cd5ada025a0094fbaf75ac5cea9c38f2##### Guide\n", + "\n", + "## 6 Strategies for Building Personalized Customer Experiences\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "**Introduction** ................................................................................................................................................................................................................. **3**\n", + "\n", + "**1.** **Building a Foundation for Personalization**\n", + "Leveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n", + "\n", + "**2.** **Estimating Customer Lifetime Value**\n", + "Building Brand Loyalty With Data ................................................................................................................................................................. **6**\n", + "\n", + "**3.** **Mitigating Customer Churn**\n", + "Balancing Acquisition and Retention .......................................................................................................................................................... **10**\n", + "\n", + "**4.** **Streamlining Customer Analysis and Targeting**\n", + "Creating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
d53c2a5c69cef5febfa62ea961c33d25**4.** **Streamlining Customer Analysis and Targeting**\n", + "Creating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n", + "\n", + "**5.** **Assessing Consumer Interest Data**\n", + "Fine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n", + "\n", + "**6.** **Delivering Personalized Customer Journeys**\n", + "Crafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n", + "\n", + "**Conclusion**\n", + "Building a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "In today’s experience-driven world, the most beloved brands are the ones that\n", + "know their customers. Customers are loyal to brands that recognize their needs\n", + "and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\n", + "buying from a brand that personalizes the shopping and user experience to the\n", + "wants and needs of the customer. And as organizations pursue omnichannel\n", + "excellence, these same high expectations of online experiences also extend to\n", + "brick-and-mortar locations — revealing for many merchants that personalized\n", + "engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized\n", + "experiences requires integrating various types of data — including demographics,\n", + "behavioral and transactional — to develop robust profiles. This guide focuses on six\n", + "actionable strategic pillars for businesses to leverage automation, real-time data,\n", + "AI-driven analysis and well-tuned ML models to architect and deliver customized\n", + "customer experiences at every touch point.\n", + "\n", + "\n", + "# 76%\n", + "\n", + "of consumers are more\n", + "likely to purchase due to\n", + "personalization\n", + "\n", + "\n", + "# 76%\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Foundation for Personalization\n", + "\n", + "Get a 360-degree view of the customer by leveraging ML-based entity resolution\n", + "\n", + "\n", + "To create truly personalized interactions, you need actionable insights\n", + "about your customers. Start by establishing a common customer profile and\n", + "accurately linking together customer records across disparate data sets.\n", + "\n", + "Get a 360-degree view of your target customer by bringing together:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
e46c7c2f5da3f3652f9c2c1ba0dfd2b7But achieving a 360-degree view of your customers to serve personalized\n", + "experiences requires integrating various types of data — including demographics,\n", + "behavioral and transactional — to develop robust profiles. This guide focuses on six\n", + "actionable strategic pillars for businesses to leverage automation, real-time data,\n", + "AI-driven analysis and well-tuned ML models to architect and deliver customized\n", + "customer experiences at every touch point.\n", + "\n", + "\n", + "# 76%\n", + "\n", + "of consumers are more\n", + "likely to purchase due to\n", + "personalization\n", + "\n", + "\n", + "# 76%\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Foundation for Personalization\n", + "\n", + "Get a 360-degree view of the customer by leveraging ML-based entity resolution\n", + "\n", + "\n", + "To create truly personalized interactions, you need actionable insights\n", + "about your customers. Start by establishing a common customer profile and\n", + "accurately linking together customer records across disparate data sets.\n", + "\n", + "Get a 360-degree view of your target customer by bringing together:\n", + "\n", + "- Sales and traffic-driven first-party data\n", + "\n", + "- Product ratings and surveys\n", + "\n", + "- Customer surveys and support center calls\n", + "\n", + "- Third-party data purchased from data aggregators and online trackers\n", + "\n", + "- Zero-party data provided by customers themselves\n", + "\n", + "Location\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n", + "\n", + "Grab is the largest online-to-offline platform in Southeast Asia and\n", + "has generated over 6 billion transactions for transport, food and\n", + "grocery delivery, and digital payments. Grab uses Databricks to create\n", + "sophisticated customer segmentation and recommendation engines\n", + "that can now ingest and optimize thousands of user-generated signals\n", + "and data sources simultaneously, enhancing data integrity and security,\n", + "and reducing weeks of work to only hours.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/grab)\n", + "\n", + "\n", + "\n", + "Demographics\n", + "\n", + "\n", + "Orders\n", + "\n", + "Network/\n", + "Usage\n", + "\n", + "\n", + "“The C360 platform empowered teams to create\n", + "consumer features at scale, which in turn allows\n", + "for these features to be extended to other markets\n", + "and used by other teams. This helps to reduce the\n", + "engineering overhead and costs exponentially.”\n", + "\n", + "**N I K H I L DWA R A K A N AT H**\n", + "Head of Analytics, Grab\n", + "\n", + "\n", + "Social\n", + "\n", + "Apps/\n", + "Clickstream\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|||||||\n", + "||Cus 3|t 6|o|mer 0||\n", + "|||||||\n", + "|||||||\n", + "\n", + "\n", + "\n", + "Service Call/\n", + "Records\n", + "\n", + "\n", + "Customer\n", + "360\n", + "\n", + "\n", + "Billing\n", + "\n", + "Devices\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the different data sources and data types, automated matching can still\n", + "be incredibly challenging due to inconsistent formats, misinterpretation of data,\n", + "and entry errors across various systems. And even if inconsistent, all that data\n", + "may be perfectly valid — but to accurately connect the millions of customer\n", + "identities most retailers manage, businesses must lean on automation.\n", + "\n", + "In a machine learning (ML) approach to entity resolution, text attributes like\n", + "name, address and phone number are translated into numerical representations\n", + "that can be used to quantify the degree of similarity between any two attribute\n", + "values. But your ability to train such a model depends on your access to\n", + "accurately labeled training data. It’s a time-consuming exercise, but if done right,\n", + "the model learns to reflect the judgments of the human reviewers.\n", + "\n", + "Many organizations rely on libraries encapsulating this knowledge to build their\n", + "applications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\n", + "bringing together ML-based approaches to intelligent candidate pair generation\n", + "and pair-scoring. Oriented toward the construction of custom workflows, Zingg\n", + "presents these capabilities within the context of commonly employed steps\n", + "such as training data label assignment, model training, data set deduplication,\n", + "and (cross-data set) record matching.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
1588d357d1fefca3d2410a7107be8befIn a machine learning (ML) approach to entity resolution, text attributes like\n", + "name, address and phone number are translated into numerical representations\n", + "that can be used to quantify the degree of similarity between any two attribute\n", + "values. But your ability to train such a model depends on your access to\n", + "accurately labeled training data. It’s a time-consuming exercise, but if done right,\n", + "the model learns to reflect the judgments of the human reviewers.\n", + "\n", + "Many organizations rely on libraries encapsulating this knowledge to build their\n", + "applications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\n", + "bringing together ML-based approaches to intelligent candidate pair generation\n", + "and pair-scoring. Oriented toward the construction of custom workflows, Zingg\n", + "presents these capabilities within the context of commonly employed steps\n", + "such as training data label assignment, model training, data set deduplication,\n", + "and (cross-data set) record matching.\n", + "\n", + "Built as a native Apache Spark TM application, Zingg scales well to apply these\n", + "techniques to enterprise-sized data sets. Organizations can then use Zingg in\n", + "combination with platforms such as Databricks Lakehouse to provide the back\n", + "end to human-in-the-middle workflow applications that automate the bulk of\n", + "the entity resolution work and present data experts with a more manageable\n", + "set of edge case pairs to interpret.\n", + "\n", + "\n", + "As an active-learning solution, models can be retrained to take advantage of\n", + "this additional human input to improve future predictions and further reduce\n", + "the number of cases requiring expert review. Finally, these technologies can be\n", + "assembled to enable their own enterprise-scaled customer entity resolution\n", + "workflow applications.\n", + "\n", + "**Need help building your foundation for a**\n", + "**360-degree view of your customers?**\n", + "\n", + "Get pre-built code sample data and step-by-step instructions\n", + "in a Databricks notebook in the **Customer Entity Resolution**\n", + "**Solution Accelerator.**\n", + "\n", + "**•** Translating text attributes (like name, address, phone number)\n", + "into quantifiable numerical representations\n", + "\n", + "**•** Training ML models to determine if these numerical labels\n", + "form a match\n", + "\n", + "**•** Scoring the confidence of each match\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Estimating Customer Lifetime Value\n", + "\n", + "Building brand loyalty to drive share of wallet with data\n", + "\n", + "\n", + "Once you’ve set up a 360-degree view of the customer, the next challenge\n", + "is how to spend money to profitably grow the brand. The goal is to spend\n", + "marketing dollars on activities that attract loyal customers and avoid spending on\n", + "unprofitable customers or activities that damage the brand. Keep in mind, that\n", + "making decisions solely based on ROI isn’t the answer. This one-track approach\n", + "could ultimately weaken your brand equity and make you more dependent on\n", + "lowering your price through promotions as a way to generate sales.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "**Identifying and engaging brand loyalists**\n", + "\n", + "Today’s customer has overwhelmingly abundant options in products and\n", + "services to choose from. That’s why personalizing customer experiences is so\n", + "important, as it increases revenue, marketing efficiency and customer retention.\n", + "\n", + "Not every customer carries the same potential for profitability. Different\n", + "customers derive different value from your products and services, which directly\n", + "translates into differences in the overall amount of value a business can expect\n", + "in return. Mutually beneficial relationships carefully align customer acquisition\n", + "cost (CAC) and retention rates with the total revenue or customer lifetime value\n", + "(CLV).\n", + "\n", + "\n", + "**Predicting and increasing customer lifetime value with ML**\n", + "\n", + "\n", + "Kolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\n", + "attracts over 10 million monthly active users. With Databricks, they\n", + "achieved a 30% increase in player LTV, improved data team productivity\n", + "by 3x, and reduced ML model-to-production time by 40x.\n", + "\n", + "[Get the full story](https://databricks.com/customers/kolibri-games)\n", + "\n", + "Within your existing customer base are people ranging from brand loyalists to\n", + "brand transients. Brand loyalists are highly engaged with your brand, are willing\n", + "to share their experience with others, and are the most likely to purchase\n", + "again. Brand transients have no loyalty to your brand and shop based on price.\n", + "Your focus should be on growing the group of brand loyalists while minimizing\n", + "interactions with brand transients.\n", + "\n", + "\n", + "**Calculating customers’ lifetime intent**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
f6482951b29e919393ff642a754723f9**Predicting and increasing customer lifetime value with ML**\n", + "\n", + "\n", + "Kolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\n", + "attracts over 10 million monthly active users. With Databricks, they\n", + "achieved a 30% increase in player LTV, improved data team productivity\n", + "by 3x, and reduced ML model-to-production time by 40x.\n", + "\n", + "[Get the full story](https://databricks.com/customers/kolibri-games)\n", + "\n", + "Within your existing customer base are people ranging from brand loyalists to\n", + "brand transients. Brand loyalists are highly engaged with your brand, are willing\n", + "to share their experience with others, and are the most likely to purchase\n", + "again. Brand transients have no loyalty to your brand and shop based on price.\n", + "Your focus should be on growing the group of brand loyalists while minimizing\n", + "interactions with brand transients.\n", + "\n", + "\n", + "**Calculating customers’ lifetime intent**\n", + "\n", + "To assess the remaining lifetime in a customer relationship, businesses must\n", + "\n", + "carefully examine the transactional signals and other indicators from previous\n", + "customer engagements and transactions.\n", + "\n", + "For example, if a frequent customer slows down their buying habits — or simply\n", + "doesn’t make a purchase for an extended period of time — it may signal the\n", + "upcoming end of the relationship. However, in the case of another customer\n", + "who engages infrequently, the same extended absence may not signal anything\n", + "notable. The infrequent buyer may continue to purchase even after a long pause\n", + "in activity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Customer A\n", + "\n", + "Customer B\n", + "\n", + "Customer C\n", + "\n", + "\n", + "Past Future\n", + "\n", + "Different customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n", + "\n", + "\n", + "Every customer relationship with a business has a lifespan. Understanding what\n", + "point in the lifespan at a given time provides critical insight to inform marketing\n", + "and sales tactics. By proactively discovering shifts in the relationship, you can\n", + "adapt how to respond to each customer at the optimal time. For example, a\n", + "certain signal might prompt a change in how to deliver products and services,\n", + "which could help maximize revenue.\n", + "\n", + "Transactional signals can be used to estimate the probability that a customer\n", + "is active and likely to return in the future. Popularized as the Buy ’til You Die\n", + "(BTYD) model, analysts can compare a customer’s frequency and recency of\n", + "\n", + "engagement to similar patterns across their user population to accurately\n", + "predict individual CLV.\n", + "\n", + "\n", + "The mathematics behind these predictive CLV models is complex, but the logic\n", + "behind these critical models is accessible through a popular Python library\n", + "named Lifetimes, which allows the input of simple summary metrics in order to\n", + "derive customer-specific lifetime estimates.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**How personalized experiences keep customers coming**\n", + "**back for more**\n", + "\n", + "Publicis Groupe empowers brands to transform retail experiences with\n", + "digital technologies, but data challenges and team silos stood in the\n", + "way of delivering the personalization that their customers required.\n", + "See how they use Databricks to create a single customer view that\n", + "allows them to drive customer loyalty and retention. As a result, they’ve\n", + "seen a 45%–50% increase in customer campaign revenue.\n", + "\n", + "[Get the full story](https://databricks.com/customers/publicis-groupe)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering customer lifetime estimates to the business**\n", + "\n", + "\n", + "Spark natively distributes this work across a multi-server environment, enabling\n", + "consistent, accurate and efficient analysis. Spark’s flexibility allows models to\n", + "adapt in real time as new information is ingested, eliminating the bottlenecks\n", + "that come with manual data mapping and profile building.\n", + "\n", + "With per customer metrics calculated, the Lifetimes library can be used to train\n", + "multiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\n", + "predict engagements over time using proprietary data can take several months\n", + "and thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\n", + "businesses tap into the infrastructure behind their Spark environments and\n", + "distribute the training outputs across models.\n", + "\n", + "\n", + "Using the Lifetimes library to calculate customer-specific probabilities at speed\n", + "and scale can be challenging — from processing large volumes of transaction\n", + "data to deriving data curves and value distribution patterns and, eventually,\n", + "to integration with business initiatives. But with the proper approach, you can\n", + "resolve all of them.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
1ce23e74ee932d8df197e9a45e53e861**Delivering customer lifetime estimates to the business**\n", + "\n", + "\n", + "Spark natively distributes this work across a multi-server environment, enabling\n", + "consistent, accurate and efficient analysis. Spark’s flexibility allows models to\n", + "adapt in real time as new information is ingested, eliminating the bottlenecks\n", + "that come with manual data mapping and profile building.\n", + "\n", + "With per customer metrics calculated, the Lifetimes library can be used to train\n", + "multiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\n", + "predict engagements over time using proprietary data can take several months\n", + "and thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\n", + "businesses tap into the infrastructure behind their Spark environments and\n", + "distribute the training outputs across models.\n", + "\n", + "\n", + "Using the Lifetimes library to calculate customer-specific probabilities at speed\n", + "and scale can be challenging — from processing large volumes of transaction\n", + "data to deriving data curves and value distribution patterns and, eventually,\n", + "to integration with business initiatives. But with the proper approach, you can\n", + "resolve all of them.\n", + "\n", + "These models depend on three key per customer metrics:\n", + "\n", + "**FREQUENCY**\n", + "The number of times within a given time period in which a repeat\n", + "transaction is observed\n", + "\n", + "**AGE**\n", + "The length of time between the occurrence of an initial transaction\n", + "to the end of a given time period\n", + "\n", + "**RECENCY**\n", + "\n", + "The “age” of a customer (how long they’ve engaged with a brand)\n", + "at the time of their latest repeat transaction\n", + "\n", + "\n", + "-----\n", + "\n", + "**Solution deployment**\n", + "\n", + "\n", + "Once properly trained, these models can determine the probability that a\n", + "customer will re-engage, as well as the number of engagements a business\n", + "can expect from that customer over time. But the real challenge is putting\n", + "these predictive capabilities into the hands of those that determine\n", + "customer engagement.\n", + "\n", + "Matrices illustrating the probability a customer is alive (left) and the number of future\n", + "purchases in a 30-day window given a customer’s frequency and recency metrics (right).\n", + "\n", + "\n", + "Businesses need a way to develop and deploy solutions in a highly scalable\n", + "environment with a limited upfront cost. Databricks Solution Accelerators\n", + "leverage real-world sample data sets and pre-built code to show how raw data\n", + "can be transformed into real solutions — including step-by-step instructions\n", + "ready to go in a Databricks notebook.\n", + "\n", + "**Need help determining your customers’**\n", + "**lifetime value?**\n", + "\n", + "Use the **Customer Lifetime Value Accelerator** to\n", + "\n", + "**•** Ingest sample retail data\n", + "\n", + "**•** Use pre-built code to develop visualizations and explore\n", + "past purchase behavior\n", + "\n", + "**•** Apply machine learning to predict the likelihood and\n", + "nature of future purchases\n", + "\n", + "**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Mitigating Customer Churn\n", + "\n", + "Balancing acquisition and retention with personalized experiences\n", + "\n", + "\n", + "There are no guarantees of success. With a bevy of options at their disposal,\n", + "customer churn is a reality that companies face and are focused on overcoming\n", + "every day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\n", + "estimated a segment average 7.2% monthly rate of churn. When narrowed to\n", + "brands focused on consumer goods, that rate jumped to 10.0%. This figure\n", + "translates to a lifetime of 10 months for the average subscription box service,\n", + "leaving businesses of this kind with little time to recover acquisition costs and\n", + "bring subscribers to net profitability.\n", + "\n", + "**C A S E S T U DY**\n", + "##### Riot Games\n", + "\n", + "**Creating an optimal in-game experience for League of Legends**\n", + "\n", + "Riot Games is one of the top PC game developers in the world, with over\n", + "100 million monthly active users, 500 billion data points, and over 26\n", + "petabytes of data and counting. They turned to Databricks to build a more\n", + "\n", + "efficient and scalable way to leverage data and improve the overall gaming\n", + "experience — ensuring customer engagement and reducing churn.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
9d9266f876aecba5c8df8c7bfd97cf6a**C A S E S T U DY**\n", + "##### Riot Games\n", + "\n", + "**Creating an optimal in-game experience for League of Legends**\n", + "\n", + "Riot Games is one of the top PC game developers in the world, with over\n", + "100 million monthly active users, 500 billion data points, and over 26\n", + "petabytes of data and counting. They turned to Databricks to build a more\n", + "\n", + "efficient and scalable way to leverage data and improve the overall gaming\n", + "experience — ensuring customer engagement and reducing churn.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/riot-games)\n", + "\n", + "Organizations must take an honest look at the cost of acquisition relative to a\n", + "customer’s lifetime value (LTV) earned. These figures need to be brought into a\n", + "\n", + "healthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n", + "\n", + "\n", + "**Understanding attrition predictability through subscriptions:**\n", + "**Examining retention-based acquisition variables**\n", + "\n", + "Public data for subscription services is extremely hard to come by. KKBox, a\n", + "Taiwan-based music streaming service, recently released over two years of\n", + "anonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\n", + "the data, we uncover customer dynamics familiar to any subscription provider.\n", + "\n", + "Most subscribers join the KKBox service through a 30-day trial offer. Customers\n", + "then appear to enlist in one-year subscriptions, which provide the service with\n", + "a steady flow of revenue. Subscribers typically churn at the end of the 30-day\n", + "trial and at regular one-year intervals.\n", + "\n", + "The Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\n", + "retained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n", + "\n", + "\n", + "-----\n", + "\n", + "By Initial Payment Method\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different payment methods.\n", + "\n", + "By Initial Payment Plan Days\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers selecting different initial payment methods and terms/days.\n", + "\n", + "\n", + "This pattern of high initial drop-off, followed by a period of slower but continuing\n", + "drop-off cycles makes intuitive sense. Where it gets interesting is when the\n", + "data changes. The patterns of customer churn become vastly different as time\n", + "passes and new or changing elements are introduced (e.g., payment methods\n", + "and options, membership tiers, etc.).\n", + "\n", + "By Registration Channel\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different channels.\n", + "\n", + "\n", + "-----\n", + "\n", + "These patterns seem to indicate that KKBox _could_ potentially differentiate\n", + "between customers based on their lifetime potential, using only the information\n", + "available at subscriber acquisition. In the same way, non-subscription businesses\n", + "could use similar data techniques to get an accurate illustration of the total\n", + "lifetime value of a particular customer, even before collecting historical data.\n", + "\n", + "This information can help businesses target certain shoppers with effective\n", + "discounts or promotions as early as trial registration. Nevertheless, it’s always\n", + "important to consider more than individual data points.\n", + "\n", + "The baseline risk of customer attrition over a subscription lifespan.\n", + "\n", + "\n", + "The channel and payment method multipliers combine to explain a customer’s risk of attrition\n", + "at various points in time. The higher the value, the higher the proportional risk of churn in the\n", + "associated period.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Applying churn analytics to your data**\n", + "\n", + "This analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n", + "**2)** to paint a quantitative picture of the specific factors that explain that risk,\n", + "giving analysts a clearer understanding of what to focus on, what to ignore and\n", + "what to investigate further. The main challenge is organizing the input data.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
dc5fc49468f8795f185c2a9a69844b3fThis information can help businesses target certain shoppers with effective\n", + "discounts or promotions as early as trial registration. Nevertheless, it’s always\n", + "important to consider more than individual data points.\n", + "\n", + "The baseline risk of customer attrition over a subscription lifespan.\n", + "\n", + "\n", + "The channel and payment method multipliers combine to explain a customer’s risk of attrition\n", + "at various points in time. The higher the value, the higher the proportional risk of churn in the\n", + "associated period.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Applying churn analytics to your data**\n", + "\n", + "This analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n", + "**2)** to paint a quantitative picture of the specific factors that explain that risk,\n", + "giving analysts a clearer understanding of what to focus on, what to ignore and\n", + "what to investigate further. The main challenge is organizing the input data.\n", + "\n", + "The data required to examine customer attrition may be scattered across\n", + "multiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\n", + "the creation of transparent, sustainable data processing pipelines that are\n", + "flexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n", + "**condition to be managed** , and attrition data should be periodically revisited to\n", + "maintain alignment between acquisition and retention efforts.\n", + "\n", + "**Need help predicting customer churn?**\n", + "\n", + "Use the **Subscriber Churn Prediction Accelerator** to analyze\n", + "behavioral data, identify subscribers with an increased risk of\n", + "cancellation, and predict attrition. Machine learning lets you\n", + "quantify a user’s likelihood to churn, identifying factors that\n", + "explain the risk.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Streamlining Customer Analysis and Targeting\n", + "\n", + "Creating efficient and highly targeted customer experiences with behavioral data\n", + "\n", + "\n", + "Effective targeting comes down to one fundamental element: the cost of\n", + "delivering a good or service relative to what a consumer is willing to pay.\n", + "\n", + "In the earliest applications of segmentation, manufacturers recognized that\n", + "specialized product lines targeting specific consumer groups could help\n", + "brands stand out against competitors.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Finding that special something every time**\n", + "\n", + "Pandora is a jewelry company with global reach. They built their master\n", + "consumer view (MCV) dashboard on the Databricks Lakehouse Platform,\n", + "giving them the insights necessary to deliver highly targeted messaging\n", + "and personalization — resulting in 80% growth in email marketing\n", + "success, a 50% increase in click-to-open rate across 65 million emails,\n", + "and 255M DKK (Danish Krone) in quarterly revenue.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/pandora)\n", + "\n", + "This mode of thinking extends beyond product development and into every\n", + "customer-oriented business function, requiring specific means of ideation,\n", + "production and delivery. The work put into segmentation doesn’t need to be\n", + "a gamble. Scrutinizing customers and testing responsiveness is an ongoing\n", + "process. Organizations must analyze and adapt to shifting markets, changing\n", + "consumer demand and evolving business objectives.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Powering insight-driven dashboards to increase customer**\n", + "**acquisition**\n", + "\n", + "Bagelcode is a global game company with more than 50 million global\n", + "users. By using the Databricks Lakehouse Platform, they are now able to\n", + "support more diversified indicators, such as a user’s level of frequency\n", + "and the amount of time they use a specific function for each game,\n", + "enabling more well-informed responses. In addition, the company is\n", + "mitigating customer churn by better predicting gamer behavior and\n", + "providing personalized experiences at scale.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/bagelcode)\n", + "\n", + "“Thanks to Databricks Lakehouse, we can support\n", + "real-time business decision-making based on data\n", + "analysis results that are automatically updated on\n", + "an hourly and daily basis, even as data volumes have\n", + "increased by nearly 1,000 times.”\n", + "\n", + "**J O O H Y U N K I M**\n", + "Vice President, Data and AI, Bagelcode\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
2f01578d9ce1f0632c2c1cb267859283**C A S E S T U DY**\n", + "\n", + "**Powering insight-driven dashboards to increase customer**\n", + "**acquisition**\n", + "\n", + "Bagelcode is a global game company with more than 50 million global\n", + "users. By using the Databricks Lakehouse Platform, they are now able to\n", + "support more diversified indicators, such as a user’s level of frequency\n", + "and the amount of time they use a specific function for each game,\n", + "enabling more well-informed responses. In addition, the company is\n", + "mitigating customer churn by better predicting gamer behavior and\n", + "providing personalized experiences at scale.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/bagelcode)\n", + "\n", + "“Thanks to Databricks Lakehouse, we can support\n", + "real-time business decision-making based on data\n", + "analysis results that are automatically updated on\n", + "an hourly and daily basis, even as data volumes have\n", + "increased by nearly 1,000 times.”\n", + "\n", + "**J O O H Y U N K I M**\n", + "Vice President, Data and AI, Bagelcode\n", + "\n", + "\n", + "-----\n", + "\n", + "A brand’s goal with segmentation should be to define a shared customer\n", + "perspective on customers, allowing the organization to engage users consistently\n", + "and cohesively. But any adjustments to customer engagement require careful\n", + "consideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Responding to global demand shifts with ease**\n", + "\n", + "Reckitt produces some of the world’s most recognizable and trusted\n", + "consumer brands in hygiene, health and nutrition. With Databricks\n", + "Lakehouse on Azure, they’re able to meet the needs of billions of\n", + "consumers worldwide by surfacing real-time, highly accurate, deep\n", + "customer insights, leading to a better understanding of trends and\n", + "demand, allowing them to provide best-in-class experiences in\n", + "every market.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/reckitt)\n", + "\n", + "\n", + "**A segmentation walk-through: Grocery chain promotions**\n", + "\n", + "A promotions management team for a large grocery chain is responsible for\n", + "running a number of promotional campaigns, each of which is intended to drive\n", + "greater overall sales. Today, these marketing campaigns include leaflets and\n", + "coupons mailed to individual households, manufacturer coupon matching,\n", + "in-store discounts and the stocking of various private-label alternatives to\n", + "popular national brands.\n", + "\n", + "Recognizing uneven response rates between households, the team is eager to\n", + "determine if customers might be segmented based on their responsiveness\n", + "to these promotions. They anticipate that such segmentation may allow the\n", + "promotions management team to better target individual households, driving\n", + "overall higher response rates for each promotional dollar spent.\n", + "\n", + "Using historical data from point-of-sale systems along with campaign\n", + "information from their promotions management systems, the team derives\n", + "a number of features that capture the behavior of various households with\n", + "regard to promotions. Applying standard data preparation techniques, the data\n", + "is organized for analysis and using a variety of clustering algorithms, such as\n", + "k-means and hierarchical clustering, the team settles on two potentially useful\n", + "cluster designs.\n", + "\n", + "\n", + "-----\n", + "\n", + "Overlapping segment designs separating households based on their responsiveness to\n", + "various promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n", + "\n", + "**Assessing results**\n", + "\n", + "\n", + "Comparing households by demographic factors not used in developing the\n", + "clusters themselves, some interesting patterns separating cluster members\n", + "by age and other factors are identified. While this information may be useful\n", + "\n", + "in not only predicting cluster membership and designing more effective\n", + "campaigns targeted to specific groups of households, the team recognizes\n", + "the need to collect additional demographic data before putting too much\n", + "emphasis on these results.\n", + "\n", + "\n", + "With profiling, marketers can discern those customer households in the\n", + "highlighted example fall into two groups: those who are responsive to coupons\n", + "and mailed leaflets, and those who are not. Further divisions show differing\n", + "degrees of responsiveness to other promotional offers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Need help segmenting your customers for**\n", + "**more targeted marketing?**\n", + "\n", + "Use the **Customer Segmentation Accelerator** and drive\n", + "better purchasing predictions based on behaviors. Through\n", + "sales data, campaigns and promotions systems, you can\n", + "build useful customer clusters to effectively target various\n", + "households with different promos and offers.\n", + "\n", + "Age-based differences in cluster composition of behavior-based customer segments.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
6cf6e85e0d863c008a1a11095b2d83fc**Assessing results**\n", + "\n", + "\n", + "Comparing households by demographic factors not used in developing the\n", + "clusters themselves, some interesting patterns separating cluster members\n", + "by age and other factors are identified. While this information may be useful\n", + "\n", + "in not only predicting cluster membership and designing more effective\n", + "campaigns targeted to specific groups of households, the team recognizes\n", + "the need to collect additional demographic data before putting too much\n", + "emphasis on these results.\n", + "\n", + "\n", + "With profiling, marketers can discern those customer households in the\n", + "highlighted example fall into two groups: those who are responsive to coupons\n", + "and mailed leaflets, and those who are not. Further divisions show differing\n", + "degrees of responsiveness to other promotional offers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Need help segmenting your customers for**\n", + "**more targeted marketing?**\n", + "\n", + "Use the **Customer Segmentation Accelerator** and drive\n", + "better purchasing predictions based on behaviors. Through\n", + "sales data, campaigns and promotions systems, you can\n", + "build useful customer clusters to effectively target various\n", + "households with different promos and offers.\n", + "\n", + "Age-based differences in cluster composition of behavior-based customer segments.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "The results of the analysis now drive a dialog between the data scientists and\n", + "the promotions management team. Based on initial findings, a revised analysis\n", + "will be performed focused on what appear to be the most critical features\n", + "differentiating households as a means to simplify the cluster design and evaluate\n", + "overall cluster stability. Subsequent analyses will also examine the revenue\n", + "\n", + "generated by various households to understand how changes in promotional\n", + "engagement may impact customer spending.\n", + "\n", + "Using this information, the team believes they will have the ability to make a case\n", + "for change to upper management. Should a change in promotions targeting be\n", + "approved, the team makes plans to monitor household spending, promotions\n", + "spend and campaign responsiveness rates using much of the same data used in\n", + "this analysis. This will allow the team to assess the impact of these efforts and\n", + "identify when the segmentation design needs to be revisited.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Assessing Consumer Interest Data to Inform Engagement Strategies\n", + "\n", + "Fine-tuning ML recommendations to boost conversions\n", + "\n", + "\n", + "Personalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\n", + "important to identify high-value audiences who have the highest likelihood of\n", + "specific actions. Here’s where **propensity scoring** comes in.\n", + "\n", + "Specifically, this process allows companies to estimate customers’ potential\n", + "receptiveness to an offer or to content related to a subset of products, and\n", + "determine which messaging to apply. Calculating propensity scores requires\n", + "assessment of past interactions and data points (e.g., frequency of purchases,\n", + "percentage of spend associated with a particular product category, days since\n", + "last purchase and other historical data).\n", + "\n", + "Databricks provides critical capabilities for propensity scoring (like the Feature\n", + "Store, AutoML and MLflow) to help businesses answer three key considerations\n", + "and develop a robust process:\n", + "\n", + "**1.** How to maintain the significant number of features used\n", + "to train propensity models\n", + "\n", + "**2.** How to rapidly train models aligned with new campaigns\n", + "\n", + "**3.** How to rapidly re-deploy models, retrained as customer\n", + "patterns drift, into the scoring pipeline\n", + "\n", + "**Boosting model training efficiency**\n", + "\n", + "With the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\n", + "created by others.\n", + "\n", + "\n", + "The feature store is a centralized repository that enables the persistence,\n", + "discovery and sharing of features across various model training exercises.\n", + "As features are captured, lineage and other metadata are captured. Standard\n", + "security models ensure that only permitted users and processes may\n", + "employ these features, enforcing the organization’s data access policies on\n", + "data science processes.\n", + "\n", + "**Extracting the complexities of ML**\n", + "\n", + "[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\n", + "best practices. As a glass box solution, AutoML first generates a collection of\n", + "notebooks representing various aligned model variations. In addition to iteratively\n", + "training models, AutoML allows you to access the notebooks associated with each\n", + "model, creating an editable starting point for further exploration.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
63e004873af9b33355e959df2444c676**Boosting model training efficiency**\n", + "\n", + "With the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\n", + "created by others.\n", + "\n", + "\n", + "The feature store is a centralized repository that enables the persistence,\n", + "discovery and sharing of features across various model training exercises.\n", + "As features are captured, lineage and other metadata are captured. Standard\n", + "security models ensure that only permitted users and processes may\n", + "employ these features, enforcing the organization’s data access policies on\n", + "data science processes.\n", + "\n", + "**Extracting the complexities of ML**\n", + "\n", + "[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\n", + "best practices. As a glass box solution, AutoML first generates a collection of\n", + "notebooks representing various aligned model variations. In addition to iteratively\n", + "training models, AutoML allows you to access the notebooks associated with each\n", + "model, creating an editable starting point for further exploration.\n", + "\n", + "**Streamlining the overall ML lifecycle**\n", + "\n", + "[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\n", + "Databricks Lakehouse. This repository enables tracking and analysis of the various\n", + "model iterations generated by both AutoML and custom training cycles alike.\n", + "\n", + "When used in combination with the Databricks Feature Store, models persisted\n", + "with MLflow can retain knowledge of the features used during training. As models\n", + "are retrieved, this same information allows the model to retrieve relevant features\n", + "from the Feature Store, greatly simplifying the scoring workflow and enabling\n", + "rapid deployment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to build a propensity scoring workflow with Databricks**\n", + "\n", + "Using these features in combination, many organizations implement propensity\n", + "scoring as part of a three-part workflow:\n", + "\n", + "**1.** Data engineers work with data scientists to define features relevant\n", + "to the propensity scoring exercise and persist these to the Feature Store.\n", + "Daily or even real-time feature engineering processes are then defined\n", + "to calculate up-to-date feature values as new data inputs arrive.\n", + "\n", + "Model Training\n", + "and Deployment\n", + "\n", + "\n", + "**2.** As part of the inference workflow, customer identifiers are presented to\n", + "previously trained models in order to generate propensity scores based on\n", + "the latest features available. Feature Store information captured with the\n", + "model allows data engineers to retrieve these features and easily generate\n", + "the desired scores, which can then be used for analysis within Databricks\n", + "Lakehouse or published to downstream marketing systems.\n", + "\n", + "**3.** In the model-training workflow, data scientists periodically retrain the\n", + "propensity score models to capture shifts in customer behaviors. As these\n", + "models are persisted to MLfLow, change management processes are used\n", + "to evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\n", + "production version of each model is retrieved to generate customer scores.\n", + "\n", + "\n", + "Score Generation\n", + "and Publication ETL\n", + "\n", + "**Need help assessing interest from your**\n", + "**target audience?**\n", + "\n", + "\n", + "Feature\n", + "Engineering ETL\n", + "\n", + "Feature Store Profiles\n", + "\n", + "\n", + "Sales\n", + "\n", + "Promotions\n", + "\n", + "Customer\n", + "\n", + "\n", + "Use the **Propensity Scoring Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Downstream\n", + "Applications\n", + "\n", + "\n", + "A three-part propensity scoring workflow.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Delivering Personalized Customer Journeys\n", + "\n", + "Strategies for crafting a real-time recommendation engine\n", + "\n", + "\n", + "As the economy continues to weather unpredictable disruptions, shortages and\n", + "demand, delivering personalized customer experiences at speed and scale will\n", + "require adaptability on the ground and within a company’s operational tech stack.\n", + "\n", + "\n", + "With the Databricks Lakehouse, Al-Futtaim has transformed their data\n", + "strategy and operations, allowing them to create a “golden customer\n", + "record” that improves all decision-making from forecasting demand to\n", + "powering their global loyalty program.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
8f4f8bec235a7c063f9b4a7b7ec6ef4bCustomer\n", + "\n", + "\n", + "Use the **Propensity Scoring Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Downstream\n", + "Applications\n", + "\n", + "\n", + "A three-part propensity scoring workflow.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Delivering Personalized Customer Journeys\n", + "\n", + "Strategies for crafting a real-time recommendation engine\n", + "\n", + "\n", + "As the economy continues to weather unpredictable disruptions, shortages and\n", + "demand, delivering personalized customer experiences at speed and scale will\n", + "require adaptability on the ground and within a company’s operational tech stack.\n", + "\n", + "\n", + "With the Databricks Lakehouse, Al-Futtaim has transformed their data\n", + "strategy and operations, allowing them to create a “golden customer\n", + "record” that improves all decision-making from forecasting demand to\n", + "powering their global loyalty program.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/al-futtaim)\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "“Databricks Lakehouse allows every division in our\n", + "organization — from automotive to retail — to gain\n", + "a unified view of our customer across businesses.\n", + "With these insights, we can optimize everything from\n", + "forecasting and supply chain, to powering our loyalty\n", + "program through personalized marketing campaigns,\n", + "cross-sell strategies and offers.”\n", + "\n", + "**D M I T R I Y D O V G A N**\n", + "Head of Data Science, Al-Futtaim Group\n", + "\n", + "As COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\n", + "safety and community, brands most attuned to changing needs and sentiments\n", + "saw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\n", + "business and many lost, organizations that had already begun the journey toward\n", + "improved customer experience saw better outcomes, closely mirroring patterns\n", + "[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n", + "\n", + "\n", + "**Creating a unified view across 200+ brands**\n", + "\n", + "As a driving force for economic growth in the Middle East, Al-Futtaim\n", + "impacts the lives of millions of people across the region through the\n", + "distribution and operations of global brands like Toyota, IKEA, Ace\n", + "Hardware and Marks & Spencer.\n", + "\n", + "Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
0473e2deba8639930389964be7b25bc7Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**\n", + "\n", + "Personalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\n", + "The [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\n", + "how they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n", + "[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing the beauty product shopping experience**\n", + "\n", + "Flaconi wanted to leverage data and AI to become the No. 1 online\n", + "beauty product destination in Europe. However, they struggled with\n", + "massive volumes of streaming data and with infrastructure complexity\n", + "that was resource-intensive and costly to scale. See how they used\n", + "Databricks to increase time-to-market by 200x, reduce staff costs by\n", + "40% and increase net order income.\n", + "\n", + "Get the full story\n", + "\n", + "\n", + "¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\n", + "Experience Performance Index in 2007-09.\n", + "\n", + "Source: Forrester Customer Experience Performance Index (2007-09); press search\n", + "\n", + "CX leaders outperform laggards, even in a down market, in this visualization of the Forrester\n", + "Customer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n", + "\n", + "\n", + "-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
8e054539e38c8a49888991a85b178399-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Flipp is an online marketplace that aggregates weekly shopping circulars,\n", + "so consumers get deals and discounts without clipping coupons. Siloed\n", + "customer data sources once made getting insights difficult. Now with\n", + "Databricks, Flipp’s data teams can access and democratize data, helping\n", + "them do their jobs more effectively while bringing better deals to users,\n", + "more meaningful insights to partners, and a 10% jump in foot traffic to\n", + "brick-and-mortar retailers.\n", + "\n", + "Get the full story\n", + "\n", + "The engines we use to serve content based on customer preferences are known\n", + "as recommenders. With some recommenders, a heavy focus on the shared\n", + "preferences of similar customers helps define what recommendations will actually\n", + "make an impact. With others, it can be more useful to focus on the properties of\n", + "the content itself (e.g., product descriptions).\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n", + "\n", + "\n", + "Providing deep, effective personalized experiences to customers depends\n", + "on a brand’s ability to intelligently leverage consumer and market data from a\n", + "wide variety of sources to fuel faster, smarter decisions — without sacrificing\n", + "accuracy for speed. The Databricks Lakehouse Platform is purpose-built for\n", + "exactly that, offering a scalable data architecture that unifies all your data,\n", + "analytics and AI to deliver unforgettable customer experiences.\n", + "\n", + "Created on open source and open standards, Databricks offers a robust\n", + "and cost-effective platform for brands to collaborate with partners, clients,\n", + "manufacturers and distributors to unleash more innovation and efficiencies\n", + "at every touch point. Businesses can rapidly ingest available data in real time,\n", + "\n", + "\n", + "at scale, and create accessible, data-driven insights that enable actionable\n", + "strategies across the value chain.\n", + "\n", + "Databricks is a multicloud platform, designed for quick enterprise development.\n", + "Teams using the Lakehouse can more effectively reveal the 360-degree view into\n", + "their company’s operational health and the evolving needs of their customers\n", + "— all while empowering teams to easily unify data efforts, perform fine-grained\n", + "analyses and streamline cross-functional data operations using a single,\n", + "sophisticated solution.\n", + "\n", + "\n", + "###### Learn more about Databricks Lakehouse for industries\n", + " like Retail & Consumer Goods, Media & Entertainment\n", + " and more at databricks.com/solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n", + "\n", + "a mission to help data teams solve the world’s toughest problems. To learn more,\n", + "\n", + "follow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
8f16d1342cbe32bc8c1aaad18ddcb487-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n", + "\n", + "a mission to help data teams solve the world’s toughest problems. To learn more,\n", + "\n", + "follow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n", + "\n", + "##### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
48fa587f07ddc7a71dd40f2d00f547a1# 2023 State\n", + " of Data + AI\n", + "```\n", + "Powered by the Databricks Lakehouse\n", + "\n", + "```\n", + "2023 STATE OF DATA + AI\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||W|e’|r|e|in||th|e|||||||\n", + "|||||||go|l|de|n|a|ge||of|||||||\n", + "|||||||||||||||||||||\n", + "|||||||d|a|ta|a|n|d|A|I|||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "INTRO\n", + "\n", + "In the 6 months since ChatGPT launched, the world has woken up to the vast potential\n", + "of AI. The unparalleled pace of AI discoveries, model improvements and new products\n", + "on the market puts data and AI strategy at the top of conversations across every\n", + "organization around the world. We believe that AI will usher in the next generation of\n", + "product and software innovation, and we’re already seeing this play out in the market.\n", + "The next generation of winning companies and executives will be those who understand\n", + "and leverage AI.\n", + "\n", + "In this report, we examine patterns and trends in data and AI adoption across more\n", + "than 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\n", + "applications across companies’ entire data estates, the Databricks Lakehouse provides\n", + "a unique vantage point into the state of data and AI, including which products and\n", + "technologies are the fastest growing, the types of data science and machine learning\n", + "(DS/ML) applications being developed and more.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Here are the major stories we uncovered:\n", + "\n", + "```\n", + "\n", + "Companies are adopting\n", + "machine learning and large\n", + "language models (LLMs)\n", + "at a rapid pace. Natural\n", + "language processing (NLP)\n", + "is dominating use cases,\n", + "with an accelerated focus\n", + "on LLMs.\n", + "\n", + "\n", + "Organizations are investing in\n", + "data integration products as\n", + "they prioritize more DS/ML\n", + "initiatives. 50% of our fastestgrowing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "Organizations are increasingly\n", + "using the Lakehouse for data\n", + "warehousing, as evidenced\n", + "by the high growth of data\n", + "integration tools dbt and\n", + "Fivetran, and the accelerated\n", + "adoption of Databricks SQL.\n", + "\n", + "\n", + "We hope that by sharing these trends, data leaders will be able to benchmark\n", + "their organizations and gain insights that help inform their strategies for an\n", + "era defined by data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Summary of\n", + "\n", + "Key Findings\n", + " DATA SCIENCE AND MACHINE LEARNING:\n", + "\n", + " NLP AND LLMS ARE IN HIGH DEMAND\n", + " 1\n", + "\n", + "```\n", + "**•** The number of companies using SaaS LLM APIs (used to access\n", + "services like ChatGPT) has grown 1310% between the end of\n", + "November 2022 and the beginning of May 2023\n", + "\n", + "**•** NLP accounts for 49% of daily Python data science library usage,\n", + "making it the most popular applicationSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
68b41e9fd77245e8e25f461a4ea62d51Organizations are investing in\n", + "data integration products as\n", + "they prioritize more DS/ML\n", + "initiatives. 50% of our fastestgrowing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "Organizations are increasingly\n", + "using the Lakehouse for data\n", + "warehousing, as evidenced\n", + "by the high growth of data\n", + "integration tools dbt and\n", + "Fivetran, and the accelerated\n", + "adoption of Databricks SQL.\n", + "\n", + "\n", + "We hope that by sharing these trends, data leaders will be able to benchmark\n", + "their organizations and gain insights that help inform their strategies for an\n", + "era defined by data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Summary of\n", + "\n", + "Key Findings\n", + " DATA SCIENCE AND MACHINE LEARNING:\n", + "\n", + " NLP AND LLMS ARE IN HIGH DEMAND\n", + " 1\n", + "\n", + "```\n", + "**•** The number of companies using SaaS LLM APIs (used to access\n", + "services like ChatGPT) has grown 1310% between the end of\n", + "November 2022 and the beginning of May 2023\n", + "\n", + "**•** NLP accounts for 49% of daily Python data science library usage,\n", + "making it the most popular application\n", + "\n", + "**•** Organizations are putting substantially more models into production\n", + "(411% YoY growth) while also increasing their ML experimentation\n", + "(54% YoY growth)\n", + "\n", + "**•** Organizations are getting more efficient with ML; for every three\n", + "\n", + "experimental models, roughly one is put into production, compared\n", + "to five experimental models a year prior\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "FASTEST-GROWING DATA\n", + "AND AI PRODUCTS\n", + "\n", + "```\n", + "```\n", + "ADOPTION AND\n", + "MIGRATION TRENDS\n", + "\n", + "```\n", + "61% of customers migrating to the\n", + "Lakehouse are coming from onprem and cloud data warehouses\n", + "\n", + "The volume of data in Delta Lake\n", + "has grown 304% YoY\n", + "\n", + "The Lakehouse is increasingly\n", + "being used for data warehousing,\n", + "including serverless data\n", + "warehousing with Databricks\n", + "SQL, which grew 144% YoY\n", + "\n", + "\n", + "BI is the top data and AI market, but\n", + "growth trends in other markets show that\n", + "companies are increasingly looking at\n", + "more advanced data use cases\n", + "\n", + "The fastest-growing data and AI product\n", + "is dbt, which grew 206% YoY by number\n", + "of customers\n", + "\n", + "Data integration is the fastest-growing\n", + "data and AI market on the Databricks\n", + "Lakehouse with 117% YoY growth\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Methodology: How did Databricks\n", + "\n", + "create this report?\n", + "\n", + "```\n", + "The _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\n", + "collected from our customers based on how they are using the Databricks\n", + "Lakehouse and its broad ecosystem of integrated tools. This report focuses\n", + "on machine learning adoption, data architecture (integrations and migrations)\n", + "and use cases. The customers in this report represent every major industry\n", + "and range in size from startups to many of the world’s largest enterprises.\n", + "\n", + "Unless otherwise noted, this report presents and analyzes data from February 1,\n", + "2022, to January 31, 2023, and usage is measured by number of customers.\n", + "When possible, we provide YoY comparisons to showcase growth trends over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Data Science and\n", + "\n", + "Machine Learning\n", + "NATURAL LANGUAGE PROCESSING AND LARGE\n", + "LANGUAGE MODELS ARE IN HIGH DEMAND\n", + "\n", + "```\n", + "Across all industries, companies leverage data science and\n", + "machine learning (DS/ML) to accelerate growth, improve\n", + "predictability and enhance customer experiences. Recent\n", + "advancements in large language models (LLMs) are propelling\n", + "companies to rethink AI within their own data strategies.\n", + "Given the rapidly evolving DS/ML landscape, we wanted to\n", + "understand several aspects of the market:\n", + "\n", + "- Which types of DS/ML applications are companies investing\n", + "in? In particular, given the recent buzz, what does the data\n", + "around LLMs look like?\n", + "\n", + "- Are companies making headway on operationalizing\n", + "\n", + "their machine learning models (MLOps)?\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Time Series Time Series\n", + "Speech Recognition\n", + "Simulations &\u0003\n", + "\n", + "Optimizations\n", + "Recommender Systems\n", + "Natural\n", + "\n", + "\u0003Language \u0003\n", + "\n", + "Processing\n", + "Industry Data Modeling\n", + "Graph\n", + "Geospatial\n", + "Computer Vision\n", + "Anomaly Detection\n", + "\u0003& Segmentation\n", + "\n", + "```\n", + "```\n", + " SPECIALIZED PYTHON \u0003DS/ML\n", + "\n", + " LIBRARIES FROM \u0003FEBRUARY 2022 \n", + "\n", + " TO JANUARY 2023\n", + "\n", + "```SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
7a9d69ed0cda68e0093366519c9fd19dMachine Learning\n", + "NATURAL LANGUAGE PROCESSING AND LARGE\n", + "LANGUAGE MODELS ARE IN HIGH DEMAND\n", + "\n", + "```\n", + "Across all industries, companies leverage data science and\n", + "machine learning (DS/ML) to accelerate growth, improve\n", + "predictability and enhance customer experiences. Recent\n", + "advancements in large language models (LLMs) are propelling\n", + "companies to rethink AI within their own data strategies.\n", + "Given the rapidly evolving DS/ML landscape, we wanted to\n", + "understand several aspects of the market:\n", + "\n", + "- Which types of DS/ML applications are companies investing\n", + "in? In particular, given the recent buzz, what does the data\n", + "around LLMs look like?\n", + "\n", + "- Are companies making headway on operationalizing\n", + "\n", + "their machine learning models (MLOps)?\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Time Series Time Series\n", + "Speech Recognition\n", + "Simulations &\u0003\n", + "\n", + "Optimizations\n", + "Recommender Systems\n", + "Natural\n", + "\n", + "\u0003Language \u0003\n", + "\n", + "Processing\n", + "Industry Data Modeling\n", + "Graph\n", + "Geospatial\n", + "Computer Vision\n", + "Anomaly Detection\n", + "\u0003& Segmentation\n", + "\n", + "```\n", + "```\n", + " SPECIALIZED PYTHON \u0003DS/ML\n", + "\n", + " LIBRARIES FROM \u0003FEBRUARY 2022 \n", + "\n", + " TO JANUARY 2023\n", + "\n", + "```\n", + "\n", + "Note: This chart reflects the unique\n", + "number of notebooks using ML\n", + "libraries per day in each of the\n", + "categories. It includes libraries used\n", + "for the particular problem-solving use\n", + "cases mentioned. It does not include\n", + "libraries used in tooling for data\n", + "preparations and modeling.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Natural language processing dominates\n", + "\n", + "machine learning use cases\n", + "\n", + "```\n", + "\n", + "Our second most popular DS/ML application is\n", + "simulations and optimization, which accounts for 30% of\n", + "all use cases. This signals organizations are using data to\n", + "model prototypes and solve problems cost-effectively.\n", + "\n", + "\n", + "To understand how organizations are applying AI and\n", + "ML within the Lakehouse, we aggregated the usage\n", + "of specialized Python libraries, which include NLTK,\n", + "Transformers and FuzzyWuzzy, into popular data science\n", + "use cases. 1 We look at data from these libraries because\n", + "Python is on the cutting edge of new developments in ML,\n", + "advanced analytics and AI, and has consistently ranked\n", + "as one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\n", + "recent years.\n", + "\n", + "Our most popular use case is natural language processing\n", + "(NLP), a rapidly growing field that enables businesses to\n", + "gain value from unstructured textual data. This opens the\n", + "door for users to accomplish tasks that were previously\n", + "too abstract for code, such as summarizing content or\n", + "extracting sentiment from customer reviews. In our data\n", + "set, 49% of libraries used are associated with NLP. LLMs\n", + "also fall within this bucket. Given the innovations launched\n", + "in recent months, we expect to see NLP take off even\n", + "more in coming years as it is applied to use cases like\n", + "chatbots, research assistance, fraud detection, content\n", + "generation and more.\n", + "\n", + "```\n", + " In our data set, 49% of\n", + " specialized Python libraries\n", + " used are associated with NLP\n", + "\n", + "```\n", + "Many of the DS/ML use cases are predominantly\n", + "leveraged by specific industries. While they take up a\n", + "smaller share of the total, they are mission-critical for\n", + "many organizations. For example, time series includes\n", + "forecasting, a use case that is especially popular in\n", + "industries such as Retail and CPG, which rely heavily\n", + "on the ability to forecast the need for every item in\n", + "every store.\n", + "\n", + "\n", + "1. This data does not include general-purpose ML libraries, including\n", + "scikit-learn or TensorFlow.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " USE OF LARGE LANGUAGE MODELS (LLMS)\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "We have rolled these libraries up into groupings based on the type of functionality they provide.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
5a3b9743641bc87bc79b7c983e4b493e1. This data does not include general-purpose ML libraries, including\n", + "scikit-learn or TensorFlow.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " USE OF LARGE LANGUAGE MODELS (LLMS)\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "We have rolled these libraries up into groupings based on the type of functionality they provide.\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||||||||\n", + "|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n", + "||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n", + "|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n", + "|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n", + "|||SaaS|||||||||||||||||||||||||||||\n", + "|||LLM|Tools||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n", + "|2022||||||||||||||||||||20|23||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n", + "\n", + "\n", + "\n", + "D t i t tl di i th l t k f D b d t lit\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Large language models areSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
75195bf43f4a607909cb075140351ad9D t i t tl di i th l t k f D b d t lit\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Large language models are\n", + "\n", + "the “it” tool\n", + "\n", + "```\n", + "LLMs are currently one of the hottest and most-watched areas\n", + "in the field of NLP. LLMs have been instrumental in enabling\n", + "machines to understand, interpret and generate human language\n", + "in a way that was previously impossible, powering everything\n", + "from machine translation to content creation to virtual assistants\n", + "and chatbots.\n", + "\n", + "Transformer-related libraries have been growing in popularity\n", + "even before ChatGPT thrust LLMs into the public consciousness.\n", + "Within the last 6 months, our data shows two accelerating\n", + "trends: organizations are building their own LLMs, which models\n", + "like [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\n", + "they are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\n", + "LLMs, have the highest adoption within the Lakehouse.\n", + "\n", + "The second most popular type is SaaS LLMs, which are used\n", + "to access models like OpenAI. This category has grown\n", + "exponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\n", + "number of Lakehouse customers using SaaS LLMs has grown\n", + "\n", + "\n", + "Organizations can leverage LLMs either by\n", + "using SaaS LLM APIs to call services like\n", + "ChatGPT from OpenAI or by operating their\n", + "own LLMs in-house.\n", + "\n", + "Thinking of building your own modern LLM\n", + "application? This approach could entail\n", + "the use of specialized transformer-related\n", + "Python libraries to train the model, as well as\n", + "LLM tools like LangChain to develop prompt\n", + "interfaces or integrations to other systems.\n", + "```\n", + "LLM DEFINITIONS\n", + "\n", + "```\n", + "**◊** **Transformer-related libraries:**\n", + "Python libraries used to train LLMs\n", + "(example: Hugging Face)\n", + "\n", + "**◊** **SaaS LLM APIs:** Libraries used to access\n", + "LLMs as a service (example: OpenAI)\n", + "\n", + "**◊** **LLM tools:** Toolchains for working\n", + "with and building proprietary LLMs\n", + "(example: LangChain)\n", + "\n", + "\n", + "an impressive 1310% between the end of November 2022 and\n", + "the beginning of May 2023. (In contrast, transformer-related\n", + "libraries grew 82% in this same period.)\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " ac e ea g e pe e a o a d p oduc o\n", + "take off across industries\n", + "\n", + "```\n", + "\n", + "The increasing demand for ML solutions and the growing\n", + "availability of technologies have led to a significant\n", + "increase in experimentation and production, two distinct\n", + "parts of the ML model lifecycle. We look at the _logging_ and\n", + "_registering_ of models in MLflow, an open source platform\n", + "developed by Databricks, to understand how ML is\n", + "trending and being adopted within organizations.\n", + "```\n", + " LOGGED MODELS AND\n", + "\n", + " ML EXPERIMENTATION\n", + "\n", + "```\n", + "During the experimentation phase of ML, data scientists\n", + "develop models designed to solve given tasks. After training\n", + "the models, they test them to evaluate their accuracy,\n", + "precision, recall (the percentage of correctly predicted\n", + "positive instances out of all actual positive instances), and\n", + "more. These metrics are logged (recorded) in order to analyze\n", + "the various models’ performance and identify which approach\n", + "works best for the given task.\n", + "\n", + "We have chosen logged models as a proxy to measure ML\n", + "experimentation because the MLflow Tracking Server is\n", + "\n", + "designed to facilitate experiment tracking and reproducibility.\n", + "\n", + "\n", + "MLflow Model Registry launched in May 2021. Overall, the\n", + "number of logged models has grown 54% since February\n", + "2022, while the number of registered models has grown\n", + "411% over the same period. This growth in volume suggests\n", + "organizations are understanding the value of investing in\n", + "and allocating more people power to ML.\n", + "```\n", + "REGISTERED MODELS AND ML PRODUCTION\n", + "\n", + "```\n", + "Production models have undergone the experimentation\n", + "phase and are then deployed in real-world applications. They\n", + "are typically used to make predictions or decisions based on\n", + "new data. Registering a model is the process of recording and\n", + "storing metadata about a trained model in a centralized location\n", + "that allows users to easily access and reuse existing models.\n", + "Registering models prior to production enables organizations to\n", + "ensure consistency and reliability in model deployment and scale.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
06e3abc9f9fe5405d9f8cd88b81b4ec1We have chosen logged models as a proxy to measure ML\n", + "experimentation because the MLflow Tracking Server is\n", + "\n", + "designed to facilitate experiment tracking and reproducibility.\n", + "\n", + "\n", + "MLflow Model Registry launched in May 2021. Overall, the\n", + "number of logged models has grown 54% since February\n", + "2022, while the number of registered models has grown\n", + "411% over the same period. This growth in volume suggests\n", + "organizations are understanding the value of investing in\n", + "and allocating more people power to ML.\n", + "```\n", + "REGISTERED MODELS AND ML PRODUCTION\n", + "\n", + "```\n", + "Production models have undergone the experimentation\n", + "phase and are then deployed in real-world applications. They\n", + "are typically used to make predictions or decisions based on\n", + "new data. Registering a model is the process of recording and\n", + "storing metadata about a trained model in a centralized location\n", + "that allows users to easily access and reuse existing models.\n", + "Registering models prior to production enables organizations to\n", + "ensure consistency and reliability in model deployment and scale.\n", + "\n", + "We have chosen registered models to represent ML production\n", + "because the MLflow Model Registry is designed to manage\n", + "models that have left the experimentation phase through the\n", + "\n", + "rest of their lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "g y yi p\n", + "\n", + "was registered. Recent advances in ML, such as improved\n", + "open source libraries like MLflow and Hugging Face, have\n", + "\n", + "radically simplified building and putting models into\n", + "production. The result is that 34% of logged models are\n", + "now candidates for production today, an improvement\n", + "from over 20% just a year ago.\n", + "\n", + "\n", + "before committing an ML model to production. We wanted\n", + "to understand, “How many models do data scientists\n", + "\n", + "experiment with before moving to production?”\n", + "\n", + "Our data shows the ratio of logged to registered models\n", + "is 2.9 : 1 as of January 2023. This means that for roughly\n", + "every three experimental models, one model will get\n", + "registered as a candidate for production. This ratio has\n", + "improved significantly from just a year prior, when we\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||VS. S|||||||||||||||||||||||\n", + "|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "||||||Models|||||||||||||||||||||||\n", + "||||||ber of|||||||||||||||||||||||\n", + "||||||Num|||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "|2.|9 :|1||||||||||||||||||||||||||\n", + "\n", + "```\n", + "Ratio of Logged to Registered\n", + "\n", + " Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\n", + "Models in Jan 2023 2023\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "The Modern Data\n", + "and AI Stack\n", + "\n", + "```\n", + "Over the last several years, the trend toward building\n", + "open, unified data architectures has played out in our\n", + "own data. We see that data leaders are opting to preserve\n", + "choice, leverage the best products and deliver innovation\n", + "across their organizations by democratizing access to\n", + "data for more people.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
1f876772fcb577c675a44952dbf0e41c```\n", + "Ratio of Logged to Registered\n", + "\n", + " Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\n", + "Models in Jan 2023 2023\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "The Modern Data\n", + "and AI Stack\n", + "\n", + "```\n", + "Over the last several years, the trend toward building\n", + "open, unified data architectures has played out in our\n", + "own data. We see that data leaders are opting to preserve\n", + "choice, leverage the best products and deliver innovation\n", + "across their organizations by democratizing access to\n", + "data for more people.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " FASTEST-GROWING DATA AND AI PRODUCTS\n", + " dbt 206%\n", + "\n", + "```\n", + "```\n", + "Fivetran\n", + "Informatica\n", + "Qlik Data Integration\n", + "Esri\n", + "Looker\n", + "Hugging Face\n", + "\n", + "```\n", + "```\n", + " 181%\n", + " 174%\n", + " 152%\n", + " 145%\n", + " 141%\n", + "110%\n", + "\n", + "```\n", + "```\n", + "Lytics\n", + "Great Expectations\n", + "Kepler.gl\n", + "\n", + "```\n", + "```\n", + " 101%\n", + " 100%\n", + "95%\n", + "\n", + "```\n", + "```\n", + "0% 50% 100% 150% 200%\n", + " Year-Over-Year Growth by Number of Customers\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "DBT IS THE FASTEST-GROWING DATA\n", + "\n", + "AND AI PRODUCT OF 2023\n", + "\n", + "```\n", + "As companies move quickly to develop more advanced\n", + "use cases with their data, they are investing in newer\n", + "products that produce trusted data sets for reporting,\n", + "ML modeling and operational workflows. Hence, we see\n", + "the rapid rise of data integration products. dbt, a data\n", + "transformation tool, and Fivetran, which automates\n", + "data pipelines, are our two fastest-growing data and AI\n", + "products. This suggests a new era of the data integration\n", + "market with challenger tools making headway as\n", + "companies shift to prioritize DS/ML initiatives. With Great\n", + "Expectations from Superconductive in the ninth spot,\n", + "a full 50% of our fastest-growing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||Busi|ness I|ntelli|gence|\n", + "|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n", + "|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n", + "|Custom||||||||||||||||||||\n", + "|ber of||||||||||||||||||||\n", + "|Num||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "Note: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\n", + "categories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
34d212b9c59581c1086822b4a76ab6f3Note: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\n", + "categories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " a a a d a e s bus ess e ge ce s\n", + "standard, organizations invest in their machine\n", + "learning foundation\n", + "\n", + "```\n", + "\n", + "To understand how organizations are prioritizing their data\n", + "initiatives, we aggregated all data and AI products on the\n", + "Databricks Lakehouse and categorized them into four\n", + "core markets: BI, data governance and security, DS/ML,\n", + "and data integration. Our data set confirms that BI tools\n", + "are more widely adopted across organizations relative to\n", + "more nascent categories — and they continue to grow,\n", + "with a 66% YoY increase in adoption. This aligns with the\n", + "broader trend of more organizations performing data\n", + "warehousing on a Lakehouse, covered in the next section,\n", + "Views from the Lakehouse.\n", + "\n", + "\n", + "While BI is often where organizations start their data\n", + "journey, companies are increasingly looking at more\n", + "advanced data and AI use cases.\n", + "```\n", + "DEMAND FOR DATA INTEGRATION PRODUCTS\n", + "\n", + "IS GROWING FAST\n", + "\n", + "```\n", + "We see the fastest growth in the data integration market.\n", + "These tools enable a company to integrate vast amounts\n", + "of upstream and downstream data in one consolidated\n", + "view. Data integration products ensure that all BI and DS/\n", + "ML initiatives are built on solid foundation.\n", + "\n", + "While it’s easier for smaller markets to experience\n", + "faster growth, at 117% YoY increased adoption, the data\n", + "integration market is growing substantially faster than BI.\n", + "This trend dovetails with the rapid growth of ML adoption\n", + "we see across the Lakehouse, covered in the DS/ML\n", + "section of the report.\n", + "\n", + "```\n", + "Data integration is the\n", + "fastest-growing market,\n", + "\n", + " with 117% YoY growth\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Views from\n", + "the Lakehouse\n", + "MIGRATION AND DATA\n", + "\n", + "FORMAT TRENDS\n", + "\n", + "```\n", + "Data migration is a major undertaking: it can be risky,\n", + "expensive and delay companies’ timelines. It’s not a\n", + "task to jump into lightly. As organizations run into the\n", + "limitations, scalability challenges and the cost burden\n", + "of legacy data platforms, they are increasingly likely\n", + "to migrate to a new type of architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Migration trends:\n", + "\n", + "the best data warehouse\n", + "\n", + "is a Lakehouse\n", + "\n", + "```\n", + "The Lakehouse Platform is an attractive\n", + "alternative to traditional data warehouses\n", + "because it supports advanced use cases and\n", + "DS/ML, allowing organizations to boost their\n", + "overall data strategy. As evidenced by the most\n", + "popular data and AI products, with BI and data\n", + "integration tools at the top, organizations are\n", + "increasingly using the data lakehouse for data\n", + "warehousing. To better understand which legacy\n", + "platforms organizations are moving away from,\n", + "\n", + "we look at the migrations of new customers\n", + "to Databricks.\n", + "\n", + "An interesting takeaway is that roughly half of the\n", + "companies moving to the Lakehouse are coming\n", + "from data warehouses. This includes the 22%\n", + "that are moving from cloud data warehouses.\n", + "It also demonstrates a growing focus on running\n", + "data warehousing workloads on a Lakehouse\n", + "and unifying data platforms to reduce cost.\n", + "\n", + "```\n", + " SOURCE OF NEW CUSTOMER \u0003\n", + "\n", + " MIGRATIONS TO DATABRICKS\n", + "\n", + "```\n", + "```\n", + "12%\n", + "\n", + "```\n", + "```\n", + "39%\n", + "\n", + "```\n", + "```\n", + "27%\n", + "\n", + "```\n", + "```\n", + "22%\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Rising tides: the volume\n", + "\n", + "of data in Delta Lake\n", + "\n", + "has grown 304% YoY\n", + "\n", + "```\n", + "As the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\n", + "large proportion is in the form of semi-structured\n", + "and unstructured data. Previously, organizations\n", + "had to manage multiple different platforms for\n", + "their structured, unstructured and semi-structured\n", + "data, which caused unnecessary complexity and\n", + "high costs. The Lakehouse solves this problem by\n", + "providing a unified platform for all data types\n", + "and formats.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
92675ce8cb8f76491cdb21da3fb3d4f7```\n", + "```\n", + "27%\n", + "\n", + "```\n", + "```\n", + "22%\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Rising tides: the volume\n", + "\n", + "of data in Delta Lake\n", + "\n", + "has grown 304% YoY\n", + "\n", + "```\n", + "As the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\n", + "large proportion is in the form of semi-structured\n", + "and unstructured data. Previously, organizations\n", + "had to manage multiple different platforms for\n", + "their structured, unstructured and semi-structured\n", + "data, which caused unnecessary complexity and\n", + "high costs. The Lakehouse solves this problem by\n", + "providing a unified platform for all data types\n", + "and formats.\n", + "\n", + "Delta Lake is the foundation of the Databricks\n", + "Lakehouse. The Delta Lake format encompasses\n", + "structured, unstructured and semi-structured\n", + "data. Use has surged over the past 2 years.\n", + "When compared to the steady, flat or declining\n", + "growth in other storage formats (e.g., text, JSON\n", + "and CSV), our data shows that a growing number\n", + "of organizations are turning to Delta Lake to manage\n", + "their data. In June 2022, Delta Lake surpassed\n", + "Parquet as the most popular data lake source,\n", + "reaching 304% YoY growth.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||BY|STO||RAG||E FO||RMA|T|||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|ata||||||||||||||||||\n", + "|e of D||||||||||||||||||\n", + "|Volum||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "||Jan|||||||J|an|||Jan||||Ja||\n", + "|||||Jan||||||||||||||\n", + "|2|019|||2020||||20|21|||2022||||202||\n", + "|||||||||Delta|Te|xt||CSV||Av||ro||\n", + "|||||||||Parquet|OR|C||JSON||||||\n", + "|||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " g g ,\n", + "with emphasis on serverless\n", + "\n", + "```\n", + "\n", + "Over the past 2 years, companies have vastly increased their usage\n", + "of data warehousing on the Lakehouse Platform. This is especially\n", + "demonstrated by use of Databricks SQL ­— the serverless data\n", + "warehouse on the Lakehouse — which shows 144% YoY growth.\n", + "This suggests that organizations are increasingly ditching traditional\n", + "data warehouses and are able to perform all their BI and analytics\n", + "on a Lakehouse.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
89431109f90bb45a304efc01edc3afa4-----\n", + "\n", + "```\n", + " g g ,\n", + "with emphasis on serverless\n", + "\n", + "```\n", + "\n", + "Over the past 2 years, companies have vastly increased their usage\n", + "of data warehousing on the Lakehouse Platform. This is especially\n", + "demonstrated by use of Databricks SQL ­— the serverless data\n", + "warehouse on the Lakehouse — which shows 144% YoY growth.\n", + "This suggests that organizations are increasingly ditching traditional\n", + "data warehouses and are able to perform all their BI and analytics\n", + "on a Lakehouse.\n", + "\n", + "```\n", + " Data \n", + "Warehouse\n", + "\n", + "```\n", + "```\n", + "Data \n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "Platform\n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||\n", + "||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n", + "||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n", + "||DA|TABR|ICK|S SQ|||||||||||||||||||||\n", + "||||||||ustome||||||||||||||||||\n", + "||||||||r of C||||||||||||||||||\n", + "||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n", + "||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n", + "||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n", + "\n", + "\n", + "-----\n", + "\n", + "CONCLUSION\n", + "```\n", + "Generation AI\n", + "\n", + "```\n", + "We’re excited that companies are progressing into more\n", + "advanced ML and AI use cases, and the modern data and\n", + "AI stack is evolving to keep up. Along with the rapid growth\n", + "of data integration tools (including our fastest growing,\n", + "dbt), we’re seeing the rapid rise of NLP and LLM usage in\n", + "our own data set, and there’s no doubt that the next few\n", + "years will see an explosion in these technologies. It’s never\n", + "been more clear: the companies that harness the power\n", + "of DS/ML will lead the next generation of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "About Databricks\n", + "\n", + "```\n", + "Databricks is the data and AI company. More than 9,000\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "the world’s toughest problems. To learn more, follow Databricks\n", + "on Twitter, LinkedIn and Instagram.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
1f170065560005166ed3bfde9a20232c-----\n", + "\n", + "```\n", + "About Databricks\n", + "\n", + "```\n", + "Databricks is the data and AI company. More than 9,000\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "the world’s toughest problems. To learn more, follow Databricks\n", + "on Twitter, LinkedIn and Instagram.\n", + "\n", + "[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
e7bcc94606d0aa9fb64905e8016a9a01**EBOOK**\n", + "\n", + "## Why the Data Lakehouse Is Your Next Data Warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Preface .......................................................................................................................................................................................................................................... **3**\n", + "\n", + "Introduction ............................................................................................................................................................................................................................. **4**\n", + "\n", + "Our Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n", + "\n", + "Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n", + "\n", + "Why Databricks SQL? ............................................................................................................................................................................................... 6SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
7fe00d5363b0ac4a160402a365eaf09dWhy Databricks SQL? ............................................................................................................................................................................................... 6\n", + "\n", + "Common use cases .................................................................................................................................................................................................... 7\n", + "\n", + "The Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n", + "\n", + "**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n", + "\n", + "**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
65670d54f6fb0335688832cdcd3c89e2**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n", + "\n", + "**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n", + "\n", + "Conclusion ............................................................................................................................................................................................................................. **24**\n", + "\n", + "Customer Stories ............................................................................................................................................................................................................... **25**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Preface\n", + "\n", + "Historically, data teams have had to resort to a bifurcated architecture to run traditional\n", + "BI and analytics workloads, copying subsets of the data already stored in their data lake\n", + "to a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\n", + "governance inherent in proprietary architectures.\n", + "\n", + "Our customers have asked us to simplify their data architecture. We decided to accelerate\n", + "our investments to do just that.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
59fb84c0af2dbaa94d16d9ab2bdedaeb-----\n", + "\n", + "### Preface\n", + "\n", + "Historically, data teams have had to resort to a bifurcated architecture to run traditional\n", + "BI and analytics workloads, copying subsets of the data already stored in their data lake\n", + "to a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\n", + "governance inherent in proprietary architectures.\n", + "\n", + "Our customers have asked us to simplify their data architecture. We decided to accelerate\n", + "our investments to do just that.\n", + "\n", + "\n", + "We introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\n", + "first-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\n", + "We use the term “lakehouse” to reflect our customers’ desire to combine the best of data\n", + "warehouses and data lakes. With the lakehouse, you can now establish one source of truth\n", + "for all data and enable all workloads from AI to BI on one platform. And we want to provide\n", + "you with ease-of-use and state-of-the-art performance at the lowest cost.\n", + "\n", + "\n", + "**Reynold Xin**\n", + "\n", + "Original Creator of Apache Spark, TM\n", + "Co-founder and Chief Architect,\n", + "Databricks\n", + "\n", + "\n", + "This eBook covers how we went back to the drawing board to build Databricks SQL — the\n", + "last mile of enabling data warehousing capabilities for your existing data lakes — as part of\n", + "the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "Most organizations operate their business with a complex data architecture that\n", + "combines data warehouses and data lakes. For one thing, data lakes are great\n", + "for machine learning (ML). They support open formats and a large ecosystem.\n", + "But data lakes have poor support for business intelligence (BI) and suffer\n", + "complex data quality problems. Data warehouses, on the other hand, are great\n", + "for BI applications. But they have limited support for ML workloads, can’t handle\n", + "natural language data, large-scale structured data, or raw, video, audio or image\n", + "files, and are proprietary systems with only a SQL interface.\n", + "\n", + "As a result, data is moved around the organization through data pipelines and\n", + "systems that create a multitude of data silos. A large amount of time is spent\n", + "maintaining these pipelines and systems rather than creating new value from\n", + "data, and downstream consumers struggle to get a single source of truth of the\n", + "data due to the inherent siloing of data that takes place. The situation becomes\n", + "very expensive, and decision-making speed and quality are negatively affected.\n", + "\n", + "Unifying these systems can be transformational in how we think about data.\n", + "\n", + "\n", + "##### The need for simplification\n", + "\n", + "It is time for a new data architecture that can meet both today’s and tomorrow’s\n", + "needs. Without any compromise. Advanced analytics and ML are one of the\n", + "most strategic priorities for data-driven organizations today, and the amount\n", + "of unstructured data is growing exponentially. So it makes sense to position\n", + "the data lake as the center of the data infrastructure. However, for this to be\n", + "achievable, the data lake needs to adopt the strengths of data warehouses.\n", + "\n", + "The answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\n", + "and standardized system design: one that implements data structure and data\n", + "management features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
8b67ad732685ff90448a1005931ea52e**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n", + "\n", + "##### Building the Data Lakehouse\n", + "[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n", + "\n", + "\n", + "-----\n", + "\n", + "### Our Approach: The Databricks Lakehouse Platform\n", + "\n", + "Our customers have asked us for simplification. This is why we’ve embarked on\n", + "this journey to deliver one simple, open and collaborative platform for all your\n", + "data, AI and BI workloads on your existing data lakes.\n", + "\n", + "The [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\n", + "combining the data management and performance typically found in data\n", + "warehouses with the low-cost, flexible object stores offered by data lakes.\n", + "\n", + "It’s built on open source and open standards to maximize flexibility, and lets you\n", + "store all your data — structured, semi-structured and unstructured — in your\n", + "existing data lake while still getting the data quality, performance, security and\n", + "governance you’d expect from a data warehouse. Data only needs to exist once\n", + "to support all of your data, AI and BI workloads on one common platform\n", + "— establishing one source of truth.\n", + "\n", + "Finally, the Lakehouse Platform provides tailored and collaborative\n", + "experiences so data engineers, data scientists and analysts can work together\n", + "on one common platform across the entire data lifecycle — from ingestion to\n", + "consumption and the serving of data products — and innovate faster.\n", + "\n", + "Let’s look at how, with the right data structures and data management\n", + "capabilities in place, we can now deliver data warehouse and analytics\n", + "capabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n", + "\n", + "\n", + "Databricks SQL is a serverless data warehouse on the Databricks Lakehouse\n", + "Platform that lets you run all your SQL and BI applications at scale with up to 12x\n", + "better price/performance, a unified governance model, open formats and APIs,\n", + "and your tools of choice — no vendor lock-in. Reduce resource management\n", + "overhead with serverless compute, and easily ingest, transform and query\n", + "all your data in place to deliver real-time business insights faster. In fact, DB\n", + "SQL now holds the new world record in 100TB TPC-DS, the gold standard\n", + "performance benchmark for data warehousing.\n", + "\n", + "Built on open standards and APIs, the lakehouse provides an open, simplified and\n", + "multicloud architecture that brings the best of data warehousing and data lakes\n", + "together, and integrations with a rich ecosystem for maximum flexibility.\n", + "\n", + "\n", + "##### Why Databricks SQL?\n", + "\n", + "Best Price/Performance\n", + "Lower costs, get world-class performance, and eliminate the need to manage,\n", + "configure or scale cloud infrastructure with serverless.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
724e4a72a8b4f9ddd30bc0de869473b7### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n", + "\n", + "\n", + "Databricks SQL is a serverless data warehouse on the Databricks Lakehouse\n", + "Platform that lets you run all your SQL and BI applications at scale with up to 12x\n", + "better price/performance, a unified governance model, open formats and APIs,\n", + "and your tools of choice — no vendor lock-in. Reduce resource management\n", + "overhead with serverless compute, and easily ingest, transform and query\n", + "all your data in place to deliver real-time business insights faster. In fact, DB\n", + "SQL now holds the new world record in 100TB TPC-DS, the gold standard\n", + "performance benchmark for data warehousing.\n", + "\n", + "Built on open standards and APIs, the lakehouse provides an open, simplified and\n", + "multicloud architecture that brings the best of data warehousing and data lakes\n", + "together, and integrations with a rich ecosystem for maximum flexibility.\n", + "\n", + "\n", + "##### Why Databricks SQL?\n", + "\n", + "Best Price/Performance\n", + "Lower costs, get world-class performance, and eliminate the need to manage,\n", + "configure or scale cloud infrastructure with serverless.\n", + "\n", + "Built-In Governance\n", + "Establish one single copy for all your data using open standards, and one unified\n", + "governance layer across all data teams using standard SQL.\n", + "\n", + "Rich Ecosystem\n", + "Use SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\n", + "to ingest, transform and query all your data in place.\n", + "\n", + "Break Down Silos\n", + "Empower every analyst to access the latest data faster for downstream real-time\n", + "analytics, and go effortlessly from BI to ML.\n", + "\n", + "**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Common use cases\n", + "\n", + "Thousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\n", + "for hundreds of analysts across their organizations, and to build custom data applications to better serve their\n", + "customers. Below are some examples of use cases for Databricks SQL.\n", + "\n", + "**At Atlassian, we have proven**\n", + "\n", + "\n", + "**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n", + "**your BI tools of choice** **the freshest data** **data applications**\n", + "\n", + "\n", + "**that there is no longer a need**\n", + "\n", + "**for two separate data things.**\n", + "\n", + "**Technology has advanced**\n", + "\n", + "**far enough for us to consider**\n", + "\n", + "**one single unified lakehouse**\n", + "\n", + "**architecture.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager,\n", + "Atlassian\n", + "\n", + "\n", + "Enable business analysts to\n", + "directly query data lake data\n", + "using their favorite BI tool and\n", + "avoid data silos. Reengineered\n", + "and optimized connectors\n", + "ensure fast performance,\n", + "low latency and high user\n", + "concurrency to your data lake.\n", + "Now analysts can use the best\n", + "tool for the job on one single\n", + "source of truth for your data.\n", + "\n", + "\n", + "Empower every analyst and SQL\n", + "professional in your organization\n", + "to quickly find and share new\n", + "insights by providing them with\n", + "a collaborative and self-served\n", + "analytics experience. Confidently\n", + "manage data permissions with\n", + "fine-grained governance, share and\n", + "reuse queries, and quickly analyze\n", + "and share results using interactive\n", + "visualizations and dashboards.\n", + "\n", + "\n", + "Build more effective and\n", + "tailored data applications\n", + "for your own organization or\n", + "your customers. Benefit from\n", + "the ease of connectivity,\n", + "management and better price/\n", + "performance of DB SQL to\n", + "simplify development of dataenhanced applications at scale,\n", + "all served from your data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "### The Inner Workings of the LakehouseSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
5aaea8ef9ed092a8eb41615a06d863b1**one single unified lakehouse**\n", + "\n", + "**architecture.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager,\n", + "Atlassian\n", + "\n", + "\n", + "Enable business analysts to\n", + "directly query data lake data\n", + "using their favorite BI tool and\n", + "avoid data silos. Reengineered\n", + "and optimized connectors\n", + "ensure fast performance,\n", + "low latency and high user\n", + "concurrency to your data lake.\n", + "Now analysts can use the best\n", + "tool for the job on one single\n", + "source of truth for your data.\n", + "\n", + "\n", + "Empower every analyst and SQL\n", + "professional in your organization\n", + "to quickly find and share new\n", + "insights by providing them with\n", + "a collaborative and self-served\n", + "analytics experience. Confidently\n", + "manage data permissions with\n", + "fine-grained governance, share and\n", + "reuse queries, and quickly analyze\n", + "and share results using interactive\n", + "visualizations and dashboards.\n", + "\n", + "\n", + "Build more effective and\n", + "tailored data applications\n", + "for your own organization or\n", + "your customers. Benefit from\n", + "the ease of connectivity,\n", + "management and better price/\n", + "performance of DB SQL to\n", + "simplify development of dataenhanced applications at scale,\n", + "all served from your data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "### The Inner Workings of the Lakehouse\n", + "\n", + "\n", + "In the next chapter, we’ll unpack the three foundational layers of the Databricks\n", + "Lakehouse Platform and how we went back to the drawing board to build this\n", + "experience. Specifically, we’ll dive into how we built Databricks SQL to deliver\n", + "analytics and data warehousing workloads on your lakehouse.\n", + "\n", + "\n", + "Those layers are:\n", + "\n", + "**1 .** The storage layer, or how we store and govern data\n", + "\n", + "**2 .** The compute layer, or how we process queries\n", + "\n", + "**3 .** The consumption layer, or the tools you can use to interface with the system\n", + "\n", + "\n", + "###### PART 1: STORAGE LAYER\n", + "\n", + "In order to bring the best of data lakes and data\n", + "warehouses, we needed to support the openness\n", + "and flexibility of data lakes, as well as the quality,\n", + "performance and governance you’d expect from a\n", + "data warehouse.\n", + "\n", + "\n", + "**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n", + "|---|---|---|\n", + "|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n", + "|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n", + "|All data types|Structured only|All data types|\n", + "|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Transactional guarantees for your data lake\n", + "\n", + "\n", + "The open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\n", + "lake challenges around data quality and reliability. It is the foundation for the\n", + "lakehouse, and Databricks SQL stores and processes data using Delta Lake.\n", + "\n", + "For example, it provides ACID transactions to ensure that every operation either\n", + "fully succeeds or fully aborts for later retries — without requiring new data\n", + "pipelines to be created. It unifies batch and streaming pipelines so you can\n", + "easily merge existing and new data at the speed required for your business. With\n", + "Time Travel, Delta Lake automatically records all past transactions, so it’s easy\n", + "to access and use previous versions of your data for compliance needs or for\n", + "ML applications. Advanced indexing, caching and auto-tuning allow optimization\n", + "of Delta tables for the best query performance. Delta Lake also acts as the\n", + "foundation for fine-grained, role-based access controls on the lakehouse.\n", + "\n", + "As a result, Delta Lake allows you to treat tables in Databricks SQL just like you\n", + "treat tables in a database: updates, inserts and merges can take place with high\n", + "performance at the row level. This is particularly useful if you are inserting new\n", + "\n", + "\n", + "data rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n", + "(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\n", + "with one open and standard format — not only for SQL but also for Python, Scala\n", + "and other languages — so you can run all analytical and ML use cases on the\n", + "same data.\n", + "\n", + "**Delta Lake provides the key**\n", + "\n", + "An open format storage layer built for lake-first architectureSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
0f0b7a883ea83d964c3f3442178a9514As a result, Delta Lake allows you to treat tables in Databricks SQL just like you\n", + "treat tables in a database: updates, inserts and merges can take place with high\n", + "performance at the row level. This is particularly useful if you are inserting new\n", + "\n", + "\n", + "data rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n", + "(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\n", + "with one open and standard format — not only for SQL but also for Python, Scala\n", + "and other languages — so you can run all analytical and ML use cases on the\n", + "same data.\n", + "\n", + "**Delta Lake provides the key**\n", + "\n", + "An open format storage layer built for lake-first architecture\n", + "\n", + "ACID transactions, Time Travel, highly available\n", + "\n", + "Advanced indexing, caching, auto-tuning\n", + "\n", + "Fine-grained, role-based access controls\n", + "\n", + "Streaming & batch, analytics & ML\n", + "\n", + "Python, SQL, R, Scala\n", + "\n", + "Delta Lake brings data quality, performance and governance to the lakehouse\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n", + "##### Delta Lake: The Definitive Guide\n", + "[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A framework for building a curated data lake\n", + "\n", + "\n", + "With the ability to ingest petabytes of data with auto-evolving schemas, Delta\n", + "Lake helps turn raw data into actionable data by incrementally and efficiently\n", + "processing data as it arrives from files or streaming sources like Kafka, Kinesis,\n", + "Event Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\n", + "as it arrives with no manual intervention, as well as infer schema, detect column\n", + "changes for structured and unstructured data formats, and prevent data loss by\n", + "rescuing data columns that don’t meet data quality specifications. And now with\n", + "[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\n", + "various sources.\n", + "\n", + "As you refine the data, you can add more structure to it. Databricks recommends\n", + "the Bronze, Silver and Gold pattern. It lets you easily merge and transform new\n", + "and existing data — in batch or streaming — while benefiting from the low-cost,\n", + "flexible object storage offered by data lakes. Bronze is the initial landing zone\n", + "for the pipeline. We recommend copying data that’s as close to its raw form as\n", + "possible to easily replay the whole pipeline from the beginning, if needed. Silver\n", + "is where the raw data gets cleansed (think data quality checks), transformed\n", + "and potentially enriched with external data sets. Gold is the production-grade\n", + "data that your entire company can rely on for business intelligence, descriptive\n", + "statistics, and data science/machine learning.\n", + "\n", + "\n", + "By the time you get to Gold, the tables are high-value business-level metrics\n", + "that have all the schema enforcement and constraints applied. This way, you can\n", + "retain the flexibility of the data lake at the Bronze and Silver levels, and then use\n", + "the Gold level for high-quality business data.\n", + "\n", + "Auto Loader\n", + "\n", + "\n", + "BRONZE\n", + "\n", + "\n", + "SILVER GOLD\n", + "\n", + "\n", + "Structured Streaming\n", + "\n", + "Batch\n", + "\n", + "COPY INTO\n", + "\n", + "Partners\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned Business-level\n", + "and history and augmented aggregates\n", + "\n", + "|Col1|Col2|\n", + "|---|---|\n", + "||R|\n", + "\n", + "\n", + "**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### An aside on batch and streaming data pipelines\n", + "\n", + "\n", + "The best way to set up and run data pipelines in the Bronze/Silver/Gold\n", + "pattern recommended on the previous page is in Delta Live Tables (DLT).\n", + "DLT makes it easy to build and manage reliable batch and streaming\n", + "data pipelines that deliver high-quality data. It helps data engineering\n", + "teams simplify ETL development and management with declarative\n", + "pipeline development, automatic data testing, and deep visibility for\n", + "monitoring and recovery.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
dbe58a2ebf2c1f9dbd2e28a0d617b81eAuto Loader\n", + "\n", + "\n", + "BRONZE\n", + "\n", + "\n", + "SILVER GOLD\n", + "\n", + "\n", + "Structured Streaming\n", + "\n", + "Batch\n", + "\n", + "COPY INTO\n", + "\n", + "Partners\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned Business-level\n", + "and history and augmented aggregates\n", + "\n", + "|Col1|Col2|\n", + "|---|---|\n", + "||R|\n", + "\n", + "\n", + "**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### An aside on batch and streaming data pipelines\n", + "\n", + "\n", + "The best way to set up and run data pipelines in the Bronze/Silver/Gold\n", + "pattern recommended on the previous page is in Delta Live Tables (DLT).\n", + "DLT makes it easy to build and manage reliable batch and streaming\n", + "data pipelines that deliver high-quality data. It helps data engineering\n", + "teams simplify ETL development and management with declarative\n", + "pipeline development, automatic data testing, and deep visibility for\n", + "monitoring and recovery.\n", + "\n", + "The fact that you can run all your batch and streaming pipelines together\n", + "in one simple, declarative framework makes data engineering easy on the\n", + "Databricks Lakehouse Platform. We regularly talk to customers who have\n", + "been able to reduce pipeline development time from weeks — or months\n", + "— to mere minutes with Delta Live Tables. And by the way, even data\n", + "\n", + "\n", + "analysts can easily interrogate DLT pipelines for the queries they need\n", + "to run, without knowing any sort of specialized programming language\n", + "or niche skills.\n", + "\n", + "One of the top benefits of DLT, and Delta Lake in general, is that it is built\n", + "with streaming pipelines in mind. Today, the world operates in real time, and\n", + "businesses are increasingly expected to analyze and respond to their data in\n", + "real time. With streaming data pipelines built on DLT, analysts can easily access,\n", + "query and analyze data with greater accuracy and actionability than with\n", + "conventional batch processing. Delta Live Tables makes real-time analytics a\n", + "reality for our customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Fine-grained governance on the lakehouse\n", + "\n", + "Delta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\n", + "on the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\n", + "provides fine-grained governance across clouds, data and ML assets. Among the\n", + "benefits of the Unity Catalog, it allows you to:\n", + "\n", + "**• Discover, audit and govern data assets in one place:** A user-friendly\n", + "interface, automated data lineage across tables, columns, notebooks,\n", + "workflows and dashboards, role-based security policies, table or\n", + "column-level tags, and central auditing capabilities make it easy for\n", + "data stewards to discover, manage and secure data access to meet\n", + "compliance and privacy needs directly on the lakehouse.\n", + "\n", + "\n", + "\n", + "**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\n", + "open standard SQL DCL. This means database administrators can easily\n", + "grant permission to arbitrary, user-specific views, or set permissions on\n", + "all columns tagged together, using familiar SQL.\n", + "\n", + "**• Centrally manage and audit shared data across organizations:** Every\n", + "organization needs to share data with customers, partners and suppliers\n", + "to better collaborate and to unlock value from their data. Unity Catalog\n", + "builds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\n", + "shared assets within and across organizations.\n", + "\n", + "\n", + "The Unity Catalog makes it easy for data stewards to discover, manage and secure data access\n", + "to meet compliance and privacy needs on the lakehouse.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 2: COMPUTE LAYER\n", + "\n", + "\n", + "The next layer to look at is the compute layer, or how we process queries.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
e75286e60b2d30c5d866b93f23073de9**• Centrally manage and audit shared data across organizations:** Every\n", + "organization needs to share data with customers, partners and suppliers\n", + "to better collaborate and to unlock value from their data. Unity Catalog\n", + "builds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\n", + "shared assets within and across organizations.\n", + "\n", + "\n", + "The Unity Catalog makes it easy for data stewards to discover, manage and secure data access\n", + "to meet compliance and privacy needs on the lakehouse.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 2: COMPUTE LAYER\n", + "\n", + "\n", + "The next layer to look at is the compute layer, or how we process queries.\n", + "\n", + "Apache Spark TM has been the de facto standard for data lake compute. It’s great\n", + "for processing terabytes and petabytes of data cheaply, but historically Spark\n", + "SQL uses a nonstandard syntax and can be difficult to configure.\n", + "\n", + "\n", + "Data warehouses, on the other hand, tend to support short running queries\n", + "really well, especially when you have a lot of users issuing queries concurrently.\n", + "They tend to be easier to set up, but don’t necessarily scale or they become\n", + "too costly.\n", + "\n", + "\n", + "**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n", + "|---|---|---|\n", + "|Economical|Scaling is exponentially more expensive|Economical|\n", + "|High operational complexity|Ease of use|Ease of use|\n", + "||||\n", + "\n", + "\n", + "A popular belief is that large workloads require a drastically different system\n", + "than low latency, high concurrency workloads. For example, there’s the classic\n", + "trade-off in computer systems between latency and throughput.\n", + "\n", + "But after spending a lot of time analyzing these systems, we found that it was\n", + "possible to simultaneously improve large query performance and concurrency\n", + "\n", + "\n", + "and latency. Although the classic trade-offs definitely existed, they were only\n", + "explicit when we optimized the system to the very theoretical optimal. It turned\n", + "out the vast majority of software — and this includes all data warehouse systems\n", + "and Databricks — were far away from optimal.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n", + "\n", + "\n", + "To achieve world-class performance for analytics on the lakehouse, we chose to\n", + "completely rebuild the compute layer. But performance isn’t everything. We also\n", + "want it to be simple to administer and cheaper to use. Databricks SQL leverages\n", + "serverless SQL warehouses that let you get started in seconds, and it’s powered\n", + "by a new native MPP vectorized engine: Photon.\n", + "\n", + "Databricks SQL warehouses are optimized and elastic SQL compute resources.\n", + "Just pick the cluster size and Databricks automatically determines the best\n", + "instance types and VMs configuration for the best price/performance. This\n", + "means you don’t have to worry about estimating peak demand or paying too\n", + "much by overprovisioning. You just need to click a few buttons to operate.\n", + "To further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\n", + "With the serverless capability, queries start rapidly with zero infrastructure\n", + "management or configuration overhead. This lowers your total cost, as you pay\n", + "only for what you consume without idle time or overprovisioned resources.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
45c780a084216b7bdf44ffa987309653To achieve world-class performance for analytics on the lakehouse, we chose to\n", + "completely rebuild the compute layer. But performance isn’t everything. We also\n", + "want it to be simple to administer and cheaper to use. Databricks SQL leverages\n", + "serverless SQL warehouses that let you get started in seconds, and it’s powered\n", + "by a new native MPP vectorized engine: Photon.\n", + "\n", + "Databricks SQL warehouses are optimized and elastic SQL compute resources.\n", + "Just pick the cluster size and Databricks automatically determines the best\n", + "instance types and VMs configuration for the best price/performance. This\n", + "means you don’t have to worry about estimating peak demand or paying too\n", + "much by overprovisioning. You just need to click a few buttons to operate.\n", + "To further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\n", + "With the serverless capability, queries start rapidly with zero infrastructure\n", + "management or configuration overhead. This lowers your total cost, as you pay\n", + "only for what you consume without idle time or overprovisioned resources.\n", + "\n", + "\n", + "Since CPU clock speeds have plateaued, we also wanted to find new ways to\n", + "process data faster, beyond raw compute power. One of the most impactful\n", + "methods has been to improve the amount of data that can be processed in\n", + "parallel. However, data processing engines need to be specifically architected to\n", + "take advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\n", + "C++ based vectorized query processing engine that dramatically improves query\n", + "performance while remaining fully compatible with open Spark APIs. Databricks\n", + "SQL warehouses are powered by Photon, which seamlessly coordinates work and\n", + "resources and transparently accelerates portions of your SQL queries directly on\n", + "your data lake. No need to move the data to a data warehouse.\n", + "\n", + "**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n", + "##### Photon: A Fast Query Engine for Lakehouse Systems\n", + "\n", + "[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Did you know?**\n", + "\n", + "Databricks SQL warehouses scale automatically throughout the day to\n", + "better suit your business needs. Administration is simplified by identifying\n", + "how many clusters can scale out with min and max, and Databricks SQL will\n", + "auto-scale as needed. This ensures that you have ample compute to serve\n", + "your needs, without overprovisioning. Administrators appreciate the ability\n", + "to have better control over consumption costs, while users appreciate that\n", + "their queries process as fast and efficiently as possible. For most BI and\n", + "analytics use cases, using medium-size warehouses with scaling is a great\n", + "balance of price/performance that fits most business needs.\n", + "\n", + "In the next section, we will discuss examples of Databricks SQL performance results\n", + "on large-scale analytic workloads as well as highly concurrent workloads.\n", + "\n", + "\n", + "Running Scheduled Starting Cluster Scale\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Large query performance: the fastest data warehouse\n", + "\n", + "\n", + "The industry standard benchmark used by data warehouses is TPC-DS. It includes\n", + "100 queries that range from very simple to very sophisticated to simulate decision\n", + "support workloads. This benchmark was created by a committee formed by\n", + "data warehousing vendors. The chart at right shows price/performance results\n", + "running the 100TB version of TPC-DS, since for large workloads the numbers that\n", + "ultimately matter pertain to the performance cost. As you can see, Databricks SQL\n", + "outperforms all cloud data warehouses we have measured.\n", + "\n", + "**[LEARN MORE](https://dbricks.co/benchmark)**\n", + "\n", + "**Did you know?**\n", + "\n", + "\n", + "**$2,000**\n", + "\n", + "**$1,791**\n", + "\n", + "**$1,500**\n", + "\n", + "**$1,000**\n", + "\n", + "**$952**\n", + "\n", + "\n", + "**$500**\n", + "\n", + "\n", + "**$242**\n", + "**$146**\n", + "\n", + "\n", + "**$358**\n", + "\n", + "\n", + "**$0**\n", + "Databricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\n", + "Spot On-Demand Warehouse 1 Warehouse 2 Warehouse 3SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
63b9d1e5fa8c663ed4247156be83c2aa**[LEARN MORE](https://dbricks.co/benchmark)**\n", + "\n", + "**Did you know?**\n", + "\n", + "\n", + "**$2,000**\n", + "\n", + "**$1,791**\n", + "\n", + "**$1,500**\n", + "\n", + "**$1,000**\n", + "\n", + "**$952**\n", + "\n", + "\n", + "**$500**\n", + "\n", + "\n", + "**$242**\n", + "**$146**\n", + "\n", + "\n", + "**$358**\n", + "\n", + "\n", + "**$0**\n", + "Databricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\n", + "Spot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n", + "\n", + "System\n", + "\n", + "100TB TPC-DS price/performance benchmark (lower is better).\n", + "\n", + "\n", + "Databricks SQL has set a [new world record in](http://tpc.org/5013)\n", + "[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\n", + "benchmark for data warehousing. Databricks\n", + "SQL outperformed the previous record by 2.2x.\n", + "And this result has been formally audited and\n", + "reviewed by the TPC council.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Highly concurrent analytics workloads\n", + "\n", + "Beyond large queries, it is also common for highly concurrent analytics workloads\n", + "to execute over small data sets. To optimize concurrency, we used the same\n", + "TPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\n", + "streams. We analyzed the results to identify and remove bottlenecks, and\n", + "built hundreds of optimizations to improve concurrency. Databricks SQL now\n", + "outperforms some of the best cloud data warehouses for both large queries and\n", + "small queries with lots of users.\n", + "\n", + "Real-world workloads, however, are not just about either large or small queries.\n", + "Databricks SQL also provides intelligent workload management with a dual\n", + "queuing system and highly parallel reads.\n", + "\n", + "\n", + "16,523\n", + "\n", + "12,248\n", + "\n", + "###### ~3X\n", + "\n", + "4,672\n", + "\n", + "\n", + "11,690\n", + "\n", + "\n", + "July 2020\n", + "\n", + "\n", + "Jan 2021 Oct 2022\n", + "\n", + "\n", + "CLOUD DW X SQL WAREHOUSE X - L SIZE\n", + "\n", + "10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Intelligent workload management with smart queuing system\n", + "\n", + "Real-world workloads typically include a mix of small and large queries. Therefore\n", + "the smart queuing and load balancing capabilities of Databricks SQL need to\n", + "account for that too. Databrick SQL uses a smart dual queuing system (in preview)\n", + "that prioritizes small queries over large, as analysts typically care more about the\n", + "latency of short queries than large ones.\n", + "\n", + "\n", + "##### Highly parallel reads with improved I/O performance\n", + "\n", + "It is common for some tables in a lakehouse to be composed of many files — for\n", + "example, in streaming scenarios such as IoT ingest when data arrives continuously.\n", + "In legacy systems, the execution engine can spend far more time listing these\n", + "files than actually executing the query. Our customers told us they do not want to\n", + "sacrifice performance for data freshness. With async and highly parallel I/O, when\n", + "executing a query, Databricks SQL now automatically reads the next blocks of data\n", + "from cloud storage while the current block is being processed. This considerably\n", + "increases overall query performance on small files (by 12x for 1MB files) and “cold\n", + "data” (data that is not cached) use cases as well.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 3: CONSUMPTION LAYER\n", + "\n", + "\n", + "The third layer of the Databricks Lakehouse Platform would similarly have to bridge\n", + "the best of both data lakes and data warehouses. In the lakehouse, you would\n", + "have to be able to work seamlessly with your tools of choice — whether you are a\n", + "business analyst, data scientist, or ML or data engineer.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
fc23420138a8b084f33c275ba7f42a96**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 3: CONSUMPTION LAYER\n", + "\n", + "\n", + "The third layer of the Databricks Lakehouse Platform would similarly have to bridge\n", + "the best of both data lakes and data warehouses. In the lakehouse, you would\n", + "have to be able to work seamlessly with your tools of choice — whether you are a\n", + "business analyst, data scientist, or ML or data engineer.\n", + "\n", + "\n", + "The lakehouse must treat Python, Scala, R and SQL programming languages\n", + "and ecosystems as first-class citizens to truly unify data engineering, ML and BI\n", + "workloads in one place.\n", + "\n", + "\n", + "**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n", + "|---|---|---|\n", + "|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n", + "|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n", + "||||\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A platform for your tools of choice\n", + "\n", + "\n", + "At Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\n", + "closely with a large number of software vendors to make sure you can easily use your tools of choice\n", + "on Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\n", + "your favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\n", + "concurrency and performance improvements, we make sure that the direct and live query experience is great.\n", + "\n", + "\n", + "**Now more than ever, organizations**\n", + "\n", + "**need a data strategy that enables**\n", + "\n", + "**speed and agility to be adaptable.**\n", + "\n", + "**As organizations are rapidly moving**\n", + "\n", + "**their data to the cloud, we’re**\n", + "\n", + "**seeing growing interest in doing**\n", + "\n", + "**analytics on the data lake. The**\n", + "\n", + "**introduction of Databricks SQL**\n", + "\n", + "**delivers an entirely new experience**\n", + "\n", + "**for customers to tap into insights**\n", + "\n", + "**from massive volumes of data with**\n", + "\n", + "**the performance, reliability and**\n", + "\n", + "**scale they need. We’re proud to**\n", + "\n", + "**partner with Databricks to bring**\n", + "\n", + "**that opportunity to life.**\n", + "\n", + "**Francois Ajenstat**\n", + "Chief Product Officer, Tableau\n", + "\n", + "\n", + "+ Any other Apache Spark-compatible client\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Faster BI results retrieval with Cloud Fetch\n", + "\n", + "Once query results are computed, cloud data warehouses often collect and\n", + "stream back results to BI clients on a single thread. This can create a bottleneck\n", + "and greatly slows down the experience if you are fetching anything more than a\n", + "few megabytes of results in size. To provide analysts with the best experience\n", + "from their favorite BI tools, we also needed to speed up how the system delivers\n", + "results to BI tools like Power BI or Tableau once computed.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
3cdeda5682e63f2b2ccf6103b25dc299**delivers an entirely new experience**\n", + "\n", + "**for customers to tap into insights**\n", + "\n", + "**from massive volumes of data with**\n", + "\n", + "**the performance, reliability and**\n", + "\n", + "**scale they need. We’re proud to**\n", + "\n", + "**partner with Databricks to bring**\n", + "\n", + "**that opportunity to life.**\n", + "\n", + "**Francois Ajenstat**\n", + "Chief Product Officer, Tableau\n", + "\n", + "\n", + "+ Any other Apache Spark-compatible client\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Faster BI results retrieval with Cloud Fetch\n", + "\n", + "Once query results are computed, cloud data warehouses often collect and\n", + "stream back results to BI clients on a single thread. This can create a bottleneck\n", + "and greatly slows down the experience if you are fetching anything more than a\n", + "few megabytes of results in size. To provide analysts with the best experience\n", + "from their favorite BI tools, we also needed to speed up how the system delivers\n", + "results to BI tools like Power BI or Tableau once computed.\n", + "\n", + "That’s why we’ve reimagined this approach with a new architecture called\n", + "[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\n", + "all of the compute nodes to cloud storage, and then sends the list of files using\n", + "pre-signed URLs back to the client. The client then can download in parallel\n", + "all the data from cloud storage. This approach provides up to 10x performance\n", + "improvement in real-world scenarios.\n", + "\n", + "\n", + "parallel\n", + "data\n", + "transfers\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "**Cluster**\n", + "\n", + "\n", + "SQL Endpoint\n", + "\n", + "\n", + "CUSTOMER BENCHMARK\n", + "TABLEAU EXTRACT\n", + "\n", + "\n", + "Cloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n", + "**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A first-class SQL development experience\n", + "\n", + "In addition to supporting your favorite tools, we\n", + "are also focused on providing a native first-class\n", + "SQL development experience. We’ve talked to\n", + "hundreds of analysts using various SQL editors\n", + "like SQL Workbench every day, and worked with\n", + "them to provide the dream set of capabilities\n", + "for SQL development.\n", + "\n", + "For example, Databricks SQL now supports\n", + "[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\n", + "special SQL dialect. Query tabs allow you to work\n", + "on multiple queries at once, autosave gives you\n", + "peace of mind so you never have to worry about\n", + "losing your drafts, integrated history lets you\n", + "easily look at what you have run in the past, and\n", + "intelligent auto-complete understands subqueries\n", + "and aliases for a delightful experience.\n", + "\n", + "\n", + "The built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with Databricks SQL, analysts can easily\n", + "make sense of query results through a wide variety\n", + "of rich visualizations and quickly build dashboards\n", + "with an intuitive drag-and-drop interface. To keep\n", + "everyone current, dashboards can be shared and\n", + "configured to automatically refresh, as well as to\n", + "alert the team to meaningful changes in the data.\n", + "\n", + "\n", + "Easily combine visualizations to build rich dashboards that can be shared with stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Databricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\n", + "into actionable data, combining the flexibility and openness of data lakes\n", + "with the reliability and performance of data warehouses. The Unity Catalog\n", + "provides fine-grained governance on the lakehouse across all clouds using\n", + "one friendly interface and standard SQL.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
cf0057d81b5e7a1c0fc04dad428e50f3The built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with Databricks SQL, analysts can easily\n", + "make sense of query results through a wide variety\n", + "of rich visualizations and quickly build dashboards\n", + "with an intuitive drag-and-drop interface. To keep\n", + "everyone current, dashboards can be shared and\n", + "configured to automatically refresh, as well as to\n", + "alert the team to meaningful changes in the data.\n", + "\n", + "\n", + "Easily combine visualizations to build rich dashboards that can be shared with stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Databricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\n", + "into actionable data, combining the flexibility and openness of data lakes\n", + "with the reliability and performance of data warehouses. The Unity Catalog\n", + "provides fine-grained governance on the lakehouse across all clouds using\n", + "one friendly interface and standard SQL.\n", + "\n", + "Databricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\n", + "standard performance benchmark for data warehousing. It is powered by\n", + "Photon, the new vectorized query engine for the lakehouse, and by SQL\n", + "warehouses for instant, elastic compute decoupled from storage.\n", + "\n", + "Finally, Databricks SQL offers a native first-class SQL development\n", + "experience, with a built-in SQL editor, rich visualizations and dashboards,\n", + "and integrates seamlessly with your favorite BI- and SQL-based tools for\n", + "maximum productivity.\n", + "\n", + "\n", + "Databricks SQL under the hood.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Atlassian\n", + "\n", + "\n", + "Atlassian is a leading provider of collaboration, development and issue-tracking\n", + "\n", + "software for teams. With over 150,000 global customers (including 85 of the Fortune\n", + "\n", + "100), Atlassian is advancing the power of collaboration with products including Jira,\n", + "\n", + "Confluence, Bitbucket, Trello and more.\n", + "\n", + "USE CASE\n", + "\n", + "Atlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\n", + "down operational costs. Atlassian currently has a number of use cases focused on putting the\n", + "customer experience at the forefront.\n", + "\n", + "**Customer support and service experience**\n", + "With the majority of their customers being server-based (using products like Jira and Confluence),\n", + "Atlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\n", + "customer support experience.\n", + "\n", + "**Marketing personalization**\n", + "The same insights could also be used to deliver personalized marketing emails to drive\n", + "engagement with new features and products.\n", + "\n", + "**Anti-abuse and fraud detection**\n", + "They can predict license abuse and fraudulent behavior through anomaly detection and\n", + "predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Atlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\n", + "and externally. They have moved from a data warehousing paradigm to standardization on Databricks,\n", + "enabling the company to become more data driven across the organization. Over 3,000 internal users in\n", + "areas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\n", + "insights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\n", + "using the platform to drive more personalized support and service experiences to their customers.\n", + "\n", + "**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\n", + "finance, sales, support and R&D\n", + "\n", + "**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n", + "\n", + "**•** MLflow streamlines MLOps for faster delivery\n", + "\n", + "**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n", + "\n", + "With cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\n", + "access all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\n", + "Already the company has:\n", + "\n", + "**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\n", + "jobs from EMR to Databricks with minimal effort and low-code change\n", + "\n", + "**•** Decreased delivery time by 30% with shorter dev cyclesSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
8660338082a59dba0603f43ca7e8a09b**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\n", + "finance, sales, support and R&D\n", + "\n", + "**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n", + "\n", + "**•** MLflow streamlines MLOps for faster delivery\n", + "\n", + "**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n", + "\n", + "With cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\n", + "access all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\n", + "Already the company has:\n", + "\n", + "**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\n", + "jobs from EMR to Databricks with minimal effort and low-code change\n", + "\n", + "**•** Decreased delivery time by 30% with shorter dev cycles\n", + "\n", + "**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n", + "\n", + "\n", + "**At Atlassian, we need to ensure**\n", + "**teams can collaborate well**\n", + "**across functions to achieve**\n", + "**constantly evolving goals. A**\n", + "**simplified lakehouse architecture**\n", + "**would empower us to ingest high**\n", + "**volumes of user data and run the**\n", + "**analytics necessary to better**\n", + "**predict customer needs and**\n", + "**improve the experience of our**\n", + "**customers. A single, easy-to-use**\n", + "**cloud analytics platform allows**\n", + "**us to rapidly improve and build**\n", + "**new collaboration tools based on**\n", + "**actionable insights.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "### ABN AMRO\n", + "\n", + "\n", + "As an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n", + "\n", + "by legacy infrastructure and data warehouses that complicated access to data across various\n", + "\n", + "sources and created inefficient data processes and workflows. Today, Azure Databricks\n", + "\n", + "empowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n", + "\n", + "scientists and analysts who work collaboratively on improving business operations and\n", + "\n", + "introducing new go-to-market capabilities across the company.\n", + "\n", + "USE CASE\n", + "\n", + "ABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\n", + "providing automation and insight across operations.\n", + "\n", + "**Personalized finance**\n", + "ABN AMRO leverages real-time data and customer insights to provide products and services tailored to\n", + "customers’ needs. For example, they use machine learning to power targeted messaging within their automated\n", + "marketing campaigns to help drive engagement and conversion.\n", + "\n", + "**Risk management**\n", + "Using data-driven decision-making, they are focused on mitigating risk for both the company and their\n", + "customers. For example, they generate reports and dashboards that internal decision makers and leaders use to\n", + "better understand risk and keep it from impacting ABN AMRO’s business.\n", + "\n", + "**Fraud detection**\n", + "With the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\n", + "impacts their customers. Among the activities they’re trying to address are money laundering and fake credit\n", + "card applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Today, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\n", + "scientists and analysts who work collaboratively on improving business operations and introducing new\n", + "go-to-market capabilities across the company.\n", + "\n", + "**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\n", + "downstream analytics\n", + "\n", + "**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\n", + "through reports and dashboards\n", + "\n", + "**•** MLflow speeds deployment of new models that improve the customer experience — with new use\n", + "cases delivered in under two monthsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
7d67332c1f511a82e62065fbbbd13d4f**Fraud detection**\n", + "With the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\n", + "impacts their customers. Among the activities they’re trying to address are money laundering and fake credit\n", + "card applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Today, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\n", + "scientists and analysts who work collaboratively on improving business operations and introducing new\n", + "go-to-market capabilities across the company.\n", + "\n", + "**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\n", + "downstream analytics\n", + "\n", + "**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\n", + "through reports and dashboards\n", + "\n", + "**•** MLflow speeds deployment of new models that improve the customer experience — with new use\n", + "cases delivered in under two months\n", + "\n", + "\n", + "**Databricks has changed the way**\n", + "**we do business. It has put us in**\n", + "**a better position to succeed in**\n", + "**our data and AI transformation**\n", + "**as a company by enabling data**\n", + "**professionals with advanced data**\n", + "**capabilities in a controlled and**\n", + "**scalable way.**\n", + "\n", + "**Stefan Groot**\n", + "Head of Analytics Engineering,\n", + "ABN AMRO\n", + "\n", + "\n", + "#### 10x faster\n", + "\n", + "time to market — use cases\n", + "deployed in two months\n", + "\n", + "\n", + "#### 100+ \n", + "\n", + "use cases to be delivered\n", + "over the coming year\n", + "\n", + "\n", + "#### 500+\n", + "\n", + "empowered business\n", + "and IT users\n", + "\n", + "\n", + "**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### SEGA Europe\n", + "\n", + "**Improving the player experience**\n", + "\n", + "# “ is at the heart of everything\n", + "\n", + "**we do, and we very much**\n", + "**see Databricks as a key**\n", + "**partner, supporting us to drive**\n", + "**forward the next generation of**\n", + "**community gaming.**\n", + "\n", + "**Felix Baker**\n", + "Data Services Manager, SEGA Europe\n", + "\n", + "\n", + "SEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n", + "\n", + "Lakehouse Platform to personalize the player experience and build its own machine\n", + "\n", + "learning algorithm to help target and tailor games for over 30 million of its customers.\n", + "\n", + "As housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\n", + "titles, including Football Manager,™ saw over double the number of sales during the first lockdown\n", + "compared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\n", + "in players over the course of the COVID-19 pandemic. With more anonymized data being collected through\n", + "an analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\n", + "sheer volume of data, extract meaningful insights from it and enable the data science team to improve\n", + "general workflow.\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the lakehouse company. More than 7,000 organizations\n", + "\n", + "worldwide — including Comcast, Condé Nast and over 50% of the\n", + "\n", + "Fortune 500 — rely on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of\n", + "\n", + "Apache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
d9097f0ec6dc83ab4e34c5641e99f8c6worldwide — including Comcast, Condé Nast and over 50% of the\n", + "\n", + "Fortune 500 — rely on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of\n", + "\n", + "Apache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
87391e7b07aace450580645213c6e700# Big Book of Data and AI Use Cases for the Public Sector\n", + "\n", + "### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "The State of Data and AI in the Government .......................................................................................... 3\n", + "\n", + "The Need for a Modern Data Architecture ............................................................................................. 5\n", + "\n", + "Introducing the Lakehouse for Public Sector ......................................................................................... 6\n", + "\n", + "**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n", + "\n", + "**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n", + "\n", + "**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n", + "\n", + "**U S E C A S E :** Money Laundering ................................................................................................................. 17SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
304d79573a7bc185c33234b5e7fa334e**U S E C A S E :** Money Laundering ................................................................................................................. 17\n", + "\n", + "**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n", + "\n", + "**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n", + "\n", + "**U S E C A S E :** Public Health Management .................................................................................................. 24\n", + "\n", + "Conclusion ................................................................................................................................................. 26\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of Data and AI in the Government\n", + "\n", + "###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n", + "\n", + "\n", + "In 2018, the U.S. Federal Government embarked on one of its most ambitious\n", + "efforts since putting a man on the moon — embedding data into all aspects of\n", + "decision-making. By enacting the Evidence-Based Policymaking Act of 2018,\n", + "Congress set in motion requirements for agencies to modernize their data and\n", + "analytics capabilities, including the appointment of agency-level chief data\n", + "officers. A year later came the Federal Data Strategy, which provided further\n", + "guidance for how agencies should manage and use data by 2030.\n", + "\n", + "\n", + "With all of this guidance, agencies are starting to make meaningful improvements\n", + "to their data strategy, but when it comes to innovating with data, agencies still\n", + "lag behind the private sector. This begs the question: what’s standing in the way?\n", + "The hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\n", + "they can largely be attributed to a patchwork of legacy technologies that have\n", + "been amassed over the last 30 to 40 years. While these hurdles stand in the\n", + "way, a number of innovative agencies are making significant progress as they\n", + "embrace new data and AI capabilities.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
4885518b1e47e463d264c834140a6756In 2018, the U.S. Federal Government embarked on one of its most ambitious\n", + "efforts since putting a man on the moon — embedding data into all aspects of\n", + "decision-making. By enacting the Evidence-Based Policymaking Act of 2018,\n", + "Congress set in motion requirements for agencies to modernize their data and\n", + "analytics capabilities, including the appointment of agency-level chief data\n", + "officers. A year later came the Federal Data Strategy, which provided further\n", + "guidance for how agencies should manage and use data by 2030.\n", + "\n", + "\n", + "With all of this guidance, agencies are starting to make meaningful improvements\n", + "to their data strategy, but when it comes to innovating with data, agencies still\n", + "lag behind the private sector. This begs the question: what’s standing in the way?\n", + "The hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\n", + "they can largely be attributed to a patchwork of legacy technologies that have\n", + "been amassed over the last 30 to 40 years. While these hurdles stand in the\n", + "way, a number of innovative agencies are making significant progress as they\n", + "embrace new data and AI capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Federal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n", + "50% from 2018. There’s a good reason for this level of spend: Deloitte recently\n", + "published a report, “AI-augmented Government,” that estimates the federal\n", + "government could free up as many as 1.2 billion hours of work and save up to\n", + "$41.1 billion annually through the use of AI-driven automation. Early adopters\n", + "of advanced analytics are starting to see the fruits of their labor. For example,\n", + "[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\n", + "on applicants by 24x, automate the processing of millions of applications,\n", + "and reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n", + "[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n", + "\n", + "In this eBook, we explore the hurdles of legacy technologies and how a modern\n", + "data lakehouse can help agencies unlock innovative data and analytics use cases\n", + "at all levels of government. Over the following seven example use cases, covering\n", + "everything from cyber threat detection to improving public health,\n", + "\n", + "\n", + "**An increased focus on cloud, analytics and AI = operational efficiency**\n", + "\n", + "1. AI/ML\n", + "2. Data Analytics\n", + "3. Cloud\n", + "\n", + "**$1B** **TOP PRIORITIES** **$41B+**\n", + "\n", + "Data and AI Research and Government CIOs’ top Estimated government\n", + "Development Initiative game-changing technologies savings from data-driven\n", + "automation\n", + "\n", + "**U.S. Government**\n", + "\n", + "we demonstrate how the Databricks Lakehouse for Public Sector is critical to\n", + "improving citizen services and delivering on mission objectives. This guide also\n", + "includes resources in the form of Solution Accelerators, reference architectures\n", + "and real-world customer stories to help as you embark on your own journey to\n", + "drive a safer and more prosperous nation through the use of data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Modern Data Architecture\n", + "\n", + "###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n", + "\n", + "##### Common challenges\n", + "\n", + "\n", + "Many government agencies are burdened with a legacy IT infrastructure that is\n", + "built with on-premises data warehouses that are complex to maintain, are costly\n", + "to scale as compute is coupled with storage, and lack support for unstructured\n", + "data and advanced analytics. This severely inhibits data-driven innovation.\n", + "Maintaining these systems requires a massive investment of both time and\n", + "money compared to modern cloud-based systems and creates a number of\n", + "avoidable challenges:SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
e75d43566d9248c2dedde1ab3be69747-----\n", + "\n", + "## The Need for a Modern Data Architecture\n", + "\n", + "###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n", + "\n", + "##### Common challenges\n", + "\n", + "\n", + "Many government agencies are burdened with a legacy IT infrastructure that is\n", + "built with on-premises data warehouses that are complex to maintain, are costly\n", + "to scale as compute is coupled with storage, and lack support for unstructured\n", + "data and advanced analytics. This severely inhibits data-driven innovation.\n", + "Maintaining these systems requires a massive investment of both time and\n", + "money compared to modern cloud-based systems and creates a number of\n", + "avoidable challenges:\n", + "\n", + "\n", + "government is often done in weekly or daily batches, but decision-making\n", + "needs to happen in real time. Critical events like cyber attacks and health\n", + "pandemics can’t wait a week.\n", + "\n", + "**Lack of citizen insights**\n", + "\n", + "When data is siloed, teams get an incomplete view of the citizen,\n", + "resulting in missed opportunities to improve the delivery of services that\n", + "impact the quality of life for their constituents.\n", + "\n", + "\n", + "**Lack of reliability**\n", + "\n", + "\n", + "Siloed systems result in data replication as teams spin up new data marts\n", + "to support their one-off use cases. Without a single source of truth, teams\n", + "struggle with data inconsistencies, which can result in inaccurate analysis\n", + "and model performance that is only compounded over time.\n", + "\n", + "**Lack of agility**\n", + "\n", + "Disjointed analytics tools and legacy infrastructure hinder the ability of\n", + "teams to conduct real-time analytics. Most data processing in the\n", + "\n", + "\n", + "**Lack of productivity**\n", + "\n", + "Data scientists and data analysts alike must have the right tool set to\n", + "collaboratively investigate, extract and report meaningful insights from\n", + "their data. Unfortunately, data silos lead to organizational silos, which make\n", + "collaboration inside an agency as well as between agencies very difficult.\n", + "With different groups of data teams leveraging their own coding and\n", + "analytical tools, communicating insights and working across teams —\n", + "let alone across agencies — is almost impossible. This lack of collaboration\n", + "can drastically limit the capabilities of any data analytics or AI initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introducing the Lakehouse for Public Sector\n", + "\n", + "\n", + "The reason that the Databricks Lakehouse is\n", + "able to deliver the simplicity, flexibility and\n", + "speed that a government agency requires is\n", + "that it fundamentally reimagines the modern\n", + "data architecture. Databricks provides federal,\n", + "state and local agencies with a cloud-native\n", + "Lakehouse Platform that combines the best\n", + "of data warehouses and data lakes — to store\n", + "and manage all your data for all your analytics\n", + "workloads. With this modern architecture,\n", + "agencies can federate all their data and\n", + "democratize access for downstream use\n", + "cases, empowering their teams to deliver on\n", + "their mission objectives by unlocking the full\n", + "potential of their data.\n", + "\n", + "\n", + "**Delivering real-time data insight in support of the mission**\n", + "\n", + "- Fraud, Waste & Abuse\n", + "\n", + "- Cybersecurity\n", + "\n", + "- Medicaid Dashboards &\n", + "Reporting\n", + "\n", + "- Process Improvement\n", + "\n", + "- Predictive Maintenance\n", + "\n", + "- SCM & Demand Forecasting\n", + "\n", + "- Smart Military/Censor Data\n", + "\n", + "- Military Heatlh\n", + "\n", + "- COVID Response/Decision\n", + "Support\n", + "\n", + "- Smart Cities/Connected\n", + "Vehicles\n", + "\n", + "- Citizen Engagement\n", + "\n", + "- Data-Driven Decision-Making\n", + "\n", + "\n", + "-----\n", + "\n", + "**Federate all of your agency’s data**\n", + "\n", + "Any type of data can be stored because, like a data lake, the Databricks\n", + "Lakehouse is built using the low-cost object storage supported by cloud\n", + "providers. Leveraging this capability helps break down the data silos that\n", + "hinder efforts to aggregate data for advanced analytics (e.g., predictive\n", + "maintenance) or compute-intensive workloads like detecting cyber\n", + "threats across billions of signals. Probably even more important is the\n", + "ability of the lakehouse architecture to travel back in time, ensuring full\n", + "audit compliance and high governance standards for analytics and AI.\n", + "\n", + "**Power real-time decision-making**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
33f711ff444a761a534ff12f1fc7e8ed- Medicaid Dashboards &\n", + "Reporting\n", + "\n", + "- Process Improvement\n", + "\n", + "- Predictive Maintenance\n", + "\n", + "- SCM & Demand Forecasting\n", + "\n", + "- Smart Military/Censor Data\n", + "\n", + "- Military Heatlh\n", + "\n", + "- COVID Response/Decision\n", + "Support\n", + "\n", + "- Smart Cities/Connected\n", + "Vehicles\n", + "\n", + "- Citizen Engagement\n", + "\n", + "- Data-Driven Decision-Making\n", + "\n", + "\n", + "-----\n", + "\n", + "**Federate all of your agency’s data**\n", + "\n", + "Any type of data can be stored because, like a data lake, the Databricks\n", + "Lakehouse is built using the low-cost object storage supported by cloud\n", + "providers. Leveraging this capability helps break down the data silos that\n", + "hinder efforts to aggregate data for advanced analytics (e.g., predictive\n", + "maintenance) or compute-intensive workloads like detecting cyber\n", + "threats across billions of signals. Probably even more important is the\n", + "ability of the lakehouse architecture to travel back in time, ensuring full\n", + "audit compliance and high governance standards for analytics and AI.\n", + "\n", + "**Power real-time decision-making**\n", + "\n", + "Streaming use cases such as IoT analytics or disease spread tracking is\n", + "simpler to support because the lakehouse uses Apache Spark TM as the\n", + "data processing engine and Delta Lake as a storage layer. With Spark,\n", + "you can toggle between batch and streaming workloads with just a line\n", + "of code. With Delta Lake, native support for ACID transactions means\n", + "that you can deploy streaming workloads without the overhead of\n", + "common reliability and performance issues. These capabilities make\n", + "real-time analytics possible.\n", + "\n", + "\n", + "**Unlock collaborative analytics for all personas**\n", + "\n", + "The Databricks Lakehouse for Public Sector is your one-stop shop for\n", + "all your analytics and AI. The platform includes a business intelligence\n", + "capability — Databricks SQL — that empowers data analysts to query and run\n", + "reports against all of an agency’s unified data. Databricks SQL integrates with\n", + "BI tools like Tableau and Microsoft Power BI and complements any existing BI\n", + "tools with a SQL-native interface, allowing data analysts and data scientists\n", + "to query data directly within Databricks and build powerful dashboards.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Deliver on your mission with predictive insights**\n", + "In the same environment, data scientists can build, share and collaborate\n", + "on machine learning models for advanced use cases like fraud detection\n", + "or geospatial analytics. Additionally, MLflow, an open source toolkit for\n", + "managing the ML lifecycle, is built into the Lakehouse so data scientists\n", + "can manage everything in one place. Databricks natively supports Python,\n", + "R, SQL and Scala so practitioners can work together with the languages and\n", + "libraries of their choice, reducing the need for separate tools. With these\n", + "capabilities, data teams can turn insights from real-world data into powerful\n", + "visualizations designed for machine learning. Visualizations can then be\n", + "turned into interactive dashboards to share insights with peers across\n", + "agencies, policymakers, regulators and decision-makers.\n", + "\n", + "\n", + "##### Customers That Innovate With Databricks Lakehouse for Public Sector\n", + "\n", + "Some of the top government agencies in the world turn to the\n", + "Databricks Lakehouse for Public Sector to bring analytics and AI-driven\n", + "automation and innovation to the communities they serve.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Cybersecurity\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Limited window of data**\n", + "Given the high cost of storage, most agencies retain only a few weeks of threat\n", + "data. This can be a real problem in scenarios where a perpetrator gains access\n", + "to a network but waits months before doing anything malicious. Without a long\n", + "historical record, security teams can’t analyze cyberattacks over long-term\n", + "horizons or conduct deep forensic reviews.\n", + "\n", + "##### Solution overview\n", + "\n", + "For government agencies that are ready to modernize their security data\n", + "infrastructure and analyze data at petabyte-scale more cost-effectively,\n", + "Databricks provides an open lakehouse platform that augments existing SIEMs\n", + "to help democratize access to data for downstream analytics and AI. Built\n", + "on Apache Spark and Delta Lake, Databricks is optimized to process large\n", + "volumes of streaming and historic data for real-time threat analysis and incident\n", + "response. Security teams can query threat data going years into the past in just\n", + "minutes and build ML models to detect new threat patterns and reduce false\n", + "positives. Additionally, Databricks created a Splunk-certified add-on to augment\n", + "Splunk for Enterprise Security (ES) for cost-efficient log and retention expansion.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
6750f72ff74bd02db319754a9aeabef5**Limited window of data**\n", + "Given the high cost of storage, most agencies retain only a few weeks of threat\n", + "data. This can be a real problem in scenarios where a perpetrator gains access\n", + "to a network but waits months before doing anything malicious. Without a long\n", + "historical record, security teams can’t analyze cyberattacks over long-term\n", + "horizons or conduct deep forensic reviews.\n", + "\n", + "##### Solution overview\n", + "\n", + "For government agencies that are ready to modernize their security data\n", + "infrastructure and analyze data at petabyte-scale more cost-effectively,\n", + "Databricks provides an open lakehouse platform that augments existing SIEMs\n", + "to help democratize access to data for downstream analytics and AI. Built\n", + "on Apache Spark and Delta Lake, Databricks is optimized to process large\n", + "volumes of streaming and historic data for real-time threat analysis and incident\n", + "response. Security teams can query threat data going years into the past in just\n", + "minutes and build ML models to detect new threat patterns and reduce false\n", + "positives. Additionally, Databricks created a Splunk-certified add-on to augment\n", + "Splunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n", + "\n", + "\n", + "Cyberattacks from bad actors and nation states are a huge and growing threat\n", + "to government agencies. Recent large-scale attacks like the ones on SolarWinds,\n", + "log4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\n", + "frequency of broad-reaching cyberattacks. Data breaches cost the federal\n", + "government more than $4 million per incident in 2021 and threaten national\n", + "security. Staying ahead of the next threat requires continuous monitoring of\n", + "security data from an agency’s entire attack surface before, during and after\n", + "an incident.\n", + "\n", + "##### Challenges\n", + "\n", + "**Scaling existing SIEM solutions**\n", + "Agencies looking to expand existing SIEM tools for today’s petabytes of data can\n", + "expect increased licensing, storage, compute and integration resources resulting\n", + "in tens of millions of dollars in additional costs per year.\n", + "\n", + "**Rules-based systems**\n", + "Many legacy SIEM tools lack the critical analytics capabilities — such as\n", + "advanced analytics, graph processing and machine learning — needed to detect\n", + "unknown threat patterns or deliver on a broader set of security use cases like\n", + "behavioral analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "\n", + "Detecting criminals and nation states through DNS analytics. In order to address\n", + "common cybersecurity challenges such as deployment complexity, tech\n", + "limitation and cost, security teams need a real-time data analytics platform that\n", + "can handle cloud scale, analyze data wherever it is, natively support streaming\n", + "and batch analytics, and have collaborative content development capabilities.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n", + "\n", + "**Fighting Cyber Threats in Real Time**\n", + "Since partnering with Databricks, HSBC has reduced costs, accelerated threat\n", + "detection and response, and improved their security posture. Not only can\n", + "they process all of their required data, but they’ve also increased online query\n", + "retention from just days to months at petabyte scale. HSBC is now able to\n", + "execute 2-3x more threat hunts per analyst.\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "\n", + "Designed for cloud-scale security operations, the add-on provides Splunk\n", + "analysts with access to all data stored in the Lakehouse. Bidirectional pipelines\n", + "between Splunk and Databricks allow agency analysts to integrate directly into\n", + "Splunk visualizations and security workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Predictive MaintenanceSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
deb498e8b0e4add641a926f3454ddb53[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "\n", + "Designed for cloud-scale security operations, the add-on provides Splunk\n", + "analysts with access to all data stored in the Lakehouse. Bidirectional pipelines\n", + "between Splunk and Databricks allow agency analysts to integrate directly into\n", + "Splunk visualizations and security workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Predictive Maintenance\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Integrating unstructured data**\n", + "Equipment data doesn’t just come in the form of IoT data. Agencies can gather\n", + "rich unstructured signals like audio, visual (e.g., video inspections) and text\n", + "(e.g., maintenance logs). Most legacy data architectures are unable to integrate\n", + "structured and unstructured data sources.\n", + "\n", + "**Operationalizing machine learning**\n", + "Most agencies lack the advanced analytics tools needed to build models that\n", + "can predict potential equipment failures. Those that do typically have their\n", + "data scientists working in a siloed set of tools, resulting in unnecessary data\n", + "replication and inefficient workflows.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is tailor-made for building IoT applications at scale.\n", + "With Databricks, agencies can easily manage large streaming volumes of small\n", + "files, with ACID transaction guarantees and reduced job fails compared to\n", + "traditional data warehouse architectures. Additionally, the Lakehouse is cloud\n", + "native and built on Apache Spark, so scaling for petabytes of data is not an issue.\n", + "With the Lakehouse, agencies can bring together all of their structured and\n", + "unstructured data with a unified set of tooling for data engineering, model building\n", + "and production rollout. With these capabilities, operations teams can quickly\n", + "detect and act on pending equipment failures before they affect performance.\n", + "\n", + "\n", + "Predictive maintenance is oftentimes associated with the manufacturing sector,\n", + "but in reality it extends far beyond the factory floor. Consider this for a moment:\n", + "the U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\n", + "buses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\n", + "vehicles — like multimillion-dollar aircraft — contain sensors that generate\n", + "massive amounts of data on the use and conditions of various components. And\n", + "it’s not just vehicles. Modern public utilities stream data through connected IoT\n", + "devices. All of this data can be analyzed to identify the root cause of a failure\n", + "and predict future maintenance, helping to avoid costly repairs and critical\n", + "assets from being out of service.\n", + "\n", + "##### Challenges\n", + "\n", + "**Managing IoT data at scale**\n", + "With billions of sensors generating information, most data systems are unable to\n", + "handle the sheer volume of data. Before agencies can even start analyzing their\n", + "data, legacy data warehouse–based tools require preprocessing of data, making\n", + "real-time analysis impossible.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "**Solution Accelerator: Predictive Maintenance**\n", + "Learn how to ingest real-time IoT data from field devices, perform complex\n", + "time series processing in Delta Lake and leverage machine learning to build\n", + "predictive maintenance models.\n", + "\n", + "[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n", + "\n", + "[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n", + "\n", + "[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
3d6176fa88a4867b90655e52485bbd5e-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "**Solution Accelerator: Predictive Maintenance**\n", + "Learn how to ingest real-time IoT data from field devices, perform complex\n", + "time series processing in Delta Lake and leverage machine learning to build\n", + "predictive maintenance models.\n", + "\n", + "[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n", + "\n", + "[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n", + "\n", + "[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n", + "\n", + "\n", + "[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n", + "[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n", + "\n", + "##### Customer story\n", + "\n", + "**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n", + "\n", + "**Protecting the Water Supply for 700,000 Residents**\n", + "Utilizing machine learning for predictive analytics to help stop water main\n", + "breaks before they occur, potentially saving hundreds of thousands of dollars\n", + "in repairs while reducing service interruption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Weather Sensor\n", + "Readings\n", + "(semi-structured)\n", + "\n", + "Real-time\n", + "streaming\n", + "\n", + "Wind Turbine\n", + "Telematics\n", + "(semi-structured)\n", + "\n", + "Maintenance Logs\n", + "(unstructured)\n", + "\n", + "\n", + "#### Databricks Lakehouse Platform\n", + "\n", + "Bronze Layer Silver Layer Gold Layer\n", + "\n", + "\n", + "Append Raw\n", + "Merge Data\n", + "Data\n", + "\n", + "\n", + "Join Streams and\n", + "Analyze Data\n", + "\n", + "Enriched\n", + "Readings\n", + "\n", + "\n", + "Output\n", + "\n", + "\n", + "Build Predictive\n", + "Maintenance Model\n", + "\n", + "\n", + "Granular\n", + "Readings\n", + "\n", + "\n", + "Aggregated\n", + "Hourly\n", + "Readings\n", + "\n", + "\n", + "Real-time Dashboards for Real-Time Dashboards for\n", + "Optimizing Performance Optimizing Performance\n", + "\n", + "|Col1|Col2|Col3|\n", + "|---|---|---|\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Fraud Detection\n", + "\n", + "\n", + "##### Overview\n", + "\n", + "According to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\n", + "monetary losses to fraud, waste and abuse go undetected and total tens of\n", + "billions of dollars. Financial fraud comes in many forms, from individuals taking\n", + "advantage of relief programs to complex networks of criminal organizations\n", + "working together to falsify medical claims and rebate forms. Investigative teams\n", + "hoping to stay ahead of fraudsters need advanced analytics techniques so they\n", + "can detect anomalous behavior buried in a sea of data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Lack of machine learning**\n", + "A rules-based approach is not enough. Bad actors are getting more and more\n", + "sophisticated in how they take advantage of government programs, necessitating\n", + "an AI-driven approach.\n", + "\n", + "**Unreliable data**\n", + "Getting high-quality, clean data and maintaining a rich feature store is critical\n", + "for identifying ever-evolving fraud patterns while maintaining a strict record of\n", + "previous data points.\n", + "\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables teams to develop complex ML models with\n", + "high governance standards and bridge the gap between data science and\n", + "technology to address the challenge of analyzing large volumes of data at scale\n", + "— 40 billion financial transactions a year are made in the United States alone.\n", + "Additionally, Databricks makes it possible to combine modern AI techniques\n", + "with the legacy rules-based methods that underpin current approaches to fraud\n", + "detection all within a common and efficient Spark-based orchestration engine.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
7eda09eb47f349dbcfccf31d852da8e1##### Challenges\n", + "\n", + "**Lack of machine learning**\n", + "A rules-based approach is not enough. Bad actors are getting more and more\n", + "sophisticated in how they take advantage of government programs, necessitating\n", + "an AI-driven approach.\n", + "\n", + "**Unreliable data**\n", + "Getting high-quality, clean data and maintaining a rich feature store is critical\n", + "for identifying ever-evolving fraud patterns while maintaining a strict record of\n", + "previous data points.\n", + "\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables teams to develop complex ML models with\n", + "high governance standards and bridge the gap between data science and\n", + "technology to address the challenge of analyzing large volumes of data at scale\n", + "— 40 billion financial transactions a year are made in the United States alone.\n", + "Additionally, Databricks makes it possible to combine modern AI techniques\n", + "with the legacy rules-based methods that underpin current approaches to fraud\n", + "detection all within a common and efficient Spark-based orchestration engine.\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n", + "\n", + "Due to an ever-changing landscape, building a financial fraud detection\n", + "framework often goes beyond just creating a highly accurate machine learning\n", + "model. Oftentimes it involves a complex-decision science setup that combines\n", + "a rules engine with a need for a robust and scalable machine learning platform.\n", + "In this example, we show how to build a holistic fraud detection solution on\n", + "Databricks using data from a financial institution.\n", + "\n", + "\n", + "**Analytics at scale**\n", + "Training complex ML models with hundreds of features on gigabytes of\n", + "structured, semi-structured and unstructured data can be impossible without a\n", + "highly scalable and distributed infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n", + "\n", + "**Identifying Financial Fraud at Scale**\n", + "Processes hundreds of billions of market events\n", + "per day on the Databricks Lakehouse and uses\n", + "the power of machine learning to identify illicit\n", + "activity in near real-time.\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Money Laundering\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "Approximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\n", + "and with criminal organizations — both at home and abroad — implementing\n", + "increasingly sophisticated methods for laundering funds, it’s getting harder to\n", + "stop. While the federal government continues to apply pressure on the financial\n", + "sector through heightened regulation, more is needed to combat laundering.\n", + "Modern AI techniques such as graph analytics and computer vision can be\n", + "used to process different types of structured (e.g., financial transactions) and\n", + "unstructured (e.g., real estate images) data and identify illicit behavior. This\n", + "allows investigative teams to automate labor-intensive activities like confirming\n", + "a residential address or reviewing transaction histories, and instead dig into\n", + "priority threats.\n", + "\n", + "##### Challenges\n", + "\n", + "**Complex data science**\n", + "Modern anti-money laundering (AML) practices require multiple ML capabilities\n", + "such as entity resolution, computer vision and graph analytics on entity\n", + "metadata, which is typically not supported by any one data platform.\n", + "\n", + "\n", + "**Time-consuming false positives**\n", + "Any reported suspicious activity must be investigated manually to ensure\n", + "accuracy. Many legacy solutions generate a high number of false positives or fail\n", + "to identify unknown patterns, resulting in wasted effort by investigators.\n", + "\n", + "##### Solution overview\n", + "\n", + "AML solutions face the operational burden of processing billions of transactions\n", + "a day. The Databricks Lakehouse Platform combines the low storage cost\n", + "benefits of cloud data lakes with the robust transaction capabilities of data\n", + "warehouses, making it the ideal foundation for building AML analytics at massive\n", + "scale. At the core of Databricks is Delta Lake, which can store and combine\n", + "both unstructured and structured data to build entity relationships; moreover,\n", + "Databricks Delta Engine provides efficient access using the new Photon compute\n", + "to speed up BI queries on tables spanning billions of transactions. On top of\n", + "these capabilities, ML is a first-class citizen in the Lakehouse, which means\n", + "analysts and data scientists do not waste time subsampling or moving data to\n", + "share dashboards and stay one step ahead of bad actors.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
4c06561b8ae2e83b0c3b0d4f8ce53da4**Time-consuming false positives**\n", + "Any reported suspicious activity must be investigated manually to ensure\n", + "accuracy. Many legacy solutions generate a high number of false positives or fail\n", + "to identify unknown patterns, resulting in wasted effort by investigators.\n", + "\n", + "##### Solution overview\n", + "\n", + "AML solutions face the operational burden of processing billions of transactions\n", + "a day. The Databricks Lakehouse Platform combines the low storage cost\n", + "benefits of cloud data lakes with the robust transaction capabilities of data\n", + "warehouses, making it the ideal foundation for building AML analytics at massive\n", + "scale. At the core of Databricks is Delta Lake, which can store and combine\n", + "both unstructured and structured data to build entity relationships; moreover,\n", + "Databricks Delta Engine provides efficient access using the new Photon compute\n", + "to speed up BI queries on tables spanning billions of transactions. On top of\n", + "these capabilities, ML is a first-class citizen in the Lakehouse, which means\n", + "analysts and data scientists do not waste time subsampling or moving data to\n", + "share dashboards and stay one step ahead of bad actors.\n", + "\n", + "\n", + "**Model transparency**\n", + "Although AI can be used to address many money laundering use cases, the lack\n", + "of transparency in the development of ML models offers little explainability,\n", + "inhibiting broader adoption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "\n", + "\n", + "Lakehouse Platform leveraging a series of next-gen machine learning techniques\n", + "including NLP, computer vision, entity resolution and graph analytics. This\n", + "approach helps teams better adapt to the reality of modern laundering practices.\n", + "\n", + "\n", + "Current anti-money laundering practices bear little resemblance to those of the\n", + "last decade. In today’s digital world, financial institutions are processing billions\n", + "of transactions daily, increasing the surface area of money laundering. With this\n", + "accelerator, we demonstrate how to build a scalable AML solution on the\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Entity Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**No machine learning capabilities**\n", + "Entity resolution typically relies on basic rules-based logic to compare records\n", + "(e.g., matching on name and address), but with messy, large volumes of data,\n", + "advanced analytics is needed to improve accuracy and accelerate efforts.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is an ideal platform for building entity analytics at\n", + "scale. With support for a wide range of data formats and a rich and extensible\n", + "set of data transformation and ML capabilities, Databricks enables agencies to\n", + "bring together all of their data in a central location and move beyond simple\n", + "rules-based methods for entity resolution. Data teams can easily explore\n", + "different machine learning techniques like natural language processing,\n", + "classification and graph analytics to automate entity matching. And one-click\n", + "provisioning and deprovisioning of cloud resources makes it easy for teams to\n", + "cost-effectively allocate the necessary compute resources for any size job so\n", + "they can uncover findings faster.\n", + "\n", + "\n", + "Entity analytics aims to connect disparate data sources to build a full view of\n", + "a person or an organization. This has many applications in the public sector,\n", + "such as fraud detection, national security and population health. For example,\n", + "Medicare fraud teams need to understand which prescriptions are filled, claims\n", + "filed and facilities visited across geographies to uncover suspicious behavior.\n", + "Before teams can even look for suspicious behavior, they must first determine\n", + "which records are associated. In the United States, nearly 50,000 people share\n", + "the name John Smith (and there are thousands of others with similar names).\n", + "Imagine trying to identify the right John Smith for this type of analysis. That’s no\n", + "easy task.\n", + "\n", + "##### Challenges\n", + "\n", + "**Disjointed data**\n", + "Managing complex and brittle ETL pipelines in order to cleanse and join data\n", + "across siloed systems and data stores.\n", + "\n", + "\n", + "**Compute intensive**\n", + "Identifying related entities across population-level data sets requires massive\n", + "compute power that far outstrips legacy on-prem data architectures.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get startedSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
fc164d931337a8fb50a377fa7e37f2bbEntity analytics aims to connect disparate data sources to build a full view of\n", + "a person or an organization. This has many applications in the public sector,\n", + "such as fraud detection, national security and population health. For example,\n", + "Medicare fraud teams need to understand which prescriptions are filled, claims\n", + "filed and facilities visited across geographies to uncover suspicious behavior.\n", + "Before teams can even look for suspicious behavior, they must first determine\n", + "which records are associated. In the United States, nearly 50,000 people share\n", + "the name John Smith (and there are thousands of others with similar names).\n", + "Imagine trying to identify the right John Smith for this type of analysis. That’s no\n", + "easy task.\n", + "\n", + "##### Challenges\n", + "\n", + "**Disjointed data**\n", + "Managing complex and brittle ETL pipelines in order to cleanse and join data\n", + "across siloed systems and data stores.\n", + "\n", + "\n", + "**Compute intensive**\n", + "Identifying related entities across population-level data sets requires massive\n", + "compute power that far outstrips legacy on-prem data architectures.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n", + "\n", + "Learn from Databricks experts on how entity analytics is being deployed\n", + "in the public sector and watch a demo that shows how to use ML to link\n", + "payments and treatments across millions of records in a public CMS data set.\n", + "\n", + "[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "\n", + "While focused on retail, this accelerator has applications for any organization\n", + "working on entity matching, especially as it relates to items that might be stored\n", + "across locations. In this notebook, we demonstrate how to use machine learning\n", + "and the Databricks Lakehouse Platform to resolve differences between product\n", + "definitions and descriptions, and determine which items are likely pairs and\n", + "which are distinct across disparate data sets.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n", + "\n", + "In this talk, NewWave shares the specifics on CMS’s entity resolution use case,\n", + "the ML necessary for this data and the unique uses of Databricks in providing\n", + "this capability.\n", + "\n", + "##### Sample workflow\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Geospatial Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Broad range of analytics capabilities**\n", + "Enterprises require a diverse set of data applications — including SQL-based\n", + "analytics, real-time monitoring, data science and machine learning — to support\n", + "geospatial workloads given the diverse nature of the data and use cases.\n", + "\n", + "##### Solution overview\n", + "\n", + "With Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\n", + "workloads, as it provides a single source of truth for all types of structured,\n", + "unstructured, streaming and batch data, enabling seamless spatio-temporal\n", + "unification and cross-querying with tabular and raster-based data. Built on\n", + "Apache Spark, the Lakehouse easily scales for data sets consisting of billions\n", + "of rows of data with distributed processing in the cloud. To expand on the core\n", + "capabilities of the Lakehouse, Databricks has introduced the Mosaic library,\n", + "an extension to the Apache Spark framework, built for fast and easy processing\n", + "of large geospatial data sets. Popular frameworks such as Apache Sedona or\n", + "GeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\n", + "Lakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\n", + "to support all types of geospatial use cases.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
3906955f310fd2514373213350a23fe3##### Solution overview\n", + "\n", + "With Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\n", + "workloads, as it provides a single source of truth for all types of structured,\n", + "unstructured, streaming and batch data, enabling seamless spatio-temporal\n", + "unification and cross-querying with tabular and raster-based data. Built on\n", + "Apache Spark, the Lakehouse easily scales for data sets consisting of billions\n", + "of rows of data with distributed processing in the cloud. To expand on the core\n", + "capabilities of the Lakehouse, Databricks has introduced the Mosaic library,\n", + "an extension to the Apache Spark framework, built for fast and easy processing\n", + "of large geospatial data sets. Popular frameworks such as Apache Sedona or\n", + "GeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\n", + "Lakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\n", + "to support all types of geospatial use cases.\n", + "\n", + "\n", + "Every day billions of handheld and IoT devices, along with thousands of\n", + "airborne and satellite remote sensing platforms, generate hundreds of exabytes\n", + "of location-aware data. This boom of geospatial big data combined with\n", + "advancements in machine learning is enabling government agencies to develop\n", + "new capabilities. The potential use cases for geospatial analytics and AI touch\n", + "every part of the government, including disaster recovery (e.g., flood/earthquake\n", + "mapping), defense and intel (e.g., detecting threats using drone footage),\n", + "infrastructure (e.g., public transportation planning), civilian safety (e.g., crime\n", + "prediction), public health (e.g., disease spread tracking), and much more. Every\n", + "agency at the state and federal level needs to consider how they can tap into\n", + "geospatial data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Massive volumes of geospatial data**\n", + "With the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\n", + "daily, outpacing their ability to store and process this data at scale.\n", + "\n", + "\n", + "**Compute-intensive spatial workloads**\n", + "Geospatial data is complex in structure, with various formats not well suited for\n", + "legacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Build a Lakehouse to support all of your geospatial analytics and AI use cases\n", + "with the Mosaic library. Mosaic provides a number of capabilities including easy\n", + "conversion between common spatial data encodings, constructors to easily\n", + "generate new geometries from Spark native data types, many of the OGC SQL\n", + "standard ST_ functions implemented as Spark Expressions for transforming,\n", + "aggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\n", + "all provided with the flexibility of a Scala, SQL or Python API.\n", + "\n", + "[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "\n", + "Learn how to build powerful geospatial insights and visualizations with a\n", + "Lakehouse for all your geospatial data processing, analytics and AI.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
968928e9f1109e7b2205726eff3e1d66[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "\n", + "Learn how to build powerful geospatial insights and visualizations with a\n", + "Lakehouse for all your geospatial data processing, analytics and AI.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n", + "\n", + "**Analyzing Flight Data to Improve Aviation**\n", + "To help airlines better serve their millions of passengers, USDOT built a\n", + "modern analytics architecture on Databricks that incorporates data such as\n", + "weather, flight, aeronautical and surveillance information. With this new\n", + "platform, they reduced compute costs by 90% and can now power use cases\n", + "such as predicting air cargo traffic patterns, flight delays and the financial\n", + "impact of flight cancellations.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n", + "\n", + "**Customer Story: Flood Prediction With Machine Learning**\n", + "In an effort to improve the safety of civil projects, Stantec built a machine\n", + "learning model on Databricks leveraging large volumes of weather and geological\n", + "data — oftentimes consisting of trillions of data points — to predict the impact\n", + "of flash floods on various regions and adjust civil planning accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Mosaic Kepler Magics\n", + "Geometry Display Functions\n", + "for Map Display\n", + "\n", + "ESRI Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "Built-In Indexing\n", + "System Support\n", + "\n", + "\n", + "JTS Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Public Health Management\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "In their lifetime, every human is expected to generate a million gigabytes of\n", + "health data spanning electronic health records, medical images, claims, wearable\n", + "data, genomics and more. This data is critical to understanding the health of\n", + "the individual, but when aggregated and analyzed across large populations,\n", + "government agencies can glean important insights like disease trends, the\n", + "impact of various treatment guidelines and the effectiveness of resources. By\n", + "adding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\n", + "location, income level, education, housing — agencies can better identify\n", + "underserved communities and the critical factors that contribute to positive\n", + "health outcomes.\n", + "\n", + "##### Challenges\n", + "\n", + "**Rapidly growing health data**\n", + "Healthcare data is growing exponentially. Unfortunately, legacy on-premises data\n", + "architectures are complex to manage and too costly to scale for populationscale analytics.\n", + "\n", + "\n", + "**Complexities of ML in healthcare**\n", + "The legacy analytics platforms that underpin healthcare lack the robust data\n", + "science capabilities needed for predictive health use cases like disease risk\n", + "scoring. There’s also the challenge of managing reproducibility, which is critical\n", + "when building ML models that can impact patient outcomes.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables public health agencies to bring together all\n", + "their research and patient data in a HIPAA-certified environment and marry it\n", + "with powerful analytics and AI capabilities to deliver real-time and predictive\n", + "insights at population scale. The Lakehouse eliminates the need for legacy\n", + "data architectures, which have historically inhibited innovation in patient care\n", + "by creating data silos and making advanced analytics difficult. Databricks led\n", + "open source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\n", + "that make it easy to ingest and prepare healthcare-specific data modalities for\n", + "downstream analytics.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
88e28b7c1cb02366091a1560b26dfc60##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables public health agencies to bring together all\n", + "their research and patient data in a HIPAA-certified environment and marry it\n", + "with powerful analytics and AI capabilities to deliver real-time and predictive\n", + "insights at population scale. The Lakehouse eliminates the need for legacy\n", + "data architectures, which have historically inhibited innovation in patient care\n", + "by creating data silos and making advanced analytics difficult. Databricks led\n", + "open source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\n", + "that make it easy to ingest and prepare healthcare-specific data modalities for\n", + "downstream analytics.\n", + "\n", + "\n", + "**Fragmented patient data**\n", + "It is widely accepted that over 80% of medical data is unstructured, yet most\n", + "organizations still focus their attention on data warehouses designed to only\n", + "support structured data and SQL-based analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Our joint solutions with John Snow Labs bring together the power of Spark NLP\n", + "for Healthcare with the collaborative analytics and AI capabilities of Databricks.\n", + "Informatics teams can ingest raw unstructured medical text files into Databricks,\n", + "extract meaningful insights using natural language processing techniques,\n", + "and make the data available for downstream analytics. We have specific NLP\n", + "solutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "\n", + "One of the most powerful tools for identifying patients at risk for a chronic\n", + "condition is the analysis of real world data (RWD). This Solution Accelerator\n", + "notebook provides a template for building a machine learning model that\n", + "assesses the risk of a patient for a given condition within a given window of time\n", + "based on a patient’s encounter history and demographics information.\n", + "\n", + "\n", + "[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "\n", + "Databricks COVID-19 surveillance solution takes a data-driven approach to\n", + "adaptive response, applying predictive analytics to COVID-19 data sets to\n", + "help drive more effective shelter-in-place policies.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n", + "\n", + "**From Vaccine Management to ICU Planning**\n", + "During the pandemic, the Chesapeake Regional Information System for our\n", + "Patients implemented a modern data architecture on Databricks to address\n", + "critical reporting needs. This allowed them to analyze 400 billion data points\n", + "\n", + "for innovative use cases like real-time disease spread tracking, vaccine\n", + "distribution and prioritizing vulnerable populations.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data is at the core of how government agencies operate and AI is at theSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
f9e2a23ff54684fbfcd88171a4a56914Databricks COVID-19 surveillance solution takes a data-driven approach to\n", + "adaptive response, applying predictive analytics to COVID-19 data sets to\n", + "help drive more effective shelter-in-place policies.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n", + "\n", + "**From Vaccine Management to ICU Planning**\n", + "During the pandemic, the Chesapeake Regional Information System for our\n", + "Patients implemented a modern data architecture on Databricks to address\n", + "critical reporting needs. This allowed them to analyze 400 billion data points\n", + "\n", + "for innovative use cases like real-time disease spread tracking, vaccine\n", + "distribution and prioritizing vulnerable populations.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data is at the core of how government agencies operate and AI is at the\n", + "\n", + "forefront of driving innovation into the future. The Databricks Lakehouse for\n", + "\n", + "Public Sector enables government agencies at the federal, state and local level\n", + "\n", + "to harness the full power of data and analytics to solve strategic challenges and\n", + "\n", + "make smarter decisions that improve the safety and quality of life of all citizens.\n", + "\n", + "Get started with a free trial of Databricks Lakehouse and start building better\n", + "\n", + "data applications today.\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "###### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
a61e36c3eaa7a642dc30c708abe4dc6c**EBOOK**\n", + "\n", + "# Four Forces Driving Intelligent Manufacturing\n", + "\n", + "### A data-driven business built on Lakehouse for Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction .................................................................................................................................................................................................................................................. **03**\n", + "\n", + "The four driving forces of change ..................................................................................................................................................................................................... **04**\n", + "\n", + "Digital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n", + "\n", + "Manufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
a957319c3c96e04dfccb0e1e1a1f4ccdManufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n", + "\n", + "The foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n", + "\n", + "DRIVING FORCE NO. 1\n", + "The shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n", + "\n", + "DRIVING FORCE NO. 2\n", + "Transparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n", + "\n", + "DRIVING FORCE NO. 3\n", + "Future opportunities for manufacturing business models ......................................................................................................................................................... **13**SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
46fb1c1ddbad577c69f2632635003ba9DRIVING FORCE NO. 3\n", + "Future opportunities for manufacturing business models ......................................................................................................................................................... **13**\n", + "\n", + "DRIVING FORCE NO. 4\n", + "The focus on sustainability ....................................................................................................................................................................................................................... **15**\n", + "\n", + "Leveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n", + "\n", + "The building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n", + "\n", + "Manufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n", + "\n", + "2 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## IntroductionSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
fe836d02d98b3b6c8a001bb6836708c8Manufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n", + "\n", + "2 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n", + "\n", + "\n", + "But today it’s data- and AI-driven businesses that\n", + "are being rewarded because they’re using process\n", + "and product optimization not previously possible,\n", + "able to forecast and sense supply chain demand,\n", + "and, crucially, introduce new forms of revenue\n", + "based upon service rather than product.\n", + "\n", + "The drivers for this evolution will be the emergence\n", + "of what we refer to as “Intelligent Manufacturing”\n", + "that has been enabled by the rise of computational\n", + "power at the Edge and in the Cloud. As well as\n", + "new levels of connectivity speed enabled by 5G\n", + "and fiber optic, combined with increased use of\n", + "advanced analytics and machine learning (ML).\n", + "\n", + "\n", + "Yet, even with all the technological advances\n", + "enabling these new data-driven businesses,\n", + "challenges exist.\n", + "\n", + "McKinsey’s recent research with the World\n", + "Economic Forum estimates the value creation\n", + "potential of manufacturers and suppliers that\n", + "implement Industry 4.0 in their operations\n", + "at USD$37 trillion by 2025. Truly a huge number.\n", + "But the challenge that most companies still\n", + "struggle with is the move from piloting point\n", + "solutions to delivering sustainable impact at scale.\n", + "[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "\n", + "\n", + "##### 80% of manufacturers\n", + "[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
b0de1f22c24ec00611cdfbd12e2b0ef5##### 80% of manufacturers\n", + "[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "\n", + "##### 57% of manufacturing leaders feel their organization\n", + "[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "\n", + "[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n", + "##### manufacturers by 2025\n", + "\n", + "\n", + "3 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The four driving forces of change\n", + "\n", + "###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n", + "\n", + "\n", + "###### Skills and production gaps\n", + "\n", + "The rise of the digital economy is demanding a new set of skills.\n", + "For today’s Intelligent Manufacturing organizations, there’s a fundamental\n", + "need for computer and programming skills for automation, along\n", + "with critical-thinking abilities. Also important is the ability to use\n", + "collaboration systems and new advanced assistance tools, such as\n", + "automation, virtual reality (VR) and augmented reality (AR). The deficit\n", + "of workers with these skills is of critical concern to manufacturers.\n", + "\n", + "In addition, the industry dynamics are pushing companies to increase\n", + "and refine both partner/supplier relationships, optimize internal\n", + "operations and build robust supply chains that do not rely upon\n", + "safety stock to weather supply chain swings. Historical focus on\n", + "operational use cases is now extending to building agile supply chains.\n", + "\n", + "###### Supply chain volatility\n", + "\n", + "If the events of the last few years proved anything, it’s that supply\n", + "chains need to be robust and resilient. Historically, supply chain volatility\n", + "was smoothed by holding “safety stock,” which added costs without\n", + "financial value. Then the pendulum swung to “just in time delivery,”\n", + "where efficient use of working capital disregarded demand risks.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
a2db7fe8fbdc33c18355e02c7ec60dc7The rise of the digital economy is demanding a new set of skills.\n", + "For today’s Intelligent Manufacturing organizations, there’s a fundamental\n", + "need for computer and programming skills for automation, along\n", + "with critical-thinking abilities. Also important is the ability to use\n", + "collaboration systems and new advanced assistance tools, such as\n", + "automation, virtual reality (VR) and augmented reality (AR). The deficit\n", + "of workers with these skills is of critical concern to manufacturers.\n", + "\n", + "In addition, the industry dynamics are pushing companies to increase\n", + "and refine both partner/supplier relationships, optimize internal\n", + "operations and build robust supply chains that do not rely upon\n", + "safety stock to weather supply chain swings. Historical focus on\n", + "operational use cases is now extending to building agile supply chains.\n", + "\n", + "###### Supply chain volatility\n", + "\n", + "If the events of the last few years proved anything, it’s that supply\n", + "chains need to be robust and resilient. Historically, supply chain volatility\n", + "was smoothed by holding “safety stock,” which added costs without\n", + "financial value. Then the pendulum swung to “just in time delivery,”\n", + "where efficient use of working capital disregarded demand risks.\n", + "\n", + "Recent experiences have highlighted that demand sensing is needed\n", + "in addition to safety stock for high-risk parts or raw materials. The ability\n", + "to monitor, predict and respond to external factors – including natural\n", + "disasters, shipping and warehouse constraints, and geopolitical disruption\n", + "– is vital to reduce risk and promote agility. Many of these external\n", + "data sources leverage unstructured data (news, social posts, videos\n", + "and images), and being able to manage both structured and unstructured\n", + "data available to measure and analyze this volatility is key.\n", + "\n", + "\n", + "###### Need for new and additional sources of revenue\n", + "\n", + "Manufacturers’ growth historically has been limited\n", + "to new product introduction rate or expansion into\n", + "new geographies. The emergence of “equipment\n", + "as-a-service” is changing that dynamic. It’s pivoting\n", + "the business from product-centric growth to one\n", + "leveraging added services, which are not slaves to the\n", + "product development introduction cycle and can be highly\n", + "differentiated depending on the market segment and types\n", + "of products. Real-time data plays an outsize role, as now\n", + "businesses are in unison with use cases such as predictive\n", + "maintenance, stock replenishment and worker safety.\n", + "\n", + "###### An increased focus on sustainability\n", + "\n", + "Manufacturers have always focused on efficiency,\n", + "but they’re increasingly seeing the value chain as circular.\n", + "It’s no longer enough to consider an organization’s own\n", + "carbon footprint – it needs to also include indirect\n", + "emissions and other environmental impacts from the\n", + "activities it doesn’t own or control. This requires a\n", + "360-degree view of sustainability, which includes both\n", + "internal and external factors in measuring compliance\n", + "with ESG programs.\n", + "\n", + "**This eBook will look closer at these four key challenges**\n", + "**and their associated use cases, as well as some**\n", + "**of the most effective technologies and solutions**\n", + "**that can be implemented to respond to them.**\n", + "\n", + "\n", + "4 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Digital transformation is not a destination, it’s a journey\n", + "\n", + "##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n", + "\n", + "This transition from manual operations to automated\n", + "solutions is enhancing and optimizing operational\n", + "efficiency and decision-making, while also making\n", + "supply chains more frictionless and reliable, as well\n", + "as enabling organizations to become more responsive\n", + "and adaptable to market and customer needs.\n", + "\n", + "This disruption has been driven by a rush of new\n", + "technologies including artificial intelligence, machine\n", + "learning, advanced analytics, digital twins, Internet\n", + "of Things (IoT), and automation. These, in turn, have\n", + "been enabled by the greater network capabilities of 5G.\n", + "Industry 4.0 is well underway. Intelligent Manufacturing\n", + "isn’t the future, it’s what competitive organizations\n", + "have established today.\n", + "\n", + "\n", + "## The data and AI maturity curve\n", + "### From descriptive to prescriptive\n", + "\n", + "Prescriptive\n", + "Analytics\n", + "\n", + "Predictive\n", + "Modeling\n", + "\n", + "**How** can we make it happen?\n", + "\n", + "Data\n", + "Exploration\n", + "\n", + "\n", + "**What** will happen?\n", + "\n", + "**Why** did it happen?\n", + "\n", + "\n", + "Ad Hoc\n", + "Queries\n", + "\n", + "\n", + "Reports\n", + "\n", + "\n", + "Cleaned\n", + "Data\n", + "\n", + "**What** happened?\n", + "\n", + "Analytics Maturity\n", + "\n", + "\n", + "Raw\n", + "Data\n", + "\n", + "\n", + "5 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing – use case maturity matrix\n", + "\n", + "\n", + "No\n", + "\n", + "1SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
fb676139b6f1a49181c1e792ad607ca7This disruption has been driven by a rush of new\n", + "technologies including artificial intelligence, machine\n", + "learning, advanced analytics, digital twins, Internet\n", + "of Things (IoT), and automation. These, in turn, have\n", + "been enabled by the greater network capabilities of 5G.\n", + "Industry 4.0 is well underway. Intelligent Manufacturing\n", + "isn’t the future, it’s what competitive organizations\n", + "have established today.\n", + "\n", + "\n", + "## The data and AI maturity curve\n", + "### From descriptive to prescriptive\n", + "\n", + "Prescriptive\n", + "Analytics\n", + "\n", + "Predictive\n", + "Modeling\n", + "\n", + "**How** can we make it happen?\n", + "\n", + "Data\n", + "Exploration\n", + "\n", + "\n", + "**What** will happen?\n", + "\n", + "**Why** did it happen?\n", + "\n", + "\n", + "Ad Hoc\n", + "Queries\n", + "\n", + "\n", + "Reports\n", + "\n", + "\n", + "Cleaned\n", + "Data\n", + "\n", + "**What** happened?\n", + "\n", + "Analytics Maturity\n", + "\n", + "\n", + "Raw\n", + "Data\n", + "\n", + "\n", + "5 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing – use case maturity matrix\n", + "\n", + "\n", + "No\n", + "\n", + "1\n", + "\n", + "2\n", + "\n", + "3\n", + "\n", + "4\n", + "\n", + "5\n", + "\n", + "6\n", + "\n", + "7\n", + "\n", + "8\n", + "\n", + "9\n", + "\n", + "10\n", + "\n", + "11\n", + "\n", + "12\n", + "\n", + "13\n", + "\n", + "14\n", + "\n", + "15\n", + "\n", + "16\n", + "\n", + "17\n", + "\n", + "18\n", + "\n", + "19\n", + "\n", + "20\n", + "\n", + "21\n", + "\n", + "22\n", + "\n", + "23\n", + "\n", + "\n", + "Use case name\n", + "\n", + "EDW offload\n", + "\n", + "Product 360\n", + "\n", + "Voice of customer insights\n", + "\n", + "Testing & simulation optimization\n", + "\n", + "Supplier 360\n", + "\n", + "Spend analytics\n", + "\n", + "Sourcing event optimization\n", + "\n", + "Process & quality monitoring\n", + "\n", + "Process 360\n", + "\n", + "Equipment predictive maintenance\n", + "\n", + "Quality & yield optimization\n", + "\n", + "Supply chain 360\n", + "\n", + "Demand analytics\n", + "\n", + "Inventory visibility & tracking\n", + "\n", + "Inventory optimization\n", + "\n", + "Logistics route optimization\n", + "\n", + "Customer 360\n", + "\n", + "Marketing & sales personalization\n", + "\n", + "Recommendation engine\n", + "\n", + "Asset/Vehicle 360\n", + "\n", + "Connected asset & value-added services\n", + "\n", + "Quality event detection & traceability\n", + "\n", + "Asset predictive maintenance\n", + "\n", + "\n", + "Peer Competitive Scale\n", + "\n", + "Standard among peer group\n", + "\n", + "Common among peer group\n", + "\n", + "Strategic among peer group\n", + "\n", + "\n", + "Design\n", + "\n", + "\n", + "Purchasing\n", + "\n", + "**11**\n", + "\n", + "**10**\n", + "\n", + "**13**\n", + "\n", + "**12**\n", + "\n", + "**17**\n", + "\n", + "\n", + "New innovations\n", + "\n", + "Manufacturing\n", + "\n", + "Supply Chain\n", + "\n", + "\n", + "That is not to say that the digital transformation\n", + "journey is simple. Replacing legacy systems, breaking\n", + "down data and organizational silos, bridging the gap\n", + "between operational technology (OT) and informational\n", + "technology (IT), reskilling workforces, and much more\n", + "requires a clear and determined digitalization strategy,\n", + "and to reach new levels of IT and data maturity.\n", + "\n", + "\n", + "**16**\n", + "\n", + "\n", + "Much of the aforementioned transformation requires\n", + "a foundation of effective data management and\n", + "architecture to be in place. Without this ability to\n", + "control the vast amounts of structured data (highly\n", + "organized and easily decipherable) and unstructured\n", + "data (qualitative, no predefined data model),\n", + "manufacturers cannot generate actionable insights\n", + "from their data, derive value from machine learning,\n", + "monitor and analyze supply chains, or coordinate\n", + "decisions across the business.\n", + "\n", + "\n", + "**15**\n", + "\n", + "\n", + "**14**\n", + "\n", + "\n", + "Marketing & Sales\n", + "\n", + "Service\n", + "\n", + "\n", + "**19**\n", + "\n", + "\n", + "**18**\n", + "\n", + "\n", + "**23**\n", + "\n", + "\n", + "**22**\n", + "**21**\n", + "**20**\n", + "\n", + "\n", + "Awareness\n", + "\n", + "\n", + "Exploration Optimization Transformation\n", + "\n", + "Maturity Stages\n", + "\n", + "\n", + "6 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The foundations for data-driven manufacturing\n", + "\n", + "###### Cloud-native platforms\n", + "\n", + "Improve data management, enhance data analytics\n", + "and expand the use of enterprise data, including streaming\n", + "structured and unstructured data\n", + "\n", + "###### Technology-enabled collaboration\n", + "\n", + "Democratize analytics and ML capabilities – ensure the right\n", + "users have access to the right data driving business valueSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
ba3d3e25b1b1a711acb9cb695f985606**15**\n", + "\n", + "\n", + "**14**\n", + "\n", + "\n", + "Marketing & Sales\n", + "\n", + "Service\n", + "\n", + "\n", + "**19**\n", + "\n", + "\n", + "**18**\n", + "\n", + "\n", + "**23**\n", + "\n", + "\n", + "**22**\n", + "**21**\n", + "**20**\n", + "\n", + "\n", + "Awareness\n", + "\n", + "\n", + "Exploration Optimization Transformation\n", + "\n", + "Maturity Stages\n", + "\n", + "\n", + "6 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The foundations for data-driven manufacturing\n", + "\n", + "###### Cloud-native platforms\n", + "\n", + "Improve data management, enhance data analytics\n", + "and expand the use of enterprise data, including streaming\n", + "structured and unstructured data\n", + "\n", + "###### Technology-enabled collaboration\n", + "\n", + "Democratize analytics and ML capabilities – ensure the right\n", + "users have access to the right data driving business value\n", + "\n", + "###### The ability to scale machine learning use cases\n", + "\n", + "A central place to store and discover ML models and enabling\n", + "greater collaboration between ML, data and business users\n", + "\n", + "\n", + "##### 95% agree that\n", + "[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "\n", + "[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "##### USD$2.8 trillion by 2025\n", + "\n", + "\n", + "##### 85% have accelerated\n", + "[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "\n", + "\n", + "###### Open standards and open data architectures\n", + "\n", + "Leverage open source standards and open data formats\n", + "to accelerate innovation and enable the integration\n", + "of best-of-breed, third-party tools and services\n", + "\n", + "\n", + "7 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 1\n", + "\n", + "## The shift from manufacturing to Intelligent Manufacturing\n", + "\n", + "##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n", + "\n", + "\n", + "Yet the reasons for the lack of manufacturing\n", + "talent today are manifold, and COVID-19 has only\n", + "contributed to an existing problem. For instance,\n", + "many highly experienced baby boomers are\n", + "retiring from the workforce, leaving fewer people\n", + "with the in-depth knowledge of custom equipment\n", + "and machines. Meanwhile, younger generations\n", + "have a poor perception of what manufacturing jobs\n", + "are like and are reluctant to step into the industry.\n", + "Meaning not only a problem with retaining skills,\n", + "but also attracting them.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
657a4ccc4a91e02d17152105d50b375e7 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 1\n", + "\n", + "## The shift from manufacturing to Intelligent Manufacturing\n", + "\n", + "##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n", + "\n", + "\n", + "Yet the reasons for the lack of manufacturing\n", + "talent today are manifold, and COVID-19 has only\n", + "contributed to an existing problem. For instance,\n", + "many highly experienced baby boomers are\n", + "retiring from the workforce, leaving fewer people\n", + "with the in-depth knowledge of custom equipment\n", + "and machines. Meanwhile, younger generations\n", + "have a poor perception of what manufacturing jobs\n", + "are like and are reluctant to step into the industry.\n", + "Meaning not only a problem with retaining skills,\n", + "but also attracting them.\n", + "\n", + "And, of course, there is a growing gap between\n", + "the current capabilities of industrial workers and\n", + "the skill sets needed for today’s data-driven,\n", + "sensor-filled, 5G-enabled Intelligent Manufacturing.\n", + "\n", + "\n", + "With the drive to optimize operations, stabilize\n", + "supply chains and reinvent business models\n", + "through equipment-as-a-service, the skill sets\n", + "have radically changed from even a decade ago.\n", + "\n", + "Intelligent Manufacturing’s use cases are placing\n", + "a high demand on robotics programmers and\n", + "technicians, cybersecurity experts, digital twin\n", + "architects, supply network analysts, and people\n", + "who can leverage AI and machine learning\n", + "algorithms because deployment of these common\n", + "use cases is producing multiples of returns for\n", + "those embracing Intelligent Manufacturing.\n", + "\n", + "\n", + "8 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n", + "\n", + "\n", + "##### 44% report difficulty\n", + "[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "##### 83% of manufacturing workers are interested\n", + "[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n", + "\n", + "##### 56% of Gen Z say\n", + "[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "\n", + "### Proof through customer success\n", + "\n", + "##### Watch our case study\n", + "\n", + "\n", + "###### Digital twinsSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
b93c12666e373499f2aed5bfd1bbad8e##### 56% of Gen Z say\n", + "[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "\n", + "### Proof through customer success\n", + "\n", + "##### Watch our case study\n", + "\n", + "\n", + "###### Digital twins\n", + "\n", + "Ingesting information from sensors and other data sources,\n", + "these virtual replicas of physical assets create models\n", + "to which a layer of visualization can be applied. This enables\n", + "users to predict failures, assess performance and reveal\n", + "opportunities for optimization. Digital twins unlock the ability\n", + "for manufacturers to monitor and manage production remotely,\n", + "as well as explore “what-if” scenarios.\n", + "\n", + "###### Process and quality optimization\n", + "\n", + "Process and quality optimization generally covers the\n", + "optimization of equipment, operating procedures, and control\n", + "loops. It requires access to accurate, up-to-date data about\n", + "conditions, collected through IoT devices to monitor every\n", + "aspect. The introduction of deep learning architectures is\n", + "enabling manufacturing machinery to identify visual clues\n", + "that are indicative of quality issues in manufactured goods,\n", + "while digital twins can be used to spot inefficiencies without\n", + "the need to pause production.\n", + "\n", + "###### Throughput optimization\n", + "\n", + "Increasing throughput is critical for meeting delivery schedules,\n", + "and manufacturers are always looking for ways to identify\n", + "and eliminate bottlenecks, reduce inventory and increase\n", + "the utilization of assets. Throughput optimization makes\n", + "use of data-driven algorithms to identify, rank and resolve\n", + "labor, equipment or inventory bottlenecks.\n", + "\n", + "\n", + "###### Equipment predictive maintenance\n", + "\n", + "Rather than wait for a piece of equipment to fail or\n", + "stick to a fixed schedule, predictive maintenance adopts\n", + "a predictive approach to equipment maintenance.\n", + "By monitoring real-time data collected from hundreds\n", + "of IoT sensors, machine learning techniques can detect\n", + "anomalies in operations and possible defects in equipment\n", + "and processes. Predictive maintenance correlates data across\n", + "many more dimensions than traditional inspection techniques,\n", + "to anticipate failures and prevent costly breakdowns.\n", + "\n", + "###### Quality and yield optimization (with computer vision)\n", + "\n", + "Quality assurance focuses on the use of data analytics,\n", + "AI and machine learning to identify and prevent defects\n", + "during the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\n", + "recognition and machine learning, computer vision\n", + "can automate visual inspections, detecting faults\n", + "and imperfections faster and more cost effectively\n", + "than manual approaches.\n", + "\n", + "\n", + "9 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 2\n", + "\n", + "## Transparency, visibility, data: optimizing the supply chain\n", + "\n", + "##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n", + "\n", + "\n", + "Such resiliency requires a combination\n", + "of technologies and solutions. For example,\n", + "decision support tools with predictive capabilities\n", + "– to monitor the supply chain and analyze\n", + "what-if scenarios. Demand sensing and forecasting\n", + "in combination with enterprise critical systems\n", + "(ERP) needs to combine data from a wide variety\n", + "of sources.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
1e002cdf6f6032ddc2663352f7af14929 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 2\n", + "\n", + "## Transparency, visibility, data: optimizing the supply chain\n", + "\n", + "##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n", + "\n", + "\n", + "Such resiliency requires a combination\n", + "of technologies and solutions. For example,\n", + "decision support tools with predictive capabilities\n", + "– to monitor the supply chain and analyze\n", + "what-if scenarios. Demand sensing and forecasting\n", + "in combination with enterprise critical systems\n", + "(ERP) needs to combine data from a wide variety\n", + "of sources.\n", + "\n", + "10 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "Working together, combining millions of data points\n", + "from across organizations’ operations along with\n", + "other external sources, these technologies can\n", + "be used to optimize supply chains, reduce costs\n", + "and improve customer service and loyalty.\n", + "However, achieving this – embracing the latest\n", + "in AI, machine learning and predictive analytics –\n", + "means being able to manage and maintain\n", + "a flow of accurate, relevant data and to be able\n", + "to translate this data into actionable insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n", + "\n", + "\n", + "###### Demand, inventory, logistics\n", + "\n", + "\n", + "###### Purchasing\n", + "\n", + "**Spend analytics:** Most obviously, transparency and insight into where\n", + "cash is spent is vital for identifying opportunities to reduce external\n", + "spending across supply markets, suppliers and locations. However, spend\n", + "analytics are also hugely important to supply chain agility and resilience.\n", + "This requires a single source of data truth for finance and procurement\n", + "departments. For example, integrating purchase order, invoice,\n", + "accounts payable, and general-ledger account data to create a level of\n", + "transparency, visibility and consistency to inform supplier discussions\n", + "and deploy strategies to manage cash better during times\n", + "of disruption.\n", + "\n", + "###### Cross supply chain collaboration\n", + "\n", + "**Supply chain 360:** With real-time insights and aggregated supply\n", + "chain data in a single business intelligence dashboard, manufacturers\n", + "are empowered with greater levels of visibility, transparency\n", + "and insights for more informed decision-making. This dashboard\n", + "can be used to identify risks and take corrective steps,\n", + "assess suppliers, control costs and more.\n", + "\n", + "\n", + "**Demand analytics:** By collecting and analyzing millions –\n", + "if not billions – of data points about market and customer\n", + "behavior and product performance, manufacturers can\n", + "use this understanding to improve operations and support\n", + "strategic decisions that affect the demand of products\n", + "and services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "\n", + "**Inventory visibility and tracking:**\n", + "Inventory visibility is the ability to view and track\n", + "inventory in real time, with insights into SKU stock levels\n", + "and which warehouse or fulfillment center it is stored at.\n", + "With complete oversight of inventory across multiple\n", + "channels, this helps improve supply chain efficiency,\n", + "demand forecasting and order accuracy, while ultimately\n", + "enhancing the customer experience.\n", + "\n", + "\n", + "**Inventory optimization:** The practice of having the right\n", + "amount of available inventory to meet demand, both in the\n", + "present and the future, enables manufacturers to address\n", + "demand expectations, and reduce the costs of common\n", + "inventory issues. Inventory optimization incorporates\n", + "data for demand forecasting, inventory strategy and\n", + "stock replenishment. With the addition of AI reinforced\n", + "learning models, this can help improve demand prediction,\n", + "recommend stock levels, and automatically order\n", + "raw materials to fulfill orders, while also detecting\n", + "and responding to shifts in demand.SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
f5fd438387cc6cba8eca4f034cde5db9**Inventory visibility and tracking:**\n", + "Inventory visibility is the ability to view and track\n", + "inventory in real time, with insights into SKU stock levels\n", + "and which warehouse or fulfillment center it is stored at.\n", + "With complete oversight of inventory across multiple\n", + "channels, this helps improve supply chain efficiency,\n", + "demand forecasting and order accuracy, while ultimately\n", + "enhancing the customer experience.\n", + "\n", + "\n", + "**Inventory optimization:** The practice of having the right\n", + "amount of available inventory to meet demand, both in the\n", + "present and the future, enables manufacturers to address\n", + "demand expectations, and reduce the costs of common\n", + "inventory issues. Inventory optimization incorporates\n", + "data for demand forecasting, inventory strategy and\n", + "stock replenishment. With the addition of AI reinforced\n", + "learning models, this can help improve demand prediction,\n", + "recommend stock levels, and automatically order\n", + "raw materials to fulfill orders, while also detecting\n", + "and responding to shifts in demand.\n", + "\n", + "**Logistics route optimization:** Using AI, route optimization\n", + "can help manufacturers go beyond normal route planning\n", + "and include parameters to further drive logistics efficiency.\n", + "What-if scenarios present route options that help cut\n", + "transportation costs, boost productivity and execute\n", + "on-time deliveries.\n", + "\n", + "\n", + "**Supply chain network design:** By building and modeling the supply\n", + "chain, it enables manufacturers to understand the costs and time\n", + "to bring goods and services to market. Supply chain network design\n", + "helps to evaluate delivery at the lowest possible cost, optimal sources\n", + "and inventory deployment, as well as define distribution strategies.\n", + "\n", + "11 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n", + "\n", + " Only 6% of companies believe\n", + "[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n", + "\n", + "##### 57% believe that supply chain management \n", + "[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n", + "[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n", + "\n", + "### Supply chain optimization case study\n", + "\n", + "##### Watch our case study\n", + "\n", + "12 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 3\n", + "\n", + "## Future opportunities for manufacturing business models\n", + "\n", + "##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n", + "\n", + "\n", + "These new opportunities are forming part\n", + "of a longer-term industry shift from the sale\n", + "of goods (CapEx) to recurring revenue streams,\n", + "such as through Equipment-as-a-Service (EaaS)\n", + "models. While this approach is not new to many\n", + "(Rolls-Royce’s “Power-by-the-Hour” engine\n", + "subscription model has been around since 1962),\n", + "customer demand, advances in industrial IoT\n", + "technology, and a continuing decline in\n", + "sales and margins have seen EaaS emerge\n", + "as an imperative for manufacturers.\n", + "\n", + "\n", + "Opening up some of these new revenue streams,\n", + "of course, demands operational flexibility, but more\n", + "importantly, digital maturity. This means cloud\n", + "technologies that allow employees new levels\n", + "of access to data, the ability to work anywhere,\n", + "and adapt rapidly to new needs. The introduction\n", + "of a microservices architecture, to allow the agile\n", + "development and deployment of new IT services.\n", + "And the democratization of data, so the entire\n", + "organization and its ecosystem of partners\n", + "and suppliers have access to information\n", + "about market demand, operations, production,\n", + "logistics and transportation.\n", + "\n", + "\n", + "13 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
e39b4cfca32bed94bd77fdf60caaae62These new opportunities are forming part\n", + "of a longer-term industry shift from the sale\n", + "of goods (CapEx) to recurring revenue streams,\n", + "such as through Equipment-as-a-Service (EaaS)\n", + "models. While this approach is not new to many\n", + "(Rolls-Royce’s “Power-by-the-Hour” engine\n", + "subscription model has been around since 1962),\n", + "customer demand, advances in industrial IoT\n", + "technology, and a continuing decline in\n", + "sales and margins have seen EaaS emerge\n", + "as an imperative for manufacturers.\n", + "\n", + "\n", + "Opening up some of these new revenue streams,\n", + "of course, demands operational flexibility, but more\n", + "importantly, digital maturity. This means cloud\n", + "technologies that allow employees new levels\n", + "of access to data, the ability to work anywhere,\n", + "and adapt rapidly to new needs. The introduction\n", + "of a microservices architecture, to allow the agile\n", + "development and deployment of new IT services.\n", + "And the democratization of data, so the entire\n", + "organization and its ecosystem of partners\n", + "and suppliers have access to information\n", + "about market demand, operations, production,\n", + "logistics and transportation.\n", + "\n", + "\n", + "13 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "##### By 2023, 20% of industrial equipment manufacturers will\n", + "[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "\n", + "##### In 2025, the global EaaS market is estimated\n", + "[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "\n", + "##### In the U.S., 34% said\n", + "[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n", + "[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n", + "[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n", + "[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n", + "\n", + "### Equipment as a service case study\n", + "\n", + "##### Read our case study\n", + "\n", + "\n", + "### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n", + "\n", + "\n", + "###### Connected assets\n", + "\n", + "The digital connectivity of high-value\n", + "physical assets is helping to drive a\n", + "more efficient use of assets and cost\n", + "savings. Connected assets can provide\n", + "continuous, real-time data on their\n", + "operating conditions, even if they are on\n", + "the other side of the world. Connected\n", + "assets can also be used as the foundation\n", + "of as-a-service business models to\n", + "track the usage of rented machines, and\n", + "for automakers to use with connected\n", + "vehicles and electrification strategies.\n", + "\n", + "\n", + "###### Quality event detection and traceability\n", + "\n", + "Manufacturers are increasingly seeking\n", + "end-to-end supply chain traceability —\n", + "to be able to identify and trace\n", + "the history, distribution, location\n", + "and application of products, parts\n", + "and materials. With event-based\n", + "traceability, typically using blockchain\n", + "ledgers, manufacturers can record\n", + "events along the supply chain.\n", + "This can help aid legal compliance,\n", + "support quality assurance and brand\n", + "trust, and provide full supply chain\n", + "visibility for better risk management.\n", + "\n", + "\n", + "###### Demand-driven manufacturing\n", + "\n", + "**Equipment-as-a-Service:**\n", + "Startup organizations without\n", + "the in-house infrastructure can\n", + "use a third-party to realize their\n", + "concepts, while manufacturers\n", + "with the production capabilities\n", + "can ensure minimal downtime\n", + "for their assets. This involves\n", + "greater risk for the manufacturer,\n", + "but also the potential for higher\n", + "and annuitized revenues.\n", + "\n", + "\n", + "14 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 4\n", + "\n", + "## The focus on sustainabilitySUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
3a08841a626197fbd67f40efeccd13e5###### Quality event detection and traceability\n", + "\n", + "Manufacturers are increasingly seeking\n", + "end-to-end supply chain traceability —\n", + "to be able to identify and trace\n", + "the history, distribution, location\n", + "and application of products, parts\n", + "and materials. With event-based\n", + "traceability, typically using blockchain\n", + "ledgers, manufacturers can record\n", + "events along the supply chain.\n", + "This can help aid legal compliance,\n", + "support quality assurance and brand\n", + "trust, and provide full supply chain\n", + "visibility for better risk management.\n", + "\n", + "\n", + "###### Demand-driven manufacturing\n", + "\n", + "**Equipment-as-a-Service:**\n", + "Startup organizations without\n", + "the in-house infrastructure can\n", + "use a third-party to realize their\n", + "concepts, while manufacturers\n", + "with the production capabilities\n", + "can ensure minimal downtime\n", + "for their assets. This involves\n", + "greater risk for the manufacturer,\n", + "but also the potential for higher\n", + "and annuitized revenues.\n", + "\n", + "\n", + "14 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 4\n", + "\n", + "## The focus on sustainability\n", + "\n", + "##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n", + "\n", + "\n", + "When looking at the entire manufacturing\n", + "value chain, there are many areas where\n", + "more sustainable practices can deliver\n", + "measurable change. Products can be\n", + "designed in a way that reduces waste\n", + "and increases their longevity; materials\n", + "can be selected and sourced in a more\n", + "ethical way; operational efficiency and\n", + "green energy can improve production;\n", + "and the introduction of sustainable\n", + "practices for transportation and\n", + "shipping can help reduce carbon\n", + "footprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "\n", + "\n", + "There are a number of business\n", + "operating models that employ the four\n", + "Rs and support the circular economy.\n", + "Sharing platforms and aaS models help\n", + "optimize manufacturing capacity and\n", + "enable businesses to rent rather than\n", + "buy the machinery and equipment\n", + "they need. Product use extension helps\n", + "extend the lifecycle of products through\n", + "repair and refurbishment, while resource\n", + "recovery means recovering raw materials\n", + "from end-of-life products.\n", + "\n", + "Achieving this means establishing\n", + "a redesigned supply chain that\n", + "leverages many use cases, technologies\n", + "and solutions we covered earlier.\n", + "\n", + "\n", + "It will require greater levels of\n", + "collaboration between suppliers\n", + "and vendors. It will require optimizing\n", + "production lines and transportation.\n", + "It will require greater levels of customer\n", + "engagement to extend product lifecycles\n", + "and close the loop of the supply chain.\n", + "\n", + "But most of all, it will require data,\n", + "to provide visibility and intelligence\n", + "across the network, and to be able\n", + "to make the decisions to improve\n", + "efficiency in the present, as well as\n", + "longer-term decisions based on a\n", + "broad view of sustainability impacts.\n", + "\n", + "\n", + "15 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Sustainability Solution Accelerator\n", + "\n", + "##### Read nowSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
b7fdb18fa34a2929db074b7b529f7ebcThere are a number of business\n", + "operating models that employ the four\n", + "Rs and support the circular economy.\n", + "Sharing platforms and aaS models help\n", + "optimize manufacturing capacity and\n", + "enable businesses to rent rather than\n", + "buy the machinery and equipment\n", + "they need. Product use extension helps\n", + "extend the lifecycle of products through\n", + "repair and refurbishment, while resource\n", + "recovery means recovering raw materials\n", + "from end-of-life products.\n", + "\n", + "Achieving this means establishing\n", + "a redesigned supply chain that\n", + "leverages many use cases, technologies\n", + "and solutions we covered earlier.\n", + "\n", + "\n", + "It will require greater levels of\n", + "collaboration between suppliers\n", + "and vendors. It will require optimizing\n", + "production lines and transportation.\n", + "It will require greater levels of customer\n", + "engagement to extend product lifecycles\n", + "and close the loop of the supply chain.\n", + "\n", + "But most of all, it will require data,\n", + "to provide visibility and intelligence\n", + "across the network, and to be able\n", + "to make the decisions to improve\n", + "efficiency in the present, as well as\n", + "longer-term decisions based on a\n", + "broad view of sustainability impacts.\n", + "\n", + "\n", + "15 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Sustainability Solution Accelerator\n", + "\n", + "##### Read now\n", + "\n", + "\n", + "[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n", + "[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "##### world’s energy consumption\n", + "[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "\n", + "\n", + "##### 80% of the world’s leading companies \n", + "[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "\n", + "\n", + "##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n", + "\n", + "\n", + "16 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Leveraging the Databricks Lakehouse for Manufacturing\n", + "\n", + "Our open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\n", + "and transportation & logistics organizations to unlock more value and transform how they use data and AI.\n", + "\n", + "\n", + "All your sources Any structure or frequency\n", + "\n", + "\n", + "Reliable, real-time processing Analytics capabilities for any use case or persona\n", + "\n", + "\n", + "Competitor News\n", + "& Social\n", + "\n", + "Consumer Devices\n", + "\n", + "Video & Images\n", + "\n", + "IoT & Shop Floor\n", + "\n", + "Enterprise Resource\n", + "Planning\n", + "\n", + "Sales Transaction\n", + "& Syndicated\n", + "\n", + "Inventory & Logistics\n", + "\n", + "\n", + "Unstructured batch\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "Low-cost, rapid experimentation\n", + "with new data and models.\n", + "\n", + "Production Machine Learning\n", + "\n", + "High volume, fine-grained analysis at scale\n", + "served in the tightest of service windows.\n", + "\n", + "BI Reporting and Dashboarding\n", + "\n", + "Power real-time dashboarding directly,\n", + "or feed data to a data warehouse for\n", + "high-concurrency reporting.\n", + "\n", + "Real-Time Applications\n", + "\n", + "\n", + "Lakehouse enables a real-time\n", + "data-driven business with the ability\n", + "to ingest structured, semi-structured\n", + "and unstructured data from ERP,\n", + "SCM, IoT, social or other sources\n", + "in your value chain so that predictive\n", + "AI and ML insights can be realized.\n", + "This enables them to operate their\n", + "business in real time, deliver more\n", + "accurate analytics that leverage all\n", + "their data, and drive collaboration\n", + "and innovation across their value\n", + "chain. Most important for capital\n", + "intensive manufacturing business,\n", + "it enables them to move quickly\n", + "from proof-of-concept (PoC)\n", + "ideation to ROI quickly.\n", + "\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Unstructured batch\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Structured real-timeSUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
Showing the first 527 rows." + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "c2c667a9122ae783d9d5bfef443e7c5e", + "**EBOOK**\n\n## The Big Book of Data Engineering 2nd Edition\n\nA collection of technical\nblogs, including code\nsamples and notebooks\n\n##### With all-new content\n\n\n-----\n\n#### Contents\n\n**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n\n**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n\n**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n\n**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n\n**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "ba5806f0679d7bbc4a72328d25697ece", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n\n**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n\n**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n\n**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n\n**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "28de646b6bc9ea25b7bb33e1d80de127", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n\n**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n\n**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n\n**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n\n**4 . 1** Akamai .................................................................................................................................................................................................... 77", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "61a3db616e17315f75ec0e473cc055ba", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77\n\n**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n\n**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n\n**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n\n**4 . 5** Rivian .................................................................................................................................................................................................... 90", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "87ad37fc256c9d60868a865a4ea85bf4", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90\n\n**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n\n\n-----\n\n**SECTION**\n\n# 01\n\n\n### Introduction to Data Engineering on Databricks\n\n\n-----\n\nOrganizations realize the value data plays as a strategic asset for various\nbusiness-related initiatives, such as growing revenues, improving the customer\nexperience, operating efficiently or improving a product or service. However,\naccessing and managing data for these initiatives has become increasingly\ncomplex. Most of the complexity has arisen with the explosion of data volumes\nand data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\nto increase, 73% of the data goes unused for analytics or decision-making. In\norder to try and decrease this percentage and make more data usable, data\nengineering teams are responsible for building data pipelines to efficiently and\nreliably deliver data. But the process of building these complex data pipelines\ncomes with a number of difficulties:\n\n**•** In order to get data into a data lake, data engineers are required\nto spend immense time hand-coding repetitive data ingestion tasks\n\n**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bf114a736c5b9b473f4e1c81c2bbaa5e", + "**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "d85d526722f3ca9735bc45d98a9ad449", + "**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "31bbc89514393b32579d699cbd8173e7", + "Predictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state\n\n**Data pipeline observability**\nMonitor overall data pipeline status from a dataflow graph dashboard and\nvisually track end-to-end pipeline health for performance, quality and latency.\nData pipeline observability capabilities include:\n\n**•** A high-quality, high-fidelity lineage diagram that provides visibility\ninto how data flows for impact analysis\n\n**•** Granular logging with performance and status of the data pipeline\nat a row level\n\n**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n\n\n**Automatic deployments and operations**\nEnsure reliable and predictable delivery of data for analytics and machine\nlearning use cases by enabling easy and automatic data pipeline deployments\nand rollbacks to minimize downtime. Benefits include:\n\n**•** Complete, parameterized and automated deployment for the\ncontinuous delivery of data\n\n**•** End-to-end orchestration, testing and monitoring of data pipeline\ndeployment across all major cloud providers\n\n**Migrations**\nAccelerating and de-risking the migration journey to the lakehouse, whether\nfrom legacy on-prem systems or disparate cloud services.\n\nThe migration process starts with a detailed discovery and assessment to\nget insights on legacy platform workloads and estimate migration as well as\nDatabricks platform consumption costs. Get help with the target architecture\nand how the current technology stack maps to Databricks, followed by a\nphased implementation based on priorities and business needs. Throughout\nthis journey companies can leverage:\n\n**•** Automation tools from Databricks and its ISV partners\n\n**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n\n**•** Databricks Professional Services and training\n\nThis is the recommended approach for a successful migration, whereby\ncustomers have seen a 25-50% reduction in costs and 2-3x faster time to value\nfor their use cases.\n\n\n-----\n\n**Unified governance**\nWith Unity Catalog, data engineering and governance teams benefit from an\nenterprisewide data catalog with a single interface to manage permissions,\ncentralize auditing, automatically track data lineage down to the column level,\nand share data across platforms, clouds and regions. Benefits:\n\n**•** Discover all your data in one place, no matter where it lives,\nand centrally manage fine-grained access permissions using an\nANSI SQL-based interface\n\n**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7ef598df8d413a6a97a8acbf6316ff8c", + "**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**\n\nAs organizations strive to become data-driven, data engineering is a focal\npoint for success. To deliver reliable, trustworthy data, data engineers shouldn’t\nneed to spend time manually developing and maintaining an end-to-end\nETL lifecycle. Data engineering teams need an efficient, scalable way to\nsimplify ETL development, improve data reliability and manage operations.\n\nAs described, the eight key differentiating capabilities simplify the\nmanagement of the ETL lifecycle by automating and maintaining all data\ndependencies, leveraging built-in quality controls with monitoring and by\nproviding deep visibility into pipeline operations with automatic recovery.\nData engineering teams can now focus on easily and rapidly building reliable\nend-to-end production-ready data pipelines using only SQL or Python\nfor batch and streaming that deliver high-value data for analytics, data\nscience or machine learning.\n\n\n**Follow proven best practices**\n\nIn the next section, we describe best practices for data engineering\nend-to end use cases drawn from real-world examples. From data ingestion\nand real-time processing to analytics and machine learning, you’ll learn\nhow to translate raw data into actionable data.\n\nAs you explore the rest of this guide, you can find data sets and code\nsamples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\nget your hands dirty as you explore all aspects of the data lifecycle on the\nDatabricks Lakehouse Platform.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 02\n\n\n### Guidance and Best Practices\n\n**2.1** Top 5 Databricks Performance Tips\n\n**2.2** How to Profile PySpark\n\n**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n\n**2.4** Streaming in Production: Collected Best Practices\n\n**2.5** Streaming in Production: Collected Best Practices, Part 2\n\n**2.6** Building Geospatial Data Products\n\n**2.7** Data Lineage With Unity Catalog\n\n**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "03da766c8ecba0e1ef483a5639ff3aed", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:\n\n**•** **Use larger clusters.** It may sound obvious, but this is the number\none problem we see. It’s actually not any more expensive to use a large\ncluster for a workload than it is to use a smaller one. It’s just faster.\nIf there’s anything you should take away from this article, it’s this.\n\nRead section 1. Really.\n\n**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\nto learn more. You won’t regret it.\n\n\n\n**•** **Clean out your configurations** . Configurations carried from one\nApache Spark™ version to the next can cause massive problems. Clean up!\nRead section 3 to learn more.\n\n**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\ncorrectly, if at all. See Section 4 to learn more.\n\n**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\nyou’re writing Spark code, jump to section 5.\n\n**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\nblog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n\n**1. Give your clusters horsepower!**\n\nThis is the number one mistake customers make. Many customers create tiny\nclusters of two workers with four cores each, and it takes forever to do anything.\nThe concern is always the same: they don’t want to spend too much money on\nlarger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n\n\n-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "63a5c596165f1ac1b23faf5431a91677", + "-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2\n\nNotice that the total cost of the workload stays the same while the real-world\ntime it takes for the job to run drops significantly. So, bump up your Databricks\ncluster specs and speed up your workloads without spending any more money. It\n\ncan’t really get any simpler than that.\n\n**2. Use Photon**\n\nOur colleagues in engineering have rewritten the Spark execution engine in C++\nand dubbed it Photon. The results are impressive!\n\n\nBeyond the obvious improvements due to running the engine in native code,\nthey’ve also made use of CPU-level performance features and better memory\n\nmanagement. On top of this, they’ve rewritten the Parquet writer in C++. So this\nmakes writing to Parquet and Delta (based on Parquet) super fast as well!\n\nBut let’s also be clear about what Photon is speeding up. It improves\ncomputation speed for any built-in functions or operations, as well as writes to\nParquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\nspending most of its time reading from an ancient on-prem database? Won’t\nhelp there either, unfortunately.\n\n\n-----\n\nThe good news is that it helps where it can. So even if part of your job can’t be\nsped up, it will speed up the other parts. Also, most jobs are written with the\nnative operations and spend a lot of time writing to Delta, and Photon helps a lot\nthere. So give it a try. You may be amazed by the results!\n\n**3. Clean out old configurations**\n\nYou know those Spark configurations you’ve been carrying along from version to\nversion and no one knows what they do anymore? They may not be harmless.\nWe’ve seen jobs go from running for hours down to minutes simply by cleaning\nout old configurations. There may have been a quirk in a particular version of\nSpark, a performance tweak that has not aged well, or something pulled off\nsome blog somewhere that never really made sense. At the very least, it’s worth\nrevisiting your Spark configurations if you’re in this situation. Often the default\nconfigurations are the best, and they’re only getting better. Your configurations\nmay be holding you back.\n\n**4. The Delta Cache is your friend**\n\nThis may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1ff1e7336923f2d0af4396b7ae44350e", + "This may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.\n\nOf course, your mileage may vary. If you’re doing BI, which involves reading the\nsame tables over and over again, caching gives an amazing boost. However, if\nyou’re simply reading a table once and writing out the results as in some ETL\njobs, you may not get much benefit. You know your jobs better than anyone.\nGo forth and conquer.\n\n\n-----\n\n**5. Be aware of lazy evaluation**\n\n\nHowever, there is a catch here. Every time you try to display or write out\nresults, it runs the execution plan again. Let’s look at the same block of code\nbut extend it and do a few more operations.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n.filter(...)\n)\n\n_# Now run the execution plan to get results_\ndf2.display()\n\n_# Unfortunately this will run the plan again, including filtering, joining,_\n_etc_\ndf2.display()\n\n_# So will this…_\ndf2.count()\n—------\n\n\nIf you’re a data analyst or data scientist only using SQL or doing BI you can skip\nthis section. However, if you’re in data engineering and writing pipelines or doing\nprocessing using Databricks/Spark, read on.\n\nWhen you’re writing Spark code like select, groupBy, filter, etc., you’re really\nbuilding an execution plan. You’ll notice the code returns almost immediately when\nyou run these functions. That’s because it’s not actually doing any computation. So\neven if you have petabytes of data, it will return in less than a second.\n\nHowever, once you go to write your results out you’ll notice it takes longer. This\nis due to lazy evaluation. It’s not until you try to display or write results that your\nexecution plan is actually run.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "0b935d4987169eb45d9abb94dceb2ad6", + "—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.\n\n\nThis works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\nbenefit greatly from lazy evaluation, but it’s something a lot of customers trip\nover. So be aware of its existence and save results you reuse in order to avoid\nunnecessary computation.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nLet’s look at the same block of code again, but this time let’s avoid the\nrecomputation:\n\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n)\n\n_# save it_\ndf2.write.save(path)\n\n_# load it back in_\ndf3 = spark.read.load(path)\n\n_# now use it_\ndf3.display()\n\n_# this is not doing any extra computation anymore. No joins, filtering,_\n_etc. It’s already done and saved._\ndf3.display()\n\n_# nor is this_\ndf3.count()\n\n\n-----\n\nSECTION 2.2 \u0007\n\n**How to Profile PySpark**\n\nby **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n\nOctober 6, 2022\n\n\nIn Apache Spark™, declarative Python APIs are supported for big data workloads.\nThey are powerful enough to handle most common use cases. Furthermore,\nPySpark UDFs offer more flexibility since they enable users to run arbitrary\nPython code on top of the Apache Spark™ engine. Users only have to state\n“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\nPySpark easier to use, but it can be difficult to identify performance bottlenecks\nand apply custom optimizations.\n\nTo address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "a0691e5c37475a0f10612ea46afc205e", + "To address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**\n\nPySpark applications run as independent sets of processes on a cluster,\ncoordinated by the SparkContext object in the driver program. On the driver\nside, PySpark is a regular Python process; thus, we can profile it as a normal\nPython program using cProfile as illustrated below:\n\nimport cProfile\n\nwith cProfile.Profile() as pr:\n_# Your code_\n\npr.print_stats()\n\n**Workers profiling**\n\nExecutors are distributed on worker nodes in the cluster, which introduces\ncomplexity because we need to aggregate profiles. Furthermore, a Python worker\nprocess is spawned per executor for PySpark UDF execution, which makes the\nprofiling more intricate.\n\n\n-----\n\nThe UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\nand becomes a major tool to profile workers for PySpark applications. We’ll\nillustrate how to use the UDF profiler with a simple Pandas UDF example.\n\nFirstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n```\n sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n by 'id' )\n ).withColumn( 'v' , rand())\n\n```\nLater, we will group by the id column, which results in 8 groups with 1,000 rows\nper group.\n\nThe Pandas UDF plus_one is then created and applied as shown below:\n```\n import pandas as pd\n def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf.apply( lambda x: x + 1 , axis= 1 )\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n res.collect()\n\n```\n\nExecuting the example above and running sc.show_profiles() prints the\nfollowing profile. The profile below can also be dumped to disk by sc.dump_\nprofiles(path).\n\nThe UDF id in the profile (271, highlighted above) matches that in the Spark plan\nfor res. The Spark plan can be shown by calling res.explain() .\n\n\nNote that plus_one takes a pandas DataFrame and returns another pandas\nDataFrame. For each group, all columns are passed together as a pandas\nDataFrame to the plus_one UDF, and the returned pandas DataFrames are\ncombined into a PySpark DataFrame.\n\n\n-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "dd07dacad46874c8f7a92f1c7ad7099d", + "-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function\n\nDigging into the column details: plus_one is triggered once per group, 8 times\nin total; _arith_method of pandas Series is called once per row, 8,000 times\nin total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\nrow, thus suffering from high invocation overhead.\n\nWe can reduce such overhead by substituting the pandas.DataFrame.apply\nwith pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\nfollows:\n```\n import pandas as pd\n def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf + 1\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n schema)\n res.collect()\n\n```\n\nThe updated profile is as shown below.\n\nWe can summarize the optimizations as follows:\n\n**•** Arithmetic operation from 8,000 calls to 8 calls\n\n**•** Total function calls from 2,898,160 calls to 2,384 calls\n\n**•** Total execution time from 2.300 seconds to 0.004 seconds\n\nThe short example above demonstrates how the UDF profiler helps us deeply\nunderstand the execution, identify the performance bottleneck and enhance\nthe overall performance of the user-defined function.\n\nThe UDF profiler was implemented based on the executor-side profiler,\nwhich is designed for PySpark RDD API. The executor-side profiler is available\nin all active Databricks Runtime versions.\n\n\n-----\n\nBoth the UDF profiler and the executor-side profiler run on Python workers.\nThey are controlled by the spark.python.profile Spark configuration, which\nis false by default. We can enable that Spark configuration on a Databricks\nRuntime cluster as shown below.\n\n\n**Conclusion**\n\nPySpark profilers are implemented based on cProfile; thus, the profile reporting\nrelies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\ncollecting profile reports from Python workers.\n\nPowerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1bb64e9939be901e7a31554b2a84b4b3", + "Powerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022\n\n\n[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\napproach for creating reliable data pipelines and fully manages the underlying\ninfrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\nactionable insights derived from near real-time data. Delta Live Tables enables\nlow-latency streaming data pipelines to support such use cases with low\nlatencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n\nThis article will walk through using DLT with Apache Kafka while providing the\nrequired Python code to ingest streams. The recommended system architecture\nwill be explained, and related DLT settings worth considering will be explored\nalong the way.\n\n**Streaming platforms**\n\nEvent buses or message buses decouple message producers from consumers.\nA popular streaming use case is the collection of click-through data from\nusers navigating a website where every user interaction is stored as an event in\n\n\nApache Kafka. The event stream from Kafka is then used for real-time streaming\ndata analytics. Multiple message consumers can read the same data from Kafka\nand use the data to learn about audience interests, conversion rates, and bounce\nreasons. The real-time, streaming event data from the user interactions often\nalso needs to be correlated with actual purchases stored in a billing database.\n\n**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bc24283f816ae54daa94d5fcb0bd9f6e", + "**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”\n\nWhen developing DLT with Python, the @dlt.table decorator is used to create a\nDelta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\nwhich are simple SQL constraints clauses that define the pipeline’s behavior with\ninvalid records.\n\nSince streaming workloads often come with unpredictable data volumes,\nDatabricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\noverall end-to-end latency while reducing cost by shutting down unnecessary\ninfrastructure.\n\n**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\npipeline run.\n\nIn contrast, **streaming Delta Live Tables** are stateful, incrementally computed\nand only process data that has been added since the last pipeline run. If the\nquery which defines a streaming live tables changes, new data will be processed\nbased on the new query but existing data is not recomputed. Streaming live\ntables always use a streaming source and only work over append-only streams,\nsuch as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\nStructured Streaming.\n\n\nYou can chain multiple streaming pipelines, for example, workloads with very\nlarge data volume and low latency requirements.\n\n**Direct ingestion from streaming engines**\n\nDelta Live Tables written in Python can directly ingest data from an event bus like\nKafka using Spark Structured Streaming. You can set a short retention period for\nthe Kafka topic to avoid compliance issues, reduce costs and then benefit from\nthe cheap, elastic and governable storage that Delta provides.\n\nAs a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n(raw) table and avoid complex transformations that could drop important data.\nLike any Delta table the Bronze table will retain the history and allow it to perform\nGDPR and other compliance tasks.\n\nIngest streaming data from Apache Kafka\n\n\n-----\n\nWhen writing DLT pipelines in Python, you use the @dlt.table annotation\nto create a DLT table. There is no special attribute to mark streaming DLTs in\nPython; simply use spark.readStream() to access the stream. Example code\nfor creating a DLT table with the name kafka_bronze that is consuming data\nfrom a Kafka topic looks as follows:\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nTOPIC = \"tracker-events\"\nKAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n_# subscribe to TOPIC at KAFKA_BROKER_\nraw_kafka_events = (spark.readStream\n. format ( \"kafka\" )\n.option( \"subscribe\" , TOPIC)\n.option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n.option( \"startingOffsets\" , \"earliest\" )\n.load()\n)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9eccab88cb97885330ddb5b2d5e96a79", + "Ingest streaming data from Apache Kafka\n\n\n-----\n\nWhen writing DLT pipelines in Python, you use the @dlt.table annotation\nto create a DLT table. There is no special attribute to mark streaming DLTs in\nPython; simply use spark.readStream() to access the stream. Example code\nfor creating a DLT table with the name kafka_bronze that is consuming data\nfrom a Kafka topic looks as follows:\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nTOPIC = \"tracker-events\"\nKAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n_# subscribe to TOPIC at KAFKA_BROKER_\nraw_kafka_events = (spark.readStream\n. format ( \"kafka\" )\n.option( \"subscribe\" , TOPIC)\n.option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n.option( \"startingOffsets\" , \"earliest\" )\n.load()\n)\n\n**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n```\n def kafka_bronze ():\n\n```\nreturn raw_kafka_events\n\npipelines.reset.allowed\n\nNote that event buses typically expire messages after a certain period of time,\nwhereas Delta is designed for infinite retention.\n\nThis might lead to the effect that source data on Kafka has already been deleted\nwhen running a full refresh for a DLT pipeline. In this case, not all historic data\ncould be backfilled from the messaging platform, and data would be missing in\nDLT tables. To prevent dropping data, use the following DLT table property:\n\n\npipelines.reset.allowed=false\n\nSetting pipelines.reset.allowed to false prevents refreshes to the table but\ndoes not prevent incremental writes to the tables or new data from flowing into\nthe table.\n\n**Checkpointing**\n\nIf you are an experienced Spark Structured Streaming developer, you will notice\nthe absence of checkpointing in the above code. In Spark Structured Streaming\ncheckpointing is required to persist progress information about what data has\nbeen successfully processed and upon failure, this metadata is used to restart a\nfailed query exactly where it left off.\n\nWhereas checkpoints are necessary for failure recovery with exactly-once\nguarantees in Spark Structured Streaming, DLT handles state automatically\nwithout any manual configuration or explicit checkpointing required.\n\n**Mixing SQL and Python for a DLT pipeline**\n\nA DLT pipeline can consist of multiple notebooks but one DLT notebook is\nrequired to be written entirely in either SQL or Python (unlike other Databricks\nnotebooks where you can have cells of different languages in a single notebook).\n\nNow, if your preference is SQL, you can code the data ingestion from Apache\nKafka in one notebook in Python and then implement the transformation logic of\nyour data pipelines in another notebook in SQL.\n\n\n-----\n\n**Schema mapping**\n\nWhen reading data from messaging platform, the data stream is opaque and a\nschema has to be provided.\n\nThe Python example below shows the schema definition of events from a fitness\ntracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n\nevent_schema = StructType([ \\\nStructField( \"time\" , TimestampType(), True ) , \\\nStructField( \"version\" , StringType(), True ), \\\nStructField( \"model\" , StringType(), True ) , \\\nStructField( \"heart_bpm\" , IntegerType(), True ), \\\nStructField( \"kcal\" , IntegerType(), True ) \\\n])", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "420316efc848ce013053d1e9f161a508", + "-----\n\n**Schema mapping**\n\nWhen reading data from messaging platform, the data stream is opaque and a\nschema has to be provided.\n\nThe Python example below shows the schema definition of events from a fitness\ntracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n\nevent_schema = StructType([ \\\nStructField( \"time\" , TimestampType(), True ) , \\\nStructField( \"version\" , StringType(), True ), \\\nStructField( \"model\" , StringType(), True ) , \\\nStructField( \"heart_bpm\" , IntegerType(), True ), \\\nStructField( \"kcal\" , IntegerType(), True ) \\\n])\n\n_# temporary table, visible in pipeline but not in data browser,_\n_# cannot be queried interactively_\n**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n**temporary=** **True** **)**\n```\n def kafka_silver ():\n\n```\nreturn (\n_# kafka streams are (timestamp,value)_\n_# value contains the kafka payload_\n\ndlt.read_stream( \"kafka_bronze\" )\n.select(col( \"timestamp\" ),from_json(col( \"value\" )\n.cast( \"string\" ), event_schema).alias( \"event\" ))\n.select( \"timestamp\" , \"event.*\" )\n\n\n**Benefits**\n\nReading streaming data in DLT directly from a message broker minimizes the\narchitectural complexity and provides lower end-to-end latency since data is\ndirectly streamed from the messaging broker and no intermediary step is involved.\n\n**Streaming ingest with cloud object store intermediary**\n\nFor some specific use cases, you may want to offload data from Apache Kafka,\ne.g., using a Kafka connector, and store your streaming data in a cloud object\nintermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\ningest the files.\n\nAuto Loader can ingest data with a single line of SQL code. The syntax to ingest\nJSON files into a DLT table is shown below (it is wrapped across two lines for\nreadability).\n\n_-- INGEST with Auto Loader_\ncreate or replace streaming live table raw\nas select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n\n\n-----\n\nNote that Auto Loader itself is a streaming data source and all newly arrived files\nwill be processed exactly once, hence the streaming keyword for the raw table\nthat indicates data is ingested incrementally to that table.\n\nSince offloading streaming data to a cloud object store introduces an additional\nstep in your system architecture it will also increase the end-to-end latency\nand create additional storage costs. Keep in mind that the Kafka connector\nwriting event data to the cloud object store needs to be managed, increasing\noperational complexity.\n\nTherefore Databricks recommends as a best practice to directly access event\nbus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n\n**Other event buses or messaging systems**\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to other event buses or messaging systems. DLT supports any data\nsource that Databricks Runtime directly supports.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e6e44af0aaa9b015f23cd88d6bc493f9", + "Since offloading streaming data to a cloud object store introduces an additional\nstep in your system architecture it will also increase the end-to-end latency\nand create additional storage costs. Keep in mind that the Kafka connector\nwriting event data to the cloud object store needs to be managed, increasing\noperational complexity.\n\nTherefore Databricks recommends as a best practice to directly access event\nbus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n\n**Other event buses or messaging systems**\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to other event buses or messaging systems. DLT supports any data\nsource that Databricks Runtime directly supports.\n\n**Amazon Kinesis**\nIn Kinesis, you write messages to a fully managed serverless stream. Same as\nKafka, Kinesis does not permanently store messages. The default message\nretention in Kinesis is one day.\n\nWhen using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\nPython code for streaming ingestion above and add Amazon Kinesis-specific\nsettings with option(). For more information, check the section about Kinesis\nIntegration in the Spark Structured Streaming documentation.\n\n\n**Azure Event Hubs**\n\nFor Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\nthe article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n\n**Summary**\n\nDLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\nstreaming and batch sources, cleanse and transform data on the Databricks\nLakehouse Platform on any cloud with guaranteed data quality.\n\nData from Apache Kafka can be ingested by directly connecting to a Kafka broker\nfrom a DLT notebook in Python. Data loss can be prevented for a full pipeline\nrefresh even when the source data in the Kafka streaming layer expired.\n\n**Get started**\n\nIf you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\nrelease notes to learn more about what’s included in this GA release. If you are\nnot an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\ndetailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n\nJoin the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\nare chatting about Data + AI Summit 2022 announcements and updates. Learn.\nNetwork.\n\nLast but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\nsummit. In that session, I walk you through the code of another streaming data\nexample with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\nHugging Face sentiment analysis.\n\n\n-----\n\nSECTION 2.4 \u0007\n\n**Streaming in Production: Collected Best Practices**\n\nby **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nDecember 12, 2022\n\n\nReleasing any data pipeline or application into a production state requires\nplanning, testing, monitoring, and maintenance. Streaming pipelines are no\ndifferent in this regard; in this blog we present some of the most important\nconsiderations for deploying streaming pipelines and applications to a\nproduction environment.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "80cc83dcbd5eb42991856a090e952ce8", + "Last but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\nsummit. In that session, I walk you through the code of another streaming data\nexample with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\nHugging Face sentiment analysis.\n\n\n-----\n\nSECTION 2.4 \u0007\n\n**Streaming in Production: Collected Best Practices**\n\nby **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nDecember 12, 2022\n\n\nReleasing any data pipeline or application into a production state requires\nplanning, testing, monitoring, and maintenance. Streaming pipelines are no\ndifferent in this regard; in this blog we present some of the most important\nconsiderations for deploying streaming pipelines and applications to a\nproduction environment.\n\nAt Databricks, we offer two different ways of building and running streaming\npipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\nDLT is our flagship, fully managed ETL product that supports both batch and\nstreaming pipelines. It offers declarative development, automated operations,\ndata quality, advanced observability capabilities, and more. Workflows enable\ncustomers to run Apache Spark™ workloads in Databricks’ optimized runtime\nenvironment (i.e., Photon) with access to unified governance (Unity Catalog) and\nstorage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n\nshare the same core streaming engine — Spark Structured Streaming. In the\ncase of DLT, customers program against the DLT API and DLT uses the Structured\nStreaming engine under the hood. In the case of Jobs, customers program\nagainst the Spark API directly.\n\n\nThe recommendations in this blog post are written from the Structured\nStreaming engine perspective, most of which apply to both DLT and Workflows\n(although DLT does take care of some of these automatically, like Triggers and\nCheckpoints). We group the recommendations under the headings “Before\nDeployment” and “After Deployment” to highlight when these concepts will\nneed to be applied and are releasing this blog series with this split between\nthe two. There will be additional deep-dive content for some of the sections\nbeyond as well. We recommend reading all sections before beginning work\nto productionalize a streaming pipeline or application, and revisiting these\nrecommendations as you promote it from dev to QA and eventually production.\n\n**Before deployment**\n\nThere are many things you need to consider when creating your streaming\napplication to improve the production experience. Some of these topics, like\nunit testing, checkpoints, triggers, and state management, will determine how\nyour streaming application performs. Others, like naming conventions and how\nmany streams to run on which clusters, have more to do with managing multiple\nstreaming applications in the same environment.\n\n\n-----\n\n**Unit testing**\n\n\nThe cost associated with finding and fixing a bug goes up exponentially\nthe farther along you get in the SDLC process, and a Structured Streaming\napplication is no different. When you’re turning that prototype into a hardened\nproduction pipeline you need a CI/CD process with built-in tests. So how do you\ncreate those tests?\n\nAt first you might think that unit testing a streaming pipeline requires something\nspecial, but that isn’t the case. The general guidance for streaming pipelines is\nno different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\norganizing your code so that it can be unit tested effectively:\n\n**•** Divide your code into testable chunks\n\n**•** Organize your business logic into functions calling other functions.\nIf you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\nmultiple functions that can be individually tested.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9c08dab20cbc4981e1fa79ec23a29b7c", + "At first you might think that unit testing a streaming pipeline requires something\nspecial, but that isn’t the case. The general guidance for streaming pipelines is\nno different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\norganizing your code so that it can be unit tested effectively:\n\n**•** Divide your code into testable chunks\n\n**•** Organize your business logic into functions calling other functions.\nIf you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\nmultiple functions that can be individually tested.\n\n**•** Do not code in dependencies on the global state or external systems\n\n**•** Any function manipulating a DataFrame or data set should be organized\nto take the DataFrame/data set/configuration as input and output the\nDataFrame/data set\n\nOnce your code is separated out in a logical manner you can implement unit\ntests for each of your functions. Spark-agnostic functions can be tested like any\nother function in that language. For testing UDFs and functions with DataFrames\nand data sets, there are multiple Spark testing frameworks available. These\n\n\nframeworks support all of the DataFrame/data set APIs so that you can easily\ncreate input, and they have specialized assertions that allow you to compare\nDataFrame content and schemas. Some examples are:\n\n**•** The built-in Spark test suite, designed to test all parts of Spark\n\n**•** spark-testing-base, which has support for both Scala and Python\n\n**•** spark-fast-tests, for testing Scala Spark 2 & 3\n\n**•** chispa, a Python version of spark-fast-tests\n\nCode examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n\nBut wait! I’m testing a streaming application here — don’t I need to make\nstreaming DataFrames for my unit tests? The answer is no; you do not! Even\nthough a streaming DataFrame represents a data set with no defined ending,\nwhen functions are executed on it they are executed on a microbatch — a\ndiscrete set of data. You can use the same unit tests that you would use for a\nbatch application, for both stateless and stateful streams. One of the advantages\nof Structured Streaming over other frameworks is the ability to use the same\ntransformation code for both streaming and with other batch operations for\nthe same sink. This allows you to simplify some operations, like backfilling\ndata, for example, where rather than trying to sync the logic between two\ndifferent applications, you can just modify the input sources and write to the\nsame destination. If the sink is a Delta table, you can even do these operations\nconcurrently if both processes are append-only operations.\n\n\n-----\n\n**Triggers**\n\n\nprocess a microbatch in order to maximize resource utilization, but setting the\ninterval longer would make sense if your stream is running on a shared cluster\nand you don’t want it to constantly take the cluster resources.\n\nIf you do not need your stream to run continuously, either because data doesn’t\ncome that often or your SLA is 10 minutes or greater, then you can use the\nTrigger.Once option. This option will start up the stream, check for anything new\nsince the last time it ran, process it all in one big batch, and then shut down.\nJust like with a continuously running stream when using Trigger.Once, the\ncheckpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "64979eeceeec57cd56c9ae0a8c21ca10", + "-----\n\n**Triggers**\n\n\nprocess a microbatch in order to maximize resource utilization, but setting the\ninterval longer would make sense if your stream is running on a shared cluster\nand you don’t want it to constantly take the cluster resources.\n\nIf you do not need your stream to run continuously, either because data doesn’t\ncome that often or your SLA is 10 minutes or greater, then you can use the\nTrigger.Once option. This option will start up the stream, check for anything new\nsince the last time it ran, process it all in one big batch, and then shut down.\nJust like with a continuously running stream when using Trigger.Once, the\ncheckpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n\nSpark has a new version of Trigger.Once called Trigger.AvailableNow. While\nTrigger.Once will process everything in one big batch, which depending on your\ndata size may not be ideal, Trigger.AvailableNow will split up the data based on\nmaxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\nprocessed in multiple batches. Those settings are ignored with Trigger.Once.\nYou can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n\n**Pop quiz —** how do you turn your streaming process into a batch process\nthat automatically keeps track of where it left off with just one line of code?\n\n**Answer —** change your processing time trigger to Trigger.Once/Trigger.\nAvailableNow! Exact same code, running on a schedule, that will neither miss nor\nreprocess any records.\n\n\nNow that you know your code works, you need to determine how often your\nstream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\none of the options for the writeStream command, and it looks like this:\n\n_// Scala/Java_\n.trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n\n_# Python_\n.trigger(processingTime= '30 seconds' )\n\nIn the above example, if a microbatch completes in less than 30 seconds,\nthen the engine will wait for the rest of the time before kicking off the next\nmicrobatch. If a microbatch takes longer than 30 seconds to complete, then the\nengine will start the next microbatch immediately after the previous one finishes.\n\nThe two factors you should consider when setting your trigger interval are how\nlong you expect your stream to process a microbatch and how often you want\nthe system to check for new data. You can lower the overall processing latency\nby using a shorter trigger interval and increasing the resources available for\nthe streaming query by adding more workers or using compute or memory\noptimized instances tailored to your application’s performance. These increased\nresources come with increased costs, so if your goal is to minimize costs, then a\nlonger trigger interval with less compute can work. Normally you would not set a\ntrigger interval longer than what it would typically take for your stream to\n\n\n-----\n\n**Name your stream**\n\n\nYou name your children, you name your pets, now it’s time to name your streams.\nThere’s a writeStream option called .queryName that allows you to provide a\nfriendly name for your stream. Why bother? Well, suppose you don’t name it. In\nthat case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\nis the string and the unintelligible guid that is automatically generated\nas the stream’s unique identifier. If you have more than one stream running on a\ncluster, and all of them have and unintelligible strings as identifiers,\nhow do you find the one you want? If you’re exporting metrics how do you tell\nwhich is which?\n\nMake it easy on yourself, and name your streams. When you’re managing them in\nproduction you’ll be glad you did, and while you’re at it, go and name your batch\nqueries in any foreachBatch() code you have.\n\n**Fault tolerance**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "ea7bdc029f34b7ff3e5937db55b11a2d", + "-----\n\n**Name your stream**\n\n\nYou name your children, you name your pets, now it’s time to name your streams.\nThere’s a writeStream option called .queryName that allows you to provide a\nfriendly name for your stream. Why bother? Well, suppose you don’t name it. In\nthat case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\nis the string and the unintelligible guid that is automatically generated\nas the stream’s unique identifier. If you have more than one stream running on a\ncluster, and all of them have and unintelligible strings as identifiers,\nhow do you find the one you want? If you’re exporting metrics how do you tell\nwhich is which?\n\nMake it easy on yourself, and name your streams. When you’re managing them in\nproduction you’ll be glad you did, and while you’re at it, go and name your batch\nqueries in any foreachBatch() code you have.\n\n**Fault tolerance**\n\nHow does your stream recover from being shut down? There are a few different\ncases where this can come into play, like cluster node failures or intentional\nhalts, but the solution is to set up checkpointing. Checkpoints with write-ahead\nlogs provide a degree of protection from your streaming application being\ninterrupted, ensuring it will be able to pick up again where it last left off.\n\nCheckpoints store the current offsets and state values (e.g., aggregate values) for\nyour stream. Checkpoints are stream specific so each should be set to its own\nlocation. Doing this will let you recover more gracefully from shutdowns, failures\nfrom your application code or unexpected cloud provider failures or limitations.\n\n\nTo configure checkpoints, add the checkpointLocation option to your stream\ndefinition:\n\n_// Scala/Java/Python_\nstreamingDataFrame.writeStream\n.format( \"delta\" )\n.option( \"path\" , \"\" )\n.queryName( \"TestStream\" )\n.option( \"checkpointLocation\" , \"\" )\n.start()\n\nTo keep it simple — every time you call .writeStream, you must specify the\ncheckpoint option with a unique checkpoint location. Even if you’re using\nforeachBatch and the writeStream itself doesn’t specify a path or table option,\nyou must still specify that checkpoint. It’s how Spark Structured Streaming gives\nyou hassle-free fault tolerance.\n\nEfforts to manage the checkpointing in your stream should be of little concern\nin general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\nanalytics is not having to reason about streaming at all.” That said, one setting\n\ndeserves mention as questions around the maintenance of checkpoint files\ncome up occasionally. Though it is an internal setting that doesn’t require direct\nconfiguration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\ncontrols the number of checkpoint files that get created. Basically, the number\nof files will be roughly this number times two, as there is a file created noting the\noffsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\non completing the batch (commits). The number of files is checked periodically\nfor cleanup as part of the internal processes. This simplifies at least one aspect\nof long-term streaming application maintenance for you.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "701356b6bf73f801bad3ac82f9c7f2ad", + "Efforts to manage the checkpointing in your stream should be of little concern\nin general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\nanalytics is not having to reason about streaming at all.” That said, one setting\n\ndeserves mention as questions around the maintenance of checkpoint files\ncome up occasionally. Though it is an internal setting that doesn’t require direct\nconfiguration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\ncontrols the number of checkpoint files that get created. Basically, the number\nof files will be roughly this number times two, as there is a file created noting the\noffsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\non completing the batch (commits). The number of files is checked periodically\nfor cleanup as part of the internal processes. This simplifies at least one aspect\nof long-term streaming application maintenance for you.\n\n\n-----\n\nIt is also important to note that some changes to your application code can\ninvalidate the checkpoint. Checking for any of these changes during code\nreviews before deployment is recommended. You can find examples of changes\nwhere this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\nwhether asynchronous checkpointing might improve the latency in your\nstreaming application. In that case, these are covered in greater depth in\n[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n\n**State management and RocksDB**\n\nStateful streaming applications are those where current records may depend\non previous events, so Spark has to retain data in between microbatches.\nThe data it retains is called state, and Spark will store it in a state store and\nread, update and delete it during each microbatch. Typical stateful operations\nare streaming aggregations, streaming dropDuplicates, stream-stream joins,\nmapGroupsWithState, or flatMapGroupsWithState. Some common types of\nexamples where you’ll need to think about your application state could be\nsessionization or hourly aggregation using group by methods to calculate\n\nbusiness metrics. Each record in the state store is identified by a key that is used\nas part of the stateful computation, and the more unique keys that are required\nthe larger the amount of state data that will be stored.\n\nWhen the amount of state data needed to enable these stateful operations\ngrows large and complex, it can degrade your workloads’ performance, leading\nto increased latency or even failures. A typical indicator of the state store being\n\n\nthe culprit of added latency is large amounts of time spent in garbage collection\n(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\nthis could look like a continual increase or wildly varying processing time across\nmicrobatches.\n\nThe default configuration for a state store, which is sufficient for most general\nstreaming workloads, is to store the state data in the executors’ JVM memory.\nLarge number of keys (typically millions, see the Monitoring & Instrumentation\nsection in part 2 of this blog) can add excessive memory pressure on the\nmachine memory and increase the frequency of hitting these GC pauses as it\ntries to free up resources.\n\nOn the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\nuse [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\nmemory pressure. RocksDB is an embeddable persistent key-value store for fast\nstorage. It features high performance through a log-structured database engine\nwritten entirely in C++ and optimized for fast, low-latency storage.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e29cc632dc23cfaf8a58cd45fbe25eb9", + "The default configuration for a state store, which is sufficient for most general\nstreaming workloads, is to store the state data in the executors’ JVM memory.\nLarge number of keys (typically millions, see the Monitoring & Instrumentation\nsection in part 2 of this blog) can add excessive memory pressure on the\nmachine memory and increase the frequency of hitting these GC pauses as it\ntries to free up resources.\n\nOn the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\nuse [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\nmemory pressure. RocksDB is an embeddable persistent key-value store for fast\nstorage. It features high performance through a log-structured database engine\nwritten entirely in C++ and optimized for fast, low-latency storage.\n\nLeveraging RocksDB as the state store provider still uses machine memory\nbut no longer occupies space in the JVM and makes for a more efficient\nstate management system for large amounts of keys. This doesn’t come for\nfree, however, as it introduces an extra step in processing every microbatch.\nIntroducing RocksDB shouldn’t be expected to reduce latency except when it is\nrelated to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\nregular state storage as it is included in the stream checkpointing.\n\n\n-----\n\nRocksDB configuration, like checkpoint configuration, is minimal by design and so\nyou only need to declare it in your overall Spark configuration:\n\nspark.conf. set (\n\"spark.sql.streaming.stateStore.providerClass\" ,\n\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n\nIf you are monitoring your stream using the streamingQueryListener class, then\nyou will also notice that RocksDB metrics will be included in the stateOperators\nfield. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n\nIt’s worth noting that large numbers of keys can have other adverse impacts in\naddition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\nalso gets backed up in checkpoints for fault tolerance. So it makes sense that\nif you have state files being created so that they will not expire, you will keep\naccumulating files in the checkpoint, increasing the amount of storage required\nand potentially the time to write it or recover from failures as well. For the data\nin memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n\nthis situation can lead to somewhat vague out-of-memory errors, and for the\ncheckpointed data written to cloud storage you might observe unexpected\nand unreasonable growth. Unless you have a business need to retain streaming\nstate for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\noperations so that the system can drop state records that are no longer needed\n(pay close attention to dropDuplicates and stream-stream joins).\n\n\n**Running multiple streams on a cluster**\n\nOnce your streams are fully tested and configured, it’s time to figure out how to\norganize them in production. It’s a common pattern to stack multiple streams on\nthe same Spark cluster to maximize resource utilization and save cost. This is fine\nto a point, but there are limits to how much you can add to one cluster before\nperformance is affected. The driver has to manage all of the streams running on\nthe cluster, and all streams will compete for the same cores across the workers.\nYou need to understand what your streams are doing and plan your capacity\nappropriately to stack effectively.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3198901ba583ef2045fa27542ef5cd04", + "**Running multiple streams on a cluster**\n\nOnce your streams are fully tested and configured, it’s time to figure out how to\norganize them in production. It’s a common pattern to stack multiple streams on\nthe same Spark cluster to maximize resource utilization and save cost. This is fine\nto a point, but there are limits to how much you can add to one cluster before\nperformance is affected. The driver has to manage all of the streams running on\nthe cluster, and all streams will compete for the same cores across the workers.\nYou need to understand what your streams are doing and plan your capacity\nappropriately to stack effectively.\n\nHere is what you should take into account when you’re planning on stacking\nmultiple streams on the same cluster:\n\n**•** Make sure your driver is big enough to manage all of your streams. Is your\ndriver struggling with a high CPU utilization and garbage collection? That\nmeans it’s struggling to manage all of your streams. Either reduce the\nnumber of streams or increase the size of your driver.\n\n**•** Consider the amount of data each stream is processing. The more data\nyou are ingesting and writing to a sink, the more cores you will need in\norder to maximize your throughput for each stream. You’ll need to reduce\nthe number of streams or increase the number of workers depending on\nhow much data is being processed. For sources like Kafka you will need to\nconfigure how many cores are being used to ingest with the minPartitions\noption if you don’t have enough cores for all of the partitions across all of\nyour streams.\n\n\n-----\n\n**•** Consider the complexity and data volume of your streams. If all of the\nstreams are doing minimal manipulation and just appending to a sink, then\neach stream will need fewer resources per microbatch and you’ll be able to\nstack more. If the streams are doing stateful processing or computation/\nmemory-intensive operations, that will require more resources for good\nperformance and you’ll want to stack fewer streams.\n\n**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\ncontending for the same workers and cores, and one stream that needs a\nlot of cores will cause the other streams to wait. Scheduler pools enable\nyou to have different streams execute on different parts of the cluster.\nThis will enable streams to execute in parallel with a subset of the available\nresources.\n\n\n**Conclusion**\n\nSome of the ideas we’ve addressed here certainly deserve their own time\nand special treatment with a more in-depth discussion, which you can look\nforward to in later deep dives. However, we hope these recommendations are\nuseful as you begin your journey or seek to enhance your production streaming\nexperience. Be sure to continue with the next post, “Streaming in Production:\nCollected Best Practices, Part 2.”\n\n**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n\n\n\n**•** Consider your SLA. If you have mission critical streams, isolate them as a\nbest practice so lower-criticality streams do not affect them.\n\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nOn Databricks we typically see customers stack between 10-30 streams on a\ncluster, but this varies depending on the use case. Consider the factors above so\nthat you can have a good experience with performance, cost and maintainability.\n\n\n-----\n\nSECTION 2.5 \u0007\n\n**Streaming in Production: Collected Best Practices, Part 2**\n\nby **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nJanuary 10, 2023\n\n\nIn our two-part blog series titled “Streaming in Production: Collected Best\nPractices,” this is the second article. Here we discuss the “After Deployment”\nconsiderations for a Structured Streaming Pipeline. The majority of the\nsuggestions in this post are relevant to both Structured Streaming Jobs and\nDelta Live Tables (our flagship and fully managed ETL product that supports\nboth batch and streaming pipelines).\n\n**After deployment**\n\nAfter the deployment of your streaming application, there are typically three\nmain things you’ll want to know:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "ca43835d6b287f050d49c4ff6a5c01ee", + "On Databricks we typically see customers stack between 10-30 streams on a\ncluster, but this varies depending on the use case. Consider the factors above so\nthat you can have a good experience with performance, cost and maintainability.\n\n\n-----\n\nSECTION 2.5 \u0007\n\n**Streaming in Production: Collected Best Practices, Part 2**\n\nby **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nJanuary 10, 2023\n\n\nIn our two-part blog series titled “Streaming in Production: Collected Best\nPractices,” this is the second article. Here we discuss the “After Deployment”\nconsiderations for a Structured Streaming Pipeline. The majority of the\nsuggestions in this post are relevant to both Structured Streaming Jobs and\nDelta Live Tables (our flagship and fully managed ETL product that supports\nboth batch and streaming pipelines).\n\n**After deployment**\n\nAfter the deployment of your streaming application, there are typically three\nmain things you’ll want to know:\n\n**•** How is my application running?\n\n**•** Are resources being used efficiently?\n\n**•** How do I manage any problems that come up?\n\nWe’ll start with an introduction to these topics, followed by a deeper dive later in\nthis blog series.\n\n\n**Monitoring and instrumentation (How is my application running?)**\n\nStreaming workloads should be pretty much hands-off once deployed to\nproduction. However, one thing that may sometimes come to mind is: “how is my\napplication running?” Monitoring applications can take on different levels and\nforms depending on:\n\n**•** the metrics collected for your application (batch duration/latency,\nthroughput, …)\n\n**•** where you want to monitor the application from\n\nAt the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\nused in a variety of situations.\n\nThis is in addition to setting up failure alerts on jobs running streaming\nworkloads.\n\nIf you want more fine-grained metrics or to create custom actions based on\nthese metrics as part of your code base, then the StreamingQueryListener is\nbetter aligned with what you’re looking for.\n\n\n-----\n\nIf you want the Spark metrics to be reported (including machine level traces for\ndrivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n\nThe Apache Spark Structured Streaming UI\n\n\nAnother point to consider is where you want to surface these metrics for\nobservability. There is a Ganglia dashboard at the cluster level, integrated partner\napplications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\nsource options you can build using tools like Prometheus and Grafana. Each\nhas advantages and disadvantages to consider around cost, performance, and\nmaintenance requirements.\n\nWhether you have low volumes of streaming workloads where interactions in the\nUI are sufficient or have decided to invest in a more robust monitoring platform,\nyou should know how to observe your production streaming workloads. Further\n“Monitoring and Alerting” posts later in this series will contain a more thorough\ndiscussion. In particular, we’ll see different measures on which to monitor\nstreaming applications and then later take a deeper look at some of the tools\nyou can leverage for observability.\n\n**Application optimization (Are resources being used effectively?**\n\n**Think “cost”)**\n\nThe next concern we have after deploying to production is “is my application", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1d13998a4c717e8522f9b108d682adc2", + "Whether you have low volumes of streaming workloads where interactions in the\nUI are sufficient or have decided to invest in a more robust monitoring platform,\nyou should know how to observe your production streaming workloads. Further\n“Monitoring and Alerting” posts later in this series will contain a more thorough\ndiscussion. In particular, we’ll see different measures on which to monitor\nstreaming applications and then later take a deeper look at some of the tools\nyou can leverage for observability.\n\n**Application optimization (Are resources being used effectively?**\n\n**Think “cost”)**\n\nThe next concern we have after deploying to production is “is my application\n\nusing resources effectively?” As developers, we understand (or quickly learn) the\ndistinction between working code and well-written code. Improving the way your\ncode runs is usually very satisfying, but what ultimately matters is the overall\ncost of running it. Cost considerations for Structured Streaming applications will\nbe largely similar to those for other Spark applications. One notable difference\nis that failing to optimize for production workloads can be extremely costly,\nas these workloads are frequently “always-on” applications, and thus wasted\nexpenditure can quickly compound. Because assistance with cost optimization is\n\n\n-----\n\nfrequently requested, a separate post in this series will address it. The key points\nthat we’ll focus on will be efficiency of usage and sizing.\n\nGetting the cluster sizing right is one of the most significant differences between\nefficiency and wastefulness in streaming applications. This can be particularly\ntricky because in some cases it’s difficult to estimate the full load conditions of\nthe application in production before it’s actually there. In other cases, it may be\ndifficult due to natural variations in volume handled throughout the day, week, or\nyear. When first deploying, it can be beneficial to oversize slightly, incurring the\nextra expense to avoid inducing performance bottlenecks. Utilize the monitoring\ntools you chose to employ after the cluster has been running for a few weeks\nto ensure proper cluster utilization. For example, are CPU and memory levels\nbeing used at a high level during peak load or is the load generally small and the\ncluster may be downsized? Maintain regular monitoring of this and keep an eye\nout for changes in data volume over time; if either occurs, a cluster resize may be\nrequired to maintain cost-effective operation.\n\nAs a general guideline, you should avoid excessive shuffle operations, joins, or an\nexcessive or extreme watermark threshold (don’t exceed your needs), as each\ncan increase the number of resources you need to run your application. A large\nwatermark threshold will cause Structured Streaming to keep more data in the\nstate store between batches, leading to an increase in memory requirements\nacross the cluster. Also, pay attention to the type of VM configured — are you\nusing memory-optimized for your memory-intense stream? Compute-optimized\nfor your computationally-intensive stream? If not, look at the utilization levels\nfor each and consider trying a machine type that could be a better fit. Newer\nfamilies of servers from cloud providers with more optimal CPUs often lead to\nfaster execution, meaning you might need fewer of them to meet your SLA.\n\n\n**Troubleshooting (How do I manage any problems that come up?)**\n\nThe last question we ask ourselves after deployment is “how do I manage any\nproblems that come up?” As with cost optimization, troubleshooting streaming\napplications in Spark often looks the same as other applications since most of\nthe mechanics remain the same under the hood. For streaming applications,\nissues usually fall into two categories — failure scenarios and latency scenarios\n\n**Failure scenarios**\n\nFailure scenarios typically manifest with the stream stopping with an error,\nexecutors failing or a driver failure causing the whole cluster to fail. Common\ncauses for this are:\n\n**•** Too many streams running on the same cluster, causing the driver to be\noverwhelmed. On Databricks, this can be seen in Ganglia, where the driver\nnode will show up as overloaded before the cluster fails.\n\n**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\nThis can also be seen on Databricks in Ganglia before an executor fails,\nor in the Spark UI under the executors tab.\n\n**•** Using a collect to send too much data to the driver, causing it to fail\nwith an Out Of Memory error.\n\n\n-----\n\n**Latency scenarios**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "c78de09600f8dd6cbf1ef88ae8462fb3", + "**Failure scenarios**\n\nFailure scenarios typically manifest with the stream stopping with an error,\nexecutors failing or a driver failure causing the whole cluster to fail. Common\ncauses for this are:\n\n**•** Too many streams running on the same cluster, causing the driver to be\noverwhelmed. On Databricks, this can be seen in Ganglia, where the driver\nnode will show up as overloaded before the cluster fails.\n\n**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\nThis can also be seen on Databricks in Ganglia before an executor fails,\nor in the Spark UI under the executors tab.\n\n**•** Using a collect to send too much data to the driver, causing it to fail\nwith an Out Of Memory error.\n\n\n-----\n\n**Latency scenarios**\n\nFor latency scenarios, your stream will not execute as fast as you want or expect.\nA latency issue can be intermittent or constant. Too many streams or too small\nof a cluster can be the cause of this as well. Some other common causes are:\n\n**•** Data skew — when a few tasks end up with much more data than the rest\nof the tasks. With skewed data, these tasks take longer to execute than the\nothers, often spilling to disk. Your stream can only run as fast as its slowest\ntask.\n\n**•** Executing a stateful query without defining a watermark or defining a very\nlong one will cause your state to grow very large, slowing down your stream\nover time and potentially leading to failure.\n\n**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n\n**•** Stable but high latency (batch execution time). Depending on the cause,\nadding more workers to increase the number of cores concurrently available\nfor Spark tasks can help. Increasing the number of input partitions and/or\ndecreasing the load per core through batch size settings can also reduce\nthe latency.\n\nJust like troubleshooting a batch job, you’ll use Ganglia to check cluster\nutilization and the Spark UI to find performance bottlenecks. There is a\nspecific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\ntroubleshoot streaming applications. On that tab each stream that is running will\nbe listed, and you’ll see either your stream name if you named your stream or\n\n\n if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\ntab of the Spark UI so that you can tell which jobs are for a given stream.\n\nYou’ll notice above we said which jobs are for a given stream. It’s a common\nmisconception that if you were to look at a streaming application in the Spark\nUI you would just see one job in the Jobs tab running continuously. Instead,\ndepending on your code, you will see one or more jobs that start and complete\nfor each microbatch. Each job will have the stream ID from the Structured\nStreaming tab and a microbatch number in the description, so you’ll be able to\ntell which jobs go with which stream. You can click into those jobs to find the\nlongest running stages and tasks, check for disk spills, and search by Job ID in\nthe SQL tab to find the slowest queries and check their explain plans.\n\nThe Jobs tab in the Apache Spark UI\n\n\n-----\n\nIf you click on your stream in the Structured Streaming tab you’ll see how much\ntime the different streaming operations are taking for each microbatch, such as\nadding a batch, query planning and committing (see earlier screenshot of the\nApache Spark Structured Streaming UI). You can also see how many rows are\nbeing processed as well as the size of your state store for a stateful stream.\nThis can give insights into where potential latency issues are.\n\nWe will go more in-depth with troubleshooting later in this blog series, where\nwe’ll look at some of the causes and remedies for both failure scenarios and\nlatency scenarios as we outlined above.\n\n**Conclusion**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "d0f45ea2c08b30fbe5cd7aa463e99148", + "The Jobs tab in the Apache Spark UI\n\n\n-----\n\nIf you click on your stream in the Structured Streaming tab you’ll see how much\ntime the different streaming operations are taking for each microbatch, such as\nadding a batch, query planning and committing (see earlier screenshot of the\nApache Spark Structured Streaming UI). You can also see how many rows are\nbeing processed as well as the size of your state store for a stateful stream.\nThis can give insights into where potential latency issues are.\n\nWe will go more in-depth with troubleshooting later in this blog series, where\nwe’ll look at some of the causes and remedies for both failure scenarios and\nlatency scenarios as we outlined above.\n\n**Conclusion**\n\nYou may have noticed that many of the topics covered here are very similar to\nhow other production Spark applications should be deployed. Whether your\nworkloads are primarily streaming applications or batch processes, the majority\nof the same principles will apply. We focused more on things that become\nespecially important when building out streaming applications, but as we’re\n\n\nsure you’ve noticed by now, the topics we discussed should be included in\nmost production deployments.\n\nAcross the majority of industries in the world today information is needed\nfaster than ever, but that won’t be a problem for you. With Spark Structured\nStreaming you’re set to make it happen at scale in production. Be on the lookout\nfor more in-depth discussions on some of the topics we’ve covered in this blog,\nand in the meantime keep streaming!\n\n**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.6 \u0007\n\n**Building Geospatial Data Products**\n\nby **M I L O S C O L I C**\n\nJanuary 6, 2023\n\n\nGeospatial data has been driving innovation for centuries, through use of\nmaps, cartography and more recently through digital content. For example,\nthe oldest map has been found etched in a piece of mammoth tusk and dates\n[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\nsources used by society to make decisions. A more recent example, labeled\nas the birth of spatial analysis, is that of Charles Picquet in 1832 who used\ngeospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\nlater John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\nproblems of their times and in effect save countless lives. Fast-forwarding to the\n20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\nRural Development.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e010236328d56ff6a8034a16f0a7902a", + "Today we are in the midst of the cloud computing industry revolution —\nsupercomputing scale available to any organization, virtually infinitely scalable\nfor both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\nare emerging within the data community to address questions like platform\nfederation and interoperability. How can we adopt these concepts to geospatial\ndata, spatial analysis and GIS systems? By adopting the concept of data\nproducts and approaching the design of geospatial data as a product.\n\n\nIn this blog we will provide a point of view on how to design scalable geospatial\ndata products that are modern and robust. We will discuss how Databricks\nLakehouse Platform can be used to unlock the full potential of geospatial\nproducts that are one of the most valuable assets in solving the toughest\nproblems of today and the future.\n\n**What is a data product? And how to design one?**\n\nThe most broad and the most concise definition of a “data product” was coined\nby DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n_Data into Product:_ “a product that facilitates an end goal through the use of\ndata.” The complexity of this definition (as admitted by Patil himself) is needed to\nencapsulate the breadth of possible products, to include dashboards, reports, Excel\n\nspreadsheets, and even CSV extracts shared via emails. You might notice that the\nexamples provided deteriorate rapidly in quality, robustness and governance.\n\nWhat are the concepts that differentiate a successful product versus an\nunsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\ncontent? Or is it only the product adoption in the market? Forbes defines the\n10 must-haves of a successful product. A good framework to summarize this is\nthrough the value pyramid.\n\n\n-----\n\nFigure 1: Product value pyramid (source)\n\nThe value pyramid provides a priority on each aspect of the product. Not every\nvalue question we ask about the product carries the same amount of weight. If\nthe output is not useful none of the other aspects matter — the output isn’t really\na product but becomes more of a data pollutant to the pool of useful results.\nLikewise, scalability only matters after simplicity and explainability are addressed.\n\nHow does the value pyramid relate to the data products? Each data output, in\norder to be a data product:\n\n**•** **Should have clear usefulness.** The amount of the data society is\ngenerating is rivaled only by the amount of data pollutants we are\ngenerating. These are outputs lacking clear value and use, much less a\nstrategy for what to do with them.\n\n\n\n**•** **Should be explainable.** With the emergence of AI/ML, explainability has\nbecome even more important for data driven decision-making. Data\nis as good as the metadata describing it. Think of it in terms of food —\ntaste does matter, but a more important factor is the nutritional value\nof ingredients.\n\n**•** **Should be simple.** An example of product misuse is using a fork to eat\ncereal instead of using a spoon. Furthermore, simplicity is essential but\nnot sufficient — beyond simplicity the products should be intuitive.\nWhenever possible both intended and unintended uses of the data\nshould be obvious.\n\n**•** **Should be scalable.** Data is one of the few resources that grows with\nuse. The more data you process the more data you have. If both inputs\nand outputs of the system are unbounded and ever-growing, then the\nsystem has to be scalable in compute power, storage capacity and\ncompute expressive power. Cloud data platforms like Databricks are in\na unique position to answer for all of the three aspects.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "cade53da742fa49ccab64c5a0370b569", + "**•** **Should be explainable.** With the emergence of AI/ML, explainability has\nbecome even more important for data driven decision-making. Data\nis as good as the metadata describing it. Think of it in terms of food —\ntaste does matter, but a more important factor is the nutritional value\nof ingredients.\n\n**•** **Should be simple.** An example of product misuse is using a fork to eat\ncereal instead of using a spoon. Furthermore, simplicity is essential but\nnot sufficient — beyond simplicity the products should be intuitive.\nWhenever possible both intended and unintended uses of the data\nshould be obvious.\n\n**•** **Should be scalable.** Data is one of the few resources that grows with\nuse. The more data you process the more data you have. If both inputs\nand outputs of the system are unbounded and ever-growing, then the\nsystem has to be scalable in compute power, storage capacity and\ncompute expressive power. Cloud data platforms like Databricks are in\na unique position to answer for all of the three aspects.\n\n**•** **Should generate habits.** In the data domain we are not concerned\nwith customer retention as is the case for the retail products. However,\nthe value of habit generation is obvious if applied to best practices.\nThe systems and data outputs should exhibit the best practices and\npromote them — it should be easier to use the data and the system in\nthe intended way than the opposite.\n\nThe geospatial data should adhere to all the aforementioned aspects — any data\nproducts should. On top of this tall order, geospatial data has some specific needs.\n\n\n-----\n\n**Geospatial data standards**\n\n\n\n**•** **“Advocate the understanding and use of geospatial data standards**\n**within other sectors of government.”** — Value pyramid applies to\nthe standards as well — concepts like ease of adherence (usefulness/\nsimplicity), purpose of the standard (explainability/usefulness), adoption\n(habit generation) are critical for the value generation of a standard.\n\nA critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\nprinciples:\n\n**•** **Findable** — The first step in (re)using data is to find them. Metadata\nand data should be easy to find for both humans and computers.\nMachine-readable metadata are essential for automatic discovery of\ndata sets and services.\n\n**•** **Accessible** — Once the user finds the required data, she/he/they\nneed to know how they can be accessed, possibly including\nauthentication and authorization.\n\n**•** **Interoperable** — The data usually needs to be integrated with\nother data. In addition, the data needs to interoperate with\napplications or workflows for analysis, storage, and processing.\n\n**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\nTo achieve this, metadata and data should be well-described so that\nthey can be replicated and/or combined in different settings.\n\n\nGeospatial data standards are used to ensure that geographic data is collected,\norganized, and shared in a consistent and reliable way. These standards can\ninclude guidelines for things like data formatting, coordinate systems, map\nprojections, and metadata. Adhering to standards makes it easier to share data\nbetween different organizations, allowing for greater collaboration and broader\naccess to geographic information.\n\nThe Geospatial Commision (UK government) has defined the UK Geospatial\nData Standards Register as a central repository for data standards to be applied\nin the case of geospatial data. Furthermore, the mission of this registry is to:\n\n**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n**across a wider range of systems.”** — These concepts are a callout for the\nimportance of explainability, usefulness and habit generation (possibly\nother aspects of the value pyramid).\n\n**•** **“Empower the UK geospatial community to become more engaged with**\n**the relevant standards and standards bodies.”** — Habit generation within\nthe community is as important as the robust and critical design on the\nstandard. If not adopted standards are useless.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7b16e03feb511254489054d702a882a4", + "The Geospatial Commision (UK government) has defined the UK Geospatial\nData Standards Register as a central repository for data standards to be applied\nin the case of geospatial data. Furthermore, the mission of this registry is to:\n\n**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n**across a wider range of systems.”** — These concepts are a callout for the\nimportance of explainability, usefulness and habit generation (possibly\nother aspects of the value pyramid).\n\n**•** **“Empower the UK geospatial community to become more engaged with**\n**the relevant standards and standards bodies.”** — Habit generation within\nthe community is as important as the robust and critical design on the\nstandard. If not adopted standards are useless.\n\n\n-----\n\nWe share the belief that the FAIR principles are crucial for the design of scalable\ndata products we can trust. To be fair, FAIR is based on common sense, so why\nis it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n_does well is to articulate, in an accessible way, the need for a holistic approach_\n_to data improvement. This ease in communication is why FAIR is being used_\n_increasingly widely as an umbrella for data improvement — and not just in the_\n_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n\nTo further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\ndeveloped the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\ncovers the years 2021-2024 and was approved in November 2020. The goals\nof NSDI are in essence FAIR principles and convey the same message of designing\nsystems that promote the circular economy of data — data products that flow\nbetween organizations following common standards and in each step through the\ndata supply chain unlock new value and new opportunities. The fact that these\nprinciples are permeating different jurisdictions and are adopted across different\nregulators is a testament to the robustness and soundness of the approach.\n\n\nThe FAIR concepts weave really well together with the data product design.\nIn fact FAIR is traversing the whole product value pyramid and forms a value\ncycle. By adopting both the value pyramid and FAIR principles we design data\nproducts with both internal and external outlook. This promotes data reuse\nas opposed to data accumulation.\n\nWhy do FAIR principles matter for geospatial data and geospatial data\n\nproducts? FAIR is transcendent to geospatial data, it is actually transcendent\nto data, it is a simple yet coherent system of guiding principles for good design\n— and that good design can be applied to anything including geospatial data\nand geospatial systems.\n\n\nFigure 2:\nNDSI Strategic Goals\n\n\n-----\n\n**Grid index systems**\n\nIn traditional GIS solutions’ performance of spatial operations are usually\nachieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\nThe issue with tree approaches is that they eventually break the scalability\nprinciple — when the data is too big to be processed in order to build the tree\nand the computation required to build the tree is too long and defeats the\npurpose. This also negatively affects the accessibility of data; if we cannot\nconstruct the tree we cannot access the complete data and in effect we cannot\nreproduce the results. In this case, grid index systems provide a solution.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "999a832f6cc41314899f6ef6c3cb3f06", + "Figure 2:\nNDSI Strategic Goals\n\n\n-----\n\n**Grid index systems**\n\nIn traditional GIS solutions’ performance of spatial operations are usually\nachieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\nThe issue with tree approaches is that they eventually break the scalability\nprinciple — when the data is too big to be processed in order to build the tree\nand the computation required to build the tree is too long and defeats the\npurpose. This also negatively affects the accessibility of data; if we cannot\nconstruct the tree we cannot access the complete data and in effect we cannot\nreproduce the results. In this case, grid index systems provide a solution.\n\n\nGrid index systems are built from the start with the scalability aspects of the\ngeospatial data in mind. Rather than building the trees, they define a series of\ngrids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\nthe grid covers the area of the Earth; in the case of local grid index systems\n(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\nThese grids are composed of cells that have unique identifiers. There is a\nmathematical relationship between location and the cell in the grid. This makes\nthe grid index systems very scalable and parallel in nature.\n\n\nFigure 4: Grid Index Systems (H3, British National Grid)\n\n\n-----\n\nAnother important aspect of grid index systems is that they are open source,\nallowing index values to be universally leveraged by data producers and\nconsumers alike. Data can be enriched with the grid index information at any\nstep of its journey through the data supply chain. This makes the grid index\nsystems an example of community driven data standards. Community driven\ndata standards by nature do not require enforcement, which fully adheres\nto the habit generation aspect of value pyramid and meaningfully addresses\ninteroperability and accessibility principles of FAIR.\n\n\nDatabricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\nfollowing the same value proposition. Adopting common industry standards\ndriven by the community is the only way to properly drive habit generation and\ninteroperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\nand [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\nGIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\nthe UK government. Grid index systems are key for the scalability of geospatial\ndata processing and for properly designing solutions for complex problems\n(e.g., figure 5 — flight holding patterns using H3).\n\n**Geospatial data diversity**\n\nGeospatial data standards spend a solid amount of effort regarding data\nformat standardization, and format for that matter is one of the most\nimportant considerations when it comes to interoperability and reproducibility.\nFurthermore, if the reading of your data is complex — how can we talk about\nsimplicity? Unfortunately geospatial data formats are typically complex, as\ndata can be produced in a number of formats including both open source", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "62d0abb59a0dcbc090a459f7bcd48faf", + "**Geospatial data diversity**\n\nGeospatial data standards spend a solid amount of effort regarding data\nformat standardization, and format for that matter is one of the most\nimportant considerations when it comes to interoperability and reproducibility.\nFurthermore, if the reading of your data is complex — how can we talk about\nsimplicity? Unfortunately geospatial data formats are typically complex, as\ndata can be produced in a number of formats including both open source\n\nand vendor-specific formats. Considering only vector data, we can expect\ndata to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\nand many others. On the other hand, if we are considering raster data we can\nexpect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\nGeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n\n\nFigure 5: Example of using H3 to express flight holding patterns\n\n\n-----\n\nGeospatial data domain is so diverse and has organically grown over the years\naround the use cases it was addressing. Unification of such a diverse ecosystem\nis a massive challenge. A recent effort by the Open Geospatial Consortium\n(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\nof designing a good scalable and robust product — unification leads to simplicity\nand addresses one of the main sources of friction in the ecosystem — the data\ningestion. Standardizing to GeoParquet brings a lot of value that addresses all of\nthe aspects of FAIR data and value pyramid.\n\nFigure 6: Geoparquet as a geospatial standard data format\n\n\nWhy introduce another format into an already complex ecosystem? GeoParquet\nisn’t a new format — it is a schema specification for Apache Parquet format that\nis already widely adopted and used by the industry and the community. Parquet\nas the base format supports binary columns and allows for storage of arbitrary\ndata payload. At the same time the format supports structured data columns\nthat can store metadata together with the data payload. This makes it a choice\nthat promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\nhas been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\nproperties of a format are crucial for reproducibility and for trusted outputs. In\naddition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n\nDelta Sharing enables enterprise scale data sharing between any public cloud\nusing Databricks (DIY options for private cloud are available using open source\nbuilding blocks). Delta Sharing completely abstracts the need for custom built\nRest APIs for exposing data to other third parties. Any data asset stored in Delta\n(using GeoParquet schema) automatically becomes a data product that can be\nexposed to external parties in a controlled and governed manner. Delta Sharing\nhas been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n\n\n-----\n\nFigure 7: Delta Sharing simplifying data access in the ecosystem\n\n**Circular data economy**\n\n\nBorrowing the concepts from the sustainability domain, we can define a circular\ndata economy as a system in which data is collected, shared, and used in a way\nthat maximizes its value while minimizing waste and negative impacts, such as\nunnecessary compute time, untrustworthy insights, or biased actions based\ndata pollutants. Reusability is the key concept in this consideration — how can\nwe minimize the \"reinvention of the wheel.\" There are countless data assets out\nin the wild that represent the same area, same concepts with just ever slight\nalterations to better match a specific use case. Is this due to the actual", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "d142161c1b5bc29be7c1b51d12681385", + "-----\n\nFigure 7: Delta Sharing simplifying data access in the ecosystem\n\n**Circular data economy**\n\n\nBorrowing the concepts from the sustainability domain, we can define a circular\ndata economy as a system in which data is collected, shared, and used in a way\nthat maximizes its value while minimizing waste and negative impacts, such as\nunnecessary compute time, untrustworthy insights, or biased actions based\ndata pollutants. Reusability is the key concept in this consideration — how can\nwe minimize the \"reinvention of the wheel.\" There are countless data assets out\nin the wild that represent the same area, same concepts with just ever slight\nalterations to better match a specific use case. Is this due to the actual\n\n\noptimizations or due to the fact it was easier to create a new copy of the assets\nthan to reuse the existing ones? Or was it too hard to find the existing data\nassets, or maybe it was too complex to define data access patterns.\n\nData asset duplication has many negative aspects in both FAIR considerations\nand data value pyramid considerations — having many disparate similar (but\ndifferent) data assets that represent the same area and same concepts can\ndeteriorate simplicity considerations of the data domain — it becomes hard\nto identify the data asset we actually can trust. It can also have very negative\n\n\n-----\n\nimplications toward habit generation. Many niche communities will emerge\nthat will standardize to themselves ignoring the best practices of the wider\necosystem, or worse yet they will not standardize at all.\n\nIn a circular data economy, data is treated as a valuable resource that can be\nused to create new products and services, as well as improving existing ones.\nThis approach encourages the reuse and recycling of data, rather than treating it\nas a disposable commodity. Once again, we are using the sustainability analogy\nin a literal sense — we argue that this is the correct way of approaching the\nproblem. Data pollutants are a real challenge for organizations both internally and\nexternally. An article by The Guardian states that less than 1% of collected data is\nactually analyzed. There is too much data duplication, the majority of data is hard\nto access and deriving actual value is too cumbersome. Circular data economy\npromotes best practices and reusability of existing data assets allowing for a more\nconsistent interpretation and insights across the wider data ecosystem.\n\n\nFigure 8: Databricks Marketplace\n\n\n-----\n\nInteroperability is a key component of FAIR data principles, and from\ninteroperability a question of circularity comes to mind. How can we design an\necosystem that maximizes data utilization and data reuse? Once again, FAIR\ntogether with the value pyramid holds answers. Findability of the data is key to\nthe data reuse and to solving for data pollution. With data assets that can be\ndiscovered easily we can avoid the recreation of same data assets in multiple\nplaces with just slight alteration. Instead we gain a coherent data ecosystem\nwith data that can be easily combined and reused. Databricks has recently\nannounced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\nline with the original definition of data product by DJ Patel. The marketplace\nwill support sharing of data sets, notebooks, dashboards, and machine learning\nmodels. The critical building block for such a marketplace is the concept of\nDelta Sharing — the scalable, flexible and robust channel for sharing any data —\ngeospatial data included.\n\n\nDesigning scalable data products that will live in the marketplace is crucial.\nIn order to maximize the value add of each data product one should strongly\nconsider FAIR principles and the product value pyramid. Without these guiding\nprinciples we will only increase the issues that are already present in the\ncurrent systems. Each data product should solve a unique problem and should\nsolve it in a simple, reproducible and robust way.\n\n**You can read more on how Databricks Lakehouse**\n**Platform can help you accelerate time to value from**\n**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7947b116f21540425c77af4e75583f06", + "Designing scalable data products that will live in the marketplace is crucial.\nIn order to maximize the value add of each data product one should strongly\nconsider FAIR principles and the product value pyramid. Without these guiding\nprinciples we will only increase the issues that are already present in the\ncurrent systems. Each data product should solve a unique problem and should\nsolve it in a simple, reproducible and robust way.\n\n**You can read more on how Databricks Lakehouse**\n**Platform can help you accelerate time to value from**\n**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.7 \u0007\n\n**Data Lineage With Unity Catalog**\n\nby **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n\nJune 8, 2022\n\n\nThis blog will discuss the importance of data lineage, some of the common\nuse cases, our vision for better data transparency and data understanding with\ndata lineage.\n\n**What is data lineage and why is it important?**\n\nData lineage describes the transformations and refinements of data from source\nto insight. Lineage includes capturing all the relevant metadata and events\nassociated with the data in its lifecycle, including the source of the data set,\nwhat other data sets were used to create it, who created it and when, what\ntransformations were performed, what other data sets leverage it, and many other\nevents and attributes. With a data lineage solution, data teams get an end-to-end\nview of how data is transformed and how it flows across their data estate.\n\nAs more and more organizations embrace a data-driven culture and set up\nprocesses and tools to democratize and scale data and AI, data lineage is\nbecoming an essential pillar of a pragmatic data management and governance\nstrategy.\n\nTo understand the importance of data lineage, we have highlighted some of the\ncommon use cases we have heard from our customers below.\n\n\n**Impact analysis**\nData goes through multiple updates or revisions over its lifecycle, and\nunderstanding the potential impact of any data changes on downstream\nconsumers becomes important from a risk management standpoint. With data\nlineage, data teams can see all the downstream consumers — applications,\ndashboards, machine learning models or data sets, etc. — impacted by data\nchanges, understand the severity of the impact, and notify the relevant\nstakeholders. Lineage also helps IT teams proactively communicate data\nmigrations to the appropriate teams, ensuring business continuity.\n\n**Data understanding and transparency**\nOrganizations deal with an influx of data from multiple sources, and building\na better understanding of the context around data is paramount to ensure\nthe trustworthiness of the data. Data lineage is a powerful tool that enables\ndata leaders to drive better transparency and understanding of data in their\norganizations. Data lineage also empowers data consumers such as data scientists,\ndata engineers and data analysts to be context-aware as they perform analyses,\nresulting in better quality outcomes. Finally, data stewards can see which data sets\nare no longer accessed or have become obsolete to retire unnecessary data and\nensure data quality for end business users .\n\n\n-----\n\n**Debugging and diagnostics**\nYou can have all the checks and balances in place, but something will eventually\nbreak. Data lineage helps data teams perform a root cause analysis of any errors\nin their data pipelines, applications, dashboards, machine learning models, etc.,\nby tracing the error to its source. This significantly reduces the debugging time,\nsaving days, or in many cases, months of manual effort.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "710f6037bd92be3299640636b705d3f3", + "**Data understanding and transparency**\nOrganizations deal with an influx of data from multiple sources, and building\na better understanding of the context around data is paramount to ensure\nthe trustworthiness of the data. Data lineage is a powerful tool that enables\ndata leaders to drive better transparency and understanding of data in their\norganizations. Data lineage also empowers data consumers such as data scientists,\ndata engineers and data analysts to be context-aware as they perform analyses,\nresulting in better quality outcomes. Finally, data stewards can see which data sets\nare no longer accessed or have become obsolete to retire unnecessary data and\nensure data quality for end business users .\n\n\n-----\n\n**Debugging and diagnostics**\nYou can have all the checks and balances in place, but something will eventually\nbreak. Data lineage helps data teams perform a root cause analysis of any errors\nin their data pipelines, applications, dashboards, machine learning models, etc.,\nby tracing the error to its source. This significantly reduces the debugging time,\nsaving days, or in many cases, months of manual effort.\n\n**Compliance and audit readiness**\nMany compliance regulations, such as the General Data Protection Regulation\n(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\nAccountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\nand Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\nand visibility of data flow. As a result, data traceability becomes a key requirement\nin order for their data architecture to meet legal regulations. Data lineage helps\norganizations be compliant and audit-ready, thereby alleviating the operational\noverhead of manually creating the trails of data flows for audit reporting purposes.\n\n\n**Effortless transparency and proactive control with**\n**data lineage**\n\nThe [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\nsubstantially simplifies enterprise data infrastructure and accelerates innovation\nby unifying your data warehousing and AI use cases on a single platform.\nWe believe data lineage is a key enabler of better data transparency and data\nunderstanding in your lakehouse, surfacing the relationships between data,\njobs, and consumers, and helping organizations move toward proactive data\nmanagement practices. For example:\n\n**•** As the owner of a dashboard, do you want to be notified next time that a\ntable your dashboard depends upon wasn’t loaded correctly?\n\n**•** As a machine learning practitioner developing a model, do you want to be\nalerted that a critical feature in your model will be deprecated soon?\n\n**•** As a governance admin, do you want to automatically control access to\ndata based on its provenance?\n\nAll of these capabilities rely upon the automatic collection of data lineage across\nall use cases and personas — which is why the lakehouse and data lineage are a\npowerful combination.\n\n\n-----\n\nData lineage for tables\n\nData lineage for table columns\n\n\nData Lineage for notebooks, workflows, dashboards\n\n**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\nthe same permission model as Unity Catalog. If users do not have access to\na table, they will not be able to explore the lineage associated with the table,\nadding an additional layer of security for privacy considerations.\n\n**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\nin near real-time, and retrieved via REST API to support integrations with our\ncatalog partners.\n\n**Getting started with data lineage in Unity Catalog**\n\nData lineage is available with Databricks Premium and Enterprise tiers for\nno additional cost. If you already are a Databricks customer, follow the data\nlineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\ncustomer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n\n\n-----\n\nSECTION 2.8\n\n**Easy Ingestion to Lakehouse With COPY INTO**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "c73fcf9efc9ef252aeab38bb4602e795", + "**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\nin near real-time, and retrieved via REST API to support integrations with our\ncatalog partners.\n\n**Getting started with data lineage in Unity Catalog**\n\nData lineage is available with Databricks Premium and Enterprise tiers for\nno additional cost. If you already are a Databricks customer, follow the data\nlineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\ncustomer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n\n\n-----\n\nSECTION 2.8\n\n**Easy Ingestion to Lakehouse With COPY INTO**\n\nby **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n\nJanuary 17, 2023\n\n\nA new data management architecture known as the data lakehouse emerged\nindependently across many organizations and use cases to support AI and BI\ndirectly on vast amounts of data. One of the key success factors for using the\ndata lakehouse for analytics and machine learning is the ability to quickly and\neasily ingest data of various types, including data from on-premises storage\nplatforms (data warehouses, mainframes), real-time streaming data, and bulk\ndata assets.\n\nAs data ingestion into the lakehouse is an ongoing process that feeds the\nproverbial ETL pipeline, you will need multiple options to ingest various formats,\ntypes and latency of data. For data stored in cloud object stores such as AWS\nS3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\nAuto Loader, a natively integrated feature, that allows data engineers to ingest\nmillions of files from the cloud storage continuously. In other streaming cases\n\n(e.g., IoT sensor or clickstream data), Databricks provides native connectors\nfor Apache Spark Structured Streaming to quickly ingest data from popular\nmessage queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\nlatencies. Furthermore, many customers can leverage popular ingestion tools\n\n\nthat integrate with Databricks, such as Fivetran — to easily ingest data from\nenterprise applications, databases, mainframes and more into the lakehouse.\nFinally, analysts can use the simple “COPY INTO” command to pull new data into\nthe lakehouse automatically, without the need to keep track of which files have\nalready been processed.\n\nThis blog focuses on COPY INTO, a simple yet powerful SQL command that allows\nyou to perform batch file ingestion into Delta Lake from cloud object stores.\nIt’s idempotent, which guarantees to ingest files with exactly-once semantics\nwhen executed multiple times, supporting incremental appends and simple\ntransformations. It can be run once, in an ad hoc manner, and can be scheduled\nthrough Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\nINTO introduced new functionalities for data preview, validation, enhanced error\nhandling, and a new way to copy into a schemaless Delta Lake table so that users\n\ncan get started quickly, completing the end-to-end user journey to ingest from\ncloud object stores. Let’s take a look at the popular COPY INTO use cases.\n\n\n-----\n\n**1. Ingesting data for the first time**\n\n\nThe default for data validation is to parse all the data in the source directory to\nensure that there aren’t any issues, but the rows returned for preview are limited.\nOptionally, you can provide the number of rows to preview after VALIDATE.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "025b935f36585c781431ea3cee33f9b6", + "can get started quickly, completing the end-to-end user journey to ingest from\ncloud object stores. Let’s take a look at the popular COPY INTO use cases.\n\n\n-----\n\n**1. Ingesting data for the first time**\n\n\nThe default for data validation is to parse all the data in the source directory to\nensure that there aren’t any issues, but the rows returned for preview are limited.\nOptionally, you can provide the number of rows to preview after VALIDATE.\n\nThe COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\nof your target Delta table. Schema evolution only allows the addition of new\ncolumns, and does not support data type changes for existing columns. In other\nuse cases, you can omit this option if you intend to manage your table schema\nmore strictly as your data pipeline may have strict schema requirements and\nmay not want to evolve the schema at all times. However, our target Delta table\nin the example above is an empty, columnless table at the moment; therefore,\nwe have to specify the COPY_OPTION “mergeSchema” here.\n\nFigure 1: COPY INTO VALIDATE mode output\n\n\nCOPY INTO requires a table to exist as it ingests the data into a target Delta\ntable. However, you have no idea what your data looks like. You first create an\nempty Delta table.\n```\n CREATE TABLE my_example_data;\n\n```\nBefore you write out your data, you may want to preview it and ensure the\ndata looks correct. The COPY INTO Validate mode is a new feature in\nDatabricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\nsource data before ingesting many files from the cloud object stores.\nThese validations include:\n\n**•** if the data can be parsed\n\n**•** the schema matches that of the target table or if the schema\nneeds to be evolved\n\n**•** all nullability and check constraints on the table are met\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\n-----\n\n**2. Configuring COPY INTO**\n\n\nFigure 2 shows the validate output that the header is properly parsed.\n\nFigure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n\n**3. Appending data to a Delta table**\n\nNow that the preview looks good, we can remove the VALIDATE keyword and\nexecute the COPY INTO command.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\nWhen looking over the results of VALIDATE (see Figure 1), you may notice that\nyour data doesn’t look like what you want. Aren’t you glad you previewed your\ndata set first? The first thing you notice is the column names are not what is\nspecified in the CSV header. What’s worse, the header is shown as a row in your\ndata. You can configure the CSV parser by specifying FORMAT_OPTIONS.\nLet’s add those next.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\nWhen using the FORMAT OPTION, you can tell COPY INTO to infer the data types\nof the CSV file by specifying the inferSchema option; otherwise, all default\ndata types are STRINGs. On the other hand, binary file formats like AVRO and\nPARQUET do not need this option since they define their own schema. Another", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "f0435ebff1911d8a7202adcdcd4eb6cc", + "COPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\nWhen using the FORMAT OPTION, you can tell COPY INTO to infer the data types\nof the CSV file by specifying the inferSchema option; otherwise, all default\ndata types are STRINGs. On the other hand, binary file formats like AVRO and\nPARQUET do not need this option since they define their own schema. Another\n\noption, “mergeSchema” states that the schema should be inferred over a\ncomprehensive sample of CSV files rather than just one. The comprehensive list\nof format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n\n\n-----\n\nCOPY INTO keeps track of the state of files that\nhave been ingested. Unlike commands like INSERT\nINTO, users get idempotency with COPY INTO,\nwhich means users won’t get duplicate data in\nthe target table when running COPY INTO multiple\ntimes from the same source data.\n\nCOPY INTO can be run once, in an ad hoc manner,\nand can be scheduled with Databricks Workflows.\nWhile COPY INTO does not support low latencies\nfor ingesting natively, you can trigger COPY INTO\nthrough orchestrators like Apache Airflow.\n\n\nFigure 3: Databricks workflow UI to schedule a task\n\n\n-----\n\n**4. Secure data access with COPY INTO**\n\nCOPY INTO supports secure access in several ways. In this section, we want to\nhighlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\nfrom recent releases:\n\n**Unity Catalog**\nWith the general availability of Databrick Unity Catalog, you can use COPY INTO\nto ingest data to Unity Catalog managed or external tables from any source and\nfile format supported by COPY INTO. Unity Catalog also adds new options for\nconfiguring secure access to raw data, allowing you to use Unity Catalog external\nlocations or storage credentials to access data in cloud object storage. Learn\nmore about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n\n**Temporary Credentials**\nWhat if you have not configured Unity Catalog or instance profile? How about\ndata from a trusted third party bucket? Here is a convenient COPY INTO feature\nthat allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\nhoc bulk ingestion use case.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath' WITH (\nCREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\nTOKEN `=` '...' )\n)\nFILEFORMAT `=` CSV\n\n\n**5. Filtering files for ingestion**\n\nWhat about ingesting a subset of files where the filenames match a pattern? You\ncan apply glob patterns — a glob pattern that identifies the files to load from the\nsource directory. For example, let’s filter and ingest files which contain the word\n`raw_data` in the filename below.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data*.csv'\nFORMAT_OPTIONS ( 'header' `=` 'true' )\n\n**6. Ingest files in a time period**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "375f71f98c2fbe67e42a245d06b4d39b", + "COPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath' WITH (\nCREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\nTOKEN `=` '...' )\n)\nFILEFORMAT `=` CSV\n\n\n**5. Filtering files for ingestion**\n\nWhat about ingesting a subset of files where the filenames match a pattern? You\ncan apply glob patterns — a glob pattern that identifies the files to load from the\nsource directory. For example, let’s filter and ingest files which contain the word\n`raw_data` in the filename below.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data*.csv'\nFORMAT_OPTIONS ( 'header' `=` 'true' )\n\n**6. Ingest files in a time period**\n\nIn data engineering, it is frequently necessary to ingest files that have been\nmodified before or after a specific timestamp. Data between two timestamps\nmay also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\noffered by COPY INTO allow users to ingest data from a chosen time window into\na Delta table.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_*.csv'\nFORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n\n\n-----\n\n**7. Correcting data with the force option**\n\nBecause COPY INTO is by default idempotent, running the same query against\nthe same source files more than once has no effect on the destination table\nafter the initial execution. You must propagate changes to the target table\nbecause, in real-world circumstances, source data files in cloud object storage\nmay be altered for correction at a later time. In such a case, it is possible to first\nerase the data from the target table before ingesting the more recent data files\nfrom the source. For this operation you only need to set the copy option ‘force’\nto ‘true’.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_2022*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n\n**8. Applying simple transformations**\n\nWhat if you want to rename columns? Or the source data has changed and a\nprevious column has been renamed to something else? You don’t want to ingest\nthat data as two separate columns, but as a single column. We can leverage the\nSELECT statement in COPY INTO perform simple transformations.\n\nCOPY INTO demo.my_example_data\nFROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n`*` EXCEPT (first_name, last_name)\nFROM 's3://my-bucket/exampleDataPath'\n)\nFILEFORMAT `=` CSV\nPATTERN `=` '*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n**9. Error handling and observability with COPY INTO**\n\n**Error handling:**\nHow about ingesting data with file corruption issues? Common examples of file\ncorruption are:\n\n**•** Files with an incorrect file format\n\n**•** Failure to decompress\n\n**•** Unreadable files (e.g., invalid Parquet)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6dc0f31a19a017a8eebf29308cb0bac0", + "COPY INTO demo.my_example_data\nFROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n`*` EXCEPT (first_name, last_name)\nFROM 's3://my-bucket/exampleDataPath'\n)\nFILEFORMAT `=` CSV\nPATTERN `=` '*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n**9. Error handling and observability with COPY INTO**\n\n**Error handling:**\nHow about ingesting data with file corruption issues? Common examples of file\ncorruption are:\n\n**•** Files with an incorrect file format\n\n**•** Failure to decompress\n\n**•** Unreadable files (e.g., invalid Parquet)\n\n\n-----\n\nCOPY INTO’s format option ignoreCorruptFiles helps skip those files while\nprocessing. The result of the COPY INTO command returns the number of files\nskipped in the num_skipped_corrupt_files column. In addition, these corrupt\nfiles aren’t tracked by the ingestion state in COPY INTO, therefore they can be\nreloaded in a subsequent execution once the corruption is fixed. This option is\navailable in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n\nYou can see which files have been detected as corrupt by running COPY INTO in\nVALIDATE mode.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nVALIDATE ALL\nFORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n\n**Observability:**\nIn Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\ninput file metadata information, which allows users to monitor and get key\nproperties of the ingested files like path, name, size and modification time, by\nquerying a hidden STRUCT column called _metadata. To include this information\nin the destination, you must explicitly reference the _metadata column in your\nquery in COPY INTO.\n\nCOPY INTO my_example_data\nFROM (\nSELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\nexampleDataPath'\n)\nFILEFORMAT `=` CSV\n\n\n**How does it compare to Auto Loader?**\n\nCOPY INTO is a simple and powerful command to use when your source\ndirectory contains a small number of files (i.e., thousands of files or less), and if\nyou prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\nDelta Lake at your convenience, a common pattern by many ingestion partners.\nTo ingest a larger number of files both in streaming and batch we recommend\nusing [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\nleveraging advanced capabilities of automatic error handling, quality control,\ndata lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n\n**How to get started?**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "f319ce9e48ca97f0ca3c7c1441bc406b", + "**How to get started?**\n\nTo get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\nexample SQL commands to ingest from your cloud object stores. Check out\nthe options in No. 4 to establish secure access to your data for querying it in\nDatabricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\nfollow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n\nAs an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\nMachine Learning workspaces to learn most of the COPY INTO features in this\nblog, where source data and target Delta tables are generated in DBFS.\n\nMore tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n\n\n-----\n\nSECTION 2.9 \u0007\n\n**Simplifying Change Data Capture With Databricks Delta Live Tables**\n\nby **M O J G A N M A Z O U C H I**\n\nApril 25, 2022\n\n\nThis guide will demonstrate how you can leverage change data capture in Delta\nLive Tables pipelines to identify new records and capture changes made to the\ndata set in your data lake. Delta Live Tables pipelines enable you to develop\nscalable, reliable and low latency data pipelines, while performing change data\ncapturee in your data lake with minimum required computation resources and\nseamless out-of-order data handling.\n\n**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\nwhich explains creating scalable and reliable pipelines using Delta Live Tables\n(DLT) and its declarative ETL definitions.\n\n**Background on change data capture**\n\nChange data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\nchanges (data deletes, inserts and updates) in databases, like tracking customer,\norder or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\nnew events occur.\n\n\nSince [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\nreal-time centralization of all data changes in your ETL pipeline across multiple\nenvironments is critical.\n\nBy capturing CDC events, Databricks users can re-materialize the source table\nas Delta Table in Lakehouse and run their analysis on top of it, while being able\nto combine data with external systems. The MERGE INTO command in Delta Lake\non Databricks enables customers to efficiently upsert and delete records in\ntheir data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\nThis is a common use case that we observe many of Databricks customers are\nleveraging Delta Lakes to perform, and keeping their data lakes up to date with\nreal-time business data.\n\nWhile Delta Lake provides a complete solution for real-time CDC synchronization\nin a data lake, we are now excited to announce the change data capture feature\nin Delta Live Tables that makes your architecture even simpler, more efficient and\nscalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "00ed295603d6779afac889a6d0524a93", + "By capturing CDC events, Databricks users can re-materialize the source table\nas Delta Table in Lakehouse and run their analysis on top of it, while being able\nto combine data with external systems. The MERGE INTO command in Delta Lake\non Databricks enables customers to efficiently upsert and delete records in\ntheir data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\nThis is a common use case that we observe many of Databricks customers are\nleveraging Delta Lakes to perform, and keeping their data lakes up to date with\nreal-time business data.\n\nWhile Delta Lake provides a complete solution for real-time CDC synchronization\nin a data lake, we are now excited to announce the change data capture feature\nin Delta Live Tables that makes your architecture even simpler, more efficient and\nscalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n\nEarlier CDC solutions with Delta tables were using MERGE INTO operation, which\nrequires manually ordering the data to avoid failure when multiple rows of the\nsource data set match while attempting to update the same rows of the target\n\n\n-----\n\nDelta table. To handle the out-of-order data, there was an extra step required to\npreprocess the source table using a foreachBatch implementation to eliminate\nthe possibility of multiple matches, retaining only the latest change for each\nkey (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\noperation in DLT pipelines automatically and seamlessly handles out-of-order\ndata without any need for data engineering manual intervention.\n\n**CDC with Databricks Delta Live Tables**\n\nIn this blog, we will demonstrate how to use the APPLY CHANGES INTO command\nin Delta Live Tables pipelines for a common CDC use case where the CDC data\nis coming from an external system. A variety of CDC tools are available such\nas Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\nimplementations differ, these tools generally capture and record the history\nof data changes in logs; downstream applications consume these CDC logs. In\nour example, data is landed in cloud object storage from a CDC tool such as\nDebezium, Fivetran, etc.\n\nWe have data from various CDC tools landing in a cloud object storage or a\nmessage queue like Apache Kafka. Typically we see CDC used in an ingestion\nto what we refer as the medallion architecture. A medallion architecture is a\ndata design pattern used to logically organize data in a Lakehouse, with the\ngoal of incrementally and progressively improving the structure and quality of\ndata as it flows through each layer of the architecture. Delta Live Tables allows\nyou to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\ncombining this functionality with the medallion architecture allows for\n\n\nincremental changes to easily flow through analytical workloads at scale. Using\nCDC together with the medallion architecture provides multiple benefits to users\nsince only changed or added data needs to be processed. Thus, it enables users\nto cost-effectively keep Gold tables up-to-date with the latest business data.\n\n**NOTE:** The example here applies to both SQL and Python versions of CDC\nand also on a specific way to use the operations; to evaluate variations,\nplease see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n\n**Prerequisites**\n\nTo get the most out of this guide, you should have a basic familiarity with:\n\n**•** SQL or Python\n\n**•** Delta Live Tables\n\n**•** Developing ETL pipelines and/or working with Big Data systems\n\n**•** Databricks interactive notebooks and clusters\n\n**•** You must have access to a Databricks Workspace with permissions\nto create new clusters, run jobs, and save data to a location on\nexternal cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bad7fff4d3f967a085f6f130ed286489", + "**NOTE:** The example here applies to both SQL and Python versions of CDC\nand also on a specific way to use the operations; to evaluate variations,\nplease see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n\n**Prerequisites**\n\nTo get the most out of this guide, you should have a basic familiarity with:\n\n**•** SQL or Python\n\n**•** Delta Live Tables\n\n**•** Developing ETL pipelines and/or working with Big Data systems\n\n**•** Databricks interactive notebooks and clusters\n\n**•** You must have access to a Databricks Workspace with permissions\nto create new clusters, run jobs, and save data to a location on\nexternal cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n\n**•** For the pipeline we are creating in this blog, “Advanced” product\nedition which supports enforcement of data quality constraints,\nneeds to be selected\n\n\n-----\n\n**The data set**\n\nHere we are consuming realistic looking CDC data from an external database. In\nthis pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\ntool like Debezium can produce and bring into cloud storage for the initial ingest\nin Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\nobject storage, and store them in the Bronze table as it stores the raw messages.\nThe Bronze tables are intended for data ingestion which enable quick access to a\nsingle source of truth. Next we perform APPLY CHANGES INTO from the cleaned\nBronze layer table to propagate the updates downstream to the Silver table. As\ndata flows to Silver tables, generally it becomes more refined and optimized\n(“just-enough”) to provide an enterprise a view of all its key business entities.\nSee the diagram below.\n\n\nThis blog focuses on a simple example that requires a JSON message with\nfour fields of customer’s name, email, address and id along with the two fields:\noperation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\noperation_date (which stores the date and timestamp for the record came for\neach operation action) to describe the changed data.\n\nTo generate a sample data set with the above fields, we are using a Python\npackage that generates fake data, Faker. You can find the notebook related to this\ndata generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\nlocation to write the generated data there. We are using the DBFS functionality of\nDatabricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\nwe use a PySpark user-defined function to generate the synthetic data set for\neach field, and write the data back to the defined storage location, which we will\nrefer to in other notebooks for accessing the synthetic data set.\n\n**Ingesting the raw data set using Auto Loader**\n\nAccording to the medallion architecture paradigm, the Bronze layer holds the\nmost raw data quality. At this stage we can incrementally read new data using\nAuto Loader from a location in cloud storage. Here we are adding the path to our\ngenerated data set to the configuration section under pipeline settings, which\nallows us to load the source path as a variable. So now our configuration under\npipeline settings looks like below:\n\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\"\n\n\n-----\n\nThen we load this configuration property in our notebooks.\n\nLet’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n\n**A . S Q L**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "42209110d6df5aaf79390f53667c369f", + "**Ingesting the raw data set using Auto Loader**\n\nAccording to the medallion architecture paradigm, the Bronze layer holds the\nmost raw data quality. At this stage we can incrementally read new data using\nAuto Loader from a location in cloud storage. Here we are adding the path to our\ngenerated data set to the configuration section under pipeline settings, which\nallows us to load the source path as a variable. So now our configuration under\npipeline settings looks like below:\n\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\"\n\n\n-----\n\nThen we load this configuration property in our notebooks.\n\nLet’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n\n**A . S Q L**\n\nSET spark.source;\nCREATE STREAMING LIVE TABLE customer_bronze\n(\naddress string ,\nemail string ,\nid string ,\nfirstname string ,\nlastname string ,\noperation string ,\noperation_date string ,\n_rescued_data string\n)\nTBLPROPERTIES ( \"quality\" = \"bronze\" )\nCOMMENT \"New customer data incrementally ingested from cloud object\nstorage landing zone\"\nAS\nSELECT *\nFROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\ninferColumnTypes\" , \"true\" ));\n\n\n**B . P Y T H O N**\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nsource = spark.conf.get( \"source\" )\n\n**@dlt.table(name=** **\"customer_bronze\"** **,**\n**comment =** **\"New customer data incrementally ingested from**\n**cloud object storage landing zone\"** **,**\n**table_properties={**\n**\"quality\"** **:** **\"bronze\"**\n**}**\n**)**\n```\n def customer_bronze ():\n\n```\nreturn (\nspark.readStream. format ( \"cloudFiles\" ) \\\n.option( \"cloudFiles.format\" , \"json\" ) \\\n.option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n.load( f\" {source} /customers\" )\n)\n\nThe above statements use the Auto Loader to create a streaming live table\ncalled customer_bronze from json files. When using Auto Loader in Delta Live\n\nTables, you do not need to provide any location for schema or checkpoint, as\nthose locations will be managed automatically by your DLT pipeline.\n\nAuto Loader provides a Structured Streaming source called cloud_files in\nSQL and cloudFiles in Python, which takes a cloud storage path and format as\nparameters.\n\nTo reduce compute costs, we recommend running the DLT pipeline in\nTriggered mode as a micro-batch assuming you do not have very low latency\nrequirements.\n\n\n-----\n\n**Expectations and high-quality data**\n\nIn the next step to create a high-quality, diverse, and accessible data set,\nwe impose quality check expectation criteria using Constraints. Currently,\na constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\nconstraints are logged to enable streamlined quality monitoring.\n\n**A . S Q L**\n\nCREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\nCONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\nCONSTRAINT valid_address EXPECT (address IS NOT NULL ),\nCONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\nDROP ROW\n)\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\nAS SELECT `*`\nFROM STREAM(LIVE.customer_bronze);\n\n**B . P Y T H O N**\n```\n @dlt.view(name= \"customer_bronze_clean_v\" ,\n comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n\n```", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "16725d26cc4892da8c004b7b86e4208e", + "**A . S Q L**\n\nCREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\nCONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\nCONSTRAINT valid_address EXPECT (address IS NOT NULL ),\nCONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\nDROP ROW\n)\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\nAS SELECT `*`\nFROM STREAM(LIVE.customer_bronze);\n\n**B . P Y T H O N**\n```\n @dlt.view(name= \"customer_bronze_clean_v\" ,\n comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n\n```\n\n**Using APPLY CHANGES INTO statement to propagate changes to**\n\n**downstream target table**\n\nPrior to executing the Apply Changes Into query, we must ensure that a target\nstreaming table which we want to hold the most up-to-date data exists. If it\ndoes not exist we need to create one. Below cells are examples of creating a\ntarget streaming table. Note that at the time of publishing this blog, the target\nstreaming table creation statement is required along with the Apply Changes\nInto query, and both need to be present in the pipeline — otherwise your table\ncreation query will fail.\n\n**A . S Q L**\n\nCREATE STREAMING LIVE TABLE customer_silver\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Clean, merged customers\";\n\n**B . P Y T H O N**\n\ndlt.create_target_table(name= \"customer_silver\" ,\ncomment= \"Clean, merged customers\" ,\ntable_properties={\n\"quality\" : \"silver\"\n\n```\n@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\ndef customer_bronze_clean_v ():\n return dlt.read_stream( \"customer_bronze\" ) \\\n\n```\n`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n```\n\"operation\" , \"operation_date\" , \"_rescued_data\" )\n\n```\n\n-----\n\nNow that we have a target streaming table, we can propagate changes to the\ndownstream target table using the Apply Changes Into query. While CDC feed\ncomes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\nINSERT and UPDATE events from any record in the source data set matching\non primary keys, and sequenced by a field which identifies the order of events.\nMore specifically it updates any row in the existing target table that matches\nthe primary key(s) or inserts a new row when a matching record does not exist\nin the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\nequivalent apply_as_deletes argument in Python to handle DELETE events.\n\nIn this example we used \"id\" as my primary key, which uniquely identifies the\ncustomers and allows CDC events to apply to those identified customer records\nin the target streaming table. Since \"operation_date\" keeps the logical order of\nCDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\nSQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\nchange events that arrive out of order. Keep in mind that the field value we use\nwith SEQUENCE BY (or sequence_by) should be unique among all updates to\nthe same key. In most cases, the sequence by column will be a column with\ntimestamp information.\n\nFinally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\ndata)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\ndate\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\nthe columns are included in the target streaming table, when we do not specify\nthe \"COLUMNS\" clause.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "23039b84f7a58b633ecbba062199cfb8", + "In this example we used \"id\" as my primary key, which uniquely identifies the\ncustomers and allows CDC events to apply to those identified customer records\nin the target streaming table. Since \"operation_date\" keeps the logical order of\nCDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\nSQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\nchange events that arrive out of order. Keep in mind that the field value we use\nwith SEQUENCE BY (or sequence_by) should be unique among all updates to\nthe same key. In most cases, the sequence by column will be a column with\ntimestamp information.\n\nFinally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\ndata)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\ndate\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\nthe columns are included in the target streaming table, when we do not specify\nthe \"COLUMNS\" clause.\n\n\n**A . S Q L**\n\nAPPLY CHANGES INTO LIVE.customer_silver\nFROM stream(LIVE.customer_bronze_clean_v)\nKEYS (id)\nAPPLY AS DELETE WHEN operation `=` \"DELETE\"\nSEQUENCE BY operation_date\nCOLUMNS `*` EXCEPT (operation, operation_date,\n_rescued_data);\n\n**B . P Y T H O N**\n```\n dlt.apply_changes(\n target = \"customer_silver\",\n source = \"customer_bronze_clean_v\",\n keys = [\"id\"],\n sequence_by = col(\"operation_date\"),\n apply_as_deletes = expr(\"operation = 'DELETE'\"),\n except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n\n```\nTo check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n\nPlease note that, at the time of publishing this blog, a table that reads from the\ntarget of an APPLY CHANGES INTO query or apply_changes function must be a\nlive table, and cannot be a streaming live table.\n\nA [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\nwe have all the cells ready, let’s create a pipeline to ingest data from cloud object\nstorage. Open Jobs in a new tab or window in your workspace, and select “Delta\nLive Tables.”\n\n\n-----\n\nThe pipeline associated with this blog has the following DLT pipeline settings:\n\n{\n\"clusters\" : [\n{\n\"label\" : \"default\" ,\n\"num_workers\" : 1\n}\n],\n\"development\" : true ,\n\"continuous\" : false ,\n\"edition\" : \"advanced\" ,\n\"photon\" : false ,\n\"libraries\" : [\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/1-CDC_DataGenerator\"\n}\n},\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/2-Retail_DLT_CDC_sql\"\n}\n}\n],\n\"name\" : \"CDC_blog\" ,\n\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\" ,\n\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n},\n\"target\" : \"my_database\"\n\n\n1. Select “Create Pipeline” to create a new pipeline\n\n2. Specify a name such as “Retail CDC Pipeline”", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "24fa9012ddc358cd759a98159e468110", + "1. Select “Create Pipeline” to create a new pipeline\n\n2. Specify a name such as “Retail CDC Pipeline”\n\n3. Specify the Notebook Paths that you already created earlier, one for the\ngenerated data set using Faker package, and another path for the ingestion\nof the generated data in DLT. The second notebook path can refer to the\nnotebook written in SQL, or Python depending on your language of choice.\n\n4. To access the data generated in the first notebook, add the data set path in\nconfiguration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\nwe set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\nour second notebook.\n\n5. Specify the Target (which is optional and referring to the target database),\nwhere you can query the resulting tables from your pipeline\n\n6. Specify the Storage Location in your object storage (which is optional), to\naccess your DLT produced data sets and metadata logs for your pipeline\n\n7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\nnew data in the source all at once, and once the processing is done it will\nterminate the compute resource automatically. You can toggle between\nTriggered and Continuous modes when editing your pipeline settings. Setting\n“continuous”: false in the JSON is equivalent to setting the pipeline to\nTriggered mode.\n\n8. For this workload you can disable the autoscaling under Autopilot Options,\nand use only one worker cluster. For production workloads, we recommend\nenabling autoscaling and setting the maximum numbers of workers needed\nfor cluster size.\n\n9. Select “Start”\n\n10. Your pipeline is created and running now!\n\n\n-----\n\nYou can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\nto see pipeline observability and data quality monitoring on the example DLT\npipeline associated with this blog.\n\n**Conclusion**\n\nIn this blog, we showed how we made it seamless for users to efficiently\nimplement change data capture (CDC) into their lakehouse platform with Delta\nLive Tables (DLT). DLT provides built-in quality controls with deep visibility into\npipeline operations, observing pipeline lineage, monitoring schema, and quality\nchecks at each step in the pipeline. DLT supports automatic error handling and\nbest in class auto-scaling capability for streaming workloads, which enables\nusers to have quality data with optimum resources required for their workload.\n\nData engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\nyour ETL pipelines easily identify changes and apply those changes across tens\nof thousands of tables with low-latency support.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3725239f09e02ff819d5ef17ec0f1c5a", + "**Conclusion**\n\nIn this blog, we showed how we made it seamless for users to efficiently\nimplement change data capture (CDC) into their lakehouse platform with Delta\nLive Tables (DLT). DLT provides built-in quality controls with deep visibility into\npipeline operations, observing pipeline lineage, monitoring schema, and quality\nchecks at each step in the pipeline. DLT supports automatic error handling and\nbest in class auto-scaling capability for streaming workloads, which enables\nusers to have quality data with optimum resources required for their workload.\n\nData engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\nyour ETL pipelines easily identify changes and apply those changes across tens\nof thousands of tables with low-latency support.\n\n**Ready to get started and try out CDC in Delta Live Tables for yourself?**\nPlease watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\ncomplexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n[video](https://vimeo.com/700994477) to create your pipeline!\n\n\n**DLT pipeline lineage observability and data quality**\n**monitoring**\n\nAll DLT pipeline logs are stored in the pipeline’s storage location. You can specify\nyour storage location only when you are creating your pipeline. Note that once\nthe pipeline is created you can no longer modify storage location.\n\n\n-----\n\nSECTION 2.10 \u0007\n\n**Best Practices for Cross-Government Data Sharing**\n\nby **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n\n**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n\nFebruary 21, 2023", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "64a7fd4d47f0417252c1425f0f46c67d", + "**DLT pipeline lineage observability and data quality**\n**monitoring**\n\nAll DLT pipeline logs are stored in the pipeline’s storage location. You can specify\nyour storage location only when you are creating your pipeline. Note that once\nthe pipeline is created you can no longer modify storage location.\n\n\n-----\n\nSECTION 2.10 \u0007\n\n**Best Practices for Cross-Government Data Sharing**\n\nby **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n\n**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n\nFebruary 21, 2023\n\n\nGovernment data exchange is the practice of sharing data between different\ngovernment agencies and often partners in commercial sectors. Government\ncan share data for various reasons, such as to improve government operations’\nefficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\nprivate sector or receiving data from the private sector. The considerations span\nmultiple jurisdictions and over almost all industries. In this blog, we will address the\nneeds disclosed as part of national data strategies and how modern technologies,\nparticularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\nimplement and manage a future-proof and sustainable data ecosystem.\n\n**Data sharing and public sector**\n\n“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n\nProbably the quote about sharing that applies the most profoundly to the\ntopic of data sharing. To the extent that the purpose of sharing the data is to\ncreate new information, new insights, and new data. The importance of data\nsharing is even more amplified in the government context, where federation\n\n\nbetween departments allows for increased focus. Still, the very same federation\nintroduces challenges around data completeness, data quality, data access,\nsecurity and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\nand require a strategic, multifaceted approach to be addressed appropriately.\nTechnology, people, process, legal frameworks, etc., require dedicated\nconsideration when designing a robust data sharing ecosystem.\n\n[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\nmissions through which we can materialize the value of data for the citizen and\nsociety-wide benefits.\n\n\n-----\n\nIt comes as no surprise that each and every one of the missions is strongly\nrelated to the concept of data sharing, or more broadly, data access both within\nand outside of government departments:\n\n**1. Unlocking the value of the data across the economy** — Mission 1 of the\nNDS aims to assert government and the regulators as enablers of the value\nextraction from data through the adoption of best practices. The UK data\neconomy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\nIn this context, it is essential to understand that the government-collected\nand provided open data can be crucial for addressing many of the challenges\nacross all industries.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "38268ee8f57ce584fd9a83226a52d486", + "[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\nmissions through which we can materialize the value of data for the citizen and\nsociety-wide benefits.\n\n\n-----\n\nIt comes as no surprise that each and every one of the missions is strongly\nrelated to the concept of data sharing, or more broadly, data access both within\nand outside of government departments:\n\n**1. Unlocking the value of the data across the economy** — Mission 1 of the\nNDS aims to assert government and the regulators as enablers of the value\nextraction from data through the adoption of best practices. The UK data\neconomy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\nIn this context, it is essential to understand that the government-collected\nand provided open data can be crucial for addressing many of the challenges\nacross all industries.\n\nFor example, insurance providers can better assess the risk of insuring\nproperties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\nthe other hand, capital market investors could better understand the risk of\ntheir investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\nReversely, it is crucial for regulators to have well-defined data access and\ndata sharing patterns for conducting their regulatory activities. This clarity\ntruly enables the economic actors that interact with government data.\n\n\n**2. Securing a pro-growth and trusted data regime** — The key aspect of\nMission 2 is data trust, or more broadly, adherence to data quality norms.\nData quality considerations become further amplified for data sharing and\ndata exchange use cases where we are considering the whole ecosystem\nat once, and quality implications transcend the boundaries of our own\nplatform. This is precisely why we have to adopt “data sustainability.” What\nwe mean by sustainable data products are data products that harness the\nexisting sources over reinvention of the same/similar assets, accumulation of\nunnecessary data (data pollutants) and that anticipate future uses.\n\nUngoverned and unbounded data sharing could negatively impact data\nquality and hinder the growth and value of data. The quality of how the data\nis shared should be a key consideration of data quality frameworks. For\nthis reason, we require a solid set of standards and best practices for data\nsharing with governance and quality assurance built into the process and\ntechnologies. Only this way can we ensure the sustainability of our data and\nsecure a pro-growth trusted data regime.\n\n\n-----\n\n**3. Transforming government’s use of data to drive efficiency and improve**\n**public services** — “By 2025 data assets are organized and supported as\nproducts, regardless of whether they’re used by internal teams or external\ncustomers… Data products continuously evolve in an agile manner to meet\nthe needs of consumers… these products provide data solutions that can\nmore easily and repeatedly be used to meet various business challenges and\nreduce the time and cost of delivering new AI-driven capabilities.” —\n[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\nenablers of digital transformation for both the public and private sectors.\n\nAI, ML, reports, and dashboards are just a few examples of data products\nand services that extract value from data. The quality of these solutions is\ndirectly reflected in the quality of data used for building them and our ability\nto access and leverage available data assets both internally and externally.\nWhilst there is a vast amount of data available for us to build new intelligent\nsolutions for driving efficiency for better processes, better decision-making,\nand better policies — there are numerous barriers that can trap the data,\nsuch as legacy systems, data silos, fragmented standards, proprietary\nformats, etc. Modeling data solutions as data products and standardizing\nthem to a unified format allows us to abstract such barriers and truly\nleverage the data ecosystem.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "11226970147e60bdd4099a6d7dbbc43d", + "AI, ML, reports, and dashboards are just a few examples of data products\nand services that extract value from data. The quality of these solutions is\ndirectly reflected in the quality of data used for building them and our ability\nto access and leverage available data assets both internally and externally.\nWhilst there is a vast amount of data available for us to build new intelligent\nsolutions for driving efficiency for better processes, better decision-making,\nand better policies — there are numerous barriers that can trap the data,\nsuch as legacy systems, data silos, fragmented standards, proprietary\nformats, etc. Modeling data solutions as data products and standardizing\nthem to a unified format allows us to abstract such barriers and truly\nleverage the data ecosystem.\n\n\n**4. Ensuring the security and resilience of the infrastructure on which**\n**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\nfrom now and even in a not so distant future, we will be required to rethink\nour approach to data, more specifically — what is our digital supply chain\ninfrastructure/data sharing infrastructure? Data and data assets are products\nand should be managed as products. If data is a product, we need a coherent\nand unified way of providing those products.\n\nIf data is to be used across industries and across both private and public\nsectors, we need an open protocol that drives adoption and habit generation.\nTo drive adoption, the technologies we use must be resilient, robust, trusted\nand usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\nboundaries to achieving this vision.\n\n**5. Championing the international flow of data** — Data exchange between\njurisdictions and across governments will likely be one of the most\ntransformative applications of data at scale. Some of the world’s toughest\nchallenges depend on the efficient exchange of data between governments\n— prevention of criminal activities, counterterrorism activities, net-zero\nemission goals, international trade, the list goes on and on. Some steps in\nthis direction are already materializing: the U.S. federal government and UK\ngovernment have agreed on data exchange for countering serious crime\nactivities. This is a true example of championing international flow data and\nusing data for good. It is imperative that for these use cases, we approach\ndata sharing from a security-first angle. Data sharing standards and protocols\nneed to adhere to security and privacy best practices.\n\n\n-----\n\nWhile originally built with a focus on the UK government and how to better\nintegrate data as a key asset of a modern government, these concepts apply in\na much wider global public sector context. In the same spirit, the U.S. Federal\nGovernment proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\npractices, action steps and timeline through which government can leverage\nthe full value of Federal data for mission, service and the public good.\n\nThe principles are grouped into three primary topics:\n\n**•** **Ethical governance** — Within the domain of ethics, the sharing of data\nis a fundamental tool for promoting transparency, accountability and\nexplainability of decision-making. It is practically impossible to uphold\nethics without some form of audit conducted by an independent party.\nData (and metadata) exchange is a critical enabler for continuous robust\nprocesses that ensure we are using the data for good and we are using data\nwe can trust.\n\n\n\n**•** **Conscious design** — These principles are strongly aligned with the idea of\ndata sustainability. The guidelines promote forward thinking around usability\nand interoperability of the data and user-centric design principles of\nsustainable data products.\n\n**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\nan important role in building a scalable learning ecosystem and learning\nculture. Data is front and center of knowledge synthesis, and from a\nscientific angle, data proves factual knowledge. Another critical component\nof knowledge is the “Why?” and data is what we need to address the\n“Why?” component of any decisions we make, which policy to enforce, who\nto sanction, who to support with grants, how to improve the efficiency of\ngovernment services, how to better serve citizens and society.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "0ee51c0c228a4f8a1444765ec2ca8db7", + "**•** **Conscious design** — These principles are strongly aligned with the idea of\ndata sustainability. The guidelines promote forward thinking around usability\nand interoperability of the data and user-centric design principles of\nsustainable data products.\n\n**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\nan important role in building a scalable learning ecosystem and learning\nculture. Data is front and center of knowledge synthesis, and from a\nscientific angle, data proves factual knowledge. Another critical component\nof knowledge is the “Why?” and data is what we need to address the\n“Why?” component of any decisions we make, which policy to enforce, who\nto sanction, who to support with grants, how to improve the efficiency of\ngovernment services, how to better serve citizens and society.\n\nIn contrast to afore discussed qualitative analysis of the value of data sharing\nacross governments, the European Commission forecasts the economic value\nof the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\nsame size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\nmillion data professionals in Europe alone. The technology and infrastructure to\nsupport the data society have to be accessible to all, interoperable, extensible,\nflexible and open. Imagine a world in which you’d need a different truck to\ntransport products between different warehouses because each road requires a\ndifferent set of tires — the whole supply chain would collapse. When it comes to\ndata, we often experience the “one set of tires for one road” paradox. Rest APIs\nand data exchange protocols have been proposed in the past but have failed\nto address the need for simplicity, ease of use and cost of scaling up with the\nnumber of data products.\n\n\n-----\n\n**Delta Sharing — the new data**\n**highway**\n\nDelta Sharing provides an open protocol for\nsecure data sharing to any computing platform.\nThe protocol is based on Delta data format and is\nagnostic concerning the cloud of choice.\n\nDelta is an open source data format that avoids\nvendor, platform and cloud lock-in, thus fully\nadhering to the principles of data sustainability,\nconscious design of the U.S. Federal Data Strategy\nand mission 4 of the UK National Data Strategy.\nDelta provides a governance layer on top of the\nParquet data format. Furthermore, it provides many\nperformance optimizations not available in Parquet\nout of the box. The openness of the data format\nis a critical consideration. It is the main factor for\ndriving the habit generation and adoption of best\npractices and standards.\n\n\n-----\n\nDelta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\npermissions and access to any data asset stored in Delta or Parquet formats.\nThe protocol defines two main actors, the data provider (data supplier, data\nowner) and the data recipient (data consumer). The recipient, by definition, is\nagnostic to the data format at the source. Delta Sharing provides the necessary\nabstractions for governed data access in many different languages and tools.\n\nDelta Sharing is uniquely positioned to answer many of the challenges of data\nsharing in a scalable manner within the context of highly regulated domains like\nthe public sector:\n\n**• Privacy and security concerns** — Personally identifiable data or otherwise\nsensitive or restricted data is a major part of the data exchange needs of a\ndata-driven and modernized government. Given the sensitive nature of such\ndata, it is paramount that the governance of data sharing is maintained in a\ncoherent and unified manner. Any unnecessary process and technological\ncomplexities increase the risk of over-sharing data. With this in mind,\nDelta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\nvery inception. The protocol provides end-to-end encryption, short-lived\ncredentials, and accessible and intuitive audit and governance features. All\nof these capabilities are available in a centralized way across all your Delta\ntables across all clouds.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "af8513c2a14ec8c66be9f231f6759c24", + "Delta Sharing is uniquely positioned to answer many of the challenges of data\nsharing in a scalable manner within the context of highly regulated domains like\nthe public sector:\n\n**• Privacy and security concerns** — Personally identifiable data or otherwise\nsensitive or restricted data is a major part of the data exchange needs of a\ndata-driven and modernized government. Given the sensitive nature of such\ndata, it is paramount that the governance of data sharing is maintained in a\ncoherent and unified manner. Any unnecessary process and technological\ncomplexities increase the risk of over-sharing data. With this in mind,\nDelta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\nvery inception. The protocol provides end-to-end encryption, short-lived\ncredentials, and accessible and intuitive audit and governance features. All\nof these capabilities are available in a centralized way across all your Delta\ntables across all clouds.\n\n**• Quality and accuracy** — Another challenge of data sharing is ensuring\nthat the data being shared is of high quality and accuracy. Given that\nthe underlying data is stored as Delta tables, we can guarantee that the\n[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\nof data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n\n\nquality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\nadditional effort. The issue becomes even more emphasized by the fact\nthat data quality cannot be ensured in the same way on both the data\nprovider and data recipient side without the exact reimplementation of the\nsource systems. It is critical to embed quality and metadata together with\ndata to ensure quality travels together with data. Any decoupled approach\nto managing data, metadata and quality separately increases the risk of\nsharing and can lead to undesirable outcomes.\n\n**• Lack of standardization** — Another challenge of data sharing is the lack\nof standardization in how data is collected, organized, and stored. This is\nparticularly pronounced in the context of governmental activities. While\ngovernments have proposed standard formats (e.g., Office for National\nStatistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\nsector companies to standards proposed by such initiatives is a massive\nchallenge. Other industries may have different requirements for scalability,\ninteroperability, format complexity, lack of structure in data, etc. Most of\nthe currently advocated standards are lacking in multiple such aspects.\nDelta is the most mature candidate for assuming the central role in the\nstandardization of data exchange format. It has been built as a transactional\nand scalable data format, it supports structured, semi-structured and\nunstructured data, it stores data schema and metadata together with data\nand it provides a scalable enterprise-grade sharing protocol through Delta\nSharing. Finally, Delta is one of the most popular open source projects\nin the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "efc4fbee742e1cddfcd60a0586c59586", + "-----\n\n**• Cultural and organizational barriers** — These challenges can be\nsummarized by one word: friction. Unfortunately, it’s a common problem\nfor civil servants to struggle to obtain access to both internal and external\ndata due to over-cumbersome processes, policies and outdated standards.\nThe principles we are using to build our data platforms and our data sharing\nplatforms have to be self-promoting, have to drive adoption and have to\ngenerate habits that adhere to best practices.\n\nIf there is friction with standard adoption, the only way to ensure standards\nare respected is by enforcement and that itself is yet another barrier to\nachieving data sustainability. Organizations have already adopted Delta\nSharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\nSharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\nand governed.\n\n\n\n**• Technical challenges** — Federation at the government scale or even\nfurther across multiple industries and geographies poses technical\nchallenges. Each organization within this federation owns its platform\nand drives technological, architectural, platform and tooling choices.\n\nHow can we promote interoperability and data exchange in this vast,\ndiverse technological ecosystem? The data is the only viable integration\nvehicle. As long as the data formats we utilize are scalable, open and\ngoverned, we can use them to abstract from individual platforms and\ntheir intrinsic complexities.\n\nDelta format and Delta Sharing solve this wide array of requirements and\nchallenges in a scalable, robust and open way. This positions Delta Sharing\nas the strongest choice for unification and simplification of the protocol and\nmechanism through which we share data across both private and public sectors.\n\n\n-----\n\n**Data Sharing through data clean rooms**\n\n\n[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\nshare data with third parties in a privacy-safe environment. With Unity Catalog ,\nyou can enable fine-grained access controls on the data and meet your privacy\nrequirements. In this architecture, the data participants never get access to\nthe raw data. The only outputs from the clean rooms are those data assets\ngenerated in a pre-agreed, governed and fully controlled manner that ensures\ncompliance with the requirements of all parties involved.\n\nFinally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\non the premise. In contrast, less restricted data is free to leverage the power\nof the cloud offerings. In said scenario, there may be a need to combine the\npower of the cloud with the restricted data to solve advanced use cases where\ncapabilities are unavailable on the on-premises data platforms. Data clean rooms\ncan ensure that no physical data copies of the raw restricted data are created,\nresults are produced within the clean room’s controlled environment and results\nare shared back to the on-premises environment (if the results maintain the\nrestricted access within the defined policies) or are forwarded to any other\ncompliant and predetermined destination system.\n\n\nTaking the complexities of data sharing within highly regulated space and the\npublic sector one step further — what if we require to share the knowledge\ncontained in the data without ever granting direct access to the source data to\nexternal parties? These requirements may prove achievable and desirable where\nthe data sharing risk appetite is very low.\n\nIn many public sector contexts, there are concerns that combining the data that\ndescribes citizens could lead to a big brother scenario where simply too much\ndata about an individual is concentrated in a single data asset. If it were to fall\ninto the wrong hands, such a hypothetical data asset could lead to immeasurable\nconsequences for individuals and the trust in public sector services could\nerode. On the other hand, the value of a 360 view of the citizen could accelerate\nimportant decision-making. It could immensely improve the quality of policies\nand services provided to the citizens.\n\n\n-----\n\n**Citizen value of data sharing**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "71748a10f27ff3863ddbfa7c97bff3c4", + "Taking the complexities of data sharing within highly regulated space and the\npublic sector one step further — what if we require to share the knowledge\ncontained in the data without ever granting direct access to the source data to\nexternal parties? These requirements may prove achievable and desirable where\nthe data sharing risk appetite is very low.\n\nIn many public sector contexts, there are concerns that combining the data that\ndescribes citizens could lead to a big brother scenario where simply too much\ndata about an individual is concentrated in a single data asset. If it were to fall\ninto the wrong hands, such a hypothetical data asset could lead to immeasurable\nconsequences for individuals and the trust in public sector services could\nerode. On the other hand, the value of a 360 view of the citizen could accelerate\nimportant decision-making. It could immensely improve the quality of policies\nand services provided to the citizens.\n\n\n-----\n\n**Citizen value of data sharing**\n\nEvery decision made by the government is a decision that affects its citizens.\nWhether the decision is a change to a policy, granting a benefit or preventing\ncrime, it can significantly influence the quality of our society. Data is a key factor\nin making the right decisions and justifying the decisions made. Simply put,\nwe can’t expect high-quality decisions without the high quality of data and a\ncomplete view of the data (within the permitted context). Without data sharing,\nwe will remain in a highly fragmented position where our ability to make those\ndecisions is severely limited or even completely compromised. In this blog, we\nhave covered several technological solutions available within the lakehouse that\ncan derisk and accelerate how the government is leveraging the data ecosystem\nin a sustainable and scalable way.\n\nFor more details on the industry use cases that Delta Sharing is addressing\nplease consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 03\n\n\n### Ready-to-Use Notebooks and Data Sets\n\n\n-----\n\n**Digital Twins**\n\nLeverage digital twins — virtual\nrepresentations of devices and\nobjects — to optimize operations and\ngain insights\n\n\nThis section includes several Solution Accelerators — free, ready-to-use\n\nexamples of data solutions from different industries ranging from retail to\n\nmanufacturing and healthcare. Each of the following scenarios includes\n\nnotebooks with code and step-by-step instructions to help you get\n\nstarted. Get hands-on experience with the Databricks Lakehouse Platform\n\n\nby trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n\n\n**Overall Equipment**\n**Effectiveness**\n\nIngest equipment sensor data for\nmetric generation and data driven\ndecision-making\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n\n**Real-time point of**\n**sale analytics**\n\nCalculate current inventories for\nvarious products across multiple store\nlocations with Delta Live Tables\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n\n\n**Recommendation Engines**\n**for Personalization**\n\nImprove customers’ user experience\nand conversion with personalized\nrecommendations\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Understanding Price**\n**Transparency Data**\n\nEfficiently ingest large healthcare data\nsets to create price transparency for\nbetter understanding of healthcare costs\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n\nAdditional Solution Accelerators with ready-to-use notebooks can be found here:\n\n**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n\n\n-----\n\n**SECTION**\n\n# 04", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "0f5ece87887f6f5d390380dda2471957", + "**Recommendation Engines**\n**for Personalization**\n\nImprove customers’ user experience\nand conversion with personalized\nrecommendations\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Understanding Price**\n**Transparency Data**\n\nEfficiently ingest large healthcare data\nsets to create price transparency for\nbetter understanding of healthcare costs\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n\nAdditional Solution Accelerators with ready-to-use notebooks can be found here:\n\n**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n\n\n-----\n\n**SECTION**\n\n# 04\n\n\n### Case Studies\n\n**4.1** Akamai\n\n**4.2** Grammarly\n\n**4.3** Honeywell\n\n**4.4** Wood Mackenzie\n\n**4.5** Rivian\n\n**4.6** AT&T\n\n\n-----\n\nSECTION 4.1\n**Akamai delivers real-time security**\n**analytics using Delta Lake**\n\n\n###### <1\n\n**Min ingestion time,**\n**reduced from 15 min**\n\n\n###### <85%\n\n**Of queries have a response**\n**time of 7 seconds or less**\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\n[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n\n**P L AT F O R M U S E C A S E**\nDelta Lake, Data Streaming, Photon,\n[Databricks SQL](https://databricks.com/product/databricks-sql)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\nAkamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n\nuses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n\nworldwide to route internet traffic for some of the largest enterprises in media, commerce,\n\nfinance, retail and many other industries. About 30% of the internet’s traffic flows through\n\nAkamai servers. Akamai also provides cloud security solutions.\n\nIn 2018, the company launched a web security analytics tool that offers Akamai customers\n\na single, unified interface for assessing a wide range of streaming security events and\n\nperforming analysis of those events. The web analytics tool helps Akamai customers to\n\ntake informed actions in relation to security events in real time. Akamai is able to stream\n\nmassive amounts of data and meet the strict SLAs it provides to customers by leveraging\n\nDelta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n\n\n-----\n\n**Ingesting and streaming enormous amounts of data**\n\nAkamai’s web security analytics tool ingests approximately 10GB of data related\nto security events per second. Data volume can increase significantly when\nretail customers conduct a large number of sales — or on big shopping days like\nBlack Friday or Cyber Monday. The web security analytics tool stores several\npetabytes of data for analysis purposes. Those analyses are performed to\nprotect Akamai’s customers and provide them with the ability to explore and\nquery security events on their own.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8cd3f41491cb6ba2467c5ee8216eb665", + "In 2018, the company launched a web security analytics tool that offers Akamai customers\n\na single, unified interface for assessing a wide range of streaming security events and\n\nperforming analysis of those events. The web analytics tool helps Akamai customers to\n\ntake informed actions in relation to security events in real time. Akamai is able to stream\n\nmassive amounts of data and meet the strict SLAs it provides to customers by leveraging\n\nDelta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n\n\n-----\n\n**Ingesting and streaming enormous amounts of data**\n\nAkamai’s web security analytics tool ingests approximately 10GB of data related\nto security events per second. Data volume can increase significantly when\nretail customers conduct a large number of sales — or on big shopping days like\nBlack Friday or Cyber Monday. The web security analytics tool stores several\npetabytes of data for analysis purposes. Those analyses are performed to\nprotect Akamai’s customers and provide them with the ability to explore and\nquery security events on their own.\n\nThe web security analytics tool initially relied on an on-premises architecture\nrunning Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\ndisplayed in the tool. The company sought to improve ingestion and query speed\nto meet those SLAs. “Data needs to be as real-time as possible so customers\ncan see what is attacking them,” says Tomer Patel, Engineering Manager at\nAkamai. “Providing queryable data to customers quickly is critical. We wanted to\nmove away from on-prem to improve performance and our SLAs so the latency\nwould be seconds rather than minutes.”\n\n**Delta Lake allows us to not only query the data better but to**\n**also acquire an increase in the data volume. We’ve seen an**\n**80% increase in traffic and data in the last year, so being able**\n**to scale fast is critical.**\n\n\nAfter conducting proofs of concept with several companies, Akamai chose to\nbase its streaming analytics architecture on Spark and the Databricks Lakehouse\nPlatform. “Because of our scale and the demands of our SLA, we determined that\nDatabricks was the right solution for us,” says Patel. “When we consider storage\noptimization, and data caching, if we went with another solution, we couldn’t\nachieve the same level of performance.”\n\n**Improving speed and reducing costs**\n\nToday, the web security analytics tool ingests and transforms data, stores it\nin cloud storage, and sends the location of the file via Kafka. It then uses a\nDatabricks Job as the ingest application. Delta Lake, the open source storage\nformat at the base of the Databricks Lakehouse Platform, supports real-time\nquerying on the web security analytics data. Delta Lake also enables Akamai to\nscale quickly. “Delta Lake allows us to not only query the data better but to also\nacquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\nin traffic and data in the last year, so being able to scale fast is critical.”\n\nAkamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n\nfast query performance. Patel added that Photon provided a significant boost\nto query performance. Overall, Databricks’ streaming architecture combined\nwith DBSQL and Photon enables Akamai to achieve real-time analytics, which\ntranslates to real-time business benefits.\n\n\n**Tomer Patel**\nEngineering Manager, Akamai\n\n\n-----\n\nPatel says he likes that Delta Lake is open source, as the company has benefitted\nfrom a community of users working to improve the product. “The fact that Delta\nLake is open source and there’s a big community behind it means we don’t need\nto implement everything ourselves,” says Patel. “We benefit from fixed bugs that\nothers have encountered and from optimizations that are contributed to the\nproject.” Akamai worked closely with Databricks to ensure Delta Lake can meet\nthe scale and performance requirements Akamai defined. These improvements\nhave been contributed back to the project (many of which were made available as\npart of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\ntechnology being tested at such a large scale in a real-world production scenario.\n\n\n**Meeting aggressive requirements for scale,**\n**reliability and performance**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "ae9c07b08ea4b4c558823863c844f0ce", + "**Tomer Patel**\nEngineering Manager, Akamai\n\n\n-----\n\nPatel says he likes that Delta Lake is open source, as the company has benefitted\nfrom a community of users working to improve the product. “The fact that Delta\nLake is open source and there’s a big community behind it means we don’t need\nto implement everything ourselves,” says Patel. “We benefit from fixed bugs that\nothers have encountered and from optimizations that are contributed to the\nproject.” Akamai worked closely with Databricks to ensure Delta Lake can meet\nthe scale and performance requirements Akamai defined. These improvements\nhave been contributed back to the project (many of which were made available as\npart of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\ntechnology being tested at such a large scale in a real-world production scenario.\n\n\n**Meeting aggressive requirements for scale,**\n**reliability and performance**\n\nUsing Spark Structured Streaming on the Databricks Lakehouse Platform enables\nthe web security analytics tool to stream vast volumes of data and provide\nlow-latency, real-time analytics-as-a-service to Akamai’s customers. That way\nAkamai is able to make available security event data to customers within the\nSLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\nperformance, performance,” says Patel. “The platform’s performance and\nscalability are what drives us.”\n\nUsing the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\nthe security event data. “Reducing ingestion time from 15 minutes to under 1\nminute is a huge improvement,” says Patel. “It benefits our customers because\nthey can see the security event data faster and they have a view of what exactly\nis happening as well as the capability to filter all of it.”\n\nAkamai’s biggest priority is to provide customers with a good experience and\nfast response times. To date, Akamai has moved about 70% of security event\ndata from its on-prem architecture to Databricks, and the SLA for customer\nquery and response time has improved significantly as a result. “Now, with the\nmove to Databricks, our customers experience much better response time, with\nover 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\noptimal security configuration.\n\n\n-----\n\nSECTION 4.2\n**Grammarly uses Databricks Lakehouse to improve**\n**user experience**\n\n\n###### 110%\n\n**Faster querying, at 10% of the cost**\n**to ingest, than a data warehouse**\n\n\n###### 5 billion\n\n**Daily events available for**\n**analytics in under 15 minutes**\n\n\nGrammarly’s mission is to improve lives by improving communication. The company’s\n\ntrusted AI-powered communication assistance provides real-time suggestions to\n\nhelp individuals and teams write more confidently and achieve better results. Its\n\ncomprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n\n[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n\nwherever writing happens. As the company grew over the years, its legacy, homegrown\n\nanalytics system made it challenging to evaluate large data sets quickly and cost-\n\neffectively.\n\nBy migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n\nflexible, scalable and highly secure analytics platform that helps 30 million people and\n\n50,000 teams worldwide write more effectively every day.\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\nRecommendation Engines, Advertising\nEffectiveness, Customer Lifetime Value", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9fbdfbb670eb38599226f9156685eaa7", + "[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n\nwherever writing happens. As the company grew over the years, its legacy, homegrown\n\nanalytics system made it challenging to evaluate large data sets quickly and cost-\n\neffectively.\n\nBy migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n\nflexible, scalable and highly secure analytics platform that helps 30 million people and\n\n50,000 teams worldwide write more effectively every day.\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\nRecommendation Engines, Advertising\nEffectiveness, Customer Lifetime Value\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Unity Catalog,\n[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Harnessing data to improve communications for millions of**\n**users and thousands of teams**\n\nWhen people use Grammarly’s AI communication assistance, they receive\nsuggestions to help them improve multiple dimensions of communication,\nincluding spelling and grammar correctness, clarity and conciseness, word\nchoice, style, and tone. Grammarly receives feedback when users accept, reject\nor ignore its suggestions through app-created events, which total about 5 billion\nevents per day.\n\nHistorically, Grammarly relied on a homegrown legacy analytics platform and\nleveraged an in-house SQL-like language that was time-intensive to learn and\nmade it challenging to onboard new hires. As the company grew, Grammarly\ndata analysts found that the platform did not sufficiently meet the needs of its\nessential business functions, especially marketing, sales and customer success.\nAnalysts found themselves copying and pasting data from spreadsheets\nbecause the existing system couldn’t effectively ingest the external data needed\nto answer questions such as, “Which marketing channel delivers the highest\nROI?” Reporting proved challenging because the existing system didn’t support\nTableau dashboards, and company leaders and analysts needed to ensure they\ncould make decisions quickly and confidently.\n\n\n**Databricks Lakehouse has given us the flexibility to unleash**\n**our data without compromise. That flexibility has allowed us**\n**to speed up analytics to a pace we’ve never achieved before.**\n\n**Chris Locklin**\nEngineering Manager, Data Platforms, Grammarly\n\nGrammarly also sought to unify its data warehouses in order to scale and\nimprove data storage and query capabilities. As it stood, large Amazon EMR\nclusters ran 24/7 and drove up costs. With the various data sources, the team\nalso needed to maintain access control. “Access control in a distributed file\nsystem is difficult, and it only gets more complicated as you ingest more data\nsources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\nMeanwhile, reliance on a single streaming workflow made collaboration among\nteams challenging. Data silos emerged as different business areas implemented\nanalytics tools individually. “Every team decided to solve their analytics needs in\nthe best way they saw fit,” says Locklin. “That created challenges in consistency\nand knowing which data set was correct.”\n\n\n-----\n\nAs its data strategy was evolving, Grammarly’s priority was to get the most out\nof analytical data while keeping it secure. This was crucial because security is\nGrammarly’s number-one priority and most important feature, both in how it\nprotects its users’ data and how it ensures its own company data remains secure.\nTo accomplish that, Grammarly’s data platform team sought to consolidate\ndata and unify the company on a single platform. That meant sustaining a highly\nsecure infrastructure that could scale alongside the company’s growth, improving\ningestion flexibility, reducing costs and fueling collaboration.\n\n**Improving analytics, visualization and decision-making**\n**with the lakehouse**\n\nAfter conducting several proofs of concept to enhance its infrastructure,\nGrammarly migrated to the Databricks Lakehouse Platform. Bringing all the\nanalytical data into the lakehouse created a central hub for all data producers\nand consumers across Grammarly, with Delta Lake at the core.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "35e24e644b33b8558da8f9295b4361fd", + "-----\n\nAs its data strategy was evolving, Grammarly’s priority was to get the most out\nof analytical data while keeping it secure. This was crucial because security is\nGrammarly’s number-one priority and most important feature, both in how it\nprotects its users’ data and how it ensures its own company data remains secure.\nTo accomplish that, Grammarly’s data platform team sought to consolidate\ndata and unify the company on a single platform. That meant sustaining a highly\nsecure infrastructure that could scale alongside the company’s growth, improving\ningestion flexibility, reducing costs and fueling collaboration.\n\n**Improving analytics, visualization and decision-making**\n**with the lakehouse**\n\nAfter conducting several proofs of concept to enhance its infrastructure,\nGrammarly migrated to the Databricks Lakehouse Platform. Bringing all the\nanalytical data into the lakehouse created a central hub for all data producers\nand consumers across Grammarly, with Delta Lake at the core.\n\nUsing the lakehouse architecture, data analysts within Grammarly now have a\nconsolidated interface for analytics, which leads to a single source of truth and\n\nconfidence in the accuracy and availability of all data managed by the data\nplatform team. Across the organization, teams are using Databricks SQL to\nconduct queries within the platform on both internally generated product data\nand external data from digital advertising platform partners. Now, they can easily\nconnect to Tableau and create dashboards and visualizations to present to\nexecutives and key stakeholders.\n\n\n“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\ncompanies ask for your data, hold it for you, and then let you perform analytics\non it. Just as Grammarly ensures our users’ data always remains theirs, we\nwanted to ensure our company data remained ours. Grammarly’s data stays\ninside of Grammarly.”\n\nWith its data consolidated in the lakehouse, different areas of Grammarly’s\nbusiness can now analyze data more thoroughly and effectively. For example,\nGrammarly’s marketing team uses advertising to attract new business. Using\nDatabricks, the team can consolidate data from various sources to extrapolate\na user’s lifetime value, compare it with customer acquisition costs and get rapid\nfeedback on campaigns. Elsewhere, data captured from user interactions flow\ninto a set of tables used by analysts for ad hoc analysis to inform and improve\nthe user experience.\n\nBy consolidating data onto one unified platform, Grammarly has eliminated data\nsilos. “The ability to bring all these capabilities, data processing and analysis\nunder the same platform using Databricks is extremely valuable,” says Sergey\nBlanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\nand engineering to analytics and ML under the same umbrella removes barriers\nand makes it easy for everyone to work with the data and each other.”\n\n\n-----\n\nTo manage access control, enable end-to-end observability and monitor data\nquality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n“Data lineage allows us to effectively monitor usage of our data and ensure it\nupholds the standards we set as a data platform team,” says Locklin. “Lineage is\nthe last crucial piece for access control. It allows analysts to leverage data to do\ntheir jobs while adhering to all usage standards and access controls, even when\nrecreating tables and data sets in another environment.”\n\n**Faster time to insight drives more intelligent**\n**business decisions**\n\nUsing the Databricks Lakehouse Platform, Grammarly’s engineering teams now\nhave a tailored, centralized platform and a consistent data source across the\ncompany, resulting in greater speed and efficiency and reduced costs. The\nlakehouse architecture has led to 110% faster querying, at 10% of the cost to\ningest, than a data warehouse. Grammarly can now make its 5 billion daily events\navailable for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n\nreceive feedback about new features being rolled out and understand if they are\nbeing adopted as expected. Ultimately, it helps them understand how groups\nof users engage with the UX, improving the experience and ensuring features\nand product releases bring the most value to users. “Everything my team does\nis focused on creating a rich, personalized experience that empowers people to\ncommunicate more effectively and achieve their potential,” says Locklin.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "f43621b04d372aa556042b5b86d86dfd", + "**Faster time to insight drives more intelligent**\n**business decisions**\n\nUsing the Databricks Lakehouse Platform, Grammarly’s engineering teams now\nhave a tailored, centralized platform and a consistent data source across the\ncompany, resulting in greater speed and efficiency and reduced costs. The\nlakehouse architecture has led to 110% faster querying, at 10% of the cost to\ningest, than a data warehouse. Grammarly can now make its 5 billion daily events\navailable for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n\nreceive feedback about new features being rolled out and understand if they are\nbeing adopted as expected. Ultimately, it helps them understand how groups\nof users engage with the UX, improving the experience and ensuring features\nand product releases bring the most value to users. “Everything my team does\nis focused on creating a rich, personalized experience that empowers people to\ncommunicate more effectively and achieve their potential,” says Locklin.\n\n\nMoving to the lakehouse architecture also solved the challenge of access control\nover distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\nability to manage file permissions with more flexibility than a database would\nallow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\nusing Databricks allows us to keep analytical data in-house, Unity Catalog helps\nus continue to uphold the highest standards of data protection by controlling\naccess paradigms inside our data. That opens a whole new world of things that\nwe can do.”\n\nUltimately, migrating to the Databricks Lakehouse Platform has helped\nGrammarly to foster a data-driven culture where employees get fast access\nto analytics without having to write complex queries, all while maintaining\nGrammarly’s enterprise-grade security practices. “Our team’s mission is to help\nGrammarly make better, faster business decisions,” adds Blanket. “My team\nwould not be able to effectively execute on that mission if we did not have a\nplatform like Databricks available to us.” Perhaps most critically, migrating off its\nrigid legacy infrastructure gives Grammarly the adaptability to do more while\nknowing the platform will evolve as its needs evolve. “Databricks has given us the\nflexibility to unleash our data without compromise,” says Locklin. “That flexibility\nhas allowed us to speed up analytics to a pace we’ve never achieved before.”\n\n\n-----\n\nSECTION 4.3\n**Honeywell selects Delta Live Tables for streaming data**\n\nCompanies are under growing pressure to reduce energy use, while at the same time\n\nthey are looking to lower costs and improve efficiency. Honeywell delivers industry-\n\nspecific solutions that include aerospace products and services, control technologies\n\nfor buildings and industry, and performance materials globally. Honeywell’s Energy\n\nand Environmental Solutions division uses IoT sensors and other technologies to help\n\nbusinesses worldwide manage energy demand, reduce energy consumption and carbon\n\nemissions, optimize indoor air quality, and improve occupant well-being.\n\nAccomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n\nTables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n\nbillions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n\nreal-time queries and multilayer insights into data at scale — helping Honeywell improve\n\nhow it manages data and extract more value from it, both for itself and for its customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Delta Live Tables\n\n\n**C LO U D**\n[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n**aggregations, and bring the significant amount of data we collect**\n**from our buildings under control so we can provide customers value.**\n\n**Dr. Chris Inkpen**\nGlobal Solutions Architect, Honeywell Energy and Environmental Solutions\n\n\n-----\n\n**Processing billions of IoT data points per day**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "2f7ae7affaa93ddaf6c69a7779f21f56", + "real-time queries and multilayer insights into data at scale — helping Honeywell improve\n\nhow it manages data and extract more value from it, both for itself and for its customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Delta Live Tables\n\n\n**C LO U D**\n[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n**aggregations, and bring the significant amount of data we collect**\n**from our buildings under control so we can provide customers value.**\n\n**Dr. Chris Inkpen**\nGlobal Solutions Architect, Honeywell Energy and Environmental Solutions\n\n\n-----\n\n**Processing billions of IoT data points per day**\n\nHoneywell’s solutions and services are used in millions of buildings around the\nworld. Helping its customers create buildings that are safe, more sustainable\nand productive can require thousands of sensors per building. Those sensors\nmonitor key factors such as temperature, pressure, humidity and air quality.\nIn addition to the data collected by sensors inside a building, data is also\ncollected from outside, such as weather and pollution data. Another data set\nconsists of information about the buildings themselves — such as building\ntype, ownership, floor plan, square footage of each floor and square footage\nof each room. That data set is combined with the two disparate data streams,\nadding up to a lot of data across multiple structured and unstructured formats,\nincluding images and video streams, telemetry data, event data, etc. At peaks,\nHoneywell ingests anywhere between 200 to 1,000 events per second for any\nbuilding, which equates to billions of data points per day. Honeywell’s existing\ndata infrastructure was challenged to meet such demand. It also made it difficult\nfor Honeywell’s data team to query and visualize its disparate data so it could\nprovide customers with fast, high-quality information and analysis.\n\n**ETL simplified: high-quality, reusable data pipelines**\n\nWith Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\ndata team can now ingest billions of rows of sensor data into Delta Lake and\nautomatically build SQL endpoints for real-time queries and multilayer insights\ninto data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n\n\nChris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\nSolutions. “We give the system more data, and it copes. Out of the box, it’s given\nus the confidence that it will handle whatever we throw at it.”\n\nHoneywell credits the Databricks Lakehouse Platform for helping it to unify its\nvast and varied data — batch, streaming, structured and unstructured — into\none platform. “We have many different data types. The Databricks Lakehouse\nPlatform allows us to use things like Apache Kafka and Auto Loader to load and\nprocess multiple types of data and treat everything as a stream of data, which is\nawesome. Once we’ve got structured data from unstructured data, we can write\nstandardized pipelines.”\n\nHoneywell data engineers can now build and leverage their own ETL pipelines\nwith Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\nbe reused regardless of environment, and data can run in batches or streams. It’s\nalso helped Honeywell’s data team transition from a small team to a larger team.\n“When we wrote our first few pipelines before DLT existed, only one person could\nwork in one part of the functionality. Now that we’ve got DLT and the ability to\nhave folders with common functionality, we’ve got a really good platform where\nwe can easily spin off different pipelines.”\n\nDLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\npipeline need optimization,” says Inkpen. “With standard pipelines, that was\nmuch more chaotic.”\n\n\n-----\n\n**Enabling ease, simplicity and scalability across the**\n**infrastructure**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "d01a04d6efbdc542df8ca255e712f59a", + "Honeywell data engineers can now build and leverage their own ETL pipelines\nwith Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\nbe reused regardless of environment, and data can run in batches or streams. It’s\nalso helped Honeywell’s data team transition from a small team to a larger team.\n“When we wrote our first few pipelines before DLT existed, only one person could\nwork in one part of the functionality. Now that we’ve got DLT and the ability to\nhave folders with common functionality, we’ve got a really good platform where\nwe can easily spin off different pipelines.”\n\nDLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\npipeline need optimization,” says Inkpen. “With standard pipelines, that was\nmuch more chaotic.”\n\n\n-----\n\n**Enabling ease, simplicity and scalability across the**\n**infrastructure**\n\nDelta Live Tables has helped Honeywell’s data team consistently query\ncomplex data while offering simplicity of scale. It also enables end-to-end data\nvisualization of Honeywell’s data streams as they flow into its infrastructure, are\ntransformed, and then flow out. “Ninety percent of our ETL is now captured in\ndiagrams, so that’s helped considerably and improves data governance. DLT\nencourages — and almost enforces — good design,” says Inkpen.\n\nUsing the lakehouse as a shared workspace has helped promote teamwork and\ncollaboration at Honeywell. “The team collaborates beautifully now, working\ntogether every day to divvy up the pipeline into their own stories and workloads,”\nsays Inkpen.\n\nMeanwhile, the ability to manage streaming data with low latency and better\nthroughput has improved accuracy and reduced costs. “Once we’ve designed\nsomething using DLT, we’re pretty safe from scalability issues — certainly a\nhundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\nthen go back and look at how we can take a traditional job and make it more\nperformant and less costly. We’re in a much better position to try and do that\nfrom DLT.”\n\n\nUsing Databricks and DLT also helps the Honeywell team perform with greater\nagility, which allows them to innovate faster while empowering developers to\nrespond to user requirements almost immediately. “Our previous architecture\nmade it impossible to know what bottlenecks we had and what we needed to\nscale. Now we can do data science in near real-time.”\n\nUltimately, Honeywell can now more quickly provide its customers with the\ndata and analysis they need to make their buildings more efficient, healthier\nand safer for occupants. “I’m continuously looking for ways to improve our\nlifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\nus pull together many different data sources, do aggregations, and bring the\nsignificant amount of data we collect from our buildings under control so we\ncan provide customers value.”\n\n**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n\n\n-----\n\nSECTION 4.4\n**Wood Mackenzie helps customers transition to a more**\n**sustainable future**\n\n\n###### 12 Billion\n\n**Data points processed**\n**each week**\n\n\n###### 80-90%\n\n**Reduction in**\n**processing time**\n\n\n###### Cost Savings\n\n**In operations through**\n**workflow automation**\n\n\nWood Mackenzie offers customized consulting and analysis for a wide range of clients\n\nin the energy and natural resources sectors. Founded in Edinburgh, the company first\n\ncultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n\ndetailed insight for every interconnected sector of the energy, chemicals, metals and\n\nmining industries.\n\nToday it sees itself playing an important role in the transition to a more sustainable\n\nfuture. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n\ningest and process massive amounts of data. Using a common workflow provided\n\nhigher visibility to engineering team members, encouraging better collaboration. With", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "4104d022cbbba479db59c7b52e2125ac", + "###### 12 Billion\n\n**Data points processed**\n**each week**\n\n\n###### 80-90%\n\n**Reduction in**\n**processing time**\n\n\n###### Cost Savings\n\n**In operations through**\n**workflow automation**\n\n\nWood Mackenzie offers customized consulting and analysis for a wide range of clients\n\nin the energy and natural resources sectors. Founded in Edinburgh, the company first\n\ncultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n\ndetailed insight for every interconnected sector of the energy, chemicals, metals and\n\nmining industries.\n\nToday it sees itself playing an important role in the transition to a more sustainable\n\nfuture. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n\ningest and process massive amounts of data. Using a common workflow provided\n\nhigher visibility to engineering team members, encouraging better collaboration. With\n\nan automated, transparent workflow in place, the team saw improved productivity and\n\ndata quality and an easier path to fix pipeline issues when they arise.\n\n\n**I N D U S T R Y**\n[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Workflows\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Delivering insights to the energy industry**\n\nFulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\nbuilt to deliver insights at key decision points for customers in the energy sector.\nFeeding into Lens are vast amounts of data collected from various data sources\nand sensors used to monitor energy creation, oil and gas production, and more.\nThose data sources update about 12 billion data points every week that must\nbe ingested, cleaned and processed as part of the input for the Lens platform.\nYanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\ndata professionals that build and maintain the ETL pipeline that provides input\ndata for Lens. The team is leveraging the Databricks Lakehouse Platform and\nuses Apache Spark™ for parallel processing, which provides greater performance\nand scalability benefits compared to an earlier single-node system working\nsequentially. “We saw a reduction of 80-90% in data processing time, which\nresults in us providing our clients with more up-to-date, more complete and\nmore accurate data,” says Wu.\n\n**Our mission is to transform the way we power the planet.**\n**Our clients in the energy sector need data, consulting services**\n**and research to achieve that transformation. Databricks**\n**Workflows gives us the speed and flexibility to deliver the**\n**insights our clients need.**\n\n\n**Improved collaboration and transparency with a common**\n**workflow**\n\nThe data pipeline managed by the team includes several stages for standardizing\nand cleaning raw data, which can be structured or unstructured and may be in\nthe form of PDFs or even handwritten notes.\n\nDifferent members of the data team are responsible for different parts of\nthe pipeline, and there is a dependency between the processing stages each\nteam member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\nworkstream that the entire team uses. Each stage of the pipeline is implemented\nin a Python notebook, which is run as a job in the main workflow.\n\nEach team member can now see exactly what code is running on each stage,\nmaking it easy to find the cause of the issue. Knowing who owns the part of the\npipeline that originated the problem makes fixing issues much faster. “Without\na common workflow, different members of the team would run their notebooks\nindependently, not knowing that failure in their run affected stages downstream,”\nsays Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\nrerun notebooks, it was hard to tell which notebook version was initially run and\nthe latest version to use.”\n\n\n**Yanyan Wu**\nVice President of Data, Wood Mackenzie\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "99a18819e51138db0281c37f58807378", + "Different members of the data team are responsible for different parts of\nthe pipeline, and there is a dependency between the processing stages each\nteam member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\nworkstream that the entire team uses. Each stage of the pipeline is implemented\nin a Python notebook, which is run as a job in the main workflow.\n\nEach team member can now see exactly what code is running on each stage,\nmaking it easy to find the cause of the issue. Knowing who owns the part of the\npipeline that originated the problem makes fixing issues much faster. “Without\na common workflow, different members of the team would run their notebooks\nindependently, not knowing that failure in their run affected stages downstream,”\nsays Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\nrerun notebooks, it was hard to tell which notebook version was initially run and\nthe latest version to use.”\n\n\n**Yanyan Wu**\nVice President of Data, Wood Mackenzie\n\n\n-----\n\nUsing Workflows’ alerting capabilities to notify the team when a workflow task\nfails ensures everyone knows a failure occurred and allows the team to work\ntogether to resolve the issue quickly. The definition of a common workflow\ncreated consistency and transparency that made collaboration easier. “Using\nDatabricks Workflows allowed us to encourage collaboration and break up the\nwalls between different stages of the process,” explains Wu. “It allowed us all to\nspeak the same language.”\n\nCreating transparency and consistency is not the only advantage the team saw.\nUsing Workflows to automate notebook runs also led to cost savings compared\nto running interactive notebooks manually.\n\n**Improved code development productivity**\n\nThe team’s ETL pipeline development process involves iteration on PySpark\nnotebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\nfor data professionals on the team to manually develop and test a notebook.\nBecause Databricks Workflows supports running notebooks as task type\n(along with Python files, JAR files and other types), when the code is ready for\n\n\ndeveloping notebooks with the interactive notebook UI while leveraging the\npower of automation, which reduces potential issues that may happen when\nrunning notebooks manually.\n\nThe team has gone even further in increasing productivity by developing a\nCI/CD process. “By connecting our source control code repository, we know\nthe workflow always runs the latest code version we committed to the repo,”\nexplains Zhang. “It’s also easy to switch to a development branch to develop a\nnew feature, fix a bug and run a development workflow. When the code passes\nall tests, it is merged back to the main branch and the production workflow is\nautomatically updated with the latest code.”\n\nGoing forward, Wood Mackenzie plans to optimize its use of Databricks\nWorkflows to automate machine learning processes such as model training,\nmodel monitoring and handling model drift. The firm uses ML to improve its data\nquality and extract insights to provide more value to its clients. “Our mission is to\ntransform how we power the planet,” Wu says. “Our clients in the energy sector\nneed data, consulting services and research to achieve that transformation.\nDatabricks Workflows gives us the speed and flexibility to deliver the insights our\nclients need.”\n\n\nproduction, it’s easy and cost effective to automate it by adding it to a workflow.\nThe workflow can then be easily revised by adding or removing any steps to\nor from the defined flow. This way of working keeps the benefit of manually\n\n\n-----\n\nSECTION 4.5\n**Rivian redefines driving experience with**\n**the Databricks Lakehouse**\n\n###### 250 platform users\n\n**A 50x increase from a year ago**\n\nRivian is preserving the natural world for future generations with revolutionary Electric\n\nAdventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n\nterabytes of IoT data per day, the company is using data insights and machine\n\nlearning to improve vehicle health and performance. However, with legacy cloud\n\ntooling, it struggled to scale pipelines cost-effectively and spent significant resources\n\non maintenance — slowing its ability to be truly data driven.\n\nSince moving to the Databricks Lakehouse Platform, Rivian can now understand how", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9284ced0d78802d34cf693f2bb49fbb6", + "-----\n\nSECTION 4.5\n**Rivian redefines driving experience with**\n**the Databricks Lakehouse**\n\n###### 250 platform users\n\n**A 50x increase from a year ago**\n\nRivian is preserving the natural world for future generations with revolutionary Electric\n\nAdventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n\nterabytes of IoT data per day, the company is using data insights and machine\n\nlearning to improve vehicle health and performance. However, with legacy cloud\n\ntooling, it struggled to scale pipelines cost-effectively and spent significant resources\n\non maintenance — slowing its ability to be truly data driven.\n\nSince moving to the Databricks Lakehouse Platform, Rivian can now understand how\n\na vehicle is performing and how this impacts the driver using it. Equipped with these\n\ninsights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n\ndriving experience to customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**S O L U T I O N**\nPredictive Maintenance, Scaling ML Models\nfor IoT, Data-Driven ESG\n\n**P L AT F O R M**\n[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Struggling to democratize data on a legacy platform**\n\n\nsharing of data, which further contributed to productivity issues. Required data\nlanguages and specific expertise of toolsets created a barrier to entry that\nlimited developers from making full use of the data available. Jason Shiverick,\nPrincipal Data Scientist at Rivian, said the biggest issue was the data access. “I\nwanted to open our data to a broader audience of less technical users so they\ncould also leverage data more easily.”\n\nRivian knew that once its EAVs hit the market, the amount of data ingested would\nexplode. In order to deliver the reliability and performance it promised, Rivian\nneeded an architecture that would not only democratize data access, but also\nprovide a common platform to build innovative solutions that can help ensure a\nreliable and enjoyable driving experience.\n\n**Databricks Lakehouse empowers us to lower the barrier of**\n**entry for data access across our organization so we can build**\n**the most innovative and reliable electric vehicles in the world.**\n\n**Wassym Bensaid**\nVice President of Software Development, Rivian\n\n\nBuilding a world that will continue to be enjoyed by future generations requires\na shift in the way we operate. At the forefront of this movement is Rivian —\nan electric vehicle manufacturer focused on shifting our planet’s energy and\ntransportation systems entirely away from fossil fuel. Today, Rivian’s fleet\nincludes personal vehicles and involves a partnership with Amazon to deliver\n100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\ncapture petabytes of data ranging from how the vehicle drives to how various\nparts function. With all this data at its fingertips, Rivian is using machine learning\nto improve the overall customer experience with predictive maintenance so that\npotential issues are addressed before they impact the driver.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility\nand tooling limitations that decreased output, prevented collaboration and\nincreased operational costs. It had 30 to 50 large and operationally complicated\ncompute clusters at any given time, which was costly. Not only was the system\ndifficult to manage, but the company experienced frequent cluster outages\nas well, forcing teams to dedicate more time to troubleshooting than to data\nanalysis. Additionally, data silos created by disjointed systems slowed the\n\n\n-----\n\n**Predicting maintenance issues with Databricks Lakehouse**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "c63502363fdd19ca89d789804b8a67ae", + "Before Rivian even shipped its first EAV, it was already up against data visibility\nand tooling limitations that decreased output, prevented collaboration and\nincreased operational costs. It had 30 to 50 large and operationally complicated\ncompute clusters at any given time, which was costly. Not only was the system\ndifficult to manage, but the company experienced frequent cluster outages\nas well, forcing teams to dedicate more time to troubleshooting than to data\nanalysis. Additionally, data silos created by disjointed systems slowed the\n\n\n-----\n\n**Predicting maintenance issues with Databricks Lakehouse**\n\nRivian chose to modernize its data infrastructure on the Databricks Lakehouse\nPlatform, giving it the ability to unify all of its data into a common view for\ndownstream analytics and machine learning. Now, unique data teams have\na range of accessible tools to deliver actionable insights for different use\ncases, from predictive maintenance to smarter product development. Venkat\nSivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\nto build a culture around an open data platform that provided a system for\nreally democratizing data and analysis in an efficient way.” Databricks’ flexible\nsupport of all programming languages and seamless integration with a variety of\ntoolsets eliminated access roadblocks and unlocked new opportunities. Wassym\nBensaid, Vice President of Software Development at Rivian, explains, “Today we\nhave various teams, both technical and business, using Databricks Lakehouse\nto explore our data, build performant data pipelines, and extract actionable\nbusiness and product insights via visual dashboards.”\n\n\nmetrics, Rivian can improve the accuracy of smart features and the control\nthat drivers have over them. Designed to take the stress out of long drives and\ndriving in heavy traffic, features like adaptive cruise control, lane change assist,\nautomatic emergency driving, and forward collision warning can be honed over\ntime to continuously optimize the driving experience for customers.\n\nSecure data sharing and collaboration was also facilitated with the Databricks\nUnity Catalog. Shiverick describes how unified governance for the lakehouse\nbenefits Rivian productivity. “Unity Catalog gives us a truly centralized data\ncatalog across all of our different teams,” he said. “Now we have proper access\nmanagement and controls.” Venkat adds, “With Unity Catalog, we are centralizing\ndata catalog and access management across various teams and workspaces,\nwhich has simplified governance.” End-to-end version controlled governance\nand auditability of sensitive data sources, like the ones used for autonomous\ndriving systems, produces a simple but secure solution for feature engineering.\nThis gives Rivian a competitive advantage in the race to capture the autonomous\ndriving grid.\n\n\nRivian’s ADAS (advanced driver-assistance systems) Team can now easily\nprepare telemetric accelerometer data to understand all EAV motions. This core\nrecording data includes information about pitch, roll, speed, suspension and\nairbag activity, to help Rivian understand vehicle performance, driving patterns\nand connected car system predictability. Based on these key performance\n\n\n-----\n\n**Accelerating into an electrified and sustainable world**\n\n\nBy scaling its capacity to deliver valuable data insights with speed, efficiency\nand cost-effectiveness, Rivian is primed to leverage more data to improve\noperations and the performance of its vehicles to enhance the customer\nexperience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\nmoney from a cloud perspective, and that’s a huge win for us.” With Databricks\nLakehouse providing a unified and open source approach to data and analytics,\nthe Vehicle Reliability Team is able to better understand how people are using\ntheir vehicles, and that helps to inform the design of future generations of\nvehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n30%–50% increase in runtime performance, which has led to faster insights and\nmodel performance.\n\nShiverick explains, “From a reliability standpoint, we can make sure that\ncomponents will withstand appropriate lifecycles. It can be as simple as\nmaking sure door handles are beefy enough to endure constant usage, or as\ncomplicated as predictive and preventative maintenance to eliminate the\nchance of failure in the field. Generally speaking, we’re improving software quality\nbased on key vehicle metrics for a better customer experience.”", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "503e0d13ef37677f2a5f0033eb59e6a8", + "By scaling its capacity to deliver valuable data insights with speed, efficiency\nand cost-effectiveness, Rivian is primed to leverage more data to improve\noperations and the performance of its vehicles to enhance the customer\nexperience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\nmoney from a cloud perspective, and that’s a huge win for us.” With Databricks\nLakehouse providing a unified and open source approach to data and analytics,\nthe Vehicle Reliability Team is able to better understand how people are using\ntheir vehicles, and that helps to inform the design of future generations of\nvehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n30%–50% increase in runtime performance, which has led to faster insights and\nmodel performance.\n\nShiverick explains, “From a reliability standpoint, we can make sure that\ncomponents will withstand appropriate lifecycles. It can be as simple as\nmaking sure door handles are beefy enough to endure constant usage, or as\ncomplicated as predictive and preventative maintenance to eliminate the\nchance of failure in the field. Generally speaking, we’re improving software quality\nbased on key vehicle metrics for a better customer experience.”\n\n\nFrom a design optimization perspective, Rivian’s unobstructed data view is also\nproducing new diagnostic insights that can improve fleet health, safety, stability\nand security. Venkat says, “We can perform remote diagnostics to triage a\nproblem quickly, or have a mobile service come in, or potentially send an OTA\nto fix the problem with the software. All of this needs so much visibility into\nthe data, and that’s been possible with our partnership and integration on the\nplatform itself.” With developers actively building vehicle software to improve\nissues along the way.\n\nMoving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\ndifferent teams — increasing the number of platform users from 5 to 250 in only\none year. This has unlocked new use cases including using machine learning to\noptimize battery efficiency in colder temperatures, increasing the accuracy of\nautonomous driving systems, and serving commercial depots with vehicle health\ndashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\nof commercial vans expands, Rivian will continue to leverage the troves of data\ngenerated by its EAVs to deliver new innovations and driving experiences that\nrevolutionize sustainable transportation.\n\n\n-----\n\nSECTION 4.6\n**Migrating to the cloud to better serve**\n**millions of customers**\n\n\n###### 300%\n\n**ROI from OpEx savings**\n**and cost avoidance**\n\n\n###### 3X\n\n**Faster delivery of ML/data**\n**science use cases**\n\n\nConsistency in innovation is what keeps customers with a telecommunications company\n\nand is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n\nHadoop system proved complex and costly to manage, impeding operational agility\n\nand efficiency and engineering resources. The need to pivot to cloud to better support\n\nhundreds of millions of subscribers was apparent.\n\nMigrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n\nsavings in operating costs. Additionally, the new cloud-based environment has unlocked\n\naccess to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n\n2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n\noverburdening its engineering team or exploding operational costs — to deliver new\n\nfeatures and innovations to its millions of end users.\n\n\n**I N D U S T R Y**\n[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n\n**S O L U T I O N**\nCustomer Retention, Subscriber Churn\nPrediction, Threat Detection\n\n**P L AT F O R M**\nLakehouse, Data Science, Machine Learning,\n[Data Streaming](https://www.databricks.com/product/data-streaming)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\n-----\n\n**Hadoop technology adds operational complexity and**\n**unnecessary costs**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7f2a1d49873576a3d22cb339b23518d7", + "2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n\noverburdening its engineering team or exploding operational costs — to deliver new\n\nfeatures and innovations to its millions of end users.\n\n\n**I N D U S T R Y**\n[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n\n**S O L U T I O N**\nCustomer Retention, Subscriber Churn\nPrediction, Threat Detection\n\n**P L AT F O R M**\nLakehouse, Data Science, Machine Learning,\n[Data Streaming](https://www.databricks.com/product/data-streaming)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\n-----\n\n**Hadoop technology adds operational complexity and**\n**unnecessary costs**\n\nAT&T is a technology giant with hundreds of millions of subscribers and ingests\n10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\nthis data, it has a team of 2,500+ data users across 60+ business units to ensure\nthe business is data powered — from building analytics to ensure decisions are\nbased on the best data-driven situation awareness to building ML models that\nbring new innovations to its customers. To support these requirements, AT&T\nneeded to democratize and establish a data single version of truth (SVOT) while\nsimplifying infrastructure management to increase agility and lower overall costs.\n\nHowever, physical infrastructure was too resource intensive. The combination\nof a highly complex hardware setup (12,500 data sources and 1,500+ servers)\ncoupled with an on-premises Hadoop architecture proved complex to\nmaintain and expensive to manage. Not only were the operational costs to\nsupport workloads high, but there were also additional capital costs around\ndata centers, licensing and more. Up to 70% of the on-prem platform had to\n\nbe prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n\ndata quality objectives. Engineers’ time was focused on managing updates,\n\n\nWith these deeply rooted technology issues, AT&T was not in the best position\nto achieve its goals of increasing its use of insights for improving its customer\nexperience and operating more efficiently. “To truly democratize data across\nthe business, we needed to pivot to a cloud-native technology environment,”\nsaid Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\nup resources that had been focused on managing our infrastructure and move\nthem up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n\n**A seamless migration journey to Databricks**\n\nAs part of its due diligence, AT&T ran a comprehensive cost analysis and\nconcluded that Databricks was both the fastest and achieved the best price/\nperformance for data pipelines and machine learning workloads. AT&T knew the\nmigration would be a massive undertaking. As such, the team did a lot of upfront\nplanning — they prioritized migrating their largest workloads first to immediately\nreduce their infrastructure footprint. They also decided to migrate their data\nbefore migrating users to ensure a smooth transition and experience for their\nthousands of data practitioners.\n\n\nfixing performance issues or simply provisioning resources rather than focusing\n\n\non higher-valued tasks. The resource constraints of physical infrastructure\n\nalso drove serialization of data science activities, slowing innovation. Another\n\nhurdle faced in operationalizing petabytes of data was the challenge of building\n\nstreaming data pipelines for real-time analytics, an area that was key to\n\nsupporting innovative use cases required to better serve its customers.\n\n\n**The migration from Hadoop to Databricks enables us to bring**\n**more value to our customers and do it more cost-efficiently**\n**and much faster than before.**\n\n**Mark Holcomb**\nDistinguished Solution Architect, AT&T\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "fcc5f70d1ab79d942fe15a5e2eda0cfb", + "fixing performance issues or simply provisioning resources rather than focusing\n\n\non higher-valued tasks. The resource constraints of physical infrastructure\n\nalso drove serialization of data science activities, slowing innovation. Another\n\nhurdle faced in operationalizing petabytes of data was the challenge of building\n\nstreaming data pipelines for real-time analytics, an area that was key to\n\nsupporting innovative use cases required to better serve its customers.\n\n\n**The migration from Hadoop to Databricks enables us to bring**\n**more value to our customers and do it more cost-efficiently**\n**and much faster than before.**\n\n**Mark Holcomb**\nDistinguished Solution Architect, AT&T\n\n\n-----\n\nThey spent a year deduplicating and synchronizing data to the cloud before\nmigrating any users. This was a critical step in ensuring the successful migration\nof such a large, complex multi-tenant environment of 2,500+ users from 60+\nbusiness units and their workloads. The user migration process occurred over\nnine months and enabled AT&T to retire on-premises hardware in parallel with\nmigration to accelerate savings as early as possible. Plus, due to the horizontal,\nscalable nature of Databricks, AT&T didn’t need to have everything in one\ncontiguous environment. Separating data and compute, and across multiple\naccounts and workspaces, ensured analytics worked seamlessly without any API\ncall limits or bandwidth issues and consumption clearly attributed to the 60+\nbusiness units.\n\nAll in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n12,500 data sources and 300 schemas. The entire process took about two and a\nhalf years. And it was able to manage the entire migration with the equivalent of\n15 full-time internal resources. “Databricks was a valuable collaborator throughout\nthe process,” said Holcomb. “The team worked closely with us to resolve product\nfeatures and security concerns to support our migration timeline.”\n\n**Databricks reduces TCO and opens new paths to**\n**innovation**\n\nOne of the immediate benefits of moving to Databricks was huge cost savings.\nAT&T was able to rationalize about 30% of its data by identifying and not\nmigrating underutilized and duplicate data. And prioritizing the migration of\nthe largest workloads allowed half the on-prem equipment to be rationalized\n\n\nduring the course of the migration. “By prioritizing the migration of our most\ncompute-intensive workloads to Databricks, we were able to significantly drive\ndown costs while putting us in position to scale more efficiently moving forward,”\nexplained Holcomb. The result is an anticipated 300% five-year migration ROI\nfrom OpEx savings and cost avoidance (e.g., not needing to refresh data center\nhardware).\n\nWith data readily available and the means to analyze data at any scale, teams\nof citizen data scientists and analysts can now spend more time innovating,\ninstead of serializing analytics efforts or waiting on engineering to provide the\nnecessary resources — or having data scientists spend their valuable time\non less complex or less insightful analyses. Data scientists are now able to\ncollaborate more effectively and speed up machine learning workflows so that\nteams can deliver value more quickly, with a 3x faster time to delivery for new\ndata science use cases.\n\n“Historically you would have had operations in one system and analytics in a\nseparate one,” said Holcomb. “Now we can do more use cases like operational\nanalytics in a platform that fosters cross-team collaboration, reduces cost and\nimproves the consistency of answers.” Since migrating to Databricks, AT&T now\nhas a single version of truth to create new data-driven opportunities, including\na self-serve AI-as-a-Service analytics platform that will enable new revenue\nstreams and help it continue delivering exceptional innovations to its millions\nof customers.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3fd63e084c48ff4b0ebabb60d20f5243", + "-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1b74eac4a063d67e5f727e36b040965b", + "##### The Delta Lake Series Complete Collection\n\n\n-----\n\n### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\nlifecycle management to data lakes. With Delta Lake, there will be no more\nmalformed data ingestion, difficulties deleting data for compliance, or issues\nmodifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\nyour data lake and the rate that teams can leverage that data with a secure and\nscalable cloud service.\n\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\n\n\n-----\n\nContents Processes Petabytes With Data Skipping and Z-Ordering\n\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n\nPerformance Matter **you’ll find inside** 5 Features 22\n\n\n\nProcesses Petabytes With Data Skipping and Z-Ordering\n\n\nRollbacks 39\n\nPinned view of a continuously updating\n\nDelta Lake table across multiple downstream jobs\n\nQueries for time series analytics made simple\n\nEasily Clone Your Delta Lake\n\nfor Testing, Sharing and ML\n\nReproducibility 41\n\nWhat are clones? 41\n\n\nA lakehouse combines the best elements\n\nof data lakes and data warehouses 52\n\nSome early examples 55\n\nFrom BI to AI 55\n\nDiving Deep Into the\n\nInner Workings of the Lakehouse and Delta Lake 56\n\n1. Data lakes 57\n\n2. Custom storage engines 57\n\n\nCreating the Dashboard /\n\nVirtual Network Operation Centers 82\n\nCreating (near) real-time alerts 85\n\nNext steps: machine learning 86\n\nPoint-of-failure prediction and remediation 87\n\nCustomer churn 87\n\nGetting started with the Databricks streaming video QoS solution 87\n\nCustomer Use Cases 88\n\nHealthdirect Australia 89\n\nData quality and governance issues, silos, and the inability to scale 89\n\n\nFundamentals & Performance\n\n\nUsing data skipping and Z-Order clustering 21\n\n\nThe Fundamentals of Delta Lake: Why Reliability and\n\n\nExploring the details 21\n\n\nPerformance Matter\n\n\nFeatures\n\n\nChallenges with data lakes\n\nDelta Lake’s key functionalities\n\nUnpacking the Transaction Log\n\nImplementing atomicity to ensure\n\n\nWhy Use MERGE\n\nWith Delta Lake?\n\nWhen are upserts necessary? 24\n\nWhy upserts into data lakes have\n\n\noperations complete fully\n\n\noperations complete fully 9\n\nDealing with multiple concurrent reads and writes **Chapter**\n\nTime travel, data lineage and debugging 10\n\nHow to Use Schema Enforcement and Evolution\n\nUnderstanding table schemas 11\n\n#### 01\n\n\nFundamentals and Performance traditionally been challenging 25\n\n\ntraditionally been challenging\n\n\nShallow clones\n\nDeep clones\n\n\n**Chapter**\n\n42\n\n42\n\n#### 04\n\n\n3. Lakehouse\n\n\nDealing with multiple concurrent reads and writes\n\n\nIntroducing MERGE in Delta Lake\n\n\nIn the research paper, the authors explain: 59\n\n\n3. Lakehouse Streaming 58\n\n\n\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\nand Performance Matter Deleting data due to GDPR 26\n\n\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "cc06bdd5d0bca0e491757d186e00b991", + "operations complete fully\n\n\noperations complete fully 9\n\nDealing with multiple concurrent reads and writes **Chapter**\n\nTime travel, data lineage and debugging 10\n\nHow to Use Schema Enforcement and Evolution\n\nUnderstanding table schemas 11\n\n#### 01\n\n\nFundamentals and Performance traditionally been challenging 25\n\n\ntraditionally been challenging\n\n\nShallow clones\n\nDeep clones\n\n\n**Chapter**\n\n42\n\n42\n\n#### 04\n\n\n3. Lakehouse\n\n\nDealing with multiple concurrent reads and writes\n\n\nIntroducing MERGE in Delta Lake\n\n\nIn the research paper, the authors explain: 59\n\n\n3. Lakehouse Streaming 58\n\n\n\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\nand Performance Matter Deleting data due to GDPR 26\n\n\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n\nDelta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n\nScaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n\n\nTime travel, data lineage and debugging\n\n\nSimplifying use cases with MERGE\n\n\nWhere do clones help?\n\n\nUnderstanding\n\n\nModernizing analytics with Databricks and Delta Lake\n\n\nHow to Use Schema Enforcement and Evolution\n\n\nDeleting data due to GDPR\n\n\nTesting and experimentation with a production table\n\n\nDelta Engine\n\n\nFaster data pipelines result in better patient-driven healthcare\n\n\n\n- Unpacking the Transaction Log Applying change data from databases 26\n\n- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n\n- Delta Lake DML Internals How to start using Delta Lake 28\n\n- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\nWith Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n\n\nUnderstanding table schemas\n\n\nApplying change data from databases\n\n\nStaging major changes to a production table\n\n\nScaling execution performance\n\n\nComcast\n\n\nAnnouncing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n\nhigh-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n\n\nWhat is schema enforcement?\n\nHow does schema enforcement work?\n\nHow is schema enforcement useful?\n\nWhat is schema evolution?\n\nHow does schema evolution work?\n\n\nUpdating session information from streaming pipelines\n\n\nMachine learning result reproducibility\n\nData migration\n\nData sharing\n\nData archiving\n\nLooks awesome! Any gotchas?\n\nHow can I use it?\n\nEnabling Spark SQL DDL\n\n\nAnnouncing Delta Engine for\n\n\nInfrastructure unable to support data and ML needs\n\n\nHow to start using Delta Lake\n\n\nhigh-performance query execution\n\n\nAutomated infrastructure, faster data\n\n\nGetting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n\nStreaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n\n\nLoading and saving our Delta Lake data\n\n\nGetting started with Delta Engine\n\n\npipelines with Delta Lake\n\n\nIn-place conversion to Delta Lake\n\n\nStreaming\n\n\nDelivering personalized experiences with ML\n\n\nDelete our flight data\n\nUpdate our flight data 31\n\nMerge our flight data 31\n\n\nHow Delta Lake Solves Common Pain Points in Streaming\n\n\nBanco Hipotecario 97\n\nLegacy analytics tools are slow, rigid and\n\nimpossible to scale 98\n\n\nHow is schema evolution useful? 14\n\nSummary **Chapter** 14\n\nDelta Lake\n\nDML Internals 15\n\nDelta Lake DML: UPDATE 15\n\n#### 02\n\n\nFeatures", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5059253ce068094b5181ff09e66e1503", + "Automated infrastructure, faster data\n\n\nGetting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n\nStreaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n\n\nLoading and saving our Delta Lake data\n\n\nGetting started with Delta Engine\n\n\npipelines with Delta Lake\n\n\nIn-place conversion to Delta Lake\n\n\nStreaming\n\n\nDelivering personalized experiences with ML\n\n\nDelete our flight data\n\nUpdate our flight data 31\n\nMerge our flight data 31\n\n\nHow Delta Lake Solves Common Pain Points in Streaming\n\n\nBanco Hipotecario 97\n\nLegacy analytics tools are slow, rigid and\n\nimpossible to scale 98\n\n\nHow is schema evolution useful? 14\n\nSummary **Chapter** 14\n\nDelta Lake\n\nDML Internals 15\n\nDelta Lake DML: UPDATE 15\n\n#### 02\n\n\nFeatures\n\n\n#### 05 Chapter\n\n\nData lake pain points Customer Use Cases 64\n\n\nHow is schema evolution useful?\n\n\nData lake pain points\n\n\nSummary\n\n\nData warehouse pain points\n\n\n\n- Why Use MERGE With Delta Lake? View table history 32\n\n- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\nTables Using Python APIs Clean up old table versions with vacuum 33\n\n\nHow Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n\n\nDelta Lake\n\n\nView table history\n\n\nand DML in Delta Lake on\n\n\nHow Delta Lake on Databricks solves these issues\n\n\nA unified platform powers the data lake\n\n\nDML Internals\n\n\nTravel back in time with table history\n\n\nApache Spark 3.0\n\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake\n\n\nand easy collaboration\n\n\nImplement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n\nstock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n\n\nDelta Lake DML: UPDATE\n\n\nClean up old table versions with vacuum\n\n\nSupport for SQL DDL commands\n\n\nImplement your streaming\n\n\nAn efficient team maximizes customer\n\n\n\n- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n\n- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\nand ML Reproducibility 1. Using a timestamp 36\n\n\nUPDATE: Under the hood 16\n\nUPDATE + Delta Lake time travel = Easy debugging\n\nUPDATE: Performance tuning tips 16\n\nDelta Lake DML: DELETE 16\n\nDELETE: Under the hood 17\n\nDELETE + VACUUM: Cleaning up old data files\n\n\nCommon challenges with changing data\n\n\nto define tables in the Hive metastore\n\n\nstock analysis solution with Delta Lake\n\n\nacquisition and retention\n\n\nAnalyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n\n\nWorking with Time Travel\n\n\nCreate or replace tables\n\n\nAnalyze streaming stock data in real time 69\n\n\nViacom18\n\n\n1. Using a timestamp\n\n\nExplicitly alter the table schema\n\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake\n\n\nGrowth in subscribers and terabytes of viewing data push Hadoop to its limits\n\n\n\n- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\non Apache Spark 3.0 Python syntax 37", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "d924d3d8e4e14883aedaebf75db94374", + "DELETE + VACUUM: Cleaning up old data files\n\n\nCommon challenges with changing data\n\n\nto define tables in the Hive metastore\n\n\nstock analysis solution with Delta Lake\n\n\nacquisition and retention\n\n\nAnalyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n\n\nWorking with Time Travel\n\n\nCreate or replace tables\n\n\nAnalyze streaming stock data in real time 69\n\n\nViacom18\n\n\n1. Using a timestamp\n\n\nExplicitly alter the table schema\n\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake\n\n\nGrowth in subscribers and terabytes of viewing data push Hadoop to its limits\n\n\n\n- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\non Apache Spark 3.0 Python syntax 37\n\n\nHow data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n\nLeveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n\n\nScala syntax\n\n\nSupport for SQL Insert, Delete, Update and Merge\n\nAutomatic and incremental Presto/Athena manifest generation\n\nConfiguring your table through table properties\n\nSupport for adding user-defined metadata\n\nin Delta Lake table commits 48\n\nOther highlights 49\n\nLakehouse 50\n\nWhat Is a\n\nLakehouse? 51\n\n\nHow data flows and associated challenges 72\n\n\nRapid data processing for analytics\n\n\nPython syntax\n\n\nLeveraging Structured Streaming with blob store as\n\n\nand ML with Databricks\n\n\nSQL syntax 37\n\n2. Using a version number\n\nScala syntax\n\n\nsource and Delta Lake tables as sink\n\n\nLeveraging viewer data to power personalized viewing experiences 104\n\n\nDELETE: Performance tuning tips 18\n\nDelta Lake DML: MERGE **Chapter** 18\n\nHere’s how an upsert works: 18\n\nMERGE: Under the hood 19\n\nMERGE: Performance tuning tips **03** 19\n\n\nDELETE: Performance tuning tips\n\n\nLakehouse\n\n\nBuilding a Quality of Service Analytics Solution for Streaming Video Services 75\n\nDatabricks Quality of Service solution overview 76\n\nVideo QoS solution architecture 77\n\nMaking your data ready for analytics 79\n\nVideo applications events 80\n\nCDN logs 81\n\n\nDelta Lake DML: MERGE\n\n\n\n- What Is a Lakehouse? Python syntax 38\n\n- Diving Deep Into the Inner Workings of the SQL syntax 38\nLakehouse and Delta Lake Audit data changes 39\n\n\nHere’s how an upsert works:\n\n\nPython syntax\n\n\nMERGE: Under the hood\n\n\nSQL syntax\n\n\nMERGE: Performance tuning tips\n\n\nAudit data changes\n\n\nHow Delta Lake Quickly\n\n\n\n- Understanding Delta Engine Reproduce experiments and reports 39\n\n\n-----\n\n**Fundamentals and Performance**\nBoost data reliability for machine learning and\nbusiness intelligence with Delta Lake\n\n## CHAPTER 01\n\n\n-----\n\n**The Fundamentals of Delta**\n**Lake: Why Reliability and**\n**Performance Matter**\n\nWhen it comes to data reliability, performance — the speed at which your programs\nrun — is of utmost importance. Because of the ACID transactional protections that\nDelta Lake provides, you’re able to get the reliability and performance you need.\n\nWith Delta Lake, you can stream and batch concurrently, perform CRUD operations,\nand save money because you’re now using fewer VMs. It’s easier to maintain your data\nengineering pipelines by taking advantage of streaming, even for batch jobs.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "46eacc3b731787c7cd2915fa2047764a", + "Python syntax\n\n\nMERGE: Under the hood\n\n\nSQL syntax\n\n\nMERGE: Performance tuning tips\n\n\nAudit data changes\n\n\nHow Delta Lake Quickly\n\n\n\n- Understanding Delta Engine Reproduce experiments and reports 39\n\n\n-----\n\n**Fundamentals and Performance**\nBoost data reliability for machine learning and\nbusiness intelligence with Delta Lake\n\n## CHAPTER 01\n\n\n-----\n\n**The Fundamentals of Delta**\n**Lake: Why Reliability and**\n**Performance Matter**\n\nWhen it comes to data reliability, performance — the speed at which your programs\nrun — is of utmost importance. Because of the ACID transactional protections that\nDelta Lake provides, you’re able to get the reliability and performance you need.\n\nWith Delta Lake, you can stream and batch concurrently, perform CRUD operations,\nand save money because you’re now using fewer VMs. It’s easier to maintain your data\nengineering pipelines by taking advantage of streaming, even for batch jobs.\n\nDelta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\ncloud object storage by providing ACID transactions through optimistic concurrency\ncontrol between writes and snapshot isolation for consistent reads during writes.\nDelta Lake also provides built-in data versioning for easy rollbacks and reproducing\nreports.\n\nIn this chapter, we’ll share some of the common challenges with data lakes as well as\nthe Delta Lake features that address them.\n\n**Challenges with data lakes**\nData lakes are a common element within modern data architectures. They serve as a\ncentral ingestion point for the plethora of data that organizations seek to gather and\nmine. While a good step forward in getting to grips with the range of data, they run\ninto the following common problems:\n\n\n-----\n\n**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\nthe problem of unsafe writes into data lakes that cause readers to see garbage\ndata during writes. They have to build workarounds to ensure readers always see\nconsistent data during writes.\n\n**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\nlake is easy, but this comes at the cost of data quality. Without any mechanisms\nfor validating schema and the data, data lakes suffer from poor data quality. As a\nconsequence, analytics projects that strive to mine this data also fail.\n\n**3. Poor performance with increasing amounts of data.** As the amount of data\nthat gets dumped into a data lake increases, the number of files and directories\nalso increases. Big data jobs and query engines that process the data spend a\nsignificant amount of time handling the metadata operations. This problem is more\npronounced in the case of streaming jobs or handling many concurrent batch jobs.\n\n**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.\n\nBecause of these challenges, many big data projects fail to deliver on their vision or\nsometimes just fail altogether. We need a solution that enables data practitioners to\nmake use of their existing data lakes, while ensuring data quality.\n\n**Delta Lake’s key functionalities**\nDelta Lake addresses the above problems to simplify how you build your data lakes.\nDelta Lake offers the following key functionalities:\n\n**• ACID transactions:** Delta Lake provides ACID transactions between multiple\nwrites. Every write is a transaction, and there is a serial order for writes recorded in\na transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "67b604e572d4a8d01ec4adef4baee177", + "**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.\n\nBecause of these challenges, many big data projects fail to deliver on their vision or\nsometimes just fail altogether. We need a solution that enables data practitioners to\nmake use of their existing data lakes, while ensuring data quality.\n\n**Delta Lake’s key functionalities**\nDelta Lake addresses the above problems to simplify how you build your data lakes.\nDelta Lake offers the following key functionalities:\n\n**• ACID transactions:** Delta Lake provides ACID transactions between multiple\nwrites. Every write is a transaction, and there is a serial order for writes recorded in\na transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n\n\n-----\n\n[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\ntrying to modify the same files don’t happen that often. In scenarios where\nthere is a conflict, Delta Lake throws a concurrent modification exception for\nusers to handle them and retry their jobs. Delta Lake also offers the highest level\nof isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\nkeep writing to a directory or table and consumers to keep reading from the same\ndirectory or table. Readers will see the latest snapshot that existed at the time the\nreading started.\n\n**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\nDataFrame being written is compatible with the schema of the table. Columns that\nare present in the table but not in the DataFrame are set to null. If there are extra\ncolumns in the DataFrame that are not present in the table, this operation throws\nan exception. Delta Lake has DDL to add new columns explicitly and the ability to\nupdate the schema automatically.\n\n**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\na table or directory in the transaction log instead of the metastore. This allows\nDelta Lake to list files in large directories in constant time and be efficient while\nreading data.\n\n**• Data versioning and time travel:** Delta Lake allows users to read a previous\nsnapshot of the table or directory. When files are modified during writes, Delta\nLake creates newer versions of the files and preserves the older versions. When\n\n\nusers want to read the older versions of the table or directory, they can provide\na timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\nconstructs the full snapshot as of that timestamp or version based on the\ninformation in the transaction log. This allows users to reproduce experiments and\nreports and also revert a table to its older versions, if needed.\n\n**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\nbe used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\nCombined with ACID transactions and scalable metadata handling, the efficient\nstreaming sink enables lots of near real-time analytics use cases without having to\nmaintain a complicated streaming and batch pipeline.\n\n**• Record update and deletion:** Delta Lake will support merge, update and delete\nDML commands. This allows engineers to easily upsert and delete records in data\nlakes and simplify their change data capture and GDPR use cases. Since Delta Lake\ntracks and modifies data at file-level granularity, it is much more efficient than\nreading and overwriting entire partitions or tables.\n\n**• Data expectations (coming soon):** Delta Lake will also support a new API to set\ndata expectations on tables or directories. Engineers will be able to specify a\nboolean condition and tune the severity to handle data expectations. When Apache\nSpark jobs write to the table or directory, Delta Lake will automatically validate\nthe records and when there is a violation, it will handle the records based on the\nseverity provided.\n\n\n-----\n\n**Unpacking the**\n**Transaction Log**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "2e8fa5b824c302814cf399c84115dbd0", + "**• Record update and deletion:** Delta Lake will support merge, update and delete\nDML commands. This allows engineers to easily upsert and delete records in data\nlakes and simplify their change data capture and GDPR use cases. Since Delta Lake\ntracks and modifies data at file-level granularity, it is much more efficient than\nreading and overwriting entire partitions or tables.\n\n**• Data expectations (coming soon):** Delta Lake will also support a new API to set\ndata expectations on tables or directories. Engineers will be able to specify a\nboolean condition and tune the severity to handle data expectations. When Apache\nSpark jobs write to the table or directory, Delta Lake will automatically validate\nthe records and when there is a violation, it will handle the records based on the\nseverity provided.\n\n\n-----\n\n**Unpacking the**\n**Transaction Log**\n\nThe transaction log is key to understanding Delta Lake because it is the common thread\nthat runs through many of its most important features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log is\nan ordered record of every transaction that has ever been performed on a Delta Lake\ntable since its inception.\n\nDelta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\ngiven table to work on the table at the same time. To show users correct views of the\ndata at all times, the transaction log serves as a single source of truth: the central\nrepository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on an\nopen table that has been modified since the last time it was read, Spark checks the\ntransaction log to see what new transactions are posted to the table. Then, Spark\nupdates the end user’s table with those new changes. This ensures that a user’s\nversion of a table is always synchronized with the master record as of the most recent\nquery and that users cannot make divergent, conflicting changes to a table.\n\nIn this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\nsolution to the problem of multiple concurrent reads and writes.\n\n\n-----\n\n**Implementing atomicity to ensure**\n**operations complete fully**\nAtomicity is one of the four properties of ACID transactions that guarantees that\noperations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\nfully or don’t complete at all. Without this property, it’s far too easy for a hardware\nfailure or a software bug to cause data to be only partially written to a table, resulting\nin messy or corrupted data.\n\nThe transaction log is the mechanism through which Delta Lake is able to offer\nthe guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\ntransaction log, it never happened. By only recording transactions that execute fully\nand completely, and using that record as the single source of truth, the Delta Lake\ntransaction log allows users to reason about their data and have peace of mind about\nits fundamental trustworthiness, at petabyte scale.\n\n**Dealing with multiple concurrent reads and writes**\nBut how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\nLake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n\n\ntable at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n**concurrency control** .\n\nOptimistic concurrency control is a method of dealing with concurrent transactions\nthat assumes the changes made to a table by different users can complete without\nconflicting with one another. It is incredibly fast because when dealing with petabytes\nof data, there’s a high likelihood that users will be working on different parts of the data\naltogether, allowing them to complete non-conflicting transactions simultaneously.\n\nOf course, even with optimistic concurrency control, sometimes users do try to\nmodify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\nfor that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\nthen it attempts to solve any conflict optimistically.\n\nThis protocol allows Delta Lake to deliver on the ACID principle of isolation, which\nensures that the resulting state of the table after multiple, concurrent writes is the\nsame as if those writes had occurred serially, in isolation from one another.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "9afc565db7357152dfe1cfe399ae37e8", + "table at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n**concurrency control** .\n\nOptimistic concurrency control is a method of dealing with concurrent transactions\nthat assumes the changes made to a table by different users can complete without\nconflicting with one another. It is incredibly fast because when dealing with petabytes\nof data, there’s a high likelihood that users will be working on different parts of the data\naltogether, allowing them to complete non-conflicting transactions simultaneously.\n\nOf course, even with optimistic concurrency control, sometimes users do try to\nmodify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\nfor that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\nthen it attempts to solve any conflict optimistically.\n\nThis protocol allows Delta Lake to deliver on the ACID principle of isolation, which\nensures that the resulting state of the table after multiple, concurrent writes is the\nsame as if those writes had occurred serially, in isolation from one another.\n\n\n-----\n\nAs all the transactions made on Delta Lake tables are stored directly to disk, this\nprocess satisfies the ACID property of durability, meaning it will persist even in the\nevent of system failure.\n\n**Time travel, data lineage and debugging**\nEvery table is the result of the sum total of all the commits recorded in the Delta Lake\ntransaction log — no more and no less. The transaction log provides a step-by-step\ninstruction guide, detailing exactly how to get from the table’s original state to its\ncurrent state.\n\nTherefore, we can recreate the state of a table at any point in time by starting with\nan original table, and processing only commits made after that point. This powerful\nability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n\n\nof situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n\nAs the definitive record of every change ever made to a table, the Delta Lake\ntransaction log offers users a verifiable data lineage that is useful for governance,\naudit and compliance purposes. It can also be used to trace the origin of an\ninadvertent change or a bug in a pipeline back to the exact action that caused it. Users\ncan run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\nthat were made.\n\n**Want to learn more about Delta Lake’s transaction log?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**How to Use Schema**\n**Enforcement and**\n**Evolution**\n\nAs business problems and requirements evolve over time, so does the structure of\nyour data. With Delta Lake, incorporating new columns or objects is easy; users have\naccess to simple semantics to control the schema of their tables. At the same time,\nit is important to call out the importance of schema enforcement to prevent users\nfrom accidentally polluting their tables with mistakes or garbage data in addition to\nschema evolution, which enables them to automatically add new columns of rich data\nwhen those columns belong.\n\n**Schema enforcement rejects any new columns or other schema changes that**\n**aren’t compatible with your table.** By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "710f6a1f26a207f904d2d61197c5e9c0", + "**Want to learn more about Delta Lake’s transaction log?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**How to Use Schema**\n**Enforcement and**\n**Evolution**\n\nAs business problems and requirements evolve over time, so does the structure of\nyour data. With Delta Lake, incorporating new columns or objects is easy; users have\naccess to simple semantics to control the schema of their tables. At the same time,\nit is important to call out the importance of schema enforcement to prevent users\nfrom accidentally polluting their tables with mistakes or garbage data in addition to\nschema evolution, which enables them to automatically add new columns of rich data\nwhen those columns belong.\n\n**Schema enforcement rejects any new columns or other schema changes that**\n**aren’t compatible with your table.** By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\n\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together,\nthese features make it easier than ever to block out the noise and tune in to the signal.\n\n**Understanding table schemas**\nEvery DataFrame in Apache Spark contains a schema, a blueprint that defines the\nshape of the data, such as data types and columns, and metadata. With Delta Lake,\nthe table’s schema is saved in JSON format inside the transaction log.\n\n\n-----\n\n**What is schema enforcement?**\nSchema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\ndata quality by rejecting writes to a table that don’t match the table’s schema.\n\nLike the front-desk manager at a busy restaurant who only accepts reservations, it\nchecks to see whether each column of data inserted into the table is on its list of\nexpected columns (in other words, whether each one has a “reservation”), and rejects\nany writes with columns that aren’t on the list.\n\n**How does schema enforcement work?**\nDelta Lake uses **schema validation on write,** which means that all new writes to a\ntable are checked for compatibility with the target table’s schema at write time. If the\nschema is not compatible, Delta Lake cancels the transaction altogether (no data is\nwritten), and raises an exception to let the user know about the mismatch.\n\nTo determine whether a write to a table is compatible, Delta Lake uses the following\nrules. The DataFrame to be written cannot contain:\n\n**• Any additional columns that are not present in the target table’s schema.**\nConversely, it’s OK if the incoming data doesn’t contain every column in the table —\nthose columns will simply be assigned null values.\n\n**• \u0007Column data types that differ from the column data types in the target table.**\nIf a target table’s column contains StringType data, but the corresponding column\nin the DataFrame contains IntegerType data, schema enforcement will raise an\nexception and prevent the write operation from taking place.\n\n**• Column names that differ only by case.** This means that you cannot have columns\nsuch as “Foo” and “foo” defined in the same table. While Spark can be used in case\nsensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\nwhen storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\ncolumn information. To avoid potential mistakes, data corruption or loss issues (which\nwe’ve personally experienced at Databricks), we decided to add this restriction.\n\n\n-----\n\nRather than automatically adding the new columns, Delta Lake enforces the schema,\nand stops the write from occurring. To help identify which column(s) caused the\nmismatch, Spark prints out both schemas in the stack trace for comparison.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e0bbc5fd6f6b9a9377e648ca4ddfd868", + "**• Column names that differ only by case.** This means that you cannot have columns\nsuch as “Foo” and “foo” defined in the same table. While Spark can be used in case\nsensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\nwhen storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\ncolumn information. To avoid potential mistakes, data corruption or loss issues (which\nwe’ve personally experienced at Databricks), we decided to add this restriction.\n\n\n-----\n\nRather than automatically adding the new columns, Delta Lake enforces the schema,\nand stops the write from occurring. To help identify which column(s) caused the\nmismatch, Spark prints out both schemas in the stack trace for comparison.\n\n**How is schema enforcement useful?**\nBecause it’s such a stringent check, schema enforcement is an excellent tool to use\nas a gatekeeper for a clean, fully transformed data set that is ready for production or\nconsumption. It’s typically enforced on tables that directly feed:\n\n- Machine learning algorithms\n\n- BI dashboards\n\n- Data analytics and visualization tools\n\n- Any production system requiring highly structured,\nstrongly typed, semantic schemas\n\nIn order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\na look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n**What is schema evolution?**\nSchema evolution is a feature that allows users to easily change a table’s current\nschema to accommodate data that is changing over time. Most commonly, it’s used\nwhen performing an append or overwrite operation, to automatically adapt the\nschema to include one or more new columns.\n\n**How does schema evolution work?**\nFollowing up on the example from the previous section, developers can\neasily use schema evolution to add the new columns that were previously\nrejected due to a schema mismatch. Schema evolution is activated by adding\n.option(‘mergeSchema’, ‘true’) to your .write or .writeStream\nSpark command, as shown in the following example.\n\n\n#Add the mergeSchema option\n\nloans.write.format( “delta” ) \\\n\n.option( “mergeSchema” , “true” ) \\\n\n.mode( “append” ) \\\n\n.save(DELTALAKE_SILVER_PATH)\n\nBy including the mergeSchema option in your query, any columns that are present\n\nin the DataFrame but not in the target table are automatically added to the end of the\n\nschema as part of a write transaction. Nested fields can also be added, and these\n\nfields will get added to the end of their respective struct columns as well.\n\nData engineers and scientists can use this option to add new columns (perhaps a\n\nnewly tracked metric, or a column of this month’s sales figures) to their existing ML\n\nproduction tables without breaking existing models that rely on the old columns.\n\nThe following types of schema changes are eligible for schema evolution during table\n\nappends or overwrites:\n\n- Adding new columns (this is the most common scenario)\n\n- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n\n→ ShortType → IntegerType\n\nOther changes, not eligible for schema evolution, require that the schema and data\n\nare overwritten by adding .option(“overwriteSchema”,“true”) . Those\n\nchanges include:\n\n- Dropping a column\n\n- Changing an existing column’s data typeC (in place)\n\n- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n\n\n-----\n\nFinally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\nsupported, allowing users to perform the following actions on table schemas:\n\n- Adding columns\n\n- Changing column comments", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "1e592fd72acb0cf9b59313daed7183c1", + "The following types of schema changes are eligible for schema evolution during table\n\nappends or overwrites:\n\n- Adding new columns (this is the most common scenario)\n\n- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n\n→ ShortType → IntegerType\n\nOther changes, not eligible for schema evolution, require that the schema and data\n\nare overwritten by adding .option(“overwriteSchema”,“true”) . Those\n\nchanges include:\n\n- Dropping a column\n\n- Changing an existing column’s data typeC (in place)\n\n- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n\n\n-----\n\nFinally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\nsupported, allowing users to perform the following actions on table schemas:\n\n- Adding columns\n\n- Changing column comments\n\n- Setting table properties that define the behavior of the table, such as setting the\nretention duration of the transaction log\n\n**How is schema evolution useful?**\nSchema evolution can be used anytime you _intend_ to change the schema of your table\n(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\nbe there). It’s the easiest way to migrate your schema because it automatically adds the\ncorrect column names and data types, without having to declare them explicitly.\n\n**Summary**\nSchema enforcement rejects any new columns or other schema changes that\naren’t compatible with your table. By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together, these\nfeatures make it easier than ever to block out the noise and tune in to the signal.\n\n**Want to learn more about schema enforcement and evolution?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**Delta Lake**\n**DML Internals**\n\nDelta Lake supports data manipulation language (DML) commands including UPDATE,\nDELETE and MERGE. These commands simplify change data capture (CDC), audit and\ngovernance, and GDPR/CCPA workflows, among others.\n\nIn this chapter, we will demonstrate how to use each of these DML commands,\ndescribe what Delta Lake is doing behind the scenes, and offer some performance\ntuning tips for each one.\n\n**Delta Lake DML: UPDATE**\nYou can use the UPDATE operation to selectively update any rows that match a\nfiltering condition, also known as a predicate. The code below demonstrates how\nto use each type of predicate as part of an UPDATE statement. Note that Delta Lake\noffers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\nonly the SQL code.\n\n-- Update events\n\nUPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n\n\n-----\n\n**UPDATE: Under the hood**\nDelta Lake performs an UPDATE on a table in two steps:\n\n1. Find and select the files containing data that match the predicate and, therefore,\nneed to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\nthis process.\n\n2. \u0007Read each matching file into memory, update the relevant rows, and write out the\nresult into a new data file.\n\nOnce Delta Lake has executed the UPDATE successfully, it adds a commit in the\ntransaction log indicating that the new data file will be used in place of the old one\nfrom now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n— recorded as a data file that applied to an older version of the table, but not the\ncurrent version. Delta Lake is able to use it to provide data versioning and time travel.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b7dca6105fca69505d8c6fc54e87219d", + "UPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n\n\n-----\n\n**UPDATE: Under the hood**\nDelta Lake performs an UPDATE on a table in two steps:\n\n1. Find and select the files containing data that match the predicate and, therefore,\nneed to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\nthis process.\n\n2. \u0007Read each matching file into memory, update the relevant rows, and write out the\nresult into a new data file.\n\nOnce Delta Lake has executed the UPDATE successfully, it adds a commit in the\ntransaction log indicating that the new data file will be used in place of the old one\nfrom now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n— recorded as a data file that applied to an older version of the table, but not the\ncurrent version. Delta Lake is able to use it to provide data versioning and time travel.\n\n**UPDATE + Delta Lake time travel = Easy debugging**\nKeeping the old data files turns out to be very useful for debugging because you can\nuse Delta Lake “time travel” to go back and query previous versions of a table at any\n\n\ntime. In the event that you update your table incorrectly and want to figure out what\nhappened, you can easily compare two versions of a table to one another to see what\nhas changed.\n\nSELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n\n- FROM mytable VERSION AS OF 12\n\n**UPDATE: Performance tuning tips**\nThe main way to improve the performance of the UPDATE command on Delta Lake\nis to add more predicates to narrow down the search space. The more specific the\nsearch, the fewer files Delta Lake needs to scan and/or modify.\n\n**Delta Lake DML: DELETE**\nYou can use the DELETE command to selectively delete rows based upon a\npredicate (filtering condition).\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n\n-----\n\nIn the event that you want to revert an accidental DELETE operation, you can use time\ntravel to roll back your table to the way it was.\n\n**DELETE: Under the hood**\nDELETE works just like UPDATE under the hood. Delta Lake makes two scans of\nthe data: The first scan is to identify any data files that contain rows matching the\npredicate condition. The second scan reads the matching data files into memory,\nat which point Delta Lake deletes the rows in question before writing out the newly\nclean data to disk.\n\nAfter Delta Lake completes a DELETE operation successfully, the old data files are\nnot deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\nlonger part of the active table) in the Delta Lake transaction log. Remember, those old\nfiles aren’t deleted immediately because you might still need them to time travel back\nto an earlier version of the table. If you want to delete files older than a certain time\nperiod, you can use the VACUUM command.\n\n**DELETE + VACUUM: Cleaning up old data files**\nRunning the VACUUM command permanently deletes all data files that are:\n\n1. No longer part of the active table and\n2. \u0007Older than the retention threshold, which is seven days by default\n\nDelta Lake does not automatically VACUUM old files — you must run the command\nyourself, as shown below. If you want to specify a retention period that is different\nfrom the default of seven days, you can provide it as a parameter.\n\nfrom delta.tables import - deltaTable.\n\n# vacuum files older than 30 days(720 hours)\n\ndeltaTable.vacuum( 720 )\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "fefdd902aa67221084956739775933ef", + "**DELETE + VACUUM: Cleaning up old data files**\nRunning the VACUUM command permanently deletes all data files that are:\n\n1. No longer part of the active table and\n2. \u0007Older than the retention threshold, which is seven days by default\n\nDelta Lake does not automatically VACUUM old files — you must run the command\nyourself, as shown below. If you want to specify a retention period that is different\nfrom the default of seven days, you can provide it as a parameter.\n\nfrom delta.tables import - deltaTable.\n\n# vacuum files older than 30 days(720 hours)\n\ndeltaTable.vacuum( 720 )\n\n\n-----\n\n**DELETE: Performance tuning tips**\nJust like with the UPDATE command, the main way to improve the performance of\na DELETE operation on Delta Lake is to add more predicates to narrow down the\nsearch space. The Databricks managed version of Delta Lake also features other\nperformance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n\n**Delta Lake DML: MERGE**\nThe Delta Lake MERGE command allows you to perform upserts, which are a mix of\nan UPDATE and an INSERT. To understand upserts, imagine that you have an existing\ntable (aka a target table), and a source table that contains a mix of new records and\nupdates to existing records.\n\n\n**Here’s how an upsert works:**\n\n- When a record from the source table matches a preexisting record in the target\ntable, Delta Lake updates the record.\n\n- When there is no such match, Delta Lake inserts the new record.\n\nThe Delta Lake MERGE command greatly simplifies workflows that can be complex\nand cumbersome with other traditional data formats like Parquet. Common scenarios\nwhere merges/upserts come in handy include change data capture, GDPR/CCPA\ncompliance, sessionization, and deduplication of records.\n\n**For more information about upserts, read:**\n\n[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n\n[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n\n[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n\n\n-----\n\n**MERGE: Under the hood**\nDelta Lake completes a MERGE in two steps:\n\n1. Perform an inner join between the target table and source table to select all files\nthat have matches.\n2. Perform an outer join between the selected files in the target and source tables\nand write out the updated/deleted/inserted data.\n\nThe main way that this differs from an UPDATE or a DELETE under the hood is that\nDelta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\nstrategies when seeking to improve performance.\n\n**MERGE: Performance tuning tips**\nTo improve performance of the MERGE command, you need to determine which of the\ntwo joins that make up the merge is limiting your speed.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e8a30d8cf3f0c5e33c90468fbd5804db", + "[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n\n\n-----\n\n**MERGE: Under the hood**\nDelta Lake completes a MERGE in two steps:\n\n1. Perform an inner join between the target table and source table to select all files\nthat have matches.\n2. Perform an outer join between the selected files in the target and source tables\nand write out the updated/deleted/inserted data.\n\nThe main way that this differs from an UPDATE or a DELETE under the hood is that\nDelta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\nstrategies when seeking to improve performance.\n\n**MERGE: Performance tuning tips**\nTo improve performance of the MERGE command, you need to determine which of the\ntwo joins that make up the merge is limiting your speed.\n\nIf the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\ntakes too long), try the following strategies:\n\n- Add more predicates to narrow down the search space.\n\n- Adjust shuffle partitions.\n\n- Adjust broadcast join thresholds.\n\n- Compact the small files in the table if there are lots of them, but don’t compact them\ninto files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n\n\n**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n**locality of updates.**\n\nOn the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\nthemselves takes too long), try the strategies below.\n\n- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\nbefore writes (with Optimized Writes in Databricks Delta Lake).\n\n- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\nbroadcast join, but if you’re doing a right outer join, Spark can do one, and you can\nadjust the broadcast thresholds as needed.\n\n- **Cache the source table / DataFrame:** Caching the source table can speed up the\nsecond scan, but be sure not to cache the target table, as this can lead to cache\ncoherency issues.\n\nDelta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\ngreatly simplify the workflow for many common big data operations. In this chapter, we\ndemonstrated how to use these commands in Delta Lake, shared information about\nhow each one works under the hood, and offered some performance tuning tips.\n\n**Want a deeper dive into DML internals, including snippets of code?**\n\n[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n\n\n-----\n\n**How Delta Lake Quickly**\n**Processes Petabytes With**\n**Data Skipping and Z-Ordering**\n\nDelta Lake is capable of sifting through petabytes of data within seconds. Much of this\nspeed is owed to two features: (1) data skipping and (2) Z-Ordering.\n\nCombining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\namount of data that needs to be scanned to answer selective queries against large\nDelta tables, which typically translates into substantial runtime improvements and\ncost savings.\n\nUsing Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\ndata lakes can be queried in a matter of seconds by skipping files not relevant to\nthe query. For example, 93.2% of the records in a 504 TB data set were skipped for a\ntypical query in a real-world cybersecurity analysis use case, reducing query times by\nup to two orders of magnitude. In other words, Delta Lake can speed up your queries\nby as much as 100x.\n\n**Want to see data skipping and Z-Ordering in action?**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "45f7948c2a554f1131404199c2ea647a", + "Delta Lake is capable of sifting through petabytes of data within seconds. Much of this\nspeed is owed to two features: (1) data skipping and (2) Z-Ordering.\n\nCombining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\namount of data that needs to be scanned to answer selective queries against large\nDelta tables, which typically translates into substantial runtime improvements and\ncost savings.\n\nUsing Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\ndata lakes can be queried in a matter of seconds by skipping files not relevant to\nthe query. For example, 93.2% of the records in a 504 TB data set were skipped for a\ntypical query in a real-world cybersecurity analysis use case, reducing query times by\nup to two orders of magnitude. In other words, Delta Lake can speed up your queries\nby as much as 100x.\n\n**Want to see data skipping and Z-Ordering in action?**\n\nApple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n\nuse Delta Lake as a unified solution for data engineering and data science in the context\n\nof cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n\n[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n\n\n-----\n\nAND / OR / NOT are also supported as well as “literal op column” predicates.\n\nEven though data skipping kicks in when the above conditions are met, it may not\nalways be effective. But, if there are a few columns that you frequently filter by and\nwant to make sure that’s fast, then you can explicitly optimize your data layout with\nrespect to skipping effectiveness by running the following command:\n\nOPTIMIZE [ WHERE ]\nZORDER BY ( [, …])\n\n**Exploring the details**\nApart from partition pruning, another common technique that’s used in the data\nwarehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\nas minimum and maximum values at a certain granularity that are correlated with I/O\ngranularity. And we want to leverage those statistics at query planning time in order\nto avoid unnecessary I/O.\n\nThis is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\ninserted into a Delta Lake table, file-level min/max statistics are collected for all\ncolumns (including nested ones) of supported types. Then, when there’s a lookup\nquery against the table, Delta Lake first consults these statistics in order to determine\nwhich files can safely be skipped.\n\n**Want to learn more about data skipping and Z-Ordering, including**\n**how to apply it within a cybersecurity analysis?**\n\n[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n\n\n**Using data skipping and Z-Order clustering**\nData skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\nDelta Lake, kicking in whenever your SQL queries or data set operations include filters\nof the form “column op literal,” where:\n\n- column is an attribute of some Delta Lake table, be it top-level or nested, whose\ndata type is string / numeric / date/ timestamp\n\n- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n\n\n- literal is an explicit (list of) value(s) of the same data type as a column\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "862d197d353a337dcbecd6b04a9c76ab", + "[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n\n\n**Using data skipping and Z-Order clustering**\nData skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\nDelta Lake, kicking in whenever your SQL queries or data set operations include filters\nof the form “column op literal,” where:\n\n- column is an attribute of some Delta Lake table, be it top-level or nested, whose\ndata type is string / numeric / date/ timestamp\n\n- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n\n\n- literal is an explicit (list of) value(s) of the same data type as a column\n\n\n-----\n\n**Features**\nUse Delta Lake’s robust features\nto reliably manage your data\n\n## CHAPTER 02\n\n\n-----\n\n**Why Use MERGE**\n**With Delta Lake?**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\nMERGE command, which allows you to efficiently upsert and delete records in your\ndata lakes.\n\nMERGE dramatically simplifies how a number of common data pipelines can be built\n-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\ncan now be replaced by simple MERGE queries.\n\nThis finer-grained update capability simplifies how you build your big data\npipelines for various use cases ranging from change data capture to GDPR. You\nno longer need to write complicated logic to overwrite tables and overcome a lack\nof snapshot isolation.\n\nWith changing data, another critical capability required is the ability to roll back, in\ncase of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n\nIn this chapter, we’ll discuss common use cases where existing data might need to be\nupdated or deleted. We’ll also explore the challenges inherent to upserts and explain\nhow MERGE can address them.\n\n\n-----\n\n**When are upserts necessary?**\nThere are a number of common use cases where existing data in a data lake needs to\nbe updated or deleted:\n\n- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\nthe right to be forgotten (also known as data erasure) in GDPR, organizations must\nremove a user’s information upon request. This data erasure includes deleting user\ninformation in the data lake as well.\n\n- **Change data capture from traditional databases:** In a service-oriented\narchitecture, typically web and mobile applications are served by microservices\nbuilt on traditional SQL/NoSQL databases that are optimized for low latency. One\nof the biggest challenges organizations face is joining data across these various\nsiloed data systems, and hence data engineers build pipelines to consolidate\nall data sources into a central data lake to facilitate analytics. These pipelines\noften have to periodically read changes made on a traditional SQL/NoSQL table\nand apply them to corresponding tables in the data lake. Such changes can take\nvarious forms: Tables with slowly changing dimensions, change data capture of all\ninserted/updated/deleted rows, etc.\n\n- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\ncase in many areas ranging from product analytics to targeted advertising to\npredictive maintenance. Building continuous applications to track sessions and\nrecording the results that write into data lakes is difficult because data lakes have\nalways been optimized for appending data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ce66f1cf55c56b762dad363ec5618ccf", + "- **Change data capture from traditional databases:** In a service-oriented\narchitecture, typically web and mobile applications are served by microservices\nbuilt on traditional SQL/NoSQL databases that are optimized for low latency. One\nof the biggest challenges organizations face is joining data across these various\nsiloed data systems, and hence data engineers build pipelines to consolidate\nall data sources into a central data lake to facilitate analytics. These pipelines\noften have to periodically read changes made on a traditional SQL/NoSQL table\nand apply them to corresponding tables in the data lake. Such changes can take\nvarious forms: Tables with slowly changing dimensions, change data capture of all\ninserted/updated/deleted rows, etc.\n\n- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\ncase in many areas ranging from product analytics to targeted advertising to\npredictive maintenance. Building continuous applications to track sessions and\nrecording the results that write into data lakes is difficult because data lakes have\nalways been optimized for appending data.\n\n- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\nDelta Lake table by appending data to the table. However, often the sources can\ngenerate duplicate records and downstream de-duplication steps are needed to\ntake care of them.\n\n\n-----\n\n**Why upserts into data lakes have**\n**traditionally been challenging**\nSince data lakes are fundamentally based on files, they have always been optimized\nfor appending data rather than for changing existing data. Hence, building the above\nuse case has always been challenging.\n\nUsers typically read the entire table (or a subset of partitions) and then overwrite\nthem. Therefore, every organization tries to reinvent the wheel for their requirement\nby handwriting complicated queries in SQL, Spark, etc. This approach is:\n\n- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\nrecords causes pipelines to be slow and costly. Hand-tuning the table layout and\nquery optimization is tedious and requires deep domain knowledge.\n\n- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\nhuman errors. For example, multiple pipelines concurrently modifying the same table\nwithout any transactional support can lead to unpredictable data inconsistencies\nand in the worst case, data losses. Often, even a single handwritten pipeline can\neasily cause data corruptions due to errors in encoding the business logic.\n\n- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\nkeep track of and maintain. In the long term, this alone can significantly increase\nthe organizational and infrastructural costs.\n\n**Introducing MERGE in Delta Lake**\nWith Delta Lake, you can easily address the use cases above without any of the\naforementioned problems using the following MERGE command:\n\nMERGE INTO\n\nUSING\n\nON\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n\n[ WHEN NOT MATCHED [ AND ] THEN ]\n\nwhere\n\n=\n\nDELETE |\n\nUPDATE SET - |\n\nUPDATE SET column1 = value1 [, column2 = value2 ...]\n\n=\n\nINSERT - |\n\nINSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n\nLet’s understand how to use MERGE with a simple example. Suppose you have a\n[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\nFurthermore, you have a table of new addresses for both existing and new users. To\nmerge all the new addresses to the main user table, you can run the following:\n\nMERGE INTO users\n\nUSING updates\n\nON users.userId = updates.userId\n\nWHEN MATCHED THEN\n\nUPDATE SET address = updates.addresses\n\nWHEN NOT MATCHED THEN\nINSERT (userId, address) VALUES (updates.userId, updates.address)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b302cede7dc4417b382ced696712982a", + "where\n\n=\n\nDELETE |\n\nUPDATE SET - |\n\nUPDATE SET column1 = value1 [, column2 = value2 ...]\n\n=\n\nINSERT - |\n\nINSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n\nLet’s understand how to use MERGE with a simple example. Suppose you have a\n[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\nFurthermore, you have a table of new addresses for both existing and new users. To\nmerge all the new addresses to the main user table, you can run the following:\n\nMERGE INTO users\n\nUSING updates\n\nON users.userId = updates.userId\n\nWHEN MATCHED THEN\n\nUPDATE SET address = updates.addresses\n\nWHEN NOT MATCHED THEN\nINSERT (userId, address) VALUES (updates.userId, updates.address)\n\nThis will perform exactly what the syntax says -- for existing users (i.e., MATCHED\nclause), it will update the address column, and for new users (i.e., NOT MATCHED\nclause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\nMERGE operation can be orders of magnitude faster than overwriting entire partitions\nor tables since Delta Lake reads only relevant files and updates them. Specifically,\nDelta Lake's MERGE has the following advantages:\n\n\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n-----\n\n**Simplifying use cases with MERGE**\n**Deleting data due to GDPR**\nComplying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\nget any easier. You can set up a simple scheduled job with an example code, like\nbelow, to delete all the users who have opted out of your service.\n\nMERGE INTO users\n\nUSING opted_out_users\n\nON opted_out_users.userId = users.userId\n\nWHEN MATCHED THEN DELETE\n\n**Applying change data from databases**\nYou can easily apply all data changes — updates, deletes, inserts — generated from an\nexternal database into a Delta Lake table with the MERGE syntax as follows:\n\nMERGE INTO users\n\nUSING (\n\nSELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n\n(\n\nSELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n\nFROM changes GROUP BY userId\n\n)\n\n) latestChange\n\nON latestChange.userId = users.userId\n\nWHEN MATCHED AND latestChange.deleted = TRUE THEN\n\nDELETE\n\nWHEN MATCHED THEN\n\nUPDATE SET address = latestChange.address\n\nWHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n\nINSERT (userId, address) VALUES (userId, address)\n\n\n\n- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\npartitions. This eliminates all the complications of rewriting partitions, updating\nthe Hive metastore with MSCK and so on.\n\n- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\nrewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\nDelta Lake with all its I/O and processing optimizations makes all the reading and\nwriting data by MERGE significantly faster than similar operations in Apache Spark.\n\n- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\nconcurrent writers update the data correctly with ACID transactions, and concurrent\nreaders always see a consistent snapshot of the data.\n\nHere is a visual explanation of how MERGE compares with handwritten pipelines.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "2a2ccd995214417b940431182cb57108", + "UPDATE SET address = latestChange.address\n\nWHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n\nINSERT (userId, address) VALUES (userId, address)\n\n\n\n- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\npartitions. This eliminates all the complications of rewriting partitions, updating\nthe Hive metastore with MSCK and so on.\n\n- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\nrewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\nDelta Lake with all its I/O and processing optimizations makes all the reading and\nwriting data by MERGE significantly faster than similar operations in Apache Spark.\n\n- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\nconcurrent writers update the data correctly with ACID transactions, and concurrent\nreaders always see a consistent snapshot of the data.\n\nHere is a visual explanation of how MERGE compares with handwritten pipelines.\n\n\n-----\n\n**Updating session information from streaming**\n**pipelines**\nIf you have streaming event data flowing in and if you want to sessionize the streaming\nevent data and incrementally update and store sessions in a Delta Lake table, you\ncan accomplish this using the foreachBatch in Structured Streaming and MERGE.\nFor example, suppose you have a Structured Streaming DataFrame that computes\nupdated session information for each user. You can start a streaming query that\napplies all the sessions update to a Delta Lake table as follows (Scala).\n\nstreamingSessionUpdatesDF.writeStream\n\n.foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n\nmicroBatchOutputDF.createOrReplaceTempView(“updates”)\n\nmicroBatchOutputDF.sparkSession.sql(s”””\n\nMERGE INTO sessions\n\nUSING updates\n\nON sessions.sessionId = updates.sessionId\n\nWHEN MATCHED THEN UPDATE SET *\n\nWHEN NOT MATCHED THEN INSERT * “”” )\n\n}.start()\n\nFor a complete working example of each Batch and MERGE, see this notebook\n( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n\n**Additional resources**\n\n[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n\n[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n\n[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n\n[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n\n[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n\n\n-----\n\n**Simple, Reliable Upserts and**\n**Deletes on Delta Lake Tables**\n**Using Python APIs**\n\nIn this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\nLake within the context of an on-time flight performance scenario. We will show how\nto upsert and delete data, query old versions of data with time travel, and vacuum\nolder versions for cleanup.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "73e577990b7c3a09d1aafcdc124d9ed0", + "[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n\n[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n\n[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n\n\n-----\n\n**Simple, Reliable Upserts and**\n**Deletes on Delta Lake Tables**\n**Using Python APIs**\n\nIn this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\nLake within the context of an on-time flight performance scenario. We will show how\nto upsert and delete data, query old versions of data with time travel, and vacuum\nolder versions for cleanup.\n\n**How to start using Delta Lake**\nThe Delta Lake package is installable through PySpark by using the --packages\noption. In our example, we will also demonstrate the ability to VACUUM files and execute\nDelta Lake SQL commands within Apache Spark. As this is a short demonstration, we\nwill also enable the following configurations:\n\n\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n\nto allow us to vacuum files shorter than the default retention duration of seven days.\nNote, this is only required for the SQL command VACUUM\n\n\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n\nto enable Delta Lake SQL commands within Apache Spark; this is not required for\nPython or Scala API calls.\n\n# Using Spark Packages\n\n./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n\ndatabricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n\nsql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n\n\n-----\n\n**Loading and saving our Delta Lake data**\nThis scenario will be using the On-Time Flight Performance or Departure Delays data\nset generated from the RITA BTS Flight Departure Statistics; some examples of this data\nin action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\nby reading the data set.\n\n\u0007# Location variables\n\n\n/departureDelays.delta$ ls l\n\n.\n\n..\n\n_delta_log\n\npart- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n\nPart- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n\n\ntripdelaysFilePath = “/root/data/departuredelays.csv”\n\npathToEventsTable = “/root/deltalake/departureDelays.delta”\n\nNow, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n\n# Read flight delay data\n\n\ndepartureDelays = spark.read \\\n\n.option( “header” , “true” ) \\\n\n.option( “inferSchema” , “true” ) \\", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "2e0b5f9b3069618ee6c293d23c466506", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n\nPart- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n\n\ntripdelaysFilePath = “/root/data/departuredelays.csv”\n\npathToEventsTable = “/root/deltalake/departureDelays.delta”\n\nNow, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n\n# Read flight delay data\n\n\ndepartureDelays = spark.read \\\n\n.option( “header” , “true” ) \\\n\n.option( “inferSchema” , “true” ) \\\n\n.csv(tripdelaysFilePath)\n\nNext, let’s save our departureDelays data set to a Delta Lake table. By saving this table\nto Delta Lake storage, we will be able to take advantage of its features including ACID\ntransactions, unified batch and streaming and time travel.\n\n# Save flight delay data into Delta Lake format\n\ndepartureDelays \\\n\n.write \\\n\n\n# Load flight delay data in Delta Lake format\n\ndelays_delta = spark \\\n\n.read \\\n\n.format( “delta” ) \\\n\n.load( “departureDelays.delta” )\n\n# Create temporary view\n\ndelays_delta.createOrReplaceTempView(“delays_delta”)\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’”).show()\n\n\n.format( “delta” ) \\\n\n.mode( “overwrite” ) \\\n\n.save( “departureDelays.delta” )\n\nNote, this approach is similar to how you would normally save Parquet data; instead\nof specifying format(“parquet”) , you will now specify format(“delta”) . If\nyou were to take a look at the underlying file system, you will notice four files created\nfor the departureDelays Delta Lake table.\n\n\n-----\n\nFinally, lets determine the number of flights originating from Seattle to San Francisco; in\nthis data set, there are 1698 flights.\n\n**In-place conversion to Delta Lake**\nIf you have existing Parquet tables, you have the ability to convert them to Delta Lake\nformat in place, thus not needing to rewrite your table. To convert the table, you can\nrun the following commands.\n\n\ndeltaTable DeltaTable .forPath(spark, pathToEventsTable\n\n)\n\n# Delete all on-time and early flights\n\ndeltaTable. delete ( “delay < 0” )\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’” ).show()\n\n\nfrom delta.tables import - \n\n# Convert non partitioned parquet table at path ‘/path/to/table’\n\ndeltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n\ntable`” )\n\n# Convert partitioned parquet table at path ‘/path/to/table’ and\n\npartitioned by integer column named ‘part’\n\n\nAfter we delete (more on this below) all of the on-time and early flights, as you can\nsee from the preceding query there are 837 late flights originating from Seattle to\nSan Francisco. If you review the file system, you will notice there are more files even\nthough you deleted data.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "9f6993ce559f866ae52a5b376d3ad1eb", + "destination = ‘SFO’” ).show()\n\n\nfrom delta.tables import - \n\n# Convert non partitioned parquet table at path ‘/path/to/table’\n\ndeltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n\ntable`” )\n\n# Convert partitioned parquet table at path ‘/path/to/table’ and\n\npartitioned by integer column named ‘part’\n\n\nAfter we delete (more on this below) all of the on-time and early flights, as you can\nsee from the preceding query there are 837 late flights originating from Seattle to\nSan Francisco. If you review the file system, you will notice there are more files even\nthough you deleted data.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n\npart-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n\npart- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n\npart- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n\npart- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n\n\npartitionedDeltaTable = DeltaTable .convertToDelta(spark,\n\n“parquet.`/path/to/table`”, “part int” )\n\n**Delete our flight data**\nTo delete data from a traditional data lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to delete\n2. Create a new table based on the previous query\n3. Delete the original table\n4. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this process\nby running a DELETE statement. To show this, let’s delete all of the flights that had\narrived early or on-time (i.e., delay < 0).\n\n\nfrom delta.tables import - \n\nfrom pyspark.sql.functions import - \n\n# Access the Delta Lake table\n\n\n-----\n\nIn traditional data lakes, deletes are performed by rewriting the entire table\nexcluding the values to be deleted. With Delta Lake, deletes are instead performed\nby selectively writing new versions of the files containing the data to be deleted and\nonly marks the previous files as deleted. This is because Delta Lake uses multiversion\nconcurrency control (MVCC) to do atomic operations on the table: For example, while\none user is deleting data, another user may be querying the previous version of the\ntable. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\nand query previous versions as we will see later.\n\n**Update our flight data**\nTo update data from your traditional Data Lake table, you will need to:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ac06a55857e208aef4a47cafe26630cf", + "from delta.tables import - \n\nfrom pyspark.sql.functions import - \n\n# Access the Delta Lake table\n\n\n-----\n\nIn traditional data lakes, deletes are performed by rewriting the entire table\nexcluding the values to be deleted. With Delta Lake, deletes are instead performed\nby selectively writing new versions of the files containing the data to be deleted and\nonly marks the previous files as deleted. This is because Delta Lake uses multiversion\nconcurrency control (MVCC) to do atomic operations on the table: For example, while\none user is deleting data, another user may be querying the previous version of the\ntable. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\nand query previous versions as we will see later.\n\n**Update our flight data**\nTo update data from your traditional Data Lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to modify\n2. Modify the rows that need to be updated/changed\n3. Merge these two tables to create a new table\n4. Delete the original table\n5. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this\nprocess by running an UPDATE statement. To show this, let’s update all of the flights\noriginating from Detroit to Seattle.\n\n\nWith the Detroit flights now tagged as Seattle flights, we now have 986 flights\noriginating from Seattle to San Francisco. If you were to list the file system for\nyour departureDelays folder (i.e., $../departureDelays/ls -l ), you will\nnotice there are now 11 files (instead of the 8 right after deleting the files and the four\nfiles after creating the table).\n\n**Merge our flight data**\nA common scenario when working with a data lake is to continuously append data to\nyour table. This often results in duplicate data (rows you do not want to be inserted\ninto your table again), new rows that need to be inserted, and some rows that need to\nbe updated. With Delta Lake, all of this can be achieved by using the merge operation\n(similar to the SQL MERGE statement).\n\nLet’s start with a sample data set that you will want to be updated, inserted or\nde-duplicated with the following query.\n\n\n# Update all flights originating from Detroit to now be\n\n\noriginating from Seattle\n\ndeltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\n# How many flights are between Seattle and San Francisco\n\n\nThe output of this query looks like the following table. Note, the color-coding has been\nadded to clearly identify which rows are de-duplicated (blue), updated (yellow) and\ninserted (green).\n\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n\nand destination = ‘SFO’” ).show()\n\n\n-----\n\nNext, let’s generate our own merge_table that contains data we will insert, update\nor de-duplicate with the following code snippet.\n\nitems = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n\n‘SEA’ , ‘SFO’ ),\n\n(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n\n\nWith Delta Lake, this can be easily achieved via a merge statement as noted in the\nfollowing code snippet.\n\n# Merge merge_table with flights\n\ndeltaTable. alias( “flights” ) \\\n\n.merge(merge_table. alias ( “updates”),”flights.date =\n\nupdates.date” ) \\", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "fd039f130d7c2e0fb95ead61b4197d3b", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n\nand destination = ‘SFO’” ).show()\n\n\n-----\n\nNext, let’s generate our own merge_table that contains data we will insert, update\nor de-duplicate with the following code snippet.\n\nitems = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n\n‘SEA’ , ‘SFO’ ),\n\n(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n\n\nWith Delta Lake, this can be easily achieved via a merge statement as noted in the\nfollowing code snippet.\n\n# Merge merge_table with flights\n\ndeltaTable. alias( “flights” ) \\\n\n.merge(merge_table. alias ( “updates”),”flights.date =\n\nupdates.date” ) \\\n\n.whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n\n.whenNotMatchedInsertAll() \\\n\n.execute()\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\ncols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n\n\nmerge_table = spark.createDataFrame(items, cols)\n\nmerge_table.toPandas()\n\nIn the preceding table ( merge_table ), there are three rows with a unique date value:\n\n1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n2. 1010710: This row is a _duplicate_ (blue)\n3. 1010832: This is a new row to be _inserted_ (green)\n\n\nAll three actions of de-duplication, update and insert were efficiently completed with\none statement.\n\n**View table history**\nAs previously noted, after each of our transactions (delete, update), there were more\nfiles created within the file system. This is because for each transaction, there are\ndifferent versions of the Delta Lake table.\n\n\n-----\n\nThis can be seen by using the DeltaTable.history() method as noted below\n\nNote: You can also perform the same task with SQL:\n\nspark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n\nAs you can see, there are three rows representing the different versions of the table\n(below is an abridged version to help make it easier to read) for each of the operations\n(create table, delete and update):\n\n**Travel back in time with table history**\nWith Time Travel, you can review the Delta Lake table as of the version or timestamp.\nTo view historical data, specify the version or timestamp option; in the following code\nsnippet, we will specify the version option.\n\n\n# Load DataFrames for each version\n\ndfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n0 ).load( “departureDelays.delta” )\n\ndfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n\n1 ).load( “departureDelays.delta” )\n\ndfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n2 ).load( “departureDelays.delta” )\n\n# Calculate the SEA to SFO flight counts for each version of history\n\ncnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "230f954a58a1e8798d1b7d8266b0f3ed", + "# Load DataFrames for each version\n\ndfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n0 ).load( “departureDelays.delta” )\n\ndfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n\n1 ).load( “departureDelays.delta” )\n\ndfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n2 ).load( “departureDelays.delta” )\n\n# Calculate the SEA to SFO flight counts for each version of history\n\ncnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\n# Print out the value\n\nprint ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n\n(cnt0, cnt1, cnt2))\n\n## Output\n\nSEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n\nWhether for governance, risk management and compliance (GRC) or rolling back\nerrors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\ndelete had occurred with these operators) and data (e.g., the actual rows deleted). But\nhow do we remove the data files either for compliance or size reasons?\n\n**Clean up old table versions with vacuum**\nThe [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\nolder than seven days’ reference. If you were to view the file system, you’ll notice the\n11 files for your table.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n\npart- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n\n\n-----\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n\npart- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ca33594916a7cf4ce26423a0d3d5f337", + "part- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n\npart- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n\npart- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n\nPart- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n\nTo delete all of the files so that you only keep the current snapshot of data, you will specify a\nsmall value for the vacuum method (instead of the default retention of 7 days).\n\n# Remove all files older than 0 hours old.\n\ndeltaTable.vacuum( 0 )\n\nNote , you perform the same task via SQL syntax:¸\n\n# Remove all files older than 0 hours old\n\nspark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n\nOnce the vacuum has completed, when you review the file system you will notice fewer\nfiles as the historical data has been removed.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n\nNote, the ability to time travel back to a version older than the retention period is lost\nafter running vacuum.\n\n\n-----\n\n**Time Travel for**\n**Large-Scale Data Lakes**\n\nTime travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\nmetadata handling, and unifies streaming and batch data processing. Delta Lake runs on\ntop of your existing data lake and is fully compatible with Apache Spark APIs.\n\nWith this feature, Delta Lake automatically versions the big data that you store in your\ndata lake, and you can access any historical version of that data. This temporal data\nmanagement simplifies your data pipeline by making it easy to audit, roll back data\nin case of accidental bad writes or deletes, and reproduce experiments and reports.\n\nYour organization can finally standardize on a clean, centralized, versioned big data\nrepository in your own cloud storage for your analytics.\n\n**Common challenges with changing data**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "88cd3f53e2c6a1283a4224629fc87774", + "Time travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\nmetadata handling, and unifies streaming and batch data processing. Delta Lake runs on\ntop of your existing data lake and is fully compatible with Apache Spark APIs.\n\nWith this feature, Delta Lake automatically versions the big data that you store in your\ndata lake, and you can access any historical version of that data. This temporal data\nmanagement simplifies your data pipeline by making it easy to audit, roll back data\nin case of accidental bad writes or deletes, and reproduce experiments and reports.\n\nYour organization can finally standardize on a clean, centralized, versioned big data\nrepository in your own cloud storage for your analytics.\n\n**Common challenges with changing data**\n\n- **Audit data changes:** Auditing data changes is critical both in terms of data\ncompliance as well as simple debugging to understand how data has changed over\ntime. Organizations moving from traditional data systems to big data technologies\nand the cloud struggle in such scenarios.\n\n- **Reproduce experiments and reports:** During model training, data scientists\nrun various experiments with different parameters on a given set of data. When\nscientists revisit their experiments after a period of time to reproduce the models,\ntypically the source data has been modified by upstream pipelines. A lot of times,\nthey are caught unaware by such upstream data changes and hence struggle to\nreproduce their experiments. Some scientists and organizations engineer best\n\n\n-----\n\npractices by creating multiple copies of the data, leading to increased storage\ncosts. The same is true for analysts generating reports.\n\n- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n\nThis can happen because of issues ranging from infrastructure instabilities to messy\ndata to bugs in the pipeline. For pipelines that do simple appends to directories or a\ntable, rollbacks can easily be addressed by date-based partitioning. With updates\nand deletes, this can become very complicated, and data engineers typically have\nto engineer a complex pipeline to deal with such scenarios.\n\n**Working with Time Travel**\nDelta Lake’s time travel capabilities simplify building data pipelines for the above use\ncases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n\n- Data scientists manage their experiments better\n\n- Data engineers simplify their pipelines and roll back bad writes\n\n- Data analysts do easy reporting\n\nOrganizations can finally standardize on a clean, centralized, versioned big data\nrepository in their own cloud storage for analytics. We are thrilled to see what you will\nbe able to accomplish with this feature.\n\nAs you write into a Delta Lake table or directory, every operation is automatically\nversioned. You can access the different versions of the data two different ways:\n\n**1. Using a timestamp**\n**Scala syntax**\nYou can provide the timestamp or date string as an option to DataFrame reader:\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “timestampAsOf” , “2019-01-01” )\n\n.load( “/path/to/my/table” )\n\n\n-----\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “timestampAsOf” , “2019-01-01” ) \\\n\n.load( “/path/to/my/table” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5d0a6e641d2854282bc57f0b4249014a", + "val df = spark.read\n\n.format( “delta” )\n\n.option( “timestampAsOf” , “2019-01-01” )\n\n.load( “/path/to/my/table” )\n\n\n-----\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “timestampAsOf” , “2019-01-01” ) \\\n\n.load( “/path/to/my/table” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n\nIf the reader code is in a library that you don’t have access to, and if you are passing\ninput parameters to the library to read data, you can still travel back in time for a table\nby passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n\nval inputPath = “/path/to/my/table@20190101000000000”\n\nval df = loadData(inputPath)\n\n// Function in a library that you don’t have access to\n\ndef loadData(inputPath : String ) : DataFrame = {\n\nspark.read\n\n.format(“delta”)\n\n.load(inputPath)\n\n}\n\ninputPath = “/path/to/my/table@20190101000000000”\n\ndf = loadData(inputPath)\n\n# Function in a library that you don’t have access to\n\ndef loadData(inputPath):\n\nreturn spark.read \\\n\n.format( “delta” ) \\\n\n.load(inputPath)\n\n\n-----\n\n**2. Using a version number**\nIn Delta Lake, every write has a version number, and you can use the version number\nto travel back in time as well.\n\n**Scala syntax**\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “versionAsOf” , “5238” )\n\n.load( “/path/to/my/table” )\n\nval df = spark.read\n\n.format( “delta” )\n\n.load( “/path/to/my/table@v5238” )\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “versionAsOf” , “5238” ) \\\n\n.load( “/path/to/my/table” )\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.load( “/path/to/my/table@v5238” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table VERSION AS OF 5238\n\n\n-----\n\n**Audit data changes**\nYou can look at the history of table changes using the DESCRIBE HISTORY command\nor through the UI.\n\n**Reproduce experiments and reports**\nTime travel also plays an important role in machine learning and data science.\nReproducibility of models and experiments is a key consideration for data scientists\nbecause they often create hundreds of models before they put one into production,\nand in that time-consuming process would like to go back to earlier models. However,\nbecause data management is often separate from data science tools, this is really\nhard to accomplish.\n\nDatabricks solves this reproducibility problem by integrating Delta Lake’s Time\nTravel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\nlifecycle. For reproducible machine learning training, you can simply log a", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "711c60d6bd61a75a8e06d2833cbe0ac5", + ".format( “delta” ) \\\n\n.load( “/path/to/my/table@v5238” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table VERSION AS OF 5238\n\n\n-----\n\n**Audit data changes**\nYou can look at the history of table changes using the DESCRIBE HISTORY command\nor through the UI.\n\n**Reproduce experiments and reports**\nTime travel also plays an important role in machine learning and data science.\nReproducibility of models and experiments is a key consideration for data scientists\nbecause they often create hundreds of models before they put one into production,\nand in that time-consuming process would like to go back to earlier models. However,\nbecause data management is often separate from data science tools, this is really\nhard to accomplish.\n\nDatabricks solves this reproducibility problem by integrating Delta Lake’s Time\nTravel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\nlifecycle. For reproducible machine learning training, you can simply log a\n\n\ntimestamped URL to the path as an MLflow parameter to track which version of the\ndata was used for each training job.\n\nThis enables you to go back to earlier settings and data sets to reproduce earlier\nmodels. You neither need to coordinate with upstream teams on the data nor worry\nabout cloning data for different experiments. This is the power of unified analytics,\nwhereby data science is closely married with data engineering.\n\n**Rollbacks**\nTime travel also makes it easy to do rollbacks in case of bad writes. For example, if\nyour GDPR pipeline job had a bug that accidentally deleted user information, you can\neasily fix the pipeline:\n\nINSERT INTO my_table\n\nSELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nWHERE userId = 111\n\n\n-----\n\nYou can also fix incorrect updates as follows:\n\n# Will use the latest version of the table for all operations below\n\nMERGE INTO my_table target\n\n\nUSING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n\nON source.userId = target.userId\n\nWHEN MATCHED THEN UPDATE SET - \n\nIf you simply want to roll back to a previous version of your table, you can do so with\neither of the following commands:\n\nRESTORE TABLE my_table VERSION AS OF [version_number]\n\nRESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n\n**Pinned view of a continuously updating**\n**Delta Lake table across multiple downstream jobs**\nWith AS OF queries, you can now pin the snapshot of a continuously updating Delta\nLake table for multiple downstream jobs. Consider a situation where a Delta Lake table\nis being continuously updated, say every 15 seconds, and there is a downstream job\nthat periodically reads from this Delta Lake table and updates different destinations.\nIn such scenarios, typically you want a consistent view of the source Delta Lake table\nso that all destination tables reflect the same state.\n\nYou can now easily handle such scenarios as follows:\n\nversion = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n\nmy_table)” ).collect()\n\n\ndata = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n\n( “event_type = e1” ).write.jdbc( “table1” )\n\ndata.where( “event_type = e2” ).write.jdbc( “table2” )\n\n...\n\ndata.where( “event_type = e10” ).write.jdbc( “table10” )\n\n**Queries for time series analytics made simple**\nTime travel also simplifies time series analytics. For example, if you want to find out\nhow many new customers you added over the last week, your query could be a very\nsimple one like this:\n\nSELECT count( distinct userId) - (\n\nSELECT count( distinct userId)\n\nFROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n\nFROM my_table\n\n**Additional resources**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "aa0e375a10a06dfdfa31a3d111220cd6", + "my_table)” ).collect()\n\n\ndata = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n\n( “event_type = e1” ).write.jdbc( “table1” )\n\ndata.where( “event_type = e2” ).write.jdbc( “table2” )\n\n...\n\ndata.where( “event_type = e10” ).write.jdbc( “table10” )\n\n**Queries for time series analytics made simple**\nTime travel also simplifies time series analytics. For example, if you want to find out\nhow many new customers you added over the last week, your query could be a very\nsimple one like this:\n\nSELECT count( distinct userId) - (\n\nSELECT count( distinct userId)\n\nFROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n\nFROM my_table\n\n**Additional resources**\n\n[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n\n[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n\n[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n\n[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n\n-----\n\n**Easily Clone Your Delta Lake**\n**for Testing, Sharing and ML**\n**Reproducibility**\n\nDelta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\nrecreate tables for ML reproducibility. Creating copies of tables in a data lake or data\nwarehouse has several practical uses. However, given the volume of data in tables\nin a data lake and the rate of its growth, making physical copies of tables is an\nexpensive operation.\n\nDelta Lake now makes the process simpler and cost-effective with the help of\ntable clones.\n\n**What are clones?**\nClones are replicas of a source table at a given point in time. They have the same\nmetadata as the source table: same schema, constraints, column descriptions, statistics\nand partitioning. However, they behave as a separate table with a separate lineage\nor history. Any changes made to clones only affect the clone and not the source. Any\nchanges that happen to the source during or after the cloning process also do not get\nreflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\nclones: shallow or deep.\n\n**Shallow clones**\nA _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\ntable being cloned; the data files of the table itself are not copied. This type of cloning\ndoes not create another physical copy of the data resulting in minimal storage costs.\nShallow clones are inexpensive and can be extremely fast to create.\n\n\n-----\n\nThese clones are not self-contained and depend on the source from which they were\ncloned as the source of data. If the files in the source that the clone depends on are removed,\nfor example with VACUUM, a shallow clone may become unusable. Therefore, shallow\nclones are typically used for short-lived use cases such as testing and experimentation.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "82ac667cde1a486cfd40d0b4dcfae632", + "**Shallow clones**\nA _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\ntable being cloned; the data files of the table itself are not copied. This type of cloning\ndoes not create another physical copy of the data resulting in minimal storage costs.\nShallow clones are inexpensive and can be extremely fast to create.\n\n\n-----\n\nThese clones are not self-contained and depend on the source from which they were\ncloned as the source of data. If the files in the source that the clone depends on are removed,\nfor example with VACUUM, a shallow clone may become unusable. Therefore, shallow\nclones are typically used for short-lived use cases such as testing and experimentation.\n\n**Deep clones**\nShallow clones are great for short-lived use cases, but some scenarios require a\nseparate and independent copy of the table’s data. A deep clone makes a full copy of\nthe metadata and the data files of the table being cloned. In that sense, it is similar in\nfunctionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\nBut it is simpler to specify since it makes a faithful copy of the original table at the\nspecified version, and you don’t need to re-specify partitioning, constraints and other\ninformation as you have to do with CTAS. In addition, it is much faster, robust and can\nwork in an incremental manner against failures.\n\nWith deep clones, we copy additional metadata, such as your streaming application\ntransactions and COPY INTO transactions, so you can continue your ETL applications\nexactly where it left off on a deep clone.\n\n**Where do clones help?**\nSometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\nnot talking about human clones here. There are many scenarios where you need a\ncopy of your data sets — for exploring, sharing or testing ML models or analytical\nqueries. Below are some examples of customer use cases.\n\n**Testing and experimentation with a production table**\nWhen users need to test a new version of their data pipeline they often have to rely\non sample test data sets that are not representative of all the data in their production\nenvironment. Data teams may also want to experiment with various indexing techniques\nto improve the performance of queries against massive tables. These experiments and\n\n\ntests cannot be carried out in a production environment without risking production\ndata processes and affecting users.\n\nIt can take many hours or even days, to spin up copies of your production tables for a test\nor a development environment. Add to that, the extra storage costs for your development\nenvironment to hold all the duplicated data — there is a large overhead in setting a test\nenvironment reflective of the production data. With a shallow clone, this is trivial:\n\n-- SQL\n\nCREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n\n# Python\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=True)\n\n// Scala\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=true)\n\nAfter creating a shallow clone of your table in a matter of seconds, you can start\nrunning a copy of your pipeline to test out your new code, or try optimizing your table\nin different dimensions to see how you can improve your query performance, and much\nmuch more. These changes will only affect your shallow clone, not your original table.\n\n**Staging major changes to a production table**\nSometimes, you may need to perform some major changes to your production table.\nThese changes may consist of many steps, and you don’t want other users to see the\nchanges that you’re making until you’re done with all of your work. A shallow clone can\nhelp you out here:\n\n\n-----\n\n-- SQL\n\nCREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n\nDELETE FROM temp.staged_changes WHERE event_id is null;\n\nUPDATE temp.staged_changes SET change_date = current_date()\n\nWHERE change_date is null;\n\n...\n\n-- Perform your verifications", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "3b5c7eb5038ea40cfd3fa6b26ef3f196", + "After creating a shallow clone of your table in a matter of seconds, you can start\nrunning a copy of your pipeline to test out your new code, or try optimizing your table\nin different dimensions to see how you can improve your query performance, and much\nmuch more. These changes will only affect your shallow clone, not your original table.\n\n**Staging major changes to a production table**\nSometimes, you may need to perform some major changes to your production table.\nThese changes may consist of many steps, and you don’t want other users to see the\nchanges that you’re making until you’re done with all of your work. A shallow clone can\nhelp you out here:\n\n\n-----\n\n-- SQL\n\nCREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n\nDELETE FROM temp.staged_changes WHERE event_id is null;\n\nUPDATE temp.staged_changes SET change_date = current_date()\n\nWHERE change_date is null;\n\n...\n\n-- Perform your verifications\n\nOnce you’re happy with the results, you have two options. If no other change has\nbeen made to your source table, you can replace your source table with the clone.\nIf changes have been made to your source table, you can merge the changes into\nyour source table.\n\n-- If no changes have been made to the source\n\nREPLACE TABLE prod.events CLONE temp.staged_changes;\n\n-- If the source table has changed\n\nMERGE INTO prod.events USING temp.staged_changes\n\nON events.event_id <=> staged_changes.event_id\n\nWHEN MATCHED THEN UPDATE SET *;\n\n-- Drop the staged table\n\nDROP TABLE temp.staged_changes;\n\n**Machine learning result reproducibility**\nComing up with an effective ML model is an iterative process. Throughout this process\nof tweaking the different parts of the model, data scientists need to assess the\naccuracy of the model against a fixed data set.\n\nThis is hard to do in a system where the data is constantly being loaded or updated. A\nsnapshot of the data used to train and test the model is required. This snapshot allows\nthe results of the ML model to be reproducible for testing or model governance purposes.\n\n\n-----\n\nWe recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\nexample of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\nOnce you’re happy with the results and would like to archive the data for later retrieval,\nfor example, next Black Friday, you can use deep clones to simplify the archiving process.\nMLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\nautolog() ) will tell you which version of the table was used to run a set of experiments.\n\n# Run your ML workloads using Python and then\n\nDeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n\nstore_bf2020”)\n\n**Data migration**\nA massive table may need to be moved to a new, dedicated bucket or storage system\nfor performance or governance reasons. The original table will not receive new\nupdates going forward and will be deactivated and removed at a future point in time.\nDeep clones make the copying of massive tables more robust and scalable.\n\n-- SQL\n\nCREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n\nALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n\nWith deep clones, since we copy your streaming application transactions and\nCOPY INTO transactions, you can continue your ETL applications from exactly where\nit left off after this migration!", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "6a1474e01a1c53beb43c0e56be508317", + "# Run your ML workloads using Python and then\n\nDeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n\nstore_bf2020”)\n\n**Data migration**\nA massive table may need to be moved to a new, dedicated bucket or storage system\nfor performance or governance reasons. The original table will not receive new\nupdates going forward and will be deactivated and removed at a future point in time.\nDeep clones make the copying of massive tables more robust and scalable.\n\n-- SQL\n\nCREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n\nALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n\nWith deep clones, since we copy your streaming application transactions and\nCOPY INTO transactions, you can continue your ETL applications from exactly where\nit left off after this migration!\n\n**Data sharing**\nIn an organization, it is often the case that users from different departments are\nlooking for data sets that they can use to enrich their analysis or models. You may\nwant to share your data with other users across the organization. But rather than\nsetting up elaborate pipelines to move the data to yet another store, it is often easier\nand economical to create a copy of the relevant data set for users to explore and\n\n\n-----\n\n**Looks awesome! Any gotchas?**\nJust to reiterate some of the gotchas mentioned above as a single list, here’s what you\nshould be wary of:\n\n- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\nthe source table after the cloning process starts will not be reflected in the\nclone.\n\n- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\ndeleted in the source table (for example through VACUUM), your shallow clone\nmay not be usable.\n\n- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\nqueries on your source table and clone may not return the same result.\n\n- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\ndeep clones to migrate your tables and continue your ETL processes from\nwhere it left off.\n\n**How can I use it?**\nShallow and deep clones support new advances in how data teams test and manage\ntheir modern cloud data lakes and warehouses. Table clones can help your team\nimplement production-level testing of their pipelines, fine-tune their indexing for optimal\nquery performance, create table copies for sharing — all with minimal overhead and\nexpense. If this is a need in your organization, we hope you will take table cloning for\na spin and give us your feedback — we look forward to hearing about new use cases and\nextensions you would like to see in the future.\n\n**Additional resource**\n\n[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n\n\ntest the data to see if it is a fit for their needs without affecting your own production\nsystems. Here deep clones again come to the rescue.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n\n**Data archiving**\nFor regulatory or archiving purposes, all data in a table needs to be preserved for a\ncertain number of years, while the active table retains data for a few months. If you\nwant your data to be updated as soon as possible, but you have a requirement to keep\ndata for several years, storing this data in a single table and performing time travel\nmay become prohibitively expensive.\n\nIn this case, archiving your data in a daily, weekly or monthly manner is a better\nsolution. The incremental cloning capability of deep clones will really help you here.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE archive.events CLONE prod.events;\n\nNote that this table will have an independent history compared to the source table,\ntherefore, time travel queries on the source table and the clone may return different\nresults based on your frequency of archiving.\n\n\n-----\n\n**Enabling Spark SQL DDL**\n**and DML in Delta Lake on**\n**Apache Spark 3.0**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "85528e52b21fbee2e55b92c64392467a", + "**Data archiving**\nFor regulatory or archiving purposes, all data in a table needs to be preserved for a\ncertain number of years, while the active table retains data for a few months. If you\nwant your data to be updated as soon as possible, but you have a requirement to keep\ndata for several years, storing this data in a single table and performing time travel\nmay become prohibitively expensive.\n\nIn this case, archiving your data in a daily, weekly or monthly manner is a better\nsolution. The incremental cloning capability of deep clones will really help you here.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE archive.events CLONE prod.events;\n\nNote that this table will have an independent history compared to the source table,\ntherefore, time travel queries on the source table and the clone may return different\nresults based on your frequency of archiving.\n\n\n-----\n\n**Enabling Spark SQL DDL**\n**and DML in Delta Lake on**\n**Apache Spark 3.0**\n\nThe release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\nenabling a new set of features that were simplified using Delta Lake from SQL. Here\nare some of the key features.\n\n**Support for SQL DDL commands**\n**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\nYou can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\nSQL operations when creating (or replacing) tables.\n\n**Create or replace tables**\n\n-- Create table in the metastore\n\nCREATE TABLE events (\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\nUSING DELTA\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n-- If a table with the same name already exists, the table is replaced\n\nwith\n\nthe new configuration, else it i s created\n\nCREATE OR REPLACE TABLE events (\n\n\n-----\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\n\nINSERT INTO events SELECT * FROM newEvents\n\n-- To atomically replace all of the data in a table, you can use\n\noverwrite mode\n\nINSERT OVERWRITE events SELECT * FROM newEvents\n\n\nUSING DELTA\n\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n**Explicitly alter the table schema**\n\n-- Alter table and schema\n\n\n-- Delete events\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n-- Update events\n\nUPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n\n\nALTER TABLE table_name ADD COLUMNS (\n\n\ncol_name data_type\n\n[COMMENT col_comment]\n\n[FIRST|AFTER colA_name],\n\n...)\n\nYou can also use the Scala/Java/Python APIs:\n\n- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\nAPIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n\n- \u0007DeltaTable.forName(tableName) API to create instances of\nio.delta.tables .DeltaTable which is useful for executing\nUpdate/Delete/Merge operations in Scala/Java/Python.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "4d04b69d8db8500adb050d5a90426616", + "LOCATION ‘/delta/events’\n\n**Explicitly alter the table schema**\n\n-- Alter table and schema\n\n\n-- Delete events\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n-- Update events\n\nUPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n\n\nALTER TABLE table_name ADD COLUMNS (\n\n\ncol_name data_type\n\n[COMMENT col_comment]\n\n[FIRST|AFTER colA_name],\n\n...)\n\nYou can also use the Scala/Java/Python APIs:\n\n- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\nAPIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n\n- \u0007DeltaTable.forName(tableName) API to create instances of\nio.delta.tables .DeltaTable which is useful for executing\nUpdate/Delete/Merge operations in Scala/Java/Python.\n\n**Support for SQL Insert, Delete, Update and Merge**\nOne of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\nwould DML operations such as delete, update and merge be available in Spark SQL?\nWait no more, these operations are now available in SQL! Below are examples of how\nyou can write delete, update and merge (insert, update, delete and de-duplication\noperations using Spark SQL).\n\n-- Using append mode, you can atomically add new data to an existing\n\nDelta table\n\n\n-- Upsert data to a target Delta\n\n-- table using merge\n\nMERGE INTO events\n\nUSING updates\n\nON events.eventId = updates.eventId\n\nWHEN MATCHED THEN UPDATE\n\nSET events.data = updates.data\n\nWHEN NOT MATCHED THEN INSERT (date, eventId, data)\n\nVALUES (date, eventId, data)\n\nIt is worth noting that the merge operation in Delta Lake supports more advanced\nsyntax than standard ANSI SQL syntax. For example, merge supports\n\n- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n“... WHEN MATCHED THEN DELETE ...”\n\n- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\nand source rows match. For example:\n\n...\n\nWHEN MATCHED AND events.shouldDelete THEN DELETE\n\nWHEN MATCHED THEN UPDATE SET events.data = updates.data\n\n\n-----\n\n\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\nsources column. For example:\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nsuch as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\nblock deletes and updates in a Delta table using delta.appendOnly=true .\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN MATCHED THEN SET *\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN NOT MATCHED THEN INSERT *\n\n-- equivalent to updating/inserting with event .date = updates.date,\n\nevents.eventId = updates.eventId, event .data = updates.data", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "6abd3284045bf3c58bcecc212ea2a929", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nsuch as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\nblock deletes and updates in a Delta table using delta.appendOnly=true .\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN MATCHED THEN SET *\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN NOT MATCHED THEN INSERT *\n\n-- equivalent to updating/inserting with event .date = updates.date,\n\nevents.eventId = updates.eventId, event .data = updates.data\n\n**Automatic and incremental Presto/Athena manifest**\n**generation**\nAs noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\nto read Delta Lake by using manifest files; the manifest files contain the list of the\nmost current version of files as of manifest generation. As described in the preceding\nchapter, you will need to:\n\n- Generate a Delta Lake manifest file\n\n- Configure Presto or Athena to read the generated manifests\n\n- Manually re-generate (update) the manifest file\n\nNew for Delta Lake 0.7.0 is the capability to update the manifest file automatically\nwith the following command:\n\nALTER TABLE delta.`pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.compatibility.symlinkFormatManifest.enabled=true\n\n)\n\n**Configuring your table through table properties**\nWith the ability to set table properties on your table by using ALTER TABLE SET\nTBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nYou can also easily control the history of your Delta Lake table retention by the\nfollowing [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n\n- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\nwant to alter this value based on your requirements (e.g., GDPR historical context)\n\n- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\nmust have been deleted before being a candidate for VACUUM. By default, data\nfiles older than seven days are deleted.\n\nAs of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\nconfigure these properties.\n\nALTER TABLE delta. `pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.logRetentionDuration = “interval “\n\ndelta.deletedFileRetentionDuration = “interval “\n\n)\n\n**Support for adding user-defined metadata**\n**in Delta Lake table commits**\nYou can specify user-defined strings as metadata in commits made by Delta\nLake table operations, either using the DataFrameWriter option userMetadata or\nthe SparkSession configuration spark.databricks.delta.commitInfo.\nuserMetadata .", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0f2e4c9a4079b11e8fa2e29dbb6b7555", + "- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\nmust have been deleted before being a candidate for VACUUM. By default, data\nfiles older than seven days are deleted.\n\nAs of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\nconfigure these properties.\n\nALTER TABLE delta. `pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.logRetentionDuration = “interval “\n\ndelta.deletedFileRetentionDuration = “interval “\n\n)\n\n**Support for adding user-defined metadata**\n**in Delta Lake table commits**\nYou can specify user-defined strings as metadata in commits made by Delta\nLake table operations, either using the DataFrameWriter option userMetadata or\nthe SparkSession configuration spark.databricks.delta.commitInfo.\nuserMetadata .\n\nIn the following example, we are deleting a user (1xsdf1) from our data lake per user\nrequest. To ensure we associate the user’s request with the deletion, we have also\nadded the DELETE request ID into the userMetadata.\n\n\n-----\n\nSET spark.databricks.delta.commitInfo.userMetadata={\n\n“GDPR”:”DELETE Request 1x891jb23”\n\n\nThere were a lot of great questions during the AMA concerning structured streaming\nand using trigger.once .\n\n\n};\n\n\nFor more information, some good resources explaining this concept include:\n\n- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\ntrade-off discussed here .\n\n**Additional resources**\n\n[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n\n[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n\n[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n\n\nDELETE FROM user_table WHERE user_id = ‘1xsdf1’\n\nWhen reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\nidentify the associated deletion request within the transaction log.\n\n**Other highlights**\nOther highlights for the Delta Lake 0.7.0 release include:\n\n- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n\n- Improved support for streaming one-time triggers — With Spark 3.0, we now\nensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\nin a Delta Lake table in a single micro-batch even if rate limits are set with the\nDataStreamReader option maxFilesPerTrigger.\n\n\n-----\n\n**Lakehouse**\nCombining the best elements of data\nlakes and data warehouses\n\n## CHAPTER 03\n\n\n-----\n\n**What Is a**\n**Lakehouse?**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b9ecac60d86210f02ec6120208247874", + "**Other highlights**\nOther highlights for the Delta Lake 0.7.0 release include:\n\n- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n\n- Improved support for streaming one-time triggers — With Spark 3.0, we now\nensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\nin a Delta Lake table in a single micro-batch even if rate limits are set with the\nDataStreamReader option maxFilesPerTrigger.\n\n\n-----\n\n**Lakehouse**\nCombining the best elements of data\nlakes and data warehouses\n\n## CHAPTER 03\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\nthat emerged independently across many customers and use cases: the **lakehouse.**\nIn this chapter, we’ll describe this new architecture and its advantages over previous\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\napplications. Since its inception in the late 1980s, data warehouse technology\ncontinued to evolve and MPP architectures led to systems that were able to handle\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\nhave to deal with unstructured data, semi-structured data, and data with high variety,\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\narchitects began envisioning a single system to house data for many different\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\nin a variety of formats. While suitable for storing data, data lakes lack some critical\nfeatures: They do not support transactions, they do not enforce data quality, and their\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\nA lakehouse is a new data architecture that combines the best elements of data lakes\nand data warehouses.\n\nLakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\nkind of low-cost storage used for data lakes. They are what you would get if you had\nto redesign data warehouses in the modern world, now that cheap and highly reliable\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\nbe reading and writing data concurrently. Support for ACID transactions ensures\nconsistency as multiple parties concurrently read or write data, typically using SQL.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\nwarehouses.\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\nrequire systems for diverse data applications including SQL analytics, real-time\nmonitoring, data science and machine learning. Most of the recent advances in\nAI have been in better models to process unstructured data (text, images, video,\naudio), but these are precisely the types of data that a data warehouse is not\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\nwarehouses, and other specialized systems such as streaming, time-series, graph\nand image databases. Having a multitude of systems introduces complexity and,\nmore importantly, introduces delay as data professionals invariably need to move\nor copy data between different systems.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "accf6ad13717062292245537ffbd0249", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\nbe reading and writing data concurrently. Support for ACID transactions ensures\nconsistency as multiple parties concurrently read or write data, typically using SQL.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\nwarehouses.\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\nrequire systems for diverse data applications including SQL analytics, real-time\nmonitoring, data science and machine learning. Most of the recent advances in\nAI have been in better models to process unstructured data (text, images, video,\naudio), but these are precisely the types of data that a data warehouse is not\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\nwarehouses, and other specialized systems such as streaming, time-series, graph\nand image databases. Having a multitude of systems introduces complexity and,\nmore importantly, introduces delay as data professionals invariably need to move\nor copy data between different systems.\n\n\n-----\n\n**\u0007Schema enforcement and governance:** The lakehouse should have a way to\nsupport schema enforcement and evolution, supporting DW schema paradigms\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\nreduces staleness and improves recency, reduces latency and lowers the cost of\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\nuse separate clusters, thus these systems are able to scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also have\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\nParquet, and they provide an API so a variety of tools and engines, including\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\nThe lakehouse can be used to store, refine, analyze and access data types needed\nfor many new data applications, including images, video, audio, semi-structured\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\nanalytics. Multiple tools might be needed to support all these workloads, but they all\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\nfeatures. Tools for security and access control are basic requirements. Data governance\ncapabilities including auditing, retention and lineage have become essential particularly\nin light of recent privacy regulations. Tools that enable data discovery such as data\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "9b85a3fa086f1fa4e09197bc46d91dab", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\nanalytics. Multiple tools might be needed to support all these workloads, but they all\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\nfeatures. Tools for security and access control are basic requirements. Data governance\ncapabilities including auditing, retention and lineage have become essential particularly\nin light of recent privacy regulations. Tools that enable data discovery such as data\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\nCloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\nstore large data warehouses and data lakes. Unfortunately, their implementation\nas key-value stores makes it difficult to achieve ACID transactions and high\nperformance: Metadata operations, such as listing objects, are expensive, and\nconsistency guarantees are limited. In this paper, we present Delta Lake, an\nopen source ACID table storage layer over cloud object stores initially developed\nat Databricks. Delta Lake uses a transaction log that is compacted into Apache\nParquet format to provide ACID properties, time travel, and significantly faster\nmetadata operations for large tabular data sets (e.g., the ability to quickly search\nbillions of table partitions for those relevant to a query). It also leverages this\ndesign to provide high-level features such as automatic data layout optimization,\nupserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\nSpark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\nthousands of Databricks customers that process exabytes of data per day, with\nthe largest instances managing exabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\nMukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\nMichał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\nSameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\nMicrosoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\nenables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\nexamples that focus primarily on BI and other SQL applications.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0c1a7a0ab76b4274b45f53089582bed3", + "**Some early examples**\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\nMicrosoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\nenables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\nsource file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\ncan move faster as they are able to use data without needing to access multiple systems.\nThe level of SQL support and integration with BI tools among these early lakehouses\nis generally sufficient for most enterprise data warehouses. Materialized views and\nstored procedures are available, but users may need to employ other mechanisms that\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\nimportant for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\nlibraries) for non-BI workloads like data science and machine learning. Data\nexploration and refinement are standard for many analytic and data science\napplications. Delta Lake is designed to let users incrementally improve the quality of\ndata in their lakehouse until it is ready for consumption.\n\n\nA note about technical building blocks. While distributed file systems can be\nused for the storage layer, object stores are more commonly used in lakehouses.\nObject stores provide low-cost, highly available storage that excels at massively\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\nThe lakehouse is a new data management architecture that radically simplifies\nenterprise data infrastructure and accelerates innovation in an age when\nmachine learning is poised to disrupt every industry. In the past, most of the\ndata that went into a company’s products or decision-making was structured\ndata from operational systems, whereas today, many products incorporate\nAI in the form of computer vision and speech models, text mining and others.\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\nversioning, governance, security and ACID properties that are needed even for\nunstructured data.\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\nnotebooks) over others so lakehouses will also need to improve their UX and their\nconnectors to popular tools so they can appeal to a variety of personas. These\nand other issues will be addressed as the technology continues to mature and\ndevelop. Over time, lakehouses will close these gaps while retaining the core\nproperties of being simpler, more cost-efficient and more capable of serving\ndiverse data applications.\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8375eac494bff392a37d6dff7c40c1b1", + "Current lakehouses reduce cost, but their performance can still lag specialized\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\nnotebooks) over others so lakehouses will also need to improve their UX and their\nconnectors to popular tools so they can appeal to a variety of personas. These\nand other issues will be addressed as the technology continues to mature and\ndevelop. Over time, lakehouses will close these gaps while retaining the core\nproperties of being simpler, more cost-efficient and more capable of serving\ndiverse data applications.\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\nadopting the lakehouse pattern. The blog created a massive amount of interest\nfrom technology enthusiasts. While lots of people praised it as the next-generation\ndata architecture, some people thought the lakehouse is the same thing as\nthe data lake. Recently, several of our engineers and founders wrote a research\npaper that describes some of the core technological challenges and solutions that\nset the lakehouse architecture apart from the data lake, and it was accepted and\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\nthey would have said faster horses.” The crux of this statement is that people often\nenvision a better solution to a problem as an evolution of what they already know\nrather than rethinking the approach to the problem altogether. In the world of data\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\nstore data warehouses and data lakes. However, their nature as key-value stores\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\nperformance is hampered by expensive metadata operations (e.g., listing objects)\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\nThe first is directories of files (i.e., data lakes) that store the table as a collection\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\napproach because the table is just a group of objects that can be accessed from\na wide variety of tools without a lot of additional data stores or systems. However,\nboth performance and consistency problems are common. Hidden data corruption\nis common due to failed transactions, eventual consistency leads to inconsistent\nqueries, latency is high, and basic management capabilities like table versioning and\naudit logs are unavailable.\n\n**2. Custom storage engines**\nThe second approach is custom storage engines, such as proprietary systems built for\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\nservice that’s able to provide a single source of truth. However, all I/O operations need\nto connect to this metadata service, which can increase cloud resource costs and\nreduce performance and availability. Additionally, it takes a lot of engineering work to\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\nand PyTorch, which can be challenging for data teams that use a variety of computing\nengines on their data. Engineering challenges can be exacerbated by unstructured\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\ncustomers into a specific service provider, leaving customers to contend with\nconsistently high prices and expensive, time-consuming migrations if they decide to\nadopt a new approach later.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "106451175b1a3fe452158b21f2f224b8", + "**2. Custom storage engines**\nThe second approach is custom storage engines, such as proprietary systems built for\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\nservice that’s able to provide a single source of truth. However, all I/O operations need\nto connect to this metadata service, which can increase cloud resource costs and\nreduce performance and availability. Additionally, it takes a lot of engineering work to\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\nand PyTorch, which can be challenging for data teams that use a variety of computing\nengines on their data. Engineering challenges can be exacerbated by unstructured\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\ncustomers into a specific service provider, leaving customers to contend with\nconsistently high prices and expensive, time-consuming migrations if they decide to\nadopt a new approach later.\n\n**3. Lakehouse**\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\nwe sought to build a car instead of a faster horse with not just a better data store,\nbut a fundamental change in how data is stored and used via the lakehouse. A\nlakehouse is a new architecture that combines the best elements of data lakes and\ndata warehouses. Lakehouses are enabled by a new system design: implementing\nsimilar data structures and data management features to those in a data warehouse,\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\nget if you had to redesign storage engines in the modern world, now that cheap and\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\nthe cloud object store. This design allows clients to update multiple objects at once,\nreplace a subset of the objects with another, etc., in a serializable manner that still\nachieves high parallel read/write performance from the objects. The log also provides\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\ncaching, and audit logs. Together, these features improve both the manageability and\nperformance of working with data in cloud object stores, ultimately opening the door\nto the lakehouse architecture that combines the key features of data warehouses and\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\nexabytes of structured and unstructured data each day, as well as many organizations\nin the open source community. These use cases span a variety of data sources and\napplications. The data types stored include Change Data Capture (CDC) logs from\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\ntables for reporting, and image or feature data for machine learning. The applications\ninclude SQL workloads (most commonly), business intelligence, streaming, data\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\nbe a good fit for most data lake applications that would have used structured storage\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\nsimplify their data architecture by running more workloads directly against cloud\nobject stores, and increasingly, by creating a lakehouse with both data lake and\ntransactional features to replace some or all of the functionality provided by message\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\nAmazon Redshift).\n\n**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "58289f2c000adf6c7a0dac805e19949b", + "Across these use cases, we found that customers often use Delta Lake to significantly\nsimplify their data architecture by running more workloads directly against cloud\nobject stores, and increasingly, by creating a lakehouse with both data lake and\ntransactional features to replace some or all of the functionality provided by message\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\nAmazon Redshift).\n\n**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\nenables a wide range of DBMS-like performance and management features for data\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\naccess protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\nengine to take advantage of modern CPU architecture with optimizations to Spark\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\nRuntime 7.0. Together, these features significantly accelerate query performance on\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\nOne of the big hardware trends over the last several years is that CPU clock speeds\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\nis that we have to find new ways to process data faster beyond raw compute power.\nOne of the most impactful methods has been to improve the amount of data that can\nbe processed in parallel. However, data processing engines need to be specifically\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\nthe pace of business increases. Poorer modeling in the interest of better business\nagility drives poorer query performance. Naturally, this is not a desired state, and\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\nworkloads through three components: an improved query optimizer, a caching\nlayer that sits between the execution layer and the cloud object storage, and a native\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\ndata teams today is the native execution engine, which we call Photon. (We know.\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\n\n-----\n\nDatabricks has been built to maximize the performance from the new changes in\nmodern cloud hardware. It brings performance improvements to all workload types\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\nBy linking these three components together, we think it will be easier for customers\nto understand how improvements in multiple places within the Databricks code\naggregate into significantly faster performance for analytics workloads on data lakes.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5926bb5ce8f74c7fafe652fb85efc82c", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\ndata teams today is the native execution engine, which we call Photon. (We know.\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\n\n-----\n\nDatabricks has been built to maximize the performance from the new changes in\nmodern cloud hardware. It brings performance improvements to all workload types\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\nBy linking these three components together, we think it will be easier for customers\nto understand how improvements in multiple places within the Databricks code\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\nnew advances in how data teams design their data architectures for increased\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n**Streaming**\nUsing Delta Lake to express\ncomputation on streaming data\n\n## CHAPTER 04\n\n\n-----\n\n**How Delta Lake Solves Common**\n**Pain Points in Streaming**\n\nThe pain points of a traditional streaming and data warehousing solution can be\nbroken into two groups: data lake and data warehouse pains.\n\n**Data lake pain points**\nWhile data lakes allow you to flexibly store an immense amount of data in a file system,\nthere are many pain points including (but not limited to):\n\n- Consolidation of streaming data from many disparate systems is difficult.\n\n- Updating data in a data lake is nearly impossible, and much of the streaming\ndata needs to be updated as changes are made. This is especially important in\nscenarios involving financial reconciliation and subsequent adjustments.\n\n- Query speeds for a data lake are typically very slow.\n\n- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n\n**Data warehouse pain points**\nThe power of a data warehouse is that you have a persistent performant store of your\ndata. But the pain points for building modern continuous applications include (but are\nnot limited to):\n\n- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n\n- Accessing streaming data and stored data together is very difficult, if at all possible.\n\n- Data warehouses do not scale very well.\n\n- Tying compute and storage together makes using a warehouse very expensive.\n\n\n-----\n\n**How Delta Lake on Databricks solves these issues**\n[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\nperformance optimizations to cloud data lakes. More succinctly, Delta Lake combines\nthe advantages of data lakes and data warehouses with Apache Spark™ to allow you\nto do incredible things.\n\n- Delta Lake, along with Structured Streaming, makes it possible to analyze\nstreaming and historical data together at high speeds.\n\n- When Delta Lake tables are used as sources and destinations of streaming big\ndata, it is easy to consolidate disparate data sources.\n\n- Upserts are supported on Delta Lake tables.\n\n- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n\n- Easily include machine learning scoring and advanced analytics into ETL\nand queries.\n\n- Decouples compute and storage for a completely scalable solution.\n\nIn the following use cases, we’ll share what this looks like in practice.\n\n\n-----\n\n**Simplifying Streaming Stock**\n**Data Analysis Using Delta Lake**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b9e920255e44aed01952f834a693b695", + "- Delta Lake, along with Structured Streaming, makes it possible to analyze\nstreaming and historical data together at high speeds.\n\n- When Delta Lake tables are used as sources and destinations of streaming big\ndata, it is easy to consolidate disparate data sources.\n\n- Upserts are supported on Delta Lake tables.\n\n- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n\n- Easily include machine learning scoring and advanced analytics into ETL\nand queries.\n\n- Decouples compute and storage for a completely scalable solution.\n\nIn the following use cases, we’ll share what this looks like in practice.\n\n\n-----\n\n**Simplifying Streaming Stock**\n**Data Analysis Using Delta Lake**\n\nReal-time analysis of stock data is a complicated endeavor. After all, there are many\nchallenges in maintaining a streaming system and ensuring transactional consistency\nof legacy and streaming data concurrently.\n\nThankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\nsystem to analyze stock data in real time. In this section, we’ll share how to simplify\nthe streaming of stock data analysis using Delta Lake.\n\nIn the following diagram, you can see a high-level architecture that simplifies this\nproblem. We start by ingesting two different sets of data into two Delta Lake tables.\nThe two data sets are stock prices and fundamentals.\n\nAfter ingesting the data into their respective tables, we then join the data in an ETL\nprocess and write the data out into a third Delta Lake table for downstream analysis.\n\nDelta Lake helps solve these problems by combining the scalability, streaming and\naccess to the advanced analytics of Apache Spark with the performance and ACID\ncompliance of a data warehouse.\n\n\n-----\n\n# Create Fundamental Data (Databricks Delta table)\n\ndfBaseFund = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksFundamentals’ )\n\n# Create Price Data (Databricks Delta table)\n\ndfBasePrice = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksDailyPrices’ )\n\n\n**Implement your streaming**\n**stock analysis solution with Delta Lake**\nDelta Lake and Apache Spark do most of the work for our solution; you can try out the\nfull [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n\nAs noted in the preceding diagram, we have two data sets to process — one for\nfundamentals and one for price data. To create our two Delta Lake tables, we specify\nthe .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n\n\n-----\n\nWhile we’re updating the stockFundamentals and stocksDailyPrices ,\nwe will consolidate this data through a series of ETL jobs into a consolidated view\n( stocksDailyPricesWFund ).\n\nWith the following code snippet, we can determine the start and end date of available\ndata and then combine the price and fundamentals data for that date range into DBFS.\n\n# Determine start and end date of available data\n\nrow = dfBasePrice.agg(\n\nfunc.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n\nfunc.min(dfBasePrice.price_date) .alias ( “minDate” )\n\n).collect()[ 0 ]\n\nstartDate = row[ “minDate” ]\n\nendDate = row[ “maxDate” ]\n\n# Define our date range function\n\n\n# Save data to DBFS\n\ndfPriceWFund\n\n.write\n\n.format( ‘delta’ )\n\n.mode( ‘append’ )", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "f6fa995d4c37ab485d6b8180de8b831b", + "With the following code snippet, we can determine the start and end date of available\ndata and then combine the price and fundamentals data for that date range into DBFS.\n\n# Determine start and end date of available data\n\nrow = dfBasePrice.agg(\n\nfunc.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n\nfunc.min(dfBasePrice.price_date) .alias ( “minDate” )\n\n).collect()[ 0 ]\n\nstartDate = row[ “minDate” ]\n\nendDate = row[ “maxDate” ]\n\n# Define our date range function\n\n\n# Save data to DBFS\n\ndfPriceWFund\n\n.write\n\n.format( ‘delta’ )\n\n.mode( ‘append’ )\n\n.save( ‘/delta/stocksDailyPricesWFund’ )\n\n# Loop through dates to complete fundamentals + price ETL process\n\nfor single_date in daterange(\n\nstartDate, (endDate + datetime.timedelta(days= 1 ))\n\n):\n\nprint ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n\nstart = datetime.datetime.now()\n\ncombinePriceAndFund(single_date)\n\nend = datetime.datetime.now()\n\nprint ( end - start)\n\n\ndef daterange(start_date, end_date):\n\n\nNow we have a stream of consolidated fundamentals and price data that is being\npushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\nDelta Lake table by specifying .format(“delta”) against that DBFS location.\n\n\nfor n in range( int ((end_date - start_date).days)):\n\nyield start_date + datetime.timedelta(n)\n\n\n# Define combinePriceAndFund information by date and\n\n\ndef combinePriceAndFund(theDate):\n\ndfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n\ndfPrice = dfBasePrice. where (\n\ndfBasePrice.price_date == theDate\n\n\ndfPriceWithFundamentals = spark\n\n.readStream\n\n.format( “delta” )\n\n.load( “/delta/stocksDailyPricesWFund” )\n\n\n).drop( ‘price_date’ )\n\n\n# Drop the updated column\n\ndfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n\n\n// Create temporary view of the data\n\ndfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n\n\n-----\n\nNow that we have created our initial Delta Lake table, let’s create a view that will\nallow us to calculate the price/earnings ratio in real time (because of the underlying\nstreaming data updating our Delta Lake table).\n\n%sql\n\nCREATE OR REPLACE TEMPORARY VIEW viewPE AS\n\nselect ticker,\n\nprice_date,\n\nfirst(close) as price,\n\n(close/eps_basic_net) as pe\n\nfrom priceWithFundamentals\n\nwhere eps_basic_net > 0\n\ngroup by ticker, price_date, pe\n\n**Analyze streaming stock data in real time**\nWith our view in place, we can quickly analyze our data using Spark SQL.\n\n%sql\n\nselect - \n\nfrom viewPE\n\nwhere ticker == “AAPL”\n\norder by price_date\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e44bb96fbdeb00e80da1083d9176e45a", + "-----\n\nNow that we have created our initial Delta Lake table, let’s create a view that will\nallow us to calculate the price/earnings ratio in real time (because of the underlying\nstreaming data updating our Delta Lake table).\n\n%sql\n\nCREATE OR REPLACE TEMPORARY VIEW viewPE AS\n\nselect ticker,\n\nprice_date,\n\nfirst(close) as price,\n\n(close/eps_basic_net) as pe\n\nfrom priceWithFundamentals\n\nwhere eps_basic_net > 0\n\ngroup by ticker, price_date, pe\n\n**Analyze streaming stock data in real time**\nWith our view in place, we can quickly analyze our data using Spark SQL.\n\n%sql\n\nselect - \n\nfrom viewPE\n\nwhere ticker == “AAPL”\n\norder by price_date\n\n\n-----\n\nAs the underlying source of this consolidated data set is a Delta Lake table, this view\nisn’t just showing the batch data but also any new streams of data that are coming in\nas per the following streaming dashboard.\n\nUnderneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\ntables but also keeping the state of the distinct number of keys (in this case ticker\nsymbols) that need to be tracked.\n\n\nBecause you are using Spark SQL, you can execute aggregate queries at scale\nand in real time.\n\n%sql\n\nSELECT ticker, AVG(close) as Average_Close\n\nFROM priceWithFundamentals\n\nGROUP BY ticker\n\nORDER BY Average_Close\n\nIn closing, we demonstrated how to simplify streaming stock data analysis using\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\nDatabricks integrated workspace to create a performant, scalable solution that has\nthe advantages of both data lakes and data warehouses.\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\ncommonly associated with streaming and transactional consistency, enabling\ndata engineering and data science teams to focus on understanding the trends in\ntheir stock data.\n\n\n-----\n\n**How Tilting Point Does Streaming**\n**Ingestion Into Delta Lake**\n\nTilting Point is a new-generation games partner that provides top development\nstudios with expert resources, services and operational support to optimize\nhigh-quality live games for success. Through its user acquisition fund and its\nworld-class technology platform, Tilting Point funds and runs performance\nmarketing management and live games operations to help developers achieve\nprofitable scale.\n\nBy leveraging Delta Lake, Tilting Point is able to leverage quality data and make\nit readily available for analytics to improve the business. Diego Link, VP of\nEngineering at Tilting Point, provided insights for this use case.\n\nThe team at Tilting Point was running daily and hourly batch jobs for reporting on\ngame analytics. They wanted to make their reporting near real-time, getting insights\nwithin 5–10 minutes.\n\nThey also wanted to make their in-game LiveOps decisions based on real-time player\nbehavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\neffects and even alert on service interruptions in game operations. The goal was to\nensure that the game experience was as robust as possible for their players.\n\nAdditionally, they had to store encrypted Personally Identifiable Information (PII) data\nseparately in order to maintain GDPR compliance.\n\n\n-----\n\n**How data flows and associated challenges**\nTilting Point has a proprietary software development kit that developers integrate\nwith to send data from game servers to an ingest server hosted in AWS. This service\nremoves all PII data and then sends the raw data to an Amazon Firehose endpoint.\nFirehose then dumps the data in JSON format continuously to S3.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "feafe0caaf0a289198396ca22ff931c1", + "The team at Tilting Point was running daily and hourly batch jobs for reporting on\ngame analytics. They wanted to make their reporting near real-time, getting insights\nwithin 5–10 minutes.\n\nThey also wanted to make their in-game LiveOps decisions based on real-time player\nbehavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\neffects and even alert on service interruptions in game operations. The goal was to\nensure that the game experience was as robust as possible for their players.\n\nAdditionally, they had to store encrypted Personally Identifiable Information (PII) data\nseparately in order to maintain GDPR compliance.\n\n\n-----\n\n**How data flows and associated challenges**\nTilting Point has a proprietary software development kit that developers integrate\nwith to send data from game servers to an ingest server hosted in AWS. This service\nremoves all PII data and then sends the raw data to an Amazon Firehose endpoint.\nFirehose then dumps the data in JSON format continuously to S3.\n\nTo clean up the raw data and make it available quickly for analytics, the team\nconsidered pushing the continuous data from Firehose to a message bus (e.g.,\nKafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\nprocess data and write to Delta Lake tables.\n\nWhile that architecture sounds ideal for low latency requirements of processing\ndata in seconds, Tilting Point didn’t have such low latency needs for their ingestion\npipeline. They wanted to make the data available for analytics in a few minutes, not\nseconds. Hence they decided to simplify our architecture by eliminating a message\nbus and instead use S3 as a continuous source for their structured streaming job.\n\nBut the key challenge in using S3 as a continuous source is identifying files that\nchanged recently.\n\nListing all files every few minutes has two major issues:\n\n- **Higher latency:** Listing all files in a directory with a large number of files has high\noverhead and increases processing time.\n\n- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n\n**Leveraging Structured Streaming with blob store as**\n**source and Delta Lake tables as sink**\nTo continuously stream data from cloud blob storage like S3, Tilting Point uses\n[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\nstream data from S3 without the need to write any state management code on what\nfiles were recently processed.\n\n\n-----\n\nThis is how Tilting Point’s ingestion pipeline looks:\n\n- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\nto SQS via SNS.\n\n- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\ninformation to read the actual file contents in S3. An example code below:\n\nspark.readStream \\\n\n.format( “s3-sqs” ) \\\n\n. option ( “fileFormat” , “json” ) \\\n\n. option ( “queueUrl” , ...) \\\n\n. schema (...) \\\n\n. load ()\n\n- Tilting Point’s structured streaming job then cleans up and transforms the data.\nBased on the game data, the streaming job uses the foreachBatch API of Spark\nstreaming and writes to 30 different Delta Lake tables.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "4cffec7831b4b93dde76b2fc65f0ac9b", + "- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\nto SQS via SNS.\n\n- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\ninformation to read the actual file contents in S3. An example code below:\n\nspark.readStream \\\n\n.format( “s3-sqs” ) \\\n\n. option ( “fileFormat” , “json” ) \\\n\n. option ( “queueUrl” , ...) \\\n\n. schema (...) \\\n\n. load ()\n\n- Tilting Point’s structured streaming job then cleans up and transforms the data.\nBased on the game data, the streaming job uses the foreachBatch API of Spark\nstreaming and writes to 30 different Delta Lake tables.\n\n- The streaming job produces lots of small files. This affects performance of\ndownstream consumers. So, an optimize job runs daily to compact small files in\nthe table and store them as right file sizes so that consumers of the data have\ngood performance while reading the data from Delta Lake tables. Tilting Point\nalso runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n\n\n-----\n\nThe above Delta Lake ingestion architecture helps in the following ways:\n\n- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\nThis helps quickly process the new files without too much overhead in listing files.\n\n- **No explicit file state management:** There is no explicit file state management\nneeded to look for recent files.\n\n- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\nand Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n\n- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\ntransactional guarantees. This helps with reliable data ingestion.\n\n- **File compaction:** One of the major problems with streaming ingestion is tables\nending up with a large number of small files that can affect read performance.\nBefore Delta Lake, we had to set up a different table to write the compacted\ndata. With Delta Lake, thanks to ACID transactions, we can compact the files and\nrewrite the data back to the same table safely.\n\n- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\ningestion tables to downstream consumers while data is being appended by a\nstreaming job and modified during compaction.\n\n- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\nprevious version of the table.\n\nIn this section, we walked through Tilting Point’s use cases and how they do\nstreaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\nefficiently without too much operational overhead to make good quality data\nreadily available for analytics.\n\n\n-----\n\n**Building a Quality of Service**\n**Analytics Solution for Streaming**\n**Video Services**\n\nAs traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\nlibraries of content. For companies whose entire business model revolved around\nproducing great content, which they then licensed to distributors, the shift to now\nowning the entire glass-to-glass experience has required new capabilities, such as\nbuilding media supply chains for content delivery to consumers, supporting apps for\na myriad of devices and operating systems, and performing customer relationship\nfunctions like billing and customer service.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ab4dfbd6fe492203c62a8b3f60e4ad55", + "In this section, we walked through Tilting Point’s use cases and how they do\nstreaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\nefficiently without too much operational overhead to make good quality data\nreadily available for analytics.\n\n\n-----\n\n**Building a Quality of Service**\n**Analytics Solution for Streaming**\n**Video Services**\n\nAs traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\nlibraries of content. For companies whose entire business model revolved around\nproducing great content, which they then licensed to distributors, the shift to now\nowning the entire glass-to-glass experience has required new capabilities, such as\nbuilding media supply chains for content delivery to consumers, supporting apps for\na myriad of devices and operating systems, and performing customer relationship\nfunctions like billing and customer service.\n\nWith most services renewing on a monthly basis, subscription service operators need\nto prove value to their subscribers at all times. General quality of streaming video\nissues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\nscreen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n\nWhen you start streaming, you realize there are so many places where breaks can\nhappen and the viewer experience can suffer. There may be an issue at the source in\nthe servers on-premises or in the cloud; in transit at either the CDN level or ISP level\nor the viewer’s home network; or at the playout level with player/client issues. What\nbreaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\nx 106. There is no pre-release testing that can quite replicate real-world users and\ntheir ability to push even the most redundant systems to their breaking point as they\n\n\n-----\n\nchannel surf, click in and out of the app, sign on from different devices simultaneously\nand so on. And because of the nature of TV, things will go wrong during the most\nimportant, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\nrather regional or a national issue? If national, is it across all devices or only certain\ntypes (e.g., possibly the OEM updated the OS on an older device type, which ended up\ncausing compatibility issues with the client)?\n\nIdentifying, remediating and preventing viewer quality of experience issues becomes\na big data problem when you consider the number of users, the number of actions\nthey are taking and the number of handoffs in the experience (servers to CDN to ISP to\nhome network to client). Quality of Service (QoS) helps make sense of these streams\nof data so you can understand what is going wrong, where and why. Eventually you\ncan get into predictive analytics around what could go wrong and how to remediate\nit before anything breaks.\n\n**Databricks Quality of Service solution overview**\nThe aim of this solution is to provide the core for any streaming video platform that\nwants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\na Unified Data Analytics Platform for both the real-time insights and the advanced\nanalytics capabilities.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "bbbd4003f5c1346b6d1798b187bd59de", + "Identifying, remediating and preventing viewer quality of experience issues becomes\na big data problem when you consider the number of users, the number of actions\nthey are taking and the number of handoffs in the experience (servers to CDN to ISP to\nhome network to client). Quality of Service (QoS) helps make sense of these streams\nof data so you can understand what is going wrong, where and why. Eventually you\ncan get into predictive analytics around what could go wrong and how to remediate\nit before anything breaks.\n\n**Databricks Quality of Service solution overview**\nThe aim of this solution is to provide the core for any streaming video platform that\nwants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\na Unified Data Analytics Platform for both the real-time insights and the advanced\nanalytics capabilities.\n\n[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\nleveraging the most complete and recent data sets powered by robust and reliable\ndata pipelines. This decreases time to market for new features by accelerating\ndata science using a collaborative environment. It provides support for managing\nthe end-to-end machine learning lifecycle and reduces operational costs across\nall cycles of software development by having a unified platform for both data\nengineering and data science.\n\n\n-----\n\n**Video QoS solution architecture**\nWith complexities like low-latency monitoring alerts and highly scalable infrastructure\nrequired for peak video traffic hours, the straightforward architectural choice was\nthe Delta Architecture — both standard big data architectures like Lambda and Kappa\nArchitectures have disadvantages around the operational effort required to maintain\nmultiple types of pipelines (streaming and batch) and lack support for a unified data\nengineering and data science approach.\n\nThe Delta Architecture is the next-generation paradigm that enables all the data\npersonas in your organization to be more productive:\n\n- Data engineers can develop data pipelines in a cost-efficient manner\ncontinuously without having to choose between batch and streaming\n\n- Data analysts can get near real-time insights and faster answers to their BI queries\n\n- Data scientists can develop better machine learning models using more reliable data\nsets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n\n\n-----\n\nWriting data pipelines using the Delta Architecture follows the best practices of\nhaving a multi-layer “multi-hop” approach where we progressively add structure to\ndata: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\nreporting or data science, and “Gold” tables are the final presentation layer.\n\nFor the pure streaming use cases, the option of materializing the DataFrames in\nintermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\ncost (an example being real-time monitoring alerts vs. updates of the recommender\nsystem based on new content).\n\nA streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n\nThe number of “hops” in this approach is directly impacted by the number of consumers\ndownstream, complexity of the aggregations (e.g., Structured Streaming enforces\ncertain limitations around chaining multiple aggregations) and the maximization of\noperational efficiency.\n\nThe QoS solution architecture is focused around best practices for data processing\nand is not a full video-on-demand (VoD) solution — with some standard components\nlike the “front door” service Amazon API Gateway being avoided from the high-level\narchitecture in order to keep the focus on data and analytics.\n\n\n-----\n\nHigh-level architecture for the QoS platform\n\n\n**Making your data ready for analytics**\nBoth sources of data included in the QoS solution (application events and CDN logs)\nare using the JSON format, great for data exchange — allowing you to represent\ncomplex nested structures, but not scalable and difficult to maintain as a storage\nformat for your data lake / analytics system.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ad49d73d4d4b66a958ad88f05e980ce7", + "A streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n\nThe number of “hops” in this approach is directly impacted by the number of consumers\ndownstream, complexity of the aggregations (e.g., Structured Streaming enforces\ncertain limitations around chaining multiple aggregations) and the maximization of\noperational efficiency.\n\nThe QoS solution architecture is focused around best practices for data processing\nand is not a full video-on-demand (VoD) solution — with some standard components\nlike the “front door” service Amazon API Gateway being avoided from the high-level\narchitecture in order to keep the focus on data and analytics.\n\n\n-----\n\nHigh-level architecture for the QoS platform\n\n\n**Making your data ready for analytics**\nBoth sources of data included in the QoS solution (application events and CDN logs)\nare using the JSON format, great for data exchange — allowing you to represent\ncomplex nested structures, but not scalable and difficult to maintain as a storage\nformat for your data lake / analytics system.\n\n\nIn order to make the data directly queryable across the entire organization, the\nBronze to Silver pipeline (the “make your data available to everyone” pipeline) should\ntransform any raw formats into Delta Lake and include all the quality checks or data\nmasking required by any regulatory agencies.\n\n\n-----\n\nRaw format of the app events\n\n**Video applications events**\nBased on the architecture, the video application events are pushed directly to\nKinesis Streams and then just ingested to a Delta Lake append-only table without\nany changes to the schema.\n\nUsing this pattern allows a high number of consumers downstream to process the\ndata in a streaming paradigm without having to scale the throughput of the Kinesis\nstream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\nwe don’t have to worry about the way the size of the processing window will impact the\nnumber of files in your target table — known as the “small files” issue in the big data world.\n\nBoth the timestamp and the type of message are being extracted from the JSON\nevent in order to be able to partition the data and allow consumers to choose the\ntype of events they want to process. Again combining a single Kinesis stream for\nthe events with a Delta Lake “Events” table reduces the operational complexity while\nmaking things easier for scaling during peak hours.\n\n\nAll the details are extracted from JSON for the Silver table\n\n\n-----\n\n**CDN logs**\nThe CDN logs are delivered to S3, so the easiest way to process them is the Databricks\nAuto Loader, which incrementally and efficiently processes new data files as they\narrive in S3 without any additional setup.\n\nauto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n\n.option( “cloudFiles.format” , “json” ) \\\n\n.option( “cloudFiles.region” , region) \\\n\n.load(input_location)\n\nanonymized_df = auto_loader_df. select ( ‘*’ , ip_\n\nanonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n\n.drop( ‘requestip’ )\\\n\n.withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n\nanonymized_df.writeStream \\\n\n.option( ‘checkpointLocation’ , checkpoint_location)\\\n\n.format( ‘delta’ ) \\\n\n.table(silver_database + ‘.cdn_logs’ )\n\nAs the logs contain IPs — considered personal data under the GDPR regulations — the\n“make your data available to everyone” pipeline has to include an anonymization step.\nDifferent techniques can be used, but we decided to just strip the last octet from IPv4\nand the last 80 bits from IPv6. On top, the data set is also enriched with information\naround the origin country and the ISP provider, which will be used later in the Network\nOperation Centers for localization.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "79c3bd3bc2ddef2dc23c74442ac386f7", + "anonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n\n.drop( ‘requestip’ )\\\n\n.withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n\nanonymized_df.writeStream \\\n\n.option( ‘checkpointLocation’ , checkpoint_location)\\\n\n.format( ‘delta’ ) \\\n\n.table(silver_database + ‘.cdn_logs’ )\n\nAs the logs contain IPs — considered personal data under the GDPR regulations — the\n“make your data available to everyone” pipeline has to include an anonymization step.\nDifferent techniques can be used, but we decided to just strip the last octet from IPv4\nand the last 80 bits from IPv6. On top, the data set is also enriched with information\naround the origin country and the ISP provider, which will be used later in the Network\nOperation Centers for localization.\n\n\n-----\n\n**Creating the Dashboard /**\n**Virtual Network Operation Centers**\nStreaming companies need to monitor network performance and the user experience\nas near real-time as possible, tracking down to the individual level with the ability to\nabstract at the segment level, easily defining new segments such as those defined by\ngeos, devices, networks and/or current and historical viewing behavior.\n\nFor streaming companies that has meant adopting the concept of Network Operation\nCenters (NOC) from telco networks for monitoring the health of the streaming\nexperience for their users at a macro level, flagging and responding to any issues\nearly on. At their most basic, NOCs should have dashboards that compare the current\nexperience for users against a performance baseline so that the product teams can\nquickly and easily identify and attend to any service anomalies.\n\nIn the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\nbe effortlessly connected in order to build more complex visualizations, but based\non customer feedback, built-in dashboards are, most of the time, the fastest way to\npresent the insights to business users.\n\nThe aggregated tables for the NOC will basically be the Gold layer of our Delta\nArchitecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n\n\n-----\n\nThe dashboard is just a way to visually package the results of SQL queries or Python\n/ R transformation — each notebook supports multiple dashboards so in case of\nmultiple end users with different requirements we don’t have to duplicate the code —\nas a bonus the refresh can also be scheduled as a Databricks job.\n\nVisualization of the results of a SQL query\n\nLoading time for videos (time to first frame) allows better understanding of the\nperformance for individual locations of your CDN — in this case the AWS CloudFront\nEdge nodes — which has a direct impact in your strategy for improving this KPI —\neither by spreading the user traffic over multi-CDNs or maybe just implementing a\ndynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n\n\n-----\n\nFailure to understand the reasons for high levels of buffering — and the poor video\nquality experience that it brings — has a significant impact on subscriber churn rate.\nOn top of that, advertisers are not willing to spend money on ads responsible for\nreducing the viewer engagement — as they add extra buffering on top, so the profits\non the advertising business usually are impacted too. In this context, collecting as\nmuch information as possible from the application side is crucial to allow the analysis\nto be done not only at video level but also browser or even type / version of application.\n\nOn the content side, events for the application can provide useful information about\nuser behavior and overall quality of experience. How many people that paused a video\nhave actually finished watching that episode / video? What caused the stoppage: The\nquality of the content or delivery issues? Of course, further analyses can be done by\nlinking all the sources together (user behavior, performance of CDNs /ISPs) to not only\ncreate a user profile but also to forecast churn.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "ae2829e8bde14a57789f9914fb41bddd", + "-----\n\nFailure to understand the reasons for high levels of buffering — and the poor video\nquality experience that it brings — has a significant impact on subscriber churn rate.\nOn top of that, advertisers are not willing to spend money on ads responsible for\nreducing the viewer engagement — as they add extra buffering on top, so the profits\non the advertising business usually are impacted too. In this context, collecting as\nmuch information as possible from the application side is crucial to allow the analysis\nto be done not only at video level but also browser or even type / version of application.\n\nOn the content side, events for the application can provide useful information about\nuser behavior and overall quality of experience. How many people that paused a video\nhave actually finished watching that episode / video? What caused the stoppage: The\nquality of the content or delivery issues? Of course, further analyses can be done by\nlinking all the sources together (user behavior, performance of CDNs /ISPs) to not only\ncreate a user profile but also to forecast churn.\n\n\n-----\n\n**Creating (near) real-time alerts**\nWhen dealing with the velocity, volume and variety of data generated in video\nstreaming from millions of concurrent users, dashboard complexity can make it\nharder for human operators in the NOC to focus on the most important data at the\nmoment and zero-in on root cause issues. With this solution, you can easily set up\nautomated alerts when performance crosses certain thresholds that can help the\nhuman operators of the network as well as set off automatic remediation protocols\nvia a Lambda function. For example:\n\n- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\nlatency vs. baseline average), initiate automatic CDN traffic shifts.\n\n- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\nproduct team that there is likely a client issue for a specific device.\n\n- If viewers on a certain ISP are having higher-than-average buffering and\npixelation issues, alert frontline customer representatives on responses and ways\nto decrease issues (e.g., set stream quality lower).\n\nFrom a technical perspective, generating real-time alerts requires a streaming\nengine capable of processing data real time and publish-subscribe service to push\nnotifications.\n\n\nupdates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\non a rule-based engine (e.g., validating the percentage of errors for each individual\ntype of app over a period of time) really straightforward.\n\ndef send_error_notification(row):\n\nsns_client = boto3.client( ‘sns’ , region)\n\nerror_message = ‘Number of errors for the App has exceeded the\n\nthreshold {}’ .format(row[ ‘percentage’ ])\n\nresponse = sns_client.publish(\n\nTopicArn =,\n\nMessage = error_message,\n\nSubject =,\n\nMessageStructure = ‘string’ )\n\n# Structured Streaming Job\n\ngetKinesisStream( “player_events” )\\\n\n.selectExpr( “type” , “app_type” )\\\n\n.groupBy( “app_type” )\\\n\n.apply(calculate_error_percentage)\\\n\n. where ( “percentage > {}” .format(threshold)) \\\n\n.writeStream\\\n\n. foreach (send_error_notification)\\\n\n.start()\n\n\nIntegrating microservices using Amazon SNS and Amazon SQS\n\nSending email notifications using AWS SNS\n\nThe QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\nby using Amazon SNS and its integrations with Amazon Lambda (see below for the\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "32aae84b9afcab02dd16df7cda65615f", + "Subject =,\n\nMessageStructure = ‘string’ )\n\n# Structured Streaming Job\n\ngetKinesisStream( “player_events” )\\\n\n.selectExpr( “type” , “app_type” )\\\n\n.groupBy( “app_type” )\\\n\n.apply(calculate_error_percentage)\\\n\n. where ( “percentage > {}” .format(threshold)) \\\n\n.writeStream\\\n\n. foreach (send_error_notification)\\\n\n.start()\n\n\nIntegrating microservices using Amazon SNS and Amazon SQS\n\nSending email notifications using AWS SNS\n\nThe QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\nby using Amazon SNS and its integrations with Amazon Lambda (see below for the\n\n\n-----\n\nOn top of the basic email use case, the Demo Player includes three widgets updated\nin real time using AWS AppSync: the number of active users, the most popular videos\nand the number of users concurrently watching a video.\n\nUpdating the application with the results of real-time aggregations\n\nThe QoS solution is applying a similar approach — Structured Streaming and Amazon\nSNS — to update all the values allowing for extra consumers to be plugged in using AWS\nSQS. This is a common pattern when huge volumes of events have to be enhanced and\nanalyzed; pre-aggregate data once and allow each service (consumer) to make their\nown decision downstream.\n\n**Next steps: machine learning**\nManually making sense of the historical data is important but is also very slow. If\nwe want to be able to make automated decisions in the future, we have to integrate\nmachine learning algorithms.\n\nAs a Unified Data Platform, Databricks empowers data scientists to build better data\nscience products using features like Runtime for Machine Learning with built-in\nor the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n\n\n-----\n\nWe have already explored a few important use cases across our customer base while\nfocusing on the possible extensions to the QoS solution.\n\n**Point-of-failure prediction and remediation**\nAs D2C streamers reach more users, the costs of even momentary loss of service\nincreases. ML can help operators move from reporting to prevention by forecasting\nwhere issues could come up and remediating before anything goes wrong (e.g.,\na spike in concurrent viewers leads to switching CDNs to one with more capacity\nautomatically).\n\n**Customer churn**\nCritical to growing subscription services is keeping the subscribers you have. By\nunderstanding the quality of service at the individual level, you can add QoS as a\nvariable in churn and customer lifetime value models. Additionally, you can create\ncustomer cohorts for those who have had video quality issues in order to test\nproactive messaging and save offers.\n\n\n**Getting started with the Databricks streaming video**\n**QoS solution**\nProviding consistent quality in the streaming video experience is table stakes at this\npoint to keep fickle audiences with ample entertainment options on your platform.\nWith this solution we have sought to create a quick start for most streaming video\nplatform environments to embed this QoS real-time streaming analytics solution in\na way that:\n1. Scales to any audience size\n2. Quickly flags quality performance issues at key parts of the distribution workflow\n3. Is flexible and modular enough to easily customize for your audience and your\nneeds, such as creating new automated alerts or enabling data scientists to test\nand roll out predictive analytics and machine learning", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "bcc6aa2a7284ca86601537158789cd67", + "**Customer churn**\nCritical to growing subscription services is keeping the subscribers you have. By\nunderstanding the quality of service at the individual level, you can add QoS as a\nvariable in churn and customer lifetime value models. Additionally, you can create\ncustomer cohorts for those who have had video quality issues in order to test\nproactive messaging and save offers.\n\n\n**Getting started with the Databricks streaming video**\n**QoS solution**\nProviding consistent quality in the streaming video experience is table stakes at this\npoint to keep fickle audiences with ample entertainment options on your platform.\nWith this solution we have sought to create a quick start for most streaming video\nplatform environments to embed this QoS real-time streaming analytics solution in\na way that:\n1. Scales to any audience size\n2. Quickly flags quality performance issues at key parts of the distribution workflow\n3. Is flexible and modular enough to easily customize for your audience and your\nneeds, such as creating new automated alerts or enabling data scientists to test\nand roll out predictive analytics and machine learning\n\nTo get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\nsystem, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n\n\n-----\n\n**Customer Use Cases**\nSee how customers are using\nDelta Lake to rapidly innovate\n\n## CHAPTER 05\n\n\n-----\n\n**Healthdirect Australia**\nProvides Personalized and Secure Online\nPatient Care With Databricks\n\nAs the shepherds of the National Health Services Directory (NHSD), Healthdirect\nis focused on leveraging terabytes of data covering time-driven, activity-based\nhealthcare transactions to improve health care services and support. With\ngovernance requirements, siloed teams and a legacy system that was difficult\nto scale, they moved to Databricks. This boosted data processing for downstream\nmachine learning while improving data security to meet HIPAA requirements.\n\n**Spotlight on Healthdirect**\n**Industry:** Healthcare and life sciences\n6x\nImprovement in data processing\n20M\nRecords ingested in minutes\n\n**Data quality and governance issues, silos, and the**\n**inability to scale**\nDue to regulatory pressures, Healthdirect Australia set forth to improve overall data\nquality and ensure a level of governance on top of that, but they ran into challenges\nwhen it came to data storage and access. On top of that, data silos were blocking the\nteam from efficiently preparing data for downstream analytics. These disjointed data\n\n\n-----\n\nsources impacted the consistency of data reads, as data was oftentimes out-of-sync\nbetween the various systems in their stack. The low-quality data also led to higher\nerror rates and processing inefficiencies. This fragmented architecture created\nsignificant operational overhead and limited their ability to have a comprehensive\nview of the patient.\n\nFurther, they needed to ingest over 1 billion data points due to a changing landscape\nof customer demand such as bookings, appointments, pricing, eHealth transaction\nactivity, etc. — estimated at over 1TB of data.\n\n“We had a lot of data challenges. We just couldn’t process efficiently enough. We\nwere starting to get batch overruns. We were starting to see that a 24-hour window\nisn’t the most optimum time in which we want to be able to deliver healthcare data\nand services,” explained Peter James, Chief Architect at Healthdirect Australia.\n\nUltimately, Healthdirect realized they needed to modernize their end-to-end process\nand tech stack to properly support the business.\n\n**Modernizing analytics with Databricks and Delta Lake**\nDatabricks provides Healthdirect Australia with a Unified Data Platform that simplifies\ndata engineering and accelerates data science innovation. The notebook environment\nenables them to make content changes in a controlled fashion rather than having to\nrun bespoke jobs each time.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5f58acfe64720103ee19410297467f1a", + "Further, they needed to ingest over 1 billion data points due to a changing landscape\nof customer demand such as bookings, appointments, pricing, eHealth transaction\nactivity, etc. — estimated at over 1TB of data.\n\n“We had a lot of data challenges. We just couldn’t process efficiently enough. We\nwere starting to get batch overruns. We were starting to see that a 24-hour window\nisn’t the most optimum time in which we want to be able to deliver healthcare data\nand services,” explained Peter James, Chief Architect at Healthdirect Australia.\n\nUltimately, Healthdirect realized they needed to modernize their end-to-end process\nand tech stack to properly support the business.\n\n**Modernizing analytics with Databricks and Delta Lake**\nDatabricks provides Healthdirect Australia with a Unified Data Platform that simplifies\ndata engineering and accelerates data science innovation. The notebook environment\nenables them to make content changes in a controlled fashion rather than having to\nrun bespoke jobs each time.\n\n“Databricks has provided a big uplift for our teams and our data operations,” said\nJames. “The analysts were working directly with the data operations teams. They are\nable to achieve the same pieces of work together within the same time frames that\nused to take twice as long. They’re working together, and we’re seeing just a massive\nacceleration in the speed at which we can deliver service.”\n\n\n-----\n\nWith Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\nWithin these zones, they store their data “as is,” in their structured or unstructured\nstate, in Delta Lake tables. From there, they use a metadata-driven schema and hold\nthe data within a nested structure within that table. What this allows them to do is\nhandle data consistently from every source and simplifies the mapping of data to the\nvarious applications pulling the data.\n\nMeanwhile, through Structured Streaming, they were able to convert all of their\nETL batch jobs into streaming ETL jobs that could serve multiple applications\nconsistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\nDatabricks Unified Data Platform provides significant architectural improvements\nthat have boosted performance, reduced operational overheads and increased\nprocess efficiencies.\n\n\n**Faster data pipelines result in better patient-driven**\n**healthcare**\nAs a result of the performance gains delivered by Databricks and the improved data\nreliability through Delta Lake, Healthdirect Australia realized improved accuracy of\ntheir fuzzy name match algorithm from less than 80% with manual verification to 95%\nand no manual intervention.\n\nThe processing improvements with Delta Lake and Structured Streaming allowed\nthem to process more than 30,000 automated updates per month. Prior to Databricks,\nthey had to use unreliable batch jobs that were highly manual to process the same\nnumber of updates over a span of 6 months — a 6x improvement in data processing.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the\nhealthcare sector.”\n\n– Peter James, Chief Architect, Healthdirect Australia\n\n\n-----\n\nThey were also able to increase their data load rate to 1 million records per minute,\nloading their entire 20 million record data set in 20 minutes. Before the adoption\nof Databricks, this used to take more than 24 hours to process the same 1 million\ntransactions, blocking analysts from making swift decisions to drive results.\n\nLast, data security, which was critical to meet compliance requirements, was greatly\nimproved. Databricks provides standard security accreditations like HIPAA, and\nHealthdirect was able to use Databricks to meet Australia’s security requirements.\nThis yielded significant cost reductions and gave them continuous data assurance\nby monitoring changes to access privileges like changes in roles, metadata-level\nsecurity changes, data leakage, etc.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the healthcare\nsector,” said James.\n\nWith the help of Databricks, they have proven the value of data and analytics and how\nit can impact their business vision. With transparent access to data that boasts\nwell-documented lineage and quality, participation across various business and\nanalyst groups has increased — empowering teams to collaborate and more\neasily and quickly extract value from their data with the goal of improving\nhealthcare for everyone.\n\n\n-----\n\n**Comcast**\nUses Delta Lake and MLflow to\nTransform the Viewer Experience", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "59a7715022c21c9e2cf313c9bc094c0b", + "Last, data security, which was critical to meet compliance requirements, was greatly\nimproved. Databricks provides standard security accreditations like HIPAA, and\nHealthdirect was able to use Databricks to meet Australia’s security requirements.\nThis yielded significant cost reductions and gave them continuous data assurance\nby monitoring changes to access privileges like changes in roles, metadata-level\nsecurity changes, data leakage, etc.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the healthcare\nsector,” said James.\n\nWith the help of Databricks, they have proven the value of data and analytics and how\nit can impact their business vision. With transparent access to data that boasts\nwell-documented lineage and quality, participation across various business and\nanalyst groups has increased — empowering teams to collaborate and more\neasily and quickly extract value from their data with the goal of improving\nhealthcare for everyone.\n\n\n-----\n\n**Comcast**\nUses Delta Lake and MLflow to\nTransform the Viewer Experience\n\n**Spotlight on Comcast**\n**Industry:** Media and entertainment\n10x\nReduction in overall compute costs to process data\n90%\nReduction in required DevOps resources to manage infrastructure\nReduced\nDeployment times from weeks to minutes\n\nAs a global technology and media company connecting millions of customers to\npersonalized experiences, Comcast struggled with massive data, fragile data pipelines\n\nand poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n— they can build performant data pipelines for petabytes of data and easily manage the\nlifecycle of hundreds of models to create a highly innovative, unique and award-winning\nviewer experience using voice recognition and machine learning.\n\n\n-----\n\n**Infrastructure unable to support data and ML needs**\nInstantly answering a customer’s voice request for a particular program while turning\nbillions of individual interactions into actionable insights, strained Comcast’s IT\ninfrastructure and data analytics and data science teams. To make matters more\ncomplicated, Comcast needed to deploy models to a disjointed and disparate range\nof environments: cloud, on-premises and even directly to devices in some instances.\n\n- **Massive data:** Billions of events generated by the entertainment system and 20+\nmillion voice remotes, resulting in petabytes of data that need to be sessionized\nfor analysis.\n\n- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\nhard to recover. Small files were difficult to manage, slowing data ingestion for\ndownstream machine learning.\n\n- **Poor collaboration:** Globally dispersed data scientists working in different\nscripting languages struggled to share and reuse code.\n\n- **Manage management of ML models:** Developing, training and deploying hundreds\nof models was highly manual, slow and hard to replicate, making it difficult to scale.\n\n- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\nand models while ops wanted to deploy on proven infrastructure.\n\n\n-----\n\n**Automated infrastructure, faster data**\n**pipelines with Delta Lake**\nComcast realized they needed to modernize their entire approach to analytics from\ndata ingest to the deployment of machine learning models to delivering new features\nthat delight their customers. Today, the Databricks Unified Data Platform enables\nComcast to build rich data sets and optimize machine learning at scale, streamline\nworkflows across teams, foster collaboration, reduce infrastructure complexity, and\ndeliver superior customer experiences.\n\n- **Simplified infrastructure management:** Reduced operational costs through\nautomated cluster management and cost management features such as\nautoscaling and spot instances.\n\n\n\n- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\ninitial processing of the raw telemetry from video and voice applications and devices.\n\n- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\nand reliable ingestion at scale.\n\n- **Collaborative workspaces:** Interactive notebooks improve cross-team\ncollaboration and data science creativity, allowing Comcast to greatly accelerate\nmodel prototyping for faster iteration.\n\n- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\nand model serving via the Kubeflow environment, allowing them to track and\nmanage hundreds of models with ease.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "97c9548f51ef028ef3850848b3a03fe8", + "- **Simplified infrastructure management:** Reduced operational costs through\nautomated cluster management and cost management features such as\nautoscaling and spot instances.\n\n\n\n- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\ninitial processing of the raw telemetry from video and voice applications and devices.\n\n- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\nand reliable ingestion at scale.\n\n- **Collaborative workspaces:** Interactive notebooks improve cross-team\ncollaboration and data science creativity, allowing Comcast to greatly accelerate\nmodel prototyping for faster iteration.\n\n- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\nand model serving via the Kubeflow environment, allowing them to track and\nmanage hundreds of models with ease.\n\n- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\nthat can reliably join historic and streaming data for richer insights.\n\n\n-----\n\n**Delivering personalized experiences with ML**\nIn the intensely competitive entertainment industry, there is no time to press the\nPause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\ndelighted with competition-beating customer experiences.\n\n- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\na highly innovative and award-winning viewer experience with intelligent voice\ncommands that boosts engagement.\n\n- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\ningestion, replacing 640 machines with 64 while improving performance. Teams\ncan spend more time on analytics and less time on infrastructure management.\n\n- **Less DevOps:** Reduced the number of DevOps full-time employees required for\nonboarding 200 users from 5 to 0.5.\n\n- **Higher data science productivity:** Fostered collaboration between global data\nscientists by enabling different programming languages through a single\ninteractive workspace. Also, Delta Lake has enabled the data team to use data at\nany point within the data pipeline, allowing them to act more quickly in building\nand training new models.\n\n- **Faster model deployment:** Reduced deployment times from weeks to minutes as\noperations teams deployed models on disparate platforms.\n\n\n-----\n\n**Banco Hipotecario**\nPersonalizes the Banking\nExperience With Data and ML\n\nBanco Hipotecario — a leading Argentinian commercial bank — is on a mission\nto leverage machine learning to deliver new insights and services that will delight\ncustomers and create upsell opportunities. With a legacy analytics and data\nwarehousing system that was rigid and complex to scale, they turned to Databricks\nto unify data science, engineering and analytics.\n\nAs a result of this partnership, they were able to significantly increase customer\nacquisition and cross-sells while lowering the cost for acquisition, greatly impacting\noverall customer retention and profitability.\n\n**Spotlight on Banco Hipotecario**\n**Industry:** Financial services\n35%\n\nReduction in cost of acquisition\n**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n\n\n-----\n\n**Legacy analytics tools are slow, rigid and**\n**impossible to scale**\nBanco Hipotecario set forth to increase customer acquisition by reducing risk and\nimproving the customer experience. With data analytics and machine learning\nanchoring their strategy, they hoped to influence a range of use cases from fraud\ndetection and risk analysis to serving product recommendations to drive upsell and\ncross-sell opportunities and forecast sales.\n\nBanco Hipotecario faced a number of the challenges that often come along with\noutdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n— the list goes on.\n\n“In order to execute on our data analytics strategy, new technologies were needed\nin order to improve data engineering and boost data science productivity,” said\nDaniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\ntook were to move to a cloud-based data lake, which led us to Azure Databricks\nand Delta Lake.”\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "570d73760acc1c981d74f3474639d57e", + "-----\n\n**Legacy analytics tools are slow, rigid and**\n**impossible to scale**\nBanco Hipotecario set forth to increase customer acquisition by reducing risk and\nimproving the customer experience. With data analytics and machine learning\nanchoring their strategy, they hoped to influence a range of use cases from fraud\ndetection and risk analysis to serving product recommendations to drive upsell and\ncross-sell opportunities and forecast sales.\n\nBanco Hipotecario faced a number of the challenges that often come along with\noutdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n— the list goes on.\n\n“In order to execute on our data analytics strategy, new technologies were needed\nin order to improve data engineering and boost data science productivity,” said\nDaniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\ntook were to move to a cloud-based data lake, which led us to Azure Databricks\nand Delta Lake.”\n\n\n-----\n\n**A unified platform powers the data lake**\n**and easy collaboration**\nBanco Hipotecario turned to Databricks to modernize their data warehouse\nenvironment, improve cross-team collaboration, and drive data science innovation.\nFully managed in Microsoft Azure, they were able to easily and reliably ingest massive\nvolumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\nautomated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n\nDelta Lake has been especially useful in bringing reliability and performance to Banco\nHipotecario’s data lake environment. With Delta Lake, they are now able to build\nreliable and performant ETL pipelines like never before.\n\n\nMeanwhile, performing SQL Analytics on Databricks has helped them do data\nexploration, cleansing and generate data sets in order to create models, enabling the\nteam to deploy their first model within the first three months, and the second model\ngenerated was rolled out in just two weeks.\n\nAt the same time, data scientists were finally able to collaborate, thanks to interactive\nnotebooks; this meant faster builds, training and deployment. And MLflow streamlined\nthe ML lifecycle and removed the overreliance on data engineering.\n\n“Databricks gives our data scientists the means to easily create our own experiments\nand deploy them to production in weeks, rather than months,” said Miguel Villalba,\nHead of Data Engineering and Data Science.\n\n\n-----\n\n**An efficient team maximizes customer**\n**acquisition and retention**\nSince moving to Databricks, the data team at Banco Hipotecario could not be happier,\nas Databricks has unified them across functions in an integrated fashion.\n\nThe results of data unification and markedly improved collaboration and autonomy\ncannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\ntheir cross-sell into new products by a whopping 90%, while machine learning has\nreduced the cost of customer acquisition by 35%.\n\n\n-----\n\n**Viacom18**\nMigrates From Hadoop to Databricks to\nDeliver More Engaging Experiences\n\nViacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\nwith 40x growth over the past decade. They offer multi-platform, multigenerational\nand multicultural brand experiences to 600+ million monthly viewers.\n\nIn order to deliver more engaging experiences for their millions of viewers, Viacom18\nmigrated from their Hadoop environment due to its inability to process data at scale\nefficiently. With Databricks, they have streamlined their infrastructure management,\nincreased data pipeline speeds and increased productivity among their data teams.\n\nToday, Viacom18 is able to deliver more relevant viewing experiences to their\nsubscribers, while identifying opportunities to optimize the business and drive\ngreater ROI.\n\n**Spotlight on Viacom18**\n**Industry:** Media and entertainment\n26%\nIncrease in operational efficiency lowers overall costs\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8165c97cf16d36d736f0cfa3da080901", + "-----\n\n**Viacom18**\nMigrates From Hadoop to Databricks to\nDeliver More Engaging Experiences\n\nViacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\nwith 40x growth over the past decade. They offer multi-platform, multigenerational\nand multicultural brand experiences to 600+ million monthly viewers.\n\nIn order to deliver more engaging experiences for their millions of viewers, Viacom18\nmigrated from their Hadoop environment due to its inability to process data at scale\nefficiently. With Databricks, they have streamlined their infrastructure management,\nincreased data pipeline speeds and increased productivity among their data teams.\n\nToday, Viacom18 is able to deliver more relevant viewing experiences to their\nsubscribers, while identifying opportunities to optimize the business and drive\ngreater ROI.\n\n**Spotlight on Viacom18**\n**Industry:** Media and entertainment\n26%\nIncrease in operational efficiency lowers overall costs\n\n\n-----\n\n**Growth in subscribers and terabytes of viewing data**\n**push Hadoop to its limits**\nViacom18, a joint venture between Network18 and ViacomCBS, is focused on\nproviding its audiences with highly personalized viewing experiences. The core\nof this strategy requires implementing an enterprise data architecture that enables\nthe building of powerful customer analytics on daily viewer data. But with millions of\nconsumers across India, the sheer amount of data was tough to wrangle: They were\ntasked with ingesting and processing over 45,000 hours of daily content on VOOT\n(Viacom18’s on-demand video subscription platform), which easily generated 700GB\nto 1TB of data per day.\n\n“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\nVice President of Digital Transformation and Technology. “We deliver personalized\ncontent recommendations across our audiences around the world based on\nindividual viewing history and preferences in order to increase viewership and\ncustomer loyalty.”\n\nViacom18’s data lake, which was leveraging on-premises Hadoop for operations,\nwasn’t able to optimally process 90 days of rolling data within their management’s\ndefined SLAs, limiting their ability to deliver on their analytics needs, which impacted\nnot only the customer experience but also overall costs.\n\nTo meet this challenge head-on, Viacom18 needed a modern data warehouse with the\nability to analyze data trends for a longer period of time instead of daily snapshots. They\nalso needed a platform that simplified infrastructure by allowing their team to easily\nprovision clusters with features like auto-scaling to help reduce compute costs.\n\n\n-----\n\n**Rapid data processing for analytics**\n**and ML with Databricks**\nTo enable the processing power and data science capabilities they required, Viacom18\npartnered with Celebal Technologies, a premier Salesforce, data analytics and big data\nconsulting organization based in India. The team at Celebal leveraged Azure Databricks\nto provide Viacom18 with a unified data platform that modernizes its data warehousing\ncapabilities and accelerates data processing at scale.\n\nThe ability to cache data within Delta Lake resulted in the much-needed acceleration\nof queries, while cluster management with auto-scaling and the decoupling of\n\n\nstorage and compute simplified Viacom18’s infrastructure management and\noptimized operational costs. “Delta Lake has created a streamlined approach to\nthe management of data pipelines,” explained Dey. “This has led to a decrease in\noperational costs while speeding up time-to-insight for downstream analytics and\ndata science.”\n\nThe notebooks feature was an unexpected bonus for Viacom18, as a common workspace\ngave data teams a way to collaborate and increase productivity on everything from\nmodel training to ad hoc analysis, dashboarding and reporting via PowerBI.\n\n\n-----\n\n**Leveraging viewer data to power personalized**\n**viewing experiences**\nCelebal Technologies and Databricks have enabled Viacom18 to deliver innovative\ncustomer solutions and insights with increased cross-team collaboration and\nproductivity. With Databricks, Viacom18’s data team is now able to seamlessly\nnavigate their data while better serving their customers.\n\n“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\nand deliver customer behavioral and engagement insights to the analysts and data\nscientists,” said Dey.\n\nIn addition to performance gains, the faster query times have also lowered the overall\ncost of ownership, even with daily increases in data volumes. “Azure Databricks has\ngreatly streamlined processes and improved productivity by an estimated 26%,”\nconcluded Dey.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "d8efcb2139fbb03f80ddfc58b319efc1", + "The notebooks feature was an unexpected bonus for Viacom18, as a common workspace\ngave data teams a way to collaborate and increase productivity on everything from\nmodel training to ad hoc analysis, dashboarding and reporting via PowerBI.\n\n\n-----\n\n**Leveraging viewer data to power personalized**\n**viewing experiences**\nCelebal Technologies and Databricks have enabled Viacom18 to deliver innovative\ncustomer solutions and insights with increased cross-team collaboration and\nproductivity. With Databricks, Viacom18’s data team is now able to seamlessly\nnavigate their data while better serving their customers.\n\n“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\nand deliver customer behavioral and engagement insights to the analysts and data\nscientists,” said Dey.\n\nIn addition to performance gains, the faster query times have also lowered the overall\ncost of ownership, even with daily increases in data volumes. “Azure Databricks has\ngreatly streamlined processes and improved productivity by an estimated 26%,”\nconcluded Dey.\n\nOverall, Dey cites the migration from Hadoop to Databricks has delivered significant\nbusiness value — reducing the cost of failure, accelerating processing speeds at\nscale, and simplifying ad hoc analysis for easier data exploration and innovations that\ndeliver highly engaging customer experiences.\n\n\n-----\n\n# What’s next?\n\nNow that you understand Delta Lake, it may be time to take a look\nat some additional resources.\n\n**Do a deep dive into Delta Lake >**\n\n- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n\n- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n\n- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n\n**[Try Databricks for free >](https://databricks.com/try-databricks)**\n**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b6cb59222f689e17c785b0d37ad018bb", + "**EBOOK**\n\n# All Roads Lead to the Lakehouse\n\n#### A deep dive into data ingestion with the lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction...................................................................................................................................................................................................................... **03**\n\nLife of a Data Engineer ............................................................................................................................................................................................... **04**\n\nIngesting From Cloud Object Stores...................................................................................................................................................................... **05**\n\nCOPY INTO ......................................................................................................................................................................................................... **06**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e49a7d2e3bd1f6a60e1306c0186dcdd5", + "COPY INTO ......................................................................................................................................................................................................... **06**\n\nAuto Loader ....................................................................................................................................................................................................... **09**\n\nIngesting Data From External Applications .......................................................................................................................................................... **13**\n\nPartner Connect ............................................................................................................................................................................................... **13**\n\n\n-----\n\n### Introduction\n\nOrganizations today are inundated with data siloed across various on-premises\napplication systems, databases, data warehouses and SaaS applications. This\nfragmentation makes it difficult to support new use cases for analytics or machine\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\narchitecture built on top of Delta Lake, an open format storage layer.\n\nThe first thing data engineers need to do to support the lakehouse architecture is to\nefficiently move data from various systems into their lakehouse. Ingesting data is a\ncritical first step in the data engineering and management lifecycle.\n\n\n-----\n\n### Life of a Data Engineer\n\nThe primary focus of data engineers is to provide timely and reliable data to downstream\n\ndata teams at an organization. Requests for data can come from a variety of teams, and for\n\n\na variety of data types. For example:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "292dc167706156bfcc1bfad9b793a6e7", + "-----\n\n### Introduction\n\nOrganizations today are inundated with data siloed across various on-premises\napplication systems, databases, data warehouses and SaaS applications. This\nfragmentation makes it difficult to support new use cases for analytics or machine\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\narchitecture built on top of Delta Lake, an open format storage layer.\n\nThe first thing data engineers need to do to support the lakehouse architecture is to\nefficiently move data from various systems into their lakehouse. Ingesting data is a\ncritical first step in the data engineering and management lifecycle.\n\n\n-----\n\n### Life of a Data Engineer\n\nThe primary focus of data engineers is to provide timely and reliable data to downstream\n\ndata teams at an organization. Requests for data can come from a variety of teams, and for\n\n\na variety of data types. For example:\n\n**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n\nbetter allocate their budget for ads\n\n**•** Security team looking to get access to a table with low latency security data from Kafka,\n\nin order to run rules to detect intrusions into the network\n\n**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n\n**•** Finance team hoping to find a way to automatically ingest critical data from Google\n\nSheets or transaction data from AWS Kinesis\n\nIn each of these common scenarios, data engineers must create usable and easily\n\nqueryable tables from semi-structured and unstructured data. Beyond writing queries to\n\nretrieve and transform all this data, the data engineering team must also be concerned\n\nwith performance, because running these queries on an ongoing basis can be a big load on\n\nthe system.\n\nData engineers face the challenge of constant requests and ongoing business\n\n\n###### W H AT I S \n D E LTA L A K E ?\n\nBefore thinking about ingestion into Delta Lake, it’s important to\n\nunderstand why ingesting into Delta Lake is the right solution in\n\nthe first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n\nlayer that brings data warehouse capabilities to your open data\n\nlake. Across industries, enterprises have enabled true collaboration\n\namong their data teams with a reliable single source of truth\n\nenabled by Delta Lake. By delivering quality, reliability, security and\n\nperformance on your data lake — for both streaming and batch\n\noperations — Delta Lake eliminates data silos and makes analytics\n\naccessible across the enterprise. With Delta Lake, customers can\n\nbuild a cost-efficient, highly scalable lakehouse that eliminates\n\ndata silos and provides self-serving analytics to end users.\n\n\nrequirements, as well as an ever-changing ecosystem. As business requirements change,\n\nso do the requirements around schemas, necessitating custom code to handle the\n\nchanges. With all of these challenges, the work of a data engineer is extremely critical, and\n\nincreasingly complex, with many steps involved before getting data to a state where it can\n\nactually be queried by the business stakeholders. So how do data engineers get the data\n\nthat each of these teams need at the frequency, with the freshness, and in the format\n\nrequired?\n\n\n-----\n\n### Ingesting From Cloud Object Stores\n\nThere are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n\ncloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n\nexisting tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8b3547e698ff3b7bcbffb21d2cc6b55a", + "changes. With all of these challenges, the work of a data engineer is extremely critical, and\n\nincreasingly complex, with many steps involved before getting data to a state where it can\n\nactually be queried by the business stakeholders. So how do data engineers get the data\n\nthat each of these teams need at the frequency, with the freshness, and in the format\n\nrequired?\n\n\n-----\n\n### Ingesting From Cloud Object Stores\n\nThere are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n\ncloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n\nexisting tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n\n[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n\n\n**Auto Loader**\n\nAuto Loader is an optimized data ingestion tool that incrementally and efficiently\n\nprocesses new data files as they arrive in cloud storage with minimal DevOps effort. You\n\njust need to provide a source directory path and start a streaming job. The new structured\n\nstreaming source, called “cloudFiles”, will automatically set up file notification services that\n\n\n**COPY INTO**\n\nCOPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n\nLake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n\nwhen the input directory contains thousands of files or fewer, and the user prefers SQL.\n\nCOPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n\n\nsubscribe file events from the input directory and process new files as they arrive, with the\n\noption of also processing existing files in that directory. Auto Loader has interfaces through\n\nPython and Scala, and can be used with SQL through Delta Live Tables.\n\n\n-----\n\n##### COPY INTO\n\n\nCOPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n\ningestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n\nINTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n\n```\nFILEFORMAT = CSV\nFORMAT_OPTIONS (‘header’ = ‘true’)\n\n```\n\nWhile COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n\n\nevents by using cloud functions such as AWS Lambda or through orchestrators like Apache\n\nAirflow. COPY INTO supports incremental appends and simple transformations.\n\nCOPY INTO is a great command to use when your source directory contains a small number\n\nof files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n\nAuto Loader, which we will cover later in this eBook.\n\n**Common Use Cases for COPY INTO**\n\n**Ingesting data to a new Delta table**\n\nA common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n\ntable. To copy data into a new Delta table, users can use CREATE TABLE command first,\n\nfollowed by COPY INTO.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "86f91de7df1d57274e8dc263ee0837a3", + "```\nFILEFORMAT = CSV\nFORMAT_OPTIONS (‘header’ = ‘true’)\n\n```\n\nWhile COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n\n\nevents by using cloud functions such as AWS Lambda or through orchestrators like Apache\n\nAirflow. COPY INTO supports incremental appends and simple transformations.\n\nCOPY INTO is a great command to use when your source directory contains a small number\n\nof files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n\nAuto Loader, which we will cover later in this eBook.\n\n**Common Use Cases for COPY INTO**\n\n**Ingesting data to a new Delta table**\n\nA common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n\ntable. To copy data into a new Delta table, users can use CREATE TABLE command first,\n\nfollowed by COPY INTO.\n\nStep 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\nStep 2 1 : `COPY INTO` `my_table`\n```\n FROM ‘s3://my_bucket/my_path’ WITH (\n CREDENTIAL (\n AWS_ACCESS_KEY = ‘*****’,\n AWS_SECRET_KEY = ‘*****’,\n AWS_SESSION_TOKEN = ‘*****’\n )\n ENCRYPTION (\n TYPE = ‘AWS_SSE_C’,\n MASTER_KEY = ‘*****’\n\n```\n\nThe code block above covers the AWS temporary in-line credential format. When you use\n\nin-line credentials in Azure and AWS, the following parameters are required for each type of\n\ncredential and encryption:\n\n\n|Credential Name|Required Parameters|\n|---|---|\n|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n||AWS_SESSION_TOKEN|\n|Azure SAS token|AZURE_SAS_TOKEN|\n\n\n\n\n\n|Encryption Name|Required Parameters|\n|---|---|\n|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n\n\n**Appending data to your Delta table**\n\nTo append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n\nis a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n\nusers point to a location of files, and once those files are ingested, Delta Lake will keep\n\n1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\nthe cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\ncloud object store, you do not need to specify credentials in-line for COPY INTO.\n\n\n-----\n\ntrack of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n\nget idempotency with COPY INTO, which means users are prevented from ingesting the\n\nsame data twice to the same table.\n```\n COPY INTO table_identifier\n FROM [ file_location | ( SELECT expression_list FROM file_location)]\n FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n\n```\nOne of the main benefits of COPY INTO is that users don’t have to worry about providing a\n\nschema, because the schema is automatically inferred from your data files. Here is a very\n\nsimple example of how you would ingest data from CSV files that have headers, where you", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "279384a45f18d48f7cfb7d752294e9fd", + "track of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n\nget idempotency with COPY INTO, which means users are prevented from ingesting the\n\nsame data twice to the same table.\n```\n COPY INTO table_identifier\n FROM [ file_location | ( SELECT expression_list FROM file_location)]\n FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n\n```\nOne of the main benefits of COPY INTO is that users don’t have to worry about providing a\n\nschema, because the schema is automatically inferred from your data files. Here is a very\n\nsimple example of how you would ingest data from CSV files that have headers, where you\n\nleave the tool to infer the schema and the proper data types. It’s as simple as that.\n```\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n\n```\n**Using COPY INTO without an existing table** 2\n\n```\n CREATE TABLE my_delta_table (dummy string);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS (\n ‘header’ = ‘true’ ,\n ‘inferSchema’ = ‘true’ ,\n ‘mergeSchema’ = ‘true’\n )\n COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n\n```\n**Ingesting a CSV file without headers**\n\nIf you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n\n_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n\ndata type that you want and then alias these columns to whatever you want to call them.\n```\n COPY INTO my_delta_table\n FROM ( SELECT\n _c0::int as key,\n _c1::double value,\n _c2::timestamp event_time\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n\n```\n\nIn the most common case, in order to use COPY INTO, a table definition is required.\n\nHowever, if you would like to get started quickly and don’t have an existing table or require\n\na specific schema, you can create your table with a dummy schema. Then, once you run\n\nCOPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n\ninfer the data types, and then change your Delta table to have the required schema.\n\n2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n\n\n-----\n\n**Evolving schema over time for CSV files** 3\n\nWhen ingesting CSV files that have a different number of columns than your existing table,\n\nyou can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n\nas FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n\nOnce “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n\nfiles and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n\nwhen you’re running the COPY INTO command. When “mergeSchema” is provided as a\n\ncopy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n\nevolution only allows the addition of new columns. Data type changes for existing columns", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "4e8ea1d153355dc7b0defb3cc23841bd", + "2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n\n\n-----\n\n**Evolving schema over time for CSV files** 3\n\nWhen ingesting CSV files that have a different number of columns than your existing table,\n\nyou can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n\nas FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n\nOnce “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n\nfiles and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n\nwhen you’re running the COPY INTO command. When “mergeSchema” is provided as a\n\ncopy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n\nevolution only allows the addition of new columns. Data type changes for existing columns\n\nare not supported.\n```\n COPY INTO my_delta_table\n FROM (SELECT\n _C0::int as key,\n _C1::double value,\n _C2::timestamp event_time,\n ...\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n\n```\n\n**Fixing bad data**\n\nIf you find that there is a mistake in the source data file and some of the data you ingested\n\nis bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n\nthe Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n\ncan rerun your COPY INTO command.\n\nAlternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n\nthe use of the “force” copy option. You can manually remove the old data from your Delta\n\nLake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n\nYou can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n\nnames with the FILES keyword to reload a subset of files in conjunction with “force”.\n```\n RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n 1);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n PATTERN = ‘2021-09-08*.csv’\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘force’ = ‘true’ )\n\n```\n3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\nclusters enabled with table ACLs.\n\n\n-----\n\n##### Auto Loader\n\n\nWhile COPY INTO can solve a lot of the key use cases our customers face, due to its\n\nlimitations (scalability), there are many scenarios where we recommend Auto Loader\n\nfor data ingestion. Auto Loader is a data source on Databricks that incrementally and\n\nefficiently processes new data files as they arrive in cloud storage with minimal DevOps\n\neffort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n\nAuto Loader is an incremental streaming source that provides exactly-once ingestion\n\nguarantees. It keeps track of which files have been ingested using a durable key-value store.\n\nIt can discover new files very efficiently and is extremely scalable. Auto Loader has been\n\nbattle tested. We have seen customers running Auto Loader on millions of files an hour, and\n\npetabytes of data per day.\n\nTo use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "468a232d6502fe03815a58da27ad126f", + "limitations (scalability), there are many scenarios where we recommend Auto Loader\n\nfor data ingestion. Auto Loader is a data source on Databricks that incrementally and\n\nefficiently processes new data files as they arrive in cloud storage with minimal DevOps\n\neffort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n\nAuto Loader is an incremental streaming source that provides exactly-once ingestion\n\nguarantees. It keeps track of which files have been ingested using a durable key-value store.\n\nIt can discover new files very efficiently and is extremely scalable. Auto Loader has been\n\nbattle tested. We have seen customers running Auto Loader on millions of files an hour, and\n\npetabytes of data per day.\n\nTo use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n\nthat you will use Auto Loader to load files from the cloud object stores. Next, you specify\n\nthe format of the file — for example, JSON — as an option to Auto Loader, and you specify\n\nwhere to load it from.\n```\n df = spark.readStream.format( “cloudFiles” )\n .option( “cloudfiles.format” , “json” )\n .load( “/path/to/table” )\n\n```\nUnder the hood, when data lands in your cloud storage, Auto Loader discovers files either\n\nthrough directory listing or file notifications. Given permissions to the underlying storage\n\nbucket or container, Auto Loader can list the directory that you want to load data from\n\nin an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n\ncan also automatically set up file notifications on your storage account, which allows it\n\n\nfrom queues, deduplicate these notifications using its key-value store and then process\n\nthe underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n\nprocessed, giving you exactly-once semantics.\n\nDirectory listing mode is very easy to get started with. If your files are uploaded to your\n\ncloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n\nfiles by starting directory listing from the latest uploaded files, saving you both time and\n\nmoney. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n\nto scale to high volumes, Databricks recommends using the file notification mode. Cloud\n\nservices such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n\nto upload files in a lexical order, typically by providing the upload time of records in the file\n\npath, such as /base/path/yyyy/MM/dd/HH/file.format.\n\n**Common Use Cases for Auto Loader**\n\n**New to Auto Loader**\n\nAs a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n\nstores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n\nexample using Python to demonstrate the ease and flexibility of Auto Loader with a few\n\ndefined options. You can run the code in a notebook.\n```\n stream = spark.readStream \\\n .format( “cloudFiles” ) \\\n .option( “cloudFiles.format” , “csv” ) \\\n .option( “cloudFiles.schemaLocation” , schema_location) \\\n .load(raw_data_location)\n\n```\n\nto efficiently discover newly arriving files. When a file lands in file notification mode, the\n\ncloud storage system sends a notification to a queuing system. For example, in AWS, S3\n\nwill send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n\nOn Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "03ec31bf5ade7d7b5e7d464b53e164b3", + "stores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n\nexample using Python to demonstrate the ease and flexibility of Auto Loader with a few\n\ndefined options. You can run the code in a notebook.\n```\n stream = spark.readStream \\\n .format( “cloudFiles” ) \\\n .option( “cloudFiles.format” , “csv” ) \\\n .option( “cloudFiles.schemaLocation” , schema_location) \\\n .load(raw_data_location)\n\n```\n\nto efficiently discover newly arriving files. When a file lands in file notification mode, the\n\ncloud storage system sends a notification to a queuing system. For example, in AWS, S3\n\nwill send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n\nOn Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n\n\n-----\n\nIn order to write to a Delta table from the stream, follow the example below:\n```\n stream.writeStream \\\n .option( “mergeSchema” , “true” ) \\\n .option( “checkpointLocation” , checkpoint_location) \\\n .start(target_delta_table_location)\n\n```\n**Migrating to Auto Loader**\n\nAs a Spark user, you may be using an existing Spark structured streaming to process data.\n\nTo migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n\ntwo lines of it into ‘cloudFiles’, specifying the file format within an option.\n\n\n**Migrating a livestreaming pipeline**\n\nMigrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n\nINTO, you can specify a timestamp when the source files are updated or created and Auto\n\nLoader will ingest all modified data after that point.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n**Schema inference and evolution**\n\nAuto Loader provides schema inference and management capabilities. With a schema\n\nlocation specified, Auto Loader can store the changes to the inferred schema over time. For\n\nfile formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n\nLoader can automatically infer data types or treat everything as a string.\n\nWhen data does not match your schema (e.g., an unknown column or format), Auto Loader\n\nhas a data rescue capability that will “rescue” all data in a separate column, stored as a\n\nJSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n\nAuto Loader supports three schema evolution modes: add new columns as they are\n\ndiscovered, fail if an unexpected column is seen, or rescue new columns.\n\n```\ndf = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.\nformat” , “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n```\ndf = spark.readStream\n .format( “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n\nOnce it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n\nLoader can scale to trillions of files, unlike the open-source file streaming source. One of\n\nthe ways that Auto Loader does this is with asynchronous backfills. Instead of needing", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "a7ce16b7acddcb62a9d5d9081ed85761", + "Auto Loader supports three schema evolution modes: add new columns as they are\n\ndiscovered, fail if an unexpected column is seen, or rescue new columns.\n\n```\ndf = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.\nformat” , “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n```\ndf = spark.readStream\n .format( “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n\nOnce it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n\nLoader can scale to trillions of files, unlike the open-source file streaming source. One of\n\nthe ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n\nto discover files first, then plan, Auto Loader discovers and processes files concurrently,\n\nmaking it much more efficient and leading to cost reductions in compute resources.\n\n\n-----\n\n**Fixing a file that was processed with Auto Loader**\n\nTo fix a file that was already processed, Auto Loader supports an option called\n\n‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n\nnew timestamp. If you want to enable this option in an existing Auto Loader stream, you\n\nneed to stop and restart the Auto Loader stream with the enabled option.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .schema(schema)\n .option( “cloudFiles.allowOverwrites” , “true” )\n .options(format_options)\n .load( “/path/to/table” )\n\n```\n**Discover missing data**\n\nWhile event notification is a very scalable method to collect all data, it relies on cloud\n\nservices, which are distributed systems and are not always reliable. With Auto Loader, you\n\ncan additionally specify a backfill interval, where Auto Loader will perform asynchronous\n\nbackfills at whatever interval you set up. This can be enabled with a once trigger,\n\n```\n df = spark.readStream\n .format(“cloudFiles”)\n .option(“cloudFiles.format”, “json”)\n .schema(schema)\n .option( “cloudFiles.backfillInterval” , “1 week” )\n .options(format_options)\n .load(“/path/to/table”)\n .writeStream\n .trigger(Trigger.AvailableNow())\n .option(“checkpointLocation”, checkpointDir)\n .start()\n\n```\nThe trigger tells Auto Loader how frequently to process incoming data. A processing time\n\ntrigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n\ninterval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n\nprocess all new data that has been added until the start of your application. Once the data\n\nis processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n\nprocess all the new data in a single micro-batch, which requires it to first discover all the\n\nnew files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n\nand perform rate limiting, which makes it a preferable alternative to Trigger Once.\n\n\nprocessing time trigger and available now trigger. The following example shows how to use\n\nbackfill internal and trigger availableNow together:\n\n\n-----\n\n**Using Auto Loader in SQL with Delta Live Tables**\n\nDelta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n\nframework to develop, test, monitor, manage and operationalize data pipelines at scale to\n\ndrive insights for data science, machine learning and analytics. Auto Loader is available in\n\nDelta Live Tables.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "416b7ff99129f37d49da101b799d54a1", + "process all new data that has been added until the start of your application. Once the data\n\nis processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n\nprocess all the new data in a single micro-batch, which requires it to first discover all the\n\nnew files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n\nand perform rate limiting, which makes it a preferable alternative to Trigger Once.\n\n\nprocessing time trigger and available now trigger. The following example shows how to use\n\nbackfill internal and trigger availableNow together:\n\n\n-----\n\n**Using Auto Loader in SQL with Delta Live Tables**\n\nDelta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n\nframework to develop, test, monitor, manage and operationalize data pipelines at scale to\n\ndrive insights for data science, machine learning and analytics. Auto Loader is available in\n\nDelta Live Tables.\n\n```\nCREATE INCREMENTAL LIVE TABLE\n autoloader_test\nAS\nSELECT\n *,\n id + id2 AS new_id\nFROM\n CLOUD_FILES (\n “some/cloud/path” , – the path to the data\n “json” – the file format\n );\n\n```\n\n**Live Tables understands**\n\n**and coordinates data flow**\n\n**between your queries**\n\n\n-----\n\n### Ingesting Data From External Applications\n\nWhile Auto Loader and COPY INTO are powerful tools, not all data is available as files\n\nin cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n\nyour data and break down the silos between sources and downstream teams. To do this,\n\ncustomers need to discover and connect a broad set of data, BI and AI tools, and systems\n\nto the data within their lakehouse.\n\n##### Partner Connect\n\nHistorically, stitching multiple enterprise tools and data sources together has been a burden\n\non the end user, making it very complicated and expensive to execute at any scale. Partner\n\nConnect solves this challenge by making it easy for you to integrate data, analytics and AI\n\ntools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n\nvalidated solutions from Databricks partners that support your expanding analytics needs.\n\nTo ingest into the lakehouse, select the partner tile in Partner Connect via the left\n\nnavigation bar in Databricks. Partner Connect will automatically configure resources such\n\nas clusters, tokens and connection files for you to connect with your data ingestion tools\n\nof choice. You can finish signing up for a trial account on the partner’s website or directly\n\nlog in if you already used Partner Connect to create a trial account. Once you log in, you will\n\nsee that Databricks is already configured as a destination in the partner portal and ready\n\nto be used.\n\n\n-----\n\n**Common Use Case for Partner Connect**\n\n**Ingest Salesforce data via Fivetran into Delta Lake**\n\nClicking on the Fivetran tile in Partner Connect starts an automated workflow between\n\nthe two products. Databricks automatically provisions a SQL endpoint and associated\n\ncredentials for Fivetran to interact with, and passes the user’s identity and the SQL\n\n\nendpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n\nDatabricks destination is automatically created. This destination is configured to ingest into\n\nDelta via the SQL endpoint that was auto-configured by Partner Connect.\n\nThe customer now selects their choice of data source in Fivetran from hundreds of pre-\n\nbuilt connectors — for example, Salesforce. The user authenticates to the Salesforce\n\nsource, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n\n\n-----\n\n(in this case the Account & Contact objects) and starts the initial sync. This automation\n\nhas saved users dozens of manual steps and copying/pasting of configuration if they", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "aef326197a1a461981dba1e157a1928f", + "Clicking on the Fivetran tile in Partner Connect starts an automated workflow between\n\nthe two products. Databricks automatically provisions a SQL endpoint and associated\n\ncredentials for Fivetran to interact with, and passes the user’s identity and the SQL\n\n\nendpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n\nDatabricks destination is automatically created. This destination is configured to ingest into\n\nDelta via the SQL endpoint that was auto-configured by Partner Connect.\n\nThe customer now selects their choice of data source in Fivetran from hundreds of pre-\n\nbuilt connectors — for example, Salesforce. The user authenticates to the Salesforce\n\nsource, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n\n\n-----\n\n(in this case the Account & Contact objects) and starts the initial sync. This automation\n\nhas saved users dozens of manual steps and copying/pasting of configuration if they\n\nmanually set up the connection. It also protects the user from making any unintentional\n\nconfiguration errors and spending time debugging those errors. The Salesforce tables\n\nare now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n\ndetails or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n\nglobe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "43842dc949a714a1b1213f165eb853a8", + "-----\n\n# TABLE OF CONTENTS\n\n\n##### Welcome to Data, Analytics and AI ....... 02\n\n**Do you know what you’re getting into?** ............................................ **02**\n\n**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n\n##### Business Value .......................................................................... 03\n\n**Talking to the business (feels like combat)** \b����������������������������� **03**\n\n**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n\n**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n\n##### Ultimate Class Build Guide .................................. 04\n\n**Creating a character** \b����������������������������������������������������������������������������� **04**\n\n- Data Engineers \b������������������������������������������������������������������������������������� **04**\n\n- Data Scientists \b������������������������������������������������������������������������������������� **05**\n\n- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n\n##### Diving In ............................................................................................... 05\n\n**Producing game data** \b���������������������������������������������������������������������������� **05**\n\n**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n\n**Getting data from your game to the cloud** \b������������������������������ **08**\n\n##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n\n**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n\n**Use data to develop a next-generation**\n\n**customer experience** \b��������������������������������������������������������������������������� **09**\n\n##### Getting Started with Gaming Use Cases .............................................................. 10\n\n**Where do I start? Start with Game Analytics** \b������������������������� **10**\n\n**Understand your audience** \b���������������������������������������������������������������������������� **11**\n\n- Player Segmentation \b���������������������������������������������������������������������������� **11**\n\n- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n\n- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n\n- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n\n- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n\n**Find your audience** \b���������������������������������������������������������������������������������� **14**\n\n\n**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n\n- Player Recommendations \b����������������������������������������������������������������� **15**\n\n- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n\n- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n\n- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n\n**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n\n- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n\n- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n\n- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n\n##### Things to Look Forward To ..................................... 19", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "814fb87a426da33b8f56cd59da2087ba", + "- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n\n- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n\n- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n\n- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n\n**Find your audience** \b���������������������������������������������������������������������������������� **14**\n\n\n**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n\n- Player Recommendations \b����������������������������������������������������������������� **15**\n\n- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n\n- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n\n- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n\n**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n\n- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n\n- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n\n- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n\n##### Things to Look Forward To ..................................... 19\n\n Appendix .............................................................................................. 21\n\n**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n\n- Creating a Character \b��������������������������������������������������������������������������� **21**\n\n- Data Engineers \b���������������������������������������������������������������������������� **21**\n\n- Data Scientists \b���������������������������������������������������������������������������� **21**\n\n- Data Analysts \b������������������������������������������������������������������������������ **22**\n\n**Data Access and the Major Cloud Providers** ................................ **23**\n\n- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n\n- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n\n- Getting started with the major cloud providers \b������������������� **23**\n\n**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n\n- Game analytics \b������������������������������������������������������������������������������������� **25**\n\n- Player Segmentation \b�������������������������������������������������������������������������� **25**\n\n- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n\n- Social Media Monitoring \b������������������������������������������������������������������� **28**\n\n- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n\n- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n\n- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n\n- Player Recommendations \b���������������������������������������������������������������� **32**\n\n- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n\n- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n\n- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n\n**Getting Started with Operational Use Cases** \b�������������������������� **36**\n\n- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n\n- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n\n- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n\n\nMulti-Touch Attribution \b��������������������������������������������������������������������� **14**\n\n\n-----\n\n# Welcome to Data, Analytics, and AI\n\n\n### Do you know what you’re getting into?\n\nYou may have heard the stories of game studios spending\n\ncountless hours trying to more effectively acquire, engage,\n\nand retain players. Well, did you know that data, analytics,\n\nand AI plays a central role in the development and operation\n\nof today’s top-grossing video games? Studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are no\n\nlonger a “nice-to-have”, but table stakes for success.\n\nThe objective of this handbook is to guide you on the\n\nrole data, analytics, and AI plays in the development\n\nand operations of video games. We’ll cover who the key\n\nstakeholders are and how to align people across business", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "4aabe7bca7a25a31b9106e788373642b", + "# Welcome to Data, Analytics, and AI\n\n\n### Do you know what you’re getting into?\n\nYou may have heard the stories of game studios spending\n\ncountless hours trying to more effectively acquire, engage,\n\nand retain players. Well, did you know that data, analytics,\n\nand AI plays a central role in the development and operation\n\nof today’s top-grossing video games? Studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are no\n\nlonger a “nice-to-have”, but table stakes for success.\n\nThe objective of this handbook is to guide you on the\n\nrole data, analytics, and AI plays in the development\n\nand operations of video games. We’ll cover who the key\n\nstakeholders are and how to align people across business\n\nunits. Then we’ll talk through strategies to help you\n\nsuccessfully advocate for data, analytics, and AI projects\n\ninternally. Finally, we dive deep through the most common\n\nuse cases. We want to give you enough information to feel\n\n\nwell as helpful tips when operating as or working with one of\n\nthese classes.\n\nWe follow this with the fundamentals for building a Proof\n\nof Concept (POC) or Minimum Viable Product (MVP). That\n\nis, connecting to the cloud; accessing your data; and\n\nmost importantly, being able to represent the value you’re\n\nseeking to unlock as you sell your project into your team and\n\nbroader organization.\n\nFinally, we’ll dive into the most common use cases for data,\n\nanalytics, and AI within game development. Similar to a tech-\n\ntree in a video game, we begin with the most basic use cases\n\n- setting up your game analytics. Then we progress through\n\nmore advanced data use cases such as player segmentation,\n\nassessing lifetime value, detecting and mitigating toxicity,\n\nmulti-touch attribution, recommendation engines, player\n\nchurn prediction and prevention, and more.\n\nDon’t forget to review the Appendix. You’ll find a handy\n\n“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n\nguide for the three major cloud providers ”. All incredibly\n\nhelpful assets to keep as hotkeys.\n\n\nempowered to make a demonstrable impact. Just by reading\n\nthis you are adding incredible insight and value to yourself as\n\n\nan industry professional. Quest on!\n\n### How to use this book\n\nThis book is primarily intended for technical professionals\n\nwho are engaging with data within game studios. No\n\nmatter your role in the gaming industry, you will be able to\n\nglean key takeaways that will make you more effective in\n\nyour individual role and within the larger team — be that\n\nproduction, art, engineering, marketing, or otherwise.\n\nBegin your journey by reviewing the “ **Data, Analytics, and AI**\n\n**Ground Rules** ” section to the right, which presents some This\n\nsection presents some rules and guidelines for interpreting\n\nthe role that data plays in the game development lifecycle.\n\nNext, it’s time to learn about the key professions (aka\n\ncharacter classes) that interact and engage with data,\n\nanalytics, and AI on a consistent basis within a game studio.\n\nThis section breaks down each of the classes, providing an\n\n\n**Data, Analytics, and AI Ground Rules**\n\nThis guide assumes you understand the following:\n\n- You understand the basics of data, analytics, and AI:\n\nHow and why data is stored in a system, why data\n\nis transformed, the different types of output that\n\ndata can feed into — such as a report, an analysis\n\nanswering a question, or a machine learning model.\n\nIf this is the first time you’re creating a character,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "63873e31dde7e8f550e5f42350b7b441", + "Begin your journey by reviewing the “ **Data, Analytics, and AI**\n\n**Ground Rules** ” section to the right, which presents some This\n\nsection presents some rules and guidelines for interpreting\n\nthe role that data plays in the game development lifecycle.\n\nNext, it’s time to learn about the key professions (aka\n\ncharacter classes) that interact and engage with data,\n\nanalytics, and AI on a consistent basis within a game studio.\n\nThis section breaks down each of the classes, providing an\n\n\n**Data, Analytics, and AI Ground Rules**\n\nThis guide assumes you understand the following:\n\n- You understand the basics of data, analytics, and AI:\n\nHow and why data is stored in a system, why data\n\nis transformed, the different types of output that\n\ndata can feed into — such as a report, an analysis\n\nanswering a question, or a machine learning model.\n\nIf this is the first time you’re creating a character,\n\nwe highly recommend reviewing our data, analytics,\n\nand AI tutorial — aka getting started training and\n\ndocumentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n\n- You have a basic understanding of cloud\n\ninfrastructure. Specifically what it is, who are the\n\nkey players, and associated terms (e.g., virtual\n\nmachines, APIs, applications)\n\n- You are generally aware of the game development\n\nlifecycle; pre-production, production, testing/QA,\n\nlaunch, operation\n\n\noverview of each character’s strengths and weaknesses as\n\n\n-----\n\n# Business Value\n\n\nDemonstrating business value is important when working\n\non data, analytics, and AI projects because it helps ensure\n\nthat the efforts of the project are aligned with the goals\n\nand objectives of the business. By showing how the project\n\ncan positively impact a game’s key performance indicators\n\n(KPIs) and bottom-line metrics, such as game revenue, player\n\nsatisfaction, and operational efficiency, studio stakeholders\n\nare more likely to support and invest in the project.\n\nAdditionally, demonstrating business value can help justify\n\nthe resources, time, and money that are required to execute\n\nthe project, and can also help prioritize which projects should\n\nbe pursued. By focusing on business value, data, analytics,\n\nand AI projects can become strategic initiatives that\n\ncontribute to the long-term success of your game studio.\n\n### Talking to the business (feels like combat)\n\nWhile we highly encourage everyone to read this section,\n\nyou may already feel confident understanding the needs and\n\nconcerns of your internal stakeholders, and how to sell-in a\n\nproject successfully. If so, feel free to skip this section.\n\nWe would love to dive into the data to explore and discover\n\nas much as possible, unfortunately in most environments,\n\nwe are limited by resources and time. Understanding both\n\nthe businesses pain points and strategic goals is crucial to\n\nchoosing projects that will benefit the business, create value\n\nand make your message much easier to sell.\n\nWhenever we embark on a proof-of-concept (PoC) or\n\nminimum viable product (MVP) — to prove out a new\n\n**Questions to ask:**\n\n- What other strategic goals and pain points can\n\nyou list out and how would you prioritize them as\n\na business leader?\n\n- Does your prioritization match how your team,\n\nmanager and/or leadership would prioritize?\n\nTypically the closer the match, the easier initial\n\nprojects will be to “sell”.\n\n\nmethodology or technology — we will need to pitch it back\n\nfor adoption. The technology could be revolutionary and\n\nabsolutely amazing, but without the value proposition and tie\n\nback to goals, it is likely to land flat or fail to be adopted.\n\nIt is key to talk to your stakeholders to understand their", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "772c9eca2e71c36590bdc92e8d2d54ff", + "the businesses pain points and strategic goals is crucial to\n\nchoosing projects that will benefit the business, create value\n\nand make your message much easier to sell.\n\nWhenever we embark on a proof-of-concept (PoC) or\n\nminimum viable product (MVP) — to prove out a new\n\n**Questions to ask:**\n\n- What other strategic goals and pain points can\n\nyou list out and how would you prioritize them as\n\na business leader?\n\n- Does your prioritization match how your team,\n\nmanager and/or leadership would prioritize?\n\nTypically the closer the match, the easier initial\n\nprojects will be to “sell”.\n\n\nmethodology or technology — we will need to pitch it back\n\nfor adoption. The technology could be revolutionary and\n\nabsolutely amazing, but without the value proposition and tie\n\nback to goals, it is likely to land flat or fail to be adopted.\n\nIt is key to talk to your stakeholders to understand their\n\nperception of pain points and positions on potential projects\n\nto add value. Much like stopping at the Tavern when the\n\nadventuring party gets to town, these can be informal\n\nconversations where you socialize potential solutions while\n\ngathering information about what matters.\n\n### Creating value alignment\n\nSo what are your strategic goals and pain points and how\n\nmight they be addressed through a use case from a PoC or\n\nMVP leveraging your data?\n\nA few examples of strategic goals that are top of mind for our\n\ncustomers at the beginning of any fiscal or calendar year:\n\n- Reduce costs\n\n- Simplify your infrastructure\n\n- Acquire more players\n\n- Monetize your playerbase\n\n- Retain your players (aka prevent churn)\n\nHere are four ways the Databricks Lakehouse can provide\n\nvalue that aligns with your strategic goals and pain points:\n\n`1.` **\u0007Improved collaboration:** Databricks platform allows\n\neveryone to share and collaborate on data, notebooks and\n\nmodels between data scientists, engineers and business\n\nusers. This enables for a more efficient and streamlined\n\nprocess for data analysis and decision making.\n\n`2.` **Find and explore your data:** The data in the Lakehouse is\n\ncataloged and accessible, which enables business users\n\nto explore and query the data easily and discover insights\n\nby themselves.\n\n`3.` **\u0007Uncover actionable business insights:** By putting\n\nyour game’s data into a Lakehouse architecture, it\n\ncan be better analyzed using various tools provided\n\nby Databricks such as SQL, dashboards, notebooks,\n\nvisualization and machine learning to better understand\n\nyour playerbase, providing valuable insights into player\n\nbehavior and performance. These insights can help the\n\n\n-----\n\nand retention, and use that information to improve the\n\ngame and grow monetization.\n\n`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n\narchitecture provides a single source of truth for your\n\norganization’s data. Data engineers write once, data\n\nanalysts interpret the data, and data scientists can run\n\nmachine machine learning models on the same data.\n\n_This cannot be understated in the value this provides an_\n\n_organization from a total cost of ownership perspective._\n\nWith the ability to access and analyze all the data in one\n\nplace, the business can make unified data-driven decisions,\n\nrather than relying on intuition or fragmented data.\n\n### Goals and outcomes\n\nLike many projects, starting with a strong foundation of ‘what\n\nsuccess looks like’ will significantly improve your likelihood\n\nof achieving your objectives. Here are a few best-practices\n\nwe recommend:\n\n`1.` **Set goals:** Define your hypothesis, then use your data\n\nand process to prove or disprove your hypothesis. You", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3f363efc9a09828918cfc25136a598b3", + "`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n\narchitecture provides a single source of truth for your\n\norganization’s data. Data engineers write once, data\n\nanalysts interpret the data, and data scientists can run\n\nmachine machine learning models on the same data.\n\n_This cannot be understated in the value this provides an_\n\n_organization from a total cost of ownership perspective._\n\nWith the ability to access and analyze all the data in one\n\nplace, the business can make unified data-driven decisions,\n\nrather than relying on intuition or fragmented data.\n\n### Goals and outcomes\n\nLike many projects, starting with a strong foundation of ‘what\n\nsuccess looks like’ will significantly improve your likelihood\n\nof achieving your objectives. Here are a few best-practices\n\nwe recommend:\n\n`1.` **Set goals:** Define your hypothesis, then use your data\n\nand process to prove or disprove your hypothesis. You\n\nhave a goal in mind, make it part of the experiment. If\n\nthe outcome differs from the expectation, that is part of\n\nexperiments and we can learn from it to improve the next\n\nexperiment. This is all about shortening the feedback loop\n\n\nproject appropriately. For example, are you doing this as\n\na side project? Do you have 2 sprints to show progress?\n\nIt’s important to scope your project based on the time,\n\nresources, and quality needed for the said project to be a\n\nsuccess.\n\n`3.` **Scope down:** Ruthlessly control scope for any PoC or\n\nMVP. Prioritization is your best friend. Stakeholders and\n\nyour own internal team will naturally want to increase\n\nscope because there’s no shortage of good ideas. But by\n\ncontrolling scope, you improve your chances of shipping\n\non time and on budget. Don’t let perfection be the enemy\n\nof good. There are always exceptions to this, but that is\n\nwhat the next sprint is for.\n\n`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n\ndifficult - strive to always deliver on time. Make sure your\n\ngoals, constraints and scope creep will not explode your\n\ntimeline as creating tight feedback loops and iteration\n\ncycles is what will make you more agile than the competition.\n\n`5.` **Socialize early, and often:** Show quantifiable value as\n\nquickly as possible, both to your immediate team and\n\nbusiness stakeholders. Measure the value as frequently\n\nas makes sense, and socialize early and often to promote\n\nvisibility of the project and ensure tight alignment across\n\nteams. This will empower you to create tighter feedback\n\nloops that will help improve any future iterations of your\n\nproduct, platform, or technology.\n\n\nbetween insight and action.\n\n# Ultimate Class Build Guide\n\n\n### Creating a character\n\nHave you rolled your character already? Data engineers, data\n\nscientists, and data analysts form the heart of mature game\n\ndata teams. Though, depending on studio size and resources,\n\n\nmaking sense of large amounts of data. Depending on the size\n\nof the organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in data\n\nengineering, analytics and data science. Key characters include:\n\n\ngame developers may also be pulled in from time to time to\n\n\nperform data engineering and or data science tasks. Though for\n\nthe sake of this guide, we’ll keep focus on roles of data engineers,\n\ndata scientists, and data analysts. There are many aspects to\n\nthese roles, but they can be summarized in that Data Engineers\n\ncreate and maintain critical data workflows, Data Analysts\n\ninterpret data and create reports that keep the business teams\n\nrunning seamlessly, and Data Scientists are responsible for\n\n\n**Data Engineers**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e22b1fb61a6fa9c2003b7a8ef3f86a82", + "scientists, and data analysts form the heart of mature game\n\ndata teams. Though, depending on studio size and resources,\n\n\nmaking sense of large amounts of data. Depending on the size\n\nof the organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in data\n\nengineering, analytics and data science. Key characters include:\n\n\ngame developers may also be pulled in from time to time to\n\n\nperform data engineering and or data science tasks. Though for\n\nthe sake of this guide, we’ll keep focus on roles of data engineers,\n\ndata scientists, and data analysts. There are many aspects to\n\nthese roles, but they can be summarized in that Data Engineers\n\ncreate and maintain critical data workflows, Data Analysts\n\ninterpret data and create reports that keep the business teams\n\nrunning seamlessly, and Data Scientists are responsible for\n\n\n**Data Engineers**\n\nData engineers build systems that collect, manage, and\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n\n-----\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Data Analysts**\n\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n**Learn more about these character classes**\n\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n# Diving In\n\n\nBefore we get to the primary use cases of game data,\n\nanalytics, and AI, we need to cover some basics. That is, the\n\ndifferent types of game data and how they are produced.\n\nAnd the subsequent receiving of that data in the cloud to\n\n\n### Producing game data…\n\nSpeaking in generalities, there are four buckets of data as it\n\nrelates to your video game.\n\n\ncollect, clean, and prepare for analysis.\n\n**1. Game Telemetry**\n\nGame telemetry refers to the data collected about player\n\nbehavior and interactions within a video game. The primary\n\ndata source is the game engine. And the goal of game\n\ntelemetry is to gather information that can help game\n\ndevelopers understand player behavior and improve the\n\noverall game experience.\n\nSome of the primary metrics that are typically tracked in\n\ngame telemetry include:\n\n- **Player engagement:** Track the amount of time players\n\nspend playing the game, and their level of engagement\n\nwith different parts of the game.\n\n- **Game progress:** Monitor player progress through\n\ndifferent levels and milestones in the game.\n\n- **In-game purchases:** Track the number and value of\n\nin-game purchases made by players.\n\n- **Player demographics:** Collect demographic information\n\nabout players, such as age, gender, location, and device type.\n\n- **Session length:** Monitor the length of each player session,\n\nand how often players return to the game.\n\n- **Retention:** Track the percentage of players who return to\n\nthe game after their first session.\n\n\n-----\n\nsuch as the types of actions taken, the number of deaths,\n\nand the use of power-ups.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels.\n\n**2. Business KPIs**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "19649d3d466f3b25c73f87e08cb38e3b", + "with different parts of the game.\n\n- **Game progress:** Monitor player progress through\n\ndifferent levels and milestones in the game.\n\n- **In-game purchases:** Track the number and value of\n\nin-game purchases made by players.\n\n- **Player demographics:** Collect demographic information\n\nabout players, such as age, gender, location, and device type.\n\n- **Session length:** Monitor the length of each player session,\n\nand how often players return to the game.\n\n- **Retention:** Track the percentage of players who return to\n\nthe game after their first session.\n\n\n-----\n\nsuch as the types of actions taken, the number of deaths,\n\nand the use of power-ups.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels.\n\n**2. Business KPIs**\n\nThe second bucket of data is business key performance\n\nindicators (or KPIs). Business KPIs are metrics that measure\n\nthe performance and success of a video game from a\n\nbusiness perspective. The primary data source for business\n\nKPIs include game telemetry, stores, and marketplaces.\n\nThese KPIs help game studios understand the financial and\n\noperational performance of their games and make informed\n\ndecisions about future development and growth.\n\nSome of the primary business metrics that are typically\n\ntracked include:\n\n- **Revenue:** Track the total revenue generated by the game,\n\nincluding sales of the game itself, in-game purchases,\n\nand advertising.\n\n- **Player Acquisition Cost (CAC):** Calculate the cost\n\nof acquiring a new player, including marketing and\n\nadvertising expenses.\n\n- **Lifetime Value (LTV):** Estimate the amount of revenue a\n\nplayer will generate over the course of their time playing\n\nthe game.\n\n- **Player Retention:** Track the percentage of players who\n\ncontinue to play the game over time, and how long they\n\nplay for.\n\n- **Engagement:** Measure the level of engagement of players\n\nwith the game, such as the number of sessions played,\n\ntime spent playing, and in-game actions taken.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels and the\n\ncost of acquiring each player.\n\n- **Conversion Rate:** Measure the percentage of players who\n\nmake an in-game purchase or complete a specific action.\n\n- **Gross Margin:** Calculate the profit generated by the game\n\nafter subtracting the cost of goods sold, such as the cost\n\nof game development and server hosting.\n\n**3. Game Services**\n\nSimilar to game telemetry, game services provide critical\n\ninfrastructure that requires careful monitoring and management.\n\nThese services include things like game server hosting,\n\n\nand more. Here the source of data is the game services used.\n\nSome of the common metrics game teams typically track for\n\nthese services include:\n\n- **Concurrent Players:** Track the number of players who are\n\nsimultaneously connected to the game servers to ensure\n\nthat the servers have enough capacity to handle the\n\nplayer demand.\n\n- **Server Availability:** Monitor the uptime and downtime of\n\nthe game servers to ensure that players have access to\n\nthe game when they want to play, particularly important\n\nfor global live service games where demand fluctuates\n\nthrought the day.\n\n- **Latency:** Measure the time it takes for data to travel\n\nfrom the player’s device to the game server and back,\n\nto ensure that players have a smooth and responsive\n\ngaming experience.\n\n- **Network Bandwidth:** Monitor the amount of data being\n\ntransmitted between the player’s device and the game", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7f3f865b56025cd59d2e65a95d3fbe29", + "and more. Here the source of data is the game services used.\n\nSome of the common metrics game teams typically track for\n\nthese services include:\n\n- **Concurrent Players:** Track the number of players who are\n\nsimultaneously connected to the game servers to ensure\n\nthat the servers have enough capacity to handle the\n\nplayer demand.\n\n- **Server Availability:** Monitor the uptime and downtime of\n\nthe game servers to ensure that players have access to\n\nthe game when they want to play, particularly important\n\nfor global live service games where demand fluctuates\n\nthrought the day.\n\n- **Latency:** Measure the time it takes for data to travel\n\nfrom the player’s device to the game server and back,\n\nto ensure that players have a smooth and responsive\n\ngaming experience.\n\n- **Network Bandwidth:** Monitor the amount of data being\n\ntransmitted between the player’s device and the game\n\nserver to ensure that players have a high-quality gaming\n\nexperience, even on slow internet connections.\n\n- **Live Operations:** Monitor the success of in-game events,\n\npromotions, and other live operations to understand what\n\nresonates with players and what doesn’t.\n\n- **Player Feedback:** Monitor player feedback and reviews,\n\nincluding ratings and comments on social media, forums,\n\nand app stores, to understand what players like and dislike\n\nabout the game.\n\n- **Chat Activity:** Track the number of messages and\n\ninteractions between players in the game’s chat channels\n\nto understand the level of social engagement and\n\ncommunity building in the game.\n\n**4. Data beyond the game**\n\nThe last bucket comes from data sources beyond the video\n\ngame. These typically include the following:\n\n- **Social Media Data:** Social media platforms, such as\n\nFacebook, Twitter, TikTok and Instagram, can provide\n\nvaluable insights into player behavior, feedback and\n\npreferences, as well as help game teams understand\n\nhow players are talking about their games online with\n\ndifferent communities.\n\n- **Forum Data:** Online forums and discussion boards, such\n\nas Reddit and Discord, can be rich sources of player\n\nfeedback and opinions about the game.\n\n\n-----\n\n#### The secret to success is bringing all of the disparate data sources\n together, so you have as complete a 360-degree view as possible of\n what’s happening in and around your game.\n\n\n\n- **Player Reviews:** Ratings and reviews on app stores, such\n\nas Steam, Epic, Google Play and the Apple App Store, can\n\nprovide valuable feedback on player experiences and help\n\ngame teams identify areas for improvement.\n\n- **Third-Party Data:** Third-party data sources, such as\n\nmarket research firms and industry data providers, can\n\nprovide valuable insights into broader gaming trends and\n\nhelp game teams make informed decisions about their\n\ngames and marketing strategies.\n\nThis is a lot of data. And it’s no wonder that studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are now\n\ntable stakes for a game to be successful. Tapping into these\n\nfour buckets of data sources, you’ll find actionable insights that\n\ndrive better understanding of your playerbase, more efficient\n\nacquisition, stronger and longer lasting engagement, and\n\nmonetization that deepens the relationship with your players.\n\nThat’s what we’re going to dig into throughout the rest of\n\nthis book.\n\n**Let’s begin with how to get data out of your game!**\n\nThere are a variety of ways to get data out of the game and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "75c893ec37e406d416e6c57926fac08f", + "help game teams make informed decisions about their\n\ngames and marketing strategies.\n\nThis is a lot of data. And it’s no wonder that studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are now\n\ntable stakes for a game to be successful. Tapping into these\n\nfour buckets of data sources, you’ll find actionable insights that\n\ndrive better understanding of your playerbase, more efficient\n\nacquisition, stronger and longer lasting engagement, and\n\nmonetization that deepens the relationship with your players.\n\nThat’s what we’re going to dig into throughout the rest of\n\nthis book.\n\n**Let’s begin with how to get data out of your game!**\n\nThere are a variety of ways to get data out of the game and\n\ninto cloud resources. In this section, we will provide resources\n\nfor producing data streams in Unity and Unreal. In addition,\n\nwe will also provide a generic approach that will work for any\n\ngame engine, as long as you are able to send HTTP requests.\n\n**Unity**\n\nSince Unity supports C#, you would use a .NET SDK from the\n\ncloud provider of your choice. All three major cloud providers\n\n\n[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n\n- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n\n- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n\n- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n\n- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n\nFrom here, the SDK is used to send data to a messaging\n\nservice. These messaging services will be covered in more\n\ndetail in the next section.\n\n**Unreal Engine**\n\nUnreal supports development with C++, so you could use\n\nC++ SDKs or Blueprint interfaces to those SDKs.\n\nThe resources for each SDK are provided here\n\n- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n\n- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n\n- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\n[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\nJust like with the Unity example above, from here the data is\n\nsent to a messaging streaming service.\n\nOther engines may not support C++ or C#, but there is still a\n\nway to get your data into the cloud, no matter the language!\n\nBy hitting an API Gateway with a HTTP POST request, you are", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a82eedb5cafbf946f8ccdaaae71556a8", + "- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n\n- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\n[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\nJust like with the Unity example above, from here the data is\n\nsent to a messaging streaming service.\n\nOther engines may not support C++ or C#, but there is still a\n\nway to get your data into the cloud, no matter the language!\n\nBy hitting an API Gateway with a HTTP POST request, you are\n\nable to send data to cloud services from many more types of\n\napplications. A sample high level architecture of this solution\n\nin AWS and Azure can be seen below:\n\n**AWS:**\n\n\nhave .NET SDKs to use and I have linked the documentation\n\n\n**Azure:**\n\n\nfor each below.\n\nNo matter the cloud provider, if you want to use a SDK you\n\ninstall it through the NuGet package manager into your Unity\n\nproject. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n\n\n-----\n\nOnce the data has been sent from the game into an event-\n\nstreaming service, how do we get that data to a more\n\npermanent home? Here we will start by outlining what these\n\nmessaging services do and how we can use them to point\n\nour data to a desired location.\n\nMessaging services ingest real-time event data, being\n\nstreamed to them from a number of different sources,\n\nand then send them to their appropriate target locations.\n\nThese target locations can be databases, compute clusters\n\nor cloud object stores. A key property of the messaging\n\nservices is to preserve the time in which the events arrive, so\n\nthat it is always known the order that events occurred.\n\n\n\n- Data is stored in object storage such as S3, Azure Storage\n\nor GCP Buckets using Delta Lake.\n\n- Delta Lake is an open-source storage framework that makes\n\nit easy to maintain data consistency and track changes.\n\n**Data Governance & Cataloging:**\n\n- Unity Catalog in Databricks provides tools for data\n\ngovernance that helps with compliance and controlling\n\naccess to data in the lake.\n\n- Unity Catalog also allows to track data lineage, auditing and\n\ndata discovery with the use of data catalogs and governance.\n\n- Metadata about the data including the structure, format,\n\nand location of the data can be stored in a data catalog.\n\n\nExamples of cloud messaging services include AWS Kinesis\n\n\nFirehose, Google PubSub, and Azure Event Hubs Messaging.\n\nIf you prefer to use open-source products, Apache Kafka is a\n\nvery popular open-source alternative.\n\n### Getting data from your game to the cloud\n\nMoving to the cloud platform part of the journey involves\n\nbuilding a gaming Lakehouse. The gaming Lakehouse allows\n\ngaming companies to store, manage, and analyze large volumes\n\nof gaming data, such as player behavior, performance metrics,\n\nand financial transactions, to gain valuable insights and make\n\ndata-driven decisions to improve their business outcomes.\n\n**Next here are the basics of the Databricks**\n\n**platform simplified.**\n\n**Data Ingestion:**\n\n- Data can be ingested into the Gaming Lakehouse using", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9d5c34638836cb6cdd89bf8cecacbac5", + "- Metadata about the data including the structure, format,\n\nand location of the data can be stored in a data catalog.\n\n\nExamples of cloud messaging services include AWS Kinesis\n\n\nFirehose, Google PubSub, and Azure Event Hubs Messaging.\n\nIf you prefer to use open-source products, Apache Kafka is a\n\nvery popular open-source alternative.\n\n### Getting data from your game to the cloud\n\nMoving to the cloud platform part of the journey involves\n\nbuilding a gaming Lakehouse. The gaming Lakehouse allows\n\ngaming companies to store, manage, and analyze large volumes\n\nof gaming data, such as player behavior, performance metrics,\n\nand financial transactions, to gain valuable insights and make\n\ndata-driven decisions to improve their business outcomes.\n\n**Next here are the basics of the Databricks**\n\n**platform simplified.**\n\n**Data Ingestion:**\n\n- Data can be ingested into the Gaming Lakehouse using\n\nvarious built-in data ingestion capabilities provided by\n\nDatabricks such as Structured Streaming and Delta Live\n\nTables for a single simple API that handles streaming or\n\nbatch pipelines.\n\n- Data can be ingested in real-time or batch mode from\n\n\n**Data Quality:**\n\n- Databricks platform enables you to validate, clean\n\nand enrich data using built-in libraries and rule-based\n\nvalidation using Delta Live Tables.\n\n- It also allows tracking data quality issues and missing\n\nvalues by using Databricks Delta Live Tables tables.\n\n**Data Security:**\n\n- Databricks provides a comprehensive security model to\n\nsecure data stored in the lake.\n\n- Access to data can be controlled through robust access\n\ncontrols on objects such as catalogs, schemas, tables,\n\nrows, columns, models, experiments, and clusters.\n\n**Analytics:**\n\n- The processed data can be analyzed using various\n\ntools provided by Databricks such as SQL Dashboards,\n\nNotebooks, visualizations and ML.\n\n- Game studios can gain insights into player performance and\n\nbehaviorto better engageplayers and improve their games.\n\n**Get started with your preferred cloud**\n\n\nvarious sources such as game clients, servers or APIs.\n\nData can be cleaned, transformed and enriched with\n\nadditional data sources, making it ready for analysis.\n\n\n-----\n\n# The Value of Data Throughout the Game Development Lifecycle\n\n\n### Lifecycle overview\n\nOver the last decade, the way games have been developed\n\nand monetized has changed dramatically. Most if not all\n\ntop grossing games are now built using a games-as-service\n\nstrategy, meaning titles shipped in cycles of constant\n\niteration to increase engagement and monetization of\n\nplayers over time. Games-as-a-Service models have the\n\nability to create sticky, high-margin games, but they also\n\nheavily depend on cloud-based services such as game\n\nplay analytics, multiplayer servers and matchmaking, player\n\nrelationship management, performance marketing and more.\n\nData plays an integral role in the development and operation\n\nof video games. Teams need tools and services to optimize\n\nplayer lifetime value (LTV) with databases that can process\n\nterabytes-petabytes of evolving data, analytics solutions\n\nthat can access that data with near real-time latency, and\n\nmachine learning (ML) models that can translate insights into\n\nactionable and innovative gameplay features.\n\nA game’s development lifecycle is unique to each studio. With\n\ndifferent skillsets, resources, and genres of games, there is no\n\n\none model. Below is a simplified view of a game development\n\nlifecycle for a studio running a games-as-a-service model.\n\nWhat’s important to remember is that throughout your title’s\n\ndevelopment lifecycle, there is data that can help you better", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a7c2b5dd3629ef4793294057fff00039", + "heavily depend on cloud-based services such as game\n\nplay analytics, multiplayer servers and matchmaking, player\n\nrelationship management, performance marketing and more.\n\nData plays an integral role in the development and operation\n\nof video games. Teams need tools and services to optimize\n\nplayer lifetime value (LTV) with databases that can process\n\nterabytes-petabytes of evolving data, analytics solutions\n\nthat can access that data with near real-time latency, and\n\nmachine learning (ML) models that can translate insights into\n\nactionable and innovative gameplay features.\n\nA game’s development lifecycle is unique to each studio. With\n\ndifferent skillsets, resources, and genres of games, there is no\n\n\none model. Below is a simplified view of a game development\n\nlifecycle for a studio running a games-as-a-service model.\n\nWhat’s important to remember is that throughout your title’s\n\ndevelopment lifecycle, there is data that can help you better\n\nunderstand your audience, more effectively find and acquire\n\nplayers, and more easily activate and engage them. Whether\n\nusing game play data to optimize creative decision making\n\nduring pre-production, tapping machine learning models to\n\npredict and prevent churn, or identifying the next best offer\n\nor action for your players in real-time, **data is your friend** .\n\n### Use data to develop a next-generation customer experience\n\nIn the game industry, customer experience (CX) is an\n\nimportant factor that can impact a player’s enjoyment of a\n\ngame and the length they choose to play that game over time.\n\nIn today’s highly competitive and fast-paced games industry,\n\na game studio’s ability to deliver exceptional and seamless\n\ncustomer experiences can be a strategic differentiator when\n\nit comes to cutting through the noise and winning a gamer’s\n\n\n## Game Development Lifecycle\n\n**Game Development Lifecycle**\n\n#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n\n\n**Game Development Lifecycle**\n\n\n_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n\n\n**1. Pre-Production**\n\nBrainstorm how to give life to the many\n\nideas laid out in the planning phase\n\n\n**3. Testing**\n\nEvery feature and mechanic in the game needs\n\nto be tested for game loop and quality control\n\n\n**5. Operation**\n\nAs studios increasingly adopt games-as-a-service models, the\n\nongoing operation of a video game is as critical as the launch itself\n\n**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n\n\n\n|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n|---|---|---|---|---|---|---|---|\n|||||||||\n|||||||||\n\n\n**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n**EXPERIMENTATION**\n\n\n**2. Production**\n\nMost of the time, effort, and resources\n\nspent on developing video games are\n\nspent in production stage\n\n\n**4. Launch**\n\nWhether developing alongside the community with\n\nalpha and beta releases, or launching into general\n\navailability, a game launch is a critical milestone\n\n\n-----\n\ncan help drive value through customer experience:\n\n`1.` **Personalization:** Game studios can use data analytics\n\nand machine learning to personalize the game experience\n\nfor each player based on their preferences and behavior.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e0222cd32cd29efca3084c4ba130b142", + "**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n**EXPERIMENTATION**\n\n\n**2. Production**\n\nMost of the time, effort, and resources\n\nspent on developing video games are\n\nspent in production stage\n\n\n**4. Launch**\n\nWhether developing alongside the community with\n\nalpha and beta releases, or launching into general\n\navailability, a game launch is a critical milestone\n\n\n-----\n\ncan help drive value through customer experience:\n\n`1.` **Personalization:** Game studios can use data analytics\n\nand machine learning to personalize the game experience\n\nfor each player based on their preferences and behavior.\n\nThis can include personalized recommendations for\n\ncontent, in-game events, and other features that are\n\ntailored to the player’s interests.\n\n`2.` **Omnichannel support:** Players often use multiple\n\nchannels, such as social media, forums, and in-game\n\nsupport, to communicate with game studios. Next\n\ngeneration customer experience involves providing a\n\nseamless and integrated support experience across all\n\nthese channels in near-real time.\n\n`3.` **Continuous improvement:** Game studios can use data\n\nand feedback from players to continuously improve\n\n\ngathering feedback on new features and using it to refine\n\nand optimize the game over time.\n\nIn summary, defining what a next generation customer\n\nexperience looks like for your game is important because it can\n\nhelp you create a more personalized, seamless, and enjoyable\n\nexperience for your players, which can lead to increased\n\nengagement, monetization, and loyalty. There are many\n\nways teams can use data throughout a game’s development\n\nlifecycle, but far and away the most valuable focus area will be\n\nin building and refining the customer experience.\n\nThroughout the rest of this guide, we will dig into the most\n\ncommon use cases for data, analytics, and AI in game\n\ndevelopment, starting with where we recommend everyone\n\nbegins: game analytics.\n\n\n# Getting Started with Gaming Use Cases\n\n\n### Where do I start? Start with game analytics\n\n**Overview**\n\nBig question: Where’s the best place to start when it comes\n\nto game data, analytics, and AI? For most game studios,\n\nthe best place to start is with game analytics. Setting up a\n\ndashboard for your game analytics that helps you correlate\n\ndata across disparate sources is infinitely valuable in a world\n\n\nwhere there is no one gaming data source to rule them all.\n\nAn effective dashboard should include your game telemetry\n\ndata, data from any game services you’re running, and data\n\nsources outside of your game such as stores, marketplaces,\n\nand social media. See below.\n\n**What we’re trying to solve/achieve**\n\nGetting a strong foundation in game analytics unlocks more\n\nadvanced data, analytics, and AI use cases. For example,\n\nconcurrent player count plus store and marketplace data\n\n\n**GAME TELEMETRY**\n\n\n**Data Sources**\n\n**GAME SERVICES** **OTHER SOURCES**\n\n\n-----\n\nand lifetime value. Usage telemetry combined with crash\n\nreporting and social media listening helps you more quickly\n\nuncover where players might be getting frustrated. And\n\ncorrelating chat logs, voice transcriptions, and or discord\n\n\nthat are relevant and engaging to your players, giving you\n\ntools to effectively market and monetize with your audience.\n\n**Let’s start with Player Segmentation.**\n\n\nand reddit forums can help you identify disruptive behavior\n\n\nbefore it gets out of hand, giving you the tools to take\n\nactionable steps to mitigate toxicity within your community.\n\n**Get started and set up your Analytics Dashboard**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9e966502682ec7b0fdb300a5e74e2770", + "advanced data, analytics, and AI use cases. For example,\n\nconcurrent player count plus store and marketplace data\n\n\n**GAME TELEMETRY**\n\n\n**Data Sources**\n\n**GAME SERVICES** **OTHER SOURCES**\n\n\n-----\n\nand lifetime value. Usage telemetry combined with crash\n\nreporting and social media listening helps you more quickly\n\nuncover where players might be getting frustrated. And\n\ncorrelating chat logs, voice transcriptions, and or discord\n\n\nthat are relevant and engaging to your players, giving you\n\ntools to effectively market and monetize with your audience.\n\n**Let’s start with Player Segmentation.**\n\n\nand reddit forums can help you identify disruptive behavior\n\n\nbefore it gets out of hand, giving you the tools to take\n\nactionable steps to mitigate toxicity within your community.\n\n**Get started and set up your Analytics Dashboard**\n\n### Understand your audience\n\nWith your analytics pipelines set up, the first area of focus is to\n\nbetter understand your audience. This can help you inform a\n\nvariety of key business decisions, from the highest macro order\n\nof “what game(s) to develop”, to how to market and monetize\n\nthose games, and how to optimize the player experience.\n\nBy understanding the demographics, preferences, and\n\nbehaviors of their audience, a game studio can create games\n\nthat are more likely to appeal to their target market and be\n\nsuccessful. You can also use this understanding to tailor your\n\nmarketing and monetization strategies to the needs and\n\npreferences of your players.\n\nAdditionally, understanding your audience can help you\n\n\n##### Player Segmentation\n\n**Overview**\n\nPlayer segmentation is the practice of dividing players\n\ninto groups based on shared characteristics or behaviors.\n\nSegmentation has a number of benefits. You can better\n\nunderstand your players, create more personalized content,\n\nimprove player retention, and optimize monetization, all of\n\nwhich contributes to an improved player experience.\n\n**What we’re trying to solve/achieve**\n\nThe primary objective of segmentation is to ensure you’re\n\nnot treating your entire playerbase the exact same. Humans\n\nare different, and your players have different motivations,\n\npreferences and behaviors. Recognizing this and engaging\n\nwith them in a way that meets them where they’re at\n\nis one of the most impactful ways you can cultivate\n\nengagement with your game. As we mentioned above,\n\nthe benefits of segmentation are broad reaching. Through\n\nbetter understanding of your playerbase, you can better\n\npersonalize experiences, tailoring content and customer\n\nexperience to specific groups of players that increases\n\nengagement and satisfaction. Better understanding of\n\nyour players also helps in improving player retention. By\n\nidentifying common characteristics of players who are at\n\nrisk of churning (i.e., stopping play), you can develop targeted\n\nstrategies that only reach specific audiences.\n\nCreate advanced customer segments to build out more\n\neffective user stories, and identify potential purchasing\n\npredictions based on behaviors. Leverage existing sales\n\ndata, campaigns and promotions systems to create robust\n\nsegments with actionable behavior insights to inform your\n\nproduct roadmap. You can then use this information to build\n\nuseful customer clusters that are targetable with different\n\npromos and offers to drive more efficient acquisition and\n\ndeeper engagement with existing players.\n\n\nidentify potential pain points or areas for improvement\n\n\nwithin your games, allowing you to proactively make changes\n\n\n**Get started with Player Segmentation**\n\n\nto address these issues and improve the player experience\n\nbefore a player potentially churns.\n\n\n-----\n\n**Overview**\n\nPlayer lifetime value (LTV) is a measure of the value that a\n\nplayer brings to a game over the lifetime they play that game.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "eadfe1f8b7a0e36411630a8ece16bbeb", + "strategies that only reach specific audiences.\n\nCreate advanced customer segments to build out more\n\neffective user stories, and identify potential purchasing\n\npredictions based on behaviors. Leverage existing sales\n\ndata, campaigns and promotions systems to create robust\n\nsegments with actionable behavior insights to inform your\n\nproduct roadmap. You can then use this information to build\n\nuseful customer clusters that are targetable with different\n\npromos and offers to drive more efficient acquisition and\n\ndeeper engagement with existing players.\n\n\nidentify potential pain points or areas for improvement\n\n\nwithin your games, allowing you to proactively make changes\n\n\n**Get started with Player Segmentation**\n\n\nto address these issues and improve the player experience\n\nbefore a player potentially churns.\n\n\n-----\n\n**Overview**\n\nPlayer lifetime value (LTV) is a measure of the value that a\n\nplayer brings to a game over the lifetime they play that game.\n\nIt is typically calculated by multiplying the average revenue\n\nper user (ARPU) by the average player lifespan. For example,\n\nif the average player spends $50 per year and plays the\n\ngame for 2 years, their LTV would be $50 * 2 = $100.\n\n**What we’re trying to solve/achieve**\n\nGame studios care about LTV because it helps them\n\nunderstand the long-term value of their players and make\n\ninformed decisions about how to invest in player acquisition\n\nand retention. For example, if the LTV of a player is higher\n\nthan the cost of acquiring them (e.g., through advertising),\n\nit may be worth investing more in player acquisition. On the\n\nother hand, if the LTV of a player is lower than the cost of\n\nacquiring them, it may be more cost-effective to focus on\n\nretaining existing players rather than acquiring new ones.\n\nLTV is one of the more important metrics that game studios,\n\nparticularly those building live service games, can use to\n\nunderstand the value of their players. It is important to\n\nconsider other metrics as well, such as player retention,\n\nmonetization, and engagement.\n\n**Get started with Player Lifetime Value**\n\n##### Social Media Monitoring\n\n**Overview**\n\nAs the great Warren Buffet once said, “It takes 20 years to\n\nbuild a reputation and five minutes to ruin it. If you think\n\nabout that, you’ll do things differently.” Now more than ever,\n\npeople are able to use social media and instantly amplify\n\ntheir voices to thousands of people who share similar\n\ninterests and hobbies. Take Reddit as an example. r/gaming,\n\nthe largest video game community (also called a subreddit)\n\nhas over 35 million members with nearly 500 new posts\n\nand 10,000 new comments per day, while over 120 game-\n\nspecific subreddits have more than 10,000 members each,\n\nthe largest being League of Legends with over 700,000\n\nmembers. The discourse that takes place on online social\n\nplatforms generates massive amounts of raw and organic\n\n\nbe used to understand how customers think and discover\n\nexactly what they want.\n\nThe act and process of monitoring content online across the\n\ninternet and social media for keyword mentions and trends\n\nfor downstream processing and analytics is called media\n\nmonitoring. By applying media monitoring to social media\n\nplatforms, game developers are able to gain new advantages\n\nthat previously might not have been possible, including:\n\n- Programmatically aggregate product ideas for new\n\nfeature prioritization\n\n- Promote a better user experience by automatically\n\nresponding to positive or negative comments\n\n- Understand the top influencers in the industry who can\n\nsway public opinion\n\n- Monitor broader industry trends and emerging segments\n\nsuch as free-to-play games\n\n- Detect and react to controversies or crises as they begin", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3469495b7a915327321a284743718afe", + "the largest being League of Legends with over 700,000\n\nmembers. The discourse that takes place on online social\n\nplatforms generates massive amounts of raw and organic\n\n\nbe used to understand how customers think and discover\n\nexactly what they want.\n\nThe act and process of monitoring content online across the\n\ninternet and social media for keyword mentions and trends\n\nfor downstream processing and analytics is called media\n\nmonitoring. By applying media monitoring to social media\n\nplatforms, game developers are able to gain new advantages\n\nthat previously might not have been possible, including:\n\n- Programmatically aggregate product ideas for new\n\nfeature prioritization\n\n- Promote a better user experience by automatically\n\nresponding to positive or negative comments\n\n- Understand the top influencers in the industry who can\n\nsway public opinion\n\n- Monitor broader industry trends and emerging segments\n\nsuch as free-to-play games\n\n- Detect and react to controversies or crises as they begin\n\n- Get organic and unfiltered feedback of games and features\n\n- Understand customer sentiment at scale\n\n- Make changes faster to keep customer satisfaction high\n\nand prevent churn\n\nBy failing to monitor, understand, and act on what customers\n\nare saying about the games and content you release as\n\nwell as broader industry trends, you risk those customers\n\nleaving for a better experience that meets the demands and\n\nrequirements of what customers want.\n\n**What we’re trying to solve/achieve**\n\nBy monitoring and listening to what existing and potential\n\ncustomers are saying on social media, game developers\n\nare able to get a natural and organic understanding of how\n\ncustomers actually feel about the games and products they\n\nrelease, or gauge consumer interest before investing time\n\nand money in a new idea. The main process for social media\n\nmonitoring is to gather data from different social media\n\nplatforms, such as Twitter or YouTube, process those comments\n\nor tweets, then take action on the processed data. While\n\ncustomer feedback can be manually discovered and processed\n\nin search of certain keyword mentions or feedback, it is a much\n\nbetter idea to automate it and do it programmatically.\n\n**Get started with Social Media Monitoring**\n\n\n-----\n\n**Overview**\n\nPlayer feedback analysis is the process of collecting,\n\nanalyzing, and acting on player feedback to inform game\n\ndevelopment. It involves collecting player feedback from\n\nmultiple sources, such as in-game surveys, customer\n\nsupport tickets, social media, marketplace reviews, and\n\nforums, and using data analytics tools to identify patterns,\n\ntrends, and insights. The goal of player feedback analysis is\n\nto better understand player needs, preferences, and pain\n\npoints, and use this information to inform game development\n\ndecisions and improve the overall player experience.\n\nPlayer feedback analysis is an important part of game\n\ndevelopment as it helps ensure that the game continues to\n\nmeet player needs and expectations. By regularly collecting and\n\nanalyzing player feedback, game studios can make data-driven\n\ndecisions to improve the game, increase player engagement\n\nand retention, and ultimately drive success and growth.\n\nFor this use case, we’re going to focus on taking online\n\nreviews for your video game and categorizing the different\n\ntopics players are talking about (bucketing topics) in order\n\nto better understand the themes (via positive or negative\n\nsentiment) affecting your community.\n\n**What we’re trying to solve/achieve**\n\nThis is incredibly helpful, providing data-driven customer\n\ninsight into your development process. Whether used in\n\n\n**Overview**\n\nAcross massively multiplayer online video games (MMOs),\n\nmultiplayer online battle arena games (MOBAs) and other\n\nforms of online gaming, players continuously interact in real\n\ntime to either coordinate or compete as they move toward a\n\ncommon goal — winning. This interactivity is integral to game", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9b831c107aaf2a3678bf3a75ed18d52c", + "analyzing player feedback, game studios can make data-driven\n\ndecisions to improve the game, increase player engagement\n\nand retention, and ultimately drive success and growth.\n\nFor this use case, we’re going to focus on taking online\n\nreviews for your video game and categorizing the different\n\ntopics players are talking about (bucketing topics) in order\n\nto better understand the themes (via positive or negative\n\nsentiment) affecting your community.\n\n**What we’re trying to solve/achieve**\n\nThis is incredibly helpful, providing data-driven customer\n\ninsight into your development process. Whether used in\n\n\n**Overview**\n\nAcross massively multiplayer online video games (MMOs),\n\nmultiplayer online battle arena games (MOBAs) and other\n\nforms of online gaming, players continuously interact in real\n\ntime to either coordinate or compete as they move toward a\n\ncommon goal — winning. This interactivity is integral to game\n\nplay dynamics, but at the same time, it’s a prime opening for\n\ntoxic behavior — an issue pervasive throughout the online\n\nvideo gaming sphere.\n\nToxic behavior manifests in many forms, such as the varying\n\ndegrees of griefing, cyberbullying and sexual harassment\n\nthat are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n\n[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n\nthe multiplayer game, _Dead by Daylight_ .\n\n\npre-production, such as looking at games that are similar\n\n\n**Survivors**\n\n\nwith reviews to learn where those games have strengths and\n\nweaknesses; or using player feedback analysis with a live\n\nservice title to identify themes that can apply to your product\n\nroadmap, player feedback analysis helps teams better\n\nsupport and cultivate engagement with the player community.\n\n\n**GEN**\n\n**RUSHING**\n\n\n**GEN**\n\n\n**HIDING** **ACTIVATING** **LOOPING**\n**EMOTES**\n\n\n**RUSH** **BLINDING** **SANDBAGGING**\n**UNHOOKING**\n\n**TEABAGGING**\n\n\n**REPORTING** **REPORTING**\n\n\n**REPORTING** **REPORTING**\n\n\n**TEXT**\n**CHATTING**\n\n\nUltimately, player feedback analysis does two things. 1) It\n\n\n**Less**\n\n**toxic**\n\n\n**Most**\n**toxic**\n\n\ncan help you stack rank themes according to positive and\n\nnegative sentiment, and 2) you can weight those themes\n\naccording to impact on player engagement, toxicity,\n\nmonetization, churn, and more. We’ve all read reviews that\n\nare overly positive, or overly negative. The process of player\n\nfeedback analysis helps to normalize feedback across the\n\ncommunity (keeping in mind, only for those who have written\n\na review), so you’re not over indexing on one review, or a\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n**CAMPING** **CAMPING**\n\n\n**FARMING** **FARMING**\n\n\n**CAMPING** **CAMPING**\n\n\n**BEING AWAY**\n**FROM**\n**KEYBOARD**\n**(AFK)**\n\n\n**CAMPING**\n\n**DRIBBLING** **TUNNELING**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e3f8982a30fe6e2560236fad0446093a", + "are overly positive, or overly negative. The process of player\n\nfeedback analysis helps to normalize feedback across the\n\ncommunity (keeping in mind, only for those who have written\n\na review), so you’re not over indexing on one review, or a\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n**CAMPING** **CAMPING**\n\n\n**FARMING** **FARMING**\n\n\n**CAMPING** **CAMPING**\n\n\n**BEING AWAY**\n**FROM**\n**KEYBOARD**\n**(AFK)**\n\n\n**CAMPING**\n\n**DRIBBLING** **TUNNELING**\n\n\n**LOBBY**\n**DODGING**\n\n**BODY**\n**BLOCKING**\n\n**FACE**\n**SLUGGING** **CAMPING**\n\n\n**Killers**\n\n\nsingle theme that may seem in the moment very pressing.\n\nIn addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n\n\n**Get started with Player Feedback Analysis**\n\n\non gamers and the community -- an issue that cannot be\n\n\n-----\n\ngame studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n\n\n[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n\ntoxicity, and of those, 20% reported leaving the game due to\n\nthese interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n\nshowed that having a disruptive or toxic encounter in the first\n\nsession of the game led to players being over three times\n\nmore likely to leave the game without returning. Given that\n\nplayer retention is a top priority for many studios, particularly\n\nas game delivery transitions from physical media releases to\n\nlong-lived services, it’s clear that toxicity must be curbed.\n\nCompounding this issue related to churn, some companies\n\nface challenges related to toxicity early in development,\n\neven before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n\nreleased into testing without text or voice chat due in part\n\nto not having a system in place to monitor or manage toxic\n\n\nIn this section, we’re going to talk about how to use your data\n\nto more effectively find your target audience across the web.\n\nWhether you’re engaging in paid advertising, influencer or\n\nreferral marketing, PR, cross promotion, community building,\n\netc - use data to separate activity from impact. You want\n\nto focus on the channels and strategies that leverage your\n\nresources most effectively, be that time or money.\n\nSay you have a cohort of highly engaged players who are\n\nspending money on your title, and you want to find more\n\ngamers just like that. Doing an analysis on the demographic\n\nand behavioral data of this cohort will give you the\n\ninformation needed to use an ad platform (such as Meta,\n\nGoogle, or Unity) to do lookalike modeling and target those\n\npotential gamers for acquisition.\n\n\ngamers and interactions. This illustrates that the scale of\n\n\nthe gaming space has far surpassed most teams’ ability to\n\nmanage such behavior through reports or by intervening in\n\ndisruptive interactions. Given this, it’s essential for studios", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7db6c2bd164244acf22349256c2cf9c3", + "to more effectively find your target audience across the web.\n\nWhether you’re engaging in paid advertising, influencer or\n\nreferral marketing, PR, cross promotion, community building,\n\netc - use data to separate activity from impact. You want\n\nto focus on the channels and strategies that leverage your\n\nresources most effectively, be that time or money.\n\nSay you have a cohort of highly engaged players who are\n\nspending money on your title, and you want to find more\n\ngamers just like that. Doing an analysis on the demographic\n\nand behavioral data of this cohort will give you the\n\ninformation needed to use an ad platform (such as Meta,\n\nGoogle, or Unity) to do lookalike modeling and target those\n\npotential gamers for acquisition.\n\n\ngamers and interactions. This illustrates that the scale of\n\n\nthe gaming space has far surpassed most teams’ ability to\n\nmanage such behavior through reports or by intervening in\n\ndisruptive interactions. Given this, it’s essential for studios\n\nto integrate analytics into games early in the development\n\nlifecycle and then design for the ongoing management of\n\ntoxic interactions.\n\n**What we’re trying to solve/achieve**\n\nToxicity in gaming is clearly a multifaceted issue that\n\nhas become a part of video game culture and cannot be\n\naddressed universally in a single way. That said, addressing\n\ntoxicity within in-game chat can have a huge impact given\n\nthe frequency of toxic behavior and the ability to automate\n\nthe detection of it using natural language processing (NLP). In\n\nsummary, by leveraging machine learning to better identify\n\ndisruptive behavior so that better-informed decisions\n\naround handling actions can be made.\n\n**Get started with Toxicity Detection**\n\n\n##### Multi-Touch Attribution\n\n**Overview**\n\nMulti-touch attribution is a method of attributing credit to\n\ndifferent marketing channels or touchpoints that contribute to\n\na sale or conversion. In other words, it is a way of understanding\n\nhow different marketing efforts influence a customer’s decision\n\nto make a purchase or take a desired action.\n\nThere are a variety of different attribution models that can\n\nbe used to assign credit to different touchpoints, each with\n\nits own strengths and limitations. For example, the last-\n\nclick model attributes all credit to the last touchpoint that\n\nthe customer interacted with before making a purchase,\n\nwhile the first-click model attributes all credit to the first\n\ntouchpoint. Other models, such as the linear model or\n\nthe time decay model, distribute credit across multiple\n\ntouchpoints based on different algorithms.\n\n**What we’re trying to solve/achieve**\n\nMulti-touch attribution can be useful for game studios because\n\nit can help them understand which marketing channels or\n\nefforts are most effective at driving conversions and inform their\n\nmarketing strategy. However, it is important to choose the right\n\nattribution model for your title based on your business model\n\n(one-time purchase, subscription, free-to-play, freemium,\n\nin-game advertising, etc.) and regularly review and optimize your\n\nattribution efforts to ensure they are accurate and effective.\n\n**Get started with Multi-Touch Attribution**\n\n\n-----\n\n### Activating Your Playerbase\n\nSo far, we’ve discussed how to better understand your\n\nplayers, and how to acquire more of your target audience.\n\nNext, we’re going to dig into how to better activate your\n\nplayers to create a more engaged and loyal playerbase that\n\nstays with your game for the long-term. Here, we’re going to\n\nfocus on strategies that differentiate your gamer experience.\n\n##### Player Recommendations\n\n\nand make in-game purchases. Additionally, personalized\n\nrecommendations can help improve the overall player\n\nexperience and increase satisfaction.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "aabb362105ded42be4f95f2d29ae3027", + "attribution model for your title based on your business model\n\n(one-time purchase, subscription, free-to-play, freemium,\n\nin-game advertising, etc.) and regularly review and optimize your\n\nattribution efforts to ensure they are accurate and effective.\n\n**Get started with Multi-Touch Attribution**\n\n\n-----\n\n### Activating Your Playerbase\n\nSo far, we’ve discussed how to better understand your\n\nplayers, and how to acquire more of your target audience.\n\nNext, we’re going to dig into how to better activate your\n\nplayers to create a more engaged and loyal playerbase that\n\nstays with your game for the long-term. Here, we’re going to\n\nfocus on strategies that differentiate your gamer experience.\n\n##### Player Recommendations\n\n\nand make in-game purchases. Additionally, personalized\n\nrecommendations can help improve the overall player\n\nexperience and increase satisfaction.\n\nGame studios can use a variety of techniques to create player\n\nrecommendations, such as machine learning algorithms,\n\ncollaborative filtering, and manual curation. It is important\n\nto regularly review and optimize these recommendations to\n\nensure that they are effective and relevant to players.\n\n**Get started with Player Recommendations**\n\n\n**Overview**\n\nPlayer recommendations are suggestions for content or actions\n\n\nthat a game studio makes to individual players based on their\n\ninterests and behaviors. These recommendations can be used\n\nto promote specific in-game items, encourage players to try\n\nnew features, or simply provide a personalized experience.\n\n**What we’re trying to solve/achieve**\n\nPlayer recommendations matter to game studios because\n\nthey can help improve player retention, engagement, and\n\nmonetization. By providing players with recommendations\n\nthat are relevant and engaging, studios can increase the\n\nlikelihood that players will continue to play their games\n\n\n##### Next Best Offer/Action\n\n**Overview**\n\nNext best offer (NBO) and next best action (NBA) are\n\ntechniques that businesses use to make personalized\n\nrecommendations to their customers. NBO refers to the\n\npractice of recommending the most relevant product or\n\nservice to a customer based on their past purchases and\n\nbehaviors. NBA refers to the practice of recommending the\n\nmost relevant action or interaction to a customer based on\n\nthe same information.\n\n\n-----\n\nin-game purchase to a player based on their past spending\n\nhabits and the items they have shown an interest in. They\n\nmight use NBA to recommend a specific level or event to a\n\nplayer based on their progress and interests.\n\n**What we’re trying to solve/achieve**\n\nIt’s important to remember that next best offer is a specific\n\nuse case within personalization that involves making\n\nrecommendations to players on the most valuable in-game\n\nitem or action they should take next. For example, a next\n\nbest offer recommendation in a mobile game might suggest\n\nthat a player purchase a specific in-game currency or unlock\n\na new character.\n\nBoth NBO and NBA can be used to improve customer\n\nretention, engagement, and monetization by providing\n\npersonalized recommendations that are more likely to be\n\nrelevant and appealing to individual customers. They can be\n\nimplemented using a variety of techniques, such as machine\n\nlearning algorithms or manual curation.\n\n**Get started with Next Best Offer/Action**\n\n##### Churn Prediction & Prevention\n\n**Overview**\n\nVideo games live and die by their player base. For Games-\n\n\nmay overwhelm the ability of these players to consume,\n\nreinforcing the overall problem of player churn.\n\nAt some point, it becomes critical for teams to take a cold,\n\nhard look at the cost of acquisition relative to the subscriber", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "228e006f7573d58e282d8049ca8f2351", + "item or action they should take next. For example, a next\n\nbest offer recommendation in a mobile game might suggest\n\nthat a player purchase a specific in-game currency or unlock\n\na new character.\n\nBoth NBO and NBA can be used to improve customer\n\nretention, engagement, and monetization by providing\n\npersonalized recommendations that are more likely to be\n\nrelevant and appealing to individual customers. They can be\n\nimplemented using a variety of techniques, such as machine\n\nlearning algorithms or manual curation.\n\n**Get started with Next Best Offer/Action**\n\n##### Churn Prediction & Prevention\n\n**Overview**\n\nVideo games live and die by their player base. For Games-\n\n\nmay overwhelm the ability of these players to consume,\n\nreinforcing the overall problem of player churn.\n\nAt some point, it becomes critical for teams to take a cold,\n\nhard look at the cost of acquisition relative to the subscriber\n\nlifetime value (LTV) earned. These figures need to be brought\n\ninto a healthy balance, and retention needs to be actively\n\nmanaged, not as a point-in-time problem to be solved, but\n\nas a “chronic condition” which needs to be managed for the\n\nongoing health of the title.\n\nHeadroom for continued acquisition-driven growth can\n\nbe created by carefully examining why some players leave\n\nand some players stay. When centered on factors known\n\nat the time of acquisition, gaming studios may have the\n\nopportunity to rethink key aspects of their acquisition\n\nstrategy that promote higher average retention rates, which\n\ncan lead to higher average revenue per user.\n\n**Prerequisites for use case**\n\nThis use case assumes a certain level of existing data\n\ncollection infrastructure in the studio. Notably, a studio ready\n\nto implement a churn prediction and prevention model\n\nshould have\n\n- A cloud environment where player data is stored\n\n- This source data should contain player behavior and\n\nsession telemetry events from within the game. This is\n\nthe foundation that insights can be built on top of.\n\n\nas-a-Service (GaaS) titles, engagement is the most\n\n\nimportant metric a team can measure. Naturally, proactively\n\npreventing churn is critical to sustained engagement and\n\n\n**Get started with Churn Prediction & Prevention**\n\n\ngrowth. Through churn prediction and prevention, you will\n\n\nbe able to analyze behavioral data to identify subscribers\n\nwith an increased risk of churn. Next, you will use machine\n\nlearning to quantify the likelihood of a subscriber to churn, as\n\nwell as indicate which factors create that risk.\n\n**What we’re trying to solve/achieve**\n\nBalancing customer acquisition and retention is critical.\n\nThis is the central challenge to the long-term success of\n\nany live service game. This is particularly challenging in that\n\nsuccessful customer acquisition strategies needed to get\n\ngames to scale tend to be followed by service disruptions or\n\ndeclines in quality and customer experience, accelerating\n\nplayer abandonment. To replenish lost subscribers, the\n\nacquisition engine continues to grind and expenses mount.\n\nAs games reach for customers beyond the core playerbase\n\nthey may have initially targeted, the title may not resonate\n\n\n##### Real-time Ad Targeting\n\n**Overview**\n\nReal-time ad targeting in the context of game development\n\nfocuses on using data to deliver personalized and relevant\n\nadvertisements to players in near real-time, while they are\n\nplaying a game. Real-time targeting is performanced based,\n\nusing highly personalized messagings which are achieved\n\nby using data to precisely determine the most opportune\n\nmoments to display ads, based on factors such as player\n\nbehavior, game state, and other contextual information.\n\nKnowing when to send those ads is based on data. This", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "da4fee979a43f9f958bc567f6e1debe3", + "successful customer acquisition strategies needed to get\n\ngames to scale tend to be followed by service disruptions or\n\ndeclines in quality and customer experience, accelerating\n\nplayer abandonment. To replenish lost subscribers, the\n\nacquisition engine continues to grind and expenses mount.\n\nAs games reach for customers beyond the core playerbase\n\nthey may have initially targeted, the title may not resonate\n\n\n##### Real-time Ad Targeting\n\n**Overview**\n\nReal-time ad targeting in the context of game development\n\nfocuses on using data to deliver personalized and relevant\n\nadvertisements to players in near real-time, while they are\n\nplaying a game. Real-time targeting is performanced based,\n\nusing highly personalized messagings which are achieved\n\nby using data to precisely determine the most opportune\n\nmoments to display ads, based on factors such as player\n\nbehavior, game state, and other contextual information.\n\nKnowing when to send those ads is based on data. This\n\nuse case is specific to titles using in-game advertising as a\n\nbusiness model. It’s important to note that in-game real-\n\ntime ad targeting requires a sophisticated tech stack, with\n\n\n-----\n\nwith bigger ad ecosystem, ad networks and partners. The\n\nDatabricks Lakehouse platform is an optimal foundation as it\n\nalready contains many of the connectors required to enable\n\nthis use case.\n\n**What we’re trying to solve/achieve**\n\nThe goal of in-game real-time ad targeting is to provide a\n\nmore immersive and relevant advertising experience for\n\nplayers, while also increasing the effectiveness of the ads\n\nfor advertisers. By delivering targeted ads that are relevant\n\nto each player’s interests, game developers can create a\n\nmore enjoyable and personalized gaming experience, which\n\ncan help to reduce churn and increase the lifetime value of\n\neach player. Additionally, real-time ad targeting can also help\n\ngame developers monetize their games more effectively, as\n\nadvertisers are willing to pay a premium for hyper-targeted\n\nand engaged audiences.\n\n**Get started with Real-time Ad Targeting**\n\n### Operational use cases\n\nIn the game development industry, operational analytics\n\n\n**Overview**\n\nAnomaly detection plays an important role in the operation\n\nof a live service video game by helping to identify and\n\ndiagnose unexpected behaviors in real-time. By identifying\n\npatterns and anomalies in player behavior, system\n\nperformance, and network traffic, this information can then\n\nbe used to detect and diagnose server crashes, performance\n\nbottlenecks, and hacking attempts. The ability to understand\n\nif there will be an issue before it becomes widespread is\n\nimmensely valuable. Without anomaly detection, which is\n\na form of advanced analytics, you’re always in a reactive\n\n(rather than proactive) state. Anomaly detection is a type of\n\nquality of service solution.\n\n**What we’re trying to solve/achieve**\n\nThe goal of anomaly detection is to ensure that players\n\nhave a stable and enjoyable gaming experience. This has\n\nan impact across your game, from reducing downtime,\n\nto minimizing player churn, and improving your game’s\n\nreputation and revenue. Additionally, the insights gained from\n\nanomaly detection can also be used to mitigate cheating and\n\ndisruptive behavior.\n\n**Get started with Anomaly Detection**\n\n\nare essential for ensuring a smooth and efficient production\n\n\nprocess. One common use case is anomaly detection, where\n\ndata analytics is utilized to identify any unusual patterns\n\nor behaviors in the game, such as crashes or performance\n\nissues. This helps developers quickly identify and fix\n\nproblems, improving the overall quality of the game. Another\n\nexample is build pipelines, where data analytics can be used\n\nto monitor and optimize the process of creating new builds", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "b594c19e3f84c444c8a005a86a439733", + "quality of service solution.\n\n**What we’re trying to solve/achieve**\n\nThe goal of anomaly detection is to ensure that players\n\nhave a stable and enjoyable gaming experience. This has\n\nan impact across your game, from reducing downtime,\n\nto minimizing player churn, and improving your game’s\n\nreputation and revenue. Additionally, the insights gained from\n\nanomaly detection can also be used to mitigate cheating and\n\ndisruptive behavior.\n\n**Get started with Anomaly Detection**\n\n\nare essential for ensuring a smooth and efficient production\n\n\nprocess. One common use case is anomaly detection, where\n\ndata analytics is utilized to identify any unusual patterns\n\nor behaviors in the game, such as crashes or performance\n\nissues. This helps developers quickly identify and fix\n\nproblems, improving the overall quality of the game. Another\n\nexample is build pipelines, where data analytics can be used\n\nto monitor and optimize the process of creating new builds\n\nof the game. By tracking key metrics such as build time,\n\nerror rates, and resource utilization, developers can make\n\ninformed decisions about how to optimize the build process\n\nfor maximum efficiency. Other operational use cases in game\n\ndevelopment include tracking player behavior, measuring\n\nserver performance, and analyzing sales and marketing data.\n\nLets explore a few of these below.\n\n\n##### Build Pipeline\n\n**Overview**\n\nA build pipeline is a set of automated processes that\n\nare used to compile and assemble the code, assets, and\n\nresources that make up a game project. The build pipeline\n\ntypically includes several stages, such as code compilation,\n\noptimization, testing, and release. The purpose of a build\n\npipeline is to streamline the game development process\n\nand ensure that each stage of development is completed\n\nefficiently and effectively. A build pipeline can be configured\n\nto run automatically, so that new builds are generated\n\nwhenever changes are made to the code or assets. This\n\nhelps to ensure that the game is always up-to-date and\n\nready for testing and release. The logs are collected are in\n\nnear-real time from build servers. A simplified example:Dev\n\nX is committing code on title Y, submitted on day Z,\n\nalong with the log files from the pipeline and build server.\n\nBuilds typically take multiple hours to complete, requiring\n\nsignificant amounts of compute via build farms. Being able to\n\n\n-----\n\nare wasting compute, and being able to predict which builds\n\nwill fail as they goes through the pipeline are ways to curb\n\noperational expenses.\n\n**What we’re trying to solve/achieve**\n\nWith this use case, we’re seeking to reduce wasted compute\n\nand build a foundational view of what was developed, by\n\nwho, when and how testing performed. In an ideal state, our\n\nautomated build pipeline could send a notification to the\n\ndeveloper with a confidence metric on the build making it\n\nthrough, allowing them to decide whether to continue or\n\nmove another build through the pipeline. Often, developers\n\ndo not have clear visibility until the build has completed\n\nor failed. By providing more insight to devs into the build\n\npipeline process, we can increase the rate at which builds\n\nare completed efficiently and effectively.\n\n**Get started with Build Pipeline**\n\n##### Crash Analytics\n\n\nresources were being used. How long crash testing takes\n\ncan vary, depending on the game’s business model, amount\n\nof content, and scope. For a title with a one-time release,\n\nwhere there is a large amount of content and a complex\n\nstoryline, the chances of hidden crashes causing errors while\n\nin development are high, making it require more time to\n\nperform testing before the game can be published. For titles\n\nbuilt in a game-as-a-service model, i.e. a game shipped in\n\ncycles of constant iteration, crash detection should be done", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "0d75377e5be40a4a0c15fdaea8622460", + "through, allowing them to decide whether to continue or\n\nmove another build through the pipeline. Often, developers\n\ndo not have clear visibility until the build has completed\n\nor failed. By providing more insight to devs into the build\n\npipeline process, we can increase the rate at which builds\n\nare completed efficiently and effectively.\n\n**Get started with Build Pipeline**\n\n##### Crash Analytics\n\n\nresources were being used. How long crash testing takes\n\ncan vary, depending on the game’s business model, amount\n\nof content, and scope. For a title with a one-time release,\n\nwhere there is a large amount of content and a complex\n\nstoryline, the chances of hidden crashes causing errors while\n\nin development are high, making it require more time to\n\nperform testing before the game can be published. For titles\n\nbuilt in a game-as-a-service model, i.e. a game shipped in\n\ncycles of constant iteration, crash detection should be done\n\ncontinuously, since errors in newly released content might\n\naffect the base game and lead to crashes.\n\nIncreasingly, titles are being released in alpha (where\n\ndevelopers do the testing), closed beta (which includes a\n\nlimited group of testers/sample-users who do the gameplay\n\ntesting) and open betas (where anyone interested can register\n\nto try the game). All of which happens before the game is\n\n“officially” released. Regardless of alpha, beta, or GA, players\n\nmay stumble over game crashes, which triggers crash reports\n\nthat are sent to the developers for fixing. But sometimes, it\n\ncan be challenging to understand the issue that caused the\n\ncrash from crash reports provided by your game’s platform.\n\n**What we’re trying to solve/achieve**\n\nUltimately, the purpose of crash analytics is to identify the\n\nroot cause of a crash, and help you take steps to prevent\n\nsimilar crashes from happening in the future. This feedback\n\nloop can be tightened through automation in the data\n\npipeline. For example, by tracking crashes caused on builds\n\nfrom committers, the data can provide build suggestions\n\nto improve crash rate. Furthermore, teams can automate\n\ndeduplication when multiple players experience the same\n\nerrors, helping to reduce noise in the alerts received.\n\n**Get started with Crash Analytics**\n\n\n**Overview**\n\nGames crash, it is a fact of game development. The\n\ncombination of drivers, hardware, software, and\n\nconfigurations create unique challenges in tracking, resolving\n\nand managing the user experience.\n\nCrash analytics and reporting is the process of collecting\n\ninformation about crashes or unexpected failures in a\n\nsoftware application, in this case, a video game. A crash\n\nreport typically includes information about the state of the\n\ngame at the time of the crash, such as what the player was\n\n\n-----\n\n# Things to look forward to\n\n\nThis eBook was created to help game developers better\n\nwrap their heads around the general concepts in which data,\n\nanalytics, and AI can be used to support the development\n\nand growth of video games. **If you only have 5 minutes,**\n\n**these takeaways are critical to your success** .\n\nFor more information on advanced data, analytics, and AI use\n\ncases, as well as education resources, we highly recommend\n\nDatabricks training portal [dbricks.co/training](http://dbricks.co/training) .\n\n**Top takeaways:**\n\nIf you take nothing else from this guide, here are the most\n\nimportant takeaways we want to leave with you on your journey.\n\n`1.` **Data is fundamental. Data, analytics, and AI play a role**\n\nthroughout the entire game development lifecycle - from\n\ndiscovery to pre-production, development to operating\n\na game as a live service. Build better games, cultivate", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9036a5784356944e4eb45e43a0f91ec0", + "wrap their heads around the general concepts in which data,\n\nanalytics, and AI can be used to support the development\n\nand growth of video games. **If you only have 5 minutes,**\n\n**these takeaways are critical to your success** .\n\nFor more information on advanced data, analytics, and AI use\n\ncases, as well as education resources, we highly recommend\n\nDatabricks training portal [dbricks.co/training](http://dbricks.co/training) .\n\n**Top takeaways:**\n\nIf you take nothing else from this guide, here are the most\n\nimportant takeaways we want to leave with you on your journey.\n\n`1.` **Data is fundamental. Data, analytics, and AI play a role**\n\nthroughout the entire game development lifecycle - from\n\ndiscovery to pre-production, development to operating\n\na game as a live service. Build better games, cultivate\n\ndeeper player engagements, and operate more effectively\n\n\nby utilizing the full potential of your data.\n\n`2.` **Define your goals.** Start by establishing the goals of what\n\nyou’re hoping to learn and or understand around your\n\ngame. Clear goals make it easier to identify key metrics\n\nto track, example goals include; developing high-quality\n\ngames that provide engaging and satisfying player\n\nexperiences, increasing player engagement and retention\n\nby analyzing and improving gameplay and mechanics, and\n\nbuilding a strong and positive brand reputation through\n\neffective marketing and community outreach.\n\n`3.` **Identify and understand your data sources.** Spend time\n\nto identify and understand the breadth of data sources\n\nyou are already collecting, be that game telemetry,\n\nmarketplace, game services, or sources beyond the game\n\nlike social media. It is critical to collect the right data, and\n\ntrack the right metrics based on the goals and objectives\n\nyou have set for your game.\n\n`4.` **Start small, and iterate quickly.** Recognize that goals and\n\nobjectives evolve as you learn more about the interaction\n\n\n-----\n\nare most effective when scoped small with tight feedback\n\nloops, allowing you to quickly adapt with your community\n\nand alongside shifting market conditions.\n\n`5.` **Game analytics forms the foundation.** Start by getting a\n\ngame analytics dashboard up and running. The process of\n\nbuilding out a dashboard will naturally require connecting\n\nand transforming your data in a way to unlock more\n\nadvanced use cases down the road.\n\n`6.` **Plan and revisit your data strategy frequently.** Once\n\ndashboarding is set up, you’ll have a better picture of what\n\ndownstream data use cases make the most sense for\n\nyour game and business objectives. As you move to use\n\ncases such as player segmentation, churn analysis, and\n\nplayer lifetime value, revisit your data strategy frequently\n\nto ensure you’re spending time on use cases that drive\n\nactionable insights for you and your team.\n\n`7.` **Show value broad and wide.** Whether your data strategy\n\nis new or well established on the team, build the habit\n\nof communicating broadly to stakeholders across the\n\ncompany. Early in the process, it is important to gather\n\ncritical feedback on what data is helpful and where there\n\nare opportunities for improvement. The worst thing that\n\ncan happen is you create something that no one uses.\n\nThat is a waste of everyone’s time and money.\n\n`8.` **Ask for help.** Engage with your technical partners. There\n\nare humans who can help ensure you’re developing your\n\ndata and analytics platform in a way that is efficient and\n\neffective. There are numerous partners with domain\n\nexpertise in data science and data engineering that can\n\naccelerate your data journey - here is our recommended", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "c9711ea58e3c015192f3db3f146bafab", + "player lifetime value, revisit your data strategy frequently\n\nto ensure you’re spending time on use cases that drive\n\nactionable insights for you and your team.\n\n`7.` **Show value broad and wide.** Whether your data strategy\n\nis new or well established on the team, build the habit\n\nof communicating broadly to stakeholders across the\n\ncompany. Early in the process, it is important to gather\n\ncritical feedback on what data is helpful and where there\n\nare opportunities for improvement. The worst thing that\n\ncan happen is you create something that no one uses.\n\nThat is a waste of everyone’s time and money.\n\n`8.` **Ask for help.** Engage with your technical partners. There\n\nare humans who can help ensure you’re developing your\n\ndata and analytics platform in a way that is efficient and\n\neffective. There are numerous partners with domain\n\nexpertise in data science and data engineering that can\n\naccelerate your data journey - here is our recommended\n\npartner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n\n`9.` **Participate in the community.** The community for game\n\nanalytics is large and growing. It is important to research and\n\n\nyour needs and interests. Here are a few of our favorites:\n\n`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n\nSpecial Interest Groups that bring together user\n\nresearchers, designers, data engineers and data\n\nscientists focused on understanding player behavior\n\nand experiences. They offer resources and events\n\nfor those working in games user research, including a\n\nyearly Games User Research Summit.\n\n`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n\nglobal community of data scientists and engineers.\n\nWhile not specifically focused on game development,\n\nthey offer a wealth of resources and opportunities for\n\nlearning, networking, and collaboration in the field of\n\ndata science.\n\n`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n\nLanguage Processing, computer vision, and other fields\n\nwhere AI plays its role. They also provide an online\n\nplatform where users can access pre-trained models\n\nand tools, share their own models and datasets, and\n\ncollaborate with other developers in the community.\n\n`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n\nsubreddit is a forum for data engineers to discuss\n\ntopics related to building and managing data pipelines,\n\ndata warehousing, and related technologies. While\n\nnot specifically focused on game development, it\n\ncan be a valuable resource for those working on data\n\nengineering in the gaming industry.\n\n`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n\nfirst step in your data journey. Imagine how the output of\n\nyour data can be presented in a way to help stakeholders\n\nacross your company achieve more. For example, dropping\n\ndata into an application that can help game designers\n\nmake balancing decisions based on player events.\n\n\n-----\n\n# APPENDIX Ultimate class build guide\n\n\n### Creating a character\n\nThe heart and soul of mature data teams are formed by this\n\ntrio of classes. There are many aspects to these roles, but\n\nthey can be summarized in that Data Engineers create and\n\nmaintain critical data workflows, Data Analysts interpret data\n\nand create reports that keep the business teams running\n\nseamlessly, and Data Scientists are responsible for making\n\nsense of large amounts of data. Depending on the size of\n\nthe organization, individuals may be required to multiclass", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "1ce1d861d15136fd48438be91479e567", + "engineering in the gaming industry.\n\n`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n\nfirst step in your data journey. Imagine how the output of\n\nyour data can be presented in a way to help stakeholders\n\nacross your company achieve more. For example, dropping\n\ndata into an application that can help game designers\n\nmake balancing decisions based on player events.\n\n\n-----\n\n# APPENDIX Ultimate class build guide\n\n\n### Creating a character\n\nThe heart and soul of mature data teams are formed by this\n\ntrio of classes. There are many aspects to these roles, but\n\nthey can be summarized in that Data Engineers create and\n\nmaintain critical data workflows, Data Analysts interpret data\n\nand create reports that keep the business teams running\n\nseamlessly, and Data Scientists are responsible for making\n\nsense of large amounts of data. Depending on the size of\n\nthe organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in\n\ndata engineering, analytics and data science.\n\nWhether you’re looking to stand-up an analytics dashboard\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n##### Data Engineers\n\n\n**Goals and Priorities of Data Engineers**\n\n- Enable access to usable data for real-time insights — data\n\nthat both enables timely decision-making and is accurate\n\nand reproducible\n\n- Increase user confidence and trust in data. This involves\n\nensuring high consistency and reliability in ETL processes\n\n- Limit the issues and failures experienced by other\n\nengineers and data scientists, allowing those roles to\n\nfocus less on troubleshooting and more on drawing\n\nmeaningful conclusions from data and building new\n\nproducts / features\n\n**What Data Engineers care about:**\n\n- Enabling access to data for real-time insights — data that\n\nboth enables timely decision-making and is accurate and\n\nreproducible\n\n- Building high-performance, reliable and scalable pipelines\n\nfor data processing\n\n- Delivering data for consumption from a variety of sources\n\nby Data Analysts and Data Scientists against tight SLAs\n\n- A Data Engineer’s biggest challenge? Collaboration\n\nacross teams\n\n\nData engineers build systems that collect, manage, and\n\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n**Responsibilities:**\n\n- Data Engineers are responsible for data migration,\n\nmanipulation, and integration of data (joining dissimilar\n\ndata systems)\n\n- Setup and maintenance of ETL pipelines to convert\n\nsource data into actionable data for insights. It is the\n\nresponsibility of the data engineer to make sure these\n\npipelines run efficiently and are well orchestrated.\n\n- The Data Engineer sets up the workflow process\n\nto orchestrate pipelines for the studio’s data and\n\ncontinuously validates it\n\n- Managing workflows to enable data scientists and data\n\nanalysts, and ensuring workflows are well-integrated with\n\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\n\n\n##### Data Scientists\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Responsibilities:**\n\n- Responsible for making sense of the large amounts of data\n\ncollected for a given game title, such as game telemetry,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "daee446a9e3d5402fc1e2ae7ee387d8d", + "- Setup and maintenance of ETL pipelines to convert\n\nsource data into actionable data for insights. It is the\n\nresponsibility of the data engineer to make sure these\n\npipelines run efficiently and are well orchestrated.\n\n- The Data Engineer sets up the workflow process\n\nto orchestrate pipelines for the studio’s data and\n\ncontinuously validates it\n\n- Managing workflows to enable data scientists and data\n\nanalysts, and ensuring workflows are well-integrated with\n\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\n\n\n##### Data Scientists\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Responsibilities:**\n\n- Responsible for making sense of the large amounts of data\n\ncollected for a given game title, such as game telemetry,\n\nbusiness KPIs, game health and quality, and sources\n\nbeyond the game such as social media listening\n\n- The analytics portion of a Data Scientist’s job means\n\nlooking at new and existing data to try and discover new\n\nthings within it\n\n- The engineering component may include writing out\n\npipeline code and deploying it to a repository\n\n- Data Scientists are responding for building, maintaining, and\n\nmonitoring models used for analytics and/or data products\n\n\n-----\n\n**Goals and Priorities:**\n\n- Developing new business capabilities (such as behavioral\n\nsegmentation, churn prediction, recommendations) and\n\noptimizing processes around those capabilities\n\n- Increase ROI by building algorithms and tools that are\n\nmaintainable and reusable\n\n- Exploring (or further expanding) the use of machine\n\nlearning models for specific use cases\n\n- Bridges the gap between engineering and analytics,\n\nbetween the technology teams and business teams\n\n- Provides business side of studio with data that is crucial\n\nin decision-making, for example a churn model that helps\n\npredict the impact of a new feature set\n\n**What Data Scientists care about:**\n\n- Creating exploratory analysis or models to accurately\n\npredict business metrics, e.g., customer spend, churn,\n\netc., and provide data-driven recommendations\n\n- Enable team with actionable insights that are easy to\n\nunderstand and well curated\n\n- Create and move models from experimentation to\n\nproduction\n\n- A Data Scientist’s biggest challenge? Keeping up with\n\nadvancements and innovation in data science, and\n\nknowing which tools and libraries to use\n\n##### Data Analysts\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n**Responsibilities:**\n\n- Often serves as the go-to point of contact for non-\n\n\n\n- Analysts often interpret data and create reports or other\n\ndocumentation for studio leadership\n\n- Analysts typically are responsible for mining and\n\ncompiling data\n\n- Streamline and or simplify processes when possible\n\n**Goals and Priorities:**\n\n- Empower stakeholder and business teams with\n\nactionable data\n\n- “Catch things before they break”. Proactively mitigate\n\npotential data issues before they occur (for internal and\n\nexternal customers)\n\n- Analysts are often recruited to assist other teams (i.e., BI\n\nteams) with their domain knowledge\n\n- Driving business impact through documentation and\n\nreliable data\n\n**What Data Analysts care about:**\n\n- Easy access to high quality data.\n\n- Quickly find insights from data with SQL queries and\n\ninteractive visualizations.\n\n- The ability to easily share insights and while creating", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7c881f4c6e03e3d222ec7082d373b2ab", + "- Often serves as the go-to point of contact for non-\n\n\n\n- Analysts often interpret data and create reports or other\n\ndocumentation for studio leadership\n\n- Analysts typically are responsible for mining and\n\ncompiling data\n\n- Streamline and or simplify processes when possible\n\n**Goals and Priorities:**\n\n- Empower stakeholder and business teams with\n\nactionable data\n\n- “Catch things before they break”. Proactively mitigate\n\npotential data issues before they occur (for internal and\n\nexternal customers)\n\n- Analysts are often recruited to assist other teams (i.e., BI\n\nteams) with their domain knowledge\n\n- Driving business impact through documentation and\n\nreliable data\n\n**What Data Analysts care about:**\n\n- Easy access to high quality data.\n\n- Quickly find insights from data with SQL queries and\n\ninteractive visualizations.\n\n- The ability to easily share insights and while creating\n\nimpactful assets for others to consume (dashboards, reports).\n\n- A Data Analyst’s biggest challenge? Working with complex\n\nprocesses and complicated technologies that are filled\n\nwith messy data. While fighting these challenges, Analysts\n\nare often left alone or forced through paths that prevent\n\ncollaboration with others across team/organization.\n\n- Untrustworthy data: often Analysts get asked to provide\n\nanswers to leadership that will leverage the data to\n\ndetermine the direction of the company. When the data is\n\nuntrustworthy or incorrect due to previously mentioned\n\nchallenges this can eventually lead to lack of trust in the\n\ndata teams from leadership or the business.\n\n\ntechnical business / operations colleagues for data\n\naccess / analysis questions\n\n\n-----\n\n# Data access and the major cloud providers\n\n\n### Cloud Rosetta Stone\n\n[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n\nIf you are newer to the cloud computing space, it is easy to\n\nget lost between the hundreds of different services between\n\nthe three major cloud providers. The table below is meant to\n\nhighlight the important data, analytics, and AI services used\n\nby the various hyperscale service providers Amazon,\n\nMicrosoft, and Google. In addition, it aims to pair up services\n\nfrom different cloud providers that serve the same purpose.\n\n### Getting started with the major cloud providers\n\nHere are some quick ways to get started with the three major\n\ncloud providers: AWS, Azure, and GCP:\n\n**AWS:**\n\n`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n\naccount on the AWS website. This will give you access to\n\nthe AWS Management Console, which is the web-based\n\ninterface for managing your AWS resources.\n\n\n`2.` **Use the AWS free tier:** AWS offers a free tier of service\n\nthat provides a limited amount of free resources each\n\nmonth. This is a great way to get started and try out\n\nvarious AWS services without incurring any charges.\n\n`3.` **Explore the AWS Management Console:** Once you have\n\nan account and are logged in, take some time to explore\n\nthe AWS Management Console and familiarize yourself\n\nwith the various services that are available.\n\n`4.` **Next you can search for Databricks:** In the AWS\n\nManagement Console, use the search bar in the top-left\n\ncorner of the page and search for “Databricks”.\n\n`5.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2adea317df8b15dd30fcfedb786f6474", + "interface for managing your AWS resources.\n\n\n`2.` **Use the AWS free tier:** AWS offers a free tier of service\n\nthat provides a limited amount of free resources each\n\nmonth. This is a great way to get started and try out\n\nvarious AWS services without incurring any charges.\n\n`3.` **Explore the AWS Management Console:** Once you have\n\nan account and are logged in, take some time to explore\n\nthe AWS Management Console and familiarize yourself\n\nwith the various services that are available.\n\n`4.` **Next you can search for Databricks:** In the AWS\n\nManagement Console, use the search bar in the top-left\n\ncorner of the page and search for “Databricks”.\n\n`5.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`6.` **Launch Databricks Workspace:** To launch the Databricks\n\nWorkspace on AWS, you can use the CloudFormation\n\ntemplate provided by Databricks. Databricks\n\nCloudFormation template creates an IAM role, security\n\ngroup, and Databricks Workspace in your AWS account.\n\n**Azure:**\n\n`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n\nan account on Azure portal. This will give you access to\n\nthe Azure portal, which is the web-based interface for\n\nmanaging your Azure resources.\n\n\n\n\n\n\n\n\n\n\n\n|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n|---|---|---|---|---|\n|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n\n\n-----\n\n**Jargon Glossary**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "727106f5081e38bb55fa6b831992c7c7", + "|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n|---|---|---|---|---|\n|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n\n\n-----\n\n**Jargon Glossary**\n\n|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n|---|---|\n|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n\n\n`2.` **Take Azure tutorials:** Azure provides tutorials,\n\ndocumentation, and sample templates to help you get\n\nstarted. These resources can help you understand the\n\nbasics of Azure and how to use its services.\n\n`3.` **You can search for Databricks:** In the Azure portal, use the\n\nsearch bar at the top of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the Azure portal, Azure\n\nCLI or Azure Powershell. Once created, you’ll be able to\n\naccess your Databricks Workspace through the Azure portal.\n\n`6.` **Other Azure Services:** Once you have a Databricks\n\nworkspace setup, you can easily connect it to other Azure\n\nServices such as Azure Storage, Event Hubs, Azure Data", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "64813058f8cb44c5e7915fedef13435a", + "basics of Azure and how to use its services.\n\n`3.` **You can search for Databricks:** In the Azure portal, use the\n\nsearch bar at the top of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the Azure portal, Azure\n\nCLI or Azure Powershell. Once created, you’ll be able to\n\naccess your Databricks Workspace through the Azure portal.\n\n`6.` **Other Azure Services:** Once you have a Databricks\n\nworkspace setup, you can easily connect it to other Azure\n\nServices such as Azure Storage, Event Hubs, Azure Data\n\nLake Storage, Azure SQL and Cosmos DB for example.\n\n\n**GCP:**\n\n`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n\naccount on GCP portal. This will give you access to the\n\nGCP Console, which is the web-based interface for\n\nmanaging your GCP resources.\n\n`2.` **Explore the GCP Console:** Once you have an account\n\nand are logged in, take some time to explore the GCP\n\nConsole and familiarize yourself with the various services\n\nthat are available.\n\n`3.` **Search for Databricks:** In the GCP Console, use the search bar\n\nin the top-left corner of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the GCP Console or\n\nthe gcloud command-line tool. Once created, you’ll be\n\nable to access your Databricks Workspace through the\n\nGCP Console.\n\n\n-----\n\n# Detailed Use Cases\n\n\n### Getting started with game analytics\n\nFortunately, standing up an effective analytics dashboard\n\nis getting easier. It all starts with getting your data into an\n\narchitecture that sets your team up for success. Selecting\n\nany of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n\nyou can land all your data into a cloud data lake, then use\n\nDatabricks Lakehouse architecture to run real-time and\n\nreliable processing. Databricks can then help you visualize\n\nthat data in a dashboard, or send to a visual analytics\n\nplatform, such as Tableau.\n\n`1.` **Sign up for a Databricks account:** You’ll need to create\n\nan account on the Databricks website in order to use the\n\nplatform.\n\n`2.` **Access the Databricks portal:** Interact with the\n\nDatabricks platform and run tasks such as creating\n\nclusters, running jobs, and accessing data.\n\n`3.` **Set up a development environment:** You’ll need a\n\ndevelopment environment where you can write and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "107a6e85f62ef7ce272fcfa69f254f7b", + "you can land all your data into a cloud data lake, then use\n\nDatabricks Lakehouse architecture to run real-time and\n\nreliable processing. Databricks can then help you visualize\n\nthat data in a dashboard, or send to a visual analytics\n\nplatform, such as Tableau.\n\n`1.` **Sign up for a Databricks account:** You’ll need to create\n\nan account on the Databricks website in order to use the\n\nplatform.\n\n`2.` **Access the Databricks portal:** Interact with the\n\nDatabricks platform and run tasks such as creating\n\nclusters, running jobs, and accessing data.\n\n`3.` **Set up a development environment:** You’ll need a\n\ndevelopment environment where you can write and\n\ntest your code, whether you’re using a local IDE or the\n\nDatabricks Workspace.\n\n`4.` **Collect data:** Once you have your development environment\n\nset up, you can start collecting data from your game. This\n\ncan involve integrating or building a SDK into your game\n\ncode, or using another tool to send data to cloud storage.\n\n`5.` **Process and analyze the data:** Once you have collected\n\nyour data, you can use Databricks to process and analyze\n\nit. This can involve cleaning and transforming the data,\n\nrunning queries or machine learning algorithms, or\n\ncreating visualizations.\n\n`6.` **Monitor and optimize:** Regularly monitor your analytics\n\nto ensure that they are accurate and relevant, and use the\n\ninsights you gain to optimize your game.\n\nKeep in mind that these are just general steps to get started\n\nwith Databricks for game analytics. The specific steps you’ll\n\nneed to take will depend on your specific use case and needs.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define your goals:** What do you want to learn from your\n\nanalytics data? Having clear goals will help you focus on\n\ncollecting the right data and making meaningful use of it.\n\n- **Plan your data collection:** Determine what data you need\n\nto collect, how you will collect it, and how you will store it.\n\n- **Consider privacy:** Make sure you are transparent with your\n\nplayers about what data you are collecting and how you\n\nwill use it, and give them the option to opt out if they wish.\n\n- **Use analytics to inform design:** Leverage your analytics data\n\nto inform decisions around game design, such as any balance\n\nchanges or new content targeting a specific audience.\n\n- **Monitor and test your analytics implementation:** Regularly\n\ncheck your analytics to ensure that data is being collected\n\ncorrectly, and conduct tests to validate the accuracy of\n\nyour data.\n\n- **Visualize your data:** Dashboarding your data is one of the\n\nmost effective ways to quickly and effectively make sense\n\nof what’s happening at a given moment in time.\n\n- **Use data to improve player retention:** Analyze player\n\nbehavior and use the insights you gain to improve player\n\nretention, such as by identifying and addressing pain\n\npoints or by providing personalized content.\n\n- **Collaborate with your team:** Share your analytics\n\nfindings with your team and encourage them to use the\n\ndata to inform their work.\n\n- **Keep it simple:** Don’t try to collect too much data or\n\ncreate overly complex analytics systems. Keep it simple\n\nand focused on your goals.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "c48db58d6a08214c8ed3d4cd460c6180", + "- **Monitor and test your analytics implementation:** Regularly\n\ncheck your analytics to ensure that data is being collected\n\ncorrectly, and conduct tests to validate the accuracy of\n\nyour data.\n\n- **Visualize your data:** Dashboarding your data is one of the\n\nmost effective ways to quickly and effectively make sense\n\nof what’s happening at a given moment in time.\n\n- **Use data to improve player retention:** Analyze player\n\nbehavior and use the insights you gain to improve player\n\nretention, such as by identifying and addressing pain\n\npoints or by providing personalized content.\n\n- **Collaborate with your team:** Share your analytics\n\nfindings with your team and encourage them to use the\n\ndata to inform their work.\n\n- **Keep it simple:** Don’t try to collect too much data or\n\ncreate overly complex analytics systems. Keep it simple\n\nand focused on your goals.\n\n- **Start where you are:** If you’ve yet to gather all of your\n\ndata, don’t go build some fancy model. Start with the data\n\nyou have available to you and build from there.\n\n### Getting started with Player Segmentation\n\nPlayer segmentation is crucial to studios as it allows them\n\nto better understand their audience and tailor their game\n\nexperience to meet their specific needs and preferences.\n\nBy dividing players into different segments based on factors\n\nsuch as demographics, playing styles, and in-game behavior,\n\n\n-----\n\nstudios can gain valuable insights into what motivates and\n\nengages their players. This information can then be used\n\nto design games that not only provide a more enjoyable\n\nexperience for players, but also drive player retention\n\nand increase revenue for the studio. In a competitive\n\nindustry where player satisfaction is key to success, player\n\nsegmentation is an essential tool for studios to stay ahead of\n\nthe game.\n\nStart by evaluating the segmentation goals such as:\n\n- **Personalize the experience:** Changing or creating\n\nexperience specific designs to the player.\n\n- **Create relevant content:** Surface the best content to\n\nplayers based on features and behaviors that will matter\n\nthe most depending on the player’s place in the games\n\nlife cycle.\n\n- **Monetization:** Create tailored monetization strategies\n\nthat effectively reach and convert each player group. For\n\nexample, you may have a group of highly engaged players\n\nwho are more likely to make in-app purchases, while\n\nanother group is less likely to spend money but may be\n\nmore receptive to advertisements.\n\nThe next steps would be to identify, collect and analyze\n\nplayer data. By gathering information on player behavior,\n\npreferences, and demographics, you can gain insights\n\ninto their motivations, pain points, and what drives their\n\nengagement with your game.\n\nThere are multiple types of player data to collect, including:\n\n- **Player Behavior:** Track player behavior and actions\n\nwithin your game to gain insights into their play style,\n\npreferences, and patterns.\n\n- **Surveys:** Ask players directly about their preferences,\n\nmotivations, and feedback through in-game surveys, email\n\nquestionnaires, or other forms of direct communication.\n\n- **Focus groups:** Gather a small group of players to discuss\n\nand provide feedback on specific aspects of your game\n\nand player experience.\n\n- **Social media listening:** Monitor social media platforms\n\nto gather insights into how players are engaging with and\n\ntalking about your game.\n\n**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n**Tips / Best Practices**\n\nDefine your segmentation goals: Determine what you want\n\nto learn about your players and why. This will help you focus", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "5c29708c331107e5cc6e18f0a765e54e", + "- **Player Behavior:** Track player behavior and actions\n\nwithin your game to gain insights into their play style,\n\npreferences, and patterns.\n\n- **Surveys:** Ask players directly about their preferences,\n\nmotivations, and feedback through in-game surveys, email\n\nquestionnaires, or other forms of direct communication.\n\n- **Focus groups:** Gather a small group of players to discuss\n\nand provide feedback on specific aspects of your game\n\nand player experience.\n\n- **Social media listening:** Monitor social media platforms\n\nto gather insights into how players are engaging with and\n\ntalking about your game.\n\n**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n**Tips / Best Practices**\n\nDefine your segmentation goals: Determine what you want\n\nto learn about your players and why. This will help you focus\n\nyour analysis and ensure that your segments are meaningful\n\nand actionable.\n\n- **Use meaningful criteria:** Choose criteria that are relevant\n\nto your goals and that differentiate players in meaningful\n\nways. This could include demographic information, in-game\n\nbehavior, spending habits, or a combination of factors.\n\n- **Analyze player data:** Use data from your players to inform\n\nyour segmentation strategy. This could include data\n\non in-game behavior, spending habits, or demographic\n\ninformation.\n\n- **Use multiple methods:** We recommend using a\n\ncombination of methods, such as clustering to create\n\nsegments that are statistically meaningful and actionable\n\nto your game.\n\n- **Validate your segments:** Test your segments to ensure\n\nthat they accurately reflect the differences you observed\n\nin your player data. This could involve comparing the\n\nsegments to each other, or validating the segments\n\nagainst external data sources.\n\n- **Consider ethical and privacy concerns:** Ensure that\n\nyour segmentation strategy is ethical and complies\n\nwith privacy laws and regulations. This could involve\n\nanonymizing your player data, obtaining consent from\n\nplayers, or other measures to protect player privacy.\n\n- **Monitor and refine your segments:** Regularly review\n\nyour segments to ensure that they remain relevant and\n\nmeaningful. Refine your segments as necessary to reflect\n\nchanges in your player data or your goals.\n\n### Getting Started with Player Lifetime Value\n\nAssuming you’ve followed the steps to collecting, storing, and\n\npreparing your player data for analysis; To calculate player\n\nlifetime value (LTV), the quick and dirty way of assessing\n\noverall player LTV is to divide the total revenue by the total\n\nnumber of registered players. Note, LTV is a critical calculation\n\nfor return on investment, which is player lifetime spend versus\n\nthe amount spent on player acquisition. Ideally, you want\n\nlifetime spend to be equal to or more than cost of acquisition.\n\n\n-----\n\nAs long as your game and its community are currently active,\n\nany player lifetime value calculations should be considered\n\nmodels, not exact numbers. This is because many of the players\n\nyou’re considering are likely actively registered and actively\n\nplaying, so the exact player LTV number is a moving target.\n\nAdvanced\npredictive\nmodels\n\nSimple\npredictive\nmodels\n\n\nHistorical\naverage and\nbenchmarks\n\n\nBut these models are not entirely accurate since it doesn’t\n\ntake into account the players who are registered but have\n\nyet to generate any revenue. Instead, a data-driven approach\n\npivoted around player segmentation or cohorts will generally\n\nyield more actionable insight, far more than calculating a\n\nsingle LTV for the entire player base.\n\nYou can define your game’s cohorts in multiple ways. Perhaps\n\nthe most obvious in terms of calculating LTV is going by daily", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "bf4bfcfdf8244e27d15c44645d42b670", + "lifetime spend to be equal to or more than cost of acquisition.\n\n\n-----\n\nAs long as your game and its community are currently active,\n\nany player lifetime value calculations should be considered\n\nmodels, not exact numbers. This is because many of the players\n\nyou’re considering are likely actively registered and actively\n\nplaying, so the exact player LTV number is a moving target.\n\nAdvanced\npredictive\nmodels\n\nSimple\npredictive\nmodels\n\n\nHistorical\naverage and\nbenchmarks\n\n\nBut these models are not entirely accurate since it doesn’t\n\ntake into account the players who are registered but have\n\nyet to generate any revenue. Instead, a data-driven approach\n\npivoted around player segmentation or cohorts will generally\n\nyield more actionable insight, far more than calculating a\n\nsingle LTV for the entire player base.\n\nYou can define your game’s cohorts in multiple ways. Perhaps\n\nthe most obvious in terms of calculating LTV is going by daily\n\nactive cohorts, or users who joined your game on the same\n\nday. You could also organize cohorts by users who joined\n\nyour game through a certain ad campaign or promotional\n\neffort, by country or geographic location, or by the type of\n\ndevice used.\n\n**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n**ACCURACY**\n\n**Tips / Best Practices**\n\n\n**Use multiple data sources:** To get a complete picture of\n\na player’s value, be sure to consider data from a variety\n\nof sources, including in-game purchases, ad revenue, and\n\nother monetization strategies.\n\n**Consider player retention:** Player retention is a key factor\n\nin LTV, so be sure to consider how long players are likely to\n\nplay your game when calculating LTV.\n\n**Use accurate data:** Make sure you are using accurate\n\ndata when calculating LTV. This might involve cleaning and\n\nprocessing your data, or using trusted sources such as in-\n\ngame analytics tools.\n\n**Regularly review and update your LTV estimates:** Player\n\nLTV can change over time, so be sure to regularly review\n\nand update your estimates to ensure they are accurate.\n\n**Test and optimize:** Use experimentation methods such\n\nas A/B testing to see how different variables, such as\n\nin-game events or pricing strategies, affect LTV. Use the\n\ninsights you gain to optimize your LTV calculations.\n\n**Be aware of outside factors:** Your calculations should\n\nconsider the many outside factors that can affect your\n\nLTV, such as the virality of your game, any spikes or surge\n\nin visitors due to unexpected promotions (influencers,\n\nreviewers talking about your game), any significant changes\n\nto your game that users respond well to, and other organic\n\nlifts that are difficult to predict with existing data.\n\n\nThe first calculation is relatively simple. We suggest using\n\naverage revenue per user (ARPU), which is a game’s daily\n\nrevenue divided by the number of active users, to help you\n\ncalculate lifetime value. First, you’ll need to define what is\n\nan active player using retention values; which can be set to\n\na week, multi-day, or multi-week period of time depending\n\non how your game has performed to date. You can then look\n\nat the number of users who churn on a given day, averaging\n\nwith the number of days from the player’s first visit to the\n\ncurrent date (or the specific date you’ve considered the end\n\nfor said exercise). This is your playerbase lifetime value (note\n\nnot Player Lifetime Value). To get Lifetime Value, divide daily\n\nrevenue by the number of daily active users, and multiply\n\nthat by the Lifetime Value to get your player LTV.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "d3bae8db08e6aad1fcd5f5954ef6c4f9", + "lifts that are difficult to predict with existing data.\n\n\nThe first calculation is relatively simple. We suggest using\n\naverage revenue per user (ARPU), which is a game’s daily\n\nrevenue divided by the number of active users, to help you\n\ncalculate lifetime value. First, you’ll need to define what is\n\nan active player using retention values; which can be set to\n\na week, multi-day, or multi-week period of time depending\n\non how your game has performed to date. You can then look\n\nat the number of users who churn on a given day, averaging\n\nwith the number of days from the player’s first visit to the\n\ncurrent date (or the specific date you’ve considered the end\n\nfor said exercise). This is your playerbase lifetime value (note\n\nnot Player Lifetime Value). To get Lifetime Value, divide daily\n\nrevenue by the number of daily active users, and multiply\n\nthat by the Lifetime Value to get your player LTV.\n\nIt’s important to note that while calculating player lifetime\n\nvalue, the term is not entirely accurate since most player\n\nlifetimes are not over (particularly true for live service\n\ngames). But for the purpose of modeling, we recommend\n\nkeeping the amount of time that you consider a lifetime\n\nrelatively short, allowing you to extrapolate. Keeping the time\n\nperiod shorter helps mitigate inaccuracies, specifically, the\n\nlonger you stretch out what you consider a lifetime the more\n\nlikely you are to collect inactive users in your count.\n\n\n-----\n\n### Getting Started with Social Media Monitoring\n\nSocial media monitoring has three primary components:\n\ncollecting the data, processing the results, and taking action\n\non the findings. When it comes to collecting the data, whether\n\nyou’re looking for tweets, YouTube comments, or Reddit\n\nposts, it can be very easy to get started since many social\n\nmedia platforms such as Twitter, YouTube, and Reddit all\n\nprovide their own detailed and comprehensive APIs making it\n\neasy to start gathering data from those platforms with proper\n\ndocumentation and code examples to help along the way.\n\nOnce the data has been collected, the next step is to process\n\nit and prepare it to be used in the next step. Processing your\n\ndata can range in complexity from a simple keywords filter\n\nor more complicated approach such as filtering by location,\n\nremoving emojis, and censoring and substituting words. With\n\nthe data collected and processed, it can move to the final\n\nstage and be analyzed for downstream use and actionable\n\ninsights by applying sentiment analysis or text mining.\n\nIf a game studio is looking to save time and have the above\n\nsteps performed for them, it may be appealing to buy a\n\npre-built tool. The primary benefits of buying an off the shelf\n\nsolution is that it is often faster and easier to get started\n\nwith, and the development of the tool is handled by a third\n\nparty who will have experience in building media monitoring\n\n\nsolutions. On the other hand, building your own custom\n\nsolution will provide more flexibility and control. Many pre-\n\nbuilt media monitoring tools might not have the capabilities\n\nrequired to effectively process video, audio, and image\n\ndata, and may not be able to control the frequency in which\n\ndata is processed, whether it be near real-time or batch.\n\nAdditionally, pre-built solutions tend to take a generalist\n\napproach for NLP, whether it be keyword extraction, topic\n\nfiltering, or sentiment analysis, which often leads to poor\n\nresults and feedback, especially for an industry as unique as\n\nthe gaming industry where certain industry-specific slang\n\nor terminology is frequently used. Overall, building your\n\nown media monitoring tool will provide greater control and\n\nflexibility leading to a better tailored return on investment,\n\nand luckily Databricks makes it even easier to get started.\n\nWith the Databricks Lakehouse platform, all data engineering,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "0e5110779c7dcf40e6eb72b0f3e5c5e4", + "party who will have experience in building media monitoring\n\n\nsolutions. On the other hand, building your own custom\n\nsolution will provide more flexibility and control. Many pre-\n\nbuilt media monitoring tools might not have the capabilities\n\nrequired to effectively process video, audio, and image\n\ndata, and may not be able to control the frequency in which\n\ndata is processed, whether it be near real-time or batch.\n\nAdditionally, pre-built solutions tend to take a generalist\n\napproach for NLP, whether it be keyword extraction, topic\n\nfiltering, or sentiment analysis, which often leads to poor\n\nresults and feedback, especially for an industry as unique as\n\nthe gaming industry where certain industry-specific slang\n\nor terminology is frequently used. Overall, building your\n\nown media monitoring tool will provide greater control and\n\nflexibility leading to a better tailored return on investment,\n\nand luckily Databricks makes it even easier to get started.\n\nWith the Databricks Lakehouse platform, all data engineering,\n\ndata science, machine learning, and data analytics can\n\nbe done in a single place without having to stitch multiple\n\nsystems and tools together.\n\nData engineers can use Workflows and Jobs to call social\n\nmedia platform APIs on a scheduled basis and use Delta Live\n\nTables to create declarative data pipelines for cleaning and\n\nprocessing the data that comes in. Data scientists can use\n\ntools such as ML-specific Databricks runtimes (DBRs) that\n\ncome with many of the most popular and common libraries\n\nalready installed, MLflow which makes model development,\n\n\n-----\n\ntracking, and serving easy and efficient, and various other\n\ntools such as AutoML and Bamboolib. Data analysts are able\n\nto create real-time alerts, dashboards, and visualizations\n\nusing Databricks SQL. Each of the three personas will be able\n\nto effectively collaborate with each other and integrate each\n\npiece of their work into the broader data architecture.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\nWhile social media monitoring can be easy to get started\n\nwith, there are a few key points to keep in mind.\n\n- Remember the Pareto principle (roughly 80% of impact\n\ncomes from 20% of activity) and diminishing returns. While\n\nit’s important to monitor large platforms such as Reddit,\n\nTwitter, and YouTube, it might not be worthwhile to monitor\n\nsmaller platforms (in terms of engagement) as the bulk of\n\ncustomer feedback will be on those major platforms.\n\n- Monitor other sources of information. It is also useful to\n\nmonitor mentions of key company personnel such as\n\nexecutives or public facing employees.\n\n- While follower count does matter on platforms such as\n\nTwitter, don’t ignore users with low-follower counts. It only\n\ntakes one or two re-tweets from other users to become a\n\nlarge issue.\n\n- On social media, customers can see through generic\n\ncorporate responses to complaints, so it is important\n\nto get a clear understanding of the issue and provide a\n\nclear response.\n\n### Getting Started with Player Feedback Analysis\n\nThe easiest place to start is gathering your data. With\n\naccounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n\nNintendo (or whatever platform you’re using), identify the ID\n\nfor your game(s), and pull the reviews corresponding to that\n\ngame into Databricks through an API call.\n\n\nFrom here, you clean the data using some of the pre-\n\nprocessing available in Python that removes any emojis and\n\nASCII characters. Once complete, run through Spark NLP", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "8a4c09960b51191088f561ef1575b018", + "- While follower count does matter on platforms such as\n\nTwitter, don’t ignore users with low-follower counts. It only\n\ntakes one or two re-tweets from other users to become a\n\nlarge issue.\n\n- On social media, customers can see through generic\n\ncorporate responses to complaints, so it is important\n\nto get a clear understanding of the issue and provide a\n\nclear response.\n\n### Getting Started with Player Feedback Analysis\n\nThe easiest place to start is gathering your data. With\n\naccounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n\nNintendo (or whatever platform you’re using), identify the ID\n\nfor your game(s), and pull the reviews corresponding to that\n\ngame into Databricks through an API call.\n\n\nFrom here, you clean the data using some of the pre-\n\nprocessing available in Python that removes any emojis and\n\nASCII characters. Once complete, run through Spark NLP\n\npipeline which does the basic natural language processing\n\nsteps such as normalization, stemming, lemmatization. We\n\nrecommend running through pre-trained models, such as Word\n\nEmbeddings and Named Entity Recognition models from John\n\nSnow Labs. This should complete the pipeline and generates\n\nthe aspects for the reviews provided by the community.\n\nThis data is then loaded into a Delta table for further analysis,\n\nsuch as using a visual dashboard (built on SQL queries inside\n\nDatabricks) to analyze and understand the aspects the\n\ncommunity is talking about, which can then be shared back\n\nwith the development team for analysis and action. This is a\n\ngreat exercise to run once per month.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Check for word groupings:** Make sure your word groupings\n\nare accurate to improve the analysis. For example, if your\n\ngame is called Football Manager, and the shorthand is FM,\n\nmake sure both of those are grouped appropriately.\n\n- **Leverage domain knowledge:** Clean the reviews based\n\non your domain knowledge. There are generic steps one\n\ncould take, but that will not be as effective as someone\n\nwith domain, and specific game knowledge of your title.\n\n- **Experiment with models:** Feel free to try multiple pre-\n\ntrained models, and or tweak the pre-trained models\n\nbased on your understanding of the domain to improve\n\nthe accuracy of your results.\n\n- **Work one title at a time:** This process works best when\n\npulling reviews for a single title, specifically one version of\n\none title at a time.\n\n- **Let the model to the heavy lift, but use humans to double-**\n\n**check:** The sentiment corresponding to the aspects in the\n\nmodel will be labeled as Positive or Negative. In the case\n\nof a neutral review, the model will do its best to determine\n\nwhether that is more positive or negative. A best practice\n\nis to spend time going back through the aspects early to\n\ndetermine model accuracy and make updates accordingly.\n\n\n-----\n\n### Getting Started with Toxicity Detection\n\nOur recommendation on tackling the toxicity issue is\n\nto leverage cloud-agnostic and flexible tooling that can\n\nconsume chat data from a variety of sources, such as chat\n\nlogs, voice transcriptions, or sources like discord and reddit\n\nforums. No matter if the data is in log form from game\n\nservers or events from a message system, Databricks can\n\nprovide quick and easy ways to ingest the data.\n\nLeveraging a simplified architecture like the diagram", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "66941af18e24e0318ab56ffb753bbeab", + "**check:** The sentiment corresponding to the aspects in the\n\nmodel will be labeled as Positive or Negative. In the case\n\nof a neutral review, the model will do its best to determine\n\nwhether that is more positive or negative. A best practice\n\nis to spend time going back through the aspects early to\n\ndetermine model accuracy and make updates accordingly.\n\n\n-----\n\n### Getting Started with Toxicity Detection\n\nOur recommendation on tackling the toxicity issue is\n\nto leverage cloud-agnostic and flexible tooling that can\n\nconsume chat data from a variety of sources, such as chat\n\nlogs, voice transcriptions, or sources like discord and reddit\n\nforums. No matter if the data is in log form from game\n\nservers or events from a message system, Databricks can\n\nprovide quick and easy ways to ingest the data.\n\nLeveraging a simplified architecture like the diagram\n\nabove shows no matter the source, getting chat data for\n\ninferencing and model development can be as simple. While\n\nwe leveraged a pre-built model from John Snow Labs to\n\naccelerate development, you can bring the ML framework of\n\nyour choice to the platform.\n\n**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define what toxic and disruptive behavior looks**\n\n**like within your community:** Clearly define what you\n\nconsider to be toxic behavior, as this will determine how\n\nyou measure and detect it. This might include things like\n\nhateful language, harassment, or cheating.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you detect toxicity. This might include\n\ndata on in-game chat, player reports, and other sources.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and identify patterns of toxic\n\nbehavior. This will allow you to more accurately detect\n\ntoxicity and prioritize cases for review.\n\n- **Test and optimize:** Regularly review and test your toxicity\n\ndetection systems to ensure they are accurate and\n\neffective. Use experimentation methods such as A/B\n\ntesting to see how different strategies impact toxicity rates.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are detecting toxicity, and give\n\nthem the option to opt out if they wish.\n\n- **Take action:** When toxic behavior is detected, take\n\nappropriate action to address it. The health and wellness\n\nof your community depends on it. This might involve\n\nbanning players, issuing warnings, or taking other\n\ndisciplinary measures.\n\n\n-----\n\n### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n\nTo get started with multi-touch attribution, you need to first\n\nselect an attribution model. There are a variety of different\n\nattribution models to choose from, each with its own\n\n\nattribution credit according to your chosen model (above).\n\nWe highly recommend you regularly review and test your\n\nattribution efforts to ensure they are accurate and effective.\n\nUse experimentation methods such as A/B testing to see\n\nhow different strategies impact conversion rates.\n\n**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n\n\nstrengths and limitations.\n\n\n`1.` **Last-click model:** This model attributes all credit to the\n\nlast touchpoint that the customer interacted with before\n\nmaking a purchase or taking a desired action.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "5cc40222e7110f4588869738b6e75f20", + "To get started with multi-touch attribution, you need to first\n\nselect an attribution model. There are a variety of different\n\nattribution models to choose from, each with its own\n\n\nattribution credit according to your chosen model (above).\n\nWe highly recommend you regularly review and test your\n\nattribution efforts to ensure they are accurate and effective.\n\nUse experimentation methods such as A/B testing to see\n\nhow different strategies impact conversion rates.\n\n**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n\n\nstrengths and limitations.\n\n\n`1.` **Last-click model:** This model attributes all credit to the\n\nlast touchpoint that the customer interacted with before\n\nmaking a purchase or taking a desired action.\n\n`2.` **First-click model:** This model attributes all credit to the\n\nfirst touchpoint that the customer interacted with.\n\n`3.` **Linear model:** This model attributes equal credit to each\n\ntouchpoint that the customer interacted with.\n\n`4.` **Time decay model:** This model attributes more credit to\n\ntouchpoints that are closer in time to the purchase\n\nor desired action.\n\n`5.` **Position-based model:** This model attributes a portion of\n\nthe credit to the first and last touchpoints, and the remainder\n\nis distributed evenly among the other touchpoints.\n\n`6.` **Custom model:** Some businesses create their own\n\nattribution model based on specific business needs or goals.\n\nEach attribution model has its own strengths and limitations,\n\nand the right model for a particular video game will depend\n\non a variety of factors, including the goals of your title, the\n\ncustomer journey, and the types of marketing channels being\n\nused. It is important to carefully consider the pros and cons\n\nof each model and choose the one that best aligns with the\n\nneeds of your game.\n\nNext, you’re going to want to set up tracking. In order to\n\nattribute credit to different touchpoints, you’ll need to set up\n\ntracking to capture data on customer interactions. This might\n\ninvolve integrating tracking code into the game, or using a\n\nthird-party tracking tool.\n\nWith tracking set up, you’ll start collecting data on player\n\ninteractions and be able to use that information to calculate\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define clear goals:** Sounds simple, but by clearly defining\n\nthe goals of your acquisition campaign and what success\n\nlooks like, you will be able to guide your decision-making\n\nand ensure that you are measuring the right metrics -\n\nsuch as cost per install, return on ad spend, conversion\n\nrate, lifetime value, retention rate, and more.\n\n- **Use a data-driven approach:** Use data to inform your\n\ndecision-making. Collect data on all touchpoints in the\n\nplayer journey, including ad impressions, clicks, installs,\n\nand in-game actions.\n\n- **Choose the right attribution model:** Select the right\n\nattribution model that accurately reflects the player\n\njourney for your specific genre of game. This can be a\n\ncomplex process. A couple of things to keep in mind\n\n- Consider the touchpoints that are most important for\n\nyour player journey, such as first ad impression, first\n\nclick, or first in-game action\n\n- Consider the business goals you’re trying to achieve.\n\nFor example, if you are focused on maximizing return\n\non investment, a last-click attribution model may be\n\nmost appropriate. On the other hand, if you are looking\n\nto understand the impact of each touchpoint, a multi-", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "c1ff59b253186b6ec6769a2984f8461a", + "- **Use a data-driven approach:** Use data to inform your\n\ndecision-making. Collect data on all touchpoints in the\n\nplayer journey, including ad impressions, clicks, installs,\n\nand in-game actions.\n\n- **Choose the right attribution model:** Select the right\n\nattribution model that accurately reflects the player\n\njourney for your specific genre of game. This can be a\n\ncomplex process. A couple of things to keep in mind\n\n- Consider the touchpoints that are most important for\n\nyour player journey, such as first ad impression, first\n\nclick, or first in-game action\n\n- Consider the business goals you’re trying to achieve.\n\nFor example, if you are focused on maximizing return\n\non investment, a last-click attribution model may be\n\nmost appropriate. On the other hand, if you are looking\n\nto understand the impact of each touchpoint, a multi-\n\ntouch attribution model may be more appropriate.\n\n- Consider the data you have available, including ad\n\nimpressions, clicks, installs, and in-game actions.\n\n- **Continuously monitor and optimize:** Continuously\n\nmonitor and optimize your acquisition campaigns based on\n\nthe data. Test different approaches, make adjustments as\n\nneeded, and use A/B testing to determine what works best.\n\n\n-----\n\n### Getting Started with Player Recommendations\n\nRecommendations is an advanced use case. We don’t\n\nrecommend (hehe) that you start here, instead, we’re\n\nassuming that you’ve done the work to set up your game\n\nanalytics (collecting, cleaning, and preparing data for analysis)\n\nand that you’ve done basic segmentation to place your\n\nplayers in cohorts based on their interests and behaviors.\n\nRecommendations can come in many forms for video games.\n\nFor this context, we’re going to focus on the wide-and-deep\n\nlearning for recommender systems, which has the ability\n\nto both memorize and generalize recommendations based\n\non player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n\n[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n\ndeep machine learning (ML) model has become popular in a\n\nvariety of online scenarios for its ability to personalize user\n\nengagements, even in ‘cold start problem’ scenarios with\n\nsparse data inputs.\n\nThe goal with wide-and-deep recommenders is to provide\n\n\n**Understanding the model design**\n\nTo understand the concept of wide-and-deep recommend­\n\nations, it’s best to think of it as two separate, but collaborating,\n\nengines. The wide model, often referred to in the literature as\n\nthe linear model, memorizes users and their past choices. Its\n\ninputs may consist simply of a user identifier and a product\n\nidentifier, though other attributes relevant to the pattern (such\n\nas time of day) may also be incorporated.\n\nThe deep portion of the model, so named as it is a deep\n\nneural network, examines the generalizable attributes of a\n\nuser and their choices. From these, the model learns the\n\nbroader characteristics that tend to favor user selections.\n\nTogether, the wide-and-deep submodels are trained\n\non historical product selections by individual users to\n\npredict future selections. The end result is a single model\n\ncapable of calculating the probability with which a user will\n\npurchase a given item, given both memorized past choices\n\nand generalizations about a user’s preferences. These\n\nprobabilities form the basis for user-specific rankings, which\n\ncan be used for making recommendations.\n\n\nan intimate level of player understanding. This model uses\n\n\nexplicit and implicit feedback to expand the considerations", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "84f2e08f39518551850d737ef0a7ef5b", + "inputs may consist simply of a user identifier and a product\n\nidentifier, though other attributes relevant to the pattern (such\n\nas time of day) may also be incorporated.\n\nThe deep portion of the model, so named as it is a deep\n\nneural network, examines the generalizable attributes of a\n\nuser and their choices. From these, the model learns the\n\nbroader characteristics that tend to favor user selections.\n\nTogether, the wide-and-deep submodels are trained\n\non historical product selections by individual users to\n\npredict future selections. The end result is a single model\n\ncapable of calculating the probability with which a user will\n\npurchase a given item, given both memorized past choices\n\nand generalizations about a user’s preferences. These\n\nprobabilities form the basis for user-specific rankings, which\n\ncan be used for making recommendations.\n\n\nan intimate level of player understanding. This model uses\n\n\nexplicit and implicit feedback to expand the considerations\n\nset for players. Wide-and-deep recommenders go beyond\n\nsimple weighted averaging of player feedback found in some\n\ncollaborative filters to balance what is understood about\n\nthe individual with what is known about similar gamers. If\n\ndone properly, the recommendations make the gamer feel\n\nunderstood (by your title) and this should translate into\n\ngreater value for both the player and you as the business.\n\n\n**Building the model**\n\nThe intuitive logic of the wide-and-deep recommender\n\nbelies the complexity of its actual construction. Inputs\n\nmust be defined separately for each of the wide-and-\n\ndeep portions of the model and each must be trained in a\n\ncoordinated manner to arrive at a single output, but tuned\n\nusing optimizers specific to the nature of each submodel.\n\nThankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n\n[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n\nsimplifying the assembly of an overall model.\n\n\n**User A**\n\n- user identity\n\n- user attributes\n\n**Product B**\n\n\n**Wide**\n**Sub-Model**\n\n\n**Probability of**\n\n**User A + Product B**\n\n**Wide & Deep**\n**Model**\n\n\n**Deep**\n**Sub-Model**\n\n\n\n- product identity\n\n- product attributes\n\n\n-----\n\n**Training**\n\nThe challenge for most teams is then training the\n\nrecommender on the large number of user-product\n\ncombinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n\nopen-source library for serving large datasets assembled in\n\nApache Spark™ to Tensorflow (and other ML libraries), one can\n\ncache the data on high-speed, temporary storage and then\n\nread that data in manageable increments to the model during\n\ntraining. In doing so, we limit the memory overhead associated\n\nwith the training exercise while preserving performance.\n\n**Tuning**\n\nTuning the model becomes the next challenge. Various model\n\nparameters control its ability to arrive at an optimal solution.\n\nThe most efficient way to work through the potential parameter\n\ncombinations is simply to iterate through some number of\n\ntraining cycles, comparing the models’ evaluation metrics with\n\neach run to identify the ideal parameter combinations. By\n\ntrials, we can parallelize this work across many compute nodes,\n\nallowing the optimizations to be performed in a timely manner.\n\n**Deploying**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "048fc590af3f4c45c67e3dff40b8ef9a", + "open-source library for serving large datasets assembled in\n\nApache Spark™ to Tensorflow (and other ML libraries), one can\n\ncache the data on high-speed, temporary storage and then\n\nread that data in manageable increments to the model during\n\ntraining. In doing so, we limit the memory overhead associated\n\nwith the training exercise while preserving performance.\n\n**Tuning**\n\nTuning the model becomes the next challenge. Various model\n\nparameters control its ability to arrive at an optimal solution.\n\nThe most efficient way to work through the potential parameter\n\ncombinations is simply to iterate through some number of\n\ntraining cycles, comparing the models’ evaluation metrics with\n\neach run to identify the ideal parameter combinations. By\n\ntrials, we can parallelize this work across many compute nodes,\n\nallowing the optimizations to be performed in a timely manner.\n\n**Deploying**\n\nFinally, we need to deploy the model for integration with\n\nvarious retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n\nto both persist our model and package it for deployment\n\nacross a wide variety of microservices layers, including\n\nAzure Machine Learning, AWS Sagemaker, Kubernetes and\n\nDatabricks Model Serving.\n\nWhile this seems like a large number of technologies to bring\n\ntogether just to build a single model, Databricks integrates all\n\nof these technologies within a single platform, providing data\n\nscientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n\nience. The pre-integration of these technologies means various\n\nper­sonas can work faster and leverage additional capabilities,\n\nsuch as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n\ntransparency of the organization’s model building efforts.\n\nTo see an end-to-end example of how a wide and deep\n\nrecommender model may be built on Databricks, please\n\ncheck out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n\n**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Use data to inform recommendations:** Use data from\n\nyour analytics, player feedback, and other sources to\n\nunderstand what players like and dislike. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and engaging for individual players.\n\n- **Segment your players:** Consider segmenting your players\n\nbased on characteristics such as playstyle, spending\n\nhabits, and demographic information. This will allow you\n\nto create more targeted recommendations for different\n\ngroups of players.\n\n- **Consider the player’s current context:** When creating\n\nrecommendations, consider the player’s current context,\n\nsuch as what they are doing in the game and what\n\ncontent they have already consumed. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and timely.\n\n- **Test and optimize your recommendations:** Use\n\nexperimentation methods such as A/B testing to see\n\nhow different recommendations perform with different\n\nplayer segments. Use the insights you gain to optimize\n\nyour recommendations.\n\n- **Be transparent:** Make sure you are transparent with\n\nplayers about how you are creating recommendations and\n\ngive them the option to opt out if they wish.\n\n- **Use recommendations to improve the player experience:**\n\nUse personalized recommendations to improve the player", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "bbbd83b865bab5295d7bbc440084f3a2", + "habits, and demographic information. This will allow you\n\nto create more targeted recommendations for different\n\ngroups of players.\n\n- **Consider the player’s current context:** When creating\n\nrecommendations, consider the player’s current context,\n\nsuch as what they are doing in the game and what\n\ncontent they have already consumed. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and timely.\n\n- **Test and optimize your recommendations:** Use\n\nexperimentation methods such as A/B testing to see\n\nhow different recommendations perform with different\n\nplayer segments. Use the insights you gain to optimize\n\nyour recommendations.\n\n- **Be transparent:** Make sure you are transparent with\n\nplayers about how you are creating recommendations and\n\ngive them the option to opt out if they wish.\n\n- **Use recommendations to improve the player experience:**\n\nUse personalized recommendations to improve the player\n\nexperience and increase engagement and satisfaction.\n\n### Getting Started with Next Best Offer/Action\n\nSince NBO/NBA is a specific use case of personalization, how a\n\nteam might get started implementing this will look very similar\n\nto how they would with broader personalization activities.\n\nBegin with ensuring you are appropriately collecting player\n\ndata (behavior, preferences, in-game purchases, etc), storing\n\nit in your cloud data lake using a service such as Delta Lake\n\nfrom Databricks. From here, you’ll prepare the data using\n\nDatabricks to clean, transform, and prepare for analysis.\n\nThis may include aggregating data from multiple sources,\n\nremoving duplicates and outliers, and transforming the data\n\ninto a format suitable for analysis. As you analyze the player\n\ndata, seek to identify patterns and trends in player behavior\n\n\n-----\n\nand preferences that will give you signal on which actions are\n\nmore likely to be successful.\n\nFrom here, you can build a recommendation model based\n\non the player data analysis, and incorporate information\n\non in-game items and player preferences to make\n\npersonalized recommendations.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Define your goals:** Like every use case, starting with\n\nclearly defined goals helps to ensure your implementation\n\nof NBO and NBA will be as effective and efficient as\n\npossible. Your goals will also help you determine what data\n\nto collect and how it will be used.\n\n- **Collect relevant data:** Based on your goals, make sure\n\nyou are collecting the right data to inform your NBO and\n\nNBA recommendations. This might include data on player\n\nbehavior, engagement, and spending habits.\n\n- **Leverage machine learning to scale your**\n\n**recommendations:** Use machine learning algorithms to\n\nanalyze your data and make personalized recommendations\n\nto your players. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n\nmethods such as A/B testing to see how different\n\nrecommendations perform with different player segments.\n\nPast performance is not a perfect indicator of future\n\nsuccess. Consistent testing allows you to tune your NBO and\n\nNBA recommendations so they evolve with your playerbase.\n\n- **Consider the player’s context:** When making recommend­\n\nations, consider the player’s current context, such as what\n\nthey are doing in the game and what content they have\n\nalready consumed. This will help you create recommend­\n\nations that are more likely to be relevant and timely.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "003f9e95e81a28915c19991b9d5cdb90", + "behavior, engagement, and spending habits.\n\n- **Leverage machine learning to scale your**\n\n**recommendations:** Use machine learning algorithms to\n\nanalyze your data and make personalized recommendations\n\nto your players. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n\nmethods such as A/B testing to see how different\n\nrecommendations perform with different player segments.\n\nPast performance is not a perfect indicator of future\n\nsuccess. Consistent testing allows you to tune your NBO and\n\nNBA recommendations so they evolve with your playerbase.\n\n- **Consider the player’s context:** When making recommend­\n\nations, consider the player’s current context, such as what\n\nthey are doing in the game and what content they have\n\nalready consumed. This will help you create recommend­\n\nations that are more likely to be relevant and timely.\n\n- **Be transparent:** Make sure you are transparent with\n\nyour players about how you are using their data to make\n\nrecommendations, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your NBO and NBA\n\n\n### Getting Started with Churn Prediction & Prevention\n\nThe exciting part of this analysis is that not only does it\n\nhelp to quantify the risk of customer churn but it paints a\n\nquantitative picture of exactly which factors explain that risk.\n\nIt’s important that we not draw too rash of a conclusion with\n\nregards to the causal linkage between a particular attribute\n\nand its associated hazard, but it’s an excellent starting point\n\nfor identifying where an organization needs to focus its\n\nattention for further investigation.\n\nThe hard part in this analysis is not the analytic techniques.\n\nThe Kaplan-Meier curves and Cox Proportional Hazard\n\nmodels used to perform the analysis above are well\n\nestablished and widely supported across analytics platforms.\n\nThe principal challenge is organizing the input data.\n\nThe vast majority of subscription services are fairly new as\n\nbusinesses. As such, the data required to examine customer\n\nattrition may be scattered across multiple systems,\n\nmaking an integrated analysis more difficult. Data Lakes\n\nare a starting point for solving this problem, but complex\n\ntransformations required to cleanse and restructure data\n\nthat has evolved as the business itself has (often rapidly)\n\nevolved requires considerable processing power. This is\n\ncertainly the case with the KKBox information assets and is a\n\npoint noted by the data provider in their public challenge.\n\nThe key to successfully completing this work is the\n\nestablishment of transparent, maintainable data processing\n\npipelines executed on an elastically scalable (and therefore\n\ncost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n\ncost-conscious in their initial approach, it’s important to\n\nremember the point made above that churn is a chronic\n\ncondition to be managed. As such, this is an analysis that\n\nshould be periodically revisited to ensure acquisition and\n\nretention practices are aligned.\n\nTo support this, we are making the code behind our\n\nanalysis available for download and review. If you have any\n\nquestions about how this solution can be deployed in your\n\nenvironment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n\n\nefforts with your team and encourage them to use the\n\n\ndata to inform their work.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "66c0f25b63f07da139c017833db0174b", + "[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n\ncost-conscious in their initial approach, it’s important to\n\nremember the point made above that churn is a chronic\n\ncondition to be managed. As such, this is an analysis that\n\nshould be periodically revisited to ensure acquisition and\n\nretention practices are aligned.\n\nTo support this, we are making the code behind our\n\nanalysis available for download and review. If you have any\n\nquestions about how this solution can be deployed in your\n\nenvironment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n\n\nefforts with your team and encourage them to use the\n\n\ndata to inform their work.\n\n\n**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Define churn:** Clearly define what you consider to be\n\nplayer churn, as this will determine how you measure\n\nand predict it. For example, you might consider churn to\n\nbe when a player stops playing your game for a certain\n\nnumber of days, or when they uninstall it.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you predict and prevent churn. This\n\nmight include data on player behavior, engagement, and\n\nspending habits.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and predict which players are at\n\nrisk of churning. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** Use experimentation methods such as\n\nA/B testing to see how different strategies impact churn\n\nrates. Use the insights you gain to optimize your churn\n\nprevention efforts.\n\n- **Focus on retention:** Implement retention strategies that are\n\ntailored to the needs and preferences of your players. This\n\nmight involve providing personalized content, addressing\n\npain points, or offering incentives to continue playing.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are using their data to predict and\n\nprevent churn, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your churn prediction\n\nand prevention efforts with your team and encourage\n\nthem to use the data to inform their work.\n\n### Getting Started with Read-time Ad Targeting\n\nTypically, implementing a real-time ad targeting strategy begins\n\noutside of your game (in services such as Google Ads, Unity\n\nAdvertising), where your game becomes the delivery point\n\nfor the advertisement. Here, you will need to integrate with\n\nAd networks that provide real-time ad targeting capabilities.\n\nThat will allow you to access a range of available ad assets\n\nto dynamically select and display the most relevant ads to\n\nplayers. Both Google AdMob and Unity Ads are great for banner\n\nads, native ads, and rewarded video ads. Your role is to ensure\n\nthat the data you’re collecting is fed back into the advertising\n\nplatform to better serve targeted ads to your playerbase.\n\n\nTo use a service like Databricks to manage the data needed\n\nto provide real-time ad targeting in your application, you can\n\nfollow the below steps:\n\n`1.` **Collect and store player data:** Collect data on player\n\nbehavior, preferences, and demographics, and store it in\n\na data lake using Databricks. Popular analytics tools such\n\nas Google Analytics or Mixpanel can be integrated into", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "f326bb843ecfb32627472ccddee6a6da", + "Advertising), where your game becomes the delivery point\n\nfor the advertisement. Here, you will need to integrate with\n\nAd networks that provide real-time ad targeting capabilities.\n\nThat will allow you to access a range of available ad assets\n\nto dynamically select and display the most relevant ads to\n\nplayers. Both Google AdMob and Unity Ads are great for banner\n\nads, native ads, and rewarded video ads. Your role is to ensure\n\nthat the data you’re collecting is fed back into the advertising\n\nplatform to better serve targeted ads to your playerbase.\n\n\nTo use a service like Databricks to manage the data needed\n\nto provide real-time ad targeting in your application, you can\n\nfollow the below steps:\n\n`1.` **Collect and store player data:** Collect data on player\n\nbehavior, preferences, and demographics, and store it in\n\na data lake using Databricks. Popular analytics tools such\n\nas Google Analytics or Mixpanel can be integrated into\n\nthe game to collect data on player behavior. These tools,\n\njust like tracking website traffic, can track in-game events,\n\nprovide insights on player behavior and demographics..\n\nand they give you access to detailed reports and\n\ndashboards. Another option is to build in-house tracking\n\nsystems to collect data on player behavior - logging\n\nevents, e.g in-game purchases or player actions, activities\n\nsuch as “at which level does a player quit playing” and\n\nstoring this in a database for analysis. The downside of\n\nbuilding in-house tracking systems is you will need to host\n\nand maintain your own logging servers.\n\n`2.` **Prepare the data:** Use Databricks to clean, transform,\n\nand prepare the player data for analysis. This may\n\ninclude aggregating data from multiple sources, removing\n\nduplicates and outliers, and transforming the data into a\n\nformat suitable for analysis.\n\n`3.` **Analyze the data:** Use Databricks’ built-in machine\n\nlearning and data analytics capabilities to analyze the\n\nplayer data and identify patterns and trends.\n\n`4.` **Create audience segments:** Based on the analysis,\n\nuse Databricks to create audience segments based on\n\ncommon characteristics such as interests, behaviors,\n\nand preferences.\n\n`5.` **Integrate with the ad server:** When an ad opportunity\n\npresents itself within the game, a call is made to the ad\n\nserver. This call includes information about the player,\n\nsuch as the audience segment that they belong to. The\n\nad server then uses this information to decide what ad to\n\ndeliver to the player.\n\n`6.` **Monitor and optimize:** Use Databricks to monitor the\n\nperformance of the ad targeting and make optimizations\n\nas needed, such as adjusting the audience segments or\n\nadjusting the targeting algorithms.\n\nBy using a service like Databricks to manage the data needed\n\nfor real-time ad targeting, game developers can effectively\n\nleverage their player data to create more personalized and\n\nengaging experiences, increase revenue, and reduce churn.\n\n\n-----\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Focus on player data:** Make player data the center of your\n\ntargeting strategy by collecting and storing comprehensive\n\ninformation on player behavior, preferences, and\n\ndemographics. Here, it’s critical to ensure the game code\n\ndata trackers are properly implemented in order to collect\n\nthis data (see Game Analytics section for detail).\n\n- **Segment your audience:** Create audience segments", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "0166095a24155148336e7b77c0f50cc4", + "for real-time ad targeting, game developers can effectively\n\nleverage their player data to create more personalized and\n\nengaging experiences, increase revenue, and reduce churn.\n\n\n-----\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Focus on player data:** Make player data the center of your\n\ntargeting strategy by collecting and storing comprehensive\n\ninformation on player behavior, preferences, and\n\ndemographics. Here, it’s critical to ensure the game code\n\ndata trackers are properly implemented in order to collect\n\nthis data (see Game Analytics section for detail).\n\n- **Segment your audience:** Create audience segments\n\nbased on common characteristics such as interests,\n\nbehaviors, and preferences, and use these segments to\n\n\n**Test and iterate:** Continuously test and iterate your\n\ntargeting strategy to refine your audience segments and\n\nimprove targeting accuracy.\n\n**Balance relevance and privacy:** Balance the need for\n\nrelevant, personalized ads with players’ privacy by only\n\ncollecting and using data that is necessary for targeting\n\nand obtaining player consent.\n\n**Monitor performance:** Regularly monitor the performance\n\nof your targeting strategy to ensure that it is delivering the\n\ndesired results and make optimizations as needed.\n\n**Partner with the right ad platform:** Choose an ad\n\nplatform that is well-suited to your needs and aligns with\n\nyour goals, and work closely with them to ensure that your\n\ntargeting strategy is delivering the best results.\n\n\ndeliver targeted ads.\n\n# Operational use cases\n\n\n### Anomaly Detection\n\nFirst thing is to begin collecting the data, game server / client\n\nlogs out of your project. Then consume this into Databricks\n\nDelta, to have a continuous anomaly detection model\n\nrunning. Focus this on key pieces of information you want to\n\nmonitor, for example - for live service games, this is going to\n\nbe infrastructure and network-related metrics such as Ping\n\nand Server Health (# of clients connected, server uptime,\n\nserver usage, CPU/RAM, # of sessions, time of sessions).\n\nOnce the model is ingesting and tuned specifically for the\n\nmetrics based on the information you have above. You would\n\nbuild out alerts or notifications based on these specific\n\nmetrics hitting a threshold that you define as needing\n\nattention. From here, you can build out automated systems\n\nto mitigate those effects - such as migrating players to a\n\ndifferent server, canceling matches, scaling infrastructure,\n\ncreating tickets for admins to review.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define the problem and objectives clearly:** Before\n\nimplementing an anomaly detection solution, it is\n\nimportant to define the problem you are trying to solve\n\nand your specific objectives. This will help ensure that\n\nyou have the right data sources and use the appropriate\n\nalgorithms to achieve your goals.\n\n- **Choose the right data sources:** To effectively detect\n\nanomalies, you need to have the right data sources.\n\nConsider data from player behavior, system performance,\n\nand network traffic, as well as any other data sources that\n\nare relevant to your problem and objectives.\n\n- **Clean and preprocess the data:** To ensure that the", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7457b3772e1b7fe52c1b29d823c92836", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define the problem and objectives clearly:** Before\n\nimplementing an anomaly detection solution, it is\n\nimportant to define the problem you are trying to solve\n\nand your specific objectives. This will help ensure that\n\nyou have the right data sources and use the appropriate\n\nalgorithms to achieve your goals.\n\n- **Choose the right data sources:** To effectively detect\n\nanomalies, you need to have the right data sources.\n\nConsider data from player behavior, system performance,\n\nand network traffic, as well as any other data sources that\n\nare relevant to your problem and objectives.\n\n- **Clean and preprocess the data:** To ensure that the\n\ndata you use for anomaly detection is accurate and\n\nmeaningful, it is important to clean and preprocess the\n\ndata. This includes removing any irrelevant or invalid data,\n\nhandling missing values, and normalizing the data\n\nif necessary.\n\n- **Choose the right algorithms:** There are many algorithms\n\nthat can be used for anomaly detection, including\n\nstatistical methods, machine learning algorithms, and\n\nrule-based systems. Choose the algorithms that are best\n\n\n-----\n\nsuited to your data and problem, and that provide the\n\nright level of accuracy, speed, and scalability.\n\n- **Validate the results:** Before deploying the anomaly\n\ndetection solution in production, it is important to validate\n\nthe results by testing the solution on a small subset of\n\ndata and comparing the results to expected outcomes.\n\n- **Monitor and update the solution:** Once the anomaly\n\ndetection solution is deployed, it is important to monitor\n\nits performance and accuracy, and update the solution as\n\nneeded. This may include retraining the algorithms, adding\n\nor removing data sources, and updating the parameters\n\nand thresholds used by the algorithms.\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Avoid overfitting:** Overfitting occurs when the anomaly\n\ndetection solution is too complex and learns the noise\n\nin the data rather than the underlying patterns. To avoid\n\noverfitting, it is important to choose algorithms that are\n\nappropriate for the size and complexity of the data, and to\n\nvalidate the results using a separate test dataset.\n\n- **False positive and false negative results:** False positive\n\nand false negative results can occur when the anomaly\n\ndetection solution is not properly calibrated, or when\n\nthe solution is applied to data that is significantly\n\ndifferent from the training data. To minimize the risk of\n\nfalse positive and false negative results, it is important\n\nto validate the results using a separate test dataset, and\n\nto fine-tune the parameters and thresholds used by the\n\nalgorithms as needed.\n\n- **Scalability:** Scalability can be a concern when\n\nimplementing an anomaly detection solution, especially\n\nwhen dealing with large amounts of data. To ensure that\n\nthe solution can scale to meet the demands of a growing\n\nplayer base, it is important to choose algorithms that\n\nare fast and scalable, and to deploy the solution using a\n\nscalable infrastructure.\n\n### Getting Started with Build Pipeline\n\nAn operational goal game projects have is to make sure\n\ngame project builds are generated, delivered quickly and\n\nefficiently to internal testing & external users.\n\n\nA few of the key metrics and capabilities with analyzing your\n\nbuild pipelines are the below:\n\n- **Build time and speed:** This includes metrics such as", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "ae5562e80c8b28bec6bb2ee3f01b5f5f", + "false positive and false negative results, it is important\n\nto validate the results using a separate test dataset, and\n\nto fine-tune the parameters and thresholds used by the\n\nalgorithms as needed.\n\n- **Scalability:** Scalability can be a concern when\n\nimplementing an anomaly detection solution, especially\n\nwhen dealing with large amounts of data. To ensure that\n\nthe solution can scale to meet the demands of a growing\n\nplayer base, it is important to choose algorithms that\n\nare fast and scalable, and to deploy the solution using a\n\nscalable infrastructure.\n\n### Getting Started with Build Pipeline\n\nAn operational goal game projects have is to make sure\n\ngame project builds are generated, delivered quickly and\n\nefficiently to internal testing & external users.\n\n\nA few of the key metrics and capabilities with analyzing your\n\nbuild pipelines are the below:\n\n- **Build time and speed:** This includes metrics such as\n\nthe time it takes to create a build, number of builds, and\n\ncompute spent.\n\n- **Build size and storage:** size of the builds, amount of\n\nstorage, and network costs.\n\n- **Bug tracking and resolution:** This includes metrics such\n\nas the number of bugs reported, the time it takes to\n\nresolve them, and the number of bugs that are resolved in\n\neach build.\n\n- **Code quality and efficiency:** This includes metrics such\n\nas code complexity, code duplication, and the number of\n\ncode lines written.\n\n- **Collaboration and communication:** Such as the number\n\nof code reviews, the number of team meetings, and the\n\nnumber of code commits.\n\n- **Advanced capabilities:** Such as Predicting real time build\n\nfailure to reduce spend and combining build data with\n\nCrash Analytics (see below) to have “commit to build”\n\nvisibility for accelerated bug fixing.\n\nBefore you start implementing your build pipeline, it’s\n\nimportant to define your requirements. What are the key\n\ngoals of your build pipeline? Choosing the right CI/CD tools is\n\ncritical to the success of your build pipeline. There are many\n\ndifferent tools available, including Jenkins, Azure Devops,\n\nPerforce, gitlab and more. When choosing a CI/CD tool,\n\nconsider factors such as ease of use, scalability, and cost. In\n\naddition, consider the specific needs of your game project,\n\nand choose a tool that can meet those needs.\n\nThe general recommendation is to look at automating your\n\nbuild process early. Once you’ve chosen your CI/CD tools, you\n\ncan automate your build process by setting up a build server,\n\nconfiguring your CI/CD tool, and creating a script to build your\n\ngame project. The build process should be automated as much\n\nas possible, and it should include steps to compile your code,\n\nrun automated tests, and generate a build of your project.\n\nOnce you have automated your build process, often the\n\nnext step is to implement CD (Continuous Delivery). This\n\ninvolves automating the deployment of your game builds\n\ndelivery to stakeholders, such as QA testers, beta testers, or\n\nend-users via publishing platforms. CD can help ensure that\n\nstakeholders have access to the latest version of your game\n\n\n-----\n\nas soon as possible, allowing them to provide feedback and\n\nhelp drive the development process forward.\n\nFinally, it’s important to monitor and measure your build\n\npipeline to ensure that it’s working as expected. This can\n\ninvolve using tools such as Databricks Dashboards to\n\nvisualize the status of your pipeline, or using metrics such\n\nas build times, test results, and deployment success rates\n\nto evaluate the performance of your pipeline. By monitoring\n\nand measuring your build pipeline, you can identify areas for", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "859ef65fc5f6ffd1ab8dfee683f35d36", + "run automated tests, and generate a build of your project.\n\nOnce you have automated your build process, often the\n\nnext step is to implement CD (Continuous Delivery). This\n\ninvolves automating the deployment of your game builds\n\ndelivery to stakeholders, such as QA testers, beta testers, or\n\nend-users via publishing platforms. CD can help ensure that\n\nstakeholders have access to the latest version of your game\n\n\n-----\n\nas soon as possible, allowing them to provide feedback and\n\nhelp drive the development process forward.\n\nFinally, it’s important to monitor and measure your build\n\npipeline to ensure that it’s working as expected. This can\n\ninvolve using tools such as Databricks Dashboards to\n\nvisualize the status of your pipeline, or using metrics such\n\nas build times, test results, and deployment success rates\n\nto evaluate the performance of your pipeline. By monitoring\n\nand measuring your build pipeline, you can identify areas for\n\nimprovement and make changes as needed to ensure that\n\nyour pipeline continues to meet your needs.\n\nIf you have any questions about how databricks can\n\nintegrate into your devops solution, please don’t hesitate to\n\n[reach out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Seek to automate early and often:** Automate as much\n\nof the build process as possible, from checking code into\n\nversion control to generating builds and distributing them\n\nto stakeholders. This can help reduce errors and save time,\n\nallowing game teams to focus on more high value tasks.\n\n\n**Version control, version control, version control:** Use a\n\nversion control system to manage the source code and\n\nother assets. This ensures that changes to the codebase\n\nare tracked and can be easily undone if needed.\n\n**Implement continuous integration and delivery:**\n\nContinuous integration (CI) involves automatically building\n\nand testing after code changes are checked into version\n\ncontrol. With CI, new changes to the codebase do not\n\nbreak existing functionality. By automating the build\n\nprocess, CI helps to reduce errors and save time. CD, on\n\nthe other hand, involves automatically delivering builds to\n\nstakeholders, such as QA testers, beta testers, or end-\n\nusers, after they have passed the automated tests. By\n\ncombining CI and CD, a video game project can ensure\n\nthat builds are generated and delivered quickly and\n\nefficiently, without the need for manual intervention.\n\n**Build for scalability:** As your game project grows, you\n\nwill need a build pipeline solution that is scalable and can\n\nhandle the needs of your game team.\n\n**Integration with other tools:** Integrate the build pipeline\n\nsolution with other tools and systems, such as issue\n\ntracking, testing, and deployment tools, to ensure a\n\nsmooth and efficient workflow.\n\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n\n|GAME INFRASTRUCTURE|Col2|\n|---|---|\n|||\n|||\n\n\n**AWS**\n\n**Quicksight**\n\n\n-----\n\n### Getting Started with Crash Analytics\n\nBuilding a pipeline to build a holistic view to support crash\n\nanalytics means data coming from multiple different\n\nsources, different velocities and joining the data together.\n\nThe amount of data sources depends on your game projects\n\npublishing platforms, some may come from console based\n\nproviders such as sony, xbox, and nintendo or pc platforms\n\nlike Steam, Epic Games Marketplace, GoG and many others.\n\n**High level steps**\n\n- Determine what platforms your game is running on and\n\nhow to interface to collect data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "ba68940e19b295f296eeb8a3fbe78784", + "smooth and efficient workflow.\n\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n\n|GAME INFRASTRUCTURE|Col2|\n|---|---|\n|||\n|||\n\n\n**AWS**\n\n**Quicksight**\n\n\n-----\n\n### Getting Started with Crash Analytics\n\nBuilding a pipeline to build a holistic view to support crash\n\nanalytics means data coming from multiple different\n\nsources, different velocities and joining the data together.\n\nThe amount of data sources depends on your game projects\n\npublishing platforms, some may come from console based\n\nproviders such as sony, xbox, and nintendo or pc platforms\n\nlike Steam, Epic Games Marketplace, GoG and many others.\n\n**High level steps**\n\n- Determine what platforms your game is running on and\n\nhow to interface to collect data.\n\n- **Collect crash data:** Implement crash reporting tools in\n\nyour game to collect data on crashes. The source data\n\nmay be delivered in varying formats such as JSON or CSV.\n\n- **Load crash data into Databricks:** Use Databricks’ data\n\ningestion tools to load the crash data into your workspace.\n\nThis could involve using Databricks’ built-in data source\n\nconnectors, or programmatically ingest files to load the data.\n\n\n\n- **Transform and clean the crash data:** Use Databricks’\n\ndata processing and transformation tools to clean and\n\nprepare the crash data for analysis. This could involve\n\nusing Databricks’ capabilities like DLT, or using SQL to\n\nperform custom transformations.\n\n- **Visualize crash data:** Use Databricks’ dashboarding tools\n\nto create visualizations that help you understand the\n\npatterns and trends in your crash data. This could involve\n\nusing Databricks’ built-in visualization tools, or integrating\n\nwith external visualization tools like Tableau or PowerBI.\n\n- **Analyze crash data:** Use Databricks’ machine learning\n\nand statistical analysis tools to identify the root causes\n\nof crashes. This could involve using Spark MLlib or many\n\nof the popular tools to build machine learning models, or\n\nusing SQL to perform custom analyses.\n\n- **Monitor and refine your pipeline:** Regularly review your\n\npipeline to ensure that it remains relevant and useful.\n\nRefine your pipeline as necessary to reflect changes in\n\nyour crash data or your goals.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Automated collection and aggregation of crash reports:**\n\nCollecting crash reports should be an automated process\n\nthat is integrated into the output of the build pipeline\n\nfor the game. The crash reports should be automatically\n\naggregated and made available for analysis in near real-time.\n\n- **Clear reporting and prioritization of issues:** The solution\n\nshould provide clear reporting on the most common\n\nissues and allow game developers to prioritize fixing the\n\nmost impactful problems first.\n\n- **Integration with other analytics tools:** The crash analytics\n\nsolution should integrate with other analytics tools, such\n\nas player behavior tracking, to provide a more complete\n\npicture of how crashes are impacting the player experience.\n\n- **Flexibility and scalability:** As the game grows, the\n\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "8896c6ca446a54c83f1d8a25b55f77af", + "**Tips / Best Practices**\n\n- **Automated collection and aggregation of crash reports:**\n\nCollecting crash reports should be an automated process\n\nthat is integrated into the output of the build pipeline\n\nfor the game. The crash reports should be automatically\n\naggregated and made available for analysis in near real-time.\n\n- **Clear reporting and prioritization of issues:** The solution\n\nshould provide clear reporting on the most common\n\nissues and allow game developers to prioritize fixing the\n\nmost impactful problems first.\n\n- **Integration with other analytics tools:** The crash analytics\n\nsolution should integrate with other analytics tools, such\n\nas player behavior tracking, to provide a more complete\n\npicture of how crashes are impacting the player experience.\n\n- **Flexibility and scalability:** As the game grows, the\n\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Data privacy and security:** Ensure that crash reports do\n\nnot contain sensitive information that could be used to\n\nidentify individual players.\n\n- **Scalability:** As the number of players and crashes\n\nincreases, it may become difficult to manage and analyze\n\nthe growing volume of data.\n\n- **Integration with other tools:** Be aware when integrating\n\ncrash analytics with other tools and systems, especially if\n\nthe tools use different data formats or data structures.\n\n- **Prioritization of issues:** Determine which crashes are\n\nthe most impactful and prioritize fixes accordingly. This\n\ncan be a complex process, especially if there are a large\n\nnumber of different crash types and causes.\n\n\nsolution should be able to scale to accommodate an\n\nincreasing number of players and crashes.\n\n**Data privacy and security:** It’s important to consider data\n\nprivacy and security when implementing a crash analytics\n\nsolution. This may involve implementing measures to\n\nanonymize crash reports, or taking steps to ensure that\n\nsensitive information is not included in the reports.\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n**AWS**\n\n**Quicksight**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fb2a03466ddbcb4463e9464f2f3db3bd", + "### Executive Guide\n\n# Transform and Scale Your Organization With Data and AI\n\n#### A guide for CIOs, CDOs, and\n data and AI executives\n\n\n-----\n\n## Contents\n\n**A U T H O R :**\n\n**Chris D’Agostino**\n\nGlobal Field CTO\n\nDatabricks\n\n**E D I T O R S :**\n\nManveer Sahota\n\n\n**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n\n**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n\n**1.** Establish the goals and business value 8\n\n**2.** Identify and prioritize use cases 19\n\n**3.** Build successful data teams 22\n\n**4.** Deploy a modern data stack 28\n\n**5.** Improve data governance and compliance 36\n\n**6.** Democratize access to quality data 41\n\n**7.** Dramatically increase productivity of your workforce 47\n\n**8.** Make informed build vs. buy decisions 52\n\n**9.** Allocate, monitor and optimize costs 55\n\n**10.** Move to production and scale adoption 58\n\n\nJessica Barbieri\n\n\nToby Balfre\n\n\n**C H A P T E R 3 :** **Conclusion** \u0007 63\n\n\n-----\n\n**CHAPTER 1:**\n## Executive Summary\n\nData and AI leaders are faced with the challenge\n\nof future-proofing their architecture and platform\n\ninvestments. The Lakehouse implementation from\n\nDatabricks combines the best features of EDWs\n\nand data lakes by enabling all their workloads using\n\nopen source and open standards — avoiding the\n\nvendor lock-in, black box design and proprietary\n\ndata formats of other cloud vendors.\n\n\nIt’s not surprising that many industry experts say data is the most valuable resource in the modern\n\neconomy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n\nwater. Its core compound never changes, and it can be transformed to whatever use case is desired,\n\nwith the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n\nessential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n\nimportance of data are growing exponentially in both our professional and personal lives, while artificial\n\nintelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n\nover the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n\n2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\n\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\n\ngoals but also in minimizing these seven key business risks.\n\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\n\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "f545eff42d3b9ae2b565475f4390ed44", + "2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\n\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\n\ngoals but also in minimizing these seven key business risks.\n\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\n\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\n\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\n\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\n\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n\nsignificant return on investment (ROI) — one that starts in months, not years.\n\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n\nto deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n\nidentify and execute on AI opportunities.\n\n\n-----\n\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\n\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n\norganizations have the option of moving away from closed, proprietary systems offered by a variety\n\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n\nindustry standards.\n\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n\nwe’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n\narchitecture, which decouples data storage from compute while providing the best price/performance\n\nmetrics for all your data workloads — including data warehousing. We have captured the lessons learned\n\nand summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\n\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n\nshown in Figure 1.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n**Figure 1:**\nThe Databricks Lakehouse Platform\n\n\n-----\n\n**The lakehouse architecture benefits organizations in several ways:**\n\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n\n**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n\n**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "4df144a0314dfaf639ae04e7ebb499d8", + "organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n\nshown in Figure 1.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n**Figure 1:**\nThe Databricks Lakehouse Platform\n\n\n-----\n\n**The lakehouse architecture benefits organizations in several ways:**\n\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n\n**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n\n**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n\nOur intention is to present key considerations and equip you with the knowledge to ask informed questions,\n\nmake the most critical decisions early in the process, and develop the comprehensive strategy that most\n\norganizations lack.\n\nIn addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n\nprofessional services offering that organizations can leverage to measure their readiness, reskill their staff\n\nand track progress as they embark on their data transformation initiative.\n\n\n-----\n\n**CHAPTER 2:**\n## Define the Strategy\n\n\nThe most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n\nstrategy for how your organization will leverage people, processes and platforms to drive measurable\n\nbusiness results against your corporate priorities. The strategy serves as a set of principles that every\n\nmember of your organization can refer to when making decisions. The strategy should cover the roles and\n\nresponsibilities of teams within your organization for how you capture, store, curate and process data to run\n\nyour business — including the internal and external resources (labor and budget) needed to be successful.\n\n\nEstablish the\ngoals and\nbusiness value\n\n\nBuild\nsuccessful\ndata teams\n\n\nEase data\ngovernance and\ncompliance\n\n\nSimplify\nthe user\nexperience\n\n\nAllocate,\nmonitor and\noptimize costs\n\n\nIdentify and\nprioritize\nuse cases\n\n\nDeploy a modern\ndata architecture\n\n\nDemocratize\naccess to\nquality data\n\n\nMake informed\nbuild vs. buy\ndecisions\n\n\nMove to\nproduction and\ndrive adoption\n\n\n**Figure 2:**\nThe 10 steps to a winning data and AI strategy\n\n\n-----\n\n#### Here are 10 key considerations\n\n**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n\n**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n\n**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n\nand data engineering talent.\n\n**4.** \u0007Future-proof your technology investment with a modern data architecture.\n\n**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n\nConsumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n\n**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n\ndata access and the sharing of all your data across the organization.\n\n**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n\n**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n\nimportant problems.\n\n**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n\n**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n\nuser satisfaction.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "e47cfe4574f92d0f241f9535db8cbac4", + "and data engineering talent.\n\n**4.** \u0007Future-proof your technology investment with a modern data architecture.\n\n**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n\nConsumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n\n**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n\ndata access and the sharing of all your data across the organization.\n\n**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n\n**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n\nimportant problems.\n\n**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n\n**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n\nuser satisfaction.\n\nThe strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n\nowned and governed by the CDO and made available for everyone in the organization to review and provide\n\nfeedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n\nthe technology landscape or a combination of any of these — but it should serve as the North Star for\n\nhow you will navigate the many decisions and trade-offs that you will need to make over the course of the\n\ntransformation.\n\n\nThis guide takes a stepwise approach to\n\naddressing each of these 10 topics.\n\n\n-----\n\nStudies have shown that data scientists spend 80%\n\nof their time collecting and compiling data sets\n\n\n#### 1. Establish the goals and business value\n\nMost organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n\nThe goals generally fall into one of three categories:\n\n**1.** **Business outcomes**\n\n**2.** **People**\n\n**3.** **Technology**\n\n\nand only 20% of their time developing insights and\n\n\nIn terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n\nemerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n\nbusiness leaders see the digital transformation as an opportunity to build a new technology foundation\n\nfrom which to run their business and increase business value. One that is more agile, scalable, secure and\n\neasier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n\ndynamic economy.\n\nFor organizations today, people are one of their most valuable assets — you cannot succeed in data,\n\nanalytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n\nimpacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n\nemployees work in a frictionless data environment, to the extent possible, so they feel productive each day\n\nand can do their best work.\n\nFinally, from a technology perspective, organizations have grown tired of the high costs associated with\n\ncomplex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n\nindustry trend is to move away from large capital expenditures (capex) to pay for network and server\n\ncapacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n\napproach. Your data analytics environment should support this trend as well — using open standards, low-\n\ncost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n\nonce they are complete.\n\n\nalgorithms. Organizations that are able to invert\n\nthese numbers benefit in two ways — happier\n\nemployees and improved time to market for use\n\ncases. These employers create more favorable\n\nworking environments and lower the risk of burnout\n\nand the resulting regrettable attrition.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "a3c69e2621c59adc16c4f9b279b5bc3c", + "and can do their best work.\n\nFinally, from a technology perspective, organizations have grown tired of the high costs associated with\n\ncomplex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n\nindustry trend is to move away from large capital expenditures (capex) to pay for network and server\n\ncapacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n\napproach. Your data analytics environment should support this trend as well — using open standards, low-\n\ncost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n\nonce they are complete.\n\n\nalgorithms. Organizations that are able to invert\n\nthese numbers benefit in two ways — happier\n\nemployees and improved time to market for use\n\ncases. These employers create more favorable\n\nworking environments and lower the risk of burnout\n\nand the resulting regrettable attrition.\n\n\n-----\n\n**Executive buy-in and support**\n\nLarge organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n\nto have unwavering buy-in and support from the highest levels of management — including the CEO and\n\nboard of directors. With this support, you have the leverage you need to develop the strategy, decide on\n\nan architecture and implement a solution that can truly change the way your business is run. Without it,\n\nyou have a very expensive science project that has little hope of succeeding. Why? Because the majority\n\nof people in your organization are busy doing their day jobs. The added work to support the initiative must\n\nbe offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n\nwithin it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n\nTransformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n\nall the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n\nto be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n\nvocal proponents.\n\n\n-----\n\n**Evolve to an AI-first company — not just a data-first company**\n\nData and AI transformations should truly transform the way organizations use data, not just evolve it. For\n\ndecades, businesses have operated using traditional business processes and leveraged Structured Query\n\nLanguage (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n\ndata. There are five major challenges with this approach:\n\n**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n\nuse pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n\nability to replicate and scale the results is nearly impossible.\n\nAuto�ated Decision�Ma�ing\n\n#### Tech leaders are to the right of the Data Maturity Curve\n\n\nPrescriptive Anal�tics\n\nPredictive Modeling\n\nData Exploration\n\n\nFrom hindsight to foresight\n\n\nHow should\nwe respond?\n\n\nAuto�aticall� �a��\nthe best decision\n\n\nAd Hoc Queries\n\nReports\nClean Data\n\nWHAT HAPPENED? WHAT W255 HAPPEN?\n\nData and A2 Maturit�\n\n\n**Figure 3:**\nThe Data Maturity Curve\n\n\n-----\n\n**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n\nprocessing.\n\n**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n\nprogrammatically state, in a priority manner, how data insights can be achieved or how the business\n\nshould react to changing data.\n\n**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n\nnumber of people needed to respond to every piece of data flowing into your environment. Machines", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "d7d8ac7c2c7123ddb5a774d91fdb8b1f", + "Prescriptive Anal�tics\n\nPredictive Modeling\n\nData Exploration\n\n\nFrom hindsight to foresight\n\n\nHow should\nwe respond?\n\n\nAuto�aticall� �a��\nthe best decision\n\n\nAd Hoc Queries\n\nReports\nClean Data\n\nWHAT HAPPENED? WHAT W255 HAPPEN?\n\nData and A2 Maturit�\n\n\n**Figure 3:**\nThe Data Maturity Curve\n\n\n-----\n\n**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n\nprocessing.\n\n**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n\nprogrammatically state, in a priority manner, how data insights can be achieved or how the business\n\nshould react to changing data.\n\n**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n\nnumber of people needed to respond to every piece of data flowing into your environment. Machines\n\nscale, people do not.\n\n**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n\ngain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n\n21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n\nof processing and managing data will not work. Using ML and AI will empower your workforce to\n\nleverage data to make better decisions for managing risk, helping your organization succeed in the\n\nmodern economy.\n\n**Go “all in” on the cloud**\n\nThe COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n\nvideoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n\ncloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n\nas a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n\nespecially when combined with the use of open source software (OSS), increase the speed at which\n\norganizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n\nFor AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n\ncorporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n\ntime, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n\nare well aware of vendor lock-in and want to abstract their applications so they can be moved across\n\nclouds if there is a compelling business reason.\n\n\n-----\n\nApproaching your technology choices with a multicloud point of view gives the organization more sovereignty\n\nover the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n\nrun on different cloud providers and simplified compliance with emerging regulations that may require\n\ncompanies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\n\nincreasingly important.\n\n**Modernize business applications**\n\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\n\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\n\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n\nservices and APIs to easily provide access to an application’s functionality.\n\nCloud-based architectures, commodity databases and software application development frameworks make", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "25ef18d715b47231f6594d1da80303e9", + "companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\n\nincreasingly important.\n\n**Modernize business applications**\n\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\n\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\n\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n\nservices and APIs to easily provide access to an application’s functionality.\n\nCloud-based architectures, commodity databases and software application development frameworks make\n\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\n\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n\na backing database) has become straightforward with the latest tooling available to your application\n\ndevelopment teams.\n\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\n\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n\napplications that generate and store a significant amount of the data consumed within an organization. Using\n\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n\n\n“We are on an amazing journey. Being among\n\nthe fastest-growing enterprise software cloud\n\ncompanies on record was unimaginable when\n\nwe started Databricks. To get here, we’ve stayed\n\nfocused on the three big bets we made when\n\nfounding the company — cloud, open source\n\nand machine learning. Fast-forward seven years,\n\nthousands of data teams around the globe are\n\nworking better together on Databricks.”\n\n**Ali Ghodsi**\n\nCo-founder and CEO\n\nDatabricks\n\n\n-----\n\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n\nother applications within your environment to store copies of the data — unless absolutely necessary for\n\nperformance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n\nthe data from the actual SOR.\n\nData from these SORs should be made available in three ways:\n\n**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n\n**2.** \u0007Ensure that copies of the data land in the data lake.\n\n**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n\nconsumption by downstream applications.\n\n**Move toward real-time decisioning**\n\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n\nand the second is to view data as an individual event. This so-called “time value of data” is an important\n\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n\nthe same data platform.\n\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\n\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "a20cadbb79e7462225d18454eb8193d4", + "**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n\nconsumption by downstream applications.\n\n**Move toward real-time decisioning**\n\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n\nand the second is to view data as an individual event. This so-called “time value of data” is an important\n\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n\nthe same data platform.\n\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\n\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n\ncan positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n\nThe goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n\nThis “time value of data” is shown in Figure 4 on the next page.\n\n\n-----\n\nFor example, real-time processing of clickstream data from your customer-facing mobile application can\n\nindicate when the customer is having trouble and may need to call into your call center. This insight gives\n\nyou the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n\ncenter agents — improving the customer experience and lowering customer churn.\n\nData, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n\nmachine learning models using historical data and provides you with the ability to make real-time decisions\n\nas new events take place. For example, credit card fraud models can use deep historical data about a given\n\ncustomer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n\nbuild rich models that are then executed for each new credit card transaction. This real-time execution,\n\ncombined with historical data, enables the best possible customer experience.\n\n#### Time Value of Data\n\n\nThe Databricks Lakehouse Platform allows you to\n\ncombine real-time streaming and batch processing\n\nusing one architecture and a consistent set of\n\nprogramming APIs.\n\n**Figure 4:**\nTime Value of Data\n\n\nValue of an individual data\n\nrecord is very high once created\nbut decreases over time\n\n\nValue of data records\n\nin aggregate increases\nover time\n\n\nReal-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n\n\n-----\n\n**Land** **_all_** **data in a data lake**\n\nIn order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n\nuser as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n\naccess. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n\nwarehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n\nthat data only. What do you do when you want to ask a different question? To further complicate matters,\n\nhow do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n\nHow do you find new insights as quickly as possible?\n\nThe overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n\nalong the data pipeline — including raw, refined and curated data states.\n\nThis phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n\n_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "5decf6a290526bf6b3497f886667a551", + "user as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n\naccess. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n\nwarehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n\nthat data only. What do you do when you want to ask a different question? To further complicate matters,\n\nhow do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n\nHow do you find new insights as quickly as possible?\n\nThe overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n\nalong the data pipeline — including raw, refined and curated data states.\n\nThis phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n\n_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n\nall their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n\nknowing whether or not you can trust the data.\n\n**Change the way people work**\n\nWhen done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n\nsavings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n\nminimizing the number of steps needed to produce results with data. This can be accomplished by:\n\n**1.** \u0007 Making data more accessible and ensuring it can be trusted\n\n**2.** Minimizing the number of tools/systems needed to perform work\n\n**3.** Creating a flywheel effect by leveraging the work of others\n\n\n“We believe that the data lakehouse architecture\n\npresents an opportunity comparable to the one\n\nwe saw during early years of the data warehouse\n\nmarket. The unique ability of the lakehouse to\n\nmanage data in an open environment, blend all\n\nvarieties of data from all parts of the enterprise and\n\ncombine the data science focus of the data lake\n\nwith the end-user analytics of the data warehouse\n\nwill unlock incredible value for organizations.”\n\n**Bill Inmon**\n\nThe father of the data warehouse\n\n\n-----\n\nIn large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n\nis laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n\nSystems and applications get built over time to satisfy specific needs within a line of business. As a result,\n\nit’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n\ndata they need to do their jobs. It should be as simple as getting your identity and PC.\n\nWith Databricks, users can collaborate and perform\n\n\nA primary goal of your data and AI transformation should be to focus on improving the user experience —\n\nin other words, improving how your entire organization interacts with data. Data must be easily discoverable\n\nwith default access to users based on their role(s) — with a simple process to compliantly request access to\n\ndata sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n\nthe various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n\nFinally, the results of the work performed by a user or system upstream should be made available to users\n\nand systems downstream as “data assets” that can drive business value.\n\nOrganizations that maximize the productivity of their workforce and enable employees to do their best work\n\nunder optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n\n**Minimize time in the “seam”**\n\nAs you begin your data transformation, it is important to know that the longer it takes, the more risk and\n\ncost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n\nto a modern data stack will require you to operate in two environments simultaneously, the old and the new,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "e98460a27fdbfdd72025b6718dc50b06", + "with default access to users based on their role(s) — with a simple process to compliantly request access to\n\ndata sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n\nthe various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n\nFinally, the results of the work performed by a user or system upstream should be made available to users\n\nand systems downstream as “data assets” that can drive business value.\n\nOrganizations that maximize the productivity of their workforce and enable employees to do their best work\n\nunder optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n\n**Minimize time in the “seam”**\n\nAs you begin your data transformation, it is important to know that the longer it takes, the more risk and\n\ncost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n\nto a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n\nfor some period of time. This will have a series of momentary adverse effects on your business:\n\n\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n\n\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n\nvery different ecosystems\n\n\ntheir work more efficiently, regardless of their\n\npersona or role. The user experience is designed\n\nto support the workloads of data analysts, SQL\n\ndevelopers, data engineers, data scientists and\n\nmachine learning professionals.\n\n\n-----\n\n\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n\nmodels and cyber defenses\n\n\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n\n\u0007It will require precise communications to ensure that your business partners know which environment to\n\nuse and for what data workloads\n\nTo mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n\n“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n\nIt’s important to remember this is a critical but short-lived experience for business continuity.\n\n**Shut down legacy platforms**\n\nIn keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n\nsteps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n\npremises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n\npremises Hadoop system is generally as follows:\n\n**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n\n**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n\nfixes or absolutely critical new business use cases.\n\n**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n\n**4.** \u0007Identify the source systems that feed the data.\n\n**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n\n**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n\n**7.** \u0007Determine the downstream consumers of the output from the jobs.\n\n\n-----\n\n**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n\n**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n\narchitecture.\n\n**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n\nworking smoothly.\n\n**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n\n**12.** \u0007Rinse and repeat — until all jobs are migrated.\n\n**13.** \u0007Shut down the Hadoop cluster.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "62110994a64bb010e27d3ddcc1b3a3d6", + "**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n\n**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n\n**7.** \u0007Determine the downstream consumers of the output from the jobs.\n\n\n-----\n\n**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n\n**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n\narchitecture.\n\n**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n\nworking smoothly.\n\n**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n\n**12.** \u0007Rinse and repeat — until all jobs are migrated.\n\n**13.** \u0007Shut down the Hadoop cluster.\n\nA similar model can also be applied to legacy on-premises enterprise data warehouses.\n\nYou can follow the same process for other legacy systems in your environment. Some of these systems\n\nmay be more complex and require the participation of more stakeholders to identify the fastest way to\n\nrationalize the data and processes. It is important, however, to make sure that the organization has the\n\nfortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n\ntheir lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n\nfor teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n\n9 plays a crucial role in seeing the shutdown of legacy platforms through.\n\n\n-----\n\n#### 2. Identify and prioritize use cases\n\nAn important next step in enabling data, analytics and AI to transform your business is to identify use cases\n\nthat drive business value — while prioritizing the ones that are achievable under the current conditions\n\n(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n\nthat could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n\nLeaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n\n**Establish the list of potential use cases**\n\nThe first step is to ideate by bringing together various stakeholders from across the organization and\n\nunderstand the overall business drivers — especially those that are monitored by the CEO and board of\n\ndirectors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n\nand understand the business processes and the data required to implement the use case. After steps one and\n\ntwo, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n\nproject within the data/IT teams, it’s important to have a line of business champion at the executive level.\n\nThere needs to be a balance between use cases that are complex and ones that are considered low-\n\nhanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n\nstraightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n\ngiven individual or household. However, developing a sophisticated credit card fraud model that takes into\n\naccount geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n\nto perform the analytics.\n\nIn terms of performance, thought should be given to the speed at which the use case must execute. In\n\ngeneral, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n\ncases into three categories:\n\n**1.** Sub-second response\n\n**2.** Multi-second response\n\n**3.** Multi-minute response\n\n\n-----\n\nBeing pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n\nengineering the design and infrastructure.\n\n**Thinking in terms of “data assets”**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "6e20bb6a8fb31697144f9de8e058686d", + "straightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n\ngiven individual or household. However, developing a sophisticated credit card fraud model that takes into\n\naccount geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n\nto perform the analytics.\n\nIn terms of performance, thought should be given to the speed at which the use case must execute. In\n\ngeneral, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n\ncases into three categories:\n\n**1.** Sub-second response\n\n**2.** Multi-second response\n\n**3.** Multi-minute response\n\n\n-----\n\nBeing pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n\nengineering the design and infrastructure.\n\n**Thinking in terms of “data assets”**\n\nMachine learning algorithms require data — data that is readily available, of high quality and relevant — to\n\nperform the experiments, train the models, and then execute the model when it is deployed to production.\n\nThe quality and veracity of the data used to perform these machine learning steps are key to deploying\n\nmodels into production that produce a tangible ROI.\n\nIt is critical to understand what steps are needed in order to make the data available for a given use case.\n\nOne point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n\nteams need to perform work to make data available for one use case, then look for opportunities to have the\n\nengineers do incremental work in order to surface data for adjacent use cases.\n\nMature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n\nthe importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n\napproach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n\nthe level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n\nroadmap helps data source owners understand the priority and complexity of the data assets that need to\n\nbe created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n\ninfluences the design of business applications and other systems within the organization.\n\n**Determine the highest impact/priority**\n\nAs shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n\naccount three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n\nwhether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n\nreduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n\ndata science talent readily available, to implement the use case. The ROI score indicates whether or not the\n\norganization can easily measure the impact to the P/L.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "0227c134834456af92ed318a9952e270", + "the level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n\nroadmap helps data source owners understand the priority and complexity of the data assets that need to\n\nbe created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n\ninfluences the design of business applications and other systems within the organization.\n\n**Determine the highest impact/priority**\n\nAs shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n\naccount three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n\nwhether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n\nreduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n\ndata science talent readily available, to implement the use case. The ROI score indicates whether or not the\n\norganization can easily measure the impact to the P/L.\n\n\n-----\n\n|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n|---|---|---|---|---|\n|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n\n\n**Figure 5:**\nMethodology for scoring use cases\n**Ensure business and technology leadership alignment**\n\nPrioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n\nIt is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n\nreduction (defensive). For example, data governance and compliance use cases should take priority\n\nover offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n\nacquisition of a new customer.\n\n\n-----\n\nThe Databricks Professional Services team can\n\nhelp customers identify revenue-generating and\n\ncost-saving opportunities for data and AI use cases\n\nthat provide a significant ROI when adopting the\n\n\n#### 3. Build successful data teams\n\nIn order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n\nperforming teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n\ntraining and leadership. Digital transformations require executive-level support and are likely to fail without\n\nit — especially in large organizations.\n\nHowever, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n\nan enterprise level. In other words, they must also evolve their company culture into one that embraces data,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "69c4fe9bf7ab670ec15a30cf31ea26f5", + "over offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n\nacquisition of a new customer.\n\n\n-----\n\nThe Databricks Professional Services team can\n\nhelp customers identify revenue-generating and\n\ncost-saving opportunities for data and AI use cases\n\nthat provide a significant ROI when adopting the\n\n\n#### 3. Build successful data teams\n\nIn order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n\nperforming teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n\ntraining and leadership. Digital transformations require executive-level support and are likely to fail without\n\nit — especially in large organizations.\n\nHowever, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n\nan enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n\ndata literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n\n\nlakehouse architecture.\n\n**Chief information officers and chief data officers — two sides of the data coin**\n\nData native companies generally have a single, accountable executive who is responsible for areas such\n\nas data science, business analytics, data strategy, data governance and data management. The data\n\nmanagement aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n\nthrough the environment, performing data quality checks and scanning for sensitive data in the clear.\n\nMany organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n\nto oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n\nstakeholders to establish the overall project plan, design and implementation — and to align project\n\nmanagement, product management, business analysis, data engineering, data scientist and machine\n\nlearning talent.\n\nThe CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n\nmake the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n\nmust understand the benefits of — and their role and responsibilities in — supporting the initiative.\n\n\n-----\n\nThere are two organizational constructs that are found in most successful data native companies. The first is\n\nthe creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n\nML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n\nthe formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n\npriorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n\nFurthermore, CDOs need to bring their CIOs along early in the journey.\n\n**Creating an AI/ML COE**\n\nData science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n\neverything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n\ndifficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n\ndocument, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n\nHowever, the general distinction is that data science is used to produce insights, machine learning is used to\n\nproduce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n\nis expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n\nvarious data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n\nset of questions.\n\nOrganizations wanting to build a data science competency should consider hiring talent into a centralized\n\norganization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n\ndata science. The COE works with the rest of the organization to educate and promote the appropriate use\n\nof data science for various use cases.\n\n\n-----\n\nA common approach is to have the COE report into the CDO, but still have data scientists dotted line into", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "2cd6d562ff9fe9da4014a21f3f129fd5", + "difficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n\ndocument, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n\nHowever, the general distinction is that data science is used to produce insights, machine learning is used to\n\nproduce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n\nis expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n\nvarious data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n\nset of questions.\n\nOrganizations wanting to build a data science competency should consider hiring talent into a centralized\n\norganization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n\ndata science. The COE works with the rest of the organization to educate and promote the appropriate use\n\nof data science for various use cases.\n\n\n-----\n\nA common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n\nthe business units or department. Using this approach, you achieve two goals:\n\n\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n\nwithin a business unit and can help identify use cases that drive value\n\n\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n\nand consistency in how work is performed among the cohort and brings that to the entire organization\n\n**Data and AI transformation steering committee**\n\nThe purpose of the steering committee is to provide governance and guidance to the data transformation\n\ninitiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n\na vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n\ninitiative.\n\nThe steering committee should meet regularly with leaders from across the organization to hear status\n\nreports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n\ngroup of stakeholders, including:\n\n\u0007\n**Program/project management:** To report the status of progress for deploying the new data\n\necosystem and driving adoption through use cases\n\n\u0007\n**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n\nof the platform\n\n\u0007\n**Engineering:** To report the status of the implementation and what technology trade-offs need\n\nto be made\n\n\u0007\n**Data science:** To report on the progress made by the COE on educating the organization about\n\nuse cases for ML and to report the status of various implementations\n\n\n-----\n\n\u0007\n**InfoSec:** To review the overall security, including network, storage, application and data\n\nencryption and tokenization\n\n\u0007\n**Architecture:** To oversee that the implementation adheres to architectural standards\n\nand guardrails\n\n\u0007\n**Risk, compliance and legal:** To oversee the approach to data governance\n\nand ethics in ML\n\n\u0007\n**User experience:** To serve as the voice of the end users who will perform their jobs using\n\nthe new data ecosystem\n\n\u0007\n**Communication:** To provide up-to-date communications to the organization about next\n\nsteps and how to drive adoption\n\n**Partnering with architecture and InfoSec**\n\nEarly on, the CDO and CIO should engage the engineering and architecture community within the\n\norganization to ensure that everyone understands the technical implications of the overall strategy. This\n\nminimizes the chances that the engineering teams will build separate and competing data platforms. In\n\nregulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n\nThe EA is responsible for validating that the overall technology design and data management features\n\nsupport the performance and regulatory compliance requirements — specifically, whether the proposed\n\ndesign can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n\nvariety and veracity (four Vs) of the data environment.\n\n\nIt is important to fully understand which\n\nenvironments and accounts your data is stored\n\nin. The goal is to minimize the number of copies of", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "8a601498740072e4936b24cddac2e38f", + "\u0007\n**Communication:** To provide up-to-date communications to the organization about next\n\nsteps and how to drive adoption\n\n**Partnering with architecture and InfoSec**\n\nEarly on, the CDO and CIO should engage the engineering and architecture community within the\n\norganization to ensure that everyone understands the technical implications of the overall strategy. This\n\nminimizes the chances that the engineering teams will build separate and competing data platforms. In\n\nregulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n\nThe EA is responsible for validating that the overall technology design and data management features\n\nsupport the performance and regulatory compliance requirements — specifically, whether the proposed\n\ndesign can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n\nvariety and veracity (four Vs) of the data environment.\n\n\nIt is important to fully understand which\n\nenvironments and accounts your data is stored\n\nin. The goal is to minimize the number of copies of\n\nyour data and to keep the data within your cloud\n\naccount — and not the vendor’s.\n\nMake sure the architecture and security model for\n\nprotecting data is well understood.\n\n\n-----\n\nFrom an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n\napplied to the new data ecosystem and that the authentication, authorization and access control methods\n\nmeet all the data governance requirements. An industry best practice is to enable self-service registration\n\nof data sets, by the data owner, and support the assignment of security groups or roles to help automate\n\nthe access control process. This allows data sets to be accessible only to the personnel that belong to a\n\ngiven group. The group membership could be based primarily on job function or role within the organization.\n\nThis approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n\ntoo many access control groups — in other words, do not get too fine grained with group permissions, as\n\nthey will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n\nrow- and column-level security sparingly.\n\n**Centralized vs. federated labor strategy**\n\nIn most organizations today, managers work in silos, making decisions with the best intentions but focused\n\non their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n\nconflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n\nand money and potentially erode the confidence and motivation of the various teams. While it certainly is\n\nbeneficial to compare and contrast different approaches to implementing an architecture, the approaches\n\nshould be strictly managed, with everyone designing for the same goals and requirements — as described in\n\nthis strategy document and adhering to the architectural principles and best practices.\n\nEven still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n\nleast amount of complexity as possible, and one that can easily scale across the organization. It is very\n\nchallenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n\nin front of this wave of innovation and take input from the various teams to create a single, centralized\n\nplatform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n\nmodern data stack — while ensuring that there is no duplication of effort when implementing the platform\n\ncomponents. Figure 6 shows one possible structure.\n\n\n-----\n\n**Figure 6:**\nCentralized teams with matrixed responsibilities\n\n\n**Data Scientist**\nModel and predict with data\n\n**Data Analyst**\nVisualize and describe data\n\n\n**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n\n**Data Engineer**\nStore, process, maintain data\n\n**Business Partners**\n**and Domain Experts**\n\n\nCentralize data scientists under CDO — embed in lines of business for day-to-day tasking", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "46976d7e483261a09e448b88ed2dab97", + "in front of this wave of innovation and take input from the various teams to create a single, centralized\n\nplatform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n\nmodern data stack — while ensuring that there is no duplication of effort when implementing the platform\n\ncomponents. Figure 6 shows one possible structure.\n\n\n-----\n\n**Figure 6:**\nCentralized teams with matrixed responsibilities\n\n\n**Data Scientist**\nModel and predict with data\n\n**Data Analyst**\nVisualize and describe data\n\n\n**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n\n**Data Engineer**\nStore, process, maintain data\n\n**Business Partners**\n**and Domain Experts**\n\n\nCentralize data scientists under CDO — embed in lines of business for day-to-day tasking\n\nCentralize data engineers under CIO/CTO — initially as an enterprise function\n\n**Hiring, training and upskilling your talent**\n\nWhile this guide does not cover recruiting strategies, it is important to note that data engineering and data\n\nscience talent is very difficult to find in this competitive market. As a result, every organization should\n\nconsider what training and upskilling opportunities exist for their current staff. A large number of online\n\ncourses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n\naugment your existing staff with experienced data scientists and machine learning experts. You will then\n\nneed to establish clear training paths, resources and timelines to upskill your talent.\n\nUsing the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n\nless experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n\nin educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n\ntransfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n\nexperience of your talent is to enable data science using production-like data and creating a collaborative\n\nenvironment for code sharing.\n\n\n-----\n\nThe Databricks training, [documentation](https://docs.databricks.com) and\n\n[certification](https://databricks.com/learn/certification) available to customers is industry-\n\nleading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n\n\n#### 4. Deploy a modern data stack\n\nThe modern data architecture can most easily be described as the evolution of the enterprise data\n\nwarehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n\nlimitations and lessons learned from working with these two legacy data architectures inspired the next\n\ngeneration of data architecture — what the industry now refers to as the lakehouse.\n\nFigure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n\nhave improved over time.\n\n\nexemplar code for organizations to hit the ground\n\nrunning with data and AI.\n\n**Figure 7:**\nA brief history of data architectures\n\n\n-----\n\n**Evolving beyond the enterprise data warehouse and data lake**\n\nThe EDW provided organizations with the ability to easily load structured and semi-structured data into\n\nwell-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n\n(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n\nthe business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n\ncatalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n\nusers — while still being performant. The EDW served its purpose for decades. However, most of the recent\n\nadvances in AI have been in better models to process unstructured data (text, images, video, audio), but", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "170152bbd1e1dfc694f4a48a8e767c4c", + "have improved over time.\n\n\nexemplar code for organizations to hit the ground\n\nrunning with data and AI.\n\n**Figure 7:**\nA brief history of data architectures\n\n\n-----\n\n**Evolving beyond the enterprise data warehouse and data lake**\n\nThe EDW provided organizations with the ability to easily load structured and semi-structured data into\n\nwell-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n\n(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n\nthe business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n\ncatalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n\nusers — while still being performant. The EDW served its purpose for decades. However, most of the recent\n\nadvances in AI have been in better models to process unstructured data (text, images, video, audio), but\n\nthese are precisely the types of data that an EDW is not optimized for.\n\nTherefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n\n_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n\n_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n\noften leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n\ncommodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n\norganizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n\nrange of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n\ncomplex programming model known as MapReduce, and the performance fell short of the majority of real-\n\ntime use cases.\n\nOver time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n\nprocessing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n\nmodel was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n\nfor languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n\nfrom storage and providing the scale needed for distributed workloads.\n\n\n-----\n\nA data lakehouse combines the best of data\n\n\n**Cloud-based data lakes**\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n\nAmazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n\nstorage systems in the world — which make them an attractive platform to serve as the next generation\n\nof data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n\nwarehouses.\n\n\nlakes and data warehouses, enabling BI and ML\n\n\nHowever, data lakes lack some critical features: They do not support transactions, they do not enforce\n\ndata quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\n\n**Lakehouse — the modern data architecture**\n\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\n\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n\narchitecture possible.\n\n\non all data on a simple, open and multicloud\n\nmodern data stack.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "a6c4aa57b347d46b3d74ce86a7176024", + "data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\n\n**Lakehouse — the modern data architecture**\n\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\n\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n\narchitecture possible.\n\n\non all data on a simple, open and multicloud\n\nmodern data stack.\n\n\n-----\n\n**Exploratory Data Scientist**\n\n\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n\n\n**Curated Data Lake**\n\n\n**Raw Data Ingest**\n“Bronze”\n\n\n**Filtered/Cleaned/Augmented**\n“Silver”\n\n\n**Business-Level Aggregates**\n“Gold”\n\n\n**D ATA Q U A L I T Y**\n\n**Data Sources (Batch and Real-Time)**\n\n\n**Unstructured**\n\n- Image, Video, Audio\n\n- Free Text, Blob\n\n\n**Semi-Structured**\n\n- Logs, Clickstream\n\n- CSV, JSON, XML\n\n\n**Structured**\n\n- Systems of Record\n\n- Operational DBs\n\n\n**Figure 8:**\nThe building blocks for a modern data architecture\n\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\n\ntarget-state architecture supports loading all the data types that might be interesting to an organization —\n\nstructured, semi-structured and unstructured — and provides a single processing layer, using consistent\n\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\n\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\n\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n\nThe architecture makes possible the efficient creation of “data assets” for the organization by taking a\n\nstepwise approach to improving data.\n\n\n-----\n\n**Lakehouse key features**\n\nTo effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n\navailable for stakeholders to run business-critical production workloads:\n\n\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\n\nmonitoring and recovery.\n\n\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n\nread or write data, typically using SQL.\n\n\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n\n\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "0ec047404a66ef05632a43b6b58c06ef", + "management with declarative pipeline development, automatic data testing and deep visibility for\n\nmonitoring and recovery.\n\n\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n\nread or write data, typically using SQL.\n\n\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n\n\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n\nunstructured data like tables, files, models and dashboards in concert with existing data, storage and\n\ncatalogs.\n\n\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n\nclusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n\nSome modern data warehouses also have this property.\n\n\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n\nan API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n\naccess the data directly.\n\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\nlifecycle management functions that are needed\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once, replace a subset of\n\nthe objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\npoint-in-time snapshots or rollback of erroneous\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n\nused to store, refine, analyze and access data types needed for many new data applications, including\n\nimages, video, audio, semi-structured data and text.\n\n\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n\ntools might be needed to support all these workloads, but they all rely on the same data repository.\n\n\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n\neliminates the need for separate systems dedicated to serving real-time data applications.\n\n\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n\nimproves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n\ndata in both a data lake and a warehouse.\n\n\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n\ngovernance experience across all clouds. You don’t need to invest in reinventing processes for every\n\ncloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n\nfocus on putting all your data to work to discover new insights and create business value.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "907a8e9c378b4107ab4c12c53e599cb4", + "\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n\nimproves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n\ndata in both a data lake and a warehouse.\n\n\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n\ngovernance experience across all clouds. You don’t need to invest in reinventing processes for every\n\ncloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n\nfocus on putting all your data to work to discover new insights and create business value.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n**Figure 9:**\nDelta Lake is the open data storage layer that delivers reliability,\nsecurity and performance on your data lake — for both\nstreaming and batch operations\n\n\n-----\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n\nfor security and access control are basic requirements. Data governance capabilities, including auditing,\n\nretention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n\nenable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n\nsuch enterprise features only need to be implemented, tested and administered for a single system.\n\nDatabricks is the only cloud-native vendor\n\n\n**Databricks — innovation driving performance**\n\nAdvanced analytics and machine learning on unstructured and large-scale data are two of the most\n\nstrategic priorities for enterprises today — and the growth of unstructured data is going to increase\n\nexponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n\ncenter of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n\nenough to meet the SLAs of the various workloads — especially SQL-based analytics.\n\nDatabricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n\nand hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n\non the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n\ntechnologies to provide the performance customers need to simplify their architecture. These innovations\n\ncombine to provide a single architecture that can store and process all the data sets within an organization —\n\nsupporting the range of analytics outlined above.\n\n**BI and SQL workloads**\n\nPerhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n\nfor star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n\npart of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n\nto compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n\nsatisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n\nexecution, statistical analysis of files in the object store, and hardware and software improvements make it\n\npossible to deliver on this promise.\n\n\nto be recognized as a Leader in both\n\n[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n\n**Cloud Database Management Systems** and\n\n**Data Science and Machine Learning Platforms**\n\n\n-----\n\n**A word about the data mesh architecture**\n\nIn 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n\nwhat some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "6d08d458f9c7010b81932bc03d5ed771", + "part of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n\nto compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n\nsatisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n\nexecution, statistical analysis of files in the object store, and hardware and software improvements make it\n\npossible to deliver on this promise.\n\n\nto be recognized as a Leader in both\n\n[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n\n**Cloud Database Management Systems** and\n\n**Data Science and Machine Learning Platforms**\n\n\n-----\n\n**A word about the data mesh architecture**\n\nIn 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n\nwhat some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n\nusing a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n\nmesh approach avoids centralizing data in one location and encourages the source systems to create\n\n“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n\ndesigners advocate for a federated approach to data and AI — while using enterprise policies to govern how\n\nsource systems make data assets available.\n\nThere are several challenges with this approach. First, the data mesh assumes that each source system\n\ncan dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n\nbecome “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n\ndetails to the individual teams. This has the potential of inconsistent implementations, which may lead to\n\nperformance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n\nsource system team has the necessary skills, or can acquire them, to build robust data products.\n\nThe lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n\nfrom the source systems reduces the curation steps needed inside the data lake itself.\n\n\n-----\n\n#### 5. Improve data governance and compliance\n\nData governance is perhaps the most challenging aspect of data transformation initiatives. Every\n\nstakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n\ndrive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n\nundetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n\nenvironments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n\na blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n\ndata governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n\nand privacy practices, and protect their data assets\n\n**Why data governance fails**\n\nWhile most people agree that data governance is a set of principles, practices and tooling that helps\n\nmanage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n\napproach — one that balances realistic policies with automation and scalability.\n\nToo often the policies developed around data governance define very strict data management principles —\n\nfor example, the development of an enterprise-wide ontological model that all data must adhere to.\n\nOrganizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n\neffort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n\ncomplexity of the requirements. Meanwhile, data continues to flow through the organization without a\n\nconsistent approach to governance, and too much of the effort is done manually and fraught with human error.\n\n\nWhat are the basic building blocks of a sound data\n\ngovernance approach?\n\n\n-----\n\n**A pragmatic approach to data governance**\n\nAt a high level, organizations should enable the following data management capabilities:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "3bd334ec290957c54f467da186a0ee7d", + "manage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n\napproach — one that balances realistic policies with automation and scalability.\n\nToo often the policies developed around data governance define very strict data management principles —\n\nfor example, the development of an enterprise-wide ontological model that all data must adhere to.\n\nOrganizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n\neffort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n\ncomplexity of the requirements. Meanwhile, data continues to flow through the organization without a\n\nconsistent approach to governance, and too much of the effort is done manually and fraught with human error.\n\n\nWhat are the basic building blocks of a sound data\n\ngovernance approach?\n\n\n-----\n\n**A pragmatic approach to data governance**\n\nAt a high level, organizations should enable the following data management capabilities:\n\n**\u0007Identify all sources of data**\n\n\u0007Identify all data-producing and data-storing applications\n\n\u0007Identify the systems of record (SOR) for each data set\n\n\u0007Label data sets as internal or external (third party)\n\n\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n\n\u0007Limit which operational data stores (ODSs) can re-store SOR data\n\n**\u0007Catalog data sets**\n\n\u0007Register all data sets in a centralized data catalog\n\n\u0007Create a lightweight, self-service data registration process\n\n\u0007Limit manual entry as much as possible\n\n\u0007Record the schema, if any, for the data set\n\n\u0007Use an inference engine or tool to extract the data set schema\n\n\u0007Add business and technical metadata to make it meaningful\n\n\u0007Use machine learning to classify data sets\n\n\u0007Use crowdsourcing to validate the machine-based results\n\n**Track data lineage**\n\n\u0007Track data set flow and what systems act on data\n\n\u0007Create an enumerated list of action values for specific operations\n\n\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n\n\n\n\u0007Optional: Add a source code repository URL for action traceability\n\n\n-----\n\n**\u0007Perform data quality checks**\n\n\u0007Create a rules library that is centrally managed and versioned\n\n\u0007Update the rules library periodically with new rules\n\n\u0007Use a combination of checks — null/not null, regex, valid values\n\n\u0007Perform schema enforcement checks against data set registration\n\nBy minimizing the number of copies of your data\n\n\n**\u0007Scan for sensitive data**\n\n\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n\n\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n\n\u0007Use fixed-length tokens to preserve analytic value\n\n\u0007Determine the approach for token lookup/resolution when needed\n\n\u0007Ensure that any central token stores are secure with rotating keys\n\n\u0007Identify which data elements from GDPR/CCPA to include in scans\n\n\u0007Efficiently scan for sensitive data in cleartext using the rules library\n\n**\u0007Establish approved data flow patterns**\n\n\u0007Determine pathways for data flow (source —> target)\n\n\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n\n\u0007Determine read/write patterns for the data lake\n\n\u0007Strictly enforce data flow pathways to/from data lake\n\n\u0007Detect violations and anomalies using lineage event analysis\n\n\u0007Identify offending systems and shut down or grant exception\n\n\u0007Record data flow exceptions and set a remediation deadline\n\n**\u0007Centralize data access controls**\n\n\u0007Establish a common governance model for all data and AI assets\n\n\u0007Centrally define access policies for all data and AI assets\n\n\u0007Enable fine-grained access controls at row and column levels\n\n\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n\n\nand moving to a single data processing layer where\n\nall your data governance controls can run together,\n\nyou improve your chances of staying in compliance", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "1785ddb61be78e30cb44c8b841db719b", + "\u0007Efficiently scan for sensitive data in cleartext using the rules library\n\n**\u0007Establish approved data flow patterns**\n\n\u0007Determine pathways for data flow (source —> target)\n\n\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n\n\u0007Determine read/write patterns for the data lake\n\n\u0007Strictly enforce data flow pathways to/from data lake\n\n\u0007Detect violations and anomalies using lineage event analysis\n\n\u0007Identify offending systems and shut down or grant exception\n\n\u0007Record data flow exceptions and set a remediation deadline\n\n**\u0007Centralize data access controls**\n\n\u0007Establish a common governance model for all data and AI assets\n\n\u0007Centrally define access policies for all data and AI assets\n\n\u0007Enable fine-grained access controls at row and column levels\n\n\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n\n\nand moving to a single data processing layer where\n\nall your data governance controls can run together,\n\nyou improve your chances of staying in compliance\n\nand detecting a data breach.\n\n\n-----\n\n**\u0007Make data discovery easy**\n\n\u0007Establish a data discovery model\n\n\u0007Use manual or automatic data classification\n\n\u0007Provide a visual interface for data discovery across your data estate\n\n\u0007Simplify data discovery with rich keyword- or business glossary-based search\n\n**\u0007Centralize data access auditing**\n\n\u0007Establish a framework or best practices for access auditing\n\n\u0007Capture audit logs for all CRUD operations performed on data\n\n\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n\nThis is not intended to be an exhaustive list of features and requirements but rather a framework to\n\nevaluate your data governance approach. There will be violations at runtime, so it will be important to have\n\nprocedures in place for how to handle these violations. In some cases, you may want to be very strict and\n\nshut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n\nthe offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n\nthese cases, the receiving systems must have their own methodology for dealing with bad data.\n\n\n-----\n\n**Hidden cost of data governance**\n\nThere are numerous examples of high-profile data breaches and failure to comply with consumer data\n\nprotection legislation. You don’t have to look very far to see reports of substantial fines levied against\n\norganizations that were not able to fully protect the data within their data ecosystem. As organizations\n\nproduce and collect more and more data, it’s important to remember that while storage is cheap, failing\n\nto enforce proper data governance is very, very expensive.\n\nIn order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n\ncompute power when you consider the massive amounts of data that exist in your organization. Each\n\ntime you copy a piece of data to load it into another tool or platform, you need to determine what data\n\ngovernance techniques exist there and how you ensure that you truly know where all your data resides.\n\nImagine the scenario where data flows through your environment and is loaded into multiple platforms\n\nusing various ETL processes. How do you handle the situation when you discover that sensitive data is\n\nin cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n\nproblem before it’s flagged for violation.\n\nHaving a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n\norganization’s brand and balance sheet.\n\nThe bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n\nit is to get data governance right.\n\n\n-----\n\n#### 6. Democratize access to quality data\n\nEffective data and AI solutions rely more on the amount of quality data available than on the sophistication\n\nor complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "a475a44648b52500087a321fd8f2521e", + "governance techniques exist there and how you ensure that you truly know where all your data resides.\n\nImagine the scenario where data flows through your environment and is loaded into multiple platforms\n\nusing various ETL processes. How do you handle the situation when you discover that sensitive data is\n\nin cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n\nproblem before it’s flagged for violation.\n\nHaving a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n\norganization’s brand and balance sheet.\n\nThe bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n\nit is to get data governance right.\n\n\n-----\n\n#### 6. Democratize access to quality data\n\nEffective data and AI solutions rely more on the amount of quality data available than on the sophistication\n\nor complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n\nData” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n\ndata scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n\nwhich is to create new opportunities for revenue growth, cost reduction and risk reduction.\n\n**The 80/20 data science dilemma**\n\nMost existing data environments have their data stored primarily in different operational data stores within a\n\ngiven business unit (BU) — creating several challenges:\n\n\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n\nof cross-BU opportunities\n\n\u0007The schemas are generally not well understood outside of BU or department — with only the database\n\ndesigners and power users being able to make efficient use of the data. This is referred to as the “tribal\n\nknowledge” phenomenon.\n\n\u0007The approval process and different system-level security models make it difficult and time-consuming\n\nfor data scientists to gain the proper access to the data they need\n\nIn order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n\noften done using single-node data science and generates unnecessary copies of data stored on local disk\n\ndrives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n\nspaces” within production platform environments. This has the strong potential of degrading the overall\n\nperformance for true production workloads.\n\nTo make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n\nbe needed in order to get the best model performance for your ML and AI workloads.\n\n\n-----\n\nSmall data sets reduce the effectiveness of exploration, experimentation, model development and model\n\ntraining — resulting in inaccurate models when deployed into production and used with full-size data sets.\n\nAs a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n\ntime performing analytic work — work that may need to be redone once they have access to the full-size\n\ndata sets. This is a serious problem for organizations that want to remain competitive and generate game-\n\nchanging results.\n\nAnother factor contributing to reduced productivity is the way in which end users are typically granted\n\naccess to data. Security policies usually require both coarse-grained and fine-grained data protections.\n\nIn other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n\ngrained) within the data set.\n\n**Rationalize data access roles**\n\nThe most common approach to providing coarse-grained and fine-grained access is to use what’s known\n\nas role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n\n(SSO) authentication and access control solution.\n\nUsers can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n\nThere are different strategies for identifying and creating these groups — but typically, they are done on a", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "6cb1f6fe81af31114d2adae75b0401a0", + "data sets. This is a serious problem for organizations that want to remain competitive and generate game-\n\nchanging results.\n\nAnother factor contributing to reduced productivity is the way in which end users are typically granted\n\naccess to data. Security policies usually require both coarse-grained and fine-grained data protections.\n\nIn other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n\ngrained) within the data set.\n\n**Rationalize data access roles**\n\nThe most common approach to providing coarse-grained and fine-grained access is to use what’s known\n\nas role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n\n(SSO) authentication and access control solution.\n\nUsers can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n\nThere are different strategies for identifying and creating these groups — but typically, they are done on a\n\nsystem-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n\nThis approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n\nthousand discrete security groups for large organizations — despite having a much smaller number of\n\ndefined job functions.\n\nThis approach creates one of the biggest security challenges in large organizations. When personnel leave\n\nthe company, it is fairly straightforward to remove them from the various security groups. However, when\n\npersonnel move around within the organization, their old security group assignments often remain intact\n\nand new ones are assigned based on their new job function. This leads to personnel continuing to have\n\naccess to data that they no longer have a “need to know.”\n\n\nThe Databricks Lakehouse Platform brings together\n\nall the data and AI personas into one environment\n\nand makes it easy to collaborate, share code and\n\ninsights, and operate against the same view of data.\n\n\n-----\n\n**Data classification**\n\nHaving all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n\nstrategies to segment your data based on “need to know.” Some organizations create a partition based\n\non which business unit owns the data and which one owns the data classification. For example, in a\n\nfinancial services company, credit card customers’ data could be stored separately from that of debit card\n\ncustomers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n\nThe simplest approach to data classification is to use three labels:\n\n\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n\nreleases, etc.\n\n\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n\ncompetitors. This would include strategy briefings and market or customer segmentation research.\n\n\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n\ncould negatively affect operations and put the organization at financial or legal risk. Restricted data\n\nrequires the highest level of security protection.\n\nSome organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n\nunderstands how to apply them.\n\nThe data classification requirements should be clearly documented and mapped to any legal or regulatory\n\nrequirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n\nand defines “personal information” as “information that identifies, relates to, describes, is capable of\n\nbeing associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n\nhousehold.”\n\n\n-----\n\nJust examining one CCPA category, _Customer Records Information_ , we see that the following information is\n\nto be protected: name, signature, social security number, physical characteristics or description, address,\n\ntelephone number, passport number, driver’s license or state identification card number, insurance policy\n\nnumber, education, employment, employment history, bank account number, credit or debit card number,\n\nother financial information, medical information, and health insurance information.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "79eb4b4e5211fef4892cfd40c5e3441b", + "requires the highest level of security protection.\n\nSome organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n\nunderstands how to apply them.\n\nThe data classification requirements should be clearly documented and mapped to any legal or regulatory\n\nrequirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n\nand defines “personal information” as “information that identifies, relates to, describes, is capable of\n\nbeing associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n\nhousehold.”\n\n\n-----\n\nJust examining one CCPA category, _Customer Records Information_ , we see that the following information is\n\nto be protected: name, signature, social security number, physical characteristics or description, address,\n\ntelephone number, passport number, driver’s license or state identification card number, insurance policy\n\nnumber, education, employment, employment history, bank account number, credit or debit card number,\n\nother financial information, medical information, and health insurance information.\n\nThere are generally three different approaches in industry to performing data classification:\n\n**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n\ndone using regular expressions and lookup tables to map values to actual entities stored inside the\n\norganization (e.g., customer SSN).\n\n**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n\nthe sensitivity of the data.\n\n**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n\ndomain knowledge to ensure accuracy.\n\nTaking all this into account, an organization could implement a streamlined set of roles for RBAC that\n\nuses the convention where “domain” might be the\n\nbusiness unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n\nthe ID, and “classification” is one of the three values (public, internal, restricted).\n\nThere is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n\nrole assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n\ncombination.\n\n\n-----\n\nFor example, gives a user or a system access to all the\n\ndata fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n\nWhereas would allow the user or system\n\naccess only to nonsensitive data regarding the transaction.\n\nThis gives organizations the chance to rationalize their security groups by using a domain naming\n\nconvention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n\ngroups. It also dramatically eases the administration of granting access to data for a given user.\n\n**Everyone working from the same view of data**\n\nThe modern data stack, when combined with a simplified security group approach and a robust data\n\ngovernance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n\nimproves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n\nshared view of your data.\n\nCombining this with a sensitive data tokenization strategy can make it straightforward to empower data\n\nscientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n\nsets that both obfuscate NPI/PII information and preserve analytic value.\n\nNow, data discovery is easier because data sets have been registered in the catalog with full descriptions\n\nand business metadata — with some organizations going as far as showing realistic sample data for a\n\nparticular data set. If a user does not have access to the underlying data files, having data in one physical", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "cf19174cbf629dea04e1166d2b6966cc", + "**Everyone working from the same view of data**\n\nThe modern data stack, when combined with a simplified security group approach and a robust data\n\ngovernance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n\nimproves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n\nshared view of your data.\n\nCombining this with a sensitive data tokenization strategy can make it straightforward to empower data\n\nscientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n\nsets that both obfuscate NPI/PII information and preserve analytic value.\n\nNow, data discovery is easier because data sets have been registered in the catalog with full descriptions\n\nand business metadata — with some organizations going as far as showing realistic sample data for a\n\nparticular data set. If a user does not have access to the underlying data files, having data in one physical\n\nlocation eases the burden of granting access, and then it’s easier to deploy access-control policies and\n\ncollect/analyze audit logs to monitor data usage and to look for bad actors.\n\n\nAdopting the Databricks Lakehouse Platform allows\n\nyou to add data sets into a well-managed data lake\n\nusing low-cost object stores, and makes it easy to\n\npartition data based on domain, entity, data set and\n\nclassification levels to provide fine-grained (row-\n\nlevel and column-level) security.\n\n\n-----\n\n**Data security, validation and curation — in one place**\n\nThe modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n\nprotecting, validating and improving your organization’s data. Data governance policies can be enforced\n\nusing the built-in features of schema validation, expectations and pipelines — the three main steps to data\n\ncuration. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n\nrefer to it at Databricks, Bronze —> Silver —> Gold.\n\nThe raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n\ndata. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n\nthe data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n\nlevel” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n\nACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n\n“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n\ncustomer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n\ncross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n\nand ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n\njobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n\nimproves the user experience and helps retain talent.\n\n**Extend the impact of your data with secure data sharing**\n\nData sharing is crucial to drive business value in today’s digital economy. More and more organizations\n\nare now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n\ncustomers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n\nmonetization. Additionally, organizations are interested in leveraging external data to drive new product\n\ninnovations and services.\n\nBusiness executives must establish and promote a data sharing culture in their organizations to build\n\ncompetitive advantage.\n\n\n-----\n\n#### 7. Dramatically increase productivity of your workforce\n\nNow that you have deployed a modern data stack and have landed all your analytical data in a well-\n\nmanaged data lake with a rationalized approach to access control, the next question is, “What tools should I", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "4a588fb0050e7497fa9478969ce37ab7", + "jobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n\nimproves the user experience and helps retain talent.\n\n**Extend the impact of your data with secure data sharing**\n\nData sharing is crucial to drive business value in today’s digital economy. More and more organizations\n\nare now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n\ncustomers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n\nmonetization. Additionally, organizations are interested in leveraging external data to drive new product\n\ninnovations and services.\n\nBusiness executives must establish and promote a data sharing culture in their organizations to build\n\ncompetitive advantage.\n\n\n-----\n\n#### 7. Dramatically increase productivity of your workforce\n\nNow that you have deployed a modern data stack and have landed all your analytical data in a well-\n\nmanaged data lake with a rationalized approach to access control, the next question is, “What tools should I\n\nprovide to the user community so they can be most effective at using the new data ecosystem?”\n\n**Design thinking: working backward from the user experience**\n\nDesign thinking is a human-centered approach to innovation — focused on understanding customer needs,\n\nrapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n\nprocesses and organizations. Design thinking was introduced as a technique to not only improve but also\n\nbring joy to the way people work. The essence of design thinking is to determine what motivates people to\n\ndo their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n\n**Moving beyond best of breed**\n\nIf you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n\ntraining and model deployment tools. Many organizations take a “best of breed” approach in providing\n\ntooling for their end users. This typically occurs because leaders genuinely want to empower business\n\nunits, departments and teams to select the tool that best suits their specific needs — so-called federated\n\ntool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n\ngiven the high cost of rolling it out to the entire user population.\n\n\n-----\n\nWhen tool selection becomes localized, there are a few things to consider:\n\n\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n\ninterchangeable with criteria that are established within a specific tool category. The tool with the best\n\noverall score gets selected.\n\n\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n\npersonal preference or adoption within a department, or because a given tool is better suited to support\n\na current business process\n\n\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n\n\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n\nmoved into production\n\n\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n\nand security environment but nothing more\n\n\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n\nof tools in play or streamlining the user experience\n\n\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n\npartnership model, the ability to influence the roadmap and professional services support\n\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\n\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\n\n\n-----\n\nDatabricks is a leading data and AI company —\n\n\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\n\ndata processing, validation and curation should work. It’s the integration between the discrete functions\n\nof the platform that saves time, conserves effort and improves the user experience. Many companies try", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "eaff954d65653182857574e043c105f1", + "and security environment but nothing more\n\n\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n\nof tools in play or streamlining the user experience\n\n\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n\npartnership model, the ability to influence the roadmap and professional services support\n\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\n\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\n\n\n-----\n\nDatabricks is a leading data and AI company —\n\n\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\n\ndata processing, validation and curation should work. It’s the integration between the discrete functions\n\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\n\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\n\nconsequences of not doing the integration properly can be serious — in terms of security, compliance,\n\nefficiency, cost, etc.\n\n\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\n\n\nSo, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\n\ntake from both parties — sometimes calling for an organization to adjust their processes to better fit how\n\nthe platform works. There are many instances where a given business process could be simplified or recast\n\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\n\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\n\napply to the broadest set of customers.\n\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\n\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\n\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\n\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\n\nand collaboration helps improve the user experience and decreases time to market.\n\n\n[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n\nlistening to the needs of thousands of customers\n\nand having our engineers work side by side with\n\ncustomer teams to deliver real business value using\n\ndata and AI.\n\n\n-----\n\n**Unified platform, unified personas**\n\nDeploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n\ndata stack — will provide an integrated suite of tools for the full range of personas in your organization,\n\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\n\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\n\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n\nand deployment. All the work streams function off a single view of the data, and the handoffs between\n\nsubsystems are well managed.\n\nData processing happens in one auditable environment, and the number of copies of data is kept to an\n\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\n\nis eliminated.\n\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "ff33877449afd48d9a7757897e586275", + "including business analysts, SQL developers, data engineers and data scientists. You will immediately\n\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\n\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n\nand deployment. All the work streams function off a single view of the data, and the handoffs between\n\nsubsystems are well managed.\n\nData processing happens in one auditable environment, and the number of copies of data is kept to an\n\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\n\nis eliminated.\n\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n\ndifferently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n\nconsumers must be able to adjust by monitoring the execution and detecting the changes. The data\n\nscientist, in turn, must update and test new models on the new data. Your data platform should make the\n\ndetection and remediation easier, not harder.\n\nFor the data engineers, their primary focus is extracting data from source systems and moving it into the\n\nnew data ecosystem. The data pipeline function can be simplified with a unified data platform because\n\nthe programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n\nresults in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n\nand debug since the compute layer is consistent, and the logging and auditing associated with the data\n\nprocessing and data management is centralized and of more value.\n\n\n-----\n\n**Maximize the productivity of your workforce**\n\nOnce you have a data platform that brings together your full range of personas, you should focus on the\n\nnext step for increasing productivity — namely, self-service environments.\n\nIn large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n\nenvironments for development, testing and production. These environments need to be nearly identical to\n\none another — using the same version of software while limiting the number, size and horsepower of the\n\ncompute nodes. To the extent possible, development and test should be performed with realistic test/\n\nsynthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n\npercentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n\ngeneral shape and range of values.\n\nThe **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n\nenvironments should be small and controlled with policies that spin them up and tear them down efficiently.\n\nEvery aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n\nenvironment that cannot be destroyed and easily rebuilt.\n\nThe **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n\ntools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n\nthe developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n\nmanagement.\n\nMoving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n\ndevelopers cannot randomly promote software to run in production. Again, this process should be\n\nstrictly governed using a workflow/sign-off approval approach — signed off by management as well.\n\nMany organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n\ndeployments.\n\n\n**DEV** **TEST**\n\n**PROD**\n\n\n-----\n\n#### 8. Make informed build vs. buy decisions", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "25f075126fb3e190be163159697de795", + "environment that cannot be destroyed and easily rebuilt.\n\nThe **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n\ntools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n\nthe developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n\nmanagement.\n\nMoving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n\ndevelopers cannot randomly promote software to run in production. Again, this process should be\n\nstrictly governed using a workflow/sign-off approval approach — signed off by management as well.\n\nMany organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n\ndeployments.\n\n\n**DEV** **TEST**\n\n**PROD**\n\n\n-----\n\n#### 8. Make informed build vs. buy decisions\n\nA key piece of the strategy will involve the decision around which components of the data ecosystem are\n\nbuilt by the in-house engineering team and which components are purchased through a vendor relationship.\n\nThere is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n\nengineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n\n**Competitive advantage**\n\nThis “roll your own’’ approach has some advantages — including being able to establish the overall product\n\nvision, prioritize features and directly allocate the resources to build the software. However, it is important to\n\nkeep in mind which aspects of your development effort give you the most competitive advantage.\n\nSpend some time working with the data transformation steering committee and other stakeholders to\n\ndebate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n\ncome down to whether or not a given solution offers true competitive advantage for the organization. Does\n\nbuilding this piece of software make it harder for your competitors to compete with you? If the answer is no,\n\nthen it is better to focus your engineering and data science resources on deriving insights from your data.\n\n**Beware: becoming your own software vendor**\n\nAs many engineering leaders know, building your own software is an exciting challenge. However, it does\n\ncome with added responsibility — namely, managing the overall project timeline and costs, and being\n\nresponsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n\nupdates. You basically are becoming your own software vendor for every component of the ecosystem\n\nthat you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n\nseveral million dollars per year building out individual component parts of the new data system. This doesn’t\n\ninclude the cost to operate and maintain the software once it is in production.\n\n\n-----\n\nTo offset the anticipated development costs, engineering teams will oftentimes make the argument that\n\nthey are starting with open source software and extending it to meet the “unique requirements” of your\n\norganization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n\nunique and b) the development offers the competitive advantage that you need.\n\nEven software built on top of open source still requires significant investment in integration and testing.\n\nThe integration work is particularly challenging because of the large number of open source libraries that\n\nare required in the data science space. The question becomes, “Is this really the area that you want your\n\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n\n**How long will it take? Can the organization afford to wait?**\n\nEven if you decide the software component provides a competitive advantage and is something worth\n\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "b5f4bd0258226132f89697f6e660b09b", + "organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n\nunique and b) the development offers the competitive advantage that you need.\n\nEven software built on top of open source still requires significant investment in integration and testing.\n\nThe integration work is particularly challenging because of the large number of open source libraries that\n\nare required in the data science space. The question becomes, “Is this really the area that you want your\n\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n\n**How long will it take? Can the organization afford to wait?**\n\nEven if you decide the software component provides a competitive advantage and is something worth\n\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\n\ntake longer and cost more money than initially planned.\n\nThe organization should understand the impact to the overall performance and capabilities of the daily\n\necosystem for any features tied to the in-house development effort. Your business partners likely do\n\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n\nfeatures and schedule.\n\n\nDatabricks is built on top of popular open source\n\nsoftware that it created. Engineering teams can\n\nimprove the underpinnings of the Databricks\n\nplatform by submitting code via pull request and\n\nbecoming committers to the projects. The benefit\n\nto organizations is that their engineers contribute\n\nto the feature set of the data platform while\n\nDatabricks remains responsible for all integration\n\nand performance testing plus all the runtime\n\nsupport, including failover and disaster recovery.\n\n\n-----\n\n**Don’t forget about the data**\n\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\n\n“data assets” consumable to the end users or systems. Data insights, model training and model execution\n\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n\nsets from multiple lines of business or departments. Focusing your data engineering and data science\n\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n\ncreating true competitive advantage.\n\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n\nserve up data for analysis should not be underestimated. The value of this work is equally important to\n\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\n\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n\nengineers innovate on components that don’t bring true competitive advantage.\n\n\n-----\n\n#### 9. Allocate, monitor and optimize costs\n\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\n\ncould be easily shared and reused by other members of the team. The more the team used the unified\n\nplatform, the more they collaborated and their level of expertise increased.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "c7999afe6a711c926c52c21162072b02", + "engineers innovate on components that don’t bring true competitive advantage.\n\n\n-----\n\n#### 9. Allocate, monitor and optimize costs\n\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\n\ncould be easily shared and reused by other members of the team. The more the team used the unified\n\nplatform, the more they collaborated and their level of expertise increased.\n\n**Reduce complexity, reduce costs**\n\nThe architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n\nmore complex — resulting in increased time to market and increased costs. This was mainly due to the\n\nrequirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n\nfor the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n\nserving and analytics are performed in a single compute layer.\n\nOrganizations can rightsize the data environments and control costs using policies. The centralized\n\nand consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n\nbottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n\nexpertise is developed within the workforce.\n\n\nThe Databricks platform optimizes costs for your\n\ndata and AI workloads by intelligently provisioning\n\ninfrastructure only as you need it. Customers can\n\nestablish policies that govern the size of clusters\n\nbased on DEV, TEST, PROD environments or\n\nanticipated workloads.\n\n\n-----\n\nDatabricks monitors and records usage and allows\n\norganizations to easily track costs on a data and\n\n\n**Centralized funding model**\n\nAs previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n\nunder the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n\nthe likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n\nthe funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n\nFunding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n\nsteady state that is more sustainable.\n\n\nAI workload basis. This provides the ability to\n\n\nThe budget takes into account the cost of the data engineering function, commercial software licenses and\n\nbuilding out the center of excellence to accelerate the data science capabilities of the organization. Again,\n\nthe CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n\nfocused on the overall implementation plan and to make sound build vs. buy decisions.\n\nIt’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n\nin the CIO’s organization to perform the data engineering tasks. The data science community reports into\n\nthe CDO and is matrixed into the lines of business in order to better understand the business drivers and\n\nthe data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n\nregulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n\nthe necessary time to educate leaders throughout the organization on the value of data governance.\n\n\nimplement an enterprise-wide chargeback mode\n\nand put in place appropriate spending limits.\n\n\n-----\n\n**Chargeback models**\n\nTo establish the centralized budget to fund the data transformation initiative, some organizations impose", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "a44c44fd48c5153138e2f0eaeee9e374", + "the CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n\nfocused on the overall implementation plan and to make sound build vs. buy decisions.\n\nIt’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n\nin the CIO’s organization to perform the data engineering tasks. The data science community reports into\n\nthe CDO and is matrixed into the lines of business in order to better understand the business drivers and\n\nthe data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n\nregulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n\nthe necessary time to educate leaders throughout the organization on the value of data governance.\n\n\nimplement an enterprise-wide chargeback mode\n\nand put in place appropriate spending limits.\n\n\n-----\n\n**Chargeback models**\n\nTo establish the centralized budget to fund the data transformation initiative, some organizations impose\n\na “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n\nshould be used to build the data engineering and data science teams needed to deploy the building blocks\n\nof the new data ecosystem. However, as different teams, departments and business units begin using the\n\nnew data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n\nnot be evenly distributed, due to different levels of usage from the various parts of the organization. The\n\ngroups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n\nability to monitor and track usage — not only based on compute but also on the amount of data generated\n\nand consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n\nand above the base-level funding.\n\nPlus, not all the departments or lines of business will require the same level of compute power or fault\n\ntolerance. The architecture should support the ability to separate out the runtime portions of the data\n\necosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n\nSome workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n\nnodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n\ncritical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n\nbetter manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n\nperformance is needed most.\n\n\n-----\n\n#### 10. Move to production and scale adoption\n\nNow that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n\necosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n\nmanaging and using data to enable use cases that drive business value. They must also establish a clear\n\nset of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n\ncontinues to improve over time.\n\n**If you build it, they will come**\n\nKeep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n\nset registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n\nA high level of automation for the registration process is important because it’s not uncommon to see\n\nthousands of data sets in large organizations. The business and technical metadata plus the data quality\n\nrules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n\nprovide a visualization that shows the data movement and verifies that the approved data flow paths are\n\nbeing followed.\n\nSome key metrics to keep an eye on are:\n\n\u0007Percentage of source systems contributing data to the ecosystem\n\n\u0007Percentage of real-time streaming relative to API and batch transfers\n\n\u0007Percentage of registered data sets with full business and technical metadata\n\n\u0007Volume of data written to the data lake", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "44e15156f76ae559ee78a6146297901b", + "continues to improve over time.\n\n**If you build it, they will come**\n\nKeep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n\nset registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n\nA high level of automation for the registration process is important because it’s not uncommon to see\n\nthousands of data sets in large organizations. The business and technical metadata plus the data quality\n\nrules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n\nprovide a visualization that shows the data movement and verifies that the approved data flow paths are\n\nbeing followed.\n\nSome key metrics to keep an eye on are:\n\n\u0007Percentage of source systems contributing data to the ecosystem\n\n\u0007Percentage of real-time streaming relative to API and batch transfers\n\n\u0007Percentage of registered data sets with full business and technical metadata\n\n\u0007Volume of data written to the data lake\n\n\u0007Percentage of raw data that enters a data curation pipeline\n\n\u0007Volume of data consumed from the data lake\n\n\u0007Number of tables defined and populated with curated data\n\n\u0007Number of models trained with data from the data lake\n\n\u0007Lineage reports and anomaly detection incidents\n\n\u0007Number of users running Python, SQL, Scala and R workloads\n\n\nIn 2018, Databricks released MLflow — an open\n\nsource platform to manage the ML lifecycle,\n\nincluding experimentation, reproducibility,\n\ndeployment and a central model registry. MLflow\n\nis included in the Databricks Lakehouse Platform\n\nand accelerates the adoption of machine learning\n\nand AI in organizations.\n\n\n-----\n\n**Communication plan**\n\nCommunication is critical throughout the data transformation initiative — however, it is particularly\n\nimportant once you move into production. Time is precious and you want to avoid rework, if at all possible.\n\nOrganizations often overlook the emotional and cultural toll that a long transformation process takes on\n\nthe workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n\nand exhausting place to be — because your business partners are busy supporting two data worlds. Most\n\nusers just want to know when the new environment will be ready. They don’t want to work with partially\n\ncompleted features, especially while performing double duty.\n\nEstablish a solid communication plan and set expectations for when features will come online. Make sure\n\nthere is detailed documentation, training and a support/help desk to field users’ questions.\n\n**DevOps — software development + IT operations**\n\nMature organizations develop a series of processes and standards for how software and data are developed,\n\nmanaged and delivered. The term “DevOps” comes from the software engineering world and refers to\n\ndeveloping and operating large-scale software systems. DevOps defines how an organization, its developers,\n\noperations staff and other stakeholders establish the goal of delivering quality software reliably and\n\nrepeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n\ncontinuous delivery (CD).\n\nThe CI portion of the process is the practice of frequently integrating newly written or changed code\n\nwith the existing code repository. As software is written, it is continuously saved back to the source code\n\nrepository, merged with other changes, built, integrated and tested — and this should occur frequently\n\nenough that the window between commit and build is narrow enough that no errors can occur without\n\ndevelopers noticing them and correcting them immediately.\n\nThis is particularly important for large, distributed teams to ensure that the software is always in a working\n\nstate — despite the frequent changes from various developers. Only software that passes the CI steps is\n\ndeployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n\ndependable releases.\n\n\nSoftware development IT operations\n\n\n-----\n\n**DataOps — data processing + IT operations**\n\nDataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n\nuse the well-established processes from DevOps to consistently and reliably improve the quality of data", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "09bdc72674d7ff408d67d2046c9637bd", + "The CI portion of the process is the practice of frequently integrating newly written or changed code\n\nwith the existing code repository. As software is written, it is continuously saved back to the source code\n\nrepository, merged with other changes, built, integrated and tested — and this should occur frequently\n\nenough that the window between commit and build is narrow enough that no errors can occur without\n\ndevelopers noticing them and correcting them immediately.\n\nThis is particularly important for large, distributed teams to ensure that the software is always in a working\n\nstate — despite the frequent changes from various developers. Only software that passes the CI steps is\n\ndeployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n\ndependable releases.\n\n\nSoftware development IT operations\n\n\n-----\n\n**DataOps — data processing + IT operations**\n\nDataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n\nuse the well-established processes from DevOps to consistently and reliably improve the quality of data\n\nused to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n\nneeded for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n\ndata are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n\nend cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n\ndata sets, data assets and models that create value.\n\nFor DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n\nand the data tooling should be designed to support the workflow and make all aspects of data curation and\n\nETL more efficient.\n\n**MLOps — machine learning + IT operations**\n\nNot surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n\ndeep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n\nunique when compared with DevOps and DataOps because the approach to deploying effective machine\n\nlearning models is far more iterative and requires much more experimentation — data scientists try different\n\nfeatures, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n\nbase, understand the data used to perform the training and create reproducible results. The logging aspect\n\nof the ML development lifecycle is critical.\n\nMLOps aims to manage deployment of machine learning and deep learning models in large-scale\n\nproduction environments while also focusing on business and regulatory requirements. The ideal MLOps\n\nenvironment would include data science tools where models are constructed and analytical engines where\n\ncomputations are performed.\n\n\nData processing IT operations\n\n#### \n\nMachine learning IT operations\n\n\n-----\n\nThe overall workflow for deploying production ML models is shown in Figure 10.\n\nUnlike most software applications that execute a series of discrete operations, ML platforms are not\n\ndeterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n\nsuffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n\nbe refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n\nshould natively support this style of iterative data science.\n\n**Ethics in AI**\n\nAs more organizations deploy data and AI solutions, there is growing concern around a number of issues\n\nrelated to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n\nfair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n\nmust ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n\nexplainability to satisfy legal and regulatory safeguards.\n\nThe vast majority of AI work still involves software development by human beings and the use of curated\n\ndata sets. There is the obvious potential for bias and the application of AI in domains that are ethically", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "c3945817f581aa2399eb0650e53d504f", + "deterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n\nsuffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n\nbe refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n\nshould natively support this style of iterative data science.\n\n**Ethics in AI**\n\nAs more organizations deploy data and AI solutions, there is growing concern around a number of issues\n\nrelated to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n\nfair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n\nmust ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n\nexplainability to satisfy legal and regulatory safeguards.\n\nThe vast majority of AI work still involves software development by human beings and the use of curated\n\ndata sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n\nquestionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n\nexplain how it works and describe the impact of its existence on the target audience — whether internal\n\nworkers or customers.\n\n\nData extraction\n\nData preparation\n\nModel e�aluation\n\n\nData analI�i�\n\n4\nModel training\n\n6\nModel �er�ing and\nexecution\n\n\nModel monitoring\n\n**Figure 10:**\nWorkflow for deploying production ML models\n\n\n-----\n\n**Data and AI Maturity Model**\n\nWhen data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n\na data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n\nFigure 11.\n\n**Top-Line Categories and Ranking Criteria**\n\n**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n\n1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n\n\nOrganization is beginning\nto explore big data and\nAI, and understand the\npossibilities and potential\nof a few starter projects\nand experiment\n\n**Figure 11:**\nThe Data and AI Maturity Model\n\n\nOrganization builds\nthe basic capabilities\nand foundations to\nbegin exploring a more\nexpansive data and AI\nstrategy, but it lacks vision,\nlong-term objectives or\nleadership buy-in\n\n\nData and AI are budding\ninto drivers of value for\nBUs aligned to specific\nprojects and initiatives as\nthe core tenets of data\nand AI are integrated into\ncorporate strategy\n\n\nData and AI are core\ndrivers of value across the\norganization, structured\nand central to corporate\nstrategy, with a scalable\narchitecture that meets\nbusiness needs and buy-in\nfrom across the organization\n\n\nData and AI are at the\nheart of the corporate\nstrategy and are\ninvaluable differentiators\nand drivers of competitive\nadvantage\n\n\nDatabricks partners with its customers to enable them to do an internal self-assessment. The output of the\n\nself-assessment allows organizations to:\n\n\u0007Understand the current state of their journey to data and AI maturity\n\n\u0007Identify key gaps in realizing (more) value from data and AI\n\n\u0007Plot a path to increase maturity with specific actions\n\n\u0007Identify Databricks resources who can help support their journey\n\n\n-----\n\n**CHAPTER 3:**\n## Conclusion\n\n\nAfter a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n\nwith the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n\n— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n\nto future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n\narchitecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n\nunleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n\nevery use case.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "6e76a7c72ed1164f542aaa2f592c4c1a", + "self-assessment allows organizations to:\n\n\u0007Understand the current state of their journey to data and AI maturity\n\n\u0007Identify key gaps in realizing (more) value from data and AI\n\n\u0007Plot a path to increase maturity with specific actions\n\n\u0007Identify Databricks resources who can help support their journey\n\n\n-----\n\n**CHAPTER 3:**\n## Conclusion\n\n\nAfter a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n\nwith the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n\n— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n\nto future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n\narchitecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n\nunleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n\nevery use case.\n\nFor more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n\n**A B O U T T H E A U T H O R**\n\nChris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n\nis to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n\nPrior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n\nhe led a team that was responsible for building out a modern data architecture that emphasized the key\n\nattributes of the lakehouse architecture.\n\nChris has also held leadership roles at a number of technology companies.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "031815c68bdd885e1f9e3299f1014c9f", + "**eBook**\n\n## The Data Team’s Guide to the Databricks Lakehouse Platform\n\n\n-----\n\n#### Contents\n\n\n**C H A P TE R 1**\n\n**C H A P TE R 2**\n\n**C H A P TE R 3**\n\n**C H A P TE R 4**\n\n**C H A P TE R 5**\n\n**C H A P TE R 6**\n\n**C H A P TE R 7**\n\n**C H A P TE R 8**\n\n**C H A P TE R 9**\n\n**C H A P TE R 10**\n\n**C H A P TE R 11**\n\n**C H A P TE R 12**\n\n\n**The data lakehouse** ...................................................................................................................................................................................... **4**\n\n**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n\n**Data reliability and performance** ................................................................................................................................... **18**\n\n**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n\n**Security** .............................................................................................................................................................................................................................. **41**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "37c8d75a944e9c050eddc4388d8456e2", + "**Security** .............................................................................................................................................................................................................................. **41**\n\n**Instant compute and serverless** ................................................................................................................................... **48**\n\n**Data warehousing** ......................................................................................................................................................................................... **52**\n\n**Data engineering** ............................................................................................................................................................................................. **56**\n\n**Data streaming** .................................................................................................................................................................................................. **68.**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "b7d5cd84a7d2802d8b2797d15d6c10b4", + "**Data streaming** .................................................................................................................................................................................................. **68.**\n\n**Data science and machine learning** ........................................................................................................................ **7** **3.**\n\n**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n\n**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n\n\n-----\n\n**I N T R O D U C T I O N**\n\n#### The Data Team’s Guide to the Databricks Lakehouse Platform\n\n_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\ndesigned for data practitioners and leaders who are embarking\non their journey into the data lakehouse architecture.\n\nIn this eBook, you will learn the full capabilities of the data lakehouse architecture\nand how the Databricks Lakehouse Platform helps organizations of all sizes — from\nenterprises to startups in every industry — with all their data, analytics, AI and\nmachine learning use cases on one platform.\n\nYou will see how the platform combines the best elements of data warehouses\nand data lakes to increase the reliability, performance and scalability of your\ndata platform. Discover how the lakehouse simplifies complex workloads in data\nengineering, data warehousing, data streaming, data science and machine learning\n— and bolsters collaboration for your data teams, allowing them to maintain new\nlevels of governance, flexibility and agility in an open and multicloud environment.\n\n\n-----\n\n**CHAPTER**\n\n### The data lakehouse\n# 01\n\n\n-----\n\n#### The evolution of data architectures\n\n\nData has moved front and center within every organization as data-driven insights\nhave fueled innovation, competitive advantage and better customer experiences.\n\nHowever, as companies place mandates on becoming more data-driven,\ntheir data teams are left in a sprint to deliver the right data for business\ninsights and innovation. With the widespread adoption of cloud, data teams\noften invest in large-scale complex data systems that have capabilities for\nstreaming, business intelligence, analytics and machine learning to support\nthe overall business objectives.\n\nTo support these objectives, data teams have deployed cloud data\n\nwarehouses and data lakes.\n\n\nTraditional data systems: The data warehouse and data lake\n\nWith the advent of big data, companies began collecting large amounts of\ndata from many different sources, such as weblogs, sensor data and images.\nData warehouses — which have a long history as the foundation for decision\nsupport and business intelligence applications — cannot handle large volumes\nof data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "c8188921b979381d315e5ec5ae191e05", + "-----\n\n**CHAPTER**\n\n### The data lakehouse\n# 01\n\n\n-----\n\n#### The evolution of data architectures\n\n\nData has moved front and center within every organization as data-driven insights\nhave fueled innovation, competitive advantage and better customer experiences.\n\nHowever, as companies place mandates on becoming more data-driven,\ntheir data teams are left in a sprint to deliver the right data for business\ninsights and innovation. With the widespread adoption of cloud, data teams\noften invest in large-scale complex data systems that have capabilities for\nstreaming, business intelligence, analytics and machine learning to support\nthe overall business objectives.\n\nTo support these objectives, data teams have deployed cloud data\n\nwarehouses and data lakes.\n\n\nTraditional data systems: The data warehouse and data lake\n\nWith the advent of big data, companies began collecting large amounts of\ndata from many different sources, such as weblogs, sensor data and images.\nData warehouses — which have a long history as the foundation for decision\nsupport and business intelligence applications — cannot handle large volumes\nof data.\n\nWhile data warehouses are great for structured data and historical analysis,\nthey weren’t designed for unstructured data, semi-structured data, and data\nwith high variety, velocity and volume, making them unsuitable for many types\nof data.\n\nThis led to the introduction of data lakes, providing a single repository of raw\ndata in a variety of formats. While suitable for storing big data, data lakes do\nnot support transactions, nor do they enforce data quality, and their lack of\nconsistency/isolation makes it almost impossible to read, write or process data.\n\nFor these reasons, many of the promises of data lakes never materialized and,\nin many cases, reduced the benefits of data warehouses.\n\nAs companies discovered new use cases for data exploration, predictive modeling\nand prescriptive analytics, the need for a single, flexible, high-performance system\nonly grew. Data teams require systems for diverse data applications including SQL\nanalytics, real-time analytics, data science and machine learning.\n\n\n-----\n\nTo solve for new use cases and new users, a common approach is to use multiple\nsystems — a data lake, several data warehouses and other specialized systems\nsuch as streaming, time-series, graph and image databases. But having multiple\nsystems introduces complexity and delay, as data teams invariably need to\nmove or copy data between different systems, effectively losing oversight and\ngovernance over data usage.\n\n\nYou have now duplicated data in two different systems and the changes you\nmake in one system are unlikely to find their way to the other. So, you are going\nto have data drift almost immediately, not to mention paying to store the same\ndata multiple times.\n\nThen, because governance is happening at two distinct levels across these\nplatforms, you are not able to control things consistently.\n\n\n**Challenges with data, analytics and AI**\n\nIn a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\nmeasurable value from data. The challenge is that most companies continue to\nimplement two different platforms: data warehouses for BI and data lakes for AI.\nThese platforms are incompatible with each other, but data from both systems\nis generally needed to deliver game-changing outcomes, which makes success\nwith AI extremely difficult.\n\nToday, most of the data is landing in the data lake, and a lot of it is unstructured.\nIn fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\nunstructured by 2025. But, this data is where much of the value from AI resides.\nSubsets of the data are then copied to the data warehouse into structured\ntables, and back again in some cases.\n\nYou also must secure and govern the data in both warehouses and offer\nfine-grained governance, while lakes tend to be coarser grained at the file level.\nThen, you stand up different stacks of tools on these platforms to do either\nBI or AI.\n\n\n-----\n\nFinally, the tool stacks on top of these platforms\nare fundamentally different, which makes it difficult\nto get any kind of collaboration going between the\nteams that support them.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "71d3fdc659abf200ba4c6b379ef23c9e", + "Today, most of the data is landing in the data lake, and a lot of it is unstructured.\nIn fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\nunstructured by 2025. But, this data is where much of the value from AI resides.\nSubsets of the data are then copied to the data warehouse into structured\ntables, and back again in some cases.\n\nYou also must secure and govern the data in both warehouses and offer\nfine-grained governance, while lakes tend to be coarser grained at the file level.\nThen, you stand up different stacks of tools on these platforms to do either\nBI or AI.\n\n\n-----\n\nFinally, the tool stacks on top of these platforms\nare fundamentally different, which makes it difficult\nto get any kind of collaboration going between the\nteams that support them.\n\nThis is why AI efforts fail. There is a tremendous\namount of complexity and rework being introduced\ninto the system. Time and resources are being\nwasted trying to get the right data to the right\npeople, and everything is happening too slowly\nto get in front of the competition.\n\n\n**Realizing this requires two disparate,**\n**incompatible data platforms**\n\n\n**Business** **SQL** **Incomplete** **Data science** **Data**\n\n**support for**\n\n**intelligence** **analytics** **and ML** **streaming**\n\n\n**SQL**\n**analytics**\n\n\n**Incomplete**\n**support for**\n**use cases**\n\n\n**Incompatible**\n**security and**\n**governance models**\n\n**Copy subsets of data**\n\n\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa T|n a|c b|e and security le ACLs|\n|||||\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa File|n s|c a|e and security nd blobs|\n|||||\n\n\n**Disjointed**\n**and duplicative**\n\n**Data warehouse** **data silos** **Data lake**\nStructured tables Unstructured files:\nlogs, text, images, video\n\n\n-----\n\n**Moving forward with a lakehouse architecture**\n\nTo satisfy the need to support AI and BI directly on vast amounts of data stored\nin data lakes (on low-cost cloud storage), a new data management architecture\nemerged independently across many organizations and use cases: the\ndata lakehouse.\n\nThe data lakehouse can store _all_ and _any_ type of data once in a data lake and\nmake that data accessible directly for AI and BI. The lakehouse paradigm has\nspecific capabilities to efficiently allow both AI and BI on all the enterprise’s data\nat a massive scale. Namely, it has the SQL and performance capabilities such as\nindexing, caching and MPP processing to make BI work fast on data lakes. It also\nhas direct file access and direct native support for Python, data science and AI\nframeworks without the need for a separate data warehouse.\n\nIn short, a lakehouse is a data architecture that combines the best elements\nof data warehouses and data lakes. Lakehouses are enabled by a new system\ndesign, which implements similar data structures and data management features\nfound in a data warehouse directly on the low-cost storage used for data lakes.\n\n\n-----\n\n##### Data lakehouse\n\nOne platform to unify all your data, analytics and AI workloads\n\n###### Lakehouse Platform\n\nAll machine learning, SQL,\nBI, and streaming use cases\n\nOne security and governance\napproach for all data assets\non all clouds\n\n\n-----\n\n**Key features for a lakehouse**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9cabb87127bfa514fa6f498e9f2831e7", + "In short, a lakehouse is a data architecture that combines the best elements\nof data warehouses and data lakes. Lakehouses are enabled by a new system\ndesign, which implements similar data structures and data management features\nfound in a data warehouse directly on the low-cost storage used for data lakes.\n\n\n-----\n\n##### Data lakehouse\n\nOne platform to unify all your data, analytics and AI workloads\n\n###### Lakehouse Platform\n\nAll machine learning, SQL,\nBI, and streaming use cases\n\nOne security and governance\napproach for all data assets\non all clouds\n\n\n-----\n\n**Key features for a lakehouse**\n\nRecent innovations with the data lakehouse architecture can help simplify\nyour data and AI workloads, ease collaboration for data teams, and maintain\nthe kind of flexibility and openness that allows your organization to stay agile\nas you scale. Here are key features to consider when evaluating data lakehouse\narchitectures:\n\nTransaction support: In an enterprise lakehouse, many data pipelines will\noften be reading and writing data concurrently. Support for ACID (Atomicity,\nConsistency, Isolation and Durability) transactions ensures consistency as\nmultiple parties concurrently read or write data.\n\nSchema enforcement and governance: The lakehouse should have\na way to support schema enforcement and evolution, supporting data\nwarehouse schema paradigms such as star/snowflake. The system should\nbe able to reason about data integrity, and it should have robust governance\nand auditing mechanisms.\n\nData governance: Capabilities including auditing, retention and lineage\nhave become essential, particularly considering recent privacy regulations.\n\nTools that allow data discovery have become popular, such as data catalogs\nand data usage metrics.\n\nBI support: Lakehouses allow the use of BI tools directly on the source\ndata. This reduces staleness and latency, improves recency and lowers cost\nby not having to operationalize two copies of the data in both a data lake\nand a warehouse.\n\n\nStorage decoupled from compute: In practice, this means storage and\ncompute use separate clusters, thus these systems can scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also\nhave this property.\n\nOpenness: The storage formats, such as Apache Parquet, are open and\nstandardized, so a variety of tools and engines, including machine learning\nand Python/R libraries, can efficiently access the data directly.\n\nSupport for diverse data types (unstructured and structured):\nThe lakehouse can be used to store, refine, analyze and access data types\nneeded for many new data applications, including images, video, audio,\nsemi-structured data and text.\n\nSupport for diverse workloads: Use the same data repository for a range\nof workloads including data science, machine learning and SQL analytics.\nMultiple tools might be needed to support all these workloads.\n\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\n**Learn more**\n\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n\n\n-----\n\n**CHAPTER**\n\n# 02\n\n\n### The Databricks Lakehouse Platform\n\n\n-----\n\n#### Lakehouse: A new generation of open platforms\n\n\n###### This is the lakehouse paradigm", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9055c5a181008db8c024cb3f2415f1ed", + "**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n\n\n-----\n\n**CHAPTER**\n\n# 02\n\n\n### The Databricks Lakehouse Platform\n\n\n-----\n\n#### Lakehouse: A new generation of open platforms\n\n\n###### This is the lakehouse paradigm\n\n\nDatabricks is the inventor and pioneer of the\ndata lakehouse architecture. The data lakehouse\narchitecture was coined in the research paper,\n[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\nintroduced by Databricks’ founders, UC Berkeley\nand Stanford University at the 11th Conference on\nInnovative Data Systems Research (CIDR) in 2021.\n\nAt Databricks, we are continuously innovating on\nthe lakehouse architecture to help customers deliver\non their data, analytics and AI aspirations. The ideal\ndata, analytics and AI platform needs to operate\ndifferently. Rather than copying and transforming\ndata in multiple systems, you need one platform\nthat accommodates all data types.\n\n\n**Data science** **Data**\n**and ML** **streaming**\n\n\n**All ML, SQL, BI**\n**and streaming use cases**\n\n**One security and governance**\n**approach for all data assets**\n**on all clouds**\n\n**A reliable data platform**\n**to efficiently handle**\n**all data types**\n\n\n**Persona-based**\n**use cases**\n\n**Unity Catalog**\nFine-grained governance\nfor data and AI\n\n**Delta Lake**\nData reliability and performance\n\n\n**Business**\n**intelligence**\n\n\n**SQL**\n**analytics**\n\n\nFiles and blobs and table ACLs\n\n\nIdeally, the platform must be open, so that you\nare not locked into any walled gardens. You would\nalso have one security and governance model.\nIt would not only manage all data types, but it\nwould also be cloud-agnostic to govern data\nwherever it is stored.\n\nLast, it would support all major data, analytics and AI\nworkloads, so that your teams can easily collaborate\nand get access to all the data they need to innovate.\n\n\n-----\n\n#### What is the Databricks Lakehouse Platform?\n\nThe Databricks Lakehouse Platform unifies your\ndata warehousing and AI uses cases on a single\nplatform. It combines the best elements of data\nlakes and data warehouses to deliver the reliability,\nstrong governance and performance of data\nwarehouses with the openness, flexibility and\nmachine learning support of data lakes.\n\nThis unified approach simplifies your modern data\nstack by eliminating the data silos that traditionally\nseparate and complicate data engineering, analytics,\nBI, data science and machine learning. It’s built\non open source and open standards to maximize\nflexibility. And, its common approach to data\nmanagement, security and governance helps you\n\noperate more efficiently and innovate faster.\n\n\n**Lakehouse Platform**\n\nData Data Data Data science\nwarehousing engineering streaming and ML\n\n\n-----\n\n#### Benefits of the Databricks Lakehouse Platform\n\n\n**Simple**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "887cf23e9a8e1aeb0ddd15dc7f5db80d", + "-----\n\n#### What is the Databricks Lakehouse Platform?\n\nThe Databricks Lakehouse Platform unifies your\ndata warehousing and AI uses cases on a single\nplatform. It combines the best elements of data\nlakes and data warehouses to deliver the reliability,\nstrong governance and performance of data\nwarehouses with the openness, flexibility and\nmachine learning support of data lakes.\n\nThis unified approach simplifies your modern data\nstack by eliminating the data silos that traditionally\nseparate and complicate data engineering, analytics,\nBI, data science and machine learning. It’s built\non open source and open standards to maximize\nflexibility. And, its common approach to data\nmanagement, security and governance helps you\n\noperate more efficiently and innovate faster.\n\n\n**Lakehouse Platform**\n\nData Data Data Data science\nwarehousing engineering streaming and ML\n\n\n-----\n\n#### Benefits of the Databricks Lakehouse Platform\n\n\n**Simple**\n\nThe unified approach simplifies your data\narchitecture by eliminating the data silos that\ntraditionally separate analytics, BI, data science\nand machine learning. With a lakehouse, you\ncan eliminate the complexity and expense that\nmake it hard to achieve the full potential of\nyour analytics and AI initiatives.\n\n\n**Open**\n\nDelta Lake forms the open foundation of\nthe lakehouse by providing reliability and\nperformance directly on data in the data\nlake. You’re able to avoid proprietary walled\ngardens, easily share data and build your\nmodern data stack with unrestricted access\nto the ecosystem of open source data projects\nand the broad Databricks partner network.\n\n\n**Multicloud**\n\nThe Databricks Lakehouse Platform offers\nyou a consistent management, security and\ngovernance experience across all clouds. You\ndo not need to invest in reinventing processes\nfor every cloud platform that you are using to\nsupport your data and AI efforts. Instead, your\ndata teams can simply focus on putting all\nyour data to work to discover new insights.\n\n\n-----\n\n#### The Databricks Lakehouse Platform architecture\n\n**Data reliability and performance for lakehouse**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\nwith all major analytics tools and works with the widest variety of formats to\nstore and process data.\n\n\n**Instant compute and serverless**\n\nServerless compute is a fully managed service where Databricks provisions and\nmanages the compute layer on behalf of the customer in the Databricks cloud\naccount instead of the customer account. As of the current release, serverless\ncompute is supported for use with Databricks SQL.\n\nIn Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n\n\n[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\na state-of-the-art vectorized engine for fast querying and provides the best\nperformance for all workloads in the lakehouse.\n\nIn Chapter 3, we explore the details of data reliability and performance\n\nfor the lakehouse.\n\n**Unified governance and security for lakehouse**\n\nThe Databricks Lakehouse Platform provides unified governance with enterprise\nscale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\ngovernance for your data and AI assets in the lakehouse — files, tables,\ndashboards, and machine learning models — giving you much better control,\nmanagement and security across clouds.\n\n[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\ndata across the organization in real time, independent of the platform\non which the data resides.\n\nIn Chapter 4, we go into the details of unified governance for lakehouse\n\nand, in Chapter 5, we dive into the details of security for lakehouse.\n\n\n-----\n\n#### The Databricks Lakehouse Platform workloads\n\nThe Databricks Lakehouse Platform architecture supports different workloads\nsuch as data warehousing, data engineering, data streaming, data science and\nmachine learning on one simple, open and multicloud data platform.\n\n**Data warehousing**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "95360c98bed3a80e5d35c9f6e1347456", + "[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\ndata across the organization in real time, independent of the platform\non which the data resides.\n\nIn Chapter 4, we go into the details of unified governance for lakehouse\n\nand, in Chapter 5, we dive into the details of security for lakehouse.\n\n\n-----\n\n#### The Databricks Lakehouse Platform workloads\n\nThe Databricks Lakehouse Platform architecture supports different workloads\nsuch as data warehousing, data engineering, data streaming, data science and\nmachine learning on one simple, open and multicloud data platform.\n\n**Data warehousing**\n\nData warehousing is one of the most business-critical workloads for data teams,\nand the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\nlets you run all your SQL and BI applications at scale with up to 12x better price/\nperformance, a unified governance model, open formats and APIs, and your tools\nof choice — no lock-in. Reduce resource management overhead with serverless\ncompute, and easily ingest, transform and query all your data in-place to deliver\nreal-time business insights faster.\n\nBuilt on open standards and APIs, the Databricks Lakehouse Platform provides\nthe reliability, quality and performance that data lakes natively lack, plus\nintegrations with the ecosystem for maximum flexibility.\n\nIn Chapter 7, we go into the details of data warehousing on the lakehouse.\n\n**Data engineering**\n\nData engineering on the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows on\nany cloud platform, and meet regulatory requirements to maintain governance.\n\n\nautomates the complexity of building and maintaining pipelines and running ETL\nworkloads so data engineers and analysts can focus on quality and reliability to\ndrive valuable insights.\n\nIn Chapter 8, we go into the details of data engineering on the lakehouse.\n\n**Data streaming**\n\n[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\nLakehouse Platform and is the future of all data processing. Real-time processing\nprovides the freshest possible data to an organization’s analytics and machine\nlearning models enabling them to make better, faster decisions, more accurate\npredictions, offer improved customer experiences and more.\n\nThe Databricks Lakehouse Platform Dramatically simplifies data streaming to\ndeliver real-time analytics, machine learning and applications on one platform.\n\nIn Chapter 9, we go into the details of data streaming on the lakehouse.\n\n**Data science and machine learning**\n\nData science and machine learning (DSML) on the lakehouse is a powerful\nworkload that is unique to many other data offerings. DSML on the lakehouse\nprovides a data-native and collaborative solution for the full ML lifecycle. It\ncan maximize data and ML team productivity, streamline collaboration, empower\nML teams to prepare, process and manage data in a self-service manner,\nand standardize the ML lifecycle from experimentation to production.\n\nIn Chapter 10, we go into the details of DSML on the lakehouse.\n\n\nThe lakehouse provides an end-to-end data engineering and ETL platform that\n\n\n-----\n\n**Databricks Lakehouse Platform and your**\n**modern data stack**\n\nThe Databricks Lakehouse Platform is open and provides the flexibility to\ncontinue using existing infrastructure, to easily share data and build your modern\ndata stack with unrestricted access to the ecosystem of open source data\nprojects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n\nIn Chapter 11, we go into the details of our technology partners and the\n\nmodern data stack.\n\n#### Global adoption of the Databricks Lakehouse Platform", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "257465947dbab17362be1c00ec93dd4c", + "In Chapter 10, we go into the details of DSML on the lakehouse.\n\n\nThe lakehouse provides an end-to-end data engineering and ETL platform that\n\n\n-----\n\n**Databricks Lakehouse Platform and your**\n**modern data stack**\n\nThe Databricks Lakehouse Platform is open and provides the flexibility to\ncontinue using existing infrastructure, to easily share data and build your modern\ndata stack with unrestricted access to the ecosystem of open source data\nprojects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n\nIn Chapter 11, we go into the details of our technology partners and the\n\nmodern data stack.\n\n#### Global adoption of the Databricks Lakehouse Platform\n\n\nToday, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\nacross industries doing transformational work. Organizations around the globe\nare driving change and delivering a new generation of data, analytics and AI\napplications. We believe that the unfulfilled promise of data and AI can finally\nbe fulfilled with one platform for data analytics, data science and machine\nlearning with the Databricks Lakehouse Platform.\n\n\n**Learn more**\n\n[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n\n[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n\n[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n\n[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n\n[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n\n[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n\n\n-----\n\n**CHAPTER**\n\n# 03\n\n\n### Data reliability and performance\n\nTo bring openness, reliability and lifecycle management to data lakes,\nthe Databricks Lakehouse Platform is built on the foundation of Delta\nLake. Delta Lake solves challenges around unstructured/structured data\ningestion, the application of data quality, difficulties with deleting data for\ncompliance or issues with modifying data for data capture.\n\nAlthough data lakes are great solutions for holding large quantities of raw\ndata, they lack important attributes for data reliability and quality and\noften don’t offer good performance when compared to data warehouses.\n\n\n-----\n\n#### Problems with today’s data lakes\n\nWhen it comes to data reliability and quality, examples of these\nmissing attributes include:\n\n**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\nappends and reads\n\n**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\nFor example, rejecting writes that don’t match a table’s schema.\n\n**•** **Lack of integration with data catalog:** Results in dark data and no single\nsource of truth\n\nEven just the absence of these three attributes can cause a lot of extra work\nfor data engineers as they strive to ensure consistent high-quality data in the\npipelines they create.\n\n\nThese challenges are solved with two key technologies that are at the foundation\nof the lakehouse: Delta Lake and Photon.\n\n**What is Delta Lake?**\n\nDelta Lake is a file-based, open source storage format that provides ACID\ntransactions and scalable metadata handling, and unifies streaming and batch\ndata processing. It runs on top of existing data lakes and is compatible with\nApache Spark™ and other processing engines.\n\nDelta Lake uses Delta Tables which are based on Apache Parquet, a commonly\nused format for structured data already utilized by many organizations. Therefore,\nswitching existing Parquet tables to Delta Tables is easy and quick. Delta\nTables can also be used with semi-structured and unstructured data, providing\nversioning, reliability, metadata management, and time travel capabilities that\nmake these types of data easily managed as well.\n\n\nAs for performance, data lakes use object storage, so data is mostly kept in\nimmutable files leading to the following problems:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7432d9cdd8951d10673fa8db5f963e39", + "Even just the absence of these three attributes can cause a lot of extra work\nfor data engineers as they strive to ensure consistent high-quality data in the\npipelines they create.\n\n\nThese challenges are solved with two key technologies that are at the foundation\nof the lakehouse: Delta Lake and Photon.\n\n**What is Delta Lake?**\n\nDelta Lake is a file-based, open source storage format that provides ACID\ntransactions and scalable metadata handling, and unifies streaming and batch\ndata processing. It runs on top of existing data lakes and is compatible with\nApache Spark™ and other processing engines.\n\nDelta Lake uses Delta Tables which are based on Apache Parquet, a commonly\nused format for structured data already utilized by many organizations. Therefore,\nswitching existing Parquet tables to Delta Tables is easy and quick. Delta\nTables can also be used with semi-structured and unstructured data, providing\nversioning, reliability, metadata management, and time travel capabilities that\nmake these types of data easily managed as well.\n\n\nAs for performance, data lakes use object storage, so data is mostly kept in\nimmutable files leading to the following problems:\n\n**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\nindexing practices in the form of partitioning that leads to hundreds of dev hours\nspent tuning file sizes to improve read/write performance. Often, partitioning\nproves to be ineffective over time if the wrong field was selected for partitioning\nor due to high cardinality columns.\n\n**•** **Too many small files:** With no support for transactions, appending new data\ntakes the form of adding more and more files, leading to “small file problems,”\na known root cause of query performance degradation.\n\n\n-----\n\n**Delta Lake features**\n\n\n**ACID guarantees**\n\nDelta Lake ensures that all data changes\nwritten to storage are committed for durability\nand made visible to readers atomically. In other\nwords, no more partial or corrupted files.\n\n**Scalable data and metadata handling**\n\nSince Delta Lake is built on data lakes, all reads\nand writes using Spark or other distributed\nprocessing engines are inherently scalable to\npetabyte-scale. However, unlike most other\nstorage formats and query engines, Delta Lake\nleverages Spark to scale out all the metadata\nprocessing, thus efficiently handling metadata\nof billions of files for petabyte-scale tables.\n\n\n**Audit history and time travel**\n\nThe Delta Lake transaction log records details\nabout every change made to data, providing a full\naudit trail of the changes. These data snapshots\nallow developers to access and revert to earlier\nversions of data for audits, rollbacks or to\nreproduce experiments.\n\n**Schema enforcement and schema evolution**\n\nDelta Lake automatically prevents the insertion of\ndata with an incorrect schema, i.e., not matching\nthe table schema. And when needed, it allows the\ntable schema to be explicitly and safely evolved to\naccommodate ever-changing data.\n\n\n**Support for deletes, updates and merges**\n\nMost distributed processing frameworks do not\nsupport atomic data modification operations on\ndata lakes. Delta Lake supports merge, update\nand delete operations to enable complex use\ncases including but not limited to change data\ncapture (CDC), slowly changing dimension (SCD)\noperations and streaming upserts.\n\n**Streaming and batch unification**\n\nA Delta Lake table can work both in batch\nand as a streaming source and sink. The\nability to work across a wide variety of latencies,\nranging from streaming data ingestion to batch\nhistoric backfill, to interactive queries all work\nout of the box.\n\n\n-----\n\n**The Delta Lake transaction log**\n\nA key to understanding how Delta Lake provides all these capabilities is the\ntransaction log. The Delta Lake transaction log is the common thread that runs\nthrough many of Delta Lake’s most notable features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log\nis an ordered record of every transaction that has ever been performed on\na Delta Lake table since its inception.\n\nDelta Lake is built on top of Spark to allow multiple readers and writers of a\ngiven table to work on a table at the same time. To always show users correct\nviews of the data, the transaction log serves as a single source of truth: the\ncentral repository that tracks all changes that users make to the table.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "15911b1a07f4772456ab5e2e5b11cee7", + "**Streaming and batch unification**\n\nA Delta Lake table can work both in batch\nand as a streaming source and sink. The\nability to work across a wide variety of latencies,\nranging from streaming data ingestion to batch\nhistoric backfill, to interactive queries all work\nout of the box.\n\n\n-----\n\n**The Delta Lake transaction log**\n\nA key to understanding how Delta Lake provides all these capabilities is the\ntransaction log. The Delta Lake transaction log is the common thread that runs\nthrough many of Delta Lake’s most notable features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log\nis an ordered record of every transaction that has ever been performed on\na Delta Lake table since its inception.\n\nDelta Lake is built on top of Spark to allow multiple readers and writers of a\ngiven table to work on a table at the same time. To always show users correct\nviews of the data, the transaction log serves as a single source of truth: the\ncentral repository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on\nan open table that has been modified since the last time it was read, Spark\nchecks the transaction log to see what new transactions are posted to the table.\nThen, Spark updates the table with those recent changes. This ensures that a\nuser’s version of a table is always synchronized with the master record as of the\nmost recent query, and that users cannot make divergent, conflicting changes\nto a table.\n\n\n**Flexibility and broad industry support**\n\nDelta Lake is an open source project, with an engaged community of\ncontributors building and growing the Delta Lake ecosystem atop a set of open\nAPIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\nas an open storage standard in different environments and use cases, comes a\nbroad set of integration with industry-leading tools, technologies and formats.\n\nOrganizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\nflexibility in how they ingest, store and query data. They are not limited in storing\ndata in a single cloud provider and can implement a true multicloud approach to\ndata storage.\n\nConnectors to tools, such as Fivetran, allow you to leverage Databricks’\necosystem of partner solutions, so organizations have full control of building the\nright ingestion pipelines for their use cases. Finally, consuming data via queries\nfor exploration or business intelligence (BI) is also flexible and open.\n\n\n-----\n\n**Delta Lake integrates with all major analytics tools**\n\nEliminates unnecessary data movement and duplication\n\n\n-----\n\nIn addition to a wide ecosystem of tools and technologies, Delta Lake supports\na broad set of data formats for structured, semi-structured and unstructured\ndata. These formats include image binary data that can be stored in Delta\nTables, graph data format, geospatial data types and key-value stores.\n\n**Learn more**\n\n[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n\n[Documentation](https://docs.databricks.com/delta/index.html)\n\n[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n\n[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n\n\n**What is Photon?**\n\nAs many organizations standardize on the lakehouse paradigm, this new\narchitecture poses challenges with the underlying query execution engine\nfor accessing and processing structured and unstructured data. The execution\nengine needs to provide the performance of a data warehouse and the scalability\nof data lakes.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "fd80575f4533ded58655e4616d3441e4", + "**Learn more**\n\n[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n\n[Documentation](https://docs.databricks.com/delta/index.html)\n\n[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n\n[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n\n\n**What is Photon?**\n\nAs many organizations standardize on the lakehouse paradigm, this new\narchitecture poses challenges with the underlying query execution engine\nfor accessing and processing structured and unstructured data. The execution\nengine needs to provide the performance of a data warehouse and the scalability\nof data lakes.\n\nPhoton is the next-generation query engine on the Databricks Lakehouse\nPlatform that provides dramatic infrastructure cost savings and speedups for\nall use cases — from data ingestion, ETL, streaming, data science and interactive\nqueries — directly on your data lake. Photon is compatible with Spark APIs and\nimplements a more general execution framework that allows efficient processing\nof data with support of the Spark API. This means getting started is as easy as\nturning it on — no code change and no lock-in. With Photon, typical customers are\nseeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\nto 85% reduction in VM compute hours.\n\nSpark instructions Photon instructions\n\n\nPhoton engine\n\n\nDelta/Parquet\n\nPhoton writer\nto Delta/Parquet\n\n\n-----\n\nWhy process queries with Photon?\n\n\nQuery performance on Databricks has steadily increased over the years,\npowered by Spark and thousands of optimizations packaged as part of the\nDatabricks Runtime (DBR). Photon provides an additional 2x speedup per the\nTPC-DS 1TB benchmark compared to the latest DBR versions.\n\n**Relative speedup to DBR 2.1 by DBR version**\nHigher is better\n\n\n**Customers have observed significant speedups using**\n**Photon on workloads such as:**\n\n**•** **SQL-based jobs:** Accelerate large-scale production jobs on\nSQL and Spark DataFrames\n\n**•** **IoT use cases:** Faster time-series analysis using Photon\ncompared to Spark and traditional Databricks Runtime\n\n**•** **Data privacy and compliance:** Query petabytes-scale data\nsets to identify and delete records without duplicating data\nwith Delta Lake, production jobs and Photon\n\n**•** **Loading data into Delta and Parquet:** Vectorized I/O\nspeeds up data loads for Delta and Parquet tables, lowering\noverall runtime and costs of data engineering jobs\n\n\nRelease date - DBR version (TPC-DS 1TB 10 x i3xl)\n\n\n-----\n\n**100TB TPC-DS price/performance**\nLower is better\n\n\nBest price/performance for analytics\nin the cloud\n\nWritten from the ground up in C++, Photon takes\nadvantage of modern hardware for faster queries,\nproviding up to 12x better price/performance\ncompared to other cloud data warehouses —\nall natively on your data lake.\n\n\nDatabricks SQL Databricks SQL Cloud data Cloud data Cloud data\nspot on-demand warehouse 1 warehouse 2 warehouse 3\n\n**System**\n\n\n-----\n\nWorks with your existing code\nand avoids vendor lock-in\n\nPhoton is designed to be compatible with the\nApache Spark DataFrame and SQL APIs to ensure\nworkloads run seamlessly without code changes.\nAll you do is turn it on. Photon will seamlessly\ncoordinate work and resources and transparently\naccelerate portions of your SQL and Spark queries.\nNo tuning or user intervention required.\n\n\n**Photon in the Databricks Lakehouse Platform**\n\n**Client: submit SQL**\n\nParsing\nCatalyst: analysis/\nplanning/optimization\nscheduling\n\nExecute task Execute task Execute task Execute task\n\n_Lifecycle of a Photon query_\n\n\nSpark\ndriver\nJVM", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "075b546940802feb12ef774f8983d5e5", + "Written from the ground up in C++, Photon takes\nadvantage of modern hardware for faster queries,\nproviding up to 12x better price/performance\ncompared to other cloud data warehouses —\nall natively on your data lake.\n\n\nDatabricks SQL Databricks SQL Cloud data Cloud data Cloud data\nspot on-demand warehouse 1 warehouse 2 warehouse 3\n\n**System**\n\n\n-----\n\nWorks with your existing code\nand avoids vendor lock-in\n\nPhoton is designed to be compatible with the\nApache Spark DataFrame and SQL APIs to ensure\nworkloads run seamlessly without code changes.\nAll you do is turn it on. Photon will seamlessly\ncoordinate work and resources and transparently\naccelerate portions of your SQL and Spark queries.\nNo tuning or user intervention required.\n\n\n**Photon in the Databricks Lakehouse Platform**\n\n**Client: submit SQL**\n\nParsing\nCatalyst: analysis/\nplanning/optimization\nscheduling\n\nExecute task Execute task Execute task Execute task\n\n_Lifecycle of a Photon query_\n\n\nSpark\ndriver\nJVM\n\nSpark\nexecutors mixed\nJVM/Native\n\n\n-----\n\nOptimizing for all data use cases\nand workloads\n\nPhoton is the first purpose-built lakehouse engine\ndesigned to accelerate all data and analytics\nworkloads: data ingestion, ETL, streaming, data\nscience, and interactive queries. While we started\nPhoton primarily focused on SQL to provide\ncustomers with world-class data warehousing\nperformance on their data lakes, we’ve significantly\nincreased the scope of ingestion sources, formats,\nAPIs and methods supported by Photon since\nthen. As a result, customers have seen dramatic\ninfrastructure cost savings and speedups on\nPhoton across all their modern Spark (e.g., Spark\nSQL and DataFrame) workloads.\n\n\nQuery optimizer\n\nNative execution engine\n\nCaching\n\n\n_Accelerating all workloads on the lakehouse_\n\n**Learn more**\n\n[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n\n[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n\n\n-----\n\n**CHAPTER**\n\n# 04\n\n\n### Unified governance and sharing for data, analytics and AI\n\nToday, more and more organizations recognize the importance of making\nhigh-quality data readily available to data teams to drive actionable insights\nand business value. At the same time, organizations also understand the risks\nof data breaches which negatively impact brand value and inevitably lead to\nerosion of customer trust. Governance is one of the most critical components\nof a lakehouse data platform architecture; it helps ensure that data assets\nare securely managed throughout the enterprise. However, many companies\nare using different incompatible governance models leading to complex and\nexpensive solutions.\n\n\n-----\n\n#### Key challenges with data and AI governance\n\n**Diversity of data and AI assets**\n\nThe increased use of data and the added complexity of the data landscape\nhave left organizations with a difficult time managing and governing all types\nof their data-related assets. No longer is data stored in files or tables. Data\nassets today take many forms, including dashboards, machine learning models\nand unstructured data like video and images that legacy data governance\nsolutions simply are not built to govern and manage.\n\n\n**Rising multicloud adoption**\n\nMore and more organizations now leverage a multicloud strategy to optimize\ncosts, avoid vendor lock-in, and meet compliance and privacy regulations. With\nnonstandard, cloud-specific governance models, data governance across clouds\nis complex and requires familiarity with cloud-specific security and governance\nconcepts, such as identity and access management (IAM).", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7c7d19c2aca4f65da5a91323d2845774", + "-----\n\n#### Key challenges with data and AI governance\n\n**Diversity of data and AI assets**\n\nThe increased use of data and the added complexity of the data landscape\nhave left organizations with a difficult time managing and governing all types\nof their data-related assets. No longer is data stored in files or tables. Data\nassets today take many forms, including dashboards, machine learning models\nand unstructured data like video and images that legacy data governance\nsolutions simply are not built to govern and manage.\n\n\n**Rising multicloud adoption**\n\nMore and more organizations now leverage a multicloud strategy to optimize\ncosts, avoid vendor lock-in, and meet compliance and privacy regulations. With\nnonstandard, cloud-specific governance models, data governance across clouds\nis complex and requires familiarity with cloud-specific security and governance\nconcepts, such as identity and access management (IAM).\n\n**Disjointed tools for data governance on the lakehouse**\n\nToday, data teams must deal with a myriad of fragmented tools and services for\ntheir data governance requirements, such as data discovery, cataloging, auditing,\nsharing, access controls, etc. This inevitably leads to operational inefficiencies\nand poor performance due to multiple integration points and network latency\nbetween the services.\n\n\n**Two disparate and incompatible data platforms**\n\nOrganizations today use two different platforms for their data analytics and\nAI efforts — data warehouses for BI and data lakes for AI. This results in data\nreplication across two platforms, presenting a major governance challenge.\nWith no unified view of the data landscape, it is difficult to see where data is\nstored, who has access to what data, and consistently define and enforce data\naccess policies across the two platforms with different governance models.\n\n\n-----\n\n#### One security and governance approach\n\nLakehouse systems provide a uniform way to manage access control, data\nquality and compliance across all of an organization’s data using standard\ninterfaces similar to those in data warehouses by adding a management\ninterface on top of data lake storage.\n\nModern lakehouse systems support fine-grained (row, column and view level)\naccess control via SQL, query auditing, attribute-based access control, data\nversioning and data quality constraints and monitoring. These features are\ngenerally provided using standard interfaces familiar to database administrators\n(for example, SQL GRANT commands) to allow existing personnel to manage\nall the data in an organization in a uniform way. Centralizing all the data in\na lakehouse system with a single management interface also reduces the\nadministrative burden and potential for error that comes with managing\nmultiple separate systems.\n\n\n#### What is Unity Catalog?\n\nUnity Catalog is a unified governance solution for all data, analytics and AI\nassets including files, tables, dashboards and machine learning models in your\nlakehouse on any cloud. Unity Catalog simplifies governance by empowering\ndata teams with a common governance model based on ANSI-SQL to define\nand enforce fine-grained access controls. With attribute-based access controls,\ndata administrators can enable fine-grained access controls on rows and\ncolumns using tags (attributes). Built-in data search and discovery allows\ndata teams to quickly find and reference relevant data for any use case. Unity\nCatalog offers automated data lineage for all workloads in SQL, R, Scala and\nPython, to build a better understanding of the data and its flow in the lakehouse.\nUnity Catalog also allows data sharing across or within organizations and\nseamless integrations with your existing data governance tools.\n\nWith Unity Catalog, data teams can simplify governance for all data and AI\nassets with one consistent model to discover, access and share data, giving\nyou much better native performance, management and security across clouds.\n\n\n-----\n\n**Key benefits**\n\n\nThe common metadata layer for cross-workspace metadata is at the account\nlevel and eases collaboration by allowing different workspaces to access Unity\nCatalog metadata through a common interface and break down data silos.\nFurther, the data permissions in Unity Catalog are applied to account-level\nidentities, rather than identities that are local to a workspace, allowing\na consistent view of users and groups across all workspaces.\n\n\nCatalog, secure and audit access to all data assets on any cloud\n\nUnity Catalog provides centralized metadata, enabling data teams to create\na single source of truth for all data assets ranging from files, tables, dashboards\nto machine learning models in one place.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8edcc501e83716d9ea824e0caa38cabf", + "With Unity Catalog, data teams can simplify governance for all data and AI\nassets with one consistent model to discover, access and share data, giving\nyou much better native performance, management and security across clouds.\n\n\n-----\n\n**Key benefits**\n\n\nThe common metadata layer for cross-workspace metadata is at the account\nlevel and eases collaboration by allowing different workspaces to access Unity\nCatalog metadata through a common interface and break down data silos.\nFurther, the data permissions in Unity Catalog are applied to account-level\nidentities, rather than identities that are local to a workspace, allowing\na consistent view of users and groups across all workspaces.\n\n\nCatalog, secure and audit access to all data assets on any cloud\n\nUnity Catalog provides centralized metadata, enabling data teams to create\na single source of truth for all data assets ranging from files, tables, dashboards\nto machine learning models in one place.\n\n\n-----\n\nUnity Catalog offers a unified data access layer that provides a simple and\nstreamlined way to define and connect to your data through managed tables,\nexternal tables, or files, while managing their access controls. Unity Catalog\ncentralizes access controls for files, tables and views.\n\nIt allows fine-grained access controls for restricting access to certain rows\nand columns to the users and groups who are authorized to query them. With\nAttribute-Based Access Controls (ABAC), you can control access to multiple\ndata items at once based on user and data attributes, further simplifying\ngovernance at scale. For example, you will be able to tag multiple columns\nas personally identifiable information (PII) and manage access to all columns\ntagged as PII in a single rule.\n\nToday, organizations are dealing with an increased burden of regulatory\ncompliance, and data access auditing is a critical component to ensure your\norganization is set up for success while meeting compliance requirements.\nUnity Catalog also provides centralized fine-grained auditing by capturing an\naudit log of operations such as create, read, update and delete (CRUD) that have\nbeen performed against the data. This allows a fine-grained audit trail showing\nwho accessed a given data set and helps you meet your compliance and\nbusiness requirements.\n\n\n-----\n\nBuilt-in data search and discovery\n\nData discovery is a critical component to break\ndown data silos and democratize data across\nyour organization to make data-driven decisions.\nUnity Catalog provides a rich user interface for\ndata search and discovery, enabling data teams to\nquickly search relevant data assets across the data\nlandscape and reference them for all use cases —\nBI, analytics and machine learning — accelerating\ntime-to-value and boosting productivity.\n\n\n-----\n\nAutomated data lineage for all workloads\n\nData lineage describes the transformations and\nrefinements of data from source to insight. Lineage\nincludes capturing all the relevant metadata and\nevents associated with the data in its lifecycle,\nincluding the source of the data set, what other\ndata sets were used to create it, who created it and\nwhen, what transformations were performed, which\nother data sets leverage it, and many other events\nand attributes. Unity Catalog offers automated data\nlineage down to table and column level, enabling\ndata teams to get an end-to-end view of where\ndata is coming from, what transformations were\nperformed on the data and how data is consumed\nby end applications such as notebooks, workflows,\ndashboards, machine learning models, etc.\n\nWith automated data lineage for all workloads —\nSQL, R, Python and Scala, data teams can quickly\nidentify and perform root cause analysis of any\nerrors in the data pipelines or end applications.\nSecond, data teams can perform impact analysis\nto see dependencies of any data changes\non downstream consumers and notify them\nabout the potential impact. Finally, data lineage\nalso empowers data teams with increased\nunderstanding of their data and reduces tribal\nknowledge. Unity Catalog can also capture lineage\nassociated with non-data entities, such as notebooks,\nworkflows and dashboards. Lineage can be\n\n\n_Data lineage with Unity Catalog_\n\nretrieved via REST APIs to support integrations\nwith other catalogs.\n\nIntegrated with your existing tools\n\n\n**Resources**\n\n[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n\n[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n\n[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6dbd9be1aaa221ebfee5e85daf053e70", + "_Data lineage with Unity Catalog_\n\nretrieved via REST APIs to support integrations\nwith other catalogs.\n\nIntegrated with your existing tools\n\n\n**Resources**\n\n[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n\n[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n\n[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n\n\nUnity Catalog helps you to future-proof your data\nand AI governance with the flexibility to leverage\nyour existing data catalogs and governance\nsolutions — Collibra, Alation, Immuta, Privacera,\nMicrosoft Purview and AWS Lakeformation.\n\n\n-----\n\n#### Open data sharing and collaboration\n\nData sharing has become important in the digital\neconomy as enterprises wish to exchange data\neasily and securely with their customers, partners,\nsuppliers and internal lines of business to better\ncollaborate and unlock value from that data. But\nto date, a lack of standards-based data sharing\nprotocol has resulted in data sharing solutions\ntied to a single vendor or commercial product,\nintroducing vendor lock-in risks. What the industry\ndeserves is an open approach to data sharing.\n\n**Why data sharing is hard**\n\nData sharing has evolved from an optional feature\nof a few data platforms to a business necessity\nand success factor for organizations. Our solution\narchitects encounter daily the classic scenarios\nof a retailer looking to publish sales data to their\nsuppliers in real time or a supplier that wants to\nshare real-time inventory.\n\nAs a reminder, data sharing recently triggered\nthe most impressive scientific development that\nhumankind has ever seen. On January 5, 2021, the\nfirst sample of the genome of the coronavirus was\n\n\nuploaded to the internet. It wasn’t a lung biopsy\nfrom a patient in Wuhan, but a shared digital\ngenomic data set that triggered the development\nof the first batch of COVID vaccines worldwide.\n\n\ntreatments, tests and tracking mutations as they\nare passed down through a lineage, a branch of\nthe coronavirus family tree. The above graphic\nshows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n\n\nSince then, coronavirus experts have daily\nexchanged public data sets, looking for better\n\n\n-----\n\nSharing data, as well as consuming data from\nexternal sources, allows you to collaborate with\npartners, establish new partnerships, enable\nresearch and can generate new revenue streams\nwith data monetization.\n\nDespite those promising examples, existing data\nsharing technologies come with several limitations:\n\n**•** Traditional data sharing technologies, such as\nSecure File Transfer Protocol (SFTP), do not\nscale well and only serve files offloaded to a\nserver\n\n**•** Cloud object stores operate on an object level\nand are cloud-specific\n\n**•** Commercial data sharing offerings baked into\nvendor products often share tables instead of\nfiles, but scaling them is expensive and they\nare not open and, therefore, do not permit data\nsharing with a different platform\n\nThe following table compares proprietary vendor\nsolutions with SFTP, cloud object stores and Delta\nSharing.\n\n\n\n|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n|---|---|---|---|---|\n|Secure|||||\n|Cheap|||||\n|Vendor agnostic|||||\n|Multicloud|||||\n|Open source|||||\n|Table/DataFrame abstraction|||||\n|Live data|||||\n|Predicate pushdown|||||\n|Object store bandwidth|||||\n|Zero compute cost|||||\n|Scalability|||||\n\n\n-----\n\n**Open source data sharing and Databricks**\n\nTo address the limitations of existing data sharing solutions, Databricks developed\n[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\nto the Linux Foundation.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "2e857bd69bc4575d85c5094c0d462dec", + "The following table compares proprietary vendor\nsolutions with SFTP, cloud object stores and Delta\nSharing.\n\n\n\n|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n|---|---|---|---|---|\n|Secure|||||\n|Cheap|||||\n|Vendor agnostic|||||\n|Multicloud|||||\n|Open source|||||\n|Table/DataFrame abstraction|||||\n|Live data|||||\n|Predicate pushdown|||||\n|Object store bandwidth|||||\n|Zero compute cost|||||\n|Scalability|||||\n\n\n-----\n\n**Open source data sharing and Databricks**\n\nTo address the limitations of existing data sharing solutions, Databricks developed\n[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\nto the Linux Foundation.\n\nAn open source–based solution, such as Delta Sharing, eliminates the lock-in\nof commercial solutions and brings a number of additional benefits such as\ncommunity-developed integrations with popular, open source data processing\nframeworks. In addition, open protocols allow the easy integration of commercial\nclients, such as BI tools.\n\n**What is Databricks Delta Sharing?**\n\nDatabricks Delta Sharing provides an open solution to securely share live data\nfrom your lakehouse to any computing platform. Recipients don’t have to be\non the Databricks platform or on the same cloud or a cloud at all. Data providers\ncan share live data, without replicating or moving it to another system. Recipients\nbenefit from always having access to the latest version of data and can quickly\nquery shared data using tools of their choice for BI, analytics and machine\nlearning, reducing time-to-value. Data providers can centrally manage, govern,\naudit and track usage of the shared data on one platform.\n\nUnity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\nfor data sharing, enabling organizations to share live, large-scale data without\nreplication and make data easily and quickly accessible from tools of your\nchoice, with enterprise-grade security.\n\n\n**Key benefits**\n\nOpen cross-platform sharing\n\nEasily share existing data in Delta Lake and Apache Parquet formats between\ndifferent vendors. Consumers don’t have to be on the Databricks platform, same\ncloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\nand Java allow recipients to consume shared data directly from the tools of their\nchoice. Delta Sharing eliminates the need to set up a new ingestion process to\nconsume data. Data recipients can directly access the fresh data and query it\nusing tools of their choice. Recipients can also enrich data with data sets from\npopular data providers.\n\nSharing live data without copying it\n\nShare live ready-to-query data, without replicating or moving it to another system.\nMost enterprise data today is stored in cloud data lakes. Any of the existing data\nsets on the provider’s data lake can easily be shared across clouds, regions or\ndata platforms without any data replication or physical movement of data. Data\nproviders can update their data sets reliably in real time and provide a fresh and\nconsistent view of their data to recipients.\n\nCentralized administration and governance\n\nYou can centrally govern, track and audit access to the shared data from a single\npoint of enforcement to meet compliance requirements. Detailed user-access\naudit logs are kept to know who is accessing the data and monitor usage of the\nshared data down to table, partition and version level.\n\n\n-----\n\nAn open Marketplace for data solutions\n\nThe demand for third-party data to make data-driven innovations is greater than ever,\n\nand data marketplaces act as a bridge between data providers and data consumers to\n\nhelp facilitate the discovery and distribution of data sets.\n\nDatabricks Marketplace provides an open marketplace for exchanging data products\n\nsuch as data sets, notebooks, dashboards and machine learning models. To accelerate\n\ninsights, data consumers can discover, evaluate and access more data products from\n\nthird-party vendors than ever before. Providers can now commercialize new offerings\n\nand shorten sales cycles by providing value-added services on top of their data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "14378d678f1ea82334c55482bb6fd4c0", + "Centralized administration and governance\n\nYou can centrally govern, track and audit access to the shared data from a single\npoint of enforcement to meet compliance requirements. Detailed user-access\naudit logs are kept to know who is accessing the data and monitor usage of the\nshared data down to table, partition and version level.\n\n\n-----\n\nAn open Marketplace for data solutions\n\nThe demand for third-party data to make data-driven innovations is greater than ever,\n\nand data marketplaces act as a bridge between data providers and data consumers to\n\nhelp facilitate the discovery and distribution of data sets.\n\nDatabricks Marketplace provides an open marketplace for exchanging data products\n\nsuch as data sets, notebooks, dashboards and machine learning models. To accelerate\n\ninsights, data consumers can discover, evaluate and access more data products from\n\nthird-party vendors than ever before. Providers can now commercialize new offerings\n\nand shorten sales cycles by providing value-added services on top of their data.\n\nDatabricks Marketplace is powered by Delta Sharing, allowing consumers to access\n\ndata products without having to be on the Databricks platform. This open approach\n\nallows data providers to broaden their addressable market without forcing consumers\n\ninto vendor lock-in.\n\n_Databricks Marketplace_\n\n\nPrivacy-safe data cleanrooms\n\nPowered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n\na flexible data cleanroom solution allowing businesses to easily collaborate with their\n\ncustomers and partners on any cloud in a privacy-safe way. Participants in the data\n\ncleanrooms can share and join their existing data, and run complex workloads in any\n\nlanguage — Python, R, SQL, Java and Scala — on the data while maintaining data\n\nprivacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n\ndata replication across clouds or regions with other participants, which simplifies data\n\noperations and reduces cost.\n\n_Data cleanrooms with Databricks Lakehouse Platform_\n\n\n-----\n\n**How it works**\n\nDelta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\nis natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\nor externally.\n\nDelta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\nAzure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n\n**Data provider** **Data recipient**\n\nData science And many more On-premises\n\nThe data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\ndecides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\nshares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n\nThe data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\nSpark, Java and Python, and is working with partners on many more.\n\n\n-----\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n1. The recipient’s client authenticates to the sharing server and asks to query\na specific table. The client can also provide filters on the data (for example,\n“country=US”) as a hint to read just a subset of the data.\n\n2. The server verifies whether the client is allowed to access the data, logs the\nrequest, and then determines which data to send back. This will be a subset\nof the data objects in cloud storage systems that make up the table.\n\n3. To transfer the data, the server generates short-lived presigned URLs that\nallow the client to read these Parquet files directly from the cloud provider,\nso that the transfer can happen in parallel at massive bandwidth, without\nstreaming through the sharing server.\n\n**Learn more**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "a9c08a28601aadaec5c9d2b4059b64a0", + "The data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\nSpark, Java and Python, and is working with partners on many more.\n\n\n-----\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n1. The recipient’s client authenticates to the sharing server and asks to query\na specific table. The client can also provide filters on the data (for example,\n“country=US”) as a hint to read just a subset of the data.\n\n2. The server verifies whether the client is allowed to access the data, logs the\nrequest, and then determines which data to send back. This will be a subset\nof the data objects in cloud storage systems that make up the table.\n\n3. To transfer the data, the server generates short-lived presigned URLs that\nallow the client to read these Parquet files directly from the cloud provider,\nso that the transfer can happen in parallel at massive bandwidth, without\nstreaming through the sharing server.\n\n**Learn more**\n\n[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n\n[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n\n[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n\n\n-----\n\n**CHAPTER**\n\n# 05\n\n\n### Security\n\nOrganizations that operate in multicloud environments need a unified, reliable\nand consistent approach to secure data. We’ve learned from our customers that\na simple and unified approach to data security for the lakehouse is one of the\nmost critical requirements for modern data solutions. Databricks is trusted by\nthe world’s largest organizations to provide a powerful lakehouse platform with\nhigh security and scalability. In fact, thousands of customers trust Databricks\nwith their most sensitive data to analyze and build data products using machine\nlearning (ML). With significant investment in building a highly secure and scalable\nplatform, Databricks delivers end-to-end platform security for data and users.\n\n\n-----\n\n#### Platform architecture reduces risk\n\nThe Databricks Lakehouse architecture is split into\ntwo separate planes to simplify your permissions,\navoid data duplication and reduce risk. The control\nplane is the management plane where Databricks\nruns the workspace application and manages\nnotebooks, configuration and clusters. Unless you\nchoose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\nruns inside your cloud service provider account,\nprocessing your data without taking it out of your\naccount. You can embed Databricks in your data\nexfiltration protection architecture using features\nlike customer-managed VPCs/VNets and admin\nconsole options that disable export.\n\nWhile certain data, such as your notebooks,\nconfigurations, logs, and user information, is\npresent within the control plane, that information\nis encrypted at rest, and communication to and\nfrom the control plane is encrypted in transit.\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n|Col1|Control pane|Col3|\n|---|---|---|\n||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n||Cluster manager||\n\n\nYou also have choices for where certain data lives:\nYou can host your own store of metadata about\nyour data tables (Hive metastore), or store query", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "d3ae991214feea1723ee8744208a2907", + "While certain data, such as your notebooks,\nconfigurations, logs, and user information, is\npresent within the control plane, that information\nis encrypted at rest, and communication to and\nfrom the control plane is encrypted in transit.\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n|Col1|Control pane|Col3|\n|---|---|---|\n||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n||Cluster manager||\n\n\nYou also have choices for where certain data lives:\nYou can host your own store of metadata about\nyour data tables (Hive metastore), or store query\n\n\n**Data**\n\n\n**DBFS root**\n\n\nresults in your cloud service provider account and\ndecide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n\n\n-----\n\n#### Step-by-step example\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n\n\n**DBFS root**\n\n|Col1|ample|Col3|Col4|Col5|\n|---|---|---|---|---|\n||Control pane 1 4||||\n|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n||||||\n||||||\n||||||\n||||||\n||||||\n\n\n-----\n\nSuppose you have a data engineer that signs in to Databricks and\nwrites a notebook that transforms raw data in Kafka to a normalized\ndata set sent to storage such as Amazon S3 or Azure Data Lake\nStorage. Six steps make that happen:\n\n1. The data engineer seamlessly authenticates, via your single sign-on\nif desired, to the Databricks web UI in the control plane, hosted in\nthe Databricks account.\n\n2. As the data engineer writes code, their web browser sends it to\nthe control plane. JDBC/ODBC requests also follow the same path,\nauthenticating with a token.\n\n3. When ready, the control plane uses Cloud Service Provider APIs to\ncreate a Databricks cluster, made of new instances in the data plane,\nin your CSP account. Administrators can apply cluster policies to\nenforce security profiles.\n\n4. Once the instances launch, the cluster manager sends the data\nengineer’s code to the cluster.\n\n5. The cluster pulls from Kafka in your account, transforms the data\nin your account and writes it to a storage in your account.\n\n6. The cluster reports status and any outputs back to the cluster manager.\n\nThe data engineer does not need to worry about many of the details —\nsimply write the code and Databricks runs it.\n\n\n#### Network and server security\n\nHere is how Databricks interacts with your cloud service provider\naccount to manage network and server security\n\n**Networking**\n\nRegardless of where you choose to host the data plane, Databricks networking\nis straightforward. If you host it yourself, Databricks by default will still configure\nnetworking for you, but you can also control data plane networking with your\nown managed VPC or VNet.\n\nThe serverless data plane network infrastructure is managed by Databricks in\na Databricks cloud service provider account and shared among customers,\nwith additional network boundaries between workspaces and between clusters.\n\nDatabricks does not rewrite or change your data structure in your storage, nor\ndoes it change or modify any of your security and governance policies. Local\nfirewalls complement security groups and subnet firewall policies to block\nunexpected inbound connections.\n\nCustomers at the enterprise tier can also use the IP access list feature on\nthe control plane to limit which IP addresses can connect to the web UI or\nREST API — for example, to allow only VPN or office IPs.\n\n\n-----\n\n**Servers**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8dc7a1ecd072e76cd966c1e96c25cf97", + "Here is how Databricks interacts with your cloud service provider\naccount to manage network and server security\n\n**Networking**\n\nRegardless of where you choose to host the data plane, Databricks networking\nis straightforward. If you host it yourself, Databricks by default will still configure\nnetworking for you, but you can also control data plane networking with your\nown managed VPC or VNet.\n\nThe serverless data plane network infrastructure is managed by Databricks in\na Databricks cloud service provider account and shared among customers,\nwith additional network boundaries between workspaces and between clusters.\n\nDatabricks does not rewrite or change your data structure in your storage, nor\ndoes it change or modify any of your security and governance policies. Local\nfirewalls complement security groups and subnet firewall policies to block\nunexpected inbound connections.\n\nCustomers at the enterprise tier can also use the IP access list feature on\nthe control plane to limit which IP addresses can connect to the web UI or\nREST API — for example, to allow only VPN or office IPs.\n\n\n-----\n\n**Servers**\n\nIn the data plane, Databricks clusters automatically run the latest hardened\nsystem image. Users cannot choose older (less secure) images or code. For AWS\nand Azure deployments, images are typically updated every two-to-four weeks.\nGCP is responsible for its system image.\n\nDatabricks runs scans for every release, including:\n\n**•** System image scanning for vulnerabilities\n\n**•** Container OS and library scanning\n\n\n**Severity** **Remediation time**\n\n**Critical** **< 14 days**\n\n**High** **< 30 days**\n\n**Medium** **< 60 days**\n\n**Low** **When appropriate**\n\n\n\n**•** Static and dynamic code scanning\n\n**Databricks access**\n\n\nDatabricks code is peer reviewed by developers who have security training.\nSignificant design documents go through comprehensive security reviews.\nScans run fully authenticated, with all checks enabled, and issues are\ntracked against the timeline shown in this table.\n\nNote that Databricks clusters are typically short-lived (often terminated\nafter a job completes) and do not persist data after they terminate. Clusters\ntypically share the same permission level (excluding high concurrency or\nDatabricks SQL clusters, where more robust security controls are in place).\nYour code is launched in an unprivileged container to maintain system\nstability. This security design provides protection against persistent attackers\nand privilege escalation.\n\n\nDatabricks access to your environment is limited to cloud service provider APIs\nfor our automation and support access. Automated access allows the Databricks\ncontrol plane to configure resources in your environment using the cloud service\nprovider APIs. The specific APIs vary based on the cloud. For instance, an AWS\ncross-account IAM role, or Azure-owned automation or GKE automation do not\ngrant access to your data sets (see the next section).\n\nDatabricks has a custom-built system that allows staff to fix issues or handle\nsupport requests — for example, when you open a support request and check the\nbox authorizing access to your workspace. Access requires either a support ticket\nor engineering ticket tied expressly to your workspace and is limited to a subset of\nemployees and for limited time periods. Additionally, if you have configured audit\nlog delivery, the audit logs show the initial access event and the staff’s actions.\n\n\n-----\n\n**Identity and access**\n\nDatabricks supports robust ACLs and SCIM. AWS customers can configure\nSAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\nGCP automatically integrate with Azure Active Directory or GCP identity.\n\nDatabricks supports a variety of ways to enable users to access their data.\n\n**Examples include:**\n\n**•** The Table ACLs feature uses traditional SQL-based statements to\nmanage access to data and enable fine-grained view-based access\n\n**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\nusers of that cluster automatically access allowed resources without\nexplicit credentials\n\n**•** External storage can be mounted or accessed using a securely\nstored access key", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3effc9d28683f767fe2ff753dd4bcb04", + "-----\n\n**Identity and access**\n\nDatabricks supports robust ACLs and SCIM. AWS customers can configure\nSAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\nGCP automatically integrate with Azure Active Directory or GCP identity.\n\nDatabricks supports a variety of ways to enable users to access their data.\n\n**Examples include:**\n\n**•** The Table ACLs feature uses traditional SQL-based statements to\nmanage access to data and enable fine-grained view-based access\n\n**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\nusers of that cluster automatically access allowed resources without\nexplicit credentials\n\n**•** External storage can be mounted or accessed using a securely\nstored access key\n\n**•** The Secrets API separates credentials from code when accessing\nexternal resources\n\n\n**Data security**\n\nDatabricks provides encryption, isolation and auditing.\n\n**Databricks encryption capabilities are**\n**in place both at rest and in motion**\n\n\n\n|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n|---|---|\n\n\n**Customers can isolate users at multiple levels:**\n\n**•** **Workspace level:** Each team or department can use a separate workspace\n\n**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n\nto a given cluster\n\n**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\nlanguages (SQL, Python) allow for the safe coexistence of users of different\nprivilege levels, and is used with Table ACLs\n\n**•** **Single-user cluster:** Users can create a private dedicated cluster\n\nActivities of Databricks users are logged and can be delivered automatically to\na cloud storage bucket. Customers can also monitor provisioning activities by\nmonitoring cloud audit logs.\n\n\n-----\n\n**Compliance**\n\n**Databricks supports the following compliance standards on**\n\n**our multi-tenant platform:**\n\n**•** **SOC 2 Type II**\n\n**•** **ISO 27001**\n\n**•** **ISO 27017**\n\n**•** **ISO 27018**\n\nCertain clouds support Databricks deployment options for FedRAMP\nHigh, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\nare also GDPR and CCPA ready.\n\n**Learn more**\n\nTo learn more about Databricks security,\nvisit the [Security and Trust Center](https://databricks.com/trust)\n\n\n-----\n\n**CHAPTER**\n\n# 06\n\n\n### Instant compute and serverless\n\n\n-----\n\n#### Benefits of Databricks Serverless SQL\n\nServerless SQL is much easier to administer with Databricks taking on the\nresponsibility of deploying, configuring and managing your cluster VMs. Databricks\ncan transfer compute capacity to user queries typically in about 15 seconds — so\nyou no longer need to wait for clusters to start up or scale out to run your queries.\n\nServerless SQL also has built-in connectors to your favorite tools such as Tableau,\nPower BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\nauthentication support and high performance. And finally, you save on cost\nbecause you do not need to overprovision or pay for the idle capacity.\n\n\n#### What is serverless compute?", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bd59e24ba90fb53ddd7c055746f246e9", + "To learn more about Databricks security,\nvisit the [Security and Trust Center](https://databricks.com/trust)\n\n\n-----\n\n**CHAPTER**\n\n# 06\n\n\n### Instant compute and serverless\n\n\n-----\n\n#### Benefits of Databricks Serverless SQL\n\nServerless SQL is much easier to administer with Databricks taking on the\nresponsibility of deploying, configuring and managing your cluster VMs. Databricks\ncan transfer compute capacity to user queries typically in about 15 seconds — so\nyou no longer need to wait for clusters to start up or scale out to run your queries.\n\nServerless SQL also has built-in connectors to your favorite tools such as Tableau,\nPower BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\nauthentication support and high performance. And finally, you save on cost\nbecause you do not need to overprovision or pay for the idle capacity.\n\n\n#### What is serverless compute?\n\nServerless compute is a fully managed service where Databricks provisions\nand manages the compute layer on behalf of the customer in the Databricks\ncloud account instead of the customer account. As of the current release,\nserverless compute is supported for use with Databricks SQL. This new\ncapability for Databricks SQL provides instant compute to users for their\nBI and SQL workloads, with minimal management required and capacity\noptimizations that can lower overall cost by 20%-40% on average. This\nmakes it even easier for organizations to expand adoption of the lakehouse\nfor business analysts who are looking to access the rich, real-time data sets\nof the lakehouse with a simple and performant solution.\n\n\n-----\n\n**Inside Serverless SQL**\n\n\n**Databricks Serverless SQL**\n\n**Managed servers**\n\n**Serverless SQL**\n**compute**\n\n**Secure**\n**Instant compute**\n\n\nAt the core of Serverless SQL is a compute\nplatform that operates a pool of servers located\nin a Databricks’ account, running Kubernetes\ncontainers that can be assigned to a user\nwithin seconds.\n\nWhen many users are running reports or queries\nat the same time, the compute platform adds more\nservers to the cluster (again, within seconds) to\nhandle the concurrent load. Databricks manages\nthe entire configuration of the server and\nautomatically performs the patching and upgrades\nas needed.\n\nEach server is running a secure configuration and\nall processing is secured by three layers of isolation:\nThe Kubernetes container hosting the runtime; the\nvirtual machine (VM) hosting the container; and\nthe virtual network for the workspace. Each layer\nis isolated to one workspace with no sharing or\ncross-network traffic allowed. The containers use\nhardened configurations, VMs are shut down and\nnot reused, and network traffic is restricted\nto nodes in the same cluster.\n\n\n-----\n\n#### Performance of Serverless SQL\n\nWe ran a set of internal tests to compare\nDatabricks Serverless SQL to the current\nDatabricks SQL and several traditional cloud\ndata warehouses. We found Serverless SQL\nto be the most cost-efficient and performant\nenvironment to run SQL workloads when\nconsidering cluster startup time, query\nexecution time and overall cost.\n\n\n**Databricks Serverless SQL is the highest**\n**performing and most cost-effective solution**\n\n**Cloud SQL solutions compared**\n\n\n**Faster**\n\n**Query**\n**execution**\n**time**\n\n**Slower**\n\n\n**Serverless**\n**SQL**\n\n**CDW1**\n\n**CDW3**\n\n\n**Cost Estimate**\n\n**High**\n\n**Medium**\n\n**Low**\n\n\n**CDW2**\n\n\n**CDW4**\n\n\n**Slower** **Faster**\n**(~5min)** **Startup time** **(~2-3sec)**\n\n**Learn more**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "11c8b386a7d3091ed8341cf2d56b6a97", + "**Databricks Serverless SQL is the highest**\n**performing and most cost-effective solution**\n\n**Cloud SQL solutions compared**\n\n\n**Faster**\n\n**Query**\n**execution**\n**time**\n\n**Slower**\n\n\n**Serverless**\n**SQL**\n\n**CDW1**\n\n**CDW3**\n\n\n**Cost Estimate**\n\n**High**\n\n**Medium**\n\n**Low**\n\n\n**CDW2**\n\n\n**CDW4**\n\n\n**Slower** **Faster**\n**(~5min)** **Startup time** **(~2-3sec)**\n\n**Learn more**\n\nThe feature is currently in Public Preview. Sign up to\n[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\nServerless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n\n\n-----\n\n**CHAPTER**\n\n# 07\n\n\n### Data warehousing\n\nData warehouses are not keeping up with today’s world. The explosion of\nlanguages other than SQL and unstructured data, machine learning, IoT and\nstreaming analytics are forcing organizations to adopt a bifurcated architecture\nof disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\nis ubiquitous and known by millions of professionals, it has never been treated\nas a first-class citizen on data lakes, until the lakehouse.\n\n\n-----\n\n#### What is data warehousing\n\nThe Databricks Lakehouse Platform provides a simplified multicloud and\nserverless architecture for your data warehousing workloads. Data warehousing on\nthe lakehouse allows SQL analytics and BI at scale with a common governance\nmodel. Now you can ingest, transform and query all your data in-place — using\nyour SQL and BI tools of choice — to deliver real-time business insights at the\nbest price/performance. Built on open standards and APIs, the lakehouse\nprovides the reliability, quality and performance that data lakes natively lack,\nand integrations with the ecosystem for maximum flexibility — no lock-in.\n\nWith data warehousing on the lakehouse, organizations can unify all analytics\nand simplify their architecture to enable their business with real-time business\ninsights at the best price/performance.\n\n\n#### Key benefits\n\n**Best price/performance**\n\nLower costs, get the best price/performance and eliminate\nresource management overhead\n\nOn-premises data warehouses have reached their limits — they physically\ncannot scale to handle the growing volumes of data, and don’t provide the\nelasticity customers need to respond to ever-changing business needs.\nCloud data warehouses are a great alternative to on-premises data\nwarehouses, providing greater scale and elasticity, but cloud costs for\nproprietary cloud data warehouses typically yield to an exponential cost\nincrease following the growth of data volume.\n\nThe Databricks Lakehouse Platform provides instant, elastic SQL serverless\ncompute — decoupled from storage on cheap cloud object stores — and\nthousands of performance optimizations that can lower overall infrastructure\ncosts by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\ntypes and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\nuse cases.\n\n\n-----\n\n**Built-in governance**\n\nOne source of truth and one unified\ngovernance layer across all data teams", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "429e8a17397c072459bbacaa5419cec3", + "The Databricks Lakehouse Platform provides instant, elastic SQL serverless\ncompute — decoupled from storage on cheap cloud object stores — and\nthousands of performance optimizations that can lower overall infrastructure\ncosts by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\ntypes and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\nuse cases.\n\n\n-----\n\n**Built-in governance**\n\nOne source of truth and one unified\ngovernance layer across all data teams\n\nUnderpinned by Delta Lake, the Databricks\nLakehouse Platform simplifies your architecture by\nallowing you to establish one single copy of all your\ndata for in-place analytics and ETL/ELT on your\nexisting data lakes — no more data movements\nand copies in disjointed systems. Then, seamless\nintegration with Databricks Unity Catalog lets you\neasily discover, secure and manage all your data\nwith fine-grained governance, data lineage, and\nstandard SQL.\n\n**Rich ecosystem**\n\nIngest, transform and query all your\ndata in-place with your favorite tools\n\nVery few tools exist to conduct BI on data lakes.\nGenerally, doing so has required data analysts to\n\nsubmit Spark jobs or use a developer interface.\nWhile these tools are common for data scientists,\nthey require knowledge of languages and\ninterfaces that are not traditionally part of a data\nanalyst’s tool set. As a result, the learning curve for\nan analyst to make use of a data lake is too high\nwhen well-established tools and methods already\nexist for data warehouses.\n\n\nThe Databricks Lakehouse Platform works with\nyour preferred tools like dbt, Fivetran, Power BI or\nTableau, allowing analysts and analytical engineers\nto easily ingest, transform and query the most\nrecent and complete data, without having to move\nit into a separate data warehouse. Additionally, it\nempowers every analyst across your organization\nto quickly and collaboratively find and share new\ninsights with a built-in SQL editor, visualizations\nand dashboards.\n\n**Break down silos**\n\nAccelerate time from raw to actionable\ndata and go effortlessly from BI to ML\n\n\napplications, organizations will need to manage\nan entirely different system than their SQL-only\ndata warehouse, slowing down collaboration and\ninnovation.\n\nThe Databricks Lakehouse Platform provides the\nmost complete end-to-end data warehousing\nsolution for all your modern analytics needs,\nand more. Now you can empower data teams\nand business users to access the latest data\nfaster for downstream real-time analytics and go\neffortlessly from BI to ML. Speed up the time from\nraw to actionable data at any scale — in batch and\nstreaming. And go from descriptive to advanced\nanalytics effortlessly to uncover new insights.\n\n\nIt is challenging for data engineering teams to\nenable analysts at the speed that the business\nrequires. Data warehouses need data to be\ningested and processed ahead of time before\nanalysts can access and query it using BI tools.\nBecause traditional data warehouses lack\nreal-time processing and do not scale well for\nlarge ETL jobs, they create new data movements\nand bottlenecks for the data engineering team,\nand make it slow for analysts to access the\nlatest data. And for advanced analytics (ML)\n\n\n-----\n\n**Data warehousing on Databricks**\n\n**Truly decoupled, serverless, compute layer**\n\n\n**Data consumers**\n\n\n**Data processing**\n\n**Unity Catalog**\n\n\n**ETL** **ETL**\n\n**Bronze raw** **Silver staging** **Gold DW/marts**\n\n\n**Open storage layer**\n\n**Data ingest**\n\n**Data sources**\n\n\n**Databricks**\n**Partner Connect**\n\n\n**Continuous**\n**ingest**\n\n\n**Batch**\n**ingest**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "22fe9fe6a29c6511b002810677a23bc5", + "-----\n\n**Data warehousing on Databricks**\n\n**Truly decoupled, serverless, compute layer**\n\n\n**Data consumers**\n\n\n**Data processing**\n\n**Unity Catalog**\n\n\n**ETL** **ETL**\n\n**Bronze raw** **Silver staging** **Gold DW/marts**\n\n\n**Open storage layer**\n\n**Data ingest**\n\n**Data sources**\n\n\n**Databricks**\n**Partner Connect**\n\n\n**Continuous**\n**ingest**\n\n\n**Batch**\n**ingest**\n\n\n**On-premises**\n\n**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**On-premises**\n\n**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**Learn more**\n\n\n[Try Databricks SQL for free](https://dbricks.co/dbsql)\n\n[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n\n[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n\n\n[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n\n[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n\n\n-----\n\n**CHAPTER**\n\n# 08\n\n\n### Data engineering\n\nOrganizations realize the value data plays as a strategic asset for growing\nrevenues, improving the customer experience, operating efficiently or improving\na product or service. Data is really the driver of all these initiatives. Nowadays,\ndata is often streamed and ingested from hundreds of different data sources,\nsometimes acquired from a data exchange, cleaned in various ways with\ndifferent orchestrated steps, versioned and shared for analytics and AI.\nAnd increasingly, data is being monetized.\n\nData teams rely on getting the right data at the right time for analytics, data\nscience and machine learning, but often are faced with challenges meeting\nthe needs of their initiatives for data engineering.\n\n\n-----\n\n#### Why data engineering is hard\n\nOne of the biggest challenges is accessing and managing the increasingly\ncomplex data that lives across the organization. Most of the complexity\narises with the explosion of data volumes and data types, with organizations\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n\nWith this volume, managing data pipelines to transform and process data\nis slow and difficult, and increasingly expensive. And to top off the complexity,\nmost businesses are putting an increased emphasis on multicloud\nenvironments which can be even more difficult to maintain.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e577e0ac294ad34249c7d000936d7c72", + "Data teams rely on getting the right data at the right time for analytics, data\nscience and machine learning, but often are faced with challenges meeting\nthe needs of their initiatives for data engineering.\n\n\n-----\n\n#### Why data engineering is hard\n\nOne of the biggest challenges is accessing and managing the increasingly\ncomplex data that lives across the organization. Most of the complexity\narises with the explosion of data volumes and data types, with organizations\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n\nWith this volume, managing data pipelines to transform and process data\nis slow and difficult, and increasingly expensive. And to top off the complexity,\nmost businesses are putting an increased emphasis on multicloud\nenvironments which can be even more difficult to maintain.\n\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\nthat data itself has become a product, and the challenging goal of the data\nengineer is to build and run the machinery that creates this high-fidelity\ndata product all the way from ingestion to monetization.\n\n\nDespite current technological advances data engineering remains\ndifficult for several reasons:\n\n**Complex data ingestion methods**\n\nData ingestion means retrieving batch and streaming data from various\nsources and in various formats. Ingesting data is hard and complex since you\neither need to use an always-running streaming platform like Apache Kafka\nor you need to be able to keep track of which files haven’t been ingested yet.\nData engineers are required to spend a lot of time hand-coding repetitive\nand error-prone data ingestion tasks.\n\n**Data engineering principles**\n\nThese days, large operations teams are often just a memory of the past.\nModern data engineering principles are based on agile software development\nmethodologies. They apply the well-known “you build it, you run it” paradigm,\nuse isolated development and production environments, CI/CD, and version\ncontrol transformations that are pushed to production after validation. Tooling\nneeds to support these principles.\n\n\n-----\n\n**Third-party tools**\n\nData engineers are often required to run additional third-party tools for\norchestration to automate tasks such as ELT/ETL or customer code in\nnotebooks. Running third-party tools increases the operational overhead\nand decreases the reliability of the system.\n\n**Performance tuning**\n\nFinally, with all pipelines and workflows written, data engineers need to\nconstantly focus on performance, tuning pipelines and architectures to meet\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\narchitecture and constantly observing throughput parameters.\n\nMost organizations are dealing with a complex landscape of data warehouses\nand data lakes these days. Each of those platforms has its own limitations,\nworkloads, development languages and governance model.\n\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9f81ac0b52802c7152247bfd5289b744", + "With the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:\n\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n_kinds of workflows._\n\n\n-----\n\n-----\n\n#### Benefits of data engineering on the lakehouse\n\nBy simplifying and modernizing with the lakehouse architecture, data engineers\ngain an enterprise-grade and enterprise-ready approach to building data\npipelines. The following are eight key differentiating capabilities that a data\nengineering solution team can enable with the Databricks Lakehouse Platform:\n\n**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\nengineers can enable fast, reliable, scalable and automatic data ingestion\nfor analytics, data science or machine learning.\n\n\n\n**•** **Data pipeline observability:** Monitor overall data pipeline estate status\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\nhealth for performance, quality, status and latency.\n\n**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\nanalytics and machine learning use cases by enabling easy and automatic\ndata pipeline deployments into production or roll back pipelines and\nminimize downtime.\n\n**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\nof data processing tasks for data and machine learning pipelines with the\nability to run multiple non-interactive tasks as a directed acyclic graph\n(DAG) on a Databricks compute cluster.\n\n\n\n**•** **Automated ETL pipelines:** Data engineers can reduce development\ntime and effort and focus on implementing business logic and data\nquality checks within the data pipeline using SQL or Python.\n\n**•** **Data quality checks:** Improve data reliability throughout the data\nlakehouse so data teams can confidently trust the information for\ndownstream initiatives with the ability to define data quality and\nautomatically address errors.\n\n**•** **Batch and streaming:** Allow data engineers to set tunable data latency\nwith cost controls without having to know complex stream processing\nand implement recovery logic.\n\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\nfor most common error conditions that can occur during the operation of\na pipeline with fast, scalable fault-tolerance.\n\n\n-----\n\n**Data engineering is all about data quality**\n\nThe goal of modern data engineering is to distill data with a quality that is fit for\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\nthree different levels.\n\n\n1. On a **technical level** , data quality is\nguaranteed by enforcing and evolving\nschemas for data storage and ingestion.\n\n**Kenesis**\n\n**CSV,**\n**JSON, TXT...**\n\n**Data Lake**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1cf679a6f5d9f2a7337862946918109e", + "**•** **Batch and streaming:** Allow data engineers to set tunable data latency\nwith cost controls without having to know complex stream processing\nand implement recovery logic.\n\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\nfor most common error conditions that can occur during the operation of\na pipeline with fast, scalable fault-tolerance.\n\n\n-----\n\n**Data engineering is all about data quality**\n\nThe goal of modern data engineering is to distill data with a quality that is fit for\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\nthree different levels.\n\n\n1. On a **technical level** , data quality is\nguaranteed by enforcing and evolving\nschemas for data storage and ingestion.\n\n**Kenesis**\n\n**CSV,**\n**JSON, TXT...**\n\n**Data Lake**\n\n\n2. On an **architectural level** , data quality is\noften achieved by implementing the medallion\narchitecture. A medallion architecture is a data\ndesign pattern used to logically organize data in\na [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\nprogressively improving the structure and quality\nof data as it flows through each layer of the\narchitecture, e.g., from Bronze to Silver to Gold\nlayer tables.\n\n\n3. The **Databricks Unity Catalog** comes\nwith robust data quality management with\nbuilt-in quality controls, testing, monitoring\nand enforcement to ensure accurate and\nuseful data is available for downstream BI,\nanalytics and machine learning workloads.\n\n**Streaming**\n**analytics**\n\n\n**Bronze**\n\n\n**Silver**\n\n\n**Gold**\n\n\n**BI and**\n\n**reporting**\n\n\nRaw ingestion Filtered, cleaned, Business-level\nand history augmented aggregates\n\n**Quality**\n\n\n**Data science**\n\n**and ML**\n\n\n-----\n\n#### Data ingestion\n\nWith the Databricks Lakehouse Platform, data engineers can build robust\nhyper-scale ingestion pipelines in streaming and batch mode. They can\nincrementally process new files as they land on cloud storage — with no\nneed to manage state information — in scheduled or continuous jobs.\n\nData engineers can efficiently track new files (with the ability to scale\nto billions of files) without having to list them in a directory. Databricks\nautomatically infers the schema from the source data and evolves it as\nthe data loads into the Delta Lake lakehouse. Efforts continue with\nenhancing and supporting Auto Loader, our powerful data ingestion\ntool for the Lakehouse.\n\n**What is Auto Loader?**\n\nHave you ever imagined that ingesting data could become as easy\nas dropping a file into a folder? Welcome to Databricks Auto Loader.\n\n[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\nefficiently processes new data files as they arrive in the cloud storage built\ninto the Databricks Lakehouse. Auto Loader can detect and enforce the\nschema of your data and, therefore, guarantee data quality. New files or\nfiles that have been changed since the last time new data was processed\nare identified automatically and ingested. Noncompliant data sets are\nquarantined into rescue data columns. You can use the [trigger once]\noption with Auto Loader to turn it into a job that turns itself off.\n\n\n**Ingestion for data analysts: COPY INTO**\n\nIngestion also got much easier for data analysts and analytics engineers working\nwith Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\nlake-first approach and loads data from a folder location into a Delta Lake table.\nCOPY INTO can be scheduled and called by a job repeatedly. When run, only new\nfiles from the source location will be processed.\n\n#### Data transformation", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "770d96f1c053793d9736812b3605af5f", + "**Ingestion for data analysts: COPY INTO**\n\nIngestion also got much easier for data analysts and analytics engineers working\nwith Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\nlake-first approach and loads data from a folder location into a Delta Lake table.\nCOPY INTO can be scheduled and called by a job repeatedly. When run, only new\nfiles from the source location will be processed.\n\n#### Data transformation\n\nTurning SQL queries into production ETL pipelines typically involves a lot\nof tedious, complicated operational work. Even at a small scale, the majority\nof a data practitioner’s time is spent on tooling and managing infrastructure.\n\nAlthough the medallion architecture is an established and reliable pattern\nfor improving data quality, the implementation of this pattern is challenging\nfor many data engineering teams.\n\nWhile hand-coding the medallion architecture was hard for data engineers,\ncreating data pipelines was outright impossible for data analysts not being\nable to code with Spark Structured Streaming in Scala or Python.\n\nEven at a small scale, most data engineering time is spent on tooling and\nmanaging infrastructure rather than transformation. Auto-scaling, observability\nand governance are difficult to implement and, as a result, often left out of the\nsolution entirely.\n\n\n-----\n\n#### What is Delta Live Tables?\n\nDelta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\ninfrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\nand apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\nboth Python and SQL and is tailored to work with both streaming and batch workloads.\n\nWith DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n\n\n**Write** **create live table**\n\n\n**Create** **a pipeline** **Click** **Start**\n\nStart\n\n\n-----\n\nDLT reduces the implementation time by accelerating development and\nautomating complex operational tasks. Since DLT can use plain SQL, it also\nenables data analysts to create production pipelines and turns them into\nthe often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\nexecution applied with Photon.\n\nSoftware engineering principles are applied for data engineering to foster the\nidea of treating your data as code. Your data is the sole source of truth for what\nis going on inside your business.\n\nBeyond just the transformations, there are many things that should be included\n\nDependency\nFull refresh\nmanagement\n\n*Coming soon\n\n\nin the code that define your data. Declaratively express entire data flows in SQL\nor Python. Natively enable modern software engineering best practices like\nseparate development and production environments, the ability to easily test\nbefore deploying, deploy and manage environments using parameterization, unit\ntesting and documentation.\n\nDLT also automatically scales compute, providing the option to set the minimum\nand maximum number of instances and let DLT size up the cluster according\nto cluster utilization. In addition, tasks like orchestration, error handling and\nrecovery, and performance optimization are all handled automatically.\n\n\nIncremental\ncomputation*\n\n\nCheckpointing\nand retries\n\n\n-----\n\nExpectations in the code help prevent bad data from flowing into tables, track\ndata quality over time, and provide tools to troubleshoot bad data with granular\npipeline observability. This enables a high-fidelity lineage diagram of your\npipeline to track dependencies and aggregate data quality metrics across all\nyour pipelines.\n\nUnlike other products that force you to deal with streaming and batch workloads\nseparately, DLT supports any type of data workload with a single API so data\nengineers and analysts alike can build cloud-scale data pipelines faster without\nthe need for advanced data engineering skills.\n\n#### Data orchestration", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "4018bedbe9433bb9032e8c093e83a934", + "DLT also automatically scales compute, providing the option to set the minimum\nand maximum number of instances and let DLT size up the cluster according\nto cluster utilization. In addition, tasks like orchestration, error handling and\nrecovery, and performance optimization are all handled automatically.\n\n\nIncremental\ncomputation*\n\n\nCheckpointing\nand retries\n\n\n-----\n\nExpectations in the code help prevent bad data from flowing into tables, track\ndata quality over time, and provide tools to troubleshoot bad data with granular\npipeline observability. This enables a high-fidelity lineage diagram of your\npipeline to track dependencies and aggregate data quality metrics across all\nyour pipelines.\n\nUnlike other products that force you to deal with streaming and batch workloads\nseparately, DLT supports any type of data workload with a single API so data\nengineers and analysts alike can build cloud-scale data pipelines faster without\nthe need for advanced data engineering skills.\n\n#### Data orchestration\n\nThe lakehouse makes it much easier for businesses to undertake ambitious data\nand machine learning (ML) initiatives. However, orchestrating and managing\nend-to-end production workflows remains a bottleneck for most organizations,\nrelying on external tools or cloud-specific solutions that are not part of their\nlakehouse platform. Tools that decouple task orchestration from the underlying\ndata processing platform reduce the overall reliability of their production\nworkloads, limit observability, and increase complexity for end users.\n\n#### What is Databricks Workflows?\n\n[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\nany cloud.\n\n\nWorkflows lets you orchestrate data flow pipelines (written in DLT or dbt),\nas well as machine learning pipelines, or any other tasks such as notebooks\nor Python wheels. Since Databricks Workflows is fully managed, it eliminates\noperational overhead for data engineers, enabling them to focus on your\nworkflows not on managing your infrastructure. It provides an easy point-and-click\nauthoring experience for all your data teams, not just those with specialized skills.\nDeep integration with the underlying lakehouse platform ensures you will create\nand run reliable production workloads on any cloud while providing deep and\ncentralized monitoring with simplicity for end users.\n\nSharing job clusters over multiple tasks reduces the time a job takes, reduces\ncosts by eliminating overhead and increases cluster utilization with parallel tasks.\n\n\n-----\n\nDatabricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\nshows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\ntriggers the execution of all dependent tasks.\n\nYou can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n\nexternal orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n\n\n-----\n\n#### Orchestrate anything\n\nRemember that DLT is one of many task types for Databricks Workflows.\nThis is where the managed data flow pipelines with DLT tie together with\nthe easy point-and-click authoring experience of Databricks Workflows.\n\nIn the following example, you can see an end-to-end workflow built with\ncustomers in a workshop: Data is streamed from Twitter according to search\nterms, then ingested with Auto Loader using automatic schema detection and\nenforcement. In the next step, the data is cleaned and transformed with Delta\nLive table pipelines written in SQL, and finally run through a pre-trained BERT\nlanguage model from Hugging Face for sentiment analysis of the tweets.\nDifferent task types for ingest, cleanse/transform and ML are combined\nin a single workflow.\n\nUsing Workflows, these tasks can be scheduled to provide a daily overview of\nsocial media coverage and customer sentiment for a business. After streaming\ntweets with filtering for keywords such as “data engineering,” “lakehouse” and\n“Delta Lake,” we curated a list of those tweets that were classified as positive\nwith the highest probability score.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e516a51c629ef5e3497646d513213e89", + "Remember that DLT is one of many task types for Databricks Workflows.\nThis is where the managed data flow pipelines with DLT tie together with\nthe easy point-and-click authoring experience of Databricks Workflows.\n\nIn the following example, you can see an end-to-end workflow built with\ncustomers in a workshop: Data is streamed from Twitter according to search\nterms, then ingested with Auto Loader using automatic schema detection and\nenforcement. In the next step, the data is cleaned and transformed with Delta\nLive table pipelines written in SQL, and finally run through a pre-trained BERT\nlanguage model from Hugging Face for sentiment analysis of the tweets.\nDifferent task types for ingest, cleanse/transform and ML are combined\nin a single workflow.\n\nUsing Workflows, these tasks can be scheduled to provide a daily overview of\nsocial media coverage and customer sentiment for a business. After streaming\ntweets with filtering for keywords such as “data engineering,” “lakehouse” and\n“Delta Lake,” we curated a list of those tweets that were classified as positive\nwith the highest probability score.\n\n**Learn more**\n\n\n[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n[Lakehouse](https://databricks.com/solutions/data-pipelines)\n\n\n[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n\n[Databricks Workflows](https://www.databricks.com/product/workflows)\n\n\n[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n\n\n-----\n\n**CHAPTER**\n\n### Data streaming\n# 09\n\n\n**CHAPTER**\n\n\nThere are two types of data processing: batch processing\nand streaming processing.\n\n\nBatch processing refers to the discontinuous, periodic processing\nof data that has been stored for a period of time. For example,\nan organization may need to run weekly reports on a set of\npredictable transaction data. There is no need for this data\nto be streaming — it can be processed on a weekly basis.\n\nStreaming processing, on the other hand, refers to unbounded\nprocessing of data as it arrives.\n\n\n-----\n\n**Data Streaming Challenges**\n\nHowever, getting value from streaming data can be a tricky practice. While most\ndata today can be considered streaming data, organizations are overwhelmed by\nthe need to access, process and analyze the volume, speed and variety of this\ndata moving through their platforms. To keep pace with innovation, they must\nquickly make sense of data streams decisively, consistently and in real time.\n\nThree common technical challenges organizations experience\nwith implementing real-time data streaming include:\n\n**•** **Specialized APIs and language skills:** Data practitioners encounter\nbarriers to adopting streaming skillsets because there are new languages,\nAPIs and tools to learn.\n\n**•** **Operational complexity:** To implement data streaming at scale, data\nteams need to integrate and manage streaming-specific tools with\ntheir other cloud services. They also have to manually build complex\noperational tooling to help these systems recover from failure, restart\nworkloads without reprocessing data, optimize performance, scale the\nunderlying infrastructure, and so on.\n\n**•** **Incompatible governance models:** Different governance and security\nmodels across real-time and historical data platforms makes it difficult\nto provide the right access to the right users, see the end-to-end data\nlineage, and/or meet compliance requirements.\n\n\nIn a wide variety of cases, an organization might find it useful to\nleverage streaming data. Here are some common examples:\n\n**•** **Retail:** Real-time inventory updates help support business activities, such\nas inventory and pricing optimization and optimization of the supply chain,\nlogistics and just-in-time delivery.\n\n**•** **Smart energy:** Smart meter monitoring in real time allows for smart\nelectricity pricing models and connection with renewable energy sources\nto optimize power generation and distribution.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e45a2f68c84e05c7b3e9cef52f4fbd67", + "**•** **Incompatible governance models:** Different governance and security\nmodels across real-time and historical data platforms makes it difficult\nto provide the right access to the right users, see the end-to-end data\nlineage, and/or meet compliance requirements.\n\n\nIn a wide variety of cases, an organization might find it useful to\nleverage streaming data. Here are some common examples:\n\n**•** **Retail:** Real-time inventory updates help support business activities, such\nas inventory and pricing optimization and optimization of the supply chain,\nlogistics and just-in-time delivery.\n\n**•** **Smart energy:** Smart meter monitoring in real time allows for smart\nelectricity pricing models and connection with renewable energy sources\nto optimize power generation and distribution.\n\n**•** **Preventative maintenance:** By reducing unplanned outages and\nunnecessary site and maintenance visits, real-time streaming analytics can\nlower operational and equipment costs.\n\n**•** **Industrial automation:** Manufacturers can use streaming and predictive\nanalytics to improve production processes and product quality, including\nsetting up automated alerts.\n\n**•** **Healthcare:** To optimize care recommendations, real-time data allows\nfor the integration of various smart sensors to monitor patient condition,\nmedication levels and even recovery speed.\n\n**•** **Financial institutions:** Firms can conduct real-time analysis of\n\ntransactions to detect fraudulent transactions and send alerts. They\ncan use fraud analytics to identify patterns and feed data into machine\nlearning algorithms.\n\n\nRegardless of specific use cases, the central tenet of streaming data is that it\ngives organizations the opportunity to leverage the freshest possible insights for\nbetter decision-making and more optimized customer experiences.\n\n\n-----\n\n**Data streaming architecture**\n\nBefore addressing these challenges head-on, it may help to take a step back and\ndiscuss the ingredients of a streaming data pipeline. Then, we will explain how\nthe Databricks Lakehouse Platform operates within this context to address the\naforementioned challenges.\n\nEvery application of streaming data requires a pipeline that brings the data from\nits origin point — whether sensors, IoT devices or database transactions — to its\nfinal destination.\n\nIn building this pipeline, streaming architectures typically employ two layers.\nFirst, streaming capture systems **capture** and temporarily store streaming data\nfor processing. Sometimes these systems are also called messaging systems\nor messaging buses. These systems are optimized for small payloads and high\nfrequency inputs/outputs. Second, streaming **processing** systems continuously\nprocess data from streaming capture systems and other storage systems.\n\n**Capturing** **Processing**\n\n\nIt may help to think of a simplified streaming pipeline\naccording to the following seven phases:\n\n1. Data is continuously generated at origin points\n\n2. The generated data is captured from those origin points by\na capture system like Apache Kafka (with limited retention)\n\n**3. The captured data is extracted and incrementally ingested to**\n**a processing platform like Databricks; data is ingested exactly**\n**once and stored permanently, even if this step is rerun**\n\n**4. The ingested data is converted into a workable format**\n\n**5. The formatted data is cleansed, transformed and joined in**\n**a number of pipeline steps**\n\n**6. The transformed data is processed downstream through**\n**analysis or ML modeling**\n\n7. The resulting analysis or model is used for some sort of practical\napplication, which may be anything from basic reporting to an\nevent-driven software application\n\nYou will notice four of the steps in this list are in boldface. This is because the\nlakehouse architecture is specifically designed to optimize this part of the\npipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\nanalyze and model on streaming data _alongside_ batch-processed data. It can\naccommodate both structured _and_ unstructured data. It is here that the value\nof unifying the best pieces of data lakes and data warehouses really shines for\ncomplex enterprise use cases.\n\n\n-----\n\n**Data Streaming on the Lakehouse**\n\nNow let’s zoom in a bit and see how the Databricks Lakehouse\nPlatform addresses each part of the pipeline mentioned above.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "31aaed4d9f4d9e1c44aaeefd580abfa6", + "**5. The formatted data is cleansed, transformed and joined in**\n**a number of pipeline steps**\n\n**6. The transformed data is processed downstream through**\n**analysis or ML modeling**\n\n7. The resulting analysis or model is used for some sort of practical\napplication, which may be anything from basic reporting to an\nevent-driven software application\n\nYou will notice four of the steps in this list are in boldface. This is because the\nlakehouse architecture is specifically designed to optimize this part of the\npipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\nanalyze and model on streaming data _alongside_ batch-processed data. It can\naccommodate both structured _and_ unstructured data. It is here that the value\nof unifying the best pieces of data lakes and data warehouses really shines for\ncomplex enterprise use cases.\n\n\n-----\n\n**Data Streaming on the Lakehouse**\n\nNow let’s zoom in a bit and see how the Databricks Lakehouse\nPlatform addresses each part of the pipeline mentioned above.\n\n**Streaming data ingestion and transformation** begins with continuously\nand incrementally collecting raw data from streaming sources through a\nfeature called Auto Loader. Once the data is ingested, it can be transformed\nfrom raw, messy data into clean, fresh, reliable data appropriate for downstream\nanalytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\nmanage these data pipelines while automatically taking care of infrastructure\nmanagement and scaling, data quality, error testing and other administrative\ntasks. DLT is a high-level abstraction built on Spark Structured Streaming,\na scalable and fault-tolerant stream processing engine.\n\n**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\nof streaming data. With fresher data streaming into SQL analytics or BI\nreporting, more actionable insights can be achieved, resulting in better\nbusiness outcomes.\n\n**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\ndeployment is supported with structured streaming for continuous inference\nfrom a live data stream. Like real-time analytics, real-time ML is a downstream\nimpact of streaming data, but for different business use cases (i.e., AI instead\nof BI). Real-time modeling has many benefits, including more accurate\npredictions about the future.\n\n\n**Real-time applications** process data directly from streaming pipelines and\ntrigger programmatic actions, such as displaying a relevant ad, updating the\nprice on a pricing page, stopping a fraudulent transaction, etc. There typically\nis no human-in-the-loop for such applications.\n\n\nData in cloud storage and message stores\n\n\n-----\n\n**Databricks Lakehouse Platform differentiators**\n\nUnderstanding what the lakehouse architecture provides is one\n\nthing, but it is useful to understand how Databricks uniquely\n\napproaches the common challenges mentioned earlier around\n\nworking with streaming data.\n\n**Databricks empowers unified data teams.** Data engineers, data scientists\nand analysts can easily build streaming data workloads with the languages\nand tools they already know and the APIs they already use.\n\n**Databricks simplifies development and operations.** Organizations can\nfocus on getting value from data by reducing complexity and automating\nmuch of the production aspects associated with building and maintaining\nreal-time data workloads.\n\n\nSee why customers love streaming on the Databricks\nLakehouse Platform with these resources.\n\n**Learn more**\n\n[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n\n[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "2df0852583ef94bc403945ac9a1e859d", + "**Databricks simplifies development and operations.** Organizations can\nfocus on getting value from data by reducing complexity and automating\nmuch of the production aspects associated with building and maintaining\nreal-time data workloads.\n\n\nSee why customers love streaming on the Databricks\nLakehouse Platform with these resources.\n\n**Learn more**\n\n[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n\n[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n\n[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n\n[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n\n\n**Databricks is one platform for streaming and batch data.** Organizations\ncan eliminate data silos, centralize security and governance models, and\nprovide complete support for all their real-time use cases under one roof —\nthe roof of the lakehouse.\n\nFinally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n\n[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\ndeeply integrated with Spark Structured Streaming and overcomes many of\nthe limitations typically associated with streaming systems and files.\n\nIn summary, the Databricks Lakehouse Platform dramatically simplifies data\nstreaming to deliver real-time analytics, machine learning and applications on\none platform. And, that platform is built on a foundation with streaming at its\ncore. This means organizations of all sizes can use their data in motion and\nmake more informed decisions faster than ever.\n\n\n-----\n\n**CHAPTER**\n\n### Data science and machine learning\n# 10\n\n\n**CHAPTER**\n\n\nWhile most companies are aware of the potential benefits of applying\nmachine learning and AI, realizing these potentials can often be quite\nchallenging for those brave enough to take the leap. Some of the\nlargest hurdles come from siloed/disparate data systems, complex\nexperimentation environments, and getting models served in a\nproduction setting.\n\n\nFortunately, the Databricks Lakehouse Platform provides a helping\nhand and lets you use data to derive innovative insights, build\npowerful predictive models, and enable data scientists, ML engineers,\nand developers of all kinds to create within the space of machine\nlearning and AI.\n\n\n-----\n\n#### Databricks Machine Learning\n\n\n-----\n\n#### Exploratory data analysis\n\nWith all the data in one place, data is easily\nexplored and visualized from within the\nnotebook-style experience that provides support\nfor various languages (R, SQL, Python and Scala)\nas well as built-in visualizations and dashboards.\nConfidently and securely share code with\nco-authoring, commenting, automatic versioning,\nGit integrations and role-based access controls.\nThe platform provides laptop-like simplicity at\nproduction-ready scale.\n\n\n-----\n\n#### Model creation and management\n\nFrom data ingestion to model training and tuning, all the way through to\nproduction model serving and versioning, the Lakehouse brings the tools\nneeded to simplify those tasks.\n\nGet right into experimenting with the Databricks ML runtimes, optimized and\npreconfigured to include most popular libraries like scikit-learn, XGBoost and\nmore. Massively scale thanks to built-in support for distributed training and\nhardware acceleration with GPUs.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3b9e992190e770ecb1aa8269e5d05d96", + "#### Databricks Machine Learning\n\n\n-----\n\n#### Exploratory data analysis\n\nWith all the data in one place, data is easily\nexplored and visualized from within the\nnotebook-style experience that provides support\nfor various languages (R, SQL, Python and Scala)\nas well as built-in visualizations and dashboards.\nConfidently and securely share code with\nco-authoring, commenting, automatic versioning,\nGit integrations and role-based access controls.\nThe platform provides laptop-like simplicity at\nproduction-ready scale.\n\n\n-----\n\n#### Model creation and management\n\nFrom data ingestion to model training and tuning, all the way through to\nproduction model serving and versioning, the Lakehouse brings the tools\nneeded to simplify those tasks.\n\nGet right into experimenting with the Databricks ML runtimes, optimized and\npreconfigured to include most popular libraries like scikit-learn, XGBoost and\nmore. Massively scale thanks to built-in support for distributed training and\nhardware acceleration with GPUs.\n\nFrom within the runtimes, you can track model training sessions, package and\nreuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\ncreated by Databricks and included as a managed service within the Lakehouse.\nIt provides a centralized location from which to manage models and package\ncode in an easily reusable way.\n\nTraining these models often involves the use of features housed in a centralized\nfeature store. Fortunately, Databricks has a built-in feature store that allows you\nto create new features, explore and re-use existing features, select features for\ntraining and scoring machine learning models, and publish features to low-latency\nonline stores for real-time inference.\n\nIf you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\nexperimentation by pointing to your data set and automatically training models\nand tuning hyperparameters to save both novice and advanced users precious\ntime in the machine learning process.\n\n\nAutoML will also report back metrics related to the model training results as well\nas the code needed to repeat the training already custom-tailored to your data\nset. This glass box approach ensures that you are never trapped or suffer from\nvendor lock-in.\n\nIn that regard, the Lakehouse supports the industry’s widest range of data tools,\ndevelopment environments, and a thriving ISV ecosystem so you can make your\nworkspace your own and put out your best work.\n\n##### Compute platform\n\n**Any ML workload optimized and accelerated**\n\n**Databricks Machine Learning Runtime**\n\n- Optimized and preconfigured ML frameworks\n\n- Turnkey distribution ML\n\n- Built-in AutoML\n\n- GPU support out of the box\n\n\nBuilt-in **ML frameworks**\nand **model explainability**\n\nBuilt-in support for **AutoML**\nand **hyperparameter tuning**\n\n\nBuilt-in support for\n**distributed training**\n\nBuilt-in support for\n**hardware accelerators**\n\n\n-----\n\n#### Deploy your models to production\n\nExploring and creating your machine learning models\ntypically represents only part of the task. Once the\nmodels exist and perform well, they must become\npart of a pipeline that keeps models updated,\nmonitored and available for use by others.\n\n**Webhooks** allow registering of\n\n\nDatabricks can help here by providing a world-class\nexperience for model versioning, monitoring and\nserving within the same platform that you can use\nto generate the models themselves. This means you\ncan make all your ML pipelines in the same place,\nmonitor them for drift, retrain them with new data,\nand promote and serve them easily and at scale.\n\nThroughout the ML lifecycle, rest assured knowing\nthat lineage and governance are being tracked the\nentire way. This means regulatory compliance and\nsecurity woes are significantly reduced, potentially\nsaving costly issues down the road.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "4fd897c16496ffce91caab2405bf7076", + "Built-in support for\n**distributed training**\n\nBuilt-in support for\n**hardware accelerators**\n\n\n-----\n\n#### Deploy your models to production\n\nExploring and creating your machine learning models\ntypically represents only part of the task. Once the\nmodels exist and perform well, they must become\npart of a pipeline that keeps models updated,\nmonitored and available for use by others.\n\n**Webhooks** allow registering of\n\n\nDatabricks can help here by providing a world-class\nexperience for model versioning, monitoring and\nserving within the same platform that you can use\nto generate the models themselves. This means you\ncan make all your ML pipelines in the same place,\nmonitor them for drift, retrain them with new data,\nand promote and serve them easily and at scale.\n\nThroughout the ML lifecycle, rest assured knowing\nthat lineage and governance are being tracked the\nentire way. This means regulatory compliance and\nsecurity woes are significantly reduced, potentially\nsaving costly issues down the road.\n\n\ncallbacks on events like stage\n\ntransitions to integrate with CI/CD\n\nautomation.\n\n**Tags** allow storing deployment\n\n— specific metadata with model\n\nversions, e.g., whether the\n\ndeployment was successful.\n\n\n**Model lifecycle management**\n\nStaging Production Archived\n\n\nLogged\nmodel\n\n**Comments** allow communication\n\nand collaboration between\n\nteammates when reviewing\n\nmodel versions.\n\n\n-----\n\n**Learn more**\n\n[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n\n[Databricks Data Science](https://databricks.com/product/data-science)\n\n[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n\n\n-----\n\n**CHAPTER**\n\n# 11\n\n\n### Databricks Technology Partners and the modern data stack\n\nDatabricks Technology Partners integrate their solutions with Databricks to\nprovide complementary capabilities for ETL, data ingestion, business intelligence,\nmachine learning and governance. These integrations allow customers to leverage\nthe Databricks Lakehouse Platform’s reliability and scalability to innovate faster\nwhile deriving valuable data insights. Use preferred analytical tools with optimized\nconnectors for fast performance, low latency and high user concurrency to your\ndata lake.\n\n\n-----\n\nWith [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\ntools to your lakehouse using validated integrations and helps you discover and try new solutions.\n\n**Databricks thrives within your modern data stack**\n\n**BI and dashboards** **Machine learning** **Data science**\n\n\n**Data governance**\n\n**Data pipelines**\n\n**Data ingestion**\n\n\nData Data Data\nwarehousing engineering streaming\n\n**Unity Catalog**\n\n\nData science\nand ML\n\n\n**Consulting**\n**and SI partners**\n\n\n**Delta Lake**\n\n**Cloud Data Lake**\n\n**Learn more**\n\n\n[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n\n[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n\n\n[Partner Connect](https://databricks.com/partnerconnect)\n\n[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n\n\n-----\n\n**CHAPTER**\n\n### Get started with the Databricks Lakehouse Platform\n# 12\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1aae0175847f695b2f674a019d22936e", + "Data Data Data\nwarehousing engineering streaming\n\n**Unity Catalog**\n\n\nData science\nand ML\n\n\n**Consulting**\n**and SI partners**\n\n\n**Delta Lake**\n\n**Cloud Data Lake**\n\n**Learn more**\n\n\n[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n\n[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n\n\n[Partner Connect](https://databricks.com/partnerconnect)\n\n[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n\n\n-----\n\n**CHAPTER**\n\n### Get started with the Databricks Lakehouse Platform\n# 12\n\n\n-----\n\n#### Databricks Trial\n\nGet a collaborative environment for data teams to build solutions together with interactive\nnotebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\nscikit-learn and more.\n\n**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\nhosted by Databricks\n\n**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\n\n**[Databricks documentation](https://databricks.com/documentation)**\n\nGet detailed documentation to get started with\nthe Databricks Lakehouse Platform on your cloud\nof choice: Databricks on AWS, Azure Databricks\nand [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n\n**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n\nGet a firsthand look at Databricks from the\npractitioner’s perspective with these simple\non-demand videos. Each demo is paired with\nrelated materials — including notebooks, videos\nand eBooks — so that you can try it out for\nyourself on Databricks.\n\n\n**[Databricks Academy](https://databricks.com/learn/training/home)**\n\nWhether you are new to the data lake or building on\nan existing skill set, you can find a curriculum tailored\nto your role or interest. With training and certification\nthrough Databricks Academy, you will learn to master\nthe Databricks Lakehouse Platform for all your big\ndata analytics projects.\n\n**[Databricks Community](https://community.databricks.com/)**\n\n\n**[Databricks Labs](https://databricks.com/learn/labs)**\n\nDatabricks Labs are projects created by the\nfield to help customers get their use cases\ninto production faster.\n\n**[Databricks customers](https://databricks.com/customers)**\n\nDiscover how innovative companies across\nevery industry are leveraging the Databricks\nLakehouse Platform.\n\n\nGet answers, network with peers and solve\nthe world’s toughest problems, together.\n\n\n-----\n\n#### About Databricks", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "c5ae327904492731d7fd76d12d66efb9", + "Whether you are new to the data lake or building on\nan existing skill set, you can find a curriculum tailored\nto your role or interest. With training and certification\nthrough Databricks Academy, you will learn to master\nthe Databricks Lakehouse Platform for all your big\ndata analytics projects.\n\n**[Databricks Community](https://community.databricks.com/)**\n\n\n**[Databricks Labs](https://databricks.com/learn/labs)**\n\nDatabricks Labs are projects created by the\nfield to help customers get their use cases\ninto production faster.\n\n**[Databricks customers](https://databricks.com/customers)**\n\nDiscover how innovative companies across\nevery industry are leveraging the Databricks\nLakehouse Platform.\n\n\nGet answers, network with peers and solve\nthe world’s toughest problems, together.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 7,000\norganizations worldwide — including Comcast, Condé Nast,\nH&M and over 40% of the Fortune 500 — rely on the Databricks\nLakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve the\nworld’s toughest problems. To learn more, follow Databricks on\n[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9f9c35c2d6e7c59e06e3fec911a0e217", + "#### eBook\n\n# Big Book of Retail\n & Consumer Goods Use Cases\n\n##### Driving real-time decisions\n with the Lakehouse\n\n\n-----\n\n### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n\n**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n\nCommon challenges 6\n\nThe Lakehouse for Retail 8\n\n**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n\nCase Study: Gousto 14\n\nCase Study: ButcherBox 14\n\n**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n\nCase Study: Embark 16\n\n**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n\nCase Study: H&M 19\n\nCase Study: Edmunds 19\n\n**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n\n**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n\nCase Study: Reckitt 25\n\n**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n\n**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n\nCase Study: Wehkamp 31\n\nCase Study: Columbia 31\n\nCase Study: Pandora 31\n\n**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n\n**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n\n\n-----\n\n### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n\nCase Study: ButcherBox 37\n\nCase Study: Sam’s Club 37\n\n**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n\n**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n\n**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n\n**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n\n\n-----\n\n**CHAPTER 1:**\n### Introduction\n\n\nRetailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n\ne-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n\ndecisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n\nto support these decisions, but legacy systems are limited to data that’s hours or days old.\n\n**When seconds matter, only the Lakehouse delivers better decisions**\n\nRetail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n\nThe integration of physical and e-commerce customer experiences into an omnichannel journey has been\n\nhappening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n\npurchasing patterns.\n\nIn reaction to these industry changes, retailers have responded with significant, rapid investments — including\n\nstronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n\ncapabilities have addressed the immediate need — and created expectations of making decisions in real\n\ntime — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n\nUnfortunately, most legacy systems are only able to process information in hours or days.\n\nThe delays caused by waiting for data are leading to significant risks and costs for the industry.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "cddfc4c750af70bbe7e43384e73c4ff4", + "to support these decisions, but legacy systems are limited to data that’s hours or days old.\n\n**When seconds matter, only the Lakehouse delivers better decisions**\n\nRetail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n\nThe integration of physical and e-commerce customer experiences into an omnichannel journey has been\n\nhappening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n\npurchasing patterns.\n\nIn reaction to these industry changes, retailers have responded with significant, rapid investments — including\n\nstronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n\ncapabilities have addressed the immediate need — and created expectations of making decisions in real\n\ntime — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n\nUnfortunately, most legacy systems are only able to process information in hours or days.\n\nThe delays caused by waiting for data are leading to significant risks and costs for the industry.\n\n**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n\nthe-minute order data. Not having this information causes them to spend more resources on having people\n\npick orders separately, at a higher operating cost.\n\n**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n\nthat in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n\nsales, or worse, the customer becoming unsatisfied and moving to different retailers.\n\n\n-----\n\n**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n\nand other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n\nweek.\n\nThe margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n\nzero. Reducing the error rate requires better predictions and real-time data.\n\n**Use Case Guide**\n\nIn this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n\n**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n\n**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n\n**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n\nperformance, while achieving a market-leading total cost of ownership.\n\nDatabricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n\nin real time. This use case guide also highlights common use cases across the industry, and offers additional\n\nresources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n\njourney to drive better customer experiences with data and AI.\n\n\n-----\n\n**CHAPTER 2:**\n### Modern Data Platform\n for Real-Time Retail\n\n\nRetailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n\nchanges, retailers are increasingly focused on improving the real-time availability of data and insights, and\n\nperforming advanced analytics delivered within tight business service windows.\n\n**Common challenges**\n\nIn response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n\nin modernizing distribution centers, partnering with delivery companies, and investing in customer\n\nengagement systems.\n\nWarehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n\ndistribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n\nthat became accustomed to having fast, same-day, and sometimes even overnight delivery options\n\nduring the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n\nexperience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n\n## $41B Market | Retail Warehouse Automation\n\nYet while retailers modernize different areas of their operations, they’re constrained by a single point of", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9b5141e5ec6f6b347aa972b1c17623d2", + "changes, retailers are increasingly focused on improving the real-time availability of data and insights, and\n\nperforming advanced analytics delivered within tight business service windows.\n\n**Common challenges**\n\nIn response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n\nin modernizing distribution centers, partnering with delivery companies, and investing in customer\n\nengagement systems.\n\nWarehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n\ndistribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n\nthat became accustomed to having fast, same-day, and sometimes even overnight delivery options\n\nduring the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n\nexperience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n\n## $41B Market | Retail Warehouse Automation\n\nYet while retailers modernize different areas of their operations, they’re constrained by a single point of\n\nweakness, as they are reliant on legacy data platforms to bring together all of this data.\n\nPowering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n\ngovernance of information, and powering business intelligence and predictive analytics all within the time\n\nrequired by retail operations.\n\n\n-----\n\n**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n\nis the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n\nsystems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n\nand other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n\naggregated and integrated with each other before they can be used. The problem? Retailers have used\n\nlegacy data warehouses that are built around batch processing. And worse, increasing the frequency\n\nof how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n\nmerchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n\nother data sets. The result? Accurate data to drive decisions can be delayed by days.\n\n**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n\ntrade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n\nRunning forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n\nbut doing so requires tens of millions of model calculations that need to be performed in narrow service\n\nwindows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n\nforced to accept the trade-off and live with less accurate predictions.\n\n**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n\nreal-time data to thousands of employees is a daunting task. While data warehouses are capable\n\nof serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n\nfrequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n\ndecisions that are more frequent.\n\n**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n\nfocused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n\na trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n\nstruggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n\ndeliver personalized experiences at scale to win in retail.\n\n\n-----\n\n###### The Lakehouse for Retail\n\nDatabricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n\nall types of data — from images to structured data — in real time, provide enterprise-class management\n\nand governance, and then immediately turn that data into actionable insights with real-time reporting and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "19724ab32f8993c136caf2d55947906a", + "frequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n\ndecisions that are more frequent.\n\n**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n\nfocused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n\na trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n\nstruggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n\ndeliver personalized experiences at scale to win in retail.\n\n\n-----\n\n###### The Lakehouse for Retail\n\nDatabricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n\nall types of data — from images to structured data — in real time, provide enterprise-class management\n\nand governance, and then immediately turn that data into actionable insights with real-time reporting and\n\npredictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n\n(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n\n**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n**or frequency** **processing** **any persona** **& collaboration**\n\n_Semi-structured batch_\n\n\n**All of**\n**your sources**\n\nCompetitive activity\n\nE-commerce\n\nMobile Applications\n\nVideo & Images\n\nPoint of Sale\n\nDistribution & Logistics\n\nCustomer & Loyalty\n\nDelivery & Partners\n\n\n_Structured real-time_\n\n_Semi-structured real-time_\n\n_Unstructured batch_\n\n_Semi-structured real-time_\n\n_Structured real-time_\n\n_Structured batch_\n\n\nData Lakehouse\n\nData Management and Governance\n\nProcess, manage and query all of your data\n\n\nAd Hoc Data Science\n\n**Internal Teams**\n\nProduction\nMachine Learning\n\n**Customers**\n\nBI Reporting\n& Dashboarding\n\n**Partners**\n\nReal-time Applications\n\n\nAny Cloud\n\n\n_Structured real-time_\n\n\n-----\n\n**Reference Architecture**\n\nAt the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n\noffs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n\nthat combines the best elements of data warehouses and data lakes — to directly address these factors by\n\nenabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n\nmanaged and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n\ndata scientists and data engineers can all leverage this information to serve models for applications and\n\npower real-time reporting, advanced analytics, large-scale forecasting models and more.\n\n**EDGE** **HYBRID** **CLOUD**\n\n\n\nREST Model Serving\n\n|Machine Learning Operations Tracking Registery|RES|\n|---|---|\n||Application|\n\n\n\nReplication\n\n\nAutomatic DBs\n\n|Col1|Real-tim|\n|---|---|\n|||\n\n\nRaw Data\n\n(Bronze Table)\n\n\nClean Data\n\n(Silver Table)\n\n\nRefined Data\n\n(Gold Table)\n\n\nBusiness\nApplications\n\nPower BI\n\n\nBatch\n\n\n-----\n\n###### How it works\n\nThe Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n\nsimplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n\ndifferentiated capabilities that help retailers win.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "f7507cf1a2132c8afe0151e2ccb104f9", + "**EDGE** **HYBRID** **CLOUD**\n\n\n\nREST Model Serving\n\n|Machine Learning Operations Tracking Registery|RES|\n|---|---|\n||Application|\n\n\n\nReplication\n\n\nAutomatic DBs\n\n|Col1|Real-tim|\n|---|---|\n|||\n\n\nRaw Data\n\n(Bronze Table)\n\n\nClean Data\n\n(Silver Table)\n\n\nRefined Data\n\n(Gold Table)\n\n\nBusiness\nApplications\n\nPower BI\n\n\nBatch\n\n\n-----\n\n###### How it works\n\nThe Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n\nsimplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n\ndifferentiated capabilities that help retailers win.\n\nRobust data Time-sensitive machine\nData in real time Use all of your data Real-time reporting\nmanagement learning\n\n\n**Limited.** EDWs support the\n\nmanagement of structured\n\ndata.\n\n**No.** Data lakes lack\n\nenterprise-class data\n\nmanagement tools.\n\n**Yes.** Delta and Unity\n\nCatalog offer native\n\ndata management and\n\ngovernance of all data types.\n\n\n**No.** EDWs offer quick access\n\nto reports on old data.\n\n**No.** Data lakes were not\n\ndesigned for reporting, let\n\nalone real-time reporting.\n\n**No.** Data lakes are able to\n\nsupport large analytics,\n\nbut lack the ability to meet\n\nbusiness SLAs.\n\n\n**No.** EDWs must extract data\n\nand send it to a third party\n\nfor machine learning.\n\n**Yes.** Data views can be\n\nmaterialized, enabling front-\n\nline employees with real-\n\ntime data.\n\n**Yes.** The Lakehouse can\n\nscale to process the most\n\ndemanding predictions\n\nwithin business SLAs.\n\n\n**No.** Data warehouses are\n\nbatch oriented, restricting\n\ndata updates to hours or days.\n\n**No.** Data lakes are batch\n\noriented.\n\n**Yes.** Support for real-time\n\nstreaming data.\n\n\n**No.** Data warehouses have\n\nvery limited support for\n\nunstructured data.\n\n**Yes.** Data lakes offer support\n\nfor all types of data.\n\n**Yes.** Supports all types of\n\ndata in a centrally managed\n\nplatform.\n\n\n**LEGACY DATA**\n\n**WAREHOUSE**\n\n\n**LEGACY DATA**\n\n\n**DATA LAKES**\n\n**(HADOOP)**\n\n\n**DATA LAKES**\n\n\n**ROBUST**\n\n**DATA**\n\n\n**ROBUST**\n\n\n-----\n\n**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n\nfor streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n\nand point-of-sale data. And Delta Lake enables this world-record-leading performance while\n\nmaintaining support for ACID transactions.\n\n**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n\nand a growing variety of other data sources. This data is extremely powerful in helping to improve our\n\nunderstanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n\nto take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n\narchitecture.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3fd69219dd9f62357486b696bdc163f5", + "**DATA LAKES**\n\n**(HADOOP)**\n\n\n**DATA LAKES**\n\n\n**ROBUST**\n\n**DATA**\n\n\n**ROBUST**\n\n\n-----\n\n**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n\nfor streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n\nand point-of-sale data. And Delta Lake enables this world-record-leading performance while\n\nmaintaining support for ACID transactions.\n\n**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n\nand a growing variety of other data sources. This data is extremely powerful in helping to improve our\n\nunderstanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n\nto take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n\narchitecture.\n\n**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n\nwas lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n\ncompliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n\nto a modern data architecture does not require sacrificing enterprise maturity.\n\n**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n\nor recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n\ncan scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n\nsensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n\navailability and out-of-stocks, and delivering highly personalized predictions.\n\n**Value with Databricks**\n\nBy using Databricks to build and support your lakehouse, you can empower your business with even more\n\nspeed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n\nstart with the use case that will have the most impact on your business. As you implement the pattern, you\n\nwill find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n\nguidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n\n\n-----\n\n**CHAPTER 3**\n### Use Case:\n Real-Time Supply\n Chain Data\n\n\n**Overview**\n\nAs companies see a surge in demand from e-commerce and delivery services, and seek increasing\n\nefficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n\nroadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n\nitems are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n\ncontrol tower.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nManufacturers Distributors Logistics Restaurants\n\n\n**Challenges**\n\n**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n\nhappening and when a customer can act on it\n\n**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n\nhave the added pressure to take immediate action on it\n\n**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n\nplant operations or as part of a supply chain control tower.\n\n**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n\nimprovement in speed to data, with greater reliability\n\n**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "581545a4d760d03962437f89de737436", + "**Challenges**\n\n**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n\nhappening and when a customer can act on it\n\n**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n\nhave the added pressure to take immediate action on it\n\n**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n\nplant operations or as part of a supply chain control tower.\n\n**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n\nimprovement in speed to data, with greater reliability\n\n**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n\nthat they were able to move from batch to real-time at neutral costs\n\n**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n\nreal-time data ingestions. Customers frequently report that the amount of code required to support\n\nstreaming ingestion is 50% less than previous solutions.\n\n**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n\nhappening, predict and prevent issues, and even gain days on major changes such as production\n\nschedules between shifts\n\n**Solution overview**\n\nDatabricks allows for both streaming and batch data sets to be ingested and made available to enable\n\nreal-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n\nACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n\nDelta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n\nlearning experiments.\n\n**Typical use case data sources include:**\n\nSupply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n\ndata, IoT sensor, transportation management\n\n\n-----\n\n**CASE STUDY**\n\nWith Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n\ndaily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n\nprovided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n\noutbreak by providing real-time insight into performance on the factory picking lines.\n\n**CASE STUDY**\n\nAs a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n\nhundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n\ndata in under three minutes.\n\nNow, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n\naddress any logistical and delivery issues.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 4**\n### Use Case: Truck Monitoring\n\n\nWith many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n\nof trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n\nmanner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n\ndelays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics\n\n\n**Challenges**\n\n**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n\n\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7fd334d58220f80f674987019149dba4", + "address any logistical and delivery issues.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 4**\n### Use Case: Truck Monitoring\n\n\nWith many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n\nof trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n\nmanner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n\ndelays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics\n\n\n**Challenges**\n\n**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n\n\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n\n\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n\naccidents or shipment delays\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n\nto delivery.\n\n**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n\nability to monitor driver safety more immediately\n\n**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n\nexpansion without sacrificing data quality and speed\n\n**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n\nproactive maintenance and reduced risk of accidents\n\n**Solution overview**\n\nDatabricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n\nto route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n\nanalytics provide companies with the tools they need to scale and improve their operations.\n\n**Typical use case data sources include:**\n\nSupply planning, transportation management, manufacturing, predictive maintenance\n\n**CASE STUDY**\n\nWith 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n\nto unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n\nvia dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n\nautonomous trucks.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 5**\n### Use Case: Inventory Allocation\n\n\n**Overview**\n\nReplenishment planning is the process of determining what needs to go where. It is used by replenishment\n\nplanning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n\nvendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n\nand on what day.\n\nReplenishment is challenging for companies because it deals with rapidly changing data and the need to\n\nmake complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n\ndata to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n\nnumber of products being sent to stores. This results in lost sales and low customer satisfaction.\n\nInventory allocation is a process that might be performed multiple times a day during peak seasons, or\n\ndaily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n\nmultiple times a day — on demand and dynamically — during peak season without paying a premium for\n\nthis capability throughout the year.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fa319e02f812af76ebeda1e70704dec6", + "planning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n\nvendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n\nand on what day.\n\nReplenishment is challenging for companies because it deals with rapidly changing data and the need to\n\nmake complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n\ndata to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n\nnumber of products being sent to stores. This results in lost sales and low customer satisfaction.\n\nInventory allocation is a process that might be performed multiple times a day during peak seasons, or\n\ndaily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n\nmultiple times a day — on demand and dynamically — during peak season without paying a premium for\n\nthis capability throughout the year.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Restaurants\n\n\n-----\n\n**Challenges**\n\n\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n\nThis information is used to determine which products get put on trucks and go to specific stores.\n\n\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n\nthe service windows\n\n\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n\nattributes that may be more or less popular by store\n\n**Value with Databricks**\n\nCustomers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n\n\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n\nimprovement in forecast accuracy\n\n\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n\n\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n\ninformation on flavors or apparel sizes for specific stores\n\n**Solution overview**\n\nThe objective of inventory allocation is to quickly determine when to distribute items and where — from\n\nwarehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n\nrate of products, the available inventory and the shipping schedules, and then using this information to\n\ncreate an optimized manifest of what items should be carried on which trucks, at what point, and at what\n\ntime. This becomes the plan for route accounting systems that arrange deliveries.\n\nInventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n\nin a store for a long time, that store may receive heightened priority for the item in the allocation.\n\n\n-----\n\nHOW TO GET STARTED\n\n\n**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n\nstock, promotions data, weather\n\n**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n\n**demand forecasting.**\n\n**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n\nOur most popular notebook at Databricks. This blog walks you through the business and technical\n\nchallenges of performing demand forecasting and explains how we approached solving it.\n\n**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n\nVideo and Q&A from our webinar with Starbucks.\n\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n**CASE STUDY**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "32a24850e5f738d2b28c2a2e26336594", + "**demand forecasting.**\n\n**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n\nOur most popular notebook at Databricks. This blog walks you through the business and technical\n\nchallenges of performing demand forecasting and explains how we approached solving it.\n\n**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n\nVideo and Q&A from our webinar with Starbucks.\n\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n**CASE STUDY**\n\nH&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n\nperformant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n\ndriven organization that could better forecast operations to streamline costs and boost revenue.\n\n**CASE STUDY**\n\nEdmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n\nPlatform, they are able to simplify access to their disparate data sources and build ML models that make\n\npredictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n\non their website is accurate and up to date, improving overall customer satisfaction.\n\n\n-----\n\n**CHAPTER 6**\n### Use Case: Point of Sale\n and Clickstream\n\n\n**Overview**\n\nDisruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n\ncoupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n\nretailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n\nrecorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n\nThis would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n\ncrucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n\ncomplete and real-time snapshot of each customer’s shopping behavior.\n\nNear real-time availability of information means that retailers can continuously update their estimates of\n\nitem availability. No longer is the business managing operations based on their knowledge of inventory\n\nstates as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n\nthey are now.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce\n\n**Challenges**\n\n\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n\nincomplete sales data\n\n\u0007Both POS and clickstream data need to be unified and ingested in real time\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\n\n**Value with Databricks**\n\nDatabricks brings POS and clickstream data together for a unified data source that leads to real-time\n\ninsights and a clearer understanding of customer behavior.\n\n\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n\nclickstream data\n\n\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n\ncustomer purchasing behaviors and trends\n\n\nto have them perform a free proof-of-\n\n\nconcept with your real-time data.\n\n\n\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n\n\n-----\n\n**CHAPTER 7**\n### Use Case: On-Shelf Availability\n\n\n**Overview**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a9bb4ef49c9cb97f5bc20541b9536596", + "-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\n\n**Value with Databricks**\n\nDatabricks brings POS and clickstream data together for a unified data source that leads to real-time\n\ninsights and a clearer understanding of customer behavior.\n\n\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n\nclickstream data\n\n\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n\ncustomer purchasing behaviors and trends\n\n\nto have them perform a free proof-of-\n\n\nconcept with your real-time data.\n\n\n\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n\n\n-----\n\n**CHAPTER 7**\n### Use Case: On-Shelf Availability\n\n\n**Overview**\n\nEnsuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n\nmissing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n\ntheir stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n\nworldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n\naccording to industry research firm IHL.\n\nIn the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n\nof going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n\nbelong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n\nitems online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n\nbuy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n\n$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n\nOn-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n\nconsidered in stock when it is actually in a current customer’s basket. If another customer places the same\n\nitem in their basket, there is the possibility that the first customer will purchase the last available item\n\nbefore the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n\nthese situations, customers may order an item that is picked for delivery at a much later time. The window\n\nbetween ordering and picking creates the probability of out-of-stocks.\n\nOn-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n\nreplenishment points, and generates a signal that suggests an item may be out of stock. This information is\n\nused to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n\nof thousands of people around the world do work that is generated by these algorithms.\n\nThe sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n\nall of their products. Companies have between midnight and 4 AM to collect all of the needed information\n\nand run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n\nthe priority categories or products to analyze, which means a significant percentage of their unavailable\n\nproducts will not be proactively addressed.\n\n\n-----\n\nOne of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n\nWhile some retailers are investing in computer vision and robots, and others employ the use of people to\n\nmanually survey item availability, most retailers default to a signal of determining when an item has not been\n\nscanned in an acceptable time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nE-commerce Direct to\nConsumer\n\n\n**Challenges**\n\nThe biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "891fec08af2e7fc29891e64459f4c5f9", + "all of their products. Companies have between midnight and 4 AM to collect all of the needed information\n\nand run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n\nthe priority categories or products to analyze, which means a significant percentage of their unavailable\n\nproducts will not be proactively addressed.\n\n\n-----\n\nOne of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n\nWhile some retailers are investing in computer vision and robots, and others employ the use of people to\n\nmanually survey item availability, most retailers default to a signal of determining when an item has not been\n\nscanned in an acceptable time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nE-commerce Direct to\nConsumer\n\n\n**Challenges**\n\nThe biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n\ndata from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n\nvolumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n\nwarehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n\nprocess that can require multiple hours per night.\n\nFor this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n\ndifferent days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n\nlevels. Among the challenges:\n\n\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n\n\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n\nfew categories\n\n\u0007Dealing with false positives and negatives in predictions\n\nDistributing information quickly and efficiently to internal systems and external partners\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n\ncompromises.\n\n**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n\nprocess large volumes of highly detailed and frequently changing point-of-sale transaction data.\n\n**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n\n**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n\n**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n\nreporting\n\n**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n\n**Solution overview**\n\nDatabricks enables companies to perform on-shelf availability analysis without making compromises to the\n\nbreadth or quality of predictions.\n\nIt begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n\nbiggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n\na data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n\nmetadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n\nbased systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n\nOnce data is available, users need to generate predictions about item availability on the shelf. With its\n\nextremely performant engine and the ability to distribute computation across countless nodes, Spark\n\nprovides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n\nor against a subset of data.\n\n\n-----\n\n**HOW TO GET STARTED**\n\n[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\n[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\nIn this solution, we show how the", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a53340337e61400ee84bf40d4bb5a86c", + "a data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n\nmetadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n\nbased systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n\nOnce data is available, users need to generate predictions about item availability on the shelf. With its\n\nextremely performant engine and the ability to distribute computation across countless nodes, Spark\n\nprovides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n\nor against a subset of data.\n\n\n-----\n\n**HOW TO GET STARTED**\n\n[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\n[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\nIn this solution, we show how the\n\nDatabricks Lakehouse Platform enables\n\nreal-time insights to rapidly respond\n\n\nAnd lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n\nLake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n\nfeed their predictive alerts to downstream retail operations systems or even to external partners within the\n\ntightest service windows, and in enough time to drive actions on that day.\n\n**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n\nmanual inventory data (optional), robotic or computer vision inventory data (optional)\n\n**CASE STUDY**\n\nReckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n\norganization to struggle with the complexity of forecast demand, especially with large volumes of different\n\ntypes of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n\nuses predictive analytics, product placement and business forecasting to better support neighborhood\n\ngrocery stores.\n\n\nto demand, drive more sales by\n\nensuring stock is available on shelf, and\n\nscale out your forecasting models to\n\naccommodate any size operation.\n\n\n-----\n\n**CHAPTER 8**\n### Use Case: Customer and Vehicle Identification\n\n\n**Overview**\n\nCOVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n\noptions. Retailers that were able to implement these new services have been able to differentiate overall\n\ncustomer experiences and mitigate catastrophic hits on revenue levels.\n\nFor retailers to create a seamless contactless experience for customers, they need real-time data to\n\nknow when a customer has arrived and where they’re located, as well as provide updates throughout the\n\npickup journey. And through the use of computer vision, they can capture that data by employing optical\n\nrecognition on images to read vehicle license plates.\n\nRetailers can also use information captured from license plates to make recommendations on buying\n\npatterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n\ninformation to better serve their customers in real time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDrive-Through\nFood Retailers\n\n\n**Challenges**\n\n\u0007Ineffective data processing can lead to suboptimal order preparation timing\n\n\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n\ntime communications throughout the entire shopping and curbside or drive-through experience.\n\n\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n\npreparation timing\n\n\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "23407d8db01212b3e918b64e6fe28d48", + "patterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n\ninformation to better serve their customers in real time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDrive-Through\nFood Retailers\n\n\n**Challenges**\n\n\u0007Ineffective data processing can lead to suboptimal order preparation timing\n\n\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n\ntime communications throughout the entire shopping and curbside or drive-through experience.\n\n\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n\npreparation timing\n\n\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n\neach subsequent visit is equally as or more seamless than the last\n\n\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n\nvehicle identification and order prediction\n\n**CASE STUDY**\n\n**CASE STUDY**\n\n\n-----\n\n**CHAPTER 9**\n### Use Case: Recommendation Engines\n\n\n**Overview**\n\nCustomers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n\nfrequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n\nis by recommending products and services that align with customer needs.\n\nProviding an experience that makes customers feel understood helps retailers stand out from the crowd\n\nof mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n\nthis more critical than ever for retail organizations. With research showing the cost of customer acquisition\n\nis as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n\ncontinue to build deeper connections with existing customers in order to retain a solid consumer base.\n\nThere is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n\nof spending.\n\nRecommendation engines are used to create personalized experiences for users across retail channels.\n\nThese recommendations are generated based on the data collected from purchases, items interacted\n\nwith, users’ behavior across physical and digital channels, and other data such as from customer service\n\ninteractions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n\nbehavioral data, marketers are able to create recommendations that are integrated with other business\n\nobjectives such as highlighting items that are on promotion or product availability.\n\nCreating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n\nthe customer experience in every possible area of consumer engagement, from proactive notifications and\n\noffers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n\nand upsell, and even suggestions for complementary items after the purchase.\n\n\n-----\n\n**R E L E V A N T F O R**\n\n\nRetail E-commerce Direct to\nConsumer\n\n\nMedia Telecom Financial Services\n(any B2B or B2C\ncompany)\n\n\n**Challenges**\n\nRecommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n\nbut traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n\nrecommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n\nirrelevant.\n\n**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n\nis improved by having recent data, and yet most systems struggle to handle the large volumes of\n\ninformation involved.\n\n**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n\ntouchpoints in one place are critical to enabling this use case. More data, including transaction and\n\nclickstream data, is critical for driving accuracy and precision in messaging.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "5010961aaebba0bff491572fc7f9703c", + "**R E L E V A N T F O R**\n\n\nRetail E-commerce Direct to\nConsumer\n\n\nMedia Telecom Financial Services\n(any B2B or B2C\ncompany)\n\n\n**Challenges**\n\nRecommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n\nbut traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n\nrecommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n\nirrelevant.\n\n**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n\nis improved by having recent data, and yet most systems struggle to handle the large volumes of\n\ninformation involved.\n\n**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n\ntouchpoints in one place are critical to enabling this use case. More data, including transaction and\n\nclickstream data, is critical for driving accuracy and precision in messaging.\n\n**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n\nchanging dynamics, and deliver real-time recommendations via APIs.\n\n**Automation.** This is an “always-on” use case where automation is essential for scalability and\n\nresponsiveness based on frequent model updates.\n\n\n-----\n\nMany firms choose to use recommender systems from Amazon or Google. Using these systems trains\n\nthe general recommendation engine in a way that helps competitors improve the accuracy of their own\n\nrecommendations.\n\n**Value with Databricks**\n\nRecommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n\nretailers must own, and Databricks provides a solid platform for enabling this.\n\nUsing Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n\npersonalization, sample value metrics from a media agency include:\n\n**200% ROI for 70% of retailers** engaging in advanced personalization\n\n**10% improvement** in conversions\n\n**35% improvement** in purchase frequency\n\n**37% improvement** in customer lifetime value\n\n**Solution overview**\n\nRecommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n\ncapturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n\nto combine various sources of data in a timely and efficient manner, from transactions, demographics and\n\npreference information across products, to clickstream, digital journey and marketing analytics data to bring\n\na 360 view of customer interactions to enable omnichannel personalization.\n\nBy identifying changes in user behavior or engagement, retailers are able to detect early signals that\n\nindicate a propensity to buy or a change in preferences, and recommend products and services that will\n\nkeep consumers engaged.\n\n\n-----\n\n**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n\nclickstream data, mobile data:\n\n**Engagement data** — transaction log data, clickstream data, promotion interaction\n\n**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n\nchildren, location\n\n**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n\nto churn\n\n**CASE STUDY**\n\nFor Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n\nfor help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n\npersonalized to each of their customers.\n\n**CASE STUDY**\n\nColumbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n\nDatabricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n\nbusiness decisions.\n\n**CASE STUDY**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "222a81bc42783b79a909cdc1fc87a65b", + "clickstream data, mobile data:\n\n**Engagement data** — transaction log data, clickstream data, promotion interaction\n\n**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n\nchildren, location\n\n**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n\nto churn\n\n**CASE STUDY**\n\nFor Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n\nfor help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n\npersonalized to each of their customers.\n\n**CASE STUDY**\n\nColumbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n\nDatabricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n\nbusiness decisions.\n\n**CASE STUDY**\n\nPandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n\nLakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n\nquarterly revenue.\n\n\nHOW TO GET STARTED\n\nDatabricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\n[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\nwith content-based and collaborative\n\nfilter methods, and both item-\n\nand user-based analysis. These\n\naccelerators have been further refined\n\nto be highly performant to enable\n\nfrequent retraining of models.\n\nTo begin working on recommendation\n\nengines, contact your Databricks\n\naccount team.\n\n\n-----\n\n**CHAPTER 10**\n### Use Case: Perpetual Inventory\n\n\n**Overview**\n\nWith the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n\ncustomer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n\ninventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n\nlevels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n\naccurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n\nThe key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n\nrecords related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n\nand lower overall costs.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Supply Chain\n\n\nInventory\nManagement\n\n\n**Challenges**\n\n**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n\n**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n\nview of inventory\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n\nthe latest sales data\n\n**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n\ncompanies know they’re getting the most accurate information at any point\n\n**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n\nmanagement costs\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3ac87782fd6d7538352816e9c421c808", + "**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n\nview of inventory\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n\nthe latest sales data\n\n**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n\ncompanies know they’re getting the most accurate information at any point\n\n**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n\nmanagement costs\n\n\n-----\n\n**CHAPTER 11**\n### Use Case: Automated\n Replenishments\n\n\n**Overview**\n\nCustomers favor convenience more than ever when it comes to their goods, and automated replenishments\n\nhelp meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n\nkey role in ensuring consumers get a refill automatically delivered at the right time.\n\nOn the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n\nreducing the time needed to forecast, order and receive thousands of items.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Direct to\nCustomer\n\n\n**Challenges**\n\n**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n\nreplenishment orders\n\nWith VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n\nfor replenishment even when the customer can’t fulfill that order\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n\ncustomer needs\n\n**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n\nunique properties and expiry dates\n\n**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n\n\n-----\n\n**CHAPTER 12**\n### Use Case: Fresh Food Forecasting\n\n\n**Overview**\n\nFresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n\nstore traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n\nrange of suppliers to work with and the products expire, which creates significant amounts of waste.\n\nIn order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n\nsell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n\ntiming for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n\nchanging needs around fresh food.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Distributors Logistics Restaurants\n\n**Challenges**\n\n**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n\nenough to conduct daily forecasting and daily replenishment", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "6e9d15b1e8da5b3d9bc282ac1e7664b5", + "**Overview**\n\nFresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n\nstore traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n\nrange of suppliers to work with and the products expire, which creates significant amounts of waste.\n\nIn order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n\nsell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n\ntiming for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n\nchanging needs around fresh food.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Distributors Logistics Restaurants\n\n**Challenges**\n\n**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n\nenough to conduct daily forecasting and daily replenishment\n\n**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n\n**\u0007** Customers are forced to compromise on what they can analyze\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team to get\n\nstarted with inventory allocation. Databricks\n\ndoes not have a Solution Accelerator.\n\nView our webinar covering demand forecasting\n\nwith Starbucks and then read our blog about\n\ndemand forecasting.\n\n[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n\nThis blog details the importance of time series\n\nforecasting, walks through building a simple\n\nmodel to show the use of Facebook Prophet, and\n\nthen shows off the combination of Facebook\n\nProphet and Adobe Spark to scale to hundreds\n\nof models.\n\n[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n\nVideo and Q&A from our webinar with Starbucks\n\n\n**Value with Databricks**\n\nCustomers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\nspoiled products, as well as lower inventory and handling costs.\n\n**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n\ndouble-digit improvement in forecast accuracy\n\n**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n\nmillions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\nfew hours.\n\n**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n\nthe products they forecast. They can predict demand for all products as frequently as required.\n\n**Solution overview:**\n\nDatabricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\nSolution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\ncan be efficiently scaled to tens of millions of predictions in tight service windows.\n\n**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n\nexpiration dates and weather.\n\n**CASE STUDY**\n\nButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\ncustomer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\nDatabricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\nrest of its data estate.\n\n\non demand forecasting.\n\n**CASE STUDY**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "537ac6a9ae1f1f02fabe20123c0b4e89", + "**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n\nthe products they forecast. They can predict demand for all products as frequently as required.\n\n**Solution overview:**\n\nDatabricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\nSolution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\ncan be efficiently scaled to tens of millions of predictions in tight service windows.\n\n**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n\nexpiration dates and weather.\n\n**CASE STUDY**\n\nButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\ncustomer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\nDatabricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\nrest of its data estate.\n\n\non demand forecasting.\n\n**CASE STUDY**\n\nSam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\ntrillions of events going through the company. Find out how Databricks became a key component in the shift\nfrom on premises Hadoop clusters to a cloud based platform\n\n\n-----\n\n**CHAPTER 13**\n### Use Case: Propensity-to-Buy\n\n\n**Overview**\n\nCustomers often have repeatable purchase patterns that may not be noticed upon initial observation.\n\nWhile we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n\nmornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n\npredict these buying moments when customers are not in our stores?\n\nThe purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n\npurchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n\nmodels leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n\nuseful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n\nmodels are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n\ndata from receipts, and causal data that helps to explain when and why customers make purchases.\n\nPropensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n\nmanagement, email and mobile alerts, recommendations and others.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Direct to\nConsumer\n\n\n-----\n\n**Challenges**\n\n**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n\noutreach to customers to avoid angering them.\n\nCompanies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequently\n\nCompanies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Value with Databricks**\n\n**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n\nefficiently synthesize this into data for analysis\n\n**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequency\n\n**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Solution overview:**\n\nPropensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n\nmoment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n\nincorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e7a1b743421b089b6a8eb62094043f88", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Value with Databricks**\n\n**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n\nefficiently synthesize this into data for analysis\n\n**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequency\n\n**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Solution overview:**\n\nPropensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n\nmoment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n\nincorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n\nmoment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n\nor even later in the day you have lost the opportunity to act\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\nWith the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n\nenables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n\nto refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n\nsystems, where the consumer can be engaged.\n\nAs this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n\nCalculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n\nperiods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n\nskipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n\n**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n\nmobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n\n\n-----\n\n**CHAPTER 14**\n### Use Case: Next Best Action\n\n\n**Overview**\n\nThe e-commerce boom over the last couple of years has given consumers ample choice for digital\n\nshopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n\nrisk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n\nbest action for customers, you can greatly increase your conversion rates.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDirect to\nConsumer\n\n\nE-commerce\n\n\n**Challenges**\n\nSiloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n\nresulting in suboptimal recommendations for the next best action\n\nCompanies need to ingest large amounts of data in real time and then take action on it immediately\n\nMany businesses still struggle with training their ML models to properly determine the next best action\n\n(and self-optimize based on the results)\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "7b7343372a08967b7914fed2682394bd", + "best action for customers, you can greatly increase your conversion rates.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDirect to\nConsumer\n\n\nE-commerce\n\n\n**Challenges**\n\nSiloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n\nresulting in suboptimal recommendations for the next best action\n\nCompanies need to ingest large amounts of data in real time and then take action on it immediately\n\nMany businesses still struggle with training their ML models to properly determine the next best action\n\n(and self-optimize based on the results)\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\n**Value with Databricks:**\n\nDatabricks provides all the tools needed to **process large volumes of data and find the next best**\n\n**action** at any given point in the customer journey\n\n**Near real-time insights** — the greater speed to data means businesses can react immediately to\n\ncustomer actions\n\n**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n\nbasic information, transactional data, online behavior/purchase history, and more) to get a complete\n\ncustomer profile\n\n**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n\nstep for customers\n\n\n-----\n\n**CHAPTER 15**\n### Customers That Innovate With Databricks Lakehouse for Retail\n\n\nSome of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n\nfor Retail to deliver real-time experiences to their customers.\n\nToday, data is at the core of every innovation in the retail and consumer packaged goods industry.\n\nDatabricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n\nharness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n\nexperiences to customers.\n\nGet started with a free trial of Lakehouse for Retail and start building better data applications today.\n\n**[Start your free trial](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n\n\n-----\n\n###### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "f420acb9b35388d3343c892d6c83435d", + "**The**\n**Delta Lake**\n**Series**\n**Lakehouse**\n\nCombining the best elements of\ndata lakes and data warehouses\n\n\n-----\n\n###### Here’s what\n#### What’s \n###### you’ll find inside\n#### inside?\n\n\nThe Delta Lake Series of eBooks is published\n\n\nby Databricks to help leaders and practitioners\n\nunderstand the full capabilities of Delta Lake as\n\n\n**Introduction**\n**What is Delta Lake?**\n\n\nwell as the landscape it resides in. This eBook,\n\n\n**The Delta Lake Series — Lakehouse** , focuses\n\non lakehouse.\n\n\n**Chapter** **01**\n\n##### 02 Chapter\n 03 Chapter\n\n\nWhat Is\na Lakehouse?\n\nDiving Deep Into the Inner Workings\nof the Lakehouse and Delta Lake\n\nUnderstanding\nDelta Engine\n\n\n#### What’s next?\n\nAfter reading this eBook, you’ll not only\n\n\nunderstand what Delta Lake offers, but you’ll\n\nalso understand how its features result in\n\nsubstantial performance improvements.\n\n\n-----\n\n#### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n\nlifecycle management to data lakes. Our customers have found that Delta Lake\n\nsolves for challenges around malformed data ingestion, difficulties deleting data for\n\ncompliance, or issues modifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\n\nyour data lake and the rate that teams can leverage that data with a secure and\n\nscalable cloud service.\n\n\n-----\n\n**What Is a Lakehouse?**\n### CHAPTER 01\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n# 01\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\n\nthat emerged independently across many customers and use cases: the **lakehouse.**\n\nIn this chapter, we’ll describe this new architecture and its advantages over previous\n\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\n\napplications. Since its inception in the late 1980s, data warehouse technology\n\ncontinued to evolve and MPP architectures led to systems that were able to handle\n\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\n\nhave to deal with unstructured data, semi-structured data, and data with high variety,\n\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\n\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\n\narchitects began envisioning a single system to house data for many different\n\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n\nin a variety of formats. While suitable for storing data, data lakes lack some critical\n\nfeatures: They do not support transactions, they do not enforce data quality, and their\n\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\n\nA lakehouse is a new data architecture that combines the best elements of data lakes\n\nand data warehouses.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "27aeb4ec0df5550cb0a51cb193c439bd", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\n\narchitects began envisioning a single system to house data for many different\n\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n\nin a variety of formats. While suitable for storing data, data lakes lack some critical\n\nfeatures: They do not support transactions, they do not enforce data quality, and their\n\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\n\nA lakehouse is a new data architecture that combines the best elements of data lakes\n\nand data warehouses.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\n\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\n\nwarehouses.\n\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\n\n\nrequire systems for diverse data applications including SQL analytics, real-time\n\nmonitoring, data science and machine learning. Most of the recent advances in\n\nAI have been in better models to process unstructured data (text, images, video,\n\naudio), but these are precisely the types of data that a data warehouse is not\n\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\n\nwarehouses, and other specialized systems such as streaming, time-series, graph\n\nand image databases. Having a multitude of systems introduces complexity and,\n\nmore importantly, introduces delay as data professionals invariably need to move\n\nor copy data between different systems.\n\n\nLakehouses are enabled by a new system design: implementing similar data struc-\n\ntures and data management features to those in a data warehouse, directly on the\n\nkind of low-cost storage used for data lakes. They are what you would get if you had\n\nto redesign data warehouses in the modern world, now that cheap and highly reliable\n\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n\nbe reading and writing data concurrently. Support for ACID transactions ensures\n\nconsistency as multiple parties concurrently read or write data, typically using\n\nSQL.\n\n\n-----\n\n- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n\nsupport schema enforcement and evolution, supporting DW schema paradigms\n\nsuch as star/snowflake-schemas. The system should be able to reason about data\n\nintegrity, and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n\nreduces staleness and improves recency, reduces latency and lowers the cost of\n\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n\ncompute use separate clusters, thus these systems are able to scale to many more\n\nconcurrent users and larger data sizes. Some modern data warehouses also have\n\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\n\nParquet, and they provide an API so a variety of tools and engines, including\n\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n\nThe lakehouse can be used to store, refine, analyze and access data types needed\n\nfor many new data applications, including images, video, audio, semi-structured", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "d260bbdbcefe5b169f94c612022b7f40", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n\ncompute use separate clusters, thus these systems are able to scale to many more\n\nconcurrent users and larger data sizes. Some modern data warehouses also have\n\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\n\nParquet, and they provide an API so a variety of tools and engines, including\n\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n\nThe lakehouse can be used to store, refine, analyze and access data types needed\n\nfor many new data applications, including images, video, audio, semi-structured\n\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n\nanalytics. Multiple tools might be needed to support all these workloads, but they all\n\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n\nSupport for streaming eliminates the need for separate systems dedicated to\n\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\n\nfeatures. Tools for security and access control are basic requirements. Data governance\n\ncapabilities including auditing, retention and lineage have become essential particularly\n\nin light of recent privacy regulations. Tools that enable data discovery such as data\n\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\n\nCloud object stores such as Amazon S3 are some of the largest and most\n\ncost-effective storage systems on the planet, making the main attractive\n\ntarget to store large data warehouses and data lakes. Unfortunately, their\n\nimplementation as key-value stores makes it difficult to achieve ACID\n\ntransactions and high performance: Metadata operations, such as listing\n\nobjects, are expensive, and consistency guarantees are limited. In this paper,\n\nwe present Delta Lake, an open source ACID table storage layer over cloud\n\nobject stores initially developed at Databricks. Delta Lake uses a transaction log\n\nthat is compacted into Apache Parquet format to provide ACID properties, time\n\ntravel, and significantly faster metadata operations for large tabular data sets\n\n(e.g., the ability to quickly search billions of table partitions for those relevant\n\nto a query). It also leverages this design to provide high-level features such\n\nas automatic data layout optimization, upserts, caching, and audit logs. Delta\n\nLake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n\nother systems. Delta Lake is deployed at thousands of Databricks customers\n\nthat process exabytes of data per day, with the largest instances managing\n\nexabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n\nZhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n\nŁuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n\nBoncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "cdb0b634a2405d4198c88650a922807d", + "other systems. Delta Lake is deployed at thousands of Databricks customers\n\nthat process exabytes of data per day, with the largest instances managing\n\nexabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n\nZhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n\nŁuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n\nBoncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n\nMicrosoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n\nenables a similar lakehouse pattern. Other managed services such as BigQuery and\n\nRedshift Spectrum have some of the lakehouse features listed above, but they are\n\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\n\nsource file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\n\ncan move faster as they are able to use data without needing to access multiple systems.\n\nThe level of SQL support and integration with BI tools among these early lakehouses\n\nis generally sufficient for most enterprise data warehouses. Materialized views and\n\n\nA note about technical building blocks. While distributed file systems can be\n\nused for the storage layer, object stores are more commonly used in lakehouses.\n\nObject stores provide low-cost, highly available storage that excels at massively\n\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\n\nThe lakehouse is a new data management architecture that radically simplifies\n\nenterprise data infrastructure and accelerates innovation in an age when\n\nmachine learning is poised to disrupt every industry. In the past, most of the\n\ndata that went into a company’s products or decision-making was structured\n\ndata from operational systems, whereas today, many products incorporate\n\nAI in the form of computer vision and speech models, text mining and others.\n\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n\nversioning, governance, security and ACID properties that are needed even for\n\nunstructured data.\n\n\nstored procedures are available, but users may need to employ other mechanisms that\n\n\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\n\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\n\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\n\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n\nlibraries) for non-BI workloads like data science and machine learning. Data\n\nexploration and refinement are standard for many analytic and data science\n\napplications. Delta Lake is designed to let users incrementally improve the quality of\n\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b1f28e2afb30602c0205684eb65002df", + "versioning, governance, security and ACID properties that are needed even for\n\nunstructured data.\n\n\nstored procedures are available, but users may need to employ other mechanisms that\n\n\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\n\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\n\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\n\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n\nlibraries) for non-BI workloads like data science and machine learning. Data\n\nexploration and refinement are standard for many analytic and data science\n\napplications. Delta Lake is designed to let users incrementally improve the quality of\n\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\n\nsystems (such as data warehouses) that have years of investments and real-\n\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n\nnotebooks) over others so lakehouses will also need to improve their UX and their\n\nconnectors to popular tools so they can appeal to a variety of personas. These\n\nand other issues will be addressed as the technology continues to mature and\n\ndevelop. Over time, lakehouses will close these gaps while retaining the core\n\nproperties of being simpler, more cost-efficient and more capable of serving\n\ndiverse data applications.\n\n\ndata in their lakehouse until it is ready for consumption.\n\n\n-----\n\n**Diving Deep Into the Inner Workings**\n**of the Lakehouse and Delta Lake**\n\n### CHAPTER 02\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n# 02\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n\nadopting the lakehouse pattern. The blog created a massive amount of interest\n\nfrom technology enthusiasts. While lots of people praised it as the next-generation\n\ndata architecture, some people thought the lakehouse is the same thing as\n\nthe data lake. Recently, several of our engineers and founders wrote a research\n\npaper that describes some of the core technological challenges and solutions that\n\nset the lakehouse architecture apart from the data lake, and it was accepted and\n\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\n\ncan read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\n\nthey would have said faster horses.” The crux of this statement is that people often\n\nenvision a better solution to a problem as an evolution of what they already know\n\nrather than rethinking the approach to the problem altogether. In the world of data\n\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\n\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\n\nobject stores like Amazon S3 have become some of the largest and most cost-\n\neffective storage systems in the world, which makes them an attractive platform to\n\nstore data warehouses and data lakes. However, their nature as key-value stores\n\nmakes it difficult to achieve ACID transactions that many organizations require. Also,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "36545a5c53d7999af33b9e016e0d8188", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n\nthey would have said faster horses.” The crux of this statement is that people often\n\nenvision a better solution to a problem as an evolution of what they already know\n\nrather than rethinking the approach to the problem altogether. In the world of data\n\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\n\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\n\nobject stores like Amazon S3 have become some of the largest and most cost-\n\neffective storage systems in the world, which makes them an attractive platform to\n\nstore data warehouses and data lakes. However, their nature as key-value stores\n\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\n\nperformance is hampered by expensive metadata operations (e.g., listing objects)\n\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\n\nThe first is directories of files (i.e., data lakes) that store the table as a collection\n\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\n\napproach because the table is just a group of objects that can be accessed from\n\na wide variety of tools without a lot of additional data stores or systems. However,\n\nboth performance and consistency problems are common. Hidden data corruption\n\nis common due to failed transactions, eventual consistency leads to inconsistent\n\nqueries, latency is high, and basic management capabilities like table versioning and\n\naudit logs are unavailable.\n\n**2. Custom storage engines**\n\nThe second approach is custom storage engines, such as proprietary systems built for\n\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\n\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\n\nservice that’s able to provide a single source of truth. However, all I/O operations need\n\nto connect to this metadata service, which can increase cloud resource costs and\n\nreduce performance and availability. Additionally, it takes a lot of engineering work to\n\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\n\nand PyTorch, which can be challenging for data teams that use a variety of computing\n\nengines on their data. Engineering challenges can be exacerbated by unstructured\n\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\n\ncustomers into a specific service provider, leaving customers to contend with\n\nconsistently high prices and expensive, time-consuming migrations if they decide to\n\nadopt a new approach later.\n\n**3. Lakehouse**\n\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\n\nwe sought to build a car instead of a faster horse with not just a better data store,\n\nbut a fundamental change in how data is stored and used via the lakehouse. A\n\nlakehouse is a new architecture that combines the best elements of data lakes and\n\ndata warehouses. Lakehouses are enabled by a new system design: implementing\n\nsimilar data structures and data management features to those in a data warehouse,\n\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\n\nget if you had to redesign storage engines in the modern world, now that cheap and\n\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\n\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n\nthe cloud object store. This design allows clients to update multiple objects at once,\n\nreplace a subset of the objects with another, etc., in a serializable manner that still", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "6f0ad77cb910ed72fc3436f747611387", + "we sought to build a car instead of a faster horse with not just a better data store,\n\nbut a fundamental change in how data is stored and used via the lakehouse. A\n\nlakehouse is a new architecture that combines the best elements of data lakes and\n\ndata warehouses. Lakehouses are enabled by a new system design: implementing\n\nsimilar data structures and data management features to those in a data warehouse,\n\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\n\nget if you had to redesign storage engines in the modern world, now that cheap and\n\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\n\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n\nthe cloud object store. This design allows clients to update multiple objects at once,\n\nreplace a subset of the objects with another, etc., in a serializable manner that still\n\nachieves high parallel read/write performance from the objects. The log also provides\n\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\n\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n\ncaching, and audit logs. Together, these features improve both the manageability and\n\nperformance of working with data in cloud object stores, ultimately opening the door\n\nto the lakehouse architecture that combines the key features of data warehouses and\n\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\n\nexabytes of structured and unstructured data each day, as well as many organizations\n\nin the open source community. These use cases span a variety of data sources and\n\napplications. The data types stored include Change Data Capture (CDC) logs from\n\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\n\ntables for reporting, and image or feature data for machine learning. The applications\n\ninclude SQL workloads (most commonly), business intelligence, streaming, data\n\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n\nbe a good fit for most data lake applications that would have used structured storage\n\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\n\nsimplify their data architecture by running more workloads directly against cloud\n\nobject stores, and increasingly, by creating a lakehouse with both data lake and\n\ntransactional features to replace some or all of the functionality provided by message\n\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n\nAmazon Redshift).\n\n**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\n\nenables a wide range of DBMS-like performance and management features for data\n\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\n\naccess protocols make it simple to operate, highly available, and able to deliver high-\n\nbandwidth access to the object store.\n\n\n-----\n\n**Understanding Delta Engine**\n\n### CHAPTER 03\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n# 03", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "f179ec9ceb185dca887837532487af04", + "- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\n\nenables a wide range of DBMS-like performance and management features for data\n\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\n\naccess protocols make it simple to operate, highly available, and able to deliver high-\n\nbandwidth access to the object store.\n\n\n-----\n\n**Understanding Delta Engine**\n\n### CHAPTER 03\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n# 03\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n\nengine to take advantage of modern CPU architecture with optimizations to Spark\n\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n\nRuntime 7.0. Together, these features significantly accelerate query performance on\n\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\n\nOne of the big hardware trends over the last several years is that CPU clock speeds\n\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\n\nis that we have to find new ways to process data faster beyond raw compute power.\n\nOne of the most impactful methods has been to improve the amount of data that can\n\nbe processed in parallel. However, data processing engines need to be specifically\n\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\n\nthe pace of business increases. Poorer modeling in the interest of better business\n\nagility drives poorer query performance. Naturally, this is not a desired state, and\n\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\n\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n\nworkloads through three components: an improved query optimizer, a caching\n\nlayer that sits between the execution layer and the cloud object storage, and a native\n\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\n\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\n\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\n\ndata teams today is the native execution engine, which we call Photon. (We know.\n\n\n-----\n\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\nDatabricks has been built to maximize the performance from the new changes in\n\nmodern cloud hardware. It brings performance improvements to all workload types\n\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\n\nBy linking these three components together, we think it will be easier for customers\n\nto understand how improvements in multiple places within the Databricks code", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "731dd131fbfa4bd813b743ecdd9eba7d", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\n\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\n\ndata teams today is the native execution engine, which we call Photon. (We know.\n\n\n-----\n\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\nDatabricks has been built to maximize the performance from the new changes in\n\nmodern cloud hardware. It brings performance improvements to all workload types\n\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\n\nBy linking these three components together, we think it will be easier for customers\n\nto understand how improvements in multiple places within the Databricks code\n\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\n\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\n\nnew advances in how data teams design their data architectures for increased\n\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n\n[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n## What’s next?\n\n\nNow that you understand Delta Lake and how its features can improve\n\nperformance, it may be time to take a look at some additional resources.\n\n**Data + AI Summit Europe 2020 >**\n\n- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n\n\n**Explore subsequent eBooks in the collection >**\n\n- The Delta Lake Series — Fundamentals and Performance\n\n- The Delta Lake Series — Features\n\n- The Delta Lake Series — Streaming\n\n- The Delta Lake Series — Customer Use Cases\n\n\n\n- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n\n- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n\n\n- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n\n- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n\n\n**Do a deep dive into Delta Lake >**\n\n- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n\n- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n\n\n**Vodcasts and podcasts >**\n\n\n\n- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "b39d47a11f4d8f74a085216623bd80f9", + "- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n\n\n**Do a deep dive into Delta Lake >**\n\n- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n\n- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n\n\n**Vodcasts and podcasts >**\n\n\n\n- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n\n- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n\n\n**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n\n\n\n- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "608c4c66830fb225969000b48507233b", + "### eBook\n\n# The Big Book\n of MLOps\n\n#### A data-centric approach\n to build and scale AI,\n including LLMOps\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\n## Contents\n\n**A U T H O R S :**\n\n**Joseph Bradley**\n\nLead Product Specialist\n\n**Rafi Kurlansik**\n\nLead Product Specialist\n\n**Matt Thomson**\n\nDirector, EMEA Product Specialists\n\n**Niall Turbitt**\n\nLead Data Scientist\n\n\n**C H A P T E R 1 :** \u0007 **Introduction** 3\n\n###### People and process 4\n\n People 5\n\n Process 6\n\n Why should I care about MLOps? 8\n\n Guiding principles 9\n\n**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n\n###### Semantics of dev, staging and prod 11\n\n ML deployment patterns 15\n\n**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n\n###### Architecture components 19\n\n Data Lakehouse 19\n\n MLflow 19\n\n Databricks and MLflow Autologging 20\n\n Feature Store 20\n\n MLflow Model Serving 20\n\n Databricks SQL 20\n\n Databricks Workflows and Jobs 20\n\n Reference architecture 21\n\n Overview 22\n\n Dev 23\n\n Staging 27\n\n Prod 30\n\n**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n\n###### Discussion of key topics for LLMOps 39\n\n Reference architecture 46\n\n Looking ahead 48\n\n\n-----\n\n**CHAPTER 1:**\n## Introduction\n\n**Note:** Our prescription for MLOps is general to\n\nany set of tools and applications, though we give\n\nconcrete examples using Databricks features\n\nand functionality. We also note that no single\n\narchitecture or prescription will work for all\n\norganizations or use cases. Therefore, while we\n\nprovide guidelines for building MLOps, we call out\n\nimportant options and variations. This whitepaper\n\nis written primarily for ML engineers and data\n\nscientists wanting to learn more about MLOps,\n\nwith high-level guidance and pointers to more\n\nresources.\n\n\nThe past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n\nadopters were a small number of large technology companies that could afford the necessary resources,\n\nin recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n\nMIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n\nThis democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n\n[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n\nHowever, building and deploying ML models is complex. There are many options available for achieving\n\nthis but little in the way of well-defined and accessible standards. As a result, over the past few years we", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "776424d2ba7780c9b9a590ec888d5154", + "This democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n\n[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n\nHowever, building and deploying ML models is complex. There are many options available for achieving\n\nthis but little in the way of well-defined and accessible standards. As a result, over the past few years we\n\nhave seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n\n**and automation for managing models, data and code to improve performance stability and long-term**\n\n**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n\nThe concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n\nsoftware applications, and the deployment of ML applications has much to gain from it. However, strong\n\nDevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n\nartifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n\naccount the various people and processes that interact with these artifacts.\n\nHere at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n\nwhich work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n\nsuccessful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n\nadoption is a testament to the appetite for operationalizing ML models.\n\nThis whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n\nFirst, we describe the people and process involved in deploying ML applications and the need for\n\noperational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n\nwe go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n\nintroduce a general MLOps reference architecture, the details of its processes, and best practices.\n\n\n-----\n\n#### People and process\n\n**M L W O R K F L O W A N D P E R S O N A S**\n\nData Governance Officer\n\nDat1\nData Scientist\nEngineer\n\nML Engineer\n\nBusiness Stakeholder\n\n\nDataa\nPreparation\n\n\nEvplorator{a\nData unal{sis\n\n\nFeature Mode� Modela Deplo{�ent\nEngineering Training Validation\n\n\nMode� Modela Deplo{�ent Monitoring\nTraining Validation\n\n\nModela\nValidation\n\n\n**Figure 1**\n\n\n-----\n\n#### People\n\nBuilding ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n\nuseful to think in terms of archetypes. They help us understand roles and responsibilities and where\n\nhandoffs are required, and they highlight areas of complexity within the system. We distinguish between\n\nthe following personas:\n\n**M L P E R S O N A S**\n\n\nData\nGovernance\nOfficer\n\nResponsible for ensuring\n\nthat data governance,\n\ndata privacy and other\n\ncompliance measures are\n\nadhered to across the\n\nmodel development and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "d8cb9d1ab8a5b591f0afe52543484c63", + "ML Engineer\n\nBusiness Stakeholder\n\n\nDataa\nPreparation\n\n\nEvplorator{a\nData unal{sis\n\n\nFeature Mode� Modela Deplo{�ent\nEngineering Training Validation\n\n\nMode� Modela Deplo{�ent Monitoring\nTraining Validation\n\n\nModela\nValidation\n\n\n**Figure 1**\n\n\n-----\n\n#### People\n\nBuilding ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n\nuseful to think in terms of archetypes. They help us understand roles and responsibilities and where\n\nhandoffs are required, and they highlight areas of complexity within the system. We distinguish between\n\nthe following personas:\n\n**M L P E R S O N A S**\n\n\nData\nGovernance\nOfficer\n\nResponsible for ensuring\n\nthat data governance,\n\ndata privacy and other\n\ncompliance measures are\n\nadhered to across the\n\nmodel development and\n\ndeployment process. Not\n\ntypically involved in day-to-\n\nday operations.\n\n\nData\nEngineer\n\nResponsible for building\n\ndata pipelines to process,\n\norganize and persist data\n\nsets for machine learning\n\nand other downstream\n\napplications.\n\n\nData\nScientist\n\nResponsible for\n\nunderstanding the business\n\nproblem, exploring available\n\ndata to understand\n\nif machine learning is\n\napplicable, and then training,\n\ntuning and evaluating a\n\nmodel to be deployed.\n\n\nML\nEngineer\n\nResponsible for deploying\n\nmachine learning models to\n\nproduction with appropriate\n\ngovernance, monitoring and\n\nsoftware development best\n\npractices such as continuous\n\nintegration and continuous\n\ndeployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n\n\nBusiness\nStakeholder\n\nResponsible for using the\n\nmodel to make decisions for\n\nthe business or product, and\n\nresponsible for the business\n\nvalue that the model is\n\nexpected to generate.\n\n\n-----\n\n#### Process\n\nTogether, these people develop and maintain ML applications. While the development process follows\n\na distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n\nyou take, and using techniques like reinforcement learning or online learning will change some details.\n\nNevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n\nabove.\n\nLet’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n\nwhich will be determined by the particular business case and data.\n\n**M L P R O C E S S**\n\n\nData\nPreparation\n\n\nExploratory\nData Analysis\n\n\nFeature\nEngineering\n\n\nModel\nTraining\n\n\nModel\nValidation\n\n\nDeployment Monitoring\n\n\n###### Data preparation\n\nPrior to any data science or ML work lies the data engineering needed to prepare production data and make\n\nit available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n\nwill extract features and labels from the raw data.\n\n###### Exploratory data analysis (EDA)\n\nAnalysis is conducted by data scientists to assess statistical properties of the data available, and determine\n\nif they address the business question. This requires frequent communication and iteration with business\n\nstakeholders.\n\n\n-----\n\n###### Feature engineering\n\nData scientists clean data and apply business logic and specialized transformations to engineer features for\n\nmodel training. These data, or features, are split into training, testing and validation sets.\n\n###### Model training\n\nData scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "708e5d98e42b111c0df5b6d124aaf98a", + "Model\nTraining\n\n\nModel\nValidation\n\n\nDeployment Monitoring\n\n\n###### Data preparation\n\nPrior to any data science or ML work lies the data engineering needed to prepare production data and make\n\nit available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n\nwill extract features and labels from the raw data.\n\n###### Exploratory data analysis (EDA)\n\nAnalysis is conducted by data scientists to assess statistical properties of the data available, and determine\n\nif they address the business question. This requires frequent communication and iteration with business\n\nstakeholders.\n\n\n-----\n\n###### Feature engineering\n\nData scientists clean data and apply business logic and specialized transformations to engineer features for\n\nmodel training. These data, or features, are split into training, testing and validation sets.\n\n###### Model training\n\nData scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n\na best-performing model is determined according to predefined evaluation metric(s).\n\n###### Model validation\n\nPrior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n\nsome baseline level of performance, in addition to meeting any other technical, business or regulatory\n\nrequirements. This necessitates collaboration between data scientists, business stakeholders and ML\n\nengineers.\n\n###### Deployment\n\nML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n\nrequirements of the use case.\n\n###### Monitoring\n\nML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n\nwill often be involved in early monitoring phases to ensure that new models perform as expected after\n\ndeployment. This will inform if and when the deployed model should be updated by returning to earlier\n\nstages in the workflow.\n\nThe data governance officer is ultimately responsible for making sure this entire process is compliant with\n\ncompany and regulatory policies.\n\n\n-----\n\n#### Why should I care about MLOps?\n\nConsider that the typical ML application depends on the aforementioned people and process, as well\n\nas regulatory and ethical requirements. These dependencies change over time — and your models, data\n\nand code must change as well. The data that were a reliable signal yesterday become noise; open source\n\nlibraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n\nresilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n\nmoving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n\niteration cycle of delivering models to production, thereby accelerating time to business value.\n\nThere are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n\n**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n\nFor example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n\nrisk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n\nfails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n\nfines or incurring reputational damage. Recently, one private company’s data collection practices were\n\nfound to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n\n$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n\ndeveloped with that data.\n\nWith respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "da9afac987d642e401fde2894c10390d", + "For example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n\nrisk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n\nfails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n\nfines or incurring reputational damage. Recently, one private company’s data collection practices were\n\nfound to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n\n$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n\ndeveloped with that data.\n\nWith respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n\nprocesses. These steps are slower and more prone to error, affecting the quality of models, data and code.\n\nEventually they form a bottleneck, capping the ability for a data team to take on new projects.\n\nSeen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n\nstability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n\nintroduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n\nmanage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n\n**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n\nWith clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\nGiven the complexity of ML\n\nprocesses and the different personas\n\ninvolved, it is helpful to start from\n\nsimpler, high-level guidance. We\n\npropose several broadly applicable\n\nprinciples to guide MLOps decisions.\n\nThey inform our design choices in\n\nlater sections, and we hope they can\n\nbe adapted to support whatever your\n\n\n#### Guiding principles\n\n###### Always keep your business goals in mind\n\nJust as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n\npurpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n\ncontinue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n\nbusiness impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n\nreduce operational costs or risks?\n\n###### Take a data-centric approach to machine learning\n\nFeature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n\nas robust as other production data engineering processes. Data quality is crucial in any ML application, so\n\nML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n\nAvoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n\nyour data. The simplest way to achieve this is to develop ML applications on the same platform used to\n\nmanage production data. For example, instead of downloading training data to a laptop, where it is hard\n\nto govern and reproduce results, secure the data in cloud storage and make that storage available to your\n\ntraining process.\n\n\nbusiness use case may be.\n\n\n-----\n\n###### \u0007Implement MLOps in a modular fashion\n\nAs with any software application, code quality is paramount for an ML application. Modularized code\n\nenables testing of individual components and mitigates difficulties with future code refactoring. Define", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "62b4254e22c0941dfae77299d22de1f7", + "Feature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n\nas robust as other production data engineering processes. Data quality is crucial in any ML application, so\n\nML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n\nAvoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n\nyour data. The simplest way to achieve this is to develop ML applications on the same platform used to\n\nmanage production data. For example, instead of downloading training data to a laptop, where it is hard\n\nto govern and reproduce results, secure the data in cloud storage and make that storage available to your\n\ntraining process.\n\n\nbusiness use case may be.\n\n\n-----\n\n###### \u0007Implement MLOps in a modular fashion\n\nAs with any software application, code quality is paramount for an ML application. Modularized code\n\nenables testing of individual components and mitigates difficulties with future code refactoring. Define\n\nclear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n\nresponsibilities to clarify the modular structure of your ML application.\n\n###### Process should guide automation\n\nWe automate processes to improve productivity and lower risk of human error, but not every step of a\n\nprocess can or should be automated. People still determine the business question, and some models will\n\nalways need human oversight before deployment. Therefore, the development process is primary and each\n\nmodule in the process should be automated as needed. This allows incremental build-out of automation\n\nand customization. Furthermore, when it comes to particular automation tools, choose those that align to\n\nyour people and process. For example, instead of building a model logging framework around a generic\n\ndatabase, you can choose a specialized tool like MLflow, which has been designed with the ML model\n\nlifecycle in mind.\n\n\n-----\n\n**CHAPTER 2:**\n## Fundamentals of MLOps\n\n**Note:** In our experience with customers, there\n\ncan be variations in these three stages, such as\n\nsplitting staging into separate “test” and “QA”\n\nsubstages. However, the principles remain the\n\nsame and we stick to a dev, staging and prod\n\nsetup within this paper.\n\n\n#### Semantics of dev, staging and prod\n\nML workflows include the following key assets: code, models and data. These assets need to be developed\n\n(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n\nenvironment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n\nstaging and prod.\n\nThese divisions can best be understood in terms of quality guarantees and access control. On one end,\n\nassets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n\nwho can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n\nof quality.\n\nFor example, many data scientists will work together in a dev environment, freely producing dev model\n\nprototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n\nthe live product. In contrast, the staging environment replicates the execution environment of production.\n\nHere, code changes made in the dev environment are tested prior to code being deployed to production.\n\nThe staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n\nare given access to staging. Code promoted to production is considered a live product. In the production\n\nenvironment, human error can pose the greatest risk to business continuity, and so the least number of\n\npeople have permission to modify production models.\n\nOne might be tempted to say that code, models and data each share a one-to-one correspondence with\n\nthe execution environment — e.g., all dev code, models and data are in the dev environment. That is often", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "85197469b7851eb47251975aad626a49", + "who can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n\nof quality.\n\nFor example, many data scientists will work together in a dev environment, freely producing dev model\n\nprototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n\nthe live product. In contrast, the staging environment replicates the execution environment of production.\n\nHere, code changes made in the dev environment are tested prior to code being deployed to production.\n\nThe staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n\nare given access to staging. Code promoted to production is considered a live product. In the production\n\nenvironment, human error can pose the greatest risk to business continuity, and so the least number of\n\npeople have permission to modify production models.\n\nOne might be tempted to say that code, models and data each share a one-to-one correspondence with\n\nthe execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n\nclose to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n\nand prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n\naccess to each.\n\n\n-----\n\n###### Execution environments\n\nAn execution environment is the place where models and data are created or consumed by code. Each\n\nexecution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n\nWith Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n\norganization could create distinct environments across multiple cloud accounts, multiple Databricks\n\nworkspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n\nare illustrated in Figure 2 below.\n\n**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n\n\nMultiple clou$\naccounts\n\nstaging\n\nprod\n\n\nMultiple Databricks\nworkspaces\n\nstaging\n\nprod\n\n\nDatabricks workspace\naccess controls\n\n\ndev\n\nstaging\n\nprod\n\n\ndev\n\n\ndev\n\n\n**Figure 2**\n\n\n-----\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\n\n###### Code\n\nML project code is often stored in a version control repository (such as Git), with most organizations\n\nusing branches corresponding to the lifecycle phases of development, staging or production. There are a\n\nfew common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n\nOthers use main and development branches (dev), branches cut for testing potential releases (staging), and\n\nbranches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n\nthrough Git repository branches.\n\n\nlifecycle management functions that are needed\n\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once and to replace a subset\n\nof the objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\n\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\n\nthat’s higher. For example, the dev environment can run any code, but the prod environment can only run\n\nprod code.\n\n###### Models\n\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "da9dc2dae18bd9c1aa982e446461522c", + "through Git repository branches.\n\n\nlifecycle management functions that are needed\n\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once and to replace a subset\n\nof the objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\n\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\n\nthat’s higher. For example, the dev environment can run any code, but the prod environment can only run\n\nprod code.\n\n###### Models\n\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n\na new model version before you push a code change, and vice versa. Consider the following scenarios:\n\n\npoint-in-time snapshots or rollback of erroneous\n\n\n\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n\nof being generated, tested and marked as “production” to predict on the most recent transactions. In\n\nthis case the code lifecycle is slower than the model lifecycle.\n\n\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\n\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n\nproduction models without code changes, streamlining the deployment process in many cases. Model\n\nartifacts are secured using MLflow access controls or cloud storage permissions\n\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n###### Data\n\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\n\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\n\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n\n\nreliability and freshness. Access to data in each environment is controlled with table access controls\n\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n| |\n\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n\nwill be the highest quality and tightly controlled.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "ebfebb0d787bbc4bd1401beb141a6e30", + "may be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n\n\nreliability and freshness. Access to data in each environment is controlled with table access controls\n\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n| |\n\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n\nwill be the highest quality and tightly controlled.\n\n\n\n\n\n\n\n\n|ASSET|SEMANTICS|SEPARATED BY|\n|---|---|---|\n|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n|Code|Labeled according to software development lifecycle phase|Git repository branches|\n\n\n**Table 1**\n\n\n-----\n\n#### ML deployment patterns\n\nThe fact that models and code can be managed separately results in multiple possible patterns for getting\n\nML artifacts through staging and into production. We explain two major patterns below.\n\n**D E P L O Y M O D E L S**\n\ndev staging prod\n\n**D E P L O Y C O D E**\n\ndev staging prod\n\nThese two patterns differ in terms of whether the model artifact or the training code that produces the\n\nmodel artifact is promoted toward production.\n\n\n-----\n\n###### Deploy models\n\nIn the first pattern, the model artifact is generated by training code in the development environment.\n\nThis artifact is then tested in staging for compliance and performance before finally being deployed into\n\nproduction. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n\nexpensive, training the model once and managing that artifact may be preferable. However, this simpler\n\narchitecture comes with limitations. If production data is not accessible from the development environment\n\n(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n\nautomated model retraining. While you could automate retraining in the development environment, you\n\nwould then be treating “dev” training code as production ready, which many deployment teams would not\n\naccept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n\ndeployed to production, requiring a separate code deployment path.\n\n###### Deploy code\n\nIn the second pattern, the code to train models is developed in the dev environment, and this code is\n\nmoved to staging and then production. Models will be trained in each environment: initially in the dev\n\nenvironment as part of model development, in staging (on a limited subset of data) as part of integration\n\ntests, and finally in the production environment (on the full production data) to produce the final model.\n\nIf an organization restricts data scientists’ access to production data from dev or staging environments,\n\ndeploying code allows training on production data while respecting access controls. Since training code\n\ngoes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n\nsame pattern as model training code, and both can go through integration tests in staging. However, the\n\nlearning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n\nproject templates and workflows are helpful. Finally, data scientists need visibility into training results from", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "f583634e6cf80e00bbe691fdbfe7ca94", + "###### Deploy code\n\nIn the second pattern, the code to train models is developed in the dev environment, and this code is\n\nmoved to staging and then production. Models will be trained in each environment: initially in the dev\n\nenvironment as part of model development, in staging (on a limited subset of data) as part of integration\n\ntests, and finally in the production environment (on the full production data) to produce the final model.\n\nIf an organization restricts data scientists’ access to production data from dev or staging environments,\n\ndeploying code allows training on production data while respecting access controls. Since training code\n\ngoes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n\nsame pattern as model training code, and both can go through integration tests in staging. However, the\n\nlearning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n\nproject templates and workflows are helpful. Finally, data scientists need visibility into training results from\n\nthe production environment, for only they have the knowledge to identify and fix ML-specific issues.\n\n\n-----\n\nThe diagram below contrasts the code lifecycle for the above deployment patterns across the different\n\nexecution environments.\n\n\nCode\ndevelopment\n\nDevelopment\nenvironment\n\n\nUnit\ntests\n\n\nIntegration\ntests\n\nDevelopment\nenvironment\n\nStaging\nenvironment\n\n\nModel\ntraining\n\n\nContinuous\ndeployment\n\nStaging\nenvironment\n\nProduction\nenvironment\n\n\nDeploy\npipelines\n\nProduction\nenvironment\n\n\n#### Deploy models\n\n Deploy code\n\n\n**In general we recommend following the “deploy code” approach, and the reference architecture in**\n\n**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n\nthe options outlined above are not mutually exclusive. Within a single organization, you may find some use\n\ncases deploying training code and others deploying model artifacts. Your choice of process will depend on\n\nthe business use case, resources available and what is most likely to succeed.\n\n\n-----\n\n|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n|---|---|---|---|\n|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "645ba7a46c6551d1fc243055ca207265", + "**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n\n\n-----\n\n**CHAPTER 3:**\n## MLOps Architecture\n and Process\n\n###### Lakehouse Platform\n\n\n#### Architecture components\n\nBefore unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n\nfeatures used to facilitate MLOps in the workflow prescribed.\n\n###### Data Lakehouse\n\nA [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n\ndata management and performance typically found in data warehouses with the low-cost, flexible object\n\nstores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n\nof Bronze, Silver and Gold tables of increasing refinement and quality.\n\n###### MLflow\n\n[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n\nfollowing primary components:\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n\nartifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n| |\n\n\n\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n\nmodel serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n| |\n\n\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n\n\nfrom staging to production, with capabilities for versioning and annotating. The registry also provides\n\nwebhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n| |\n\nDatabricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n\nhigh availability, and other Databricks workspace features such as experiment and run management and\n\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n\nmachine learning model training runs and running machine learning projects.\n\n\n-----\n\n###### Databricks and MLflow Autologging\n\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "bfb585054ac3d95182da37f6cea4c11b", + "Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n\nhigh availability, and other Databricks workspace features such as experiment and run management and\n\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n\nmachine learning model training runs and running machine learning projects.\n\n\n-----\n\n###### Databricks and MLflow Autologging\n\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n\n\nautomatically captures model parameters, metrics, files and lineage information when you train models with\n\ntraining runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n| |\n\n###### Feature Store\n\nThe Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n\n\nacross an organization and also ensures that the same feature computation code is used for model training\n\nand inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n| |\n\n###### MLflow Model Serving\n\nMLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n\n\nthat are updated automatically based on the availability of model versions and their stages. See\n\ndocumentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n| |\n\n###### Databricks SQL\n\nDatabricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n\n\ndata lake, create multiple visualization types to explore query results from different perspectives, and build\n\nand share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n| |\n\n###### Databricks Workflows and Jobs\n\nDatabricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n\n\nways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n\nsteps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n| |\n\n\n-----\n\n#### Reference architecture", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "a1fd060419f5e0d621ce925e95993752", + "###### Databricks Workflows and Jobs\n\nDatabricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n\n\nways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n\nsteps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n| |\n\n\n-----\n\n#### Reference architecture\n\nWe are now ready to review a general reference architecture for implementing MLOps on the Databricks\n\nLakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n\nthe majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n\nwe will highlight alternative approaches to implementing different parts of the process.\n\nWe begin with an overview of the system end-to-end, followed by more detailed views of the process\n\nin development, staging and production environments. These diagrams show the system as it operates\n\nin a steady state, with the finer details of iterative development cycles omitted. This structure is\n\nsummarized below.\n\n**O V E R V I E W**\n```\n dev staging prod\n\n```\n\n\u0007Data\n\n\u0007Exploratory data analysis (EDA)\n\n\u0007Project code\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Commit code\n\n\n\u0007Merge request\n\n\u0007Unit tests (CI)\n\n\u0007Integration tests (CI)\n\n\u0007Merge\n\n\u0007Cut release branch\n\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Continuous deployment (CD)\n\n\u0007Online serving (REST APIs)\n\n\u0007Inference: batch or streaming\n\n\u0007Monitoring\n\n\u0007Retraining\n\n\n-----\n\n###### Overview\n\nSource control\n\ndev staging (main) release\n\nMerge reIuest to staging Cut release branch Pull from release branch to production\n\n\n**Figure 3**\n\n\nDevelopment\nenvironment\n\nExploratory\ndata analysis\n\n\nStaging\nenvironment\n\nCreate dev branch Commit code C} trigger Merge\n\n\nProduction\nenvironment\n\nModel Registry\n\nSt�ge{ �one St�ge{ St�ging St�ge{ Production\n\n\n. . .\n\n\nInference & serving dev\n\nFeature table refresh dev\n\n\nUnit tests\n(CI)\n\n\nPush model to registr� Load model for testing Load model for inference\n\nIntegration\ntests (CI)\n\n\ndev\n\n\ndev\n\n\nPromote to production\n\n\nInference & serving\n\n\nModel training dev\n\nrelease\n\ndev\n\n\nFeature\ntable refresh\n\nrelease\n\n\nMode�\ntraining\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring\n\nrelease\n\n\nData tables Feature tables Feature tables Data tables Feature tables Metrics tables\n\nHere we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n\nand model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n\npipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n\ndevelopment environment, and changes to the codebase are committed back to source control. Upon merge\n\nrequest to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n\ncode in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n\ncode release. In production, a model is trained on the full production data and pushed to the MLflow Model\n\nRegistry. A continuous deployment (CD) process tests the model and promotes it toward the production", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "94ff21f4dd51692d8c4759c20116d3a3", + "release\n\n\nMode�\ntraining\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring\n\nrelease\n\n\nData tables Feature tables Feature tables Data tables Feature tables Metrics tables\n\nHere we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n\nand model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n\npipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n\ndevelopment environment, and changes to the codebase are committed back to source control. Upon merge\n\nrequest to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n\ncode in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n\ncode release. In production, a model is trained on the full production data and pushed to the MLflow Model\n\nRegistry. A continuous deployment (CD) process tests the model and promotes it toward the production\n\nstage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n\n\n-----\n\n###### Dev\n\nIn the development environment, data scientists and ML engineers can collaborate on all pipelines in\n\nan ML project, committing their changes to source control. While engineers may help to configure this\n\nenvironment, data scientists typically have significant control over the libraries, compute resources and\n\ncode that they use.\n\n\n**Figure 4** Development environment\n\n0�\n\nE�ploratory\ndata analysis\n\n0�\n\n\ndev\n\n\nSource control\n\nTracking Server\n\nMetrics Parameters Models\n\ndev\n\n\n. . .\n\nmodels\n\n\ntrain.py\n\ndeploy.py\n\nin(erence.py\n\nmonitoring.py\n\ndat<\n\n(eaturization.py\n\ntests\n\nunit.py\n\nintegration.py\n\n\nInference: Streaming or batch\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\n\nModel training\n\nTraining and\nEvaluation\ntuning\n\n\nCreate dev mrancg\n\n0u\n\nCommit code\n\n\n04\n\n\n\ndev\n\n\ndev\n\n\n0�\n\n\nLakehouse\n\n\nFeature tamles Bronze / Silver / Gold\n\nprod data\n\n\nFeature tamles Temp tamles\n\ndev data\n\n\n-----\n\n###### Data\n\nData scientists working in the dev environment possess read-only access to production data. They also\n\nrequire read-write access to a separate dev storage environment to develop and experiment with new\n\nfeatures and other data tables.\n\n###### Exploratory data analysis (EDA)\n\nThe data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n\nassess whether the available data has the potential to address the business problem. EDA is also where the\n\ndata scientist will begin discerning what data preparation and featurization are required for model training.\n\nThis ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n\n###### Project code\n\nThis is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n\nare used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n\na project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n\n\n-----\n\n###### Feature table refresh\n\nThis pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n\npipeline consists of two steps:\n\n\u0007 **Data preparation**\n\nThis step checks for and corrects any data quality issues prior to featurization.\n\n**\u0007Featurization**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "ca6e7ba0b61410c074994bf41992e225", + "data scientist will begin discerning what data preparation and featurization are required for model training.\n\nThis ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n\n###### Project code\n\nThis is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n\nare used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n\na project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n\n\n-----\n\n###### Feature table refresh\n\nThis pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n\npipeline consists of two steps:\n\n\u0007 **Data preparation**\n\nThis step checks for and corrects any data quality issues prior to featurization.\n\n**\u0007Featurization**\n\nIn the dev environment, new features and updated featurization logic can be tested by writing to feature\n\ntables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n\nfeaturization code is promoted to production, these changes will affect the production feature tables.\n\nFeatures already available in production feature tables can be read directly for development.\n\nIn some organizations, feature engineering pipelines are managed separately from ML projects. In such\n\ncases, the featurization pipeline can be omitted from this architecture.\n\n\n-----\n\n###### Model training\n\nData scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n\n\u0007 **Training and tuning**\n\nThe training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n\nand it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n\nhyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n\nbetween the model, its input data, and the code used to generate it.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out data. The results of these tests are logged to the\n\nMLflow tracking server.\n\nIf governance requires additional metrics or supplemental documentation about the model, this is the\n\ntime to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n\nand plain text descriptions are common, but defining the specifics for such governance requires input\n\nfrom business stakeholders or a data governance officer.\n\n**\u0007Model output**\n\nThe output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n\ntraining pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n\nvia the model URI (or path) and then push the model to the Model Registry for management and testing.\n\n###### Commit code\n\nAfter developing code for featurization, training, inference and other pipelines, the data scientist or\n\nML engineer commits the dev branch changes into source control. This section does not discuss the\n\ncontinuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n\ninformation on those.\n\n\n-----\n\n###### Staging\n\nThe transition of code from development to production occurs in the staging environment. This code\n\nincludes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n\nengineers are responsible for writing tests for code and models, but ML engineers manage the continuous", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "6279c87720dd1cacba7445a01ead0c8e", + "The output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n\ntraining pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n\nvia the model URI (or path) and then push the model to the Model Registry for management and testing.\n\n###### Commit code\n\nAfter developing code for featurization, training, inference and other pipelines, the data scientist or\n\nML engineer commits the dev branch changes into source control. This section does not discuss the\n\ncontinuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n\ninformation on those.\n\n\n-----\n\n###### Staging\n\nThe transition of code from development to production occurs in the staging environment. This code\n\nincludes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n\nengineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n\nintegration pipelines and orchestration.\n\nSource control\n\n0] 0_\n\ndev staging >main< release\n\nMerge reHuest to staging Cut release branch\n\nStaging environment\n\nCI trigger Merge\n\n0�\n\n\n**Figure 5**\n\n\nUnit tests\n(CI)\n\n\nTracking Server\n\n0�\n\nModel Registry\n\ndev\n\n\n03\n\nIntegration tests (CI)\n\n\nFeature\nStore tests\n\n\nModel\ntraining tests\n\n\nModel\ndeployment\ntests\n\n\nInference\ntests\n\n\nModel\nmonitoring\ntests\n\n\nLakehouse\n\n\ndev\n\nFeature tables Temp tables\n\nstaging data\n\n\n-----\n\n###### Data\n\nThe staging environment may have its own storage area for testing feature tables and ML pipelines. This\n\ndata is generally temporary and only retained long enough to run tests and to investigate test failures. This\n\ndata can be made readable from the development environment for debugging.\n\n###### Merge code\n\n\u0007 **Merge request**\n\nThe deployment process begins when a merge (or pull) request is submitted against the staging branch\n\nof the project in source control. It is common to use the “main” branch as the staging branch.\n\n**\u0007Unit tests (CI)**\n\nThis merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n\nrequest is rejected.\n\n\n-----\n\n###### Integration tests (CI)\n\nThe merge request then goes through integration tests, which run all pipelines to confirm that they function\n\ncorrectly together. The staging environment should mimic the production environment as much as is\n\nreasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n\nIntegration tests can trade off fidelity of testing for speed and cost. For example, when models are\n\nexpensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n\ncost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n\ntesting within these integration tests, whereas other models may be tested with small batch jobs or a few\n\nqueries to temporary REST endpoints.\n\nOnce integration tests pass on the staging branch, the code may be promoted toward production.\n\n\u0007 **Merge**\n\nIf all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n\nsystem should notify users and post results on the merge (pull) request.\n\nNote: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n\nupdated frequently with concurrent merge requests.\n\n###### Cut release branch\n\nOnce CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n\nthat commit.\n\n\n-----\n\n**Figure 6**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "274b8ac4ce7f9bd380391183c9e04742", + "cost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n\ntesting within these integration tests, whereas other models may be tested with small batch jobs or a few\n\nqueries to temporary REST endpoints.\n\nOnce integration tests pass on the staging branch, the code may be promoted toward production.\n\n\u0007 **Merge**\n\nIf all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n\nsystem should notify users and post results on the merge (pull) request.\n\nNote: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n\nupdated frequently with concurrent merge requests.\n\n###### Cut release branch\n\nOnce CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n\nthat commit.\n\n\n-----\n\n**Figure 6**\n\n\n###### Prod\n\nThe production environment is typically managed by a select set of ML engineers and is where ML pipelines\n\ndirectly serve the business or application. These pipelines compute fresh feature values, train and test new\n\nmodel versions, publish predictions to downstream tables or applications, and monitor the entire process to\n\navoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n\nonline serving below, most ML applications will use only one of these methods, depending on the business\n\nrequirements.\n\nProduction environment\n\n\n0b\n\n0�\n\n0�\n\n\nModel Registry\n\n\nOnline serving\n\n\nStage: None Stage: Staging Stage: Production\n\n\nLog\nrequests and\npredictions\n\nrelease\n\n\nLoad model for\nonline serving\n\n\nEna�le online\nserving\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\nrelease\n\n0B\n\n\n0~\n\n\nLoad model for testing\n\n\nLoad model for testing Load model for inference\n\n\nInference: Batch or streaming\n\n\nRegister and request transition\n\nModel training\n\nTraining\nEvaluation\nand tuning\n\nrelease\n\n\nPromote to staging Promote to production\n\n\nModel\nData ingest\ninference\n\n\nPu�lish\npredictions\n\n\n03\n\n\nContinuous Deployment (CD)\n\n\nrelease\n\nMonitoring\n\n\nData ingest\n\n\nCheck model\nperformance\nand data drift\n\n\nPu�lish\nmetrics\n\n\nCompare\nStaging vs\nProduction\n\n\nRequest model\ntransition to\nProduction\n\nrelease\n\n\nCompliance\nchecks\n\n\n0�\n\n\nTrigger model training\n\n\nrelease\n\n\nData ta�les Feature ta�les Feature ta�les Monitoring ta�les\nLakehouse\n\n\n-----\n\nThough data scientists may not have write or compute access in the production environment, it is\n\nimportant to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n\nin production. This visibility allows them to identify and diagnose problems in production.\n\n###### Feature table refresh\n\nThis pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n\nor streaming computation, depending on the freshness requirements for downstream training and inference.\n\nThe pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n\n###### Model training\n\nThe model training pipeline runs either when code changes affect upstream featurization or training logic, or\n\nwhen automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n\n\u0007 **Training and tuning**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "8ad624e8a2bccafc9e62872cd484b446", + "in production. This visibility allows them to identify and diagnose problems in production.\n\n###### Feature table refresh\n\nThis pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n\nor streaming computation, depending on the freshness requirements for downstream training and inference.\n\nThe pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n\n###### Model training\n\nThe model training pipeline runs either when code changes affect upstream featurization or training logic, or\n\nwhen automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n\n\u0007 **Training and tuning**\n\nDuring the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n\nmetrics, parameters, tags and the model itself.\n\nDuring development, data scientists may test many algorithms and hyperparameters, but it is common\n\nto restrict those choices to the top-performing options in the production training code. Restricting tuning\n\ncan reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out production data. The results of these tests are\n\nlogged to the MLflow tracking server. During development, data scientists will have selected meaningful\n\nevaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n\n**\u0007Register and request transition**\n\nFollowing model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n\nenvironment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n\n\n-----\n\n###### Continuous deployment (CD)\n\nThe CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n\n‘stage=Staging’. There are three key tasks in this pipeline:\n\n\u0007 **Compliance checks**\n\nThese tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n\netc.), and approve or reject the request based on test results. If compliance checks require human\n\nexpertise, this automated step can compute statistics or visualizations for people to review in a manual\n\napproval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n\nare recorded to the Model Registry through metadata in tags and comments in descriptions.\n\nThe MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n\ncan be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n\nthe transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n\ntransition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n\n**\u0007Compare staging vs. production**\n\nTo prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n\n‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n\naccording to the use case, and the method for comparison can vary from canary deployments to A/B", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "f104800f30d27377e2cd3379dfe313ee", + "are recorded to the Model Registry through metadata in tags and comments in descriptions.\n\nThe MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n\ncan be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n\nthe transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n\ntransition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n\n**\u0007Compare staging vs. production**\n\nTo prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n\n‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n\naccording to the use case, and the method for comparison can vary from canary deployments to A/B\n\ntests. All comparison results are saved to metrics tables in the lakehouse.\n\nIf this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n\nshould be compared to a business heuristic or other threshold as a baseline. For a new version\n\nof an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n\n‘stage=Production’ model.\n\n\n-----\n\n**\u0007Request model transition to production**\n\nIf the candidate model passes the comparison tests, a request is made to transition it to\n\n‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n\napprovals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n\nwebhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n\nfully available to downstream applications. A person can manually review the compliance checks and\n\nperformance comparisons to perform checks which are difficult to automate.\n\n###### Online serving (REST APIs)\n\nFor lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n\nsimple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n\ncustom serving layers.\n\nIn all cases, the serving system loads the production model from the Model Registry upon initialization. On\n\neach request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n\nserving system, data transport layer or the model itself could log requests and predictions.\n\n###### Inference: batch or streaming\n\nThis pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n\n‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n\nthroughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n\noption.\n\nA batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n\nA streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n\nApache Kafka.®\n\n\n-----\n\n###### Monitoring\n\nInput data and model predictions are monitored, both for statistical properties (data drift, model\n\nperformance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n\npublished for dashboards and alerts.\n\n\u0007 **Data ingestion**\n\nThis pipeline reads in logs from batch, streaming or online inference.\n\n**\u0007Check accuracy and data drift**\n\nThe pipeline then computes metrics about the input data, the model’s predictions and the infrastructure", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "b8305a116d1c3a3f477c13a54827b19f", + "‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n\nthroughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n\noption.\n\nA batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n\nA streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n\nApache Kafka.®\n\n\n-----\n\n###### Monitoring\n\nInput data and model predictions are monitored, both for statistical properties (data drift, model\n\nperformance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n\npublished for dashboards and alerts.\n\n\u0007 **Data ingestion**\n\nThis pipeline reads in logs from batch, streaming or online inference.\n\n**\u0007Check accuracy and data drift**\n\nThe pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n\nperformance. Metrics that measure statistical properties are generally chosen by data scientists during\n\ndevelopment, whereas metrics for infrastructure are generally chosen by ML engineers.\n\n\u0007 **Publish metrics**\n\nThe pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n\nto produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n\ndashboarding tool issues notifications when health metrics surpass defined thresholds.\n\n**\u0007Trigger model training**\n\nWhen the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n\nout of date, the data scientist may need to return to the development environment and develop a new\n\nmodel version.\n\n\n-----\n\n**Note:** While automated retraining is supported\n\nin this architecture, it isn’t required, and caution\n\n\n###### Retraining\n\nThis architecture supports automatic retraining using the same model training pipeline above. While we\n\nrecommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n\nretraining when needed.\n\n\u0007 **Scheduled**\n\nIf fresh data are regularly made available, rerunning model training on a defined schedule can help models\n\nto keep up with changing trends and behavior.\n\n**\u0007Triggered**\n\nIf the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n\ntrigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n\nperformance degrades, automatic retraining and redeployment can boost model performance with\n\nminimal human intervention.\n\n\nmust be taken in cases where it is implemented.\n\n\nIt is inherently difficult to automate selecting the\n\ncorrect action to take from model monitoring\n\n\nWhen the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n\nscientist may need to return to the dev environment and resume experimentation to address such issues.\n\n\nalerts. For example, if data drift is observed, does\n\nit indicate that we should automatically retrain, or\n\ndoes it indicate that we should engineer additional\n\nfeatures to encode some new signal in the data?\n\n\n-----\n\n**CHAPTER 4:**\n## LLMOps – Large Language Model Operations\n\n\n#### Large language models\n\nLLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n\ncountless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n\n\u0007Is prompt engineering part of operations, and if so, what is needed?\n\n\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "4e015774702bd6742d3bee6eec59b8dd", + "correct action to take from model monitoring\n\n\nWhen the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n\nscientist may need to return to the dev environment and resume experimentation to address such issues.\n\n\nalerts. For example, if data drift is observed, does\n\nit indicate that we should automatically retrain, or\n\ndoes it indicate that we should engineer additional\n\nfeatures to encode some new signal in the data?\n\n\n-----\n\n**CHAPTER 4:**\n## LLMOps – Large Language Model Operations\n\n\n#### Large language models\n\nLLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n\ncountless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n\n\u0007Is prompt engineering part of operations, and if so, what is needed?\n\n\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n\n\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n\n…and many more!\n\nThe good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n\nsome parts of your MLOps platform and process may require changes, and your team will need to learn a\n\nmental model of how LLMs coexist alongside traditional ML in your operations.\n\nIn this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n\nkey topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n\na reference architecture diagram to illustrate what may change in your production environment.\n\n###### What changes with LLMs?\n\nFor those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n\none-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n\nsignificantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n\nquestion answering, summarization and execution of near-arbitrary instructions.\n\nFrom the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n\nplatforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n\ninto more detail in the next section.\n\n\n-----\n\n**Table 3**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "b13bc8ea85264360bae8db33185fbadb", + "a reference architecture diagram to illustrate what may change in your production environment.\n\n###### What changes with LLMs?\n\nFor those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n\none-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n\nsignificantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n\nquestion answering, summarization and execution of near-arbitrary instructions.\n\nFrom the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n\nplatforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n\ninto more detail in the next section.\n\n\n-----\n\n**Table 3**\n\n\n\n|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n|---|---|\n|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n\n\n-----\n\nThe list above may look long, but as we will see in the next section, many existing tools and processes\n\nonly require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n\ndo not change:\n\n\u0007The separation of development, staging and production remains the same\n\n\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n\nmodels toward production\n\n\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n\n\u0007Existing CI/CD infrastructure should not require changes\n\n\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n\nmodel inference, etc.\n\n\n-----\n\n#### Discussion of key topics for LLMOps\n\nSo far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n\nmore details about selected topics.\n\n###### Prompt engineering\n\nPrompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n\nresponses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n\nWe will cover a few tips and best practices and link to useful resources.\n\n**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n\ngenerally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "403f5d8c7d9cbd8decd7c0e286cb08c8", + "\u0007Existing CI/CD infrastructure should not require changes\n\n\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n\nmodel inference, etc.\n\n\n-----\n\n#### Discussion of key topics for LLMOps\n\nSo far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n\nmore details about selected topics.\n\n###### Prompt engineering\n\nPrompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n\nresponses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n\nWe will cover a few tips and best practices and link to useful resources.\n\n**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n\ngenerally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n\nIn the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n\nprompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n\n**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n\n\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n\nprompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n\nmore details. Checking structured LLM pipeline code into version control also helps with prompt\n\ndevelopment, for git diffs allow you to review changes to prompts over time. Also see the section\n\nbelow on packaging model and pipelines for more information about tracking prompt versions.\n\n\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n\nNewer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n\n\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n\ntuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n\ntuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n\ntool for prompt tuning.\n\n\n-----\n\n###### Resources\n\nThere are lots of good resources about\nprompt engineering, especially for popular\n\nmodels and services:\n\n\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n\n\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n\n**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n\npopularity. Some of these generalize to other models as well. We will provide a few tips here:\n\n\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n\ninput, and a description of the desired output type or format", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "fb3241773f0b268b7253f34772d98d1a", + "[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n\n\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n\n**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n\npopularity. Some of these generalize to other models as well. We will provide a few tips here:\n\n\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n\ninput, and a description of the desired output type or format\n\n\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n\n\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n\n\u0007Tell the model to think step-by-step or explain its reasoning\n\n\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n\nclear which parts of the prompt correspond to your instruction vs. user input\n\n\n-----\n\n###### Packaging models or pipelines for deployment\n\nIn traditional ML, there are generally two types of ML logic to package for deployment: models and\n\npipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n\ncontrol, respectively.\n\nWith LLMs, it is common to package ML logic in new forms. These may include:\n\n\u0007A lightweight call to an LLM API service (third party or internal)\n\n\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n\nlocal LLM model.\n\n\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n\npretrained model or a custom fine-tuned model.\n\n\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n\nThough LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n\nand pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n\ndeployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n\n\u0007PyTorch and TensorFlow\n\n\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n\n\u0007LangChain\n\n\u0007OpenAI API\n\n\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n\nFor other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n\narbitrary Python code.\n\n\n**Note about prompt versioning:** Just as it is helpful\n\nto track model versions, it is helpful to track prompt\n\nversions (and LLM pipeline versions, more generally).\n\nPackaging prompts and pipelines as MLflow Models\n\nsimplifies versioning. Just as a newly retrained\n\nmodel can be tracked as a new model version in the\n\nMLflow Model Registry, a newly updated prompt can\n\nbe tracked as a new model version.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "b7a9be595932e694392d780538b29682", + "\u0007LangChain\n\n\u0007OpenAI API\n\n\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n\nFor other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n\narbitrary Python code.\n\n\n**Note about prompt versioning:** Just as it is helpful\n\nto track model versions, it is helpful to track prompt\n\nversions (and LLM pipeline versions, more generally).\n\nPackaging prompts and pipelines as MLflow Models\n\nsimplifies versioning. Just as a newly retrained\n\nmodel can be tracked as a new model version in the\n\nMLflow Model Registry, a newly updated prompt can\n\nbe tracked as a new model version.\n\n**Note about deploying models vs. code:** Your\n\ndecisions around packaging ML logic as version\n\ncontrolled code vs. registered models will help\n\nto inform your decision about choosing between\n\nthe deploy models, deploy code and hybrid\n\narchitectures. Review the subsection below about\n\nhuman feedback, and make sure that you have a\n\nwell-defined testing process for whatever artifacts\n\nyou choose to deploy.\n\n\n-----\n\n###### Managing cost/performance trade-offs\n\nOne of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n\nand serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n\nof billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n\nmanage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n\n**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n\ndevelopment is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n\nmodels. As you go, make sure to collect data such as queries and responses. In the future, you can use\n\nthat data to fine-tune a smaller, cheaper model which you can own.\n\n**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n\nHow much does each query cost? These estimates will inform you about project feasibility and will help\n\nyou to decide when to consider bringing the model in-house with open source models and fine-tuning.\n\n**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n\ncomputation and costs. These include shortening queries, tweaking inference configurations and using\n\nsmaller versions of models.\n\n**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n\nunless you get human feedback from end users.\n\n\n-----\n\n###### Resources\n\n**Fine-tuning**\n\n\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "58baf953efcaa5513cfbb2367b889214", + "smaller versions of models.\n\n**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n\nunless you get human feedback from end users.\n\n\n-----\n\n###### Resources\n\n**Fine-tuning**\n\n\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n**Model distillation,**\n**quantization and pruning**\n\n\n###### Methods for reducing costs of inference\n\n**Use a smaller model**\n\n\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n\nor alternate architectures.\n\n\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n\nperform better than a generic model.\n\n\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n\nmodel into a smaller model.\n\n\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n\nwithout losing much in quality.\n\n\n\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\n**\u0007Reduce computation for a given model**\n\n\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n\nqueries and responses reduces costs.\n\n\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n\n**Other**\n\n\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n\nlow ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n\n\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n\nmodels to use sparse computation during inference. This reduces computation for most or all queries.\n\n\n[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "97a9d43b25175ae2b3b51d9ac813bda2", + "\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n\nmodels to use sparse computation during inference. This reduces computation for most or all queries.\n\n\n[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n\n-----\n\n###### Human feedback, testing, and monitoring\n\nWhile human feedback is important in many traditional ML applications, it becomes much more important\n\nfor LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n\nmetrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n\nmight have almost completely different words and word orders, so even defining a “ground-truth” label\n\nbecomes difficult or impossible.\n\nHumans — ideally your end users — become essential for validating LLM output. While you can pay human\n\nlabelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n\nfeedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n\nto chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n\nwere helpful.\n\nIn terms of operations, not much changes from traditional MLOps:\n\n\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n\nlakehouse, and process it using the same data pipeline tooling as other data.\n\n\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n\nmore important, superceding offline quality tests. If you can collect user feedback, then these rollout\n\nmethods can validate models before they are fully deployed.\n\n\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n\nfine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n\nstart with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n\n\n###### Resources\n\n**Reinforcement Learning from**\n**Human Feedback (RLHF)**\n\n\u0007Chip Huyen blog post on\n\n[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n\n[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n\n\u0007Hugging Face blog post on\n\n[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n\n[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n\n\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n\n\n-----\n\n###### Other topics", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "51901892d98beec26cc5fa8ab0b1f58b", + "###### Resources\n\n**Reinforcement Learning from**\n**Human Feedback (RLHF)**\n\n\u0007Chip Huyen blog post on\n\n[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n\n[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n\n\u0007Hugging Face blog post on\n\n[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n\n[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n\n\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n\n\n-----\n\n###### Other topics\n\n\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n\nbut some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n\nyour LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n\nfine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n\n[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n\n\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n\nadjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n\nmodels, most LLMs will benefit from or require GPUs for serving and inference.\n\n\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n\nbased lookups of documents or other data. Vector databases may be an important addition to your\n\nserving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n\npreprocessed data which can be queried by inference jobs or model serving systems.\n\n\n-----\n\n#### Reference architecture\n\nTo illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n\nmodified version of the previous production architecture.\n\nProduction environment\n\nModel Registry\n\nStage: �one Stage: Staging Stage: Production\n\nLoad model for testing Load model for inference\n\n\nPush model to registry Promote to production\n\n\nModel serving\n\n\nLLM API request\n\nrelease\n\n\nFine-Tine LLM\n\nrelease\n\n\nVector Database\nUpdate\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring &\nEvaluation\n\nrelease\n\n\nInternal/External Data tables Vector database Metrics tables Human feedback\nmodel hub\n\n**Figure 7**\n\n\n-----\n\n###### Additional resources\n\nWith LLMs being such a novel field, we link to\nseveral LLM resources below, which are not\n\nnecessarily “LLMOps” but may prove useful\nto you.\n\n\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "8aae99498ab67fee9a85931282af919e", + "Push model to registry Promote to production\n\n\nModel serving\n\n\nLLM API request\n\nrelease\n\n\nFine-Tine LLM\n\nrelease\n\n\nVector Database\nUpdate\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring &\nEvaluation\n\nrelease\n\n\nInternal/External Data tables Vector database Metrics tables Human feedback\nmodel hub\n\n**Figure 7**\n\n\n-----\n\n###### Additional resources\n\nWith LLMs being such a novel field, we link to\nseveral LLM resources below, which are not\n\nnecessarily “LLMOps” but may prove useful\nto you.\n\n\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\n[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\nLLM lists and leaderboards\n\n\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n\n\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n\n\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n\n[Foundation Models](https://crfm.stanford.edu/)\n\n\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n\n\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\nThe primary changes to this production architecture are:\n\n\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n\nan internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n\nproduction to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n\ntuning, this hub would mainly be used in development.\n\n\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n\nmodel (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n\nbut it is similar operationally.\n\n\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n\nmost often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n\nits Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n\nthat these data stores and jobs are analogous in terms of operations.\n\n\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n\nAPI calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n\npotential latency or flakiness from third-party APIs, as well as another layer of credential management.\n\n\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n\nbut become essential in most LLM applications. Human feedback should be managed like other data,\n\nideally incorporated into monitoring based on near real-time streaming.\n\n\n[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\n-----\n\n#### Looking ahead", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "70c4fc510f690efc8e9e44abb2e20db6", + "its Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n\nthat these data stores and jobs are analogous in terms of operations.\n\n\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n\nAPI calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n\npotential latency or flakiness from third-party APIs, as well as another layer of credential management.\n\n\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n\nbut become essential in most LLM applications. Human feedback should be managed like other data,\n\nideally incorporated into monitoring based on near real-time streaming.\n\n\n[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\n-----\n\n#### Looking ahead\n\nLLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n\nsupport and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n\nsourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n\nflexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n\nWhile this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n\nrequires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n\nnews is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n\nLLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n\nthe power of large language models.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n9,000 organizations worldwide — including Comcast,\n\nCondé Nast and over 50% of the Fortune 500 — rely\n\non the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered\n\nin San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark ™ ,\n\nDelta Lake and MLflow, Databricks is on a mission\n\nto help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "83fa8c714cfff14256cf56543c98c4cb", + "**eBook**\n\n# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n\n### Real-world use cases with Databricks Lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nThree Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n\nThe Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n\nCommon Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n\nWhy Lakehouse for Insurance ............................................................................................................................................................................ **10**\n\nKey Use Cases for Insurance:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "5014f5f2c09c55edb470c8b5528eb000", + "Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n\nKey Use Cases for Insurance:\n\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fba83e9ab8b12d3c768f58c396c23616", + "**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n\nGlobal Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n\n**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n\nGet Started With Industry Solutions ............................................................................................................................................................. **20**\n\nConclusion ................................................................................................................................................................................................................... **26**\n\n\n-----\n\n## Introduction\n\nWith the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\nfrom the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\ninsights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\ndata from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\nclickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a89b2827b1da6463d7f2aa3ebcea8079", + "-----\n\n## Introduction\n\nWith the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\nfrom the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\ninsights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\ndata from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\nclickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n\nConsumers want stronger reassurance for what they value most: financial security and greater peace of mind.\nInsurers have always prided themselves on delivering such protection and security. However, customer needs\nhave changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\nchallenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\ntheir customers to remain competitive.\n\nData-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\npricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\ndigital investments and enterprise data strategy has become a top priority for boards and senior executives\nin the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\nto having one reliable source of truth for data, which is derived from batch and streaming data, structured and\nunstructured data, from multiple clouds and jurisdictions.\n\n\nIn a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\nfundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\nDatabricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\ntransform their businesses, resulting in significant operational efficiencies and improved customer experiences\nat a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\nand common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\nLakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\npartners available to assist you on your journey.\n\n\n**The future of insurance will**\n\n**become increasingly data-driven,**\n\n**and analytics enabled.”**\n\n**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n\n\n-----\n\nThe Lakehouse reference architecture below illustrates a sample framework upon\nwhich insurers can build. Moving from left to right in the diagram, the first layer\nrepresents various data sources such as on-premises systems, web and mobile\napplications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\nis then ingested through automated data pipelines, and processed within the\nLakehouse platform across three layers (Bronze, Silver and Gold). These layers\nare responsible for data preparation, including ML model registry, centralized\n\n\ngovernance, workflow orchestration, and job scheduling. They ensure a compliant\nand secure infrastructure that sits atop the cloud layer (or multiple clouds),\neliminating the need for data duplication. Finally, the transformed data is delivered\nas actionable insights and supports use cases such as automated reporting,\nbusiness analytics, customer 360, and claims analytics. These use cases not only\nmitigate risk but also drive revenue.\n\n\n**Data Sources**\n\n**On-Premises**\n**Servers**\n\n\n**Ingestion**\n\n\n**Lakehouse for Financial Services**\n\n**Bronze Layer** **Silver Layer** **Gold Layer**\n\n\n**Serving**\n\n**Automated**\n**Reporting**\n\n\n**Web and Mobile**\n**Applications**\n\n\n**Business Analytics**\n**and Interactive**\n**Dashboards**\n\n\n**Raw Entity Data**\n\n\n**Curated Feature**\n**Sets**\n\n\n**Aggregated**\n**Business Views**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "76dd835dcbfd9e1aaaff490ac84f4e71", + "**Data Sources**\n\n**On-Premises**\n**Servers**\n\n\n**Ingestion**\n\n\n**Lakehouse for Financial Services**\n\n**Bronze Layer** **Silver Layer** **Gold Layer**\n\n\n**Serving**\n\n**Automated**\n**Reporting**\n\n\n**Web and Mobile**\n**Applications**\n\n\n**Business Analytics**\n**and Interactive**\n**Dashboards**\n\n\n**Raw Entity Data**\n\n\n**Curated Feature**\n**Sets**\n\n\n**Aggregated**\n**Business Views**\n\n\n**Automated Data Pipelines**\n**(Batch or Streaming)**\n\n**Collaborative**\n**Data Source**\n\n\n**Internet-of-Things**\n**(IoT) Devices**\n\n\n**Enterprise Data**\n**Warehouses**\n\n\n**Third-Party APIs**\n**and Services**\n\n\n**ML Model**\n**Registry**\n\n\n**Centralized Data**\n**Governance**\n\n\n**Workflow**\n**Orchestration**\n\n\n**Productionized**\n**Referenced Data**\n**and Models**\n\n**Job Scheduling**\n\n\n-----\n\n## Three Trends Driving Transformation in Insurance\n\nOver the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\nThe following three trends are driving this transformation in the insurance industry:\n\n\n**The rapid emergence of large language**\n**models and generative AI**\n\nIn recent years, there has been a significant\nbreakthrough in the field of artificial intelligence with\nthe emergence of large language models (LLMs)\nand generative AI. These models, such as GPT-4 and\nits predecessors, Databricks Dolly and others are\nbuilt using deep learning techniques and massive\namounts of training data, enabling them to generate\nhuman-like text and perform a wide range of natural\nlanguage processing tasks. LLMs and generative AI\ncan help insurance companies automate repetitive\ntasks such as underwriting, claims processing,\n\nand customer service, improving efficiency and\nreducing costs. They can also help insurers to better\nunderstand customer needs and preferences,\nleading to more personalized products and services.\nHowever, as with any disruptive technology, the\nadoption of LLMs and generative AI will require\ncareful consideration of ethical and regulatory\nissues, such as data privacy and algorithmic bias.\n\n\n**Transformed ecosystems**\n**and open insurance**\n\n[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\ninsurtechs in their ecosystems to achieve high\nmargins in commoditized products. Open insurance,\nwhich involves sharing and managing insurancerelated data through APIs, is more than an item in\nthe regulatory agenda. It can give consumers access\nto better products and accurate pricing, as well as\nenable them to execute transactions more easily.\nIn its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\nfound that organizations that promote external data\nsharing have three times the measurable economic\n\nbenefit across a variety of performance metrics\ncompared to their peers.\n\n\n**Revised target operating model**\n**with a focus on talent**\n\nDemographic shifts and perennial cost pressures\nmake it critical for insurers to attract and retain\ntalent. Consequently, it’s important for insurers\nto equip their workforces with the right tools\nand technologies to help them identify business\nprocesses that can be optimized to differentiate\nthemselves from their competitors, with an emphasis\non moments that matter in the customer journey,\naccording to EY. Recent research from Deloitte\nhighlights the advantages of upskilling and building\na future-ready workforce. One of the benefits", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2db840ab2d9248420f54aab2c527ca4b", + "benefit across a variety of performance metrics\ncompared to their peers.\n\n\n**Revised target operating model**\n**with a focus on talent**\n\nDemographic shifts and perennial cost pressures\nmake it critical for insurers to attract and retain\ntalent. Consequently, it’s important for insurers\nto equip their workforces with the right tools\nand technologies to help them identify business\nprocesses that can be optimized to differentiate\nthemselves from their competitors, with an emphasis\non moments that matter in the customer journey,\naccording to EY. Recent research from Deloitte\nhighlights the advantages of upskilling and building\na future-ready workforce. One of the benefits\n\nof AI adoption in the workforce is that it enables\norganizations to automate a wide range of business\nprocesses, boosting speed and efficiency. But what’s\neven more important is that it enables employees to\nfocus on higher-value work, according to Deloitte.\n\n\n-----\n\n## The Need for Modern Data Infrastructure\n\n**Insurers turning to cloud and data analytics**\n\n\nThe insurance industry has undergone significant changes over the years, and\none of the areas that has evolved the most is data management. With the\ngrowing need for advanced analytics and digital transformation, many insurance\ncompanies are turning to cloud technology and modern data infrastructures\nto enhance their data management strategies. The benefits of adopting cloud\ntechnology are numerous, particularly the ability to efficiently store and quickly\naccess vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\ninsurers to scale costs, adapt to changing work environments, and meet evolving\ncustomer and business requirements.\n\n\ndynamic pricing and underwriting, and form the foundation for claims automation.\nBy implementing advanced analytics, insurers can innovate more easily, scale their\nbusinesses, and bring new products to market more quickly.\n\nTo remain competitive, insurance companies must increase their investment in\ncloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\npolicy administration, and customer satisfaction. Overall, the adoption of cloud\ntechnology and data analytics is imperative for insurance providers to enhance\noperational efficiency, improve business processes, and stay relevant in today’s\nfast-paced business landscape.\n\n\nFurthermore, insurance providers can leverage the cloud to analyze customer\ndata at scale, gaining insights into behaviors that drive hyper-personalization,\n\n\n-----\n\n**Let’s take a closer look look at a few examples:**\n\n\n**Auto insurers** need to integrate new data sources, such as weather and traffic,\nto build solutions capable of real-time processing. This enables them to alert\nemergency services promptly and gain a better understanding of drivers’ driving\npatterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n\n**Commercial insurance** , including property, general liability, cyber insurance and\nbusiness income insurance, utilizes ML-based automation of actuarial models.\nThis automation facilitates underwriting, claims forecasting and dynamic pricing\nfor their customers. Another notable trend in recent years is the use of IoT-\n\n\nbased alerting for sensitive or valuable commodities. For example, in the case of\nvaccines, IoT sensors can monitor the temperature in real time and send alerts to\nthe appropriate team or person if the temperature exceeds acceptable thresholds.\nThis is crucial as vaccines must be stored within specific temperature ranges.\n\nIn **life insurance** , complex ML models can be employed to create a profile of\nthe customer’s lifestyle and, importantly, detect any changes to it. This deeper\nunderstanding and 360-degree view of the customer enable more customized\nunderwriting and pricing based on the policyholder’s current health, lifestyle and\neating habits.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "46e48686f920f1e618b993de52ae72d0", + "**Commercial insurance** , including property, general liability, cyber insurance and\nbusiness income insurance, utilizes ML-based automation of actuarial models.\nThis automation facilitates underwriting, claims forecasting and dynamic pricing\nfor their customers. Another notable trend in recent years is the use of IoT-\n\n\nbased alerting for sensitive or valuable commodities. For example, in the case of\nvaccines, IoT sensors can monitor the temperature in real time and send alerts to\nthe appropriate team or person if the temperature exceeds acceptable thresholds.\nThis is crucial as vaccines must be stored within specific temperature ranges.\n\nIn **life insurance** , complex ML models can be employed to create a profile of\nthe customer’s lifestyle and, importantly, detect any changes to it. This deeper\nunderstanding and 360-degree view of the customer enable more customized\nunderwriting and pricing based on the policyholder’s current health, lifestyle and\neating habits.\n\n\n|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n\n\n**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n\n\n-----\n\n## Common Challenges Insurers Face Using Legacy Technology\n\n\nModernization is not an easy process for insurers, and while transforming IT\necosystems is necessary to improve business outcomes, ensuring business\ncontinuity is absolutely critical. However, the volume of data they collect, along\nwith changes in user behavior and legacy systems that can’t handle this amount of\ndata, are forcing insurance providers to accelerate their modernization journeys.\n\nInsurance providers face several challenges when using legacy technology, including:\n\n**Legacy on-premises systems:** Legacy on-premises systems are not only\nexpensive to maintain, but they also store large amounts of big data in silos across\nthe business. This makes it difficult to access the data, hindering data analytics\nefforts and limiting executives’ ability to make informed business decisions.\n\n**Ingesting large volumes of transactional data in real time:** The inability to\ningest data from transaction systems in real time is a major obstacle to obtaining\ncritical insights. Transaction logs from operations such as policy administration,\nenrollment and claims constantly stream data. However, many insurance\ncompanies still rely on legacy data warehouses built around batch processing,\nwhich is not suitable for ingesting and integrating large data sets. As a result,\ninsurers often opt to ingest data nightly, leading to delays in receiving accurate\ndata for decision-making.\n\n\n**Performing fine-grained analysis at scale within tight time frames:** Legacy\ntechnology forces insurers to make a trade-off when analyzing data for user intent.\nThey can choose between detailed and accurate predictions or fast predictions.\nRunning detailed forecasts can improve accuracy, but it requires performing\nmillions of model calculations within narrow service windows, which exceeds the\ncapability of legacy data platforms. Consequently, insurance companies have to\naccept less accurate predictions.\n\n**Powering real-time decisions on the front line:** Serving real-time data to\nthousands of workers is a complex task. While data warehouses can serve reports\nto large groups of users, they are limited to providing stale data. As a result, most\ninsurers only provide daily or weekly updates to reports and rely on employees’\njudgment for more frequent decisions.\n\n**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\nto deliver personalized experiences across every channel, both digital and offline.\nWhile insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\nhigh volumes, leading to inaccurate analytics. To succeed in the insurance industry,\ncompanies must deliver personalized experiences at scale.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2751342200e6294e9cde3b924bddfeda", + "**Powering real-time decisions on the front line:** Serving real-time data to\nthousands of workers is a complex task. While data warehouses can serve reports\nto large groups of users, they are limited to providing stale data. As a result, most\ninsurers only provide daily or weekly updates to reports and rely on employees’\njudgment for more frequent decisions.\n\n**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\nto deliver personalized experiences across every channel, both digital and offline.\nWhile insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\nhigh volumes, leading to inaccurate analytics. To succeed in the insurance industry,\ncompanies must deliver personalized experiences at scale.\n\n\n-----\n\nDatabricks Lakehouse for Insurance addresses the key challenges faced across the\ninsurance value chain. The lakehouse enables the integration of various data types,\nincluding images and structured data, in real time. It offers robust management\nand governance capabilities, and rapidly transforms data into actionable insights\n\n\nthrough real-time reporting and predictive analytics. This platform-as-a-service\nsolution delivers exceptional speed and industry-leading total cost of ownership,\nproviding insurers with faster insights to enhance the customer experience and\ngain a competitive edge.\n\n\n**Product**\n**Development &**\n**Feature Selection**\n\n\n**Application**\n**Review &**\n**Submission**\n\n\n**Policy Issue,**\n**Service &**\n**Administration**\n\n\n**Sales & Lead**\n**Management**\n\n**Hyperpersonalization/**\n**life events**\n\n\n**Underwriting**\n**and Pricing**\n\n**UW rules**\n**guidelines &**\n**technical pricing**\n\n\n**Rating Offer &**\n**Endorsements**\n\n**Evaluate**\n**rate options,**\n**pricing and**\n**endorsements**\n\n\n**Claims**\n\n\n**Coverage/** **Review policy**\n**features/riders** **documents**\n**(submission)**\n\n\n**Omnichannel** **Fraud, frequency,**\n**severity and**\n**reserves**\n\n\n**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n\n\n\n**•** Dynamic segmentation\n\n**•** Personas\n\n**•** Hyper-personalization\n\n**•** Intelligent automation\n\n\n\n**•** Product architecture and\nmanufacturing\n\n**•** Configurable products\n\n**•** Competitor rates\n\n\n\n**•** Reflexive questionnaire\n\n**•** LLM assistance for\ndocument summarization\n\n**•** NLP for unstructured data\n\n\n\n**•** Evaluation of risk within\nappetite\n\n**•** Validation of UW\nrequirements\n\n**•** Straight-through\nprocessing optimization\n\n**•** Risk assessment via\nactuarial pricing\n\n**•** Triaging of risk to\nunderwriter SME for policy/\nexposure changes\n\n\n\n**•** Predict loss cost\n(frequency and severity)\n\n**•** Computer vision on images\nto identify loss\n\n**•** Auto-adjudication and\ntriaging of claims to claim\nadjuster\n\n**•** Tailor communication by\nsegment (e.g., email, text,\nmail, or omnichannel)\n\n**•** Identify Fraud, Waste and\nAbuse, route to ICU\n\n\n**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n\n\n-----\n\n## Why Lakehouse for Insurance", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "63622c61194bbab78b3598f578b3dc38", + "**•** Evaluation of risk within\nappetite\n\n**•** Validation of UW\nrequirements\n\n**•** Straight-through\nprocessing optimization\n\n**•** Risk assessment via\nactuarial pricing\n\n**•** Triaging of risk to\nunderwriter SME for policy/\nexposure changes\n\n\n\n**•** Predict loss cost\n(frequency and severity)\n\n**•** Computer vision on images\nto identify loss\n\n**•** Auto-adjudication and\ntriaging of claims to claim\nadjuster\n\n**•** Tailor communication by\nsegment (e.g., email, text,\nmail, or omnichannel)\n\n**•** Identify Fraud, Waste and\nAbuse, route to ICU\n\n\n**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n\n\n-----\n\n## Why Lakehouse for Insurance\n\nDatabricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\nbest-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n\n\n\n**•** Insurance companies can store any type of\ndata using Databricks Lakehouse for Insurance,\nleveraging the low-cost object storage supported\nby cloud providers. This helps break down data\nsilos that hinder efforts to aggregate data for\nadvanced analytics, such as claim triaging and\nfraud identification, regulatory reporting, or\ncompute-intensive risk workloads. Another critical\nfeature is the time-travel capabilities of the\nlakehouse architecture, allowing insurers to access\nany historical version of their data.\n\n\n\n**•** Supporting streaming use cases, such as\nmonitoring transaction data, is easier with the\nlakehouse. It utilizes Apache Spark ™ as the data\nprocessing engine and Delta Lake as the storage\nlayer. Spark enables seamless switching between\nbatch and streaming workloads with just a single\nline of code. Delta Lake’s native support for ACID\ntransactions ensures reliable and high-performing\nstreaming workloads.\n\n\n\n**•** For both machine learning and non-machine\nlearning insurance models, a comprehensive\ngovernance framework is provided. Data, code,\nlibraries and models are linked and independently\nversion controlled using technologies like Delta\nLake and MLflow. Delta Lake ensures stability by\nallowing insurance companies to declare their\nexpectations for data quality upfront. MLflow\nenables training models in any language and\ndeploying them anywhere, minimizing the need for\ncomplex handoffs between data science practices,\nindependent validation units and operational teams.\n\n\n-----\n\n**Level-up value with Databricks Lakehouse for insurance**\n\nBuilding your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\nuse cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n\nWith a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\ncloud data architecture. The key benefits include:\n\n\n**1. Cost and complexity reduction**\n\nThe Databricks Lakehouse provides an open, simple\nand unified cloud data management architecture\nthat streamlines operational inefficiencies, reduces\nIT infrastructure costs, and enhances productivity\nacross teams.\n\n\n**2. Enhanced risk management and control**\n\nBy unlocking the value of enterprise data, the\nplatform helps reduce corporate governance and\nsecurity risks. It facilitates data-driven decisionmaking through governed discovery, access and\ndata sharing.\n\n\n**3. Accelerated innovation**\n\nThe platform enables the acceleration of digital\ntransformation, modernization and cloud migration\ninitiatives, fostering new growth opportunities\nand driving innovation for improved customer and\nworkforce experiences.\n\n\nTo help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n\n\n-----\n\n**Reference Architecture for Smart Claims**\n\n\n**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n\nor incrementally through change data capture (CDC). These\n\ninclude structured and unstructured data sets like images, text,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "9e5658fb4d571359c7d2e349d628c5ce", + "The Databricks Lakehouse provides an open, simple\nand unified cloud data management architecture\nthat streamlines operational inefficiencies, reduces\nIT infrastructure costs, and enhances productivity\nacross teams.\n\n\n**2. Enhanced risk management and control**\n\nBy unlocking the value of enterprise data, the\nplatform helps reduce corporate governance and\nsecurity risks. It facilitates data-driven decisionmaking through governed discovery, access and\ndata sharing.\n\n\n**3. Accelerated innovation**\n\nThe platform enables the acceleration of digital\ntransformation, modernization and cloud migration\ninitiatives, fostering new growth opportunities\nand driving innovation for improved customer and\nworkforce experiences.\n\n\nTo help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n\n\n-----\n\n**Reference Architecture for Smart Claims**\n\n\n**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n\nor incrementally through change data capture (CDC). These\n\ninclude structured and unstructured data sets like images, text,\n\nand video, such as IoT sensor data, operational data like claims\n\nand policies, and on-prem or third-party data such as from\n\ncredit bureaus, weather, and driving records. Partner Connect\n\noffers a range of ingest tools from different vendors that you can\n\ndirectly use from the Databricks portal.\n\n\n**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n\npath to transform the data based on business\n\nrequirements. All the data resides in cloud storage,\n\nwhere Delta refines it into Bronze, Silver and Gold\n\nzones of a medallion pipeline blueprint. Databricks\n\nWorkflows provide orchestration of the various\n\ndependent tasks, with advanced capabilities like\n\n\n**3.** \u0007Databricks SQL, with Photon\n\nand serverless options, caters\n\nto BI consumption use cases to\n\nrefresh a dashboard monitoring\n\nkey metrics and KPIs, with\n\nquery history and alerts on\n\ncritical events.\n\n\n**4.** \u0007Databricks ML Runtime,\n\nMLFlow, along with\n\nFeature Store, Auto ML,\n\nand real-time Model\n\nServing enable ML\n\nuse cases to provide\n\n\n**5.** \u0007Delta Sharing provides\n\na secure and governed\n\nway of sharing data\n\ninternally and externally\n\nwithout copying it,\n\nusing Unity Catalog.\n\n\npredictive insights.\n\n\nretry, repair and job status notifications.\n\n\n-----\n\n**Secure data sharing with Delta Lake**\n\nAt the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\nLake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\nunify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n\nOnce the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\nThey can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n\n**Business intelligence**\n\n**Streaming**\n\n**Centralized**\n**governance**\n\n\n##### Lakehouse Platform\n\n\n**Data science / ML**\n\n**One copy**\n**of data**\n\n**Data warehouse**\n\n**Orchestration**\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Claims automation and transformation\n\n**Overview**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "013e9842fece7a621a03d16a704b8220", + "Once the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\nThey can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n\n**Business intelligence**\n\n**Streaming**\n\n**Centralized**\n**governance**\n\n\n##### Lakehouse Platform\n\n\n**Data science / ML**\n\n**One copy**\n**of data**\n\n**Data warehouse**\n\n**Orchestration**\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Claims automation and transformation\n\n**Overview**\n\n\nInsurers are entering a new era of claims transformation, supported by evolving technological advancements\nand increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\namount of structured and unstructured data coming in from different sources, in different formats, and time\nframes. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\nby a combination of technology and human intervention that seamlessly expedites the process.\n\n**Business problem**\n\nMissing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\nleakage and inefficient processes in triaging claims to the right resource.\n\n**Solution/value with Databricks**\n\nEnable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\nincluding MLflow model lifecycle management.\n\n**Business outcomes and benefits**\n\n**•** Decrease in annual claims payout\n\n**•** Increase in claim fraud detection/prevention\n\n**•** Improve efficiencies by 15%\n\n**“Applying AI as broadly, as aggressively**\n\n**and as enthusiastically as possible. No part**\n\n**of our business should be untouched by it.”**\n\n— \u0007Masashi Namatame, Group Chief Digital Officer,\nManaging Executive Officer, Tokio Marine\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**Tokio Marine: Striving to**\n**become Al-driven**\n\nInsurers of all types now routinely use AI\nmodels to drive underwriting, streamline claims\nprocessing and accelerate claims adjudication,\nprotect against insurance fraud, and improve\nrisk forecasting, for example. Tokio Marine —\nJapan’s oldest insurance company, which has\ndone business since 1879 — has been applying\nadvanced uses of AI, particularly in its auto\ninsurance business, says Masashi Namatame,\nGroup Chief Digital Officer and Managing\nExecutive Officer at Tokio Marine: “To assess\ncollision damages, the company uses an AIbased computer vision solution to analyze\nphotos from accident scenes.” Comparing these\nwith what he describes as “thousands or even\nmillions” of photos of past analogous incidents,\nthe model produces liability assessments of the\nparties involved and projects anticipated repair\ncosts. AI has also provided the company with\ntangible benefits in online sales — especially in\npersonalized product recommendations and\ncontract writing, according to Namatame. Read\nthe case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n\n\n-----\n\n**K E Y U S E C A S E**\n## Dynamic pricing and underwriting\n\n**Overview**\n\n\nIn modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\ncarriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\nThis involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\ngeolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\noffers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n\n**Business problem**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "808dbdaefdc79410d236eb7fb1e575ae", + "-----\n\n**K E Y U S E C A S E**\n## Dynamic pricing and underwriting\n\n**Overview**\n\n\nIn modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\ncarriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\nThis involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\ngeolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\noffers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n\n**Business problem**\n\nActuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\ncapabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n\n**Solution/value with Databricks**\n\n**•** Unified cloud-native platform\n\n**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n\n**•** Reduced total cost of ownership compared to legacy Hadoop systems\n\n**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\nlowering loss ratios\n\n**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n\n**Business outcomes and benefits**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**American financial services**\n**mutual organization**\n\nThis organization aimed to leverage the vast\namounts of structured and unstructured data\nit collected to enhance its underwriting and\ndecision-making processes, enabling greater\nefficiency and effectiveness. However, the\ncompany’s legacy infrastructure struggled\nto scale with the increasing data volume and\nprocessing demands, limiting its ability to\nanalyze the data and derive actionable insights.\n\nWith Databricks, the insurer centralized\neverything on one unified Lakehouse platform,\n\nsupporting all operational and analytical\nuse cases. This allowed them to analyze\nbroader sets of data for superior underwriting\nperformance and create a digitally empowered,\nend-to-end underwriting experience.\n\n\n\n**•** Improve competitive position\n\n**•** Decrease combined ratio\n\n**•** 15% improvement in efficiencies\n\n\n-----\n\n**K E Y U S E C A S E**\n## Anomaly detection and fraudulent claims\n\n**Overview**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**One of the largest U.S.**\n**insurance companies and a**\n**leading small business insurer**\n\nThe increasing availability of data and market\ncompetition challenge insurance providers to\noffer better pricing to their customers. This\nU.S.-based insurer, with hundreds of millions of\ninsurance records to analyze for downstream\nML, realized that their legacy batch analysis\nprocess was slow and inaccurate, providing\nlimited insight for predicting the frequency\nand severity of claims. With Databricks, they\nwere able to scale up the use of deep learning\nmodels, resulting in more accurate pricing\npredictions and increased revenue from\nclaims. By leveraging Databricks Lakehouse,\nthey harmonized data, analytics and AI at\nscale, enabling accurate pricing predictions\nand supporting various use cases from vehicle\ntelematics to actuarial modeling.\n\n\nFraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\nAmerican consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\nin 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\nchange to support new channels and services, offering transactional features and facilitating payments through\ndigital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\nconsumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\nmodels. It often involves a complex decision science process that combines a rules engine with a robust and\nscalable machine learning platform.\n\n**Business problem**\n\nInsurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n\n**Solution/value with Databricks**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fec16e5ce1d62014b12f9de0cbc3e75c", + "Fraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\nAmerican consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\nin 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\nchange to support new channels and services, offering transactional features and facilitating payments through\ndigital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\nconsumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\nmodels. It often involves a complex decision science process that combines a rules engine with a robust and\nscalable machine learning platform.\n\n**Business problem**\n\nInsurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n\n**Solution/value with Databricks**\n\nModernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\nproviders (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\nThis data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n\n**$1 of fraud costs companies 3.36x in chargeback,**\n**replacement and operational costs**\n\n\n[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Customer 360 and hyper-personalization\n\n\n**Overview**\n\nWinning the hearts and minds of your customers\nstarts with personalizing the user experience. The\nability to offer complementary products to meet\nthe needs of your customers lets you build deeper\nrelationships with them and engender their loyalty.\nIn addition, a better understanding of the finer\ndetails within accounts allows you to offer them\nmore personalized products. To do this, you need\n360-degree customer views, which requires you to\nlocate and consolidate all your customers’ contact\ndata from every digital tool that you use and house\nit in one central location. With Databricks Lakehouse,\ninsurers can “hyper-personalize,” increase\ncross-sell/upsell opportunities, enhance customer\n360 and bring new products to market faster.\n\n**Business problem**\n\nThe inability to reconcile customer records across\ndifferent lines of business limits real-time customer\ninsights necessary for upselling and cross-selling.\nSiloed data makes it challenging to create accurate\nand comprehensive customer profiles, resulting in\nsuboptimal recommendations for the next best action.\n\n\n**Solution/value with Databricks**\n\nDatabricks provides the tools needed to process\nlarge volumes of data and determine the next best\naction at any point in the customer journey.\n\n**•** Eliminates data silos by unifying all customer data,\nincluding basic information, transactional data,\nonline behavior/purchase history, etc., to create\ncomplete customer profiles\n\n**•** Integrated data security ensures that security\nmeasures are incorporated at every layer of the\nDatabricks Lakehouse Platform\n\n**•** Delta improves data quality, providing a single\nsource of truth for real-time streams and ensuring\nreliable and high-quality data for data teams\n\n**•** Integrated ML and AI capabilities utilize AI to\ncreate self-optimizing ML models that determine\nthe next best step for each customer\n\n**•** MLflow model lifecycle management helps manage\nthe entire machine learning lifecycle reliably,\nsecurely and at scale\n\n\n**Business outcomes and benefits**\n\n**•** Use AI, ML, automation and real-time data to\ngain deeper customer insights and understand\ntheir needs\n\n**•** Improve competitive positioning\n\n**•** Enhance the customer experience\n\n**C U S T O M E R C A S E S T U D Y**\n\n**160-year-old U.S.**\n**insurance company**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "4c4fe41bf1c9f3e6411a258785e8e375", + "**•** Integrated data security ensures that security\nmeasures are incorporated at every layer of the\nDatabricks Lakehouse Platform\n\n**•** Delta improves data quality, providing a single\nsource of truth for real-time streams and ensuring\nreliable and high-quality data for data teams\n\n**•** Integrated ML and AI capabilities utilize AI to\ncreate self-optimizing ML models that determine\nthe next best step for each customer\n\n**•** MLflow model lifecycle management helps manage\nthe entire machine learning lifecycle reliably,\nsecurely and at scale\n\n\n**Business outcomes and benefits**\n\n**•** Use AI, ML, automation and real-time data to\ngain deeper customer insights and understand\ntheir needs\n\n**•** Improve competitive positioning\n\n**•** Enhance the customer experience\n\n**C U S T O M E R C A S E S T U D Y**\n\n**160-year-old U.S.**\n**insurance company**\n\nThis insurance provider underwent a significant\ndigital transformation to provide a more\npersonalized financial services experience to\nits 10,000 advisors and millions of customers\nacross various touchpoints. Recognizing the\nimportance of becoming data-driven, the\ncompany leveraged Databricks in its client\n360 platform to aggregate transactional and\nbehavioral data, along with core attributes,\nproviding business users with next-best-action\nrecommendations for seamless customer\nengagement.\n\n\n-----\n\n## Global Regulatory Impact in Insurance\n\n\n**Navigating global regulations**\n**with technical implementation**\n\nDigital innovation continues to reshape the insurance sector. The pace and scale\nof technological change are likely to increase due to factors such as artificial\nintelligence (AI), cloud computing, and the entry of new players like insurtechs,\ne-tailers, and manufacturers from outside the insurance industry.\n\nTo succeed and thrive in today’s economic environment, insurers should prioritize\nupgrading their infrastructure and technology, rather than solely focusing on\ntransforming operations. For example, migrating from on-premises systems to the\ncloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n\nAs insurers upgrade their compliance processes to meet new global regulations,\nsuch as IFRS 17 and LDTI, the impact of regulatory updates becomes more\ncomplex for organizations operating across multiple jurisdictions. Instead of\nmerely responding to regulatory and industry requirements, insurance companies\nshould make data-focused investments that help them anticipate and meet the\nexpectations of distributors and policyholders.\n\n\n**IFRS-17**\n\nIFRS 17 is an International Finance Reporting Standard (IFRS) for\ninsurance contracts. IFRS 17 aims to standardize insurance accounting\nby providing consistent principles for all facets of accounting for\ninsurance contracts. IFRS 17 removes existing inconsistencies so\nanalysts, investors and others can more easily compare companies,\ncontracts and industries.\n\n**LDTI for long-duration contracts**\n\nThe Financial Accounting Standards Board long-duration targeted\nimprovements (LDTI) introduced changes to the U.S. GAAP accounting\nmodel to simplify and improve the financial reporting of long-duration\ncontracts, including providing financial statement users with more\ntimely and relevant information about those contracts.\n\n\nIt is crucial for insurers to redirect their focus toward developing advanced data\nmanagement and utilization capabilities that offer better insights and improved\nperformance. These investments serve as not only a foundation for regulatory\ncompliance but also a starting point for more comprehensive and proactive\ntransformation initiatives.\n\n\n-----\n\n**I N D U S T R Y S O L U T I O N S**\n\n## Get Started With Accelerators, Brickbuilders and Enablers\n\nInsurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n\n**Adoption challenges**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "99907c1053249536af54c1cc15272bd1", + "**LDTI for long-duration contracts**\n\nThe Financial Accounting Standards Board long-duration targeted\nimprovements (LDTI) introduced changes to the U.S. GAAP accounting\nmodel to simplify and improve the financial reporting of long-duration\ncontracts, including providing financial statement users with more\ntimely and relevant information about those contracts.\n\n\nIt is crucial for insurers to redirect their focus toward developing advanced data\nmanagement and utilization capabilities that offer better insights and improved\nperformance. These investments serve as not only a foundation for regulatory\ncompliance but also a starting point for more comprehensive and proactive\ntransformation initiatives.\n\n\n-----\n\n**I N D U S T R Y S O L U T I O N S**\n\n## Get Started With Accelerators, Brickbuilders and Enablers\n\nInsurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n\n**Adoption challenges**\n\n\nNumerous challenges hinder organizations from developing and implementing the\nnecessary technical solutions to enhance their operational effectiveness, increase\nrevenue, and stay competitive. These challenges include:\n\n**•** Lack of technical skills (data scientists/data engineers): Companies often\nstruggle to find employees proficient in Python or Scala, or individuals who\npossess extensive experience in data science.\n\n\n\n**•** Business problems require in-depth data science and industry knowledge:\nBusinesses seek solutions tailored to address specific problems, rather than\ngeneric technical features.\n\n**•** Companies seek actionable insights: Organizations prefer readily applicable\npatterns that can be quickly implemented, rather than custom data science\nsolutions that come with potential costs and risks of implementation failure.\n\n\n**What are accelerators/enablers?**\n\n\n**Solution Accelerators**\n\nSave hours on discovery, design, development and\ntesting with Databricks Solution Accelerators. Our\npurpose-built guides, including fully functional\nnotebooks and best practices, expedite results for\nyour most common and high-impact use cases. With\nthese accelerators, you can go from idea to proof of\nconcept (PoC) in as little as two weeks.\n\n\n**Brickbuilders**\n\nBrickbuilder Solutions are data and AI solutions\ndesigned by leading consulting companies to\naddress industry-specific business requirements.\nBuilt on the Databricks Lakehouse Platform and\nbacked by the industry experience of these\nconsultancies, businesses can have confidence\nin solutions tailored to their specific use cases.\nBrickbuilder Solutions can be implemented at any\nstage of the customer journey.\n\n\n**Solution Enablers**\n\nSolution enablers consist of targeted collections\nof notebooks and materials, such as webinars and\nblog posts, designed to support larger solutions.\nThey aim to solve pain points or address specific\nlayers of business capabilities, such as resolving data\ningestion challenges.\n\n\n-----\n\n## Get Started With Industry Solutions\n\n\n**Claims transformation:**\n**automation and fraud prevention**\n\nInsurers are entering a new era of claims transformation, supported by evolving\ntechnological advancements and growing data availability. The end-to-end claims\nprocess, from extracting relevant information from documentation submitted\nwhen filing a claim to triaging and routing claims and the underwriting process,\nis ripe for digital transformation. By leveraging the Databricks Lakehouse,\norganizations can handle millions of data points coming in different formats and\ntime frames, from various sources, at an unprecedented volume. Every touchpoint\nin the claims journey, starting even before an incident occurs, will be supported by\na combination of technology and human intervention that seamlessly expedites\nthe process. Personalizing the claims experience by anticipating needs, providing\nreal-time status alerts, and reducing friction in the process increases customer\nloyalty and retention.\n\n\n**Customer/Partner Successes**\n\n**Accelerate underwriting through collaboration and efficient ML**\n\nA leading P&C insurer took full advantage of the MongoDB and Databricks\nintegration, leveraging both platforms to foster collaboration between their data\nand developer teams. The integration provides a more natural development\nexperience for Spark users and exposes all of Spark’s libraries. This allows\nMongoDB data to be materialized as DataFrames and data sets for analysis\nusing machine learning, graph, streaming and SQL APIs. The insurer also benefits\nfrom automatic schema inference. With this integration, the insurer was able to\ntrain and observe their ML models (MongoDB Atlas Charts) more efficiently and\nincorporate them into business applications.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "4e253736b9449112ad11cee7566da0ca", + "**Customer/Partner Successes**\n\n**Accelerate underwriting through collaboration and efficient ML**\n\nA leading P&C insurer took full advantage of the MongoDB and Databricks\nintegration, leveraging both platforms to foster collaboration between their data\nand developer teams. The integration provides a more natural development\nexperience for Spark users and exposes all of Spark’s libraries. This allows\nMongoDB data to be materialized as DataFrames and data sets for analysis\nusing machine learning, graph, streaming and SQL APIs. The insurer also benefits\nfrom automatic schema inference. With this integration, the insurer was able to\ntrain and observe their ML models (MongoDB Atlas Charts) more efficiently and\nincorporate them into business applications.\n\nAs a result, crucial underwriting processes that previously took days are now executed\nin seconds. In addition to the time and cost savings, the company can provide a more\nimmediate response to customers within its digital experience platform.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n\n**Claims processing is the process whereby an insurer receives,**\n\n\n**verifies and processes a claim report submitted by a policyholder.**\n\n**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n\n**criticial component of customer satisfaction with their carrier.”**\n\n\n**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n\n\n[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n\n\n**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n\n\n**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n\n**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n\n\n-----\n\n**Risk management:**\n**dynamic pricing and underwriting**\n\nModernized approaches at insurance carriers require a full digital transformation,\nand one aspect of this transformation involves dynamic pricing and underwriting\nto reduce premiums. Insurance providers are now consuming data from the largest\nmobile telematics providers to obtain the most granular sensor and trip summaries\nfor users of online insurance applications. Not only is this data critical for pricing,\nbut it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\nand underwriting automate routine tasks and provide teams with alternative\ndata sources to empower actuarial and underwriting professionals to become\n“exponential.” This allows teams to focus on key aspects of risk selection and\nanalysis that drive competitive advantage and market differentiation. By leveraging\npersonalized data points, insurers can deliver near real-time underwriting\ndecisions for life insurance applicants, reducing policy abandonment and costs.\n\n\n**Customer/Partner Successes**\n\n**Automated extraction of medical risk factors for life insurance underwriting**\n**(John Snow Labs)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "ccdf45dc81f51db3b4e319cdd68d324f", + "-----\n\n**Risk management:**\n**dynamic pricing and underwriting**\n\nModernized approaches at insurance carriers require a full digital transformation,\nand one aspect of this transformation involves dynamic pricing and underwriting\nto reduce premiums. Insurance providers are now consuming data from the largest\nmobile telematics providers to obtain the most granular sensor and trip summaries\nfor users of online insurance applications. Not only is this data critical for pricing,\nbut it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\nand underwriting automate routine tasks and provide teams with alternative\ndata sources to empower actuarial and underwriting professionals to become\n“exponential.” This allows teams to focus on key aspects of risk selection and\nanalysis that drive competitive advantage and market differentiation. By leveraging\npersonalized data points, insurers can deliver near real-time underwriting\ndecisions for life insurance applicants, reducing policy abandonment and costs.\n\n\n**Customer/Partner Successes**\n\n**Automated extraction of medical risk factors for life insurance underwriting**\n**(John Snow Labs)**\n\nLife insurance underwriting considers an applicant’s medical risk factors in\naddition to mortality risk characteristics. These risk factors are often found\nin free-text documents. New insurance-specific natural language processing\n(NLP) models can automatically extract relevant medical history and risk factors\nfrom such documents. Forward-thinking companies are embracing accelerated\nunderwriting, which utilizes new data along with algorithmic tools and modeling\ntechniques to quickly assess and group applicants without requiring bodily fluids,\nphysician’s notes, and so on. This joint Solution Accelerator from Databricks and\nJohn Snow Labs simplifies the implementation of this approach, creating a faster,\nmore consistent, and scalable underwriting experience.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n\n**Risk is highly influenced by behavior, and 80% of morbidity in**\n\n\n**healthcare risk is driven by factors such as smoking, drinking**\n\n**alcohol, physical activity and diet. In the case of driving,**\n\n**60% of fatal accidents are a result of behavior alone. If insurers**\n\n**can change customer behaviors and help them make better**\n\n**choices, then the risk curve shifts.”**\n\n\n**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n\n**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2bc1a24e9f2f35f29d6f23452045b7f7", + "**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n\n\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n\n\n**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n\n\n-----\n\n**Product distribution:**\n**segmentation and personalization**\n\nThe most forward-thinking and data-driven insurers are\nfocused on achieving personalization at scale. They are\nexploring new partnerships and business models to create\nintegrated, value-added experiences that prioritize the\noverall health and financial wellness of their customers,\nrather than just their insurance needs. These insurers\nare investing in new data sources, analytics platforms,\nand artificial intelligence (AI)-powered decision engines\nthat enable them to connect producers with like-minded\ncustomers or engage customers with enticing offers\nand actionable steps based on their previous choices.\nThe outcome is more efficient and effective service\nfrom producers, trusted and convenient interactions for\nconsumers, and increased customer engagement and\ngrowth for insurers in an increasingly digital-oriented world.\n\n\n**Customer/Partner Successes**\n\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n\nWith Persona 360, you can:\n\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n1,695+ attributes and segments", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e86d61fb5ece85469f5408d595d3ab26", + "[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n\nWith Persona 360, you can:\n\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n1,695+ attributes and segments\n\n**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\nPersona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\nusers to comprehend and activate the data\n\n**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\npersonalized campaigns\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Demand for hyper-personalized and real-time risk protection**\n\n\n**requires broad adoption of artificial** **intelligence (AI), machine**\n\n**learning and digital platforms.**\n\n**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n\n\n**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n\n\n**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n\n**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n\n\n-----\n\n**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n**by insurance provider type**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "d598d42d52ff0953c4525c8a65fd365b", + "**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n\n**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n\n\n-----\n\n**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n**by insurance provider type**\n\n\n\n\n\n\n\n\n\n|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n\n\n-----\n\n|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n\n\n\n\n\n\n|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "26d91674b02626bd8b4b6427126750b0", + "|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n\n\n-----\n\n|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n\n\n-----\n\n## Conclusion\n\nToday, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\nInsurance empowers insurance providers to leverage the potential of data and analytics to address strategic\nchallenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n\n**Customers that innovate with Databricks Lakehouse for Insurance**\n\nSome of the top property and casualty, life and health insurance companies and reinsurers in the world turn\nto Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\nsmarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000 organizations worldwide — including\n\nComcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n\nLake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n\n#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n\n###### Contact us for a personalized demo at:\n dbricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "77fa3ca534959648d7a8e5eebca4d12e", + "**eBook**\n\n# Making Your Digital Twin Come to Life\n\n##### With the Lakehouse for Manufacturing and Tredence\n\n\n-----\n\n### Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\n\nDigital Twin Architectures .................................................................................................................................................................................. **08**\n\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "10392cc0d1b6c4e31a30c959626d4c63", + "How to Build a Digital Twin ................................................................................................................................................................................ **09**\n\nWhy Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n\nWhy Databricks for Digital Twins? ................................................................................................................................................................... **13**\n\nWhy Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n\nUsing Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n\n\n-----\n\n### Introduction", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "ed0b74c51c64e6fd2c535c1bd5dafb1a", + "Using Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n\n\n-----\n\n### Introduction\n\n\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\ncost-effective and are now an imperative in today’s data-driven businesses.\n\nToday’s manufacturing industries are expected to streamline and optimize all the processes in their value\nchain from product development and design, through operations and supply chain optimization to obtaining\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "33042520bb456fb0730d8ed53528a953", + "-----\n\n### Introduction\n\n\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\ncost-effective and are now an imperative in today’s data-driven businesses.\n\nToday’s manufacturing industries are expected to streamline and optimize all the processes in their value\nchain from product development and design, through operations and supply chain optimization to obtaining\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\n\n\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "32450e347d08b2ca314b2a9bc96b9a6e", + "**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 10%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 50%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 25%\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n-----\n\n**Introduction (continued)**\n\n\n**Digital twin market growth rate accelerates**\n\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\nat a CAGR of 58%, riding on the wave of Industry 4.0.\n\n\n**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "929aec8a6e41f875b04a8fd58c7e9553", + "**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing\n\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\nwould have come at significant costs without digital twin technology.\n\n**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n\n\n\n**•** Product design and development is performed with\nless cost and is completed in less time as iterative\nsimulations, using multiple constraints, deliver the\nbest or most optimized design. All commercial\naircraft are designed using digital twins.\n\n**•** Digital twins provide the awareness of how long\ninventory will last, when to replenish and how to\nminimize the supply chain disruptions. The oil and gas\nindustry, for example, uses supply chain–oriented\ndigital twins to reduce supply chain bottlenecks in\nstorage and midstream delivery, schedule tanker\noff-loads and model demand with externalities.\n\n\n\n**•** Continuous quality checks on produced items\nwith ML/AI generated feedback pre-emptively\nassuring improved product quality. Final paint\ninspection in the automotive industry, for example,\nis performed with computer vision built on top of\ndigital twin technology.\n\n**•** Striking the sweet spot between when to replace\na part before the process degrades or breaks\ndown and utilizing the components to their fullest,\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\nbuilding an asset performance management suite.\n\n\n\n**•** Digital twins create the opportunity to have\nmultiple departments in sync by providing\nnecessary instructions modularly to attain\na required throughput. Digital twins are the\nbackbone of kaizen events that optimize\nmanufacturing process flow.\n\n**•** Customer feedback loops can be modeled through\ninputs, from point of sale customer behavior,\nbuying preferences, or product performance and\nthen integrated into the product development\nprocess, forming a closed loop providing an\nimproved product design.\n\n\n-----\n\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\ndeployment, but typically offer higher and longer-lasting value.\n\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n\n\nImprove product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "c743afeca2a4f67e2f6fcc8b2a07bc10", + "Improve product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**\n\n**8%**\n**8%**\n\n\nCan you imagine the cost to change\nan oil refinery’s crude distillation\nunit process conditions to improve\nthe output of diesel one week\nand gasoline the next to address\nchanges in demand and ensure\nmaximum economic value? Can you\nimagine how to replicate an even\nsimple supply chain to model risk?\n\n\n**5%**\n\n\n**1%**\n\n\n-----\n\n### What Are Digital Twins?\n\n\nKnowing the business challenges and benefits digital twins deliver, let’s turn to\nthe basics and explore what digital twins are and how a modern data stack is\nnecessary to build effective and timely digital twins. The classic definition of\ndigital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n\n\nFor a discrete or continuous manufacturing process, a digital twin gathers system\nand processes state data with the help of various IoT sensors [operational\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\nvirtual model which is then used to run simulations, study performance issues and\ngenerate possible insights.\n\n\n**Types of Digital Twins**\n\n\n-----\n\n### Digital Twin Architectures\n\nClassic digital twins have been physics-based models of specific systems. More recently,\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n\n\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\nthe industrial environment.\n\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\n\n**Data-Driven Operational Digital Twins: Maturity Journey**\n\n**AI**\n\nSimulate & Optimize\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n# 6-8 18-24\n## years to months\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "2028d1c3a99d0f0ed0da57cd872f75fa", + "# 6-8 18-24\n## years to months\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n# 20% to 25%\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n\nIdentify next best action and\nintegrate with actuation systems\n\n\n**IoT**\n\n**Edge/**\n**Cloud**\n\n\n**Digital Twins**\n\n**ERP**\n\n\nPredict & Diagnose\n\n|Col1|I i|\n|---|---|\n\n\n\nPredictive maintenance, process\nimprovements and Root Causing\n\n\nMonitor & Alert\n\n|Col1|P i|\n|---|---|\n\n\nReal-time operations monitoring\nand alerting\n\n\n-----\n\n### How to Build a Digital Twin\n\n\nA data architecture capability is needed to capture\nand collect the ever-expanding volume and variety\nof data streaming in real time from example\nprotocols, such as ABB Total Flow, Allen Bradley,\nEmerson, Fanuc, GE, Hitachi and Mitsubishi.\n\n\nData collection, data analytics, application\nenablement and data integration orchestrate the\ntime-series data stream and transfer to the cloud.\nAzure IoT Hub is used to securely ingest data from\nedge to cloud.\n\n\nCloud infrastructure and analytics capabilities are\noffered within the flexibility of the cloud. Azure\nDigital Twin is used to model and visualize process\nworkflows. Databricks MLflow and Delta Lake scale to\ndeliver real-time predictive analytics.\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Digital Twins: Technical Architecture**\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n\n\n**System and use case discovery**\n**and blueprinting**\n\n**•** Identify priority plant processes and systems\nto model, with focused use cases (e.g., asset\nmaintenance, energy management, process\nmonitoring/optimization, etc.)\n\n**•** Develop a validated process outline, blueprint and\nkey performance indicators", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "8b1bc3a24399aaf12f8b7d348990a68f", + "Cloud infrastructure and analytics capabilities are\noffered within the flexibility of the cloud. Azure\nDigital Twin is used to model and visualize process\nworkflows. Databricks MLflow and Delta Lake scale to\ndeliver real-time predictive analytics.\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Digital Twins: Technical Architecture**\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n\n\n**System and use case discovery**\n**and blueprinting**\n\n**•** Identify priority plant processes and systems\nto model, with focused use cases (e.g., asset\nmaintenance, energy management, process\nmonitoring/optimization, etc.)\n\n**•** Develop a validated process outline, blueprint and\nkey performance indicators\n\n**•** Develop a set of process variables, control\nvariables and manipulated variables\n\n**•** Design control loop\n\n**•** Validate and document process and asset FMEA\nfor all assets and sub-systems\n\n\n**Technology infrastructure requirements**\n\n**•** Technical edge infrastructure onsite — to sense,\ncollect and transmit real-time information\n\n**•** Clean, reliable data availability in the cloud\n\n**•** Data processing and analytics platform — to\ndesign, develop and implement solutions\n\n**•** Stream processing and deployment of models for\npredictions and soft sensing\n\n\n**Visualization delivered**\n\n**•** Information communication — visual\nrepresentation of digital twin along with remote\ncontrolling functions (e.g., Power BI dashboards,\ntime series insights, web app-based digital\ntwin portals)\n\n**•** Closed-loop feedback — to send the insights and\nactions back to form a closed loop — Azure – Event\nGrid and Event Hub with connection from IoT Hub to\nAzure IoT edge devices and control systems is used\n\n\n\n**•** Edge platform to orchestrate the data, insights and\nactions between the cloud and site IT systems\n\n**•** Cloud to edge integration — to enable seamless\nmonitoring, alerting and integration with plant\nOT/IT systems\n\n\n-----\n\n### Why Is Manufacturing Struggling With Data and AI?\n\n**Challenge** **Root Cause** **Goal**\n\n\nAggregate high volumes and velocities of\n\nstructured and unstructured data to power\n\npredictive analytics (e.g., images, IoT, ERP/SCM)\n\nData architectures that scale for TBs /PBs of\n\nenterprise IT and OT data\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\nfor on-premises 30 years ago\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\n\nLegacy architectures such as data\n\nhistorians that can’t handle semi-structured\n\nor unstructured data\n\n\n**Unable to scale enterprise data sets**\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\ngranular supply chain issues in the real world\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\n\n**Can’t meet intellectual property**\n\n\n**Can’t meet intellectual property** Data lineage established across organizational\n\nSystems that do not establish data lineage\n**requirements** silos and disjointed workflows\n\n\nsilos and disjointed workflows\n\n\n### Data architecture is the root cause of this struggle.\n\n\n-----\n\n### Why Databricks for Digital Twins?\n\n\nLakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\nfrom across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\ndigital twins with predictive insights across the value chain from product development to optimizing operations\nto building agile supply chains to robust customer insights.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "106e6e5a55cb20e8948c9f9d7f848eab", + "Address manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\ngranular supply chain issues in the real world\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\n\n**Can’t meet intellectual property**\n\n\n**Can’t meet intellectual property** Data lineage established across organizational\n\nSystems that do not establish data lineage\n**requirements** silos and disjointed workflows\n\n\nsilos and disjointed workflows\n\n\n### Data architecture is the root cause of this struggle.\n\n\n-----\n\n### Why Databricks for Digital Twins?\n\n\nLakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\nfrom across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\ndigital twins with predictive insights across the value chain from product development to optimizing operations\nto building agile supply chains to robust customer insights.\n\n\nDatabricks open Lakehouse\n\nPlatform has shown time and\n\nagain that it is the foundational\n\nenabling technology to power\n\ndigital twins for manufacturing. But\n\nthe real power is the Databricks\n\npartnership with Tredence that\n\nspeeds implementation for\n\ntailored use cases that deliver\n\nsuperior ROI in less time.”\n\n**Dr. Bala Amavasai** ,\n\nManufacturing CTO, Databricks\n\n\n**Supports Real-Time**\n**Decisions**\n\nLakehouse for Manufacturing\nleverages any enterprise data\nsource — from business critical\nERP data to edge sensor data in\none integrated platform, making it\neasy to automate and secure data\nwith fast, real-time performance.\n\n\n**Faster and More**\n**Accurate Analysis**\n\nThe true benefits of digital twins\nare not the business intelligence\ndashboards, but machine\nlearning insights generated\nfrom incorporating real-time\ndata. Scalable and shareable\nnotebook-based machine learning\naccelerates ROI.\n\n\n**Open Data Sharing**\n**and Collaboration**\n\nDrive stronger customer insights\nand greater service with partners\nleveraging open and secure\ndata collaboration between\ndepartments or your supply chain\ndelivering faster ROI.\n\n\n-----\n\n### Why Tredence for Digital Twins?\n\n\nOver the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\nexpertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\nNow, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\nfurther strengthen their ‘’last mile impact’’ vision.\n\n\nTredence is excited to\n\nco-innovate with Databricks to\n\ndeliver the solutions required for\n\nenterprises to create digital twins\n\nfrom the ground up and implement\n\nthem swiftly to maximize their ROI.\n\nOur partnership enables clients to\n\nget the most out of Tredence’s data\n\nscience capabilities to build decision\n\nintelligence around manufacturing\n\nprocesses and Databricks’\n\nLakehouse Platform to realize the full\n\npromise of digital twins.”\n\n**Naresh Agarwal** ,\n\nHead of Industrials, Tredence\n\n\n**Global Reach**\n\nTredence offers a global team with\nthe subject matter expertise that\ndelivers practitioner and useroriented solutions to identify\nand solve for challenges in\ndigital transformation design\nand implementation.\n\n\n**Purpose-Built Solutions**\n\nAdopt contextual edge to cloud,\npurpose-built AIoT solutions\nthat unify your ecosystems with\nconnected insights and enhance\nproductivity, while enabling\nefficient cost structures.\n\n\n**Focused Dedication**\n\nA dedicated centre of excellence\n(CoE) for AIoT and smart\nmanufacturing solutions —\nserving the entire manufacturing\nvalue chain from product\ndevelopment to manufacturing and\ndownstream operations.\n\n\n-----\n\n### Using Digital Twins to Drive Insights\n\n\n**Use Case**\n\n**Predictive Maintenance**\n\n- \u0007Rolls-Royce sought to use real-time\nengine data to reduce unplanned\nmaintenance and downtime", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "d56f561c8a16d9a60b6e5861216c425b", + "Lakehouse Platform to realize the full\n\npromise of digital twins.”\n\n**Naresh Agarwal** ,\n\nHead of Industrials, Tredence\n\n\n**Global Reach**\n\nTredence offers a global team with\nthe subject matter expertise that\ndelivers practitioner and useroriented solutions to identify\nand solve for challenges in\ndigital transformation design\nand implementation.\n\n\n**Purpose-Built Solutions**\n\nAdopt contextual edge to cloud,\npurpose-built AIoT solutions\nthat unify your ecosystems with\nconnected insights and enhance\nproductivity, while enabling\nefficient cost structures.\n\n\n**Focused Dedication**\n\nA dedicated centre of excellence\n(CoE) for AIoT and smart\nmanufacturing solutions —\nserving the entire manufacturing\nvalue chain from product\ndevelopment to manufacturing and\ndownstream operations.\n\n\n-----\n\n### Using Digital Twins to Drive Insights\n\n\n**Use Case**\n\n**Predictive Maintenance**\n\n- \u0007Rolls-Royce sought to use real-time\nengine data to reduce unplanned\nmaintenance and downtime\n\n- \u0007Legacy systems were unable to\nscale data ingestion of engine\nsensor data in real time for ML\n\n**Impact**\n\n\n**Why Databricks?**\n\n- \u0007The Lakehouse Platform on Azure unifies in-flight data\nstreams with external environmental conditions data to\npredict engine performance issues\n\n- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\nacross use cases\n\n- \u0007MLflow speeds deployment of new models and reduces\nincidents of grounded planes\n\n\nRolls-Royce uses Databricks\nto drive insights around predictive\nmaintenance, improving\nairframe reliability and reducing\ncarbon emissions.\n\n\n#### 22 million tons\nof carbon emissions saved\n\n\n#### 5% reduction\nin unplanned airplane groundings\n\n\n#### Millions of pounds\nin inventory cost savings from a 50%\nimprovement in maintenance efficiency\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n\nNast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n\noriginal creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n###### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\nTo learn more, visit us at:\n\n**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "4da384183bd03d8a10274cfeaf813719", + "### eBook\n\n# A New Approach to Data Sharing\n\n#### Open data sharing and collaboration for data, analytics, and AI\n\n### Second Edition\n\n\n-----\n\n## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n\n**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n\nCommon data sharing use cases 6\n\nData monetization 6\n\nData sharing with partners or suppliers (B2B) 6\n\nInternal lines of business (LOBs) sharing 6\n\nKey benefits of data sharing 7\n\n**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n\nLegacy and homegrown solutions 9\n\nProprietary vendor solutions 11\n\nCloud object storage 13\n\n**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n\nWhat is Delta Sharing? 14\n\nKey benefits of Delta Sharing 16\n\nMaximizing value of data with Delta Sharing 18\n\nData monetization with Delta Sharing 19\n\nB2B sharing with Delta Sharing 21\n\nInternal data sharing with Delta Sharing 23\n\n**Chapter 4: How Delta Sharing Works** **26**\n\n\n-----\n\n**Chapter 5: Introducing Databricks Marketplace** **28**\n## Contents\n\nWhat is Databricks Marketplace? 30\n\nKey benefits of Databricks Marketplace 30\n\nEnable collaboration and accelerate innovation 32\n\nPowered by a fast, growing ecosystem 32\n\nUse cases for an open marketplace 32\n\nNew upcoming feature: AI model sharing 33\n\n**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n\nWhat is a data clean room? 34\n\nCommon data clean room use cases 36\n\nShortcomings of existing data clean rooms 38\n\nKey benefits of Databricks Clean Rooms 39\n\n**Resources: Getting started with Data Sharing and Collaboration** **40**\n\n**About the Authors** **42**\n\n\n-----\n\n## Introduction\n Data Sharing in Today’s Digital Economy\n\n\nToday’s economy revolves around data. Everyday, more and more\n\norganizations must exchange data with their customers, suppliers\n\nand partners. Security is critical. And yet, efficiency and immediate\n\naccessibility are equally important.\n\nWhere data sharing may have been considered optional, it’s now\n\nrequired. More organizations are investing in streamlining internal\n\nand external data sharing across the value chain. But they still face\n\nmajor roadblocks — from human inhibition to legacy solutions to\n\nvendor lock-in.\n\nTo be truly data-driven, organizations need a better way to share\n\ndata. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n\ndata sharing will outperform their peers on most business value\n\n\nwho have successfully executed data sharing initiatives are 1.7x\n\nmore effective in showing business value and return on investment\n\nfrom their data analytics strategy.\n\nTo compete in the digital economy, organizations need an open —\n\nand secure — approach to data sharing.\n\nThis eBook takes a deep dive into the modern era of data sharing\n\nand collaboration, from common use cases and key benefits to\n\nconventional approaches and the challenges of those methods.\n\nYou’ll get an overview of our open approach to data sharing and find\n\nout how Databricks allows you to share your data across platforms,\n\nto share all your data and AI, and to share all your data securely with\n\nunified governance in a privacy-safe way.\n\n\nmetrics. In addition, Gartner recently found that Chief Data Officers\n\n\n-----\n\n## Chapter 1\n What Is Data Sharing and Why Is It Important?\n\nData sharing is the ability to make the same data available to one or many stakeholders — both external", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6e916da2e05c4e43549d9fcbf3e506d5", + "who have successfully executed data sharing initiatives are 1.7x\n\nmore effective in showing business value and return on investment\n\nfrom their data analytics strategy.\n\nTo compete in the digital economy, organizations need an open —\n\nand secure — approach to data sharing.\n\nThis eBook takes a deep dive into the modern era of data sharing\n\nand collaboration, from common use cases and key benefits to\n\nconventional approaches and the challenges of those methods.\n\nYou’ll get an overview of our open approach to data sharing and find\n\nout how Databricks allows you to share your data across platforms,\n\nto share all your data and AI, and to share all your data securely with\n\nunified governance in a privacy-safe way.\n\n\nmetrics. In addition, Gartner recently found that Chief Data Officers\n\n\n-----\n\n## Chapter 1\n What Is Data Sharing and Why Is It Important?\n\nData sharing is the ability to make the same data available to one or many stakeholders — both external\n\nand internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n\nData sharing — within your organization or externally — is an enabling technology for data commercialization\n\nand enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n\nto collaborate with partners, establish new partnerships and generate new revenue streams with data\n\nmonetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n\ngroups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n\nlimited to roles such as the data analyst, data scientist and data engineer.\n\n\n-----\n\n#### Common data sharing use cases\n\n\n#### Data\n monetization\n\nCompanies across industries are commercializing\n\ndata. Large multinational organizations have\n\nformed exclusively to monetize data, while other\n\norganizations are looking for ways to monetize\n\ntheir data and generate additional revenue\n\nstreams. Examples of these companies can\n\nrange from an agency with an identity graph to a\n\ntelecommunication company with proprietary 5G\n\ndata or to retailers that have a unique ability to\n\ncombine online and offline data. Data vendors are\n\ngrowing in importance as companies realize they\n\nneed external data for better decision-making.\n\n\n#### Data sharing with partners\n or suppliers (B2B)\n\nMany companies now strive to share data with\n\npartners and suppliers as similarly as they share\n\nit across their own organizations. For example,\n\nretailers and their suppliers continue to work more\n\nclosely together as they seek to keep their products\n\nmoving in an era of ever-changing consumer tastes.\n\nRetailers can keep suppliers posted by sharing sales\n\ndata by SKU in real time, while suppliers can share\n\nreal-time inventory data with retailers so they know\n\nwhat to expect. Scientific research organizations\n\ncan make their data available to pharmaceutical\n\ncompanies engaged in drug discovery. Public safety\n\nagencies can provide real-time public data feeds\n\nof environmental data, such as climate change\n\nstatistics or updates on potential volcanic eruptions.\n\n\n#### Internal lines of business\n (LOBs) sharing\n\nWithin any company, different departments, lines\n\nof business and subsidiaries seek to share data so\n\neveryone can make decisions based on a complete\n\nview of the current business reality. For example,\n\nfinance and HR departments need to share data\n\nas they analyze the true costs of each employee.\n\nMarketing and sales teams need a common view\n\nof data to determine the effectiveness of recent\n\nmarketing campaigns. And different subsidiaries\n\nof the same company need a unified view of the\n\nhealth of the business. Removing data silos — which\n\nare often established for the important purpose of\n\npreventing unauthorized access to data — is critical\n\nfor digital transformation initiatives and maximizing\n\nthe business value of data.\n\n\n-----\n\n#### Key benefits of data sharing", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8539bb76a5be7cad0989786bdef40c4b", + "of environmental data, such as climate change\n\nstatistics or updates on potential volcanic eruptions.\n\n\n#### Internal lines of business\n (LOBs) sharing\n\nWithin any company, different departments, lines\n\nof business and subsidiaries seek to share data so\n\neveryone can make decisions based on a complete\n\nview of the current business reality. For example,\n\nfinance and HR departments need to share data\n\nas they analyze the true costs of each employee.\n\nMarketing and sales teams need a common view\n\nof data to determine the effectiveness of recent\n\nmarketing campaigns. And different subsidiaries\n\nof the same company need a unified view of the\n\nhealth of the business. Removing data silos — which\n\nare often established for the important purpose of\n\npreventing unauthorized access to data — is critical\n\nfor digital transformation initiatives and maximizing\n\nthe business value of data.\n\n\n-----\n\n#### Key benefits of data sharing\n\nAs you can see from the use cases described above, there are many benefits of data sharing, including:\n\n\n**Greater collaboration with existing partners.** In today’s hyper-\n\nconnected digital economy, no single organization can advance its\n\nbusiness objectives without partnerships. Data sharing helps solidify\n\nexisting partnerships and can help organizations establish new ones.\n\n\u0007 **Ability to generate new revenue streams.** With data sharing,\n\norganizations can generate new revenue streams by offering data\n\nproducts or data services to their end consumers.\n\n\n**Ease of producing new products, services or business models.**\n\nProduct teams can leverage both first-party data and third-party\n\ndata to refine their products and services and expand their product/\n\nservice catalog.\n\n**Greater efficiency of internal operations.** Teams across the\n\norganization can meet their business goals far more quickly when\n\nthey don’t have to spend time figuring out how to free data from\n\nsilos. When teams have access to live data, there’s no lag time\n\nbetween the need for data and the connection with the appropriate\n\ndata source.\n\n\n-----\n\n## Chapter 2\n Conventional Methods of Data Sharing and Their Challenges\n\nSharing data across different platforms, companies and clouds is no easy task. In the past,\n\norganizations have hesitated to share data more freely because of the perceived lack\n\nof secure technology, competitive concerns and the cost of implementing data sharing\n\nsolutions.\n\nEven for companies that have the budget to implement data sharing technology, many of\n\nthe current approaches can’t keep up with today’s requirements for open-format, multi-\n\ncloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n\nwhich creates friction for data providers and data consumers who use non-compatible\n\nplatforms.\n\nOver the past 30 years, data sharing solutions have come in three forms: legacy and\n\nhomegrown solutions, cloud object storage and closed source commercial solutions.\n\nEach of these approaches comes with its pros and cons.\n\n\n-----\n\n#### Legacy and homegrown solutions\n\nMany companies have built homegrown data sharing solutions based on legacy\n\ntechnologies such as email, (S)FTP or APIs.\n\n\nProvider\n\nETL\n\n\nConsumer\n\n\nBatch data\nfrom provider\n\n\nTable �\n\nTable 2\n\n\nFTP/SSH/API\nServer\n\n\nFTP/SSH/API ETL Database Analyst Run Analysis\nServer\n\n\n**Figure 1:**\nLegacy data\nsharing solutions\n\n\n**Pros**\n\n\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n\nconsumers can leverage a suite of clients to access data provided to them.\n\n\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n\nand will work both on-prem and on clouds.\n\n\n-----\n\n**Cons**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "731baf9c1075c89f113cda106e49039e", + "-----\n\n#### Legacy and homegrown solutions\n\nMany companies have built homegrown data sharing solutions based on legacy\n\ntechnologies such as email, (S)FTP or APIs.\n\n\nProvider\n\nETL\n\n\nConsumer\n\n\nBatch data\nfrom provider\n\n\nTable �\n\nTable 2\n\n\nFTP/SSH/API\nServer\n\n\nFTP/SSH/API ETL Database Analyst Run Analysis\nServer\n\n\n**Figure 1:**\nLegacy data\nsharing solutions\n\n\n**Pros**\n\n\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n\nconsumers can leverage a suite of clients to access data provided to them.\n\n\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n\nand will work both on-prem and on clouds.\n\n\n-----\n\n**Cons**\n\n\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n\nit and host it on an FTP server for different recipients. Additionally, this approach\n\nresults in creating copies of data sets. Data copying causes duplication and prevents\n\norganizations from instantly accessing live data.\n\n\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n\narchitectures due to replication and provisioning. This can add considerable time to\n\ndata sharing activities and result in out-of-date data for end consumers.\n\n\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n\nand load (ETL) the shared data for their end use cases, which further delays the time to\n\ninsights. For any new data updates from the providers, the consumers have to rerun ETL\n\npipelines again and again.\n\n\u0007 **Security and governance.** As modern data requirements become more stringent,\n\nhomegrown and legacy technologies have become more difficult to secure and govern.\n\n\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n\naccommodate large data sets.\n\n\n-----\n\n#### Proprietary vendor solutions\n\nCommercial data sharing solutions are a popular option among companies that don’t want\n\nto devote the time and resources to building an in-house solution yet also want more\n\ncontrol than what cloud object storage can offer.\n\n\nVendor 1 Platform\n\nProprietary\ndata format\n\n\nVendor V Platform\n\nProprietary\ndata format\n\n\nData Provider 1\n\nData;\nProvider\n\n\nData Provider 1\n\n\nData;\nConsumer\n\nShared data set\n\n\nData;\nProvider\n\nShared dataset\n\n\nData;\nConsumer\n\n\nNo cross-platform\nsharing\n\n\n**Figure 2:**\nProprietary\nvendor solutions\n\n\nShared dataset\n\nShared data set\n\n\nShared data set\n\n\nShared data set\n\n\nSharing limited to recipients\non the same platform\n\nData;\nConsumer\n\n\nData;\nConsumere\n\n\n**Pros**\n\n\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n\nthe same platform.\n\n\n-----\n\n**Cons**\n\n\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n\ndata sharing is easy among fellow customers, it’s usually impossible with those who\n\nuse competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n\nFurthermore, platform differences between data providers and recipients introduce\n\ndata sharing complexities.\n\n\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n\ndata copies.\n\n\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n\n\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n\nconsumers, as data providers have to replicate data for different recipients on different\n\ncloud platforms.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "01af964dec80339a4c762a6a74d2f97b", + "**Pros**\n\n\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n\nthe same platform.\n\n\n-----\n\n**Cons**\n\n\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n\ndata sharing is easy among fellow customers, it’s usually impossible with those who\n\nuse competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n\nFurthermore, platform differences between data providers and recipients introduce\n\ndata sharing complexities.\n\n\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n\ndata copies.\n\n\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n\n\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n\nconsumers, as data providers have to replicate data for different recipients on different\n\ncloud platforms.\n\n\n-----\n\n#### Cloud object storage\n\n\n**Cons**\n\n\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n\nsame cloud to access the objects.\n\n\u0007 **Cumbersome security and governance.** Assigning permissions\n\nand managing access is complex. Custom application logic is\n\nneeded to generate signed URLs.\n\n\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n\nfind it difficult to understand Identity Access Management\n\n(IAM) policies and how data is mapped to underlying files. For\n\ncompanies with large volumes of data, sharing via cloud storage\n\nis time-consuming, cumbersome and nearly impossible to scale.\n\n\u0007 **Operational overhead for data recipients.** The data recipients\n\nhave to run extract, transform and load (ETL) pipelines on the\n\nraw files before consuming them for their end use cases.\n\nThe lack of a comprehensive solution makes it challenging for data\n\nproviders and consumers to easily share data. Cumbersome and\n\nincomplete data sharing processes also constrain the development\n\nof business opportunities from shared data.\n\n\nObject storage is considered a good fit for the cloud because it is\n\nelastic and can more easily scale into multiple petabytes to support\n\nunlimited data growth. The big three cloud providers all offer object\n\nstorage services (AWS S3, Azure Blob, Google Cloud Storage) that\n\nare cheap, scalable and extremely reliable.\n\nAn interesting feature of cloud object storage is the ability to\n\ngenerate signed URLs, which grant time-limited permission to\n\ndownload objects. Anyone who receives the presigned URL can\n\nthen access the specified objects, making this a convenient\n\nway to share data.\n\n**Pros**\n\n\u0007 **Sharing data in place.** Object storage can be shared in place,\n\nallowing consumers to access the latest available data.\n\n\u0007 **Scalability.** Cloud object storage profits from availability and\n\ndurability guarantees that typically cannot be achieved\n\non-premises. Data consumers retrieve data directly from the\n\ncloud providers, saving bandwidth for the providers.\n\n\n-----\n\n## Chapter 3\n Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n\n\nWe believe the future of data sharing should be characterized by\n\nopen technology. Data sharing shouldn’t be tied to a proprietary\n\ntechnology that introduces unnecessary limitations and financial\n\nburdens to the process. It should be readily available to anyone who\n\nwants to share data at scale. This philosophy inspired us to develop\n\nand release a new protocol for sharing data: Delta Sharing.\n\n#### What is Delta Sharing?\n\nDelta Sharing provides an open solution to securely share live data\n\nfrom your lakehouse to any computing platform. Recipients don’t\n\n\nData providers can centrally manage, govern, audit and track\n\nusage of the shared data on one platform. Delta Sharing is natively", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8d31bd1738c89d49b2778f7bacf93c94", + "durability guarantees that typically cannot be achieved\n\non-premises. Data consumers retrieve data directly from the\n\ncloud providers, saving bandwidth for the providers.\n\n\n-----\n\n## Chapter 3\n Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n\n\nWe believe the future of data sharing should be characterized by\n\nopen technology. Data sharing shouldn’t be tied to a proprietary\n\ntechnology that introduces unnecessary limitations and financial\n\nburdens to the process. It should be readily available to anyone who\n\nwants to share data at scale. This philosophy inspired us to develop\n\nand release a new protocol for sharing data: Delta Sharing.\n\n#### What is Delta Sharing?\n\nDelta Sharing provides an open solution to securely share live data\n\nfrom your lakehouse to any computing platform. Recipients don’t\n\n\nData providers can centrally manage, govern, audit and track\n\nusage of the shared data on one platform. Delta Sharing is natively\n\nintegrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n\nmanage and audit shared data across organizations and confidently\n\nshare data assets while meeting security and compliance needs.\n\nWith Delta Sharing, organizations can easily share existing large-\n\nscale data sets based on the open source formats Apache Parquet\n\nand Delta Lake without moving data. Teams gain the flexibility to\n\nquery, visualize, transform, ingest or enrich shared data with their\n\ntools of choice.\n\n\nhave to be on the Databricks platform or on the same cloud or a\n\ncloud at all. Data providers can share live data without replicating\n\nit or moving it to another system. Recipients benefit from always\n\nhaving access to the latest version of data and can quickly query\n\nshared data using tools of their choice for BI, analytics and machine\n\nlearning, reducing time-to-value.\n\n\n-----\n\nData ����i�e�\n\n\nAny u�e cy�e\n\nAnalytics\n\nBI\n\nData Science\n\n\nData Recipient\n\nAny sool\n\nAnd many more\n\n\nAny cloud/on-prem\n\nOn-premises\n\n\nAccess permissions\n\nDelta Sharing Protocol\n\n\nDelta �a�e �a�le Delta Sharing Ser�er\n\n\nNo replication\nEasy to manage\nSecure\n\n\n**Figure 3:**\nDelta Sharing\n\n\nDatabricks designed Delta Sharing with five goals in mind:\n\n\u0007Provide an open cross-platform sharing solution\n\n\u0007Share live data without copying it to another system\n\n\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n\nprovide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n\n\u0007Provide strong security, auditing and governance\n\n\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n\nderivatives such as ML models, dashboards and notebooks, in addition to tabular data\n\n\n-----\n\n#### Key benefits of Delta Sharing\n\nBy eliminating the obstacles and shortcomings associated with typical data sharing\n\napproaches, Delta Sharing delivers several key benefits, including:\n\n\n**Open cross-platform sharing.** Delta Sharing establishes a new\n\nopen standard for secure data sharing and supports open source\n\nDelta and Apache Parquet formats. Data recipients don’t have to be\n\non the Databricks platform or on the same cloud, as Delta Sharing\n\nworks across clouds and even from cloud to on-premises setups. To\n\ngive customers even greater flexibility, Databricks has also released\n\nopen source connectors for pandas, Apache Spark, Elixir and\n\nPython, and is working with partners on many more.\n\n\u0007 **Securely share live data without replication.** Most enterprise\n\n\n**Centralized governance.** With Databricks Delta Sharing, data\n\nproviders can grant, track, audit and even revoke access to shared\n\ndata sets from a single point of enforcement to meet compliance and", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "976ae92ee9091e152cd7f09d1f089fd2", + "#### Key benefits of Delta Sharing\n\nBy eliminating the obstacles and shortcomings associated with typical data sharing\n\napproaches, Delta Sharing delivers several key benefits, including:\n\n\n**Open cross-platform sharing.** Delta Sharing establishes a new\n\nopen standard for secure data sharing and supports open source\n\nDelta and Apache Parquet formats. Data recipients don’t have to be\n\non the Databricks platform or on the same cloud, as Delta Sharing\n\nworks across clouds and even from cloud to on-premises setups. To\n\ngive customers even greater flexibility, Databricks has also released\n\nopen source connectors for pandas, Apache Spark, Elixir and\n\nPython, and is working with partners on many more.\n\n\u0007 **Securely share live data without replication.** Most enterprise\n\n\n**Centralized governance.** With Databricks Delta Sharing, data\n\nproviders can grant, track, audit and even revoke access to shared\n\ndata sets from a single point of enforcement to meet compliance and\n\nother regulatory requirements. Databricks Delta Sharing users get:\n\n\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n\ngovernance offering for Databricks Lakehouse\n\n\u0007Simple, more secure setup and management of shares\n\n\u0007The ability to create and manage recipients and data shares\n\n\u0007Audit logging captured automatically as part of Unity Catalog\n\n\u0007Direct integration with the rest of the Databricks ecosystem\n\n\u0007No separate compute for providing and managing shares\n\n\ndata today is stored in cloud data lakes. Any of these existing data\n\nsets on the provider’s data lake can easily be shared without any\n\ndata replication or physical movement of data. Data providers can\n\nupdate their data sets reliably in real time and provide a fresh and\n\nconsistent view of their data to recipients.\n\n\n-----\n\n**Share data products, including AI models, dashboards and**\n\n**notebooks, with greater flexibility.** Data providers can choose\n\nbetween sharing anentire table or sharing only a version or\n\nspecific partitions of a table. However, sharing just tabular data\n\nis not enough to meet today’s consumer demands. Delta Sharing\n\nalso supports sharing of non-tabular data and data derivatives\n\nsuch as data streams, AI models, SQL views and arbitrary files,\n\nenablingincreased collaboration and innovation. Data providers can\n\nbuild, package and distribute data products including data sets,\n\nAI and notebooks, allowingdata recipients to get insights faster.\n\nFurthermore, this approach promotes and empowers the exchange\n\nof knowledge — not just data — between different organizations.\n\n\n**Share data at a lower cost.** Delta Sharing lowers the cost of\n\nmanaging and consuming shares for both data providers and\n\nrecipients. Providers can share data from their cloud object store\n\nwithout replicating, thereby reducing the cost of storage. Incontrast,\n\nexisting data sharing platforms require data providers to first move\n\ntheir data into their platform or store data in proprietary formats in\n\ntheir managed storage, which often costs more and results in data\n\nduplication. With Delta Sharing, data providers don’t need to set\n\nup separate computing environments to share data. Consumers\n\ncan access shared data directly using their tools of choice without\n\nsetting up specific consumption ecosystems, thereby reducing\n\ncosts.\n\n\nWith Delta Sharing we are able to achieve a truly open marketplace\n\nand truly open ecosystem. In contrast, commercial products are\n\nmostly limited to sharing raw tabular data and cannot be used to\n\n\nshare these higher-valued data derivatives.\n\n\n\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n\nset up a new ingestion process to consume data. Data recipients\n\ncan directly access the fresh data and query it using tools of their\n\nchoice. Recipients can also enrich data with data sets from popular\n\ndata providers. The Delta Sharing ecosystem of open source and\n\ncommercial partners is growing every day.\n\n\n-----\n\n#### Maximizing value of data with Delta Sharing", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "21fed49f6ac8d9e5d38d22f0703f33a6", + "duplication. With Delta Sharing, data providers don’t need to set\n\nup separate computing environments to share data. Consumers\n\ncan access shared data directly using their tools of choice without\n\nsetting up specific consumption ecosystems, thereby reducing\n\ncosts.\n\n\nWith Delta Sharing we are able to achieve a truly open marketplace\n\nand truly open ecosystem. In contrast, commercial products are\n\nmostly limited to sharing raw tabular data and cannot be used to\n\n\nshare these higher-valued data derivatives.\n\n\n\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n\nset up a new ingestion process to consume data. Data recipients\n\ncan directly access the fresh data and query it using tools of their\n\nchoice. Recipients can also enrich data with data sets from popular\n\ndata providers. The Delta Sharing ecosystem of open source and\n\ncommercial partners is growing every day.\n\n\n-----\n\n#### Maximizing value of data with Delta Sharing\n\nDelta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n\nvariety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n\nSharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n\nIn this section we will explore the building blocks of such an approach and the use cases emerging from these.\n\n\n“Delta Sharing helped us streamline our data delivery process\n\nfor large data sets. This enables our clients to bring their own\n\ncompute environment to read fresh curated data with little-to-\n\nno integration work, and enables us to continue expanding our\n\ncatalog of unique, high-quality data products.”\n\n— **William Dague** , Head of Alternative Data, Nasdaq\n\n\n“We recognize that openness of data will play a key role in\n\nachieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n\nprovides Shell with a standard, controlled and secure protocol\n\nfor sharing vast amounts of data easily with our partners to work\n\ntoward these goals without requiring our partners be on the same\n\ndata sharing platform.”\n\n— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n\n\n“Leveraging the powerful capabilities of Delta Sharing from\n\n\nDatabricks enables Pumpjack Dataworks to have a faster\n\nonboarding experience, removing the need for exporting,\n\nimporting and remodeling of data, which brings immediate\n\nvalue to our clients. Faster results yield greater commercial\n\nopportunity for our clients and their partners.”\n\n\n“Data accessibility is a massive consideration for us. We believe\n\nthat Delta Sharing will simplify data pipelines by enabling us to\n\nquery fresh data from the place where it lives, and we are not\n\nlocked into any platform or data format.”\n\n— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n\n\n— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n\n“As a data company, giving our customers access to our data sets\n\nis critical. The Databricks Lakehouse Platform with Delta Sharing\n\nreally streamlines that process, allowing us to securely reach a\n\nmuch broader user base regardless of cloud or platform.”\n\n— **Felix Cheung** , VP of Engineering, SafeGraph\n\n\n-----\n\n#### Data monetization with Delta Sharing\n\nDelta Sharing enables companies to monetize their data product simply and with necessary governance.\n\nData /on.2-er $\n\n\nCloud Storage\n\n\nFulfllleen\n\nEntitles various data products\n\nData Vendor\n\nUnity\nCatalog\n\n\nUnity\nCatalog\n\nCloud Storage\n\nData /on.2-er �\n\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O\n\nR/O", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "8c21a4f7b812c6f92740eeb06e59f417", + "is critical. The Databricks Lakehouse Platform with Delta Sharing\n\nreally streamlines that process, allowing us to securely reach a\n\nmuch broader user base regardless of cloud or platform.”\n\n— **Felix Cheung** , VP of Engineering, SafeGraph\n\n\n-----\n\n#### Data monetization with Delta Sharing\n\nDelta Sharing enables companies to monetize their data product simply and with necessary governance.\n\nData /on.2-er $\n\n\nCloud Storage\n\n\nFulfllleen\n\nEntitles various data products\n\nData Vendor\n\nUnity\nCatalog\n\n\nUnity\nCatalog\n\nCloud Storage\n\nData /on.2-er �\n\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O\n\nR/O\n\n\n**Figure 4:**\nData monetization\nwith Delta Sharing\n\n\nDelta\nSharing\n\n\nBillieg Audit Log\n\n\n-----\n\nWith Delta Sharing, a data provider can seamlessly share large data sets and overcome\n\nthe scalability issues associated with SFTP servers. Data providers can easily expand their\n\ndata product lines since Delta Sharing doesn’t require you to build a dedicated service\n\nfor each of your data products like API services would. The company simply grants and\n\nmanages access to the data recipients instead of replicating the data — thereby reducing\n\ncomplexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n\nfor a data product. Any data that exists on your platform can be securely shared with your\n\nconsumers. This grants a wider addressable market — your products have appeal to a\n\nbroader range of consumers, from those who say “we need access to your raw data only”\n\nto those who say “we want only small subsets of your Gold layer data.”\n\nTo mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n\naccess to the data. Data providers can use this information to determine the costs\n\nassociated with any of the data products and evaluate if such products are commercially\n\nviable and sensible.\n\n\n-----\n\n#### B2B sharing with Delta Sharing\n\nCloud Storage\n\nPartner A\n\nUnity\nCatalog\n\n\nPartner U\n\n\nUnity\nCatalog\n\nCloud Storage\n\nPartner B\n\nN o n - D ata b r i c k s C u s t o m e r\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O R/O\n\nR/O\n\n\n**Figure 5:**\nB2B sharing with\nDelta Sharing\n\n\nDelta\nSharing\n\n\n-----\n\nDelta Sharing applies in the case of bidirectional exchange of data.\n\nCompanies use Delta Sharing to incorporate partners and suppliers\n\nseamlessly into their workflows. Traditionally, this is not an easy task.\n\nAn organization typically has no control over how their partners are\n\nimplementing their own data platforms. The complexity increases\n\nwhen we consider that the partners and suppliers can reside in\n\na public cloud, private cloud or an on-premises deployed data\n\nplatform. The choices of platform and architecture are not imposed\n\non your partners and suppliers. Due to its open protocol, Delta\n\nSharing addresses this requirement foundationally. Through a wide\n\narray of existing connectors (and many more being implemented),\n\nyour data can land anywhere your partners and suppliers need to\n\nconsume it.\n\n\nIn addition to the location of data consumer residency, the\n\ncomplexity of data arises as a consideration. The traditional\n\napproach to sharing data using APIs is inflexible and imposes\n\nadditional development cycles on both ends of the exchange in\n\norder to implement both the provider pipelines and consumer\n\npipelines. With Delta Sharing, this problem can be abstracted. Data", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bdd311ea74ae748ff1891b3c527d43fe", + "An organization typically has no control over how their partners are\n\nimplementing their own data platforms. The complexity increases\n\nwhen we consider that the partners and suppliers can reside in\n\na public cloud, private cloud or an on-premises deployed data\n\nplatform. The choices of platform and architecture are not imposed\n\non your partners and suppliers. Due to its open protocol, Delta\n\nSharing addresses this requirement foundationally. Through a wide\n\narray of existing connectors (and many more being implemented),\n\nyour data can land anywhere your partners and suppliers need to\n\nconsume it.\n\n\nIn addition to the location of data consumer residency, the\n\ncomplexity of data arises as a consideration. The traditional\n\napproach to sharing data using APIs is inflexible and imposes\n\nadditional development cycles on both ends of the exchange in\n\norder to implement both the provider pipelines and consumer\n\npipelines. With Delta Sharing, this problem can be abstracted. Data\n\ncan be shared as soon as it lands in the Delta table and when the\n\nshares and grants are defined. There are no implementation costs\n\non the provider side. On the consumer side, data simply needs\n\nto be ingested and transformed into an expected schema for the\n\ndownstream processes.\n\nThis means that you can form much more agile data exchange\n\npatterns with your partners and suppliers and attain value from your\n\ncombined data much quicker than ever before.\n\n\n-----\n\n#### Internal data sharing with Delta Sharing\n\nInternal data sharing is becoming an increasingly important consideration for any modern\n\norganization, particularly where data describing the same concepts have been produced in\n\ndifferent ways and in different data silos across the organization. In this situation it is important\n\nto design systems and platforms that allow governed and intentional federation of data and\n\nprocesses, and at the same time allow easy and seamless integration of said data and processes.\n\nArchitectural design patterns such as Data Mesh have emerged to address these specific\n\nchallenges and considerations. Data Mesh architecture assumes a federated design and\n\ndissemination of ownership and responsibility to business units or divisions. This, in fact, has\n\nseveral advantages, chief among them that data is owned by the parts of the organization closest\n\nto the source of the data. Data residence is naturally enforced since data sits within the geo-\n\nlocality where it has been generated. Finally, data volumes and data variety are kept in control\n\ndue to the localization within a data domain (or data node). On the other hand, the architecture\n\npromotes exchange of data between different data domains when that data is needed to deliver\n\noutcomes and better insights.\n\n\n-----\n\nBusiness Unit 1 Business Unit ,\ni n R e g i o n A i n R e g i o n -\n\nCloud Storage\n\nUnity\nCatalog\n\nR/O R/O\n\n\nUnity\nCatalog\n\nCloud Storage\n\n\nDelta\nSharing\n\n\nBusiness Unit B\n\ni n R e g i o n A\n\n\nDelta\nSharing\n\nR/O R/O\n\n\nCloud Storage\n\nBusiness Unit �\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\n**Figure 6:**\nBuilding a Data Mesh\nwith Delta Sharing\n\n\n-----\n\nUnity Catalog enables consolidated data access control across\n\ndifferent data domains within an organization using the Lakehouse\n\non Databricks. In addition, Unity Catalog adds a set of simple and\n\neasy-to-use declarative APIsto govern and control data exchange\n\npatterns between the data domains in the Data Mesh.\n\nTo make matters even more complicated, organizations can grow\n\nthrough mergers and acquisitions. In such cases we cannot assume\n\nthat organizations being acquired have followed the same set of\n\nrules and standards to define their platforms and produce their\n\ndata. Furthermore, we cannot even assume that they have used\n\nthe same cloud providers, nor can we assume the complexity of", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "82417c072537377ab26a850fbc316698", + "Cloud Storage\n\nBusiness Unit �\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\n**Figure 6:**\nBuilding a Data Mesh\nwith Delta Sharing\n\n\n-----\n\nUnity Catalog enables consolidated data access control across\n\ndifferent data domains within an organization using the Lakehouse\n\non Databricks. In addition, Unity Catalog adds a set of simple and\n\neasy-to-use declarative APIsto govern and control data exchange\n\npatterns between the data domains in the Data Mesh.\n\nTo make matters even more complicated, organizations can grow\n\nthrough mergers and acquisitions. In such cases we cannot assume\n\nthat organizations being acquired have followed the same set of\n\nrules and standards to define their platforms and produce their\n\ndata. Furthermore, we cannot even assume that they have used\n\nthe same cloud providers, nor can we assume the complexity of\n\ntheir data models. Delta Sharing can simplify and accelerate the\n\n\nunification and assimilation of newly acquired organizations and\n\ntheir data and processes.. Individual organizations can be treated\n\nas new data domains in the overarching mesh. Only selected data\n\nsources can be exchanged between the different platforms. This\n\nenables teams to move freely between the organizations that are\n\nmerging without losing their data — if anything, they are empowered\n\nto drive insights of higher quality by combining the data of both.\n\nWith Unity Catalog and Delta Sharing, the Lakehouse architecture\n\nseamlessly combines with the Data Mesh architecture to deliver\n\nmore power than ever before, pushing the boundaries of what’s\n\npossible and simplifying activities that were deemed daunting not\n\nso long ago.\n\n\n-----\n\n## Chapter 4\n How Delta Sharing Works\n\n\nDelta Sharing is designed to be simple, scalable, nonproprietary\n\nand cost-effective for organizations that are serious about getting\n\nmore from their data. Delta Sharing is natively integrated with Unity\n\nCatalog, which enables customers to add fine-grained governance\n\nand security controls, making it easy and safe to share data\n\n\nDelta Sharing is a simple REST protocol that securely grants\n\ntemporary access to part of a cloud data set. It leverages modern\n\ncloud storage systems — such as AWS S3, Azure ADLS or Google’s\n\nGCS — to reliably grant read-only access to large data sets. Here’s\n\nhow it works for data providers and data recipients.\n\n\ninternally or externally.\n\nData PJQIiLeJ Data Recipient\n\nAccess permissions\n\nRequest table\n\nPre-signed short-lived URLs\n\n\nDelta Lake\n\nParquet `iles\n\n\nDelta Sharing Server\n\n\n**Figure 7:**\nHow Delta Sharing\nworks connecting data\nproviders and data\nrecipients\n\n\nTemporary direct access to fles Parquet ormatt\nin the object store — AWS S3, GCP, ADLS\n\n\n\n- • •\nDelta Sharing Client\n\n\n-----\n\n#### Data providers\n\nThe data provider shares existing tables or parts thereof (such as\n\nspecific table versions or partitions) stored on the cloud data lake\n\nin Delta Lake format. The provider decides what data they want to\n\nshare and runs a sharing server in front of it that implements the\n\nDelta Sharing protocol and manages recipient access. . To manage\n\nshares and recipients, you can use SQL commands,the Unity\n\nCatalog CLI or the intuitive user interface.\n\n#### Data recipients\n\nThe data recipient only needs one of the many Delta Sharing clients\n\nthat support the protocol. Databricks has released open source\n\nconnectors for pandas, Apache Spark, Java and Python, and is\n\nworking with partners on many more.\n\n\n#### The data exchange\n\nThe Delta Sharing data exchange follows three efficient steps:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "84f8fc3ba756aa770968b11bce51feb6", + "- • •\nDelta Sharing Client\n\n\n-----\n\n#### Data providers\n\nThe data provider shares existing tables or parts thereof (such as\n\nspecific table versions or partitions) stored on the cloud data lake\n\nin Delta Lake format. The provider decides what data they want to\n\nshare and runs a sharing server in front of it that implements the\n\nDelta Sharing protocol and manages recipient access. . To manage\n\nshares and recipients, you can use SQL commands,the Unity\n\nCatalog CLI or the intuitive user interface.\n\n#### Data recipients\n\nThe data recipient only needs one of the many Delta Sharing clients\n\nthat support the protocol. Databricks has released open source\n\nconnectors for pandas, Apache Spark, Java and Python, and is\n\nworking with partners on many more.\n\n\n#### The data exchange\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n**1.** \u0007The recipient’s client authenticates to the sharing server and\n\nasks to query a specific table. The client can also provide filters\n\non the data (for example, “country=US”) as a hint to read just a\n\nsubset of the data.\n\n**2.** \u0007The server verifies whether the client is allowed to access the\n\ndata, logs the request, and then determines which data to send\n\nback. This will be a subset of the data objects in cloud storage\n\nsystems that make up the table.\n\n**3.** \u0007To allow temporary access to the data, the server generates\n\nshort-lived presigned URLs that allow the client to read Parquet\n\nfiles directly from the cloud provider so that the read-only\n\naccess can happen in parallel at massive bandwidth, without\n\nstreaming through the sharing server.\n\n\n-----\n\n## Chapter 5\n Introducing Databricks Marketplace\n\n\nEnterprises need open collaboration for data and AI. Data sharing\n\n— within an organization or externally — allows companies to\n\ncollaborate with partners, establish new partnerships and generate\n\nnew revenue streams with data monetization.\n\nThe demand for generative AI is driving disruption across industries,\n\nincreasing the urgency for technical teams to build generative AI\n\nmodels and Large Language Models (LLMs) on top of their own data\n\nto differentiate their offerings.\n\n\nTraditional data marketplaces are restricted and offer only data or\n\nsimple applications, therefore limiting their value to data consumers.\n\nThey also don’t offer tools to evaluate the data assets beyond basic\n\ndescriptions or examples. Finally, data delivery is limited, often\n\nrequiring ETL or a proprietary delivery mechanism.\n\nEnterprises need a better way to share data and AI that is flexible,\n\nsecure and unlocks business value. An ecosystem makes data\n\nsharing and collaboration powerful.\n\n\n**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n\n**Data Consumers** **Data Providers**\n\n\nFocus on data only\nor simple applications\n\nLengthy discovery and\nevaluation\n\nDelayed time-to-insights\nwith vendor lock-in\n\n\nLimited opportunities to\n\nmonetize new types of assets\n\n\nLimited opportunities to\n\n\nDifficulty reaching\n\nmore users\n\n\nDifficulty reaching\n\n\nLack of secure technology\n\nand unified governance\n\n\nLack of secure technology\n\n\n-----\n\n#### Challenges in today's data marketplaces\n\n**Data Consumers** **Data Providers**\n\n\n\u0007 **Focus on data only or simple applications:** Accessing only\n\ndata sets means organizations looking to take advantage of\n\nAI/ML need to look elsewhere or start from scratch, causing\n\ndelays in driving business insights.\n\n\u0007 **Lengthy discovery and evaluation:** The tools most\n\nmarketplaces provide for data consumers to evaluate data\n\nare simply descriptions and example SQL statements. Minimal\n\n\n\u0007 **Limited opportunities to monetize new types of assets:**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3981674a529e30218b182bb885b310c5", + "Lengthy discovery and\nevaluation\n\nDelayed time-to-insights\nwith vendor lock-in\n\n\nLimited opportunities to\n\nmonetize new types of assets\n\n\nLimited opportunities to\n\n\nDifficulty reaching\n\nmore users\n\n\nDifficulty reaching\n\n\nLack of secure technology\n\nand unified governance\n\n\nLack of secure technology\n\n\n-----\n\n#### Challenges in today's data marketplaces\n\n**Data Consumers** **Data Providers**\n\n\n\u0007 **Focus on data only or simple applications:** Accessing only\n\ndata sets means organizations looking to take advantage of\n\nAI/ML need to look elsewhere or start from scratch, causing\n\ndelays in driving business insights.\n\n\u0007 **Lengthy discovery and evaluation:** The tools most\n\nmarketplaces provide for data consumers to evaluate data\n\nare simply descriptions and example SQL statements. Minimal\n\n\n\u0007 **Limited opportunities to monetize new types of assets:**\n\nA data-only approach means organizations are limited to\n\nmonetizing anything beyond a data set and will face more\n\nfriction to create new revenue opportunities with non-\n\ncompatible platforms.\n\n**Difficulty reaching more users:** Data providers must choose\n\nbetween forgoing potential business or incurring the expense\n\nof replicating data.\n\n\nevaluation tools mean it takes more time to figure out if a data\n\nproduct is right for you, which might include more time in\n\nback-and-forth messages with a provider or searching for a\n\nnew provider altogether.\n\n\n**Delayed time-to-insights with vendor lock-in:** Delivery\n\nthrough proprietary sharing technologies or FTP means either\n\nvendor lock-in or lengthy ETL processes to get the data where\n\nyou need to work with it.\n\n\n**Lack of secure technology and unified governance:** Without\n\nopen standards for sharing data securely across platforms\n\nand clouds, data providers must use multiple tools to secure\n\naccess to scattered data, leading to inconsistent governance.\n\n\n-----\n\n#### What is Databricks Marketplace?\n\n\napproach allows you to put your data to work more quickly in\n\nevery cloud with your tools of choice.\n\nMarketplace brings together a vast ecosystem of data\n\nconsumers and data providers to collaborate across a wide\n\narray of data sets without platform dependencies, complicated\n\nETL, expensive replication and vendor lock-in.\n\n\nDatabricks Marketplace is an open marketplace for all your data,\n\nanalytics and AI, powered by Delta Sharing.\n\nSince Marketplace is powered by Delta Sharing, you can benefit\n\nfrom open source flexibility and no vendor lock-in, enabling you\n\nto collaborate across all platforms, clouds and regions. This open\n\n\n#### Key Benefits of Databricks Marketplace\n\n**Consumers** **Providers**\n\n\nDatabricks\nMarketplace\nprovides key benefits\nfor both data\nconsumers and data\nproviders.\n\n\nDiscover more\n\nthan just data\n\n\nReach users\n\non any platform\n\n\nReach users\n\n\nEvaluate data\n\nproducts faster\n\nAvoid vendor lock-in\n\n\nMonetize more\n\nthan just data\n\n\nMonetize more\n\n\nShare data securely\n\n\n-----\n\n#### Databricks Marketplace drives innovation and expands revenue opportunities\n\n\n##### Data Consumers\n\n For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n\n**Discover more than just data:** Access more than just data sets,\n\nincluding AI models, notebooks, applications and solutions.\n\n**Evaluate data products faster:** Pre-built notebooks and sample\n\ndata help you quickly evaluate and have much greater confidence\n\nthat a data product is right for your AI or analytics initiatives.\n\nObtain the fastest and simplest time to insight.\n\n**Avoid vendor lock-in:** Substantially reduce the time to deliver", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "fdc1e9ec7cda82d04d9014acb796d6e7", + "Reach users\n\non any platform\n\n\nReach users\n\n\nEvaluate data\n\nproducts faster\n\nAvoid vendor lock-in\n\n\nMonetize more\n\nthan just data\n\n\nMonetize more\n\n\nShare data securely\n\n\n-----\n\n#### Databricks Marketplace drives innovation and expands revenue opportunities\n\n\n##### Data Consumers\n\n For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n\n**Discover more than just data:** Access more than just data sets,\n\nincluding AI models, notebooks, applications and solutions.\n\n**Evaluate data products faster:** Pre-built notebooks and sample\n\ndata help you quickly evaluate and have much greater confidence\n\nthat a data product is right for your AI or analytics initiatives.\n\nObtain the fastest and simplest time to insight.\n\n**Avoid vendor lock-in:** Substantially reduce the time to deliver\n\ninsights and avoid lock-in with open and seamless sharing and\n\ncollaboration across clouds, regions, or platforms. Directly\n\nintegrate with your tools of choice and right where you work.\n\n\n##### Data Providers\n\n For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n\n**Reach users on any platform:** Expand your reach across\n\nplatforms and access a massive ecosystem beyond walled\n\ngardens. Streamline delivery of simple data sharing to any cloud\n\nor region, without replication.\n\n**Monetize more than just data:** Monetize the broadest set of\n\ndata assets including data sets, notebooks, AI models to reach\n\nmore data consumers.\n\n**Share data securely:** Share all your data sets, notebooks, AI\n\nmodels, dashboards and more securely across clouds, regions\n\nand data platforms.\n\n\n-----\n\n#### Enable collaboration and accelerate innovation\n\n\n#### Powered by a fast, growing ecosystem\n\nEnterprises need open collaboration for data and AI. In the past few\n\nmonths, we've continued to increase partners across industries,\n\nincluding Retail, Communications and Media & Entertainment,\n\n\n\u0007 **Advertising and Retail**\n\nIncorporate shopper behavior analysis | Ads uplift/\n\nperformance | Demand forecasting | “Next best SKU”\n\nprediction | Inventory analysis | Live weather data\n\n\nFinancial Services, with 520+ listings you can explore in our open\n\n\n\u0007 **Finance**\n\nIncorporate data from stock exchange to predict\n\neconomic impact | Market research | Public census and\n\nhousing data to predict insurance sales\n\n\u0007 **Healthcare and Life Sciences**\n\nGenomic target identification | Patient risk scoring\n\nAccelerating drug discovery | Commercial effectiveness |\n\nClinical research\n\nFor more on Databricks Marketplace,\n\ngo to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n\nResources section on page 41 .\n\n\nMarketplace from 80+ providers and counting.\n\n#### Use cases for an open marketplace\n\nOrganizations across all industries have many use cases for\n\nconsuming and sharing third-party data from the simple (dataset\n\njoins) to the more advanced (AI notebooks, applications and\n\ndashboards).\n\n\n-----\n\n#### New upcoming feature: AI model sharing\n\n\nNowadays, it may seem like every organization wants to become\n\nan AI organization. However, most organizations are new to AI.\n\nDatabricks has heard from customers that they want to discover\n\nout-of-the-box AI models on Marketplace to help them kickstart\n\ntheir AI innovation journey.\n\nTo meet this demand, Databricks will be adding AI model sharing\n\ncapabilities on Marketplace to provide users access to both OSS\n\nand proprietary AI (both first-and third-party) models. This will", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e3a2c1df39ef549700795ff328f19b66", + "Resources section on page 41 .\n\n\nMarketplace from 80+ providers and counting.\n\n#### Use cases for an open marketplace\n\nOrganizations across all industries have many use cases for\n\nconsuming and sharing third-party data from the simple (dataset\n\njoins) to the more advanced (AI notebooks, applications and\n\ndashboards).\n\n\n-----\n\n#### New upcoming feature: AI model sharing\n\n\nNowadays, it may seem like every organization wants to become\n\nan AI organization. However, most organizations are new to AI.\n\nDatabricks has heard from customers that they want to discover\n\nout-of-the-box AI models on Marketplace to help them kickstart\n\ntheir AI innovation journey.\n\nTo meet this demand, Databricks will be adding AI model sharing\n\ncapabilities on Marketplace to provide users access to both OSS\n\nand proprietary AI (both first-and third-party) models. This will\n\nenable data consumers and providers to discover and monetize AI\n\nmodels and integrate AI into their data solutions.\n\n\nUsing this feature, data consumers can evaluate AI models with\n\nrich previews, including visualizations and pre-built notebooks with\n\nsample data. With Databricks Marketplace, there are no difficult\n\ndata delivery mechanisms — you can get the AI models instantly\n\nwith the click of a button. All of this works out-of-the-box with the AI\n\ncapabilities of the Databricks Lakehouse Platform for both real-time\n\nand batch inference. For real-time inference, you can use model\n\nserving endpoints. For batch inference, you can invoke the models\n\nas functions directly from DBSQL or notebooks.\n\nWith AI model sharing, Databricks customers will have access\n\nto best-in-class models from leading providers, as well as OSS\n\nmodels published by Databricks which can be quickly and securely\n\napplied on top of their data. Databricks will curate and publish\n\nits own open source models across common use cases, such as\n\ninstruction-following and text summarization, and optimize tuning or\n\ndeployment of these models.\n\nUsing AI models from Databricks Marketplace can help your\n\norganization summarize complex information quickly and easily to\n\nhelp accelerate the pace of innovation.\n\n\n-----\n\n## Chapter 6\n Share securely with Databricks Clean Rooms\n\n\nWhile the demand for external data to make data-driven\n\ninnovations is greater than ever, there is growing concern among\n\norganizations around data privacy. The need for organizations to\n\nshare data and collaborate with their partners and customers in a\n\nsecure, governed and privacy-centric way is driving the concept\n\nof “data clean rooms.”\n\n\n#### What is a data clean room?\n\nA data clean room provides a secure, governed and privacy-safe\n\nenvironment where participants can bring their sensitive data, which\n\nmight include personally identifiable information (PII), and perform\n\njoint analysis on that private data. Participants have full control\n\nof the data and can decide which participants can perform what\n\nanalysis without exposing any sensitive data.\n\n\n###### Collaborator A\n Data Cleanroom\nE.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n\n\u0007What is our audience overlap?\n\n\n###### Collaborator B\n\nE.G., ADVERTISERTS\n\n\n**Figure 8:**\nData clean room\ndiagram example\nfor audience\noverlap analysis in\nadvertising\n\n\nHow did my campaign do in\n\nterms of reach and frequency?\n\n\n\u0007What is the lift in purchases\n\namong those in-segment versus\nthose out-of-segment?\n\n**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n\n\n-----\n\nA data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "bfc080f09852618aa1bd62f142c2e4c6", + "joint analysis on that private data. Participants have full control\n\nof the data and can decide which participants can perform what\n\nanalysis without exposing any sensitive data.\n\n\n###### Collaborator A\n Data Cleanroom\nE.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n\n\u0007What is our audience overlap?\n\n\n###### Collaborator B\n\nE.G., ADVERTISERTS\n\n\n**Figure 8:**\nData clean room\ndiagram example\nfor audience\noverlap analysis in\nadvertising\n\n\nHow did my campaign do in\n\nterms of reach and frequency?\n\n\n\u0007What is the lift in purchases\n\namong those in-segment versus\nthose out-of-segment?\n\n**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n\n\n-----\n\nA data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n\nadvertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n\nthe last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n\nsharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n\nThere are various compelling needs driving this demand:\n\n\n**Privacy-first world.** Stringent data privacy regulations such as\n\nGDPR and CCPA, along with sweeping changes in third-party\n\nmeasurement, have transformed how organizations collect, use\n\nand share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n\n[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n\nand flexibility to easily opt out of app tracking. Google also plans\n\nto [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n\n2024. As these privacy laws and practices evolve, the demand\n\nfor data cleanrooms is likely to rise as the industry moves to new\n\n\n**Collaboration in a fragmented ecosystem.** Today, consumers have\n\nmore options than ever before when it comes to where, when and\n\nhow they engage with content. As a result, the digital footprint of\n\nconsumers is fragmented across different platforms, necessitating\n\nthat companies collaborate with their partners to create a unified\n\nview of their customers’ needs and requirements. To facilitate\n\ncollaboration across organizations, cleanrooms provide a secure\n\nand private way to combine their data with other data to unlock new\n\ninsights or capabilities.\n\n\nidentifiers that are PII based, such as UID 2.0, and organizations\n\ntry to find new ways to share and join data with customers and\n\npartners in a privacy-centric way.\n\n**New ways to monetize data.** Most organizations are looking to\n\nmonetize their data in one form or another. With today’s privacy\n\nlaws, companies will try to find any possible advantages to monetize\n\ntheir data without the risk of breaking privacy rules. This creates an\n\nopportunity for data vendors or publishers to join data for big data\n\nanalytics without having direct access to the data.\n\n\n-----\n\n#### Common data clean room uses cases\n\n\n#### Category management for retail and consumer goods\n\nClean rooms enable real-time collaboration between retailers\n\nand suppliers, ensuring secure information exchange for demand\n\nforecasting, inventory planning and supply chain optimization.\n\nThis improves product availability, reduces costs and streamlines\n\noperations for both parties.\n\n#### Real-world evidence (RWE) for healthcare\n\nClean rooms provide secure access to sensitive healthcare data sets,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "5fedc77ae1a861260a1c4c866e2dbd05", + "partners in a privacy-centric way.\n\n**New ways to monetize data.** Most organizations are looking to\n\nmonetize their data in one form or another. With today’s privacy\n\nlaws, companies will try to find any possible advantages to monetize\n\ntheir data without the risk of breaking privacy rules. This creates an\n\nopportunity for data vendors or publishers to join data for big data\n\nanalytics without having direct access to the data.\n\n\n-----\n\n#### Common data clean room uses cases\n\n\n#### Category management for retail and consumer goods\n\nClean rooms enable real-time collaboration between retailers\n\nand suppliers, ensuring secure information exchange for demand\n\nforecasting, inventory planning and supply chain optimization.\n\nThis improves product availability, reduces costs and streamlines\n\noperations for both parties.\n\n#### Real-world evidence (RWE) for healthcare\n\nClean rooms provide secure access to sensitive healthcare data sets,\n\nallowing collaborators to connect and query multiple sources of data\n\nwithout comprising data privacy. This supports RWE use cases such\n\nas regulatory decisions, safety, clinical trial design and observational\n\nresearch.\n\n\n#### Audience overlap exploration for media and entertainment\n\nBy creating a clean room environment, media companies can\n\nsecurely share their audience data with advertisers or other media\n\npartners. This allows them to perform in-depth analysis and identify\n\nshared audience segments without directly accessing or exposing\n\nindividual user information.\n\n#### Know Your Customer (KYC) in banking\n\nKYC standards are designed to combat financial fraud, money\n\nlaundering and terrorism financing. Clean rooms can be used within a\n\ngiven jurisdiction to allow financial services companies to collaborate\n\nand run shared analytics to build a holistic view of a transaction for\n\ninvestigations.\n\n\n-----\n\n#### Personalization with expanded interests for retailers\n\nRetailers want to target consumers based on past purchases, as\n\nwell as other purchases with different retailers. Clean rooms enable\n\nretailers to augment their knowledge of consumers to suggest new\n\nproducts and services that are relevant to the individual but have\n\n\n#### 5G data monetization for telecom\n\n5G data monetization enables telecoms to capitalize on data\n\nfrom 5G networks. Clean rooms provide a secure environment\n\nfor collaboration with trusted partners, ensuring privacy while\n\nmaximizing data value for optimized services, personalized\n\nexperiences and targeted advertising.\n\n\nnot yet been purchased.\n\n\n-----\n\n#### Shortcomings of existing data clean rooms\n\n\nOrganizations exploring clean room options are finding some glaring\n\nshortcomings in the existing solutions that limit the full potential of the\n\n“clean rooms” concept.\n\nFirst, many existing data clean room vendors require data to be on the\n\nsame cloud, same region, and/or same data platform. Participants then\n\nhave to move data into proprietary platforms, which results in lock-in\n\nand additional data storage costs.\n\n\nSecond, most existing solutions are not scalable to expand\n\ncollaboration beyond a few collaborators at a time. For example,\n\nan advertiser might want to get a detailed view of their ad\n\nperformance across different platforms, which requires analysis\n\nof the aggregated data from multiple data publishers. With\n\ncollaboration limited to just a few participants, organizations get\n\npartial insights on one clean room platform and end up moving\n\ntheir data to another clean room vendor to aggregate the data,\n\nincurring the operational overhead of collating partial insights.\n\nFinally, existing clean room solutions do not provide the flexibility\n\nto run arbitrary analysis and are mainly restricted to SQL, a\n\nsubset of Python, and pre-defined templates. While SQL is\n\nabsolutely needed for clean rooms, there are times when you\n\nrequire complex computations such as machine learning or\n\nintegration with APIs where SQL doesn’t satisfy the full depth of\n\nthe technical requirements.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "384c8233e32ea9b504cccff0fc400f7e", + "and additional data storage costs.\n\n\nSecond, most existing solutions are not scalable to expand\n\ncollaboration beyond a few collaborators at a time. For example,\n\nan advertiser might want to get a detailed view of their ad\n\nperformance across different platforms, which requires analysis\n\nof the aggregated data from multiple data publishers. With\n\ncollaboration limited to just a few participants, organizations get\n\npartial insights on one clean room platform and end up moving\n\ntheir data to another clean room vendor to aggregate the data,\n\nincurring the operational overhead of collating partial insights.\n\nFinally, existing clean room solutions do not provide the flexibility\n\nto run arbitrary analysis and are mainly restricted to SQL, a\n\nsubset of Python, and pre-defined templates. While SQL is\n\nabsolutely needed for clean rooms, there are times when you\n\nrequire complex computations such as machine learning or\n\nintegration with APIs where SQL doesn’t satisfy the full depth of\n\nthe technical requirements.\n\n\n-----\n\n#### Key benefits of Databricks Clean Rooms\n\nDatabricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n\nany cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n\n\n**Flexible - your language and workload of**\n\n**choice.** Databricks Clean Rooms empower\n\ncollaborators to share and join their existing\n\ndata and run complex workloads in any\n\nlanguage —Python, R, SQL, Java and Scala —\n\non the data while maintaining data privacy.\n\nBeyond traditional SQL, users can run arbitrary\n\nworkloads and languages, allowing them to train\n\nmachine learning models, perform inference\n\nand utilize open-source or third-party privacy-\n\nenhancing technologies. This flexibility enables\n\ndata scientists and analysts to achieve more\n\ncomprehensive and advanced data analysis\n\nwithin the secure Clean Room environment.\n\n\n**Scalable, multi-party collaboration.**\n\nWith Databricks Clean Rooms, you can\n\nlaunch a clean room and work with multiple\n\ncollaborators at a time. This capability\n\nenables real-time collaboration, fostering\n\nefficient and rapid results. Moreover,\n\nDatabricks Clean Rooms seamlessly\n\nintegrate with identity service providers,\n\nallowing users to leverage offerings from\n\nthese providers during collaboration. The\n\nability to collaborate with multiple parties\n\nand leverage identity services enhances the\n\noverall data collaboration experience within\n\nDatabricks Clean Rooms.\n\n\n**Interoperable - any data source**\n\n**with no replication.** Databricks Clean\n\nRooms excel in interoperability, ensuring\n\nsmooth collaboration across diverse\n\nenvironments. With Delta Sharing,\n\ncollaborators can seamlessly work\n\ntogether across different cloud providers,\n\nregions and even data platforms without\n\nthe need for extensive data movement.\n\nThis eliminates data silos and enables\n\norganizations to leverage existing\n\ninfrastructure and data ecosystems while\n\nmaintaining the utmost security and\n\ncompliance.\n\n\n-----\n\n## Resources\n Getting started with Data Sharing and Collaboration\n\n\nData sharing plays a key role in business processes across the\n\nenterprise, from product development and internal operations to\n\ncustomer experience and compliance. However, most businesses\n\nhave been slow to move forward because of incompatibility\n\nbetween systems, complexity and security concerns.\n\nData-driven organizations need an open — and secure — approach\n\nto data sharing.\n\n\nDatabricks offers an open approach to data sharing and\n\ncollaboration with a variety of tools to:\n\n\u0007 **Share across platforms:** You can share live data sets, as well\n\nas AI models, dashboards and notebooks across platforms,\n\nclouds and regions. This open approach is powered by\n\nDelta Sharing, the world’s first open protocol for secure data\n\nsharing, which allows organizations to share data for any use", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6b59600128f0f7c536b3b2f20ba6891b", + "organizations to leverage existing\n\ninfrastructure and data ecosystems while\n\nmaintaining the utmost security and\n\ncompliance.\n\n\n-----\n\n## Resources\n Getting started with Data Sharing and Collaboration\n\n\nData sharing plays a key role in business processes across the\n\nenterprise, from product development and internal operations to\n\ncustomer experience and compliance. However, most businesses\n\nhave been slow to move forward because of incompatibility\n\nbetween systems, complexity and security concerns.\n\nData-driven organizations need an open — and secure — approach\n\nto data sharing.\n\n\nDatabricks offers an open approach to data sharing and\n\ncollaboration with a variety of tools to:\n\n\u0007 **Share across platforms:** You can share live data sets, as well\n\nas AI models, dashboards and notebooks across platforms,\n\nclouds and regions. This open approach is powered by\n\nDelta Sharing, the world’s first open protocol for secure data\n\nsharing, which allows organizations to share data for any use\n\ncase, any tool and on any cloud.\n\n\u0007 **Share all your data and AI: Databricks Marketplace** is an\n\nopen marketplace for all your data, analytics and AI, enabling\n\nboth data consumers and data providers with the ability to\n\ndeliver innovation and advance analytics and AI initiatives.\n\n\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n\nto easily collaborate with customers and partners on any\n\ncloud in a privacy-safe way. With Delta Sharing, clean room\n\nparticipants can securely share data from their data lakes\n\nwithout any data replication across clouds or regions. Your\n\ndata stays with you without vendor lock-in, and you can\n\ncentrally audit and monitor the usage of your data.\n\n\n-----\n\nGet started with these products by exploring the resources below.\n\n\n**Delta Sharing**\n\n\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n\n[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n\n[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n\n**Databricks Marketplace**\n\n[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n\n[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n\n[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n\n[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n\n[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n\n\n[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n\n\n**Databricks Clean Rooms**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "f330d8664cc5ce1bac8095d4a154a2f8", + "[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n\n[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n\n[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n\n[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n\n\n[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n\n\n**Databricks Clean Rooms**\n\n\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n\n[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n\n[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n\n[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n\n\n-----\n\n## About the Authors\n\n\n**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n\nmaking analytics and AI simple for customers by leveraging the\n\npower of the Databricks Lakehouse Platform. You can reach Vuong\n\non [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n\n\n**Sachin Thakur** is a Principal Product Marketing Manager on the\n\nDatabricks Data Engineering and Analytics team. His area of focus\n\nis data governance with Unity Catalog, and he is passionate about\n\nhelping organizations democratize data and AI with the Databricks", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "783b4155a9c7a07bf4dcceef9213e9f4", + "[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n\n\n-----\n\n## About the Authors\n\n\n**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n\nmaking analytics and AI simple for customers by leveraging the\n\npower of the Databricks Lakehouse Platform. You can reach Vuong\n\non [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n\n\n**Sachin Thakur** is a Principal Product Marketing Manager on the\n\nDatabricks Data Engineering and Analytics team. His area of focus\n\nis data governance with Unity Catalog, and he is passionate about\n\nhelping organizations democratize data and AI with the Databricks\n\nLakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n\n\n**Milos Colic** is a Senior Solution Architect at Databricks. His\n\n\npassion is to help customers with their data exchange and data\n\nmonetization needs. Furthermore, he is passionate about geospatial\n\ndata processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n\n\n**Jay Bhankharia** is a Senior Director on the Databricks Data\n\nPartnerships team. His passion is to help customers gain insights\n\nfrom data to use the power of the Databricks Lakehouse Platform\n\nfor their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n\n\n**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n\n\nover 20 years of experience in helping organizations of any size\n\nbuild data solutions. He focuses on data monetization and loves to\n\nhelp customers and businesses get more value from the data they\n\nhave. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n\n**Somasekar Natarajan** (Som) is a Solution Architect at\n\nDatabricks specializing in enterprise data management. Som has\n\nworked with Fortune organizations spanning three continents for\n\nclose to two decades with one objective — helping customers to\n\n\n**Giselle Goicochea** is a Senior Product Marketing Manager\n\non the Databricks Data Engineering and Analytics team. Her area\n\nof focus is data sharing and collaboration with Delta Sharing and\n\nDatabricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n\n**Kelly Albano** is a Product Marketing Manager on the Databricks\n\nData Engineering and Analytics team. Her area of focus is security,\n\ncompliance and Databricks Clean Rooms. You can reach\n\nKelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n\n\nharness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "842ae84382f4816a7c67c6480c3a3d55", + "Kelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n\n\nharness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n© Databricks 2023 All rights reserved\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "76bb60c8fadfe670658fb0e87fc193c4", + "### EBOOK\n\n# A Compact Guide to Large Language Models\n\n\n-----\n\nSECTION 1\n## Introduction\n\n##### Definition of large language models (LLMs)\n\nLarge language models are AI systems that are designed to process and analyze\nvast amounts of natural language data and then use that information to generate\nresponses to user prompts. These systems are trained on massive data sets\nusing advanced machine learning algorithms to learn the patterns and structures\nof human language, and are capable of generating natural language responses to\na wide range of written inputs. Large language models are becoming increasingly\nimportant in a variety of applications such as natural language processing,\nmachine translation, code and text generation, and more.\n\nWhile this guide will focus on language models, it’s important to understand that\nthey are only one aspect under a larger generative AI umbrella. Other noteworthy\ngenerative AI implementations include projects such as art generation from text,\naudio and video generation, and certainly more to come in the near future.\n\n\n-----\n\n##### Extremely brief historical background and development of LLMs\n\n\n###### 1950s–1990s\nInitial attempts are made to map hard rules around languages and\nfollow logical steps to accomplish tasks like translating a sentence\nfrom one language to another.\n\nWhile this works sometimes, strictly defined rules only work for\nconcrete, well-defined tasks that the system has knowledge about.\n\n###### 1990s \nLanguage models begin evolving into statistical models and\nlanguage patterns start being analyzed, but larger-scale projects\nare limited by computing power.\n\n###### 2000s \nAdvancements in machine learning increase the complexity of\nlanguage models, and the wide adoption of the internet sees an\n\nenormous increase in available training data.\n\n###### 2012 \nAdvancements in deep learning architectures and larger data sets\nlead to the development of GPT (Generative Pre-trained Transformer).\n\n\n###### 2018\nGoogle introduces BERT (Bidirectional Encoder Representations\nfrom Transformers), which is a big leap in architecture and paves\nthe way for future large language models.\n\n###### 2020\nOpenAI releases GPT-3, which becomes the largest model at\n175B parameters and sets a new performance benchmark for\nlanguage-related tasks.\n\n###### 2022\nChatGPT is launched, which turns GPT-3 and similar models into\na service that is widely accessible to users through a web interface\nand kicks off a huge increase in public awareness of LLMs and\ngenerative AI.\n\n###### 2023\nOpen source LLMs begin showing increasingly impressive results\nwith releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\nGPT-4 is also released, setting a new benchmark for both parameter\nsize and performance.\n\n\n-----\n\nSECTION 2\n## Understanding Large Language Models\n\n\n##### What are language models and how do they work?\n\nLarge language models are advanced artificial intelligence systems that take\nsome input and generate humanlike text as a response. They work by first\nanalyzing vast amounts of data and creating an internal structure that models\nthe natural language data sets that they’re trained on. Once this internal\nstructure has been developed, the models can then take input in the form of\nnatural language and approximate a good response.\n\n##### If they’ve been around for so many years, why are they just now making headlines?\n\nA few recent advancements have really brought the spotlight to generative AI\nand large language models:\n\n**A D VA N C E M E N T S I N T E C H N I Q U E S**\nOver the past few years, there have been significant advancements in the\ntechniques used to train these models, resulting in big leaps in performance.\nNotably, one of the largest jumps in performance has come from integrating\nhuman feedback directly into the training process.\n\n\n**I N C R E A S E D A C C E S S I B I L I T Y**\nThe release of ChatGPT opened the door for anyone with internet access\nto interact with one of the most advanced LLMs through a simple web\ninterface. This brought the impressive advancements of LLMs into the\nspotlight, since previously these more powerful LLMs were only available\nto researchers with large amounts of resources and those with very deep\ntechnical knowledge.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "b30db2ad94ab140731be7adc8169387e", + "##### If they’ve been around for so many years, why are they just now making headlines?\n\nA few recent advancements have really brought the spotlight to generative AI\nand large language models:\n\n**A D VA N C E M E N T S I N T E C H N I Q U E S**\nOver the past few years, there have been significant advancements in the\ntechniques used to train these models, resulting in big leaps in performance.\nNotably, one of the largest jumps in performance has come from integrating\nhuman feedback directly into the training process.\n\n\n**I N C R E A S E D A C C E S S I B I L I T Y**\nThe release of ChatGPT opened the door for anyone with internet access\nto interact with one of the most advanced LLMs through a simple web\ninterface. This brought the impressive advancements of LLMs into the\nspotlight, since previously these more powerful LLMs were only available\nto researchers with large amounts of resources and those with very deep\ntechnical knowledge.\n\n**G R O W I N G C O M P U TAT I O N A L P O W E R**\nThe availability of more powerful computing resources, such as graphics\nprocessing units (GPUs), and better data processing techniques allowed\nresearchers to train much larger models, improving the performance of\nthese language models.\n\n**I M P R O V E D T R A I N I N G D ATA**\nAs we get better at collecting and analyzing large amounts of data, the\n\nmodel performance has improved dramatically. In fact, Databricks showed\nthat you can get amazing results training a relatively small model with a\nhigh-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\nwith the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n\n\n-----\n\n##### So what are organizations using large language models for?\n\nHere are just a few examples of common use cases for large language models:\n\n**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\nOne of the most common implementations, LLMs can be used by\norganizations to provide help with things like customer support,\ntroubleshooting, or even having open-ended conversations with userprovided prompts.\n\n**C O D E G E N E R AT I O N A N D D E B U G G I N G**\nLLMs can be trained on large amounts of code examples and give\nuseful code snippets as a response to a request written in natural language.\nWith the proper techniques, LLMs can also be built in a way to reference\nother relevant data that it may not have been trained with, such as a\ncompany’s documentation, to help provide more accurate responses.\n\n**S E N T I M E N T A N A LY S I S**\nOften a hard task to quantify, LLMs can help take a piece of text and gauge\nemotion and opinions. This can help organizations gather the data and\n\nfeedback needed to improve customer satisfaction.\n\n\n**L A N G U A G E T R A N S L AT I O N**\nGlobalize all your content without hours of painstaking work by simply\nfeeding your web pages through the proper LLMs and translating them to\ndifferent languages. As more LLMs are trained in other languages, quality\nand availability will continue to improve.\n\n**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\nEntire customer calls or meetings could be efficiently summarized so that\nothers can more easily digest the content. LLMs can take large amounts of\ntext and boil it down to just the most important bytes.\n\n**C O N T E N T G E N E R AT I O N**\nStart with a detailed prompt and have an LLM develop an outline for you.\nThen continue on with those prompts and LLMs can generate a good first\ndraft for you to build off. Use them to brainstorm ideas, and ask the LLM\nquestions to help you draw inspiration from.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "156c59c7ad359ad7302ff542371be820", + "feedback needed to improve customer satisfaction.\n\n\n**L A N G U A G E T R A N S L AT I O N**\nGlobalize all your content without hours of painstaking work by simply\nfeeding your web pages through the proper LLMs and translating them to\ndifferent languages. As more LLMs are trained in other languages, quality\nand availability will continue to improve.\n\n**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\nEntire customer calls or meetings could be efficiently summarized so that\nothers can more easily digest the content. LLMs can take large amounts of\ntext and boil it down to just the most important bytes.\n\n**C O N T E N T G E N E R AT I O N**\nStart with a detailed prompt and have an LLM develop an outline for you.\nThen continue on with those prompts and LLMs can generate a good first\ndraft for you to build off. Use them to brainstorm ideas, and ask the LLM\nquestions to help you draw inspiration from.\n\n**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\nlanguage, but they might not know who won the big sporting event last year.\nIt’s always important to fact check and understand the responses before\n\nusing them as a reference.\n\n\n**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\nThe ability to categorize and sort large volumes of data enables the\nidentification of common themes and trends, supporting informed\ndecision-making and more targeted strategies.\n\n\n-----\n\nSECTION 3\n## Applying Large Language Models\n\n\nThere are a few paths that one can take when looking to apply large language\nmodels for their given use case. Generally speaking, you can break them down\ninto two categories, but there’s some crossover between each. We’ll briefly cover\nthe pros and cons of each and what scenarios fit best for each.\n\n##### Proprietary services\n\nAs the first widely available LLM powered service, OpenAI’s ChatGPT was the\nexplosive charge that brought LLMs into the mainstream. ChatGPT provides\na nice user interface (or API) where users can feed prompts to one of many\nmodels (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\namong the highest-performing models, trained on enormous data sets, and are\ncapable of extremely complex tasks both from a technical standpoint, such as\ncode generation, as well as from a creative perspective like writing poetry in a\nspecific style.\n\nThe downside of these services is the absolutely enormous amount of compute\nrequired not only to train them (OpenAI has said GPT-4 cost them over $100\nmillion to develop) but also to serve the responses. For this reason, these\nextremely large models will likely always be under the control of organizations,\n\n\nand require you to send your data to their servers in order to interact with their\nlanguage models. This raises privacy and security concerns, and also subjects\nusers to “black box” models, whose training and guardrails they have no control\nover. Also, due to the compute required, these services are not free beyond a\nvery limited use, so cost becomes a factor in applying these at scale.\n\nIn summary: Proprietary services are great to use if you have very complex tasks,\nare okay with sharing your data with a third party, and are prepared to incur\ncosts if operating at any significant scale.\n\n##### Open source models\n\nThe other avenue for language models is to go to the open source community,\nwhere there has been similarly explosive growth over the past few years.\nCommunities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n\nfrom contributors that can help solve tons of specific use cases such as text\ngeneration, summarization and classification. The open source community has\nbeen quickly catching up to the performance of the proprietary models, but\nultimately still hasn’t matched the performance of something like GPT-4.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "db2ede947d82f8c39104fbd747f97f40", + "In summary: Proprietary services are great to use if you have very complex tasks,\nare okay with sharing your data with a third party, and are prepared to incur\ncosts if operating at any significant scale.\n\n##### Open source models\n\nThe other avenue for language models is to go to the open source community,\nwhere there has been similarly explosive growth over the past few years.\nCommunities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n\nfrom contributors that can help solve tons of specific use cases such as text\ngeneration, summarization and classification. The open source community has\nbeen quickly catching up to the performance of the proprietary models, but\nultimately still hasn’t matched the performance of something like GPT-4.\n\n\n-----\n\nIt does currently take a little bit more work to grab an open source model and\nstart using it, but progress is moving very quickly to make them more accessible\nto users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\nexperience to pull any Hugging Face transformer model and use it as a Python\nobject. Oftentimes, you can find an open source model that solves your specific\nproblem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\nthe model into your environment and host it yourself. This means that you can\nkeep the data in your control for privacy and governance concerns as well as\nmanage your costs.\n\n\n##### Conclusion and general guidelines\n\nUltimately, every organization is going to have unique challenges to overcome,\nand there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\nbecomes more data driven, everything, including LLMs, will be reliant on having\na strong foundation of data. LLMs are incredible tools, but they have to be used\nand implemented on top of this strong data foundation. Databricks brings both\nthat strong data foundation as well as the integrated tools to let you use and\nfine-tune LLMs in your domain.\n\n\nAnother huge upside to using open source models is the ability to fine-tune\nthem to your own data. Since you’re not dealing with a black box of a proprietary\nservice, there are techniques that let you take open source models and train\nthem to your specific data, greatly improving their performance on your\nspecific domain. We believe the future of language models is going to move\nin this direction, as more and more organizations will want full control and\nunderstanding of their LLMs.\n\n\n-----\n\nSECTION 4\n## So What Do I Do Next If I Want to Start Using LLMs?\n\n\nThat depends where you are on your journey! Fortunately, we have a few paths\nfor you.\n\nIf you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\nyou can watch one of Databricks’ most talented developers and speakers go\nover these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n\nIf you’re ready to dive a little deeper and expand your education and\nunderstanding of LLM foundations, we’d recommend checking out our\n[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\nand dive into the theory behind foundation models.\n\nIf your hands are already shaking with excitement and you already have some\nworking knowledge of Python and Databricks, we’ll provide some great examples\nwith sample code that can get you up and running with LLMs right away!\n\n\n###### Getting started with NLP using Hugging Face transformers pipelines\n\n Fine-Tuning Large Language Models with Hugging Face and DeepSpeed", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6164b4b9647cc0b9d8049c2b7312a557", + "If you’re ready to dive a little deeper and expand your education and\nunderstanding of LLM foundations, we’d recommend checking out our\n[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\nand dive into the theory behind foundation models.\n\nIf your hands are already shaking with excitement and you already have some\nworking knowledge of Python and Databricks, we’ll provide some great examples\nwith sample code that can get you up and running with LLMs right away!\n\n\n###### Getting started with NLP using Hugging Face transformers pipelines\n\n Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n\n Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n#### Contact us for a personalized demo: databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "9f63c6b051354dd466246dab2d3a7ff5", + "##### EBOOK\n\n# 8 Steps to Becoming an AI-Forward Retailer\n\n\n-----\n\n## Contents\n\n\nIntroduction .............................................................................................................................................................................................. **3**\n\nThe State of the Retail Industry:\n\nThe Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n\nBegin With a Shared Vision of Success ....................................................................................................................................... **6**\n\nWhy Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n\nBefore Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n\nGetting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n\nGoing Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n\nNormalizing the Process: Engraining a Data-Driven Mindset", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "f0a2f88cb135664940c16d2d9507c27f", + "Going Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n\nNormalizing the Process: Engraining a Data-Driven Mindset\n\nInto the Fabric of the Business ...................................................................................................................................................... **14**\n\nFrom Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n\nThe 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n\nTransform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n\n\n-----\n\n## Introduction\n\n\nIn a world where data is king, retailers have historically been trailblazers, pioneering data technology\nadoption to supercharge their operations, enhance customer understanding and sharpen\npersonalization. The journey began with the simple cash register about 150 years ago, progressed to\nstandardized product reporting with the introduction of the UPC and EAN, and has evolved to include\ncutting-edge technologies such as RFID and machine learning.\n\nToday, we stand on the brink of “Generation AI,” defined by sophisticated language models and\nimages. Retailers, with their history of embracing data technologies, find themselves in a strong\nposition to reap the benefits of this new era. Automation of customer service, supply chain modeling\nwith digital twins and delivering hyper-personalized experiences in real time are all in the cards,\npromising to bolster revenue, improve margins and slash costs for early adopters.\n\nAccording to an internal analysis by Databricks, data pioneers are already outstripping their\ncompetition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\nsix major industry sectors, including retail — shows these front-runners outperforming the rest of the\nmarket by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\nare setting themselves up for significant gains and a robust competitive advantage.\n\nHowever, for retailers mired in the landscape of outdated data platforms, the transformation into an\nAI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\nfeel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\nevolving retail landscape.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8af29864e6963d85e9bb0a6142ac12f2", + "Today, we stand on the brink of “Generation AI,” defined by sophisticated language models and\nimages. Retailers, with their history of embracing data technologies, find themselves in a strong\nposition to reap the benefits of this new era. Automation of customer service, supply chain modeling\nwith digital twins and delivering hyper-personalized experiences in real time are all in the cards,\npromising to bolster revenue, improve margins and slash costs for early adopters.\n\nAccording to an internal analysis by Databricks, data pioneers are already outstripping their\ncompetition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\nsix major industry sectors, including retail — shows these front-runners outperforming the rest of the\nmarket by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\nare setting themselves up for significant gains and a robust competitive advantage.\n\nHowever, for retailers mired in the landscape of outdated data platforms, the transformation into an\nAI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\nfeel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\nevolving retail landscape.\n\nTo help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\nroad map for organizations embarking on digital transformation journeys — a shift that is as much\nabout culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\nvision for transformation, outlining a compelling case for why such change is vital for the company’s\nlong-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\ncritical business procedures.\n\n\n-----\n\n## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n\n\nThe pandemic’s fallout has led to a widening chasm between the retail industry’s\nleaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n“Companies with tech-forward business models, who were already pulling ahead\npre-crisis, left their competitors in the dust.”\n\nBut what exactly is a “tech-forward business model”? It isn’t a simple narrative of\ndigital natives dethroning traditional retailers. Heavyweights like Walmart, Target\nand Costco held their own against Amazon. Nor was it purely a matter of scale —\nsmaller brands like Warby Parker or Everlane managed to carve out substantial\nconsumer bases, competing against larger, established players.\n\n**The common denominator among all victors**\n**was their ability to harness data, analytics and AI**\n**to rapidly react to shifts in consumer behavior.**\n\n\nmethods, optimizing operations to alleviate the pressure these modes exerted\non margins. They successfully established tighter partnerships with suppliers\nand logistic entities, collaborating toward shared triumphs.\n\nIn all these instances, it was their timely access to information, foresight\ndriven by this data, and the exploration of probable outcomes that set these\norganizations apart. Infusing data-driven decision-making into core processes\nwithin the organization, as well as those crossing partner boundaries, unlocked\nthis approach’s full potential.\n\nTo illustrate the significance of prioritizing data and AI, we developed the\nDatabricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\nstocks research, this index tracks marquee customers across our top five\nverticals and partners. The Databricks 30 is an equal-weight price index,\n\ncomposed of five marquee customers each across Retail/Consumer Products,\nFinancial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\nplus five strategic partners.\n\n\nThese businesses deftly used consumer demand insights to understand the\neffects of supply chain disruptions and labor shortages and reallocate resources\nto mitigate the most harmful impacts. They adeptly introduced new delivery\n\n\n-----\n\nOur analysis reveals that companies in the Databricks 30 Index outpaced the\nS&P 500 by an impressive +21 percentage points (pp) over the past three years.\nIn other words, if the stock market rose by 50% during this period, the Databricks\n30 Index would have soared by 71% (outperforming by 21pp). Even more\nremarkable, excluding tech entirely from the Databricks 30, the Databricks 30\nex-Tech index outperforms the S&P 500 by an even larger margin over the same\ntime frame: +23pp.\n\n\nDB30 DOw30", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "7a3d84f6cfab2fffb4dd147ed10688b3", + "composed of five marquee customers each across Retail/Consumer Products,\nFinancial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\nplus five strategic partners.\n\n\nThese businesses deftly used consumer demand insights to understand the\neffects of supply chain disruptions and labor shortages and reallocate resources\nto mitigate the most harmful impacts. They adeptly introduced new delivery\n\n\n-----\n\nOur analysis reveals that companies in the Databricks 30 Index outpaced the\nS&P 500 by an impressive +21 percentage points (pp) over the past three years.\nIn other words, if the stock market rose by 50% during this period, the Databricks\n30 Index would have soared by 71% (outperforming by 21pp). Even more\nremarkable, excluding tech entirely from the Databricks 30, the Databricks 30\nex-Tech index outperforms the S&P 500 by an even larger margin over the same\ntime frame: +23pp.\n\n\nDB30 DOw30\n\n\nSimilar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\nare investing in cloud, data and innovation do, in fact, win.\n\n\nSo now that we see the impact, let’s dive into the steps retail organizations can\ntake to put themselves on a trajectory of continued growth and success amid an\never-changing landscape.\n\n\n01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n\n\n01-01-2019 01-01-2020 01-01-2021\n\n\nDATE\n\n\n-----\n\n## Begin With a Shared Vision of Success\n\n\nThe most overlooked activity in becoming an AI-forward retailer is the most\ncrucial. In the rush to secure a position on the AI frontier, many companies\nare leaping before they look, embarking on AI initiatives without a clear\nunderstanding of what they want to achieve. Simply adopting the newest,\nshiniest tech tools isn’t a silver bullet. Many companies set themselves up for\nfailure by neglecting to clearly define the expected business outcomes at the\nonset of the initiative, a strategic move that can effectively reduce project risk\nand costs and lead to the ultimate success of the program. In fact, in an attempt\nto accelerate results, this cavalier approach can instead spiral into expensive\nmistakes, wasted resources and a decrease in trust for stakeholders from\nunmet expectations. It’s like setting sail on an open ocean without a destination\nin mind; the journey might provide some interesting detours, but it lacks\ndirection and purpose.\n\nHowever, when organizations take the time to articulate their expected\nbusiness outcomes before deploying AI and data-driven programs, they position\nthemselves to reduce project risk and costs. By aligning AI initiatives with\nspecific business objectives and creating a shared vision with stakeholders,\nthe focus becomes less about the technology itself and more about how it\ncan be used to reach these defined goals.\n\n\nTechnology decisions, too, are improved by having a known target. Without\nclear business outcomes in mind, companies tend to design, develop and\nimplement technologies that _might_ be needed to solve the problem. Aligning\nthe technical road map and activities with business outcomes mitigates the\nrisk of misallocated resources and the potential fallout from the unfulfilled\npromise of AI.\n\nFurthermore, a clear understanding of expected business outcomes allows\nfor efficient project management and cost control. Companies can set key\nperformance indicators (KPIs) tied directly to these outcomes. This not only\nprovides a means to measure progress, but also helps control costs by\nensuring that resources are targeted toward initiatives that deliver value.\n\nIt’s not just about numbers either; having explicit objectives aids in cultivating\n\nstakeholder buy-in. Clear communication about the purpose and potential\nbenefits of an AI initiative can foster support from executives, employees,\ninvestors and customers alike. This collective backing can further mitigate risk\nand cut costs by ensuring that everyone is pulling in the same direction.\n\n\n-----\n\n## Why Companies Struggle With Setting Clear Business Outcomes for AI\n\n\nGetting started with AI at your organization might be daunting, and that’s\nbecause it is a big undertaking! Struggling to define clear outcomes for AI\nprojects is a common issue among many businesses for a variety of reasons.\nHere are some key factors that contribute to this challenge:\n\n**They believe the data strategy is a technology problem.**\n\nCompanies often hire a chief data officer, or make the data strategy\nthe responsibility of the technology organization.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5d904afdb1f9a9de4e4f6617833b86c6", + "It’s not just about numbers either; having explicit objectives aids in cultivating\n\nstakeholder buy-in. Clear communication about the purpose and potential\nbenefits of an AI initiative can foster support from executives, employees,\ninvestors and customers alike. This collective backing can further mitigate risk\nand cut costs by ensuring that everyone is pulling in the same direction.\n\n\n-----\n\n## Why Companies Struggle With Setting Clear Business Outcomes for AI\n\n\nGetting started with AI at your organization might be daunting, and that’s\nbecause it is a big undertaking! Struggling to define clear outcomes for AI\nprojects is a common issue among many businesses for a variety of reasons.\nHere are some key factors that contribute to this challenge:\n\n**They believe the data strategy is a technology problem.**\n\nCompanies often hire a chief data officer, or make the data strategy\nthe responsibility of the technology organization.\n\n**They lack an understanding of their business processes**\nAn alarming number of businesses jump onto the AI bandwagon without\nunderstanding how their business operates. Decisions are made at\nthe leadership level, but how they translate to operational decisions is\nmuddled. Data and AI are fundamentally business process technologies,\n\nand without fully understanding how the business works, any initiative\nin data and AI is bound to have limited success.\n\n\n**They lack a data culture**\n\nSomewhat related to the previous point, many companies have teams\nthat make decisions based on experience and intuition. These should\nnot be discounted, but the reason for intuition is often a result of a\npoor definition of processes, which prevents the ability to measure\nand improve processes.\n\n**They struggle to get high-quality data**\n\nAI projects require good-quality, relevant data. Many businesses\nstruggle with issues related to data access, quality, privacy and\nsecurity, which can complicate the process of defining clear outcomes.\n\n**They lack the organizational structures required**\n\nImplementing AI often requires significant changes in business\n\nprocesses, organizational structures and even corporate culture.\nMany companies find it hard to manage these changes, leading to\ndifficulties in setting and achieving clear outcomes.\n\n\n-----\n\nData and AI programs are a business process problem first, and a\ntechnology problem last. Familiarity with technology is important, but\nirrelevant if companies do not understand it.\n\nAddressing these challenges often requires companies to invest in\neducation about AI capabilities, to formulate clear strategies, to manage\nchange effectively, and to bring on board the necessary skills either\nby hiring new talent or upskilling existing employees. It’s a journey that\nrequires commitment, but the potential benefits of successful AI initiatives\nmake it a worthwhile venture.\n\n\n**They don’t have the right people in place**\n\nThere’s often a gap between the skills available within a company and\nthe skills needed to define and achieve AI outcomes. Without team\nmembers who understand AI, data analysis and project management,\nbusinesses can struggle to set clear objectives for AI initiatives.\n\n**They struggle to quantify the value of AI projects**\n\nAI’s benefits can sometimes be intangible or long-term, making them\ndifficult to quantify. Companies may struggle to define outcomes in\nmeasurable terms, complicating the process of setting objectives\nand monitoring progress.\n\n\n-----\n\n## Before Diving In: Assess Your Readiness\n\n\nThere is a growing sense of urgency for organizations relatively new to data\nand AI-driven enablement to “get in the game.” Profiles of top performers and\nheadline-making achievements create a clearer sense of what is possible\nand what can be gained, leaving those entering into the space eager to achieve\nsimilar results.\n\nBut what’s missing in those articles are the sustained investments in\nprocess, people and technology and the numerous challenges, missteps and\noutright failures that had to occur before success was achieved. Data-driven\ntransformation is a journey, and before any successful journey is pursued,\nit’s wise to reflect on the organization’s readiness so that you can anticipate\nchallenges and identify areas for remediation and improvement that will\ndeliver you to your intended destination.\n\nWith this in mind, we encourage organizations new to this space to\nassess their maturity in terms of the use and management of their existing\ninformation assets:\n\n1. How easily discoverable and accessible are data in\nyour environment?\n\n\n3. Is the quality of these data formally verified?\n\n4. Are key entities such as products and customers actively\nmanaged, and can data related to these items be easily linked\nacross various data sources?", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "bc6bec299cd25b6f4221096090e241c7", + "But what’s missing in those articles are the sustained investments in\nprocess, people and technology and the numerous challenges, missteps and\noutright failures that had to occur before success was achieved. Data-driven\ntransformation is a journey, and before any successful journey is pursued,\nit’s wise to reflect on the organization’s readiness so that you can anticipate\nchallenges and identify areas for remediation and improvement that will\ndeliver you to your intended destination.\n\nWith this in mind, we encourage organizations new to this space to\nassess their maturity in terms of the use and management of their existing\ninformation assets:\n\n1. How easily discoverable and accessible are data in\nyour environment?\n\n\n3. Is the quality of these data formally verified?\n\n4. Are key entities such as products and customers actively\nmanaged, and can data related to these items be easily linked\nacross various data sources?\n\n5. How quickly are data made available for analysis following their\ncreation or modification? Is this latency aligned with how you\nmight use this data?\n\n6. Are processes established for determining appropriate uses of\ndata, governing access and providing oversight on consumption?\n\n7. Is there one individual responsible for effective data management\nacross the enterprise, and has this person established a\n\nprocess for receiving and responding to feedback and shifting\norganizational priorities?\n\nThis list of questions is by no means exhaustive, but it should help to identify\nblockers that are likely to become impediments down the road.\n\n\n2. How well understood are these information assets?\n\n\n-----\n\nSimilarly, we would encourage organizations to assess their maturity in terms of\nanalytics capabilities:\n\n1. Is business performance at all levels assessed in terms of\nkey metrics?\n\n2. How frequently are data-driven analyses used in making key\nbusiness decisions?\n\n3. To what degree are advanced analytics techniques\n— i.e., data science — used in decision-making processes?\n\n4. Are predictive models regularly leveraged as part of operational\nbusiness processes?\n\n5. How is experimentation used to assess the performance of\nvarious initiatives?\n\n\nLastly, and probably most importantly, we’d encourage the organization to\nperform a frank assessment of its readiness to embrace change. Becoming a\ndata-driven enterprise is fundamentally about operating differently than before.\nDecision-making authority becomes more diffuse and often more automated.\nProject outcomes become less certain as the organization focuses on innovation\nwhere learning is emphasized over predictable results. Process silos often\nbecome more intertwined as new modes of engagement evolve.\n\nWhen done right, this transition creates a healthy tension between what’s\nneeded to be successful today and what’s needed to be successful tomorrow.\nBut this can also manifest itself as employee resistance and political infighting\nas processes and organizational structures evolve. What’s often needed to\novercome this is strong leadership, a clear vision and mandate for change as\nwell as a reassessment of incentive structures and active organizational change\nmanagement as the organization transitions into this new way of working.\n\n\n6. Are predictive models used to automate key business decisions?\n\n\n7. Has the organization embraced a model of continuous deployment\nfor the regular update of model-driven processes?\n\n\n**TRADITIONAL APPROACH**\n\n**Upfront reqs** **Technical implementation** **Production**\n\n\n**ITERATIVE APPROACH**\n\n\nContinuous feedback\n\n\n**Business questions** **Testing** **Production** **Optimization**\n\nContinuous learning and optimization\n\nAn iterative approach involves the use of data to continually optimize the performance of data products.\n\n\n-----\n\n## Getting Started: Putting Some Wins on the Board\n\n\nWith the organization ready to proceed, the next phase is about learning to\ndeliver new solutions within your organization. There will be new technologies\nto deploy and new skills to develop, and there will be new patterns for\nintegration into business workflows and procedures for incremental updates\nand improvements. But most importantly, there will need to be a new level of\npartnership and trust between the business and the technology sides of the\norganization that needs to be carefully nurtured.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "4759cd08dc657d0089a31804b27e273e", + "7. Has the organization embraced a model of continuous deployment\nfor the regular update of model-driven processes?\n\n\n**TRADITIONAL APPROACH**\n\n**Upfront reqs** **Technical implementation** **Production**\n\n\n**ITERATIVE APPROACH**\n\n\nContinuous feedback\n\n\n**Business questions** **Testing** **Production** **Optimization**\n\nContinuous learning and optimization\n\nAn iterative approach involves the use of data to continually optimize the performance of data products.\n\n\n-----\n\n## Getting Started: Putting Some Wins on the Board\n\n\nWith the organization ready to proceed, the next phase is about learning to\ndeliver new solutions within your organization. There will be new technologies\nto deploy and new skills to develop, and there will be new patterns for\nintegration into business workflows and procedures for incremental updates\nand improvements. But most importantly, there will need to be a new level of\npartnership and trust between the business and the technology sides of the\norganization that needs to be carefully nurtured.\n\nThe best way we have found to do this is to start with projects that improve\non existing operational workflows, i.e., do what you do, but do it smarter.\nThe business is often familiar with existing pain points and can more clearly\nenvision how a new capability can be folded into its processes. They are also\nfamiliar with how to assess the impact a new approach may have on their\nbusiness and can help design tests to validate whether the intended results\n\n\nAs capabilities demonstrating value over the status quo are developed, they\nare folded into business processes. This is not a one-and-done effort but part\nof an ongoing cycle of deployment to continue so long as the team has a line\nof sight to meaningful gains. The team does not wait for the ideal solution but\ninstead focuses on incremental improvements that deliver measurable value\nalong the way.\n\nOversight for this process is provided by another body, one tasked with the\nsuccess of the overall transformative efforts within the business. As success\nis delivered, there will be growing demand for the time and talents of these\nteams, and the organization will need to prioritize resources across an increasing\nnumber of opportunities. This steering committee will need to be responsible for\nallocating limited resources and advocating for additional ones as well to strike\nthe right balance of investments for the organization.\n\n\nare or are not being delivered.\n\n\n**DEMAND FORECASTING**\n\nDemand forecasting is a massive challenge for retail and consumer goods\n\norganizations. And one where even an incremental change can have a massive impact,\n\nso it’s often one of the first projects organizations identify to put a win on the board.\n\nAccording to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n\naccuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n\nincrease in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n\n[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n\nkey use cases.\n\n\nWork on these projects is a collaborative effort between the business and IT.\nTogether, the project team explores a potential solution with a notion of how it\nmay be integrated in mind from the outset. As the project unfolds, all members\nare part of the iterative cycles and help to steer the solution in new directions\nuntil an item of value is derived.\n\n\n-----\n\n## Going Big: Learning to Embrace Transformational Change\n\n\nWith some experience under your belt, it’s time to build on the organizational\nmuscle developed during initial efforts and flex for more transformative impact.\nAgain, the focus is on established functions within the business, but instead of\npointed, incremental improvements, the team begins to create a vision for the\npart of the organization that would operate if it were to fully embrace data and\nAI enablement.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0d9476593a17460dc2d5ede6a2f13133", + "[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n\nkey use cases.\n\n\nWork on these projects is a collaborative effort between the business and IT.\nTogether, the project team explores a potential solution with a notion of how it\nmay be integrated in mind from the outset. As the project unfolds, all members\nare part of the iterative cycles and help to steer the solution in new directions\nuntil an item of value is derived.\n\n\n-----\n\n## Going Big: Learning to Embrace Transformational Change\n\n\nWith some experience under your belt, it’s time to build on the organizational\nmuscle developed during initial efforts and flex for more transformative impact.\nAgain, the focus is on established functions within the business, but instead of\npointed, incremental improvements, the team begins to create a vision for the\npart of the organization that would operate if it were to fully embrace data and\nAI enablement.\n\nIt’s at this phase that many of the concerns about organizational resistance\nmentioned earlier are most likely to manifest themselves. Ideally, initial\nimplementation efforts have built champions within the business, but it’s still\nimportant to be mindful of pushback that can emerge as the organization more\nfully begins to change. Having and maintaining strong business sponsorship\nin this phase is critical, and having that sponsor articulate and regularly\nreinforce a clear vision for the change that’s now underway can help everyone\n\nunderstand the need to support these efforts.\n\n\nSo far in this exploration of the journey to data and AI transformation, we’ve\nminimized the importance of technology in order to focus on the business and\norganizational aspects that often get neglected in this conversation. But it’s\nat this stage that the organization needs to have established its preference\nfor data and analytics platforms. Because of the breadth of needs that will\nhave to be addressed and the ongoing innovation taking place in the data\nscience community, we strongly suggest standardizing on a platform that is\nopen and flexible while also providing cost-effective use of both infrastructure\nand people resources and strong data governance and protection. For many\norganizations, the Databricks Lakehouse Platform has proven itself to be the\nideal platform to meet these needs.\n\n**WHY STANDARDIZE ON DATABRICKS?**\n\nThe Databricks Lakehouse is the only enterprise data and AI\n\nplatform that allows retailers to leverage all of their data, from any\n\nsource, on any workload to always offer more engaging customer\n\nexperiences driven by real-time data, at the lowest cost and with\n\nthe greatest investment protection.\n\n\n-----\n\nBut simply standardizing on a platform is not enough. The organization\nneeds to work through the roles and responsibilities around the use of this\nplatform and processes for moving things from experimentation and formal\ndevelopment to testing and operationalization.\n\nThe importance of having an MLOps strategy really comes to life at this\nphase. This doesn’t mean your strategy around MLOps can’t change, but this\nphase is when you want to think about and define your answers to some key\nquestions such as the following:\n\n1. How do we evaluate new and existing (retrained) models as\npart of their movement from development to production?\n\n2. How do we determine when a model should be retrained?\n\n3. What are the preferred mechanisms for production deployment?\n\n4. How do we fall back should we have a deployment problem?\n\n5. What are the service level expectations for the\ndeployment processes?\n\n\n###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n\n**Numan Ali**\n\nSolutions Architect, Data and Analytics Center of Excellence at Pandora\n\n\n-----\n\n## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n\n\nToo often, leadership views innovation as a destination and not a process\n(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\ndata-driven organization overnight and then it’s done. Yes, there will be an\nupfront investment, but there will also be ongoing investment in order to\nsupport sustained innovation.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0e3cdc6f5aa88555c164d82f7b02ae3e", + "2. How do we determine when a model should be retrained?\n\n3. What are the preferred mechanisms for production deployment?\n\n4. How do we fall back should we have a deployment problem?\n\n5. What are the service level expectations for the\ndeployment processes?\n\n\n###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n\n**Numan Ali**\n\nSolutions Architect, Data and Analytics Center of Excellence at Pandora\n\n\n-----\n\n## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n\n\nToo often, leadership views innovation as a destination and not a process\n(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\ndata-driven organization overnight and then it’s done. Yes, there will be an\nupfront investment, but there will also be ongoing investment in order to\nsupport sustained innovation.\n\nIronically, one of the major obstacles to this change is viewing the goal as\nsimply delivering a project or projects. Think about it — just 12 months ago,\nonly a few specialists in academia and industry were talking about generative\nAI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\npersonalized consumer experiences with it.\n\n\nTechnology, especially when it comes to data and AI, moves far too quickly.\nWhat retailer tech teams need to deliver at the end of the day is applications,\nof course, but also the ability to react quickly to change. What sort of ongoing\ninvestments in terms of people, process and technology do retailers need to\nfoster in order to ingrain an innovation mindset?\n\nThis is an ongoing balancing act where organizations need to innovate and look\nfor new opportunities but also sustain that innovation in a way that is realistic\nfor the business. For this, let’s consider the 70-20-10 rule: the idea that\ncompanies should allocate 70% of innovation investment to core initiatives,\n20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\nnot a hard-and-fast rule, this concept was touted by Google co-founder Larry\nPage in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n\noutperformed their peers, typically realizing a P/E premium of 10% to 20%.\n\n\n-----\n\nThe goal of the 70-20-10 rule is to help guide the organization toward\nsustained innovation and spend the bulk of time on the core business. This is\npart of why we recommend starting first with fast (just 2- to 3-month total)\npilot projects to use AI on existing business use cases like demand forecasting\nand call center optimization. By working in these areas with a focus on learning\nand iterating, retailers will soon find where data silos and rigidity exist in the\nsystem. As these foundational barriers are knocked down, it then makes it\npossible to tackle more transformational use cases and start to build the\ncharacteristics of a data-forward enterprise. In other words, start to utilize\ndata and data-driven insights as a primary driver for decision-making and\noperations, while also prioritizing continuous data analysis and improvement.\n\n\n**TRANSFORMATIVE**\n\n\n**ADJACENT**\n\n\n**CORE**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "9048b352c5ac27632575c975cbfca802", + "outperformed their peers, typically realizing a P/E premium of 10% to 20%.\n\n\n-----\n\nThe goal of the 70-20-10 rule is to help guide the organization toward\nsustained innovation and spend the bulk of time on the core business. This is\npart of why we recommend starting first with fast (just 2- to 3-month total)\npilot projects to use AI on existing business use cases like demand forecasting\nand call center optimization. By working in these areas with a focus on learning\nand iterating, retailers will soon find where data silos and rigidity exist in the\nsystem. As these foundational barriers are knocked down, it then makes it\npossible to tackle more transformational use cases and start to build the\ncharacteristics of a data-forward enterprise. In other words, start to utilize\ndata and data-driven insights as a primary driver for decision-making and\noperations, while also prioritizing continuous data analysis and improvement.\n\n\n**TRANSFORMATIVE**\n\n\n**ADJACENT**\n\n\n**CORE**\n\n\n###### Companies that allocated about 70% of their innovation activity to core initiatives, \n### 20% to adjacent ones and 10% to\n###### transformational ones outperformed their peers.\n\n**Bansi Nagji & Geoff Tuff**\n_Managing Your Innovation Portfolio_\nHarvard Business Review, May 2012\n\n\n-----\n\n## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n\n\nSo what does it take to successfully embark on this\njourney to becoming a data-forward enterprise?\nFirst and foremost, you need to not only establish\na baseline understanding of what has occurred by\nexamining historical data but leverage advancements\nin technologies (e.g., streaming, computer vision,\nvoice recognition) to make predictions of the future.\n\nThrough the use of both historical data and\npredictive techniques such as forecasting,\nrecommendations, prescriptive care and nextbest-action, organizations can begin to improve\ndecisions and, in some cases, automate certain\ndecision-making processes. But rather than moving\n\nfrom historical views to predictive actions in a\nlinear fashion, this journey involves addressing both\napproaches simultaneously. Once you are able to\nunify historical and predictive analysis, you can then\ntake significant steps toward becoming a dataforward enterprise.\n\n\n##### The Data-Forward Enterprise\n\nData, analytics and AI working in concert\n\n\n**Data Purgatory**\nThings are better, but data isn’t\ndriving the business\n\n\n**Data Maturity**\nEvery aspect of the\nbusiness is supported\nby insights and AI\n\n\n**Data Siloed**\nData and teams are segregated\ninto different systems\n\nDATA MATURITY\n\nBeing data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n\n\n-----\n\n## The 8 Steps to Building a Data-Forward Retailer\n\n\nBefore you start your data-forward journey, a few critical steps must be\nconsidered to establish a solid foundation to build upon. Based on our\nwork with the largest and most successful retailers in the world, spanning\nstartups to global giants, we at Databricks have seen that the most successful\nfollowed these steps to effectively gain wallet share, whereas those who\ncouldn’t would often leave major gaps that competitors could take advantage\nof. These steps are the basics to prepare businesses for where they need\nto be both now and in the near future.\n\n\n**2** **Get grounded: Understand the technology**\n\nTo start, business leaders need to ground themselves in technology, especially\nwhen it comes to AI. AI can do amazing things, but it is not magical and vendors\nare prone to overpromising and underdelivering. Less than getting deep into\ncode, the purpose is to understand the limitations and ideal use cases.\n\nDatabricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\nstarting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\nperspective of how different brands are using data, analytics and AI to drive\nrevenue or cut operational costs.\n\n\n**1** **Set the foundation: Define goals and objectives**\n\n\nThe best way to avoid shiny object syndrome (where you start out with a", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "f20d3832b09cd451c1938dcafc1883b0", + "**2** **Get grounded: Understand the technology**\n\nTo start, business leaders need to ground themselves in technology, especially\nwhen it comes to AI. AI can do amazing things, but it is not magical and vendors\nare prone to overpromising and underdelivering. Less than getting deep into\ncode, the purpose is to understand the limitations and ideal use cases.\n\nDatabricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\nstarting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\nperspective of how different brands are using data, analytics and AI to drive\nrevenue or cut operational costs.\n\n\n**1** **Set the foundation: Define goals and objectives**\n\n\nThe best way to avoid shiny object syndrome (where you start out with a\n\ntechnology and then try to figure out what to do with it) is to first identify the\nproblems you want to solve. From there, you can set goals around innovation\nto align incentives, and, most importantly, ensure you are driving specific\nbusiness outcomes such as improving customer engagement, optimizing\ninventory management or increasing sales.\n\n\n**3** **Understand the skills and processes in your business**\n\nAs we will get into in step 4, starting with smaller pilot projects enables you\nto not just deliver a quick win and validate the use of AI in the enterprise, but\nalso understand the in-house capabilities in terms of people, process and\ntechnology to deliver technical projects. And if required, be willing and ready\nto hire people with the right skill sets that can help you make the most of your\ndata. For example, building a core team of data analysts can help extract deep\ninsights that lead to better decision-making and identify opportunities for\ngrowth. It is critical at this step to define the roles you need, determine how\nyou will source for those roles (via external hiring or internal transfer), and\nensure those roles have opportunities for career progression.\n\n\n-----\n\nFor inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\nsave hours of discovery, design, development and testing. Our purpose-built\nguides — fully functional notebooks and best practices — speed up results\nacross your most common and high-impact use cases and enable you to go\nfrom idea to proof of concept (PoC) in as little as two weeks. We have over\n20 accelerators built specifically for critical retail and consumer goods use\ncases, from Demand Forecasting and On-Shelf Availability to Recommendation\nEngines and Customer Lifetime Value. We also have a set of Solution\nAccelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n\n**5** **Implement data management and governance early**\n\nThe first step to successfully implementing AI/ML in your business broadly\nis to ensure you have accurate, reliable and current data to train your\nmodels against. This data can (and should) come from a variety of sources,\nso it’s key to unify all data types and sources (sales transactions, customer\nfeedback, social media) in a centralized location that is easily accessible,\nwhile not losing sight of data security to maintain customer trust. Setting\nup data governance parameters to control who has which kinds of access\nto what data, and being able to audit the history of this access, will actually\naccelerate innovation while ensuring data security and compliance.\n\n\n**Delivering exactly what customers want,**\n**every time, and on time**\n\nData is at the heart of Gousto’s mission to change the\nway people eat through the delivery of boxes of fresh\ningredients and easy-to-follow recipes. However, even\nas their business exploded at the start of the pandemic,\ntheir systems couldn’t ingest data fast enough, couldn’t\ntalk to each other and wouldn’t scale — forcing them to\ntemporarily stop accepting new customers. Now Gousto is\nset up to achieve exciting ambitions for menu expansion,\nsophisticated personalization and next-day delivery. Learn\nhow they did it.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "08e33a0a940c3fc8d481f2dca3f3daa6", + "**Delivering exactly what customers want,**\n**every time, and on time**\n\nData is at the heart of Gousto’s mission to change the\nway people eat through the delivery of boxes of fresh\ningredients and easy-to-follow recipes. However, even\nas their business exploded at the start of the pandemic,\ntheir systems couldn’t ingest data fast enough, couldn’t\ntalk to each other and wouldn’t scale — forcing them to\ntemporarily stop accepting new customers. Now Gousto is\nset up to achieve exciting ambitions for menu expansion,\nsophisticated personalization and next-day delivery. Learn\nhow they did it.\n\n**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n\n**4** **Start small: Pilot a project**\n\nThere is no substitute for rolling your sleeves up and running a pilot project to\nevaluate the feasibility and potential impact of a project before implementing\nit on a larger scale. When selecting a pilot project, we recommend starting with\na project that will deliver clear business value, such as incremental revenue\nor clear cost savings, yet only takes 2-3 months to complete. The more time\nthere is between project inception and seeing results, the more likely it will lose\nmomentum internally.\n\n\n-----\n\n**6** **Incorporate AI across the business (starting with daily tasks)**\n\nGiven the large upfront investment in data scientists and engineers to build\nan AI program, the ROI will come from using it at scale. Constantly look to\nuncover patterns and repeatable processes that can be optimized or fully\nautomated with AI.\n\n**Building a global fashion icon with a**\n**customer-first approach**\n\nBritish luxury brand Burberry was seeking an efficient way to\nannotate its thousands of highly specific marketing assets\nfor better targeting. Working with Labelbox within Databricks\nLakehouse, they are now able to complete image annotation\nprojects in hours instead of months. And marketing team\nmembers now have access to powerful content insights\nwithout needing to ask data scientists for help.\n\n**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n\n**Customizing interactions that convert clicks**\n**to revenue with Databricks Lakehouse**\n\nGlobal jewelry manufacturer and retailer Pandora needed a\nunified view of all their data where they could easily segment,\ncategorize and analyze to deliver custom messaging to\nconsumers. With Databricks Lakehouse, they now have the\ninsights they need to deliver highly targeted messaging —\nincreasing consumer engagement from the initial opening of\na marketing email to maximizing shopping bag conversions to\ndriving revenue on the website.\n\n**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n\n\n**Building an operationally efficient**\n**omnichannel business**\n\nThe Hershey Company analyzes the data they need to\nstay in front of changing human behavior and delight their\ncustomers. With Databricks Lakehouse, they can analyze\ndata feeds from their largest retail customer — uncovering\ninsights that will help extend their industry leadership.\n\n**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n\n\n**Ushering in a new era**\n**of data-driven retailing**\n\nOutdoor apparel brand Columbia Sportswear has enabled\ndata and analytics self-service throughout the organization in\na way that ensures everyone is working from a single source\nof truth. Whichever data team needs access to the data,\nDatabricks Lakehouse gives them the confidence that the\ndata is reliable and consistent.\n\n**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n\n\n-----\n\n**7** **Foster a culture of data-driven decision-making**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "fb312ae35c808bf547ea524165081a95", + "**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n\n\n**Ushering in a new era**\n**of data-driven retailing**\n\nOutdoor apparel brand Columbia Sportswear has enabled\ndata and analytics self-service throughout the organization in\na way that ensures everyone is working from a single source\nof truth. Whichever data team needs access to the data,\nDatabricks Lakehouse gives them the confidence that the\ndata is reliable and consistent.\n\n**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n\n\n-----\n\n**7** **Foster a culture of data-driven decision-making**\n\nWhat does it mean to have a culture of data-driven decision-making? In\npractice, it means empowering all employees to use data to inform their\ndecisions. Only some strategic decisions will be based on complete and\naccurate information. It’s unwise to assume otherwise. The right approach\nis to leverage as much data as possible, from past tests or current efforts,\nto mitigate risk. Leaders need to not only ask for data but also ensure\nthat their employees will be able to find the data they need.\n\n**Unlocking critical trends and insights**\n**needed to serve our 180 million customers**\n\nReckitt, the maker of Lysol as well as hundreds of other\nhousehold brands, was looking to deliver best-in-class\ncustomer experiences to their over 180 million customers\nspanning the globe. With Databricks Lakehouse, Reckitt\nhas established a data-first culture by surfacing real-time,\nhighly accurate, deep customer data insights that have\nled to a better understanding of international market\ntrends and demand across the multiple product lines\nthey support.\n\n**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n\n\n**Customer 360 to enable faster speed**\n**to market, better results**\n\nThe Middle East’s Al-Futtaim serves as a local distributor\nfor global brands such as Toyota, IKEA and Ace Hardware.\nWith Databricks Lakehouse serving as a unified platform to\naggregate and analyze various data sources on all customers,\nthey have created a “golden customer record” that improves\nall decision-making, from forecasting demand to powering\ntheir global loyalty program.\n\n**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n\n**8** **Continuously evaluate and improve**\n\nRecognize that establishing a data-driven culture is an ongoing journey and\nnever a set destination. Constantly evaluate your data collection, analysis and\ndecision-making process to identify areas for improvement. Even small and\nconstant incremental improvements will deliver large gains in absolute terms\nwhen applied at scale. You can always personalize more, forecast better, or\nbetter manage your supply chain as you bring in better data sources and refine\nyour models.\n\n\n-----\n\n## Transform Retail Data Into Actionable Insights\n\n\nBecoming data forward is not a crazy idea. Too often, leaders or organizations\nallow themselves to be intimidated by focusing on large-scale transformations.\nBut it’s the small operational changes that can make your business more efficient\nas well as shift the larger culture forward. Once you’ve set this foundation, it then\nallows you to move toward bigger things. These steps may fail, but it’s actually\npositive to have these setbacks to learn from to try again. The bigger risk is to\nnot try and thus fall behind competitors who are embracing the internal changes\nneeded to take advantage of AI and machine learning.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "2cd345e72229d795c8e73b85ccdb8516", + "**8** **Continuously evaluate and improve**\n\nRecognize that establishing a data-driven culture is an ongoing journey and\nnever a set destination. Constantly evaluate your data collection, analysis and\ndecision-making process to identify areas for improvement. Even small and\nconstant incremental improvements will deliver large gains in absolute terms\nwhen applied at scale. You can always personalize more, forecast better, or\nbetter manage your supply chain as you bring in better data sources and refine\nyour models.\n\n\n-----\n\n## Transform Retail Data Into Actionable Insights\n\n\nBecoming data forward is not a crazy idea. Too often, leaders or organizations\nallow themselves to be intimidated by focusing on large-scale transformations.\nBut it’s the small operational changes that can make your business more efficient\nas well as shift the larger culture forward. Once you’ve set this foundation, it then\nallows you to move toward bigger things. These steps may fail, but it’s actually\npositive to have these setbacks to learn from to try again. The bigger risk is to\nnot try and thus fall behind competitors who are embracing the internal changes\nneeded to take advantage of AI and machine learning.\n\nCore to delivering on these steps to become a data-forward retailer is a solid\ndata foundation that can unify your data and AI workloads with sharing and\ngovernance built in, so internal and external teams can get access to the\ndata they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\ncompanies gain valuable insights into customer behavior, optimize supply chain\n\noperations and make informed business decisions in real time.\n\n\nEXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n\nAccess key resources to understanding how a lakehouse\nfor retail can set you on the path toward becoming a\ndata-forward organization.\n\n**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n\n\n#### Visit our website to learn more about Databricks Lakehouse for Retail.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast, and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5503b8c1c4a023a953c42915fd5ed36a", + "### Technical Migration Guide\n\n# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n\n\n-----\n\n## Contents Lakehouse Architecture 3\n\nThe Databricks Lakehouse Platform 4\n\nBusiness Value 5\n\nSingle source of truth 5\n\nData team 6\n\nFuture-proof 6\n\nMigration to Lakehouse 7\n\nOverview 7\n\nMigration strategy 8\n\nMigration planning 9\n\nELT approach 12\n\nAgile modernization 15\n\nSecurity and data governance 17\n\nTeam involvement 19\n\nConclusion 19\n\n\n-----\n\n## Lakehouse Architecture\n\n\nData warehouses were designed to provide a central data repository\n\nwith analytic compute capabilities to help business leaders\n\nget analytical insights, support decision-making and business\n\nintelligence (BI). Legacy on-premises data warehouse architectures\n\nare difficult to scale and make it difficult for data teams to keep up\n\nwith the exponential growth of data. Oftentimes data teams publish\n\nand use a subset of well-defined data for development and testing.\n\nThis slows down both innovation and time to insight.\n\nCloud data warehouses (CDW) were an attempt to tackle the\n\non-premises data warehouse challenges. CDWs removed the\n\nadministrative burden of tasks such as setup, upgrades and\n\nbackups. CDWs also improved scalability and introduced cloud’s\n\npay-as-you-go model to reduce cost. CDWs leverage a proprietary\n\ndata format to achieve cloud-scale and performance; however, this\n\nalso leads to customers locked into these formats with difficult\n\n\nBut enterprise data teams don’t need a better data warehouse.\n\nThey need an innovative, simple solution that provides reliable\n\nperformance, elastic scale and allows self-service to unblock\n\nanalytics to access all data at a reasonable cost. The answer is\n\nthe lakehouse.\n\nThe lakehouse pattern represents a paradigm shift from traditional\n\non-premises data warehouse systems that are expensive and\n\ncomplex to manage. It uses an open data management architecture\n\nthat combines the flexibility, cost-efficiency and scale of data\n\nlakes with the data management and ACID semantics of data\n\nwarehouses. A lakehouse pattern enables data transformation,\n\ncleansing and validation to support both business intelligence and\n\nmachine learning (ML) users on all data. Lakehouse is cloud-centric\n\nand unifies a complete up-to-date data set for teams, allowing\n\ncollaboration across an organization.\n\n\npaths to support use cases outside the data warehouse itself\n\n(i.e., machine learning). Customers often find themselves with a\n\nbifurcated architecture, which ultimately leads to a more costly and\n\ncomplex data platform over time.\n\n\n-----\n\n## The Databricks Lakehouse Platform\n\nThe Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n\nand AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n\necosystem with open standards and data formats. Databricks is **multicloud** — delivering\n\none **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n\nevery cloud platform that you’re using to support your data and AI efforts.\n\nDatabricks SQL stores and processes data using Delta Lake to simplify and enhance\n\ndata warehousing capabilities. Analysts can use their favorite language, SQL, popular\n\ntransformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n\nanalyze data. The built-in query editor reduces contextual switching and improves\n\nproductivity. Administrators enjoy simplified workload management via serverless\n\ncompute and auto-scaling to meet high-concurrency workload needs. All this at a\n\nfraction of the cost of traditional data warehouses.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "5026816418aa0b55a9a9cb44ff1d0df7", + "ecosystem with open standards and data formats. Databricks is **multicloud** — delivering\n\none **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n\nevery cloud platform that you’re using to support your data and AI efforts.\n\nDatabricks SQL stores and processes data using Delta Lake to simplify and enhance\n\ndata warehousing capabilities. Analysts can use their favorite language, SQL, popular\n\ntransformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n\nanalyze data. The built-in query editor reduces contextual switching and improves\n\nproductivity. Administrators enjoy simplified workload management via serverless\n\ncompute and auto-scaling to meet high-concurrency workload needs. All this at a\n\nfraction of the cost of traditional data warehouses.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\nSimple Open Multicloud\n\n\n-----\n\n## Business Value\n\n#### Single source of truth\n\nDatabricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n\nscalable storage layer where you can store all your data, including raw and historical data,\n\nalongside structured data tables in the data warehouse. The lakehouse pattern avoids\n\ndata silos and shares the same elastic scale and governance across all use cases: BI, data\n\nengineering, streaming and AI/ML. This means that data engineering teams don’t have to\n\nmove data to a proprietary data warehouse for business analysts or create a separate\n\ndata store to support data science.\n\nInstead, data teams can access the open format Delta tables directly and combine data\n\nsets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n\ndata with access to versioned history to facilitate repeatable experiments. A single source\n\nof truth facilitates moving from descriptive to predictive analytics.\n\n\n-----\n\n#### Data team\n\n\nWith central data governance and fine-grained access control\n\ncapabilities to secure the lakehouse, you can enable self-service\n\nSQL analytics for everyone on the Databricks Lakehouse Platform.\n\nThis allows each team to be more agile and innovate faster.\n\n**Data Analysts** — Using the Databricks SQL editor\n\nor their tools of choice (DBT, Power BI, Tableau), SQL\n\nanalysts can leverage familiar toolsets.\n\n**Data Engineers** — Utilizing Delta Lake as a unified\n\nstorage layer, data engineering teams can eliminate\n\nduplicate data and ETL jobs that move data across\n\nvarious systems. Databricks supports both batch and\n\nstreaming workloads to reduce bottlenecks and serve\n\nthe most up-to-date data to downstream users and\n\napplications.\n\n**Administrators** — The pay-as-you-go, decentralized\n\ncompute resource allows each team to run their\n\n\nThe Databricks Lakehouse Platform provides a reliable ETL and data\n\nmanagement framework to simplify ETL pipelines. Data teams can\n\nbuild end-to-end data transformations in a single pipeline instead of\n\nmany small ETL tasks. Databricks supports data quality enforcement\n\nto ensure reliability with auto-scalable infrastructure. Your teams\n\ncan onboard new data sources quickly to power new use cases with\n\nfresh data. This not only allows your team to efficiently and reliably\n\ndeliver high-quality data in a timely manner, it also reduces ETL\n\nworkload cost significantly.\n\n#### Future-proof\n\nUnlike CDWs that lock customers in, Databricks offers an open\n\nplatform with open standards, open protocols and open data\n\nformats. It supports a full range of popular languages (SQL, Python,", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "56f2df145130a08a1b2bbea4b2265a31", + "applications.\n\n**Administrators** — The pay-as-you-go, decentralized\n\ncompute resource allows each team to run their\n\n\nThe Databricks Lakehouse Platform provides a reliable ETL and data\n\nmanagement framework to simplify ETL pipelines. Data teams can\n\nbuild end-to-end data transformations in a single pipeline instead of\n\nmany small ETL tasks. Databricks supports data quality enforcement\n\nto ensure reliability with auto-scalable infrastructure. Your teams\n\ncan onboard new data sources quickly to power new use cases with\n\nfresh data. This not only allows your team to efficiently and reliably\n\ndeliver high-quality data in a timely manner, it also reduces ETL\n\nworkload cost significantly.\n\n#### Future-proof\n\nUnlike CDWs that lock customers in, Databricks offers an open\n\nplatform with open standards, open protocols and open data\n\nformats. It supports a full range of popular languages (SQL, Python,\n\nR, Scala) and popular BI tools. You can leverage the performant\n\nand low-cost distributed compute layer for data processing — or\n\nuse a variety of tools and engines to efficiently access the data via\n\nDatabricks APIs. Databricks also allows data consumption with a rich\n\npartner ecosystem. Teams can handle all existing BI and AI use cases\n\nwith the flexibility to support future use cases as they emerge.\n\n\nworkload in isolated environments without worrying\n\nabout contention. Serverless SQL endpoint frees your\n\nteam from infrastructure management challenges.\n\n\n-----\n\n## Migration to Lakehouse\n\n#### Overview\n\nA lakehouse is the ideal data architecture for data-driven organizations. It combines the\n\nbest qualities of data warehouses and data lakes to provide a single solution for all major\n\ndata workloads and supports use cases from streaming analytics to BI, data science and\n\nAI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n\nonly consumes (charges for) compute resources when workloads are running. This pay-\n\n\n**C U S T O M E R S T O R Y**\n##### Building the Lakehouse\n at Atlassian\n\n[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n\n\nas-you-go model means compute resources are automatically shut down if no processing\n\nis needed. Data teams can use small clusters that can power individual workloads\n\nthey plan to migrate. They can make the choice to leverage serverless SQL endpoints\n\nand completely free data teams from infrastructure capacity planning and cluster\n\nmaintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n\nsavings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n\nsavings compared to other cloud data warehouses.\n\nData warehouse migration is never an easy task. Databricks aims to mitigate the things\n\nthat can go wrong in these demanding migration projects. The Databricks Lakehouse\n\nPlatform provides many out-of-the-box features to mitigate migration risks.\n\n**C U S T O M E R S T O R Y**\n##### Driving Freight Transportation Into the Future\n\n[Read more](https://databricks.com/customers/jbhunt)\n\n\n-----\n\n#### Migration strategy\n\n\nMigration is a huge effort and very expensive. Yet, almost every\n\nenterprise has to migrate to new platforms every 3–5 years because\n\nthe old platform cannot support new use cases, catch up with\n\ndata growth or meet scaling needs. To get better ROI on migration,\n\nimplement a migration strategy that can reduce future re-platform\n\nneeds and extend to your future data and AI strategy.\n\nUse the opportunity of a data migration to standardize your data\n\nin open Delta format to allow existing and future tools to access\n\nit directly without moving or converting it. Merge your siloed\n\ndata warehouses into the unified storage layer in the Databricks", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2748250245f5cbd7aa9d6d16ef303f97", + "**C U S T O M E R S T O R Y**\n##### Driving Freight Transportation Into the Future\n\n[Read more](https://databricks.com/customers/jbhunt)\n\n\n-----\n\n#### Migration strategy\n\n\nMigration is a huge effort and very expensive. Yet, almost every\n\nenterprise has to migrate to new platforms every 3–5 years because\n\nthe old platform cannot support new use cases, catch up with\n\ndata growth or meet scaling needs. To get better ROI on migration,\n\nimplement a migration strategy that can reduce future re-platform\n\nneeds and extend to your future data and AI strategy.\n\nUse the opportunity of a data migration to standardize your data\n\nin open Delta format to allow existing and future tools to access\n\nit directly without moving or converting it. Merge your siloed\n\ndata warehouses into the unified storage layer in the Databricks\n\nLakehouse Platform — without worrying about storage capacity.\n\nThe unified storage layer allows your team to deploy a unified data\n\ngovernance on top to secure all data access consistently. Simplify\n\nyour data governance story with Databricks Unity Catalog.\n\n\nMove toward a single, consistent approach to data pipelining\n\nand refinement. Merge batch and streaming into a single end-\n\nto-end pipeline to get fresher data and provide more real-time\n\ndecisions. Take a metadata-driven approach to align the dataflow\n\nwith business processes and have data validation and quality\n\ncheck built-in. Through a series of curation and refinement steps,\n\nthe output results in highly consumable and trusted data for\n\ndownstream use cases.\n\nThe lakehouse architecture makes it possible for the organization\n\nto create “data assets” by taking a stepwise approach to improving\n\ndata and serving all essential use cases. Encourage your BI/analyst\n\nteam to leverage Databricks serverless endpoints for self-serve\n\nand agility. Each team can evaluate their top priority workloads and\n\nmigrate them in parallel to speed up migration.\n\nTake advantage of Databricks’ rich partner ecosystem. Your favorite\n\npartners are likely already integrated via Partner Connect and\n\ncan be set up with a few clicks. There are also many ISV and SI\n\nconsulting partners who can help your migration journey.\n\n\n-----\n\n#### Migration planning\n\nMigrating a data warehouse to the cloud can be time consuming and challenging for your\n\ndata teams. It’s important to agree on the data architecture, migration strategy and process/\n\nframeworks to be used before undertaking a data migration. Databricks provides Migration\n\nAssessment and Architecture Review sessions to develop a joint migration roadmap. This\n\nprocess is designed to help organizations to successfully migrate to a lakehouse architecture.\n\nBased on information collected and business objectives, the Databricks team will work with\n\ncustomers to propose a target architecture and provide a tailored migration roadmap.\n\nThese assessments help get a full picture of current data systems and the future vision. They\n\nclarify what you are migrating and do proper use case discovery. This includes identifying\n\nworkloads and data source dependency, for example:\n\nSample migration assessment checklist:\n\nIdentify upstream data sources and workload dependencies\n\nIdentify active/inactive data sets and database objects\n\nIdentify downstream application dependencies and data freshness requirements\n\nDefine a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n\nDefine security requirements and data governance\n\nClarify access management need, document needed permissions per user/group\n\nOutline current tooling (ingestion, ETL and BI) and what’s needed\n\n\n-----\n\nIt’s important to identify key stakeholders and keep them engaged during the migration to\n\nmake sure they are aligned with the overall objectives. The workload assessment result will\n\nbe reviewed with key stakeholders. Through the review process, data teams can get a better\n\nunderstanding of which workloads can most benefit from modernization.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "b8f09be63b094f14e5cb405d10b93048", + "clarify what you are migrating and do proper use case discovery. This includes identifying\n\nworkloads and data source dependency, for example:\n\nSample migration assessment checklist:\n\nIdentify upstream data sources and workload dependencies\n\nIdentify active/inactive data sets and database objects\n\nIdentify downstream application dependencies and data freshness requirements\n\nDefine a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n\nDefine security requirements and data governance\n\nClarify access management need, document needed permissions per user/group\n\nOutline current tooling (ingestion, ETL and BI) and what’s needed\n\n\n-----\n\nIt’s important to identify key stakeholders and keep them engaged during the migration to\n\nmake sure they are aligned with the overall objectives. The workload assessment result will\n\nbe reviewed with key stakeholders. Through the review process, data teams can get a better\n\nunderstanding of which workloads can most benefit from modernization.\n\nDatabricks often works with partners to provide a workload assessment and help customers\n\nunderstand their migration complexity and properly plan a budget. Databricks also partners\n\nwith third-party vendors that provide migration tools to securely automate major migration\n\ntasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n\nhelp with the migration, including:\n\n\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n\nyour current system to Databricks optimized code with Delta and other best practices\n\n\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n\nmigration time and cost\n\n\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n\n2x–3x faster than what was previously possible\n\n\n-----\n\n#### We can use Automated conversion for most workload types\n\n###### EDWs\n\n\nOpen Cloud Storage\nADLS, S3, GCP Storage\n\nDatabricks Tables, �ie�s\n\nSpark SQL Databricks Notebooks\n\nSpark SQL � little bit o� Python or Scal�\n\nRuns on Databricks JDBC/ODBC\n\nDatabricks permissions- Table ACLs\n\nCredential Pass-throughs to Files\n\nBig Data ETL tools, Databricks Notebooks\n\nAir5o� DAGs, ADF, Databricks Job\nand any other Enterprise Schedulers\n\n\nData Migration\n\nMetastore Migration\n\nSQL Migration\n\nSecurity\n\nETL Tools\n\n\nDB locked �ormats on Disks\n\nDatabases, Tables, �ie�s\n\nAd-hoc SQL �ueries\n\nT-SQL, PL/SQL, BTEQ\n\nReports �rom PB`, Tableau etc^\n\nGRANTs, Roles\n\nExternal tables- File permissions\n\nData Stage, Po�erCenter, Ab `nitio etc^\n\n\nOrchestration ETL Schedulers\n\n\n-----\n\n#### ELT approach\n\nThe separation of storage and compute makes ELT on lakehouse a better choice than traditional\n\nETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n\ndata implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n\nuse cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n\nthe foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n\nas needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n\nby exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n\n“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n\n**I M P R O V E D ATA Q U A L I T Y**\n\nData B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n\nStreaming Analytics", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "2fa9b783df7db3ee2e04a3b7d5ed2ad5", + "The separation of storage and compute makes ELT on lakehouse a better choice than traditional\n\nETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n\ndata implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n\nuse cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n\nthe foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n\nas needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n\nby exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n\n“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n\n**I M P R O V E D ATA Q U A L I T Y**\n\nData B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n\nStreaming Analytics\n\nCSV TXT JSON\n\n\nD ata �a �e\n\n\nRaw\nintegration\n\n\nFiltered, Cleaned,\nAugmented\n\n\nBusiness-level\nAggregates\n\n\nReuorting\n\n\n-----\n\nWe highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n\nservice in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n\nmodernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n\na traditional data warehouse, you can focus on source and expected output, and create your\n\nentire dataflow graph declaratively. Delta Live Tables offers:\n\n\u0007A metadata-driven approach — You just specify what data should be in each table or view\n\nrather than the details of how processing should be done\n\n\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n\nmonitoring/visibility, error recovery, and lineage, which reduces the strain on data\n\nengineering teams and improves time-to-value in building data pipelines\n\n\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n\nare populated correctly, whether continuously or on a regular schedule. For example,\n\nupdating one table will automatically trigger all downstream table updates to keep data\n\nup-to-date.\n\n\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n\npipelines simpler and easier. DLT can also automatically recover from common error\n\nconditions, reducing operational overhead.\n\n\n-----\n\n#### Agile modernization\n\n\nAgile development allows teams to move quickly knowing migrated\n\npipelines can be revisited at a later cycle and evolving data models\n\nare supported within the architecture. Allowing business impact to\n\ndrive priorities via an agile approach helps mitigate migration risks.\n\nPrioritizing and selecting use cases where modernization brings\n\nbusiness benefits quickly is a good starting point. Focus on the 20%\n\nof workloads that consume 80% of budget. By breaking workflows\n\ndown into components and managing data stories, teams can adjust\n\npriorities over time. Changes can be made in collaboration with the\n\nuser community to fit the business definition of value.\n\nMigrating to a lakehouse architecture leverages separation of storage\n\nand compute to remove resource contention between ETL and BI\n\nworkloads. As a result, the migration process can be more agile,\n\nallowing you to evolve your design iteratively without big-bang effort:\n\n\u0007Reduce time during the initial phase on full capacity plan and\n\n\nAll of this allows you to take a more iterative and business-focused\n\napproach for migration instead of a full planning, execution, test/\n\nvalidation approach. Here are more approaches that help facilitate\n\nthis phased implementation:\n\n\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "8773044556745d4feb2616c8e75292b3", + "of workloads that consume 80% of budget. By breaking workflows\n\ndown into components and managing data stories, teams can adjust\n\npriorities over time. Changes can be made in collaboration with the\n\nuser community to fit the business definition of value.\n\nMigrating to a lakehouse architecture leverages separation of storage\n\nand compute to remove resource contention between ETL and BI\n\nworkloads. As a result, the migration process can be more agile,\n\nallowing you to evolve your design iteratively without big-bang effort:\n\n\u0007Reduce time during the initial phase on full capacity plan and\n\n\nAll of this allows you to take a more iterative and business-focused\n\napproach for migration instead of a full planning, execution, test/\n\nvalidation approach. Here are more approaches that help facilitate\n\nthis phased implementation:\n\n\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n\nnew data into pipelines quicker to get data in near real-time.\n\n\u0007Delta Live Tables (DLT) improves data quality during data\n\ntransformation and automatically scales to address data volume\n\nchange. DLT can also support schema evolution and quarantine\n\nbad data or data that needs to be reprocessed at a later stage.\n\n\u0007Use dedicated clusters to isolate workloads, lower the total cost\n\nof ownership and improve overall performance. By using multiple\n\nclusters, we can shut down resources when not in use and move\n\naway from managing fixed resources in a single large cluster.\n\n\nscoping\n\n\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n\n\u0007Workload management is much simpler, you can isolate each\n\nworkload with a dedicated compute resource, without worrying\n\nabout managing workload contention\n\n\u0007Auto-scale and tear down the compute resources after the job\n\nis done to achieve cost efficiency\n\n\n-----\n\nLeverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n\n\u0007Create a migration factory for iterative migration process\n\n\u0007Determine and implement a security and governance framework\n\n\u0007Establish a to-be environment and move use cases/workloads in logical units\n\n\u0007Prove business value and scale over time\n\n\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n\nTake this iterative and templated approach. Migration speed will accelerate. Customers can\n\nfinish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n\n\n“ M a k e i t w o r k ”\n\nPa r e l l e l i z e t h e\nB u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\ni t e r at i o n s\n\n“ M a k e i t w o r k >a s t 2\n\n\nFull %i\"ecycle %ig�t�ou�e /or�load�\n\nLeverage Databricks’ deep\n\nbench of expertise to build\n\nout some **templates for the**\n\n**most effective Databricks**\n\n**implementation.**\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nTake an **iterative, bite-sized**\n\n**approach** to migration, reduce tech\n\ndebt and rework, and bring forward\n\nthe value of the solution earlier.\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\n\n-----\n\nTo maximize the value of your lakehouse, you should consider retiring\n\nsome legacy architecture design patterns. Leverage the migration\n\nprocess to simplify data warehousing tasks. Regardless of how you\n\ncomplete your migration, you could utilize lakehouse strengths to\n\nimprove architectural patterns:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "ff4cafaa91f98cf81ce85137ca9e88ed", + "Leverage Databricks’ deep\n\nbench of expertise to build\n\nout some **templates for the**\n\n**most effective Databricks**\n\n**implementation.**\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nTake an **iterative, bite-sized**\n\n**approach** to migration, reduce tech\n\ndebt and rework, and bring forward\n\nthe value of the solution earlier.\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\n\n-----\n\nTo maximize the value of your lakehouse, you should consider retiring\n\nsome legacy architecture design patterns. Leverage the migration\n\nprocess to simplify data warehousing tasks. Regardless of how you\n\ncomplete your migration, you could utilize lakehouse strengths to\n\nimprove architectural patterns:\n\n\u0007Merge your siloed data warehouses on your unified lakehouse\n\nplatform and unify data access and data governance via Unity\n\nCatalog. The lakehouse architecture provides a unified storage\n\nlayer for all your data where there is no physical boundary\n\nbetween data. There is no need to keep data copies for each\n\nsystem using the data set. Clean up and remove jobs that are\n\ncreated to keep data in sync across various data systems.\n\nKeep a single copy of raw data in your lakehouse as a single\n\nsource of truth.\n\n\u0007The Databricks Lakehouse Platform allows you to merge batch\n\nand streaming into a single system to build a simple continuous\n\n\n\u0007Simplify your workload isolation and management by running jobs\n\nin dedicated clusters. Separating storage and compute allows you\n\nto easily isolate each task with isolated compute resources. There\n\nis no need to squeeze them into a single large data appliance\n\nand spend lots of time managing and coordinating resources.\n\nLeverage the elasticity of the Databricks compute layer to\n\nautomatically handle workload concurrency changes at peak time\n\ninstead of paying for over-provisioned resources for most of the\n\ntime. This greatly simplifies the workload management effort the\n\ntraditional data warehouses require.\n\n\u0007Simplify disaster recovery. Storage and compute separation\n\nallows easy disaster recovery. The cloud storage provides very\n\ngood data redundancy and supports automated replication\n\nto another region. Customers can spin up compute resources\n\nquickly in another region and maintain service availability in case\n\nof an outage.\n\n\ndata flow model to process data as it arrives. Process data in\n\nnear real-time and enable data-driven decisions with the most\n\nrecent updates.\n\n\n-----\n\n#### Security and data governance\n\n\nSecurity is paramount in any data-driven organization. Data security\n\nshould enforce the business needs for both internal and external\n\ndata, so the lakehouse should be set up to meet your organization’s\n\nsecurity requirements. Databricks provides built-in security to\n\nprotect your data during and after migration.\n\n\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n\nor your own\n\n\u0007Set up a custom network policy, use IP range to control access\n\n\u0007Leverage Private Link to limit network traffic to not traverse the\n\npublic internet\n\n\nThe challenge with the traditional data warehouse and data lake\n\narchitecture is that data is stored in multiple stores and your data\n\nteam also needs to manage data access and data governance\n\ntwice. The lakehouse pattern uses unified storage which simplifies\n\ngovernance. The Databricks Lakehouse Platform provides a unified\n\ngovernance layer across all your data teams. Migrating to Databricks\n\nUnity Catalog provides data discovery, data lineage, role-based\n\nsecurity policies, table or row/column-level access control, and\n\ncentral auditing capabilities that make the data platform easy for\n\ndata stewards to confidently manage and secure data access to", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "75a1c56b0f4933fba47834345e0c4f4d", + "security requirements. Databricks provides built-in security to\n\nprotect your data during and after migration.\n\n\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n\nor your own\n\n\u0007Set up a custom network policy, use IP range to control access\n\n\u0007Leverage Private Link to limit network traffic to not traverse the\n\npublic internet\n\n\nThe challenge with the traditional data warehouse and data lake\n\narchitecture is that data is stored in multiple stores and your data\n\nteam also needs to manage data access and data governance\n\ntwice. The lakehouse pattern uses unified storage which simplifies\n\ngovernance. The Databricks Lakehouse Platform provides a unified\n\ngovernance layer across all your data teams. Migrating to Databricks\n\nUnity Catalog provides data discovery, data lineage, role-based\n\nsecurity policies, table or row/column-level access control, and\n\ncentral auditing capabilities that make the data platform easy for\n\ndata stewards to confidently manage and secure data access to\n\nmeet compliance and privacy needs, directly on the lakehouse.\n\n\n\u0007Enable SSO, integrate with active directory and other IdPs\n\n\u0007Control data access to database objects using RBAC\n\n\u0007Enable audit logs to monitor user activities\n\n\n-----\n\nA-�it Log\n\nAcco-nt Level$\nUser Management\n\nCre�entials\n\n##### Centralized Governance\n\nACL Store\n\nAccess Control\n\n\nMetastore\n\nLineage Explorer\n\nData Explorer\n\n\n-----\n\n#### Team involvement\n\nPlan to educate and train your team iteratively throughout the\n\nmigration process. As new workloads are migrated, new teams will\n\ngain exposure to the lakehouse pattern. Plan to ramp up new team\n\nmembers as the migration process progresses, developing a data\n\nCenter of Excellence within the organization. Databricks provides\n\na cost effective platform for ad hoc work to be performed. A\n\nsandbox environment can be leveraged for teams to get exposure\n\nto Databricks technology and get hands-on experience. Databricks\n\nalso provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n\nto get hands-on experience relevant to their immediate tasks, gain\n\n\n#### Conclusion\n\nData warehouse migration touches many business areas and\n\nimpacts many teams, but the Databricks Lakehouse Platform\n\nsimplifies this transition, reduces risks and accelerates your ROI.\n\nThe Databricks Business Value Consulting team can work with you\n\nto quantify the impact of your use cases to both data and business\n\nteams. And the Databricks team of solution architects, professional\n\nservices, and partners are ready to help.\n\nReach out to your Databricks account team or send a message to\n\n[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n\n\nexposure to new things and try new ideas.\n\n#### Additional resources\n\n[Migrate to Databricks](https://databricks.com/solutions/migration)\n\n[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow Databricks on", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3205b185d56baaf2d9a6be359acdc2bd", + "exposure to new things and try new ideas.\n\n#### Additional resources\n\n[Migrate to Databricks](https://databricks.com/solutions/migration)\n\n[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "10579387cee3f11fabfa76bda3175578", + "```\nTECHNICAL GUIDE\n\n```\n\n# Solving Common Data Challenges \n\n\n#### Startups and Digital Native Businesses\n\n\n-----\n\n### Table of Contents\n\n\n# 01\n```\nCHALLENGE:\n \u0003\n\n###### Creating a unified data architecture for data quality, governance and efficiency\n\n# 03\nCHALLENGE:\n \u0003\n\n###### Building effective machine learning operations\n\n```\n\n# 02\n```\nCHALLENGE:\n \u0003\n\n###### Building a data architecture to support scale and performance\n\n# 04\nSUMMARY:\n\n###### The Databricks Lakehouse Platform addresses these challenges\n\n```\n\n-----\n\n**I N T R O D U C T I O N**\n\n\nThis guide shares how the lakehouse architecture can increase\nproductivity and cost-efficiently support all your data, analytics\nand AI workloads, and flexibly scale with the pace of growth\nfor your company. Read the entire guide or dive straight into a\nspecific challenge.\n\nWith the advent of cloud infrastructure, a new generation of\nstartups has rapidly built and scaled their businesses. The use of\ncloud infrastructure, once seen as innovative, has now become\ntable stakes. The differentiator for the fastest-moving startups\nand digital natives now comes from the effective use of data\nat scale, primarily analytics and AI. Digital natives — defined\nas fast-moving, lean, and technically savvy, born-in-the-cloud\norganizations — are beginning to focus on new data-driven use\ncases such as real-time machine learning and personalized\ncustomer experiences.\n\nTo pursue these new data-intensive use cases and initiatives,\norganizations must look beyond the technologies that delivered\nthem to this point in time. Over time, these technologies, such\nas transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n\nThis guide examines some of the biggest data challenges and\nsolutions for startups and for scaling digital native businesses\nthat have reached the point where an end-to-end modern data\nplatform is a smart investment. Some key considerations include:\nsystems that are not cost-efficient and require time-consuming\nadministration and engineering toil. In addition to growing\nmaintenance needs, data is often stored in disparate locations\nand formats, with little or no governance, making real-time use\ncases, analytics and AI difficult or impossible.\n\n\n**Consolidating on a unified data platform**\nAs mentioned above, siloed data storage and management add administrative and\nfinancial cost. You can benefit significantly when you unify your data in one location\nwith a flexible architecture that scales with your needs and delivers performance\nfor future success. For this, you will want an open platform that supports all your\ndata including batch and streaming workloads, data analytics and machine learning.\nWith data unification, you create a more efficient, integrated approach to ingesting,\ncleaning and organizing your data. You also need automation to make data analysis\neasier for the nontechnical users in the company. But broader data access also\nmeans more focus on security, privacy, compliance and access control, which can\ncreate overhead for a growing.\n\n**Scaling up capacity and increasing performance**\n**and usability of the data solutions**\nData teams at growing digital native organizations find it time intensive and costly to\nhandle the growing volume and velocity of their data being ingested from multiple\nsources, across multiple clouds. You now need a unified and simplified platform that\ncan instantly scale up capacity and deliver more computing power on demand to\nfree up your data teams to produce outputs more quickly. This lowers the total cost\nfor the overall infrastructure by eliminating redundant licensing, infrastructure and\nadministration costs.\n\n**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "44bc7408f155cba595fa74c04215d6b9", + "**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency\n\n```\nAs cloud-born companies grow, data volumes rapidly increase, leading to new\nchallenges and use cases. Among the challenges:\n\n\nApplication stacks optimized for transaction\nuse cases aren’t able to handle the volume,\nvelocity and variety of data that modern data\nteams require. For example, this leads to query\nperformance issues as data volume grows.\n\nData silos develop as each team within an\norganization chooses different ETL/ELT and\nstorage solutions for their needs. As the\norganization grows and changes, these pipelines\nand storage solutions become brittle, hard to\nmaintain and nearly impossible to integrate.\n\n\nThese data silos lead to discoverability,\nintegration and access issues, which prevent\nteams from leveraging the full value of the\norganization’s available data.\n\nData governance is hard. Disparate ETL/ELT\nand storage solutions lead to governance,\ncompliance, auditability and access control\nchallenges, which expose organizations to\ntremendous risk.\n\n\nThe Databricks Lakehouse Platform provides\na unified set of tools for building, deploying,\nsharing and maintaining data solutions at scale.\nIt integrates with cloud storage and the security\nin your cloud account, manages and deploys\ncloud infrastructure on your behalf. Your data\npractitioners no longer need separate storage\nsystems for their data. And you don’t have to rely\non your cloud provider for security. The lakehouse\nhas its own robust security built into the platform.\n\n\nFor all the reasons above, the most\nconsistent advice from successful data\npractitioners is to create a “single source\nof truth” by unifying all data on a single\nplatform. With the Databricks Lakehouse\nPlatform, you can unify all your data on one\nplatform, reducing data infrastructure costs\nand compute. You don’t need excess data\ncopies and you can retire expensive\nlegacy infrastructure.\n```\n 01\n\n```\n\n-----\n\n```\nCUSTOMER STORY: GRAMMARLY\n\n### Helping 30 million people and 50,000 teams communicate more effectively\n\n```\n\nWhile its business is based on analytics, [Grammarly](http://www.grammarly.com)\n\nfor many years relied on a homegrown analytics\n\nplatform to drive its AI writing assistant to\n\nhelp users improve multiple aspects of written\n\ncommunications. As teams developed their own\n\nrequirements, data silos inevitably emerged as\n\ndifferent business areas implemented analytics\n\ntools individually.\n\n“Every team decided to solve their analytics\n\nneeds in the best way they saw fit,” said Chris\n\nLocklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "112b18913a18db059bbc06be42eae3e3", + "Locklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds\n\nthe standards we set as a data platform team,” said Locklin. “Lineage is\n\nthe last crucial piece for access control.”\n\nData analysts within Grammarly now have a consolidated interface for\n\nanalytics, which leads to a single source of truth and confidence in the\n\naccuracy and availability of all data managed by the data platform team.\n\nHaving a consistent data source across the company also resulted in\n\ngreater speed and efficiency and reduced costs. Data practitioners\n\nexperienced 110% faster querying at 10% of the cost to ingest compared\n\nto a data warehouse. Grammarly can now make its 5 billion daily events\n\navailable for analytics in under 15 minutes rather than 4 hours. Migrating\n\noff its rigid legacy infrastructure gave Grammarly the flexibility to do\n\nmore and the confidence that the platform will evolve with its needs.\n\nGrammarly is now able to sustain a flexible, scalable and highly secure\n\nanalytics platform that helps 30 million people and 50,000 teams\n\nworldwide write more effectively every day.\n\n[Read the full story here.](https://www.databricks.com/customers/grammarly)\n\n\n-----\n\n###### How to unify the data infrastructure with Databricks\n\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\nis composed of two primary parts:\n\n- The infrastructure to deploy, configure and\nmanage the platform and services\n\n\nYou can build a Databricks workspace by configuring\nsecure integrations between the Databricks platform\nand your cloud account, and then Databricks deploys\ntemporary Apache Spark™/Photon clusters using cloud\nresources in your account to process and store data\nin object storage and other integrated services you\ncontrol. Here are three steps to get started with the\nDatabricks Lakehouse Platform:\n\n**Understand the architecture**\nThe lakehouse provides a unified architecture,\nmeaning that all data is stored in the same\naccessible place. The diagram shows how data\ncomes in from sources like a customer relationship\nmanagement (CRM) system, an enterprise resource\nplanning (ERP) system, websites or unstructured\ncustomer emails.\n\n**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "6fe95ef77e68154410332294fec104f9", + "**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).\n\n[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n\n\n-----\n\nThe Databricks Lakehouse organizes data stored with Delta Lake in cloud object\nstorage with familiar concepts like database, tables and views. Delta Lake extends\nParquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\nscalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\nand was developed for tight integration with Structured Streaming, allowing you to\neasily use a single copy of data for both batch and streaming operations to provide\nincremental processing at scale.This model combines many of the benefits of a data\nwarehouse with the scalability and flexibility of a data lake.\n\nTo learn more about the optimized storage layer that provides the foundation for\nstoring data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n\nThe first step in unifying your data architecture is setting up how data is to be\naccessed and used across the organization. We’ll discuss this as a series of steps:\n\n**1** Set up governance with Unity Catalog\n\n**2** Grant secure access to the data\n\n\n###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n\n[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\n**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "7464e873c78f3ca1d4eb6292685b6fe6", + "**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n\nTo set up Unity Catalog for your organization,\nyou do the following:\n\n\n**1** Configure an S3 bucket and IAM role that\nUnity Catalog can use to store and access\ndata in your AWS account.\n\n**2** Create a metastore for each region in\n\nwhich your organization operates, and\nattach workspaces to the metastore. Each\nworkspace will have the same view of the\ndata you manage in Unity Catalog.\n\n\n**3** If you have a new account, add users,\ngroups and service principals to your\nDatabricks account.\n\n**4** Next, create and grant access to\n\ncatalogs, schemas and tables.\n\n\nFor complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n\n\n-----\n\n###### How Unity Catalog works\n\n\nYou will notice that the hierarchy of primary data\nobjects in Unity Catalog flows from metastore to table:\n\n**Metastore** is the top-level container for metadata.\nEach metastore exposes a three-level namespace\n(catalog.schema.table) that organizes your data.\n\n\n**Metastore** **Catalog** **Schemas**\n\n\n**Views**\n\n**Managed**\n**Tables**\n\n\n**Catalog** is the first layer of the object hierarchy, used\nto organize your data assets.\n\n\n**Schemas** , also known as databases, are the second\nlayer of the object hierarchy and contain tables and\nviews.\n\n**Table** is the lowest level in the object hierarchy, and\ntables can be external (stored in external locations in\nyour cloud storage of choice) or managed (stored in a\nstorage container in your cloud storage that you create\n\nexpressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "a0f7f52e108693072a591793dd53fd03", + "expressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n\nUnity Catalog users, service principals, and groups\nmust also be added to workspaces to access Unity\nCatalog data in a notebook, a Databricks SQL query,\nData Explorer or a REST API command. The assignment\nof users, service principals, and groups to workspaces\nis called identity federation. All workspaces attached\nto a Unity Catalog metastore are enabled for identity\nfederation.\n\nSecurable objects in Unity Catalog are hierarchical,\nmeaning that granting a privilege on a catalog or schema\nautomatically grants the privilege to all current and\nfuture objects within the catalog or schema. For more\non granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\nA common scenario is to set up a schema per team\nwhere only that team has USE SCHEMA and CREATE on\nthe schema. This means that any tables produced by\nteam members can only be shared within the team.\nData Explorer uses the privileges configured by Unity\nCatalog administrators to ensure that users are only\nable to see catalogs, databases, tables and views that\nthey have permission to query.\n\n\n[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\nmany Unity Catalog features. Use Data Explorer to view\nschema details, preview sample data, and see table\ndetails and properties. Administrators can view and\nchange owners. Admins and data object owners can grant\nand revoke permissions through this interface.\n\n**Set up secure access**\nIn Unity Catalog, data is secure by default. Initially, users\nhave no access to data in a metastore. Access can\nbe granted by either a metastore admin, the owner of\nan object, or the owner of the catalog or schema that\ncontains the object. Securable objects in Unity Catalog\nare hierarchical and privileges are inherited downward.\n\nUnity Catalog’s security model is based on standard ANSI\nSQL and allows administrators to grant permissions in\ntheir existing data lake using familiar syntax, at the level of\ncatalogs, databases (schema), tables and views. Privileges\nand metastores are shared across workspaces, allowing\nadministrators to set secure permissions once against\n\ngroups synced from identity providers and know that\nend users only have access to the proper data in any\nDatabricks workspace they enter.\n\n\n-----\n\n```\nCUSTOMER STORY: BUTCHERBOX\n\n### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "2cb92e326f83cade2a74789a0196a281", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant\n\n\n“We knew we needed to migrate from our legacy data warehouse\n\nenvironment to a data analytics platform that would unify our\n\ndata and make it easily accessible for quick analysis to improve\n\nsupply chain operations, forecast demand and, most importantly,\n\nkeep up with our growing customer base,” explained Jake Stone,\n\nSenior Manager, Business Analytics, at ButcherBox.\n\nThe platform allows analysts to share builds and iterate on a\n\nproject without getting into the code. Querying a table of 18\n\nbillion rows would have been problematic with a traditional\n\nplatform. With Databricks, ButcherBox can do it in three minutes.\n\n“Delta Lake provides us with a single source of truth for all of\n\nour data,” said Stone. “Now our data engineers are able to build\n\nreliable data pipelines that thread the needle on key topics such\n\nas inventory management, allowing us to identify in near real-\n\ntime what our trends are so we can figure out how to effectively\n\nmove inventory.”\n\n[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\nproblem because they blocked complete\n\nvisibility into critical insights needed to make\n\nstrategic and marketing decisions.\n\n\n-----\n\n**Set up secure data sharing**\nDatabricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\nto share data with other entities regardless of their\ncomputing platforms. Delta Sharing is integrated with\nUnity Catalog. Your data must be registered with Unity\nCatalog to manage, govern, audit and track usage of the\nshared data on the Lakehouse Platform. The primary\nconcepts of Delta Sharing are shares (read-only\ncollections of tables and table partitions to be shared)\nand recipients (objects that associate an organization\nwith a credential or secure sharing identifier).\n\nAs a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "e574ab8b9c03dd45aa08ea51855a9280", + "As a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n\n**View data lineage**\nYou can use Unity Catalog to capture runtime data\nlineage across queries in any language executed on\na Databricks cluster or SQL warehouse. Lineage can\nbe visualized in Data Explorer in near real-time and\nretrieved with the Databricks REST API. Lineage is\naggregated across all workspaces attached to Unity\nCatalog and captured down to the column level, and\nincludes notebooks, workflows and dashboards related\nto the query. To understand the requirements and how\nto capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n\n\nUnity Catalog Metastore\n\n\nCatalog\n\n\nData providers can use Databricks audit logging to\nmonitor the creation and modification of shares,\nand recipients can monitor recipient activity on\nshares. Data recipients who use shared data in a\nDatabricks account can use Databricks audit logging\nto understand who is accessing which data.\n\n\n-----\n\n###### Resources:\n\n- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n\n- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n\n- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n\n- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n\n- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n\n\n###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "f1108b13cd3a9bfb92c543c7a09639ad", + "###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud\n\nThe Databricks Lakehouse Platform is open\nsource with multicloud flexibility so that you can\nuse your data however and wherever you want —\nno vendor lock-in\n\n\n-----\n\n# 02\n```\nCHALLENGE: \u0003\n\n## Build your data architecture to support scale and performance\n\n```\n\n-----\n\n```\nCHALLENGE 02\n\n### Build your data architecture to support scale and performance\n\n```\nAs modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\nthe increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\nsuddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\ncost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\nupon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n\nHere are some common challenges around managing data and performance at scale:\n\n\n**Volume and velocity** — Exponentially\nincreasing data sources, and the speed at\nwhich they capture and create data.\n\n**Latency requirements** — The demands of\ndownstream applications and users have\nevolved (people want data and the results\nfrom the data faster).\n\n\n**Governance** — Cataloging, auditing, securing and\nreporting on data is burdensome at scale when\nusing old systems not built with data access\ncontrols and compliance in mind.\n\n**Multicloud** is really hard.\n\n\n**Data storage** — Storing data in the wrong\nformat is slow to access, query and is\nexpensive at scale.\n\n\n**Data format** — Supporting structured, semistructured and unstructured data formats\nis now a requirement. Most data storage\nsolutions are designed to handle only one type\nof data, requiring multiple products\nto be stitched together.\n\n```\n02\n\n```\n\n-----\n\n###### Lakehouse solves scale and performance challenges\n\n\nThe solution for growing digital companies is a unified\nand simplified platform that can instantly scale up\ncapacity to deliver more computing power on demand,\nfreeing up teams to go after the much-needed data\nand produce outputs more quickly. With a lakehouse,\nthey can replace their data silos with a single home for\ntheir structured, semi-structured and unstructured\ndata. Users and applications throughout the enterprise\nenvironment can connect to the same single copy of\nthe data to drive diverse workloads.\n\nThe lakehouse architecture is cost-efficient for\nscaling, lowering the total cost of ownership for the\noverall infrastructure by consolidating all data estate\nand use cases onto a single platform and eliminating\nredundant licensing, infrastructure and administration\ncosts. Unlike other warehouse options that can only\nscale horizontally, the Databricks Lakehouse can scale\nhorizontally and vertically based on workload demands.\n\nWith the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "a8a76379a0975b07547488cacc7a4a80", + "With the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN\n\n```\n\nWith more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n\nday, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n\nlegacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n\ndecreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n\ninfrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n\ndownstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n\nactionable insights for different use cases, from predictive maintenance to smarter product development.\n\n“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n\nperformant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n\nWassym Bensaid, Vice President of Software Development at Rivian.\n\nFor instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n\naccelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n\nroll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n\nconnected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n\nsmart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n\nhas seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n\n[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "7351ac4134a858391fa716f964aaaef2", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.\n\n\n**Step 2**\n**Understand the common Delta Lake operations**\nThe Databricks Lakehouse Platform simplifies the\nentire data lifecycle, from data ingestion to monitoring\nand governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\nopen-source storage system based on the Delta\nformat providing reliability through ACID transactions\nand scalable metadata handling. Large quantities of\nraw files in blob storage can be converted to Delta to\norganize and store the data cheaply. This allows for\nflexibility of data movement while being performant\nand less expensive.\n\n\n**Step 1**\n**Get a trial Databricks account**\nStart your 14-day free trial with Databricks on\nAWS in a few easy steps.\n[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\nuses compute and S3 storage resources in your cloud\nprovider account.\n\n\nand writing data can occur simultaneously without risk\nof many queries resulting in performance degradation\nor deadlock for business-critical workloads.\n\nThis means that users and applications throughout\nthe enterprise environment can connect to the same\nsingle copy of the data to drive diverse workloads, with\nall viewers guaranteed to receive the most current\nversion of the data at the time their query executes.\nWith performance features like indexing, Delta Lake\ncustomers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n[up to 48x faster.](https://www.databricks.com/customers/columbia)\n\n\n[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\nand learn how to create, manage and query tables.\nWith support for ACID transactions and schema\nenforcement, Delta Lake provides the reliability that\ntraditional data lakes lack. This enables you to scale\nreliable data insights throughout the organization and\nrun analytics and other data projects directly on your\ndata lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n\nDelta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "d9dd58cc603554c740505cf7cba15920", + "Delta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n**Step 3**\n**Ingest data efficiently at scale**\nWith a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\nfrom hundreds of data sources for analytics, AI and\nstreaming applications into one place.\n\nDatabricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\ndata ingestion. To ingest any file that can land in a data\nlake, Auto Loader incrementally and automatically\nprocesses new data files as they arrive in cloud storage\nin scheduled or continuous jobs. Auto Loader scales to\nsupport near real-time ingestion of millions of files\nper hour.\n\nFor pushing data in Delta Lake, the SQL command\n[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\ninto Delta Lake. COPY INTO is best used when the input\ndirectory contains thousands of files or fewer, and the\nuser prefers SQL. COPY INTO can be used over JDBC\nto push data into Delta Lake at your convenience.\n\n\n**Step 4**\n**Leverage production-ready tools**\n**to automate ETL pipelines**\nOnce the raw data is ingested, Databricks provides\na suite of production-ready tools that allow data\nprofessionals to quickly develop and deploy extract,\n\ntransform and load (ETL) pipelines. Databricks SQL\nallows analysts to run SQL queries against the same\ntables used in production ETL workloads, allowing for\nreal-time business intelligence at scale.\n\nWith your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\nfor data orchestration and learn how easy it is to create\na cluster, create a Databricks notebook, configure\n[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\ninteract with the data, and schedule a job.\n\n\nDatabricks supports workloads in SQL, Python, Scala\nand R, allowing users with diverse skill sets and\ntechnical backgrounds to leverage their knowledge\nto derive analytic insights. You can use all languages\nsupported by Databricks to define production jobs, and\nnotebooks can leverage a combination of languages.\n\nThis means that you can promote queries written by\nSQL analysts for last-mile ETL into production data\nengineering code with almost no effort. Queries and\nworkloads defined by personas across the organization\nleverage the same data sets, so there’s no need to\nreconcile field names or make sure dashboards are up\nto date before sharing code and results with\nother teams.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "57a6fe7bc345cc9f0a87c5e3e8917062", + "Databricks supports workloads in SQL, Python, Scala\nand R, allowing users with diverse skill sets and\ntechnical backgrounds to leverage their knowledge\nto derive analytic insights. You can use all languages\nsupported by Databricks to define production jobs, and\nnotebooks can leverage a combination of languages.\n\nThis means that you can promote queries written by\nSQL analysts for last-mile ETL into production data\nengineering code with almost no effort. Queries and\nworkloads defined by personas across the organization\nleverage the same data sets, so there’s no need to\nreconcile field names or make sure dashboards are up\nto date before sharing code and results with\nother teams.\n\n\n-----\n\nWith [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\na framework that uses a simple declarative approach\nto build ETL and ML pipelines on batch or streaming\ndata while automating operational complexities such\nas infrastructure management, task orchestration,\nerror handling and recovery, retries, and performance\noptimization.\n\nDelta Live Tables extends functionality in Apache Spark\nStructured Streaming and allows you to write just a\nfew lines of declarative Python or SQL to deploy a\nproduction-quality data pipeline with:\n\n- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n\n- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n\n- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n\n- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n\nWith DLT, engineers can also treat their data as code\nand apply software engineering best practices like\ntesting, monitoring and documentation to deploy\nreliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\nmaintain all data dependencies across the pipeline and\nreuse ETL pipelines with environment-independent\ndata management.\n\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n### Stopping sophisticated ransomware in its tracks\n\n```\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n```\n\nThe increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n\nto meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n\nthat scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n\nAbnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n\npipelines and constantly refined ML models.\n\n“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n\nAbnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n\nproduct better.”\n\nThe company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n\nmaximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n\ndirectly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n\ndelivers reliability, security and performance on the data lake for both streaming and batch operations. With\n\nDatabricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n\ndecisions and improve detection efficacy.\n\nDatabricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n\nproductivity and work in the same space without constantly competing for compute resources.\n\nWith Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n\ninfrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "1d885a2ec6fd20eca29296519748aef4", + "product better.”\n\nThe company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n\nmaximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n\ndirectly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n\ndelivers reliability, security and performance on the data lake for both streaming and batch operations. With\n\nDatabricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n\ndecisions and improve detection efficacy.\n\nDatabricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n\nproductivity and work in the same space without constantly competing for compute resources.\n\nWith Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n\ninfrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n\n\n-----\n\nDelta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\nthat trigger intermittently and are unpredictable. It optimizes cluster utilization\nby only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\nunnecessary idle node capacity.\n\n\nDelta Live Tables helps prevent bad data from flowing into tables through validation,\nintegrity checks and predefined error policies. In addition, you can monitor data\n\nquality trends over time to get insight into how your data is evolving and where\nchanges may be necessary.\n\n\n-----\n\n**Step 5**\n**Use Databricks SQL for serverless compute**\n[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\nwarehouse on the Lakehouse Platform for running your\nSQL and BI applications at scale with up to 12x better\nprice/performance. It’s imperative for younger, growing\ncompanies to reduce resource contention, and one way\nto accomplish that is with serverless compute. Running\nserverless removes the need to manage, configure or\nscale cloud infrastructure on the lakehouse, freeing up\nyour data team for what they do best.\n\n\nSee for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\nstored in your data lake.\n\nThe Databricks SQL REST API supports services to\nmanage queries and dashboards, query history and SQL\nwarehouses.\n\n\nDatabricks SQL warehouses provide instant, elastic\nSQL compute — decoupled from storage — and will\nautomatically scale to provide unlimited concurrency\nwithout disruption, for high concurrency use cases. DB\nSQL has data governance and security built in. Handle\nhigh concurrency with fully managed load balancing\nand scaling of compute resources.\n\n\n-----\n\n**Faster queries with Photon**\n[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\nto deliver dramatic infrastructure cost savings and\naccelerate all data and analytics workloads: data\ningestion, ETL, streaming, interactive queries, data\nscience and machine learning.\n\nPhoton is used by default in Databricks SQL. To\nenable Photon acceleration, select the **Use Photon**\n**Acceleration** checkbox when you create the cluster.\nIf you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\nset runtime_engine to PHOTON.\n\nPhoton supports a number of instance types on\nthe driver and worker nodes. Photon instance types\nconsume DBUs at a different rate than the same\ninstance type running the non-Photon runtime. For\nmore information about Photon instances and DBU\nconsumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "5c332356b67f14614107f85b62a638e4", + "Photon is used by default in Databricks SQL. To\nenable Photon acceleration, select the **Use Photon**\n**Acceleration** checkbox when you create the cluster.\nIf you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\nset runtime_engine to PHOTON.\n\nPhoton supports a number of instance types on\nthe driver and worker nodes. Photon instance types\nconsume DBUs at a different rate than the same\ninstance type running the non-Photon runtime. For\nmore information about Photon instances and DBU\nconsumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n\nPhoton will seamlessly coordinate work and resources\nand transparently accelerate portions of your SQL and\nSpark queries. No tuning or user intervention required.\nPhoton is compatible with Apache Spark APIs, so\ngetting started is as easy as turning it on — no code\nchange and no lock- in. Written entirely in C++, Photon\nprovides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\nper the TPC-DS 1TB benchmark, and customers have\nobserved 3x–8x speedups on average.\n\n\nWith Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\nDatabricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n\nLearn how to connect BI tools to Databricks SQL\ncompute resources with the following user guides:\n\n\n[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n\n[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n\n\n[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n\n[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n\n\n[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n\n[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n\n\n-----\n\n**Step 6**\n**Orchestrate workflows**\nDatabricks provides a comprehensive suite of tools and integrations to support your\ndata processing workflows.\n\nDatabricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\norchestration service for all your teams, so you can focus on your workflows, not on\nmanaging your infrastructure. Orchestrate diverse workloads for the full lifecycle\nincluding Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n\nHere’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\nlearn how to create notebooks, create and run a job, view the run details, and run jobs\nwith different parameters.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "ec1f30eb42cf277550e929aecf858722", + "Databricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\norchestration service for all your teams, so you can focus on your workflows, not on\nmanaging your infrastructure. Orchestrate diverse workloads for the full lifecycle\nincluding Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n\nHere’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\nlearn how to create notebooks, create and run a job, view the run details, and run jobs\nwith different parameters.\n\n\n-----\n\n**Step 7**\n**Run an end-to-end analytics pipeline**\nThis where you can see how everything works together to run efficiently at scale. First\ntake the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\nwill write to and read data from an external location managed by Unity Catalog and\nconfigure Auto Loader to ingest data to Unity Catalog.\n\n###### Resources:\n\n- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n\n- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n\n- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n\n- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n\n- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n\n- [Databricks documentation](https://docs.databricks.com/)\n\n\n###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n —Anup Segu, Senior Software Engineer, YipitData\n\n[Learn more.](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n# 03\n```\nCHALLENGE: \u0003\n\n## Building effective machine-learning operations\n\n```\n\n-----\n\n```\nCHALLENGE 03\n\n### Building effective machine-learning operations\n\n```\nGrowing startups and digital native companies face several challenges when they\nstart building, maintaining and scaling machine learning operations (MLOps) for their\ndata science teams.\n\n\nMLOps is different from DevOps. DevOps practices\nand tooling alone are insufficient because ML\napplications rely on an assortment of artifacts (e.g.,\nmodels, data, code) that can each require different\nmethods of experiment tracking, model training,\nfeature development, governance, feature and\nmodel serving.\n\nFor data teams beginning their machine learning\njourneys, the challenge of training data models can\nbe labor-intensive and not cost-effective because\nthe data has to be converted into features and\n\ntrained on a separate machine learning platform\n\n\nData teams often perform development in\ndisjointed, siloed stacks spanning DataOps,\nModelOps and DevOps\n\nDevelopment and training environment\ndisconnect. Moving code and data between\npersonal development environments and\nmachine learning platforms for model training\nat scale is error prone and cumbersome. The\n“it worked on my machine” problem.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "f75cf4ba1fcfd8ce9fced708407dd9ca", + "```\nCHALLENGE 03\n\n### Building effective machine-learning operations\n\n```\nGrowing startups and digital native companies face several challenges when they\nstart building, maintaining and scaling machine learning operations (MLOps) for their\ndata science teams.\n\n\nMLOps is different from DevOps. DevOps practices\nand tooling alone are insufficient because ML\napplications rely on an assortment of artifacts (e.g.,\nmodels, data, code) that can each require different\nmethods of experiment tracking, model training,\nfeature development, governance, feature and\nmodel serving.\n\nFor data teams beginning their machine learning\njourneys, the challenge of training data models can\nbe labor-intensive and not cost-effective because\nthe data has to be converted into features and\n\ntrained on a separate machine learning platform\n\n\nData teams often perform development in\ndisjointed, siloed stacks spanning DataOps,\nModelOps and DevOps\n\nDevelopment and training environment\ndisconnect. Moving code and data between\npersonal development environments and\nmachine learning platforms for model training\nat scale is error prone and cumbersome. The\n“it worked on my machine” problem.\n\nGathering high-quality data. Data that is siloed\nacross the organization is hard to discover,\ncollect, clean and use. This leads to stale data\nand delays in development of models.\n\n\nSee **Create a unified data architecture.**\n```\n 03\n\n```\n\n-----\n\n###### Siloed stacks spanning DataOps, ModelOps and DevOps\n\nWhen data engineers help ingest, refine and prep\ndata, they do so on their own stack. This data has\nto be converted into features and then trained on\na separate machine learning platform. This cross-\nplatform handoff often results in data staleness,\ndifficulty in maintaining versions, and eventually,\npoorly performing models. Even after you have\ntrained your model, you have to deal with yet another\ntech stack for model deployment. It’s challenging\nto serve features in real time and difficult to trace\nproblems in production back to the data.\n\nThe downstream business impact is massive —\nlonger and more expensive projects, and lower\nmodel accuracy in production leading to declining\nbusiness metrics.\n\nIf you are looking at launching or scaling your\nMLOps, you should probably focus on an incremental\nstrategy. At Databricks, we see firsthand how\ncustomers develop their MLOps approaches across\na huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\nbuilding robust MLOps practices.\n\n\n###### Databricks solution:\n\nDatabricks Machine Learning is an integrated\nend-to-end machine learning environment\nincorporating managed services for experiment\ntracking, model training, feature development and\nmanagement, and model serving. The capabilities\nof Databricks map directly to the steps of model\ndevelopment and deployment. With Databricks\nMachine Learning, you can:\n\n\nTrain models either manually or with AutoML\n\nTrack training parameters and models using\nexperiments with MLflow tracking\n\nCreate feature tables and access them for model\ntraining and inference\n\nShare, manage and serve models using MLflow\nModel Registry\n\nDeploy models for Serverless Real-time Inference\n\n\n-----\n\n###### Use MLOps on the Databricks Lakehouse Platform\n\nTo gain efficiencies and reduce costs, many smaller\ndigital companies are employing machine learning\noperations. MLOps is a set of processes and\nautomation for managing models, data and code, and\nunique library dependencies to improve performance\nstability and long-term efficiency in ML systems.\n\nTo describe it simply, MLOps = ModelOps + DataOps +\nDevOps. The aim of MLOps is to improve the long-term\nperformance, stability and success rate of ML systems\nwhile maximizing the efficiency of the teams who\nbuild them.\n\n\nNot only does MLOps improve organizational efficiency,\nit also allows the models to iterate faster and react\nto real-life changes in the data. This ability separates\ncompanies that can grow to meet their customer’s\nchallenges in a reactive manner versus those that will\nspend significant time on data updates/processes and\nmiss the opportunity to do something with\ntheir models.\n\nThe absence of MLOps is typically marked by an\noverabundance of manual processes which are slower", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "49385fba85744ca2e26f86c0a4b6ffd8", + "Deploy models for Serverless Real-time Inference\n\n\n-----\n\n###### Use MLOps on the Databricks Lakehouse Platform\n\nTo gain efficiencies and reduce costs, many smaller\ndigital companies are employing machine learning\noperations. MLOps is a set of processes and\nautomation for managing models, data and code, and\nunique library dependencies to improve performance\nstability and long-term efficiency in ML systems.\n\nTo describe it simply, MLOps = ModelOps + DataOps +\nDevOps. The aim of MLOps is to improve the long-term\nperformance, stability and success rate of ML systems\nwhile maximizing the efficiency of the teams who\nbuild them.\n\n\nNot only does MLOps improve organizational efficiency,\nit also allows the models to iterate faster and react\nto real-life changes in the data. This ability separates\ncompanies that can grow to meet their customer’s\nchallenges in a reactive manner versus those that will\nspend significant time on data updates/processes and\nmiss the opportunity to do something with\ntheir models.\n\nThe absence of MLOps is typically marked by an\noverabundance of manual processes which are slower\n\n\nand more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\ncapping the ability for a data team to take on new projects. The process is complex. In larger organizations,\nseveral specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\nnatives and high-growth startups may be forced to wear several hats.\n\n\n-----\n\nAnd once an ML project goes into production, the\nMLOps continues, since the models, data and code\nchange over time due to regulatory and business\nrequirements. But the ML system must be resilient and\nflexible. Addressing these challenges with a defined\nMLOps strategy can dramatically reduce the iteration\ncycle of delivering models to production.\n\n\n-----\n\n###### Steps in machine learning model development and deployment:\n\n\n**Step 1**\n**Data preparation**\nManually preparing and labeling data is a thankless,\ntime-consuming job. With Databricks, teams can\nlabel data with human effort, machine learning\nmodels in Databricks, or a combination of both.\nTeams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\nworkflow that allows humans to easily inspect and\ncorrect a model’s predicted labels. This process can\ndrastically reduce the amount of unstructured data\nyou need to achieve strong model performance.\n\nThe [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\nready-to-go environment with many external\nlibraries, including TensorFlow, PyTorch, Horovod,\nscikit-learn and XGBoost. It provides\nextensions to improve performance, including GPU\nacceleration in XGBoost, distributed deep\nlearning using HorovodRunner, and model\ncheckpointing.\n\nTo use Databricks Runtime ML, select the ML version\nof the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\naccess data in Unity Catalog for machine learning\nworkflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\nisolation clusters are not compatible with Databricks\nRuntime for Machine Learning.\n\n\nMachine learning applications often\nneed to use shared storage for data\nloading and model checkpointing. You\ncan load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\nfiles. A table is a collection of\nstructured data stored as a directory\non cloud object storage.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "04ad6067c8620647b09f29ae19400f5a", + "To use Databricks Runtime ML, select the ML version\nof the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\naccess data in Unity Catalog for machine learning\nworkflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\nisolation clusters are not compatible with Databricks\nRuntime for Machine Learning.\n\n\nMachine learning applications often\nneed to use shared storage for data\nloading and model checkpointing. You\ncan load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\nfiles. A table is a collection of\nstructured data stored as a directory\non cloud object storage.\n\nFor [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\nuse [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\nnew features, explore and reuse\nexisting features, track lineage and\nfeature creation code, and publish\nfeatures to low-latency online stores\nfor real-time inference. The Feature\nStore is a centralized repository\nthat enables data scientists to find\nand share features. It ensures that\nthe same code used to compute\nthe feature values is used for model\ntraining and inference. The Feature\nStore library is available only on\nDatabricks Runtime for Machine\nLearning and is accessible through\nDatabricks notebooks and workflows.\n\n\n###### Resources:\n\n- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n\n- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n\n- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n\n\n-----\n\nC `USTOMER STORY: ZIPLINE`\n\n### Data-driven drones deliver lifesaving medical aid around the world\n\n\nAutomated logistics and delivery system\n\nprovider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n\ncutting-edge drone technology and a global\n\nautonomous logistics network to save lives\n\n\ninformation they need to accurately measure success, find\n\nthe metrics that relate to customer experiences or logistics,\n\nand improve on them exponentially as more data is ingested\n\nand machine learning models are refined.\n\n\nby giving remote communities access to\n\n\nemergency and preparatory medical aid and\n\nresources, regardless of where they are in the\n\nworld.\n\nDoing so requires the ability to ingest and\n\nanalyze huge chunks of time series data in real\n\ntime. This data is produced every time a drone\n\ntakes flight and includes performance data,\n\nin-flight battery management, regional weather\n\npatterns, geographic obstacles, landing errors\n\nand a litany of other information that must be\n\nprocessed.\n\n\n“About 30% of the deliveries we do are lifesaving emergency\n\ndeliveries, where the product being delivered does not exist\n\nat the hospital. We have to be fast, and we have to be able\n\nto rely on all the different kinds of data to predict failures\n\nbefore they occur so that we can guarantee a really, really\n\nhigh service level to the people who are literally depending\n\non us with their lives,” said Zipline CEO Keller Rinaudo.\n\n“Databricks gives us confidence in our operations, and\n\nenables us to continuously improve our technology, expand\n\nour impact, and provide lifesaving aid where and when it’s\n\nneeded, every single day.”", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "103fa9f67351003f0db724bd575fe49e", + "analyze huge chunks of time series data in real\n\ntime. This data is produced every time a drone\n\ntakes flight and includes performance data,\n\nin-flight battery management, regional weather\n\npatterns, geographic obstacles, landing errors\n\nand a litany of other information that must be\n\nprocessed.\n\n\n“About 30% of the deliveries we do are lifesaving emergency\n\ndeliveries, where the product being delivered does not exist\n\nat the hospital. We have to be fast, and we have to be able\n\nto rely on all the different kinds of data to predict failures\n\nbefore they occur so that we can guarantee a really, really\n\nhigh service level to the people who are literally depending\n\non us with their lives,” said Zipline CEO Keller Rinaudo.\n\n“Databricks gives us confidence in our operations, and\n\nenables us to continuously improve our technology, expand\n\nour impact, and provide lifesaving aid where and when it’s\n\nneeded, every single day.”\n\n[Read full story here.](https://www.databricks.com/customers/zipline)\n\n\nEvery Zipline flight generates a gigabyte of data\n\nwith potential life-or-death consequences,\n\nbut accessing and federating the data for both\n\ninternal and external decision-making was\n\nchallenging. With Databricks as the common\n\nplatform, Zipline’s data team can access all the\n\n\n-----\n\n**Step 2**\n**Model training**\nFor training machine learning and deep learning\nmodels, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\nprepares a data set for model training, performs a set\nof trials using open-source libraries such as scikit-learn\nand XGBoost, and creates a Python notebook with\nthe source code for each trial run so you can review,\nreproduce and modify the code.\n\nIn Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\ncreating data science and machine learning workflows\nand collaborating with colleagues. Databricks\nnotebooks provide real-time coauthoring in multiple\nlanguages, automatic versioning and built-in data\nvisualizations.\n\n\n###### Resources:\n\n- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n\n- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n\n\n-----\n\n###### Resources:\n\n- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n\n- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n\n- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n\n- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n\n- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "211ac77249d86b077653e7b60ecf7232", + "-----\n\n###### Resources:\n\n- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n\n- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n\n- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n\n- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n\n- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n\n\n**Step 3**\n**Track model development**\nThe model development process is iterative, and can\nbe challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\nyou keep track of the model development process,\nincluding parameter settings or combinations you have\ntried and how they affected the model’s performance.\n\nMLflow tracking uses experiments and runs to log\nand track your model development. A run is a single\nexecution of model code. An experiment is a collection\nof related runs. Within an experiment, you can compare\nand filter runs to understand how your model performs\nand how its performance depends on the parameter\nsettings, input data, etc.\n\nMLflow can automatically log training code written\nin many ML frameworks. This is the easiest way to\nget started using MLflow tracking. With MLflow’s\nautologging capabilities, a single line of code\nautomatically logs the resulting model.\n\n\nA hosted version of MLflow Model Registry can help\n[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\napply webhooks to automatically trigger actions based\non registry events. For example, you can trigger CI\nbuilds when a new model version is created or notify\nyour team members through Slack each time a model\ntransition to production is requested. This promotes\na traceable version control work process. You can\nleverage this feature for web traffic A/B testing and\nfunneled to versions of deployed models for more\nprecise population studies.\n\n\n**Step 4**\n**Deploy machine learning models**\nYou can use MLflow to deploy models for batch or\nstreaming inference or to set up a REST endpoint to\nserve the model. Simplify your model deployment by\nregistering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\nyou have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\nthe model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n\n[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\napplications, Databricks recommends the following\nworkflow.\n\nTo debug and tune model inference on Databricks,\nusing GPUs (graphics processing units) can efficiently\noptimize the running speed for model inference. As\nGPUs and other accelerators become faster, it is\nimportant that the data input pipeline keep up with\ndemand. The data input pipeline reads the data into\nSpark DataFrames, transforms it and loads it as the\ninput for model inference.\n\n\n-----\n\n```\nCUSTOMER STORY: ITERABLE\n\n### Optimizing touch points across the entire customer journey\n\n```\n“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "5ad7d5b602b0286dc3f06bfc52998475", + "[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\napplications, Databricks recommends the following\nworkflow.\n\nTo debug and tune model inference on Databricks,\nusing GPUs (graphics processing units) can efficiently\noptimize the running speed for model inference. As\nGPUs and other accelerators become faster, it is\nimportant that the data input pipeline keep up with\ndemand. The data input pipeline reads the data into\nSpark DataFrames, transforms it and loads it as the\ninput for model inference.\n\n\n-----\n\n```\nCUSTOMER STORY: ITERABLE\n\n### Optimizing touch points across the entire customer journey\n\n```\n“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n\nrising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n\nPrincipal Product Manager, [Iterable](https://iterable.com/)\n\nCaptivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n\nconnections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n\npatients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n\nthan 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n\nThis need to build personalized and automated customer experiences for its clients drove the company to find a\n\nfully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n\nthe ability to scale for analytics and AI.\n\nWith Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n\nunique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n\nlearning models that deliver top-notch and personalized user experiences for higher-converting marketing\n\ncampaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n\n\n-----\n\n###### ML Stages\n\nML workflows include the following key assets: code,\nmodels and data. These assets need to be developed\n(dev), tested (staging) and deployed (production).\nEach stage needs to operate within an execution\nenvironment. So the execution environments, code,\nmodels and data are divided into dev, staging and\nproduction.\n\nML project code is often stored in a version control\nrepository (such as Git), with most organizations using\nbranches corresponding to the lifecycle phases of\ndevelopment, staging or production.\n\nSince model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\nmanagement to have its own service. MLflow and its\nModel Registry support managing model artifacts\ndirectly via UI and APIs. The loose coupling of model\nartifacts and code provides flexibility to update\nproduction models without code changes, streamlining\nthe deployment process in many cases.\n\nDatabricks recommends creating separate\nenvironments for the different stages of ML code and\nmodel development with clearly defined transitions\nbetween stages. The recommended MLOps workflow is\nbroken into these three stages:\n\n\n[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\nis experimentation. Data scientists develop features\nand models and run experiments to optimize model\nperformance. The output of the development process is\nML pipeline code that can include feature computation,\nmodel training inference and monitoring\n\n\n-----\n\n[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\nThis stage focuses on testing the ML pipeline code\nfor production readiness, including code for model\ntraining as well as feature engineering pipelines and\ninference code. The output of the staging process is a\nrelease branch that triggers the CI/CD system to start\nthe production stage.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "0929758b2300251054152ee60e8367e6", + "Databricks recommends creating separate\nenvironments for the different stages of ML code and\nmodel development with clearly defined transitions\nbetween stages. The recommended MLOps workflow is\nbroken into these three stages:\n\n\n[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\nis experimentation. Data scientists develop features\nand models and run experiments to optimize model\nperformance. The output of the development process is\nML pipeline code that can include feature computation,\nmodel training inference and monitoring\n\n\n-----\n\n[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\nThis stage focuses on testing the ML pipeline code\nfor production readiness, including code for model\ntraining as well as feature engineering pipelines and\ninference code. The output of the staging process is a\nrelease branch that triggers the CI/CD system to start\nthe production stage.\n\n\n-----\n\n[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\nML engineers own the production environment\nwhere ML pipelines are deployed. These pipelines\ncompute fresh feature values, train and test new model\nversions, publish predictions to downstream tables\nor applications, and monitor the entire process to\navoid performance degradation and instability. Data\nscientists have visibility to test results, logs, model\nartifacts and production pipeline status to allow them\nto identify and diagnose problems in production.\n\nThe Databricks Machine Learning home page provides\nquick access to all the machine learning resources. To\naccess this page, move your mouse or pointer over\nthe left sidebar in the Databricks workspace. From\nthe persona switcher at the top of the sidebar, select\n\nMachine Learning.\n\nFrom the shortcuts menu, you can create\na [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\nThe center of the screen includes any recently viewed\nitems, and the sidebar provides quick access to\nthe [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\nNew users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\nthat illustrate how to use Databricks throughout the\n\n\n-----\n\n###### Resources:\n\n- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n\n- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n\n- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n\n- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "f1fccacf5e51dabf1c04271b9515d627", + "- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n\n- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n\n- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n\n- [Watch the demos](https://www.databricks.com/discover/demos)\n\n\nML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\nfor a model-training tutorial notebook that steps\nthrough loading data, training and tuning a model,\ncomparing and analyzing model performance and\nusing the model for inference.\n\nAlso be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\nlearn how your organization can build a robust MLOPs\npractice incrementally.\n\n\n-----\n\n# 04\n```\nSUMMARY: \u0003\n\n## The Databricks Lakehouse Platform addresses these challenges\n 04\n\n```\n\n-----\n\n### Summary\n\nWe’ve organized the common data challenges for startups and growing digital native\n\nbusinesses into three main buckets: Building a **unified data architecture** — one that\n\nsupports **scalability and performance** ; and building effective **machine learning**\n\n**operations** , all with an eye on cost efficiency and increased productivity.\n\nThe Lakehouse Platform provides an efficient and scalable architecture that solves\nthese challenges and will support your data, analytics and AI workloads now and as\nyou scale.\n\nWith [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\nperformant digital native applications and analytic workloads — designed to scale as\nyou grow. Use your data however and wherever you want with open-source flexibility,\nleverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\ndata workloads while Databricks automatically manages your infrastructure as you\nscale. Leverage serverless Databricks SQL to increase productivity and scale on\ndemand with up to 12x better price/performance.\n\nEasily access data for ML models and accelerate the full ML lifecycle from\nexperimentation to production.\n\nDiscover more about the lakehouse for companies born in the cloud **.**\n\n\n-----\n\n### Get started with Databricks Trial\n\nGet a collaborative environment for data teams to build\nsolutions together with interactive notebooks to use\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\nTensorFlow, Keras, scikit-learn and more.\n\n\n### Get started with About Databricks Trial Databricks\n\nGet a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\nsolutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\nTensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San\n\nAvailable as a 14-day full trial in your own cloud or as\n\nFrancisco, with offices around the globe. Founded by\n\na lightweight trial hosted by Databricks.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "20ade079c6a0bfe78a147f5842812b5e", + "-----\n\n### Get started with Databricks Trial\n\nGet a collaborative environment for data teams to build\nsolutions together with interactive notebooks to use\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\nTensorFlow, Keras, scikit-learn and more.\n\n\n### Get started with About Databricks Trial Databricks\n\nGet a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\nsolutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\nTensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San\n\nAvailable as a 14-day full trial in your own cloud or as\n\nFrancisco, with offices around the globe. Founded by\n\na lightweight trial hosted by Databricks.\n\nthe original creators of Apache Spark™, Delta Lake and\nMLflow, Databricks is on a mission to help data teams\nsolve the world’s toughest problems. To learn more,\nfollow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n\n\n\n- Available as a 14-day full trial in your own cloud or as\na lightweight trial hosted by Databricks.\n\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "cd5ada025a0094fbaf75ac5cea9c38f2", + "##### Guide\n\n## 6 Strategies for Building Personalized Customer Experiences\n\n\n-----\n\n### Contents\n\n**Introduction** ................................................................................................................................................................................................................. **3**\n\n**1.** **Building a Foundation for Personalization**\nLeveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n\n**2.** **Estimating Customer Lifetime Value**\nBuilding Brand Loyalty With Data ................................................................................................................................................................. **6**\n\n**3.** **Mitigating Customer Churn**\nBalancing Acquisition and Retention .......................................................................................................................................................... **10**\n\n**4.** **Streamlining Customer Analysis and Targeting**\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "d53c2a5c69cef5febfa62ea961c33d25", + "**4.** **Streamlining Customer Analysis and Targeting**\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n\n**5.** **Assessing Consumer Interest Data**\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n\n**6.** **Delivering Personalized Customer Journeys**\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n\n**Conclusion**\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n\n\n-----\n\n### Introduction\n\nIn today’s experience-driven world, the most beloved brands are the ones that\nknow their customers. Customers are loyal to brands that recognize their needs\nand preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\nbuying from a brand that personalizes the shopping and user experience to the\nwants and needs of the customer. And as organizations pursue omnichannel\nexcellence, these same high expectations of online experiences also extend to\nbrick-and-mortar locations — revealing for many merchants that personalized\nengagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized\nexperiences requires integrating various types of data — including demographics,\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\nactionable strategic pillars for businesses to leverage automation, real-time data,\nAI-driven analysis and well-tuned ML models to architect and deliver customized\ncustomer experiences at every touch point.\n\n\n# 76%\n\nof consumers are more\nlikely to purchase due to\npersonalization\n\n\n# 76%\n\n\n-----\n\n### Building a Foundation for Personalization\n\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\n\n\nTo create truly personalized interactions, you need actionable insights\nabout your customers. Start by establishing a common customer profile and\naccurately linking together customer records across disparate data sets.\n\nGet a 360-degree view of your target customer by bringing together:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e46c7c2f5da3f3652f9c2c1ba0dfd2b7", + "But achieving a 360-degree view of your customers to serve personalized\nexperiences requires integrating various types of data — including demographics,\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\nactionable strategic pillars for businesses to leverage automation, real-time data,\nAI-driven analysis and well-tuned ML models to architect and deliver customized\ncustomer experiences at every touch point.\n\n\n# 76%\n\nof consumers are more\nlikely to purchase due to\npersonalization\n\n\n# 76%\n\n\n-----\n\n### Building a Foundation for Personalization\n\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\n\n\nTo create truly personalized interactions, you need actionable insights\nabout your customers. Start by establishing a common customer profile and\naccurately linking together customer records across disparate data sets.\n\nGet a 360-degree view of your target customer by bringing together:\n\n- Sales and traffic-driven first-party data\n\n- Product ratings and surveys\n\n- Customer surveys and support center calls\n\n- Third-party data purchased from data aggregators and online trackers\n\n- Zero-party data provided by customers themselves\n\nLocation\n\n\n**C A S E S T U DY**\n\n**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n\nGrab is the largest online-to-offline platform in Southeast Asia and\nhas generated over 6 billion transactions for transport, food and\ngrocery delivery, and digital payments. Grab uses Databricks to create\nsophisticated customer segmentation and recommendation engines\nthat can now ingest and optimize thousands of user-generated signals\nand data sources simultaneously, enhancing data integrity and security,\nand reducing weeks of work to only hours.\n\n[Get the full story](https://www.databricks.com/customers/grab)\n\n\n\nDemographics\n\n\nOrders\n\nNetwork/\nUsage\n\n\n“The C360 platform empowered teams to create\nconsumer features at scale, which in turn allows\nfor these features to be extended to other markets\nand used by other teams. This helps to reduce the\nengineering overhead and costs exponentially.”\n\n**N I K H I L DWA R A K A N AT H**\nHead of Analytics, Grab\n\n\nSocial\n\nApps/\nClickstream\n\n|Col1|Col2|Col3|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|||||||\n||Cus 3|t 6|o|mer 0||\n|||||||\n|||||||\n\n\n\nService Call/\nRecords\n\n\nCustomer\n360\n\n\nBilling\n\nDevices\n\n\n-----\n\nGiven the different data sources and data types, automated matching can still\nbe incredibly challenging due to inconsistent formats, misinterpretation of data,\nand entry errors across various systems. And even if inconsistent, all that data\nmay be perfectly valid — but to accurately connect the millions of customer\nidentities most retailers manage, businesses must lean on automation.\n\nIn a machine learning (ML) approach to entity resolution, text attributes like\nname, address and phone number are translated into numerical representations\nthat can be used to quantify the degree of similarity between any two attribute\nvalues. But your ability to train such a model depends on your access to\naccurately labeled training data. It’s a time-consuming exercise, but if done right,\nthe model learns to reflect the judgments of the human reviewers.\n\nMany organizations rely on libraries encapsulating this knowledge to build their\napplications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\nbringing together ML-based approaches to intelligent candidate pair generation\nand pair-scoring. Oriented toward the construction of custom workflows, Zingg\npresents these capabilities within the context of commonly employed steps\nsuch as training data label assignment, model training, data set deduplication,\nand (cross-data set) record matching.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "1588d357d1fefca3d2410a7107be8bef", + "In a machine learning (ML) approach to entity resolution, text attributes like\nname, address and phone number are translated into numerical representations\nthat can be used to quantify the degree of similarity between any two attribute\nvalues. But your ability to train such a model depends on your access to\naccurately labeled training data. It’s a time-consuming exercise, but if done right,\nthe model learns to reflect the judgments of the human reviewers.\n\nMany organizations rely on libraries encapsulating this knowledge to build their\napplications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\nbringing together ML-based approaches to intelligent candidate pair generation\nand pair-scoring. Oriented toward the construction of custom workflows, Zingg\npresents these capabilities within the context of commonly employed steps\nsuch as training data label assignment, model training, data set deduplication,\nand (cross-data set) record matching.\n\nBuilt as a native Apache Spark TM application, Zingg scales well to apply these\ntechniques to enterprise-sized data sets. Organizations can then use Zingg in\ncombination with platforms such as Databricks Lakehouse to provide the back\nend to human-in-the-middle workflow applications that automate the bulk of\nthe entity resolution work and present data experts with a more manageable\nset of edge case pairs to interpret.\n\n\nAs an active-learning solution, models can be retrained to take advantage of\nthis additional human input to improve future predictions and further reduce\nthe number of cases requiring expert review. Finally, these technologies can be\nassembled to enable their own enterprise-scaled customer entity resolution\nworkflow applications.\n\n**Need help building your foundation for a**\n**360-degree view of your customers?**\n\nGet pre-built code sample data and step-by-step instructions\nin a Databricks notebook in the **Customer Entity Resolution**\n**Solution Accelerator.**\n\n**•** Translating text attributes (like name, address, phone number)\ninto quantifiable numerical representations\n\n**•** Training ML models to determine if these numerical labels\nform a match\n\n**•** Scoring the confidence of each match\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n\n\n-----\n\n### Estimating Customer Lifetime Value\n\nBuilding brand loyalty to drive share of wallet with data\n\n\nOnce you’ve set up a 360-degree view of the customer, the next challenge\nis how to spend money to profitably grow the brand. The goal is to spend\nmarketing dollars on activities that attract loyal customers and avoid spending on\nunprofitable customers or activities that damage the brand. Keep in mind, that\nmaking decisions solely based on ROI isn’t the answer. This one-track approach\ncould ultimately weaken your brand equity and make you more dependent on\nlowering your price through promotions as a way to generate sales.\n\n**C A S E S T U DY**\n\n\n**Identifying and engaging brand loyalists**\n\nToday’s customer has overwhelmingly abundant options in products and\nservices to choose from. That’s why personalizing customer experiences is so\nimportant, as it increases revenue, marketing efficiency and customer retention.\n\nNot every customer carries the same potential for profitability. Different\ncustomers derive different value from your products and services, which directly\ntranslates into differences in the overall amount of value a business can expect\nin return. Mutually beneficial relationships carefully align customer acquisition\ncost (CAC) and retention rates with the total revenue or customer lifetime value\n(CLV).\n\n\n**Predicting and increasing customer lifetime value with ML**\n\n\nKolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\nattracts over 10 million monthly active users. With Databricks, they\nachieved a 30% increase in player LTV, improved data team productivity\nby 3x, and reduced ML model-to-production time by 40x.\n\n[Get the full story](https://databricks.com/customers/kolibri-games)\n\nWithin your existing customer base are people ranging from brand loyalists to\nbrand transients. Brand loyalists are highly engaged with your brand, are willing\nto share their experience with others, and are the most likely to purchase\nagain. Brand transients have no loyalty to your brand and shop based on price.\nYour focus should be on growing the group of brand loyalists while minimizing\ninteractions with brand transients.\n\n\n**Calculating customers’ lifetime intent**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "f6482951b29e919393ff642a754723f9", + "**Predicting and increasing customer lifetime value with ML**\n\n\nKolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\nattracts over 10 million monthly active users. With Databricks, they\nachieved a 30% increase in player LTV, improved data team productivity\nby 3x, and reduced ML model-to-production time by 40x.\n\n[Get the full story](https://databricks.com/customers/kolibri-games)\n\nWithin your existing customer base are people ranging from brand loyalists to\nbrand transients. Brand loyalists are highly engaged with your brand, are willing\nto share their experience with others, and are the most likely to purchase\nagain. Brand transients have no loyalty to your brand and shop based on price.\nYour focus should be on growing the group of brand loyalists while minimizing\ninteractions with brand transients.\n\n\n**Calculating customers’ lifetime intent**\n\nTo assess the remaining lifetime in a customer relationship, businesses must\n\ncarefully examine the transactional signals and other indicators from previous\ncustomer engagements and transactions.\n\nFor example, if a frequent customer slows down their buying habits — or simply\ndoesn’t make a purchase for an extended period of time — it may signal the\nupcoming end of the relationship. However, in the case of another customer\nwho engages infrequently, the same extended absence may not signal anything\nnotable. The infrequent buyer may continue to purchase even after a long pause\nin activity.\n\n\n-----\n\nCustomer A\n\nCustomer B\n\nCustomer C\n\n\nPast Future\n\nDifferent customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n\n\nEvery customer relationship with a business has a lifespan. Understanding what\npoint in the lifespan at a given time provides critical insight to inform marketing\nand sales tactics. By proactively discovering shifts in the relationship, you can\nadapt how to respond to each customer at the optimal time. For example, a\ncertain signal might prompt a change in how to deliver products and services,\nwhich could help maximize revenue.\n\nTransactional signals can be used to estimate the probability that a customer\nis active and likely to return in the future. Popularized as the Buy ’til You Die\n(BTYD) model, analysts can compare a customer’s frequency and recency of\n\nengagement to similar patterns across their user population to accurately\npredict individual CLV.\n\n\nThe mathematics behind these predictive CLV models is complex, but the logic\nbehind these critical models is accessible through a popular Python library\nnamed Lifetimes, which allows the input of simple summary metrics in order to\nderive customer-specific lifetime estimates.\n\n**C A S E S T U DY**\n\n**How personalized experiences keep customers coming**\n**back for more**\n\nPublicis Groupe empowers brands to transform retail experiences with\ndigital technologies, but data challenges and team silos stood in the\nway of delivering the personalization that their customers required.\nSee how they use Databricks to create a single customer view that\nallows them to drive customer loyalty and retention. As a result, they’ve\nseen a 45%–50% increase in customer campaign revenue.\n\n[Get the full story](https://databricks.com/customers/publicis-groupe)\n\n\n-----\n\n**Delivering customer lifetime estimates to the business**\n\n\nSpark natively distributes this work across a multi-server environment, enabling\nconsistent, accurate and efficient analysis. Spark’s flexibility allows models to\nadapt in real time as new information is ingested, eliminating the bottlenecks\nthat come with manual data mapping and profile building.\n\nWith per customer metrics calculated, the Lifetimes library can be used to train\nmultiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\npredict engagements over time using proprietary data can take several months\nand thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\nbusinesses tap into the infrastructure behind their Spark environments and\ndistribute the training outputs across models.\n\n\nUsing the Lifetimes library to calculate customer-specific probabilities at speed\nand scale can be challenging — from processing large volumes of transaction\ndata to deriving data curves and value distribution patterns and, eventually,\nto integration with business initiatives. But with the proper approach, you can\nresolve all of them.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "1ce23e74ee932d8df197e9a45e53e861", + "**Delivering customer lifetime estimates to the business**\n\n\nSpark natively distributes this work across a multi-server environment, enabling\nconsistent, accurate and efficient analysis. Spark’s flexibility allows models to\nadapt in real time as new information is ingested, eliminating the bottlenecks\nthat come with manual data mapping and profile building.\n\nWith per customer metrics calculated, the Lifetimes library can be used to train\nmultiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\npredict engagements over time using proprietary data can take several months\nand thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\nbusinesses tap into the infrastructure behind their Spark environments and\ndistribute the training outputs across models.\n\n\nUsing the Lifetimes library to calculate customer-specific probabilities at speed\nand scale can be challenging — from processing large volumes of transaction\ndata to deriving data curves and value distribution patterns and, eventually,\nto integration with business initiatives. But with the proper approach, you can\nresolve all of them.\n\nThese models depend on three key per customer metrics:\n\n**FREQUENCY**\nThe number of times within a given time period in which a repeat\ntransaction is observed\n\n**AGE**\nThe length of time between the occurrence of an initial transaction\nto the end of a given time period\n\n**RECENCY**\n\nThe “age” of a customer (how long they’ve engaged with a brand)\nat the time of their latest repeat transaction\n\n\n-----\n\n**Solution deployment**\n\n\nOnce properly trained, these models can determine the probability that a\ncustomer will re-engage, as well as the number of engagements a business\ncan expect from that customer over time. But the real challenge is putting\nthese predictive capabilities into the hands of those that determine\ncustomer engagement.\n\nMatrices illustrating the probability a customer is alive (left) and the number of future\npurchases in a 30-day window given a customer’s frequency and recency metrics (right).\n\n\nBusinesses need a way to develop and deploy solutions in a highly scalable\nenvironment with a limited upfront cost. Databricks Solution Accelerators\nleverage real-world sample data sets and pre-built code to show how raw data\ncan be transformed into real solutions — including step-by-step instructions\nready to go in a Databricks notebook.\n\n**Need help determining your customers’**\n**lifetime value?**\n\nUse the **Customer Lifetime Value Accelerator** to\n\n**•** Ingest sample retail data\n\n**•** Use pre-built code to develop visualizations and explore\npast purchase behavior\n\n**•** Apply machine learning to predict the likelihood and\nnature of future purchases\n\n**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n-----\n\n### Mitigating Customer Churn\n\nBalancing acquisition and retention with personalized experiences\n\n\nThere are no guarantees of success. With a bevy of options at their disposal,\ncustomer churn is a reality that companies face and are focused on overcoming\nevery day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\nestimated a segment average 7.2% monthly rate of churn. When narrowed to\nbrands focused on consumer goods, that rate jumped to 10.0%. This figure\ntranslates to a lifetime of 10 months for the average subscription box service,\nleaving businesses of this kind with little time to recover acquisition costs and\nbring subscribers to net profitability.\n\n**C A S E S T U DY**\n##### Riot Games\n\n**Creating an optimal in-game experience for League of Legends**\n\nRiot Games is one of the top PC game developers in the world, with over\n100 million monthly active users, 500 billion data points, and over 26\npetabytes of data and counting. They turned to Databricks to build a more\n\nefficient and scalable way to leverage data and improve the overall gaming\nexperience — ensuring customer engagement and reducing churn.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "9d9266f876aecba5c8df8c7bfd97cf6a", + "**C A S E S T U DY**\n##### Riot Games\n\n**Creating an optimal in-game experience for League of Legends**\n\nRiot Games is one of the top PC game developers in the world, with over\n100 million monthly active users, 500 billion data points, and over 26\npetabytes of data and counting. They turned to Databricks to build a more\n\nefficient and scalable way to leverage data and improve the overall gaming\nexperience — ensuring customer engagement and reducing churn.\n\n[Get the full story](https://www.databricks.com/customers/riot-games)\n\nOrganizations must take an honest look at the cost of acquisition relative to a\ncustomer’s lifetime value (LTV) earned. These figures need to be brought into a\n\nhealthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n\n\n**Understanding attrition predictability through subscriptions:**\n**Examining retention-based acquisition variables**\n\nPublic data for subscription services is extremely hard to come by. KKBox, a\nTaiwan-based music streaming service, recently released over two years of\nanonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\nthe data, we uncover customer dynamics familiar to any subscription provider.\n\nMost subscribers join the KKBox service through a 30-day trial offer. Customers\nthen appear to enlist in one-year subscriptions, which provide the service with\na steady flow of revenue. Subscribers typically churn at the end of the 30-day\ntrial and at regular one-year intervals.\n\nThe Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\nretained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n\n\n-----\n\nBy Initial Payment Method\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different payment methods.\n\nBy Initial Payment Plan Days\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers selecting different initial payment methods and terms/days.\n\n\nThis pattern of high initial drop-off, followed by a period of slower but continuing\ndrop-off cycles makes intuitive sense. Where it gets interesting is when the\ndata changes. The patterns of customer churn become vastly different as time\npasses and new or changing elements are introduced (e.g., payment methods\nand options, membership tiers, etc.).\n\nBy Registration Channel\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different channels.\n\n\n-----\n\nThese patterns seem to indicate that KKBox _could_ potentially differentiate\nbetween customers based on their lifetime potential, using only the information\navailable at subscriber acquisition. In the same way, non-subscription businesses\ncould use similar data techniques to get an accurate illustration of the total\nlifetime value of a particular customer, even before collecting historical data.\n\nThis information can help businesses target certain shoppers with effective\ndiscounts or promotions as early as trial registration. Nevertheless, it’s always\nimportant to consider more than individual data points.\n\nThe baseline risk of customer attrition over a subscription lifespan.\n\n\nThe channel and payment method multipliers combine to explain a customer’s risk of attrition\nat various points in time. The higher the value, the higher the proportional risk of churn in the\nassociated period.\n\n\n-----\n\n**Applying churn analytics to your data**\n\nThis analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n**2)** to paint a quantitative picture of the specific factors that explain that risk,\ngiving analysts a clearer understanding of what to focus on, what to ignore and\nwhat to investigate further. The main challenge is organizing the input data.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "dc5fc49468f8795f185c2a9a69844b3f", + "This information can help businesses target certain shoppers with effective\ndiscounts or promotions as early as trial registration. Nevertheless, it’s always\nimportant to consider more than individual data points.\n\nThe baseline risk of customer attrition over a subscription lifespan.\n\n\nThe channel and payment method multipliers combine to explain a customer’s risk of attrition\nat various points in time. The higher the value, the higher the proportional risk of churn in the\nassociated period.\n\n\n-----\n\n**Applying churn analytics to your data**\n\nThis analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n**2)** to paint a quantitative picture of the specific factors that explain that risk,\ngiving analysts a clearer understanding of what to focus on, what to ignore and\nwhat to investigate further. The main challenge is organizing the input data.\n\nThe data required to examine customer attrition may be scattered across\nmultiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\nthe creation of transparent, sustainable data processing pipelines that are\nflexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n**condition to be managed** , and attrition data should be periodically revisited to\nmaintain alignment between acquisition and retention efforts.\n\n**Need help predicting customer churn?**\n\nUse the **Subscriber Churn Prediction Accelerator** to analyze\nbehavioral data, identify subscribers with an increased risk of\ncancellation, and predict attrition. Machine learning lets you\nquantify a user’s likelihood to churn, identifying factors that\nexplain the risk.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n### Streamlining Customer Analysis and Targeting\n\nCreating efficient and highly targeted customer experiences with behavioral data\n\n\nEffective targeting comes down to one fundamental element: the cost of\ndelivering a good or service relative to what a consumer is willing to pay.\n\nIn the earliest applications of segmentation, manufacturers recognized that\nspecialized product lines targeting specific consumer groups could help\nbrands stand out against competitors.\n\n**C A S E S T U DY**\n\n**Finding that special something every time**\n\nPandora is a jewelry company with global reach. They built their master\nconsumer view (MCV) dashboard on the Databricks Lakehouse Platform,\ngiving them the insights necessary to deliver highly targeted messaging\nand personalization — resulting in 80% growth in email marketing\nsuccess, a 50% increase in click-to-open rate across 65 million emails,\nand 255M DKK (Danish Krone) in quarterly revenue.\n\n[Get the full story](https://www.databricks.com/customers/pandora)\n\nThis mode of thinking extends beyond product development and into every\ncustomer-oriented business function, requiring specific means of ideation,\nproduction and delivery. The work put into segmentation doesn’t need to be\na gamble. Scrutinizing customers and testing responsiveness is an ongoing\nprocess. Organizations must analyze and adapt to shifting markets, changing\nconsumer demand and evolving business objectives.\n\n\n**C A S E S T U DY**\n\n**Powering insight-driven dashboards to increase customer**\n**acquisition**\n\nBagelcode is a global game company with more than 50 million global\nusers. By using the Databricks Lakehouse Platform, they are now able to\nsupport more diversified indicators, such as a user’s level of frequency\nand the amount of time they use a specific function for each game,\nenabling more well-informed responses. In addition, the company is\nmitigating customer churn by better predicting gamer behavior and\nproviding personalized experiences at scale.\n\n[Get the full story](https://www.databricks.com/customers/bagelcode)\n\n“Thanks to Databricks Lakehouse, we can support\nreal-time business decision-making based on data\nanalysis results that are automatically updated on\nan hourly and daily basis, even as data volumes have\nincreased by nearly 1,000 times.”\n\n**J O O H Y U N K I M**\nVice President, Data and AI, Bagelcode\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "2f01578d9ce1f0632c2c1cb267859283", + "**C A S E S T U DY**\n\n**Powering insight-driven dashboards to increase customer**\n**acquisition**\n\nBagelcode is a global game company with more than 50 million global\nusers. By using the Databricks Lakehouse Platform, they are now able to\nsupport more diversified indicators, such as a user’s level of frequency\nand the amount of time they use a specific function for each game,\nenabling more well-informed responses. In addition, the company is\nmitigating customer churn by better predicting gamer behavior and\nproviding personalized experiences at scale.\n\n[Get the full story](https://www.databricks.com/customers/bagelcode)\n\n“Thanks to Databricks Lakehouse, we can support\nreal-time business decision-making based on data\nanalysis results that are automatically updated on\nan hourly and daily basis, even as data volumes have\nincreased by nearly 1,000 times.”\n\n**J O O H Y U N K I M**\nVice President, Data and AI, Bagelcode\n\n\n-----\n\nA brand’s goal with segmentation should be to define a shared customer\nperspective on customers, allowing the organization to engage users consistently\nand cohesively. But any adjustments to customer engagement require careful\nconsideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n\n**C A S E S T U DY**\n\n**Responding to global demand shifts with ease**\n\nReckitt produces some of the world’s most recognizable and trusted\nconsumer brands in hygiene, health and nutrition. With Databricks\nLakehouse on Azure, they’re able to meet the needs of billions of\nconsumers worldwide by surfacing real-time, highly accurate, deep\ncustomer insights, leading to a better understanding of trends and\ndemand, allowing them to provide best-in-class experiences in\nevery market.\n\n[Get the full story](https://www.databricks.com/customers/reckitt)\n\n\n**A segmentation walk-through: Grocery chain promotions**\n\nA promotions management team for a large grocery chain is responsible for\nrunning a number of promotional campaigns, each of which is intended to drive\ngreater overall sales. Today, these marketing campaigns include leaflets and\ncoupons mailed to individual households, manufacturer coupon matching,\nin-store discounts and the stocking of various private-label alternatives to\npopular national brands.\n\nRecognizing uneven response rates between households, the team is eager to\ndetermine if customers might be segmented based on their responsiveness\nto these promotions. They anticipate that such segmentation may allow the\npromotions management team to better target individual households, driving\noverall higher response rates for each promotional dollar spent.\n\nUsing historical data from point-of-sale systems along with campaign\ninformation from their promotions management systems, the team derives\na number of features that capture the behavior of various households with\nregard to promotions. Applying standard data preparation techniques, the data\nis organized for analysis and using a variety of clustering algorithms, such as\nk-means and hierarchical clustering, the team settles on two potentially useful\ncluster designs.\n\n\n-----\n\nOverlapping segment designs separating households based on their responsiveness to\nvarious promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n\n**Assessing results**\n\n\nComparing households by demographic factors not used in developing the\nclusters themselves, some interesting patterns separating cluster members\nby age and other factors are identified. While this information may be useful\n\nin not only predicting cluster membership and designing more effective\ncampaigns targeted to specific groups of households, the team recognizes\nthe need to collect additional demographic data before putting too much\nemphasis on these results.\n\n\nWith profiling, marketers can discern those customer households in the\nhighlighted example fall into two groups: those who are responsive to coupons\nand mailed leaflets, and those who are not. Further divisions show differing\ndegrees of responsiveness to other promotional offers.\n\n\n-----\n\n**Need help segmenting your customers for**\n**more targeted marketing?**\n\nUse the **Customer Segmentation Accelerator** and drive\nbetter purchasing predictions based on behaviors. Through\nsales data, campaigns and promotions systems, you can\nbuild useful customer clusters to effectively target various\nhouseholds with different promos and offers.\n\nAge-based differences in cluster composition of behavior-based customer segments.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "6cf6e85e0d863c008a1a11095b2d83fc", + "**Assessing results**\n\n\nComparing households by demographic factors not used in developing the\nclusters themselves, some interesting patterns separating cluster members\nby age and other factors are identified. While this information may be useful\n\nin not only predicting cluster membership and designing more effective\ncampaigns targeted to specific groups of households, the team recognizes\nthe need to collect additional demographic data before putting too much\nemphasis on these results.\n\n\nWith profiling, marketers can discern those customer households in the\nhighlighted example fall into two groups: those who are responsive to coupons\nand mailed leaflets, and those who are not. Further divisions show differing\ndegrees of responsiveness to other promotional offers.\n\n\n-----\n\n**Need help segmenting your customers for**\n**more targeted marketing?**\n\nUse the **Customer Segmentation Accelerator** and drive\nbetter purchasing predictions based on behaviors. Through\nsales data, campaigns and promotions systems, you can\nbuild useful customer clusters to effectively target various\nhouseholds with different promos and offers.\n\nAge-based differences in cluster composition of behavior-based customer segments.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\nThe results of the analysis now drive a dialog between the data scientists and\nthe promotions management team. Based on initial findings, a revised analysis\nwill be performed focused on what appear to be the most critical features\ndifferentiating households as a means to simplify the cluster design and evaluate\noverall cluster stability. Subsequent analyses will also examine the revenue\n\ngenerated by various households to understand how changes in promotional\nengagement may impact customer spending.\n\nUsing this information, the team believes they will have the ability to make a case\nfor change to upper management. Should a change in promotions targeting be\napproved, the team makes plans to monitor household spending, promotions\nspend and campaign responsiveness rates using much of the same data used in\nthis analysis. This will allow the team to assess the impact of these efforts and\nidentify when the segmentation design needs to be revisited.\n\n\n-----\n\n#### Assessing Consumer Interest Data to Inform Engagement Strategies\n\nFine-tuning ML recommendations to boost conversions\n\n\nPersonalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\nimportant to identify high-value audiences who have the highest likelihood of\nspecific actions. Here’s where **propensity scoring** comes in.\n\nSpecifically, this process allows companies to estimate customers’ potential\nreceptiveness to an offer or to content related to a subset of products, and\ndetermine which messaging to apply. Calculating propensity scores requires\nassessment of past interactions and data points (e.g., frequency of purchases,\npercentage of spend associated with a particular product category, days since\nlast purchase and other historical data).\n\nDatabricks provides critical capabilities for propensity scoring (like the Feature\nStore, AutoML and MLflow) to help businesses answer three key considerations\nand develop a robust process:\n\n**1.** How to maintain the significant number of features used\nto train propensity models\n\n**2.** How to rapidly train models aligned with new campaigns\n\n**3.** How to rapidly re-deploy models, retrained as customer\npatterns drift, into the scoring pipeline\n\n**Boosting model training efficiency**\n\nWith the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\ncreated by others.\n\n\nThe feature store is a centralized repository that enables the persistence,\ndiscovery and sharing of features across various model training exercises.\nAs features are captured, lineage and other metadata are captured. Standard\nsecurity models ensure that only permitted users and processes may\nemploy these features, enforcing the organization’s data access policies on\ndata science processes.\n\n**Extracting the complexities of ML**\n\n[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\nbest practices. As a glass box solution, AutoML first generates a collection of\nnotebooks representing various aligned model variations. In addition to iteratively\ntraining models, AutoML allows you to access the notebooks associated with each\nmodel, creating an editable starting point for further exploration.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "63e004873af9b33355e959df2444c676", + "**Boosting model training efficiency**\n\nWith the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\ncreated by others.\n\n\nThe feature store is a centralized repository that enables the persistence,\ndiscovery and sharing of features across various model training exercises.\nAs features are captured, lineage and other metadata are captured. Standard\nsecurity models ensure that only permitted users and processes may\nemploy these features, enforcing the organization’s data access policies on\ndata science processes.\n\n**Extracting the complexities of ML**\n\n[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\nbest practices. As a glass box solution, AutoML first generates a collection of\nnotebooks representing various aligned model variations. In addition to iteratively\ntraining models, AutoML allows you to access the notebooks associated with each\nmodel, creating an editable starting point for further exploration.\n\n**Streamlining the overall ML lifecycle**\n\n[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\nDatabricks Lakehouse. This repository enables tracking and analysis of the various\nmodel iterations generated by both AutoML and custom training cycles alike.\n\nWhen used in combination with the Databricks Feature Store, models persisted\nwith MLflow can retain knowledge of the features used during training. As models\nare retrieved, this same information allows the model to retrieve relevant features\nfrom the Feature Store, greatly simplifying the scoring workflow and enabling\nrapid deployment.\n\n\n-----\n\n**How to build a propensity scoring workflow with Databricks**\n\nUsing these features in combination, many organizations implement propensity\nscoring as part of a three-part workflow:\n\n**1.** Data engineers work with data scientists to define features relevant\nto the propensity scoring exercise and persist these to the Feature Store.\nDaily or even real-time feature engineering processes are then defined\nto calculate up-to-date feature values as new data inputs arrive.\n\nModel Training\nand Deployment\n\n\n**2.** As part of the inference workflow, customer identifiers are presented to\npreviously trained models in order to generate propensity scores based on\nthe latest features available. Feature Store information captured with the\nmodel allows data engineers to retrieve these features and easily generate\nthe desired scores, which can then be used for analysis within Databricks\nLakehouse or published to downstream marketing systems.\n\n**3.** In the model-training workflow, data scientists periodically retrain the\npropensity score models to capture shifts in customer behaviors. As these\nmodels are persisted to MLfLow, change management processes are used\nto evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\nproduction version of each model is retrieved to generate customer scores.\n\n\nScore Generation\nand Publication ETL\n\n**Need help assessing interest from your**\n**target audience?**\n\n\nFeature\nEngineering ETL\n\nFeature Store Profiles\n\n\nSales\n\nPromotions\n\nCustomer\n\n\nUse the **Propensity Scoring Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nDownstream\nApplications\n\n\nA three-part propensity scoring workflow.\n\n\n-----\n\n### Delivering Personalized Customer Journeys\n\nStrategies for crafting a real-time recommendation engine\n\n\nAs the economy continues to weather unpredictable disruptions, shortages and\ndemand, delivering personalized customer experiences at speed and scale will\nrequire adaptability on the ground and within a company’s operational tech stack.\n\n\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\nstrategy and operations, allowing them to create a “golden customer\nrecord” that improves all decision-making from forecasting demand to\npowering their global loyalty program.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8f4f8bec235a7c063f9b4a7b7ec6ef4b", + "Customer\n\n\nUse the **Propensity Scoring Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nDownstream\nApplications\n\n\nA three-part propensity scoring workflow.\n\n\n-----\n\n### Delivering Personalized Customer Journeys\n\nStrategies for crafting a real-time recommendation engine\n\n\nAs the economy continues to weather unpredictable disruptions, shortages and\ndemand, delivering personalized customer experiences at speed and scale will\nrequire adaptability on the ground and within a company’s operational tech stack.\n\n\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\nstrategy and operations, allowing them to create a “golden customer\nrecord” that improves all decision-making from forecasting demand to\npowering their global loyalty program.\n\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\n\n\n**C A S E S T U DY**\n\n\n“Databricks Lakehouse allows every division in our\norganization — from automotive to retail — to gain\na unified view of our customer across businesses.\nWith these insights, we can optimize everything from\nforecasting and supply chain, to powering our loyalty\nprogram through personalized marketing campaigns,\ncross-sell strategies and offers.”\n\n**D M I T R I Y D O V G A N**\nHead of Data Science, Al-Futtaim Group\n\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\nsafety and community, brands most attuned to changing needs and sentiments\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\nbusiness and many lost, organizations that had already begun the journey toward\nimproved customer experience saw better outcomes, closely mirroring patterns\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n\n\n**Creating a unified view across 200+ brands**\n\nAs a driving force for economic growth in the Middle East, Al-Futtaim\nimpacts the lives of millions of people across the region through the\ndistribution and operations of global brands like Toyota, IKEA, Ace\nHardware and Marks & Spencer.\n\nAl-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0473e2deba8639930389964be7b25bc7", + "Al-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**\n\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n\n**C A S E S T U DY**\n\n**Personalizing the beauty product shopping experience**\n\nFlaconi wanted to leverage data and AI to become the No. 1 online\nbeauty product destination in Europe. However, they struggled with\nmassive volumes of streaming data and with infrastructure complexity\nthat was resource-intensive and costly to scale. See how they used\nDatabricks to increase time-to-market by 200x, reduce staff costs by\n40% and increase net order income.\n\nGet the full story\n\n\n¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\nExperience Performance Index in 2007-09.\n\nSource: Forrester Customer Experience Performance Index (2007-09); press search\n\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n\n\n-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8e054539e38c8a49888991a85b178399", + "-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nFlipp is an online marketplace that aggregates weekly shopping circulars,\nso consumers get deals and discounts without clipping coupons. Siloed\ncustomer data sources once made getting insights difficult. Now with\nDatabricks, Flipp’s data teams can access and democratize data, helping\nthem do their jobs more effectively while bringing better deals to users,\nmore meaningful insights to partners, and a 10% jump in foot traffic to\nbrick-and-mortar retailers.\n\nGet the full story\n\nThe engines we use to serve content based on customer preferences are known\nas recommenders. With some recommenders, a heavy focus on the shared\npreferences of similar customers helps define what recommendations will actually\nmake an impact. With others, it can be more useful to focus on the properties of\nthe content itself (e.g., product descriptions).\n\n\n-----\n\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n\n\nProviding deep, effective personalized experiences to customers depends\non a brand’s ability to intelligently leverage consumer and market data from a\nwide variety of sources to fuel faster, smarter decisions — without sacrificing\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\nexactly that, offering a scalable data architecture that unifies all your data,\nanalytics and AI to deliver unforgettable customer experiences.\n\nCreated on open source and open standards, Databricks offers a robust\nand cost-effective platform for brands to collaborate with partners, clients,\nmanufacturers and distributors to unleash more innovation and efficiencies\nat every touch point. Businesses can rapidly ingest available data in real time,\n\n\nat scale, and create accessible, data-driven insights that enable actionable\nstrategies across the value chain.\n\nDatabricks is a multicloud platform, designed for quick enterprise development.\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\ntheir company’s operational health and the evolving needs of their customers\n— all while empowering teams to easily unify data efforts, perform fine-grained\nanalyses and streamline cross-functional data operations using a single,\nsophisticated solution.\n\n\n###### Learn more about Databricks Lakehouse for industries\n like Retail & Consumer Goods, Media & Entertainment\n and more at databricks.com/solutions\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n\nis headquartered in San Francisco, with offices around the globe. Founded by\n\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n\na mission to help data teams solve the world’s toughest problems. To learn more,\n\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8f16d1342cbe32bc8c1aaad18ddcb487", + "-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n\nis headquartered in San Francisco, with offices around the globe. Founded by\n\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n\na mission to help data teams solve the world’s toughest problems. To learn more,\n\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n\n##### Contact us for a personalized demo databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "48fa587f07ddc7a71dd40f2d00f547a1", + "# 2023 State\n of Data + AI\n```\nPowered by the Databricks Lakehouse\n\n```\n2023 STATE OF DATA + AI\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||W|e’|r|e|in||th|e|||||||\n|||||||go|l|de|n|a|ge||of|||||||\n|||||||||||||||||||||\n|||||||d|a|ta|a|n|d|A|I|||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\n-----\n\nINTRO\n\nIn the 6 months since ChatGPT launched, the world has woken up to the vast potential\nof AI. The unparalleled pace of AI discoveries, model improvements and new products\non the market puts data and AI strategy at the top of conversations across every\norganization around the world. We believe that AI will usher in the next generation of\nproduct and software innovation, and we’re already seeing this play out in the market.\nThe next generation of winning companies and executives will be those who understand\nand leverage AI.\n\nIn this report, we examine patterns and trends in data and AI adoption across more\nthan 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\napplications across companies’ entire data estates, the Databricks Lakehouse provides\na unique vantage point into the state of data and AI, including which products and\ntechnologies are the fastest growing, the types of data science and machine learning\n(DS/ML) applications being developed and more.\n\n\n-----\n\n```\nHere are the major stories we uncovered:\n\n```\n\nCompanies are adopting\nmachine learning and large\nlanguage models (LLMs)\nat a rapid pace. Natural\nlanguage processing (NLP)\nis dominating use cases,\nwith an accelerated focus\non LLMs.\n\n\nOrganizations are investing in\ndata integration products as\nthey prioritize more DS/ML\ninitiatives. 50% of our fastestgrowing products represent\nthe data integration category.\n\n\nOrganizations are increasingly\nusing the Lakehouse for data\nwarehousing, as evidenced\nby the high growth of data\nintegration tools dbt and\nFivetran, and the accelerated\nadoption of Databricks SQL.\n\n\nWe hope that by sharing these trends, data leaders will be able to benchmark\ntheir organizations and gain insights that help inform their strategies for an\nera defined by data and AI.\n\n\n-----\n\n```\nSummary of\n\nKey Findings\n DATA SCIENCE AND MACHINE LEARNING:\n\n NLP AND LLMS ARE IN HIGH DEMAND\n 1\n\n```\n**•** The number of companies using SaaS LLM APIs (used to access\nservices like ChatGPT) has grown 1310% between the end of\nNovember 2022 and the beginning of May 2023\n\n**•** NLP accounts for 49% of daily Python data science library usage,\nmaking it the most popular application", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "68b41e9fd77245e8e25f461a4ea62d51", + "Organizations are investing in\ndata integration products as\nthey prioritize more DS/ML\ninitiatives. 50% of our fastestgrowing products represent\nthe data integration category.\n\n\nOrganizations are increasingly\nusing the Lakehouse for data\nwarehousing, as evidenced\nby the high growth of data\nintegration tools dbt and\nFivetran, and the accelerated\nadoption of Databricks SQL.\n\n\nWe hope that by sharing these trends, data leaders will be able to benchmark\ntheir organizations and gain insights that help inform their strategies for an\nera defined by data and AI.\n\n\n-----\n\n```\nSummary of\n\nKey Findings\n DATA SCIENCE AND MACHINE LEARNING:\n\n NLP AND LLMS ARE IN HIGH DEMAND\n 1\n\n```\n**•** The number of companies using SaaS LLM APIs (used to access\nservices like ChatGPT) has grown 1310% between the end of\nNovember 2022 and the beginning of May 2023\n\n**•** NLP accounts for 49% of daily Python data science library usage,\nmaking it the most popular application\n\n**•** Organizations are putting substantially more models into production\n(411% YoY growth) while also increasing their ML experimentation\n(54% YoY growth)\n\n**•** Organizations are getting more efficient with ML; for every three\n\nexperimental models, roughly one is put into production, compared\nto five experimental models a year prior\n\n\n-----\n\n```\nFASTEST-GROWING DATA\nAND AI PRODUCTS\n\n```\n```\nADOPTION AND\nMIGRATION TRENDS\n\n```\n61% of customers migrating to the\nLakehouse are coming from onprem and cloud data warehouses\n\nThe volume of data in Delta Lake\nhas grown 304% YoY\n\nThe Lakehouse is increasingly\nbeing used for data warehousing,\nincluding serverless data\nwarehousing with Databricks\nSQL, which grew 144% YoY\n\n\nBI is the top data and AI market, but\ngrowth trends in other markets show that\ncompanies are increasingly looking at\nmore advanced data use cases\n\nThe fastest-growing data and AI product\nis dbt, which grew 206% YoY by number\nof customers\n\nData integration is the fastest-growing\ndata and AI market on the Databricks\nLakehouse with 117% YoY growth\n\n\n-----\n\n```\nMethodology: How did Databricks\n\ncreate this report?\n\n```\nThe _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\ncollected from our customers based on how they are using the Databricks\nLakehouse and its broad ecosystem of integrated tools. This report focuses\non machine learning adoption, data architecture (integrations and migrations)\nand use cases. The customers in this report represent every major industry\nand range in size from startups to many of the world’s largest enterprises.\n\nUnless otherwise noted, this report presents and analyzes data from February 1,\n2022, to January 31, 2023, and usage is measured by number of customers.\nWhen possible, we provide YoY comparisons to showcase growth trends over time.\n\n\n-----\n\n```\nData Science and\n\nMachine Learning\nNATURAL LANGUAGE PROCESSING AND LARGE\nLANGUAGE MODELS ARE IN HIGH DEMAND\n\n```\nAcross all industries, companies leverage data science and\nmachine learning (DS/ML) to accelerate growth, improve\npredictability and enhance customer experiences. Recent\nadvancements in large language models (LLMs) are propelling\ncompanies to rethink AI within their own data strategies.\nGiven the rapidly evolving DS/ML landscape, we wanted to\nunderstand several aspects of the market:\n\n- Which types of DS/ML applications are companies investing\nin? In particular, given the recent buzz, what does the data\naround LLMs look like?\n\n- Are companies making headway on operationalizing\n\ntheir machine learning models (MLOps)?\n\n\n-----\n\n```\nTime Series Time Series\nSpeech Recognition\nSimulations &\u0003\n\nOptimizations\nRecommender Systems\nNatural\n\n\u0003Language \u0003\n\nProcessing\nIndustry Data Modeling\nGraph\nGeospatial\nComputer Vision\nAnomaly Detection\n\u0003& Segmentation\n\n```\n```\n SPECIALIZED PYTHON \u0003DS/ML\n\n LIBRARIES FROM \u0003FEBRUARY 2022 \n\n TO JANUARY 2023\n\n```", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7a9d69ed0cda68e0093366519c9fd19d", + "Machine Learning\nNATURAL LANGUAGE PROCESSING AND LARGE\nLANGUAGE MODELS ARE IN HIGH DEMAND\n\n```\nAcross all industries, companies leverage data science and\nmachine learning (DS/ML) to accelerate growth, improve\npredictability and enhance customer experiences. Recent\nadvancements in large language models (LLMs) are propelling\ncompanies to rethink AI within their own data strategies.\nGiven the rapidly evolving DS/ML landscape, we wanted to\nunderstand several aspects of the market:\n\n- Which types of DS/ML applications are companies investing\nin? In particular, given the recent buzz, what does the data\naround LLMs look like?\n\n- Are companies making headway on operationalizing\n\ntheir machine learning models (MLOps)?\n\n\n-----\n\n```\nTime Series Time Series\nSpeech Recognition\nSimulations &\u0003\n\nOptimizations\nRecommender Systems\nNatural\n\n\u0003Language \u0003\n\nProcessing\nIndustry Data Modeling\nGraph\nGeospatial\nComputer Vision\nAnomaly Detection\n\u0003& Segmentation\n\n```\n```\n SPECIALIZED PYTHON \u0003DS/ML\n\n LIBRARIES FROM \u0003FEBRUARY 2022 \n\n TO JANUARY 2023\n\n```\n\nNote: This chart reflects the unique\nnumber of notebooks using ML\nlibraries per day in each of the\ncategories. It includes libraries used\nfor the particular problem-solving use\ncases mentioned. It does not include\nlibraries used in tooling for data\npreparations and modeling.\n\n\n-----\n\n```\nNatural language processing dominates\n\nmachine learning use cases\n\n```\n\nOur second most popular DS/ML application is\nsimulations and optimization, which accounts for 30% of\nall use cases. This signals organizations are using data to\nmodel prototypes and solve problems cost-effectively.\n\n\nTo understand how organizations are applying AI and\nML within the Lakehouse, we aggregated the usage\nof specialized Python libraries, which include NLTK,\nTransformers and FuzzyWuzzy, into popular data science\nuse cases. 1 We look at data from these libraries because\nPython is on the cutting edge of new developments in ML,\nadvanced analytics and AI, and has consistently ranked\nas one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\nrecent years.\n\nOur most popular use case is natural language processing\n(NLP), a rapidly growing field that enables businesses to\ngain value from unstructured textual data. This opens the\ndoor for users to accomplish tasks that were previously\ntoo abstract for code, such as summarizing content or\nextracting sentiment from customer reviews. In our data\nset, 49% of libraries used are associated with NLP. LLMs\nalso fall within this bucket. Given the innovations launched\nin recent months, we expect to see NLP take off even\nmore in coming years as it is applied to use cases like\nchatbots, research assistance, fraud detection, content\ngeneration and more.\n\n```\n In our data set, 49% of\n specialized Python libraries\n used are associated with NLP\n\n```\nMany of the DS/ML use cases are predominantly\nleveraged by specific industries. While they take up a\nsmaller share of the total, they are mission-critical for\nmany organizations. For example, time series includes\nforecasting, a use case that is especially popular in\nindustries such as Retail and CPG, which rely heavily\non the ability to forecast the need for every item in\nevery store.\n\n\n1. This data does not include general-purpose ML libraries, including\nscikit-learn or TensorFlow.\n\n\n-----\n\n```\n USE OF LARGE LANGUAGE MODELS (LLMS)\n\n```\n\n\n\n\n\n\n\nWe have rolled these libraries up into groupings based on the type of functionality they provide.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "5a3b9743641bc87bc79b7c983e4b493e", + "1. This data does not include general-purpose ML libraries, including\nscikit-learn or TensorFlow.\n\n\n-----\n\n```\n USE OF LARGE LANGUAGE MODELS (LLMS)\n\n```\n\n\n\n\n\n\n\nWe have rolled these libraries up into groupings based on the type of functionality they provide.\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||||||||\n|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n|||SaaS|||||||||||||||||||||||||||||\n|||LLM|Tools||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n|2022||||||||||||||||||||20|23||||||||||\n||||||||||||||||||||||||||||||||\n||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n\n\n\nD t i t tl di i th l t k f D b d t lit\n\n\n-----\n\n```\nLarge language models are", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "75195bf43f4a607909cb075140351ad9", + "D t i t tl di i th l t k f D b d t lit\n\n\n-----\n\n```\nLarge language models are\n\nthe “it” tool\n\n```\nLLMs are currently one of the hottest and most-watched areas\nin the field of NLP. LLMs have been instrumental in enabling\nmachines to understand, interpret and generate human language\nin a way that was previously impossible, powering everything\nfrom machine translation to content creation to virtual assistants\nand chatbots.\n\nTransformer-related libraries have been growing in popularity\neven before ChatGPT thrust LLMs into the public consciousness.\nWithin the last 6 months, our data shows two accelerating\ntrends: organizations are building their own LLMs, which models\nlike [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\nthey are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\nLLMs, have the highest adoption within the Lakehouse.\n\nThe second most popular type is SaaS LLMs, which are used\nto access models like OpenAI. This category has grown\nexponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\nnumber of Lakehouse customers using SaaS LLMs has grown\n\n\nOrganizations can leverage LLMs either by\nusing SaaS LLM APIs to call services like\nChatGPT from OpenAI or by operating their\nown LLMs in-house.\n\nThinking of building your own modern LLM\napplication? This approach could entail\nthe use of specialized transformer-related\nPython libraries to train the model, as well as\nLLM tools like LangChain to develop prompt\ninterfaces or integrations to other systems.\n```\nLLM DEFINITIONS\n\n```\n**◊** **Transformer-related libraries:**\nPython libraries used to train LLMs\n(example: Hugging Face)\n\n**◊** **SaaS LLM APIs:** Libraries used to access\nLLMs as a service (example: OpenAI)\n\n**◊** **LLM tools:** Toolchains for working\nwith and building proprietary LLMs\n(example: LangChain)\n\n\nan impressive 1310% between the end of November 2022 and\nthe beginning of May 2023. (In contrast, transformer-related\nlibraries grew 82% in this same period.)\n\n\n-----\n\n```\n ac e ea g e pe e a o a d p oduc o\ntake off across industries\n\n```\n\nThe increasing demand for ML solutions and the growing\navailability of technologies have led to a significant\nincrease in experimentation and production, two distinct\nparts of the ML model lifecycle. We look at the _logging_ and\n_registering_ of models in MLflow, an open source platform\ndeveloped by Databricks, to understand how ML is\ntrending and being adopted within organizations.\n```\n LOGGED MODELS AND\n\n ML EXPERIMENTATION\n\n```\nDuring the experimentation phase of ML, data scientists\ndevelop models designed to solve given tasks. After training\nthe models, they test them to evaluate their accuracy,\nprecision, recall (the percentage of correctly predicted\npositive instances out of all actual positive instances), and\nmore. These metrics are logged (recorded) in order to analyze\nthe various models’ performance and identify which approach\nworks best for the given task.\n\nWe have chosen logged models as a proxy to measure ML\nexperimentation because the MLflow Tracking Server is\n\ndesigned to facilitate experiment tracking and reproducibility.\n\n\nMLflow Model Registry launched in May 2021. Overall, the\nnumber of logged models has grown 54% since February\n2022, while the number of registered models has grown\n411% over the same period. This growth in volume suggests\norganizations are understanding the value of investing in\nand allocating more people power to ML.\n```\nREGISTERED MODELS AND ML PRODUCTION\n\n```\nProduction models have undergone the experimentation\nphase and are then deployed in real-world applications. They\nare typically used to make predictions or decisions based on\nnew data. Registering a model is the process of recording and\nstoring metadata about a trained model in a centralized location\nthat allows users to easily access and reuse existing models.\nRegistering models prior to production enables organizations to\nensure consistency and reliability in model deployment and scale.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "06e3abc9f9fe5405d9f8cd88b81b4ec1", + "We have chosen logged models as a proxy to measure ML\nexperimentation because the MLflow Tracking Server is\n\ndesigned to facilitate experiment tracking and reproducibility.\n\n\nMLflow Model Registry launched in May 2021. Overall, the\nnumber of logged models has grown 54% since February\n2022, while the number of registered models has grown\n411% over the same period. This growth in volume suggests\norganizations are understanding the value of investing in\nand allocating more people power to ML.\n```\nREGISTERED MODELS AND ML PRODUCTION\n\n```\nProduction models have undergone the experimentation\nphase and are then deployed in real-world applications. They\nare typically used to make predictions or decisions based on\nnew data. Registering a model is the process of recording and\nstoring metadata about a trained model in a centralized location\nthat allows users to easily access and reuse existing models.\nRegistering models prior to production enables organizations to\nensure consistency and reliability in model deployment and scale.\n\nWe have chosen registered models to represent ML production\nbecause the MLflow Model Registry is designed to manage\nmodels that have left the experimentation phase through the\n\nrest of their lifecycle.\n\n\n-----\n\ng y yi p\n\nwas registered. Recent advances in ML, such as improved\nopen source libraries like MLflow and Hugging Face, have\n\nradically simplified building and putting models into\nproduction. The result is that 34% of logged models are\nnow candidates for production today, an improvement\nfrom over 20% just a year ago.\n\n\nbefore committing an ML model to production. We wanted\nto understand, “How many models do data scientists\n\nexperiment with before moving to production?”\n\nOur data shows the ratio of logged to registered models\nis 2.9 : 1 as of January 2023. This means that for roughly\nevery three experimental models, one model will get\nregistered as a candidate for production. This ratio has\nimproved significantly from just a year prior, when we\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||VS. S|||||||||||||||||||||||\n|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n||||||Models|||||||||||||||||||||||\n||||||ber of|||||||||||||||||||||||\n||||||Num|||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n|2.|9 :|1||||||||||||||||||||||||||\n\n```\nRatio of Logged to Registered\n\n Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\nModels in Jan 2023 2023\n\n```\n\n-----\n\n```\nThe Modern Data\nand AI Stack\n\n```\nOver the last several years, the trend toward building\nopen, unified data architectures has played out in our\nown data. We see that data leaders are opting to preserve\nchoice, leverage the best products and deliver innovation\nacross their organizations by democratizing access to\ndata for more people.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1f876772fcb577c675a44952dbf0e41c", + "```\nRatio of Logged to Registered\n\n Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\nModels in Jan 2023 2023\n\n```\n\n-----\n\n```\nThe Modern Data\nand AI Stack\n\n```\nOver the last several years, the trend toward building\nopen, unified data architectures has played out in our\nown data. We see that data leaders are opting to preserve\nchoice, leverage the best products and deliver innovation\nacross their organizations by democratizing access to\ndata for more people.\n\n\n-----\n\n```\n FASTEST-GROWING DATA AND AI PRODUCTS\n dbt 206%\n\n```\n```\nFivetran\nInformatica\nQlik Data Integration\nEsri\nLooker\nHugging Face\n\n```\n```\n 181%\n 174%\n 152%\n 145%\n 141%\n110%\n\n```\n```\nLytics\nGreat Expectations\nKepler.gl\n\n```\n```\n 101%\n 100%\n95%\n\n```\n```\n0% 50% 100% 150% 200%\n Year-Over-Year Growth by Number of Customers\n\n```\n\n-----\n\n```\nDBT IS THE FASTEST-GROWING DATA\n\nAND AI PRODUCT OF 2023\n\n```\nAs companies move quickly to develop more advanced\nuse cases with their data, they are investing in newer\nproducts that produce trusted data sets for reporting,\nML modeling and operational workflows. Hence, we see\nthe rapid rise of data integration products. dbt, a data\ntransformation tool, and Fivetran, which automates\ndata pipelines, are our two fastest-growing data and AI\nproducts. This suggests a new era of the data integration\nmarket with challenger tools making headway as\ncompanies shift to prioritize DS/ML initiatives. With Great\nExpectations from Superconductive in the ninth spot,\na full 50% of our fastest-growing products represent\nthe data integration category.\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||Busi|ness I|ntelli|gence|\n|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n|Custom||||||||||||||||||||\n|ber of||||||||||||||||||||\n|Num||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\nNote: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\ncategories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "34d212b9c59581c1086822b4a76ab6f3", + "Note: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\ncategories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n\n\n-----\n\n```\n a a a d a e s bus ess e ge ce s\nstandard, organizations invest in their machine\nlearning foundation\n\n```\n\nTo understand how organizations are prioritizing their data\ninitiatives, we aggregated all data and AI products on the\nDatabricks Lakehouse and categorized them into four\ncore markets: BI, data governance and security, DS/ML,\nand data integration. Our data set confirms that BI tools\nare more widely adopted across organizations relative to\nmore nascent categories — and they continue to grow,\nwith a 66% YoY increase in adoption. This aligns with the\nbroader trend of more organizations performing data\nwarehousing on a Lakehouse, covered in the next section,\nViews from the Lakehouse.\n\n\nWhile BI is often where organizations start their data\njourney, companies are increasingly looking at more\nadvanced data and AI use cases.\n```\nDEMAND FOR DATA INTEGRATION PRODUCTS\n\nIS GROWING FAST\n\n```\nWe see the fastest growth in the data integration market.\nThese tools enable a company to integrate vast amounts\nof upstream and downstream data in one consolidated\nview. Data integration products ensure that all BI and DS/\nML initiatives are built on solid foundation.\n\nWhile it’s easier for smaller markets to experience\nfaster growth, at 117% YoY increased adoption, the data\nintegration market is growing substantially faster than BI.\nThis trend dovetails with the rapid growth of ML adoption\nwe see across the Lakehouse, covered in the DS/ML\nsection of the report.\n\n```\nData integration is the\nfastest-growing market,\n\n with 117% YoY growth\n\n```\n\n-----\n\n```\nViews from\nthe Lakehouse\nMIGRATION AND DATA\n\nFORMAT TRENDS\n\n```\nData migration is a major undertaking: it can be risky,\nexpensive and delay companies’ timelines. It’s not a\ntask to jump into lightly. As organizations run into the\nlimitations, scalability challenges and the cost burden\nof legacy data platforms, they are increasingly likely\nto migrate to a new type of architecture.\n\n\n-----\n\n```\nMigration trends:\n\nthe best data warehouse\n\nis a Lakehouse\n\n```\nThe Lakehouse Platform is an attractive\nalternative to traditional data warehouses\nbecause it supports advanced use cases and\nDS/ML, allowing organizations to boost their\noverall data strategy. As evidenced by the most\npopular data and AI products, with BI and data\nintegration tools at the top, organizations are\nincreasingly using the data lakehouse for data\nwarehousing. To better understand which legacy\nplatforms organizations are moving away from,\n\nwe look at the migrations of new customers\nto Databricks.\n\nAn interesting takeaway is that roughly half of the\ncompanies moving to the Lakehouse are coming\nfrom data warehouses. This includes the 22%\nthat are moving from cloud data warehouses.\nIt also demonstrates a growing focus on running\ndata warehousing workloads on a Lakehouse\nand unifying data platforms to reduce cost.\n\n```\n SOURCE OF NEW CUSTOMER \u0003\n\n MIGRATIONS TO DATABRICKS\n\n```\n```\n12%\n\n```\n```\n39%\n\n```\n```\n27%\n\n```\n```\n22%\n\n```\n\n-----\n\n```\nRising tides: the volume\n\nof data in Delta Lake\n\nhas grown 304% YoY\n\n```\nAs the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\nlarge proportion is in the form of semi-structured\nand unstructured data. Previously, organizations\nhad to manage multiple different platforms for\ntheir structured, unstructured and semi-structured\ndata, which caused unnecessary complexity and\nhigh costs. The Lakehouse solves this problem by\nproviding a unified platform for all data types\nand formats.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "92675ce8cb8f76491cdb21da3fb3d4f7", + "```\n```\n27%\n\n```\n```\n22%\n\n```\n\n-----\n\n```\nRising tides: the volume\n\nof data in Delta Lake\n\nhas grown 304% YoY\n\n```\nAs the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\nlarge proportion is in the form of semi-structured\nand unstructured data. Previously, organizations\nhad to manage multiple different platforms for\ntheir structured, unstructured and semi-structured\ndata, which caused unnecessary complexity and\nhigh costs. The Lakehouse solves this problem by\nproviding a unified platform for all data types\nand formats.\n\nDelta Lake is the foundation of the Databricks\nLakehouse. The Delta Lake format encompasses\nstructured, unstructured and semi-structured\ndata. Use has surged over the past 2 years.\nWhen compared to the steady, flat or declining\ngrowth in other storage formats (e.g., text, JSON\nand CSV), our data shows that a growing number\nof organizations are turning to Delta Lake to manage\ntheir data. In June 2022, Delta Lake surpassed\nParquet as the most popular data lake source,\nreaching 304% YoY growth.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||BY|STO||RAG||E FO||RMA|T|||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|ata||||||||||||||||||\n|e of D||||||||||||||||||\n|Volum||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n||Jan|||||||J|an|||Jan||||Ja||\n|||||Jan||||||||||||||\n|2|019|||2020||||20|21|||2022||||202||\n|||||||||Delta|Te|xt||CSV||Av||ro||\n|||||||||Parquet|OR|C||JSON||||||\n|||||||||||||||||||\n\n\n-----\n\n```\n g g ,\nwith emphasis on serverless\n\n```\n\nOver the past 2 years, companies have vastly increased their usage\nof data warehousing on the Lakehouse Platform. This is especially\ndemonstrated by use of Databricks SQL ­— the serverless data\nwarehouse on the Lakehouse — which shows 144% YoY growth.\nThis suggests that organizations are increasingly ditching traditional\ndata warehouses and are able to perform all their BI and analytics\non a Lakehouse.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "89431109f90bb45a304efc01edc3afa4", + "-----\n\n```\n g g ,\nwith emphasis on serverless\n\n```\n\nOver the past 2 years, companies have vastly increased their usage\nof data warehousing on the Lakehouse Platform. This is especially\ndemonstrated by use of Databricks SQL ­— the serverless data\nwarehouse on the Lakehouse — which shows 144% YoY growth.\nThis suggests that organizations are increasingly ditching traditional\ndata warehouses and are able to perform all their BI and analytics\non a Lakehouse.\n\n```\n Data \nWarehouse\n\n```\n```\nData \n\n```\n```\nLakehouse\nPlatform\n\n```\n```\nLakehouse\n\n```\n\n\n\n\n\n\n\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||\n||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n||DA|TABR|ICK|S SQ|||||||||||||||||||||\n||||||||ustome||||||||||||||||||\n||||||||r of C||||||||||||||||||\n||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n\n\n-----\n\nCONCLUSION\n```\nGeneration AI\n\n```\nWe’re excited that companies are progressing into more\nadvanced ML and AI use cases, and the modern data and\nAI stack is evolving to keep up. Along with the rapid growth\nof data integration tools (including our fastest growing,\ndbt), we’re seeing the rapid rise of NLP and LLM usage in\nour own data set, and there’s no doubt that the next few\nyears will see an explosion in these technologies. It’s never\nbeen more clear: the companies that harness the power\nof DS/ML will lead the next generation of data.\n\n\n-----\n\n```\nAbout Databricks\n\n```\nDatabricks is the data and AI company. More than 9,000\norganizations worldwide — including Comcast, Condé Nast, and\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\nPlatform to unify their data, analytics and AI. Databricks is\nheadquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve\nthe world’s toughest problems. To learn more, follow Databricks\non Twitter, LinkedIn and Instagram.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "1f170065560005166ed3bfde9a20232c", + "-----\n\n```\nAbout Databricks\n\n```\nDatabricks is the data and AI company. More than 9,000\norganizations worldwide — including Comcast, Condé Nast, and\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\nPlatform to unify their data, analytics and AI. Databricks is\nheadquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve\nthe world’s toughest problems. To learn more, follow Databricks\non Twitter, LinkedIn and Instagram.\n\n[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e7bcc94606d0aa9fb64905e8016a9a01", + "**EBOOK**\n\n## Why the Data Lakehouse Is Your Next Data Warehouse\n\n\n-----\n\n### Contents\n\nPreface .......................................................................................................................................................................................................................................... **3**\n\nIntroduction ............................................................................................................................................................................................................................. **4**\n\nOur Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n\nIntroducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n\nWhy Databricks SQL? ............................................................................................................................................................................................... 6", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "7fe00d5363b0ac4a160402a365eaf09d", + "Why Databricks SQL? ............................................................................................................................................................................................... 6\n\nCommon use cases .................................................................................................................................................................................................... 7\n\nThe Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n\n**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n\n**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "65670d54f6fb0335688832cdcd3c89e2", + "**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n\n**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n\nConclusion ............................................................................................................................................................................................................................. **24**\n\nCustomer Stories ............................................................................................................................................................................................................... **25**\n\n\n-----\n\n### Preface\n\nHistorically, data teams have had to resort to a bifurcated architecture to run traditional\nBI and analytics workloads, copying subsets of the data already stored in their data lake\nto a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\ngovernance inherent in proprietary architectures.\n\nOur customers have asked us to simplify their data architecture. We decided to accelerate\nour investments to do just that.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "59fb84c0af2dbaa94d16d9ab2bdedaeb", + "-----\n\n### Preface\n\nHistorically, data teams have had to resort to a bifurcated architecture to run traditional\nBI and analytics workloads, copying subsets of the data already stored in their data lake\nto a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\ngovernance inherent in proprietary architectures.\n\nOur customers have asked us to simplify their data architecture. We decided to accelerate\nour investments to do just that.\n\n\nWe introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\nfirst-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\nWe use the term “lakehouse” to reflect our customers’ desire to combine the best of data\nwarehouses and data lakes. With the lakehouse, you can now establish one source of truth\nfor all data and enable all workloads from AI to BI on one platform. And we want to provide\nyou with ease-of-use and state-of-the-art performance at the lowest cost.\n\n\n**Reynold Xin**\n\nOriginal Creator of Apache Spark, TM\nCo-founder and Chief Architect,\nDatabricks\n\n\nThis eBook covers how we went back to the drawing board to build Databricks SQL — the\nlast mile of enabling data warehousing capabilities for your existing data lakes — as part of\nthe Databricks Lakehouse Platform.\n\n\n-----\n\n### Introduction\n\n\nMost organizations operate their business with a complex data architecture that\ncombines data warehouses and data lakes. For one thing, data lakes are great\nfor machine learning (ML). They support open formats and a large ecosystem.\nBut data lakes have poor support for business intelligence (BI) and suffer\ncomplex data quality problems. Data warehouses, on the other hand, are great\nfor BI applications. But they have limited support for ML workloads, can’t handle\nnatural language data, large-scale structured data, or raw, video, audio or image\nfiles, and are proprietary systems with only a SQL interface.\n\nAs a result, data is moved around the organization through data pipelines and\nsystems that create a multitude of data silos. A large amount of time is spent\nmaintaining these pipelines and systems rather than creating new value from\ndata, and downstream consumers struggle to get a single source of truth of the\ndata due to the inherent siloing of data that takes place. The situation becomes\nvery expensive, and decision-making speed and quality are negatively affected.\n\nUnifying these systems can be transformational in how we think about data.\n\n\n##### The need for simplification\n\nIt is time for a new data architecture that can meet both today’s and tomorrow’s\nneeds. Without any compromise. Advanced analytics and ML are one of the\nmost strategic priorities for data-driven organizations today, and the amount\nof unstructured data is growing exponentially. So it makes sense to position\nthe data lake as the center of the data infrastructure. However, for this to be\nachievable, the data lake needs to adopt the strengths of data warehouses.\n\nThe answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\nand standardized system design: one that implements data structure and data\nmanagement features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8b67ad732685ff90448a1005931ea52e", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n\n##### Building the Data Lakehouse\n[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n\n\n-----\n\n### Our Approach: The Databricks Lakehouse Platform\n\nOur customers have asked us for simplification. This is why we’ve embarked on\nthis journey to deliver one simple, open and collaborative platform for all your\ndata, AI and BI workloads on your existing data lakes.\n\nThe [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\ncombining the data management and performance typically found in data\nwarehouses with the low-cost, flexible object stores offered by data lakes.\n\nIt’s built on open source and open standards to maximize flexibility, and lets you\nstore all your data — structured, semi-structured and unstructured — in your\nexisting data lake while still getting the data quality, performance, security and\ngovernance you’d expect from a data warehouse. Data only needs to exist once\nto support all of your data, AI and BI workloads on one common platform\n— establishing one source of truth.\n\nFinally, the Lakehouse Platform provides tailored and collaborative\nexperiences so data engineers, data scientists and analysts can work together\non one common platform across the entire data lifecycle — from ingestion to\nconsumption and the serving of data products — and innovate faster.\n\nLet’s look at how, with the right data structures and data management\ncapabilities in place, we can now deliver data warehouse and analytics\ncapabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----\n\n### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n\n\nDatabricks SQL is a serverless data warehouse on the Databricks Lakehouse\nPlatform that lets you run all your SQL and BI applications at scale with up to 12x\nbetter price/performance, a unified governance model, open formats and APIs,\nand your tools of choice — no vendor lock-in. Reduce resource management\noverhead with serverless compute, and easily ingest, transform and query\nall your data in place to deliver real-time business insights faster. In fact, DB\nSQL now holds the new world record in 100TB TPC-DS, the gold standard\nperformance benchmark for data warehousing.\n\nBuilt on open standards and APIs, the lakehouse provides an open, simplified and\nmulticloud architecture that brings the best of data warehousing and data lakes\ntogether, and integrations with a rich ecosystem for maximum flexibility.\n\n\n##### Why Databricks SQL?\n\nBest Price/Performance\nLower costs, get world-class performance, and eliminate the need to manage,\nconfigure or scale cloud infrastructure with serverless.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "724e4a72a8b4f9ddd30bc0de869473b7", + "### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n\n\nDatabricks SQL is a serverless data warehouse on the Databricks Lakehouse\nPlatform that lets you run all your SQL and BI applications at scale with up to 12x\nbetter price/performance, a unified governance model, open formats and APIs,\nand your tools of choice — no vendor lock-in. Reduce resource management\noverhead with serverless compute, and easily ingest, transform and query\nall your data in place to deliver real-time business insights faster. In fact, DB\nSQL now holds the new world record in 100TB TPC-DS, the gold standard\nperformance benchmark for data warehousing.\n\nBuilt on open standards and APIs, the lakehouse provides an open, simplified and\nmulticloud architecture that brings the best of data warehousing and data lakes\ntogether, and integrations with a rich ecosystem for maximum flexibility.\n\n\n##### Why Databricks SQL?\n\nBest Price/Performance\nLower costs, get world-class performance, and eliminate the need to manage,\nconfigure or scale cloud infrastructure with serverless.\n\nBuilt-In Governance\nEstablish one single copy for all your data using open standards, and one unified\ngovernance layer across all data teams using standard SQL.\n\nRich Ecosystem\nUse SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\nto ingest, transform and query all your data in place.\n\nBreak Down Silos\nEmpower every analyst to access the latest data faster for downstream real-time\nanalytics, and go effortlessly from BI to ML.\n\n**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n\n\n-----\n\n### Common use cases\n\nThousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\nfor hundreds of analysts across their organizations, and to build custom data applications to better serve their\ncustomers. Below are some examples of use cases for Databricks SQL.\n\n**At Atlassian, we have proven**\n\n\n**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n**your BI tools of choice** **the freshest data** **data applications**\n\n\n**that there is no longer a need**\n\n**for two separate data things.**\n\n**Technology has advanced**\n\n**far enough for us to consider**\n\n**one single unified lakehouse**\n\n**architecture.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager,\nAtlassian\n\n\nEnable business analysts to\ndirectly query data lake data\nusing their favorite BI tool and\navoid data silos. Reengineered\nand optimized connectors\nensure fast performance,\nlow latency and high user\nconcurrency to your data lake.\nNow analysts can use the best\ntool for the job on one single\nsource of truth for your data.\n\n\nEmpower every analyst and SQL\nprofessional in your organization\nto quickly find and share new\ninsights by providing them with\na collaborative and self-served\nanalytics experience. Confidently\nmanage data permissions with\nfine-grained governance, share and\nreuse queries, and quickly analyze\nand share results using interactive\nvisualizations and dashboards.\n\n\nBuild more effective and\ntailored data applications\nfor your own organization or\nyour customers. Benefit from\nthe ease of connectivity,\nmanagement and better price/\nperformance of DB SQL to\nsimplify development of dataenhanced applications at scale,\nall served from your data lake.\n\n\n-----\n\n### The Inner Workings of the Lakehouse", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "5aaea8ef9ed092a8eb41615a06d863b1", + "**one single unified lakehouse**\n\n**architecture.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager,\nAtlassian\n\n\nEnable business analysts to\ndirectly query data lake data\nusing their favorite BI tool and\navoid data silos. Reengineered\nand optimized connectors\nensure fast performance,\nlow latency and high user\nconcurrency to your data lake.\nNow analysts can use the best\ntool for the job on one single\nsource of truth for your data.\n\n\nEmpower every analyst and SQL\nprofessional in your organization\nto quickly find and share new\ninsights by providing them with\na collaborative and self-served\nanalytics experience. Confidently\nmanage data permissions with\nfine-grained governance, share and\nreuse queries, and quickly analyze\nand share results using interactive\nvisualizations and dashboards.\n\n\nBuild more effective and\ntailored data applications\nfor your own organization or\nyour customers. Benefit from\nthe ease of connectivity,\nmanagement and better price/\nperformance of DB SQL to\nsimplify development of dataenhanced applications at scale,\nall served from your data lake.\n\n\n-----\n\n### The Inner Workings of the Lakehouse\n\n\nIn the next chapter, we’ll unpack the three foundational layers of the Databricks\nLakehouse Platform and how we went back to the drawing board to build this\nexperience. Specifically, we’ll dive into how we built Databricks SQL to deliver\nanalytics and data warehousing workloads on your lakehouse.\n\n\nThose layers are:\n\n**1 .** The storage layer, or how we store and govern data\n\n**2 .** The compute layer, or how we process queries\n\n**3 .** The consumption layer, or the tools you can use to interface with the system\n\n\n###### PART 1: STORAGE LAYER\n\nIn order to bring the best of data lakes and data\nwarehouses, we needed to support the openness\nand flexibility of data lakes, as well as the quality,\nperformance and governance you’d expect from a\ndata warehouse.\n\n\n**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n|---|---|---|\n|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n|All data types|Structured only|All data types|\n|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n\n\n-----\n\n##### Transactional guarantees for your data lake\n\n\nThe open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\nlake challenges around data quality and reliability. It is the foundation for the\nlakehouse, and Databricks SQL stores and processes data using Delta Lake.\n\nFor example, it provides ACID transactions to ensure that every operation either\nfully succeeds or fully aborts for later retries — without requiring new data\npipelines to be created. It unifies batch and streaming pipelines so you can\neasily merge existing and new data at the speed required for your business. With\nTime Travel, Delta Lake automatically records all past transactions, so it’s easy\nto access and use previous versions of your data for compliance needs or for\nML applications. Advanced indexing, caching and auto-tuning allow optimization\nof Delta tables for the best query performance. Delta Lake also acts as the\nfoundation for fine-grained, role-based access controls on the lakehouse.\n\nAs a result, Delta Lake allows you to treat tables in Databricks SQL just like you\ntreat tables in a database: updates, inserts and merges can take place with high\nperformance at the row level. This is particularly useful if you are inserting new\n\n\ndata rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\nwith one open and standard format — not only for SQL but also for Python, Scala\nand other languages — so you can run all analytical and ML use cases on the\nsame data.\n\n**Delta Lake provides the key**\n\nAn open format storage layer built for lake-first architecture", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "0f0b7a883ea83d964c3f3442178a9514", + "As a result, Delta Lake allows you to treat tables in Databricks SQL just like you\ntreat tables in a database: updates, inserts and merges can take place with high\nperformance at the row level. This is particularly useful if you are inserting new\n\n\ndata rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\nwith one open and standard format — not only for SQL but also for Python, Scala\nand other languages — so you can run all analytical and ML use cases on the\nsame data.\n\n**Delta Lake provides the key**\n\nAn open format storage layer built for lake-first architecture\n\nACID transactions, Time Travel, highly available\n\nAdvanced indexing, caching, auto-tuning\n\nFine-grained, role-based access controls\n\nStreaming & batch, analytics & ML\n\nPython, SQL, R, Scala\n\nDelta Lake brings data quality, performance and governance to the lakehouse\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n##### Delta Lake: The Definitive Guide\n[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n\n\n-----\n\n##### A framework for building a curated data lake\n\n\nWith the ability to ingest petabytes of data with auto-evolving schemas, Delta\nLake helps turn raw data into actionable data by incrementally and efficiently\nprocessing data as it arrives from files or streaming sources like Kafka, Kinesis,\nEvent Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\nas it arrives with no manual intervention, as well as infer schema, detect column\nchanges for structured and unstructured data formats, and prevent data loss by\nrescuing data columns that don’t meet data quality specifications. And now with\n[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\nvarious sources.\n\nAs you refine the data, you can add more structure to it. Databricks recommends\nthe Bronze, Silver and Gold pattern. It lets you easily merge and transform new\nand existing data — in batch or streaming — while benefiting from the low-cost,\nflexible object storage offered by data lakes. Bronze is the initial landing zone\nfor the pipeline. We recommend copying data that’s as close to its raw form as\npossible to easily replay the whole pipeline from the beginning, if needed. Silver\nis where the raw data gets cleansed (think data quality checks), transformed\nand potentially enriched with external data sets. Gold is the production-grade\ndata that your entire company can rely on for business intelligence, descriptive\nstatistics, and data science/machine learning.\n\n\nBy the time you get to Gold, the tables are high-value business-level metrics\nthat have all the schema enforcement and constraints applied. This way, you can\nretain the flexibility of the data lake at the Bronze and Silver levels, and then use\nthe Gold level for high-quality business data.\n\nAuto Loader\n\n\nBRONZE\n\n\nSILVER GOLD\n\n\nStructured Streaming\n\nBatch\n\nCOPY INTO\n\nPartners\n\n\nRaw ingestion Filtered, cleaned Business-level\nand history and augmented aggregates\n\n|Col1|Col2|\n|---|---|\n||R|\n\n\n**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n\n\n-----\n\n##### An aside on batch and streaming data pipelines\n\n\nThe best way to set up and run data pipelines in the Bronze/Silver/Gold\npattern recommended on the previous page is in Delta Live Tables (DLT).\nDLT makes it easy to build and manage reliable batch and streaming\ndata pipelines that deliver high-quality data. It helps data engineering\nteams simplify ETL development and management with declarative\npipeline development, automatic data testing, and deep visibility for\nmonitoring and recovery.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "dbe58a2ebf2c1f9dbd2e28a0d617b81e", + "Auto Loader\n\n\nBRONZE\n\n\nSILVER GOLD\n\n\nStructured Streaming\n\nBatch\n\nCOPY INTO\n\nPartners\n\n\nRaw ingestion Filtered, cleaned Business-level\nand history and augmented aggregates\n\n|Col1|Col2|\n|---|---|\n||R|\n\n\n**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n\n\n-----\n\n##### An aside on batch and streaming data pipelines\n\n\nThe best way to set up and run data pipelines in the Bronze/Silver/Gold\npattern recommended on the previous page is in Delta Live Tables (DLT).\nDLT makes it easy to build and manage reliable batch and streaming\ndata pipelines that deliver high-quality data. It helps data engineering\nteams simplify ETL development and management with declarative\npipeline development, automatic data testing, and deep visibility for\nmonitoring and recovery.\n\nThe fact that you can run all your batch and streaming pipelines together\nin one simple, declarative framework makes data engineering easy on the\nDatabricks Lakehouse Platform. We regularly talk to customers who have\nbeen able to reduce pipeline development time from weeks — or months\n— to mere minutes with Delta Live Tables. And by the way, even data\n\n\nanalysts can easily interrogate DLT pipelines for the queries they need\nto run, without knowing any sort of specialized programming language\nor niche skills.\n\nOne of the top benefits of DLT, and Delta Lake in general, is that it is built\nwith streaming pipelines in mind. Today, the world operates in real time, and\nbusinesses are increasingly expected to analyze and respond to their data in\nreal time. With streaming data pipelines built on DLT, analysts can easily access,\nquery and analyze data with greater accuracy and actionability than with\nconventional batch processing. Delta Live Tables makes real-time analytics a\nreality for our customers.\n\n\n-----\n\n##### Fine-grained governance on the lakehouse\n\nDelta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\non the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\nprovides fine-grained governance across clouds, data and ML assets. Among the\nbenefits of the Unity Catalog, it allows you to:\n\n**• Discover, audit and govern data assets in one place:** A user-friendly\ninterface, automated data lineage across tables, columns, notebooks,\nworkflows and dashboards, role-based security policies, table or\ncolumn-level tags, and central auditing capabilities make it easy for\ndata stewards to discover, manage and secure data access to meet\ncompliance and privacy needs directly on the lakehouse.\n\n\n\n**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\nopen standard SQL DCL. This means database administrators can easily\ngrant permission to arbitrary, user-specific views, or set permissions on\nall columns tagged together, using familiar SQL.\n\n**• Centrally manage and audit shared data across organizations:** Every\norganization needs to share data with customers, partners and suppliers\nto better collaborate and to unlock value from their data. Unity Catalog\nbuilds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\nshared assets within and across organizations.\n\n\nThe Unity Catalog makes it easy for data stewards to discover, manage and secure data access\nto meet compliance and privacy needs on the lakehouse.\n\n**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n\n\n-----\n\n###### PART 2: COMPUTE LAYER\n\n\nThe next layer to look at is the compute layer, or how we process queries.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "e75286e60b2d30c5d866b93f23073de9", + "**• Centrally manage and audit shared data across organizations:** Every\norganization needs to share data with customers, partners and suppliers\nto better collaborate and to unlock value from their data. Unity Catalog\nbuilds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\nshared assets within and across organizations.\n\n\nThe Unity Catalog makes it easy for data stewards to discover, manage and secure data access\nto meet compliance and privacy needs on the lakehouse.\n\n**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n\n\n-----\n\n###### PART 2: COMPUTE LAYER\n\n\nThe next layer to look at is the compute layer, or how we process queries.\n\nApache Spark TM has been the de facto standard for data lake compute. It’s great\nfor processing terabytes and petabytes of data cheaply, but historically Spark\nSQL uses a nonstandard syntax and can be difficult to configure.\n\n\nData warehouses, on the other hand, tend to support short running queries\nreally well, especially when you have a lot of users issuing queries concurrently.\nThey tend to be easier to set up, but don’t necessarily scale or they become\ntoo costly.\n\n\n**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n|---|---|---|\n|Economical|Scaling is exponentially more expensive|Economical|\n|High operational complexity|Ease of use|Ease of use|\n||||\n\n\nA popular belief is that large workloads require a drastically different system\nthan low latency, high concurrency workloads. For example, there’s the classic\ntrade-off in computer systems between latency and throughput.\n\nBut after spending a lot of time analyzing these systems, we found that it was\npossible to simultaneously improve large query performance and concurrency\n\n\nand latency. Although the classic trade-offs definitely existed, they were only\nexplicit when we optimized the system to the very theoretical optimal. It turned\nout the vast majority of software — and this includes all data warehouse systems\nand Databricks — were far away from optimal.\n\n\n-----\n\n##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n\n\nTo achieve world-class performance for analytics on the lakehouse, we chose to\ncompletely rebuild the compute layer. But performance isn’t everything. We also\nwant it to be simple to administer and cheaper to use. Databricks SQL leverages\nserverless SQL warehouses that let you get started in seconds, and it’s powered\nby a new native MPP vectorized engine: Photon.\n\nDatabricks SQL warehouses are optimized and elastic SQL compute resources.\nJust pick the cluster size and Databricks automatically determines the best\ninstance types and VMs configuration for the best price/performance. This\nmeans you don’t have to worry about estimating peak demand or paying too\nmuch by overprovisioning. You just need to click a few buttons to operate.\nTo further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\nWith the serverless capability, queries start rapidly with zero infrastructure\nmanagement or configuration overhead. This lowers your total cost, as you pay\nonly for what you consume without idle time or overprovisioned resources.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "45c780a084216b7bdf44ffa987309653", + "To achieve world-class performance for analytics on the lakehouse, we chose to\ncompletely rebuild the compute layer. But performance isn’t everything. We also\nwant it to be simple to administer and cheaper to use. Databricks SQL leverages\nserverless SQL warehouses that let you get started in seconds, and it’s powered\nby a new native MPP vectorized engine: Photon.\n\nDatabricks SQL warehouses are optimized and elastic SQL compute resources.\nJust pick the cluster size and Databricks automatically determines the best\ninstance types and VMs configuration for the best price/performance. This\nmeans you don’t have to worry about estimating peak demand or paying too\nmuch by overprovisioning. You just need to click a few buttons to operate.\nTo further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\nWith the serverless capability, queries start rapidly with zero infrastructure\nmanagement or configuration overhead. This lowers your total cost, as you pay\nonly for what you consume without idle time or overprovisioned resources.\n\n\nSince CPU clock speeds have plateaued, we also wanted to find new ways to\nprocess data faster, beyond raw compute power. One of the most impactful\nmethods has been to improve the amount of data that can be processed in\nparallel. However, data processing engines need to be specifically architected to\ntake advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\nC++ based vectorized query processing engine that dramatically improves query\nperformance while remaining fully compatible with open Spark APIs. Databricks\nSQL warehouses are powered by Photon, which seamlessly coordinates work and\nresources and transparently accelerates portions of your SQL queries directly on\nyour data lake. No need to move the data to a data warehouse.\n\n**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n##### Photon: A Fast Query Engine for Lakehouse Systems\n\n[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n\n\n-----\n\n**Did you know?**\n\nDatabricks SQL warehouses scale automatically throughout the day to\nbetter suit your business needs. Administration is simplified by identifying\nhow many clusters can scale out with min and max, and Databricks SQL will\nauto-scale as needed. This ensures that you have ample compute to serve\nyour needs, without overprovisioning. Administrators appreciate the ability\nto have better control over consumption costs, while users appreciate that\ntheir queries process as fast and efficiently as possible. For most BI and\nanalytics use cases, using medium-size warehouses with scaling is a great\nbalance of price/performance that fits most business needs.\n\nIn the next section, we will discuss examples of Databricks SQL performance results\non large-scale analytic workloads as well as highly concurrent workloads.\n\n\nRunning Scheduled Starting Cluster Scale\n\n\n-----\n\n##### Large query performance: the fastest data warehouse\n\n\nThe industry standard benchmark used by data warehouses is TPC-DS. It includes\n100 queries that range from very simple to very sophisticated to simulate decision\nsupport workloads. This benchmark was created by a committee formed by\ndata warehousing vendors. The chart at right shows price/performance results\nrunning the 100TB version of TPC-DS, since for large workloads the numbers that\nultimately matter pertain to the performance cost. As you can see, Databricks SQL\noutperforms all cloud data warehouses we have measured.\n\n**[LEARN MORE](https://dbricks.co/benchmark)**\n\n**Did you know?**\n\n\n**$2,000**\n\n**$1,791**\n\n**$1,500**\n\n**$1,000**\n\n**$952**\n\n\n**$500**\n\n\n**$242**\n**$146**\n\n\n**$358**\n\n\n**$0**\nDatabricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\nSpot On-Demand Warehouse 1 Warehouse 2 Warehouse 3", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "63b9d1e5fa8c663ed4247156be83c2aa", + "**[LEARN MORE](https://dbricks.co/benchmark)**\n\n**Did you know?**\n\n\n**$2,000**\n\n**$1,791**\n\n**$1,500**\n\n**$1,000**\n\n**$952**\n\n\n**$500**\n\n\n**$242**\n**$146**\n\n\n**$358**\n\n\n**$0**\nDatabricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\nSpot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n\nSystem\n\n100TB TPC-DS price/performance benchmark (lower is better).\n\n\nDatabricks SQL has set a [new world record in](http://tpc.org/5013)\n[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\nbenchmark for data warehousing. Databricks\nSQL outperformed the previous record by 2.2x.\nAnd this result has been formally audited and\nreviewed by the TPC council.\n\n\n-----\n\n##### Highly concurrent analytics workloads\n\nBeyond large queries, it is also common for highly concurrent analytics workloads\nto execute over small data sets. To optimize concurrency, we used the same\nTPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\nstreams. We analyzed the results to identify and remove bottlenecks, and\nbuilt hundreds of optimizations to improve concurrency. Databricks SQL now\noutperforms some of the best cloud data warehouses for both large queries and\nsmall queries with lots of users.\n\nReal-world workloads, however, are not just about either large or small queries.\nDatabricks SQL also provides intelligent workload management with a dual\nqueuing system and highly parallel reads.\n\n\n16,523\n\n12,248\n\n###### ~3X\n\n4,672\n\n\n11,690\n\n\nJuly 2020\n\n\nJan 2021 Oct 2022\n\n\nCLOUD DW X SQL WAREHOUSE X - L SIZE\n\n10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n\n\n-----\n\n##### Intelligent workload management with smart queuing system\n\nReal-world workloads typically include a mix of small and large queries. Therefore\nthe smart queuing and load balancing capabilities of Databricks SQL need to\naccount for that too. Databrick SQL uses a smart dual queuing system (in preview)\nthat prioritizes small queries over large, as analysts typically care more about the\nlatency of short queries than large ones.\n\n\n##### Highly parallel reads with improved I/O performance\n\nIt is common for some tables in a lakehouse to be composed of many files — for\nexample, in streaming scenarios such as IoT ingest when data arrives continuously.\nIn legacy systems, the execution engine can spend far more time listing these\nfiles than actually executing the query. Our customers told us they do not want to\nsacrifice performance for data freshness. With async and highly parallel I/O, when\nexecuting a query, Databricks SQL now automatically reads the next blocks of data\nfrom cloud storage while the current block is being processed. This considerably\nincreases overall query performance on small files (by 12x for 1MB files) and “cold\ndata” (data that is not cached) use cases as well.\n\n**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n\n\n-----\n\n###### PART 3: CONSUMPTION LAYER\n\n\nThe third layer of the Databricks Lakehouse Platform would similarly have to bridge\nthe best of both data lakes and data warehouses. In the lakehouse, you would\nhave to be able to work seamlessly with your tools of choice — whether you are a\nbusiness analyst, data scientist, or ML or data engineer.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "fc23420138a8b084f33c275ba7f42a96", + "**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n\n\n-----\n\n###### PART 3: CONSUMPTION LAYER\n\n\nThe third layer of the Databricks Lakehouse Platform would similarly have to bridge\nthe best of both data lakes and data warehouses. In the lakehouse, you would\nhave to be able to work seamlessly with your tools of choice — whether you are a\nbusiness analyst, data scientist, or ML or data engineer.\n\n\nThe lakehouse must treat Python, Scala, R and SQL programming languages\nand ecosystems as first-class citizens to truly unify data engineering, ML and BI\nworkloads in one place.\n\n\n**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n|---|---|---|\n|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n||||\n\n\n-----\n\n##### A platform for your tools of choice\n\n\nAt Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\nclosely with a large number of software vendors to make sure you can easily use your tools of choice\non Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\nyour favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\nconcurrency and performance improvements, we make sure that the direct and live query experience is great.\n\n\n**Now more than ever, organizations**\n\n**need a data strategy that enables**\n\n**speed and agility to be adaptable.**\n\n**As organizations are rapidly moving**\n\n**their data to the cloud, we’re**\n\n**seeing growing interest in doing**\n\n**analytics on the data lake. The**\n\n**introduction of Databricks SQL**\n\n**delivers an entirely new experience**\n\n**for customers to tap into insights**\n\n**from massive volumes of data with**\n\n**the performance, reliability and**\n\n**scale they need. We’re proud to**\n\n**partner with Databricks to bring**\n\n**that opportunity to life.**\n\n**Francois Ajenstat**\nChief Product Officer, Tableau\n\n\n+ Any other Apache Spark-compatible client\n\n\n-----\n\n##### Faster BI results retrieval with Cloud Fetch\n\nOnce query results are computed, cloud data warehouses often collect and\nstream back results to BI clients on a single thread. This can create a bottleneck\nand greatly slows down the experience if you are fetching anything more than a\nfew megabytes of results in size. To provide analysts with the best experience\nfrom their favorite BI tools, we also needed to speed up how the system delivers\nresults to BI tools like Power BI or Tableau once computed.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "3cdeda5682e63f2b2ccf6103b25dc299", + "**delivers an entirely new experience**\n\n**for customers to tap into insights**\n\n**from massive volumes of data with**\n\n**the performance, reliability and**\n\n**scale they need. We’re proud to**\n\n**partner with Databricks to bring**\n\n**that opportunity to life.**\n\n**Francois Ajenstat**\nChief Product Officer, Tableau\n\n\n+ Any other Apache Spark-compatible client\n\n\n-----\n\n##### Faster BI results retrieval with Cloud Fetch\n\nOnce query results are computed, cloud data warehouses often collect and\nstream back results to BI clients on a single thread. This can create a bottleneck\nand greatly slows down the experience if you are fetching anything more than a\nfew megabytes of results in size. To provide analysts with the best experience\nfrom their favorite BI tools, we also needed to speed up how the system delivers\nresults to BI tools like Power BI or Tableau once computed.\n\nThat’s why we’ve reimagined this approach with a new architecture called\n[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\nall of the compute nodes to cloud storage, and then sends the list of files using\npre-signed URLs back to the client. The client then can download in parallel\nall the data from cloud storage. This approach provides up to 10x performance\nimprovement in real-world scenarios.\n\n\nparallel\ndata\ntransfers\n\n\nCloud Storage\n\n**Cluster**\n\n\nSQL Endpoint\n\n\nCUSTOMER BENCHMARK\nTABLEAU EXTRACT\n\n\nCloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n\n\n-----\n\n##### A first-class SQL development experience\n\nIn addition to supporting your favorite tools, we\nare also focused on providing a native first-class\nSQL development experience. We’ve talked to\nhundreds of analysts using various SQL editors\nlike SQL Workbench every day, and worked with\nthem to provide the dream set of capabilities\nfor SQL development.\n\nFor example, Databricks SQL now supports\n[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\nspecial SQL dialect. Query tabs allow you to work\non multiple queries at once, autosave gives you\npeace of mind so you never have to worry about\nlosing your drafts, integrated history lets you\neasily look at what you have run in the past, and\nintelligent auto-complete understands subqueries\nand aliases for a delightful experience.\n\n\nThe built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n\n\n-----\n\nFinally, with Databricks SQL, analysts can easily\nmake sense of query results through a wide variety\nof rich visualizations and quickly build dashboards\nwith an intuitive drag-and-drop interface. To keep\neveryone current, dashboards can be shared and\nconfigured to automatically refresh, as well as to\nalert the team to meaningful changes in the data.\n\n\nEasily combine visualizations to build rich dashboards that can be shared with stakeholders.\n\n\n-----\n\n### Conclusion\n\nDatabricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\ninto actionable data, combining the flexibility and openness of data lakes\nwith the reliability and performance of data warehouses. The Unity Catalog\nprovides fine-grained governance on the lakehouse across all clouds using\none friendly interface and standard SQL.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "cf0057d81b5e7a1c0fc04dad428e50f3", + "The built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n\n\n-----\n\nFinally, with Databricks SQL, analysts can easily\nmake sense of query results through a wide variety\nof rich visualizations and quickly build dashboards\nwith an intuitive drag-and-drop interface. To keep\neveryone current, dashboards can be shared and\nconfigured to automatically refresh, as well as to\nalert the team to meaningful changes in the data.\n\n\nEasily combine visualizations to build rich dashboards that can be shared with stakeholders.\n\n\n-----\n\n### Conclusion\n\nDatabricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\ninto actionable data, combining the flexibility and openness of data lakes\nwith the reliability and performance of data warehouses. The Unity Catalog\nprovides fine-grained governance on the lakehouse across all clouds using\none friendly interface and standard SQL.\n\nDatabricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\nstandard performance benchmark for data warehousing. It is powered by\nPhoton, the new vectorized query engine for the lakehouse, and by SQL\nwarehouses for instant, elastic compute decoupled from storage.\n\nFinally, Databricks SQL offers a native first-class SQL development\nexperience, with a built-in SQL editor, rich visualizations and dashboards,\nand integrates seamlessly with your favorite BI- and SQL-based tools for\nmaximum productivity.\n\n\nDatabricks SQL under the hood.\n\n\n-----\n\n### Atlassian\n\n\nAtlassian is a leading provider of collaboration, development and issue-tracking\n\nsoftware for teams. With over 150,000 global customers (including 85 of the Fortune\n\n100), Atlassian is advancing the power of collaboration with products including Jira,\n\nConfluence, Bitbucket, Trello and more.\n\nUSE CASE\n\nAtlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\ndown operational costs. Atlassian currently has a number of use cases focused on putting the\ncustomer experience at the forefront.\n\n**Customer support and service experience**\nWith the majority of their customers being server-based (using products like Jira and Confluence),\nAtlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\ncustomer support experience.\n\n**Marketing personalization**\nThe same insights could also be used to deliver personalized marketing emails to drive\nengagement with new features and products.\n\n**Anti-abuse and fraud detection**\nThey can predict license abuse and fraudulent behavior through anomaly detection and\npredictive analytics.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nAtlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\nand externally. They have moved from a data warehousing paradigm to standardization on Databricks,\nenabling the company to become more data driven across the organization. Over 3,000 internal users in\nareas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\ninsights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\nusing the platform to drive more personalized support and service experiences to their customers.\n\n**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\nfinance, sales, support and R&D\n\n**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n\n**•** MLflow streamlines MLOps for faster delivery\n\n**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n\nWith cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\naccess all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\nAlready the company has:\n\n**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\njobs from EMR to Databricks with minimal effort and low-code change\n\n**•** Decreased delivery time by 30% with shorter dev cycles", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "8660338082a59dba0603f43ca7e8a09b", + "**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\nfinance, sales, support and R&D\n\n**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n\n**•** MLflow streamlines MLOps for faster delivery\n\n**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n\nWith cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\naccess all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\nAlready the company has:\n\n**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\njobs from EMR to Databricks with minimal effort and low-code change\n\n**•** Decreased delivery time by 30% with shorter dev cycles\n\n**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n\n**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n\n\n**At Atlassian, we need to ensure**\n**teams can collaborate well**\n**across functions to achieve**\n**constantly evolving goals. A**\n**simplified lakehouse architecture**\n**would empower us to ingest high**\n**volumes of user data and run the**\n**analytics necessary to better**\n**predict customer needs and**\n**improve the experience of our**\n**customers. A single, easy-to-use**\n**cloud analytics platform allows**\n**us to rapidly improve and build**\n**new collaboration tools based on**\n**actionable insights.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager, Atlassian\n\n\n-----\n\n### ABN AMRO\n\n\nAs an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n\nby legacy infrastructure and data warehouses that complicated access to data across various\n\nsources and created inefficient data processes and workflows. Today, Azure Databricks\n\nempowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n\nscientists and analysts who work collaboratively on improving business operations and\n\nintroducing new go-to-market capabilities across the company.\n\nUSE CASE\n\nABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\nproviding automation and insight across operations.\n\n**Personalized finance**\nABN AMRO leverages real-time data and customer insights to provide products and services tailored to\ncustomers’ needs. For example, they use machine learning to power targeted messaging within their automated\nmarketing campaigns to help drive engagement and conversion.\n\n**Risk management**\nUsing data-driven decision-making, they are focused on mitigating risk for both the company and their\ncustomers. For example, they generate reports and dashboards that internal decision makers and leaders use to\nbetter understand risk and keep it from impacting ABN AMRO’s business.\n\n**Fraud detection**\nWith the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\nimpacts their customers. Among the activities they’re trying to address are money laundering and fake credit\ncard applications.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nToday, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\nscientists and analysts who work collaboratively on improving business operations and introducing new\ngo-to-market capabilities across the company.\n\n**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\ndownstream analytics\n\n**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\nthrough reports and dashboards\n\n**•** MLflow speeds deployment of new models that improve the customer experience — with new use\ncases delivered in under two months", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "7d67332c1f511a82e62065fbbbd13d4f", + "**Fraud detection**\nWith the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\nimpacts their customers. Among the activities they’re trying to address are money laundering and fake credit\ncard applications.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nToday, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\nscientists and analysts who work collaboratively on improving business operations and introducing new\ngo-to-market capabilities across the company.\n\n**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\ndownstream analytics\n\n**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\nthrough reports and dashboards\n\n**•** MLflow speeds deployment of new models that improve the customer experience — with new use\ncases delivered in under two months\n\n\n**Databricks has changed the way**\n**we do business. It has put us in**\n**a better position to succeed in**\n**our data and AI transformation**\n**as a company by enabling data**\n**professionals with advanced data**\n**capabilities in a controlled and**\n**scalable way.**\n\n**Stefan Groot**\nHead of Analytics Engineering,\nABN AMRO\n\n\n#### 10x faster\n\ntime to market — use cases\ndeployed in two months\n\n\n#### 100+ \n\nuse cases to be delivered\nover the coming year\n\n\n#### 500+\n\nempowered business\nand IT users\n\n\n**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n\n\n-----\n\n### SEGA Europe\n\n**Improving the player experience**\n\n# “ is at the heart of everything\n\n**we do, and we very much**\n**see Databricks as a key**\n**partner, supporting us to drive**\n**forward the next generation of**\n**community gaming.**\n\n**Felix Baker**\nData Services Manager, SEGA Europe\n\n\nSEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n\nLakehouse Platform to personalize the player experience and build its own machine\n\nlearning algorithm to help target and tailor games for over 30 million of its customers.\n\nAs housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\ntitles, including Football Manager,™ saw over double the number of sales during the first lockdown\ncompared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\nin players over the course of the COVID-19 pandemic. With more anonymized data being collected through\nan analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\nsheer volume of data, extract meaningful insights from it and enable the data science team to improve\ngeneral workflow.\n\n**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n\n\n-----\n\n### About Databricks\n\nDatabricks is the lakehouse company. More than 7,000 organizations\n\nworldwide — including Comcast, Condé Nast and over 50% of the\n\nFortune 500 — rely on the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of\n\nApache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "d9097f0ec6dc83ab4e34c5641e99f8c6", + "worldwide — including Comcast, Condé Nast and over 50% of the\n\nFortune 500 — rely on the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of\n\nApache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "87391e7b07aace450580645213c6e700", + "# Big Book of Data and AI Use Cases for the Public Sector\n\n### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n\n\n-----\n\n## Contents\n\nThe State of Data and AI in the Government .......................................................................................... 3\n\nThe Need for a Modern Data Architecture ............................................................................................. 5\n\nIntroducing the Lakehouse for Public Sector ......................................................................................... 6\n\n**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n\n**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n\n**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n\n**U S E C A S E :** Money Laundering ................................................................................................................. 17", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "304d79573a7bc185c33234b5e7fa334e", + "**U S E C A S E :** Money Laundering ................................................................................................................. 17\n\n**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n\n**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n\n**U S E C A S E :** Public Health Management .................................................................................................. 24\n\nConclusion ................................................................................................................................................. 26\n\n\n-----\n\n## The State of Data and AI in the Government\n\n###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n\n\nIn 2018, the U.S. Federal Government embarked on one of its most ambitious\nefforts since putting a man on the moon — embedding data into all aspects of\ndecision-making. By enacting the Evidence-Based Policymaking Act of 2018,\nCongress set in motion requirements for agencies to modernize their data and\nanalytics capabilities, including the appointment of agency-level chief data\nofficers. A year later came the Federal Data Strategy, which provided further\nguidance for how agencies should manage and use data by 2030.\n\n\nWith all of this guidance, agencies are starting to make meaningful improvements\nto their data strategy, but when it comes to innovating with data, agencies still\nlag behind the private sector. This begs the question: what’s standing in the way?\nThe hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\nthey can largely be attributed to a patchwork of legacy technologies that have\nbeen amassed over the last 30 to 40 years. While these hurdles stand in the\nway, a number of innovative agencies are making significant progress as they\nembrace new data and AI capabilities.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "4885518b1e47e463d264c834140a6756", + "In 2018, the U.S. Federal Government embarked on one of its most ambitious\nefforts since putting a man on the moon — embedding data into all aspects of\ndecision-making. By enacting the Evidence-Based Policymaking Act of 2018,\nCongress set in motion requirements for agencies to modernize their data and\nanalytics capabilities, including the appointment of agency-level chief data\nofficers. A year later came the Federal Data Strategy, which provided further\nguidance for how agencies should manage and use data by 2030.\n\n\nWith all of this guidance, agencies are starting to make meaningful improvements\nto their data strategy, but when it comes to innovating with data, agencies still\nlag behind the private sector. This begs the question: what’s standing in the way?\nThe hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\nthey can largely be attributed to a patchwork of legacy technologies that have\nbeen amassed over the last 30 to 40 years. While these hurdles stand in the\nway, a number of innovative agencies are making significant progress as they\nembrace new data and AI capabilities.\n\n\n-----\n\nFederal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n50% from 2018. There’s a good reason for this level of spend: Deloitte recently\npublished a report, “AI-augmented Government,” that estimates the federal\ngovernment could free up as many as 1.2 billion hours of work and save up to\n$41.1 billion annually through the use of AI-driven automation. Early adopters\nof advanced analytics are starting to see the fruits of their labor. For example,\n[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\non applicants by 24x, automate the processing of millions of applications,\nand reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n\nIn this eBook, we explore the hurdles of legacy technologies and how a modern\ndata lakehouse can help agencies unlock innovative data and analytics use cases\nat all levels of government. Over the following seven example use cases, covering\neverything from cyber threat detection to improving public health,\n\n\n**An increased focus on cloud, analytics and AI = operational efficiency**\n\n1. AI/ML\n2. Data Analytics\n3. Cloud\n\n**$1B** **TOP PRIORITIES** **$41B+**\n\nData and AI Research and Government CIOs’ top Estimated government\nDevelopment Initiative game-changing technologies savings from data-driven\nautomation\n\n**U.S. Government**\n\nwe demonstrate how the Databricks Lakehouse for Public Sector is critical to\nimproving citizen services and delivering on mission objectives. This guide also\nincludes resources in the form of Solution Accelerators, reference architectures\nand real-world customer stories to help as you embark on your own journey to\ndrive a safer and more prosperous nation through the use of data and AI.\n\n\n-----\n\n## The Need for a Modern Data Architecture\n\n###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n\n##### Common challenges\n\n\nMany government agencies are burdened with a legacy IT infrastructure that is\nbuilt with on-premises data warehouses that are complex to maintain, are costly\nto scale as compute is coupled with storage, and lack support for unstructured\ndata and advanced analytics. This severely inhibits data-driven innovation.\nMaintaining these systems requires a massive investment of both time and\nmoney compared to modern cloud-based systems and creates a number of\navoidable challenges:", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "e75d43566d9248c2dedde1ab3be69747", + "-----\n\n## The Need for a Modern Data Architecture\n\n###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n\n##### Common challenges\n\n\nMany government agencies are burdened with a legacy IT infrastructure that is\nbuilt with on-premises data warehouses that are complex to maintain, are costly\nto scale as compute is coupled with storage, and lack support for unstructured\ndata and advanced analytics. This severely inhibits data-driven innovation.\nMaintaining these systems requires a massive investment of both time and\nmoney compared to modern cloud-based systems and creates a number of\navoidable challenges:\n\n\ngovernment is often done in weekly or daily batches, but decision-making\nneeds to happen in real time. Critical events like cyber attacks and health\npandemics can’t wait a week.\n\n**Lack of citizen insights**\n\nWhen data is siloed, teams get an incomplete view of the citizen,\nresulting in missed opportunities to improve the delivery of services that\nimpact the quality of life for their constituents.\n\n\n**Lack of reliability**\n\n\nSiloed systems result in data replication as teams spin up new data marts\nto support their one-off use cases. Without a single source of truth, teams\nstruggle with data inconsistencies, which can result in inaccurate analysis\nand model performance that is only compounded over time.\n\n**Lack of agility**\n\nDisjointed analytics tools and legacy infrastructure hinder the ability of\nteams to conduct real-time analytics. Most data processing in the\n\n\n**Lack of productivity**\n\nData scientists and data analysts alike must have the right tool set to\ncollaboratively investigate, extract and report meaningful insights from\ntheir data. Unfortunately, data silos lead to organizational silos, which make\ncollaboration inside an agency as well as between agencies very difficult.\nWith different groups of data teams leveraging their own coding and\nanalytical tools, communicating insights and working across teams —\nlet alone across agencies — is almost impossible. This lack of collaboration\ncan drastically limit the capabilities of any data analytics or AI initiative.\n\n\n-----\n\n## Introducing the Lakehouse for Public Sector\n\n\nThe reason that the Databricks Lakehouse is\nable to deliver the simplicity, flexibility and\nspeed that a government agency requires is\nthat it fundamentally reimagines the modern\ndata architecture. Databricks provides federal,\nstate and local agencies with a cloud-native\nLakehouse Platform that combines the best\nof data warehouses and data lakes — to store\nand manage all your data for all your analytics\nworkloads. With this modern architecture,\nagencies can federate all their data and\ndemocratize access for downstream use\ncases, empowering their teams to deliver on\ntheir mission objectives by unlocking the full\npotential of their data.\n\n\n**Delivering real-time data insight in support of the mission**\n\n- Fraud, Waste & Abuse\n\n- Cybersecurity\n\n- Medicaid Dashboards &\nReporting\n\n- Process Improvement\n\n- Predictive Maintenance\n\n- SCM & Demand Forecasting\n\n- Smart Military/Censor Data\n\n- Military Heatlh\n\n- COVID Response/Decision\nSupport\n\n- Smart Cities/Connected\nVehicles\n\n- Citizen Engagement\n\n- Data-Driven Decision-Making\n\n\n-----\n\n**Federate all of your agency’s data**\n\nAny type of data can be stored because, like a data lake, the Databricks\nLakehouse is built using the low-cost object storage supported by cloud\nproviders. Leveraging this capability helps break down the data silos that\nhinder efforts to aggregate data for advanced analytics (e.g., predictive\nmaintenance) or compute-intensive workloads like detecting cyber\nthreats across billions of signals. Probably even more important is the\nability of the lakehouse architecture to travel back in time, ensuring full\naudit compliance and high governance standards for analytics and AI.\n\n**Power real-time decision-making**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "33f711ff444a761a534ff12f1fc7e8ed", + "- Medicaid Dashboards &\nReporting\n\n- Process Improvement\n\n- Predictive Maintenance\n\n- SCM & Demand Forecasting\n\n- Smart Military/Censor Data\n\n- Military Heatlh\n\n- COVID Response/Decision\nSupport\n\n- Smart Cities/Connected\nVehicles\n\n- Citizen Engagement\n\n- Data-Driven Decision-Making\n\n\n-----\n\n**Federate all of your agency’s data**\n\nAny type of data can be stored because, like a data lake, the Databricks\nLakehouse is built using the low-cost object storage supported by cloud\nproviders. Leveraging this capability helps break down the data silos that\nhinder efforts to aggregate data for advanced analytics (e.g., predictive\nmaintenance) or compute-intensive workloads like detecting cyber\nthreats across billions of signals. Probably even more important is the\nability of the lakehouse architecture to travel back in time, ensuring full\naudit compliance and high governance standards for analytics and AI.\n\n**Power real-time decision-making**\n\nStreaming use cases such as IoT analytics or disease spread tracking is\nsimpler to support because the lakehouse uses Apache Spark TM as the\ndata processing engine and Delta Lake as a storage layer. With Spark,\nyou can toggle between batch and streaming workloads with just a line\nof code. With Delta Lake, native support for ACID transactions means\nthat you can deploy streaming workloads without the overhead of\ncommon reliability and performance issues. These capabilities make\nreal-time analytics possible.\n\n\n**Unlock collaborative analytics for all personas**\n\nThe Databricks Lakehouse for Public Sector is your one-stop shop for\nall your analytics and AI. The platform includes a business intelligence\ncapability — Databricks SQL — that empowers data analysts to query and run\nreports against all of an agency’s unified data. Databricks SQL integrates with\nBI tools like Tableau and Microsoft Power BI and complements any existing BI\ntools with a SQL-native interface, allowing data analysts and data scientists\nto query data directly within Databricks and build powerful dashboards.\n\n\n-----\n\n**Deliver on your mission with predictive insights**\nIn the same environment, data scientists can build, share and collaborate\non machine learning models for advanced use cases like fraud detection\nor geospatial analytics. Additionally, MLflow, an open source toolkit for\nmanaging the ML lifecycle, is built into the Lakehouse so data scientists\ncan manage everything in one place. Databricks natively supports Python,\nR, SQL and Scala so practitioners can work together with the languages and\nlibraries of their choice, reducing the need for separate tools. With these\ncapabilities, data teams can turn insights from real-world data into powerful\nvisualizations designed for machine learning. Visualizations can then be\nturned into interactive dashboards to share insights with peers across\nagencies, policymakers, regulators and decision-makers.\n\n\n##### Customers That Innovate With Databricks Lakehouse for Public Sector\n\nSome of the top government agencies in the world turn to the\nDatabricks Lakehouse for Public Sector to bring analytics and AI-driven\nautomation and innovation to the communities they serve.\n\n\n-----\n\n###### USE CASE:\n## Cybersecurity\n\n##### Overview\n\n\n**Limited window of data**\nGiven the high cost of storage, most agencies retain only a few weeks of threat\ndata. This can be a real problem in scenarios where a perpetrator gains access\nto a network but waits months before doing anything malicious. Without a long\nhistorical record, security teams can’t analyze cyberattacks over long-term\nhorizons or conduct deep forensic reviews.\n\n##### Solution overview\n\nFor government agencies that are ready to modernize their security data\ninfrastructure and analyze data at petabyte-scale more cost-effectively,\nDatabricks provides an open lakehouse platform that augments existing SIEMs\nto help democratize access to data for downstream analytics and AI. Built\non Apache Spark and Delta Lake, Databricks is optimized to process large\nvolumes of streaming and historic data for real-time threat analysis and incident\nresponse. Security teams can query threat data going years into the past in just\nminutes and build ML models to detect new threat patterns and reduce false\npositives. Additionally, Databricks created a Splunk-certified add-on to augment\nSplunk for Enterprise Security (ES) for cost-efficient log and retention expansion.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "6750f72ff74bd02db319754a9aeabef5", + "**Limited window of data**\nGiven the high cost of storage, most agencies retain only a few weeks of threat\ndata. This can be a real problem in scenarios where a perpetrator gains access\nto a network but waits months before doing anything malicious. Without a long\nhistorical record, security teams can’t analyze cyberattacks over long-term\nhorizons or conduct deep forensic reviews.\n\n##### Solution overview\n\nFor government agencies that are ready to modernize their security data\ninfrastructure and analyze data at petabyte-scale more cost-effectively,\nDatabricks provides an open lakehouse platform that augments existing SIEMs\nto help democratize access to data for downstream analytics and AI. Built\non Apache Spark and Delta Lake, Databricks is optimized to process large\nvolumes of streaming and historic data for real-time threat analysis and incident\nresponse. Security teams can query threat data going years into the past in just\nminutes and build ML models to detect new threat patterns and reduce false\npositives. Additionally, Databricks created a Splunk-certified add-on to augment\nSplunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n\n\nCyberattacks from bad actors and nation states are a huge and growing threat\nto government agencies. Recent large-scale attacks like the ones on SolarWinds,\nlog4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\nfrequency of broad-reaching cyberattacks. Data breaches cost the federal\ngovernment more than $4 million per incident in 2021 and threaten national\nsecurity. Staying ahead of the next threat requires continuous monitoring of\nsecurity data from an agency’s entire attack surface before, during and after\nan incident.\n\n##### Challenges\n\n**Scaling existing SIEM solutions**\nAgencies looking to expand existing SIEM tools for today’s petabytes of data can\nexpect increased licensing, storage, compute and integration resources resulting\nin tens of millions of dollars in additional costs per year.\n\n**Rules-based systems**\nMany legacy SIEM tools lack the critical analytics capabilities — such as\nadvanced analytics, graph processing and machine learning — needed to detect\nunknown threat patterns or deliver on a broader set of security use cases like\nbehavioral analytics.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n\nDetecting criminals and nation states through DNS analytics. In order to address\ncommon cybersecurity challenges such as deployment complexity, tech\nlimitation and cost, security teams need a real-time data analytics platform that\ncan handle cloud scale, analyze data wherever it is, natively support streaming\nand batch analytics, and have collaborative content development capabilities.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n\n**Fighting Cyber Threats in Real Time**\nSince partnering with Databricks, HSBC has reduced costs, accelerated threat\ndetection and response, and improved their security posture. Not only can\nthey process all of their required data, but they’ve also increased online query\nretention from just days to months at petabyte scale. HSBC is now able to\nexecute 2-3x more threat hunts per analyst.\n\n\n[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n\nDesigned for cloud-scale security operations, the add-on provides Splunk\nanalysts with access to all data stored in the Lakehouse. Bidirectional pipelines\nbetween Splunk and Databricks allow agency analysts to integrate directly into\nSplunk visualizations and security workflows.\n\n\n-----\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Predictive Maintenance", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "deb498e8b0e4add641a926f3454ddb53", + "[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n\nDesigned for cloud-scale security operations, the add-on provides Splunk\nanalysts with access to all data stored in the Lakehouse. Bidirectional pipelines\nbetween Splunk and Databricks allow agency analysts to integrate directly into\nSplunk visualizations and security workflows.\n\n\n-----\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Predictive Maintenance\n\n##### Overview\n\n\n**Integrating unstructured data**\nEquipment data doesn’t just come in the form of IoT data. Agencies can gather\nrich unstructured signals like audio, visual (e.g., video inspections) and text\n(e.g., maintenance logs). Most legacy data architectures are unable to integrate\nstructured and unstructured data sources.\n\n**Operationalizing machine learning**\nMost agencies lack the advanced analytics tools needed to build models that\ncan predict potential equipment failures. Those that do typically have their\ndata scientists working in a siloed set of tools, resulting in unnecessary data\nreplication and inefficient workflows.\n\n##### Solution overview\n\nThe Databricks Lakehouse is tailor-made for building IoT applications at scale.\nWith Databricks, agencies can easily manage large streaming volumes of small\nfiles, with ACID transaction guarantees and reduced job fails compared to\ntraditional data warehouse architectures. Additionally, the Lakehouse is cloud\nnative and built on Apache Spark, so scaling for petabytes of data is not an issue.\nWith the Lakehouse, agencies can bring together all of their structured and\nunstructured data with a unified set of tooling for data engineering, model building\nand production rollout. With these capabilities, operations teams can quickly\ndetect and act on pending equipment failures before they affect performance.\n\n\nPredictive maintenance is oftentimes associated with the manufacturing sector,\nbut in reality it extends far beyond the factory floor. Consider this for a moment:\nthe U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\nbuses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\nvehicles — like multimillion-dollar aircraft — contain sensors that generate\nmassive amounts of data on the use and conditions of various components. And\nit’s not just vehicles. Modern public utilities stream data through connected IoT\ndevices. All of this data can be analyzed to identify the root cause of a failure\nand predict future maintenance, helping to avoid costly repairs and critical\nassets from being out of service.\n\n##### Challenges\n\n**Managing IoT data at scale**\nWith billions of sensors generating information, most data systems are unable to\nhandle the sheer volume of data. Before agencies can even start analyzing their\ndata, legacy data warehouse–based tools require preprocessing of data, making\nreal-time analysis impossible.\n\n\n-----\n\n##### How to get started\n\n\n**Solution Accelerator: Predictive Maintenance**\nLearn how to ingest real-time IoT data from field devices, perform complex\ntime series processing in Delta Lake and leverage machine learning to build\npredictive maintenance models.\n\n[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n\n[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n\n[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3d6176fa88a4867b90655e52485bbd5e", + "-----\n\n##### How to get started\n\n\n**Solution Accelerator: Predictive Maintenance**\nLearn how to ingest real-time IoT data from field devices, perform complex\ntime series processing in Delta Lake and leverage machine learning to build\npredictive maintenance models.\n\n[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n\n[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n\n[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n\n\n[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n\n##### Customer story\n\n**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n\n**Protecting the Water Supply for 700,000 Residents**\nUtilizing machine learning for predictive analytics to help stop water main\nbreaks before they occur, potentially saving hundreds of thousands of dollars\nin repairs while reducing service interruption.\n\n\n-----\n\n##### Reference architecture\n\nWeather Sensor\nReadings\n(semi-structured)\n\nReal-time\nstreaming\n\nWind Turbine\nTelematics\n(semi-structured)\n\nMaintenance Logs\n(unstructured)\n\n\n#### Databricks Lakehouse Platform\n\nBronze Layer Silver Layer Gold Layer\n\n\nAppend Raw\nMerge Data\nData\n\n\nJoin Streams and\nAnalyze Data\n\nEnriched\nReadings\n\n\nOutput\n\n\nBuild Predictive\nMaintenance Model\n\n\nGranular\nReadings\n\n\nAggregated\nHourly\nReadings\n\n\nReal-time Dashboards for Real-Time Dashboards for\nOptimizing Performance Optimizing Performance\n\n|Col1|Col2|Col3|\n|---|---|---|\n\n\n-----\n\n###### USE CASE:\n## Fraud Detection\n\n\n##### Overview\n\nAccording to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\nmonetary losses to fraud, waste and abuse go undetected and total tens of\nbillions of dollars. Financial fraud comes in many forms, from individuals taking\nadvantage of relief programs to complex networks of criminal organizations\nworking together to falsify medical claims and rebate forms. Investigative teams\nhoping to stay ahead of fraudsters need advanced analytics techniques so they\ncan detect anomalous behavior buried in a sea of data.\n\n##### Challenges\n\n**Lack of machine learning**\nA rules-based approach is not enough. Bad actors are getting more and more\nsophisticated in how they take advantage of government programs, necessitating\nan AI-driven approach.\n\n**Unreliable data**\nGetting high-quality, clean data and maintaining a rich feature store is critical\nfor identifying ever-evolving fraud patterns while maintaining a strict record of\nprevious data points.\n\n\n##### Solution overview\n\nThe Databricks Lakehouse enables teams to develop complex ML models with\nhigh governance standards and bridge the gap between data science and\ntechnology to address the challenge of analyzing large volumes of data at scale\n— 40 billion financial transactions a year are made in the United States alone.\nAdditionally, Databricks makes it possible to combine modern AI techniques\nwith the legacy rules-based methods that underpin current approaches to fraud\ndetection all within a common and efficient Spark-based orchestration engine.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "7eda09eb47f349dbcfccf31d852da8e1", + "##### Challenges\n\n**Lack of machine learning**\nA rules-based approach is not enough. Bad actors are getting more and more\nsophisticated in how they take advantage of government programs, necessitating\nan AI-driven approach.\n\n**Unreliable data**\nGetting high-quality, clean data and maintaining a rich feature store is critical\nfor identifying ever-evolving fraud patterns while maintaining a strict record of\nprevious data points.\n\n\n##### Solution overview\n\nThe Databricks Lakehouse enables teams to develop complex ML models with\nhigh governance standards and bridge the gap between data science and\ntechnology to address the challenge of analyzing large volumes of data at scale\n— 40 billion financial transactions a year are made in the United States alone.\nAdditionally, Databricks makes it possible to combine modern AI techniques\nwith the legacy rules-based methods that underpin current approaches to fraud\ndetection all within a common and efficient Spark-based orchestration engine.\n\n##### How to get started\n\n[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n\nDue to an ever-changing landscape, building a financial fraud detection\nframework often goes beyond just creating a highly accurate machine learning\nmodel. Oftentimes it involves a complex-decision science setup that combines\na rules engine with a need for a robust and scalable machine learning platform.\nIn this example, we show how to build a holistic fraud detection solution on\nDatabricks using data from a financial institution.\n\n\n**Analytics at scale**\nTraining complex ML models with hundreds of features on gigabytes of\nstructured, semi-structured and unstructured data can be impossible without a\nhighly scalable and distributed infrastructure.\n\n\n-----\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n\n**Identifying Financial Fraud at Scale**\nProcesses hundreds of billions of market events\nper day on the Databricks Lakehouse and uses\nthe power of machine learning to identify illicit\nactivity in near real-time.\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Money Laundering\n\n##### Overview\n\n\nApproximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\nand with criminal organizations — both at home and abroad — implementing\nincreasingly sophisticated methods for laundering funds, it’s getting harder to\nstop. While the federal government continues to apply pressure on the financial\nsector through heightened regulation, more is needed to combat laundering.\nModern AI techniques such as graph analytics and computer vision can be\nused to process different types of structured (e.g., financial transactions) and\nunstructured (e.g., real estate images) data and identify illicit behavior. This\nallows investigative teams to automate labor-intensive activities like confirming\na residential address or reviewing transaction histories, and instead dig into\npriority threats.\n\n##### Challenges\n\n**Complex data science**\nModern anti-money laundering (AML) practices require multiple ML capabilities\nsuch as entity resolution, computer vision and graph analytics on entity\nmetadata, which is typically not supported by any one data platform.\n\n\n**Time-consuming false positives**\nAny reported suspicious activity must be investigated manually to ensure\naccuracy. Many legacy solutions generate a high number of false positives or fail\nto identify unknown patterns, resulting in wasted effort by investigators.\n\n##### Solution overview\n\nAML solutions face the operational burden of processing billions of transactions\na day. The Databricks Lakehouse Platform combines the low storage cost\nbenefits of cloud data lakes with the robust transaction capabilities of data\nwarehouses, making it the ideal foundation for building AML analytics at massive\nscale. At the core of Databricks is Delta Lake, which can store and combine\nboth unstructured and structured data to build entity relationships; moreover,\nDatabricks Delta Engine provides efficient access using the new Photon compute\nto speed up BI queries on tables spanning billions of transactions. On top of\nthese capabilities, ML is a first-class citizen in the Lakehouse, which means\nanalysts and data scientists do not waste time subsampling or moving data to\nshare dashboards and stay one step ahead of bad actors.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "4c06561b8ae2e83b0c3b0d4f8ce53da4", + "**Time-consuming false positives**\nAny reported suspicious activity must be investigated manually to ensure\naccuracy. Many legacy solutions generate a high number of false positives or fail\nto identify unknown patterns, resulting in wasted effort by investigators.\n\n##### Solution overview\n\nAML solutions face the operational burden of processing billions of transactions\na day. The Databricks Lakehouse Platform combines the low storage cost\nbenefits of cloud data lakes with the robust transaction capabilities of data\nwarehouses, making it the ideal foundation for building AML analytics at massive\nscale. At the core of Databricks is Delta Lake, which can store and combine\nboth unstructured and structured data to build entity relationships; moreover,\nDatabricks Delta Engine provides efficient access using the new Photon compute\nto speed up BI queries on tables spanning billions of transactions. On top of\nthese capabilities, ML is a first-class citizen in the Lakehouse, which means\nanalysts and data scientists do not waste time subsampling or moving data to\nshare dashboards and stay one step ahead of bad actors.\n\n\n**Model transparency**\nAlthough AI can be used to address many money laundering use cases, the lack\nof transparency in the development of ML models offers little explainability,\ninhibiting broader adoption.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n\n\nLakehouse Platform leveraging a series of next-gen machine learning techniques\nincluding NLP, computer vision, entity resolution and graph analytics. This\napproach helps teams better adapt to the reality of modern laundering practices.\n\n\nCurrent anti-money laundering practices bear little resemblance to those of the\nlast decade. In today’s digital world, financial institutions are processing billions\nof transactions daily, increasing the surface area of money laundering. With this\naccelerator, we demonstrate how to build a scalable AML solution on the\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Entity Analytics\n\n##### Overview\n\n\n**No machine learning capabilities**\nEntity resolution typically relies on basic rules-based logic to compare records\n(e.g., matching on name and address), but with messy, large volumes of data,\nadvanced analytics is needed to improve accuracy and accelerate efforts.\n\n##### Solution overview\n\nThe Databricks Lakehouse is an ideal platform for building entity analytics at\nscale. With support for a wide range of data formats and a rich and extensible\nset of data transformation and ML capabilities, Databricks enables agencies to\nbring together all of their data in a central location and move beyond simple\nrules-based methods for entity resolution. Data teams can easily explore\ndifferent machine learning techniques like natural language processing,\nclassification and graph analytics to automate entity matching. And one-click\nprovisioning and deprovisioning of cloud resources makes it easy for teams to\ncost-effectively allocate the necessary compute resources for any size job so\nthey can uncover findings faster.\n\n\nEntity analytics aims to connect disparate data sources to build a full view of\na person or an organization. This has many applications in the public sector,\nsuch as fraud detection, national security and population health. For example,\nMedicare fraud teams need to understand which prescriptions are filled, claims\nfiled and facilities visited across geographies to uncover suspicious behavior.\nBefore teams can even look for suspicious behavior, they must first determine\nwhich records are associated. In the United States, nearly 50,000 people share\nthe name John Smith (and there are thousands of others with similar names).\nImagine trying to identify the right John Smith for this type of analysis. That’s no\neasy task.\n\n##### Challenges\n\n**Disjointed data**\nManaging complex and brittle ETL pipelines in order to cleanse and join data\nacross siloed systems and data stores.\n\n\n**Compute intensive**\nIdentifying related entities across population-level data sets requires massive\ncompute power that far outstrips legacy on-prem data architectures.\n\n\n-----\n\n##### How to get started", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "fc164d931337a8fb50a377fa7e37f2bb", + "Entity analytics aims to connect disparate data sources to build a full view of\na person or an organization. This has many applications in the public sector,\nsuch as fraud detection, national security and population health. For example,\nMedicare fraud teams need to understand which prescriptions are filled, claims\nfiled and facilities visited across geographies to uncover suspicious behavior.\nBefore teams can even look for suspicious behavior, they must first determine\nwhich records are associated. In the United States, nearly 50,000 people share\nthe name John Smith (and there are thousands of others with similar names).\nImagine trying to identify the right John Smith for this type of analysis. That’s no\neasy task.\n\n##### Challenges\n\n**Disjointed data**\nManaging complex and brittle ETL pipelines in order to cleanse and join data\nacross siloed systems and data stores.\n\n\n**Compute intensive**\nIdentifying related entities across population-level data sets requires massive\ncompute power that far outstrips legacy on-prem data architectures.\n\n\n-----\n\n##### How to get started\n\n[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n\nLearn from Databricks experts on how entity analytics is being deployed\nin the public sector and watch a demo that shows how to use ML to link\npayments and treatments across millions of records in a public CMS data set.\n\n[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n\nWhile focused on retail, this accelerator has applications for any organization\nworking on entity matching, especially as it relates to items that might be stored\nacross locations. In this notebook, we demonstrate how to use machine learning\nand the Databricks Lakehouse Platform to resolve differences between product\ndefinitions and descriptions, and determine which items are likely pairs and\nwhich are distinct across disparate data sets.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n\nIn this talk, NewWave shares the specifics on CMS’s entity resolution use case,\nthe ML necessary for this data and the unique uses of Databricks in providing\nthis capability.\n\n##### Sample workflow\n\n\n-----\n\n###### USE CASE:\n## Geospatial Analytics\n\n##### Overview\n\n\n**Broad range of analytics capabilities**\nEnterprises require a diverse set of data applications — including SQL-based\nanalytics, real-time monitoring, data science and machine learning — to support\ngeospatial workloads given the diverse nature of the data and use cases.\n\n##### Solution overview\n\nWith Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\nworkloads, as it provides a single source of truth for all types of structured,\nunstructured, streaming and batch data, enabling seamless spatio-temporal\nunification and cross-querying with tabular and raster-based data. Built on\nApache Spark, the Lakehouse easily scales for data sets consisting of billions\nof rows of data with distributed processing in the cloud. To expand on the core\ncapabilities of the Lakehouse, Databricks has introduced the Mosaic library,\nan extension to the Apache Spark framework, built for fast and easy processing\nof large geospatial data sets. Popular frameworks such as Apache Sedona or\nGeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\nLakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\nto support all types of geospatial use cases.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "3906955f310fd2514373213350a23fe3", + "##### Solution overview\n\nWith Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\nworkloads, as it provides a single source of truth for all types of structured,\nunstructured, streaming and batch data, enabling seamless spatio-temporal\nunification and cross-querying with tabular and raster-based data. Built on\nApache Spark, the Lakehouse easily scales for data sets consisting of billions\nof rows of data with distributed processing in the cloud. To expand on the core\ncapabilities of the Lakehouse, Databricks has introduced the Mosaic library,\nan extension to the Apache Spark framework, built for fast and easy processing\nof large geospatial data sets. Popular frameworks such as Apache Sedona or\nGeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\nLakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\nto support all types of geospatial use cases.\n\n\nEvery day billions of handheld and IoT devices, along with thousands of\nairborne and satellite remote sensing platforms, generate hundreds of exabytes\nof location-aware data. This boom of geospatial big data combined with\nadvancements in machine learning is enabling government agencies to develop\nnew capabilities. The potential use cases for geospatial analytics and AI touch\nevery part of the government, including disaster recovery (e.g., flood/earthquake\nmapping), defense and intel (e.g., detecting threats using drone footage),\ninfrastructure (e.g., public transportation planning), civilian safety (e.g., crime\nprediction), public health (e.g., disease spread tracking), and much more. Every\nagency at the state and federal level needs to consider how they can tap into\ngeospatial data.\n\n##### Challenges\n\n**Massive volumes of geospatial data**\nWith the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\ndaily, outpacing their ability to store and process this data at scale.\n\n\n**Compute-intensive spatial workloads**\nGeospatial data is complex in structure, with various formats not well suited for\nlegacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nBuild a Lakehouse to support all of your geospatial analytics and AI use cases\nwith the Mosaic library. Mosaic provides a number of capabilities including easy\nconversion between common spatial data encodings, constructors to easily\ngenerate new geometries from Spark native data types, many of the OGC SQL\nstandard ST_ functions implemented as Spark Expressions for transforming,\naggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\nall provided with the flexibility of a Scala, SQL or Python API.\n\n[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n\nLearn how to build powerful geospatial insights and visualizations with a\nLakehouse for all your geospatial data processing, analytics and AI.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "968928e9f1109e7b2205726eff3e1d66", + "[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n\nLearn how to build powerful geospatial insights and visualizations with a\nLakehouse for all your geospatial data processing, analytics and AI.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n\n**Analyzing Flight Data to Improve Aviation**\nTo help airlines better serve their millions of passengers, USDOT built a\nmodern analytics architecture on Databricks that incorporates data such as\nweather, flight, aeronautical and surveillance information. With this new\nplatform, they reduced compute costs by 90% and can now power use cases\nsuch as predicting air cargo traffic patterns, flight delays and the financial\nimpact of flight cancellations.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n\n**Customer Story: Flood Prediction With Machine Learning**\nIn an effort to improve the safety of civil projects, Stantec built a machine\nlearning model on Databricks leveraging large volumes of weather and geological\ndata — oftentimes consisting of trillions of data points — to predict the impact\nof flash floods on various regions and adjust civil planning accordingly.\n\n\n-----\n\n##### Reference architecture\n\nMosaic Kepler Magics\nGeometry Display Functions\nfor Map Display\n\nESRI Java API for\nGeometry Operations\n\n\nBuilt-In Indexing\nSystem Support\n\n\nJTS Java API for\nGeometry Operations\n\n\n-----\n\n###### USE CASE:\n## Public Health Management\n\n##### Overview\n\n\nIn their lifetime, every human is expected to generate a million gigabytes of\nhealth data spanning electronic health records, medical images, claims, wearable\ndata, genomics and more. This data is critical to understanding the health of\nthe individual, but when aggregated and analyzed across large populations,\ngovernment agencies can glean important insights like disease trends, the\nimpact of various treatment guidelines and the effectiveness of resources. By\nadding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\nlocation, income level, education, housing — agencies can better identify\nunderserved communities and the critical factors that contribute to positive\nhealth outcomes.\n\n##### Challenges\n\n**Rapidly growing health data**\nHealthcare data is growing exponentially. Unfortunately, legacy on-premises data\narchitectures are complex to manage and too costly to scale for populationscale analytics.\n\n\n**Complexities of ML in healthcare**\nThe legacy analytics platforms that underpin healthcare lack the robust data\nscience capabilities needed for predictive health use cases like disease risk\nscoring. There’s also the challenge of managing reproducibility, which is critical\nwhen building ML models that can impact patient outcomes.\n\n##### Solution overview\n\nThe Databricks Lakehouse enables public health agencies to bring together all\ntheir research and patient data in a HIPAA-certified environment and marry it\nwith powerful analytics and AI capabilities to deliver real-time and predictive\ninsights at population scale. The Lakehouse eliminates the need for legacy\ndata architectures, which have historically inhibited innovation in patient care\nby creating data silos and making advanced analytics difficult. Databricks led\nopen source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\nthat make it easy to ingest and prepare healthcare-specific data modalities for\ndownstream analytics.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "88e28b7c1cb02366091a1560b26dfc60", + "##### Solution overview\n\nThe Databricks Lakehouse enables public health agencies to bring together all\ntheir research and patient data in a HIPAA-certified environment and marry it\nwith powerful analytics and AI capabilities to deliver real-time and predictive\ninsights at population scale. The Lakehouse eliminates the need for legacy\ndata architectures, which have historically inhibited innovation in patient care\nby creating data silos and making advanced analytics difficult. Databricks led\nopen source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\nthat make it easy to ingest and prepare healthcare-specific data modalities for\ndownstream analytics.\n\n\n**Fragmented patient data**\nIt is widely accepted that over 80% of medical data is unstructured, yet most\norganizations still focus their attention on data warehouses designed to only\nsupport structured data and SQL-based analytics.\n\n\n-----\n\n##### How to get started\n\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nOur joint solutions with John Snow Labs bring together the power of Spark NLP\nfor Healthcare with the collaborative analytics and AI capabilities of Databricks.\nInformatics teams can ingest raw unstructured medical text files into Databricks,\nextract meaningful insights using natural language processing techniques,\nand make the data available for downstream analytics. We have specific NLP\nsolutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n\n[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n\nOne of the most powerful tools for identifying patients at risk for a chronic\ncondition is the analysis of real world data (RWD). This Solution Accelerator\nnotebook provides a template for building a machine learning model that\nassesses the risk of a patient for a given condition within a given window of time\nbased on a patient’s encounter history and demographics information.\n\n\n[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n\nDatabricks COVID-19 surveillance solution takes a data-driven approach to\nadaptive response, applying predictive analytics to COVID-19 data sets to\nhelp drive more effective shelter-in-place policies.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n\n**From Vaccine Management to ICU Planning**\nDuring the pandemic, the Chesapeake Regional Information System for our\nPatients implemented a modern data architecture on Databricks to address\ncritical reporting needs. This allowed them to analyze 400 billion data points\n\nfor innovative use cases like real-time disease spread tracking, vaccine\ndistribution and prioritizing vulnerable populations.\n\n\n-----\n\n## Conclusion\n\nToday, data is at the core of how government agencies operate and AI is at the", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "f9e2a23ff54684fbfcd88171a4a56914", + "Databricks COVID-19 surveillance solution takes a data-driven approach to\nadaptive response, applying predictive analytics to COVID-19 data sets to\nhelp drive more effective shelter-in-place policies.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n\n**From Vaccine Management to ICU Planning**\nDuring the pandemic, the Chesapeake Regional Information System for our\nPatients implemented a modern data architecture on Databricks to address\ncritical reporting needs. This allowed them to analyze 400 billion data points\n\nfor innovative use cases like real-time disease spread tracking, vaccine\ndistribution and prioritizing vulnerable populations.\n\n\n-----\n\n## Conclusion\n\nToday, data is at the core of how government agencies operate and AI is at the\n\nforefront of driving innovation into the future. The Databricks Lakehouse for\n\nPublic Sector enables government agencies at the federal, state and local level\n\nto harness the full power of data and analytics to solve strategic challenges and\n\nmake smarter decisions that improve the safety and quality of life of all citizens.\n\nGet started with a free trial of Databricks Lakehouse and start building better\n\ndata applications today.\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n###### Contact us for a personalized demo databricks.com/contact\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "a61e36c3eaa7a642dc30c708abe4dc6c", + "**EBOOK**\n\n# Four Forces Driving Intelligent Manufacturing\n\n### A data-driven business built on Lakehouse for Manufacturing\n\n\n-----\n\n## Contents\n\nIntroduction .................................................................................................................................................................................................................................................. **03**\n\nThe four driving forces of change ..................................................................................................................................................................................................... **04**\n\nDigital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n\nManufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a957319c3c96e04dfccb0e1e1a1f4ccd", + "Manufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n\nThe foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n\nDRIVING FORCE NO. 1\nThe shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n\nDRIVING FORCE NO. 2\nTransparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n\nDRIVING FORCE NO. 3\nFuture opportunities for manufacturing business models ......................................................................................................................................................... **13**", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "46fb1c1ddbad577c69f2632635003ba9", + "DRIVING FORCE NO. 3\nFuture opportunities for manufacturing business models ......................................................................................................................................................... **13**\n\nDRIVING FORCE NO. 4\nThe focus on sustainability ....................................................................................................................................................................................................................... **15**\n\nLeveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n\nThe building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n\nManufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n\n2 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Introduction", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fe836d02d98b3b6c8a001bb6836708c8", + "Manufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n\n2 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Introduction\n\n##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n\n\nBut today it’s data- and AI-driven businesses that\nare being rewarded because they’re using process\nand product optimization not previously possible,\nable to forecast and sense supply chain demand,\nand, crucially, introduce new forms of revenue\nbased upon service rather than product.\n\nThe drivers for this evolution will be the emergence\nof what we refer to as “Intelligent Manufacturing”\nthat has been enabled by the rise of computational\npower at the Edge and in the Cloud. As well as\nnew levels of connectivity speed enabled by 5G\nand fiber optic, combined with increased use of\nadvanced analytics and machine learning (ML).\n\n\nYet, even with all the technological advances\nenabling these new data-driven businesses,\nchallenges exist.\n\nMcKinsey’s recent research with the World\nEconomic Forum estimates the value creation\npotential of manufacturers and suppliers that\nimplement Industry 4.0 in their operations\nat USD$37 trillion by 2025. Truly a huge number.\nBut the challenge that most companies still\nstruggle with is the move from piloting point\nsolutions to delivering sustainable impact at scale.\n[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n\n\n##### 80% of manufacturers\n[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "b0de1f22c24ec00611cdfbd12e2b0ef5", + "##### 80% of manufacturers\n[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n\n##### 57% of manufacturing leaders feel their organization\n[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n\n[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n##### manufacturers by 2025\n\n\n3 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The four driving forces of change\n\n###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n\n\n###### Skills and production gaps\n\nThe rise of the digital economy is demanding a new set of skills.\nFor today’s Intelligent Manufacturing organizations, there’s a fundamental\nneed for computer and programming skills for automation, along\nwith critical-thinking abilities. Also important is the ability to use\ncollaboration systems and new advanced assistance tools, such as\nautomation, virtual reality (VR) and augmented reality (AR). The deficit\nof workers with these skills is of critical concern to manufacturers.\n\nIn addition, the industry dynamics are pushing companies to increase\nand refine both partner/supplier relationships, optimize internal\noperations and build robust supply chains that do not rely upon\nsafety stock to weather supply chain swings. Historical focus on\noperational use cases is now extending to building agile supply chains.\n\n###### Supply chain volatility\n\nIf the events of the last few years proved anything, it’s that supply\nchains need to be robust and resilient. Historically, supply chain volatility\nwas smoothed by holding “safety stock,” which added costs without\nfinancial value. Then the pendulum swung to “just in time delivery,”\nwhere efficient use of working capital disregarded demand risks.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "a2db7fe8fbdc33c18355e02c7ec60dc7", + "The rise of the digital economy is demanding a new set of skills.\nFor today’s Intelligent Manufacturing organizations, there’s a fundamental\nneed for computer and programming skills for automation, along\nwith critical-thinking abilities. Also important is the ability to use\ncollaboration systems and new advanced assistance tools, such as\nautomation, virtual reality (VR) and augmented reality (AR). The deficit\nof workers with these skills is of critical concern to manufacturers.\n\nIn addition, the industry dynamics are pushing companies to increase\nand refine both partner/supplier relationships, optimize internal\noperations and build robust supply chains that do not rely upon\nsafety stock to weather supply chain swings. Historical focus on\noperational use cases is now extending to building agile supply chains.\n\n###### Supply chain volatility\n\nIf the events of the last few years proved anything, it’s that supply\nchains need to be robust and resilient. Historically, supply chain volatility\nwas smoothed by holding “safety stock,” which added costs without\nfinancial value. Then the pendulum swung to “just in time delivery,”\nwhere efficient use of working capital disregarded demand risks.\n\nRecent experiences have highlighted that demand sensing is needed\nin addition to safety stock for high-risk parts or raw materials. The ability\nto monitor, predict and respond to external factors – including natural\ndisasters, shipping and warehouse constraints, and geopolitical disruption\n– is vital to reduce risk and promote agility. Many of these external\ndata sources leverage unstructured data (news, social posts, videos\nand images), and being able to manage both structured and unstructured\ndata available to measure and analyze this volatility is key.\n\n\n###### Need for new and additional sources of revenue\n\nManufacturers’ growth historically has been limited\nto new product introduction rate or expansion into\nnew geographies. The emergence of “equipment\nas-a-service” is changing that dynamic. It’s pivoting\nthe business from product-centric growth to one\nleveraging added services, which are not slaves to the\nproduct development introduction cycle and can be highly\ndifferentiated depending on the market segment and types\nof products. Real-time data plays an outsize role, as now\nbusinesses are in unison with use cases such as predictive\nmaintenance, stock replenishment and worker safety.\n\n###### An increased focus on sustainability\n\nManufacturers have always focused on efficiency,\nbut they’re increasingly seeing the value chain as circular.\nIt’s no longer enough to consider an organization’s own\ncarbon footprint – it needs to also include indirect\nemissions and other environmental impacts from the\nactivities it doesn’t own or control. This requires a\n360-degree view of sustainability, which includes both\ninternal and external factors in measuring compliance\nwith ESG programs.\n\n**This eBook will look closer at these four key challenges**\n**and their associated use cases, as well as some**\n**of the most effective technologies and solutions**\n**that can be implemented to respond to them.**\n\n\n4 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Digital transformation is not a destination, it’s a journey\n\n##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n\nThis transition from manual operations to automated\nsolutions is enhancing and optimizing operational\nefficiency and decision-making, while also making\nsupply chains more frictionless and reliable, as well\nas enabling organizations to become more responsive\nand adaptable to market and customer needs.\n\nThis disruption has been driven by a rush of new\ntechnologies including artificial intelligence, machine\nlearning, advanced analytics, digital twins, Internet\nof Things (IoT), and automation. These, in turn, have\nbeen enabled by the greater network capabilities of 5G.\nIndustry 4.0 is well underway. Intelligent Manufacturing\nisn’t the future, it’s what competitive organizations\nhave established today.\n\n\n## The data and AI maturity curve\n### From descriptive to prescriptive\n\nPrescriptive\nAnalytics\n\nPredictive\nModeling\n\n**How** can we make it happen?\n\nData\nExploration\n\n\n**What** will happen?\n\n**Why** did it happen?\n\n\nAd Hoc\nQueries\n\n\nReports\n\n\nCleaned\nData\n\n**What** happened?\n\nAnalytics Maturity\n\n\nRaw\nData\n\n\n5 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturing – use case maturity matrix\n\n\nNo\n\n1", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "fb676139b6f1a49181c1e792ad607ca7", + "This disruption has been driven by a rush of new\ntechnologies including artificial intelligence, machine\nlearning, advanced analytics, digital twins, Internet\nof Things (IoT), and automation. These, in turn, have\nbeen enabled by the greater network capabilities of 5G.\nIndustry 4.0 is well underway. Intelligent Manufacturing\nisn’t the future, it’s what competitive organizations\nhave established today.\n\n\n## The data and AI maturity curve\n### From descriptive to prescriptive\n\nPrescriptive\nAnalytics\n\nPredictive\nModeling\n\n**How** can we make it happen?\n\nData\nExploration\n\n\n**What** will happen?\n\n**Why** did it happen?\n\n\nAd Hoc\nQueries\n\n\nReports\n\n\nCleaned\nData\n\n**What** happened?\n\nAnalytics Maturity\n\n\nRaw\nData\n\n\n5 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturing – use case maturity matrix\n\n\nNo\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n10\n\n11\n\n12\n\n13\n\n14\n\n15\n\n16\n\n17\n\n18\n\n19\n\n20\n\n21\n\n22\n\n23\n\n\nUse case name\n\nEDW offload\n\nProduct 360\n\nVoice of customer insights\n\nTesting & simulation optimization\n\nSupplier 360\n\nSpend analytics\n\nSourcing event optimization\n\nProcess & quality monitoring\n\nProcess 360\n\nEquipment predictive maintenance\n\nQuality & yield optimization\n\nSupply chain 360\n\nDemand analytics\n\nInventory visibility & tracking\n\nInventory optimization\n\nLogistics route optimization\n\nCustomer 360\n\nMarketing & sales personalization\n\nRecommendation engine\n\nAsset/Vehicle 360\n\nConnected asset & value-added services\n\nQuality event detection & traceability\n\nAsset predictive maintenance\n\n\nPeer Competitive Scale\n\nStandard among peer group\n\nCommon among peer group\n\nStrategic among peer group\n\n\nDesign\n\n\nPurchasing\n\n**11**\n\n**10**\n\n**13**\n\n**12**\n\n**17**\n\n\nNew innovations\n\nManufacturing\n\nSupply Chain\n\n\nThat is not to say that the digital transformation\njourney is simple. Replacing legacy systems, breaking\ndown data and organizational silos, bridging the gap\nbetween operational technology (OT) and informational\ntechnology (IT), reskilling workforces, and much more\nrequires a clear and determined digitalization strategy,\nand to reach new levels of IT and data maturity.\n\n\n**16**\n\n\nMuch of the aforementioned transformation requires\na foundation of effective data management and\narchitecture to be in place. Without this ability to\ncontrol the vast amounts of structured data (highly\norganized and easily decipherable) and unstructured\ndata (qualitative, no predefined data model),\nmanufacturers cannot generate actionable insights\nfrom their data, derive value from machine learning,\nmonitor and analyze supply chains, or coordinate\ndecisions across the business.\n\n\n**15**\n\n\n**14**\n\n\nMarketing & Sales\n\nService\n\n\n**19**\n\n\n**18**\n\n\n**23**\n\n\n**22**\n**21**\n**20**\n\n\nAwareness\n\n\nExploration Optimization Transformation\n\nMaturity Stages\n\n\n6 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The foundations for data-driven manufacturing\n\n###### Cloud-native platforms\n\nImprove data management, enhance data analytics\nand expand the use of enterprise data, including streaming\nstructured and unstructured data\n\n###### Technology-enabled collaboration\n\nDemocratize analytics and ML capabilities – ensure the right\nusers have access to the right data driving business value", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "ba3d3e25b1b1a711acb9cb695f985606", + "**15**\n\n\n**14**\n\n\nMarketing & Sales\n\nService\n\n\n**19**\n\n\n**18**\n\n\n**23**\n\n\n**22**\n**21**\n**20**\n\n\nAwareness\n\n\nExploration Optimization Transformation\n\nMaturity Stages\n\n\n6 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The foundations for data-driven manufacturing\n\n###### Cloud-native platforms\n\nImprove data management, enhance data analytics\nand expand the use of enterprise data, including streaming\nstructured and unstructured data\n\n###### Technology-enabled collaboration\n\nDemocratize analytics and ML capabilities – ensure the right\nusers have access to the right data driving business value\n\n###### The ability to scale machine learning use cases\n\nA central place to store and discover ML models and enabling\ngreater collaboration between ML, data and business users\n\n\n##### 95% agree that\n[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n\n[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n##### USD$2.8 trillion by 2025\n\n\n##### 85% have accelerated\n[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n\n\n###### Open standards and open data architectures\n\nLeverage open source standards and open data formats\nto accelerate innovation and enable the integration\nof best-of-breed, third-party tools and services\n\n\n7 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 1\n\n## The shift from manufacturing to Intelligent Manufacturing\n\n##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n\n\nYet the reasons for the lack of manufacturing\ntalent today are manifold, and COVID-19 has only\ncontributed to an existing problem. For instance,\nmany highly experienced baby boomers are\nretiring from the workforce, leaving fewer people\nwith the in-depth knowledge of custom equipment\nand machines. Meanwhile, younger generations\nhave a poor perception of what manufacturing jobs\nare like and are reluctant to step into the industry.\nMeaning not only a problem with retaining skills,\nbut also attracting them.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "657a4ccc4a91e02d17152105d50b375e", + "7 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 1\n\n## The shift from manufacturing to Intelligent Manufacturing\n\n##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n\n\nYet the reasons for the lack of manufacturing\ntalent today are manifold, and COVID-19 has only\ncontributed to an existing problem. For instance,\nmany highly experienced baby boomers are\nretiring from the workforce, leaving fewer people\nwith the in-depth knowledge of custom equipment\nand machines. Meanwhile, younger generations\nhave a poor perception of what manufacturing jobs\nare like and are reluctant to step into the industry.\nMeaning not only a problem with retaining skills,\nbut also attracting them.\n\nAnd, of course, there is a growing gap between\nthe current capabilities of industrial workers and\nthe skill sets needed for today’s data-driven,\nsensor-filled, 5G-enabled Intelligent Manufacturing.\n\n\nWith the drive to optimize operations, stabilize\nsupply chains and reinvent business models\nthrough equipment-as-a-service, the skill sets\nhave radically changed from even a decade ago.\n\nIntelligent Manufacturing’s use cases are placing\na high demand on robotics programmers and\ntechnicians, cybersecurity experts, digital twin\narchitects, supply network analysts, and people\nwho can leverage AI and machine learning\nalgorithms because deployment of these common\nuse cases is producing multiples of returns for\nthose embracing Intelligent Manufacturing.\n\n\n8 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n\n\n##### 44% report difficulty\n[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n##### 83% of manufacturing workers are interested\n[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n\n##### 56% of Gen Z say\n[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n\n### Proof through customer success\n\n##### Watch our case study\n\n\n###### Digital twins", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "b93c12666e373499f2aed5bfd1bbad8e", + "##### 56% of Gen Z say\n[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n\n### Proof through customer success\n\n##### Watch our case study\n\n\n###### Digital twins\n\nIngesting information from sensors and other data sources,\nthese virtual replicas of physical assets create models\nto which a layer of visualization can be applied. This enables\nusers to predict failures, assess performance and reveal\nopportunities for optimization. Digital twins unlock the ability\nfor manufacturers to monitor and manage production remotely,\nas well as explore “what-if” scenarios.\n\n###### Process and quality optimization\n\nProcess and quality optimization generally covers the\noptimization of equipment, operating procedures, and control\nloops. It requires access to accurate, up-to-date data about\nconditions, collected through IoT devices to monitor every\naspect. The introduction of deep learning architectures is\nenabling manufacturing machinery to identify visual clues\nthat are indicative of quality issues in manufactured goods,\nwhile digital twins can be used to spot inefficiencies without\nthe need to pause production.\n\n###### Throughput optimization\n\nIncreasing throughput is critical for meeting delivery schedules,\nand manufacturers are always looking for ways to identify\nand eliminate bottlenecks, reduce inventory and increase\nthe utilization of assets. Throughput optimization makes\nuse of data-driven algorithms to identify, rank and resolve\nlabor, equipment or inventory bottlenecks.\n\n\n###### Equipment predictive maintenance\n\nRather than wait for a piece of equipment to fail or\nstick to a fixed schedule, predictive maintenance adopts\na predictive approach to equipment maintenance.\nBy monitoring real-time data collected from hundreds\nof IoT sensors, machine learning techniques can detect\nanomalies in operations and possible defects in equipment\nand processes. Predictive maintenance correlates data across\nmany more dimensions than traditional inspection techniques,\nto anticipate failures and prevent costly breakdowns.\n\n###### Quality and yield optimization (with computer vision)\n\nQuality assurance focuses on the use of data analytics,\nAI and machine learning to identify and prevent defects\nduring the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\nrecognition and machine learning, computer vision\ncan automate visual inspections, detecting faults\nand imperfections faster and more cost effectively\nthan manual approaches.\n\n\n9 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 2\n\n## Transparency, visibility, data: optimizing the supply chain\n\n##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n\n\nSuch resiliency requires a combination\nof technologies and solutions. For example,\ndecision support tools with predictive capabilities\n– to monitor the supply chain and analyze\nwhat-if scenarios. Demand sensing and forecasting\nin combination with enterprise critical systems\n(ERP) needs to combine data from a wide variety\nof sources.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "1e002cdf6f6032ddc2663352f7af1492", + "9 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 2\n\n## Transparency, visibility, data: optimizing the supply chain\n\n##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n\n\nSuch resiliency requires a combination\nof technologies and solutions. For example,\ndecision support tools with predictive capabilities\n– to monitor the supply chain and analyze\nwhat-if scenarios. Demand sensing and forecasting\nin combination with enterprise critical systems\n(ERP) needs to combine data from a wide variety\nof sources.\n\n10 Four Forces Driving Intelligent Manufacturing\n\n\nWorking together, combining millions of data points\nfrom across organizations’ operations along with\nother external sources, these technologies can\nbe used to optimize supply chains, reduce costs\nand improve customer service and loyalty.\nHowever, achieving this – embracing the latest\nin AI, machine learning and predictive analytics –\nmeans being able to manage and maintain\na flow of accurate, relevant data and to be able\nto translate this data into actionable insights.\n\n\n-----\n\n#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n\n\n###### Demand, inventory, logistics\n\n\n###### Purchasing\n\n**Spend analytics:** Most obviously, transparency and insight into where\ncash is spent is vital for identifying opportunities to reduce external\nspending across supply markets, suppliers and locations. However, spend\nanalytics are also hugely important to supply chain agility and resilience.\nThis requires a single source of data truth for finance and procurement\ndepartments. For example, integrating purchase order, invoice,\naccounts payable, and general-ledger account data to create a level of\ntransparency, visibility and consistency to inform supplier discussions\nand deploy strategies to manage cash better during times\nof disruption.\n\n###### Cross supply chain collaboration\n\n**Supply chain 360:** With real-time insights and aggregated supply\nchain data in a single business intelligence dashboard, manufacturers\nare empowered with greater levels of visibility, transparency\nand insights for more informed decision-making. This dashboard\ncan be used to identify risks and take corrective steps,\nassess suppliers, control costs and more.\n\n\n**Demand analytics:** By collecting and analyzing millions –\nif not billions – of data points about market and customer\nbehavior and product performance, manufacturers can\nuse this understanding to improve operations and support\nstrategic decisions that affect the demand of products\nand services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n\n**Inventory visibility and tracking:**\nInventory visibility is the ability to view and track\ninventory in real time, with insights into SKU stock levels\nand which warehouse or fulfillment center it is stored at.\nWith complete oversight of inventory across multiple\nchannels, this helps improve supply chain efficiency,\ndemand forecasting and order accuracy, while ultimately\nenhancing the customer experience.\n\n\n**Inventory optimization:** The practice of having the right\namount of available inventory to meet demand, both in the\npresent and the future, enables manufacturers to address\ndemand expectations, and reduce the costs of common\ninventory issues. Inventory optimization incorporates\ndata for demand forecasting, inventory strategy and\nstock replenishment. With the addition of AI reinforced\nlearning models, this can help improve demand prediction,\nrecommend stock levels, and automatically order\nraw materials to fulfill orders, while also detecting\nand responding to shifts in demand.", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "f5fd438387cc6cba8eca4f034cde5db9", + "**Inventory visibility and tracking:**\nInventory visibility is the ability to view and track\ninventory in real time, with insights into SKU stock levels\nand which warehouse or fulfillment center it is stored at.\nWith complete oversight of inventory across multiple\nchannels, this helps improve supply chain efficiency,\ndemand forecasting and order accuracy, while ultimately\nenhancing the customer experience.\n\n\n**Inventory optimization:** The practice of having the right\namount of available inventory to meet demand, both in the\npresent and the future, enables manufacturers to address\ndemand expectations, and reduce the costs of common\ninventory issues. Inventory optimization incorporates\ndata for demand forecasting, inventory strategy and\nstock replenishment. With the addition of AI reinforced\nlearning models, this can help improve demand prediction,\nrecommend stock levels, and automatically order\nraw materials to fulfill orders, while also detecting\nand responding to shifts in demand.\n\n**Logistics route optimization:** Using AI, route optimization\ncan help manufacturers go beyond normal route planning\nand include parameters to further drive logistics efficiency.\nWhat-if scenarios present route options that help cut\ntransportation costs, boost productivity and execute\non-time deliveries.\n\n\n**Supply chain network design:** By building and modeling the supply\nchain, it enables manufacturers to understand the costs and time\nto bring goods and services to market. Supply chain network design\nhelps to evaluate delivery at the lowest possible cost, optimal sources\nand inventory deployment, as well as define distribution strategies.\n\n11 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n\n Only 6% of companies believe\n[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n\n##### 57% believe that supply chain management \n[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n\n### Supply chain optimization case study\n\n##### Watch our case study\n\n12 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 3\n\n## Future opportunities for manufacturing business models\n\n##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n\n\nThese new opportunities are forming part\nof a longer-term industry shift from the sale\nof goods (CapEx) to recurring revenue streams,\nsuch as through Equipment-as-a-Service (EaaS)\nmodels. While this approach is not new to many\n(Rolls-Royce’s “Power-by-the-Hour” engine\nsubscription model has been around since 1962),\ncustomer demand, advances in industrial IoT\ntechnology, and a continuing decline in\nsales and margins have seen EaaS emerge\nas an imperative for manufacturers.\n\n\nOpening up some of these new revenue streams,\nof course, demands operational flexibility, but more\nimportantly, digital maturity. This means cloud\ntechnologies that allow employees new levels\nof access to data, the ability to work anywhere,\nand adapt rapidly to new needs. The introduction\nof a microservices architecture, to allow the agile\ndevelopment and deployment of new IT services.\nAnd the democratization of data, so the entire\norganization and its ecosystem of partners\nand suppliers have access to information\nabout market demand, operations, production,\nlogistics and transportation.\n\n\n13 Four Forces Driving Intelligent Manufacturing\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "e39b4cfca32bed94bd77fdf60caaae62", + "These new opportunities are forming part\nof a longer-term industry shift from the sale\nof goods (CapEx) to recurring revenue streams,\nsuch as through Equipment-as-a-Service (EaaS)\nmodels. While this approach is not new to many\n(Rolls-Royce’s “Power-by-the-Hour” engine\nsubscription model has been around since 1962),\ncustomer demand, advances in industrial IoT\ntechnology, and a continuing decline in\nsales and margins have seen EaaS emerge\nas an imperative for manufacturers.\n\n\nOpening up some of these new revenue streams,\nof course, demands operational flexibility, but more\nimportantly, digital maturity. This means cloud\ntechnologies that allow employees new levels\nof access to data, the ability to work anywhere,\nand adapt rapidly to new needs. The introduction\nof a microservices architecture, to allow the agile\ndevelopment and deployment of new IT services.\nAnd the democratization of data, so the entire\norganization and its ecosystem of partners\nand suppliers have access to information\nabout market demand, operations, production,\nlogistics and transportation.\n\n\n13 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n##### By 2023, 20% of industrial equipment manufacturers will\n[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n\n##### In 2025, the global EaaS market is estimated\n[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n\n##### In the U.S., 34% said\n[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n\n### Equipment as a service case study\n\n##### Read our case study\n\n\n### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n\n\n###### Connected assets\n\nThe digital connectivity of high-value\nphysical assets is helping to drive a\nmore efficient use of assets and cost\nsavings. Connected assets can provide\ncontinuous, real-time data on their\noperating conditions, even if they are on\nthe other side of the world. Connected\nassets can also be used as the foundation\nof as-a-service business models to\ntrack the usage of rented machines, and\nfor automakers to use with connected\nvehicles and electrification strategies.\n\n\n###### Quality event detection and traceability\n\nManufacturers are increasingly seeking\nend-to-end supply chain traceability —\nto be able to identify and trace\nthe history, distribution, location\nand application of products, parts\nand materials. With event-based\ntraceability, typically using blockchain\nledgers, manufacturers can record\nevents along the supply chain.\nThis can help aid legal compliance,\nsupport quality assurance and brand\ntrust, and provide full supply chain\nvisibility for better risk management.\n\n\n###### Demand-driven manufacturing\n\n**Equipment-as-a-Service:**\nStartup organizations without\nthe in-house infrastructure can\nuse a third-party to realize their\nconcepts, while manufacturers\nwith the production capabilities\ncan ensure minimal downtime\nfor their assets. This involves\ngreater risk for the manufacturer,\nbut also the potential for higher\nand annuitized revenues.\n\n\n14 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 4\n\n## The focus on sustainability", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "3a08841a626197fbd67f40efeccd13e5", + "###### Quality event detection and traceability\n\nManufacturers are increasingly seeking\nend-to-end supply chain traceability —\nto be able to identify and trace\nthe history, distribution, location\nand application of products, parts\nand materials. With event-based\ntraceability, typically using blockchain\nledgers, manufacturers can record\nevents along the supply chain.\nThis can help aid legal compliance,\nsupport quality assurance and brand\ntrust, and provide full supply chain\nvisibility for better risk management.\n\n\n###### Demand-driven manufacturing\n\n**Equipment-as-a-Service:**\nStartup organizations without\nthe in-house infrastructure can\nuse a third-party to realize their\nconcepts, while manufacturers\nwith the production capabilities\ncan ensure minimal downtime\nfor their assets. This involves\ngreater risk for the manufacturer,\nbut also the potential for higher\nand annuitized revenues.\n\n\n14 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 4\n\n## The focus on sustainability\n\n##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n\n\nWhen looking at the entire manufacturing\nvalue chain, there are many areas where\nmore sustainable practices can deliver\nmeasurable change. Products can be\ndesigned in a way that reduces waste\nand increases their longevity; materials\ncan be selected and sourced in a more\nethical way; operational efficiency and\ngreen energy can improve production;\nand the introduction of sustainable\npractices for transportation and\nshipping can help reduce carbon\nfootprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n\n\nThere are a number of business\noperating models that employ the four\nRs and support the circular economy.\nSharing platforms and aaS models help\noptimize manufacturing capacity and\nenable businesses to rent rather than\nbuy the machinery and equipment\nthey need. Product use extension helps\nextend the lifecycle of products through\nrepair and refurbishment, while resource\nrecovery means recovering raw materials\nfrom end-of-life products.\n\nAchieving this means establishing\na redesigned supply chain that\nleverages many use cases, technologies\nand solutions we covered earlier.\n\n\nIt will require greater levels of\ncollaboration between suppliers\nand vendors. It will require optimizing\nproduction lines and transportation.\nIt will require greater levels of customer\nengagement to extend product lifecycles\nand close the loop of the supply chain.\n\nBut most of all, it will require data,\nto provide visibility and intelligence\nacross the network, and to be able\nto make the decisions to improve\nefficiency in the present, as well as\nlonger-term decisions based on a\nbroad view of sustainability impacts.\n\n\n15 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Sustainability Solution Accelerator\n\n##### Read now", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "b7fdb18fa34a2929db074b7b529f7ebc", + "There are a number of business\noperating models that employ the four\nRs and support the circular economy.\nSharing platforms and aaS models help\noptimize manufacturing capacity and\nenable businesses to rent rather than\nbuy the machinery and equipment\nthey need. Product use extension helps\nextend the lifecycle of products through\nrepair and refurbishment, while resource\nrecovery means recovering raw materials\nfrom end-of-life products.\n\nAchieving this means establishing\na redesigned supply chain that\nleverages many use cases, technologies\nand solutions we covered earlier.\n\n\nIt will require greater levels of\ncollaboration between suppliers\nand vendors. It will require optimizing\nproduction lines and transportation.\nIt will require greater levels of customer\nengagement to extend product lifecycles\nand close the loop of the supply chain.\n\nBut most of all, it will require data,\nto provide visibility and intelligence\nacross the network, and to be able\nto make the decisions to improve\nefficiency in the present, as well as\nlonger-term decisions based on a\nbroad view of sustainability impacts.\n\n\n15 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Sustainability Solution Accelerator\n\n##### Read now\n\n\n[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n##### world’s energy consumption\n[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n\n\n##### 80% of the world’s leading companies \n[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n\n\n##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n\n\n16 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Leveraging the Databricks Lakehouse for Manufacturing\n\nOur open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\nand transportation & logistics organizations to unlock more value and transform how they use data and AI.\n\n\nAll your sources Any structure or frequency\n\n\nReliable, real-time processing Analytics capabilities for any use case or persona\n\n\nCompetitor News\n& Social\n\nConsumer Devices\n\nVideo & Images\n\nIoT & Shop Floor\n\nEnterprise Resource\nPlanning\n\nSales Transaction\n& Syndicated\n\nInventory & Logistics\n\n\nUnstructured batch\n\n\nAd Hoc Data Science\n\nLow-cost, rapid experimentation\nwith new data and models.\n\nProduction Machine Learning\n\nHigh volume, fine-grained analysis at scale\nserved in the tightest of service windows.\n\nBI Reporting and Dashboarding\n\nPower real-time dashboarding directly,\nor feed data to a data warehouse for\nhigh-concurrency reporting.\n\nReal-Time Applications\n\n\nLakehouse enables a real-time\ndata-driven business with the ability\nto ingest structured, semi-structured\nand unstructured data from ERP,\nSCM, IoT, social or other sources\nin your value chain so that predictive\nAI and ML insights can be realized.\nThis enables them to operate their\nbusiness in real time, deliver more\naccurate analytics that leverage all\ntheir data, and drive collaboration\nand innovation across their value\nchain. Most important for capital\nintensive manufacturing business,\nit enables them to move quickly\nfrom proof-of-concept (PoC)\nideation to ROI quickly.\n\n\nSemi-structured real-time\n\nUnstructured batch\n\nSemi-structured real-time\n\nStructured real-time", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": true, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "chunk_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "content_chunked", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "parser_status", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doc_uri", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "last_modified", + "type": "\"timestamp\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from cookbook.data_pipeline.chunk_docs import apply_chunking_fn\n", + "from cookbook.databricks_utils import get_table_url\n", + "\n", + "# Tune this parameter to optimize performance. More partitions will improve performance, but may cause out of memory errors if your cluster is too small.\n", + "NUM_PARTITIONS = 50\n", + "\n", + "# Load parsed docs\n", + "parsed_files_df = spark.table(output_config.parsed_docs_table).repartition(NUM_PARTITIONS)\n", + "\n", + "chunked_docs_df = chunked_docs_table = apply_chunking_fn(\n", + " # The source documents table.\n", + " parsed_docs_df=parsed_files_df,\n", + " # The chunking function that takes a string (document) and returns a list of strings (chunks).\n", + " chunking_fn=recursive_character_text_splitter_fn,\n", + " # Choose which columns to propagate from the docs table to chunks table. `doc_uri` column is required we can propagate the original document URL to the Agent's web app.\n", + " propagate_columns=propagate_columns,\n", + ")\n", + "\n", + "# Write to Delta Table\n", + "chunked_docs_df.write.mode(\"overwrite\").option(\n", + " \"overwriteSchema\", \"true\"\n", + ").saveAsTable(output_config.chunked_docs_table)\n", + "\n", + "# Get resulting table\n", + "chunked_docs_df = spark.table(output_config.chunked_docs_table)\n", + "\n", + "# Show number of chunks created\n", + "print(f\"Created {chunked_docs_df.count()} chunks. Inspect `chunked_docs_df` or visit {get_table_url(output_config.chunked_docs_table)} to see the results.\")\n", + "\n", + "# enable CDC feed for VS index sync\n", + "cdc_results = spark.sql(f\"ALTER TABLE {output_config.chunked_docs_table} SET TBLPROPERTIES (delta.enableChangeDataFeed = true)\")\n", + "\n", + "# Show chunks\n", + "display(chunked_docs_df.toPandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9fe923a8-89c2-4852-9cea-98074b3ce404", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### 🚫✏️ Pipeline step 3: Create the vector index\n", + "\n", + "In this step, we'll embed the documents to compute the vector index over the chunks and create our retriever index that will be used to query relevant documents to the user question. The embedding pipeline is handled within Databricks Vector Search using [Delta Sync](https://docs.databricks.com/en/generative-ai/create-query-vector-search.html#create-a-vector-search-index)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d53faa42-2a65-40b0-8fc1-6c27e88df6d0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Found existing index https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs_chunked_index__v2...\nStarting the sync of index casaman_ssa.demos.test_product_docs_docs_chunked_index__v2, this can take 15 minutes or much longer if you have a larger number of documents.\nNOTE: This cell will complete before the vector index has finished syncing/embedding your chunks & is ready for queries!\nView sync status here: https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs_chunked_index__v2\n" + ] + } + ], + "source": [ + "from cookbook.data_pipeline.build_retriever_index import build_retriever_index\n", + "from cookbook.databricks_utils import get_table_url\n", + "\n", + "is_error, msg = retriever_index_result = build_retriever_index(\n", + " # Spark requires `` to escape names with special chars, VS client does not.\n", + " chunked_docs_table_name=output_config.chunked_docs_table.replace(\"`\", \"\"),\n", + " vector_search_endpoint=output_config.vector_search_endpoint,\n", + " vector_search_index_name=output_config.vector_index,\n", + "\n", + " # Must match the embedding endpoint you used to chunk your documents\n", + " embedding_endpoint_name=chunking_config.embedding_model_endpoint,\n", + "\n", + " # Set to true to re-create the vector search endpoint when re-running the data pipeline. If set to True, syncing will not work if re-run the pipeline and change the schema of chunked_docs_table_name. Keeping this as False will allow Vector Search to avoid recomputing embeddings for any row with that has a chunk_id that was previously computed.\n", + " force_delete_index_before_create=False,\n", + ")\n", + "if is_error:\n", + " raise Exception(msg)\n", + "else:\n", + " print(\"NOTE: This cell will complete before the vector index has finished syncing/embedding your chunks & is ready for queries!\")\n", + " print(f\"View sync status here: {get_table_url(output_config.vector_index)}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1a1ad14b-2573-4485-8369-d417f7a548f6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### 🚫✏️ Print links to view the resulting tables/index" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0cd40431-4cd3-4cc9-b38d-5ab817c40043", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\nParsed docs table: https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs__v2\n\nChunked docs table: https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs_chunked__v2\n\nVector search index: https://adb-984752964297111.11.azuredatabricks.net/explore/data/casaman_ssa/demos/test_product_docs_docs_chunked_index__v2\n\n" + ] + } + ], + "source": [ + "from cookbook.databricks_utils import get_table_url\n", + "\n", + "print()\n", + "print(f\"Parsed docs table: {get_table_url(output_config.parsed_docs_table)}\\n\")\n", + "print(f\"Chunked docs table: {get_table_url(output_config.chunked_docs_table)}\\n\")\n", + "print(f\"Vector search index: {get_table_url(output_config.vector_index)}\\n\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "01_data_pipeline", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/02_agent_setup.ipynb b/autogen_agent_app_sample_code/02_agent_setup.ipynb index ad0a5f2..429049a 100644 --- a/autogen_agent_app_sample_code/02_agent_setup.ipynb +++ b/autogen_agent_app_sample_code/02_agent_setup.ipynb @@ -1,113 +1,368 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## 👉 START HERE: How to use this notebook -# MAGIC -# MAGIC ### Step 1: Agent storage configuration -# MAGIC -# MAGIC This notebook initializes a `AgentStorageConfig` Pydantic class to define the locations where the Agent's code/config and its supporting data & metadata is stored in the Unity Catalog: -# MAGIC - **Unity Catalog Model:** Stores staging/production versions of the Agent's code/config -# MAGIC - **MLflow Experiment:** Stores every development version of the Agent's code/config, each version's associated quality/cost/latency evaluation results, and any MLflow Traces from your development & evaluation processes -# MAGIC - **Evaluation Set Delta Table:** Stores the Agent's evaluation set -# MAGIC -# MAGIC This notebook does the following: -# MAGIC 1. Validates the provided locations exist. -# MAGIC 2. Serializes this configuration to `config/agent_storage_config.yaml` so other notebooks can use it - -# COMMAND ---------- - -# MAGIC %md -# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: -# MAGIC - ✅✏️ *should* customize - these cells contain config settings to change -# MAGIC - 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to validate / save the configuration -# MAGIC -# MAGIC *Cells that don't require customization still need to be run!* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Install Python libraries - -# COMMAND ---------- - -# MAGIC %pip install -qqqq -U -r requirements.txt -# MAGIC %restart_python - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Connect to Databricks -# MAGIC -# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. - -# COMMAND ---------- - -from mlflow.utils import databricks_utils as du -import os -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - - spark = DatabricksSession.builder.getOrCreate() - os.environ["MLFLOW_TRACKING_URI"] = "databricks" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Get current user info to set default values - -# COMMAND ---------- - -from cookbook.databricks_utils import get_current_user_info - -user_email, user_name, default_catalog = get_current_user_info(spark) - -print(f"User email: {user_email}") -print(f"User name: {user_name}") -print(f"Default UC catalog: {default_catalog}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### ✅✏️ Configure your Agent's storage locations -# MAGIC -# MAGIC Either review & accept the default values or enter your preferred location. - -# COMMAND ---------- - -from cookbook.config.shared.agent_storage_location import AgentStorageConfig -from cookbook.databricks_utils import get_mlflow_experiment_url -import mlflow - -# Default values below for `AgentStorageConfig` -agent_name = "my_agent_autogen" -uc_catalog_name = "casaman_ssa" -uc_schema_name = "demos" - -# Agent storage configuration -agent_storage_config = AgentStorageConfig( - uc_model_name=f"{uc_catalog_name}.{uc_schema_name}.{agent_name}", # UC model to store staging/production versions of the Agent's code/config - evaluation_set_uc_table=f"{uc_catalog_name}.{uc_schema_name}.{agent_name}_eval_set", # UC table to store the evaluation set - mlflow_experiment_name=f"/Users/{user_email}/{agent_name}_mlflow_experiment", # MLflow Experiment to store development versions of the Agent and their associated quality/cost/latency evaluation results + MLflow Traces -) - -# Validate the UC catalog and schema for the Agent'smodel & evaluation table -is_valid, msg = agent_storage_config.validate_catalog_and_schema() -if not is_valid: - raise Exception(msg) - -# Set the MLflow experiment, validating the path is valid -experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) -# If running in a local IDE, set the MLflow experiment name as an environment variable -os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name - -print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Save the configuration for use by other notebooks - -# COMMAND ---------- - -from cookbook.config import serializable_config_to_yaml_file - -serializable_config_to_yaml_file(agent_storage_config, "./configs/agent_storage_config.yaml") \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d0640741-6d84-482a-aa79-f87b04d04023", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 👉 START HERE: How to use this notebook\n", + "\n", + "### Step 1: Agent storage configuration\n", + "\n", + "This notebook initializes a `AgentStorageConfig` Pydantic class to define the locations where the Agent's code/config and its supporting data & metadata is stored in the Unity Catalog:\n", + "- **Unity Catalog Model:** Stores staging/production versions of the Agent's code/config\n", + "- **MLflow Experiment:** Stores every development version of the Agent's code/config, each version's associated quality/cost/latency evaluation results, and any MLflow Traces from your development & evaluation processes\n", + "- **Evaluation Set Delta Table:** Stores the Agent's evaluation set\n", + "\n", + "This notebook does the following:\n", + "1. Validates the provided locations exist.\n", + "2. Serializes this configuration to `config/agent_storage_config.yaml` so other notebooks can use it" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7702011a-84dd-4281-bba1-ea9e2b5e551d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "**Important note:** Throughout this notebook, we indicate which cells you:\n", + "- ✅✏️ *should* customize - these cells contain config settings to change\n", + "- 🚫✏️ *typically will not* customize - these cells contain boilerplate code required to validate / save the configuration\n", + "\n", + "*Cells that don't require customization still need to be run!*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f8963d6e-3123-4095-bb92-9d508c52ed41", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Install Python libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0a145c3b-d3d9-4b95-b7f6-22e1d8e991c6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.2 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.2 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.24 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.3 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt\n", + "%restart_python" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fff62a18-ac56-497b-82c2-f32bd7d88061", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Connect to Databricks\n", + "\n", + "If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "88f31cbb-504f-4ca1-b96c-d8ecc37d5f73", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.utils import databricks_utils as du\n", + "import os\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()\n", + " os.environ[\"MLFLOW_TRACKING_URI\"] = \"databricks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a9feb28c-c72b-49b2-bbc4-a9bd4721a7cd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Get current user info to set default values" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7824cc0a-1b29-4cf9-a974-2c5ef885979f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "User email: manffred.calvosanchez@databricks.com\nUser name: manffred_calvosanchez\nDefault UC catalog: sunny_uc\n" + ] + } + ], + "source": [ + "from cookbook.databricks_utils import get_current_user_info\n", + "\n", + "user_email, user_name, default_catalog = get_current_user_info(spark)\n", + "\n", + "print(f\"User email: {user_email}\")\n", + "print(f\"User name: {user_name}\")\n", + "print(f\"Default UC catalog: {default_catalog}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4b684188-d4eb-4944-86ae-9942a68308c2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### ✅✏️ Configure your Agent's storage locations\n", + "\n", + "Either review & accept the default values or enter your preferred location." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "64682c1f-7e61-430e-84c9-4fb9cad8152b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "All catalogs and schemas exist for both model `casaman_ssa.demos.my_agent_autogen` and evaluation table `casaman_ssa.demos.my_agent_autogen_eval_set`.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/12/11 18:01:36 INFO mlflow.tracking.fluent: Experiment with name '/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment' does not exist. Creating a new experiment.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "View the MLflow Experiment `/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment` at https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093\n" + ] + } + ], + "source": [ + "from cookbook.config.shared.agent_storage_location import AgentStorageConfig\n", + "from cookbook.databricks_utils import get_mlflow_experiment_url\n", + "import mlflow\n", + "\n", + "# Default values below for `AgentStorageConfig` \n", + "agent_name = \"my_agent_autogen\"\n", + "uc_catalog_name = \"casaman_ssa\"\n", + "uc_schema_name = \"demos\"\n", + "\n", + "# Agent storage configuration\n", + "agent_storage_config = AgentStorageConfig(\n", + " uc_model_name=f\"{uc_catalog_name}.{uc_schema_name}.{agent_name}\", # UC model to store staging/production versions of the Agent's code/config\n", + " evaluation_set_uc_table=f\"{uc_catalog_name}.{uc_schema_name}.{agent_name}_eval_set\", # UC table to store the evaluation set\n", + " mlflow_experiment_name=f\"/Users/{user_email}/{agent_name}_mlflow_experiment\", # MLflow Experiment to store development versions of the Agent and their associated quality/cost/latency evaluation results + MLflow Traces\n", + ")\n", + "\n", + "# Validate the UC catalog and schema for the Agent'smodel & evaluation table\n", + "is_valid, msg = agent_storage_config.validate_catalog_and_schema()\n", + "if not is_valid:\n", + " raise Exception(msg)\n", + "\n", + "# Set the MLflow experiment, validating the path is valid\n", + "experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name)\n", + "# If running in a local IDE, set the MLflow experiment name as an environment variable\n", + "os.environ[\"MLFLOW_EXPERIMENT_NAME\"] = agent_storage_config.mlflow_experiment_name\n", + "\n", + "print(f\"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7a49117d-f136-41fa-807d-8be60b863fa9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Save the configuration for use by other notebooks" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6dd99015-5b0d-420b-8a3e-067d84b84dc7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config import serializable_config_to_yaml_file\n", + "\n", + "serializable_config_to_yaml_file(agent_storage_config, \"./configs/agent_storage_config.yaml\")" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "02_agent_setup", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb b/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb index 18c451e..0b51a3f 100644 --- a/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb +++ b/autogen_agent_app_sample_code/03_create_synthetic_eval.ipynb @@ -1,144 +1,49145 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## 👉 START HERE: How to use this notebook -# MAGIC -# MAGIC ### Step 1: Create synthetic evaluation data -# MAGIC -# MAGIC To measure your Agent's quality, you need a diverse, representative evaluation set. This notebook turns your unstructured documents into a high-quality synthetic evaluation set so that you can start to evaluate and improve your Agent's quality before subject matter experts are available to label data. -# MAGIC -# MAGIC This notebook does the following: -# MAGIC 1. -# MAGIC -# MAGIC THIS DOES NOT WORK FROM LOCAL IDE YET. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC **Important note:** Throughout this notebook, we indicate which cells you: -# MAGIC - ✅✏️ *should* customize - these cells contain config settings to change -# MAGIC - 🚫✏️ *typically will not* customize - these cells contain code that is parameterized by your configuration. -# MAGIC -# MAGIC *Cells that don't require customization still need to be run!* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Install Python libraries - -# COMMAND ---------- - -# MAGIC %pip install -qqqq -U -r requirements.txt -# MAGIC dbutils.library.restartPython() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Connect to Databricks -# MAGIC -# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. - -# COMMAND ---------- - -from mlflow.utils import databricks_utils as du -import os - -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - - spark = DatabricksSession.builder.getOrCreate() - os.environ["MLFLOW_TRACKING_URI"] = "databricks" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Load the Agent's storage locations -# MAGIC -# MAGIC This notebook writes to the evaluation set table that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. - -# COMMAND ---------- - -from cookbook.config.shared.agent_storage_location import AgentStorageConfig -from cookbook.databricks_utils import get_table_url -from cookbook.config import load_serializable_config_from_yaml_file - -# Load the Agent's storage configuration -agent_storage_config: AgentStorageConfig = load_serializable_config_from_yaml_file('./configs/agent_storage_config.yaml') - -# Check if the evaluation set already exists -try: - eval_dataset = spark.table(agent_storage_config.evaluation_set_uc_table) - if eval_dataset.count() > 0: - print(f"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} already exists! By default, this notebook will append to the evaluation dataset. If you would like to overwrite the existing evaluation set, please delete the table before running this notebook.") - else: - print(f"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} exists, but is empty! By default, this notebook will NOT change the schema of this table - if you experience schema related errors, drop this table before running this notebook so it can be recreated with the correct schema.") -except Exception: - print(f"Evaluation set `{agent_storage_config.evaluation_set_uc_table}` does not exist. This notebook will create a new Delta Table at this location.") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Load the source documents for synthetic evaluation data generation -# MAGIC -# MAGIC Most often, this will be the same as the document output table from the [data pipeline](01_data_pipeline.ipynb). -# MAGIC -# MAGIC Here, we provide code to load the documents table that was created in the [data pipeline](01_data_pipeline.ipynb). -# MAGIC -# MAGIC Alternatively, this can be a Spark DataFrame, Pandas DataFrame, or list of dictionaries with the following keys/columns: -# MAGIC - `doc_uri`: A URI pointing to the document. -# MAGIC - `content`: The content of the document. - -# COMMAND ---------- - -from cookbook.config.data_pipeline import DataPipelineConfig -from cookbook.config import load_serializable_config_from_yaml_file - -datapipeline_config: DataPipelineConfig= load_serializable_config_from_yaml_file('./configs/data_pipeline_config.yaml') - -source_documents = spark.table(datapipeline_config.output.parsed_docs_table) - -display(source_documents.toPandas()) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Run the synthetic evaluation data generation -# MAGIC -# MAGIC Optionally, you can customize the guidelines to guide the synthetic data generation. By default, guidelines are not applied - to apply the guidelines, uncomment `guidelines=guidelines` in the `generate_evals_df(...)` call. See our [documentation](https://docs.databricks.com/en/generative-ai/agent-evaluation/synthesize-evaluation-set.html) for more details. - -# COMMAND ---------- - -from databricks.agents.evals import generate_evals_df - -# NOTE: The guidelines you provide are a free-form string. The markdown string below is the suggested formatting for the set of guidelines, however you are free -# to add your sections here. Note that this will be prompt-engineering an LLM that generates the synthetic data, so you may have to iterate on these guidelines before -# you get the results you desire. -guidelines = """ -# Task Description -The Agent is a RAG chatbot that answers questions about using Spark on Databricks. The Agent has access to a corpus of Databricks documents, and its task is to answer the user's questions by retrieving the relevant docs from the corpus and synthesizing a helpful, accurate response. The corpus covers a lot of info, but the Agent is specifically designed to interact with Databricks users who have questions about Spark. So questions outside of this scope are considered irrelevant. - -# User personas -- A developer who is new to the Databricks platform -- An experienced, highly technical Data Scientist or Data Engineer - -# Example questions -- what API lets me parallelize operations over rows of a delta table? -- Which cluster settings will give me the best performance when using Spark? - -# Additional Guidelines -- Questions should be succinct, and human-like -""" - -synthesized_evals_df = generate_evals_df( - docs=source_documents, - # The number of evaluations to generate for each doc. - num_evals=10, - # A optional set of guidelines that help guide the synthetic generation. This is a free-form string that will be used to prompt the generation. - # guidelines=guidelines -) - -# Write the synthetic evaluation data to the evaluation set table -spark.createDataFrame(synthesized_evals_df).write.format("delta").mode("append").saveAsTable(agent_storage_config.evaluation_set_uc_table) - -# Display the synthetic evaluation data -eval_set_df = spark.table(agent_storage_config.evaluation_set_uc_table) -display(eval_set_df.toPandas()) \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fb6c2851-d04e-4c40-8f67-6f7bd77057db", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 👉 START HERE: How to use this notebook\n", + "\n", + "### Step 1: Create synthetic evaluation data\n", + "\n", + "To measure your Agent's quality, you need a diverse, representative evaluation set. This notebook turns your unstructured documents into a high-quality synthetic evaluation set so that you can start to evaluate and improve your Agent's quality before subject matter experts are available to label data.\n", + "\n", + "This notebook does the following:\n", + "1. \n", + "\n", + "THIS DOES NOT WORK FROM LOCAL IDE YET." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0988eef5-d1bf-4e28-956e-5b895306996c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "**Important note:** Throughout this notebook, we indicate which cells you:\n", + "- ✅✏️ *should* customize - these cells contain config settings to change\n", + "- 🚫✏️ *typically will not* customize - these cells contain code that is parameterized by your configuration.\n", + "\n", + "*Cells that don't require customization still need to be run!*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c1bb8aeb-7397-4f57-ba94-3eeb5d85d59c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Install Python libraries" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "cbcdef70-657e-4f12-b564-90d0f5b74e42", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.2 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.2 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.24 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.3 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "99bc4527-3968-4fc0-9d41-7fbdb8ccea90", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Connect to Databricks\n", + "\n", + "If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "987f7958-8d58-4f3d-826d-b719e57a845b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.utils import databricks_utils as du\n", + "import os\n", + "\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()\n", + " os.environ[\"MLFLOW_TRACKING_URI\"] = \"databricks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "70f6d803-1117-4067-9997-2bc53ac00822", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Load the Agent's storage locations\n", + "\n", + "This notebook writes to the evaluation set table that you specified in the [Agent setup](02_agent_setup.ipynb) notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5bd1a3ff-bd90-49a6-944e-6d9afe138d59", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Evaluation set `casaman_ssa.demos.my_agent_autogen_eval_set` does not exist. This notebook will create a new Delta Table at this location.\n" + ] + } + ], + "source": [ + "from cookbook.config.shared.agent_storage_location import AgentStorageConfig\n", + "from cookbook.databricks_utils import get_table_url\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "\n", + "# Load the Agent's storage configuration\n", + "agent_storage_config: AgentStorageConfig = load_serializable_config_from_yaml_file('./configs/agent_storage_config.yaml')\n", + "\n", + "# Check if the evaluation set already exists\n", + "try:\n", + " eval_dataset = spark.table(agent_storage_config.evaluation_set_uc_table)\n", + " if eval_dataset.count() > 0:\n", + " print(f\"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} already exists! By default, this notebook will append to the evaluation dataset. If you would like to overwrite the existing evaluation set, please delete the table before running this notebook.\")\n", + " else:\n", + " print(f\"Evaluation set {get_table_url(agent_storage_config.evaluation_set_uc_table)} exists, but is empty! By default, this notebook will NOT change the schema of this table - if you experience schema related errors, drop this table before running this notebook so it can be recreated with the correct schema.\")\n", + "except Exception:\n", + " print(f\"Evaluation set `{agent_storage_config.evaluation_set_uc_table}` does not exist. This notebook will create a new Delta Table at this location.\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4af68bf5-7a1d-4d84-b363-45de317ab05e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Load the source documents for synthetic evaluation data generation\n", + "\n", + "Most often, this will be the same as the document output table from the [data pipeline](01_data_pipeline.ipynb).\n", + "\n", + "Here, we provide code to load the documents table that was created in the [data pipeline](01_data_pipeline.ipynb).\n", + "\n", + "Alternatively, this can be a Spark DataFrame, Pandas DataFrame, or list of dictionaries with the following keys/columns:\n", + "- `doc_uri`: A URI pointing to the document.\n", + "- `content`: The content of the document." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "89f7035b-9957-4300-97e2-bb21afc47039", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/html": [ + "
contentparser_statusdoc_urilast_modified
**EBOOK**\n", + "\n", + "## The Big Book of Data Engineering 2nd Edition\n", + "\n", + "A collection of technical\n", + "blogs, including code\n", + "samples and notebooks\n", + "\n", + "##### With all-new content\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n", + "\n", + "**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n", + "\n", + "**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n", + "\n", + "**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n", + "\n", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n", + "\n", + "**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n", + "\n", + "**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n", + "\n", + "**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n", + "\n", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n", + "\n", + "**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n", + "\n", + "**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n", + "\n", + "**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n", + "\n", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77\n", + "\n", + "**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n", + "\n", + "**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n", + "\n", + "**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n", + "\n", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90\n", + "\n", + "**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 01\n", + "\n", + "\n", + "### Introduction to Data Engineering on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "Organizations realize the value data plays as a strategic asset for various\n", + "business-related initiatives, such as growing revenues, improving the customer\n", + "experience, operating efficiently or improving a product or service. However,\n", + "accessing and managing data for these initiatives has become increasingly\n", + "complex. Most of the complexity has arisen with the explosion of data volumes\n", + "and data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\n", + "to increase, 73% of the data goes unused for analytics or decision-making. In\n", + "order to try and decrease this percentage and make more data usable, data\n", + "engineering teams are responsible for building data pipelines to efficiently and\n", + "reliably deliver data. But the process of building these complex data pipelines\n", + "comes with a number of difficulties:\n", + "\n", + "**•** In order to get data into a data lake, data engineers are required\n", + "to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state\n", + "\n", + "**Data pipeline observability**\n", + "Monitor overall data pipeline status from a dataflow graph dashboard and\n", + "visually track end-to-end pipeline health for performance, quality and latency.\n", + "Data pipeline observability capabilities include:\n", + "\n", + "**•** A high-quality, high-fidelity lineage diagram that provides visibility\n", + "into how data flows for impact analysis\n", + "\n", + "**•** Granular logging with performance and status of the data pipeline\n", + "at a row level\n", + "\n", + "**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n", + "\n", + "\n", + "**Automatic deployments and operations**\n", + "Ensure reliable and predictable delivery of data for analytics and machine\n", + "learning use cases by enabling easy and automatic data pipeline deployments\n", + "and rollbacks to minimize downtime. Benefits include:\n", + "\n", + "**•** Complete, parameterized and automated deployment for the\n", + "continuous delivery of data\n", + "\n", + "**•** End-to-end orchestration, testing and monitoring of data pipeline\n", + "deployment across all major cloud providers\n", + "\n", + "**Migrations**\n", + "Accelerating and de-risking the migration journey to the lakehouse, whether\n", + "from legacy on-prem systems or disparate cloud services.\n", + "\n", + "The migration process starts with a detailed discovery and assessment to\n", + "get insights on legacy platform workloads and estimate migration as well as\n", + "Databricks platform consumption costs. Get help with the target architecture\n", + "and how the current technology stack maps to Databricks, followed by a\n", + "phased implementation based on priorities and business needs. Throughout\n", + "this journey companies can leverage:\n", + "\n", + "**•** Automation tools from Databricks and its ISV partners\n", + "\n", + "**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n", + "\n", + "**•** Databricks Professional Services and training\n", + "\n", + "This is the recommended approach for a successful migration, whereby\n", + "customers have seen a 25-50% reduction in costs and 2-3x faster time to value\n", + "for their use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified governance**\n", + "With Unity Catalog, data engineering and governance teams benefit from an\n", + "enterprisewide data catalog with a single interface to manage permissions,\n", + "centralize auditing, automatically track data lineage down to the column level,\n", + "and share data across platforms, clouds and regions. Benefits:\n", + "\n", + "**•** Discover all your data in one place, no matter where it lives,\n", + "and centrally manage fine-grained access permissions using an\n", + "ANSI SQL-based interface\n", + "\n", + "**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**\n", + "\n", + "As organizations strive to become data-driven, data engineering is a focal\n", + "point for success. To deliver reliable, trustworthy data, data engineers shouldn’t\n", + "need to spend time manually developing and maintaining an end-to-end\n", + "ETL lifecycle. Data engineering teams need an efficient, scalable way to\n", + "simplify ETL development, improve data reliability and manage operations.\n", + "\n", + "As described, the eight key differentiating capabilities simplify the\n", + "management of the ETL lifecycle by automating and maintaining all data\n", + "dependencies, leveraging built-in quality controls with monitoring and by\n", + "providing deep visibility into pipeline operations with automatic recovery.\n", + "Data engineering teams can now focus on easily and rapidly building reliable\n", + "end-to-end production-ready data pipelines using only SQL or Python\n", + "for batch and streaming that deliver high-value data for analytics, data\n", + "science or machine learning.\n", + "\n", + "\n", + "**Follow proven best practices**\n", + "\n", + "In the next section, we describe best practices for data engineering\n", + "end-to end use cases drawn from real-world examples. From data ingestion\n", + "and real-time processing to analytics and machine learning, you’ll learn\n", + "how to translate raw data into actionable data.\n", + "\n", + "As you explore the rest of this guide, you can find data sets and code\n", + "samples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\n", + "get your hands dirty as you explore all aspects of the data lifecycle on the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### Guidance and Best Practices\n", + "\n", + "**2.1** Top 5 Databricks Performance Tips\n", + "\n", + "**2.2** How to Profile PySpark\n", + "\n", + "**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n", + "\n", + "**2.4** Streaming in Production: Collected Best Practices\n", + "\n", + "**2.5** Streaming in Production: Collected Best Practices, Part 2\n", + "\n", + "**2.6** Building Geospatial Data Products\n", + "\n", + "**2.7** Data Lineage With Unity Catalog\n", + "\n", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:\n", + "\n", + "**•** **Use larger clusters.** It may sound obvious, but this is the number\n", + "one problem we see. It’s actually not any more expensive to use a large\n", + "cluster for a workload than it is to use a smaller one. It’s just faster.\n", + "If there’s anything you should take away from this article, it’s this.\n", + "\n", + "Read section 1. Really.\n", + "\n", + "**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\n", + "to learn more. You won’t regret it.\n", + "\n", + "\n", + "\n", + "**•** **Clean out your configurations** . Configurations carried from one\n", + "Apache Spark™ version to the next can cause massive problems. Clean up!\n", + "Read section 3 to learn more.\n", + "\n", + "**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\n", + "correctly, if at all. See Section 4 to learn more.\n", + "\n", + "**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\n", + "you’re writing Spark code, jump to section 5.\n", + "\n", + "**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\n", + "blog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n", + "\n", + "**1. Give your clusters horsepower!**\n", + "\n", + "This is the number one mistake customers make. Many customers create tiny\n", + "clusters of two workers with four cores each, and it takes forever to do anything.\n", + "The concern is always the same: they don’t want to spend too much money on\n", + "larger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n", + "**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n", + "\n", + "\n", + "-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2\n", + "\n", + "Notice that the total cost of the workload stays the same while the real-world\n", + "time it takes for the job to run drops significantly. So, bump up your Databricks\n", + "cluster specs and speed up your workloads without spending any more money. It\n", + "\n", + "can’t really get any simpler than that.\n", + "\n", + "**2. Use Photon**\n", + "\n", + "Our colleagues in engineering have rewritten the Spark execution engine in C++\n", + "and dubbed it Photon. The results are impressive!\n", + "\n", + "\n", + "Beyond the obvious improvements due to running the engine in native code,\n", + "they’ve also made use of CPU-level performance features and better memory\n", + "\n", + "management. On top of this, they’ve rewritten the Parquet writer in C++. So this\n", + "makes writing to Parquet and Delta (based on Parquet) super fast as well!\n", + "\n", + "But let’s also be clear about what Photon is speeding up. It improves\n", + "computation speed for any built-in functions or operations, as well as writes to\n", + "Parquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n", + "(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\n", + "spending most of its time reading from an ancient on-prem database? Won’t\n", + "help there either, unfortunately.\n", + "\n", + "\n", + "-----\n", + "\n", + "The good news is that it helps where it can. So even if part of your job can’t be\n", + "sped up, it will speed up the other parts. Also, most jobs are written with the\n", + "native operations and spend a lot of time writing to Delta, and Photon helps a lot\n", + "there. So give it a try. You may be amazed by the results!\n", + "\n", + "**3. Clean out old configurations**\n", + "\n", + "You know those Spark configurations you’ve been carrying along from version to\n", + "version and no one knows what they do anymore? They may not be harmless.\n", + "We’ve seen jobs go from running for hours down to minutes simply by cleaning\n", + "out old configurations. There may have been a quirk in a particular version of\n", + "Spark, a performance tweak that has not aged well, or something pulled off\n", + "some blog somewhere that never really made sense. At the very least, it’s worth\n", + "revisiting your Spark configurations if you’re in this situation. Often the default\n", + "configurations are the best, and they’re only getting better. Your configurations\n", + "may be holding you back.\n", + "\n", + "**4. The Delta Cache is your friend**\n", + "\n", + "This may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.\n", + "\n", + "Of course, your mileage may vary. If you’re doing BI, which involves reading the\n", + "same tables over and over again, caching gives an amazing boost. However, if\n", + "you’re simply reading a table once and writing out the results as in some ETL\n", + "jobs, you may not get much benefit. You know your jobs better than anyone.\n", + "Go forth and conquer.\n", + "\n", + "\n", + "-----\n", + "\n", + "**5. Be aware of lazy evaluation**\n", + "\n", + "\n", + "However, there is a catch here. Every time you try to display or write out\n", + "results, it runs the execution plan again. Let’s look at the same block of code\n", + "but extend it and do a few more operations.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ".filter(...)\n", + ")\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "\n", + "_# Unfortunately this will run the plan again, including filtering, joining,_\n", + "_etc_\n", + "df2.display()\n", + "\n", + "_# So will this…_\n", + "df2.count()\n", + "—------\n", + "\n", + "\n", + "If you’re a data analyst or data scientist only using SQL or doing BI you can skip\n", + "this section. However, if you’re in data engineering and writing pipelines or doing\n", + "processing using Databricks/Spark, read on.\n", + "\n", + "When you’re writing Spark code like select, groupBy, filter, etc., you’re really\n", + "building an execution plan. You’ll notice the code returns almost immediately when\n", + "you run these functions. That’s because it’s not actually doing any computation. So\n", + "even if you have petabytes of data, it will return in less than a second.\n", + "\n", + "However, once you go to write your results out you’ll notice it takes longer. This\n", + "is due to lazy evaluation. It’s not until you try to display or write results that your\n", + "execution plan is actually run.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.\n", + "\n", + "\n", + "This works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\n", + "benefit greatly from lazy evaluation, but it’s something a lot of customers trip\n", + "over. So be aware of its existence and save results you reuse in order to avoid\n", + "unnecessary computation.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "Let’s look at the same block of code again, but this time let’s avoid the\n", + "recomputation:\n", + "\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + ")\n", + "\n", + "_# save it_\n", + "df2.write.save(path)\n", + "\n", + "_# load it back in_\n", + "df3 = spark.read.load(path)\n", + "\n", + "_# now use it_\n", + "df3.display()\n", + "\n", + "_# this is not doing any extra computation anymore. No joins, filtering,_\n", + "_etc. It’s already done and saved._\n", + "df3.display()\n", + "\n", + "_# nor is this_\n", + "df3.count()\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.2 \u0007\n", + "\n", + "**How to Profile PySpark**\n", + "\n", + "by **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n", + "\n", + "October 6, 2022\n", + "\n", + "\n", + "In Apache Spark™, declarative Python APIs are supported for big data workloads.\n", + "They are powerful enough to handle most common use cases. Furthermore,\n", + "PySpark UDFs offer more flexibility since they enable users to run arbitrary\n", + "Python code on top of the Apache Spark™ engine. Users only have to state\n", + "“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\n", + "PySpark easier to use, but it can be difficult to identify performance bottlenecks\n", + "and apply custom optimizations.\n", + "\n", + "To address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**\n", + "\n", + "PySpark applications run as independent sets of processes on a cluster,\n", + "coordinated by the SparkContext object in the driver program. On the driver\n", + "side, PySpark is a regular Python process; thus, we can profile it as a normal\n", + "Python program using cProfile as illustrated below:\n", + "\n", + "import cProfile\n", + "\n", + "with cProfile.Profile() as pr:\n", + "_# Your code_\n", + "\n", + "pr.print_stats()\n", + "\n", + "**Workers profiling**\n", + "\n", + "Executors are distributed on worker nodes in the cluster, which introduces\n", + "complexity because we need to aggregate profiles. Furthermore, a Python worker\n", + "process is spawned per executor for PySpark UDF execution, which makes the\n", + "profiling more intricate.\n", + "\n", + "\n", + "-----\n", + "\n", + "The UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\n", + "and becomes a major tool to profile workers for PySpark applications. We’ll\n", + "illustrate how to use the UDF profiler with a simple Pandas UDF example.\n", + "\n", + "Firstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n", + "```\n", + " sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n", + " 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n", + " by 'id' )\n", + " ).withColumn( 'v' , rand())\n", + "\n", + "```\n", + "Later, we will group by the id column, which results in 8 groups with 1,000 rows\n", + "per group.\n", + "\n", + "The Pandas UDF plus_one is then created and applied as shown below:\n", + "```\n", + " import pandas as pd\n", + " def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf.apply( lambda x: x + 1 , axis= 1 )\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "Executing the example above and running sc.show_profiles() prints the\n", + "following profile. The profile below can also be dumped to disk by sc.dump_\n", + "profiles(path).\n", + "\n", + "The UDF id in the profile (271, highlighted above) matches that in the Spark plan\n", + "for res. The Spark plan can be shown by calling res.explain() .\n", + "\n", + "\n", + "Note that plus_one takes a pandas DataFrame and returns another pandas\n", + "DataFrame. For each group, all columns are passed together as a pandas\n", + "DataFrame to the plus_one UDF, and the returned pandas DataFrames are\n", + "combined into a PySpark DataFrame.\n", + "\n", + "\n", + "-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each function\n", + "\n", + "Digging into the column details: plus_one is triggered once per group, 8 times\n", + "in total; _arith_method of pandas Series is called once per row, 8,000 times\n", + "in total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\n", + "row, thus suffering from high invocation overhead.\n", + "\n", + "We can reduce such overhead by substituting the pandas.DataFrame.apply\n", + "with pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\n", + "follows:\n", + "```\n", + " import pandas as pd\n", + " def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf + 1\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n", + " schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "The updated profile is as shown below.\n", + "\n", + "We can summarize the optimizations as follows:\n", + "\n", + "**•** Arithmetic operation from 8,000 calls to 8 calls\n", + "\n", + "**•** Total function calls from 2,898,160 calls to 2,384 calls\n", + "\n", + "**•** Total execution time from 2.300 seconds to 0.004 seconds\n", + "\n", + "The short example above demonstrates how the UDF profiler helps us deeply\n", + "understand the execution, identify the performance bottleneck and enhance\n", + "the overall performance of the user-defined function.\n", + "\n", + "The UDF profiler was implemented based on the executor-side profiler,\n", + "which is designed for PySpark RDD API. The executor-side profiler is available\n", + "in all active Databricks Runtime versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "Both the UDF profiler and the executor-side profiler run on Python workers.\n", + "They are controlled by the spark.python.profile Spark configuration, which\n", + "is false by default. We can enable that Spark configuration on a Databricks\n", + "Runtime cluster as shown below.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "PySpark profilers are implemented based on cProfile; thus, the profile reporting\n", + "relies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\n", + "collecting profile reports from Python workers.\n", + "\n", + "Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022\n", + "\n", + "\n", + "[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\n", + "approach for creating reliable data pipelines and fully manages the underlying\n", + "infrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\n", + "actionable insights derived from near real-time data. Delta Live Tables enables\n", + "low-latency streaming data pipelines to support such use cases with low\n", + "latencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n", + "[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n", + "\n", + "This article will walk through using DLT with Apache Kafka while providing the\n", + "required Python code to ingest streams. The recommended system architecture\n", + "will be explained, and related DLT settings worth considering will be explored\n", + "along the way.\n", + "\n", + "**Streaming platforms**\n", + "\n", + "Event buses or message buses decouple message producers from consumers.\n", + "A popular streaming use case is the collection of click-through data from\n", + "users navigating a website where every user interaction is stored as an event in\n", + "\n", + "\n", + "Apache Kafka. The event stream from Kafka is then used for real-time streaming\n", + "data analytics. Multiple message consumers can read the same data from Kafka\n", + "and use the data to learn about audience interests, conversion rates, and bounce\n", + "reasons. The real-time, streaming event data from the user interactions often\n", + "also needs to be correlated with actual purchases stored in a billing database.\n", + "\n", + "**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”\n", + "\n", + "When developing DLT with Python, the @dlt.table decorator is used to create a\n", + "Delta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\n", + "which are simple SQL constraints clauses that define the pipeline’s behavior with\n", + "invalid records.\n", + "\n", + "Since streaming workloads often come with unpredictable data volumes,\n", + "Databricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\n", + "overall end-to-end latency while reducing cost by shutting down unnecessary\n", + "infrastructure.\n", + "\n", + "**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\n", + "pipeline run.\n", + "\n", + "In contrast, **streaming Delta Live Tables** are stateful, incrementally computed\n", + "and only process data that has been added since the last pipeline run. If the\n", + "query which defines a streaming live tables changes, new data will be processed\n", + "based on the new query but existing data is not recomputed. Streaming live\n", + "tables always use a streaming source and only work over append-only streams,\n", + "such as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\n", + "Structured Streaming.\n", + "\n", + "\n", + "You can chain multiple streaming pipelines, for example, workloads with very\n", + "large data volume and low latency requirements.\n", + "\n", + "**Direct ingestion from streaming engines**\n", + "\n", + "Delta Live Tables written in Python can directly ingest data from an event bus like\n", + "Kafka using Spark Structured Streaming. You can set a short retention period for\n", + "the Kafka topic to avoid compliance issues, reduce costs and then benefit from\n", + "the cheap, elastic and governable storage that Delta provides.\n", + "\n", + "As a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n", + "(raw) table and avoid complex transformations that could drop important data.\n", + "Like any Delta table the Bronze table will retain the history and allow it to perform\n", + "GDPR and other compliance tasks.\n", + "\n", + "Ingest streaming data from Apache Kafka\n", + "\n", + "\n", + "-----\n", + "\n", + "When writing DLT pipelines in Python, you use the @dlt.table annotation\n", + "to create a DLT table. There is no special attribute to mark streaming DLTs in\n", + "Python; simply use spark.readStream() to access the stream. Example code\n", + "for creating a DLT table with the name kafka_bronze that is consuming data\n", + "from a Kafka topic looks as follows:\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "TOPIC = \"tracker-events\"\n", + "KAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n", + "_# subscribe to TOPIC at KAFKA_BROKER_\n", + "raw_kafka_events = (spark.readStream\n", + ". format ( \"kafka\" )\n", + ".option( \"subscribe\" , TOPIC)\n", + ".option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n", + ".option( \"startingOffsets\" , \"earliest\" )\n", + ".load()\n", + ")\n", + "\n", + "**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n", + "```\n", + " def kafka_bronze ():\n", + "\n", + "```\n", + "return raw_kafka_events\n", + "\n", + "pipelines.reset.allowed\n", + "\n", + "Note that event buses typically expire messages after a certain period of time,\n", + "whereas Delta is designed for infinite retention.\n", + "\n", + "This might lead to the effect that source data on Kafka has already been deleted\n", + "when running a full refresh for a DLT pipeline. In this case, not all historic data\n", + "could be backfilled from the messaging platform, and data would be missing in\n", + "DLT tables. To prevent dropping data, use the following DLT table property:\n", + "\n", + "\n", + "pipelines.reset.allowed=false\n", + "\n", + "Setting pipelines.reset.allowed to false prevents refreshes to the table but\n", + "does not prevent incremental writes to the tables or new data from flowing into\n", + "the table.\n", + "\n", + "**Checkpointing**\n", + "\n", + "If you are an experienced Spark Structured Streaming developer, you will notice\n", + "the absence of checkpointing in the above code. In Spark Structured Streaming\n", + "checkpointing is required to persist progress information about what data has\n", + "been successfully processed and upon failure, this metadata is used to restart a\n", + "failed query exactly where it left off.\n", + "\n", + "Whereas checkpoints are necessary for failure recovery with exactly-once\n", + "guarantees in Spark Structured Streaming, DLT handles state automatically\n", + "without any manual configuration or explicit checkpointing required.\n", + "\n", + "**Mixing SQL and Python for a DLT pipeline**\n", + "\n", + "A DLT pipeline can consist of multiple notebooks but one DLT notebook is\n", + "required to be written entirely in either SQL or Python (unlike other Databricks\n", + "notebooks where you can have cells of different languages in a single notebook).\n", + "\n", + "Now, if your preference is SQL, you can code the data ingestion from Apache\n", + "Kafka in one notebook in Python and then implement the transformation logic of\n", + "your data pipelines in another notebook in SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Schema mapping**\n", + "\n", + "When reading data from messaging platform, the data stream is opaque and a\n", + "schema has to be provided.\n", + "\n", + "The Python example below shows the schema definition of events from a fitness\n", + "tracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n", + "\n", + "event_schema = StructType([ \\\n", + "StructField( \"time\" , TimestampType(), True ) , \\\n", + "StructField( \"version\" , StringType(), True ), \\\n", + "StructField( \"model\" , StringType(), True ) , \\\n", + "StructField( \"heart_bpm\" , IntegerType(), True ), \\\n", + "StructField( \"kcal\" , IntegerType(), True ) \\\n", + "])\n", + "\n", + "_# temporary table, visible in pipeline but not in data browser,_\n", + "_# cannot be queried interactively_\n", + "**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n", + "**temporary=** **True** **)**\n", + "```\n", + " def kafka_silver ():\n", + "\n", + "```\n", + "return (\n", + "_# kafka streams are (timestamp,value)_\n", + "_# value contains the kafka payload_\n", + "\n", + "dlt.read_stream( \"kafka_bronze\" )\n", + ".select(col( \"timestamp\" ),from_json(col( \"value\" )\n", + ".cast( \"string\" ), event_schema).alias( \"event\" ))\n", + ".select( \"timestamp\" , \"event.*\" )\n", + "\n", + "\n", + "**Benefits**\n", + "\n", + "Reading streaming data in DLT directly from a message broker minimizes the\n", + "architectural complexity and provides lower end-to-end latency since data is\n", + "directly streamed from the messaging broker and no intermediary step is involved.\n", + "\n", + "**Streaming ingest with cloud object store intermediary**\n", + "\n", + "For some specific use cases, you may want to offload data from Apache Kafka,\n", + "e.g., using a Kafka connector, and store your streaming data in a cloud object\n", + "intermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\n", + "ingest the files.\n", + "\n", + "Auto Loader can ingest data with a single line of SQL code. The syntax to ingest\n", + "JSON files into a DLT table is shown below (it is wrapped across two lines for\n", + "readability).\n", + "\n", + "_-- INGEST with Auto Loader_\n", + "create or replace streaming live table raw\n", + "as select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n", + "\n", + "\n", + "-----\n", + "\n", + "Note that Auto Loader itself is a streaming data source and all newly arrived files\n", + "will be processed exactly once, hence the streaming keyword for the raw table\n", + "that indicates data is ingested incrementally to that table.\n", + "\n", + "Since offloading streaming data to a cloud object store introduces an additional\n", + "step in your system architecture it will also increase the end-to-end latency\n", + "and create additional storage costs. Keep in mind that the Kafka connector\n", + "writing event data to the cloud object store needs to be managed, increasing\n", + "operational complexity.\n", + "\n", + "Therefore Databricks recommends as a best practice to directly access event\n", + "bus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n", + "\n", + "**Other event buses or messaging systems**\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to other event buses or messaging systems. DLT supports any data\n", + "source that Databricks Runtime directly supports.\n", + "\n", + "**Amazon Kinesis**\n", + "In Kinesis, you write messages to a fully managed serverless stream. Same as\n", + "Kafka, Kinesis does not permanently store messages. The default message\n", + "retention in Kinesis is one day.\n", + "\n", + "When using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\n", + "Python code for streaming ingestion above and add Amazon Kinesis-specific\n", + "settings with option(). For more information, check the section about Kinesis\n", + "Integration in the Spark Structured Streaming documentation.\n", + "\n", + "\n", + "**Azure Event Hubs**\n", + "\n", + "For Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\n", + "the article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n", + "\n", + "**Summary**\n", + "\n", + "DLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\n", + "streaming and batch sources, cleanse and transform data on the Databricks\n", + "Lakehouse Platform on any cloud with guaranteed data quality.\n", + "\n", + "Data from Apache Kafka can be ingested by directly connecting to a Kafka broker\n", + "from a DLT notebook in Python. Data loss can be prevented for a full pipeline\n", + "refresh even when the source data in the Kafka streaming layer expired.\n", + "\n", + "**Get started**\n", + "\n", + "If you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\n", + "release notes to learn more about what’s included in this GA release. If you are\n", + "not an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\n", + "detailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n", + "\n", + "Join the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\n", + "are chatting about Data + AI Summit 2022 announcements and updates. Learn.\n", + "Network.\n", + "\n", + "Last but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\n", + "summit. In that session, I walk you through the code of another streaming data\n", + "example with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\n", + "Hugging Face sentiment analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.4 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices**\n", + "\n", + "by **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "December 12, 2022\n", + "\n", + "\n", + "Releasing any data pipeline or application into a production state requires\n", + "planning, testing, monitoring, and maintenance. Streaming pipelines are no\n", + "different in this regard; in this blog we present some of the most important\n", + "considerations for deploying streaming pipelines and applications to a\n", + "production environment.\n", + "\n", + "At Databricks, we offer two different ways of building and running streaming\n", + "pipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\n", + "DLT is our flagship, fully managed ETL product that supports both batch and\n", + "streaming pipelines. It offers declarative development, automated operations,\n", + "data quality, advanced observability capabilities, and more. Workflows enable\n", + "customers to run Apache Spark™ workloads in Databricks’ optimized runtime\n", + "environment (i.e., Photon) with access to unified governance (Unity Catalog) and\n", + "storage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n", + "\n", + "share the same core streaming engine — Spark Structured Streaming. In the\n", + "case of DLT, customers program against the DLT API and DLT uses the Structured\n", + "Streaming engine under the hood. In the case of Jobs, customers program\n", + "against the Spark API directly.\n", + "\n", + "\n", + "The recommendations in this blog post are written from the Structured\n", + "Streaming engine perspective, most of which apply to both DLT and Workflows\n", + "(although DLT does take care of some of these automatically, like Triggers and\n", + "Checkpoints). We group the recommendations under the headings “Before\n", + "Deployment” and “After Deployment” to highlight when these concepts will\n", + "need to be applied and are releasing this blog series with this split between\n", + "the two. There will be additional deep-dive content for some of the sections\n", + "beyond as well. We recommend reading all sections before beginning work\n", + "to productionalize a streaming pipeline or application, and revisiting these\n", + "recommendations as you promote it from dev to QA and eventually production.\n", + "\n", + "**Before deployment**\n", + "\n", + "There are many things you need to consider when creating your streaming\n", + "application to improve the production experience. Some of these topics, like\n", + "unit testing, checkpoints, triggers, and state management, will determine how\n", + "your streaming application performs. Others, like naming conventions and how\n", + "many streams to run on which clusters, have more to do with managing multiple\n", + "streaming applications in the same environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unit testing**\n", + "\n", + "\n", + "The cost associated with finding and fixing a bug goes up exponentially\n", + "the farther along you get in the SDLC process, and a Structured Streaming\n", + "application is no different. When you’re turning that prototype into a hardened\n", + "production pipeline you need a CI/CD process with built-in tests. So how do you\n", + "create those tests?\n", + "\n", + "At first you might think that unit testing a streaming pipeline requires something\n", + "special, but that isn’t the case. The general guidance for streaming pipelines is\n", + "no different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\n", + "organizing your code so that it can be unit tested effectively:\n", + "\n", + "**•** Divide your code into testable chunks\n", + "\n", + "**•** Organize your business logic into functions calling other functions.\n", + "If you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n", + "[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\n", + "multiple functions that can be individually tested.\n", + "\n", + "**•** Do not code in dependencies on the global state or external systems\n", + "\n", + "**•** Any function manipulating a DataFrame or data set should be organized\n", + "to take the DataFrame/data set/configuration as input and output the\n", + "DataFrame/data set\n", + "\n", + "Once your code is separated out in a logical manner you can implement unit\n", + "tests for each of your functions. Spark-agnostic functions can be tested like any\n", + "other function in that language. For testing UDFs and functions with DataFrames\n", + "and data sets, there are multiple Spark testing frameworks available. These\n", + "\n", + "\n", + "frameworks support all of the DataFrame/data set APIs so that you can easily\n", + "create input, and they have specialized assertions that allow you to compare\n", + "DataFrame content and schemas. Some examples are:\n", + "\n", + "**•** The built-in Spark test suite, designed to test all parts of Spark\n", + "\n", + "**•** spark-testing-base, which has support for both Scala and Python\n", + "\n", + "**•** spark-fast-tests, for testing Scala Spark 2 & 3\n", + "\n", + "**•** chispa, a Python version of spark-fast-tests\n", + "\n", + "Code examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n", + "\n", + "But wait! I’m testing a streaming application here — don’t I need to make\n", + "streaming DataFrames for my unit tests? The answer is no; you do not! Even\n", + "though a streaming DataFrame represents a data set with no defined ending,\n", + "when functions are executed on it they are executed on a microbatch — a\n", + "discrete set of data. You can use the same unit tests that you would use for a\n", + "batch application, for both stateless and stateful streams. One of the advantages\n", + "of Structured Streaming over other frameworks is the ability to use the same\n", + "transformation code for both streaming and with other batch operations for\n", + "the same sink. This allows you to simplify some operations, like backfilling\n", + "data, for example, where rather than trying to sync the logic between two\n", + "different applications, you can just modify the input sources and write to the\n", + "same destination. If the sink is a Delta table, you can even do these operations\n", + "concurrently if both processes are append-only operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Triggers**\n", + "\n", + "\n", + "process a microbatch in order to maximize resource utilization, but setting the\n", + "interval longer would make sense if your stream is running on a shared cluster\n", + "and you don’t want it to constantly take the cluster resources.\n", + "\n", + "If you do not need your stream to run continuously, either because data doesn’t\n", + "come that often or your SLA is 10 minutes or greater, then you can use the\n", + "Trigger.Once option. This option will start up the stream, check for anything new\n", + "since the last time it ran, process it all in one big batch, and then shut down.\n", + "Just like with a continuously running stream when using Trigger.Once, the\n", + "checkpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n", + "\n", + "Spark has a new version of Trigger.Once called Trigger.AvailableNow. While\n", + "Trigger.Once will process everything in one big batch, which depending on your\n", + "data size may not be ideal, Trigger.AvailableNow will split up the data based on\n", + "maxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\n", + "processed in multiple batches. Those settings are ignored with Trigger.Once.\n", + "You can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n", + "\n", + "**Pop quiz —** how do you turn your streaming process into a batch process\n", + "that automatically keeps track of where it left off with just one line of code?\n", + "\n", + "**Answer —** change your processing time trigger to Trigger.Once/Trigger.\n", + "AvailableNow! Exact same code, running on a schedule, that will neither miss nor\n", + "reprocess any records.\n", + "\n", + "\n", + "Now that you know your code works, you need to determine how often your\n", + "stream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\n", + "one of the options for the writeStream command, and it looks like this:\n", + "\n", + "_// Scala/Java_\n", + ".trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n", + "\n", + "_# Python_\n", + ".trigger(processingTime= '30 seconds' )\n", + "\n", + "In the above example, if a microbatch completes in less than 30 seconds,\n", + "then the engine will wait for the rest of the time before kicking off the next\n", + "microbatch. If a microbatch takes longer than 30 seconds to complete, then the\n", + "engine will start the next microbatch immediately after the previous one finishes.\n", + "\n", + "The two factors you should consider when setting your trigger interval are how\n", + "long you expect your stream to process a microbatch and how often you want\n", + "the system to check for new data. You can lower the overall processing latency\n", + "by using a shorter trigger interval and increasing the resources available for\n", + "the streaming query by adding more workers or using compute or memory\n", + "optimized instances tailored to your application’s performance. These increased\n", + "resources come with increased costs, so if your goal is to minimize costs, then a\n", + "longer trigger interval with less compute can work. Normally you would not set a\n", + "trigger interval longer than what it would typically take for your stream to\n", + "\n", + "\n", + "-----\n", + "\n", + "**Name your stream**\n", + "\n", + "\n", + "You name your children, you name your pets, now it’s time to name your streams.\n", + "There’s a writeStream option called .queryName that allows you to provide a\n", + "friendly name for your stream. Why bother? Well, suppose you don’t name it. In\n", + "that case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\n", + "is the string and the unintelligible guid that is automatically generated\n", + "as the stream’s unique identifier. If you have more than one stream running on a\n", + "cluster, and all of them have and unintelligible strings as identifiers,\n", + "how do you find the one you want? If you’re exporting metrics how do you tell\n", + "which is which?\n", + "\n", + "Make it easy on yourself, and name your streams. When you’re managing them in\n", + "production you’ll be glad you did, and while you’re at it, go and name your batch\n", + "queries in any foreachBatch() code you have.\n", + "\n", + "**Fault tolerance**\n", + "\n", + "How does your stream recover from being shut down? There are a few different\n", + "cases where this can come into play, like cluster node failures or intentional\n", + "halts, but the solution is to set up checkpointing. Checkpoints with write-ahead\n", + "logs provide a degree of protection from your streaming application being\n", + "interrupted, ensuring it will be able to pick up again where it last left off.\n", + "\n", + "Checkpoints store the current offsets and state values (e.g., aggregate values) for\n", + "your stream. Checkpoints are stream specific so each should be set to its own\n", + "location. Doing this will let you recover more gracefully from shutdowns, failures\n", + "from your application code or unexpected cloud provider failures or limitations.\n", + "\n", + "\n", + "To configure checkpoints, add the checkpointLocation option to your stream\n", + "definition:\n", + "\n", + "_// Scala/Java/Python_\n", + "streamingDataFrame.writeStream\n", + ".format( \"delta\" )\n", + ".option( \"path\" , \"\" )\n", + ".queryName( \"TestStream\" )\n", + ".option( \"checkpointLocation\" , \"\" )\n", + ".start()\n", + "\n", + "To keep it simple — every time you call .writeStream, you must specify the\n", + "checkpoint option with a unique checkpoint location. Even if you’re using\n", + "foreachBatch and the writeStream itself doesn’t specify a path or table option,\n", + "you must still specify that checkpoint. It’s how Spark Structured Streaming gives\n", + "you hassle-free fault tolerance.\n", + "\n", + "Efforts to manage the checkpointing in your stream should be of little concern\n", + "in general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\n", + "analytics is not having to reason about streaming at all.” That said, one setting\n", + "\n", + "deserves mention as questions around the maintenance of checkpoint files\n", + "come up occasionally. Though it is an internal setting that doesn’t require direct\n", + "configuration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\n", + "controls the number of checkpoint files that get created. Basically, the number\n", + "of files will be roughly this number times two, as there is a file created noting the\n", + "offsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\n", + "on completing the batch (commits). The number of files is checked periodically\n", + "for cleanup as part of the internal processes. This simplifies at least one aspect\n", + "of long-term streaming application maintenance for you.\n", + "\n", + "\n", + "-----\n", + "\n", + "It is also important to note that some changes to your application code can\n", + "invalidate the checkpoint. Checking for any of these changes during code\n", + "reviews before deployment is recommended. You can find examples of changes\n", + "where this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n", + "[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\n", + "whether asynchronous checkpointing might improve the latency in your\n", + "streaming application. In that case, these are covered in greater depth in\n", + "[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n", + "\n", + "**State management and RocksDB**\n", + "\n", + "Stateful streaming applications are those where current records may depend\n", + "on previous events, so Spark has to retain data in between microbatches.\n", + "The data it retains is called state, and Spark will store it in a state store and\n", + "read, update and delete it during each microbatch. Typical stateful operations\n", + "are streaming aggregations, streaming dropDuplicates, stream-stream joins,\n", + "mapGroupsWithState, or flatMapGroupsWithState. Some common types of\n", + "examples where you’ll need to think about your application state could be\n", + "sessionization or hourly aggregation using group by methods to calculate\n", + "\n", + "business metrics. Each record in the state store is identified by a key that is used\n", + "as part of the stateful computation, and the more unique keys that are required\n", + "the larger the amount of state data that will be stored.\n", + "\n", + "When the amount of state data needed to enable these stateful operations\n", + "grows large and complex, it can degrade your workloads’ performance, leading\n", + "to increased latency or even failures. A typical indicator of the state store being\n", + "\n", + "\n", + "the culprit of added latency is large amounts of time spent in garbage collection\n", + "(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\n", + "this could look like a continual increase or wildly varying processing time across\n", + "microbatches.\n", + "\n", + "The default configuration for a state store, which is sufficient for most general\n", + "streaming workloads, is to store the state data in the executors’ JVM memory.\n", + "Large number of keys (typically millions, see the Monitoring & Instrumentation\n", + "section in part 2 of this blog) can add excessive memory pressure on the\n", + "machine memory and increase the frequency of hitting these GC pauses as it\n", + "tries to free up resources.\n", + "\n", + "On the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\n", + "use [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\n", + "memory pressure. RocksDB is an embeddable persistent key-value store for fast\n", + "storage. It features high performance through a log-structured database engine\n", + "written entirely in C++ and optimized for fast, low-latency storage.\n", + "\n", + "Leveraging RocksDB as the state store provider still uses machine memory\n", + "but no longer occupies space in the JVM and makes for a more efficient\n", + "state management system for large amounts of keys. This doesn’t come for\n", + "free, however, as it introduces an extra step in processing every microbatch.\n", + "Introducing RocksDB shouldn’t be expected to reduce latency except when it is\n", + "related to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\n", + "regular state storage as it is included in the stream checkpointing.\n", + "\n", + "\n", + "-----\n", + "\n", + "RocksDB configuration, like checkpoint configuration, is minimal by design and so\n", + "you only need to declare it in your overall Spark configuration:\n", + "\n", + "spark.conf. set (\n", + "\"spark.sql.streaming.stateStore.providerClass\" ,\n", + "\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n", + "\n", + "If you are monitoring your stream using the streamingQueryListener class, then\n", + "you will also notice that RocksDB metrics will be included in the stateOperators\n", + "field. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n", + "[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n", + "\n", + "It’s worth noting that large numbers of keys can have other adverse impacts in\n", + "addition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\n", + "also gets backed up in checkpoints for fault tolerance. So it makes sense that\n", + "if you have state files being created so that they will not expire, you will keep\n", + "accumulating files in the checkpoint, increasing the amount of storage required\n", + "and potentially the time to write it or recover from failures as well. For the data\n", + "in memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n", + "\n", + "this situation can lead to somewhat vague out-of-memory errors, and for the\n", + "checkpointed data written to cloud storage you might observe unexpected\n", + "and unreasonable growth. Unless you have a business need to retain streaming\n", + "state for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n", + "[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\n", + "operations so that the system can drop state records that are no longer needed\n", + "(pay close attention to dropDuplicates and stream-stream joins).\n", + "\n", + "\n", + "**Running multiple streams on a cluster**\n", + "\n", + "Once your streams are fully tested and configured, it’s time to figure out how to\n", + "organize them in production. It’s a common pattern to stack multiple streams on\n", + "the same Spark cluster to maximize resource utilization and save cost. This is fine\n", + "to a point, but there are limits to how much you can add to one cluster before\n", + "performance is affected. The driver has to manage all of the streams running on\n", + "the cluster, and all streams will compete for the same cores across the workers.\n", + "You need to understand what your streams are doing and plan your capacity\n", + "appropriately to stack effectively.\n", + "\n", + "Here is what you should take into account when you’re planning on stacking\n", + "multiple streams on the same cluster:\n", + "\n", + "**•** Make sure your driver is big enough to manage all of your streams. Is your\n", + "driver struggling with a high CPU utilization and garbage collection? That\n", + "means it’s struggling to manage all of your streams. Either reduce the\n", + "number of streams or increase the size of your driver.\n", + "\n", + "**•** Consider the amount of data each stream is processing. The more data\n", + "you are ingesting and writing to a sink, the more cores you will need in\n", + "order to maximize your throughput for each stream. You’ll need to reduce\n", + "the number of streams or increase the number of workers depending on\n", + "how much data is being processed. For sources like Kafka you will need to\n", + "configure how many cores are being used to ingest with the minPartitions\n", + "option if you don’t have enough cores for all of the partitions across all of\n", + "your streams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**•** Consider the complexity and data volume of your streams. If all of the\n", + "streams are doing minimal manipulation and just appending to a sink, then\n", + "each stream will need fewer resources per microbatch and you’ll be able to\n", + "stack more. If the streams are doing stateful processing or computation/\n", + "memory-intensive operations, that will require more resources for good\n", + "performance and you’ll want to stack fewer streams.\n", + "\n", + "**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\n", + "contending for the same workers and cores, and one stream that needs a\n", + "lot of cores will cause the other streams to wait. Scheduler pools enable\n", + "you to have different streams execute on different parts of the cluster.\n", + "This will enable streams to execute in parallel with a subset of the available\n", + "resources.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "Some of the ideas we’ve addressed here certainly deserve their own time\n", + "and special treatment with a more in-depth discussion, which you can look\n", + "forward to in later deep dives. However, we hope these recommendations are\n", + "useful as you begin your journey or seek to enhance your production streaming\n", + "experience. Be sure to continue with the next post, “Streaming in Production:\n", + "Collected Best Practices, Part 2.”\n", + "\n", + "**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n", + "\n", + "\n", + "\n", + "**•** Consider your SLA. If you have mission critical streams, isolate them as a\n", + "best practice so lower-criticality streams do not affect them.\n", + "\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "On Databricks we typically see customers stack between 10-30 streams on a\n", + "cluster, but this varies depending on the use case. Consider the factors above so\n", + "that you can have a good experience with performance, cost and maintainability.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.5 \u0007\n", + "\n", + "**Streaming in Production: Collected Best Practices, Part 2**\n", + "\n", + "by **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n", + "\n", + "January 10, 2023\n", + "\n", + "\n", + "In our two-part blog series titled “Streaming in Production: Collected Best\n", + "Practices,” this is the second article. Here we discuss the “After Deployment”\n", + "considerations for a Structured Streaming Pipeline. The majority of the\n", + "suggestions in this post are relevant to both Structured Streaming Jobs and\n", + "Delta Live Tables (our flagship and fully managed ETL product that supports\n", + "both batch and streaming pipelines).\n", + "\n", + "**After deployment**\n", + "\n", + "After the deployment of your streaming application, there are typically three\n", + "main things you’ll want to know:\n", + "\n", + "**•** How is my application running?\n", + "\n", + "**•** Are resources being used efficiently?\n", + "\n", + "**•** How do I manage any problems that come up?\n", + "\n", + "We’ll start with an introduction to these topics, followed by a deeper dive later in\n", + "this blog series.\n", + "\n", + "\n", + "**Monitoring and instrumentation (How is my application running?)**\n", + "\n", + "Streaming workloads should be pretty much hands-off once deployed to\n", + "production. However, one thing that may sometimes come to mind is: “how is my\n", + "application running?” Monitoring applications can take on different levels and\n", + "forms depending on:\n", + "\n", + "**•** the metrics collected for your application (batch duration/latency,\n", + "throughput, …)\n", + "\n", + "**•** where you want to monitor the application from\n", + "\n", + "At the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n", + "[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\n", + "used in a variety of situations.\n", + "\n", + "This is in addition to setting up failure alerts on jobs running streaming\n", + "workloads.\n", + "\n", + "If you want more fine-grained metrics or to create custom actions based on\n", + "these metrics as part of your code base, then the StreamingQueryListener is\n", + "better aligned with what you’re looking for.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you want the Spark metrics to be reported (including machine level traces for\n", + "drivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n", + "\n", + "The Apache Spark Structured Streaming UI\n", + "\n", + "\n", + "Another point to consider is where you want to surface these metrics for\n", + "observability. There is a Ganglia dashboard at the cluster level, integrated partner\n", + "applications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\n", + "source options you can build using tools like Prometheus and Grafana. Each\n", + "has advantages and disadvantages to consider around cost, performance, and\n", + "maintenance requirements.\n", + "\n", + "Whether you have low volumes of streaming workloads where interactions in the\n", + "UI are sufficient or have decided to invest in a more robust monitoring platform,\n", + "you should know how to observe your production streaming workloads. Further\n", + "“Monitoring and Alerting” posts later in this series will contain a more thorough\n", + "discussion. In particular, we’ll see different measures on which to monitor\n", + "streaming applications and then later take a deeper look at some of the tools\n", + "you can leverage for observability.\n", + "\n", + "**Application optimization (Are resources being used effectively?**\n", + "\n", + "**Think “cost”)**\n", + "\n", + "The next concern we have after deploying to production is “is my application\n", + "\n", + "using resources effectively?” As developers, we understand (or quickly learn) the\n", + "distinction between working code and well-written code. Improving the way your\n", + "code runs is usually very satisfying, but what ultimately matters is the overall\n", + "cost of running it. Cost considerations for Structured Streaming applications will\n", + "be largely similar to those for other Spark applications. One notable difference\n", + "is that failing to optimize for production workloads can be extremely costly,\n", + "as these workloads are frequently “always-on” applications, and thus wasted\n", + "expenditure can quickly compound. Because assistance with cost optimization is\n", + "\n", + "\n", + "-----\n", + "\n", + "frequently requested, a separate post in this series will address it. The key points\n", + "that we’ll focus on will be efficiency of usage and sizing.\n", + "\n", + "Getting the cluster sizing right is one of the most significant differences between\n", + "efficiency and wastefulness in streaming applications. This can be particularly\n", + "tricky because in some cases it’s difficult to estimate the full load conditions of\n", + "the application in production before it’s actually there. In other cases, it may be\n", + "difficult due to natural variations in volume handled throughout the day, week, or\n", + "year. When first deploying, it can be beneficial to oversize slightly, incurring the\n", + "extra expense to avoid inducing performance bottlenecks. Utilize the monitoring\n", + "tools you chose to employ after the cluster has been running for a few weeks\n", + "to ensure proper cluster utilization. For example, are CPU and memory levels\n", + "being used at a high level during peak load or is the load generally small and the\n", + "cluster may be downsized? Maintain regular monitoring of this and keep an eye\n", + "out for changes in data volume over time; if either occurs, a cluster resize may be\n", + "required to maintain cost-effective operation.\n", + "\n", + "As a general guideline, you should avoid excessive shuffle operations, joins, or an\n", + "excessive or extreme watermark threshold (don’t exceed your needs), as each\n", + "can increase the number of resources you need to run your application. A large\n", + "watermark threshold will cause Structured Streaming to keep more data in the\n", + "state store between batches, leading to an increase in memory requirements\n", + "across the cluster. Also, pay attention to the type of VM configured — are you\n", + "using memory-optimized for your memory-intense stream? Compute-optimized\n", + "for your computationally-intensive stream? If not, look at the utilization levels\n", + "for each and consider trying a machine type that could be a better fit. Newer\n", + "families of servers from cloud providers with more optimal CPUs often lead to\n", + "faster execution, meaning you might need fewer of them to meet your SLA.\n", + "\n", + "\n", + "**Troubleshooting (How do I manage any problems that come up?)**\n", + "\n", + "The last question we ask ourselves after deployment is “how do I manage any\n", + "problems that come up?” As with cost optimization, troubleshooting streaming\n", + "applications in Spark often looks the same as other applications since most of\n", + "the mechanics remain the same under the hood. For streaming applications,\n", + "issues usually fall into two categories — failure scenarios and latency scenarios\n", + "\n", + "**Failure scenarios**\n", + "\n", + "Failure scenarios typically manifest with the stream stopping with an error,\n", + "executors failing or a driver failure causing the whole cluster to fail. Common\n", + "causes for this are:\n", + "\n", + "**•** Too many streams running on the same cluster, causing the driver to be\n", + "overwhelmed. On Databricks, this can be seen in Ganglia, where the driver\n", + "node will show up as overloaded before the cluster fails.\n", + "\n", + "**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\n", + "This can also be seen on Databricks in Ganglia before an executor fails,\n", + "or in the Spark UI under the executors tab.\n", + "\n", + "**•** Using a collect to send too much data to the driver, causing it to fail\n", + "with an Out Of Memory error.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Latency scenarios**\n", + "\n", + "For latency scenarios, your stream will not execute as fast as you want or expect.\n", + "A latency issue can be intermittent or constant. Too many streams or too small\n", + "of a cluster can be the cause of this as well. Some other common causes are:\n", + "\n", + "**•** Data skew — when a few tasks end up with much more data than the rest\n", + "of the tasks. With skewed data, these tasks take longer to execute than the\n", + "others, often spilling to disk. Your stream can only run as fast as its slowest\n", + "task.\n", + "\n", + "**•** Executing a stateful query without defining a watermark or defining a very\n", + "long one will cause your state to grow very large, slowing down your stream\n", + "over time and potentially leading to failure.\n", + "\n", + "**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n", + "\n", + "**•** Stable but high latency (batch execution time). Depending on the cause,\n", + "adding more workers to increase the number of cores concurrently available\n", + "for Spark tasks can help. Increasing the number of input partitions and/or\n", + "decreasing the load per core through batch size settings can also reduce\n", + "the latency.\n", + "\n", + "Just like troubleshooting a batch job, you’ll use Ganglia to check cluster\n", + "utilization and the Spark UI to find performance bottlenecks. There is a\n", + "specific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\n", + "troubleshoot streaming applications. On that tab each stream that is running will\n", + "be listed, and you’ll see either your stream name if you named your stream or\n", + "\n", + "\n", + " if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\n", + "tab of the Spark UI so that you can tell which jobs are for a given stream.\n", + "\n", + "You’ll notice above we said which jobs are for a given stream. It’s a common\n", + "misconception that if you were to look at a streaming application in the Spark\n", + "UI you would just see one job in the Jobs tab running continuously. Instead,\n", + "depending on your code, you will see one or more jobs that start and complete\n", + "for each microbatch. Each job will have the stream ID from the Structured\n", + "Streaming tab and a microbatch number in the description, so you’ll be able to\n", + "tell which jobs go with which stream. You can click into those jobs to find the\n", + "longest running stages and tasks, check for disk spills, and search by Job ID in\n", + "the SQL tab to find the slowest queries and check their explain plans.\n", + "\n", + "The Jobs tab in the Apache Spark UI\n", + "\n", + "\n", + "-----\n", + "\n", + "If you click on your stream in the Structured Streaming tab you’ll see how much\n", + "time the different streaming operations are taking for each microbatch, such as\n", + "adding a batch, query planning and committing (see earlier screenshot of the\n", + "Apache Spark Structured Streaming UI). You can also see how many rows are\n", + "being processed as well as the size of your state store for a stateful stream.\n", + "This can give insights into where potential latency issues are.\n", + "\n", + "We will go more in-depth with troubleshooting later in this blog series, where\n", + "we’ll look at some of the causes and remedies for both failure scenarios and\n", + "latency scenarios as we outlined above.\n", + "\n", + "**Conclusion**\n", + "\n", + "You may have noticed that many of the topics covered here are very similar to\n", + "how other production Spark applications should be deployed. Whether your\n", + "workloads are primarily streaming applications or batch processes, the majority\n", + "of the same principles will apply. We focused more on things that become\n", + "especially important when building out streaming applications, but as we’re\n", + "\n", + "\n", + "sure you’ve noticed by now, the topics we discussed should be included in\n", + "most production deployments.\n", + "\n", + "Across the majority of industries in the world today information is needed\n", + "faster than ever, but that won’t be a problem for you. With Spark Structured\n", + "Streaming you’re set to make it happen at scale in production. Be on the lookout\n", + "for more in-depth discussions on some of the topics we’ve covered in this blog,\n", + "and in the meantime keep streaming!\n", + "\n", + "**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n", + "**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.6 \u0007\n", + "\n", + "**Building Geospatial Data Products**\n", + "\n", + "by **M I L O S C O L I C**\n", + "\n", + "January 6, 2023\n", + "\n", + "\n", + "Geospatial data has been driving innovation for centuries, through use of\n", + "maps, cartography and more recently through digital content. For example,\n", + "the oldest map has been found etched in a piece of mammoth tusk and dates\n", + "[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\n", + "sources used by society to make decisions. A more recent example, labeled\n", + "as the birth of spatial analysis, is that of Charles Picquet in 1832 who used\n", + "geospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\n", + "later John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n", + "[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\n", + "problems of their times and in effect save countless lives. Fast-forwarding to the\n", + "20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n", + "[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\n", + "Rural Development.\n", + "\n", + "Today we are in the midst of the cloud computing industry revolution —\n", + "supercomputing scale available to any organization, virtually infinitely scalable\n", + "for both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\n", + "are emerging within the data community to address questions like platform\n", + "federation and interoperability. How can we adopt these concepts to geospatial\n", + "data, spatial analysis and GIS systems? By adopting the concept of data\n", + "products and approaching the design of geospatial data as a product.\n", + "\n", + "\n", + "In this blog we will provide a point of view on how to design scalable geospatial\n", + "data products that are modern and robust. We will discuss how Databricks\n", + "Lakehouse Platform can be used to unlock the full potential of geospatial\n", + "products that are one of the most valuable assets in solving the toughest\n", + "problems of today and the future.\n", + "\n", + "**What is a data product? And how to design one?**\n", + "\n", + "The most broad and the most concise definition of a “data product” was coined\n", + "by DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n", + "_Data into Product:_ “a product that facilitates an end goal through the use of\n", + "data.” The complexity of this definition (as admitted by Patil himself) is needed to\n", + "encapsulate the breadth of possible products, to include dashboards, reports, Excel\n", + "\n", + "spreadsheets, and even CSV extracts shared via emails. You might notice that the\n", + "examples provided deteriorate rapidly in quality, robustness and governance.\n", + "\n", + "What are the concepts that differentiate a successful product versus an\n", + "unsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\n", + "content? Or is it only the product adoption in the market? Forbes defines the\n", + "10 must-haves of a successful product. A good framework to summarize this is\n", + "through the value pyramid.\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 1: Product value pyramid (source)\n", + "\n", + "The value pyramid provides a priority on each aspect of the product. Not every\n", + "value question we ask about the product carries the same amount of weight. If\n", + "the output is not useful none of the other aspects matter — the output isn’t really\n", + "a product but becomes more of a data pollutant to the pool of useful results.\n", + "Likewise, scalability only matters after simplicity and explainability are addressed.\n", + "\n", + "How does the value pyramid relate to the data products? Each data output, in\n", + "order to be a data product:\n", + "\n", + "**•** **Should have clear usefulness.** The amount of the data society is\n", + "generating is rivaled only by the amount of data pollutants we are\n", + "generating. These are outputs lacking clear value and use, much less a\n", + "strategy for what to do with them.\n", + "\n", + "\n", + "\n", + "**•** **Should be explainable.** With the emergence of AI/ML, explainability has\n", + "become even more important for data driven decision-making. Data\n", + "is as good as the metadata describing it. Think of it in terms of food —\n", + "taste does matter, but a more important factor is the nutritional value\n", + "of ingredients.\n", + "\n", + "**•** **Should be simple.** An example of product misuse is using a fork to eat\n", + "cereal instead of using a spoon. Furthermore, simplicity is essential but\n", + "not sufficient — beyond simplicity the products should be intuitive.\n", + "Whenever possible both intended and unintended uses of the data\n", + "should be obvious.\n", + "\n", + "**•** **Should be scalable.** Data is one of the few resources that grows with\n", + "use. The more data you process the more data you have. If both inputs\n", + "and outputs of the system are unbounded and ever-growing, then the\n", + "system has to be scalable in compute power, storage capacity and\n", + "compute expressive power. Cloud data platforms like Databricks are in\n", + "a unique position to answer for all of the three aspects.\n", + "\n", + "**•** **Should generate habits.** In the data domain we are not concerned\n", + "with customer retention as is the case for the retail products. However,\n", + "the value of habit generation is obvious if applied to best practices.\n", + "The systems and data outputs should exhibit the best practices and\n", + "promote them — it should be easier to use the data and the system in\n", + "the intended way than the opposite.\n", + "\n", + "The geospatial data should adhere to all the aforementioned aspects — any data\n", + "products should. On top of this tall order, geospatial data has some specific needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Geospatial data standards**\n", + "\n", + "\n", + "\n", + "**•** **“Advocate the understanding and use of geospatial data standards**\n", + "**within other sectors of government.”** — Value pyramid applies to\n", + "the standards as well — concepts like ease of adherence (usefulness/\n", + "simplicity), purpose of the standard (explainability/usefulness), adoption\n", + "(habit generation) are critical for the value generation of a standard.\n", + "\n", + "A critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\n", + "principles:\n", + "\n", + "**•** **Findable** — The first step in (re)using data is to find them. Metadata\n", + "and data should be easy to find for both humans and computers.\n", + "Machine-readable metadata are essential for automatic discovery of\n", + "data sets and services.\n", + "\n", + "**•** **Accessible** — Once the user finds the required data, she/he/they\n", + "need to know how they can be accessed, possibly including\n", + "authentication and authorization.\n", + "\n", + "**•** **Interoperable** — The data usually needs to be integrated with\n", + "other data. In addition, the data needs to interoperate with\n", + "applications or workflows for analysis, storage, and processing.\n", + "\n", + "**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\n", + "To achieve this, metadata and data should be well-described so that\n", + "they can be replicated and/or combined in different settings.\n", + "\n", + "\n", + "Geospatial data standards are used to ensure that geographic data is collected,\n", + "organized, and shared in a consistent and reliable way. These standards can\n", + "include guidelines for things like data formatting, coordinate systems, map\n", + "projections, and metadata. Adhering to standards makes it easier to share data\n", + "between different organizations, allowing for greater collaboration and broader\n", + "access to geographic information.\n", + "\n", + "The Geospatial Commision (UK government) has defined the UK Geospatial\n", + "Data Standards Register as a central repository for data standards to be applied\n", + "in the case of geospatial data. Furthermore, the mission of this registry is to:\n", + "\n", + "**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n", + "**across a wider range of systems.”** — These concepts are a callout for the\n", + "importance of explainability, usefulness and habit generation (possibly\n", + "other aspects of the value pyramid).\n", + "\n", + "**•** **“Empower the UK geospatial community to become more engaged with**\n", + "**the relevant standards and standards bodies.”** — Habit generation within\n", + "the community is as important as the robust and critical design on the\n", + "standard. If not adopted standards are useless.\n", + "\n", + "\n", + "-----\n", + "\n", + "We share the belief that the FAIR principles are crucial for the design of scalable\n", + "data products we can trust. To be fair, FAIR is based on common sense, so why\n", + "is it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n", + "_does well is to articulate, in an accessible way, the need for a holistic approach_\n", + "_to data improvement. This ease in communication is why FAIR is being used_\n", + "_increasingly widely as an umbrella for data improvement — and not just in the_\n", + "_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n", + "\n", + "To further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\n", + "developed the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\n", + "covers the years 2021-2024 and was approved in November 2020. The goals\n", + "of NSDI are in essence FAIR principles and convey the same message of designing\n", + "systems that promote the circular economy of data — data products that flow\n", + "between organizations following common standards and in each step through the\n", + "data supply chain unlock new value and new opportunities. The fact that these\n", + "principles are permeating different jurisdictions and are adopted across different\n", + "regulators is a testament to the robustness and soundness of the approach.\n", + "\n", + "\n", + "The FAIR concepts weave really well together with the data product design.\n", + "In fact FAIR is traversing the whole product value pyramid and forms a value\n", + "cycle. By adopting both the value pyramid and FAIR principles we design data\n", + "products with both internal and external outlook. This promotes data reuse\n", + "as opposed to data accumulation.\n", + "\n", + "Why do FAIR principles matter for geospatial data and geospatial data\n", + "\n", + "products? FAIR is transcendent to geospatial data, it is actually transcendent\n", + "to data, it is a simple yet coherent system of guiding principles for good design\n", + "— and that good design can be applied to anything including geospatial data\n", + "and geospatial systems.\n", + "\n", + "\n", + "Figure 2:\n", + "NDSI Strategic Goals\n", + "\n", + "\n", + "-----\n", + "\n", + "**Grid index systems**\n", + "\n", + "In traditional GIS solutions’ performance of spatial operations are usually\n", + "achieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\n", + "The issue with tree approaches is that they eventually break the scalability\n", + "principle — when the data is too big to be processed in order to build the tree\n", + "and the computation required to build the tree is too long and defeats the\n", + "purpose. This also negatively affects the accessibility of data; if we cannot\n", + "construct the tree we cannot access the complete data and in effect we cannot\n", + "reproduce the results. In this case, grid index systems provide a solution.\n", + "\n", + "\n", + "Grid index systems are built from the start with the scalability aspects of the\n", + "geospatial data in mind. Rather than building the trees, they define a series of\n", + "grids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\n", + "the grid covers the area of the Earth; in the case of local grid index systems\n", + "(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\n", + "These grids are composed of cells that have unique identifiers. There is a\n", + "mathematical relationship between location and the cell in the grid. This makes\n", + "the grid index systems very scalable and parallel in nature.\n", + "\n", + "\n", + "Figure 4: Grid Index Systems (H3, British National Grid)\n", + "\n", + "\n", + "-----\n", + "\n", + "Another important aspect of grid index systems is that they are open source,\n", + "allowing index values to be universally leveraged by data producers and\n", + "consumers alike. Data can be enriched with the grid index information at any\n", + "step of its journey through the data supply chain. This makes the grid index\n", + "systems an example of community driven data standards. Community driven\n", + "data standards by nature do not require enforcement, which fully adheres\n", + "to the habit generation aspect of value pyramid and meaningfully addresses\n", + "interoperability and accessibility principles of FAIR.\n", + "\n", + "\n", + "Databricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\n", + "following the same value proposition. Adopting common industry standards\n", + "driven by the community is the only way to properly drive habit generation and\n", + "interoperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\n", + "and [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\n", + "GIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n", + "[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\n", + "the UK government. Grid index systems are key for the scalability of geospatial\n", + "data processing and for properly designing solutions for complex problems\n", + "(e.g., figure 5 — flight holding patterns using H3).\n", + "\n", + "**Geospatial data diversity**\n", + "\n", + "Geospatial data standards spend a solid amount of effort regarding data\n", + "format standardization, and format for that matter is one of the most\n", + "important considerations when it comes to interoperability and reproducibility.\n", + "Furthermore, if the reading of your data is complex — how can we talk about\n", + "simplicity? Unfortunately geospatial data formats are typically complex, as\n", + "data can be produced in a number of formats including both open source\n", + "\n", + "and vendor-specific formats. Considering only vector data, we can expect\n", + "data to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\n", + "and many others. On the other hand, if we are considering raster data we can\n", + "expect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\n", + "GeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n", + "\n", + "\n", + "Figure 5: Example of using H3 to express flight holding patterns\n", + "\n", + "\n", + "-----\n", + "\n", + "Geospatial data domain is so diverse and has organically grown over the years\n", + "around the use cases it was addressing. Unification of such a diverse ecosystem\n", + "is a massive challenge. A recent effort by the Open Geospatial Consortium\n", + "(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n", + "[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\n", + "of designing a good scalable and robust product — unification leads to simplicity\n", + "and addresses one of the main sources of friction in the ecosystem — the data\n", + "ingestion. Standardizing to GeoParquet brings a lot of value that addresses all of\n", + "the aspects of FAIR data and value pyramid.\n", + "\n", + "Figure 6: Geoparquet as a geospatial standard data format\n", + "\n", + "\n", + "Why introduce another format into an already complex ecosystem? GeoParquet\n", + "isn’t a new format — it is a schema specification for Apache Parquet format that\n", + "is already widely adopted and used by the industry and the community. Parquet\n", + "as the base format supports binary columns and allows for storage of arbitrary\n", + "data payload. At the same time the format supports structured data columns\n", + "that can store metadata together with the data payload. This makes it a choice\n", + "that promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\n", + "has been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\n", + "properties of a format are crucial for reproducibility and for trusted outputs. In\n", + "addition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n", + "\n", + "Delta Sharing enables enterprise scale data sharing between any public cloud\n", + "using Databricks (DIY options for private cloud are available using open source\n", + "building blocks). Delta Sharing completely abstracts the need for custom built\n", + "Rest APIs for exposing data to other third parties. Any data asset stored in Delta\n", + "(using GeoParquet schema) automatically becomes a data product that can be\n", + "exposed to external parties in a controlled and governed manner. Delta Sharing\n", + "has been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n", + "\n", + "\n", + "-----\n", + "\n", + "Figure 7: Delta Sharing simplifying data access in the ecosystem\n", + "\n", + "**Circular data economy**\n", + "\n", + "\n", + "Borrowing the concepts from the sustainability domain, we can define a circular\n", + "data economy as a system in which data is collected, shared, and used in a way\n", + "that maximizes its value while minimizing waste and negative impacts, such as\n", + "unnecessary compute time, untrustworthy insights, or biased actions based\n", + "data pollutants. Reusability is the key concept in this consideration — how can\n", + "we minimize the \"reinvention of the wheel.\" There are countless data assets out\n", + "in the wild that represent the same area, same concepts with just ever slight\n", + "alterations to better match a specific use case. Is this due to the actual\n", + "\n", + "\n", + "optimizations or due to the fact it was easier to create a new copy of the assets\n", + "than to reuse the existing ones? Or was it too hard to find the existing data\n", + "assets, or maybe it was too complex to define data access patterns.\n", + "\n", + "Data asset duplication has many negative aspects in both FAIR considerations\n", + "and data value pyramid considerations — having many disparate similar (but\n", + "different) data assets that represent the same area and same concepts can\n", + "deteriorate simplicity considerations of the data domain — it becomes hard\n", + "to identify the data asset we actually can trust. It can also have very negative\n", + "\n", + "\n", + "-----\n", + "\n", + "implications toward habit generation. Many niche communities will emerge\n", + "that will standardize to themselves ignoring the best practices of the wider\n", + "ecosystem, or worse yet they will not standardize at all.\n", + "\n", + "In a circular data economy, data is treated as a valuable resource that can be\n", + "used to create new products and services, as well as improving existing ones.\n", + "This approach encourages the reuse and recycling of data, rather than treating it\n", + "as a disposable commodity. Once again, we are using the sustainability analogy\n", + "in a literal sense — we argue that this is the correct way of approaching the\n", + "problem. Data pollutants are a real challenge for organizations both internally and\n", + "externally. An article by The Guardian states that less than 1% of collected data is\n", + "actually analyzed. There is too much data duplication, the majority of data is hard\n", + "to access and deriving actual value is too cumbersome. Circular data economy\n", + "promotes best practices and reusability of existing data assets allowing for a more\n", + "consistent interpretation and insights across the wider data ecosystem.\n", + "\n", + "\n", + "Figure 8: Databricks Marketplace\n", + "\n", + "\n", + "-----\n", + "\n", + "Interoperability is a key component of FAIR data principles, and from\n", + "interoperability a question of circularity comes to mind. How can we design an\n", + "ecosystem that maximizes data utilization and data reuse? Once again, FAIR\n", + "together with the value pyramid holds answers. Findability of the data is key to\n", + "the data reuse and to solving for data pollution. With data assets that can be\n", + "discovered easily we can avoid the recreation of same data assets in multiple\n", + "places with just slight alteration. Instead we gain a coherent data ecosystem\n", + "with data that can be easily combined and reused. Databricks has recently\n", + "announced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\n", + "line with the original definition of data product by DJ Patel. The marketplace\n", + "will support sharing of data sets, notebooks, dashboards, and machine learning\n", + "models. The critical building block for such a marketplace is the concept of\n", + "Delta Sharing — the scalable, flexible and robust channel for sharing any data —\n", + "geospatial data included.\n", + "\n", + "\n", + "Designing scalable data products that will live in the marketplace is crucial.\n", + "In order to maximize the value add of each data product one should strongly\n", + "consider FAIR principles and the product value pyramid. Without these guiding\n", + "principles we will only increase the issues that are already present in the\n", + "current systems. Each data product should solve a unique problem and should\n", + "solve it in a simple, reproducible and robust way.\n", + "\n", + "**You can read more on how Databricks Lakehouse**\n", + "**Platform can help you accelerate time to value from**\n", + "**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n", + "**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.7 \u0007\n", + "\n", + "**Data Lineage With Unity Catalog**\n", + "\n", + "by **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n", + "\n", + "June 8, 2022\n", + "\n", + "\n", + "This blog will discuss the importance of data lineage, some of the common\n", + "use cases, our vision for better data transparency and data understanding with\n", + "data lineage.\n", + "\n", + "**What is data lineage and why is it important?**\n", + "\n", + "Data lineage describes the transformations and refinements of data from source\n", + "to insight. Lineage includes capturing all the relevant metadata and events\n", + "associated with the data in its lifecycle, including the source of the data set,\n", + "what other data sets were used to create it, who created it and when, what\n", + "transformations were performed, what other data sets leverage it, and many other\n", + "events and attributes. With a data lineage solution, data teams get an end-to-end\n", + "view of how data is transformed and how it flows across their data estate.\n", + "\n", + "As more and more organizations embrace a data-driven culture and set up\n", + "processes and tools to democratize and scale data and AI, data lineage is\n", + "becoming an essential pillar of a pragmatic data management and governance\n", + "strategy.\n", + "\n", + "To understand the importance of data lineage, we have highlighted some of the\n", + "common use cases we have heard from our customers below.\n", + "\n", + "\n", + "**Impact analysis**\n", + "Data goes through multiple updates or revisions over its lifecycle, and\n", + "understanding the potential impact of any data changes on downstream\n", + "consumers becomes important from a risk management standpoint. With data\n", + "lineage, data teams can see all the downstream consumers — applications,\n", + "dashboards, machine learning models or data sets, etc. — impacted by data\n", + "changes, understand the severity of the impact, and notify the relevant\n", + "stakeholders. Lineage also helps IT teams proactively communicate data\n", + "migrations to the appropriate teams, ensuring business continuity.\n", + "\n", + "**Data understanding and transparency**\n", + "Organizations deal with an influx of data from multiple sources, and building\n", + "a better understanding of the context around data is paramount to ensure\n", + "the trustworthiness of the data. Data lineage is a powerful tool that enables\n", + "data leaders to drive better transparency and understanding of data in their\n", + "organizations. Data lineage also empowers data consumers such as data scientists,\n", + "data engineers and data analysts to be context-aware as they perform analyses,\n", + "resulting in better quality outcomes. Finally, data stewards can see which data sets\n", + "are no longer accessed or have become obsolete to retire unnecessary data and\n", + "ensure data quality for end business users .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Debugging and diagnostics**\n", + "You can have all the checks and balances in place, but something will eventually\n", + "break. Data lineage helps data teams perform a root cause analysis of any errors\n", + "in their data pipelines, applications, dashboards, machine learning models, etc.,\n", + "by tracing the error to its source. This significantly reduces the debugging time,\n", + "saving days, or in many cases, months of manual effort.\n", + "\n", + "**Compliance and audit readiness**\n", + "Many compliance regulations, such as the General Data Protection Regulation\n", + "(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\n", + "Accountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\n", + "and Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\n", + "and visibility of data flow. As a result, data traceability becomes a key requirement\n", + "in order for their data architecture to meet legal regulations. Data lineage helps\n", + "organizations be compliant and audit-ready, thereby alleviating the operational\n", + "overhead of manually creating the trails of data flows for audit reporting purposes.\n", + "\n", + "\n", + "**Effortless transparency and proactive control with**\n", + "**data lineage**\n", + "\n", + "The [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\n", + "substantially simplifies enterprise data infrastructure and accelerates innovation\n", + "by unifying your data warehousing and AI use cases on a single platform.\n", + "We believe data lineage is a key enabler of better data transparency and data\n", + "understanding in your lakehouse, surfacing the relationships between data,\n", + "jobs, and consumers, and helping organizations move toward proactive data\n", + "management practices. For example:\n", + "\n", + "**•** As the owner of a dashboard, do you want to be notified next time that a\n", + "table your dashboard depends upon wasn’t loaded correctly?\n", + "\n", + "**•** As a machine learning practitioner developing a model, do you want to be\n", + "alerted that a critical feature in your model will be deprecated soon?\n", + "\n", + "**•** As a governance admin, do you want to automatically control access to\n", + "data based on its provenance?\n", + "\n", + "All of these capabilities rely upon the automatic collection of data lineage across\n", + "all use cases and personas — which is why the lakehouse and data lineage are a\n", + "powerful combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data lineage for tables\n", + "\n", + "Data lineage for table columns\n", + "\n", + "\n", + "Data Lineage for notebooks, workflows, dashboards\n", + "\n", + "**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\n", + "the same permission model as Unity Catalog. If users do not have access to\n", + "a table, they will not be able to explore the lineage associated with the table,\n", + "adding an additional layer of security for privacy considerations.\n", + "\n", + "**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\n", + "in near real-time, and retrieved via REST API to support integrations with our\n", + "catalog partners.\n", + "\n", + "**Getting started with data lineage in Unity Catalog**\n", + "\n", + "Data lineage is available with Databricks Premium and Enterprise tiers for\n", + "no additional cost. If you already are a Databricks customer, follow the data\n", + "lineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\n", + "customer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.8\n", + "\n", + "**Easy Ingestion to Lakehouse With COPY INTO**\n", + "\n", + "by **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n", + "\n", + "January 17, 2023\n", + "\n", + "\n", + "A new data management architecture known as the data lakehouse emerged\n", + "independently across many organizations and use cases to support AI and BI\n", + "directly on vast amounts of data. One of the key success factors for using the\n", + "data lakehouse for analytics and machine learning is the ability to quickly and\n", + "easily ingest data of various types, including data from on-premises storage\n", + "platforms (data warehouses, mainframes), real-time streaming data, and bulk\n", + "data assets.\n", + "\n", + "As data ingestion into the lakehouse is an ongoing process that feeds the\n", + "proverbial ETL pipeline, you will need multiple options to ingest various formats,\n", + "types and latency of data. For data stored in cloud object stores such as AWS\n", + "S3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\n", + "Auto Loader, a natively integrated feature, that allows data engineers to ingest\n", + "millions of files from the cloud storage continuously. In other streaming cases\n", + "\n", + "(e.g., IoT sensor or clickstream data), Databricks provides native connectors\n", + "for Apache Spark Structured Streaming to quickly ingest data from popular\n", + "message queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\n", + "latencies. Furthermore, many customers can leverage popular ingestion tools\n", + "\n", + "\n", + "that integrate with Databricks, such as Fivetran — to easily ingest data from\n", + "enterprise applications, databases, mainframes and more into the lakehouse.\n", + "Finally, analysts can use the simple “COPY INTO” command to pull new data into\n", + "the lakehouse automatically, without the need to keep track of which files have\n", + "already been processed.\n", + "\n", + "This blog focuses on COPY INTO, a simple yet powerful SQL command that allows\n", + "you to perform batch file ingestion into Delta Lake from cloud object stores.\n", + "It’s idempotent, which guarantees to ingest files with exactly-once semantics\n", + "when executed multiple times, supporting incremental appends and simple\n", + "transformations. It can be run once, in an ad hoc manner, and can be scheduled\n", + "through Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\n", + "INTO introduced new functionalities for data preview, validation, enhanced error\n", + "handling, and a new way to copy into a schemaless Delta Lake table so that users\n", + "\n", + "can get started quickly, completing the end-to-end user journey to ingest from\n", + "cloud object stores. Let’s take a look at the popular COPY INTO use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. Ingesting data for the first time**\n", + "\n", + "\n", + "The default for data validation is to parse all the data in the source directory to\n", + "ensure that there aren’t any issues, but the rows returned for preview are limited.\n", + "Optionally, you can provide the number of rows to preview after VALIDATE.\n", + "\n", + "The COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\n", + "of your target Delta table. Schema evolution only allows the addition of new\n", + "columns, and does not support data type changes for existing columns. In other\n", + "use cases, you can omit this option if you intend to manage your table schema\n", + "more strictly as your data pipeline may have strict schema requirements and\n", + "may not want to evolve the schema at all times. However, our target Delta table\n", + "in the example above is an empty, columnless table at the moment; therefore,\n", + "we have to specify the COPY_OPTION “mergeSchema” here.\n", + "\n", + "Figure 1: COPY INTO VALIDATE mode output\n", + "\n", + "\n", + "COPY INTO requires a table to exist as it ingests the data into a target Delta\n", + "table. However, you have no idea what your data looks like. You first create an\n", + "empty Delta table.\n", + "```\n", + " CREATE TABLE my_example_data;\n", + "\n", + "```\n", + "Before you write out your data, you may want to preview it and ensure the\n", + "data looks correct. The COPY INTO Validate mode is a new feature in\n", + "Databricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\n", + "source data before ingesting many files from the cloud object stores.\n", + "These validations include:\n", + "\n", + "**•** if the data can be parsed\n", + "\n", + "**•** the schema matches that of the target table or if the schema\n", + "needs to be evolved\n", + "\n", + "**•** all nullability and check constraints on the table are met\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Configuring COPY INTO**\n", + "\n", + "\n", + "Figure 2 shows the validate output that the header is properly parsed.\n", + "\n", + "Figure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n", + "\n", + "**3. Appending data to a Delta table**\n", + "\n", + "Now that the preview looks good, we can remove the VALIDATE keyword and\n", + "execute the COPY INTO command.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "\n", + "When looking over the results of VALIDATE (see Figure 1), you may notice that\n", + "your data doesn’t look like what you want. Aren’t you glad you previewed your\n", + "data set first? The first thing you notice is the column names are not what is\n", + "specified in the CSV header. What’s worse, the header is shown as a row in your\n", + "data. You can configure the CSV parser by specifying FORMAT_OPTIONS.\n", + "Let’s add those next.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleData'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n", + "'true' )\n", + "COPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n", + "\n", + "When using the FORMAT OPTION, you can tell COPY INTO to infer the data types\n", + "of the CSV file by specifying the inferSchema option; otherwise, all default\n", + "data types are STRINGs. On the other hand, binary file formats like AVRO and\n", + "PARQUET do not need this option since they define their own schema. Another\n", + "\n", + "option, “mergeSchema” states that the schema should be inferred over a\n", + "comprehensive sample of CSV files rather than just one. The comprehensive list\n", + "of format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO keeps track of the state of files that\n", + "have been ingested. Unlike commands like INSERT\n", + "INTO, users get idempotency with COPY INTO,\n", + "which means users won’t get duplicate data in\n", + "the target table when running COPY INTO multiple\n", + "times from the same source data.\n", + "\n", + "COPY INTO can be run once, in an ad hoc manner,\n", + "and can be scheduled with Databricks Workflows.\n", + "While COPY INTO does not support low latencies\n", + "for ingesting natively, you can trigger COPY INTO\n", + "through orchestrators like Apache Airflow.\n", + "\n", + "\n", + "Figure 3: Databricks workflow UI to schedule a task\n", + "\n", + "\n", + "-----\n", + "\n", + "**4. Secure data access with COPY INTO**\n", + "\n", + "COPY INTO supports secure access in several ways. In this section, we want to\n", + "highlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\n", + "from recent releases:\n", + "\n", + "**Unity Catalog**\n", + "With the general availability of Databrick Unity Catalog, you can use COPY INTO\n", + "to ingest data to Unity Catalog managed or external tables from any source and\n", + "file format supported by COPY INTO. Unity Catalog also adds new options for\n", + "configuring secure access to raw data, allowing you to use Unity Catalog external\n", + "locations or storage credentials to access data in cloud object storage. Learn\n", + "more about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n", + "\n", + "**Temporary Credentials**\n", + "What if you have not configured Unity Catalog or instance profile? How about\n", + "data from a trusted third party bucket? Here is a convenient COPY INTO feature\n", + "that allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\n", + "hoc bulk ingestion use case.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath' WITH (\n", + "CREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\n", + "TOKEN `=` '...' )\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**5. Filtering files for ingestion**\n", + "\n", + "What about ingesting a subset of files where the filenames match a pattern? You\n", + "can apply glob patterns — a glob pattern that identifies the files to load from the\n", + "source directory. For example, let’s filter and ingest files which contain the word\n", + "`raw_data` in the filename below.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data*.csv'\n", + "FORMAT_OPTIONS ( 'header' `=` 'true' )\n", + "\n", + "**6. Ingest files in a time period**\n", + "\n", + "In data engineering, it is frequently necessary to ingest files that have been\n", + "modified before or after a specific timestamp. Data between two timestamps\n", + "may also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\n", + "offered by COPY INTO allow users to ingest data from a chosen time window into\n", + "a Delta table.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_*.csv'\n", + "FORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n", + "\n", + "\n", + "-----\n", + "\n", + "**7. Correcting data with the force option**\n", + "\n", + "Because COPY INTO is by default idempotent, running the same query against\n", + "the same source files more than once has no effect on the destination table\n", + "after the initial execution. You must propagate changes to the target table\n", + "because, in real-world circumstances, source data files in cloud object storage\n", + "may be altered for correction at a later time. In such a case, it is possible to first\n", + "erase the data from the target table before ingesting the more recent data files\n", + "from the source. For this operation you only need to set the copy option ‘force’\n", + "to ‘true’.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*raw_data_2022*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "\n", + "**8. Applying simple transformations**\n", + "\n", + "What if you want to rename columns? Or the source data has changed and a\n", + "previous column has been renamed to something else? You don’t want to ingest\n", + "that data as two separate columns, but as a single column. We can leverage the\n", + "SELECT statement in COPY INTO perform simple transformations.\n", + "\n", + "COPY INTO demo.my_example_data\n", + "FROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n", + "`*` EXCEPT (first_name, last_name)\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "PATTERN `=` '*.csv'\n", + "FORMAT_OPTIONS( 'header' `=` 'true' )\n", + "COPY_OPTIONS ( 'force' `=` 'true' )\n", + "\n", + "**9. Error handling and observability with COPY INTO**\n", + "\n", + "**Error handling:**\n", + "How about ingesting data with file corruption issues? Common examples of file\n", + "corruption are:\n", + "\n", + "**•** Files with an incorrect file format\n", + "\n", + "**•** Failure to decompress\n", + "\n", + "**•** Unreadable files (e.g., invalid Parquet)\n", + "\n", + "\n", + "-----\n", + "\n", + "COPY INTO’s format option ignoreCorruptFiles helps skip those files while\n", + "processing. The result of the COPY INTO command returns the number of files\n", + "skipped in the num_skipped_corrupt_files column. In addition, these corrupt\n", + "files aren’t tracked by the ingestion state in COPY INTO, therefore they can be\n", + "reloaded in a subsequent execution once the corruption is fixed. This option is\n", + "available in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n", + "\n", + "You can see which files have been detected as corrupt by running COPY INTO in\n", + "VALIDATE mode.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM 's3://my-bucket/exampleDataPath'\n", + "FILEFORMAT `=` CSV\n", + "VALIDATE ALL\n", + "FORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n", + "\n", + "**Observability:**\n", + "In Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\n", + "input file metadata information, which allows users to monitor and get key\n", + "properties of the ingested files like path, name, size and modification time, by\n", + "querying a hidden STRUCT column called _metadata. To include this information\n", + "in the destination, you must explicitly reference the _metadata column in your\n", + "query in COPY INTO.\n", + "\n", + "COPY INTO my_example_data\n", + "FROM (\n", + "SELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\n", + "exampleDataPath'\n", + ")\n", + "FILEFORMAT `=` CSV\n", + "\n", + "\n", + "**How does it compare to Auto Loader?**\n", + "\n", + "COPY INTO is a simple and powerful command to use when your source\n", + "directory contains a small number of files (i.e., thousands of files or less), and if\n", + "you prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\n", + "Delta Lake at your convenience, a common pattern by many ingestion partners.\n", + "To ingest a larger number of files both in streaming and batch we recommend\n", + "using [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n", + "[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\n", + "leveraging advanced capabilities of automatic error handling, quality control,\n", + "data lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n", + "\n", + "**How to get started?**\n", + "\n", + "To get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\n", + "example SQL commands to ingest from your cloud object stores. Check out\n", + "the options in No. 4 to establish secure access to your data for querying it in\n", + "Databricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\n", + "follow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n", + "\n", + "As an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\n", + "Machine Learning workspaces to learn most of the COPY INTO features in this\n", + "blog, where source data and target Delta tables are generated in DBFS.\n", + "\n", + "More tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.9 \u0007\n", + "\n", + "**Simplifying Change Data Capture With Databricks Delta Live Tables**\n", + "\n", + "by **M O J G A N M A Z O U C H I**\n", + "\n", + "April 25, 2022\n", + "\n", + "\n", + "This guide will demonstrate how you can leverage change data capture in Delta\n", + "Live Tables pipelines to identify new records and capture changes made to the\n", + "data set in your data lake. Delta Live Tables pipelines enable you to develop\n", + "scalable, reliable and low latency data pipelines, while performing change data\n", + "capturee in your data lake with minimum required computation resources and\n", + "seamless out-of-order data handling.\n", + "\n", + "**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\n", + "which explains creating scalable and reliable pipelines using Delta Live Tables\n", + "(DLT) and its declarative ETL definitions.\n", + "\n", + "**Background on change data capture**\n", + "\n", + "Change data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\n", + "changes (data deletes, inserts and updates) in databases, like tracking customer,\n", + "order or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\n", + "new events occur.\n", + "\n", + "\n", + "Since [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n", + "[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\n", + "real-time centralization of all data changes in your ETL pipeline across multiple\n", + "environments is critical.\n", + "\n", + "By capturing CDC events, Databricks users can re-materialize the source table\n", + "as Delta Table in Lakehouse and run their analysis on top of it, while being able\n", + "to combine data with external systems. The MERGE INTO command in Delta Lake\n", + "on Databricks enables customers to efficiently upsert and delete records in\n", + "their data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\n", + "This is a common use case that we observe many of Databricks customers are\n", + "leveraging Delta Lakes to perform, and keeping their data lakes up to date with\n", + "real-time business data.\n", + "\n", + "While Delta Lake provides a complete solution for real-time CDC synchronization\n", + "in a data lake, we are now excited to announce the change data capture feature\n", + "in Delta Live Tables that makes your architecture even simpler, more efficient and\n", + "scalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n", + "\n", + "Earlier CDC solutions with Delta tables were using MERGE INTO operation, which\n", + "requires manually ordering the data to avoid failure when multiple rows of the\n", + "source data set match while attempting to update the same rows of the target\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta table. To handle the out-of-order data, there was an extra step required to\n", + "preprocess the source table using a foreachBatch implementation to eliminate\n", + "the possibility of multiple matches, retaining only the latest change for each\n", + "key (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\n", + "operation in DLT pipelines automatically and seamlessly handles out-of-order\n", + "data without any need for data engineering manual intervention.\n", + "\n", + "**CDC with Databricks Delta Live Tables**\n", + "\n", + "In this blog, we will demonstrate how to use the APPLY CHANGES INTO command\n", + "in Delta Live Tables pipelines for a common CDC use case where the CDC data\n", + "is coming from an external system. A variety of CDC tools are available such\n", + "as Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\n", + "implementations differ, these tools generally capture and record the history\n", + "of data changes in logs; downstream applications consume these CDC logs. In\n", + "our example, data is landed in cloud object storage from a CDC tool such as\n", + "Debezium, Fivetran, etc.\n", + "\n", + "We have data from various CDC tools landing in a cloud object storage or a\n", + "message queue like Apache Kafka. Typically we see CDC used in an ingestion\n", + "to what we refer as the medallion architecture. A medallion architecture is a\n", + "data design pattern used to logically organize data in a Lakehouse, with the\n", + "goal of incrementally and progressively improving the structure and quality of\n", + "data as it flows through each layer of the architecture. Delta Live Tables allows\n", + "you to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\n", + "combining this functionality with the medallion architecture allows for\n", + "\n", + "\n", + "incremental changes to easily flow through analytical workloads at scale. Using\n", + "CDC together with the medallion architecture provides multiple benefits to users\n", + "since only changed or added data needs to be processed. Thus, it enables users\n", + "to cost-effectively keep Gold tables up-to-date with the latest business data.\n", + "\n", + "**NOTE:** The example here applies to both SQL and Python versions of CDC\n", + "and also on a specific way to use the operations; to evaluate variations,\n", + "please see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n", + "\n", + "**Prerequisites**\n", + "\n", + "To get the most out of this guide, you should have a basic familiarity with:\n", + "\n", + "**•** SQL or Python\n", + "\n", + "**•** Delta Live Tables\n", + "\n", + "**•** Developing ETL pipelines and/or working with Big Data systems\n", + "\n", + "**•** Databricks interactive notebooks and clusters\n", + "\n", + "**•** You must have access to a Databricks Workspace with permissions\n", + "to create new clusters, run jobs, and save data to a location on\n", + "external cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n", + "\n", + "**•** For the pipeline we are creating in this blog, “Advanced” product\n", + "edition which supports enforcement of data quality constraints,\n", + "needs to be selected\n", + "\n", + "\n", + "-----\n", + "\n", + "**The data set**\n", + "\n", + "Here we are consuming realistic looking CDC data from an external database. In\n", + "this pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\n", + "tool like Debezium can produce and bring into cloud storage for the initial ingest\n", + "in Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\n", + "object storage, and store them in the Bronze table as it stores the raw messages.\n", + "The Bronze tables are intended for data ingestion which enable quick access to a\n", + "single source of truth. Next we perform APPLY CHANGES INTO from the cleaned\n", + "Bronze layer table to propagate the updates downstream to the Silver table. As\n", + "data flows to Silver tables, generally it becomes more refined and optimized\n", + "(“just-enough”) to provide an enterprise a view of all its key business entities.\n", + "See the diagram below.\n", + "\n", + "\n", + "This blog focuses on a simple example that requires a JSON message with\n", + "four fields of customer’s name, email, address and id along with the two fields:\n", + "operation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\n", + "operation_date (which stores the date and timestamp for the record came for\n", + "each operation action) to describe the changed data.\n", + "\n", + "To generate a sample data set with the above fields, we are using a Python\n", + "package that generates fake data, Faker. You can find the notebook related to this\n", + "data generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\n", + "location to write the generated data there. We are using the DBFS functionality of\n", + "Databricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\n", + "we use a PySpark user-defined function to generate the synthetic data set for\n", + "each field, and write the data back to the defined storage location, which we will\n", + "refer to in other notebooks for accessing the synthetic data set.\n", + "\n", + "**Ingesting the raw data set using Auto Loader**\n", + "\n", + "According to the medallion architecture paradigm, the Bronze layer holds the\n", + "most raw data quality. At this stage we can incrementally read new data using\n", + "Auto Loader from a location in cloud storage. Here we are adding the path to our\n", + "generated data set to the configuration section under pipeline settings, which\n", + "allows us to load the source path as a variable. So now our configuration under\n", + "pipeline settings looks like below:\n", + "\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\"\n", + "\n", + "\n", + "-----\n", + "\n", + "Then we load this configuration property in our notebooks.\n", + "\n", + "Let’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n", + "\n", + "**A . S Q L**\n", + "\n", + "SET spark.source;\n", + "CREATE STREAMING LIVE TABLE customer_bronze\n", + "(\n", + "address string ,\n", + "email string ,\n", + "id string ,\n", + "firstname string ,\n", + "lastname string ,\n", + "operation string ,\n", + "operation_date string ,\n", + "_rescued_data string\n", + ")\n", + "TBLPROPERTIES ( \"quality\" = \"bronze\" )\n", + "COMMENT \"New customer data incrementally ingested from cloud object\n", + "storage landing zone\"\n", + "AS\n", + "SELECT *\n", + "FROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\n", + "inferColumnTypes\" , \"true\" ));\n", + "\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "import dlt\n", + "from pyspark.sql.functions import - \n", + "from pyspark.sql.types import - \n", + "\n", + "source = spark.conf.get( \"source\" )\n", + "\n", + "**@dlt.table(name=** **\"customer_bronze\"** **,**\n", + "**comment =** **\"New customer data incrementally ingested from**\n", + "**cloud object storage landing zone\"** **,**\n", + "**table_properties={**\n", + "**\"quality\"** **:** **\"bronze\"**\n", + "**}**\n", + "**)**\n", + "```\n", + " def customer_bronze ():\n", + "\n", + "```\n", + "return (\n", + "spark.readStream. format ( \"cloudFiles\" ) \\\n", + ".option( \"cloudFiles.format\" , \"json\" ) \\\n", + ".option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n", + ".load( f\" {source} /customers\" )\n", + ")\n", + "\n", + "The above statements use the Auto Loader to create a streaming live table\n", + "called customer_bronze from json files. When using Auto Loader in Delta Live\n", + "\n", + "Tables, you do not need to provide any location for schema or checkpoint, as\n", + "those locations will be managed automatically by your DLT pipeline.\n", + "\n", + "Auto Loader provides a Structured Streaming source called cloud_files in\n", + "SQL and cloudFiles in Python, which takes a cloud storage path and format as\n", + "parameters.\n", + "\n", + "To reduce compute costs, we recommend running the DLT pipeline in\n", + "Triggered mode as a micro-batch assuming you do not have very low latency\n", + "requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Expectations and high-quality data**\n", + "\n", + "In the next step to create a high-quality, diverse, and accessible data set,\n", + "we impose quality check expectation criteria using Constraints. Currently,\n", + "a constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\n", + "constraints are logged to enable streamlined quality monitoring.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\n", + "CONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\n", + "CONSTRAINT valid_address EXPECT (address IS NOT NULL ),\n", + "CONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\n", + "DROP ROW\n", + ")\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\n", + "AS SELECT `*`\n", + "FROM STREAM(LIVE.customer_bronze);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " @dlt.view(name= \"customer_bronze_clean_v\" ,\n", + " comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n", + "\n", + "```\n", + "\n", + "**Using APPLY CHANGES INTO statement to propagate changes to**\n", + "\n", + "**downstream target table**\n", + "\n", + "Prior to executing the Apply Changes Into query, we must ensure that a target\n", + "streaming table which we want to hold the most up-to-date data exists. If it\n", + "does not exist we need to create one. Below cells are examples of creating a\n", + "target streaming table. Note that at the time of publishing this blog, the target\n", + "streaming table creation statement is required along with the Apply Changes\n", + "Into query, and both need to be present in the pipeline — otherwise your table\n", + "creation query will fail.\n", + "\n", + "**A . S Q L**\n", + "\n", + "CREATE STREAMING LIVE TABLE customer_silver\n", + "TBLPROPERTIES (\"quality\" `=` \"silver\")\n", + "COMMENT \"Clean, merged customers\";\n", + "\n", + "**B . P Y T H O N**\n", + "\n", + "dlt.create_target_table(name= \"customer_silver\" ,\n", + "comment= \"Clean, merged customers\" ,\n", + "table_properties={\n", + "\"quality\" : \"silver\"\n", + "\n", + "```\n", + "@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n", + "@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n", + "@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\n", + "def customer_bronze_clean_v ():\n", + " return dlt.read_stream( \"customer_bronze\" ) \\\n", + "\n", + "```\n", + "`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n", + "```\n", + "\"operation\" , \"operation_date\" , \"_rescued_data\" )\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "Now that we have a target streaming table, we can propagate changes to the\n", + "downstream target table using the Apply Changes Into query. While CDC feed\n", + "comes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\n", + "INSERT and UPDATE events from any record in the source data set matching\n", + "on primary keys, and sequenced by a field which identifies the order of events.\n", + "More specifically it updates any row in the existing target table that matches\n", + "the primary key(s) or inserts a new row when a matching record does not exist\n", + "in the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\n", + "equivalent apply_as_deletes argument in Python to handle DELETE events.\n", + "\n", + "In this example we used \"id\" as my primary key, which uniquely identifies the\n", + "customers and allows CDC events to apply to those identified customer records\n", + "in the target streaming table. Since \"operation_date\" keeps the logical order of\n", + "CDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\n", + "SQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\n", + "change events that arrive out of order. Keep in mind that the field value we use\n", + "with SEQUENCE BY (or sequence_by) should be unique among all updates to\n", + "the same key. In most cases, the sequence by column will be a column with\n", + "timestamp information.\n", + "\n", + "Finally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\n", + "data)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\n", + "date\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n", + "\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\n", + "the columns are included in the target streaming table, when we do not specify\n", + "the \"COLUMNS\" clause.\n", + "\n", + "\n", + "**A . S Q L**\n", + "\n", + "APPLY CHANGES INTO LIVE.customer_silver\n", + "FROM stream(LIVE.customer_bronze_clean_v)\n", + "KEYS (id)\n", + "APPLY AS DELETE WHEN operation `=` \"DELETE\"\n", + "SEQUENCE BY operation_date\n", + "COLUMNS `*` EXCEPT (operation, operation_date,\n", + "_rescued_data);\n", + "\n", + "**B . P Y T H O N**\n", + "```\n", + " dlt.apply_changes(\n", + " target = \"customer_silver\",\n", + " source = \"customer_bronze_clean_v\",\n", + " keys = [\"id\"],\n", + " sequence_by = col(\"operation_date\"),\n", + " apply_as_deletes = expr(\"operation = 'DELETE'\"),\n", + " except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n", + "\n", + "```\n", + "To check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n", + "\n", + "Please note that, at the time of publishing this blog, a table that reads from the\n", + "target of an APPLY CHANGES INTO query or apply_changes function must be a\n", + "live table, and cannot be a streaming live table.\n", + "\n", + "A [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\n", + "we have all the cells ready, let’s create a pipeline to ingest data from cloud object\n", + "storage. Open Jobs in a new tab or window in your workspace, and select “Delta\n", + "Live Tables.”\n", + "\n", + "\n", + "-----\n", + "\n", + "The pipeline associated with this blog has the following DLT pipeline settings:\n", + "\n", + "{\n", + "\"clusters\" : [\n", + "{\n", + "\"label\" : \"default\" ,\n", + "\"num_workers\" : 1\n", + "}\n", + "],\n", + "\"development\" : true ,\n", + "\"continuous\" : false ,\n", + "\"edition\" : \"advanced\" ,\n", + "\"photon\" : false ,\n", + "\"libraries\" : [\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/1-CDC_DataGenerator\"\n", + "}\n", + "},\n", + "{\n", + "\"notebook\" : {\n", + "\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\n", + "notebooks/2-Retail_DLT_CDC_sql\"\n", + "}\n", + "}\n", + "],\n", + "\"name\" : \"CDC_blog\" ,\n", + "\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n", + "\"configuration\" : {\n", + "\"source\" : \"/tmp/demo/cdc_raw\" ,\n", + "\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n", + "},\n", + "\"target\" : \"my_database\"\n", + "\n", + "\n", + "1. Select “Create Pipeline” to create a new pipeline\n", + "\n", + "2. Specify a name such as “Retail CDC Pipeline”\n", + "\n", + "3. Specify the Notebook Paths that you already created earlier, one for the\n", + "generated data set using Faker package, and another path for the ingestion\n", + "of the generated data in DLT. The second notebook path can refer to the\n", + "notebook written in SQL, or Python depending on your language of choice.\n", + "\n", + "4. To access the data generated in the first notebook, add the data set path in\n", + "configuration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\n", + "we set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\n", + "our second notebook.\n", + "\n", + "5. Specify the Target (which is optional and referring to the target database),\n", + "where you can query the resulting tables from your pipeline\n", + "\n", + "6. Specify the Storage Location in your object storage (which is optional), to\n", + "access your DLT produced data sets and metadata logs for your pipeline\n", + "\n", + "7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\n", + "new data in the source all at once, and once the processing is done it will\n", + "terminate the compute resource automatically. You can toggle between\n", + "Triggered and Continuous modes when editing your pipeline settings. Setting\n", + "“continuous”: false in the JSON is equivalent to setting the pipeline to\n", + "Triggered mode.\n", + "\n", + "8. For this workload you can disable the autoscaling under Autopilot Options,\n", + "and use only one worker cluster. For production workloads, we recommend\n", + "enabling autoscaling and setting the maximum numbers of workers needed\n", + "for cluster size.\n", + "\n", + "9. Select “Start”\n", + "\n", + "10. Your pipeline is created and running now!\n", + "\n", + "\n", + "-----\n", + "\n", + "You can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\n", + "to see pipeline observability and data quality monitoring on the example DLT\n", + "pipeline associated with this blog.\n", + "\n", + "**Conclusion**\n", + "\n", + "In this blog, we showed how we made it seamless for users to efficiently\n", + "implement change data capture (CDC) into their lakehouse platform with Delta\n", + "Live Tables (DLT). DLT provides built-in quality controls with deep visibility into\n", + "pipeline operations, observing pipeline lineage, monitoring schema, and quality\n", + "checks at each step in the pipeline. DLT supports automatic error handling and\n", + "best in class auto-scaling capability for streaming workloads, which enables\n", + "users to have quality data with optimum resources required for their workload.\n", + "\n", + "Data engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n", + "[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\n", + "your ETL pipelines easily identify changes and apply those changes across tens\n", + "of thousands of tables with low-latency support.\n", + "\n", + "**Ready to get started and try out CDC in Delta Live Tables for yourself?**\n", + "Please watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\n", + "complexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n", + "[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n", + "[video](https://vimeo.com/700994477) to create your pipeline!\n", + "\n", + "\n", + "**DLT pipeline lineage observability and data quality**\n", + "**monitoring**\n", + "\n", + "All DLT pipeline logs are stored in the pipeline’s storage location. You can specify\n", + "your storage location only when you are creating your pipeline. Note that once\n", + "the pipeline is created you can no longer modify storage location.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.10 \u0007\n", + "\n", + "**Best Practices for Cross-Government Data Sharing**\n", + "\n", + "by **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n", + "\n", + "**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n", + "\n", + "February 21, 2023\n", + "\n", + "\n", + "Government data exchange is the practice of sharing data between different\n", + "government agencies and often partners in commercial sectors. Government\n", + "can share data for various reasons, such as to improve government operations’\n", + "efficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\n", + "private sector or receiving data from the private sector. The considerations span\n", + "multiple jurisdictions and over almost all industries. In this blog, we will address the\n", + "needs disclosed as part of national data strategies and how modern technologies,\n", + "particularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\n", + "implement and manage a future-proof and sustainable data ecosystem.\n", + "\n", + "**Data sharing and public sector**\n", + "\n", + "“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n", + "\n", + "Probably the quote about sharing that applies the most profoundly to the\n", + "topic of data sharing. To the extent that the purpose of sharing the data is to\n", + "create new information, new insights, and new data. The importance of data\n", + "sharing is even more amplified in the government context, where federation\n", + "\n", + "\n", + "between departments allows for increased focus. Still, the very same federation\n", + "introduces challenges around data completeness, data quality, data access,\n", + "security and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\n", + "and require a strategic, multifaceted approach to be addressed appropriately.\n", + "Technology, people, process, legal frameworks, etc., require dedicated\n", + "consideration when designing a robust data sharing ecosystem.\n", + "\n", + "[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\n", + "missions through which we can materialize the value of data for the citizen and\n", + "society-wide benefits.\n", + "\n", + "\n", + "-----\n", + "\n", + "It comes as no surprise that each and every one of the missions is strongly\n", + "related to the concept of data sharing, or more broadly, data access both within\n", + "and outside of government departments:\n", + "\n", + "**1. Unlocking the value of the data across the economy** — Mission 1 of the\n", + "NDS aims to assert government and the regulators as enablers of the value\n", + "extraction from data through the adoption of best practices. The UK data\n", + "economy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\n", + "In this context, it is essential to understand that the government-collected\n", + "and provided open data can be crucial for addressing many of the challenges\n", + "across all industries.\n", + "\n", + "For example, insurance providers can better assess the risk of insuring\n", + "properties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\n", + "the other hand, capital market investors could better understand the risk of\n", + "their investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\n", + "Reversely, it is crucial for regulators to have well-defined data access and\n", + "data sharing patterns for conducting their regulatory activities. This clarity\n", + "truly enables the economic actors that interact with government data.\n", + "\n", + "\n", + "**2. Securing a pro-growth and trusted data regime** — The key aspect of\n", + "Mission 2 is data trust, or more broadly, adherence to data quality norms.\n", + "Data quality considerations become further amplified for data sharing and\n", + "data exchange use cases where we are considering the whole ecosystem\n", + "at once, and quality implications transcend the boundaries of our own\n", + "platform. This is precisely why we have to adopt “data sustainability.” What\n", + "we mean by sustainable data products are data products that harness the\n", + "existing sources over reinvention of the same/similar assets, accumulation of\n", + "unnecessary data (data pollutants) and that anticipate future uses.\n", + "\n", + "Ungoverned and unbounded data sharing could negatively impact data\n", + "quality and hinder the growth and value of data. The quality of how the data\n", + "is shared should be a key consideration of data quality frameworks. For\n", + "this reason, we require a solid set of standards and best practices for data\n", + "sharing with governance and quality assurance built into the process and\n", + "technologies. Only this way can we ensure the sustainability of our data and\n", + "secure a pro-growth trusted data regime.\n", + "\n", + "\n", + "-----\n", + "\n", + "**3. Transforming government’s use of data to drive efficiency and improve**\n", + "**public services** — “By 2025 data assets are organized and supported as\n", + "products, regardless of whether they’re used by internal teams or external\n", + "customers… Data products continuously evolve in an agile manner to meet\n", + "the needs of consumers… these products provide data solutions that can\n", + "more easily and repeatedly be used to meet various business challenges and\n", + "reduce the time and cost of delivering new AI-driven capabilities.” —\n", + "[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\n", + "enablers of digital transformation for both the public and private sectors.\n", + "\n", + "AI, ML, reports, and dashboards are just a few examples of data products\n", + "and services that extract value from data. The quality of these solutions is\n", + "directly reflected in the quality of data used for building them and our ability\n", + "to access and leverage available data assets both internally and externally.\n", + "Whilst there is a vast amount of data available for us to build new intelligent\n", + "solutions for driving efficiency for better processes, better decision-making,\n", + "and better policies — there are numerous barriers that can trap the data,\n", + "such as legacy systems, data silos, fragmented standards, proprietary\n", + "formats, etc. Modeling data solutions as data products and standardizing\n", + "them to a unified format allows us to abstract such barriers and truly\n", + "leverage the data ecosystem.\n", + "\n", + "\n", + "**4. Ensuring the security and resilience of the infrastructure on which**\n", + "**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\n", + "from now and even in a not so distant future, we will be required to rethink\n", + "our approach to data, more specifically — what is our digital supply chain\n", + "infrastructure/data sharing infrastructure? Data and data assets are products\n", + "and should be managed as products. If data is a product, we need a coherent\n", + "and unified way of providing those products.\n", + "\n", + "If data is to be used across industries and across both private and public\n", + "sectors, we need an open protocol that drives adoption and habit generation.\n", + "To drive adoption, the technologies we use must be resilient, robust, trusted\n", + "and usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\n", + "boundaries to achieving this vision.\n", + "\n", + "**5. Championing the international flow of data** — Data exchange between\n", + "jurisdictions and across governments will likely be one of the most\n", + "transformative applications of data at scale. Some of the world’s toughest\n", + "challenges depend on the efficient exchange of data between governments\n", + "— prevention of criminal activities, counterterrorism activities, net-zero\n", + "emission goals, international trade, the list goes on and on. Some steps in\n", + "this direction are already materializing: the U.S. federal government and UK\n", + "government have agreed on data exchange for countering serious crime\n", + "activities. This is a true example of championing international flow data and\n", + "using data for good. It is imperative that for these use cases, we approach\n", + "data sharing from a security-first angle. Data sharing standards and protocols\n", + "need to adhere to security and privacy best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "While originally built with a focus on the UK government and how to better\n", + "integrate data as a key asset of a modern government, these concepts apply in\n", + "a much wider global public sector context. In the same spirit, the U.S. Federal\n", + "Government proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\n", + "practices, action steps and timeline through which government can leverage\n", + "the full value of Federal data for mission, service and the public good.\n", + "\n", + "The principles are grouped into three primary topics:\n", + "\n", + "**•** **Ethical governance** — Within the domain of ethics, the sharing of data\n", + "is a fundamental tool for promoting transparency, accountability and\n", + "explainability of decision-making. It is practically impossible to uphold\n", + "ethics without some form of audit conducted by an independent party.\n", + "Data (and metadata) exchange is a critical enabler for continuous robust\n", + "processes that ensure we are using the data for good and we are using data\n", + "we can trust.\n", + "\n", + "\n", + "\n", + "**•** **Conscious design** — These principles are strongly aligned with the idea of\n", + "data sustainability. The guidelines promote forward thinking around usability\n", + "and interoperability of the data and user-centric design principles of\n", + "sustainable data products.\n", + "\n", + "**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\n", + "an important role in building a scalable learning ecosystem and learning\n", + "culture. Data is front and center of knowledge synthesis, and from a\n", + "scientific angle, data proves factual knowledge. Another critical component\n", + "of knowledge is the “Why?” and data is what we need to address the\n", + "“Why?” component of any decisions we make, which policy to enforce, who\n", + "to sanction, who to support with grants, how to improve the efficiency of\n", + "government services, how to better serve citizens and society.\n", + "\n", + "In contrast to afore discussed qualitative analysis of the value of data sharing\n", + "across governments, the European Commission forecasts the economic value\n", + "of the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\n", + "same size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\n", + "million data professionals in Europe alone. The technology and infrastructure to\n", + "support the data society have to be accessible to all, interoperable, extensible,\n", + "flexible and open. Imagine a world in which you’d need a different truck to\n", + "transport products between different warehouses because each road requires a\n", + "different set of tires — the whole supply chain would collapse. When it comes to\n", + "data, we often experience the “one set of tires for one road” paradox. Rest APIs\n", + "and data exchange protocols have been proposed in the past but have failed\n", + "to address the need for simplicity, ease of use and cost of scaling up with the\n", + "number of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Sharing — the new data**\n", + "**highway**\n", + "\n", + "Delta Sharing provides an open protocol for\n", + "secure data sharing to any computing platform.\n", + "The protocol is based on Delta data format and is\n", + "agnostic concerning the cloud of choice.\n", + "\n", + "Delta is an open source data format that avoids\n", + "vendor, platform and cloud lock-in, thus fully\n", + "adhering to the principles of data sustainability,\n", + "conscious design of the U.S. Federal Data Strategy\n", + "and mission 4 of the UK National Data Strategy.\n", + "Delta provides a governance layer on top of the\n", + "Parquet data format. Furthermore, it provides many\n", + "performance optimizations not available in Parquet\n", + "out of the box. The openness of the data format\n", + "is a critical consideration. It is the main factor for\n", + "driving the habit generation and adoption of best\n", + "practices and standards.\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\n", + "permissions and access to any data asset stored in Delta or Parquet formats.\n", + "The protocol defines two main actors, the data provider (data supplier, data\n", + "owner) and the data recipient (data consumer). The recipient, by definition, is\n", + "agnostic to the data format at the source. Delta Sharing provides the necessary\n", + "abstractions for governed data access in many different languages and tools.\n", + "\n", + "Delta Sharing is uniquely positioned to answer many of the challenges of data\n", + "sharing in a scalable manner within the context of highly regulated domains like\n", + "the public sector:\n", + "\n", + "**• Privacy and security concerns** — Personally identifiable data or otherwise\n", + "sensitive or restricted data is a major part of the data exchange needs of a\n", + "data-driven and modernized government. Given the sensitive nature of such\n", + "data, it is paramount that the governance of data sharing is maintained in a\n", + "coherent and unified manner. Any unnecessary process and technological\n", + "complexities increase the risk of over-sharing data. With this in mind,\n", + "Delta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\n", + "very inception. The protocol provides end-to-end encryption, short-lived\n", + "credentials, and accessible and intuitive audit and governance features. All\n", + "of these capabilities are available in a centralized way across all your Delta\n", + "tables across all clouds.\n", + "\n", + "**• Quality and accuracy** — Another challenge of data sharing is ensuring\n", + "that the data being shared is of high quality and accuracy. Given that\n", + "the underlying data is stored as Delta tables, we can guarantee that the\n", + "[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\n", + "of data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n", + "\n", + "\n", + "quality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n", + "[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\n", + "additional effort. The issue becomes even more emphasized by the fact\n", + "that data quality cannot be ensured in the same way on both the data\n", + "provider and data recipient side without the exact reimplementation of the\n", + "source systems. It is critical to embed quality and metadata together with\n", + "data to ensure quality travels together with data. Any decoupled approach\n", + "to managing data, metadata and quality separately increases the risk of\n", + "sharing and can lead to undesirable outcomes.\n", + "\n", + "**• Lack of standardization** — Another challenge of data sharing is the lack\n", + "of standardization in how data is collected, organized, and stored. This is\n", + "particularly pronounced in the context of governmental activities. While\n", + "governments have proposed standard formats (e.g., Office for National\n", + "Statistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\n", + "sector companies to standards proposed by such initiatives is a massive\n", + "challenge. Other industries may have different requirements for scalability,\n", + "interoperability, format complexity, lack of structure in data, etc. Most of\n", + "the currently advocated standards are lacking in multiple such aspects.\n", + "Delta is the most mature candidate for assuming the central role in the\n", + "standardization of data exchange format. It has been built as a transactional\n", + "and scalable data format, it supports structured, semi-structured and\n", + "unstructured data, it stores data schema and metadata together with data\n", + "and it provides a scalable enterprise-grade sharing protocol through Delta\n", + "Sharing. Finally, Delta is one of the most popular open source projects\n", + "in the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n", + "[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**• Cultural and organizational barriers** — These challenges can be\n", + "summarized by one word: friction. Unfortunately, it’s a common problem\n", + "for civil servants to struggle to obtain access to both internal and external\n", + "data due to over-cumbersome processes, policies and outdated standards.\n", + "The principles we are using to build our data platforms and our data sharing\n", + "platforms have to be self-promoting, have to drive adoption and have to\n", + "generate habits that adhere to best practices.\n", + "\n", + "If there is friction with standard adoption, the only way to ensure standards\n", + "are respected is by enforcement and that itself is yet another barrier to\n", + "achieving data sustainability. Organizations have already adopted Delta\n", + "Sharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n", + "[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n", + "[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\n", + "Sharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\n", + "and governed.\n", + "\n", + "\n", + "\n", + "**• Technical challenges** — Federation at the government scale or even\n", + "further across multiple industries and geographies poses technical\n", + "challenges. Each organization within this federation owns its platform\n", + "and drives technological, architectural, platform and tooling choices.\n", + "\n", + "How can we promote interoperability and data exchange in this vast,\n", + "diverse technological ecosystem? The data is the only viable integration\n", + "vehicle. As long as the data formats we utilize are scalable, open and\n", + "governed, we can use them to abstract from individual platforms and\n", + "their intrinsic complexities.\n", + "\n", + "Delta format and Delta Sharing solve this wide array of requirements and\n", + "challenges in a scalable, robust and open way. This positions Delta Sharing\n", + "as the strongest choice for unification and simplification of the protocol and\n", + "mechanism through which we share data across both private and public sectors.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Sharing through data clean rooms**\n", + "\n", + "\n", + "[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\n", + "share data with third parties in a privacy-safe environment. With Unity Catalog ,\n", + "you can enable fine-grained access controls on the data and meet your privacy\n", + "requirements. In this architecture, the data participants never get access to\n", + "the raw data. The only outputs from the clean rooms are those data assets\n", + "generated in a pre-agreed, governed and fully controlled manner that ensures\n", + "compliance with the requirements of all parties involved.\n", + "\n", + "Finally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\n", + "on the premise. In contrast, less restricted data is free to leverage the power\n", + "of the cloud offerings. In said scenario, there may be a need to combine the\n", + "power of the cloud with the restricted data to solve advanced use cases where\n", + "capabilities are unavailable on the on-premises data platforms. Data clean rooms\n", + "can ensure that no physical data copies of the raw restricted data are created,\n", + "results are produced within the clean room’s controlled environment and results\n", + "are shared back to the on-premises environment (if the results maintain the\n", + "restricted access within the defined policies) or are forwarded to any other\n", + "compliant and predetermined destination system.\n", + "\n", + "\n", + "Taking the complexities of data sharing within highly regulated space and the\n", + "public sector one step further — what if we require to share the knowledge\n", + "contained in the data without ever granting direct access to the source data to\n", + "external parties? These requirements may prove achievable and desirable where\n", + "the data sharing risk appetite is very low.\n", + "\n", + "In many public sector contexts, there are concerns that combining the data that\n", + "describes citizens could lead to a big brother scenario where simply too much\n", + "data about an individual is concentrated in a single data asset. If it were to fall\n", + "into the wrong hands, such a hypothetical data asset could lead to immeasurable\n", + "consequences for individuals and the trust in public sector services could\n", + "erode. On the other hand, the value of a 360 view of the citizen could accelerate\n", + "important decision-making. It could immensely improve the quality of policies\n", + "and services provided to the citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Citizen value of data sharing**\n", + "\n", + "Every decision made by the government is a decision that affects its citizens.\n", + "Whether the decision is a change to a policy, granting a benefit or preventing\n", + "crime, it can significantly influence the quality of our society. Data is a key factor\n", + "in making the right decisions and justifying the decisions made. Simply put,\n", + "we can’t expect high-quality decisions without the high quality of data and a\n", + "complete view of the data (within the permitted context). Without data sharing,\n", + "we will remain in a highly fragmented position where our ability to make those\n", + "decisions is severely limited or even completely compromised. In this blog, we\n", + "have covered several technological solutions available within the lakehouse that\n", + "can derisk and accelerate how the government is leveraging the data ecosystem\n", + "in a sustainable and scalable way.\n", + "\n", + "For more details on the industry use cases that Delta Sharing is addressing\n", + "please consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Ready-to-Use Notebooks and Data Sets\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins**\n", + "\n", + "Leverage digital twins — virtual\n", + "representations of devices and\n", + "objects — to optimize operations and\n", + "gain insights\n", + "\n", + "\n", + "This section includes several Solution Accelerators — free, ready-to-use\n", + "\n", + "examples of data solutions from different industries ranging from retail to\n", + "\n", + "manufacturing and healthcare. Each of the following scenarios includes\n", + "\n", + "notebooks with code and step-by-step instructions to help you get\n", + "\n", + "started. Get hands-on experience with the Databricks Lakehouse Platform\n", + "\n", + "\n", + "by trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n", + "\n", + "\n", + "**Overall Equipment**\n", + "**Effectiveness**\n", + "\n", + "Ingest equipment sensor data for\n", + "metric generation and data driven\n", + "decision-making\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n", + "\n", + "**Real-time point of**\n", + "**sale analytics**\n", + "\n", + "Calculate current inventories for\n", + "various products across multiple store\n", + "locations with Delta Live Tables\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n", + "\n", + "\n", + "**Recommendation Engines**\n", + "**for Personalization**\n", + "\n", + "Improve customers’ user experience\n", + "and conversion with personalized\n", + "recommendations\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Understanding Price**\n", + "**Transparency Data**\n", + "\n", + "Efficiently ingest large healthcare data\n", + "sets to create price transparency for\n", + "better understanding of healthcare costs\n", + "\n", + "**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n", + "\n", + "Additional Solution Accelerators with ready-to-use notebooks can be found here:\n", + "\n", + "**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Case Studies\n", + "\n", + "**4.1** Akamai\n", + "\n", + "**4.2** Grammarly\n", + "\n", + "**4.3** Honeywell\n", + "\n", + "**4.4** Wood Mackenzie\n", + "\n", + "**4.5** Rivian\n", + "\n", + "**4.6** AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.1\n", + "**Akamai delivers real-time security**\n", + "**analytics using Delta Lake**\n", + "\n", + "\n", + "###### <1\n", + "\n", + "**Min ingestion time,**\n", + "**reduced from 15 min**\n", + "\n", + "\n", + "###### <85%\n", + "\n", + "**Of queries have a response**\n", + "**time of 7 seconds or less**\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Delta Lake, Data Streaming, Photon,\n", + "[Databricks SQL](https://databricks.com/product/databricks-sql)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "Akamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n", + "\n", + "uses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n", + "\n", + "worldwide to route internet traffic for some of the largest enterprises in media, commerce,\n", + "\n", + "finance, retail and many other industries. About 30% of the internet’s traffic flows through\n", + "\n", + "Akamai servers. Akamai also provides cloud security solutions.\n", + "\n", + "In 2018, the company launched a web security analytics tool that offers Akamai customers\n", + "\n", + "a single, unified interface for assessing a wide range of streaming security events and\n", + "\n", + "performing analysis of those events. The web analytics tool helps Akamai customers to\n", + "\n", + "take informed actions in relation to security events in real time. Akamai is able to stream\n", + "\n", + "massive amounts of data and meet the strict SLAs it provides to customers by leveraging\n", + "\n", + "Delta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting and streaming enormous amounts of data**\n", + "\n", + "Akamai’s web security analytics tool ingests approximately 10GB of data related\n", + "to security events per second. Data volume can increase significantly when\n", + "retail customers conduct a large number of sales — or on big shopping days like\n", + "Black Friday or Cyber Monday. The web security analytics tool stores several\n", + "petabytes of data for analysis purposes. Those analyses are performed to\n", + "protect Akamai’s customers and provide them with the ability to explore and\n", + "query security events on their own.\n", + "\n", + "The web security analytics tool initially relied on an on-premises architecture\n", + "running Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n", + "(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\n", + "displayed in the tool. The company sought to improve ingestion and query speed\n", + "to meet those SLAs. “Data needs to be as real-time as possible so customers\n", + "can see what is attacking them,” says Tomer Patel, Engineering Manager at\n", + "Akamai. “Providing queryable data to customers quickly is critical. We wanted to\n", + "move away from on-prem to improve performance and our SLAs so the latency\n", + "would be seconds rather than minutes.”\n", + "\n", + "**Delta Lake allows us to not only query the data better but to**\n", + "**also acquire an increase in the data volume. We’ve seen an**\n", + "**80% increase in traffic and data in the last year, so being able**\n", + "**to scale fast is critical.**\n", + "\n", + "\n", + "After conducting proofs of concept with several companies, Akamai chose to\n", + "base its streaming analytics architecture on Spark and the Databricks Lakehouse\n", + "Platform. “Because of our scale and the demands of our SLA, we determined that\n", + "Databricks was the right solution for us,” says Patel. “When we consider storage\n", + "optimization, and data caching, if we went with another solution, we couldn’t\n", + "achieve the same level of performance.”\n", + "\n", + "**Improving speed and reducing costs**\n", + "\n", + "Today, the web security analytics tool ingests and transforms data, stores it\n", + "in cloud storage, and sends the location of the file via Kafka. It then uses a\n", + "Databricks Job as the ingest application. Delta Lake, the open source storage\n", + "format at the base of the Databricks Lakehouse Platform, supports real-time\n", + "querying on the web security analytics data. Delta Lake also enables Akamai to\n", + "scale quickly. “Delta Lake allows us to not only query the data better but to also\n", + "acquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\n", + "in traffic and data in the last year, so being able to scale fast is critical.”\n", + "\n", + "Akamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n", + "\n", + "fast query performance. Patel added that Photon provided a significant boost\n", + "to query performance. Overall, Databricks’ streaming architecture combined\n", + "with DBSQL and Photon enables Akamai to achieve real-time analytics, which\n", + "translates to real-time business benefits.\n", + "\n", + "\n", + "**Tomer Patel**\n", + "Engineering Manager, Akamai\n", + "\n", + "\n", + "-----\n", + "\n", + "Patel says he likes that Delta Lake is open source, as the company has benefitted\n", + "from a community of users working to improve the product. “The fact that Delta\n", + "Lake is open source and there’s a big community behind it means we don’t need\n", + "to implement everything ourselves,” says Patel. “We benefit from fixed bugs that\n", + "others have encountered and from optimizations that are contributed to the\n", + "project.” Akamai worked closely with Databricks to ensure Delta Lake can meet\n", + "the scale and performance requirements Akamai defined. These improvements\n", + "have been contributed back to the project (many of which were made available as\n", + "part of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\n", + "technology being tested at such a large scale in a real-world production scenario.\n", + "\n", + "\n", + "**Meeting aggressive requirements for scale,**\n", + "**reliability and performance**\n", + "\n", + "Using Spark Structured Streaming on the Databricks Lakehouse Platform enables\n", + "the web security analytics tool to stream vast volumes of data and provide\n", + "low-latency, real-time analytics-as-a-service to Akamai’s customers. That way\n", + "Akamai is able to make available security event data to customers within the\n", + "SLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\n", + "performance, performance,” says Patel. “The platform’s performance and\n", + "scalability are what drives us.”\n", + "\n", + "Using the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\n", + "the security event data. “Reducing ingestion time from 15 minutes to under 1\n", + "minute is a huge improvement,” says Patel. “It benefits our customers because\n", + "they can see the security event data faster and they have a view of what exactly\n", + "is happening as well as the capability to filter all of it.”\n", + "\n", + "Akamai’s biggest priority is to provide customers with a good experience and\n", + "fast response times. To date, Akamai has moved about 70% of security event\n", + "data from its on-prem architecture to Databricks, and the SLA for customer\n", + "query and response time has improved significantly as a result. “Now, with the\n", + "move to Databricks, our customers experience much better response time, with\n", + "over 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\n", + "optimal security configuration.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.2\n", + "**Grammarly uses Databricks Lakehouse to improve**\n", + "**user experience**\n", + "\n", + "\n", + "###### 110%\n", + "\n", + "**Faster querying, at 10% of the cost**\n", + "**to ingest, than a data warehouse**\n", + "\n", + "\n", + "###### 5 billion\n", + "\n", + "**Daily events available for**\n", + "**analytics in under 15 minutes**\n", + "\n", + "\n", + "Grammarly’s mission is to improve lives by improving communication. The company’s\n", + "\n", + "trusted AI-powered communication assistance provides real-time suggestions to\n", + "\n", + "help individuals and teams write more confidently and achieve better results. Its\n", + "\n", + "comprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n", + "\n", + "[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n", + "\n", + "wherever writing happens. As the company grew over the years, its legacy, homegrown\n", + "\n", + "analytics system made it challenging to evaluate large data sets quickly and cost-\n", + "\n", + "effectively.\n", + "\n", + "By migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n", + "\n", + "flexible, scalable and highly secure analytics platform that helps 30 million people and\n", + "\n", + "50,000 teams worldwide write more effectively every day.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n", + "\n", + "**S O L U T I O N**\n", + "Recommendation Engines, Advertising\n", + "Effectiveness, Customer Lifetime Value\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Unity Catalog,\n", + "[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Harnessing data to improve communications for millions of**\n", + "**users and thousands of teams**\n", + "\n", + "When people use Grammarly’s AI communication assistance, they receive\n", + "suggestions to help them improve multiple dimensions of communication,\n", + "including spelling and grammar correctness, clarity and conciseness, word\n", + "choice, style, and tone. Grammarly receives feedback when users accept, reject\n", + "or ignore its suggestions through app-created events, which total about 5 billion\n", + "events per day.\n", + "\n", + "Historically, Grammarly relied on a homegrown legacy analytics platform and\n", + "leveraged an in-house SQL-like language that was time-intensive to learn and\n", + "made it challenging to onboard new hires. As the company grew, Grammarly\n", + "data analysts found that the platform did not sufficiently meet the needs of its\n", + "essential business functions, especially marketing, sales and customer success.\n", + "Analysts found themselves copying and pasting data from spreadsheets\n", + "because the existing system couldn’t effectively ingest the external data needed\n", + "to answer questions such as, “Which marketing channel delivers the highest\n", + "ROI?” Reporting proved challenging because the existing system didn’t support\n", + "Tableau dashboards, and company leaders and analysts needed to ensure they\n", + "could make decisions quickly and confidently.\n", + "\n", + "\n", + "**Databricks Lakehouse has given us the flexibility to unleash**\n", + "**our data without compromise. That flexibility has allowed us**\n", + "**to speed up analytics to a pace we’ve never achieved before.**\n", + "\n", + "**Chris Locklin**\n", + "Engineering Manager, Data Platforms, Grammarly\n", + "\n", + "Grammarly also sought to unify its data warehouses in order to scale and\n", + "improve data storage and query capabilities. As it stood, large Amazon EMR\n", + "clusters ran 24/7 and drove up costs. With the various data sources, the team\n", + "also needed to maintain access control. “Access control in a distributed file\n", + "system is difficult, and it only gets more complicated as you ingest more data\n", + "sources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\n", + "Meanwhile, reliance on a single streaming workflow made collaboration among\n", + "teams challenging. Data silos emerged as different business areas implemented\n", + "analytics tools individually. “Every team decided to solve their analytics needs in\n", + "the best way they saw fit,” says Locklin. “That created challenges in consistency\n", + "and knowing which data set was correct.”\n", + "\n", + "\n", + "-----\n", + "\n", + "As its data strategy was evolving, Grammarly’s priority was to get the most out\n", + "of analytical data while keeping it secure. This was crucial because security is\n", + "Grammarly’s number-one priority and most important feature, both in how it\n", + "protects its users’ data and how it ensures its own company data remains secure.\n", + "To accomplish that, Grammarly’s data platform team sought to consolidate\n", + "data and unify the company on a single platform. That meant sustaining a highly\n", + "secure infrastructure that could scale alongside the company’s growth, improving\n", + "ingestion flexibility, reducing costs and fueling collaboration.\n", + "\n", + "**Improving analytics, visualization and decision-making**\n", + "**with the lakehouse**\n", + "\n", + "After conducting several proofs of concept to enhance its infrastructure,\n", + "Grammarly migrated to the Databricks Lakehouse Platform. Bringing all the\n", + "analytical data into the lakehouse created a central hub for all data producers\n", + "and consumers across Grammarly, with Delta Lake at the core.\n", + "\n", + "Using the lakehouse architecture, data analysts within Grammarly now have a\n", + "consolidated interface for analytics, which leads to a single source of truth and\n", + "\n", + "confidence in the accuracy and availability of all data managed by the data\n", + "platform team. Across the organization, teams are using Databricks SQL to\n", + "conduct queries within the platform on both internally generated product data\n", + "and external data from digital advertising platform partners. Now, they can easily\n", + "connect to Tableau and create dashboards and visualizations to present to\n", + "executives and key stakeholders.\n", + "\n", + "\n", + "“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\n", + "companies ask for your data, hold it for you, and then let you perform analytics\n", + "on it. Just as Grammarly ensures our users’ data always remains theirs, we\n", + "wanted to ensure our company data remained ours. Grammarly’s data stays\n", + "inside of Grammarly.”\n", + "\n", + "With its data consolidated in the lakehouse, different areas of Grammarly’s\n", + "business can now analyze data more thoroughly and effectively. For example,\n", + "Grammarly’s marketing team uses advertising to attract new business. Using\n", + "Databricks, the team can consolidate data from various sources to extrapolate\n", + "a user’s lifetime value, compare it with customer acquisition costs and get rapid\n", + "feedback on campaigns. Elsewhere, data captured from user interactions flow\n", + "into a set of tables used by analysts for ad hoc analysis to inform and improve\n", + "the user experience.\n", + "\n", + "By consolidating data onto one unified platform, Grammarly has eliminated data\n", + "silos. “The ability to bring all these capabilities, data processing and analysis\n", + "under the same platform using Databricks is extremely valuable,” says Sergey\n", + "Blanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\n", + "and engineering to analytics and ML under the same umbrella removes barriers\n", + "and makes it easy for everyone to work with the data and each other.”\n", + "\n", + "\n", + "-----\n", + "\n", + "To manage access control, enable end-to-end observability and monitor data\n", + "quality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n", + "“Data lineage allows us to effectively monitor usage of our data and ensure it\n", + "upholds the standards we set as a data platform team,” says Locklin. “Lineage is\n", + "the last crucial piece for access control. It allows analysts to leverage data to do\n", + "their jobs while adhering to all usage standards and access controls, even when\n", + "recreating tables and data sets in another environment.”\n", + "\n", + "**Faster time to insight drives more intelligent**\n", + "**business decisions**\n", + "\n", + "Using the Databricks Lakehouse Platform, Grammarly’s engineering teams now\n", + "have a tailored, centralized platform and a consistent data source across the\n", + "company, resulting in greater speed and efficiency and reduced costs. The\n", + "lakehouse architecture has led to 110% faster querying, at 10% of the cost to\n", + "ingest, than a data warehouse. Grammarly can now make its 5 billion daily events\n", + "available for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n", + "\n", + "receive feedback about new features being rolled out and understand if they are\n", + "being adopted as expected. Ultimately, it helps them understand how groups\n", + "of users engage with the UX, improving the experience and ensuring features\n", + "and product releases bring the most value to users. “Everything my team does\n", + "is focused on creating a rich, personalized experience that empowers people to\n", + "communicate more effectively and achieve their potential,” says Locklin.\n", + "\n", + "\n", + "Moving to the lakehouse architecture also solved the challenge of access control\n", + "over distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\n", + "ability to manage file permissions with more flexibility than a database would\n", + "allow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\n", + "using Databricks allows us to keep analytical data in-house, Unity Catalog helps\n", + "us continue to uphold the highest standards of data protection by controlling\n", + "access paradigms inside our data. That opens a whole new world of things that\n", + "we can do.”\n", + "\n", + "Ultimately, migrating to the Databricks Lakehouse Platform has helped\n", + "Grammarly to foster a data-driven culture where employees get fast access\n", + "to analytics without having to write complex queries, all while maintaining\n", + "Grammarly’s enterprise-grade security practices. “Our team’s mission is to help\n", + "Grammarly make better, faster business decisions,” adds Blanket. “My team\n", + "would not be able to effectively execute on that mission if we did not have a\n", + "platform like Databricks available to us.” Perhaps most critically, migrating off its\n", + "rigid legacy infrastructure gives Grammarly the adaptability to do more while\n", + "knowing the platform will evolve as its needs evolve. “Databricks has given us the\n", + "flexibility to unleash our data without compromise,” says Locklin. “That flexibility\n", + "has allowed us to speed up analytics to a pace we’ve never achieved before.”\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.3\n", + "**Honeywell selects Delta Live Tables for streaming data**\n", + "\n", + "Companies are under growing pressure to reduce energy use, while at the same time\n", + "\n", + "they are looking to lower costs and improve efficiency. Honeywell delivers industry-\n", + "\n", + "specific solutions that include aerospace products and services, control technologies\n", + "\n", + "for buildings and industry, and performance materials globally. Honeywell’s Energy\n", + "\n", + "and Environmental Solutions division uses IoT sensors and other technologies to help\n", + "\n", + "businesses worldwide manage energy demand, reduce energy consumption and carbon\n", + "\n", + "emissions, optimize indoor air quality, and improve occupant well-being.\n", + "\n", + "Accomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n", + "\n", + "Tables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n", + "\n", + "billions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n", + "\n", + "real-time queries and multilayer insights into data at scale — helping Honeywell improve\n", + "\n", + "how it manages data and extract more value from it, both for itself and for its customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Delta Lake, Delta Live Tables\n", + "\n", + "\n", + "**C LO U D**\n", + "[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n", + "**aggregations, and bring the significant amount of data we collect**\n", + "**from our buildings under control so we can provide customers value.**\n", + "\n", + "**Dr. Chris Inkpen**\n", + "Global Solutions Architect, Honeywell Energy and Environmental Solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "**Processing billions of IoT data points per day**\n", + "\n", + "Honeywell’s solutions and services are used in millions of buildings around the\n", + "world. Helping its customers create buildings that are safe, more sustainable\n", + "and productive can require thousands of sensors per building. Those sensors\n", + "monitor key factors such as temperature, pressure, humidity and air quality.\n", + "In addition to the data collected by sensors inside a building, data is also\n", + "collected from outside, such as weather and pollution data. Another data set\n", + "consists of information about the buildings themselves — such as building\n", + "type, ownership, floor plan, square footage of each floor and square footage\n", + "of each room. That data set is combined with the two disparate data streams,\n", + "adding up to a lot of data across multiple structured and unstructured formats,\n", + "including images and video streams, telemetry data, event data, etc. At peaks,\n", + "Honeywell ingests anywhere between 200 to 1,000 events per second for any\n", + "building, which equates to billions of data points per day. Honeywell’s existing\n", + "data infrastructure was challenged to meet such demand. It also made it difficult\n", + "for Honeywell’s data team to query and visualize its disparate data so it could\n", + "provide customers with fast, high-quality information and analysis.\n", + "\n", + "**ETL simplified: high-quality, reusable data pipelines**\n", + "\n", + "With Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\n", + "data team can now ingest billions of rows of sensor data into Delta Lake and\n", + "automatically build SQL endpoints for real-time queries and multilayer insights\n", + "into data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n", + "\n", + "\n", + "Chris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\n", + "Solutions. “We give the system more data, and it copes. Out of the box, it’s given\n", + "us the confidence that it will handle whatever we throw at it.”\n", + "\n", + "Honeywell credits the Databricks Lakehouse Platform for helping it to unify its\n", + "vast and varied data — batch, streaming, structured and unstructured — into\n", + "one platform. “We have many different data types. The Databricks Lakehouse\n", + "Platform allows us to use things like Apache Kafka and Auto Loader to load and\n", + "process multiple types of data and treat everything as a stream of data, which is\n", + "awesome. Once we’ve got structured data from unstructured data, we can write\n", + "standardized pipelines.”\n", + "\n", + "Honeywell data engineers can now build and leverage their own ETL pipelines\n", + "with Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\n", + "be reused regardless of environment, and data can run in batches or streams. It’s\n", + "also helped Honeywell’s data team transition from a small team to a larger team.\n", + "“When we wrote our first few pipelines before DLT existed, only one person could\n", + "work in one part of the functionality. Now that we’ve got DLT and the ability to\n", + "have folders with common functionality, we’ve got a really good platform where\n", + "we can easily spin off different pipelines.”\n", + "\n", + "DLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\n", + "pipeline need optimization,” says Inkpen. “With standard pipelines, that was\n", + "much more chaotic.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling ease, simplicity and scalability across the**\n", + "**infrastructure**\n", + "\n", + "Delta Live Tables has helped Honeywell’s data team consistently query\n", + "complex data while offering simplicity of scale. It also enables end-to-end data\n", + "visualization of Honeywell’s data streams as they flow into its infrastructure, are\n", + "transformed, and then flow out. “Ninety percent of our ETL is now captured in\n", + "diagrams, so that’s helped considerably and improves data governance. DLT\n", + "encourages — and almost enforces — good design,” says Inkpen.\n", + "\n", + "Using the lakehouse as a shared workspace has helped promote teamwork and\n", + "collaboration at Honeywell. “The team collaborates beautifully now, working\n", + "together every day to divvy up the pipeline into their own stories and workloads,”\n", + "says Inkpen.\n", + "\n", + "Meanwhile, the ability to manage streaming data with low latency and better\n", + "throughput has improved accuracy and reduced costs. “Once we’ve designed\n", + "something using DLT, we’re pretty safe from scalability issues — certainly a\n", + "hundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\n", + "then go back and look at how we can take a traditional job and make it more\n", + "performant and less costly. We’re in a much better position to try and do that\n", + "from DLT.”\n", + "\n", + "\n", + "Using Databricks and DLT also helps the Honeywell team perform with greater\n", + "agility, which allows them to innovate faster while empowering developers to\n", + "respond to user requirements almost immediately. “Our previous architecture\n", + "made it impossible to know what bottlenecks we had and what we needed to\n", + "scale. Now we can do data science in near real-time.”\n", + "\n", + "Ultimately, Honeywell can now more quickly provide its customers with the\n", + "data and analysis they need to make their buildings more efficient, healthier\n", + "and safer for occupants. “I’m continuously looking for ways to improve our\n", + "lifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\n", + "us pull together many different data sources, do aggregations, and bring the\n", + "significant amount of data we collect from our buildings under control so we\n", + "can provide customers value.”\n", + "\n", + "**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.4\n", + "**Wood Mackenzie helps customers transition to a more**\n", + "**sustainable future**\n", + "\n", + "\n", + "###### 12 Billion\n", + "\n", + "**Data points processed**\n", + "**each week**\n", + "\n", + "\n", + "###### 80-90%\n", + "\n", + "**Reduction in**\n", + "**processing time**\n", + "\n", + "\n", + "###### Cost Savings\n", + "\n", + "**In operations through**\n", + "**workflow automation**\n", + "\n", + "\n", + "Wood Mackenzie offers customized consulting and analysis for a wide range of clients\n", + "\n", + "in the energy and natural resources sectors. Founded in Edinburgh, the company first\n", + "\n", + "cultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n", + "\n", + "detailed insight for every interconnected sector of the energy, chemicals, metals and\n", + "\n", + "mining industries.\n", + "\n", + "Today it sees itself playing an important role in the transition to a more sustainable\n", + "\n", + "future. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n", + "\n", + "ingest and process massive amounts of data. Using a common workflow provided\n", + "\n", + "higher visibility to engineering team members, encouraging better collaboration. With\n", + "\n", + "an automated, transparent workflow in place, the team saw improved productivity and\n", + "\n", + "data quality and an easier path to fix pipeline issues when they arise.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n", + "\n", + "**P L AT F O R M U S E C A S E**\n", + "Lakehouse, Workflows\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering insights to the energy industry**\n", + "\n", + "Fulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\n", + "built to deliver insights at key decision points for customers in the energy sector.\n", + "Feeding into Lens are vast amounts of data collected from various data sources\n", + "and sensors used to monitor energy creation, oil and gas production, and more.\n", + "Those data sources update about 12 billion data points every week that must\n", + "be ingested, cleaned and processed as part of the input for the Lens platform.\n", + "Yanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\n", + "data professionals that build and maintain the ETL pipeline that provides input\n", + "data for Lens. The team is leveraging the Databricks Lakehouse Platform and\n", + "uses Apache Spark™ for parallel processing, which provides greater performance\n", + "and scalability benefits compared to an earlier single-node system working\n", + "sequentially. “We saw a reduction of 80-90% in data processing time, which\n", + "results in us providing our clients with more up-to-date, more complete and\n", + "more accurate data,” says Wu.\n", + "\n", + "**Our mission is to transform the way we power the planet.**\n", + "**Our clients in the energy sector need data, consulting services**\n", + "**and research to achieve that transformation. Databricks**\n", + "**Workflows gives us the speed and flexibility to deliver the**\n", + "**insights our clients need.**\n", + "\n", + "\n", + "**Improved collaboration and transparency with a common**\n", + "**workflow**\n", + "\n", + "The data pipeline managed by the team includes several stages for standardizing\n", + "and cleaning raw data, which can be structured or unstructured and may be in\n", + "the form of PDFs or even handwritten notes.\n", + "\n", + "Different members of the data team are responsible for different parts of\n", + "the pipeline, and there is a dependency between the processing stages each\n", + "team member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\n", + "workstream that the entire team uses. Each stage of the pipeline is implemented\n", + "in a Python notebook, which is run as a job in the main workflow.\n", + "\n", + "Each team member can now see exactly what code is running on each stage,\n", + "making it easy to find the cause of the issue. Knowing who owns the part of the\n", + "pipeline that originated the problem makes fixing issues much faster. “Without\n", + "a common workflow, different members of the team would run their notebooks\n", + "independently, not knowing that failure in their run affected stages downstream,”\n", + "says Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\n", + "rerun notebooks, it was hard to tell which notebook version was initially run and\n", + "the latest version to use.”\n", + "\n", + "\n", + "**Yanyan Wu**\n", + "Vice President of Data, Wood Mackenzie\n", + "\n", + "\n", + "-----\n", + "\n", + "Using Workflows’ alerting capabilities to notify the team when a workflow task\n", + "fails ensures everyone knows a failure occurred and allows the team to work\n", + "together to resolve the issue quickly. The definition of a common workflow\n", + "created consistency and transparency that made collaboration easier. “Using\n", + "Databricks Workflows allowed us to encourage collaboration and break up the\n", + "walls between different stages of the process,” explains Wu. “It allowed us all to\n", + "speak the same language.”\n", + "\n", + "Creating transparency and consistency is not the only advantage the team saw.\n", + "Using Workflows to automate notebook runs also led to cost savings compared\n", + "to running interactive notebooks manually.\n", + "\n", + "**Improved code development productivity**\n", + "\n", + "The team’s ETL pipeline development process involves iteration on PySpark\n", + "notebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\n", + "for data professionals on the team to manually develop and test a notebook.\n", + "Because Databricks Workflows supports running notebooks as task type\n", + "(along with Python files, JAR files and other types), when the code is ready for\n", + "\n", + "\n", + "developing notebooks with the interactive notebook UI while leveraging the\n", + "power of automation, which reduces potential issues that may happen when\n", + "running notebooks manually.\n", + "\n", + "The team has gone even further in increasing productivity by developing a\n", + "CI/CD process. “By connecting our source control code repository, we know\n", + "the workflow always runs the latest code version we committed to the repo,”\n", + "explains Zhang. “It’s also easy to switch to a development branch to develop a\n", + "new feature, fix a bug and run a development workflow. When the code passes\n", + "all tests, it is merged back to the main branch and the production workflow is\n", + "automatically updated with the latest code.”\n", + "\n", + "Going forward, Wood Mackenzie plans to optimize its use of Databricks\n", + "Workflows to automate machine learning processes such as model training,\n", + "model monitoring and handling model drift. The firm uses ML to improve its data\n", + "quality and extract insights to provide more value to its clients. “Our mission is to\n", + "transform how we power the planet,” Wu says. “Our clients in the energy sector\n", + "need data, consulting services and research to achieve that transformation.\n", + "Databricks Workflows gives us the speed and flexibility to deliver the insights our\n", + "clients need.”\n", + "\n", + "\n", + "production, it’s easy and cost effective to automate it by adding it to a workflow.\n", + "The workflow can then be easily revised by adding or removing any steps to\n", + "or from the defined flow. This way of working keeps the benefit of manually\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.5\n", + "**Rivian redefines driving experience with**\n", + "**the Databricks Lakehouse**\n", + "\n", + "###### 250 platform users\n", + "\n", + "**A 50x increase from a year ago**\n", + "\n", + "Rivian is preserving the natural world for future generations with revolutionary Electric\n", + "\n", + "Adventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n", + "\n", + "terabytes of IoT data per day, the company is using data insights and machine\n", + "\n", + "learning to improve vehicle health and performance. However, with legacy cloud\n", + "\n", + "tooling, it struggled to scale pipelines cost-effectively and spent significant resources\n", + "\n", + "on maintenance — slowing its ability to be truly data driven.\n", + "\n", + "Since moving to the Databricks Lakehouse Platform, Rivian can now understand how\n", + "\n", + "a vehicle is performing and how this impacts the driver using it. Equipped with these\n", + "\n", + "insights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n", + "\n", + "driving experience to customers.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Predictive Maintenance, Scaling ML Models\n", + "for IoT, Data-Driven ESG\n", + "\n", + "**P L AT F O R M**\n", + "[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "**C LO U D**\n", + "[AWS](https://www.databricks.com/product/aws)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Struggling to democratize data on a legacy platform**\n", + "\n", + "\n", + "sharing of data, which further contributed to productivity issues. Required data\n", + "languages and specific expertise of toolsets created a barrier to entry that\n", + "limited developers from making full use of the data available. Jason Shiverick,\n", + "Principal Data Scientist at Rivian, said the biggest issue was the data access. “I\n", + "wanted to open our data to a broader audience of less technical users so they\n", + "could also leverage data more easily.”\n", + "\n", + "Rivian knew that once its EAVs hit the market, the amount of data ingested would\n", + "explode. In order to deliver the reliability and performance it promised, Rivian\n", + "needed an architecture that would not only democratize data access, but also\n", + "provide a common platform to build innovative solutions that can help ensure a\n", + "reliable and enjoyable driving experience.\n", + "\n", + "**Databricks Lakehouse empowers us to lower the barrier of**\n", + "**entry for data access across our organization so we can build**\n", + "**the most innovative and reliable electric vehicles in the world.**\n", + "\n", + "**Wassym Bensaid**\n", + "Vice President of Software Development, Rivian\n", + "\n", + "\n", + "Building a world that will continue to be enjoyed by future generations requires\n", + "a shift in the way we operate. At the forefront of this movement is Rivian —\n", + "an electric vehicle manufacturer focused on shifting our planet’s energy and\n", + "transportation systems entirely away from fossil fuel. Today, Rivian’s fleet\n", + "includes personal vehicles and involves a partnership with Amazon to deliver\n", + "100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\n", + "capture petabytes of data ranging from how the vehicle drives to how various\n", + "parts function. With all this data at its fingertips, Rivian is using machine learning\n", + "to improve the overall customer experience with predictive maintenance so that\n", + "potential issues are addressed before they impact the driver.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility\n", + "and tooling limitations that decreased output, prevented collaboration and\n", + "increased operational costs. It had 30 to 50 large and operationally complicated\n", + "compute clusters at any given time, which was costly. Not only was the system\n", + "difficult to manage, but the company experienced frequent cluster outages\n", + "as well, forcing teams to dedicate more time to troubleshooting than to data\n", + "analysis. Additionally, data silos created by disjointed systems slowed the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Predicting maintenance issues with Databricks Lakehouse**\n", + "\n", + "Rivian chose to modernize its data infrastructure on the Databricks Lakehouse\n", + "Platform, giving it the ability to unify all of its data into a common view for\n", + "downstream analytics and machine learning. Now, unique data teams have\n", + "a range of accessible tools to deliver actionable insights for different use\n", + "cases, from predictive maintenance to smarter product development. Venkat\n", + "Sivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\n", + "to build a culture around an open data platform that provided a system for\n", + "really democratizing data and analysis in an efficient way.” Databricks’ flexible\n", + "support of all programming languages and seamless integration with a variety of\n", + "toolsets eliminated access roadblocks and unlocked new opportunities. Wassym\n", + "Bensaid, Vice President of Software Development at Rivian, explains, “Today we\n", + "have various teams, both technical and business, using Databricks Lakehouse\n", + "to explore our data, build performant data pipelines, and extract actionable\n", + "business and product insights via visual dashboards.”\n", + "\n", + "\n", + "metrics, Rivian can improve the accuracy of smart features and the control\n", + "that drivers have over them. Designed to take the stress out of long drives and\n", + "driving in heavy traffic, features like adaptive cruise control, lane change assist,\n", + "automatic emergency driving, and forward collision warning can be honed over\n", + "time to continuously optimize the driving experience for customers.\n", + "\n", + "Secure data sharing and collaboration was also facilitated with the Databricks\n", + "Unity Catalog. Shiverick describes how unified governance for the lakehouse\n", + "benefits Rivian productivity. “Unity Catalog gives us a truly centralized data\n", + "catalog across all of our different teams,” he said. “Now we have proper access\n", + "management and controls.” Venkat adds, “With Unity Catalog, we are centralizing\n", + "data catalog and access management across various teams and workspaces,\n", + "which has simplified governance.” End-to-end version controlled governance\n", + "and auditability of sensitive data sources, like the ones used for autonomous\n", + "driving systems, produces a simple but secure solution for feature engineering.\n", + "This gives Rivian a competitive advantage in the race to capture the autonomous\n", + "driving grid.\n", + "\n", + "\n", + "Rivian’s ADAS (advanced driver-assistance systems) Team can now easily\n", + "prepare telemetric accelerometer data to understand all EAV motions. This core\n", + "recording data includes information about pitch, roll, speed, suspension and\n", + "airbag activity, to help Rivian understand vehicle performance, driving patterns\n", + "and connected car system predictability. Based on these key performance\n", + "\n", + "\n", + "-----\n", + "\n", + "**Accelerating into an electrified and sustainable world**\n", + "\n", + "\n", + "By scaling its capacity to deliver valuable data insights with speed, efficiency\n", + "and cost-effectiveness, Rivian is primed to leverage more data to improve\n", + "operations and the performance of its vehicles to enhance the customer\n", + "experience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\n", + "money from a cloud perspective, and that’s a huge win for us.” With Databricks\n", + "Lakehouse providing a unified and open source approach to data and analytics,\n", + "the Vehicle Reliability Team is able to better understand how people are using\n", + "their vehicles, and that helps to inform the design of future generations of\n", + "vehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n", + "30%–50% increase in runtime performance, which has led to faster insights and\n", + "model performance.\n", + "\n", + "Shiverick explains, “From a reliability standpoint, we can make sure that\n", + "components will withstand appropriate lifecycles. It can be as simple as\n", + "making sure door handles are beefy enough to endure constant usage, or as\n", + "complicated as predictive and preventative maintenance to eliminate the\n", + "chance of failure in the field. Generally speaking, we’re improving software quality\n", + "based on key vehicle metrics for a better customer experience.”\n", + "\n", + "\n", + "From a design optimization perspective, Rivian’s unobstructed data view is also\n", + "producing new diagnostic insights that can improve fleet health, safety, stability\n", + "and security. Venkat says, “We can perform remote diagnostics to triage a\n", + "problem quickly, or have a mobile service come in, or potentially send an OTA\n", + "to fix the problem with the software. All of this needs so much visibility into\n", + "the data, and that’s been possible with our partnership and integration on the\n", + "platform itself.” With developers actively building vehicle software to improve\n", + "issues along the way.\n", + "\n", + "Moving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\n", + "different teams — increasing the number of platform users from 5 to 250 in only\n", + "one year. This has unlocked new use cases including using machine learning to\n", + "optimize battery efficiency in colder temperatures, increasing the accuracy of\n", + "autonomous driving systems, and serving commercial depots with vehicle health\n", + "dashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\n", + "of commercial vans expands, Rivian will continue to leverage the troves of data\n", + "generated by its EAVs to deliver new innovations and driving experiences that\n", + "revolutionize sustainable transportation.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4.6\n", + "**Migrating to the cloud to better serve**\n", + "**millions of customers**\n", + "\n", + "\n", + "###### 300%\n", + "\n", + "**ROI from OpEx savings**\n", + "**and cost avoidance**\n", + "\n", + "\n", + "###### 3X\n", + "\n", + "**Faster delivery of ML/data**\n", + "**science use cases**\n", + "\n", + "\n", + "Consistency in innovation is what keeps customers with a telecommunications company\n", + "\n", + "and is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n", + "\n", + "Hadoop system proved complex and costly to manage, impeding operational agility\n", + "\n", + "and efficiency and engineering resources. The need to pivot to cloud to better support\n", + "\n", + "hundreds of millions of subscribers was apparent.\n", + "\n", + "Migrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n", + "\n", + "savings in operating costs. Additionally, the new cloud-based environment has unlocked\n", + "\n", + "access to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n", + "\n", + "2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n", + "\n", + "overburdening its engineering team or exploding operational costs — to deliver new\n", + "\n", + "features and innovations to its millions of end users.\n", + "\n", + "\n", + "**I N D U S T R Y**\n", + "[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n", + "\n", + "**S O L U T I O N**\n", + "Customer Retention, Subscriber Churn\n", + "Prediction, Threat Detection\n", + "\n", + "**P L AT F O R M**\n", + "Lakehouse, Data Science, Machine Learning,\n", + "[Data Streaming](https://www.databricks.com/product/data-streaming)\n", + "\n", + "**C LO U D**\n", + "[Azure](https://www.databricks.com/product/azure)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hadoop technology adds operational complexity and**\n", + "**unnecessary costs**\n", + "\n", + "AT&T is a technology giant with hundreds of millions of subscribers and ingests\n", + "10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\n", + "this data, it has a team of 2,500+ data users across 60+ business units to ensure\n", + "the business is data powered — from building analytics to ensure decisions are\n", + "based on the best data-driven situation awareness to building ML models that\n", + "bring new innovations to its customers. To support these requirements, AT&T\n", + "needed to democratize and establish a data single version of truth (SVOT) while\n", + "simplifying infrastructure management to increase agility and lower overall costs.\n", + "\n", + "However, physical infrastructure was too resource intensive. The combination\n", + "of a highly complex hardware setup (12,500 data sources and 1,500+ servers)\n", + "coupled with an on-premises Hadoop architecture proved complex to\n", + "maintain and expensive to manage. Not only were the operational costs to\n", + "support workloads high, but there were also additional capital costs around\n", + "data centers, licensing and more. Up to 70% of the on-prem platform had to\n", + "\n", + "be prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n", + "\n", + "data quality objectives. Engineers’ time was focused on managing updates,\n", + "\n", + "\n", + "With these deeply rooted technology issues, AT&T was not in the best position\n", + "to achieve its goals of increasing its use of insights for improving its customer\n", + "experience and operating more efficiently. “To truly democratize data across\n", + "the business, we needed to pivot to a cloud-native technology environment,”\n", + "said Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\n", + "up resources that had been focused on managing our infrastructure and move\n", + "them up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n", + "\n", + "**A seamless migration journey to Databricks**\n", + "\n", + "As part of its due diligence, AT&T ran a comprehensive cost analysis and\n", + "concluded that Databricks was both the fastest and achieved the best price/\n", + "performance for data pipelines and machine learning workloads. AT&T knew the\n", + "migration would be a massive undertaking. As such, the team did a lot of upfront\n", + "planning — they prioritized migrating their largest workloads first to immediately\n", + "reduce their infrastructure footprint. They also decided to migrate their data\n", + "before migrating users to ensure a smooth transition and experience for their\n", + "thousands of data practitioners.\n", + "\n", + "\n", + "fixing performance issues or simply provisioning resources rather than focusing\n", + "\n", + "\n", + "on higher-valued tasks. The resource constraints of physical infrastructure\n", + "\n", + "also drove serialization of data science activities, slowing innovation. Another\n", + "\n", + "hurdle faced in operationalizing petabytes of data was the challenge of building\n", + "\n", + "streaming data pipelines for real-time analytics, an area that was key to\n", + "\n", + "supporting innovative use cases required to better serve its customers.\n", + "\n", + "\n", + "**The migration from Hadoop to Databricks enables us to bring**\n", + "**more value to our customers and do it more cost-efficiently**\n", + "**and much faster than before.**\n", + "\n", + "**Mark Holcomb**\n", + "Distinguished Solution Architect, AT&T\n", + "\n", + "\n", + "-----\n", + "\n", + "They spent a year deduplicating and synchronizing data to the cloud before\n", + "migrating any users. This was a critical step in ensuring the successful migration\n", + "of such a large, complex multi-tenant environment of 2,500+ users from 60+\n", + "business units and their workloads. The user migration process occurred over\n", + "nine months and enabled AT&T to retire on-premises hardware in parallel with\n", + "migration to accelerate savings as early as possible. Plus, due to the horizontal,\n", + "scalable nature of Databricks, AT&T didn’t need to have everything in one\n", + "contiguous environment. Separating data and compute, and across multiple\n", + "accounts and workspaces, ensured analytics worked seamlessly without any API\n", + "call limits or bandwidth issues and consumption clearly attributed to the 60+\n", + "business units.\n", + "\n", + "All in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n", + "12,500 data sources and 300 schemas. The entire process took about two and a\n", + "half years. And it was able to manage the entire migration with the equivalent of\n", + "15 full-time internal resources. “Databricks was a valuable collaborator throughout\n", + "the process,” said Holcomb. “The team worked closely with us to resolve product\n", + "features and security concerns to support our migration timeline.”\n", + "\n", + "**Databricks reduces TCO and opens new paths to**\n", + "**innovation**\n", + "\n", + "One of the immediate benefits of moving to Databricks was huge cost savings.\n", + "AT&T was able to rationalize about 30% of its data by identifying and not\n", + "migrating underutilized and duplicate data. And prioritizing the migration of\n", + "the largest workloads allowed half the on-prem equipment to be rationalized\n", + "\n", + "\n", + "during the course of the migration. “By prioritizing the migration of our most\n", + "compute-intensive workloads to Databricks, we were able to significantly drive\n", + "down costs while putting us in position to scale more efficiently moving forward,”\n", + "explained Holcomb. The result is an anticipated 300% five-year migration ROI\n", + "from OpEx savings and cost avoidance (e.g., not needing to refresh data center\n", + "hardware).\n", + "\n", + "With data readily available and the means to analyze data at any scale, teams\n", + "of citizen data scientists and analysts can now spend more time innovating,\n", + "instead of serializing analytics efforts or waiting on engineering to provide the\n", + "necessary resources — or having data scientists spend their valuable time\n", + "on less complex or less insightful analyses. Data scientists are now able to\n", + "collaborate more effectively and speed up machine learning workflows so that\n", + "teams can deliver value more quickly, with a 3x faster time to delivery for new\n", + "data science use cases.\n", + "\n", + "“Historically you would have had operations in one system and analytics in a\n", + "separate one,” said Holcomb. “Now we can do more use cases like operational\n", + "analytics in a platform that fosters cross-team collaboration, reduces cost and\n", + "improves the consistency of answers.” Since migrating to Databricks, AT&T now\n", + "has a single version of truth to create new data-driven opportunities, including\n", + "a self-serve AI-as-a-Service analytics platform that will enable new revenue\n", + "streams and help it continue delivering exceptional innovations to its millions\n", + "of customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf2024-09-19T16:57:20Z
##### EBOOK\n", + "\n", + "# 8 Steps to Becoming an AI-Forward Retailer\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "\n", + "Introduction .............................................................................................................................................................................................. **3**\n", + "\n", + "The State of the Retail Industry:\n", + "\n", + "The Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n", + "\n", + "Begin With a Shared Vision of Success ....................................................................................................................................... **6**\n", + "\n", + "Why Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n", + "\n", + "Before Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n", + "\n", + "Getting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n", + "\n", + "Going Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n", + "\n", + "Normalizing the Process: Engraining a Data-Driven Mindset\n", + "\n", + "Into the Fabric of the Business ...................................................................................................................................................... **14**\n", + "\n", + "From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n", + "\n", + "The 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n", + "\n", + "Transform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "\n", + "In a world where data is king, retailers have historically been trailblazers, pioneering data technology\n", + "adoption to supercharge their operations, enhance customer understanding and sharpen\n", + "personalization. The journey began with the simple cash register about 150 years ago, progressed to\n", + "standardized product reporting with the introduction of the UPC and EAN, and has evolved to include\n", + "cutting-edge technologies such as RFID and machine learning.\n", + "\n", + "Today, we stand on the brink of “Generation AI,” defined by sophisticated language models and\n", + "images. Retailers, with their history of embracing data technologies, find themselves in a strong\n", + "position to reap the benefits of this new era. Automation of customer service, supply chain modeling\n", + "with digital twins and delivering hyper-personalized experiences in real time are all in the cards,\n", + "promising to bolster revenue, improve margins and slash costs for early adopters.\n", + "\n", + "According to an internal analysis by Databricks, data pioneers are already outstripping their\n", + "competition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\n", + "six major industry sectors, including retail — shows these front-runners outperforming the rest of the\n", + "market by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\n", + "are setting themselves up for significant gains and a robust competitive advantage.\n", + "\n", + "However, for retailers mired in the landscape of outdated data platforms, the transformation into an\n", + "AI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\n", + "feel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\n", + "evolving retail landscape.\n", + "\n", + "To help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\n", + "road map for organizations embarking on digital transformation journeys — a shift that is as much\n", + "about culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\n", + "vision for transformation, outlining a compelling case for why such change is vital for the company’s\n", + "long-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\n", + "critical business procedures.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n", + "\n", + "\n", + "The pandemic’s fallout has led to a widening chasm between the retail industry’s\n", + "leaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n", + "“Companies with tech-forward business models, who were already pulling ahead\n", + "pre-crisis, left their competitors in the dust.”\n", + "\n", + "But what exactly is a “tech-forward business model”? It isn’t a simple narrative of\n", + "digital natives dethroning traditional retailers. Heavyweights like Walmart, Target\n", + "and Costco held their own against Amazon. Nor was it purely a matter of scale —\n", + "smaller brands like Warby Parker or Everlane managed to carve out substantial\n", + "consumer bases, competing against larger, established players.\n", + "\n", + "**The common denominator among all victors**\n", + "**was their ability to harness data, analytics and AI**\n", + "**to rapidly react to shifts in consumer behavior.**\n", + "\n", + "\n", + "methods, optimizing operations to alleviate the pressure these modes exerted\n", + "on margins. They successfully established tighter partnerships with suppliers\n", + "and logistic entities, collaborating toward shared triumphs.\n", + "\n", + "In all these instances, it was their timely access to information, foresight\n", + "driven by this data, and the exploration of probable outcomes that set these\n", + "organizations apart. Infusing data-driven decision-making into core processes\n", + "within the organization, as well as those crossing partner boundaries, unlocked\n", + "this approach’s full potential.\n", + "\n", + "To illustrate the significance of prioritizing data and AI, we developed the\n", + "Databricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\n", + "stocks research, this index tracks marquee customers across our top five\n", + "verticals and partners. The Databricks 30 is an equal-weight price index,\n", + "\n", + "composed of five marquee customers each across Retail/Consumer Products,\n", + "Financial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\n", + "plus five strategic partners.\n", + "\n", + "\n", + "These businesses deftly used consumer demand insights to understand the\n", + "effects of supply chain disruptions and labor shortages and reallocate resources\n", + "to mitigate the most harmful impacts. They adeptly introduced new delivery\n", + "\n", + "\n", + "-----\n", + "\n", + "Our analysis reveals that companies in the Databricks 30 Index outpaced the\n", + "S&P 500 by an impressive +21 percentage points (pp) over the past three years.\n", + "In other words, if the stock market rose by 50% during this period, the Databricks\n", + "30 Index would have soared by 71% (outperforming by 21pp). Even more\n", + "remarkable, excluding tech entirely from the Databricks 30, the Databricks 30\n", + "ex-Tech index outperforms the S&P 500 by an even larger margin over the same\n", + "time frame: +23pp.\n", + "\n", + "\n", + "DB30 DOw30\n", + "\n", + "\n", + "Similar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\n", + "are investing in cloud, data and innovation do, in fact, win.\n", + "\n", + "\n", + "So now that we see the impact, let’s dive into the steps retail organizations can\n", + "take to put themselves on a trajectory of continued growth and success amid an\n", + "ever-changing landscape.\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n", + "\n", + "\n", + "01-01-2019 01-01-2020 01-01-2021\n", + "\n", + "\n", + "DATE\n", + "\n", + "\n", + "-----\n", + "\n", + "## Begin With a Shared Vision of Success\n", + "\n", + "\n", + "The most overlooked activity in becoming an AI-forward retailer is the most\n", + "crucial. In the rush to secure a position on the AI frontier, many companies\n", + "are leaping before they look, embarking on AI initiatives without a clear\n", + "understanding of what they want to achieve. Simply adopting the newest,\n", + "shiniest tech tools isn’t a silver bullet. Many companies set themselves up for\n", + "failure by neglecting to clearly define the expected business outcomes at the\n", + "onset of the initiative, a strategic move that can effectively reduce project risk\n", + "and costs and lead to the ultimate success of the program. In fact, in an attempt\n", + "to accelerate results, this cavalier approach can instead spiral into expensive\n", + "mistakes, wasted resources and a decrease in trust for stakeholders from\n", + "unmet expectations. It’s like setting sail on an open ocean without a destination\n", + "in mind; the journey might provide some interesting detours, but it lacks\n", + "direction and purpose.\n", + "\n", + "However, when organizations take the time to articulate their expected\n", + "business outcomes before deploying AI and data-driven programs, they position\n", + "themselves to reduce project risk and costs. By aligning AI initiatives with\n", + "specific business objectives and creating a shared vision with stakeholders,\n", + "the focus becomes less about the technology itself and more about how it\n", + "can be used to reach these defined goals.\n", + "\n", + "\n", + "Technology decisions, too, are improved by having a known target. Without\n", + "clear business outcomes in mind, companies tend to design, develop and\n", + "implement technologies that _might_ be needed to solve the problem. Aligning\n", + "the technical road map and activities with business outcomes mitigates the\n", + "risk of misallocated resources and the potential fallout from the unfulfilled\n", + "promise of AI.\n", + "\n", + "Furthermore, a clear understanding of expected business outcomes allows\n", + "for efficient project management and cost control. Companies can set key\n", + "performance indicators (KPIs) tied directly to these outcomes. This not only\n", + "provides a means to measure progress, but also helps control costs by\n", + "ensuring that resources are targeted toward initiatives that deliver value.\n", + "\n", + "It’s not just about numbers either; having explicit objectives aids in cultivating\n", + "\n", + "stakeholder buy-in. Clear communication about the purpose and potential\n", + "benefits of an AI initiative can foster support from executives, employees,\n", + "investors and customers alike. This collective backing can further mitigate risk\n", + "and cut costs by ensuring that everyone is pulling in the same direction.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Companies Struggle With Setting Clear Business Outcomes for AI\n", + "\n", + "\n", + "Getting started with AI at your organization might be daunting, and that’s\n", + "because it is a big undertaking! Struggling to define clear outcomes for AI\n", + "projects is a common issue among many businesses for a variety of reasons.\n", + "Here are some key factors that contribute to this challenge:\n", + "\n", + "**They believe the data strategy is a technology problem.**\n", + "\n", + "Companies often hire a chief data officer, or make the data strategy\n", + "the responsibility of the technology organization.\n", + "\n", + "**They lack an understanding of their business processes**\n", + "An alarming number of businesses jump onto the AI bandwagon without\n", + "understanding how their business operates. Decisions are made at\n", + "the leadership level, but how they translate to operational decisions is\n", + "muddled. Data and AI are fundamentally business process technologies,\n", + "\n", + "and without fully understanding how the business works, any initiative\n", + "in data and AI is bound to have limited success.\n", + "\n", + "\n", + "**They lack a data culture**\n", + "\n", + "Somewhat related to the previous point, many companies have teams\n", + "that make decisions based on experience and intuition. These should\n", + "not be discounted, but the reason for intuition is often a result of a\n", + "poor definition of processes, which prevents the ability to measure\n", + "and improve processes.\n", + "\n", + "**They struggle to get high-quality data**\n", + "\n", + "AI projects require good-quality, relevant data. Many businesses\n", + "struggle with issues related to data access, quality, privacy and\n", + "security, which can complicate the process of defining clear outcomes.\n", + "\n", + "**They lack the organizational structures required**\n", + "\n", + "Implementing AI often requires significant changes in business\n", + "\n", + "processes, organizational structures and even corporate culture.\n", + "Many companies find it hard to manage these changes, leading to\n", + "difficulties in setting and achieving clear outcomes.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data and AI programs are a business process problem first, and a\n", + "technology problem last. Familiarity with technology is important, but\n", + "irrelevant if companies do not understand it.\n", + "\n", + "Addressing these challenges often requires companies to invest in\n", + "education about AI capabilities, to formulate clear strategies, to manage\n", + "change effectively, and to bring on board the necessary skills either\n", + "by hiring new talent or upskilling existing employees. It’s a journey that\n", + "requires commitment, but the potential benefits of successful AI initiatives\n", + "make it a worthwhile venture.\n", + "\n", + "\n", + "**They don’t have the right people in place**\n", + "\n", + "There’s often a gap between the skills available within a company and\n", + "the skills needed to define and achieve AI outcomes. Without team\n", + "members who understand AI, data analysis and project management,\n", + "businesses can struggle to set clear objectives for AI initiatives.\n", + "\n", + "**They struggle to quantify the value of AI projects**\n", + "\n", + "AI’s benefits can sometimes be intangible or long-term, making them\n", + "difficult to quantify. Companies may struggle to define outcomes in\n", + "measurable terms, complicating the process of setting objectives\n", + "and monitoring progress.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Before Diving In: Assess Your Readiness\n", + "\n", + "\n", + "There is a growing sense of urgency for organizations relatively new to data\n", + "and AI-driven enablement to “get in the game.” Profiles of top performers and\n", + "headline-making achievements create a clearer sense of what is possible\n", + "and what can be gained, leaving those entering into the space eager to achieve\n", + "similar results.\n", + "\n", + "But what’s missing in those articles are the sustained investments in\n", + "process, people and technology and the numerous challenges, missteps and\n", + "outright failures that had to occur before success was achieved. Data-driven\n", + "transformation is a journey, and before any successful journey is pursued,\n", + "it’s wise to reflect on the organization’s readiness so that you can anticipate\n", + "challenges and identify areas for remediation and improvement that will\n", + "deliver you to your intended destination.\n", + "\n", + "With this in mind, we encourage organizations new to this space to\n", + "assess their maturity in terms of the use and management of their existing\n", + "information assets:\n", + "\n", + "1. How easily discoverable and accessible are data in\n", + "your environment?\n", + "\n", + "\n", + "3. Is the quality of these data formally verified?\n", + "\n", + "4. Are key entities such as products and customers actively\n", + "managed, and can data related to these items be easily linked\n", + "across various data sources?\n", + "\n", + "5. How quickly are data made available for analysis following their\n", + "creation or modification? Is this latency aligned with how you\n", + "might use this data?\n", + "\n", + "6. Are processes established for determining appropriate uses of\n", + "data, governing access and providing oversight on consumption?\n", + "\n", + "7. Is there one individual responsible for effective data management\n", + "across the enterprise, and has this person established a\n", + "\n", + "process for receiving and responding to feedback and shifting\n", + "organizational priorities?\n", + "\n", + "This list of questions is by no means exhaustive, but it should help to identify\n", + "blockers that are likely to become impediments down the road.\n", + "\n", + "\n", + "2. How well understood are these information assets?\n", + "\n", + "\n", + "-----\n", + "\n", + "Similarly, we would encourage organizations to assess their maturity in terms of\n", + "analytics capabilities:\n", + "\n", + "1. Is business performance at all levels assessed in terms of\n", + "key metrics?\n", + "\n", + "2. How frequently are data-driven analyses used in making key\n", + "business decisions?\n", + "\n", + "3. To what degree are advanced analytics techniques\n", + "— i.e., data science — used in decision-making processes?\n", + "\n", + "4. Are predictive models regularly leveraged as part of operational\n", + "business processes?\n", + "\n", + "5. How is experimentation used to assess the performance of\n", + "various initiatives?\n", + "\n", + "\n", + "Lastly, and probably most importantly, we’d encourage the organization to\n", + "perform a frank assessment of its readiness to embrace change. Becoming a\n", + "data-driven enterprise is fundamentally about operating differently than before.\n", + "Decision-making authority becomes more diffuse and often more automated.\n", + "Project outcomes become less certain as the organization focuses on innovation\n", + "where learning is emphasized over predictable results. Process silos often\n", + "become more intertwined as new modes of engagement evolve.\n", + "\n", + "When done right, this transition creates a healthy tension between what’s\n", + "needed to be successful today and what’s needed to be successful tomorrow.\n", + "But this can also manifest itself as employee resistance and political infighting\n", + "as processes and organizational structures evolve. What’s often needed to\n", + "overcome this is strong leadership, a clear vision and mandate for change as\n", + "well as a reassessment of incentive structures and active organizational change\n", + "management as the organization transitions into this new way of working.\n", + "\n", + "\n", + "6. Are predictive models used to automate key business decisions?\n", + "\n", + "\n", + "7. Has the organization embraced a model of continuous deployment\n", + "for the regular update of model-driven processes?\n", + "\n", + "\n", + "**TRADITIONAL APPROACH**\n", + "\n", + "**Upfront reqs** **Technical implementation** **Production**\n", + "\n", + "\n", + "**ITERATIVE APPROACH**\n", + "\n", + "\n", + "Continuous feedback\n", + "\n", + "\n", + "**Business questions** **Testing** **Production** **Optimization**\n", + "\n", + "Continuous learning and optimization\n", + "\n", + "An iterative approach involves the use of data to continually optimize the performance of data products.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started: Putting Some Wins on the Board\n", + "\n", + "\n", + "With the organization ready to proceed, the next phase is about learning to\n", + "deliver new solutions within your organization. There will be new technologies\n", + "to deploy and new skills to develop, and there will be new patterns for\n", + "integration into business workflows and procedures for incremental updates\n", + "and improvements. But most importantly, there will need to be a new level of\n", + "partnership and trust between the business and the technology sides of the\n", + "organization that needs to be carefully nurtured.\n", + "\n", + "The best way we have found to do this is to start with projects that improve\n", + "on existing operational workflows, i.e., do what you do, but do it smarter.\n", + "The business is often familiar with existing pain points and can more clearly\n", + "envision how a new capability can be folded into its processes. They are also\n", + "familiar with how to assess the impact a new approach may have on their\n", + "business and can help design tests to validate whether the intended results\n", + "\n", + "\n", + "As capabilities demonstrating value over the status quo are developed, they\n", + "are folded into business processes. This is not a one-and-done effort but part\n", + "of an ongoing cycle of deployment to continue so long as the team has a line\n", + "of sight to meaningful gains. The team does not wait for the ideal solution but\n", + "instead focuses on incremental improvements that deliver measurable value\n", + "along the way.\n", + "\n", + "Oversight for this process is provided by another body, one tasked with the\n", + "success of the overall transformative efforts within the business. As success\n", + "is delivered, there will be growing demand for the time and talents of these\n", + "teams, and the organization will need to prioritize resources across an increasing\n", + "number of opportunities. This steering committee will need to be responsible for\n", + "allocating limited resources and advocating for additional ones as well to strike\n", + "the right balance of investments for the organization.\n", + "\n", + "\n", + "are or are not being delivered.\n", + "\n", + "\n", + "**DEMAND FORECASTING**\n", + "\n", + "Demand forecasting is a massive challenge for retail and consumer goods\n", + "\n", + "organizations. And one where even an incremental change can have a massive impact,\n", + "\n", + "so it’s often one of the first projects organizations identify to put a win on the board.\n", + "\n", + "According to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n", + "\n", + "accuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n", + "\n", + "increase in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n", + "\n", + "[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n", + "\n", + "key use cases.\n", + "\n", + "\n", + "Work on these projects is a collaborative effort between the business and IT.\n", + "Together, the project team explores a potential solution with a notion of how it\n", + "may be integrated in mind from the outset. As the project unfolds, all members\n", + "are part of the iterative cycles and help to steer the solution in new directions\n", + "until an item of value is derived.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Going Big: Learning to Embrace Transformational Change\n", + "\n", + "\n", + "With some experience under your belt, it’s time to build on the organizational\n", + "muscle developed during initial efforts and flex for more transformative impact.\n", + "Again, the focus is on established functions within the business, but instead of\n", + "pointed, incremental improvements, the team begins to create a vision for the\n", + "part of the organization that would operate if it were to fully embrace data and\n", + "AI enablement.\n", + "\n", + "It’s at this phase that many of the concerns about organizational resistance\n", + "mentioned earlier are most likely to manifest themselves. Ideally, initial\n", + "implementation efforts have built champions within the business, but it’s still\n", + "important to be mindful of pushback that can emerge as the organization more\n", + "fully begins to change. Having and maintaining strong business sponsorship\n", + "in this phase is critical, and having that sponsor articulate and regularly\n", + "reinforce a clear vision for the change that’s now underway can help everyone\n", + "\n", + "understand the need to support these efforts.\n", + "\n", + "\n", + "So far in this exploration of the journey to data and AI transformation, we’ve\n", + "minimized the importance of technology in order to focus on the business and\n", + "organizational aspects that often get neglected in this conversation. But it’s\n", + "at this stage that the organization needs to have established its preference\n", + "for data and analytics platforms. Because of the breadth of needs that will\n", + "have to be addressed and the ongoing innovation taking place in the data\n", + "science community, we strongly suggest standardizing on a platform that is\n", + "open and flexible while also providing cost-effective use of both infrastructure\n", + "and people resources and strong data governance and protection. For many\n", + "organizations, the Databricks Lakehouse Platform has proven itself to be the\n", + "ideal platform to meet these needs.\n", + "\n", + "**WHY STANDARDIZE ON DATABRICKS?**\n", + "\n", + "The Databricks Lakehouse is the only enterprise data and AI\n", + "\n", + "platform that allows retailers to leverage all of their data, from any\n", + "\n", + "source, on any workload to always offer more engaging customer\n", + "\n", + "experiences driven by real-time data, at the lowest cost and with\n", + "\n", + "the greatest investment protection.\n", + "\n", + "\n", + "-----\n", + "\n", + "But simply standardizing on a platform is not enough. The organization\n", + "needs to work through the roles and responsibilities around the use of this\n", + "platform and processes for moving things from experimentation and formal\n", + "development to testing and operationalization.\n", + "\n", + "The importance of having an MLOps strategy really comes to life at this\n", + "phase. This doesn’t mean your strategy around MLOps can’t change, but this\n", + "phase is when you want to think about and define your answers to some key\n", + "questions such as the following:\n", + "\n", + "1. How do we evaluate new and existing (retrained) models as\n", + "part of their movement from development to production?\n", + "\n", + "2. How do we determine when a model should be retrained?\n", + "\n", + "3. What are the preferred mechanisms for production deployment?\n", + "\n", + "4. How do we fall back should we have a deployment problem?\n", + "\n", + "5. What are the service level expectations for the\n", + "deployment processes?\n", + "\n", + "\n", + "###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n", + "\n", + "**Numan Ali**\n", + "\n", + "Solutions Architect, Data and Analytics Center of Excellence at Pandora\n", + "\n", + "\n", + "-----\n", + "\n", + "## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n", + "\n", + "\n", + "Too often, leadership views innovation as a destination and not a process\n", + "(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\n", + "data-driven organization overnight and then it’s done. Yes, there will be an\n", + "upfront investment, but there will also be ongoing investment in order to\n", + "support sustained innovation.\n", + "\n", + "Ironically, one of the major obstacles to this change is viewing the goal as\n", + "simply delivering a project or projects. Think about it — just 12 months ago,\n", + "only a few specialists in academia and industry were talking about generative\n", + "AI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n", + "[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\n", + "personalized consumer experiences with it.\n", + "\n", + "\n", + "Technology, especially when it comes to data and AI, moves far too quickly.\n", + "What retailer tech teams need to deliver at the end of the day is applications,\n", + "of course, but also the ability to react quickly to change. What sort of ongoing\n", + "investments in terms of people, process and technology do retailers need to\n", + "foster in order to ingrain an innovation mindset?\n", + "\n", + "This is an ongoing balancing act where organizations need to innovate and look\n", + "for new opportunities but also sustain that innovation in a way that is realistic\n", + "for the business. For this, let’s consider the 70-20-10 rule: the idea that\n", + "companies should allocate 70% of innovation investment to core initiatives,\n", + "20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\n", + "not a hard-and-fast rule, this concept was touted by Google co-founder Larry\n", + "Page in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n", + "[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n", + "\n", + "outperformed their peers, typically realizing a P/E premium of 10% to 20%.\n", + "\n", + "\n", + "-----\n", + "\n", + "The goal of the 70-20-10 rule is to help guide the organization toward\n", + "sustained innovation and spend the bulk of time on the core business. This is\n", + "part of why we recommend starting first with fast (just 2- to 3-month total)\n", + "pilot projects to use AI on existing business use cases like demand forecasting\n", + "and call center optimization. By working in these areas with a focus on learning\n", + "and iterating, retailers will soon find where data silos and rigidity exist in the\n", + "system. As these foundational barriers are knocked down, it then makes it\n", + "possible to tackle more transformational use cases and start to build the\n", + "characteristics of a data-forward enterprise. In other words, start to utilize\n", + "data and data-driven insights as a primary driver for decision-making and\n", + "operations, while also prioritizing continuous data analysis and improvement.\n", + "\n", + "\n", + "**TRANSFORMATIVE**\n", + "\n", + "\n", + "**ADJACENT**\n", + "\n", + "\n", + "**CORE**\n", + "\n", + "\n", + "###### Companies that allocated about 70% of their innovation activity to core initiatives, \n", + "### 20% to adjacent ones and 10% to\n", + "###### transformational ones outperformed their peers.\n", + "\n", + "**Bansi Nagji & Geoff Tuff**\n", + "_Managing Your Innovation Portfolio_\n", + "Harvard Business Review, May 2012\n", + "\n", + "\n", + "-----\n", + "\n", + "## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n", + "\n", + "\n", + "So what does it take to successfully embark on this\n", + "journey to becoming a data-forward enterprise?\n", + "First and foremost, you need to not only establish\n", + "a baseline understanding of what has occurred by\n", + "examining historical data but leverage advancements\n", + "in technologies (e.g., streaming, computer vision,\n", + "voice recognition) to make predictions of the future.\n", + "\n", + "Through the use of both historical data and\n", + "predictive techniques such as forecasting,\n", + "recommendations, prescriptive care and nextbest-action, organizations can begin to improve\n", + "decisions and, in some cases, automate certain\n", + "decision-making processes. But rather than moving\n", + "\n", + "from historical views to predictive actions in a\n", + "linear fashion, this journey involves addressing both\n", + "approaches simultaneously. Once you are able to\n", + "unify historical and predictive analysis, you can then\n", + "take significant steps toward becoming a dataforward enterprise.\n", + "\n", + "\n", + "##### The Data-Forward Enterprise\n", + "\n", + "Data, analytics and AI working in concert\n", + "\n", + "\n", + "**Data Purgatory**\n", + "Things are better, but data isn’t\n", + "driving the business\n", + "\n", + "\n", + "**Data Maturity**\n", + "Every aspect of the\n", + "business is supported\n", + "by insights and AI\n", + "\n", + "\n", + "**Data Siloed**\n", + "Data and teams are segregated\n", + "into different systems\n", + "\n", + "DATA MATURITY\n", + "\n", + "Being data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The 8 Steps to Building a Data-Forward Retailer\n", + "\n", + "\n", + "Before you start your data-forward journey, a few critical steps must be\n", + "considered to establish a solid foundation to build upon. Based on our\n", + "work with the largest and most successful retailers in the world, spanning\n", + "startups to global giants, we at Databricks have seen that the most successful\n", + "followed these steps to effectively gain wallet share, whereas those who\n", + "couldn’t would often leave major gaps that competitors could take advantage\n", + "of. These steps are the basics to prepare businesses for where they need\n", + "to be both now and in the near future.\n", + "\n", + "\n", + "**2** **Get grounded: Understand the technology**\n", + "\n", + "To start, business leaders need to ground themselves in technology, especially\n", + "when it comes to AI. AI can do amazing things, but it is not magical and vendors\n", + "are prone to overpromising and underdelivering. Less than getting deep into\n", + "code, the purpose is to understand the limitations and ideal use cases.\n", + "\n", + "Databricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\n", + "starting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\n", + "perspective of how different brands are using data, analytics and AI to drive\n", + "revenue or cut operational costs.\n", + "\n", + "\n", + "**1** **Set the foundation: Define goals and objectives**\n", + "\n", + "\n", + "The best way to avoid shiny object syndrome (where you start out with a\n", + "\n", + "technology and then try to figure out what to do with it) is to first identify the\n", + "problems you want to solve. From there, you can set goals around innovation\n", + "to align incentives, and, most importantly, ensure you are driving specific\n", + "business outcomes such as improving customer engagement, optimizing\n", + "inventory management or increasing sales.\n", + "\n", + "\n", + "**3** **Understand the skills and processes in your business**\n", + "\n", + "As we will get into in step 4, starting with smaller pilot projects enables you\n", + "to not just deliver a quick win and validate the use of AI in the enterprise, but\n", + "also understand the in-house capabilities in terms of people, process and\n", + "technology to deliver technical projects. And if required, be willing and ready\n", + "to hire people with the right skill sets that can help you make the most of your\n", + "data. For example, building a core team of data analysts can help extract deep\n", + "insights that lead to better decision-making and identify opportunities for\n", + "growth. It is critical at this step to define the roles you need, determine how\n", + "you will source for those roles (via external hiring or internal transfer), and\n", + "ensure those roles have opportunities for career progression.\n", + "\n", + "\n", + "-----\n", + "\n", + "For inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n", + "[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\n", + "save hours of discovery, design, development and testing. Our purpose-built\n", + "guides — fully functional notebooks and best practices — speed up results\n", + "across your most common and high-impact use cases and enable you to go\n", + "from idea to proof of concept (PoC) in as little as two weeks. We have over\n", + "20 accelerators built specifically for critical retail and consumer goods use\n", + "cases, from Demand Forecasting and On-Shelf Availability to Recommendation\n", + "Engines and Customer Lifetime Value. We also have a set of Solution\n", + "Accelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n", + "\n", + "**5** **Implement data management and governance early**\n", + "\n", + "The first step to successfully implementing AI/ML in your business broadly\n", + "is to ensure you have accurate, reliable and current data to train your\n", + "models against. This data can (and should) come from a variety of sources,\n", + "so it’s key to unify all data types and sources (sales transactions, customer\n", + "feedback, social media) in a centralized location that is easily accessible,\n", + "while not losing sight of data security to maintain customer trust. Setting\n", + "up data governance parameters to control who has which kinds of access\n", + "to what data, and being able to audit the history of this access, will actually\n", + "accelerate innovation while ensuring data security and compliance.\n", + "\n", + "\n", + "**Delivering exactly what customers want,**\n", + "**every time, and on time**\n", + "\n", + "Data is at the heart of Gousto’s mission to change the\n", + "way people eat through the delivery of boxes of fresh\n", + "ingredients and easy-to-follow recipes. However, even\n", + "as their business exploded at the start of the pandemic,\n", + "their systems couldn’t ingest data fast enough, couldn’t\n", + "talk to each other and wouldn’t scale — forcing them to\n", + "temporarily stop accepting new customers. Now Gousto is\n", + "set up to achieve exciting ambitions for menu expansion,\n", + "sophisticated personalization and next-day delivery. Learn\n", + "how they did it.\n", + "\n", + "**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n", + "\n", + "**4** **Start small: Pilot a project**\n", + "\n", + "There is no substitute for rolling your sleeves up and running a pilot project to\n", + "evaluate the feasibility and potential impact of a project before implementing\n", + "it on a larger scale. When selecting a pilot project, we recommend starting with\n", + "a project that will deliver clear business value, such as incremental revenue\n", + "or clear cost savings, yet only takes 2-3 months to complete. The more time\n", + "there is between project inception and seeing results, the more likely it will lose\n", + "momentum internally.\n", + "\n", + "\n", + "-----\n", + "\n", + "**6** **Incorporate AI across the business (starting with daily tasks)**\n", + "\n", + "Given the large upfront investment in data scientists and engineers to build\n", + "an AI program, the ROI will come from using it at scale. Constantly look to\n", + "uncover patterns and repeatable processes that can be optimized or fully\n", + "automated with AI.\n", + "\n", + "**Building a global fashion icon with a**\n", + "**customer-first approach**\n", + "\n", + "British luxury brand Burberry was seeking an efficient way to\n", + "annotate its thousands of highly specific marketing assets\n", + "for better targeting. Working with Labelbox within Databricks\n", + "Lakehouse, they are now able to complete image annotation\n", + "projects in hours instead of months. And marketing team\n", + "members now have access to powerful content insights\n", + "without needing to ask data scientists for help.\n", + "\n", + "**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n", + "\n", + "**Customizing interactions that convert clicks**\n", + "**to revenue with Databricks Lakehouse**\n", + "\n", + "Global jewelry manufacturer and retailer Pandora needed a\n", + "unified view of all their data where they could easily segment,\n", + "categorize and analyze to deliver custom messaging to\n", + "consumers. With Databricks Lakehouse, they now have the\n", + "insights they need to deliver highly targeted messaging —\n", + "increasing consumer engagement from the initial opening of\n", + "a marketing email to maximizing shopping bag conversions to\n", + "driving revenue on the website.\n", + "\n", + "**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n", + "\n", + "\n", + "**Building an operationally efficient**\n", + "**omnichannel business**\n", + "\n", + "The Hershey Company analyzes the data they need to\n", + "stay in front of changing human behavior and delight their\n", + "customers. With Databricks Lakehouse, they can analyze\n", + "data feeds from their largest retail customer — uncovering\n", + "insights that will help extend their industry leadership.\n", + "\n", + "**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n", + "\n", + "\n", + "**Ushering in a new era**\n", + "**of data-driven retailing**\n", + "\n", + "Outdoor apparel brand Columbia Sportswear has enabled\n", + "data and analytics self-service throughout the organization in\n", + "a way that ensures everyone is working from a single source\n", + "of truth. Whichever data team needs access to the data,\n", + "Databricks Lakehouse gives them the confidence that the\n", + "data is reliable and consistent.\n", + "\n", + "**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**7** **Foster a culture of data-driven decision-making**\n", + "\n", + "What does it mean to have a culture of data-driven decision-making? In\n", + "practice, it means empowering all employees to use data to inform their\n", + "decisions. Only some strategic decisions will be based on complete and\n", + "accurate information. It’s unwise to assume otherwise. The right approach\n", + "is to leverage as much data as possible, from past tests or current efforts,\n", + "to mitigate risk. Leaders need to not only ask for data but also ensure\n", + "that their employees will be able to find the data they need.\n", + "\n", + "**Unlocking critical trends and insights**\n", + "**needed to serve our 180 million customers**\n", + "\n", + "Reckitt, the maker of Lysol as well as hundreds of other\n", + "household brands, was looking to deliver best-in-class\n", + "customer experiences to their over 180 million customers\n", + "spanning the globe. With Databricks Lakehouse, Reckitt\n", + "has established a data-first culture by surfacing real-time,\n", + "highly accurate, deep customer data insights that have\n", + "led to a better understanding of international market\n", + "trends and demand across the multiple product lines\n", + "they support.\n", + "\n", + "**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n", + "\n", + "\n", + "**Customer 360 to enable faster speed**\n", + "**to market, better results**\n", + "\n", + "The Middle East’s Al-Futtaim serves as a local distributor\n", + "for global brands such as Toyota, IKEA and Ace Hardware.\n", + "With Databricks Lakehouse serving as a unified platform to\n", + "aggregate and analyze various data sources on all customers,\n", + "they have created a “golden customer record” that improves\n", + "all decision-making, from forecasting demand to powering\n", + "their global loyalty program.\n", + "\n", + "**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n", + "\n", + "**8** **Continuously evaluate and improve**\n", + "\n", + "Recognize that establishing a data-driven culture is an ongoing journey and\n", + "never a set destination. Constantly evaluate your data collection, analysis and\n", + "decision-making process to identify areas for improvement. Even small and\n", + "constant incremental improvements will deliver large gains in absolute terms\n", + "when applied at scale. You can always personalize more, forecast better, or\n", + "better manage your supply chain as you bring in better data sources and refine\n", + "your models.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Transform Retail Data Into Actionable Insights\n", + "\n", + "\n", + "Becoming data forward is not a crazy idea. Too often, leaders or organizations\n", + "allow themselves to be intimidated by focusing on large-scale transformations.\n", + "But it’s the small operational changes that can make your business more efficient\n", + "as well as shift the larger culture forward. Once you’ve set this foundation, it then\n", + "allows you to move toward bigger things. These steps may fail, but it’s actually\n", + "positive to have these setbacks to learn from to try again. The bigger risk is to\n", + "not try and thus fall behind competitors who are embracing the internal changes\n", + "needed to take advantage of AI and machine learning.\n", + "\n", + "Core to delivering on these steps to become a data-forward retailer is a solid\n", + "data foundation that can unify your data and AI workloads with sharing and\n", + "governance built in, so internal and external teams can get access to the\n", + "data they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\n", + "companies gain valuable insights into customer behavior, optimize supply chain\n", + "\n", + "operations and make informed business decisions in real time.\n", + "\n", + "\n", + "EXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n", + "\n", + "Access key resources to understanding how a lakehouse\n", + "for retail can set you on the path toward becoming a\n", + "data-forward organization.\n", + "\n", + "**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n", + "\n", + "\n", + "#### Visit our website to learn more about Databricks Lakehouse for Retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf2024-09-19T16:57:19Z
### eBook\n", + "\n", + "# The Big Book\n", + " of MLOps\n", + "\n", + "#### A data-centric approach\n", + " to build and scale AI,\n", + " including LLMOps\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R S :**\n", + "\n", + "**Joseph Bradley**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Rafi Kurlansik**\n", + "\n", + "Lead Product Specialist\n", + "\n", + "**Matt Thomson**\n", + "\n", + "Director, EMEA Product Specialists\n", + "\n", + "**Niall Turbitt**\n", + "\n", + "Lead Data Scientist\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Introduction** 3\n", + "\n", + "###### People and process 4\n", + "\n", + " People 5\n", + "\n", + " Process 6\n", + "\n", + " Why should I care about MLOps? 8\n", + "\n", + " Guiding principles 9\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n", + "\n", + "###### Semantics of dev, staging and prod 11\n", + "\n", + " ML deployment patterns 15\n", + "\n", + "**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n", + "\n", + "###### Architecture components 19\n", + "\n", + " Data Lakehouse 19\n", + "\n", + " MLflow 19\n", + "\n", + " Databricks and MLflow Autologging 20\n", + "\n", + " Feature Store 20\n", + "\n", + " MLflow Model Serving 20\n", + "\n", + " Databricks SQL 20\n", + "\n", + " Databricks Workflows and Jobs 20\n", + "\n", + " Reference architecture 21\n", + "\n", + " Overview 22\n", + "\n", + " Dev 23\n", + "\n", + " Staging 27\n", + "\n", + " Prod 30\n", + "\n", + "**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n", + "\n", + "###### Discussion of key topics for LLMOps 39\n", + "\n", + " Reference architecture 46\n", + "\n", + " Looking ahead 48\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Introduction\n", + "\n", + "**Note:** Our prescription for MLOps is general to\n", + "\n", + "any set of tools and applications, though we give\n", + "\n", + "concrete examples using Databricks features\n", + "\n", + "and functionality. We also note that no single\n", + "\n", + "architecture or prescription will work for all\n", + "\n", + "organizations or use cases. Therefore, while we\n", + "\n", + "provide guidelines for building MLOps, we call out\n", + "\n", + "important options and variations. This whitepaper\n", + "\n", + "is written primarily for ML engineers and data\n", + "\n", + "scientists wanting to learn more about MLOps,\n", + "\n", + "with high-level guidance and pointers to more\n", + "\n", + "resources.\n", + "\n", + "\n", + "The past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n", + "\n", + "adopters were a small number of large technology companies that could afford the necessary resources,\n", + "\n", + "in recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n", + "\n", + "MIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n", + "\n", + "This democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n", + "\n", + "[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n", + "\n", + "However, building and deploying ML models is complex. There are many options available for achieving\n", + "\n", + "this but little in the way of well-defined and accessible standards. As a result, over the past few years we\n", + "\n", + "have seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n", + "\n", + "**and automation for managing models, data and code to improve performance stability and long-term**\n", + "\n", + "**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n", + "\n", + "The concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n", + "\n", + "software applications, and the deployment of ML applications has much to gain from it. However, strong\n", + "\n", + "DevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n", + "\n", + "artifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n", + "\n", + "account the various people and processes that interact with these artifacts.\n", + "\n", + "Here at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n", + "\n", + "which work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n", + "\n", + "successful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n", + "\n", + "adoption is a testament to the appetite for operationalizing ML models.\n", + "\n", + "This whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n", + "\n", + "First, we describe the people and process involved in deploying ML applications and the need for\n", + "\n", + "operational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n", + "\n", + "we go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n", + "\n", + "introduce a general MLOps reference architecture, the details of its processes, and best practices.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People and process\n", + "\n", + "**M L W O R K F L O W A N D P E R S O N A S**\n", + "\n", + "Data Governance Officer\n", + "\n", + "Dat1\n", + "Data Scientist\n", + "Engineer\n", + "\n", + "ML Engineer\n", + "\n", + "Business Stakeholder\n", + "\n", + "\n", + "Dataa\n", + "Preparation\n", + "\n", + "\n", + "Evplorator{a\n", + "Data unal{sis\n", + "\n", + "\n", + "Feature Mode� Modela Deplo{�ent\n", + "Engineering Training Validation\n", + "\n", + "\n", + "Mode� Modela Deplo{�ent Monitoring\n", + "Training Validation\n", + "\n", + "\n", + "Modela\n", + "Validation\n", + "\n", + "\n", + "**Figure 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### People\n", + "\n", + "Building ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n", + "\n", + "useful to think in terms of archetypes. They help us understand roles and responsibilities and where\n", + "\n", + "handoffs are required, and they highlight areas of complexity within the system. We distinguish between\n", + "\n", + "the following personas:\n", + "\n", + "**M L P E R S O N A S**\n", + "\n", + "\n", + "Data\n", + "Governance\n", + "Officer\n", + "\n", + "Responsible for ensuring\n", + "\n", + "that data governance,\n", + "\n", + "data privacy and other\n", + "\n", + "compliance measures are\n", + "\n", + "adhered to across the\n", + "\n", + "model development and\n", + "\n", + "deployment process. Not\n", + "\n", + "typically involved in day-to-\n", + "\n", + "day operations.\n", + "\n", + "\n", + "Data\n", + "Engineer\n", + "\n", + "Responsible for building\n", + "\n", + "data pipelines to process,\n", + "\n", + "organize and persist data\n", + "\n", + "sets for machine learning\n", + "\n", + "and other downstream\n", + "\n", + "applications.\n", + "\n", + "\n", + "Data\n", + "Scientist\n", + "\n", + "Responsible for\n", + "\n", + "understanding the business\n", + "\n", + "problem, exploring available\n", + "\n", + "data to understand\n", + "\n", + "if machine learning is\n", + "\n", + "applicable, and then training,\n", + "\n", + "tuning and evaluating a\n", + "\n", + "model to be deployed.\n", + "\n", + "\n", + "ML\n", + "Engineer\n", + "\n", + "Responsible for deploying\n", + "\n", + "machine learning models to\n", + "\n", + "production with appropriate\n", + "\n", + "governance, monitoring and\n", + "\n", + "software development best\n", + "\n", + "practices such as continuous\n", + "\n", + "integration and continuous\n", + "\n", + "deployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n", + "\n", + "\n", + "Business\n", + "Stakeholder\n", + "\n", + "Responsible for using the\n", + "\n", + "model to make decisions for\n", + "\n", + "the business or product, and\n", + "\n", + "responsible for the business\n", + "\n", + "value that the model is\n", + "\n", + "expected to generate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Process\n", + "\n", + "Together, these people develop and maintain ML applications. While the development process follows\n", + "\n", + "a distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n", + "\n", + "you take, and using techniques like reinforcement learning or online learning will change some details.\n", + "\n", + "Nevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n", + "\n", + "above.\n", + "\n", + "Let’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n", + "\n", + "which will be determined by the particular business case and data.\n", + "\n", + "**M L P R O C E S S**\n", + "\n", + "\n", + "Data\n", + "Preparation\n", + "\n", + "\n", + "Exploratory\n", + "Data Analysis\n", + "\n", + "\n", + "Feature\n", + "Engineering\n", + "\n", + "\n", + "Model\n", + "Training\n", + "\n", + "\n", + "Model\n", + "Validation\n", + "\n", + "\n", + "Deployment Monitoring\n", + "\n", + "\n", + "###### Data preparation\n", + "\n", + "Prior to any data science or ML work lies the data engineering needed to prepare production data and make\n", + "\n", + "it available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n", + "\n", + "will extract features and labels from the raw data.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "Analysis is conducted by data scientists to assess statistical properties of the data available, and determine\n", + "\n", + "if they address the business question. This requires frequent communication and iteration with business\n", + "\n", + "stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature engineering\n", + "\n", + "Data scientists clean data and apply business logic and specialized transformations to engineer features for\n", + "\n", + "model training. These data, or features, are split into training, testing and validation sets.\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n", + "\n", + "a best-performing model is determined according to predefined evaluation metric(s).\n", + "\n", + "###### Model validation\n", + "\n", + "Prior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n", + "\n", + "some baseline level of performance, in addition to meeting any other technical, business or regulatory\n", + "\n", + "requirements. This necessitates collaboration between data scientists, business stakeholders and ML\n", + "\n", + "engineers.\n", + "\n", + "###### Deployment\n", + "\n", + "ML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n", + "\n", + "requirements of the use case.\n", + "\n", + "###### Monitoring\n", + "\n", + "ML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n", + "\n", + "will often be involved in early monitoring phases to ensure that new models perform as expected after\n", + "\n", + "deployment. This will inform if and when the deployed model should be updated by returning to earlier\n", + "\n", + "stages in the workflow.\n", + "\n", + "The data governance officer is ultimately responsible for making sure this entire process is compliant with\n", + "\n", + "company and regulatory policies.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why should I care about MLOps?\n", + "\n", + "Consider that the typical ML application depends on the aforementioned people and process, as well\n", + "\n", + "as regulatory and ethical requirements. These dependencies change over time — and your models, data\n", + "\n", + "and code must change as well. The data that were a reliable signal yesterday become noise; open source\n", + "\n", + "libraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n", + "\n", + "resilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n", + "\n", + "moving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n", + "\n", + "iteration cycle of delivering models to production, thereby accelerating time to business value.\n", + "\n", + "There are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n", + "\n", + "**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n", + "\n", + "For example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n", + "\n", + "risk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n", + "\n", + "fails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n", + "\n", + "fines or incurring reputational damage. Recently, one private company’s data collection practices were\n", + "\n", + "found to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n", + "\n", + "$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n", + "\n", + "developed with that data.\n", + "\n", + "With respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n", + "\n", + "processes. These steps are slower and more prone to error, affecting the quality of models, data and code.\n", + "\n", + "Eventually they form a bottleneck, capping the ability for a data team to take on new projects.\n", + "\n", + "Seen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n", + "\n", + "stability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n", + "\n", + "introduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n", + "\n", + "manage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n", + "\n", + "**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n", + "\n", + "With clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n", + "\n", + "\n", + "M o d e l O p s D a t a O p s D e �O p s\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the complexity of ML\n", + "\n", + "processes and the different personas\n", + "\n", + "involved, it is helpful to start from\n", + "\n", + "simpler, high-level guidance. We\n", + "\n", + "propose several broadly applicable\n", + "\n", + "principles to guide MLOps decisions.\n", + "\n", + "They inform our design choices in\n", + "\n", + "later sections, and we hope they can\n", + "\n", + "be adapted to support whatever your\n", + "\n", + "\n", + "#### Guiding principles\n", + "\n", + "###### Always keep your business goals in mind\n", + "\n", + "Just as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n", + "\n", + "purpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n", + "\n", + "continue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n", + "\n", + "business impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n", + "\n", + "reduce operational costs or risks?\n", + "\n", + "###### Take a data-centric approach to machine learning\n", + "\n", + "Feature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n", + "\n", + "as robust as other production data engineering processes. Data quality is crucial in any ML application, so\n", + "\n", + "ML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n", + "\n", + "Avoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n", + "\n", + "your data. The simplest way to achieve this is to develop ML applications on the same platform used to\n", + "\n", + "manage production data. For example, instead of downloading training data to a laptop, where it is hard\n", + "\n", + "to govern and reproduce results, secure the data in cloud storage and make that storage available to your\n", + "\n", + "training process.\n", + "\n", + "\n", + "business use case may be.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### \u0007Implement MLOps in a modular fashion\n", + "\n", + "As with any software application, code quality is paramount for an ML application. Modularized code\n", + "\n", + "enables testing of individual components and mitigates difficulties with future code refactoring. Define\n", + "\n", + "clear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n", + "\n", + "responsibilities to clarify the modular structure of your ML application.\n", + "\n", + "###### Process should guide automation\n", + "\n", + "We automate processes to improve productivity and lower risk of human error, but not every step of a\n", + "\n", + "process can or should be automated. People still determine the business question, and some models will\n", + "\n", + "always need human oversight before deployment. Therefore, the development process is primary and each\n", + "\n", + "module in the process should be automated as needed. This allows incremental build-out of automation\n", + "\n", + "and customization. Furthermore, when it comes to particular automation tools, choose those that align to\n", + "\n", + "your people and process. For example, instead of building a model logging framework around a generic\n", + "\n", + "database, you can choose a specialized tool like MLflow, which has been designed with the ML model\n", + "\n", + "lifecycle in mind.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Fundamentals of MLOps\n", + "\n", + "**Note:** In our experience with customers, there\n", + "\n", + "can be variations in these three stages, such as\n", + "\n", + "splitting staging into separate “test” and “QA”\n", + "\n", + "substages. However, the principles remain the\n", + "\n", + "same and we stick to a dev, staging and prod\n", + "\n", + "setup within this paper.\n", + "\n", + "\n", + "#### Semantics of dev, staging and prod\n", + "\n", + "ML workflows include the following key assets: code, models and data. These assets need to be developed\n", + "\n", + "(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n", + "\n", + "environment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n", + "\n", + "staging and prod.\n", + "\n", + "These divisions can best be understood in terms of quality guarantees and access control. On one end,\n", + "\n", + "assets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n", + "\n", + "who can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n", + "\n", + "of quality.\n", + "\n", + "For example, many data scientists will work together in a dev environment, freely producing dev model\n", + "\n", + "prototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n", + "\n", + "the live product. In contrast, the staging environment replicates the execution environment of production.\n", + "\n", + "Here, code changes made in the dev environment are tested prior to code being deployed to production.\n", + "\n", + "The staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n", + "\n", + "are given access to staging. Code promoted to production is considered a live product. In the production\n", + "\n", + "environment, human error can pose the greatest risk to business continuity, and so the least number of\n", + "\n", + "people have permission to modify production models.\n", + "\n", + "One might be tempted to say that code, models and data each share a one-to-one correspondence with\n", + "\n", + "the execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n", + "\n", + "close to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n", + "\n", + "and prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n", + "\n", + "access to each.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Execution environments\n", + "\n", + "An execution environment is the place where models and data are created or consumed by code. Each\n", + "\n", + "execution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n", + "\n", + "With Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n", + "\n", + "organization could create distinct environments across multiple cloud accounts, multiple Databricks\n", + "\n", + "workspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n", + "\n", + "are illustrated in Figure 2 below.\n", + "\n", + "**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n", + "\n", + "\n", + "Multiple clou$\n", + "accounts\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Multiple Databricks\n", + "workspaces\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "Databricks workspace\n", + "access controls\n", + "\n", + "\n", + "dev\n", + "\n", + "staging\n", + "\n", + "prod\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "**Figure 2**\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "\n", + "###### Code\n", + "\n", + "ML project code is often stored in a version control repository (such as Git), with most organizations\n", + "\n", + "using branches corresponding to the lifecycle phases of development, staging or production. There are a\n", + "\n", + "few common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n", + "\n", + "Others use main and development branches (dev), branches cut for testing potential releases (staging), and\n", + "\n", + "branches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n", + "\n", + "through Git repository branches.\n", + "\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once and to replace a subset\n", + "\n", + "of the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "\n", + "As a best practice, code should only be run in an execution environment that corresponds to it or in one\n", + "\n", + "that’s higher. For example, the dev environment can run any code, but the prod environment can only run\n", + "\n", + "prod code.\n", + "\n", + "###### Models\n", + "\n", + "While models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n", + "\n", + "**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n", + "\n", + "a new model version before you push a code change, and vice versa. Consider the following scenarios:\n", + "\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "\n", + "\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n", + "\n", + "the code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n", + "\n", + "of being generated, tested and marked as “production” to predict on the most recent transactions. In\n", + "\n", + "this case the code lifecycle is slower than the model lifecycle.\n", + "\n", + "\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n", + "\n", + "time process due to cost. Updates to the serving and monitoring code in the project may be deployed\n", + "\n", + "more frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n", + "\n", + "Since model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n", + "\n", + "management to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n", + "\n", + "directly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n", + "\n", + "production models without code changes, streamlining the deployment process in many cases. Model\n", + "\n", + "artifacts are secured using MLflow access controls or cloud storage permissions\n", + "\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Some organizations label data as either dev, staging or prod, depending on which environment it originated\n", + "\n", + "in. For example, all prod data is produced in the prod environment, but dev and staging environments may\n", + "\n", + "have read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n", + "\n", + "may be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n", + "\n", + "\n", + "reliability and freshness. Access to data in each environment is controlled with table access controls\n", + "\n", + "( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n", + "| |\n", + "\n", + "In summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n", + "\n", + "prod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n", + "\n", + "will be the highest quality and tightly controlled.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|ASSET|SEMANTICS|SEPARATED BY|\n", + "|---|---|---|\n", + "|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n", + "|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n", + "|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n", + "|Code|Labeled according to software development lifecycle phase|Git repository branches|\n", + "\n", + "\n", + "**Table 1**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ML deployment patterns\n", + "\n", + "The fact that models and code can be managed separately results in multiple possible patterns for getting\n", + "\n", + "ML artifacts through staging and into production. We explain two major patterns below.\n", + "\n", + "**D E P L O Y M O D E L S**\n", + "\n", + "dev staging prod\n", + "\n", + "**D E P L O Y C O D E**\n", + "\n", + "dev staging prod\n", + "\n", + "These two patterns differ in terms of whether the model artifact or the training code that produces the\n", + "\n", + "model artifact is promoted toward production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Deploy models\n", + "\n", + "In the first pattern, the model artifact is generated by training code in the development environment.\n", + "\n", + "This artifact is then tested in staging for compliance and performance before finally being deployed into\n", + "\n", + "production. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n", + "\n", + "expensive, training the model once and managing that artifact may be preferable. However, this simpler\n", + "\n", + "architecture comes with limitations. If production data is not accessible from the development environment\n", + "\n", + "(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n", + "\n", + "automated model retraining. While you could automate retraining in the development environment, you\n", + "\n", + "would then be treating “dev” training code as production ready, which many deployment teams would not\n", + "\n", + "accept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n", + "\n", + "deployed to production, requiring a separate code deployment path.\n", + "\n", + "###### Deploy code\n", + "\n", + "In the second pattern, the code to train models is developed in the dev environment, and this code is\n", + "\n", + "moved to staging and then production. Models will be trained in each environment: initially in the dev\n", + "\n", + "environment as part of model development, in staging (on a limited subset of data) as part of integration\n", + "\n", + "tests, and finally in the production environment (on the full production data) to produce the final model.\n", + "\n", + "If an organization restricts data scientists’ access to production data from dev or staging environments,\n", + "\n", + "deploying code allows training on production data while respecting access controls. Since training code\n", + "\n", + "goes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n", + "\n", + "same pattern as model training code, and both can go through integration tests in staging. However, the\n", + "\n", + "learning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n", + "\n", + "project templates and workflows are helpful. Finally, data scientists need visibility into training results from\n", + "\n", + "the production environment, for only they have the knowledge to identify and fix ML-specific issues.\n", + "\n", + "\n", + "-----\n", + "\n", + "The diagram below contrasts the code lifecycle for the above deployment patterns across the different\n", + "\n", + "execution environments.\n", + "\n", + "\n", + "Code\n", + "development\n", + "\n", + "Development\n", + "environment\n", + "\n", + "\n", + "Unit\n", + "tests\n", + "\n", + "\n", + "Integration\n", + "tests\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "\n", + "Model\n", + "training\n", + "\n", + "\n", + "Continuous\n", + "deployment\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "Deploy\n", + "pipelines\n", + "\n", + "Production\n", + "environment\n", + "\n", + "\n", + "#### Deploy models\n", + "\n", + " Deploy code\n", + "\n", + "\n", + "**In general we recommend following the “deploy code” approach, and the reference architecture in**\n", + "\n", + "**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n", + "\n", + "the options outlined above are not mutually exclusive. Within a single organization, you may find some use\n", + "\n", + "cases deploying training code and others deploying model artifacts. Your choice of process will depend on\n", + "\n", + "the business use case, resources available and what is most likely to succeed.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n", + "|---|---|---|---|\n", + "|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n", + "||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n", + "||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n", + "|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n", + "||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n", + "||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n", + "||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n", + "||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n", + "||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n", + "|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|\n", + "\n", + "\n", + "**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## MLOps Architecture\n", + " and Process\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "#### Architecture components\n", + "\n", + "Before unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n", + "\n", + "features used to facilitate MLOps in the workflow prescribed.\n", + "\n", + "###### Data Lakehouse\n", + "\n", + "A [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n", + "\n", + "data management and performance typically found in data warehouses with the low-cost, flexible object\n", + "\n", + "stores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n", + "\n", + "of Bronze, Silver and Gold tables of increasing refinement and quality.\n", + "\n", + "###### MLflow\n", + "\n", + "[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n", + "\n", + "following primary components:\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n", + "\n", + "artifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n", + "| |\n", + "\n", + "\n", + "\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n", + "\n", + "model serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n", + "| |\n", + "\n", + "\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n", + "\n", + "\n", + "from staging to production, with capabilities for versioning and annotating. The registry also provides\n", + "\n", + "webhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n", + "| |\n", + "\n", + "Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n", + "\n", + "high availability, and other Databricks workspace features such as experiment and run management and\n", + "\n", + "notebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n", + "\n", + "machine learning model training runs and running machine learning projects.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Databricks and MLflow Autologging\n", + "\n", + "Databricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n", + "\n", + "experiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n", + "\n", + "\n", + "automatically captures model parameters, metrics, files and lineage information when you train models with\n", + "\n", + "training runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n", + "| |\n", + "\n", + "###### Feature Store\n", + "\n", + "The Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n", + "\n", + "\n", + "across an organization and also ensures that the same feature computation code is used for model training\n", + "\n", + "and inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n", + "| |\n", + "\n", + "###### MLflow Model Serving\n", + "\n", + "MLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n", + "\n", + "\n", + "that are updated automatically based on the availability of model versions and their stages. See\n", + "\n", + "documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n", + "| |\n", + "\n", + "###### Databricks SQL\n", + "\n", + "Databricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n", + "\n", + "\n", + "data lake, create multiple visualization types to explore query results from different perspectives, and build\n", + "\n", + "and share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n", + "| |\n", + "\n", + "###### Databricks Workflows and Jobs\n", + "\n", + "Databricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n", + "\n", + "\n", + "ways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n", + "\n", + "steps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n", + "| |\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "We are now ready to review a general reference architecture for implementing MLOps on the Databricks\n", + "\n", + "Lakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n", + "\n", + "the majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n", + "\n", + "we will highlight alternative approaches to implementing different parts of the process.\n", + "\n", + "We begin with an overview of the system end-to-end, followed by more detailed views of the process\n", + "\n", + "in development, staging and production environments. These diagrams show the system as it operates\n", + "\n", + "in a steady state, with the finer details of iterative development cycles omitted. This structure is\n", + "\n", + "summarized below.\n", + "\n", + "**O V E R V I E W**\n", + "```\n", + " dev staging prod\n", + "\n", + "```\n", + "\n", + "\u0007Data\n", + "\n", + "\u0007Exploratory data analysis (EDA)\n", + "\n", + "\u0007Project code\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Commit code\n", + "\n", + "\n", + "\u0007Merge request\n", + "\n", + "\u0007Unit tests (CI)\n", + "\n", + "\u0007Integration tests (CI)\n", + "\n", + "\u0007Merge\n", + "\n", + "\u0007Cut release branch\n", + "\n", + "\n", + "\u0007Feature table refresh\n", + "\n", + "\u0007Model training\n", + "\n", + "\u0007Continuous deployment (CD)\n", + "\n", + "\u0007Online serving (REST APIs)\n", + "\n", + "\u0007Inference: batch or streaming\n", + "\n", + "\u0007Monitoring\n", + "\n", + "\u0007Retraining\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Overview\n", + "\n", + "Source control\n", + "\n", + "dev staging (main) release\n", + "\n", + "Merge reIuest to staging Cut release branch Pull from release branch to production\n", + "\n", + "\n", + "**Figure 3**\n", + "\n", + "\n", + "Development\n", + "environment\n", + "\n", + "Exploratory\n", + "data analysis\n", + "\n", + "\n", + "Staging\n", + "environment\n", + "\n", + "Create dev branch Commit code C} trigger Merge\n", + "\n", + "\n", + "Production\n", + "environment\n", + "\n", + "Model Registry\n", + "\n", + "St�ge{ �one St�ge{ St�ging St�ge{ Production\n", + "\n", + "\n", + ". . .\n", + "\n", + "\n", + "Inference & serving dev\n", + "\n", + "Feature table refresh dev\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Push model to registr� Load model for testing Load model for inference\n", + "\n", + "Integration\n", + "tests (CI)\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Promote to production\n", + "\n", + "\n", + "Inference & serving\n", + "\n", + "\n", + "Model training dev\n", + "\n", + "release\n", + "\n", + "dev\n", + "\n", + "\n", + "Feature\n", + "table refresh\n", + "\n", + "release\n", + "\n", + "\n", + "Mode�\n", + "training\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring\n", + "\n", + "release\n", + "\n", + "\n", + "Data tables Feature tables Feature tables Data tables Feature tables Metrics tables\n", + "\n", + "Here we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n", + "\n", + "and model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n", + "\n", + "pipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n", + "\n", + "development environment, and changes to the codebase are committed back to source control. Upon merge\n", + "\n", + "request to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n", + "\n", + "code in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n", + "\n", + "code release. In production, a model is trained on the full production data and pushed to the MLflow Model\n", + "\n", + "Registry. A continuous deployment (CD) process tests the model and promotes it toward the production\n", + "\n", + "stage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Dev\n", + "\n", + "In the development environment, data scientists and ML engineers can collaborate on all pipelines in\n", + "\n", + "an ML project, committing their changes to source control. While engineers may help to configure this\n", + "\n", + "environment, data scientists typically have significant control over the libraries, compute resources and\n", + "\n", + "code that they use.\n", + "\n", + "\n", + "**Figure 4** Development environment\n", + "\n", + "0�\n", + "\n", + "E�ploratory\n", + "data analysis\n", + "\n", + "0�\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "Source control\n", + "\n", + "Tracking Server\n", + "\n", + "Metrics Parameters Models\n", + "\n", + "dev\n", + "\n", + "\n", + ". . .\n", + "\n", + "models\n", + "\n", + "\n", + "train.py\n", + "\n", + "deploy.py\n", + "\n", + "in(erence.py\n", + "\n", + "monitoring.py\n", + "\n", + "dat<\n", + "\n", + "(eaturization.py\n", + "\n", + "tests\n", + "\n", + "unit.py\n", + "\n", + "integration.py\n", + "\n", + "\n", + "Inference: Streaming or batch\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "\n", + "Model training\n", + "\n", + "Training and\n", + "Evaluation\n", + "tuning\n", + "\n", + "\n", + "Create dev mrancg\n", + "\n", + "0u\n", + "\n", + "Commit code\n", + "\n", + "\n", + "04\n", + "\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "dev\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Feature tamles Bronze / Silver / Gold\n", + "\n", + "prod data\n", + "\n", + "\n", + "Feature tamles Temp tamles\n", + "\n", + "dev data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "Data scientists working in the dev environment possess read-only access to production data. They also\n", + "\n", + "require read-write access to a separate dev storage environment to develop and experiment with new\n", + "\n", + "features and other data tables.\n", + "\n", + "###### Exploratory data analysis (EDA)\n", + "\n", + "The data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n", + "\n", + "assess whether the available data has the potential to address the business problem. EDA is also where the\n", + "\n", + "data scientist will begin discerning what data preparation and featurization are required for model training.\n", + "\n", + "This ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n", + "\n", + "###### Project code\n", + "\n", + "This is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n", + "\n", + "are used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n", + "\n", + "a project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n", + "\n", + "pipeline consists of two steps:\n", + "\n", + "\u0007 **Data preparation**\n", + "\n", + "This step checks for and corrects any data quality issues prior to featurization.\n", + "\n", + "**\u0007Featurization**\n", + "\n", + "In the dev environment, new features and updated featurization logic can be tested by writing to feature\n", + "\n", + "tables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n", + "\n", + "featurization code is promoted to production, these changes will affect the production feature tables.\n", + "\n", + "Features already available in production feature tables can be read directly for development.\n", + "\n", + "In some organizations, feature engineering pipelines are managed separately from ML projects. In such\n", + "\n", + "cases, the featurization pipeline can be omitted from this architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Model training\n", + "\n", + "Data scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "The training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n", + "\n", + "and it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n", + "\n", + "hyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n", + "\n", + "between the model, its input data, and the code used to generate it.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out data. The results of these tests are logged to the\n", + "\n", + "MLflow tracking server.\n", + "\n", + "If governance requires additional metrics or supplemental documentation about the model, this is the\n", + "\n", + "time to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n", + "\n", + "and plain text descriptions are common, but defining the specifics for such governance requires input\n", + "\n", + "from business stakeholders or a data governance officer.\n", + "\n", + "**\u0007Model output**\n", + "\n", + "The output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n", + "\n", + "training pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n", + "\n", + "via the model URI (or path) and then push the model to the Model Registry for management and testing.\n", + "\n", + "###### Commit code\n", + "\n", + "After developing code for featurization, training, inference and other pipelines, the data scientist or\n", + "\n", + "ML engineer commits the dev branch changes into source control. This section does not discuss the\n", + "\n", + "continuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n", + "\n", + "information on those.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Staging\n", + "\n", + "The transition of code from development to production occurs in the staging environment. This code\n", + "\n", + "includes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n", + "\n", + "engineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n", + "\n", + "integration pipelines and orchestration.\n", + "\n", + "Source control\n", + "\n", + "0] 0_\n", + "\n", + "dev staging >main< release\n", + "\n", + "Merge reHuest to staging Cut release branch\n", + "\n", + "Staging environment\n", + "\n", + "CI trigger Merge\n", + "\n", + "0�\n", + "\n", + "\n", + "**Figure 5**\n", + "\n", + "\n", + "Unit tests\n", + "(CI)\n", + "\n", + "\n", + "Tracking Server\n", + "\n", + "0�\n", + "\n", + "Model Registry\n", + "\n", + "dev\n", + "\n", + "\n", + "03\n", + "\n", + "Integration tests (CI)\n", + "\n", + "\n", + "Feature\n", + "Store tests\n", + "\n", + "\n", + "Model\n", + "training tests\n", + "\n", + "\n", + "Model\n", + "deployment\n", + "tests\n", + "\n", + "\n", + "Inference\n", + "tests\n", + "\n", + "\n", + "Model\n", + "monitoring\n", + "tests\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "dev\n", + "\n", + "Feature tables Temp tables\n", + "\n", + "staging data\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Data\n", + "\n", + "The staging environment may have its own storage area for testing feature tables and ML pipelines. This\n", + "\n", + "data is generally temporary and only retained long enough to run tests and to investigate test failures. This\n", + "\n", + "data can be made readable from the development environment for debugging.\n", + "\n", + "###### Merge code\n", + "\n", + "\u0007 **Merge request**\n", + "\n", + "The deployment process begins when a merge (or pull) request is submitted against the staging branch\n", + "\n", + "of the project in source control. It is common to use the “main” branch as the staging branch.\n", + "\n", + "**\u0007Unit tests (CI)**\n", + "\n", + "This merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n", + "\n", + "request is rejected.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Integration tests (CI)\n", + "\n", + "The merge request then goes through integration tests, which run all pipelines to confirm that they function\n", + "\n", + "correctly together. The staging environment should mimic the production environment as much as is\n", + "\n", + "reasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n", + "\n", + "Integration tests can trade off fidelity of testing for speed and cost. For example, when models are\n", + "\n", + "expensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n", + "\n", + "cost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n", + "\n", + "testing within these integration tests, whereas other models may be tested with small batch jobs or a few\n", + "\n", + "queries to temporary REST endpoints.\n", + "\n", + "Once integration tests pass on the staging branch, the code may be promoted toward production.\n", + "\n", + "\u0007 **Merge**\n", + "\n", + "If all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n", + "\n", + "system should notify users and post results on the merge (pull) request.\n", + "\n", + "Note: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n", + "\n", + "updated frequently with concurrent merge requests.\n", + "\n", + "###### Cut release branch\n", + "\n", + "Once CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n", + "\n", + "that commit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6**\n", + "\n", + "\n", + "###### Prod\n", + "\n", + "The production environment is typically managed by a select set of ML engineers and is where ML pipelines\n", + "\n", + "directly serve the business or application. These pipelines compute fresh feature values, train and test new\n", + "\n", + "model versions, publish predictions to downstream tables or applications, and monitor the entire process to\n", + "\n", + "avoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n", + "\n", + "online serving below, most ML applications will use only one of these methods, depending on the business\n", + "\n", + "requirements.\n", + "\n", + "Production environment\n", + "\n", + "\n", + "0b\n", + "\n", + "0�\n", + "\n", + "0�\n", + "\n", + "\n", + "Model Registry\n", + "\n", + "\n", + "Online serving\n", + "\n", + "\n", + "Stage: None Stage: Staging Stage: Production\n", + "\n", + "\n", + "Log\n", + "requests and\n", + "predictions\n", + "\n", + "release\n", + "\n", + "\n", + "Load model for\n", + "online serving\n", + "\n", + "\n", + "Ena�le online\n", + "serving\n", + "\n", + "\n", + "Feature table refresh\n", + "\n", + "Data\n", + "Featurization\n", + "preparation\n", + "\n", + "release\n", + "\n", + "0B\n", + "\n", + "\n", + "0~\n", + "\n", + "\n", + "Load model for testing\n", + "\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Inference: Batch or streaming\n", + "\n", + "\n", + "Register and request transition\n", + "\n", + "Model training\n", + "\n", + "Training\n", + "Evaluation\n", + "and tuning\n", + "\n", + "release\n", + "\n", + "\n", + "Promote to staging Promote to production\n", + "\n", + "\n", + "Model\n", + "Data ingest\n", + "inference\n", + "\n", + "\n", + "Pu�lish\n", + "predictions\n", + "\n", + "\n", + "03\n", + "\n", + "\n", + "Continuous Deployment (CD)\n", + "\n", + "\n", + "release\n", + "\n", + "Monitoring\n", + "\n", + "\n", + "Data ingest\n", + "\n", + "\n", + "Check model\n", + "performance\n", + "and data drift\n", + "\n", + "\n", + "Pu�lish\n", + "metrics\n", + "\n", + "\n", + "Compare\n", + "Staging vs\n", + "Production\n", + "\n", + "\n", + "Request model\n", + "transition to\n", + "Production\n", + "\n", + "release\n", + "\n", + "\n", + "Compliance\n", + "checks\n", + "\n", + "\n", + "0�\n", + "\n", + "\n", + "Trigger model training\n", + "\n", + "\n", + "release\n", + "\n", + "\n", + "Data ta�les Feature ta�les Feature ta�les Monitoring ta�les\n", + "Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "Though data scientists may not have write or compute access in the production environment, it is\n", + "\n", + "important to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n", + "\n", + "in production. This visibility allows them to identify and diagnose problems in production.\n", + "\n", + "###### Feature table refresh\n", + "\n", + "This pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n", + "\n", + "or streaming computation, depending on the freshness requirements for downstream training and inference.\n", + "\n", + "The pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n", + "\n", + "###### Model training\n", + "\n", + "The model training pipeline runs either when code changes affect upstream featurization or training logic, or\n", + "\n", + "when automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n", + "\n", + "\u0007 **Training and tuning**\n", + "\n", + "During the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n", + "\n", + "metrics, parameters, tags and the model itself.\n", + "\n", + "During development, data scientists may test many algorithms and hyperparameters, but it is common\n", + "\n", + "to restrict those choices to the top-performing options in the production training code. Restricting tuning\n", + "\n", + "can reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n", + "\n", + "**\u0007Evaluation**\n", + "\n", + "Model quality is evaluated by testing on held-out production data. The results of these tests are\n", + "\n", + "logged to the MLflow tracking server. During development, data scientists will have selected meaningful\n", + "\n", + "evaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n", + "\n", + "**\u0007Register and request transition**\n", + "\n", + "Following model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n", + "\n", + "environment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Continuous deployment (CD)\n", + "\n", + "The CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n", + "\n", + "‘stage=Staging’. There are three key tasks in this pipeline:\n", + "\n", + "\u0007 **Compliance checks**\n", + "\n", + "These tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n", + "\n", + "etc.), and approve or reject the request based on test results. If compliance checks require human\n", + "\n", + "expertise, this automated step can compute statistics or visualizations for people to review in a manual\n", + "\n", + "approval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n", + "\n", + "are recorded to the Model Registry through metadata in tags and comments in descriptions.\n", + "\n", + "The MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n", + "\n", + "can be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n", + "\n", + "the transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n", + "\n", + "transition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n", + "\n", + "**\u0007Compare staging vs. production**\n", + "\n", + "To prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n", + "\n", + "‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n", + "\n", + "according to the use case, and the method for comparison can vary from canary deployments to A/B\n", + "\n", + "tests. All comparison results are saved to metrics tables in the lakehouse.\n", + "\n", + "If this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n", + "\n", + "should be compared to a business heuristic or other threshold as a baseline. For a new version\n", + "\n", + "of an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n", + "\n", + "‘stage=Production’ model.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Request model transition to production**\n", + "\n", + "If the candidate model passes the comparison tests, a request is made to transition it to\n", + "\n", + "‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n", + "\n", + "approvals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n", + "\n", + "webhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n", + "\n", + "fully available to downstream applications. A person can manually review the compliance checks and\n", + "\n", + "performance comparisons to perform checks which are difficult to automate.\n", + "\n", + "###### Online serving (REST APIs)\n", + "\n", + "For lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n", + "\n", + "simple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n", + "\n", + "custom serving layers.\n", + "\n", + "In all cases, the serving system loads the production model from the Model Registry upon initialization. On\n", + "\n", + "each request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n", + "\n", + "serving system, data transport layer or the model itself could log requests and predictions.\n", + "\n", + "###### Inference: batch or streaming\n", + "\n", + "This pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n", + "\n", + "‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n", + "\n", + "throughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n", + "\n", + "option.\n", + "\n", + "A batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n", + "\n", + "A streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n", + "\n", + "Apache Kafka.®\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Monitoring\n", + "\n", + "Input data and model predictions are monitored, both for statistical properties (data drift, model\n", + "\n", + "performance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n", + "\n", + "published for dashboards and alerts.\n", + "\n", + "\u0007 **Data ingestion**\n", + "\n", + "This pipeline reads in logs from batch, streaming or online inference.\n", + "\n", + "**\u0007Check accuracy and data drift**\n", + "\n", + "The pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n", + "\n", + "performance. Metrics that measure statistical properties are generally chosen by data scientists during\n", + "\n", + "development, whereas metrics for infrastructure are generally chosen by ML engineers.\n", + "\n", + "\u0007 **Publish metrics**\n", + "\n", + "The pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n", + "\n", + "to produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n", + "\n", + "dashboarding tool issues notifications when health metrics surpass defined thresholds.\n", + "\n", + "**\u0007Trigger model training**\n", + "\n", + "When the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n", + "\n", + "out of date, the data scientist may need to return to the development environment and develop a new\n", + "\n", + "model version.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Note:** While automated retraining is supported\n", + "\n", + "in this architecture, it isn’t required, and caution\n", + "\n", + "\n", + "###### Retraining\n", + "\n", + "This architecture supports automatic retraining using the same model training pipeline above. While we\n", + "\n", + "recommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n", + "\n", + "retraining when needed.\n", + "\n", + "\u0007 **Scheduled**\n", + "\n", + "If fresh data are regularly made available, rerunning model training on a defined schedule can help models\n", + "\n", + "to keep up with changing trends and behavior.\n", + "\n", + "**\u0007Triggered**\n", + "\n", + "If the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n", + "\n", + "trigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n", + "\n", + "performance degrades, automatic retraining and redeployment can boost model performance with\n", + "\n", + "minimal human intervention.\n", + "\n", + "\n", + "must be taken in cases where it is implemented.\n", + "\n", + "\n", + "It is inherently difficult to automate selecting the\n", + "\n", + "correct action to take from model monitoring\n", + "\n", + "\n", + "When the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n", + "\n", + "scientist may need to return to the dev environment and resume experimentation to address such issues.\n", + "\n", + "\n", + "alerts. For example, if data drift is observed, does\n", + "\n", + "it indicate that we should automatically retrain, or\n", + "\n", + "does it indicate that we should engineer additional\n", + "\n", + "features to encode some new signal in the data?\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4:**\n", + "## LLMOps – Large Language Model Operations\n", + "\n", + "\n", + "#### Large language models\n", + "\n", + "LLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n", + "\n", + "countless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n", + "\n", + "\u0007Is prompt engineering part of operations, and if so, what is needed?\n", + "\n", + "\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n", + "\n", + "\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n", + "\n", + "…and many more!\n", + "\n", + "The good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n", + "\n", + "some parts of your MLOps platform and process may require changes, and your team will need to learn a\n", + "\n", + "mental model of how LLMs coexist alongside traditional ML in your operations.\n", + "\n", + "In this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n", + "\n", + "key topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n", + "\n", + "a reference architecture diagram to illustrate what may change in your production environment.\n", + "\n", + "###### What changes with LLMs?\n", + "\n", + "For those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n", + "\n", + "one-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n", + "\n", + "significantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n", + "\n", + "question answering, summarization and execution of near-arbitrary instructions.\n", + "\n", + "From the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n", + "\n", + "platforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n", + "\n", + "into more detail in the next section.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Table 3**\n", + "\n", + "\n", + "\n", + "|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n", + "|---|---|\n", + "|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n", + "|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n", + "|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n", + "|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n", + "|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n", + "\n", + "\n", + "-----\n", + "\n", + "The list above may look long, but as we will see in the next section, many existing tools and processes\n", + "\n", + "only require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n", + "\n", + "do not change:\n", + "\n", + "\u0007The separation of development, staging and production remains the same\n", + "\n", + "\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n", + "\n", + "models toward production\n", + "\n", + "\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n", + "\n", + "\u0007Existing CI/CD infrastructure should not require changes\n", + "\n", + "\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n", + "\n", + "model inference, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Discussion of key topics for LLMOps\n", + "\n", + "So far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n", + "\n", + "more details about selected topics.\n", + "\n", + "###### Prompt engineering\n", + "\n", + "Prompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n", + "\n", + "responses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n", + "\n", + "We will cover a few tips and best practices and link to useful resources.\n", + "\n", + "**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n", + "\n", + "generally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n", + "\n", + "In the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n", + "\n", + "prompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n", + "\n", + "**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n", + "\n", + "\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n", + "\n", + "prompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n", + "\n", + "more details. Checking structured LLM pipeline code into version control also helps with prompt\n", + "\n", + "development, for git diffs allow you to review changes to prompts over time. Also see the section\n", + "\n", + "below on packaging model and pipelines for more information about tracking prompt versions.\n", + "\n", + "\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n", + "\n", + "Newer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n", + "\n", + "\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n", + "\n", + "tuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n", + "\n", + "tuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n", + "\n", + "tool for prompt tuning.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "There are lots of good resources about\n", + "prompt engineering, especially for popular\n", + "\n", + "models and services:\n", + "\n", + "\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n", + "\n", + "\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n", + "\n", + "\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n", + "\n", + "\n", + "**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n", + "\n", + "popularity. Some of these generalize to other models as well. We will provide a few tips here:\n", + "\n", + "\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n", + "\n", + "input, and a description of the desired output type or format\n", + "\n", + "\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n", + "\n", + "\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n", + "\n", + "\u0007Tell the model to think step-by-step or explain its reasoning\n", + "\n", + "\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n", + "\n", + "clear which parts of the prompt correspond to your instruction vs. user input\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Packaging models or pipelines for deployment\n", + "\n", + "In traditional ML, there are generally two types of ML logic to package for deployment: models and\n", + "\n", + "pipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n", + "\n", + "control, respectively.\n", + "\n", + "With LLMs, it is common to package ML logic in new forms. These may include:\n", + "\n", + "\u0007A lightweight call to an LLM API service (third party or internal)\n", + "\n", + "\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n", + "\n", + "local LLM model.\n", + "\n", + "\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n", + "\n", + "pretrained model or a custom fine-tuned model.\n", + "\n", + "\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n", + "\n", + "Though LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n", + "\n", + "and pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n", + "\n", + "deployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n", + "\n", + "\u0007PyTorch and TensorFlow\n", + "\n", + "\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n", + "\n", + "\u0007LangChain\n", + "\n", + "\u0007OpenAI API\n", + "\n", + "\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n", + "\n", + "For other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n", + "\n", + "arbitrary Python code.\n", + "\n", + "\n", + "**Note about prompt versioning:** Just as it is helpful\n", + "\n", + "to track model versions, it is helpful to track prompt\n", + "\n", + "versions (and LLM pipeline versions, more generally).\n", + "\n", + "Packaging prompts and pipelines as MLflow Models\n", + "\n", + "simplifies versioning. Just as a newly retrained\n", + "\n", + "model can be tracked as a new model version in the\n", + "\n", + "MLflow Model Registry, a newly updated prompt can\n", + "\n", + "be tracked as a new model version.\n", + "\n", + "**Note about deploying models vs. code:** Your\n", + "\n", + "decisions around packaging ML logic as version\n", + "\n", + "controlled code vs. registered models will help\n", + "\n", + "to inform your decision about choosing between\n", + "\n", + "the deploy models, deploy code and hybrid\n", + "\n", + "architectures. Review the subsection below about\n", + "\n", + "human feedback, and make sure that you have a\n", + "\n", + "well-defined testing process for whatever artifacts\n", + "\n", + "you choose to deploy.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Managing cost/performance trade-offs\n", + "\n", + "One of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n", + "\n", + "and serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n", + "\n", + "of billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n", + "\n", + "manage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n", + "\n", + "**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n", + "\n", + "development is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n", + "\n", + "models. As you go, make sure to collect data such as queries and responses. In the future, you can use\n", + "\n", + "that data to fine-tune a smaller, cheaper model which you can own.\n", + "\n", + "**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n", + "\n", + "How much does each query cost? These estimates will inform you about project feasibility and will help\n", + "\n", + "you to decide when to consider bringing the model in-house with open source models and fine-tuning.\n", + "\n", + "**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n", + "\n", + "computation and costs. These include shortening queries, tweaking inference configurations and using\n", + "\n", + "smaller versions of models.\n", + "\n", + "**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n", + "\n", + "unless you get human feedback from end users.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources\n", + "\n", + "**Fine-tuning**\n", + "\n", + "\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n", + "\n", + "\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "\n", + "**Model distillation,**\n", + "**quantization and pruning**\n", + "\n", + "\n", + "###### Methods for reducing costs of inference\n", + "\n", + "**Use a smaller model**\n", + "\n", + "\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n", + "\n", + "or alternate architectures.\n", + "\n", + "\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n", + "\n", + "perform better than a generic model.\n", + "\n", + "\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n", + "\n", + "model into a smaller model.\n", + "\n", + "\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n", + "\n", + "without losing much in quality.\n", + "\n", + "\n", + "\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\n", + "**\u0007Reduce computation for a given model**\n", + "\n", + "\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n", + "\n", + "queries and responses reduces costs.\n", + "\n", + "\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n", + "\n", + "**Other**\n", + "\n", + "\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n", + "\n", + "low ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n", + "\n", + "\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n", + "\n", + "models to use sparse computation during inference. This reduces computation for most or all queries.\n", + "\n", + "\n", + "[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n", + "\n", + "\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n", + "\n", + "\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Human feedback, testing, and monitoring\n", + "\n", + "While human feedback is important in many traditional ML applications, it becomes much more important\n", + "\n", + "for LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n", + "\n", + "metrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n", + "\n", + "might have almost completely different words and word orders, so even defining a “ground-truth” label\n", + "\n", + "becomes difficult or impossible.\n", + "\n", + "Humans — ideally your end users — become essential for validating LLM output. While you can pay human\n", + "\n", + "labelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n", + "\n", + "feedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n", + "\n", + "to chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n", + "\n", + "were helpful.\n", + "\n", + "In terms of operations, not much changes from traditional MLOps:\n", + "\n", + "\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n", + "\n", + "lakehouse, and process it using the same data pipeline tooling as other data.\n", + "\n", + "\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n", + "\n", + "more important, superceding offline quality tests. If you can collect user feedback, then these rollout\n", + "\n", + "methods can validate models before they are fully deployed.\n", + "\n", + "\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n", + "\n", + "fine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n", + "\n", + "start with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n", + "\n", + "\n", + "###### Resources\n", + "\n", + "**Reinforcement Learning from**\n", + "**Human Feedback (RLHF)**\n", + "\n", + "\u0007Chip Huyen blog post on\n", + "\n", + "[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n", + "\n", + "\u0007Hugging Face blog post on\n", + "\n", + "[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n", + "\n", + "[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n", + "\n", + "\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Other topics\n", + "\n", + "\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n", + "\n", + "but some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n", + "\n", + "your LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n", + "\n", + "fine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n", + "\n", + "[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n", + "\n", + "\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n", + "\n", + "adjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n", + "\n", + "models, most LLMs will benefit from or require GPUs for serving and inference.\n", + "\n", + "\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n", + "\n", + "based lookups of documents or other data. Vector databases may be an important addition to your\n", + "\n", + "serving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n", + "\n", + "preprocessed data which can be queried by inference jobs or model serving systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Reference architecture\n", + "\n", + "To illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n", + "\n", + "modified version of the previous production architecture.\n", + "\n", + "Production environment\n", + "\n", + "Model Registry\n", + "\n", + "Stage: �one Stage: Staging Stage: Production\n", + "\n", + "Load model for testing Load model for inference\n", + "\n", + "\n", + "Push model to registry Promote to production\n", + "\n", + "\n", + "Model serving\n", + "\n", + "\n", + "LLM API request\n", + "\n", + "release\n", + "\n", + "\n", + "Fine-Tine LLM\n", + "\n", + "release\n", + "\n", + "\n", + "Vector Database\n", + "Update\n", + "\n", + "release\n", + "\n", + "\n", + "Continuous\n", + "Deployment (CD)\n", + "\n", + "release\n", + "\n", + "\n", + "Monitoring &\n", + "Evaluation\n", + "\n", + "release\n", + "\n", + "\n", + "Internal/External Data tables Vector database Metrics tables Human feedback\n", + "model hub\n", + "\n", + "**Figure 7**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Additional resources\n", + "\n", + "With LLMs being such a novel field, we link to\n", + "several LLM resources below, which are not\n", + "\n", + "necessarily “LLMOps” but may prove useful\n", + "to you.\n", + "\n", + "\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n", + "\n", + "\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n", + "\n", + "LLM lists and leaderboards\n", + "\n", + "\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n", + "\n", + "\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n", + "\n", + "\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n", + "\n", + "[Foundation Models](https://crfm.stanford.edu/)\n", + "\n", + "\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n", + "\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n", + "\n", + "\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "The primary changes to this production architecture are:\n", + "\n", + "\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n", + "\n", + "an internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n", + "\n", + "production to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n", + "\n", + "tuning, this hub would mainly be used in development.\n", + "\n", + "\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n", + "\n", + "model (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n", + "\n", + "but it is similar operationally.\n", + "\n", + "\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n", + "\n", + "most often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n", + "\n", + "its Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n", + "\n", + "that these data stores and jobs are analogous in terms of operations.\n", + "\n", + "\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n", + "\n", + "API calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n", + "\n", + "potential latency or flakiness from third-party APIs, as well as another layer of credential management.\n", + "\n", + "\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n", + "\n", + "but become essential in most LLM applications. Human feedback should be managed like other data,\n", + "\n", + "ideally incorporated into monitoring based on near real-time streaming.\n", + "\n", + "\n", + "[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Looking ahead\n", + "\n", + "LLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n", + "\n", + "support and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n", + "\n", + "sourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n", + "\n", + "flexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n", + "\n", + "While this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n", + "\n", + "requires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n", + "\n", + "news is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n", + "\n", + "LLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n", + "\n", + "the power of large language models.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "9,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast and over 50% of the Fortune 500 — rely\n", + "\n", + "on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered\n", + "\n", + "in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark ™ ,\n", + "\n", + "Delta Lake and MLflow, Databricks is on a mission\n", + "\n", + "to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf2024-09-19T16:57:22Z
-----\n", + "\n", + "# TABLE OF CONTENTS\n", + "\n", + "\n", + "##### Welcome to Data, Analytics and AI ....... 02\n", + "\n", + "**Do you know what you’re getting into?** ............................................ **02**\n", + "\n", + "**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n", + "\n", + "##### Business Value .......................................................................... 03\n", + "\n", + "**Talking to the business (feels like combat)** \b����������������������������� **03**\n", + "\n", + "**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n", + "\n", + "**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n", + "\n", + "##### Ultimate Class Build Guide .................................. 04\n", + "\n", + "**Creating a character** \b����������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Engineers \b������������������������������������������������������������������������������������� **04**\n", + "\n", + "- Data Scientists \b������������������������������������������������������������������������������������� **05**\n", + "\n", + "- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n", + "\n", + "##### Diving In ............................................................................................... 05\n", + "\n", + "**Producing game data** \b���������������������������������������������������������������������������� **05**\n", + "\n", + "**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n", + "\n", + "**Getting data from your game to the cloud** \b������������������������������ **08**\n", + "\n", + "##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n", + "\n", + "**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n", + "\n", + "**Use data to develop a next-generation**\n", + "\n", + "**customer experience** \b��������������������������������������������������������������������������� **09**\n", + "\n", + "##### Getting Started with Gaming Use Cases .............................................................. 10\n", + "\n", + "**Where do I start? Start with Game Analytics** \b������������������������� **10**\n", + "\n", + "**Understand your audience** \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Segmentation \b���������������������������������������������������������������������������� **11**\n", + "\n", + "- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n", + "\n", + "- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n", + "\n", + "- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n", + "\n", + "- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n", + "\n", + "**Find your audience** \b���������������������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n", + "\n", + "- Player Recommendations \b����������������������������������������������������������������� **15**\n", + "\n", + "- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n", + "\n", + "- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n", + "\n", + "- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n", + "\n", + "**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n", + "\n", + "- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n", + "\n", + "- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n", + "\n", + "- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n", + "\n", + "##### Things to Look Forward To ..................................... 19\n", + "\n", + " Appendix .............................................................................................. 21\n", + "\n", + "**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n", + "\n", + "- Creating a Character \b��������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Engineers \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Scientists \b���������������������������������������������������������������������������� **21**\n", + "\n", + "- Data Analysts \b������������������������������������������������������������������������������ **22**\n", + "\n", + "**Data Access and the Major Cloud Providers** ................................ **23**\n", + "\n", + "- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n", + "\n", + "- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n", + "\n", + "- Getting started with the major cloud providers \b������������������� **23**\n", + "\n", + "**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n", + "\n", + "- Game analytics \b������������������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Segmentation \b�������������������������������������������������������������������������� **25**\n", + "\n", + "- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n", + "\n", + "- Social Media Monitoring \b������������������������������������������������������������������� **28**\n", + "\n", + "- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n", + "\n", + "- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n", + "\n", + "- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n", + "\n", + "- Player Recommendations \b���������������������������������������������������������������� **32**\n", + "\n", + "- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n", + "\n", + "- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n", + "\n", + "- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n", + "\n", + "**Getting Started with Operational Use Cases** \b�������������������������� **36**\n", + "\n", + "- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n", + "\n", + "- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n", + "\n", + "- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n", + "\n", + "\n", + "Multi-Touch Attribution \b��������������������������������������������������������������������� **14**\n", + "\n", + "\n", + "-----\n", + "\n", + "# Welcome to Data, Analytics, and AI\n", + "\n", + "\n", + "### Do you know what you’re getting into?\n", + "\n", + "You may have heard the stories of game studios spending\n", + "\n", + "countless hours trying to more effectively acquire, engage,\n", + "\n", + "and retain players. Well, did you know that data, analytics,\n", + "\n", + "and AI plays a central role in the development and operation\n", + "\n", + "of today’s top-grossing video games? Studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are no\n", + "\n", + "longer a “nice-to-have”, but table stakes for success.\n", + "\n", + "The objective of this handbook is to guide you on the\n", + "\n", + "role data, analytics, and AI plays in the development\n", + "\n", + "and operations of video games. We’ll cover who the key\n", + "\n", + "stakeholders are and how to align people across business\n", + "\n", + "units. Then we’ll talk through strategies to help you\n", + "\n", + "successfully advocate for data, analytics, and AI projects\n", + "\n", + "internally. Finally, we dive deep through the most common\n", + "\n", + "use cases. We want to give you enough information to feel\n", + "\n", + "\n", + "well as helpful tips when operating as or working with one of\n", + "\n", + "these classes.\n", + "\n", + "We follow this with the fundamentals for building a Proof\n", + "\n", + "of Concept (POC) or Minimum Viable Product (MVP). That\n", + "\n", + "is, connecting to the cloud; accessing your data; and\n", + "\n", + "most importantly, being able to represent the value you’re\n", + "\n", + "seeking to unlock as you sell your project into your team and\n", + "\n", + "broader organization.\n", + "\n", + "Finally, we’ll dive into the most common use cases for data,\n", + "\n", + "analytics, and AI within game development. Similar to a tech-\n", + "\n", + "tree in a video game, we begin with the most basic use cases\n", + "\n", + "- setting up your game analytics. Then we progress through\n", + "\n", + "more advanced data use cases such as player segmentation,\n", + "\n", + "assessing lifetime value, detecting and mitigating toxicity,\n", + "\n", + "multi-touch attribution, recommendation engines, player\n", + "\n", + "churn prediction and prevention, and more.\n", + "\n", + "Don’t forget to review the Appendix. You’ll find a handy\n", + "\n", + "“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n", + "\n", + "guide for the three major cloud providers ”. All incredibly\n", + "\n", + "helpful assets to keep as hotkeys.\n", + "\n", + "\n", + "empowered to make a demonstrable impact. Just by reading\n", + "\n", + "this you are adding incredible insight and value to yourself as\n", + "\n", + "\n", + "an industry professional. Quest on!\n", + "\n", + "### How to use this book\n", + "\n", + "This book is primarily intended for technical professionals\n", + "\n", + "who are engaging with data within game studios. No\n", + "\n", + "matter your role in the gaming industry, you will be able to\n", + "\n", + "glean key takeaways that will make you more effective in\n", + "\n", + "your individual role and within the larger team — be that\n", + "\n", + "production, art, engineering, marketing, or otherwise.\n", + "\n", + "Begin your journey by reviewing the “ **Data, Analytics, and AI**\n", + "\n", + "**Ground Rules** ” section to the right, which presents some This\n", + "\n", + "section presents some rules and guidelines for interpreting\n", + "\n", + "the role that data plays in the game development lifecycle.\n", + "\n", + "Next, it’s time to learn about the key professions (aka\n", + "\n", + "character classes) that interact and engage with data,\n", + "\n", + "analytics, and AI on a consistent basis within a game studio.\n", + "\n", + "This section breaks down each of the classes, providing an\n", + "\n", + "\n", + "**Data, Analytics, and AI Ground Rules**\n", + "\n", + "This guide assumes you understand the following:\n", + "\n", + "- You understand the basics of data, analytics, and AI:\n", + "\n", + "How and why data is stored in a system, why data\n", + "\n", + "is transformed, the different types of output that\n", + "\n", + "data can feed into — such as a report, an analysis\n", + "\n", + "answering a question, or a machine learning model.\n", + "\n", + "If this is the first time you’re creating a character,\n", + "\n", + "we highly recommend reviewing our data, analytics,\n", + "\n", + "and AI tutorial — aka getting started training and\n", + "\n", + "documentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n", + "\n", + "- You have a basic understanding of cloud\n", + "\n", + "infrastructure. Specifically what it is, who are the\n", + "\n", + "key players, and associated terms (e.g., virtual\n", + "\n", + "machines, APIs, applications)\n", + "\n", + "- You are generally aware of the game development\n", + "\n", + "lifecycle; pre-production, production, testing/QA,\n", + "\n", + "launch, operation\n", + "\n", + "\n", + "overview of each character’s strengths and weaknesses as\n", + "\n", + "\n", + "-----\n", + "\n", + "# Business Value\n", + "\n", + "\n", + "Demonstrating business value is important when working\n", + "\n", + "on data, analytics, and AI projects because it helps ensure\n", + "\n", + "that the efforts of the project are aligned with the goals\n", + "\n", + "and objectives of the business. By showing how the project\n", + "\n", + "can positively impact a game’s key performance indicators\n", + "\n", + "(KPIs) and bottom-line metrics, such as game revenue, player\n", + "\n", + "satisfaction, and operational efficiency, studio stakeholders\n", + "\n", + "are more likely to support and invest in the project.\n", + "\n", + "Additionally, demonstrating business value can help justify\n", + "\n", + "the resources, time, and money that are required to execute\n", + "\n", + "the project, and can also help prioritize which projects should\n", + "\n", + "be pursued. By focusing on business value, data, analytics,\n", + "\n", + "and AI projects can become strategic initiatives that\n", + "\n", + "contribute to the long-term success of your game studio.\n", + "\n", + "### Talking to the business (feels like combat)\n", + "\n", + "While we highly encourage everyone to read this section,\n", + "\n", + "you may already feel confident understanding the needs and\n", + "\n", + "concerns of your internal stakeholders, and how to sell-in a\n", + "\n", + "project successfully. If so, feel free to skip this section.\n", + "\n", + "We would love to dive into the data to explore and discover\n", + "\n", + "as much as possible, unfortunately in most environments,\n", + "\n", + "we are limited by resources and time. Understanding both\n", + "\n", + "the businesses pain points and strategic goals is crucial to\n", + "\n", + "choosing projects that will benefit the business, create value\n", + "\n", + "and make your message much easier to sell.\n", + "\n", + "Whenever we embark on a proof-of-concept (PoC) or\n", + "\n", + "minimum viable product (MVP) — to prove out a new\n", + "\n", + "**Questions to ask:**\n", + "\n", + "- What other strategic goals and pain points can\n", + "\n", + "you list out and how would you prioritize them as\n", + "\n", + "a business leader?\n", + "\n", + "- Does your prioritization match how your team,\n", + "\n", + "manager and/or leadership would prioritize?\n", + "\n", + "Typically the closer the match, the easier initial\n", + "\n", + "projects will be to “sell”.\n", + "\n", + "\n", + "methodology or technology — we will need to pitch it back\n", + "\n", + "for adoption. The technology could be revolutionary and\n", + "\n", + "absolutely amazing, but without the value proposition and tie\n", + "\n", + "back to goals, it is likely to land flat or fail to be adopted.\n", + "\n", + "It is key to talk to your stakeholders to understand their\n", + "\n", + "perception of pain points and positions on potential projects\n", + "\n", + "to add value. Much like stopping at the Tavern when the\n", + "\n", + "adventuring party gets to town, these can be informal\n", + "\n", + "conversations where you socialize potential solutions while\n", + "\n", + "gathering information about what matters.\n", + "\n", + "### Creating value alignment\n", + "\n", + "So what are your strategic goals and pain points and how\n", + "\n", + "might they be addressed through a use case from a PoC or\n", + "\n", + "MVP leveraging your data?\n", + "\n", + "A few examples of strategic goals that are top of mind for our\n", + "\n", + "customers at the beginning of any fiscal or calendar year:\n", + "\n", + "- Reduce costs\n", + "\n", + "- Simplify your infrastructure\n", + "\n", + "- Acquire more players\n", + "\n", + "- Monetize your playerbase\n", + "\n", + "- Retain your players (aka prevent churn)\n", + "\n", + "Here are four ways the Databricks Lakehouse can provide\n", + "\n", + "value that aligns with your strategic goals and pain points:\n", + "\n", + "`1.` **\u0007Improved collaboration:** Databricks platform allows\n", + "\n", + "everyone to share and collaborate on data, notebooks and\n", + "\n", + "models between data scientists, engineers and business\n", + "\n", + "users. This enables for a more efficient and streamlined\n", + "\n", + "process for data analysis and decision making.\n", + "\n", + "`2.` **Find and explore your data:** The data in the Lakehouse is\n", + "\n", + "cataloged and accessible, which enables business users\n", + "\n", + "to explore and query the data easily and discover insights\n", + "\n", + "by themselves.\n", + "\n", + "`3.` **\u0007Uncover actionable business insights:** By putting\n", + "\n", + "your game’s data into a Lakehouse architecture, it\n", + "\n", + "can be better analyzed using various tools provided\n", + "\n", + "by Databricks such as SQL, dashboards, notebooks,\n", + "\n", + "visualization and machine learning to better understand\n", + "\n", + "your playerbase, providing valuable insights into player\n", + "\n", + "behavior and performance. These insights can help the\n", + "\n", + "\n", + "-----\n", + "\n", + "and retention, and use that information to improve the\n", + "\n", + "game and grow monetization.\n", + "\n", + "`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n", + "\n", + "architecture provides a single source of truth for your\n", + "\n", + "organization’s data. Data engineers write once, data\n", + "\n", + "analysts interpret the data, and data scientists can run\n", + "\n", + "machine machine learning models on the same data.\n", + "\n", + "_This cannot be understated in the value this provides an_\n", + "\n", + "_organization from a total cost of ownership perspective._\n", + "\n", + "With the ability to access and analyze all the data in one\n", + "\n", + "place, the business can make unified data-driven decisions,\n", + "\n", + "rather than relying on intuition or fragmented data.\n", + "\n", + "### Goals and outcomes\n", + "\n", + "Like many projects, starting with a strong foundation of ‘what\n", + "\n", + "success looks like’ will significantly improve your likelihood\n", + "\n", + "of achieving your objectives. Here are a few best-practices\n", + "\n", + "we recommend:\n", + "\n", + "`1.` **Set goals:** Define your hypothesis, then use your data\n", + "\n", + "and process to prove or disprove your hypothesis. You\n", + "\n", + "have a goal in mind, make it part of the experiment. If\n", + "\n", + "the outcome differs from the expectation, that is part of\n", + "\n", + "experiments and we can learn from it to improve the next\n", + "\n", + "experiment. This is all about shortening the feedback loop\n", + "\n", + "\n", + "project appropriately. For example, are you doing this as\n", + "\n", + "a side project? Do you have 2 sprints to show progress?\n", + "\n", + "It’s important to scope your project based on the time,\n", + "\n", + "resources, and quality needed for the said project to be a\n", + "\n", + "success.\n", + "\n", + "`3.` **Scope down:** Ruthlessly control scope for any PoC or\n", + "\n", + "MVP. Prioritization is your best friend. Stakeholders and\n", + "\n", + "your own internal team will naturally want to increase\n", + "\n", + "scope because there’s no shortage of good ideas. But by\n", + "\n", + "controlling scope, you improve your chances of shipping\n", + "\n", + "on time and on budget. Don’t let perfection be the enemy\n", + "\n", + "of good. There are always exceptions to this, but that is\n", + "\n", + "what the next sprint is for.\n", + "\n", + "`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n", + "\n", + "difficult - strive to always deliver on time. Make sure your\n", + "\n", + "goals, constraints and scope creep will not explode your\n", + "\n", + "timeline as creating tight feedback loops and iteration\n", + "\n", + "cycles is what will make you more agile than the competition.\n", + "\n", + "`5.` **Socialize early, and often:** Show quantifiable value as\n", + "\n", + "quickly as possible, both to your immediate team and\n", + "\n", + "business stakeholders. Measure the value as frequently\n", + "\n", + "as makes sense, and socialize early and often to promote\n", + "\n", + "visibility of the project and ensure tight alignment across\n", + "\n", + "teams. This will empower you to create tighter feedback\n", + "\n", + "loops that will help improve any future iterations of your\n", + "\n", + "product, platform, or technology.\n", + "\n", + "\n", + "between insight and action.\n", + "\n", + "# Ultimate Class Build Guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "Have you rolled your character already? Data engineers, data\n", + "\n", + "scientists, and data analysts form the heart of mature game\n", + "\n", + "data teams. Though, depending on studio size and resources,\n", + "\n", + "\n", + "making sense of large amounts of data. Depending on the size\n", + "\n", + "of the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in data\n", + "\n", + "engineering, analytics and data science. Key characters include:\n", + "\n", + "\n", + "game developers may also be pulled in from time to time to\n", + "\n", + "\n", + "perform data engineering and or data science tasks. Though for\n", + "\n", + "the sake of this guide, we’ll keep focus on roles of data engineers,\n", + "\n", + "data scientists, and data analysts. There are many aspects to\n", + "\n", + "these roles, but they can be summarized in that Data Engineers\n", + "\n", + "create and maintain critical data workflows, Data Analysts\n", + "\n", + "interpret data and create reports that keep the business teams\n", + "\n", + "running seamlessly, and Data Scientists are responsible for\n", + "\n", + "\n", + "**Data Engineers**\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Data Analysts**\n", + "\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "**Learn more about these character classes**\n", + "\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "# Diving In\n", + "\n", + "\n", + "Before we get to the primary use cases of game data,\n", + "\n", + "analytics, and AI, we need to cover some basics. That is, the\n", + "\n", + "different types of game data and how they are produced.\n", + "\n", + "And the subsequent receiving of that data in the cloud to\n", + "\n", + "\n", + "### Producing game data…\n", + "\n", + "Speaking in generalities, there are four buckets of data as it\n", + "\n", + "relates to your video game.\n", + "\n", + "\n", + "collect, clean, and prepare for analysis.\n", + "\n", + "**1. Game Telemetry**\n", + "\n", + "Game telemetry refers to the data collected about player\n", + "\n", + "behavior and interactions within a video game. The primary\n", + "\n", + "data source is the game engine. And the goal of game\n", + "\n", + "telemetry is to gather information that can help game\n", + "\n", + "developers understand player behavior and improve the\n", + "\n", + "overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in\n", + "\n", + "game telemetry include:\n", + "\n", + "- **Player engagement:** Track the amount of time players\n", + "\n", + "spend playing the game, and their level of engagement\n", + "\n", + "with different parts of the game.\n", + "\n", + "- **Game progress:** Monitor player progress through\n", + "\n", + "different levels and milestones in the game.\n", + "\n", + "- **In-game purchases:** Track the number and value of\n", + "\n", + "in-game purchases made by players.\n", + "\n", + "- **Player demographics:** Collect demographic information\n", + "\n", + "about players, such as age, gender, location, and device type.\n", + "\n", + "- **Session length:** Monitor the length of each player session,\n", + "\n", + "and how often players return to the game.\n", + "\n", + "- **Retention:** Track the percentage of players who return to\n", + "\n", + "the game after their first session.\n", + "\n", + "\n", + "-----\n", + "\n", + "such as the types of actions taken, the number of deaths,\n", + "\n", + "and the use of power-ups.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels.\n", + "\n", + "**2. Business KPIs**\n", + "\n", + "The second bucket of data is business key performance\n", + "\n", + "indicators (or KPIs). Business KPIs are metrics that measure\n", + "\n", + "the performance and success of a video game from a\n", + "\n", + "business perspective. The primary data source for business\n", + "\n", + "KPIs include game telemetry, stores, and marketplaces.\n", + "\n", + "These KPIs help game studios understand the financial and\n", + "\n", + "operational performance of their games and make informed\n", + "\n", + "decisions about future development and growth.\n", + "\n", + "Some of the primary business metrics that are typically\n", + "\n", + "tracked include:\n", + "\n", + "- **Revenue:** Track the total revenue generated by the game,\n", + "\n", + "including sales of the game itself, in-game purchases,\n", + "\n", + "and advertising.\n", + "\n", + "- **Player Acquisition Cost (CAC):** Calculate the cost\n", + "\n", + "of acquiring a new player, including marketing and\n", + "\n", + "advertising expenses.\n", + "\n", + "- **Lifetime Value (LTV):** Estimate the amount of revenue a\n", + "\n", + "player will generate over the course of their time playing\n", + "\n", + "the game.\n", + "\n", + "- **Player Retention:** Track the percentage of players who\n", + "\n", + "continue to play the game over time, and how long they\n", + "\n", + "play for.\n", + "\n", + "- **Engagement:** Measure the level of engagement of players\n", + "\n", + "with the game, such as the number of sessions played,\n", + "\n", + "time spent playing, and in-game actions taken.\n", + "\n", + "- **User Acquisition:** Track the number of new players\n", + "\n", + "acquired through different marketing channels and the\n", + "\n", + "cost of acquiring each player.\n", + "\n", + "- **Conversion Rate:** Measure the percentage of players who\n", + "\n", + "make an in-game purchase or complete a specific action.\n", + "\n", + "- **Gross Margin:** Calculate the profit generated by the game\n", + "\n", + "after subtracting the cost of goods sold, such as the cost\n", + "\n", + "of game development and server hosting.\n", + "\n", + "**3. Game Services**\n", + "\n", + "Similar to game telemetry, game services provide critical\n", + "\n", + "infrastructure that requires careful monitoring and management.\n", + "\n", + "These services include things like game server hosting,\n", + "\n", + "\n", + "and more. Here the source of data is the game services used.\n", + "\n", + "Some of the common metrics game teams typically track for\n", + "\n", + "these services include:\n", + "\n", + "- **Concurrent Players:** Track the number of players who are\n", + "\n", + "simultaneously connected to the game servers to ensure\n", + "\n", + "that the servers have enough capacity to handle the\n", + "\n", + "player demand.\n", + "\n", + "- **Server Availability:** Monitor the uptime and downtime of\n", + "\n", + "the game servers to ensure that players have access to\n", + "\n", + "the game when they want to play, particularly important\n", + "\n", + "for global live service games where demand fluctuates\n", + "\n", + "throught the day.\n", + "\n", + "- **Latency:** Measure the time it takes for data to travel\n", + "\n", + "from the player’s device to the game server and back,\n", + "\n", + "to ensure that players have a smooth and responsive\n", + "\n", + "gaming experience.\n", + "\n", + "- **Network Bandwidth:** Monitor the amount of data being\n", + "\n", + "transmitted between the player’s device and the game\n", + "\n", + "server to ensure that players have a high-quality gaming\n", + "\n", + "experience, even on slow internet connections.\n", + "\n", + "- **Live Operations:** Monitor the success of in-game events,\n", + "\n", + "promotions, and other live operations to understand what\n", + "\n", + "resonates with players and what doesn’t.\n", + "\n", + "- **Player Feedback:** Monitor player feedback and reviews,\n", + "\n", + "including ratings and comments on social media, forums,\n", + "\n", + "and app stores, to understand what players like and dislike\n", + "\n", + "about the game.\n", + "\n", + "- **Chat Activity:** Track the number of messages and\n", + "\n", + "interactions between players in the game’s chat channels\n", + "\n", + "to understand the level of social engagement and\n", + "\n", + "community building in the game.\n", + "\n", + "**4. Data beyond the game**\n", + "\n", + "The last bucket comes from data sources beyond the video\n", + "\n", + "game. These typically include the following:\n", + "\n", + "- **Social Media Data:** Social media platforms, such as\n", + "\n", + "Facebook, Twitter, TikTok and Instagram, can provide\n", + "\n", + "valuable insights into player behavior, feedback and\n", + "\n", + "preferences, as well as help game teams understand\n", + "\n", + "how players are talking about their games online with\n", + "\n", + "different communities.\n", + "\n", + "- **Forum Data:** Online forums and discussion boards, such\n", + "\n", + "as Reddit and Discord, can be rich sources of player\n", + "\n", + "feedback and opinions about the game.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The secret to success is bringing all of the disparate data sources\n", + " together, so you have as complete a 360-degree view as possible of\n", + " what’s happening in and around your game.\n", + "\n", + "\n", + "\n", + "- **Player Reviews:** Ratings and reviews on app stores, such\n", + "\n", + "as Steam, Epic, Google Play and the Apple App Store, can\n", + "\n", + "provide valuable feedback on player experiences and help\n", + "\n", + "game teams identify areas for improvement.\n", + "\n", + "- **Third-Party Data:** Third-party data sources, such as\n", + "\n", + "market research firms and industry data providers, can\n", + "\n", + "provide valuable insights into broader gaming trends and\n", + "\n", + "help game teams make informed decisions about their\n", + "\n", + "games and marketing strategies.\n", + "\n", + "This is a lot of data. And it’s no wonder that studios globally\n", + "\n", + "struggle with fragmented views of their audience, with data\n", + "\n", + "often outpacing legacy technologies. Today, the need for real-\n", + "\n", + "time capabilities and the leap from descriptive to predictive\n", + "\n", + "analytics has made it so that data, analytics, and AI are now\n", + "\n", + "table stakes for a game to be successful. Tapping into these\n", + "\n", + "four buckets of data sources, you’ll find actionable insights that\n", + "\n", + "drive better understanding of your playerbase, more efficient\n", + "\n", + "acquisition, stronger and longer lasting engagement, and\n", + "\n", + "monetization that deepens the relationship with your players.\n", + "\n", + "That’s what we’re going to dig into throughout the rest of\n", + "\n", + "this book.\n", + "\n", + "**Let’s begin with how to get data out of your game!**\n", + "\n", + "There are a variety of ways to get data out of the game and\n", + "\n", + "into cloud resources. In this section, we will provide resources\n", + "\n", + "for producing data streams in Unity and Unreal. In addition,\n", + "\n", + "we will also provide a generic approach that will work for any\n", + "\n", + "game engine, as long as you are able to send HTTP requests.\n", + "\n", + "**Unity**\n", + "\n", + "Since Unity supports C#, you would use a .NET SDK from the\n", + "\n", + "cloud provider of your choice. All three major cloud providers\n", + "\n", + "\n", + "[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n", + "\n", + "- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n", + "\n", + "- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n", + "\n", + "- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n", + "\n", + "- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n", + "\n", + "From here, the SDK is used to send data to a messaging\n", + "\n", + "service. These messaging services will be covered in more\n", + "\n", + "detail in the next section.\n", + "\n", + "**Unreal Engine**\n", + "\n", + "Unreal supports development with C++, so you could use\n", + "\n", + "C++ SDKs or Blueprint interfaces to those SDKs.\n", + "\n", + "The resources for each SDK are provided here\n", + "\n", + "- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n", + "\n", + "- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n", + "\n", + "- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n", + "\n", + "Just like with the Unity example above, from here the data is\n", + "\n", + "sent to a messaging streaming service.\n", + "\n", + "Other engines may not support C++ or C#, but there is still a\n", + "\n", + "way to get your data into the cloud, no matter the language!\n", + "\n", + "By hitting an API Gateway with a HTTP POST request, you are\n", + "\n", + "able to send data to cloud services from many more types of\n", + "\n", + "applications. A sample high level architecture of this solution\n", + "\n", + "in AWS and Azure can be seen below:\n", + "\n", + "**AWS:**\n", + "\n", + "\n", + "have .NET SDKs to use and I have linked the documentation\n", + "\n", + "\n", + "**Azure:**\n", + "\n", + "\n", + "for each below.\n", + "\n", + "No matter the cloud provider, if you want to use a SDK you\n", + "\n", + "install it through the NuGet package manager into your Unity\n", + "\n", + "project. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n", + "\n", + "\n", + "-----\n", + "\n", + "Once the data has been sent from the game into an event-\n", + "\n", + "streaming service, how do we get that data to a more\n", + "\n", + "permanent home? Here we will start by outlining what these\n", + "\n", + "messaging services do and how we can use them to point\n", + "\n", + "our data to a desired location.\n", + "\n", + "Messaging services ingest real-time event data, being\n", + "\n", + "streamed to them from a number of different sources,\n", + "\n", + "and then send them to their appropriate target locations.\n", + "\n", + "These target locations can be databases, compute clusters\n", + "\n", + "or cloud object stores. A key property of the messaging\n", + "\n", + "services is to preserve the time in which the events arrive, so\n", + "\n", + "that it is always known the order that events occurred.\n", + "\n", + "\n", + "\n", + "- Data is stored in object storage such as S3, Azure Storage\n", + "\n", + "or GCP Buckets using Delta Lake.\n", + "\n", + "- Delta Lake is an open-source storage framework that makes\n", + "\n", + "it easy to maintain data consistency and track changes.\n", + "\n", + "**Data Governance & Cataloging:**\n", + "\n", + "- Unity Catalog in Databricks provides tools for data\n", + "\n", + "governance that helps with compliance and controlling\n", + "\n", + "access to data in the lake.\n", + "\n", + "- Unity Catalog also allows to track data lineage, auditing and\n", + "\n", + "data discovery with the use of data catalogs and governance.\n", + "\n", + "- Metadata about the data including the structure, format,\n", + "\n", + "and location of the data can be stored in a data catalog.\n", + "\n", + "\n", + "Examples of cloud messaging services include AWS Kinesis\n", + "\n", + "\n", + "Firehose, Google PubSub, and Azure Event Hubs Messaging.\n", + "\n", + "If you prefer to use open-source products, Apache Kafka is a\n", + "\n", + "very popular open-source alternative.\n", + "\n", + "### Getting data from your game to the cloud\n", + "\n", + "Moving to the cloud platform part of the journey involves\n", + "\n", + "building a gaming Lakehouse. The gaming Lakehouse allows\n", + "\n", + "gaming companies to store, manage, and analyze large volumes\n", + "\n", + "of gaming data, such as player behavior, performance metrics,\n", + "\n", + "and financial transactions, to gain valuable insights and make\n", + "\n", + "data-driven decisions to improve their business outcomes.\n", + "\n", + "**Next here are the basics of the Databricks**\n", + "\n", + "**platform simplified.**\n", + "\n", + "**Data Ingestion:**\n", + "\n", + "- Data can be ingested into the Gaming Lakehouse using\n", + "\n", + "various built-in data ingestion capabilities provided by\n", + "\n", + "Databricks such as Structured Streaming and Delta Live\n", + "\n", + "Tables for a single simple API that handles streaming or\n", + "\n", + "batch pipelines.\n", + "\n", + "- Data can be ingested in real-time or batch mode from\n", + "\n", + "\n", + "**Data Quality:**\n", + "\n", + "- Databricks platform enables you to validate, clean\n", + "\n", + "and enrich data using built-in libraries and rule-based\n", + "\n", + "validation using Delta Live Tables.\n", + "\n", + "- It also allows tracking data quality issues and missing\n", + "\n", + "values by using Databricks Delta Live Tables tables.\n", + "\n", + "**Data Security:**\n", + "\n", + "- Databricks provides a comprehensive security model to\n", + "\n", + "secure data stored in the lake.\n", + "\n", + "- Access to data can be controlled through robust access\n", + "\n", + "controls on objects such as catalogs, schemas, tables,\n", + "\n", + "rows, columns, models, experiments, and clusters.\n", + "\n", + "**Analytics:**\n", + "\n", + "- The processed data can be analyzed using various\n", + "\n", + "tools provided by Databricks such as SQL Dashboards,\n", + "\n", + "Notebooks, visualizations and ML.\n", + "\n", + "- Game studios can gain insights into player performance and\n", + "\n", + "behaviorto better engageplayers and improve their games.\n", + "\n", + "**Get started with your preferred cloud**\n", + "\n", + "\n", + "various sources such as game clients, servers or APIs.\n", + "\n", + "Data can be cleaned, transformed and enriched with\n", + "\n", + "additional data sources, making it ready for analysis.\n", + "\n", + "\n", + "-----\n", + "\n", + "# The Value of Data Throughout the Game Development Lifecycle\n", + "\n", + "\n", + "### Lifecycle overview\n", + "\n", + "Over the last decade, the way games have been developed\n", + "\n", + "and monetized has changed dramatically. Most if not all\n", + "\n", + "top grossing games are now built using a games-as-service\n", + "\n", + "strategy, meaning titles shipped in cycles of constant\n", + "\n", + "iteration to increase engagement and monetization of\n", + "\n", + "players over time. Games-as-a-Service models have the\n", + "\n", + "ability to create sticky, high-margin games, but they also\n", + "\n", + "heavily depend on cloud-based services such as game\n", + "\n", + "play analytics, multiplayer servers and matchmaking, player\n", + "\n", + "relationship management, performance marketing and more.\n", + "\n", + "Data plays an integral role in the development and operation\n", + "\n", + "of video games. Teams need tools and services to optimize\n", + "\n", + "player lifetime value (LTV) with databases that can process\n", + "\n", + "terabytes-petabytes of evolving data, analytics solutions\n", + "\n", + "that can access that data with near real-time latency, and\n", + "\n", + "machine learning (ML) models that can translate insights into\n", + "\n", + "actionable and innovative gameplay features.\n", + "\n", + "A game’s development lifecycle is unique to each studio. With\n", + "\n", + "different skillsets, resources, and genres of games, there is no\n", + "\n", + "\n", + "one model. Below is a simplified view of a game development\n", + "\n", + "lifecycle for a studio running a games-as-a-service model.\n", + "\n", + "What’s important to remember is that throughout your title’s\n", + "\n", + "development lifecycle, there is data that can help you better\n", + "\n", + "understand your audience, more effectively find and acquire\n", + "\n", + "players, and more easily activate and engage them. Whether\n", + "\n", + "using game play data to optimize creative decision making\n", + "\n", + "during pre-production, tapping machine learning models to\n", + "\n", + "predict and prevent churn, or identifying the next best offer\n", + "\n", + "or action for your players in real-time, **data is your friend** .\n", + "\n", + "### Use data to develop a next-generation customer experience\n", + "\n", + "In the game industry, customer experience (CX) is an\n", + "\n", + "important factor that can impact a player’s enjoyment of a\n", + "\n", + "game and the length they choose to play that game over time.\n", + "\n", + "In today’s highly competitive and fast-paced games industry,\n", + "\n", + "a game studio’s ability to deliver exceptional and seamless\n", + "\n", + "customer experiences can be a strategic differentiator when\n", + "\n", + "it comes to cutting through the noise and winning a gamer’s\n", + "\n", + "\n", + "## Game Development Lifecycle\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n", + "\n", + "\n", + "**Game Development Lifecycle**\n", + "\n", + "\n", + "_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n", + "\n", + "\n", + "**1. Pre-Production**\n", + "\n", + "Brainstorm how to give life to the many\n", + "\n", + "ideas laid out in the planning phase\n", + "\n", + "\n", + "**3. Testing**\n", + "\n", + "Every feature and mechanic in the game needs\n", + "\n", + "to be tested for game loop and quality control\n", + "\n", + "\n", + "**5. Operation**\n", + "\n", + "As studios increasingly adopt games-as-a-service models, the\n", + "\n", + "ongoing operation of a video game is as critical as the launch itself\n", + "\n", + "**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n", + "\n", + "\n", + "\n", + "|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n", + "|---|---|---|---|---|---|---|---|\n", + "|||||||||\n", + "|||||||||\n", + "\n", + "\n", + "**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n", + "**EXPERIMENTATION**\n", + "\n", + "\n", + "**2. Production**\n", + "\n", + "Most of the time, effort, and resources\n", + "\n", + "spent on developing video games are\n", + "\n", + "spent in production stage\n", + "\n", + "\n", + "**4. Launch**\n", + "\n", + "Whether developing alongside the community with\n", + "\n", + "alpha and beta releases, or launching into general\n", + "\n", + "availability, a game launch is a critical milestone\n", + "\n", + "\n", + "-----\n", + "\n", + "can help drive value through customer experience:\n", + "\n", + "`1.` **Personalization:** Game studios can use data analytics\n", + "\n", + "and machine learning to personalize the game experience\n", + "\n", + "for each player based on their preferences and behavior.\n", + "\n", + "This can include personalized recommendations for\n", + "\n", + "content, in-game events, and other features that are\n", + "\n", + "tailored to the player’s interests.\n", + "\n", + "`2.` **Omnichannel support:** Players often use multiple\n", + "\n", + "channels, such as social media, forums, and in-game\n", + "\n", + "support, to communicate with game studios. Next\n", + "\n", + "generation customer experience involves providing a\n", + "\n", + "seamless and integrated support experience across all\n", + "\n", + "these channels in near-real time.\n", + "\n", + "`3.` **Continuous improvement:** Game studios can use data\n", + "\n", + "and feedback from players to continuously improve\n", + "\n", + "\n", + "gathering feedback on new features and using it to refine\n", + "\n", + "and optimize the game over time.\n", + "\n", + "In summary, defining what a next generation customer\n", + "\n", + "experience looks like for your game is important because it can\n", + "\n", + "help you create a more personalized, seamless, and enjoyable\n", + "\n", + "experience for your players, which can lead to increased\n", + "\n", + "engagement, monetization, and loyalty. There are many\n", + "\n", + "ways teams can use data throughout a game’s development\n", + "\n", + "lifecycle, but far and away the most valuable focus area will be\n", + "\n", + "in building and refining the customer experience.\n", + "\n", + "Throughout the rest of this guide, we will dig into the most\n", + "\n", + "common use cases for data, analytics, and AI in game\n", + "\n", + "development, starting with where we recommend everyone\n", + "\n", + "begins: game analytics.\n", + "\n", + "\n", + "# Getting Started with Gaming Use Cases\n", + "\n", + "\n", + "### Where do I start? Start with game analytics\n", + "\n", + "**Overview**\n", + "\n", + "Big question: Where’s the best place to start when it comes\n", + "\n", + "to game data, analytics, and AI? For most game studios,\n", + "\n", + "the best place to start is with game analytics. Setting up a\n", + "\n", + "dashboard for your game analytics that helps you correlate\n", + "\n", + "data across disparate sources is infinitely valuable in a world\n", + "\n", + "\n", + "where there is no one gaming data source to rule them all.\n", + "\n", + "An effective dashboard should include your game telemetry\n", + "\n", + "data, data from any game services you’re running, and data\n", + "\n", + "sources outside of your game such as stores, marketplaces,\n", + "\n", + "and social media. See below.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Getting a strong foundation in game analytics unlocks more\n", + "\n", + "advanced data, analytics, and AI use cases. For example,\n", + "\n", + "concurrent player count plus store and marketplace data\n", + "\n", + "\n", + "**GAME TELEMETRY**\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**GAME SERVICES** **OTHER SOURCES**\n", + "\n", + "\n", + "-----\n", + "\n", + "and lifetime value. Usage telemetry combined with crash\n", + "\n", + "reporting and social media listening helps you more quickly\n", + "\n", + "uncover where players might be getting frustrated. And\n", + "\n", + "correlating chat logs, voice transcriptions, and or discord\n", + "\n", + "\n", + "that are relevant and engaging to your players, giving you\n", + "\n", + "tools to effectively market and monetize with your audience.\n", + "\n", + "**Let’s start with Player Segmentation.**\n", + "\n", + "\n", + "and reddit forums can help you identify disruptive behavior\n", + "\n", + "\n", + "before it gets out of hand, giving you the tools to take\n", + "\n", + "actionable steps to mitigate toxicity within your community.\n", + "\n", + "**Get started and set up your Analytics Dashboard**\n", + "\n", + "### Understand your audience\n", + "\n", + "With your analytics pipelines set up, the first area of focus is to\n", + "\n", + "better understand your audience. This can help you inform a\n", + "\n", + "variety of key business decisions, from the highest macro order\n", + "\n", + "of “what game(s) to develop”, to how to market and monetize\n", + "\n", + "those games, and how to optimize the player experience.\n", + "\n", + "By understanding the demographics, preferences, and\n", + "\n", + "behaviors of their audience, a game studio can create games\n", + "\n", + "that are more likely to appeal to their target market and be\n", + "\n", + "successful. You can also use this understanding to tailor your\n", + "\n", + "marketing and monetization strategies to the needs and\n", + "\n", + "preferences of your players.\n", + "\n", + "Additionally, understanding your audience can help you\n", + "\n", + "\n", + "##### Player Segmentation\n", + "\n", + "**Overview**\n", + "\n", + "Player segmentation is the practice of dividing players\n", + "\n", + "into groups based on shared characteristics or behaviors.\n", + "\n", + "Segmentation has a number of benefits. You can better\n", + "\n", + "understand your players, create more personalized content,\n", + "\n", + "improve player retention, and optimize monetization, all of\n", + "\n", + "which contributes to an improved player experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The primary objective of segmentation is to ensure you’re\n", + "\n", + "not treating your entire playerbase the exact same. Humans\n", + "\n", + "are different, and your players have different motivations,\n", + "\n", + "preferences and behaviors. Recognizing this and engaging\n", + "\n", + "with them in a way that meets them where they’re at\n", + "\n", + "is one of the most impactful ways you can cultivate\n", + "\n", + "engagement with your game. As we mentioned above,\n", + "\n", + "the benefits of segmentation are broad reaching. Through\n", + "\n", + "better understanding of your playerbase, you can better\n", + "\n", + "personalize experiences, tailoring content and customer\n", + "\n", + "experience to specific groups of players that increases\n", + "\n", + "engagement and satisfaction. Better understanding of\n", + "\n", + "your players also helps in improving player retention. By\n", + "\n", + "identifying common characteristics of players who are at\n", + "\n", + "risk of churning (i.e., stopping play), you can develop targeted\n", + "\n", + "strategies that only reach specific audiences.\n", + "\n", + "Create advanced customer segments to build out more\n", + "\n", + "effective user stories, and identify potential purchasing\n", + "\n", + "predictions based on behaviors. Leverage existing sales\n", + "\n", + "data, campaigns and promotions systems to create robust\n", + "\n", + "segments with actionable behavior insights to inform your\n", + "\n", + "product roadmap. You can then use this information to build\n", + "\n", + "useful customer clusters that are targetable with different\n", + "\n", + "promos and offers to drive more efficient acquisition and\n", + "\n", + "deeper engagement with existing players.\n", + "\n", + "\n", + "identify potential pain points or areas for improvement\n", + "\n", + "\n", + "within your games, allowing you to proactively make changes\n", + "\n", + "\n", + "**Get started with Player Segmentation**\n", + "\n", + "\n", + "to address these issues and improve the player experience\n", + "\n", + "before a player potentially churns.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player lifetime value (LTV) is a measure of the value that a\n", + "\n", + "player brings to a game over the lifetime they play that game.\n", + "\n", + "It is typically calculated by multiplying the average revenue\n", + "\n", + "per user (ARPU) by the average player lifespan. For example,\n", + "\n", + "if the average player spends $50 per year and plays the\n", + "\n", + "game for 2 years, their LTV would be $50 * 2 = $100.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Game studios care about LTV because it helps them\n", + "\n", + "understand the long-term value of their players and make\n", + "\n", + "informed decisions about how to invest in player acquisition\n", + "\n", + "and retention. For example, if the LTV of a player is higher\n", + "\n", + "than the cost of acquiring them (e.g., through advertising),\n", + "\n", + "it may be worth investing more in player acquisition. On the\n", + "\n", + "other hand, if the LTV of a player is lower than the cost of\n", + "\n", + "acquiring them, it may be more cost-effective to focus on\n", + "\n", + "retaining existing players rather than acquiring new ones.\n", + "\n", + "LTV is one of the more important metrics that game studios,\n", + "\n", + "particularly those building live service games, can use to\n", + "\n", + "understand the value of their players. It is important to\n", + "\n", + "consider other metrics as well, such as player retention,\n", + "\n", + "monetization, and engagement.\n", + "\n", + "**Get started with Player Lifetime Value**\n", + "\n", + "##### Social Media Monitoring\n", + "\n", + "**Overview**\n", + "\n", + "As the great Warren Buffet once said, “It takes 20 years to\n", + "\n", + "build a reputation and five minutes to ruin it. If you think\n", + "\n", + "about that, you’ll do things differently.” Now more than ever,\n", + "\n", + "people are able to use social media and instantly amplify\n", + "\n", + "their voices to thousands of people who share similar\n", + "\n", + "interests and hobbies. Take Reddit as an example. r/gaming,\n", + "\n", + "the largest video game community (also called a subreddit)\n", + "\n", + "has over 35 million members with nearly 500 new posts\n", + "\n", + "and 10,000 new comments per day, while over 120 game-\n", + "\n", + "specific subreddits have more than 10,000 members each,\n", + "\n", + "the largest being League of Legends with over 700,000\n", + "\n", + "members. The discourse that takes place on online social\n", + "\n", + "platforms generates massive amounts of raw and organic\n", + "\n", + "\n", + "be used to understand how customers think and discover\n", + "\n", + "exactly what they want.\n", + "\n", + "The act and process of monitoring content online across the\n", + "\n", + "internet and social media for keyword mentions and trends\n", + "\n", + "for downstream processing and analytics is called media\n", + "\n", + "monitoring. By applying media monitoring to social media\n", + "\n", + "platforms, game developers are able to gain new advantages\n", + "\n", + "that previously might not have been possible, including:\n", + "\n", + "- Programmatically aggregate product ideas for new\n", + "\n", + "feature prioritization\n", + "\n", + "- Promote a better user experience by automatically\n", + "\n", + "responding to positive or negative comments\n", + "\n", + "- Understand the top influencers in the industry who can\n", + "\n", + "sway public opinion\n", + "\n", + "- Monitor broader industry trends and emerging segments\n", + "\n", + "such as free-to-play games\n", + "\n", + "- Detect and react to controversies or crises as they begin\n", + "\n", + "- Get organic and unfiltered feedback of games and features\n", + "\n", + "- Understand customer sentiment at scale\n", + "\n", + "- Make changes faster to keep customer satisfaction high\n", + "\n", + "and prevent churn\n", + "\n", + "By failing to monitor, understand, and act on what customers\n", + "\n", + "are saying about the games and content you release as\n", + "\n", + "well as broader industry trends, you risk those customers\n", + "\n", + "leaving for a better experience that meets the demands and\n", + "\n", + "requirements of what customers want.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "By monitoring and listening to what existing and potential\n", + "\n", + "customers are saying on social media, game developers\n", + "\n", + "are able to get a natural and organic understanding of how\n", + "\n", + "customers actually feel about the games and products they\n", + "\n", + "release, or gauge consumer interest before investing time\n", + "\n", + "and money in a new idea. The main process for social media\n", + "\n", + "monitoring is to gather data from different social media\n", + "\n", + "platforms, such as Twitter or YouTube, process those comments\n", + "\n", + "or tweets, then take action on the processed data. While\n", + "\n", + "customer feedback can be manually discovered and processed\n", + "\n", + "in search of certain keyword mentions or feedback, it is a much\n", + "\n", + "better idea to automate it and do it programmatically.\n", + "\n", + "**Get started with Social Media Monitoring**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Overview**\n", + "\n", + "Player feedback analysis is the process of collecting,\n", + "\n", + "analyzing, and acting on player feedback to inform game\n", + "\n", + "development. It involves collecting player feedback from\n", + "\n", + "multiple sources, such as in-game surveys, customer\n", + "\n", + "support tickets, social media, marketplace reviews, and\n", + "\n", + "forums, and using data analytics tools to identify patterns,\n", + "\n", + "trends, and insights. The goal of player feedback analysis is\n", + "\n", + "to better understand player needs, preferences, and pain\n", + "\n", + "points, and use this information to inform game development\n", + "\n", + "decisions and improve the overall player experience.\n", + "\n", + "Player feedback analysis is an important part of game\n", + "\n", + "development as it helps ensure that the game continues to\n", + "\n", + "meet player needs and expectations. By regularly collecting and\n", + "\n", + "analyzing player feedback, game studios can make data-driven\n", + "\n", + "decisions to improve the game, increase player engagement\n", + "\n", + "and retention, and ultimately drive success and growth.\n", + "\n", + "For this use case, we’re going to focus on taking online\n", + "\n", + "reviews for your video game and categorizing the different\n", + "\n", + "topics players are talking about (bucketing topics) in order\n", + "\n", + "to better understand the themes (via positive or negative\n", + "\n", + "sentiment) affecting your community.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "This is incredibly helpful, providing data-driven customer\n", + "\n", + "insight into your development process. Whether used in\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Across massively multiplayer online video games (MMOs),\n", + "\n", + "multiplayer online battle arena games (MOBAs) and other\n", + "\n", + "forms of online gaming, players continuously interact in real\n", + "\n", + "time to either coordinate or compete as they move toward a\n", + "\n", + "common goal — winning. This interactivity is integral to game\n", + "\n", + "play dynamics, but at the same time, it’s a prime opening for\n", + "\n", + "toxic behavior — an issue pervasive throughout the online\n", + "\n", + "video gaming sphere.\n", + "\n", + "Toxic behavior manifests in many forms, such as the varying\n", + "\n", + "degrees of griefing, cyberbullying and sexual harassment\n", + "\n", + "that are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n", + "\n", + "[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n", + "\n", + "the multiplayer game, _Dead by Daylight_ .\n", + "\n", + "\n", + "pre-production, such as looking at games that are similar\n", + "\n", + "\n", + "**Survivors**\n", + "\n", + "\n", + "with reviews to learn where those games have strengths and\n", + "\n", + "weaknesses; or using player feedback analysis with a live\n", + "\n", + "service title to identify themes that can apply to your product\n", + "\n", + "roadmap, player feedback analysis helps teams better\n", + "\n", + "support and cultivate engagement with the player community.\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "**RUSHING**\n", + "\n", + "\n", + "**GEN**\n", + "\n", + "\n", + "**HIDING** **ACTIVATING** **LOOPING**\n", + "**EMOTES**\n", + "\n", + "\n", + "**RUSH** **BLINDING** **SANDBAGGING**\n", + "**UNHOOKING**\n", + "\n", + "**TEABAGGING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**REPORTING** **REPORTING**\n", + "\n", + "\n", + "**TEXT**\n", + "**CHATTING**\n", + "\n", + "\n", + "Ultimately, player feedback analysis does two things. 1) It\n", + "\n", + "\n", + "**Less**\n", + "\n", + "**toxic**\n", + "\n", + "\n", + "**Most**\n", + "**toxic**\n", + "\n", + "\n", + "can help you stack rank themes according to positive and\n", + "\n", + "negative sentiment, and 2) you can weight those themes\n", + "\n", + "according to impact on player engagement, toxicity,\n", + "\n", + "monetization, churn, and more. We’ve all read reviews that\n", + "\n", + "are overly positive, or overly negative. The process of player\n", + "\n", + "feedback analysis helps to normalize feedback across the\n", + "\n", + "community (keeping in mind, only for those who have written\n", + "\n", + "a review), so you’re not over indexing on one review, or a\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "\n", + "**HATCH** **HATCH**\n", + "**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**FARMING** **FARMING**\n", + "\n", + "\n", + "**CAMPING** **CAMPING**\n", + "\n", + "\n", + "**BEING AWAY**\n", + "**FROM**\n", + "**KEYBOARD**\n", + "**(AFK)**\n", + "\n", + "\n", + "**CAMPING**\n", + "\n", + "**DRIBBLING** **TUNNELING**\n", + "\n", + "\n", + "**LOBBY**\n", + "**DODGING**\n", + "\n", + "**BODY**\n", + "**BLOCKING**\n", + "\n", + "**FACE**\n", + "**SLUGGING** **CAMPING**\n", + "\n", + "\n", + "**Killers**\n", + "\n", + "\n", + "single theme that may seem in the moment very pressing.\n", + "\n", + "In addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n", + "\n", + "\n", + "**Get started with Player Feedback Analysis**\n", + "\n", + "\n", + "on gamers and the community -- an issue that cannot be\n", + "\n", + "\n", + "-----\n", + "\n", + "game studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n", + "\n", + "\n", + "[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n", + "\n", + "toxicity, and of those, 20% reported leaving the game due to\n", + "\n", + "these interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n", + "\n", + "showed that having a disruptive or toxic encounter in the first\n", + "\n", + "session of the game led to players being over three times\n", + "\n", + "more likely to leave the game without returning. Given that\n", + "\n", + "player retention is a top priority for many studios, particularly\n", + "\n", + "as game delivery transitions from physical media releases to\n", + "\n", + "long-lived services, it’s clear that toxicity must be curbed.\n", + "\n", + "Compounding this issue related to churn, some companies\n", + "\n", + "face challenges related to toxicity early in development,\n", + "\n", + "even before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n", + "\n", + "released into testing without text or voice chat due in part\n", + "\n", + "to not having a system in place to monitor or manage toxic\n", + "\n", + "\n", + "In this section, we’re going to talk about how to use your data\n", + "\n", + "to more effectively find your target audience across the web.\n", + "\n", + "Whether you’re engaging in paid advertising, influencer or\n", + "\n", + "referral marketing, PR, cross promotion, community building,\n", + "\n", + "etc - use data to separate activity from impact. You want\n", + "\n", + "to focus on the channels and strategies that leverage your\n", + "\n", + "resources most effectively, be that time or money.\n", + "\n", + "Say you have a cohort of highly engaged players who are\n", + "\n", + "spending money on your title, and you want to find more\n", + "\n", + "gamers just like that. Doing an analysis on the demographic\n", + "\n", + "and behavioral data of this cohort will give you the\n", + "\n", + "information needed to use an ad platform (such as Meta,\n", + "\n", + "Google, or Unity) to do lookalike modeling and target those\n", + "\n", + "potential gamers for acquisition.\n", + "\n", + "\n", + "gamers and interactions. This illustrates that the scale of\n", + "\n", + "\n", + "the gaming space has far surpassed most teams’ ability to\n", + "\n", + "manage such behavior through reports or by intervening in\n", + "\n", + "disruptive interactions. Given this, it’s essential for studios\n", + "\n", + "to integrate analytics into games early in the development\n", + "\n", + "lifecycle and then design for the ongoing management of\n", + "\n", + "toxic interactions.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Toxicity in gaming is clearly a multifaceted issue that\n", + "\n", + "has become a part of video game culture and cannot be\n", + "\n", + "addressed universally in a single way. That said, addressing\n", + "\n", + "toxicity within in-game chat can have a huge impact given\n", + "\n", + "the frequency of toxic behavior and the ability to automate\n", + "\n", + "the detection of it using natural language processing (NLP). In\n", + "\n", + "summary, by leveraging machine learning to better identify\n", + "\n", + "disruptive behavior so that better-informed decisions\n", + "\n", + "around handling actions can be made.\n", + "\n", + "**Get started with Toxicity Detection**\n", + "\n", + "\n", + "##### Multi-Touch Attribution\n", + "\n", + "**Overview**\n", + "\n", + "Multi-touch attribution is a method of attributing credit to\n", + "\n", + "different marketing channels or touchpoints that contribute to\n", + "\n", + "a sale or conversion. In other words, it is a way of understanding\n", + "\n", + "how different marketing efforts influence a customer’s decision\n", + "\n", + "to make a purchase or take a desired action.\n", + "\n", + "There are a variety of different attribution models that can\n", + "\n", + "be used to assign credit to different touchpoints, each with\n", + "\n", + "its own strengths and limitations. For example, the last-\n", + "\n", + "click model attributes all credit to the last touchpoint that\n", + "\n", + "the customer interacted with before making a purchase,\n", + "\n", + "while the first-click model attributes all credit to the first\n", + "\n", + "touchpoint. Other models, such as the linear model or\n", + "\n", + "the time decay model, distribute credit across multiple\n", + "\n", + "touchpoints based on different algorithms.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Multi-touch attribution can be useful for game studios because\n", + "\n", + "it can help them understand which marketing channels or\n", + "\n", + "efforts are most effective at driving conversions and inform their\n", + "\n", + "marketing strategy. However, it is important to choose the right\n", + "\n", + "attribution model for your title based on your business model\n", + "\n", + "(one-time purchase, subscription, free-to-play, freemium,\n", + "\n", + "in-game advertising, etc.) and regularly review and optimize your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "**Get started with Multi-Touch Attribution**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Activating Your Playerbase\n", + "\n", + "So far, we’ve discussed how to better understand your\n", + "\n", + "players, and how to acquire more of your target audience.\n", + "\n", + "Next, we’re going to dig into how to better activate your\n", + "\n", + "players to create a more engaged and loyal playerbase that\n", + "\n", + "stays with your game for the long-term. Here, we’re going to\n", + "\n", + "focus on strategies that differentiate your gamer experience.\n", + "\n", + "##### Player Recommendations\n", + "\n", + "\n", + "and make in-game purchases. Additionally, personalized\n", + "\n", + "recommendations can help improve the overall player\n", + "\n", + "experience and increase satisfaction.\n", + "\n", + "Game studios can use a variety of techniques to create player\n", + "\n", + "recommendations, such as machine learning algorithms,\n", + "\n", + "collaborative filtering, and manual curation. It is important\n", + "\n", + "to regularly review and optimize these recommendations to\n", + "\n", + "ensure that they are effective and relevant to players.\n", + "\n", + "**Get started with Player Recommendations**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Player recommendations are suggestions for content or actions\n", + "\n", + "\n", + "that a game studio makes to individual players based on their\n", + "\n", + "interests and behaviors. These recommendations can be used\n", + "\n", + "to promote specific in-game items, encourage players to try\n", + "\n", + "new features, or simply provide a personalized experience.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Player recommendations matter to game studios because\n", + "\n", + "they can help improve player retention, engagement, and\n", + "\n", + "monetization. By providing players with recommendations\n", + "\n", + "that are relevant and engaging, studios can increase the\n", + "\n", + "likelihood that players will continue to play their games\n", + "\n", + "\n", + "##### Next Best Offer/Action\n", + "\n", + "**Overview**\n", + "\n", + "Next best offer (NBO) and next best action (NBA) are\n", + "\n", + "techniques that businesses use to make personalized\n", + "\n", + "recommendations to their customers. NBO refers to the\n", + "\n", + "practice of recommending the most relevant product or\n", + "\n", + "service to a customer based on their past purchases and\n", + "\n", + "behaviors. NBA refers to the practice of recommending the\n", + "\n", + "most relevant action or interaction to a customer based on\n", + "\n", + "the same information.\n", + "\n", + "\n", + "-----\n", + "\n", + "in-game purchase to a player based on their past spending\n", + "\n", + "habits and the items they have shown an interest in. They\n", + "\n", + "might use NBA to recommend a specific level or event to a\n", + "\n", + "player based on their progress and interests.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "It’s important to remember that next best offer is a specific\n", + "\n", + "use case within personalization that involves making\n", + "\n", + "recommendations to players on the most valuable in-game\n", + "\n", + "item or action they should take next. For example, a next\n", + "\n", + "best offer recommendation in a mobile game might suggest\n", + "\n", + "that a player purchase a specific in-game currency or unlock\n", + "\n", + "a new character.\n", + "\n", + "Both NBO and NBA can be used to improve customer\n", + "\n", + "retention, engagement, and monetization by providing\n", + "\n", + "personalized recommendations that are more likely to be\n", + "\n", + "relevant and appealing to individual customers. They can be\n", + "\n", + "implemented using a variety of techniques, such as machine\n", + "\n", + "learning algorithms or manual curation.\n", + "\n", + "**Get started with Next Best Offer/Action**\n", + "\n", + "##### Churn Prediction & Prevention\n", + "\n", + "**Overview**\n", + "\n", + "Video games live and die by their player base. For Games-\n", + "\n", + "\n", + "may overwhelm the ability of these players to consume,\n", + "\n", + "reinforcing the overall problem of player churn.\n", + "\n", + "At some point, it becomes critical for teams to take a cold,\n", + "\n", + "hard look at the cost of acquisition relative to the subscriber\n", + "\n", + "lifetime value (LTV) earned. These figures need to be brought\n", + "\n", + "into a healthy balance, and retention needs to be actively\n", + "\n", + "managed, not as a point-in-time problem to be solved, but\n", + "\n", + "as a “chronic condition” which needs to be managed for the\n", + "\n", + "ongoing health of the title.\n", + "\n", + "Headroom for continued acquisition-driven growth can\n", + "\n", + "be created by carefully examining why some players leave\n", + "\n", + "and some players stay. When centered on factors known\n", + "\n", + "at the time of acquisition, gaming studios may have the\n", + "\n", + "opportunity to rethink key aspects of their acquisition\n", + "\n", + "strategy that promote higher average retention rates, which\n", + "\n", + "can lead to higher average revenue per user.\n", + "\n", + "**Prerequisites for use case**\n", + "\n", + "This use case assumes a certain level of existing data\n", + "\n", + "collection infrastructure in the studio. Notably, a studio ready\n", + "\n", + "to implement a churn prediction and prevention model\n", + "\n", + "should have\n", + "\n", + "- A cloud environment where player data is stored\n", + "\n", + "- This source data should contain player behavior and\n", + "\n", + "session telemetry events from within the game. This is\n", + "\n", + "the foundation that insights can be built on top of.\n", + "\n", + "\n", + "as-a-Service (GaaS) titles, engagement is the most\n", + "\n", + "\n", + "important metric a team can measure. Naturally, proactively\n", + "\n", + "preventing churn is critical to sustained engagement and\n", + "\n", + "\n", + "**Get started with Churn Prediction & Prevention**\n", + "\n", + "\n", + "growth. Through churn prediction and prevention, you will\n", + "\n", + "\n", + "be able to analyze behavioral data to identify subscribers\n", + "\n", + "with an increased risk of churn. Next, you will use machine\n", + "\n", + "learning to quantify the likelihood of a subscriber to churn, as\n", + "\n", + "well as indicate which factors create that risk.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Balancing customer acquisition and retention is critical.\n", + "\n", + "This is the central challenge to the long-term success of\n", + "\n", + "any live service game. This is particularly challenging in that\n", + "\n", + "successful customer acquisition strategies needed to get\n", + "\n", + "games to scale tend to be followed by service disruptions or\n", + "\n", + "declines in quality and customer experience, accelerating\n", + "\n", + "player abandonment. To replenish lost subscribers, the\n", + "\n", + "acquisition engine continues to grind and expenses mount.\n", + "\n", + "As games reach for customers beyond the core playerbase\n", + "\n", + "they may have initially targeted, the title may not resonate\n", + "\n", + "\n", + "##### Real-time Ad Targeting\n", + "\n", + "**Overview**\n", + "\n", + "Real-time ad targeting in the context of game development\n", + "\n", + "focuses on using data to deliver personalized and relevant\n", + "\n", + "advertisements to players in near real-time, while they are\n", + "\n", + "playing a game. Real-time targeting is performanced based,\n", + "\n", + "using highly personalized messagings which are achieved\n", + "\n", + "by using data to precisely determine the most opportune\n", + "\n", + "moments to display ads, based on factors such as player\n", + "\n", + "behavior, game state, and other contextual information.\n", + "\n", + "Knowing when to send those ads is based on data. This\n", + "\n", + "use case is specific to titles using in-game advertising as a\n", + "\n", + "business model. It’s important to note that in-game real-\n", + "\n", + "time ad targeting requires a sophisticated tech stack, with\n", + "\n", + "\n", + "-----\n", + "\n", + "with bigger ad ecosystem, ad networks and partners. The\n", + "\n", + "Databricks Lakehouse platform is an optimal foundation as it\n", + "\n", + "already contains many of the connectors required to enable\n", + "\n", + "this use case.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of in-game real-time ad targeting is to provide a\n", + "\n", + "more immersive and relevant advertising experience for\n", + "\n", + "players, while also increasing the effectiveness of the ads\n", + "\n", + "for advertisers. By delivering targeted ads that are relevant\n", + "\n", + "to each player’s interests, game developers can create a\n", + "\n", + "more enjoyable and personalized gaming experience, which\n", + "\n", + "can help to reduce churn and increase the lifetime value of\n", + "\n", + "each player. Additionally, real-time ad targeting can also help\n", + "\n", + "game developers monetize their games more effectively, as\n", + "\n", + "advertisers are willing to pay a premium for hyper-targeted\n", + "\n", + "and engaged audiences.\n", + "\n", + "**Get started with Real-time Ad Targeting**\n", + "\n", + "### Operational use cases\n", + "\n", + "In the game development industry, operational analytics\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Anomaly detection plays an important role in the operation\n", + "\n", + "of a live service video game by helping to identify and\n", + "\n", + "diagnose unexpected behaviors in real-time. By identifying\n", + "\n", + "patterns and anomalies in player behavior, system\n", + "\n", + "performance, and network traffic, this information can then\n", + "\n", + "be used to detect and diagnose server crashes, performance\n", + "\n", + "bottlenecks, and hacking attempts. The ability to understand\n", + "\n", + "if there will be an issue before it becomes widespread is\n", + "\n", + "immensely valuable. Without anomaly detection, which is\n", + "\n", + "a form of advanced analytics, you’re always in a reactive\n", + "\n", + "(rather than proactive) state. Anomaly detection is a type of\n", + "\n", + "quality of service solution.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "The goal of anomaly detection is to ensure that players\n", + "\n", + "have a stable and enjoyable gaming experience. This has\n", + "\n", + "an impact across your game, from reducing downtime,\n", + "\n", + "to minimizing player churn, and improving your game’s\n", + "\n", + "reputation and revenue. Additionally, the insights gained from\n", + "\n", + "anomaly detection can also be used to mitigate cheating and\n", + "\n", + "disruptive behavior.\n", + "\n", + "**Get started with Anomaly Detection**\n", + "\n", + "\n", + "are essential for ensuring a smooth and efficient production\n", + "\n", + "\n", + "process. One common use case is anomaly detection, where\n", + "\n", + "data analytics is utilized to identify any unusual patterns\n", + "\n", + "or behaviors in the game, such as crashes or performance\n", + "\n", + "issues. This helps developers quickly identify and fix\n", + "\n", + "problems, improving the overall quality of the game. Another\n", + "\n", + "example is build pipelines, where data analytics can be used\n", + "\n", + "to monitor and optimize the process of creating new builds\n", + "\n", + "of the game. By tracking key metrics such as build time,\n", + "\n", + "error rates, and resource utilization, developers can make\n", + "\n", + "informed decisions about how to optimize the build process\n", + "\n", + "for maximum efficiency. Other operational use cases in game\n", + "\n", + "development include tracking player behavior, measuring\n", + "\n", + "server performance, and analyzing sales and marketing data.\n", + "\n", + "Lets explore a few of these below.\n", + "\n", + "\n", + "##### Build Pipeline\n", + "\n", + "**Overview**\n", + "\n", + "A build pipeline is a set of automated processes that\n", + "\n", + "are used to compile and assemble the code, assets, and\n", + "\n", + "resources that make up a game project. The build pipeline\n", + "\n", + "typically includes several stages, such as code compilation,\n", + "\n", + "optimization, testing, and release. The purpose of a build\n", + "\n", + "pipeline is to streamline the game development process\n", + "\n", + "and ensure that each stage of development is completed\n", + "\n", + "efficiently and effectively. A build pipeline can be configured\n", + "\n", + "to run automatically, so that new builds are generated\n", + "\n", + "whenever changes are made to the code or assets. This\n", + "\n", + "helps to ensure that the game is always up-to-date and\n", + "\n", + "ready for testing and release. The logs are collected are in\n", + "\n", + "near-real time from build servers. A simplified example:Dev\n", + "\n", + "X is committing code on title Y, submitted on day Z,\n", + "\n", + "along with the log files from the pipeline and build server.\n", + "\n", + "Builds typically take multiple hours to complete, requiring\n", + "\n", + "significant amounts of compute via build farms. Being able to\n", + "\n", + "\n", + "-----\n", + "\n", + "are wasting compute, and being able to predict which builds\n", + "\n", + "will fail as they goes through the pipeline are ways to curb\n", + "\n", + "operational expenses.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "With this use case, we’re seeking to reduce wasted compute\n", + "\n", + "and build a foundational view of what was developed, by\n", + "\n", + "who, when and how testing performed. In an ideal state, our\n", + "\n", + "automated build pipeline could send a notification to the\n", + "\n", + "developer with a confidence metric on the build making it\n", + "\n", + "through, allowing them to decide whether to continue or\n", + "\n", + "move another build through the pipeline. Often, developers\n", + "\n", + "do not have clear visibility until the build has completed\n", + "\n", + "or failed. By providing more insight to devs into the build\n", + "\n", + "pipeline process, we can increase the rate at which builds\n", + "\n", + "are completed efficiently and effectively.\n", + "\n", + "**Get started with Build Pipeline**\n", + "\n", + "##### Crash Analytics\n", + "\n", + "\n", + "resources were being used. How long crash testing takes\n", + "\n", + "can vary, depending on the game’s business model, amount\n", + "\n", + "of content, and scope. For a title with a one-time release,\n", + "\n", + "where there is a large amount of content and a complex\n", + "\n", + "storyline, the chances of hidden crashes causing errors while\n", + "\n", + "in development are high, making it require more time to\n", + "\n", + "perform testing before the game can be published. For titles\n", + "\n", + "built in a game-as-a-service model, i.e. a game shipped in\n", + "\n", + "cycles of constant iteration, crash detection should be done\n", + "\n", + "continuously, since errors in newly released content might\n", + "\n", + "affect the base game and lead to crashes.\n", + "\n", + "Increasingly, titles are being released in alpha (where\n", + "\n", + "developers do the testing), closed beta (which includes a\n", + "\n", + "limited group of testers/sample-users who do the gameplay\n", + "\n", + "testing) and open betas (where anyone interested can register\n", + "\n", + "to try the game). All of which happens before the game is\n", + "\n", + "“officially” released. Regardless of alpha, beta, or GA, players\n", + "\n", + "may stumble over game crashes, which triggers crash reports\n", + "\n", + "that are sent to the developers for fixing. But sometimes, it\n", + "\n", + "can be challenging to understand the issue that caused the\n", + "\n", + "crash from crash reports provided by your game’s platform.\n", + "\n", + "**What we’re trying to solve/achieve**\n", + "\n", + "Ultimately, the purpose of crash analytics is to identify the\n", + "\n", + "root cause of a crash, and help you take steps to prevent\n", + "\n", + "similar crashes from happening in the future. This feedback\n", + "\n", + "loop can be tightened through automation in the data\n", + "\n", + "pipeline. For example, by tracking crashes caused on builds\n", + "\n", + "from committers, the data can provide build suggestions\n", + "\n", + "to improve crash rate. Furthermore, teams can automate\n", + "\n", + "deduplication when multiple players experience the same\n", + "\n", + "errors, helping to reduce noise in the alerts received.\n", + "\n", + "**Get started with Crash Analytics**\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Games crash, it is a fact of game development. The\n", + "\n", + "combination of drivers, hardware, software, and\n", + "\n", + "configurations create unique challenges in tracking, resolving\n", + "\n", + "and managing the user experience.\n", + "\n", + "Crash analytics and reporting is the process of collecting\n", + "\n", + "information about crashes or unexpected failures in a\n", + "\n", + "software application, in this case, a video game. A crash\n", + "\n", + "report typically includes information about the state of the\n", + "\n", + "game at the time of the crash, such as what the player was\n", + "\n", + "\n", + "-----\n", + "\n", + "# Things to look forward to\n", + "\n", + "\n", + "This eBook was created to help game developers better\n", + "\n", + "wrap their heads around the general concepts in which data,\n", + "\n", + "analytics, and AI can be used to support the development\n", + "\n", + "and growth of video games. **If you only have 5 minutes,**\n", + "\n", + "**these takeaways are critical to your success** .\n", + "\n", + "For more information on advanced data, analytics, and AI use\n", + "\n", + "cases, as well as education resources, we highly recommend\n", + "\n", + "Databricks training portal [dbricks.co/training](http://dbricks.co/training) .\n", + "\n", + "**Top takeaways:**\n", + "\n", + "If you take nothing else from this guide, here are the most\n", + "\n", + "important takeaways we want to leave with you on your journey.\n", + "\n", + "`1.` **Data is fundamental. Data, analytics, and AI play a role**\n", + "\n", + "throughout the entire game development lifecycle - from\n", + "\n", + "discovery to pre-production, development to operating\n", + "\n", + "a game as a live service. Build better games, cultivate\n", + "\n", + "deeper player engagements, and operate more effectively\n", + "\n", + "\n", + "by utilizing the full potential of your data.\n", + "\n", + "`2.` **Define your goals.** Start by establishing the goals of what\n", + "\n", + "you’re hoping to learn and or understand around your\n", + "\n", + "game. Clear goals make it easier to identify key metrics\n", + "\n", + "to track, example goals include; developing high-quality\n", + "\n", + "games that provide engaging and satisfying player\n", + "\n", + "experiences, increasing player engagement and retention\n", + "\n", + "by analyzing and improving gameplay and mechanics, and\n", + "\n", + "building a strong and positive brand reputation through\n", + "\n", + "effective marketing and community outreach.\n", + "\n", + "`3.` **Identify and understand your data sources.** Spend time\n", + "\n", + "to identify and understand the breadth of data sources\n", + "\n", + "you are already collecting, be that game telemetry,\n", + "\n", + "marketplace, game services, or sources beyond the game\n", + "\n", + "like social media. It is critical to collect the right data, and\n", + "\n", + "track the right metrics based on the goals and objectives\n", + "\n", + "you have set for your game.\n", + "\n", + "`4.` **Start small, and iterate quickly.** Recognize that goals and\n", + "\n", + "objectives evolve as you learn more about the interaction\n", + "\n", + "\n", + "-----\n", + "\n", + "are most effective when scoped small with tight feedback\n", + "\n", + "loops, allowing you to quickly adapt with your community\n", + "\n", + "and alongside shifting market conditions.\n", + "\n", + "`5.` **Game analytics forms the foundation.** Start by getting a\n", + "\n", + "game analytics dashboard up and running. The process of\n", + "\n", + "building out a dashboard will naturally require connecting\n", + "\n", + "and transforming your data in a way to unlock more\n", + "\n", + "advanced use cases down the road.\n", + "\n", + "`6.` **Plan and revisit your data strategy frequently.** Once\n", + "\n", + "dashboarding is set up, you’ll have a better picture of what\n", + "\n", + "downstream data use cases make the most sense for\n", + "\n", + "your game and business objectives. As you move to use\n", + "\n", + "cases such as player segmentation, churn analysis, and\n", + "\n", + "player lifetime value, revisit your data strategy frequently\n", + "\n", + "to ensure you’re spending time on use cases that drive\n", + "\n", + "actionable insights for you and your team.\n", + "\n", + "`7.` **Show value broad and wide.** Whether your data strategy\n", + "\n", + "is new or well established on the team, build the habit\n", + "\n", + "of communicating broadly to stakeholders across the\n", + "\n", + "company. Early in the process, it is important to gather\n", + "\n", + "critical feedback on what data is helpful and where there\n", + "\n", + "are opportunities for improvement. The worst thing that\n", + "\n", + "can happen is you create something that no one uses.\n", + "\n", + "That is a waste of everyone’s time and money.\n", + "\n", + "`8.` **Ask for help.** Engage with your technical partners. There\n", + "\n", + "are humans who can help ensure you’re developing your\n", + "\n", + "data and analytics platform in a way that is efficient and\n", + "\n", + "effective. There are numerous partners with domain\n", + "\n", + "expertise in data science and data engineering that can\n", + "\n", + "accelerate your data journey - here is our recommended\n", + "\n", + "partner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n", + "\n", + "`9.` **Participate in the community.** The community for game\n", + "\n", + "analytics is large and growing. It is important to research and\n", + "\n", + "\n", + "your needs and interests. Here are a few of our favorites:\n", + "\n", + "`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n", + "\n", + "Special Interest Groups that bring together user\n", + "\n", + "researchers, designers, data engineers and data\n", + "\n", + "scientists focused on understanding player behavior\n", + "\n", + "and experiences. They offer resources and events\n", + "\n", + "for those working in games user research, including a\n", + "\n", + "yearly Games User Research Summit.\n", + "\n", + "`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n", + "\n", + "global community of data scientists and engineers.\n", + "\n", + "While not specifically focused on game development,\n", + "\n", + "they offer a wealth of resources and opportunities for\n", + "\n", + "learning, networking, and collaboration in the field of\n", + "\n", + "data science.\n", + "\n", + "`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n", + "\n", + "Language Processing, computer vision, and other fields\n", + "\n", + "where AI plays its role. They also provide an online\n", + "\n", + "platform where users can access pre-trained models\n", + "\n", + "and tools, share their own models and datasets, and\n", + "\n", + "collaborate with other developers in the community.\n", + "\n", + "`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n", + "\n", + "subreddit is a forum for data engineers to discuss\n", + "\n", + "topics related to building and managing data pipelines,\n", + "\n", + "data warehousing, and related technologies. While\n", + "\n", + "not specifically focused on game development, it\n", + "\n", + "can be a valuable resource for those working on data\n", + "\n", + "engineering in the gaming industry.\n", + "\n", + "`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n", + "\n", + "first step in your data journey. Imagine how the output of\n", + "\n", + "your data can be presented in a way to help stakeholders\n", + "\n", + "across your company achieve more. For example, dropping\n", + "\n", + "data into an application that can help game designers\n", + "\n", + "make balancing decisions based on player events.\n", + "\n", + "\n", + "-----\n", + "\n", + "# APPENDIX Ultimate class build guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "The heart and soul of mature data teams are formed by this\n", + "\n", + "trio of classes. There are many aspects to these roles, but\n", + "\n", + "they can be summarized in that Data Engineers create and\n", + "\n", + "maintain critical data workflows, Data Analysts interpret data\n", + "\n", + "and create reports that keep the business teams running\n", + "\n", + "seamlessly, and Data Scientists are responsible for making\n", + "\n", + "sense of large amounts of data. Depending on the size of\n", + "\n", + "the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in\n", + "\n", + "data engineering, analytics and data science.\n", + "\n", + "Whether you’re looking to stand-up an analytics dashboard\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "##### Data Engineers\n", + "\n", + "\n", + "**Goals and Priorities of Data Engineers**\n", + "\n", + "- Enable access to usable data for real-time insights — data\n", + "\n", + "that both enables timely decision-making and is accurate\n", + "\n", + "and reproducible\n", + "\n", + "- Increase user confidence and trust in data. This involves\n", + "\n", + "ensuring high consistency and reliability in ETL processes\n", + "\n", + "- Limit the issues and failures experienced by other\n", + "\n", + "engineers and data scientists, allowing those roles to\n", + "\n", + "focus less on troubleshooting and more on drawing\n", + "\n", + "meaningful conclusions from data and building new\n", + "\n", + "products / features\n", + "\n", + "**What Data Engineers care about:**\n", + "\n", + "- Enabling access to data for real-time insights — data that\n", + "\n", + "both enables timely decision-making and is accurate and\n", + "\n", + "reproducible\n", + "\n", + "- Building high-performance, reliable and scalable pipelines\n", + "\n", + "for data processing\n", + "\n", + "- Delivering data for consumption from a variety of sources\n", + "\n", + "by Data Analysts and Data Scientists against tight SLAs\n", + "\n", + "- A Data Engineer’s biggest challenge? Collaboration\n", + "\n", + "across teams\n", + "\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Data Engineers are responsible for data migration,\n", + "\n", + "manipulation, and integration of data (joining dissimilar\n", + "\n", + "data systems)\n", + "\n", + "- Setup and maintenance of ETL pipelines to convert\n", + "\n", + "source data into actionable data for insights. It is the\n", + "\n", + "responsibility of the data engineer to make sure these\n", + "\n", + "pipelines run efficiently and are well orchestrated.\n", + "\n", + "- The Data Engineer sets up the workflow process\n", + "\n", + "to orchestrate pipelines for the studio’s data and\n", + "\n", + "continuously validates it\n", + "\n", + "- Managing workflows to enable data scientists and data\n", + "\n", + "analysts, and ensuring workflows are well-integrated with\n", + "\n", + "different parts of the studio (e.g., marketing, test/QA, etc)\n", + "\n", + "\n", + "##### Data Scientists\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Responsible for making sense of the large amounts of data\n", + "\n", + "collected for a given game title, such as game telemetry,\n", + "\n", + "business KPIs, game health and quality, and sources\n", + "\n", + "beyond the game such as social media listening\n", + "\n", + "- The analytics portion of a Data Scientist’s job means\n", + "\n", + "looking at new and existing data to try and discover new\n", + "\n", + "things within it\n", + "\n", + "- The engineering component may include writing out\n", + "\n", + "pipeline code and deploying it to a repository\n", + "\n", + "- Data Scientists are responding for building, maintaining, and\n", + "\n", + "monitoring models used for analytics and/or data products\n", + "\n", + "\n", + "-----\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Developing new business capabilities (such as behavioral\n", + "\n", + "segmentation, churn prediction, recommendations) and\n", + "\n", + "optimizing processes around those capabilities\n", + "\n", + "- Increase ROI by building algorithms and tools that are\n", + "\n", + "maintainable and reusable\n", + "\n", + "- Exploring (or further expanding) the use of machine\n", + "\n", + "learning models for specific use cases\n", + "\n", + "- Bridges the gap between engineering and analytics,\n", + "\n", + "between the technology teams and business teams\n", + "\n", + "- Provides business side of studio with data that is crucial\n", + "\n", + "in decision-making, for example a churn model that helps\n", + "\n", + "predict the impact of a new feature set\n", + "\n", + "**What Data Scientists care about:**\n", + "\n", + "- Creating exploratory analysis or models to accurately\n", + "\n", + "predict business metrics, e.g., customer spend, churn,\n", + "\n", + "etc., and provide data-driven recommendations\n", + "\n", + "- Enable team with actionable insights that are easy to\n", + "\n", + "understand and well curated\n", + "\n", + "- Create and move models from experimentation to\n", + "\n", + "production\n", + "\n", + "- A Data Scientist’s biggest challenge? Keeping up with\n", + "\n", + "advancements and innovation in data science, and\n", + "\n", + "knowing which tools and libraries to use\n", + "\n", + "##### Data Analysts\n", + "\n", + "A data analyst reviews data to identify key insights into a\n", + "\n", + "game studio’s customers and ways the data can be used to\n", + "\n", + "solve problems.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Often serves as the go-to point of contact for non-\n", + "\n", + "\n", + "\n", + "- Analysts often interpret data and create reports or other\n", + "\n", + "documentation for studio leadership\n", + "\n", + "- Analysts typically are responsible for mining and\n", + "\n", + "compiling data\n", + "\n", + "- Streamline and or simplify processes when possible\n", + "\n", + "**Goals and Priorities:**\n", + "\n", + "- Empower stakeholder and business teams with\n", + "\n", + "actionable data\n", + "\n", + "- “Catch things before they break”. Proactively mitigate\n", + "\n", + "potential data issues before they occur (for internal and\n", + "\n", + "external customers)\n", + "\n", + "- Analysts are often recruited to assist other teams (i.e., BI\n", + "\n", + "teams) with their domain knowledge\n", + "\n", + "- Driving business impact through documentation and\n", + "\n", + "reliable data\n", + "\n", + "**What Data Analysts care about:**\n", + "\n", + "- Easy access to high quality data.\n", + "\n", + "- Quickly find insights from data with SQL queries and\n", + "\n", + "interactive visualizations.\n", + "\n", + "- The ability to easily share insights and while creating\n", + "\n", + "impactful assets for others to consume (dashboards, reports).\n", + "\n", + "- A Data Analyst’s biggest challenge? Working with complex\n", + "\n", + "processes and complicated technologies that are filled\n", + "\n", + "with messy data. While fighting these challenges, Analysts\n", + "\n", + "are often left alone or forced through paths that prevent\n", + "\n", + "collaboration with others across team/organization.\n", + "\n", + "- Untrustworthy data: often Analysts get asked to provide\n", + "\n", + "answers to leadership that will leverage the data to\n", + "\n", + "determine the direction of the company. When the data is\n", + "\n", + "untrustworthy or incorrect due to previously mentioned\n", + "\n", + "challenges this can eventually lead to lack of trust in the\n", + "\n", + "data teams from leadership or the business.\n", + "\n", + "\n", + "technical business / operations colleagues for data\n", + "\n", + "access / analysis questions\n", + "\n", + "\n", + "-----\n", + "\n", + "# Data access and the major cloud providers\n", + "\n", + "\n", + "### Cloud Rosetta Stone\n", + "\n", + "[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n", + "\n", + "If you are newer to the cloud computing space, it is easy to\n", + "\n", + "get lost between the hundreds of different services between\n", + "\n", + "the three major cloud providers. The table below is meant to\n", + "\n", + "highlight the important data, analytics, and AI services used\n", + "\n", + "by the various hyperscale service providers Amazon,\n", + "\n", + "Microsoft, and Google. In addition, it aims to pair up services\n", + "\n", + "from different cloud providers that serve the same purpose.\n", + "\n", + "### Getting started with the major cloud providers\n", + "\n", + "Here are some quick ways to get started with the three major\n", + "\n", + "cloud providers: AWS, Azure, and GCP:\n", + "\n", + "**AWS:**\n", + "\n", + "`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n", + "\n", + "account on the AWS website. This will give you access to\n", + "\n", + "the AWS Management Console, which is the web-based\n", + "\n", + "interface for managing your AWS resources.\n", + "\n", + "\n", + "`2.` **Use the AWS free tier:** AWS offers a free tier of service\n", + "\n", + "that provides a limited amount of free resources each\n", + "\n", + "month. This is a great way to get started and try out\n", + "\n", + "various AWS services without incurring any charges.\n", + "\n", + "`3.` **Explore the AWS Management Console:** Once you have\n", + "\n", + "an account and are logged in, take some time to explore\n", + "\n", + "the AWS Management Console and familiarize yourself\n", + "\n", + "with the various services that are available.\n", + "\n", + "`4.` **Next you can search for Databricks:** In the AWS\n", + "\n", + "Management Console, use the search bar in the top-left\n", + "\n", + "corner of the page and search for “Databricks”.\n", + "\n", + "`5.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`6.` **Launch Databricks Workspace:** To launch the Databricks\n", + "\n", + "Workspace on AWS, you can use the CloudFormation\n", + "\n", + "template provided by Databricks. Databricks\n", + "\n", + "CloudFormation template creates an IAM role, security\n", + "\n", + "group, and Databricks Workspace in your AWS account.\n", + "\n", + "**Azure:**\n", + "\n", + "`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n", + "\n", + "an account on Azure portal. This will give you access to\n", + "\n", + "the Azure portal, which is the web-based interface for\n", + "\n", + "managing your Azure resources.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n", + "|---|---|---|---|---|\n", + "|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n", + "|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n", + "|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n", + "|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n", + "\n", + "\n", + "-----\n", + "\n", + "**Jargon Glossary**\n", + "\n", + "|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n", + "|---|---|\n", + "|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n", + "|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n", + "|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n", + "|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n", + "|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n", + "|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n", + "|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n", + "\n", + "\n", + "`2.` **Take Azure tutorials:** Azure provides tutorials,\n", + "\n", + "documentation, and sample templates to help you get\n", + "\n", + "started. These resources can help you understand the\n", + "\n", + "basics of Azure and how to use its services.\n", + "\n", + "`3.` **You can search for Databricks:** In the Azure portal, use the\n", + "\n", + "search bar at the top of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the Azure portal, Azure\n", + "\n", + "CLI or Azure Powershell. Once created, you’ll be able to\n", + "\n", + "access your Databricks Workspace through the Azure portal.\n", + "\n", + "`6.` **Other Azure Services:** Once you have a Databricks\n", + "\n", + "workspace setup, you can easily connect it to other Azure\n", + "\n", + "Services such as Azure Storage, Event Hubs, Azure Data\n", + "\n", + "Lake Storage, Azure SQL and Cosmos DB for example.\n", + "\n", + "\n", + "**GCP:**\n", + "\n", + "`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n", + "\n", + "account on GCP portal. This will give you access to the\n", + "\n", + "GCP Console, which is the web-based interface for\n", + "\n", + "managing your GCP resources.\n", + "\n", + "`2.` **Explore the GCP Console:** Once you have an account\n", + "\n", + "and are logged in, take some time to explore the GCP\n", + "\n", + "Console and familiarize yourself with the various services\n", + "\n", + "that are available.\n", + "\n", + "`3.` **Search for Databricks:** In the GCP Console, use the search bar\n", + "\n", + "in the top-left corner of the page and search for “Databricks”.\n", + "\n", + "`4.` **Navigate to the Databricks page:** Once you have found\n", + "\n", + "the Databricks page, you can access it to get started with\n", + "\n", + "the Databricks service.\n", + "\n", + "`5.` **Create a new Databricks workspace:** To create a new\n", + "\n", + "Databricks workspace, you can use the GCP Console or\n", + "\n", + "the gcloud command-line tool. Once created, you’ll be\n", + "\n", + "able to access your Databricks Workspace through the\n", + "\n", + "GCP Console.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Detailed Use Cases\n", + "\n", + "\n", + "### Getting started with game analytics\n", + "\n", + "Fortunately, standing up an effective analytics dashboard\n", + "\n", + "is getting easier. It all starts with getting your data into an\n", + "\n", + "architecture that sets your team up for success. Selecting\n", + "\n", + "any of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n", + "\n", + "you can land all your data into a cloud data lake, then use\n", + "\n", + "Databricks Lakehouse architecture to run real-time and\n", + "\n", + "reliable processing. Databricks can then help you visualize\n", + "\n", + "that data in a dashboard, or send to a visual analytics\n", + "\n", + "platform, such as Tableau.\n", + "\n", + "`1.` **Sign up for a Databricks account:** You’ll need to create\n", + "\n", + "an account on the Databricks website in order to use the\n", + "\n", + "platform.\n", + "\n", + "`2.` **Access the Databricks portal:** Interact with the\n", + "\n", + "Databricks platform and run tasks such as creating\n", + "\n", + "clusters, running jobs, and accessing data.\n", + "\n", + "`3.` **Set up a development environment:** You’ll need a\n", + "\n", + "development environment where you can write and\n", + "\n", + "test your code, whether you’re using a local IDE or the\n", + "\n", + "Databricks Workspace.\n", + "\n", + "`4.` **Collect data:** Once you have your development environment\n", + "\n", + "set up, you can start collecting data from your game. This\n", + "\n", + "can involve integrating or building a SDK into your game\n", + "\n", + "code, or using another tool to send data to cloud storage.\n", + "\n", + "`5.` **Process and analyze the data:** Once you have collected\n", + "\n", + "your data, you can use Databricks to process and analyze\n", + "\n", + "it. This can involve cleaning and transforming the data,\n", + "\n", + "running queries or machine learning algorithms, or\n", + "\n", + "creating visualizations.\n", + "\n", + "`6.` **Monitor and optimize:** Regularly monitor your analytics\n", + "\n", + "to ensure that they are accurate and relevant, and use the\n", + "\n", + "insights you gain to optimize your game.\n", + "\n", + "Keep in mind that these are just general steps to get started\n", + "\n", + "with Databricks for game analytics. The specific steps you’ll\n", + "\n", + "need to take will depend on your specific use case and needs.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** What do you want to learn from your\n", + "\n", + "analytics data? Having clear goals will help you focus on\n", + "\n", + "collecting the right data and making meaningful use of it.\n", + "\n", + "- **Plan your data collection:** Determine what data you need\n", + "\n", + "to collect, how you will collect it, and how you will store it.\n", + "\n", + "- **Consider privacy:** Make sure you are transparent with your\n", + "\n", + "players about what data you are collecting and how you\n", + "\n", + "will use it, and give them the option to opt out if they wish.\n", + "\n", + "- **Use analytics to inform design:** Leverage your analytics data\n", + "\n", + "to inform decisions around game design, such as any balance\n", + "\n", + "changes or new content targeting a specific audience.\n", + "\n", + "- **Monitor and test your analytics implementation:** Regularly\n", + "\n", + "check your analytics to ensure that data is being collected\n", + "\n", + "correctly, and conduct tests to validate the accuracy of\n", + "\n", + "your data.\n", + "\n", + "- **Visualize your data:** Dashboarding your data is one of the\n", + "\n", + "most effective ways to quickly and effectively make sense\n", + "\n", + "of what’s happening at a given moment in time.\n", + "\n", + "- **Use data to improve player retention:** Analyze player\n", + "\n", + "behavior and use the insights you gain to improve player\n", + "\n", + "retention, such as by identifying and addressing pain\n", + "\n", + "points or by providing personalized content.\n", + "\n", + "- **Collaborate with your team:** Share your analytics\n", + "\n", + "findings with your team and encourage them to use the\n", + "\n", + "data to inform their work.\n", + "\n", + "- **Keep it simple:** Don’t try to collect too much data or\n", + "\n", + "create overly complex analytics systems. Keep it simple\n", + "\n", + "and focused on your goals.\n", + "\n", + "- **Start where you are:** If you’ve yet to gather all of your\n", + "\n", + "data, don’t go build some fancy model. Start with the data\n", + "\n", + "you have available to you and build from there.\n", + "\n", + "### Getting started with Player Segmentation\n", + "\n", + "Player segmentation is crucial to studios as it allows them\n", + "\n", + "to better understand their audience and tailor their game\n", + "\n", + "experience to meet their specific needs and preferences.\n", + "\n", + "By dividing players into different segments based on factors\n", + "\n", + "such as demographics, playing styles, and in-game behavior,\n", + "\n", + "\n", + "-----\n", + "\n", + "studios can gain valuable insights into what motivates and\n", + "\n", + "engages their players. This information can then be used\n", + "\n", + "to design games that not only provide a more enjoyable\n", + "\n", + "experience for players, but also drive player retention\n", + "\n", + "and increase revenue for the studio. In a competitive\n", + "\n", + "industry where player satisfaction is key to success, player\n", + "\n", + "segmentation is an essential tool for studios to stay ahead of\n", + "\n", + "the game.\n", + "\n", + "Start by evaluating the segmentation goals such as:\n", + "\n", + "- **Personalize the experience:** Changing or creating\n", + "\n", + "experience specific designs to the player.\n", + "\n", + "- **Create relevant content:** Surface the best content to\n", + "\n", + "players based on features and behaviors that will matter\n", + "\n", + "the most depending on the player’s place in the games\n", + "\n", + "life cycle.\n", + "\n", + "- **Monetization:** Create tailored monetization strategies\n", + "\n", + "that effectively reach and convert each player group. For\n", + "\n", + "example, you may have a group of highly engaged players\n", + "\n", + "who are more likely to make in-app purchases, while\n", + "\n", + "another group is less likely to spend money but may be\n", + "\n", + "more receptive to advertisements.\n", + "\n", + "The next steps would be to identify, collect and analyze\n", + "\n", + "player data. By gathering information on player behavior,\n", + "\n", + "preferences, and demographics, you can gain insights\n", + "\n", + "into their motivations, pain points, and what drives their\n", + "\n", + "engagement with your game.\n", + "\n", + "There are multiple types of player data to collect, including:\n", + "\n", + "- **Player Behavior:** Track player behavior and actions\n", + "\n", + "within your game to gain insights into their play style,\n", + "\n", + "preferences, and patterns.\n", + "\n", + "- **Surveys:** Ask players directly about their preferences,\n", + "\n", + "motivations, and feedback through in-game surveys, email\n", + "\n", + "questionnaires, or other forms of direct communication.\n", + "\n", + "- **Focus groups:** Gather a small group of players to discuss\n", + "\n", + "and provide feedback on specific aspects of your game\n", + "\n", + "and player experience.\n", + "\n", + "- **Social media listening:** Monitor social media platforms\n", + "\n", + "to gather insights into how players are engaging with and\n", + "\n", + "talking about your game.\n", + "\n", + "**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "Define your segmentation goals: Determine what you want\n", + "\n", + "to learn about your players and why. This will help you focus\n", + "\n", + "your analysis and ensure that your segments are meaningful\n", + "\n", + "and actionable.\n", + "\n", + "- **Use meaningful criteria:** Choose criteria that are relevant\n", + "\n", + "to your goals and that differentiate players in meaningful\n", + "\n", + "ways. This could include demographic information, in-game\n", + "\n", + "behavior, spending habits, or a combination of factors.\n", + "\n", + "- **Analyze player data:** Use data from your players to inform\n", + "\n", + "your segmentation strategy. This could include data\n", + "\n", + "on in-game behavior, spending habits, or demographic\n", + "\n", + "information.\n", + "\n", + "- **Use multiple methods:** We recommend using a\n", + "\n", + "combination of methods, such as clustering to create\n", + "\n", + "segments that are statistically meaningful and actionable\n", + "\n", + "to your game.\n", + "\n", + "- **Validate your segments:** Test your segments to ensure\n", + "\n", + "that they accurately reflect the differences you observed\n", + "\n", + "in your player data. This could involve comparing the\n", + "\n", + "segments to each other, or validating the segments\n", + "\n", + "against external data sources.\n", + "\n", + "- **Consider ethical and privacy concerns:** Ensure that\n", + "\n", + "your segmentation strategy is ethical and complies\n", + "\n", + "with privacy laws and regulations. This could involve\n", + "\n", + "anonymizing your player data, obtaining consent from\n", + "\n", + "players, or other measures to protect player privacy.\n", + "\n", + "- **Monitor and refine your segments:** Regularly review\n", + "\n", + "your segments to ensure that they remain relevant and\n", + "\n", + "meaningful. Refine your segments as necessary to reflect\n", + "\n", + "changes in your player data or your goals.\n", + "\n", + "### Getting Started with Player Lifetime Value\n", + "\n", + "Assuming you’ve followed the steps to collecting, storing, and\n", + "\n", + "preparing your player data for analysis; To calculate player\n", + "\n", + "lifetime value (LTV), the quick and dirty way of assessing\n", + "\n", + "overall player LTV is to divide the total revenue by the total\n", + "\n", + "number of registered players. Note, LTV is a critical calculation\n", + "\n", + "for return on investment, which is player lifetime spend versus\n", + "\n", + "the amount spent on player acquisition. Ideally, you want\n", + "\n", + "lifetime spend to be equal to or more than cost of acquisition.\n", + "\n", + "\n", + "-----\n", + "\n", + "As long as your game and its community are currently active,\n", + "\n", + "any player lifetime value calculations should be considered\n", + "\n", + "models, not exact numbers. This is because many of the players\n", + "\n", + "you’re considering are likely actively registered and actively\n", + "\n", + "playing, so the exact player LTV number is a moving target.\n", + "\n", + "Advanced\n", + "predictive\n", + "models\n", + "\n", + "Simple\n", + "predictive\n", + "models\n", + "\n", + "\n", + "Historical\n", + "average and\n", + "benchmarks\n", + "\n", + "\n", + "But these models are not entirely accurate since it doesn’t\n", + "\n", + "take into account the players who are registered but have\n", + "\n", + "yet to generate any revenue. Instead, a data-driven approach\n", + "\n", + "pivoted around player segmentation or cohorts will generally\n", + "\n", + "yield more actionable insight, far more than calculating a\n", + "\n", + "single LTV for the entire player base.\n", + "\n", + "You can define your game’s cohorts in multiple ways. Perhaps\n", + "\n", + "the most obvious in terms of calculating LTV is going by daily\n", + "\n", + "active cohorts, or users who joined your game on the same\n", + "\n", + "day. You could also organize cohorts by users who joined\n", + "\n", + "your game through a certain ad campaign or promotional\n", + "\n", + "effort, by country or geographic location, or by the type of\n", + "\n", + "device used.\n", + "\n", + "**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "**ACCURACY**\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "\n", + "**Use multiple data sources:** To get a complete picture of\n", + "\n", + "a player’s value, be sure to consider data from a variety\n", + "\n", + "of sources, including in-game purchases, ad revenue, and\n", + "\n", + "other monetization strategies.\n", + "\n", + "**Consider player retention:** Player retention is a key factor\n", + "\n", + "in LTV, so be sure to consider how long players are likely to\n", + "\n", + "play your game when calculating LTV.\n", + "\n", + "**Use accurate data:** Make sure you are using accurate\n", + "\n", + "data when calculating LTV. This might involve cleaning and\n", + "\n", + "processing your data, or using trusted sources such as in-\n", + "\n", + "game analytics tools.\n", + "\n", + "**Regularly review and update your LTV estimates:** Player\n", + "\n", + "LTV can change over time, so be sure to regularly review\n", + "\n", + "and update your estimates to ensure they are accurate.\n", + "\n", + "**Test and optimize:** Use experimentation methods such\n", + "\n", + "as A/B testing to see how different variables, such as\n", + "\n", + "in-game events or pricing strategies, affect LTV. Use the\n", + "\n", + "insights you gain to optimize your LTV calculations.\n", + "\n", + "**Be aware of outside factors:** Your calculations should\n", + "\n", + "consider the many outside factors that can affect your\n", + "\n", + "LTV, such as the virality of your game, any spikes or surge\n", + "\n", + "in visitors due to unexpected promotions (influencers,\n", + "\n", + "reviewers talking about your game), any significant changes\n", + "\n", + "to your game that users respond well to, and other organic\n", + "\n", + "lifts that are difficult to predict with existing data.\n", + "\n", + "\n", + "The first calculation is relatively simple. We suggest using\n", + "\n", + "average revenue per user (ARPU), which is a game’s daily\n", + "\n", + "revenue divided by the number of active users, to help you\n", + "\n", + "calculate lifetime value. First, you’ll need to define what is\n", + "\n", + "an active player using retention values; which can be set to\n", + "\n", + "a week, multi-day, or multi-week period of time depending\n", + "\n", + "on how your game has performed to date. You can then look\n", + "\n", + "at the number of users who churn on a given day, averaging\n", + "\n", + "with the number of days from the player’s first visit to the\n", + "\n", + "current date (or the specific date you’ve considered the end\n", + "\n", + "for said exercise). This is your playerbase lifetime value (note\n", + "\n", + "not Player Lifetime Value). To get Lifetime Value, divide daily\n", + "\n", + "revenue by the number of daily active users, and multiply\n", + "\n", + "that by the Lifetime Value to get your player LTV.\n", + "\n", + "It’s important to note that while calculating player lifetime\n", + "\n", + "value, the term is not entirely accurate since most player\n", + "\n", + "lifetimes are not over (particularly true for live service\n", + "\n", + "games). But for the purpose of modeling, we recommend\n", + "\n", + "keeping the amount of time that you consider a lifetime\n", + "\n", + "relatively short, allowing you to extrapolate. Keeping the time\n", + "\n", + "period shorter helps mitigate inaccuracies, specifically, the\n", + "\n", + "longer you stretch out what you consider a lifetime the more\n", + "\n", + "likely you are to collect inactive users in your count.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Social Media Monitoring\n", + "\n", + "Social media monitoring has three primary components:\n", + "\n", + "collecting the data, processing the results, and taking action\n", + "\n", + "on the findings. When it comes to collecting the data, whether\n", + "\n", + "you’re looking for tweets, YouTube comments, or Reddit\n", + "\n", + "posts, it can be very easy to get started since many social\n", + "\n", + "media platforms such as Twitter, YouTube, and Reddit all\n", + "\n", + "provide their own detailed and comprehensive APIs making it\n", + "\n", + "easy to start gathering data from those platforms with proper\n", + "\n", + "documentation and code examples to help along the way.\n", + "\n", + "Once the data has been collected, the next step is to process\n", + "\n", + "it and prepare it to be used in the next step. Processing your\n", + "\n", + "data can range in complexity from a simple keywords filter\n", + "\n", + "or more complicated approach such as filtering by location,\n", + "\n", + "removing emojis, and censoring and substituting words. With\n", + "\n", + "the data collected and processed, it can move to the final\n", + "\n", + "stage and be analyzed for downstream use and actionable\n", + "\n", + "insights by applying sentiment analysis or text mining.\n", + "\n", + "If a game studio is looking to save time and have the above\n", + "\n", + "steps performed for them, it may be appealing to buy a\n", + "\n", + "pre-built tool. The primary benefits of buying an off the shelf\n", + "\n", + "solution is that it is often faster and easier to get started\n", + "\n", + "with, and the development of the tool is handled by a third\n", + "\n", + "party who will have experience in building media monitoring\n", + "\n", + "\n", + "solutions. On the other hand, building your own custom\n", + "\n", + "solution will provide more flexibility and control. Many pre-\n", + "\n", + "built media monitoring tools might not have the capabilities\n", + "\n", + "required to effectively process video, audio, and image\n", + "\n", + "data, and may not be able to control the frequency in which\n", + "\n", + "data is processed, whether it be near real-time or batch.\n", + "\n", + "Additionally, pre-built solutions tend to take a generalist\n", + "\n", + "approach for NLP, whether it be keyword extraction, topic\n", + "\n", + "filtering, or sentiment analysis, which often leads to poor\n", + "\n", + "results and feedback, especially for an industry as unique as\n", + "\n", + "the gaming industry where certain industry-specific slang\n", + "\n", + "or terminology is frequently used. Overall, building your\n", + "\n", + "own media monitoring tool will provide greater control and\n", + "\n", + "flexibility leading to a better tailored return on investment,\n", + "\n", + "and luckily Databricks makes it even easier to get started.\n", + "\n", + "With the Databricks Lakehouse platform, all data engineering,\n", + "\n", + "data science, machine learning, and data analytics can\n", + "\n", + "be done in a single place without having to stitch multiple\n", + "\n", + "systems and tools together.\n", + "\n", + "Data engineers can use Workflows and Jobs to call social\n", + "\n", + "media platform APIs on a scheduled basis and use Delta Live\n", + "\n", + "Tables to create declarative data pipelines for cleaning and\n", + "\n", + "processing the data that comes in. Data scientists can use\n", + "\n", + "tools such as ML-specific Databricks runtimes (DBRs) that\n", + "\n", + "come with many of the most popular and common libraries\n", + "\n", + "already installed, MLflow which makes model development,\n", + "\n", + "\n", + "-----\n", + "\n", + "tracking, and serving easy and efficient, and various other\n", + "\n", + "tools such as AutoML and Bamboolib. Data analysts are able\n", + "\n", + "to create real-time alerts, dashboards, and visualizations\n", + "\n", + "using Databricks SQL. Each of the three personas will be able\n", + "\n", + "to effectively collaborate with each other and integrate each\n", + "\n", + "piece of their work into the broader data architecture.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n", + "\n", + "[out](https://databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "While social media monitoring can be easy to get started\n", + "\n", + "with, there are a few key points to keep in mind.\n", + "\n", + "- Remember the Pareto principle (roughly 80% of impact\n", + "\n", + "comes from 20% of activity) and diminishing returns. While\n", + "\n", + "it’s important to monitor large platforms such as Reddit,\n", + "\n", + "Twitter, and YouTube, it might not be worthwhile to monitor\n", + "\n", + "smaller platforms (in terms of engagement) as the bulk of\n", + "\n", + "customer feedback will be on those major platforms.\n", + "\n", + "- Monitor other sources of information. It is also useful to\n", + "\n", + "monitor mentions of key company personnel such as\n", + "\n", + "executives or public facing employees.\n", + "\n", + "- While follower count does matter on platforms such as\n", + "\n", + "Twitter, don’t ignore users with low-follower counts. It only\n", + "\n", + "takes one or two re-tweets from other users to become a\n", + "\n", + "large issue.\n", + "\n", + "- On social media, customers can see through generic\n", + "\n", + "corporate responses to complaints, so it is important\n", + "\n", + "to get a clear understanding of the issue and provide a\n", + "\n", + "clear response.\n", + "\n", + "### Getting Started with Player Feedback Analysis\n", + "\n", + "The easiest place to start is gathering your data. With\n", + "\n", + "accounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n", + "\n", + "Nintendo (or whatever platform you’re using), identify the ID\n", + "\n", + "for your game(s), and pull the reviews corresponding to that\n", + "\n", + "game into Databricks through an API call.\n", + "\n", + "\n", + "From here, you clean the data using some of the pre-\n", + "\n", + "processing available in Python that removes any emojis and\n", + "\n", + "ASCII characters. Once complete, run through Spark NLP\n", + "\n", + "pipeline which does the basic natural language processing\n", + "\n", + "steps such as normalization, stemming, lemmatization. We\n", + "\n", + "recommend running through pre-trained models, such as Word\n", + "\n", + "Embeddings and Named Entity Recognition models from John\n", + "\n", + "Snow Labs. This should complete the pipeline and generates\n", + "\n", + "the aspects for the reviews provided by the community.\n", + "\n", + "This data is then loaded into a Delta table for further analysis,\n", + "\n", + "such as using a visual dashboard (built on SQL queries inside\n", + "\n", + "Databricks) to analyze and understand the aspects the\n", + "\n", + "community is talking about, which can then be shared back\n", + "\n", + "with the development team for analysis and action. This is a\n", + "\n", + "great exercise to run once per month.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Check for word groupings:** Make sure your word groupings\n", + "\n", + "are accurate to improve the analysis. For example, if your\n", + "\n", + "game is called Football Manager, and the shorthand is FM,\n", + "\n", + "make sure both of those are grouped appropriately.\n", + "\n", + "- **Leverage domain knowledge:** Clean the reviews based\n", + "\n", + "on your domain knowledge. There are generic steps one\n", + "\n", + "could take, but that will not be as effective as someone\n", + "\n", + "with domain, and specific game knowledge of your title.\n", + "\n", + "- **Experiment with models:** Feel free to try multiple pre-\n", + "\n", + "trained models, and or tweak the pre-trained models\n", + "\n", + "based on your understanding of the domain to improve\n", + "\n", + "the accuracy of your results.\n", + "\n", + "- **Work one title at a time:** This process works best when\n", + "\n", + "pulling reviews for a single title, specifically one version of\n", + "\n", + "one title at a time.\n", + "\n", + "- **Let the model to the heavy lift, but use humans to double-**\n", + "\n", + "**check:** The sentiment corresponding to the aspects in the\n", + "\n", + "model will be labeled as Positive or Negative. In the case\n", + "\n", + "of a neutral review, the model will do its best to determine\n", + "\n", + "whether that is more positive or negative. A best practice\n", + "\n", + "is to spend time going back through the aspects early to\n", + "\n", + "determine model accuracy and make updates accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Toxicity Detection\n", + "\n", + "Our recommendation on tackling the toxicity issue is\n", + "\n", + "to leverage cloud-agnostic and flexible tooling that can\n", + "\n", + "consume chat data from a variety of sources, such as chat\n", + "\n", + "logs, voice transcriptions, or sources like discord and reddit\n", + "\n", + "forums. No matter if the data is in log form from game\n", + "\n", + "servers or events from a message system, Databricks can\n", + "\n", + "provide quick and easy ways to ingest the data.\n", + "\n", + "Leveraging a simplified architecture like the diagram\n", + "\n", + "above shows no matter the source, getting chat data for\n", + "\n", + "inferencing and model development can be as simple. While\n", + "\n", + "we leveraged a pre-built model from John Snow Labs to\n", + "\n", + "accelerate development, you can bring the ML framework of\n", + "\n", + "your choice to the platform.\n", + "\n", + "**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define what toxic and disruptive behavior looks**\n", + "\n", + "**like within your community:** Clearly define what you\n", + "\n", + "consider to be toxic behavior, as this will determine how\n", + "\n", + "you measure and detect it. This might include things like\n", + "\n", + "hateful language, harassment, or cheating.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you detect toxicity. This might include\n", + "\n", + "data on in-game chat, player reports, and other sources.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and identify patterns of toxic\n", + "\n", + "behavior. This will allow you to more accurately detect\n", + "\n", + "toxicity and prioritize cases for review.\n", + "\n", + "- **Test and optimize:** Regularly review and test your toxicity\n", + "\n", + "detection systems to ensure they are accurate and\n", + "\n", + "effective. Use experimentation methods such as A/B\n", + "\n", + "testing to see how different strategies impact toxicity rates.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are detecting toxicity, and give\n", + "\n", + "them the option to opt out if they wish.\n", + "\n", + "- **Take action:** When toxic behavior is detected, take\n", + "\n", + "appropriate action to address it. The health and wellness\n", + "\n", + "of your community depends on it. This might involve\n", + "\n", + "banning players, issuing warnings, or taking other\n", + "\n", + "disciplinary measures.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n", + "\n", + "To get started with multi-touch attribution, you need to first\n", + "\n", + "select an attribution model. There are a variety of different\n", + "\n", + "attribution models to choose from, each with its own\n", + "\n", + "\n", + "attribution credit according to your chosen model (above).\n", + "\n", + "We highly recommend you regularly review and test your\n", + "\n", + "attribution efforts to ensure they are accurate and effective.\n", + "\n", + "Use experimentation methods such as A/B testing to see\n", + "\n", + "how different strategies impact conversion rates.\n", + "\n", + "**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n", + "\n", + "\n", + "strengths and limitations.\n", + "\n", + "\n", + "`1.` **Last-click model:** This model attributes all credit to the\n", + "\n", + "last touchpoint that the customer interacted with before\n", + "\n", + "making a purchase or taking a desired action.\n", + "\n", + "`2.` **First-click model:** This model attributes all credit to the\n", + "\n", + "first touchpoint that the customer interacted with.\n", + "\n", + "`3.` **Linear model:** This model attributes equal credit to each\n", + "\n", + "touchpoint that the customer interacted with.\n", + "\n", + "`4.` **Time decay model:** This model attributes more credit to\n", + "\n", + "touchpoints that are closer in time to the purchase\n", + "\n", + "or desired action.\n", + "\n", + "`5.` **Position-based model:** This model attributes a portion of\n", + "\n", + "the credit to the first and last touchpoints, and the remainder\n", + "\n", + "is distributed evenly among the other touchpoints.\n", + "\n", + "`6.` **Custom model:** Some businesses create their own\n", + "\n", + "attribution model based on specific business needs or goals.\n", + "\n", + "Each attribution model has its own strengths and limitations,\n", + "\n", + "and the right model for a particular video game will depend\n", + "\n", + "on a variety of factors, including the goals of your title, the\n", + "\n", + "customer journey, and the types of marketing channels being\n", + "\n", + "used. It is important to carefully consider the pros and cons\n", + "\n", + "of each model and choose the one that best aligns with the\n", + "\n", + "needs of your game.\n", + "\n", + "Next, you’re going to want to set up tracking. In order to\n", + "\n", + "attribute credit to different touchpoints, you’ll need to set up\n", + "\n", + "tracking to capture data on customer interactions. This might\n", + "\n", + "involve integrating tracking code into the game, or using a\n", + "\n", + "third-party tracking tool.\n", + "\n", + "With tracking set up, you’ll start collecting data on player\n", + "\n", + "interactions and be able to use that information to calculate\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Define clear goals:** Sounds simple, but by clearly defining\n", + "\n", + "the goals of your acquisition campaign and what success\n", + "\n", + "looks like, you will be able to guide your decision-making\n", + "\n", + "and ensure that you are measuring the right metrics -\n", + "\n", + "such as cost per install, return on ad spend, conversion\n", + "\n", + "rate, lifetime value, retention rate, and more.\n", + "\n", + "- **Use a data-driven approach:** Use data to inform your\n", + "\n", + "decision-making. Collect data on all touchpoints in the\n", + "\n", + "player journey, including ad impressions, clicks, installs,\n", + "\n", + "and in-game actions.\n", + "\n", + "- **Choose the right attribution model:** Select the right\n", + "\n", + "attribution model that accurately reflects the player\n", + "\n", + "journey for your specific genre of game. This can be a\n", + "\n", + "complex process. A couple of things to keep in mind\n", + "\n", + "- Consider the touchpoints that are most important for\n", + "\n", + "your player journey, such as first ad impression, first\n", + "\n", + "click, or first in-game action\n", + "\n", + "- Consider the business goals you’re trying to achieve.\n", + "\n", + "For example, if you are focused on maximizing return\n", + "\n", + "on investment, a last-click attribution model may be\n", + "\n", + "most appropriate. On the other hand, if you are looking\n", + "\n", + "to understand the impact of each touchpoint, a multi-\n", + "\n", + "touch attribution model may be more appropriate.\n", + "\n", + "- Consider the data you have available, including ad\n", + "\n", + "impressions, clicks, installs, and in-game actions.\n", + "\n", + "- **Continuously monitor and optimize:** Continuously\n", + "\n", + "monitor and optimize your acquisition campaigns based on\n", + "\n", + "the data. Test different approaches, make adjustments as\n", + "\n", + "needed, and use A/B testing to determine what works best.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Player Recommendations\n", + "\n", + "Recommendations is an advanced use case. We don’t\n", + "\n", + "recommend (hehe) that you start here, instead, we’re\n", + "\n", + "assuming that you’ve done the work to set up your game\n", + "\n", + "analytics (collecting, cleaning, and preparing data for analysis)\n", + "\n", + "and that you’ve done basic segmentation to place your\n", + "\n", + "players in cohorts based on their interests and behaviors.\n", + "\n", + "Recommendations can come in many forms for video games.\n", + "\n", + "For this context, we’re going to focus on the wide-and-deep\n", + "\n", + "learning for recommender systems, which has the ability\n", + "\n", + "to both memorize and generalize recommendations based\n", + "\n", + "on player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n", + "\n", + "[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n", + "\n", + "deep machine learning (ML) model has become popular in a\n", + "\n", + "variety of online scenarios for its ability to personalize user\n", + "\n", + "engagements, even in ‘cold start problem’ scenarios with\n", + "\n", + "sparse data inputs.\n", + "\n", + "The goal with wide-and-deep recommenders is to provide\n", + "\n", + "\n", + "**Understanding the model design**\n", + "\n", + "To understand the concept of wide-and-deep recommend­\n", + "\n", + "ations, it’s best to think of it as two separate, but collaborating,\n", + "\n", + "engines. The wide model, often referred to in the literature as\n", + "\n", + "the linear model, memorizes users and their past choices. Its\n", + "\n", + "inputs may consist simply of a user identifier and a product\n", + "\n", + "identifier, though other attributes relevant to the pattern (such\n", + "\n", + "as time of day) may also be incorporated.\n", + "\n", + "The deep portion of the model, so named as it is a deep\n", + "\n", + "neural network, examines the generalizable attributes of a\n", + "\n", + "user and their choices. From these, the model learns the\n", + "\n", + "broader characteristics that tend to favor user selections.\n", + "\n", + "Together, the wide-and-deep submodels are trained\n", + "\n", + "on historical product selections by individual users to\n", + "\n", + "predict future selections. The end result is a single model\n", + "\n", + "capable of calculating the probability with which a user will\n", + "\n", + "purchase a given item, given both memorized past choices\n", + "\n", + "and generalizations about a user’s preferences. These\n", + "\n", + "probabilities form the basis for user-specific rankings, which\n", + "\n", + "can be used for making recommendations.\n", + "\n", + "\n", + "an intimate level of player understanding. This model uses\n", + "\n", + "\n", + "explicit and implicit feedback to expand the considerations\n", + "\n", + "set for players. Wide-and-deep recommenders go beyond\n", + "\n", + "simple weighted averaging of player feedback found in some\n", + "\n", + "collaborative filters to balance what is understood about\n", + "\n", + "the individual with what is known about similar gamers. If\n", + "\n", + "done properly, the recommendations make the gamer feel\n", + "\n", + "understood (by your title) and this should translate into\n", + "\n", + "greater value for both the player and you as the business.\n", + "\n", + "\n", + "**Building the model**\n", + "\n", + "The intuitive logic of the wide-and-deep recommender\n", + "\n", + "belies the complexity of its actual construction. Inputs\n", + "\n", + "must be defined separately for each of the wide-and-\n", + "\n", + "deep portions of the model and each must be trained in a\n", + "\n", + "coordinated manner to arrive at a single output, but tuned\n", + "\n", + "using optimizers specific to the nature of each submodel.\n", + "\n", + "Thankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n", + "\n", + "[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n", + "\n", + "simplifying the assembly of an overall model.\n", + "\n", + "\n", + "**User A**\n", + "\n", + "- user identity\n", + "\n", + "- user attributes\n", + "\n", + "**Product B**\n", + "\n", + "\n", + "**Wide**\n", + "**Sub-Model**\n", + "\n", + "\n", + "**Probability of**\n", + "\n", + "**User A + Product B**\n", + "\n", + "**Wide & Deep**\n", + "**Model**\n", + "\n", + "\n", + "**Deep**\n", + "**Sub-Model**\n", + "\n", + "\n", + "\n", + "- product identity\n", + "\n", + "- product attributes\n", + "\n", + "\n", + "-----\n", + "\n", + "**Training**\n", + "\n", + "The challenge for most teams is then training the\n", + "\n", + "recommender on the large number of user-product\n", + "\n", + "combinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n", + "\n", + "open-source library for serving large datasets assembled in\n", + "\n", + "Apache Spark™ to Tensorflow (and other ML libraries), one can\n", + "\n", + "cache the data on high-speed, temporary storage and then\n", + "\n", + "read that data in manageable increments to the model during\n", + "\n", + "training. In doing so, we limit the memory overhead associated\n", + "\n", + "with the training exercise while preserving performance.\n", + "\n", + "**Tuning**\n", + "\n", + "Tuning the model becomes the next challenge. Various model\n", + "\n", + "parameters control its ability to arrive at an optimal solution.\n", + "\n", + "The most efficient way to work through the potential parameter\n", + "\n", + "combinations is simply to iterate through some number of\n", + "\n", + "training cycles, comparing the models’ evaluation metrics with\n", + "\n", + "each run to identify the ideal parameter combinations. By\n", + "\n", + "trials, we can parallelize this work across many compute nodes,\n", + "\n", + "allowing the optimizations to be performed in a timely manner.\n", + "\n", + "**Deploying**\n", + "\n", + "Finally, we need to deploy the model for integration with\n", + "\n", + "various retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n", + "\n", + "to both persist our model and package it for deployment\n", + "\n", + "across a wide variety of microservices layers, including\n", + "\n", + "Azure Machine Learning, AWS Sagemaker, Kubernetes and\n", + "\n", + "Databricks Model Serving.\n", + "\n", + "While this seems like a large number of technologies to bring\n", + "\n", + "together just to build a single model, Databricks integrates all\n", + "\n", + "of these technologies within a single platform, providing data\n", + "\n", + "scientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n", + "\n", + "ience. The pre-integration of these technologies means various\n", + "\n", + "per­sonas can work faster and leverage additional capabilities,\n", + "\n", + "such as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n", + "\n", + "transparency of the organization’s model building efforts.\n", + "\n", + "To see an end-to-end example of how a wide and deep\n", + "\n", + "recommender model may be built on Databricks, please\n", + "\n", + "check out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n", + "\n", + "**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "\n", + "**Tips / Best Practices - things to consider**\n", + "\n", + "- **Use data to inform recommendations:** Use data from\n", + "\n", + "your analytics, player feedback, and other sources to\n", + "\n", + "understand what players like and dislike. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and engaging for individual players.\n", + "\n", + "- **Segment your players:** Consider segmenting your players\n", + "\n", + "based on characteristics such as playstyle, spending\n", + "\n", + "habits, and demographic information. This will allow you\n", + "\n", + "to create more targeted recommendations for different\n", + "\n", + "groups of players.\n", + "\n", + "- **Consider the player’s current context:** When creating\n", + "\n", + "recommendations, consider the player’s current context,\n", + "\n", + "such as what they are doing in the game and what\n", + "\n", + "content they have already consumed. This will help\n", + "\n", + "you create recommendations that are more likely to be\n", + "\n", + "relevant and timely.\n", + "\n", + "- **Test and optimize your recommendations:** Use\n", + "\n", + "experimentation methods such as A/B testing to see\n", + "\n", + "how different recommendations perform with different\n", + "\n", + "player segments. Use the insights you gain to optimize\n", + "\n", + "your recommendations.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "players about how you are creating recommendations and\n", + "\n", + "give them the option to opt out if they wish.\n", + "\n", + "- **Use recommendations to improve the player experience:**\n", + "\n", + "Use personalized recommendations to improve the player\n", + "\n", + "experience and increase engagement and satisfaction.\n", + "\n", + "### Getting Started with Next Best Offer/Action\n", + "\n", + "Since NBO/NBA is a specific use case of personalization, how a\n", + "\n", + "team might get started implementing this will look very similar\n", + "\n", + "to how they would with broader personalization activities.\n", + "\n", + "Begin with ensuring you are appropriately collecting player\n", + "\n", + "data (behavior, preferences, in-game purchases, etc), storing\n", + "\n", + "it in your cloud data lake using a service such as Delta Lake\n", + "\n", + "from Databricks. From here, you’ll prepare the data using\n", + "\n", + "Databricks to clean, transform, and prepare for analysis.\n", + "\n", + "This may include aggregating data from multiple sources,\n", + "\n", + "removing duplicates and outliers, and transforming the data\n", + "\n", + "into a format suitable for analysis. As you analyze the player\n", + "\n", + "data, seek to identify patterns and trends in player behavior\n", + "\n", + "\n", + "-----\n", + "\n", + "and preferences that will give you signal on which actions are\n", + "\n", + "more likely to be successful.\n", + "\n", + "From here, you can build a recommendation model based\n", + "\n", + "on the player data analysis, and incorporate information\n", + "\n", + "on in-game items and player preferences to make\n", + "\n", + "personalized recommendations.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define your goals:** Like every use case, starting with\n", + "\n", + "clearly defined goals helps to ensure your implementation\n", + "\n", + "of NBO and NBA will be as effective and efficient as\n", + "\n", + "possible. Your goals will also help you determine what data\n", + "\n", + "to collect and how it will be used.\n", + "\n", + "- **Collect relevant data:** Based on your goals, make sure\n", + "\n", + "you are collecting the right data to inform your NBO and\n", + "\n", + "NBA recommendations. This might include data on player\n", + "\n", + "behavior, engagement, and spending habits.\n", + "\n", + "- **Leverage machine learning to scale your**\n", + "\n", + "**recommendations:** Use machine learning algorithms to\n", + "\n", + "analyze your data and make personalized recommendations\n", + "\n", + "to your players. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n", + "\n", + "methods such as A/B testing to see how different\n", + "\n", + "recommendations perform with different player segments.\n", + "\n", + "Past performance is not a perfect indicator of future\n", + "\n", + "success. Consistent testing allows you to tune your NBO and\n", + "\n", + "NBA recommendations so they evolve with your playerbase.\n", + "\n", + "- **Consider the player’s context:** When making recommend­\n", + "\n", + "ations, consider the player’s current context, such as what\n", + "\n", + "they are doing in the game and what content they have\n", + "\n", + "already consumed. This will help you create recommend­\n", + "\n", + "ations that are more likely to be relevant and timely.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with\n", + "\n", + "your players about how you are using their data to make\n", + "\n", + "recommendations, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your NBO and NBA\n", + "\n", + "\n", + "### Getting Started with Churn Prediction & Prevention\n", + "\n", + "The exciting part of this analysis is that not only does it\n", + "\n", + "help to quantify the risk of customer churn but it paints a\n", + "\n", + "quantitative picture of exactly which factors explain that risk.\n", + "\n", + "It’s important that we not draw too rash of a conclusion with\n", + "\n", + "regards to the causal linkage between a particular attribute\n", + "\n", + "and its associated hazard, but it’s an excellent starting point\n", + "\n", + "for identifying where an organization needs to focus its\n", + "\n", + "attention for further investigation.\n", + "\n", + "The hard part in this analysis is not the analytic techniques.\n", + "\n", + "The Kaplan-Meier curves and Cox Proportional Hazard\n", + "\n", + "models used to perform the analysis above are well\n", + "\n", + "established and widely supported across analytics platforms.\n", + "\n", + "The principal challenge is organizing the input data.\n", + "\n", + "The vast majority of subscription services are fairly new as\n", + "\n", + "businesses. As such, the data required to examine customer\n", + "\n", + "attrition may be scattered across multiple systems,\n", + "\n", + "making an integrated analysis more difficult. Data Lakes\n", + "\n", + "are a starting point for solving this problem, but complex\n", + "\n", + "transformations required to cleanse and restructure data\n", + "\n", + "that has evolved as the business itself has (often rapidly)\n", + "\n", + "evolved requires considerable processing power. This is\n", + "\n", + "certainly the case with the KKBox information assets and is a\n", + "\n", + "point noted by the data provider in their public challenge.\n", + "\n", + "The key to successfully completing this work is the\n", + "\n", + "establishment of transparent, maintainable data processing\n", + "\n", + "pipelines executed on an elastically scalable (and therefore\n", + "\n", + "cost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n", + "\n", + "cost-conscious in their initial approach, it’s important to\n", + "\n", + "remember the point made above that churn is a chronic\n", + "\n", + "condition to be managed. As such, this is an analysis that\n", + "\n", + "should be periodically revisited to ensure acquisition and\n", + "\n", + "retention practices are aligned.\n", + "\n", + "To support this, we are making the code behind our\n", + "\n", + "analysis available for download and review. If you have any\n", + "\n", + "questions about how this solution can be deployed in your\n", + "\n", + "environment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "efforts with your team and encourage them to use the\n", + "\n", + "\n", + "data to inform their work.\n", + "\n", + "\n", + "**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define churn:** Clearly define what you consider to be\n", + "\n", + "player churn, as this will determine how you measure\n", + "\n", + "and predict it. For example, you might consider churn to\n", + "\n", + "be when a player stops playing your game for a certain\n", + "\n", + "number of days, or when they uninstall it.\n", + "\n", + "- **Collect relevant data:** Make sure you are collecting the\n", + "\n", + "right data to help you predict and prevent churn. This\n", + "\n", + "might include data on player behavior, engagement, and\n", + "\n", + "spending habits.\n", + "\n", + "- **Use machine learning:** Use machine learning algorithms\n", + "\n", + "to analyze your data and predict which players are at\n", + "\n", + "risk of churning. This will allow you to identify trends and\n", + "\n", + "patterns that might not be immediately apparent.\n", + "\n", + "- **Test and optimize:** Use experimentation methods such as\n", + "\n", + "A/B testing to see how different strategies impact churn\n", + "\n", + "rates. Use the insights you gain to optimize your churn\n", + "\n", + "prevention efforts.\n", + "\n", + "- **Focus on retention:** Implement retention strategies that are\n", + "\n", + "tailored to the needs and preferences of your players. This\n", + "\n", + "might involve providing personalized content, addressing\n", + "\n", + "pain points, or offering incentives to continue playing.\n", + "\n", + "- **Be transparent:** Make sure you are transparent with your\n", + "\n", + "players about how you are using their data to predict and\n", + "\n", + "prevent churn, and give them the option to opt out if\n", + "\n", + "they wish.\n", + "\n", + "- **Collaborate with your team:** Share your churn prediction\n", + "\n", + "and prevention efforts with your team and encourage\n", + "\n", + "them to use the data to inform their work.\n", + "\n", + "### Getting Started with Read-time Ad Targeting\n", + "\n", + "Typically, implementing a real-time ad targeting strategy begins\n", + "\n", + "outside of your game (in services such as Google Ads, Unity\n", + "\n", + "Advertising), where your game becomes the delivery point\n", + "\n", + "for the advertisement. Here, you will need to integrate with\n", + "\n", + "Ad networks that provide real-time ad targeting capabilities.\n", + "\n", + "That will allow you to access a range of available ad assets\n", + "\n", + "to dynamically select and display the most relevant ads to\n", + "\n", + "players. Both Google AdMob and Unity Ads are great for banner\n", + "\n", + "ads, native ads, and rewarded video ads. Your role is to ensure\n", + "\n", + "that the data you’re collecting is fed back into the advertising\n", + "\n", + "platform to better serve targeted ads to your playerbase.\n", + "\n", + "\n", + "To use a service like Databricks to manage the data needed\n", + "\n", + "to provide real-time ad targeting in your application, you can\n", + "\n", + "follow the below steps:\n", + "\n", + "`1.` **Collect and store player data:** Collect data on player\n", + "\n", + "behavior, preferences, and demographics, and store it in\n", + "\n", + "a data lake using Databricks. Popular analytics tools such\n", + "\n", + "as Google Analytics or Mixpanel can be integrated into\n", + "\n", + "the game to collect data on player behavior. These tools,\n", + "\n", + "just like tracking website traffic, can track in-game events,\n", + "\n", + "provide insights on player behavior and demographics..\n", + "\n", + "and they give you access to detailed reports and\n", + "\n", + "dashboards. Another option is to build in-house tracking\n", + "\n", + "systems to collect data on player behavior - logging\n", + "\n", + "events, e.g in-game purchases or player actions, activities\n", + "\n", + "such as “at which level does a player quit playing” and\n", + "\n", + "storing this in a database for analysis. The downside of\n", + "\n", + "building in-house tracking systems is you will need to host\n", + "\n", + "and maintain your own logging servers.\n", + "\n", + "`2.` **Prepare the data:** Use Databricks to clean, transform,\n", + "\n", + "and prepare the player data for analysis. This may\n", + "\n", + "include aggregating data from multiple sources, removing\n", + "\n", + "duplicates and outliers, and transforming the data into a\n", + "\n", + "format suitable for analysis.\n", + "\n", + "`3.` **Analyze the data:** Use Databricks’ built-in machine\n", + "\n", + "learning and data analytics capabilities to analyze the\n", + "\n", + "player data and identify patterns and trends.\n", + "\n", + "`4.` **Create audience segments:** Based on the analysis,\n", + "\n", + "use Databricks to create audience segments based on\n", + "\n", + "common characteristics such as interests, behaviors,\n", + "\n", + "and preferences.\n", + "\n", + "`5.` **Integrate with the ad server:** When an ad opportunity\n", + "\n", + "presents itself within the game, a call is made to the ad\n", + "\n", + "server. This call includes information about the player,\n", + "\n", + "such as the audience segment that they belong to. The\n", + "\n", + "ad server then uses this information to decide what ad to\n", + "\n", + "deliver to the player.\n", + "\n", + "`6.` **Monitor and optimize:** Use Databricks to monitor the\n", + "\n", + "performance of the ad targeting and make optimizations\n", + "\n", + "as needed, such as adjusting the audience segments or\n", + "\n", + "adjusting the targeting algorithms.\n", + "\n", + "By using a service like Databricks to manage the data needed\n", + "\n", + "for real-time ad targeting, game developers can effectively\n", + "\n", + "leverage their player data to create more personalized and\n", + "\n", + "engaging experiences, increase revenue, and reduce churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Focus on player data:** Make player data the center of your\n", + "\n", + "targeting strategy by collecting and storing comprehensive\n", + "\n", + "information on player behavior, preferences, and\n", + "\n", + "demographics. Here, it’s critical to ensure the game code\n", + "\n", + "data trackers are properly implemented in order to collect\n", + "\n", + "this data (see Game Analytics section for detail).\n", + "\n", + "- **Segment your audience:** Create audience segments\n", + "\n", + "based on common characteristics such as interests,\n", + "\n", + "behaviors, and preferences, and use these segments to\n", + "\n", + "\n", + "**Test and iterate:** Continuously test and iterate your\n", + "\n", + "targeting strategy to refine your audience segments and\n", + "\n", + "improve targeting accuracy.\n", + "\n", + "**Balance relevance and privacy:** Balance the need for\n", + "\n", + "relevant, personalized ads with players’ privacy by only\n", + "\n", + "collecting and using data that is necessary for targeting\n", + "\n", + "and obtaining player consent.\n", + "\n", + "**Monitor performance:** Regularly monitor the performance\n", + "\n", + "of your targeting strategy to ensure that it is delivering the\n", + "\n", + "desired results and make optimizations as needed.\n", + "\n", + "**Partner with the right ad platform:** Choose an ad\n", + "\n", + "platform that is well-suited to your needs and aligns with\n", + "\n", + "your goals, and work closely with them to ensure that your\n", + "\n", + "targeting strategy is delivering the best results.\n", + "\n", + "\n", + "deliver targeted ads.\n", + "\n", + "# Operational use cases\n", + "\n", + "\n", + "### Anomaly Detection\n", + "\n", + "First thing is to begin collecting the data, game server / client\n", + "\n", + "logs out of your project. Then consume this into Databricks\n", + "\n", + "Delta, to have a continuous anomaly detection model\n", + "\n", + "running. Focus this on key pieces of information you want to\n", + "\n", + "monitor, for example - for live service games, this is going to\n", + "\n", + "be infrastructure and network-related metrics such as Ping\n", + "\n", + "and Server Health (# of clients connected, server uptime,\n", + "\n", + "server usage, CPU/RAM, # of sessions, time of sessions).\n", + "\n", + "Once the model is ingesting and tuned specifically for the\n", + "\n", + "metrics based on the information you have above. You would\n", + "\n", + "build out alerts or notifications based on these specific\n", + "\n", + "metrics hitting a threshold that you define as needing\n", + "\n", + "attention. From here, you can build out automated systems\n", + "\n", + "to mitigate those effects - such as migrating players to a\n", + "\n", + "different server, canceling matches, scaling infrastructure,\n", + "\n", + "creating tickets for admins to review.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Define the problem and objectives clearly:** Before\n", + "\n", + "implementing an anomaly detection solution, it is\n", + "\n", + "important to define the problem you are trying to solve\n", + "\n", + "and your specific objectives. This will help ensure that\n", + "\n", + "you have the right data sources and use the appropriate\n", + "\n", + "algorithms to achieve your goals.\n", + "\n", + "- **Choose the right data sources:** To effectively detect\n", + "\n", + "anomalies, you need to have the right data sources.\n", + "\n", + "Consider data from player behavior, system performance,\n", + "\n", + "and network traffic, as well as any other data sources that\n", + "\n", + "are relevant to your problem and objectives.\n", + "\n", + "- **Clean and preprocess the data:** To ensure that the\n", + "\n", + "data you use for anomaly detection is accurate and\n", + "\n", + "meaningful, it is important to clean and preprocess the\n", + "\n", + "data. This includes removing any irrelevant or invalid data,\n", + "\n", + "handling missing values, and normalizing the data\n", + "\n", + "if necessary.\n", + "\n", + "- **Choose the right algorithms:** There are many algorithms\n", + "\n", + "that can be used for anomaly detection, including\n", + "\n", + "statistical methods, machine learning algorithms, and\n", + "\n", + "rule-based systems. Choose the algorithms that are best\n", + "\n", + "\n", + "-----\n", + "\n", + "suited to your data and problem, and that provide the\n", + "\n", + "right level of accuracy, speed, and scalability.\n", + "\n", + "- **Validate the results:** Before deploying the anomaly\n", + "\n", + "detection solution in production, it is important to validate\n", + "\n", + "the results by testing the solution on a small subset of\n", + "\n", + "data and comparing the results to expected outcomes.\n", + "\n", + "- **Monitor and update the solution:** Once the anomaly\n", + "\n", + "detection solution is deployed, it is important to monitor\n", + "\n", + "its performance and accuracy, and update the solution as\n", + "\n", + "needed. This may include retraining the algorithms, adding\n", + "\n", + "or removing data sources, and updating the parameters\n", + "\n", + "and thresholds used by the algorithms.\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Avoid overfitting:** Overfitting occurs when the anomaly\n", + "\n", + "detection solution is too complex and learns the noise\n", + "\n", + "in the data rather than the underlying patterns. To avoid\n", + "\n", + "overfitting, it is important to choose algorithms that are\n", + "\n", + "appropriate for the size and complexity of the data, and to\n", + "\n", + "validate the results using a separate test dataset.\n", + "\n", + "- **False positive and false negative results:** False positive\n", + "\n", + "and false negative results can occur when the anomaly\n", + "\n", + "detection solution is not properly calibrated, or when\n", + "\n", + "the solution is applied to data that is significantly\n", + "\n", + "different from the training data. To minimize the risk of\n", + "\n", + "false positive and false negative results, it is important\n", + "\n", + "to validate the results using a separate test dataset, and\n", + "\n", + "to fine-tune the parameters and thresholds used by the\n", + "\n", + "algorithms as needed.\n", + "\n", + "- **Scalability:** Scalability can be a concern when\n", + "\n", + "implementing an anomaly detection solution, especially\n", + "\n", + "when dealing with large amounts of data. To ensure that\n", + "\n", + "the solution can scale to meet the demands of a growing\n", + "\n", + "player base, it is important to choose algorithms that\n", + "\n", + "are fast and scalable, and to deploy the solution using a\n", + "\n", + "scalable infrastructure.\n", + "\n", + "### Getting Started with Build Pipeline\n", + "\n", + "An operational goal game projects have is to make sure\n", + "\n", + "game project builds are generated, delivered quickly and\n", + "\n", + "efficiently to internal testing & external users.\n", + "\n", + "\n", + "A few of the key metrics and capabilities with analyzing your\n", + "\n", + "build pipelines are the below:\n", + "\n", + "- **Build time and speed:** This includes metrics such as\n", + "\n", + "the time it takes to create a build, number of builds, and\n", + "\n", + "compute spent.\n", + "\n", + "- **Build size and storage:** size of the builds, amount of\n", + "\n", + "storage, and network costs.\n", + "\n", + "- **Bug tracking and resolution:** This includes metrics such\n", + "\n", + "as the number of bugs reported, the time it takes to\n", + "\n", + "resolve them, and the number of bugs that are resolved in\n", + "\n", + "each build.\n", + "\n", + "- **Code quality and efficiency:** This includes metrics such\n", + "\n", + "as code complexity, code duplication, and the number of\n", + "\n", + "code lines written.\n", + "\n", + "- **Collaboration and communication:** Such as the number\n", + "\n", + "of code reviews, the number of team meetings, and the\n", + "\n", + "number of code commits.\n", + "\n", + "- **Advanced capabilities:** Such as Predicting real time build\n", + "\n", + "failure to reduce spend and combining build data with\n", + "\n", + "Crash Analytics (see below) to have “commit to build”\n", + "\n", + "visibility for accelerated bug fixing.\n", + "\n", + "Before you start implementing your build pipeline, it’s\n", + "\n", + "important to define your requirements. What are the key\n", + "\n", + "goals of your build pipeline? Choosing the right CI/CD tools is\n", + "\n", + "critical to the success of your build pipeline. There are many\n", + "\n", + "different tools available, including Jenkins, Azure Devops,\n", + "\n", + "Perforce, gitlab and more. When choosing a CI/CD tool,\n", + "\n", + "consider factors such as ease of use, scalability, and cost. In\n", + "\n", + "addition, consider the specific needs of your game project,\n", + "\n", + "and choose a tool that can meet those needs.\n", + "\n", + "The general recommendation is to look at automating your\n", + "\n", + "build process early. Once you’ve chosen your CI/CD tools, you\n", + "\n", + "can automate your build process by setting up a build server,\n", + "\n", + "configuring your CI/CD tool, and creating a script to build your\n", + "\n", + "game project. The build process should be automated as much\n", + "\n", + "as possible, and it should include steps to compile your code,\n", + "\n", + "run automated tests, and generate a build of your project.\n", + "\n", + "Once you have automated your build process, often the\n", + "\n", + "next step is to implement CD (Continuous Delivery). This\n", + "\n", + "involves automating the deployment of your game builds\n", + "\n", + "delivery to stakeholders, such as QA testers, beta testers, or\n", + "\n", + "end-users via publishing platforms. CD can help ensure that\n", + "\n", + "stakeholders have access to the latest version of your game\n", + "\n", + "\n", + "-----\n", + "\n", + "as soon as possible, allowing them to provide feedback and\n", + "\n", + "help drive the development process forward.\n", + "\n", + "Finally, it’s important to monitor and measure your build\n", + "\n", + "pipeline to ensure that it’s working as expected. This can\n", + "\n", + "involve using tools such as Databricks Dashboards to\n", + "\n", + "visualize the status of your pipeline, or using metrics such\n", + "\n", + "as build times, test results, and deployment success rates\n", + "\n", + "to evaluate the performance of your pipeline. By monitoring\n", + "\n", + "and measuring your build pipeline, you can identify areas for\n", + "\n", + "improvement and make changes as needed to ensure that\n", + "\n", + "your pipeline continues to meet your needs.\n", + "\n", + "If you have any questions about how databricks can\n", + "\n", + "integrate into your devops solution, please don’t hesitate to\n", + "\n", + "[reach out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Seek to automate early and often:** Automate as much\n", + "\n", + "of the build process as possible, from checking code into\n", + "\n", + "version control to generating builds and distributing them\n", + "\n", + "to stakeholders. This can help reduce errors and save time,\n", + "\n", + "allowing game teams to focus on more high value tasks.\n", + "\n", + "\n", + "**Version control, version control, version control:** Use a\n", + "\n", + "version control system to manage the source code and\n", + "\n", + "other assets. This ensures that changes to the codebase\n", + "\n", + "are tracked and can be easily undone if needed.\n", + "\n", + "**Implement continuous integration and delivery:**\n", + "\n", + "Continuous integration (CI) involves automatically building\n", + "\n", + "and testing after code changes are checked into version\n", + "\n", + "control. With CI, new changes to the codebase do not\n", + "\n", + "break existing functionality. By automating the build\n", + "\n", + "process, CI helps to reduce errors and save time. CD, on\n", + "\n", + "the other hand, involves automatically delivering builds to\n", + "\n", + "stakeholders, such as QA testers, beta testers, or end-\n", + "\n", + "users, after they have passed the automated tests. By\n", + "\n", + "combining CI and CD, a video game project can ensure\n", + "\n", + "that builds are generated and delivered quickly and\n", + "\n", + "efficiently, without the need for manual intervention.\n", + "\n", + "**Build for scalability:** As your game project grows, you\n", + "\n", + "will need a build pipeline solution that is scalable and can\n", + "\n", + "handle the needs of your game team.\n", + "\n", + "**Integration with other tools:** Integrate the build pipeline\n", + "\n", + "solution with other tools and systems, such as issue\n", + "\n", + "tracking, testing, and deployment tools, to ensure a\n", + "\n", + "smooth and efficient workflow.\n", + "\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "\n", + "|GAME INFRASTRUCTURE|Col2|\n", + "|---|---|\n", + "|||\n", + "|||\n", + "\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Getting Started with Crash Analytics\n", + "\n", + "Building a pipeline to build a holistic view to support crash\n", + "\n", + "analytics means data coming from multiple different\n", + "\n", + "sources, different velocities and joining the data together.\n", + "\n", + "The amount of data sources depends on your game projects\n", + "\n", + "publishing platforms, some may come from console based\n", + "\n", + "providers such as sony, xbox, and nintendo or pc platforms\n", + "\n", + "like Steam, Epic Games Marketplace, GoG and many others.\n", + "\n", + "**High level steps**\n", + "\n", + "- Determine what platforms your game is running on and\n", + "\n", + "how to interface to collect data.\n", + "\n", + "- **Collect crash data:** Implement crash reporting tools in\n", + "\n", + "your game to collect data on crashes. The source data\n", + "\n", + "may be delivered in varying formats such as JSON or CSV.\n", + "\n", + "- **Load crash data into Databricks:** Use Databricks’ data\n", + "\n", + "ingestion tools to load the crash data into your workspace.\n", + "\n", + "This could involve using Databricks’ built-in data source\n", + "\n", + "connectors, or programmatically ingest files to load the data.\n", + "\n", + "\n", + "\n", + "- **Transform and clean the crash data:** Use Databricks’\n", + "\n", + "data processing and transformation tools to clean and\n", + "\n", + "prepare the crash data for analysis. This could involve\n", + "\n", + "using Databricks’ capabilities like DLT, or using SQL to\n", + "\n", + "perform custom transformations.\n", + "\n", + "- **Visualize crash data:** Use Databricks’ dashboarding tools\n", + "\n", + "to create visualizations that help you understand the\n", + "\n", + "patterns and trends in your crash data. This could involve\n", + "\n", + "using Databricks’ built-in visualization tools, or integrating\n", + "\n", + "with external visualization tools like Tableau or PowerBI.\n", + "\n", + "- **Analyze crash data:** Use Databricks’ machine learning\n", + "\n", + "and statistical analysis tools to identify the root causes\n", + "\n", + "of crashes. This could involve using Spark MLlib or many\n", + "\n", + "of the popular tools to build machine learning models, or\n", + "\n", + "using SQL to perform custom analyses.\n", + "\n", + "- **Monitor and refine your pipeline:** Regularly review your\n", + "\n", + "pipeline to ensure that it remains relevant and useful.\n", + "\n", + "Refine your pipeline as necessary to reflect changes in\n", + "\n", + "your crash data or your goals.\n", + "\n", + "If you have any questions about how this solution can be\n", + "\n", + "deployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n", + "\n", + "[out](https://www.databricks.com/company/contact) to us.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Tips / Best Practices**\n", + "\n", + "- **Automated collection and aggregation of crash reports:**\n", + "\n", + "Collecting crash reports should be an automated process\n", + "\n", + "that is integrated into the output of the build pipeline\n", + "\n", + "for the game. The crash reports should be automatically\n", + "\n", + "aggregated and made available for analysis in near real-time.\n", + "\n", + "- **Clear reporting and prioritization of issues:** The solution\n", + "\n", + "should provide clear reporting on the most common\n", + "\n", + "issues and allow game developers to prioritize fixing the\n", + "\n", + "most impactful problems first.\n", + "\n", + "- **Integration with other analytics tools:** The crash analytics\n", + "\n", + "solution should integrate with other analytics tools, such\n", + "\n", + "as player behavior tracking, to provide a more complete\n", + "\n", + "picture of how crashes are impacting the player experience.\n", + "\n", + "- **Flexibility and scalability:** As the game grows, the\n", + "\n", + "\n", + "Additionally, there are some key gotchas to look out for when\n", + "\n", + "implementing an anomaly detection solution.\n", + "\n", + "- **Data privacy and security:** Ensure that crash reports do\n", + "\n", + "not contain sensitive information that could be used to\n", + "\n", + "identify individual players.\n", + "\n", + "- **Scalability:** As the number of players and crashes\n", + "\n", + "increases, it may become difficult to manage and analyze\n", + "\n", + "the growing volume of data.\n", + "\n", + "- **Integration with other tools:** Be aware when integrating\n", + "\n", + "crash analytics with other tools and systems, especially if\n", + "\n", + "the tools use different data formats or data structures.\n", + "\n", + "- **Prioritization of issues:** Determine which crashes are\n", + "\n", + "the most impactful and prioritize fixes accordingly. This\n", + "\n", + "can be a complex process, especially if there are a large\n", + "\n", + "number of different crash types and causes.\n", + "\n", + "\n", + "solution should be able to scale to accommodate an\n", + "\n", + "increasing number of players and crashes.\n", + "\n", + "**Data privacy and security:** It’s important to consider data\n", + "\n", + "privacy and security when implementing a crash analytics\n", + "\n", + "solution. This may involve implementing measures to\n", + "\n", + "anonymize crash reports, or taking steps to ensure that\n", + "\n", + "sensitive information is not included in the reports.\n", + "\n", + "**Reference Architecture**\n", + "\n", + "**Databricks**\n", + "**SQL**\n", + "\n", + "**Power BI**\n", + "\n", + "**AWS**\n", + "\n", + "**Quicksight**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf2024-09-19T16:57:21Z
### Executive Guide\n", + "\n", + "# Transform and Scale Your Organization With Data and AI\n", + "\n", + "#### A guide for CIOs, CDOs, and\n", + " data and AI executives\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "**A U T H O R :**\n", + "\n", + "**Chris D’Agostino**\n", + "\n", + "Global Field CTO\n", + "\n", + "Databricks\n", + "\n", + "**E D I T O R S :**\n", + "\n", + "Manveer Sahota\n", + "\n", + "\n", + "**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n", + "\n", + "**1.** Establish the goals and business value 8\n", + "\n", + "**2.** Identify and prioritize use cases 19\n", + "\n", + "**3.** Build successful data teams 22\n", + "\n", + "**4.** Deploy a modern data stack 28\n", + "\n", + "**5.** Improve data governance and compliance 36\n", + "\n", + "**6.** Democratize access to quality data 41\n", + "\n", + "**7.** Dramatically increase productivity of your workforce 47\n", + "\n", + "**8.** Make informed build vs. buy decisions 52\n", + "\n", + "**9.** Allocate, monitor and optimize costs 55\n", + "\n", + "**10.** Move to production and scale adoption 58\n", + "\n", + "\n", + "Jessica Barbieri\n", + "\n", + "\n", + "Toby Balfre\n", + "\n", + "\n", + "**C H A P T E R 3 :** **Conclusion** \u0007 63\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "## Executive Summary\n", + "\n", + "Data and AI leaders are faced with the challenge\n", + "\n", + "of future-proofing their architecture and platform\n", + "\n", + "investments. The Lakehouse implementation from\n", + "\n", + "Databricks combines the best features of EDWs\n", + "\n", + "and data lakes by enabling all their workloads using\n", + "\n", + "open source and open standards — avoiding the\n", + "\n", + "vendor lock-in, black box design and proprietary\n", + "\n", + "data formats of other cloud vendors.\n", + "\n", + "\n", + "It’s not surprising that many industry experts say data is the most valuable resource in the modern\n", + "\n", + "economy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n", + "\n", + "water. Its core compound never changes, and it can be transformed to whatever use case is desired,\n", + "\n", + "with the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n", + "\n", + "essential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n", + "\n", + "importance of data are growing exponentially in both our professional and personal lives, while artificial\n", + "\n", + "intelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n", + "\n", + "over the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n", + "\n", + "2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n", + "\n", + "(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n", + "\n", + "governance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n", + "\n", + "Every organization is working to improve business outcomes while effectively managing a variety of risks —\n", + "\n", + "including economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n", + "\n", + "Your organization’s data and the systems that process it play a critical role in not only enabling your financial\n", + "\n", + "goals but also in minimizing these seven key business risks.\n", + "\n", + "Businesses have realized that their legacy information technology (IT) platforms are not able to scale and\n", + "\n", + "meet the increasing demands for better data analytics. As a result, they are looking to transform how their\n", + "\n", + "organizations use and process data. Successful data transformation initiatives for data, analytics and AI\n", + "\n", + "involve not only the design of hardware and software systems but also the alignment of people, processes\n", + "\n", + "and platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n", + "\n", + "significant return on investment (ROI) — one that starts in months, not years.\n", + "\n", + "To guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n", + "\n", + "Despite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n", + "\n", + "to deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n", + "\n", + "efficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n", + "\n", + "identify and execute on AI opportunities.\n", + "\n", + "\n", + "-----\n", + "\n", + "To successfully lead data and AI transformation initiatives, organizations need to develop and execute\n", + "\n", + "a comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n", + "\n", + "full potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n", + "\n", + "organizations have the option of moving away from closed, proprietary systems offered by a variety\n", + "\n", + "of cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n", + "\n", + "industry standards.\n", + "\n", + "At Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n", + "\n", + "we’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n", + "\n", + "in successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n", + "\n", + "architecture, which decouples data storage from compute while providing the best price/performance\n", + "\n", + "metrics for all your data workloads — including data warehousing. We have captured the lessons learned\n", + "\n", + "and summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n", + "\n", + "CIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n", + "\n", + "initiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n", + "\n", + "unified data platform that realizes the data lakehouse architecture and enables the data personas in your\n", + "\n", + "organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n", + "\n", + "shown in Figure 1.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "**Figure 1:**\n", + "The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**The lakehouse architecture benefits organizations in several ways:**\n", + "\n", + "**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n", + "\n", + "**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n", + "\n", + "**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n", + "\n", + "Our intention is to present key considerations and equip you with the knowledge to ask informed questions,\n", + "\n", + "make the most critical decisions early in the process, and develop the comprehensive strategy that most\n", + "\n", + "organizations lack.\n", + "\n", + "In addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n", + "\n", + "professional services offering that organizations can leverage to measure their readiness, reskill their staff\n", + "\n", + "and track progress as they embark on their data transformation initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "## Define the Strategy\n", + "\n", + "\n", + "The most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n", + "\n", + "strategy for how your organization will leverage people, processes and platforms to drive measurable\n", + "\n", + "business results against your corporate priorities. The strategy serves as a set of principles that every\n", + "\n", + "member of your organization can refer to when making decisions. The strategy should cover the roles and\n", + "\n", + "responsibilities of teams within your organization for how you capture, store, curate and process data to run\n", + "\n", + "your business — including the internal and external resources (labor and budget) needed to be successful.\n", + "\n", + "\n", + "Establish the\n", + "goals and\n", + "business value\n", + "\n", + "\n", + "Build\n", + "successful\n", + "data teams\n", + "\n", + "\n", + "Ease data\n", + "governance and\n", + "compliance\n", + "\n", + "\n", + "Simplify\n", + "the user\n", + "experience\n", + "\n", + "\n", + "Allocate,\n", + "monitor and\n", + "optimize costs\n", + "\n", + "\n", + "Identify and\n", + "prioritize\n", + "use cases\n", + "\n", + "\n", + "Deploy a modern\n", + "data architecture\n", + "\n", + "\n", + "Democratize\n", + "access to\n", + "quality data\n", + "\n", + "\n", + "Make informed\n", + "build vs. buy\n", + "decisions\n", + "\n", + "\n", + "Move to\n", + "production and\n", + "drive adoption\n", + "\n", + "\n", + "**Figure 2:**\n", + "The 10 steps to a winning data and AI strategy\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Here are 10 key considerations\n", + "\n", + "**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n", + "\n", + "**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n", + "\n", + "**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n", + "\n", + "and data engineering talent.\n", + "\n", + "**4.** \u0007Future-proof your technology investment with a modern data architecture.\n", + "\n", + "**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n", + "\n", + "Consumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n", + "\n", + "**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n", + "\n", + "data access and the sharing of all your data across the organization.\n", + "\n", + "**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n", + "\n", + "**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n", + "\n", + "important problems.\n", + "\n", + "**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n", + "\n", + "**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n", + "\n", + "user satisfaction.\n", + "\n", + "The strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n", + "\n", + "owned and governed by the CDO and made available for everyone in the organization to review and provide\n", + "\n", + "feedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n", + "\n", + "the technology landscape or a combination of any of these — but it should serve as the North Star for\n", + "\n", + "how you will navigate the many decisions and trade-offs that you will need to make over the course of the\n", + "\n", + "transformation.\n", + "\n", + "\n", + "This guide takes a stepwise approach to\n", + "\n", + "addressing each of these 10 topics.\n", + "\n", + "\n", + "-----\n", + "\n", + "Studies have shown that data scientists spend 80%\n", + "\n", + "of their time collecting and compiling data sets\n", + "\n", + "\n", + "#### 1. Establish the goals and business value\n", + "\n", + "Most organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n", + "\n", + "The goals generally fall into one of three categories:\n", + "\n", + "**1.** **Business outcomes**\n", + "\n", + "**2.** **People**\n", + "\n", + "**3.** **Technology**\n", + "\n", + "\n", + "and only 20% of their time developing insights and\n", + "\n", + "\n", + "In terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n", + "\n", + "emerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n", + "\n", + "business leaders see the digital transformation as an opportunity to build a new technology foundation\n", + "\n", + "from which to run their business and increase business value. One that is more agile, scalable, secure and\n", + "\n", + "easier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n", + "\n", + "dynamic economy.\n", + "\n", + "For organizations today, people are one of their most valuable assets — you cannot succeed in data,\n", + "\n", + "analytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n", + "\n", + "impacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n", + "\n", + "employees work in a frictionless data environment, to the extent possible, so they feel productive each day\n", + "\n", + "and can do their best work.\n", + "\n", + "Finally, from a technology perspective, organizations have grown tired of the high costs associated with\n", + "\n", + "complex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n", + "\n", + "industry trend is to move away from large capital expenditures (capex) to pay for network and server\n", + "\n", + "capacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n", + "\n", + "approach. Your data analytics environment should support this trend as well — using open standards, low-\n", + "\n", + "cost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n", + "\n", + "once they are complete.\n", + "\n", + "\n", + "algorithms. Organizations that are able to invert\n", + "\n", + "these numbers benefit in two ways — happier\n", + "\n", + "employees and improved time to market for use\n", + "\n", + "cases. These employers create more favorable\n", + "\n", + "working environments and lower the risk of burnout\n", + "\n", + "and the resulting regrettable attrition.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Executive buy-in and support**\n", + "\n", + "Large organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n", + "\n", + "to have unwavering buy-in and support from the highest levels of management — including the CEO and\n", + "\n", + "board of directors. With this support, you have the leverage you need to develop the strategy, decide on\n", + "\n", + "an architecture and implement a solution that can truly change the way your business is run. Without it,\n", + "\n", + "you have a very expensive science project that has little hope of succeeding. Why? Because the majority\n", + "\n", + "of people in your organization are busy doing their day jobs. The added work to support the initiative must\n", + "\n", + "be offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n", + "\n", + "within it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n", + "\n", + "Transformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n", + "\n", + "all the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n", + "\n", + "to be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n", + "\n", + "vocal proponents.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolve to an AI-first company — not just a data-first company**\n", + "\n", + "Data and AI transformations should truly transform the way organizations use data, not just evolve it. For\n", + "\n", + "decades, businesses have operated using traditional business processes and leveraged Structured Query\n", + "\n", + "Language (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n", + "\n", + "data. There are five major challenges with this approach:\n", + "\n", + "**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n", + "\n", + "use pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n", + "\n", + "ability to replicate and scale the results is nearly impossible.\n", + "\n", + "Auto�ated Decision�Ma�ing\n", + "\n", + "#### Tech leaders are to the right of the Data Maturity Curve\n", + "\n", + "\n", + "Prescriptive Anal�tics\n", + "\n", + "Predictive Modeling\n", + "\n", + "Data Exploration\n", + "\n", + "\n", + "From hindsight to foresight\n", + "\n", + "\n", + "How should\n", + "we respond?\n", + "\n", + "\n", + "Auto�aticall� �a��\n", + "the best decision\n", + "\n", + "\n", + "Ad Hoc Queries\n", + "\n", + "Reports\n", + "Clean Data\n", + "\n", + "WHAT HAPPENED? WHAT W255 HAPPEN?\n", + "\n", + "Data and A2 Maturit�\n", + "\n", + "\n", + "**Figure 3:**\n", + "The Data Maturity Curve\n", + "\n", + "\n", + "-----\n", + "\n", + "**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n", + "\n", + "processing.\n", + "\n", + "**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n", + "\n", + "programmatically state, in a priority manner, how data insights can be achieved or how the business\n", + "\n", + "should react to changing data.\n", + "\n", + "**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n", + "\n", + "number of people needed to respond to every piece of data flowing into your environment. Machines\n", + "\n", + "scale, people do not.\n", + "\n", + "**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n", + "\n", + "gain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n", + "\n", + "21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n", + "\n", + "of processing and managing data will not work. Using ML and AI will empower your workforce to\n", + "\n", + "leverage data to make better decisions for managing risk, helping your organization succeed in the\n", + "\n", + "modern economy.\n", + "\n", + "**Go “all in” on the cloud**\n", + "\n", + "The COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n", + "\n", + "videoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n", + "\n", + "cloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n", + "\n", + "as a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n", + "\n", + "especially when combined with the use of open source software (OSS), increase the speed at which\n", + "\n", + "organizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n", + "\n", + "For AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n", + "\n", + "corporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n", + "\n", + "time, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n", + "\n", + "are well aware of vendor lock-in and want to abstract their applications so they can be moved across\n", + "\n", + "clouds if there is a compelling business reason.\n", + "\n", + "\n", + "-----\n", + "\n", + "Approaching your technology choices with a multicloud point of view gives the organization more sovereignty\n", + "\n", + "over the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n", + "\n", + "run on different cloud providers and simplified compliance with emerging regulations that may require\n", + "\n", + "companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n", + "\n", + "As a result, data portability and the ability to run workloads on different cloud providers are becoming\n", + "\n", + "increasingly important.\n", + "\n", + "**Modernize business applications**\n", + "\n", + "As organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n", + "\n", + "approach. The majority of on-premises applications are not built with the cloud in mind. They usually\n", + "\n", + "differ in the way that they handle security, resiliency, scalability and failover. Their application designs\n", + "\n", + "often store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n", + "\n", + "CCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n", + "\n", + "therefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n", + "\n", + "services and APIs to easily provide access to an application’s functionality.\n", + "\n", + "Cloud-based architectures, commodity databases and software application development frameworks make\n", + "\n", + "it easier for developers to build scalable, secure end-to-end applications to run all your internal business\n", + "\n", + "processes. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n", + "\n", + "a backing database) has become straightforward with the latest tooling available to your application\n", + "\n", + "development teams.\n", + "\n", + "As a first step, organizations should inventory their business-critical applications, prioritize them based\n", + "\n", + "on business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n", + "\n", + "applications that generate and store a significant amount of the data consumed within an organization. Using\n", + "\n", + "a consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n", + "\n", + "\n", + "“We are on an amazing journey. Being among\n", + "\n", + "the fastest-growing enterprise software cloud\n", + "\n", + "companies on record was unimaginable when\n", + "\n", + "we started Databricks. To get here, we’ve stayed\n", + "\n", + "focused on the three big bets we made when\n", + "\n", + "founding the company — cloud, open source\n", + "\n", + "and machine learning. Fast-forward seven years,\n", + "\n", + "thousands of data teams around the globe are\n", + "\n", + "working better together on Databricks.”\n", + "\n", + "**Ali Ghodsi**\n", + "\n", + "Co-founder and CEO\n", + "\n", + "Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "The next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n", + "\n", + "A good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n", + "\n", + "other applications within your environment to store copies of the data — unless absolutely necessary for\n", + "\n", + "performance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n", + "\n", + "the data from the actual SOR.\n", + "\n", + "Data from these SORs should be made available in three ways:\n", + "\n", + "**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n", + "\n", + "**2.** \u0007Ensure that copies of the data land in the data lake.\n", + "\n", + "**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n", + "\n", + "consumption by downstream applications.\n", + "\n", + "**Move toward real-time decisioning**\n", + "\n", + "The value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n", + "\n", + "and the second is to view data as an individual event. This so-called “time value of data” is an important\n", + "\n", + "concept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n", + "\n", + "the same data platform.\n", + "\n", + "On the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n", + "\n", + "aggregate data provides the ability to look back in time and see the complete history of an aspect of your\n", + "\n", + "business and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n", + "\n", + "newly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n", + "\n", + "can positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n", + "\n", + "The goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n", + "\n", + "This “time value of data” is shown in Figure 4 on the next page.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, real-time processing of clickstream data from your customer-facing mobile application can\n", + "\n", + "indicate when the customer is having trouble and may need to call into your call center. This insight gives\n", + "\n", + "you the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n", + "\n", + "center agents — improving the customer experience and lowering customer churn.\n", + "\n", + "Data, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n", + "\n", + "machine learning models using historical data and provides you with the ability to make real-time decisions\n", + "\n", + "as new events take place. For example, credit card fraud models can use deep historical data about a given\n", + "\n", + "customer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n", + "\n", + "build rich models that are then executed for each new credit card transaction. This real-time execution,\n", + "\n", + "combined with historical data, enables the best possible customer experience.\n", + "\n", + "#### Time Value of Data\n", + "\n", + "\n", + "The Databricks Lakehouse Platform allows you to\n", + "\n", + "combine real-time streaming and batch processing\n", + "\n", + "using one architecture and a consistent set of\n", + "\n", + "programming APIs.\n", + "\n", + "**Figure 4:**\n", + "Time Value of Data\n", + "\n", + "\n", + "Value of an individual data\n", + "\n", + "record is very high once created\n", + "but decreases over time\n", + "\n", + "\n", + "Value of data records\n", + "\n", + "in aggregate increases\n", + "over time\n", + "\n", + "\n", + "Real-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n", + "\n", + "\n", + "-----\n", + "\n", + "**Land** **_all_** **data in a data lake**\n", + "\n", + "In order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n", + "\n", + "user as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n", + "\n", + "access. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n", + "\n", + "warehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n", + "\n", + "that data only. What do you do when you want to ask a different question? To further complicate matters,\n", + "\n", + "how do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n", + "\n", + "How do you find new insights as quickly as possible?\n", + "\n", + "The overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n", + "\n", + "along the data pipeline — including raw, refined and curated data states.\n", + "\n", + "This phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n", + "\n", + "_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n", + "\n", + "all their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n", + "\n", + "knowing whether or not you can trust the data.\n", + "\n", + "**Change the way people work**\n", + "\n", + "When done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n", + "\n", + "savings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n", + "\n", + "minimizing the number of steps needed to produce results with data. This can be accomplished by:\n", + "\n", + "**1.** \u0007 Making data more accessible and ensuring it can be trusted\n", + "\n", + "**2.** Minimizing the number of tools/systems needed to perform work\n", + "\n", + "**3.** Creating a flywheel effect by leveraging the work of others\n", + "\n", + "\n", + "“We believe that the data lakehouse architecture\n", + "\n", + "presents an opportunity comparable to the one\n", + "\n", + "we saw during early years of the data warehouse\n", + "\n", + "market. The unique ability of the lakehouse to\n", + "\n", + "manage data in an open environment, blend all\n", + "\n", + "varieties of data from all parts of the enterprise and\n", + "\n", + "combine the data science focus of the data lake\n", + "\n", + "with the end-user analytics of the data warehouse\n", + "\n", + "will unlock incredible value for organizations.”\n", + "\n", + "**Bill Inmon**\n", + "\n", + "The father of the data warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "In large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n", + "\n", + "is laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n", + "\n", + "Systems and applications get built over time to satisfy specific needs within a line of business. As a result,\n", + "\n", + "it’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n", + "\n", + "data they need to do their jobs. It should be as simple as getting your identity and PC.\n", + "\n", + "With Databricks, users can collaborate and perform\n", + "\n", + "\n", + "A primary goal of your data and AI transformation should be to focus on improving the user experience —\n", + "\n", + "in other words, improving how your entire organization interacts with data. Data must be easily discoverable\n", + "\n", + "with default access to users based on their role(s) — with a simple process to compliantly request access to\n", + "\n", + "data sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n", + "\n", + "the various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n", + "\n", + "Finally, the results of the work performed by a user or system upstream should be made available to users\n", + "\n", + "and systems downstream as “data assets” that can drive business value.\n", + "\n", + "Organizations that maximize the productivity of their workforce and enable employees to do their best work\n", + "\n", + "under optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n", + "\n", + "**Minimize time in the “seam”**\n", + "\n", + "As you begin your data transformation, it is important to know that the longer it takes, the more risk and\n", + "\n", + "cost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n", + "\n", + "to a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n", + "\n", + "for some period of time. This will have a series of momentary adverse effects on your business:\n", + "\n", + "\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n", + "\n", + "\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n", + "\n", + "very different ecosystems\n", + "\n", + "\n", + "their work more efficiently, regardless of their\n", + "\n", + "persona or role. The user experience is designed\n", + "\n", + "to support the workloads of data analysts, SQL\n", + "\n", + "developers, data engineers, data scientists and\n", + "\n", + "machine learning professionals.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n", + "\n", + "models and cyber defenses\n", + "\n", + "\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n", + "\n", + "\u0007It will require precise communications to ensure that your business partners know which environment to\n", + "\n", + "use and for what data workloads\n", + "\n", + "To mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n", + "\n", + "“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n", + "\n", + "It’s important to remember this is a critical but short-lived experience for business continuity.\n", + "\n", + "**Shut down legacy platforms**\n", + "\n", + "In keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n", + "\n", + "steps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n", + "\n", + "premises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n", + "\n", + "premises Hadoop system is generally as follows:\n", + "\n", + "**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n", + "\n", + "**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n", + "\n", + "fixes or absolutely critical new business use cases.\n", + "\n", + "**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n", + "\n", + "**4.** \u0007Identify the source systems that feed the data.\n", + "\n", + "**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n", + "\n", + "**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n", + "\n", + "**7.** \u0007Determine the downstream consumers of the output from the jobs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n", + "\n", + "**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n", + "\n", + "architecture.\n", + "\n", + "**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n", + "\n", + "working smoothly.\n", + "\n", + "**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n", + "\n", + "**12.** \u0007Rinse and repeat — until all jobs are migrated.\n", + "\n", + "**13.** \u0007Shut down the Hadoop cluster.\n", + "\n", + "A similar model can also be applied to legacy on-premises enterprise data warehouses.\n", + "\n", + "You can follow the same process for other legacy systems in your environment. Some of these systems\n", + "\n", + "may be more complex and require the participation of more stakeholders to identify the fastest way to\n", + "\n", + "rationalize the data and processes. It is important, however, to make sure that the organization has the\n", + "\n", + "fortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n", + "\n", + "their lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n", + "\n", + "for teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n", + "\n", + "9 plays a crucial role in seeing the shutdown of legacy platforms through.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 2. Identify and prioritize use cases\n", + "\n", + "An important next step in enabling data, analytics and AI to transform your business is to identify use cases\n", + "\n", + "that drive business value — while prioritizing the ones that are achievable under the current conditions\n", + "\n", + "(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n", + "\n", + "that could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n", + "\n", + "Leaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n", + "\n", + "**Establish the list of potential use cases**\n", + "\n", + "The first step is to ideate by bringing together various stakeholders from across the organization and\n", + "\n", + "understand the overall business drivers — especially those that are monitored by the CEO and board of\n", + "\n", + "directors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n", + "\n", + "and understand the business processes and the data required to implement the use case. After steps one and\n", + "\n", + "two, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n", + "\n", + "project within the data/IT teams, it’s important to have a line of business champion at the executive level.\n", + "\n", + "There needs to be a balance between use cases that are complex and ones that are considered low-\n", + "\n", + "hanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n", + "\n", + "straightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n", + "\n", + "given individual or household. However, developing a sophisticated credit card fraud model that takes into\n", + "\n", + "account geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n", + "\n", + "to perform the analytics.\n", + "\n", + "In terms of performance, thought should be given to the speed at which the use case must execute. In\n", + "\n", + "general, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n", + "\n", + "cases into three categories:\n", + "\n", + "**1.** Sub-second response\n", + "\n", + "**2.** Multi-second response\n", + "\n", + "**3.** Multi-minute response\n", + "\n", + "\n", + "-----\n", + "\n", + "Being pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n", + "\n", + "engineering the design and infrastructure.\n", + "\n", + "**Thinking in terms of “data assets”**\n", + "\n", + "Machine learning algorithms require data — data that is readily available, of high quality and relevant — to\n", + "\n", + "perform the experiments, train the models, and then execute the model when it is deployed to production.\n", + "\n", + "The quality and veracity of the data used to perform these machine learning steps are key to deploying\n", + "\n", + "models into production that produce a tangible ROI.\n", + "\n", + "It is critical to understand what steps are needed in order to make the data available for a given use case.\n", + "\n", + "One point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n", + "\n", + "teams need to perform work to make data available for one use case, then look for opportunities to have the\n", + "\n", + "engineers do incremental work in order to surface data for adjacent use cases.\n", + "\n", + "Mature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n", + "\n", + "the importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n", + "\n", + "approach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n", + "\n", + "the level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n", + "\n", + "roadmap helps data source owners understand the priority and complexity of the data assets that need to\n", + "\n", + "be created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n", + "\n", + "influences the design of business applications and other systems within the organization.\n", + "\n", + "**Determine the highest impact/priority**\n", + "\n", + "As shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n", + "\n", + "account three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n", + "\n", + "whether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n", + "\n", + "reduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n", + "\n", + "data science talent readily available, to implement the use case. The ROI score indicates whether or not the\n", + "\n", + "organization can easily measure the impact to the P/L.\n", + "\n", + "\n", + "-----\n", + "\n", + "|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n", + "|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n", + "||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n", + "||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n", + "|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n", + "||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n", + "||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n", + "|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n", + "\n", + "\n", + "**Figure 5:**\n", + "Methodology for scoring use cases\n", + "**Ensure business and technology leadership alignment**\n", + "\n", + "Prioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n", + "\n", + "It is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n", + "\n", + "reduction (defensive). For example, data governance and compliance use cases should take priority\n", + "\n", + "over offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n", + "\n", + "acquisition of a new customer.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Professional Services team can\n", + "\n", + "help customers identify revenue-generating and\n", + "\n", + "cost-saving opportunities for data and AI use cases\n", + "\n", + "that provide a significant ROI when adopting the\n", + "\n", + "\n", + "#### 3. Build successful data teams\n", + "\n", + "In order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n", + "\n", + "performing teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n", + "\n", + "training and leadership. Digital transformations require executive-level support and are likely to fail without\n", + "\n", + "it — especially in large organizations.\n", + "\n", + "However, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n", + "\n", + "an enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n", + "\n", + "data literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n", + "\n", + "\n", + "lakehouse architecture.\n", + "\n", + "**Chief information officers and chief data officers — two sides of the data coin**\n", + "\n", + "Data native companies generally have a single, accountable executive who is responsible for areas such\n", + "\n", + "as data science, business analytics, data strategy, data governance and data management. The data\n", + "\n", + "management aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n", + "\n", + "through the environment, performing data quality checks and scanning for sensitive data in the clear.\n", + "\n", + "Many organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n", + "\n", + "to oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n", + "\n", + "stakeholders to establish the overall project plan, design and implementation — and to align project\n", + "\n", + "management, product management, business analysis, data engineering, data scientist and machine\n", + "\n", + "learning talent.\n", + "\n", + "The CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n", + "\n", + "make the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n", + "\n", + "must understand the benefits of — and their role and responsibilities in — supporting the initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "There are two organizational constructs that are found in most successful data native companies. The first is\n", + "\n", + "the creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n", + "\n", + "ML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n", + "\n", + "the formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n", + "\n", + "priorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n", + "\n", + "Furthermore, CDOs need to bring their CIOs along early in the journey.\n", + "\n", + "**Creating an AI/ML COE**\n", + "\n", + "Data science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n", + "\n", + "everything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n", + "\n", + "difficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n", + "\n", + "document, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n", + "\n", + "However, the general distinction is that data science is used to produce insights, machine learning is used to\n", + "\n", + "produce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n", + "\n", + "is expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n", + "\n", + "various data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n", + "\n", + "set of questions.\n", + "\n", + "Organizations wanting to build a data science competency should consider hiring talent into a centralized\n", + "\n", + "organization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n", + "\n", + "data science. The COE works with the rest of the organization to educate and promote the appropriate use\n", + "\n", + "of data science for various use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "A common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n", + "\n", + "the business units or department. Using this approach, you achieve two goals:\n", + "\n", + "\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n", + "\n", + "within a business unit and can help identify use cases that drive value\n", + "\n", + "\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n", + "\n", + "and consistency in how work is performed among the cohort and brings that to the entire organization\n", + "\n", + "**Data and AI transformation steering committee**\n", + "\n", + "The purpose of the steering committee is to provide governance and guidance to the data transformation\n", + "\n", + "initiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n", + "\n", + "a vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n", + "\n", + "initiative.\n", + "\n", + "The steering committee should meet regularly with leaders from across the organization to hear status\n", + "\n", + "reports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n", + "\n", + "group of stakeholders, including:\n", + "\n", + "\u0007\n", + "**Program/project management:** To report the status of progress for deploying the new data\n", + "\n", + "ecosystem and driving adoption through use cases\n", + "\n", + "\u0007\n", + "**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n", + "\n", + "of the platform\n", + "\n", + "\u0007\n", + "**Engineering:** To report the status of the implementation and what technology trade-offs need\n", + "\n", + "to be made\n", + "\n", + "\u0007\n", + "**Data science:** To report on the progress made by the COE on educating the organization about\n", + "\n", + "use cases for ML and to report the status of various implementations\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007\n", + "**InfoSec:** To review the overall security, including network, storage, application and data\n", + "\n", + "encryption and tokenization\n", + "\n", + "\u0007\n", + "**Architecture:** To oversee that the implementation adheres to architectural standards\n", + "\n", + "and guardrails\n", + "\n", + "\u0007\n", + "**Risk, compliance and legal:** To oversee the approach to data governance\n", + "\n", + "and ethics in ML\n", + "\n", + "\u0007\n", + "**User experience:** To serve as the voice of the end users who will perform their jobs using\n", + "\n", + "the new data ecosystem\n", + "\n", + "\u0007\n", + "**Communication:** To provide up-to-date communications to the organization about next\n", + "\n", + "steps and how to drive adoption\n", + "\n", + "**Partnering with architecture and InfoSec**\n", + "\n", + "Early on, the CDO and CIO should engage the engineering and architecture community within the\n", + "\n", + "organization to ensure that everyone understands the technical implications of the overall strategy. This\n", + "\n", + "minimizes the chances that the engineering teams will build separate and competing data platforms. In\n", + "\n", + "regulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n", + "\n", + "The EA is responsible for validating that the overall technology design and data management features\n", + "\n", + "support the performance and regulatory compliance requirements — specifically, whether the proposed\n", + "\n", + "design can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n", + "\n", + "variety and veracity (four Vs) of the data environment.\n", + "\n", + "\n", + "It is important to fully understand which\n", + "\n", + "environments and accounts your data is stored\n", + "\n", + "in. The goal is to minimize the number of copies of\n", + "\n", + "your data and to keep the data within your cloud\n", + "\n", + "account — and not the vendor’s.\n", + "\n", + "Make sure the architecture and security model for\n", + "\n", + "protecting data is well understood.\n", + "\n", + "\n", + "-----\n", + "\n", + "From an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n", + "\n", + "applied to the new data ecosystem and that the authentication, authorization and access control methods\n", + "\n", + "meet all the data governance requirements. An industry best practice is to enable self-service registration\n", + "\n", + "of data sets, by the data owner, and support the assignment of security groups or roles to help automate\n", + "\n", + "the access control process. This allows data sets to be accessible only to the personnel that belong to a\n", + "\n", + "given group. The group membership could be based primarily on job function or role within the organization.\n", + "\n", + "This approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n", + "\n", + "too many access control groups — in other words, do not get too fine grained with group permissions, as\n", + "\n", + "they will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n", + "\n", + "row- and column-level security sparingly.\n", + "\n", + "**Centralized vs. federated labor strategy**\n", + "\n", + "In most organizations today, managers work in silos, making decisions with the best intentions but focused\n", + "\n", + "on their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n", + "\n", + "conflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n", + "\n", + "and money and potentially erode the confidence and motivation of the various teams. While it certainly is\n", + "\n", + "beneficial to compare and contrast different approaches to implementing an architecture, the approaches\n", + "\n", + "should be strictly managed, with everyone designing for the same goals and requirements — as described in\n", + "\n", + "this strategy document and adhering to the architectural principles and best practices.\n", + "\n", + "Even still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n", + "\n", + "least amount of complexity as possible, and one that can easily scale across the organization. It is very\n", + "\n", + "challenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n", + "\n", + "in front of this wave of innovation and take input from the various teams to create a single, centralized\n", + "\n", + "platform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n", + "\n", + "modern data stack — while ensuring that there is no duplication of effort when implementing the platform\n", + "\n", + "components. Figure 6 shows one possible structure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Figure 6:**\n", + "Centralized teams with matrixed responsibilities\n", + "\n", + "\n", + "**Data Scientist**\n", + "Model and predict with data\n", + "\n", + "**Data Analyst**\n", + "Visualize and describe data\n", + "\n", + "\n", + "**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n", + "\n", + "**Data Engineer**\n", + "Store, process, maintain data\n", + "\n", + "**Business Partners**\n", + "**and Domain Experts**\n", + "\n", + "\n", + "Centralize data scientists under CDO — embed in lines of business for day-to-day tasking\n", + "\n", + "Centralize data engineers under CIO/CTO — initially as an enterprise function\n", + "\n", + "**Hiring, training and upskilling your talent**\n", + "\n", + "While this guide does not cover recruiting strategies, it is important to note that data engineering and data\n", + "\n", + "science talent is very difficult to find in this competitive market. As a result, every organization should\n", + "\n", + "consider what training and upskilling opportunities exist for their current staff. A large number of online\n", + "\n", + "courses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n", + "\n", + "augment your existing staff with experienced data scientists and machine learning experts. You will then\n", + "\n", + "need to establish clear training paths, resources and timelines to upskill your talent.\n", + "\n", + "Using the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n", + "\n", + "less experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n", + "\n", + "in educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n", + "\n", + "transfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n", + "\n", + "experience of your talent is to enable data science using production-like data and creating a collaborative\n", + "\n", + "environment for code sharing.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks training, [documentation](https://docs.databricks.com) and\n", + "\n", + "[certification](https://databricks.com/learn/certification) available to customers is industry-\n", + "\n", + "leading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n", + "\n", + "\n", + "#### 4. Deploy a modern data stack\n", + "\n", + "The modern data architecture can most easily be described as the evolution of the enterprise data\n", + "\n", + "warehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n", + "\n", + "limitations and lessons learned from working with these two legacy data architectures inspired the next\n", + "\n", + "generation of data architecture — what the industry now refers to as the lakehouse.\n", + "\n", + "Figure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n", + "\n", + "have improved over time.\n", + "\n", + "\n", + "exemplar code for organizations to hit the ground\n", + "\n", + "running with data and AI.\n", + "\n", + "**Figure 7:**\n", + "A brief history of data architectures\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving beyond the enterprise data warehouse and data lake**\n", + "\n", + "The EDW provided organizations with the ability to easily load structured and semi-structured data into\n", + "\n", + "well-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n", + "\n", + "(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n", + "\n", + "the business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n", + "\n", + "catalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n", + "\n", + "users — while still being performant. The EDW served its purpose for decades. However, most of the recent\n", + "\n", + "advances in AI have been in better models to process unstructured data (text, images, video, audio), but\n", + "\n", + "these are precisely the types of data that an EDW is not optimized for.\n", + "\n", + "Therefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n", + "\n", + "_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n", + "\n", + "_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n", + "\n", + "often leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n", + "\n", + "commodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n", + "\n", + "organizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n", + "\n", + "range of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n", + "\n", + "complex programming model known as MapReduce, and the performance fell short of the majority of real-\n", + "\n", + "time use cases.\n", + "\n", + "Over time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n", + "\n", + "processing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n", + "\n", + "model was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n", + "\n", + "for languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n", + "\n", + "from storage and providing the scale needed for distributed workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "A data lakehouse combines the best of data\n", + "\n", + "\n", + "**Cloud-based data lakes**\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n", + "\n", + "Amazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n", + "\n", + "storage systems in the world — which make them an attractive platform to serve as the next generation\n", + "\n", + "of data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "lakes and data warehouses, enabling BI and ML\n", + "\n", + "\n", + "However, data lakes lack some critical features: They do not support transactions, they do not enforce\n", + "\n", + "data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "and batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n", + "\n", + "example, efficiently listing the millions of files (objects) that make up most large data lakes.\n", + "\n", + "**Lakehouse — the modern data architecture**\n", + "\n", + "What if it were possible to combine the best of both worlds? The performance, concurrency and data\n", + "\n", + "management of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n", + "\n", + "the target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n", + "\n", + "the complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n", + "\n", + "of this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n", + "\n", + "architecture possible.\n", + "\n", + "\n", + "on all data on a simple, open and multicloud\n", + "\n", + "modern data stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Exploratory Data Scientist**\n", + "\n", + "\n", + "**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n", + "\n", + "\n", + "**Curated Data Lake**\n", + "\n", + "\n", + "**Raw Data Ingest**\n", + "“Bronze”\n", + "\n", + "\n", + "**Filtered/Cleaned/Augmented**\n", + "“Silver”\n", + "\n", + "\n", + "**Business-Level Aggregates**\n", + "“Gold”\n", + "\n", + "\n", + "**D ATA Q U A L I T Y**\n", + "\n", + "**Data Sources (Batch and Real-Time)**\n", + "\n", + "\n", + "**Unstructured**\n", + "\n", + "- Image, Video, Audio\n", + "\n", + "- Free Text, Blob\n", + "\n", + "\n", + "**Semi-Structured**\n", + "\n", + "- Logs, Clickstream\n", + "\n", + "- CSV, JSON, XML\n", + "\n", + "\n", + "**Structured**\n", + "\n", + "- Systems of Record\n", + "\n", + "- Operational DBs\n", + "\n", + "\n", + "**Figure 8:**\n", + "The building blocks for a modern data architecture\n", + "\n", + "The lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n", + "\n", + "including real-time streaming, batch processing, data warehousing, data science and machine learning. This\n", + "\n", + "target-state architecture supports loading all the data types that might be interesting to an organization —\n", + "\n", + "structured, semi-structured and unstructured — and provides a single processing layer, using consistent\n", + "\n", + "APIs across programming languages, to curate data while applying rigorous data management techniques.\n", + "\n", + "The move toward a single, consistent approach to data pipelining and refinement saves organizations\n", + "\n", + "time, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n", + "\n", + "curation and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n", + "\n", + "The architecture makes possible the efficient creation of “data assets” for the organization by taking a\n", + "\n", + "stepwise approach to improving data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse key features**\n", + "\n", + "To effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n", + "\n", + "available for stakeholders to run business-critical production workloads:\n", + "\n", + "\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n", + "\n", + "management with declarative pipeline development, automatic data testing and deep visibility for\n", + "\n", + "monitoring and recovery.\n", + "\n", + "\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n", + "\n", + "data concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n", + "\n", + "read or write data, typically using SQL.\n", + "\n", + "\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n", + "\n", + "and evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n", + "\n", + "be able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n", + "\n", + "lakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n", + "\n", + "to unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n", + "\n", + "unstructured data like tables, files, models and dashboards in concert with existing data, storage and\n", + "\n", + "catalogs.\n", + "\n", + "\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n", + "\n", + "clusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n", + "\n", + "Some modern data warehouses also have this property.\n", + "\n", + "\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n", + "\n", + "an API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n", + "\n", + "access the data directly.\n", + "\n", + "\n", + "Databricks released Delta Lake to the open source\n", + "\n", + "community in 2019. Delta Lake provides all the data\n", + "\n", + "lifecycle management functions that are needed\n", + "\n", + "to make cloud-based object stores reliable and\n", + "\n", + "performant. This design allows clients to update\n", + "\n", + "multiple objects at once, replace a subset of\n", + "\n", + "the objects with another, etc., in a serializable\n", + "\n", + "manner that still achieves high parallel read/write\n", + "\n", + "performance from the objects — while offering\n", + "\n", + "advanced capabilities like time travel (e.g., query\n", + "\n", + "point-in-time snapshots or rollback of erroneous\n", + "\n", + "updates), automatic data layout optimization,\n", + "\n", + "upserts, caching and audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n", + "\n", + "used to store, refine, analyze and access data types needed for many new data applications, including\n", + "\n", + "images, video, audio, semi-structured data and text.\n", + "\n", + "\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n", + "\n", + "tools might be needed to support all these workloads, but they all rely on the same data repository.\n", + "\n", + "\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n", + "\n", + "eliminates the need for separate systems dedicated to serving real-time data applications.\n", + "\n", + "\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n", + "\n", + "improves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n", + "\n", + "data in both a data lake and a warehouse.\n", + "\n", + "\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n", + "\n", + "governance experience across all clouds. You don’t need to invest in reinventing processes for every\n", + "\n", + "cloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n", + "\n", + "focus on putting all your data to work to discover new insights and create business value.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "\n", + "**Figure 9:**\n", + "Delta Lake is the open data storage layer that delivers reliability,\n", + "security and performance on your data lake — for both\n", + "streaming and batch operations\n", + "\n", + "\n", + "-----\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n", + "\n", + "for security and access control are basic requirements. Data governance capabilities, including auditing,\n", + "\n", + "retention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n", + "\n", + "enable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n", + "\n", + "such enterprise features only need to be implemented, tested and administered for a single system.\n", + "\n", + "Databricks is the only cloud-native vendor\n", + "\n", + "\n", + "**Databricks — innovation driving performance**\n", + "\n", + "Advanced analytics and machine learning on unstructured and large-scale data are two of the most\n", + "\n", + "strategic priorities for enterprises today — and the growth of unstructured data is going to increase\n", + "\n", + "exponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n", + "\n", + "center of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n", + "\n", + "enough to meet the SLAs of the various workloads — especially SQL-based analytics.\n", + "\n", + "Databricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n", + "\n", + "and hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n", + "\n", + "on the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n", + "\n", + "technologies to provide the performance customers need to simplify their architecture. These innovations\n", + "\n", + "combine to provide a single architecture that can store and process all the data sets within an organization —\n", + "\n", + "supporting the range of analytics outlined above.\n", + "\n", + "**BI and SQL workloads**\n", + "\n", + "Perhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n", + "\n", + "for star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n", + "\n", + "part of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n", + "\n", + "to compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n", + "\n", + "satisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n", + "\n", + "execution, statistical analysis of files in the object store, and hardware and software improvements make it\n", + "\n", + "possible to deliver on this promise.\n", + "\n", + "\n", + "to be recognized as a Leader in both\n", + "\n", + "[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n", + "\n", + "**Cloud Database Management Systems** and\n", + "\n", + "**Data Science and Machine Learning Platforms**\n", + "\n", + "\n", + "-----\n", + "\n", + "**A word about the data mesh architecture**\n", + "\n", + "In 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n", + "\n", + "what some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n", + "\n", + "using a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n", + "\n", + "mesh approach avoids centralizing data in one location and encourages the source systems to create\n", + "\n", + "“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n", + "\n", + "designers advocate for a federated approach to data and AI — while using enterprise policies to govern how\n", + "\n", + "source systems make data assets available.\n", + "\n", + "There are several challenges with this approach. First, the data mesh assumes that each source system\n", + "\n", + "can dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n", + "\n", + "become “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n", + "\n", + "details to the individual teams. This has the potential of inconsistent implementations, which may lead to\n", + "\n", + "performance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n", + "\n", + "source system team has the necessary skills, or can acquire them, to build robust data products.\n", + "\n", + "The lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n", + "\n", + "from the source systems reduces the curation steps needed inside the data lake itself.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 5. Improve data governance and compliance\n", + "\n", + "Data governance is perhaps the most challenging aspect of data transformation initiatives. Every\n", + "\n", + "stakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n", + "\n", + "drive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n", + "\n", + "undetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n", + "\n", + "environments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n", + "\n", + "a blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n", + "\n", + "data governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n", + "\n", + "and privacy practices, and protect their data assets\n", + "\n", + "**Why data governance fails**\n", + "\n", + "While most people agree that data governance is a set of principles, practices and tooling that helps\n", + "\n", + "manage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n", + "\n", + "approach — one that balances realistic policies with automation and scalability.\n", + "\n", + "Too often the policies developed around data governance define very strict data management principles —\n", + "\n", + "for example, the development of an enterprise-wide ontological model that all data must adhere to.\n", + "\n", + "Organizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n", + "\n", + "effort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n", + "\n", + "complexity of the requirements. Meanwhile, data continues to flow through the organization without a\n", + "\n", + "consistent approach to governance, and too much of the effort is done manually and fraught with human error.\n", + "\n", + "\n", + "What are the basic building blocks of a sound data\n", + "\n", + "governance approach?\n", + "\n", + "\n", + "-----\n", + "\n", + "**A pragmatic approach to data governance**\n", + "\n", + "At a high level, organizations should enable the following data management capabilities:\n", + "\n", + "**\u0007Identify all sources of data**\n", + "\n", + "\u0007Identify all data-producing and data-storing applications\n", + "\n", + "\u0007Identify the systems of record (SOR) for each data set\n", + "\n", + "\u0007Label data sets as internal or external (third party)\n", + "\n", + "\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n", + "\n", + "\u0007Limit which operational data stores (ODSs) can re-store SOR data\n", + "\n", + "**\u0007Catalog data sets**\n", + "\n", + "\u0007Register all data sets in a centralized data catalog\n", + "\n", + "\u0007Create a lightweight, self-service data registration process\n", + "\n", + "\u0007Limit manual entry as much as possible\n", + "\n", + "\u0007Record the schema, if any, for the data set\n", + "\n", + "\u0007Use an inference engine or tool to extract the data set schema\n", + "\n", + "\u0007Add business and technical metadata to make it meaningful\n", + "\n", + "\u0007Use machine learning to classify data sets\n", + "\n", + "\u0007Use crowdsourcing to validate the machine-based results\n", + "\n", + "**Track data lineage**\n", + "\n", + "\u0007Track data set flow and what systems act on data\n", + "\n", + "\u0007Create an enumerated list of action values for specific operations\n", + "\n", + "\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n", + "\n", + "\n", + "\n", + "\u0007Optional: Add a source code repository URL for action traceability\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Perform data quality checks**\n", + "\n", + "\u0007Create a rules library that is centrally managed and versioned\n", + "\n", + "\u0007Update the rules library periodically with new rules\n", + "\n", + "\u0007Use a combination of checks — null/not null, regex, valid values\n", + "\n", + "\u0007Perform schema enforcement checks against data set registration\n", + "\n", + "By minimizing the number of copies of your data\n", + "\n", + "\n", + "**\u0007Scan for sensitive data**\n", + "\n", + "\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n", + "\n", + "\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n", + "\n", + "\u0007Use fixed-length tokens to preserve analytic value\n", + "\n", + "\u0007Determine the approach for token lookup/resolution when needed\n", + "\n", + "\u0007Ensure that any central token stores are secure with rotating keys\n", + "\n", + "\u0007Identify which data elements from GDPR/CCPA to include in scans\n", + "\n", + "\u0007Efficiently scan for sensitive data in cleartext using the rules library\n", + "\n", + "**\u0007Establish approved data flow patterns**\n", + "\n", + "\u0007Determine pathways for data flow (source —> target)\n", + "\n", + "\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n", + "\n", + "\u0007Determine read/write patterns for the data lake\n", + "\n", + "\u0007Strictly enforce data flow pathways to/from data lake\n", + "\n", + "\u0007Detect violations and anomalies using lineage event analysis\n", + "\n", + "\u0007Identify offending systems and shut down or grant exception\n", + "\n", + "\u0007Record data flow exceptions and set a remediation deadline\n", + "\n", + "**\u0007Centralize data access controls**\n", + "\n", + "\u0007Establish a common governance model for all data and AI assets\n", + "\n", + "\u0007Centrally define access policies for all data and AI assets\n", + "\n", + "\u0007Enable fine-grained access controls at row and column levels\n", + "\n", + "\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n", + "\n", + "\n", + "and moving to a single data processing layer where\n", + "\n", + "all your data governance controls can run together,\n", + "\n", + "you improve your chances of staying in compliance\n", + "\n", + "and detecting a data breach.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Make data discovery easy**\n", + "\n", + "\u0007Establish a data discovery model\n", + "\n", + "\u0007Use manual or automatic data classification\n", + "\n", + "\u0007Provide a visual interface for data discovery across your data estate\n", + "\n", + "\u0007Simplify data discovery with rich keyword- or business glossary-based search\n", + "\n", + "**\u0007Centralize data access auditing**\n", + "\n", + "\u0007Establish a framework or best practices for access auditing\n", + "\n", + "\u0007Capture audit logs for all CRUD operations performed on data\n", + "\n", + "\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n", + "\n", + "This is not intended to be an exhaustive list of features and requirements but rather a framework to\n", + "\n", + "evaluate your data governance approach. There will be violations at runtime, so it will be important to have\n", + "\n", + "procedures in place for how to handle these violations. In some cases, you may want to be very strict and\n", + "\n", + "shut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n", + "\n", + "the offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n", + "\n", + "these cases, the receiving systems must have their own methodology for dealing with bad data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Hidden cost of data governance**\n", + "\n", + "There are numerous examples of high-profile data breaches and failure to comply with consumer data\n", + "\n", + "protection legislation. You don’t have to look very far to see reports of substantial fines levied against\n", + "\n", + "organizations that were not able to fully protect the data within their data ecosystem. As organizations\n", + "\n", + "produce and collect more and more data, it’s important to remember that while storage is cheap, failing\n", + "\n", + "to enforce proper data governance is very, very expensive.\n", + "\n", + "In order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n", + "\n", + "compute power when you consider the massive amounts of data that exist in your organization. Each\n", + "\n", + "time you copy a piece of data to load it into another tool or platform, you need to determine what data\n", + "\n", + "governance techniques exist there and how you ensure that you truly know where all your data resides.\n", + "\n", + "Imagine the scenario where data flows through your environment and is loaded into multiple platforms\n", + "\n", + "using various ETL processes. How do you handle the situation when you discover that sensitive data is\n", + "\n", + "in cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n", + "\n", + "problem before it’s flagged for violation.\n", + "\n", + "Having a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n", + "\n", + "organization’s brand and balance sheet.\n", + "\n", + "The bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n", + "\n", + "it is to get data governance right.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 6. Democratize access to quality data\n", + "\n", + "Effective data and AI solutions rely more on the amount of quality data available than on the sophistication\n", + "\n", + "or complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n", + "\n", + "Data” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n", + "\n", + "data scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n", + "\n", + "which is to create new opportunities for revenue growth, cost reduction and risk reduction.\n", + "\n", + "**The 80/20 data science dilemma**\n", + "\n", + "Most existing data environments have their data stored primarily in different operational data stores within a\n", + "\n", + "given business unit (BU) — creating several challenges:\n", + "\n", + "\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n", + "\n", + "of cross-BU opportunities\n", + "\n", + "\u0007The schemas are generally not well understood outside of BU or department — with only the database\n", + "\n", + "designers and power users being able to make efficient use of the data. This is referred to as the “tribal\n", + "\n", + "knowledge” phenomenon.\n", + "\n", + "\u0007The approval process and different system-level security models make it difficult and time-consuming\n", + "\n", + "for data scientists to gain the proper access to the data they need\n", + "\n", + "In order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n", + "\n", + "often done using single-node data science and generates unnecessary copies of data stored on local disk\n", + "\n", + "drives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n", + "\n", + "spaces” within production platform environments. This has the strong potential of degrading the overall\n", + "\n", + "performance for true production workloads.\n", + "\n", + "To make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n", + "\n", + "be needed in order to get the best model performance for your ML and AI workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Small data sets reduce the effectiveness of exploration, experimentation, model development and model\n", + "\n", + "training — resulting in inaccurate models when deployed into production and used with full-size data sets.\n", + "\n", + "As a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n", + "\n", + "time performing analytic work — work that may need to be redone once they have access to the full-size\n", + "\n", + "data sets. This is a serious problem for organizations that want to remain competitive and generate game-\n", + "\n", + "changing results.\n", + "\n", + "Another factor contributing to reduced productivity is the way in which end users are typically granted\n", + "\n", + "access to data. Security policies usually require both coarse-grained and fine-grained data protections.\n", + "\n", + "In other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n", + "\n", + "grained) within the data set.\n", + "\n", + "**Rationalize data access roles**\n", + "\n", + "The most common approach to providing coarse-grained and fine-grained access is to use what’s known\n", + "\n", + "as role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n", + "\n", + "(SSO) authentication and access control solution.\n", + "\n", + "Users can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n", + "\n", + "There are different strategies for identifying and creating these groups — but typically, they are done on a\n", + "\n", + "system-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n", + "\n", + "This approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n", + "\n", + "thousand discrete security groups for large organizations — despite having a much smaller number of\n", + "\n", + "defined job functions.\n", + "\n", + "This approach creates one of the biggest security challenges in large organizations. When personnel leave\n", + "\n", + "the company, it is fairly straightforward to remove them from the various security groups. However, when\n", + "\n", + "personnel move around within the organization, their old security group assignments often remain intact\n", + "\n", + "and new ones are assigned based on their new job function. This leads to personnel continuing to have\n", + "\n", + "access to data that they no longer have a “need to know.”\n", + "\n", + "\n", + "The Databricks Lakehouse Platform brings together\n", + "\n", + "all the data and AI personas into one environment\n", + "\n", + "and makes it easy to collaborate, share code and\n", + "\n", + "insights, and operate against the same view of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data classification**\n", + "\n", + "Having all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n", + "\n", + "strategies to segment your data based on “need to know.” Some organizations create a partition based\n", + "\n", + "on which business unit owns the data and which one owns the data classification. For example, in a\n", + "\n", + "financial services company, credit card customers’ data could be stored separately from that of debit card\n", + "\n", + "customers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n", + "\n", + "The simplest approach to data classification is to use three labels:\n", + "\n", + "\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n", + "\n", + "releases, etc.\n", + "\n", + "\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n", + "\n", + "competitors. This would include strategy briefings and market or customer segmentation research.\n", + "\n", + "\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n", + "\n", + "could negatively affect operations and put the organization at financial or legal risk. Restricted data\n", + "\n", + "requires the highest level of security protection.\n", + "\n", + "Some organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n", + "\n", + "understands how to apply them.\n", + "\n", + "The data classification requirements should be clearly documented and mapped to any legal or regulatory\n", + "\n", + "requirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n", + "\n", + "and defines “personal information” as “information that identifies, relates to, describes, is capable of\n", + "\n", + "being associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n", + "\n", + "household.”\n", + "\n", + "\n", + "-----\n", + "\n", + "Just examining one CCPA category, _Customer Records Information_ , we see that the following information is\n", + "\n", + "to be protected: name, signature, social security number, physical characteristics or description, address,\n", + "\n", + "telephone number, passport number, driver’s license or state identification card number, insurance policy\n", + "\n", + "number, education, employment, employment history, bank account number, credit or debit card number,\n", + "\n", + "other financial information, medical information, and health insurance information.\n", + "\n", + "There are generally three different approaches in industry to performing data classification:\n", + "\n", + "**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n", + "\n", + "done using regular expressions and lookup tables to map values to actual entities stored inside the\n", + "\n", + "organization (e.g., customer SSN).\n", + "\n", + "**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n", + "\n", + "the sensitivity of the data.\n", + "\n", + "**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n", + "\n", + "domain knowledge to ensure accuracy.\n", + "\n", + "Taking all this into account, an organization could implement a streamlined set of roles for RBAC that\n", + "\n", + "uses the convention where “domain” might be the\n", + "\n", + "business unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n", + "\n", + "the ID, and “classification” is one of the three values (public, internal, restricted).\n", + "\n", + "There is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n", + "\n", + "role assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n", + "\n", + "combination.\n", + "\n", + "\n", + "-----\n", + "\n", + "For example, gives a user or a system access to all the\n", + "\n", + "data fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n", + "\n", + "Whereas would allow the user or system\n", + "\n", + "access only to nonsensitive data regarding the transaction.\n", + "\n", + "This gives organizations the chance to rationalize their security groups by using a domain naming\n", + "\n", + "convention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n", + "\n", + "groups. It also dramatically eases the administration of granting access to data for a given user.\n", + "\n", + "**Everyone working from the same view of data**\n", + "\n", + "The modern data stack, when combined with a simplified security group approach and a robust data\n", + "\n", + "governance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n", + "\n", + "improves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n", + "\n", + "shared view of your data.\n", + "\n", + "Combining this with a sensitive data tokenization strategy can make it straightforward to empower data\n", + "\n", + "scientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n", + "\n", + "sets that both obfuscate NPI/PII information and preserve analytic value.\n", + "\n", + "Now, data discovery is easier because data sets have been registered in the catalog with full descriptions\n", + "\n", + "and business metadata — with some organizations going as far as showing realistic sample data for a\n", + "\n", + "particular data set. If a user does not have access to the underlying data files, having data in one physical\n", + "\n", + "location eases the burden of granting access, and then it’s easier to deploy access-control policies and\n", + "\n", + "collect/analyze audit logs to monitor data usage and to look for bad actors.\n", + "\n", + "\n", + "Adopting the Databricks Lakehouse Platform allows\n", + "\n", + "you to add data sets into a well-managed data lake\n", + "\n", + "using low-cost object stores, and makes it easy to\n", + "\n", + "partition data based on domain, entity, data set and\n", + "\n", + "classification levels to provide fine-grained (row-\n", + "\n", + "level and column-level) security.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data security, validation and curation — in one place**\n", + "\n", + "The modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n", + "\n", + "protecting, validating and improving your organization’s data. Data governance policies can be enforced\n", + "\n", + "using the built-in features of schema validation, expectations and pipelines — the three main steps to data\n", + "\n", + "curation. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n", + "\n", + "refer to it at Databricks, Bronze —> Silver —> Gold.\n", + "\n", + "The raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n", + "\n", + "data. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n", + "\n", + "the data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n", + "\n", + "level” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n", + "\n", + "ACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n", + "\n", + "“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n", + "\n", + "customer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n", + "\n", + "cross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n", + "\n", + "and ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n", + "\n", + "jobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n", + "\n", + "improves the user experience and helps retain talent.\n", + "\n", + "**Extend the impact of your data with secure data sharing**\n", + "\n", + "Data sharing is crucial to drive business value in today’s digital economy. More and more organizations\n", + "\n", + "are now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n", + "\n", + "customers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n", + "\n", + "monetization. Additionally, organizations are interested in leveraging external data to drive new product\n", + "\n", + "innovations and services.\n", + "\n", + "Business executives must establish and promote a data sharing culture in their organizations to build\n", + "\n", + "competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 7. Dramatically increase productivity of your workforce\n", + "\n", + "Now that you have deployed a modern data stack and have landed all your analytical data in a well-\n", + "\n", + "managed data lake with a rationalized approach to access control, the next question is, “What tools should I\n", + "\n", + "provide to the user community so they can be most effective at using the new data ecosystem?”\n", + "\n", + "**Design thinking: working backward from the user experience**\n", + "\n", + "Design thinking is a human-centered approach to innovation — focused on understanding customer needs,\n", + "\n", + "rapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n", + "\n", + "processes and organizations. Design thinking was introduced as a technique to not only improve but also\n", + "\n", + "bring joy to the way people work. The essence of design thinking is to determine what motivates people to\n", + "\n", + "do their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n", + "\n", + "**Moving beyond best of breed**\n", + "\n", + "If you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n", + "\n", + "training and model deployment tools. Many organizations take a “best of breed” approach in providing\n", + "\n", + "tooling for their end users. This typically occurs because leaders genuinely want to empower business\n", + "\n", + "units, departments and teams to select the tool that best suits their specific needs — so-called federated\n", + "\n", + "tool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n", + "\n", + "given the high cost of rolling it out to the entire user population.\n", + "\n", + "\n", + "-----\n", + "\n", + "When tool selection becomes localized, there are a few things to consider:\n", + "\n", + "\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n", + "\n", + "interchangeable with criteria that are established within a specific tool category. The tool with the best\n", + "\n", + "overall score gets selected.\n", + "\n", + "\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n", + "\n", + "personal preference or adoption within a department, or because a given tool is better suited to support\n", + "\n", + "a current business process\n", + "\n", + "\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n", + "\n", + "\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n", + "\n", + "moved into production\n", + "\n", + "\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n", + "\n", + "and security environment but nothing more\n", + "\n", + "\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n", + "\n", + "of tools in play or streamlining the user experience\n", + "\n", + "\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n", + "\n", + "partnership model, the ability to influence the roadmap and professional services support\n", + "\n", + "For these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n", + "\n", + "on selecting a data platform that enables seamless integration with point solutions rather than a suite of\n", + "\n", + "discrete tools that require integration work and may no longer be category leaders over the long haul.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is a leading data and AI company —\n", + "\n", + "\n", + "Keep in mind that data platforms work well because the vendor took an opinionated point of view of how\n", + "\n", + "data processing, validation and curation should work. It’s the integration between the discrete functions\n", + "\n", + "of the platform that saves time, conserves effort and improves the user experience. Many companies try\n", + "\n", + "to take on the integration of different technology stacks, which increases risk, cost and complexity. The\n", + "\n", + "consequences of not doing the integration properly can be serious — in terms of security, compliance,\n", + "\n", + "efficiency, cost, etc.\n", + "\n", + "\n", + "partly due to the innovations in the [open source](https://databricks.com/product/open-source)\n", + "\n", + "\n", + "So, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n", + "\n", + "and incorporate your requirements into their platform product roadmap. This will require some give-and-\n", + "\n", + "take from both parties — sometimes calling for an organization to adjust their processes to better fit how\n", + "\n", + "the platform works. There are many instances where a given business process could be simplified or recast\n", + "\n", + "to work with the platform, as is. Sometimes it will require the vendor to add features that support your\n", + "\n", + "processes. The vendor will always be market driven and will want to build features in such a way that they\n", + "\n", + "apply to the broadest set of customers.\n", + "\n", + "The final point to consider is that it takes a substantial amount of time to become an expert user of a given\n", + "\n", + "tool. Users must make a significant investment to learn how the tool works and the most efficient way of\n", + "\n", + "performing their job. The more discrete tools in an environment, the more challenging this becomes.\n", + "\n", + "Minimizing the number of tools and their different interfaces, styles of interaction and approach to security\n", + "\n", + "and collaboration helps improve the user experience and decreases time to market.\n", + "\n", + "\n", + "[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n", + "\n", + "listening to the needs of thousands of customers\n", + "\n", + "and having our engineers work side by side with\n", + "\n", + "customer teams to deliver real business value using\n", + "\n", + "data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified platform, unified personas**\n", + "\n", + "Deploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n", + "\n", + "data stack — will provide an integrated suite of tools for the full range of personas in your organization,\n", + "\n", + "including business analysts, SQL developers, data engineers and data scientists. You will immediately\n", + "\n", + "increase productivity and reduce risk because you’ll be better able to share the key aspects of data\n", + "\n", + "pipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n", + "\n", + "and deployment. All the work streams function off a single view of the data, and the handoffs between\n", + "\n", + "subsystems are well managed.\n", + "\n", + "Data processing happens in one auditable environment, and the number of copies of data is kept to an\n", + "\n", + "absolute minimum — with each user benefiting from the data assets created by others. Redundant work\n", + "\n", + "is eliminated.\n", + "\n", + "The 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n", + "\n", + "working with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n", + "\n", + "the 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n", + "\n", + "Another challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n", + "\n", + "differently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n", + "\n", + "consumers must be able to adjust by monitoring the execution and detecting the changes. The data\n", + "\n", + "scientist, in turn, must update and test new models on the new data. Your data platform should make the\n", + "\n", + "detection and remediation easier, not harder.\n", + "\n", + "For the data engineers, their primary focus is extracting data from source systems and moving it into the\n", + "\n", + "new data ecosystem. The data pipeline function can be simplified with a unified data platform because\n", + "\n", + "the programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n", + "\n", + "results in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n", + "\n", + "and debug since the compute layer is consistent, and the logging and auditing associated with the data\n", + "\n", + "processing and data management is centralized and of more value.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Maximize the productivity of your workforce**\n", + "\n", + "Once you have a data platform that brings together your full range of personas, you should focus on the\n", + "\n", + "next step for increasing productivity — namely, self-service environments.\n", + "\n", + "In large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n", + "\n", + "environments for development, testing and production. These environments need to be nearly identical to\n", + "\n", + "one another — using the same version of software while limiting the number, size and horsepower of the\n", + "\n", + "compute nodes. To the extent possible, development and test should be performed with realistic test/\n", + "\n", + "synthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n", + "\n", + "percentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n", + "\n", + "general shape and range of values.\n", + "\n", + "The **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n", + "\n", + "environments should be small and controlled with policies that spin them up and tear them down efficiently.\n", + "\n", + "Every aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n", + "\n", + "environment that cannot be destroyed and easily rebuilt.\n", + "\n", + "The **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n", + "\n", + "tools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n", + "\n", + "the developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n", + "\n", + "management.\n", + "\n", + "Moving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n", + "\n", + "developers cannot randomly promote software to run in production. Again, this process should be\n", + "\n", + "strictly governed using a workflow/sign-off approval approach — signed off by management as well.\n", + "\n", + "Many organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n", + "\n", + "deployments.\n", + "\n", + "\n", + "**DEV** **TEST**\n", + "\n", + "**PROD**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 8. Make informed build vs. buy decisions\n", + "\n", + "A key piece of the strategy will involve the decision around which components of the data ecosystem are\n", + "\n", + "built by the in-house engineering team and which components are purchased through a vendor relationship.\n", + "\n", + "There is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n", + "\n", + "engineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n", + "\n", + "**Competitive advantage**\n", + "\n", + "This “roll your own’’ approach has some advantages — including being able to establish the overall product\n", + "\n", + "vision, prioritize features and directly allocate the resources to build the software. However, it is important to\n", + "\n", + "keep in mind which aspects of your development effort give you the most competitive advantage.\n", + "\n", + "Spend some time working with the data transformation steering committee and other stakeholders to\n", + "\n", + "debate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n", + "\n", + "come down to whether or not a given solution offers true competitive advantage for the organization. Does\n", + "\n", + "building this piece of software make it harder for your competitors to compete with you? If the answer is no,\n", + "\n", + "then it is better to focus your engineering and data science resources on deriving insights from your data.\n", + "\n", + "**Beware: becoming your own software vendor**\n", + "\n", + "As many engineering leaders know, building your own software is an exciting challenge. However, it does\n", + "\n", + "come with added responsibility — namely, managing the overall project timeline and costs, and being\n", + "\n", + "responsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n", + "\n", + "updates. You basically are becoming your own software vendor for every component of the ecosystem\n", + "\n", + "that you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n", + "\n", + "several million dollars per year building out individual component parts of the new data system. This doesn’t\n", + "\n", + "include the cost to operate and maintain the software once it is in production.\n", + "\n", + "\n", + "-----\n", + "\n", + "To offset the anticipated development costs, engineering teams will oftentimes make the argument that\n", + "\n", + "they are starting with open source software and extending it to meet the “unique requirements” of your\n", + "\n", + "organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n", + "\n", + "unique and b) the development offers the competitive advantage that you need.\n", + "\n", + "Even software built on top of open source still requires significant investment in integration and testing.\n", + "\n", + "The integration work is particularly challenging because of the large number of open source libraries that\n", + "\n", + "are required in the data science space. The question becomes, “Is this really the area that you want your\n", + "\n", + "engineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n", + "\n", + "**How long will it take? Can the organization afford to wait?**\n", + "\n", + "Even if you decide the software component provides a competitive advantage and is something worth\n", + "\n", + "building in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n", + "\n", + "time-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n", + "\n", + "business due to the anticipated delivery schedule. Keep in mind that software development projects usually\n", + "\n", + "take longer and cost more money than initially planned.\n", + "\n", + "The organization should understand the impact to the overall performance and capabilities of the daily\n", + "\n", + "ecosystem for any features tied to the in-house development effort. Your business partners likely do\n", + "\n", + "not care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n", + "\n", + "is reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n", + "\n", + "features and schedule.\n", + "\n", + "\n", + "Databricks is built on top of popular open source\n", + "\n", + "software that it created. Engineering teams can\n", + "\n", + "improve the underpinnings of the Databricks\n", + "\n", + "platform by submitting code via pull request and\n", + "\n", + "becoming committers to the projects. The benefit\n", + "\n", + "to organizations is that their engineers contribute\n", + "\n", + "to the feature set of the data platform while\n", + "\n", + "Databricks remains responsible for all integration\n", + "\n", + "and performance testing plus all the runtime\n", + "\n", + "support, including failover and disaster recovery.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Don’t forget about the data**\n", + "\n", + "Perhaps the single most important feature of a modern data stack is its ability to help make data sets and\n", + "\n", + "“data assets” consumable to the end users or systems. Data insights, model training and model execution\n", + "\n", + "cannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n", + "\n", + "In large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n", + "\n", + "sets from multiple lines of business or departments. Focusing your data engineering and data science\n", + "\n", + "efforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n", + "\n", + "creating true competitive advantage.\n", + "\n", + "The amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n", + "\n", + "serve up data for analysis should not be underestimated. The value of this work is equally important to\n", + "\n", + "the business. The ability to curate data to enable game-changing insights should be the focus of the work\n", + "\n", + "led by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n", + "\n", + "engineers innovate on components that don’t bring true competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 9. Allocate, monitor and optimize costs\n", + "\n", + "Beginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n", + "\n", + "class of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n", + "\n", + "only one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n", + "\n", + "more manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n", + "\n", + "case anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n", + "\n", + "and increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n", + "\n", + "related personas to collaborate and operate from the same point of view. Lessons learned on the platform\n", + "\n", + "could be easily shared and reused by other members of the team. The more the team used the unified\n", + "\n", + "platform, the more they collaborated and their level of expertise increased.\n", + "\n", + "**Reduce complexity, reduce costs**\n", + "\n", + "The architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n", + "\n", + "more complex — resulting in increased time to market and increased costs. This was mainly due to the\n", + "\n", + "requirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n", + "\n", + "for the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n", + "\n", + "serving and analytics are performed in a single compute layer.\n", + "\n", + "Organizations can rightsize the data environments and control costs using policies. The centralized\n", + "\n", + "and consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n", + "\n", + "bottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n", + "\n", + "expertise is developed within the workforce.\n", + "\n", + "\n", + "The Databricks platform optimizes costs for your\n", + "\n", + "data and AI workloads by intelligently provisioning\n", + "\n", + "infrastructure only as you need it. Customers can\n", + "\n", + "establish policies that govern the size of clusters\n", + "\n", + "based on DEV, TEST, PROD environments or\n", + "\n", + "anticipated workloads.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks monitors and records usage and allows\n", + "\n", + "organizations to easily track costs on a data and\n", + "\n", + "\n", + "**Centralized funding model**\n", + "\n", + "As previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n", + "\n", + "under the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n", + "\n", + "the likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n", + "\n", + "the funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n", + "\n", + "Funding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n", + "\n", + "steady state that is more sustainable.\n", + "\n", + "\n", + "AI workload basis. This provides the ability to\n", + "\n", + "\n", + "The budget takes into account the cost of the data engineering function, commercial software licenses and\n", + "\n", + "building out the center of excellence to accelerate the data science capabilities of the organization. Again,\n", + "\n", + "the CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n", + "\n", + "focused on the overall implementation plan and to make sound build vs. buy decisions.\n", + "\n", + "It’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n", + "\n", + "in the CIO’s organization to perform the data engineering tasks. The data science community reports into\n", + "\n", + "the CDO and is matrixed into the lines of business in order to better understand the business drivers and\n", + "\n", + "the data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n", + "\n", + "regulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n", + "\n", + "the necessary time to educate leaders throughout the organization on the value of data governance.\n", + "\n", + "\n", + "implement an enterprise-wide chargeback mode\n", + "\n", + "and put in place appropriate spending limits.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chargeback models**\n", + "\n", + "To establish the centralized budget to fund the data transformation initiative, some organizations impose\n", + "\n", + "a “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n", + "\n", + "should be used to build the data engineering and data science teams needed to deploy the building blocks\n", + "\n", + "of the new data ecosystem. However, as different teams, departments and business units begin using the\n", + "\n", + "new data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n", + "\n", + "not be evenly distributed, due to different levels of usage from the various parts of the organization. The\n", + "\n", + "groups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n", + "\n", + "ability to monitor and track usage — not only based on compute but also on the amount of data generated\n", + "\n", + "and consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n", + "\n", + "and above the base-level funding.\n", + "\n", + "Plus, not all the departments or lines of business will require the same level of compute power or fault\n", + "\n", + "tolerance. The architecture should support the ability to separate out the runtime portions of the data\n", + "\n", + "ecosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n", + "\n", + "Some workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n", + "\n", + "nodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n", + "\n", + "critical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n", + "\n", + "better manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n", + "\n", + "performance is needed most.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 10. Move to production and scale adoption\n", + "\n", + "Now that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n", + "\n", + "ecosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n", + "\n", + "managing and using data to enable use cases that drive business value. They must also establish a clear\n", + "\n", + "set of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n", + "\n", + "continues to improve over time.\n", + "\n", + "**If you build it, they will come**\n", + "\n", + "Keep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n", + "\n", + "set registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n", + "\n", + "A high level of automation for the registration process is important because it’s not uncommon to see\n", + "\n", + "thousands of data sets in large organizations. The business and technical metadata plus the data quality\n", + "\n", + "rules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n", + "\n", + "provide a visualization that shows the data movement and verifies that the approved data flow paths are\n", + "\n", + "being followed.\n", + "\n", + "Some key metrics to keep an eye on are:\n", + "\n", + "\u0007Percentage of source systems contributing data to the ecosystem\n", + "\n", + "\u0007Percentage of real-time streaming relative to API and batch transfers\n", + "\n", + "\u0007Percentage of registered data sets with full business and technical metadata\n", + "\n", + "\u0007Volume of data written to the data lake\n", + "\n", + "\u0007Percentage of raw data that enters a data curation pipeline\n", + "\n", + "\u0007Volume of data consumed from the data lake\n", + "\n", + "\u0007Number of tables defined and populated with curated data\n", + "\n", + "\u0007Number of models trained with data from the data lake\n", + "\n", + "\u0007Lineage reports and anomaly detection incidents\n", + "\n", + "\u0007Number of users running Python, SQL, Scala and R workloads\n", + "\n", + "\n", + "In 2018, Databricks released MLflow — an open\n", + "\n", + "source platform to manage the ML lifecycle,\n", + "\n", + "including experimentation, reproducibility,\n", + "\n", + "deployment and a central model registry. MLflow\n", + "\n", + "is included in the Databricks Lakehouse Platform\n", + "\n", + "and accelerates the adoption of machine learning\n", + "\n", + "and AI in organizations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Communication plan**\n", + "\n", + "Communication is critical throughout the data transformation initiative — however, it is particularly\n", + "\n", + "important once you move into production. Time is precious and you want to avoid rework, if at all possible.\n", + "\n", + "Organizations often overlook the emotional and cultural toll that a long transformation process takes on\n", + "\n", + "the workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n", + "\n", + "and exhausting place to be — because your business partners are busy supporting two data worlds. Most\n", + "\n", + "users just want to know when the new environment will be ready. They don’t want to work with partially\n", + "\n", + "completed features, especially while performing double duty.\n", + "\n", + "Establish a solid communication plan and set expectations for when features will come online. Make sure\n", + "\n", + "there is detailed documentation, training and a support/help desk to field users’ questions.\n", + "\n", + "**DevOps — software development + IT operations**\n", + "\n", + "Mature organizations develop a series of processes and standards for how software and data are developed,\n", + "\n", + "managed and delivered. The term “DevOps” comes from the software engineering world and refers to\n", + "\n", + "developing and operating large-scale software systems. DevOps defines how an organization, its developers,\n", + "\n", + "operations staff and other stakeholders establish the goal of delivering quality software reliably and\n", + "\n", + "repeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n", + "\n", + "continuous delivery (CD).\n", + "\n", + "The CI portion of the process is the practice of frequently integrating newly written or changed code\n", + "\n", + "with the existing code repository. As software is written, it is continuously saved back to the source code\n", + "\n", + "repository, merged with other changes, built, integrated and tested — and this should occur frequently\n", + "\n", + "enough that the window between commit and build is narrow enough that no errors can occur without\n", + "\n", + "developers noticing them and correcting them immediately.\n", + "\n", + "This is particularly important for large, distributed teams to ensure that the software is always in a working\n", + "\n", + "state — despite the frequent changes from various developers. Only software that passes the CI steps is\n", + "\n", + "deployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n", + "\n", + "dependable releases.\n", + "\n", + "\n", + "Software development IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "**DataOps — data processing + IT operations**\n", + "\n", + "DataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n", + "\n", + "use the well-established processes from DevOps to consistently and reliably improve the quality of data\n", + "\n", + "used to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n", + "\n", + "needed for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n", + "\n", + "data are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n", + "\n", + "end cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n", + "\n", + "data sets, data assets and models that create value.\n", + "\n", + "For DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n", + "\n", + "and the data tooling should be designed to support the workflow and make all aspects of data curation and\n", + "\n", + "ETL more efficient.\n", + "\n", + "**MLOps — machine learning + IT operations**\n", + "\n", + "Not surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n", + "\n", + "deep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n", + "\n", + "unique when compared with DevOps and DataOps because the approach to deploying effective machine\n", + "\n", + "learning models is far more iterative and requires much more experimentation — data scientists try different\n", + "\n", + "features, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n", + "\n", + "base, understand the data used to perform the training and create reproducible results. The logging aspect\n", + "\n", + "of the ML development lifecycle is critical.\n", + "\n", + "MLOps aims to manage deployment of machine learning and deep learning models in large-scale\n", + "\n", + "production environments while also focusing on business and regulatory requirements. The ideal MLOps\n", + "\n", + "environment would include data science tools where models are constructed and analytical engines where\n", + "\n", + "computations are performed.\n", + "\n", + "\n", + "Data processing IT operations\n", + "\n", + "#### \n", + "\n", + "Machine learning IT operations\n", + "\n", + "\n", + "-----\n", + "\n", + "The overall workflow for deploying production ML models is shown in Figure 10.\n", + "\n", + "Unlike most software applications that execute a series of discrete operations, ML platforms are not\n", + "\n", + "deterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n", + "\n", + "suffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n", + "\n", + "be refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n", + "\n", + "should natively support this style of iterative data science.\n", + "\n", + "**Ethics in AI**\n", + "\n", + "As more organizations deploy data and AI solutions, there is growing concern around a number of issues\n", + "\n", + "related to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n", + "\n", + "fair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n", + "\n", + "must ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n", + "\n", + "explainability to satisfy legal and regulatory safeguards.\n", + "\n", + "The vast majority of AI work still involves software development by human beings and the use of curated\n", + "\n", + "data sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n", + "\n", + "questionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n", + "\n", + "explain how it works and describe the impact of its existence on the target audience — whether internal\n", + "\n", + "workers or customers.\n", + "\n", + "\n", + "Data extraction\n", + "\n", + "Data preparation\n", + "\n", + "Model e�aluation\n", + "\n", + "\n", + "Data analI�i�\n", + "\n", + "4\n", + "Model training\n", + "\n", + "6\n", + "Model �er�ing and\n", + "execution\n", + "\n", + "\n", + "Model monitoring\n", + "\n", + "**Figure 10:**\n", + "Workflow for deploying production ML models\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data and AI Maturity Model**\n", + "\n", + "When data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n", + "\n", + "a data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n", + "\n", + "Figure 11.\n", + "\n", + "**Top-Line Categories and Ranking Criteria**\n", + "\n", + "**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n", + "\n", + "1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n", + "\n", + "\n", + "Organization is beginning\n", + "to explore big data and\n", + "AI, and understand the\n", + "possibilities and potential\n", + "of a few starter projects\n", + "and experiment\n", + "\n", + "**Figure 11:**\n", + "The Data and AI Maturity Model\n", + "\n", + "\n", + "Organization builds\n", + "the basic capabilities\n", + "and foundations to\n", + "begin exploring a more\n", + "expansive data and AI\n", + "strategy, but it lacks vision,\n", + "long-term objectives or\n", + "leadership buy-in\n", + "\n", + "\n", + "Data and AI are budding\n", + "into drivers of value for\n", + "BUs aligned to specific\n", + "projects and initiatives as\n", + "the core tenets of data\n", + "and AI are integrated into\n", + "corporate strategy\n", + "\n", + "\n", + "Data and AI are core\n", + "drivers of value across the\n", + "organization, structured\n", + "and central to corporate\n", + "strategy, with a scalable\n", + "architecture that meets\n", + "business needs and buy-in\n", + "from across the organization\n", + "\n", + "\n", + "Data and AI are at the\n", + "heart of the corporate\n", + "strategy and are\n", + "invaluable differentiators\n", + "and drivers of competitive\n", + "advantage\n", + "\n", + "\n", + "Databricks partners with its customers to enable them to do an internal self-assessment. The output of the\n", + "\n", + "self-assessment allows organizations to:\n", + "\n", + "\u0007Understand the current state of their journey to data and AI maturity\n", + "\n", + "\u0007Identify key gaps in realizing (more) value from data and AI\n", + "\n", + "\u0007Plot a path to increase maturity with specific actions\n", + "\n", + "\u0007Identify Databricks resources who can help support their journey\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3:**\n", + "## Conclusion\n", + "\n", + "\n", + "After a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n", + "\n", + "with the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n", + "\n", + "— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n", + "\n", + "to future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n", + "\n", + "architecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n", + "\n", + "unleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n", + "\n", + "every use case.\n", + "\n", + "For more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n", + "\n", + "**A B O U T T H E A U T H O R**\n", + "\n", + "Chris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n", + "\n", + "is to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n", + "\n", + "Prior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n", + "\n", + "he led a team that was responsible for building out a modern data architecture that emphasized the key\n", + "\n", + "attributes of the lakehouse architecture.\n", + "\n", + "Chris has also held leadership roles at a number of technology companies.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf2024-09-19T16:57:23Z
### eBook\n", + "\n", + "# A New Approach to Data Sharing\n", + "\n", + "#### Open data sharing and collaboration for data, analytics, and AI\n", + "\n", + "### Second Edition\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n", + "\n", + "**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n", + "\n", + "Common data sharing use cases 6\n", + "\n", + "Data monetization 6\n", + "\n", + "Data sharing with partners or suppliers (B2B) 6\n", + "\n", + "Internal lines of business (LOBs) sharing 6\n", + "\n", + "Key benefits of data sharing 7\n", + "\n", + "**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n", + "\n", + "Legacy and homegrown solutions 9\n", + "\n", + "Proprietary vendor solutions 11\n", + "\n", + "Cloud object storage 13\n", + "\n", + "**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n", + "\n", + "What is Delta Sharing? 14\n", + "\n", + "Key benefits of Delta Sharing 16\n", + "\n", + "Maximizing value of data with Delta Sharing 18\n", + "\n", + "Data monetization with Delta Sharing 19\n", + "\n", + "B2B sharing with Delta Sharing 21\n", + "\n", + "Internal data sharing with Delta Sharing 23\n", + "\n", + "**Chapter 4: How Delta Sharing Works** **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Chapter 5: Introducing Databricks Marketplace** **28**\n", + "## Contents\n", + "\n", + "What is Databricks Marketplace? 30\n", + "\n", + "Key benefits of Databricks Marketplace 30\n", + "\n", + "Enable collaboration and accelerate innovation 32\n", + "\n", + "Powered by a fast, growing ecosystem 32\n", + "\n", + "Use cases for an open marketplace 32\n", + "\n", + "New upcoming feature: AI model sharing 33\n", + "\n", + "**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n", + "\n", + "What is a data clean room? 34\n", + "\n", + "Common data clean room use cases 36\n", + "\n", + "Shortcomings of existing data clean rooms 38\n", + "\n", + "Key benefits of Databricks Clean Rooms 39\n", + "\n", + "**Resources: Getting started with Data Sharing and Collaboration** **40**\n", + "\n", + "**About the Authors** **42**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + " Data Sharing in Today’s Digital Economy\n", + "\n", + "\n", + "Today’s economy revolves around data. Everyday, more and more\n", + "\n", + "organizations must exchange data with their customers, suppliers\n", + "\n", + "and partners. Security is critical. And yet, efficiency and immediate\n", + "\n", + "accessibility are equally important.\n", + "\n", + "Where data sharing may have been considered optional, it’s now\n", + "\n", + "required. More organizations are investing in streamlining internal\n", + "\n", + "and external data sharing across the value chain. But they still face\n", + "\n", + "major roadblocks — from human inhibition to legacy solutions to\n", + "\n", + "vendor lock-in.\n", + "\n", + "To be truly data-driven, organizations need a better way to share\n", + "\n", + "data. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n", + "\n", + "data sharing will outperform their peers on most business value\n", + "\n", + "\n", + "who have successfully executed data sharing initiatives are 1.7x\n", + "\n", + "more effective in showing business value and return on investment\n", + "\n", + "from their data analytics strategy.\n", + "\n", + "To compete in the digital economy, organizations need an open —\n", + "\n", + "and secure — approach to data sharing.\n", + "\n", + "This eBook takes a deep dive into the modern era of data sharing\n", + "\n", + "and collaboration, from common use cases and key benefits to\n", + "\n", + "conventional approaches and the challenges of those methods.\n", + "\n", + "You’ll get an overview of our open approach to data sharing and find\n", + "\n", + "out how Databricks allows you to share your data across platforms,\n", + "\n", + "to share all your data and AI, and to share all your data securely with\n", + "\n", + "unified governance in a privacy-safe way.\n", + "\n", + "\n", + "metrics. In addition, Gartner recently found that Chief Data Officers\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 1\n", + " What Is Data Sharing and Why Is It Important?\n", + "\n", + "Data sharing is the ability to make the same data available to one or many stakeholders — both external\n", + "\n", + "and internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n", + "\n", + "Data sharing — within your organization or externally — is an enabling technology for data commercialization\n", + "\n", + "and enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n", + "\n", + "to collaborate with partners, establish new partnerships and generate new revenue streams with data\n", + "\n", + "monetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n", + "\n", + "groups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n", + "\n", + "limited to roles such as the data analyst, data scientist and data engineer.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data sharing use cases\n", + "\n", + "\n", + "#### Data\n", + " monetization\n", + "\n", + "Companies across industries are commercializing\n", + "\n", + "data. Large multinational organizations have\n", + "\n", + "formed exclusively to monetize data, while other\n", + "\n", + "organizations are looking for ways to monetize\n", + "\n", + "their data and generate additional revenue\n", + "\n", + "streams. Examples of these companies can\n", + "\n", + "range from an agency with an identity graph to a\n", + "\n", + "telecommunication company with proprietary 5G\n", + "\n", + "data or to retailers that have a unique ability to\n", + "\n", + "combine online and offline data. Data vendors are\n", + "\n", + "growing in importance as companies realize they\n", + "\n", + "need external data for better decision-making.\n", + "\n", + "\n", + "#### Data sharing with partners\n", + " or suppliers (B2B)\n", + "\n", + "Many companies now strive to share data with\n", + "\n", + "partners and suppliers as similarly as they share\n", + "\n", + "it across their own organizations. For example,\n", + "\n", + "retailers and their suppliers continue to work more\n", + "\n", + "closely together as they seek to keep their products\n", + "\n", + "moving in an era of ever-changing consumer tastes.\n", + "\n", + "Retailers can keep suppliers posted by sharing sales\n", + "\n", + "data by SKU in real time, while suppliers can share\n", + "\n", + "real-time inventory data with retailers so they know\n", + "\n", + "what to expect. Scientific research organizations\n", + "\n", + "can make their data available to pharmaceutical\n", + "\n", + "companies engaged in drug discovery. Public safety\n", + "\n", + "agencies can provide real-time public data feeds\n", + "\n", + "of environmental data, such as climate change\n", + "\n", + "statistics or updates on potential volcanic eruptions.\n", + "\n", + "\n", + "#### Internal lines of business\n", + " (LOBs) sharing\n", + "\n", + "Within any company, different departments, lines\n", + "\n", + "of business and subsidiaries seek to share data so\n", + "\n", + "everyone can make decisions based on a complete\n", + "\n", + "view of the current business reality. For example,\n", + "\n", + "finance and HR departments need to share data\n", + "\n", + "as they analyze the true costs of each employee.\n", + "\n", + "Marketing and sales teams need a common view\n", + "\n", + "of data to determine the effectiveness of recent\n", + "\n", + "marketing campaigns. And different subsidiaries\n", + "\n", + "of the same company need a unified view of the\n", + "\n", + "health of the business. Removing data silos — which\n", + "\n", + "are often established for the important purpose of\n", + "\n", + "preventing unauthorized access to data — is critical\n", + "\n", + "for digital transformation initiatives and maximizing\n", + "\n", + "the business value of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of data sharing\n", + "\n", + "As you can see from the use cases described above, there are many benefits of data sharing, including:\n", + "\n", + "\n", + "**Greater collaboration with existing partners.** In today’s hyper-\n", + "\n", + "connected digital economy, no single organization can advance its\n", + "\n", + "business objectives without partnerships. Data sharing helps solidify\n", + "\n", + "existing partnerships and can help organizations establish new ones.\n", + "\n", + "\u0007 **Ability to generate new revenue streams.** With data sharing,\n", + "\n", + "organizations can generate new revenue streams by offering data\n", + "\n", + "products or data services to their end consumers.\n", + "\n", + "\n", + "**Ease of producing new products, services or business models.**\n", + "\n", + "Product teams can leverage both first-party data and third-party\n", + "\n", + "data to refine their products and services and expand their product/\n", + "\n", + "service catalog.\n", + "\n", + "**Greater efficiency of internal operations.** Teams across the\n", + "\n", + "organization can meet their business goals far more quickly when\n", + "\n", + "they don’t have to spend time figuring out how to free data from\n", + "\n", + "silos. When teams have access to live data, there’s no lag time\n", + "\n", + "between the need for data and the connection with the appropriate\n", + "\n", + "data source.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 2\n", + " Conventional Methods of Data Sharing and Their Challenges\n", + "\n", + "Sharing data across different platforms, companies and clouds is no easy task. In the past,\n", + "\n", + "organizations have hesitated to share data more freely because of the perceived lack\n", + "\n", + "of secure technology, competitive concerns and the cost of implementing data sharing\n", + "\n", + "solutions.\n", + "\n", + "Even for companies that have the budget to implement data sharing technology, many of\n", + "\n", + "the current approaches can’t keep up with today’s requirements for open-format, multi-\n", + "\n", + "cloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n", + "\n", + "which creates friction for data providers and data consumers who use non-compatible\n", + "\n", + "platforms.\n", + "\n", + "Over the past 30 years, data sharing solutions have come in three forms: legacy and\n", + "\n", + "homegrown solutions, cloud object storage and closed source commercial solutions.\n", + "\n", + "Each of these approaches comes with its pros and cons.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Legacy and homegrown solutions\n", + "\n", + "Many companies have built homegrown data sharing solutions based on legacy\n", + "\n", + "technologies such as email, (S)FTP or APIs.\n", + "\n", + "\n", + "Provider\n", + "\n", + "ETL\n", + "\n", + "\n", + "Consumer\n", + "\n", + "\n", + "Batch data\n", + "from provider\n", + "\n", + "\n", + "Table �\n", + "\n", + "Table 2\n", + "\n", + "\n", + "FTP/SSH/API\n", + "Server\n", + "\n", + "\n", + "FTP/SSH/API ETL Database Analyst Run Analysis\n", + "Server\n", + "\n", + "\n", + "**Figure 1:**\n", + "Legacy data\n", + "sharing solutions\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n", + "\n", + "consumers can leverage a suite of clients to access data provided to them.\n", + "\n", + "\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n", + "\n", + "and will work both on-prem and on clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n", + "\n", + "it and host it on an FTP server for different recipients. Additionally, this approach\n", + "\n", + "results in creating copies of data sets. Data copying causes duplication and prevents\n", + "\n", + "organizations from instantly accessing live data.\n", + "\n", + "\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n", + "\n", + "architectures due to replication and provisioning. This can add considerable time to\n", + "\n", + "data sharing activities and result in out-of-date data for end consumers.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n", + "\n", + "and load (ETL) the shared data for their end use cases, which further delays the time to\n", + "\n", + "insights. For any new data updates from the providers, the consumers have to rerun ETL\n", + "\n", + "pipelines again and again.\n", + "\n", + "\u0007 **Security and governance.** As modern data requirements become more stringent,\n", + "\n", + "homegrown and legacy technologies have become more difficult to secure and govern.\n", + "\n", + "\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n", + "\n", + "accommodate large data sets.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Proprietary vendor solutions\n", + "\n", + "Commercial data sharing solutions are a popular option among companies that don’t want\n", + "\n", + "to devote the time and resources to building an in-house solution yet also want more\n", + "\n", + "control than what cloud object storage can offer.\n", + "\n", + "\n", + "Vendor 1 Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Vendor V Platform\n", + "\n", + "Proprietary\n", + "data format\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "\n", + "Data Provider 1\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Data;\n", + "Provider\n", + "\n", + "Shared dataset\n", + "\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "No cross-platform\n", + "sharing\n", + "\n", + "\n", + "**Figure 2:**\n", + "Proprietary\n", + "vendor solutions\n", + "\n", + "\n", + "Shared dataset\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Shared data set\n", + "\n", + "\n", + "Sharing limited to recipients\n", + "on the same platform\n", + "\n", + "Data;\n", + "Consumer\n", + "\n", + "\n", + "Data;\n", + "Consumere\n", + "\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n", + "\n", + "the same platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n", + "\n", + "data sharing is easy among fellow customers, it’s usually impossible with those who\n", + "\n", + "use competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n", + "\n", + "Furthermore, platform differences between data providers and recipients introduce\n", + "\n", + "data sharing complexities.\n", + "\n", + "\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n", + "\n", + "data copies.\n", + "\n", + "\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n", + "\n", + "\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n", + "\n", + "consumers, as data providers have to replicate data for different recipients on different\n", + "\n", + "cloud platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Cloud object storage\n", + "\n", + "\n", + "**Cons**\n", + "\n", + "\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n", + "\n", + "same cloud to access the objects.\n", + "\n", + "\u0007 **Cumbersome security and governance.** Assigning permissions\n", + "\n", + "and managing access is complex. Custom application logic is\n", + "\n", + "needed to generate signed URLs.\n", + "\n", + "\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n", + "\n", + "find it difficult to understand Identity Access Management\n", + "\n", + "(IAM) policies and how data is mapped to underlying files. For\n", + "\n", + "companies with large volumes of data, sharing via cloud storage\n", + "\n", + "is time-consuming, cumbersome and nearly impossible to scale.\n", + "\n", + "\u0007 **Operational overhead for data recipients.** The data recipients\n", + "\n", + "have to run extract, transform and load (ETL) pipelines on the\n", + "\n", + "raw files before consuming them for their end use cases.\n", + "\n", + "The lack of a comprehensive solution makes it challenging for data\n", + "\n", + "providers and consumers to easily share data. Cumbersome and\n", + "\n", + "incomplete data sharing processes also constrain the development\n", + "\n", + "of business opportunities from shared data.\n", + "\n", + "\n", + "Object storage is considered a good fit for the cloud because it is\n", + "\n", + "elastic and can more easily scale into multiple petabytes to support\n", + "\n", + "unlimited data growth. The big three cloud providers all offer object\n", + "\n", + "storage services (AWS S3, Azure Blob, Google Cloud Storage) that\n", + "\n", + "are cheap, scalable and extremely reliable.\n", + "\n", + "An interesting feature of cloud object storage is the ability to\n", + "\n", + "generate signed URLs, which grant time-limited permission to\n", + "\n", + "download objects. Anyone who receives the presigned URL can\n", + "\n", + "then access the specified objects, making this a convenient\n", + "\n", + "way to share data.\n", + "\n", + "**Pros**\n", + "\n", + "\u0007 **Sharing data in place.** Object storage can be shared in place,\n", + "\n", + "allowing consumers to access the latest available data.\n", + "\n", + "\u0007 **Scalability.** Cloud object storage profits from availability and\n", + "\n", + "durability guarantees that typically cannot be achieved\n", + "\n", + "on-premises. Data consumers retrieve data directly from the\n", + "\n", + "cloud providers, saving bandwidth for the providers.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 3\n", + " Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n", + "\n", + "\n", + "We believe the future of data sharing should be characterized by\n", + "\n", + "open technology. Data sharing shouldn’t be tied to a proprietary\n", + "\n", + "technology that introduces unnecessary limitations and financial\n", + "\n", + "burdens to the process. It should be readily available to anyone who\n", + "\n", + "wants to share data at scale. This philosophy inspired us to develop\n", + "\n", + "and release a new protocol for sharing data: Delta Sharing.\n", + "\n", + "#### What is Delta Sharing?\n", + "\n", + "Delta Sharing provides an open solution to securely share live data\n", + "\n", + "from your lakehouse to any computing platform. Recipients don’t\n", + "\n", + "\n", + "Data providers can centrally manage, govern, audit and track\n", + "\n", + "usage of the shared data on one platform. Delta Sharing is natively\n", + "\n", + "integrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n", + "\n", + "manage and audit shared data across organizations and confidently\n", + "\n", + "share data assets while meeting security and compliance needs.\n", + "\n", + "With Delta Sharing, organizations can easily share existing large-\n", + "\n", + "scale data sets based on the open source formats Apache Parquet\n", + "\n", + "and Delta Lake without moving data. Teams gain the flexibility to\n", + "\n", + "query, visualize, transform, ingest or enrich shared data with their\n", + "\n", + "tools of choice.\n", + "\n", + "\n", + "have to be on the Databricks platform or on the same cloud or a\n", + "\n", + "cloud at all. Data providers can share live data without replicating\n", + "\n", + "it or moving it to another system. Recipients benefit from always\n", + "\n", + "having access to the latest version of data and can quickly query\n", + "\n", + "shared data using tools of their choice for BI, analytics and machine\n", + "\n", + "learning, reducing time-to-value.\n", + "\n", + "\n", + "-----\n", + "\n", + "Data ����i�e�\n", + "\n", + "\n", + "Any u�e cy�e\n", + "\n", + "Analytics\n", + "\n", + "BI\n", + "\n", + "Data Science\n", + "\n", + "\n", + "Data Recipient\n", + "\n", + "Any sool\n", + "\n", + "And many more\n", + "\n", + "\n", + "Any cloud/on-prem\n", + "\n", + "On-premises\n", + "\n", + "\n", + "Access permissions\n", + "\n", + "Delta Sharing Protocol\n", + "\n", + "\n", + "Delta �a�e �a�le Delta Sharing Ser�er\n", + "\n", + "\n", + "No replication\n", + "Easy to manage\n", + "Secure\n", + "\n", + "\n", + "**Figure 3:**\n", + "Delta Sharing\n", + "\n", + "\n", + "Databricks designed Delta Sharing with five goals in mind:\n", + "\n", + "\u0007Provide an open cross-platform sharing solution\n", + "\n", + "\u0007Share live data without copying it to another system\n", + "\n", + "\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n", + "\n", + "provide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n", + "\n", + "\u0007Provide strong security, auditing and governance\n", + "\n", + "\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n", + "\n", + "derivatives such as ML models, dashboards and notebooks, in addition to tabular data\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Delta Sharing\n", + "\n", + "By eliminating the obstacles and shortcomings associated with typical data sharing\n", + "\n", + "approaches, Delta Sharing delivers several key benefits, including:\n", + "\n", + "\n", + "**Open cross-platform sharing.** Delta Sharing establishes a new\n", + "\n", + "open standard for secure data sharing and supports open source\n", + "\n", + "Delta and Apache Parquet formats. Data recipients don’t have to be\n", + "\n", + "on the Databricks platform or on the same cloud, as Delta Sharing\n", + "\n", + "works across clouds and even from cloud to on-premises setups. To\n", + "\n", + "give customers even greater flexibility, Databricks has also released\n", + "\n", + "open source connectors for pandas, Apache Spark, Elixir and\n", + "\n", + "Python, and is working with partners on many more.\n", + "\n", + "\u0007 **Securely share live data without replication.** Most enterprise\n", + "\n", + "\n", + "**Centralized governance.** With Databricks Delta Sharing, data\n", + "\n", + "providers can grant, track, audit and even revoke access to shared\n", + "\n", + "data sets from a single point of enforcement to meet compliance and\n", + "\n", + "other regulatory requirements. Databricks Delta Sharing users get:\n", + "\n", + "\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n", + "\n", + "governance offering for Databricks Lakehouse\n", + "\n", + "\u0007Simple, more secure setup and management of shares\n", + "\n", + "\u0007The ability to create and manage recipients and data shares\n", + "\n", + "\u0007Audit logging captured automatically as part of Unity Catalog\n", + "\n", + "\u0007Direct integration with the rest of the Databricks ecosystem\n", + "\n", + "\u0007No separate compute for providing and managing shares\n", + "\n", + "\n", + "data today is stored in cloud data lakes. Any of these existing data\n", + "\n", + "sets on the provider’s data lake can easily be shared without any\n", + "\n", + "data replication or physical movement of data. Data providers can\n", + "\n", + "update their data sets reliably in real time and provide a fresh and\n", + "\n", + "consistent view of their data to recipients.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Share data products, including AI models, dashboards and**\n", + "\n", + "**notebooks, with greater flexibility.** Data providers can choose\n", + "\n", + "between sharing anentire table or sharing only a version or\n", + "\n", + "specific partitions of a table. However, sharing just tabular data\n", + "\n", + "is not enough to meet today’s consumer demands. Delta Sharing\n", + "\n", + "also supports sharing of non-tabular data and data derivatives\n", + "\n", + "such as data streams, AI models, SQL views and arbitrary files,\n", + "\n", + "enablingincreased collaboration and innovation. Data providers can\n", + "\n", + "build, package and distribute data products including data sets,\n", + "\n", + "AI and notebooks, allowingdata recipients to get insights faster.\n", + "\n", + "Furthermore, this approach promotes and empowers the exchange\n", + "\n", + "of knowledge — not just data — between different organizations.\n", + "\n", + "\n", + "**Share data at a lower cost.** Delta Sharing lowers the cost of\n", + "\n", + "managing and consuming shares for both data providers and\n", + "\n", + "recipients. Providers can share data from their cloud object store\n", + "\n", + "without replicating, thereby reducing the cost of storage. Incontrast,\n", + "\n", + "existing data sharing platforms require data providers to first move\n", + "\n", + "their data into their platform or store data in proprietary formats in\n", + "\n", + "their managed storage, which often costs more and results in data\n", + "\n", + "duplication. With Delta Sharing, data providers don’t need to set\n", + "\n", + "up separate computing environments to share data. Consumers\n", + "\n", + "can access shared data directly using their tools of choice without\n", + "\n", + "setting up specific consumption ecosystems, thereby reducing\n", + "\n", + "costs.\n", + "\n", + "\n", + "With Delta Sharing we are able to achieve a truly open marketplace\n", + "\n", + "and truly open ecosystem. In contrast, commercial products are\n", + "\n", + "mostly limited to sharing raw tabular data and cannot be used to\n", + "\n", + "\n", + "share these higher-valued data derivatives.\n", + "\n", + "\n", + "\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n", + "\n", + "set up a new ingestion process to consume data. Data recipients\n", + "\n", + "can directly access the fresh data and query it using tools of their\n", + "\n", + "choice. Recipients can also enrich data with data sets from popular\n", + "\n", + "data providers. The Delta Sharing ecosystem of open source and\n", + "\n", + "commercial partners is growing every day.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Maximizing value of data with Delta Sharing\n", + "\n", + "Delta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n", + "\n", + "variety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n", + "\n", + "Sharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n", + "\n", + "In this section we will explore the building blocks of such an approach and the use cases emerging from these.\n", + "\n", + "\n", + "“Delta Sharing helped us streamline our data delivery process\n", + "\n", + "for large data sets. This enables our clients to bring their own\n", + "\n", + "compute environment to read fresh curated data with little-to-\n", + "\n", + "no integration work, and enables us to continue expanding our\n", + "\n", + "catalog of unique, high-quality data products.”\n", + "\n", + "— **William Dague** , Head of Alternative Data, Nasdaq\n", + "\n", + "\n", + "“We recognize that openness of data will play a key role in\n", + "\n", + "achieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n", + "\n", + "provides Shell with a standard, controlled and secure protocol\n", + "\n", + "for sharing vast amounts of data easily with our partners to work\n", + "\n", + "toward these goals without requiring our partners be on the same\n", + "\n", + "data sharing platform.”\n", + "\n", + "— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n", + "\n", + "\n", + "“Leveraging the powerful capabilities of Delta Sharing from\n", + "\n", + "\n", + "Databricks enables Pumpjack Dataworks to have a faster\n", + "\n", + "onboarding experience, removing the need for exporting,\n", + "\n", + "importing and remodeling of data, which brings immediate\n", + "\n", + "value to our clients. Faster results yield greater commercial\n", + "\n", + "opportunity for our clients and their partners.”\n", + "\n", + "\n", + "“Data accessibility is a massive consideration for us. We believe\n", + "\n", + "that Delta Sharing will simplify data pipelines by enabling us to\n", + "\n", + "query fresh data from the place where it lives, and we are not\n", + "\n", + "locked into any platform or data format.”\n", + "\n", + "— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n", + "\n", + "\n", + "— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n", + "\n", + "“As a data company, giving our customers access to our data sets\n", + "\n", + "is critical. The Databricks Lakehouse Platform with Delta Sharing\n", + "\n", + "really streamlines that process, allowing us to securely reach a\n", + "\n", + "much broader user base regardless of cloud or platform.”\n", + "\n", + "— **Felix Cheung** , VP of Engineering, SafeGraph\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data monetization with Delta Sharing\n", + "\n", + "Delta Sharing enables companies to monetize their data product simply and with necessary governance.\n", + "\n", + "Data /on.2-er $\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Fulfllleen\n", + "\n", + "Entitles various data products\n", + "\n", + "Data Vendor\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Data /on.2-er �\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 4:**\n", + "Data monetization\n", + "with Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Billieg Audit Log\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Sharing, a data provider can seamlessly share large data sets and overcome\n", + "\n", + "the scalability issues associated with SFTP servers. Data providers can easily expand their\n", + "\n", + "data product lines since Delta Sharing doesn’t require you to build a dedicated service\n", + "\n", + "for each of your data products like API services would. The company simply grants and\n", + "\n", + "manages access to the data recipients instead of replicating the data — thereby reducing\n", + "\n", + "complexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n", + "\n", + "for a data product. Any data that exists on your platform can be securely shared with your\n", + "\n", + "consumers. This grants a wider addressable market — your products have appeal to a\n", + "\n", + "broader range of consumers, from those who say “we need access to your raw data only”\n", + "\n", + "to those who say “we want only small subsets of your Gold layer data.”\n", + "\n", + "To mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n", + "\n", + "access to the data. Data providers can use this information to determine the costs\n", + "\n", + "associated with any of the data products and evaluate if such products are commercially\n", + "\n", + "viable and sensible.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### B2B sharing with Delta Sharing\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner A\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "\n", + "Partner U\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "Partner B\n", + "\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "R/O R/O\n", + "\n", + "R/O\n", + "\n", + "\n", + "**Figure 5:**\n", + "B2B sharing with\n", + "Delta Sharing\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Sharing applies in the case of bidirectional exchange of data.\n", + "\n", + "Companies use Delta Sharing to incorporate partners and suppliers\n", + "\n", + "seamlessly into their workflows. Traditionally, this is not an easy task.\n", + "\n", + "An organization typically has no control over how their partners are\n", + "\n", + "implementing their own data platforms. The complexity increases\n", + "\n", + "when we consider that the partners and suppliers can reside in\n", + "\n", + "a public cloud, private cloud or an on-premises deployed data\n", + "\n", + "platform. The choices of platform and architecture are not imposed\n", + "\n", + "on your partners and suppliers. Due to its open protocol, Delta\n", + "\n", + "Sharing addresses this requirement foundationally. Through a wide\n", + "\n", + "array of existing connectors (and many more being implemented),\n", + "\n", + "your data can land anywhere your partners and suppliers need to\n", + "\n", + "consume it.\n", + "\n", + "\n", + "In addition to the location of data consumer residency, the\n", + "\n", + "complexity of data arises as a consideration. The traditional\n", + "\n", + "approach to sharing data using APIs is inflexible and imposes\n", + "\n", + "additional development cycles on both ends of the exchange in\n", + "\n", + "order to implement both the provider pipelines and consumer\n", + "\n", + "pipelines. With Delta Sharing, this problem can be abstracted. Data\n", + "\n", + "can be shared as soon as it lands in the Delta table and when the\n", + "\n", + "shares and grants are defined. There are no implementation costs\n", + "\n", + "on the provider side. On the consumer side, data simply needs\n", + "\n", + "to be ingested and transformed into an expected schema for the\n", + "\n", + "downstream processes.\n", + "\n", + "This means that you can form much more agile data exchange\n", + "\n", + "patterns with your partners and suppliers and attain value from your\n", + "\n", + "combined data much quicker than ever before.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Internal data sharing with Delta Sharing\n", + "\n", + "Internal data sharing is becoming an increasingly important consideration for any modern\n", + "\n", + "organization, particularly where data describing the same concepts have been produced in\n", + "\n", + "different ways and in different data silos across the organization. In this situation it is important\n", + "\n", + "to design systems and platforms that allow governed and intentional federation of data and\n", + "\n", + "processes, and at the same time allow easy and seamless integration of said data and processes.\n", + "\n", + "Architectural design patterns such as Data Mesh have emerged to address these specific\n", + "\n", + "challenges and considerations. Data Mesh architecture assumes a federated design and\n", + "\n", + "dissemination of ownership and responsibility to business units or divisions. This, in fact, has\n", + "\n", + "several advantages, chief among them that data is owned by the parts of the organization closest\n", + "\n", + "to the source of the data. Data residence is naturally enforced since data sits within the geo-\n", + "\n", + "locality where it has been generated. Finally, data volumes and data variety are kept in control\n", + "\n", + "due to the localization within a data domain (or data node). On the other hand, the architecture\n", + "\n", + "promotes exchange of data between different data domains when that data is needed to deliver\n", + "\n", + "outcomes and better insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "Business Unit 1 Business Unit ,\n", + "i n R e g i o n A i n R e g i o n -\n", + "\n", + "Cloud Storage\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Unity\n", + "Catalog\n", + "\n", + "Cloud Storage\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "\n", + "Business Unit B\n", + "\n", + "i n R e g i o n A\n", + "\n", + "\n", + "Delta\n", + "Sharing\n", + "\n", + "R/O R/O\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "Business Unit �\n", + "N o n - D ata b r i c k s C u s t o m e r\n", + "\n", + "O n a n y c lo u d o r o n - p r e m i s e s\n", + "\n", + "Storage\n", + "\n", + "\n", + "**Figure 6:**\n", + "Building a Data Mesh\n", + "with Delta Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog enables consolidated data access control across\n", + "\n", + "different data domains within an organization using the Lakehouse\n", + "\n", + "on Databricks. In addition, Unity Catalog adds a set of simple and\n", + "\n", + "easy-to-use declarative APIsto govern and control data exchange\n", + "\n", + "patterns between the data domains in the Data Mesh.\n", + "\n", + "To make matters even more complicated, organizations can grow\n", + "\n", + "through mergers and acquisitions. In such cases we cannot assume\n", + "\n", + "that organizations being acquired have followed the same set of\n", + "\n", + "rules and standards to define their platforms and produce their\n", + "\n", + "data. Furthermore, we cannot even assume that they have used\n", + "\n", + "the same cloud providers, nor can we assume the complexity of\n", + "\n", + "their data models. Delta Sharing can simplify and accelerate the\n", + "\n", + "\n", + "unification and assimilation of newly acquired organizations and\n", + "\n", + "their data and processes.. Individual organizations can be treated\n", + "\n", + "as new data domains in the overarching mesh. Only selected data\n", + "\n", + "sources can be exchanged between the different platforms. This\n", + "\n", + "enables teams to move freely between the organizations that are\n", + "\n", + "merging without losing their data — if anything, they are empowered\n", + "\n", + "to drive insights of higher quality by combining the data of both.\n", + "\n", + "With Unity Catalog and Delta Sharing, the Lakehouse architecture\n", + "\n", + "seamlessly combines with the Data Mesh architecture to deliver\n", + "\n", + "more power than ever before, pushing the boundaries of what’s\n", + "\n", + "possible and simplifying activities that were deemed daunting not\n", + "\n", + "so long ago.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 4\n", + " How Delta Sharing Works\n", + "\n", + "\n", + "Delta Sharing is designed to be simple, scalable, nonproprietary\n", + "\n", + "and cost-effective for organizations that are serious about getting\n", + "\n", + "more from their data. Delta Sharing is natively integrated with Unity\n", + "\n", + "Catalog, which enables customers to add fine-grained governance\n", + "\n", + "and security controls, making it easy and safe to share data\n", + "\n", + "\n", + "Delta Sharing is a simple REST protocol that securely grants\n", + "\n", + "temporary access to part of a cloud data set. It leverages modern\n", + "\n", + "cloud storage systems — such as AWS S3, Azure ADLS or Google’s\n", + "\n", + "GCS — to reliably grant read-only access to large data sets. Here’s\n", + "\n", + "how it works for data providers and data recipients.\n", + "\n", + "\n", + "internally or externally.\n", + "\n", + "Data PJQIiLeJ Data Recipient\n", + "\n", + "Access permissions\n", + "\n", + "Request table\n", + "\n", + "Pre-signed short-lived URLs\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "Parquet `iles\n", + "\n", + "\n", + "Delta Sharing Server\n", + "\n", + "\n", + "**Figure 7:**\n", + "How Delta Sharing\n", + "works connecting data\n", + "providers and data\n", + "recipients\n", + "\n", + "\n", + "Temporary direct access to fles Parquet ormatt\n", + "in the object store — AWS S3, GCP, ADLS\n", + "\n", + "\n", + "\n", + "- • •\n", + "Delta Sharing Client\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data providers\n", + "\n", + "The data provider shares existing tables or parts thereof (such as\n", + "\n", + "specific table versions or partitions) stored on the cloud data lake\n", + "\n", + "in Delta Lake format. The provider decides what data they want to\n", + "\n", + "share and runs a sharing server in front of it that implements the\n", + "\n", + "Delta Sharing protocol and manages recipient access. . To manage\n", + "\n", + "shares and recipients, you can use SQL commands,the Unity\n", + "\n", + "Catalog CLI or the intuitive user interface.\n", + "\n", + "#### Data recipients\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients\n", + "\n", + "that support the protocol. Databricks has released open source\n", + "\n", + "connectors for pandas, Apache Spark, Java and Python, and is\n", + "\n", + "working with partners on many more.\n", + "\n", + "\n", + "#### The data exchange\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "**1.** \u0007The recipient’s client authenticates to the sharing server and\n", + "\n", + "asks to query a specific table. The client can also provide filters\n", + "\n", + "on the data (for example, “country=US”) as a hint to read just a\n", + "\n", + "subset of the data.\n", + "\n", + "**2.** \u0007The server verifies whether the client is allowed to access the\n", + "\n", + "data, logs the request, and then determines which data to send\n", + "\n", + "back. This will be a subset of the data objects in cloud storage\n", + "\n", + "systems that make up the table.\n", + "\n", + "**3.** \u0007To allow temporary access to the data, the server generates\n", + "\n", + "short-lived presigned URLs that allow the client to read Parquet\n", + "\n", + "files directly from the cloud provider so that the read-only\n", + "\n", + "access can happen in parallel at massive bandwidth, without\n", + "\n", + "streaming through the sharing server.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 5\n", + " Introducing Databricks Marketplace\n", + "\n", + "\n", + "Enterprises need open collaboration for data and AI. Data sharing\n", + "\n", + "— within an organization or externally — allows companies to\n", + "\n", + "collaborate with partners, establish new partnerships and generate\n", + "\n", + "new revenue streams with data monetization.\n", + "\n", + "The demand for generative AI is driving disruption across industries,\n", + "\n", + "increasing the urgency for technical teams to build generative AI\n", + "\n", + "models and Large Language Models (LLMs) on top of their own data\n", + "\n", + "to differentiate their offerings.\n", + "\n", + "\n", + "Traditional data marketplaces are restricted and offer only data or\n", + "\n", + "simple applications, therefore limiting their value to data consumers.\n", + "\n", + "They also don’t offer tools to evaluate the data assets beyond basic\n", + "\n", + "descriptions or examples. Finally, data delivery is limited, often\n", + "\n", + "requiring ETL or a proprietary delivery mechanism.\n", + "\n", + "Enterprises need a better way to share data and AI that is flexible,\n", + "\n", + "secure and unlocks business value. An ecosystem makes data\n", + "\n", + "sharing and collaboration powerful.\n", + "\n", + "\n", + "**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "Focus on data only\n", + "or simple applications\n", + "\n", + "Lengthy discovery and\n", + "evaluation\n", + "\n", + "Delayed time-to-insights\n", + "with vendor lock-in\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "monetize new types of assets\n", + "\n", + "\n", + "Limited opportunities to\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "more users\n", + "\n", + "\n", + "Difficulty reaching\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "and unified governance\n", + "\n", + "\n", + "Lack of secure technology\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Challenges in today's data marketplaces\n", + "\n", + "**Data Consumers** **Data Providers**\n", + "\n", + "\n", + "\u0007 **Focus on data only or simple applications:** Accessing only\n", + "\n", + "data sets means organizations looking to take advantage of\n", + "\n", + "AI/ML need to look elsewhere or start from scratch, causing\n", + "\n", + "delays in driving business insights.\n", + "\n", + "\u0007 **Lengthy discovery and evaluation:** The tools most\n", + "\n", + "marketplaces provide for data consumers to evaluate data\n", + "\n", + "are simply descriptions and example SQL statements. Minimal\n", + "\n", + "\n", + "\u0007 **Limited opportunities to monetize new types of assets:**\n", + "\n", + "A data-only approach means organizations are limited to\n", + "\n", + "monetizing anything beyond a data set and will face more\n", + "\n", + "friction to create new revenue opportunities with non-\n", + "\n", + "compatible platforms.\n", + "\n", + "**Difficulty reaching more users:** Data providers must choose\n", + "\n", + "between forgoing potential business or incurring the expense\n", + "\n", + "of replicating data.\n", + "\n", + "\n", + "evaluation tools mean it takes more time to figure out if a data\n", + "\n", + "product is right for you, which might include more time in\n", + "\n", + "back-and-forth messages with a provider or searching for a\n", + "\n", + "new provider altogether.\n", + "\n", + "\n", + "**Delayed time-to-insights with vendor lock-in:** Delivery\n", + "\n", + "through proprietary sharing technologies or FTP means either\n", + "\n", + "vendor lock-in or lengthy ETL processes to get the data where\n", + "\n", + "you need to work with it.\n", + "\n", + "\n", + "**Lack of secure technology and unified governance:** Without\n", + "\n", + "open standards for sharing data securely across platforms\n", + "\n", + "and clouds, data providers must use multiple tools to secure\n", + "\n", + "access to scattered data, leading to inconsistent governance.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Databricks Marketplace?\n", + "\n", + "\n", + "approach allows you to put your data to work more quickly in\n", + "\n", + "every cloud with your tools of choice.\n", + "\n", + "Marketplace brings together a vast ecosystem of data\n", + "\n", + "consumers and data providers to collaborate across a wide\n", + "\n", + "array of data sets without platform dependencies, complicated\n", + "\n", + "ETL, expensive replication and vendor lock-in.\n", + "\n", + "\n", + "Databricks Marketplace is an open marketplace for all your data,\n", + "\n", + "analytics and AI, powered by Delta Sharing.\n", + "\n", + "Since Marketplace is powered by Delta Sharing, you can benefit\n", + "\n", + "from open source flexibility and no vendor lock-in, enabling you\n", + "\n", + "to collaborate across all platforms, clouds and regions. This open\n", + "\n", + "\n", + "#### Key Benefits of Databricks Marketplace\n", + "\n", + "**Consumers** **Providers**\n", + "\n", + "\n", + "Databricks\n", + "Marketplace\n", + "provides key benefits\n", + "for both data\n", + "consumers and data\n", + "providers.\n", + "\n", + "\n", + "Discover more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Reach users\n", + "\n", + "on any platform\n", + "\n", + "\n", + "Reach users\n", + "\n", + "\n", + "Evaluate data\n", + "\n", + "products faster\n", + "\n", + "Avoid vendor lock-in\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "than just data\n", + "\n", + "\n", + "Monetize more\n", + "\n", + "\n", + "Share data securely\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Marketplace drives innovation and expands revenue opportunities\n", + "\n", + "\n", + "##### Data Consumers\n", + "\n", + " For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "**Discover more than just data:** Access more than just data sets,\n", + "\n", + "including AI models, notebooks, applications and solutions.\n", + "\n", + "**Evaluate data products faster:** Pre-built notebooks and sample\n", + "\n", + "data help you quickly evaluate and have much greater confidence\n", + "\n", + "that a data product is right for your AI or analytics initiatives.\n", + "\n", + "Obtain the fastest and simplest time to insight.\n", + "\n", + "**Avoid vendor lock-in:** Substantially reduce the time to deliver\n", + "\n", + "insights and avoid lock-in with open and seamless sharing and\n", + "\n", + "collaboration across clouds, regions, or platforms. Directly\n", + "\n", + "integrate with your tools of choice and right where you work.\n", + "\n", + "\n", + "##### Data Providers\n", + "\n", + " For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n", + "\n", + "**Reach users on any platform:** Expand your reach across\n", + "\n", + "platforms and access a massive ecosystem beyond walled\n", + "\n", + "gardens. Streamline delivery of simple data sharing to any cloud\n", + "\n", + "or region, without replication.\n", + "\n", + "**Monetize more than just data:** Monetize the broadest set of\n", + "\n", + "data assets including data sets, notebooks, AI models to reach\n", + "\n", + "more data consumers.\n", + "\n", + "**Share data securely:** Share all your data sets, notebooks, AI\n", + "\n", + "models, dashboards and more securely across clouds, regions\n", + "\n", + "and data platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Enable collaboration and accelerate innovation\n", + "\n", + "\n", + "#### Powered by a fast, growing ecosystem\n", + "\n", + "Enterprises need open collaboration for data and AI. In the past few\n", + "\n", + "months, we've continued to increase partners across industries,\n", + "\n", + "including Retail, Communications and Media & Entertainment,\n", + "\n", + "\n", + "\u0007 **Advertising and Retail**\n", + "\n", + "Incorporate shopper behavior analysis | Ads uplift/\n", + "\n", + "performance | Demand forecasting | “Next best SKU”\n", + "\n", + "prediction | Inventory analysis | Live weather data\n", + "\n", + "\n", + "Financial Services, with 520+ listings you can explore in our open\n", + "\n", + "\n", + "\u0007 **Finance**\n", + "\n", + "Incorporate data from stock exchange to predict\n", + "\n", + "economic impact | Market research | Public census and\n", + "\n", + "housing data to predict insurance sales\n", + "\n", + "\u0007 **Healthcare and Life Sciences**\n", + "\n", + "Genomic target identification | Patient risk scoring\n", + "\n", + "Accelerating drug discovery | Commercial effectiveness |\n", + "\n", + "Clinical research\n", + "\n", + "For more on Databricks Marketplace,\n", + "\n", + "go to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n", + "\n", + "Resources section on page 41 .\n", + "\n", + "\n", + "Marketplace from 80+ providers and counting.\n", + "\n", + "#### Use cases for an open marketplace\n", + "\n", + "Organizations across all industries have many use cases for\n", + "\n", + "consuming and sharing third-party data from the simple (dataset\n", + "\n", + "joins) to the more advanced (AI notebooks, applications and\n", + "\n", + "dashboards).\n", + "\n", + "\n", + "-----\n", + "\n", + "#### New upcoming feature: AI model sharing\n", + "\n", + "\n", + "Nowadays, it may seem like every organization wants to become\n", + "\n", + "an AI organization. However, most organizations are new to AI.\n", + "\n", + "Databricks has heard from customers that they want to discover\n", + "\n", + "out-of-the-box AI models on Marketplace to help them kickstart\n", + "\n", + "their AI innovation journey.\n", + "\n", + "To meet this demand, Databricks will be adding AI model sharing\n", + "\n", + "capabilities on Marketplace to provide users access to both OSS\n", + "\n", + "and proprietary AI (both first-and third-party) models. This will\n", + "\n", + "enable data consumers and providers to discover and monetize AI\n", + "\n", + "models and integrate AI into their data solutions.\n", + "\n", + "\n", + "Using this feature, data consumers can evaluate AI models with\n", + "\n", + "rich previews, including visualizations and pre-built notebooks with\n", + "\n", + "sample data. With Databricks Marketplace, there are no difficult\n", + "\n", + "data delivery mechanisms — you can get the AI models instantly\n", + "\n", + "with the click of a button. All of this works out-of-the-box with the AI\n", + "\n", + "capabilities of the Databricks Lakehouse Platform for both real-time\n", + "\n", + "and batch inference. For real-time inference, you can use model\n", + "\n", + "serving endpoints. For batch inference, you can invoke the models\n", + "\n", + "as functions directly from DBSQL or notebooks.\n", + "\n", + "With AI model sharing, Databricks customers will have access\n", + "\n", + "to best-in-class models from leading providers, as well as OSS\n", + "\n", + "models published by Databricks which can be quickly and securely\n", + "\n", + "applied on top of their data. Databricks will curate and publish\n", + "\n", + "its own open source models across common use cases, such as\n", + "\n", + "instruction-following and text summarization, and optimize tuning or\n", + "\n", + "deployment of these models.\n", + "\n", + "Using AI models from Databricks Marketplace can help your\n", + "\n", + "organization summarize complex information quickly and easily to\n", + "\n", + "help accelerate the pace of innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Chapter 6\n", + " Share securely with Databricks Clean Rooms\n", + "\n", + "\n", + "While the demand for external data to make data-driven\n", + "\n", + "innovations is greater than ever, there is growing concern among\n", + "\n", + "organizations around data privacy. The need for organizations to\n", + "\n", + "share data and collaborate with their partners and customers in a\n", + "\n", + "secure, governed and privacy-centric way is driving the concept\n", + "\n", + "of “data clean rooms.”\n", + "\n", + "\n", + "#### What is a data clean room?\n", + "\n", + "A data clean room provides a secure, governed and privacy-safe\n", + "\n", + "environment where participants can bring their sensitive data, which\n", + "\n", + "might include personally identifiable information (PII), and perform\n", + "\n", + "joint analysis on that private data. Participants have full control\n", + "\n", + "of the data and can decide which participants can perform what\n", + "\n", + "analysis without exposing any sensitive data.\n", + "\n", + "\n", + "###### Collaborator A\n", + " Data Cleanroom\n", + "E.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n", + "\n", + "\u0007What is our audience overlap?\n", + "\n", + "\n", + "###### Collaborator B\n", + "\n", + "E.G., ADVERTISERTS\n", + "\n", + "\n", + "**Figure 8:**\n", + "Data clean room\n", + "diagram example\n", + "for audience\n", + "overlap analysis in\n", + "advertising\n", + "\n", + "\n", + "How did my campaign do in\n", + "\n", + "terms of reach and frequency?\n", + "\n", + "\n", + "\u0007What is the lift in purchases\n", + "\n", + "among those in-segment versus\n", + "those out-of-segment?\n", + "\n", + "**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n", + "\n", + "\n", + "-----\n", + "\n", + "A data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n", + "\n", + "advertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n", + "\n", + "the last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n", + "\n", + "sharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n", + "\n", + "There are various compelling needs driving this demand:\n", + "\n", + "\n", + "**Privacy-first world.** Stringent data privacy regulations such as\n", + "\n", + "GDPR and CCPA, along with sweeping changes in third-party\n", + "\n", + "measurement, have transformed how organizations collect, use\n", + "\n", + "and share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n", + "\n", + "[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n", + "\n", + "and flexibility to easily opt out of app tracking. Google also plans\n", + "\n", + "to [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n", + "\n", + "2024. As these privacy laws and practices evolve, the demand\n", + "\n", + "for data cleanrooms is likely to rise as the industry moves to new\n", + "\n", + "\n", + "**Collaboration in a fragmented ecosystem.** Today, consumers have\n", + "\n", + "more options than ever before when it comes to where, when and\n", + "\n", + "how they engage with content. As a result, the digital footprint of\n", + "\n", + "consumers is fragmented across different platforms, necessitating\n", + "\n", + "that companies collaborate with their partners to create a unified\n", + "\n", + "view of their customers’ needs and requirements. To facilitate\n", + "\n", + "collaboration across organizations, cleanrooms provide a secure\n", + "\n", + "and private way to combine their data with other data to unlock new\n", + "\n", + "insights or capabilities.\n", + "\n", + "\n", + "identifiers that are PII based, such as UID 2.0, and organizations\n", + "\n", + "try to find new ways to share and join data with customers and\n", + "\n", + "partners in a privacy-centric way.\n", + "\n", + "**New ways to monetize data.** Most organizations are looking to\n", + "\n", + "monetize their data in one form or another. With today’s privacy\n", + "\n", + "laws, companies will try to find any possible advantages to monetize\n", + "\n", + "their data without the risk of breaking privacy rules. This creates an\n", + "\n", + "opportunity for data vendors or publishers to join data for big data\n", + "\n", + "analytics without having direct access to the data.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Common data clean room uses cases\n", + "\n", + "\n", + "#### Category management for retail and consumer goods\n", + "\n", + "Clean rooms enable real-time collaboration between retailers\n", + "\n", + "and suppliers, ensuring secure information exchange for demand\n", + "\n", + "forecasting, inventory planning and supply chain optimization.\n", + "\n", + "This improves product availability, reduces costs and streamlines\n", + "\n", + "operations for both parties.\n", + "\n", + "#### Real-world evidence (RWE) for healthcare\n", + "\n", + "Clean rooms provide secure access to sensitive healthcare data sets,\n", + "\n", + "allowing collaborators to connect and query multiple sources of data\n", + "\n", + "without comprising data privacy. This supports RWE use cases such\n", + "\n", + "as regulatory decisions, safety, clinical trial design and observational\n", + "\n", + "research.\n", + "\n", + "\n", + "#### Audience overlap exploration for media and entertainment\n", + "\n", + "By creating a clean room environment, media companies can\n", + "\n", + "securely share their audience data with advertisers or other media\n", + "\n", + "partners. This allows them to perform in-depth analysis and identify\n", + "\n", + "shared audience segments without directly accessing or exposing\n", + "\n", + "individual user information.\n", + "\n", + "#### Know Your Customer (KYC) in banking\n", + "\n", + "KYC standards are designed to combat financial fraud, money\n", + "\n", + "laundering and terrorism financing. Clean rooms can be used within a\n", + "\n", + "given jurisdiction to allow financial services companies to collaborate\n", + "\n", + "and run shared analytics to build a holistic view of a transaction for\n", + "\n", + "investigations.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Personalization with expanded interests for retailers\n", + "\n", + "Retailers want to target consumers based on past purchases, as\n", + "\n", + "well as other purchases with different retailers. Clean rooms enable\n", + "\n", + "retailers to augment their knowledge of consumers to suggest new\n", + "\n", + "products and services that are relevant to the individual but have\n", + "\n", + "\n", + "#### 5G data monetization for telecom\n", + "\n", + "5G data monetization enables telecoms to capitalize on data\n", + "\n", + "from 5G networks. Clean rooms provide a secure environment\n", + "\n", + "for collaboration with trusted partners, ensuring privacy while\n", + "\n", + "maximizing data value for optimized services, personalized\n", + "\n", + "experiences and targeted advertising.\n", + "\n", + "\n", + "not yet been purchased.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Shortcomings of existing data clean rooms\n", + "\n", + "\n", + "Organizations exploring clean room options are finding some glaring\n", + "\n", + "shortcomings in the existing solutions that limit the full potential of the\n", + "\n", + "“clean rooms” concept.\n", + "\n", + "First, many existing data clean room vendors require data to be on the\n", + "\n", + "same cloud, same region, and/or same data platform. Participants then\n", + "\n", + "have to move data into proprietary platforms, which results in lock-in\n", + "\n", + "and additional data storage costs.\n", + "\n", + "\n", + "Second, most existing solutions are not scalable to expand\n", + "\n", + "collaboration beyond a few collaborators at a time. For example,\n", + "\n", + "an advertiser might want to get a detailed view of their ad\n", + "\n", + "performance across different platforms, which requires analysis\n", + "\n", + "of the aggregated data from multiple data publishers. With\n", + "\n", + "collaboration limited to just a few participants, organizations get\n", + "\n", + "partial insights on one clean room platform and end up moving\n", + "\n", + "their data to another clean room vendor to aggregate the data,\n", + "\n", + "incurring the operational overhead of collating partial insights.\n", + "\n", + "Finally, existing clean room solutions do not provide the flexibility\n", + "\n", + "to run arbitrary analysis and are mainly restricted to SQL, a\n", + "\n", + "subset of Python, and pre-defined templates. While SQL is\n", + "\n", + "absolutely needed for clean rooms, there are times when you\n", + "\n", + "require complex computations such as machine learning or\n", + "\n", + "integration with APIs where SQL doesn’t satisfy the full depth of\n", + "\n", + "the technical requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key benefits of Databricks Clean Rooms\n", + "\n", + "Databricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n", + "\n", + "any cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n", + "\n", + "\n", + "**Flexible - your language and workload of**\n", + "\n", + "**choice.** Databricks Clean Rooms empower\n", + "\n", + "collaborators to share and join their existing\n", + "\n", + "data and run complex workloads in any\n", + "\n", + "language —Python, R, SQL, Java and Scala —\n", + "\n", + "on the data while maintaining data privacy.\n", + "\n", + "Beyond traditional SQL, users can run arbitrary\n", + "\n", + "workloads and languages, allowing them to train\n", + "\n", + "machine learning models, perform inference\n", + "\n", + "and utilize open-source or third-party privacy-\n", + "\n", + "enhancing technologies. This flexibility enables\n", + "\n", + "data scientists and analysts to achieve more\n", + "\n", + "comprehensive and advanced data analysis\n", + "\n", + "within the secure Clean Room environment.\n", + "\n", + "\n", + "**Scalable, multi-party collaboration.**\n", + "\n", + "With Databricks Clean Rooms, you can\n", + "\n", + "launch a clean room and work with multiple\n", + "\n", + "collaborators at a time. This capability\n", + "\n", + "enables real-time collaboration, fostering\n", + "\n", + "efficient and rapid results. Moreover,\n", + "\n", + "Databricks Clean Rooms seamlessly\n", + "\n", + "integrate with identity service providers,\n", + "\n", + "allowing users to leverage offerings from\n", + "\n", + "these providers during collaboration. The\n", + "\n", + "ability to collaborate with multiple parties\n", + "\n", + "and leverage identity services enhances the\n", + "\n", + "overall data collaboration experience within\n", + "\n", + "Databricks Clean Rooms.\n", + "\n", + "\n", + "**Interoperable - any data source**\n", + "\n", + "**with no replication.** Databricks Clean\n", + "\n", + "Rooms excel in interoperability, ensuring\n", + "\n", + "smooth collaboration across diverse\n", + "\n", + "environments. With Delta Sharing,\n", + "\n", + "collaborators can seamlessly work\n", + "\n", + "together across different cloud providers,\n", + "\n", + "regions and even data platforms without\n", + "\n", + "the need for extensive data movement.\n", + "\n", + "This eliminates data silos and enables\n", + "\n", + "organizations to leverage existing\n", + "\n", + "infrastructure and data ecosystems while\n", + "\n", + "maintaining the utmost security and\n", + "\n", + "compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Resources\n", + " Getting started with Data Sharing and Collaboration\n", + "\n", + "\n", + "Data sharing plays a key role in business processes across the\n", + "\n", + "enterprise, from product development and internal operations to\n", + "\n", + "customer experience and compliance. However, most businesses\n", + "\n", + "have been slow to move forward because of incompatibility\n", + "\n", + "between systems, complexity and security concerns.\n", + "\n", + "Data-driven organizations need an open — and secure — approach\n", + "\n", + "to data sharing.\n", + "\n", + "\n", + "Databricks offers an open approach to data sharing and\n", + "\n", + "collaboration with a variety of tools to:\n", + "\n", + "\u0007 **Share across platforms:** You can share live data sets, as well\n", + "\n", + "as AI models, dashboards and notebooks across platforms,\n", + "\n", + "clouds and regions. This open approach is powered by\n", + "\n", + "Delta Sharing, the world’s first open protocol for secure data\n", + "\n", + "sharing, which allows organizations to share data for any use\n", + "\n", + "case, any tool and on any cloud.\n", + "\n", + "\u0007 **Share all your data and AI: Databricks Marketplace** is an\n", + "\n", + "open marketplace for all your data, analytics and AI, enabling\n", + "\n", + "both data consumers and data providers with the ability to\n", + "\n", + "deliver innovation and advance analytics and AI initiatives.\n", + "\n", + "\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n", + "\n", + "to easily collaborate with customers and partners on any\n", + "\n", + "cloud in a privacy-safe way. With Delta Sharing, clean room\n", + "\n", + "participants can securely share data from their data lakes\n", + "\n", + "without any data replication across clouds or regions. Your\n", + "\n", + "data stays with you without vendor lock-in, and you can\n", + "\n", + "centrally audit and monitor the usage of your data.\n", + "\n", + "\n", + "-----\n", + "\n", + "Get started with these products by exploring the resources below.\n", + "\n", + "\n", + "**Delta Sharing**\n", + "\n", + "\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n", + "\n", + "[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n", + "\n", + "[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n", + "\n", + "[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n", + "\n", + "[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "\n", + "**Databricks Marketplace**\n", + "\n", + "[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n", + "\n", + "[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n", + "\n", + "[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n", + "\n", + "[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n", + "\n", + "[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n", + "\n", + "[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n", + "\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n", + "\n", + "\n", + "**Databricks Clean Rooms**\n", + "\n", + "\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n", + "\n", + "[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n", + "\n", + "[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n", + "\n", + "[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n", + "\n", + "\n", + "[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n", + "\n", + "\n", + "-----\n", + "\n", + "## About the Authors\n", + "\n", + "\n", + "**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n", + "\n", + "making analytics and AI simple for customers by leveraging the\n", + "\n", + "power of the Databricks Lakehouse Platform. You can reach Vuong\n", + "\n", + "on [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n", + "\n", + "\n", + "**Sachin Thakur** is a Principal Product Marketing Manager on the\n", + "\n", + "Databricks Data Engineering and Analytics team. His area of focus\n", + "\n", + "is data governance with Unity Catalog, and he is passionate about\n", + "\n", + "helping organizations democratize data and AI with the Databricks\n", + "\n", + "Lakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n", + "\n", + "\n", + "**Milos Colic** is a Senior Solution Architect at Databricks. His\n", + "\n", + "\n", + "passion is to help customers with their data exchange and data\n", + "\n", + "monetization needs. Furthermore, he is passionate about geospatial\n", + "\n", + "data processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n", + "\n", + "\n", + "**Jay Bhankharia** is a Senior Director on the Databricks Data\n", + "\n", + "Partnerships team. His passion is to help customers gain insights\n", + "\n", + "from data to use the power of the Databricks Lakehouse Platform\n", + "\n", + "for their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n", + "\n", + "\n", + "**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n", + "\n", + "\n", + "over 20 years of experience in helping organizations of any size\n", + "\n", + "build data solutions. He focuses on data monetization and loves to\n", + "\n", + "help customers and businesses get more value from the data they\n", + "\n", + "have. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n", + "\n", + "**Somasekar Natarajan** (Som) is a Solution Architect at\n", + "\n", + "Databricks specializing in enterprise data management. Som has\n", + "\n", + "worked with Fortune organizations spanning three continents for\n", + "\n", + "close to two decades with one objective — helping customers to\n", + "\n", + "\n", + "**Giselle Goicochea** is a Senior Product Marketing Manager\n", + "\n", + "on the Databricks Data Engineering and Analytics team. Her area\n", + "\n", + "of focus is data sharing and collaboration with Delta Sharing and\n", + "\n", + "Databricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n", + "\n", + "**Kelly Albano** is a Product Marketing Manager on the Databricks\n", + "\n", + "Data Engineering and Analytics team. Her area of focus is security,\n", + "\n", + "compliance and Databricks Clean Rooms. You can reach\n", + "\n", + "Kelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n", + "\n", + "\n", + "harness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "© Databricks 2023 All rights reserved\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf2024-09-19T16:57:20Z
##### The Delta Lake Series Complete Collection\n", + "\n", + "\n", + "-----\n", + "\n", + "### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "lifecycle management to data lakes. With Delta Lake, there will be no more\n", + "malformed data ingestion, difficulties deleting data for compliance, or issues\n", + "modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "scalable cloud service.\n", + "\n", + "In this eBook, the Databricks team has compiled all of their insights into a comprehensive\n", + "format so that you can gain a full understanding of Delta Lake and its capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Contents Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "Fundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n", + "\n", + "Performance Matter **you’ll find inside** 5 Features 22\n", + "\n", + "\n", + "\n", + "Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "\n", + "Rollbacks 39\n", + "\n", + "Pinned view of a continuously updating\n", + "\n", + "Delta Lake table across multiple downstream jobs\n", + "\n", + "Queries for time series analytics made simple\n", + "\n", + "Easily Clone Your Delta Lake\n", + "\n", + "for Testing, Sharing and ML\n", + "\n", + "Reproducibility 41\n", + "\n", + "What are clones? 41\n", + "\n", + "\n", + "A lakehouse combines the best elements\n", + "\n", + "of data lakes and data warehouses 52\n", + "\n", + "Some early examples 55\n", + "\n", + "From BI to AI 55\n", + "\n", + "Diving Deep Into the\n", + "\n", + "Inner Workings of the Lakehouse and Delta Lake 56\n", + "\n", + "1. Data lakes 57\n", + "\n", + "2. Custom storage engines 57\n", + "\n", + "\n", + "Creating the Dashboard /\n", + "\n", + "Virtual Network Operation Centers 82\n", + "\n", + "Creating (near) real-time alerts 85\n", + "\n", + "Next steps: machine learning 86\n", + "\n", + "Point-of-failure prediction and remediation 87\n", + "\n", + "Customer churn 87\n", + "\n", + "Getting started with the Databricks streaming video QoS solution 87\n", + "\n", + "Customer Use Cases 88\n", + "\n", + "Healthdirect Australia 89\n", + "\n", + "Data quality and governance issues, silos, and the inability to scale 89\n", + "\n", + "\n", + "Fundamentals & Performance\n", + "\n", + "\n", + "Using data skipping and Z-Order clustering 21\n", + "\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and\n", + "\n", + "\n", + "Exploring the details 21\n", + "\n", + "\n", + "Performance Matter\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "Challenges with data lakes\n", + "\n", + "Delta Lake’s key functionalities\n", + "\n", + "Unpacking the Transaction Log\n", + "\n", + "Implementing atomicity to ensure\n", + "\n", + "\n", + "Why Use MERGE\n", + "\n", + "With Delta Lake?\n", + "\n", + "When are upserts necessary? 24\n", + "\n", + "Why upserts into data lakes have\n", + "\n", + "\n", + "operations complete fully\n", + "\n", + "\n", + "operations complete fully 9\n", + "\n", + "Dealing with multiple concurrent reads and writes **Chapter**\n", + "\n", + "Time travel, data lineage and debugging 10\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "Understanding table schemas 11\n", + "\n", + "#### 01\n", + "\n", + "\n", + "Fundamentals and Performance traditionally been challenging 25\n", + "\n", + "\n", + "traditionally been challenging\n", + "\n", + "\n", + "Shallow clones\n", + "\n", + "Deep clones\n", + "\n", + "\n", + "**Chapter**\n", + "\n", + "42\n", + "\n", + "42\n", + "\n", + "#### 04\n", + "\n", + "\n", + "3. Lakehouse\n", + "\n", + "\n", + "Dealing with multiple concurrent reads and writes\n", + "\n", + "\n", + "Introducing MERGE in Delta Lake\n", + "\n", + "\n", + "In the research paper, the authors explain: 59\n", + "\n", + "\n", + "3. Lakehouse Streaming 58\n", + "\n", + "\n", + "\n", + "- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\n", + "and Performance Matter Deleting data due to GDPR 26\n", + "\n", + "\n", + "Understanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n", + "\n", + "Delta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n", + "\n", + "Scaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n", + "\n", + "\n", + "Time travel, data lineage and debugging\n", + "\n", + "\n", + "Simplifying use cases with MERGE\n", + "\n", + "\n", + "Where do clones help?\n", + "\n", + "\n", + "Understanding\n", + "\n", + "\n", + "Modernizing analytics with Databricks and Delta Lake\n", + "\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "\n", + "Deleting data due to GDPR\n", + "\n", + "\n", + "Testing and experimentation with a production table\n", + "\n", + "\n", + "Delta Engine\n", + "\n", + "\n", + "Faster data pipelines result in better patient-driven healthcare\n", + "\n", + "\n", + "\n", + "- Unpacking the Transaction Log Applying change data from databases 26\n", + "\n", + "- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n", + "\n", + "- Delta Lake DML Internals How to start using Delta Lake 28\n", + "\n", + "- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\n", + "With Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n", + "\n", + "\n", + "Understanding table schemas\n", + "\n", + "\n", + "Applying change data from databases\n", + "\n", + "\n", + "Staging major changes to a production table\n", + "\n", + "\n", + "Scaling execution performance\n", + "\n", + "\n", + "Comcast\n", + "\n", + "\n", + "Announcing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n", + "\n", + "high-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n", + "\n", + "\n", + "What is schema enforcement?\n", + "\n", + "How does schema enforcement work?\n", + "\n", + "How is schema enforcement useful?\n", + "\n", + "What is schema evolution?\n", + "\n", + "How does schema evolution work?\n", + "\n", + "\n", + "Updating session information from streaming pipelines\n", + "\n", + "\n", + "Machine learning result reproducibility\n", + "\n", + "Data migration\n", + "\n", + "Data sharing\n", + "\n", + "Data archiving\n", + "\n", + "Looks awesome! Any gotchas?\n", + "\n", + "How can I use it?\n", + "\n", + "Enabling Spark SQL DDL\n", + "\n", + "\n", + "Announcing Delta Engine for\n", + "\n", + "\n", + "Infrastructure unable to support data and ML needs\n", + "\n", + "\n", + "How to start using Delta Lake\n", + "\n", + "\n", + "high-performance query execution\n", + "\n", + "\n", + "Automated infrastructure, faster data\n", + "\n", + "\n", + "Getting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n", + "\n", + "Streaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n", + "\n", + "\n", + "Loading and saving our Delta Lake data\n", + "\n", + "\n", + "Getting started with Delta Engine\n", + "\n", + "\n", + "pipelines with Delta Lake\n", + "\n", + "\n", + "In-place conversion to Delta Lake\n", + "\n", + "\n", + "Streaming\n", + "\n", + "\n", + "Delivering personalized experiences with ML\n", + "\n", + "\n", + "Delete our flight data\n", + "\n", + "Update our flight data 31\n", + "\n", + "Merge our flight data 31\n", + "\n", + "\n", + "How Delta Lake Solves Common Pain Points in Streaming\n", + "\n", + "\n", + "Banco Hipotecario 97\n", + "\n", + "Legacy analytics tools are slow, rigid and\n", + "\n", + "impossible to scale 98\n", + "\n", + "\n", + "How is schema evolution useful? 14\n", + "\n", + "Summary **Chapter** 14\n", + "\n", + "Delta Lake\n", + "\n", + "DML Internals 15\n", + "\n", + "Delta Lake DML: UPDATE 15\n", + "\n", + "#### 02\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "#### 05 Chapter\n", + "\n", + "\n", + "Data lake pain points Customer Use Cases 64\n", + "\n", + "\n", + "How is schema evolution useful?\n", + "\n", + "\n", + "Data lake pain points\n", + "\n", + "\n", + "Summary\n", + "\n", + "\n", + "Data warehouse pain points\n", + "\n", + "\n", + "\n", + "- Why Use MERGE With Delta Lake? View table history 32\n", + "\n", + "- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\n", + "Tables Using Python APIs Clean up old table versions with vacuum 33\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n", + "\n", + "\n", + "Delta Lake\n", + "\n", + "\n", + "View table history\n", + "\n", + "\n", + "and DML in Delta Lake on\n", + "\n", + "\n", + "How Delta Lake on Databricks solves these issues\n", + "\n", + "\n", + "A unified platform powers the data lake\n", + "\n", + "\n", + "DML Internals\n", + "\n", + "\n", + "Travel back in time with table history\n", + "\n", + "\n", + "Apache Spark 3.0\n", + "\n", + "\n", + "Simplifying Streaming Stock Data Analysis Using Delta Lake\n", + "\n", + "\n", + "and easy collaboration\n", + "\n", + "\n", + "Implement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n", + "\n", + "stock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n", + "\n", + "\n", + "Delta Lake DML: UPDATE\n", + "\n", + "\n", + "Clean up old table versions with vacuum\n", + "\n", + "\n", + "Support for SQL DDL commands\n", + "\n", + "\n", + "Implement your streaming\n", + "\n", + "\n", + "An efficient team maximizes customer\n", + "\n", + "\n", + "\n", + "- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n", + "\n", + "- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\n", + "and ML Reproducibility 1. Using a timestamp 36\n", + "\n", + "\n", + "UPDATE: Under the hood 16\n", + "\n", + "UPDATE + Delta Lake time travel = Easy debugging\n", + "\n", + "UPDATE: Performance tuning tips 16\n", + "\n", + "Delta Lake DML: DELETE 16\n", + "\n", + "DELETE: Under the hood 17\n", + "\n", + "DELETE + VACUUM: Cleaning up old data files\n", + "\n", + "\n", + "Common challenges with changing data\n", + "\n", + "\n", + "to define tables in the Hive metastore\n", + "\n", + "\n", + "stock analysis solution with Delta Lake\n", + "\n", + "\n", + "acquisition and retention\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n", + "\n", + "\n", + "Working with Time Travel\n", + "\n", + "\n", + "Create or replace tables\n", + "\n", + "\n", + "Analyze streaming stock data in real time 69\n", + "\n", + "\n", + "Viacom18\n", + "\n", + "\n", + "1. Using a timestamp\n", + "\n", + "\n", + "Explicitly alter the table schema\n", + "\n", + "\n", + "How Tilting Point Does Streaming Ingestion Into Delta Lake\n", + "\n", + "\n", + "Growth in subscribers and terabytes of viewing data push Hadoop to its limits\n", + "\n", + "\n", + "\n", + "- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\n", + "on Apache Spark 3.0 Python syntax 37\n", + "\n", + "\n", + "How data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n", + "\n", + "Leveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n", + "\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "Support for SQL Insert, Delete, Update and Merge\n", + "\n", + "Automatic and incremental Presto/Athena manifest generation\n", + "\n", + "Configuring your table through table properties\n", + "\n", + "Support for adding user-defined metadata\n", + "\n", + "in Delta Lake table commits 48\n", + "\n", + "Other highlights 49\n", + "\n", + "Lakehouse 50\n", + "\n", + "What Is a\n", + "\n", + "Lakehouse? 51\n", + "\n", + "\n", + "How data flows and associated challenges 72\n", + "\n", + "\n", + "Rapid data processing for analytics\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "Leveraging Structured Streaming with blob store as\n", + "\n", + "\n", + "and ML with Databricks\n", + "\n", + "\n", + "SQL syntax 37\n", + "\n", + "2. Using a version number\n", + "\n", + "Scala syntax\n", + "\n", + "\n", + "source and Delta Lake tables as sink\n", + "\n", + "\n", + "Leveraging viewer data to power personalized viewing experiences 104\n", + "\n", + "\n", + "DELETE: Performance tuning tips 18\n", + "\n", + "Delta Lake DML: MERGE **Chapter** 18\n", + "\n", + "Here’s how an upsert works: 18\n", + "\n", + "MERGE: Under the hood 19\n", + "\n", + "MERGE: Performance tuning tips **03** 19\n", + "\n", + "\n", + "DELETE: Performance tuning tips\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Building a Quality of Service Analytics Solution for Streaming Video Services 75\n", + "\n", + "Databricks Quality of Service solution overview 76\n", + "\n", + "Video QoS solution architecture 77\n", + "\n", + "Making your data ready for analytics 79\n", + "\n", + "Video applications events 80\n", + "\n", + "CDN logs 81\n", + "\n", + "\n", + "Delta Lake DML: MERGE\n", + "\n", + "\n", + "\n", + "- What Is a Lakehouse? Python syntax 38\n", + "\n", + "- Diving Deep Into the Inner Workings of the SQL syntax 38\n", + "Lakehouse and Delta Lake Audit data changes 39\n", + "\n", + "\n", + "Here’s how an upsert works:\n", + "\n", + "\n", + "Python syntax\n", + "\n", + "\n", + "MERGE: Under the hood\n", + "\n", + "\n", + "SQL syntax\n", + "\n", + "\n", + "MERGE: Performance tuning tips\n", + "\n", + "\n", + "Audit data changes\n", + "\n", + "\n", + "How Delta Lake Quickly\n", + "\n", + "\n", + "\n", + "- Understanding Delta Engine Reproduce experiments and reports 39\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fundamentals and Performance**\n", + "Boost data reliability for machine learning and\n", + "business intelligence with Delta Lake\n", + "\n", + "## CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Fundamentals of Delta**\n", + "**Lake: Why Reliability and**\n", + "**Performance Matter**\n", + "\n", + "When it comes to data reliability, performance — the speed at which your programs\n", + "run — is of utmost importance. Because of the ACID transactional protections that\n", + "Delta Lake provides, you’re able to get the reliability and performance you need.\n", + "\n", + "With Delta Lake, you can stream and batch concurrently, perform CRUD operations,\n", + "and save money because you’re now using fewer VMs. It’s easier to maintain your data\n", + "engineering pipelines by taking advantage of streaming, even for batch jobs.\n", + "\n", + "Delta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\n", + "cloud object storage by providing ACID transactions through optimistic concurrency\n", + "control between writes and snapshot isolation for consistent reads during writes.\n", + "Delta Lake also provides built-in data versioning for easy rollbacks and reproducing\n", + "reports.\n", + "\n", + "In this chapter, we’ll share some of the common challenges with data lakes as well as\n", + "the Delta Lake features that address them.\n", + "\n", + "**Challenges with data lakes**\n", + "Data lakes are a common element within modern data architectures. They serve as a\n", + "central ingestion point for the plethora of data that organizations seek to gather and\n", + "mine. While a good step forward in getting to grips with the range of data, they run\n", + "into the following common problems:\n", + "\n", + "\n", + "-----\n", + "\n", + "**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\n", + "the problem of unsafe writes into data lakes that cause readers to see garbage\n", + "data during writes. They have to build workarounds to ensure readers always see\n", + "consistent data during writes.\n", + "\n", + "**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\n", + "lake is easy, but this comes at the cost of data quality. Without any mechanisms\n", + "for validating schema and the data, data lakes suffer from poor data quality. As a\n", + "consequence, analytics projects that strive to mine this data also fail.\n", + "\n", + "**3. Poor performance with increasing amounts of data.** As the amount of data\n", + "that gets dumped into a data lake increases, the number of files and directories\n", + "also increases. Big data jobs and query engines that process the data spend a\n", + "significant amount of time handling the metadata operations. This problem is more\n", + "pronounced in the case of streaming jobs or handling many concurrent batch jobs.\n", + "\n", + "**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain.\n", + "\n", + "Because of these challenges, many big data projects fail to deliver on their vision or\n", + "sometimes just fail altogether. We need a solution that enables data practitioners to\n", + "make use of their existing data lakes, while ensuring data quality.\n", + "\n", + "**Delta Lake’s key functionalities**\n", + "Delta Lake addresses the above problems to simplify how you build your data lakes.\n", + "Delta Lake offers the following key functionalities:\n", + "\n", + "**• ACID transactions:** Delta Lake provides ACID transactions between multiple\n", + "writes. Every write is a transaction, and there is a serial order for writes recorded in\n", + "a transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n", + "\n", + "\n", + "-----\n", + "\n", + "[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\n", + "trying to modify the same files don’t happen that often. In scenarios where\n", + "there is a conflict, Delta Lake throws a concurrent modification exception for\n", + "users to handle them and retry their jobs. Delta Lake also offers the highest level\n", + "of isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\n", + "keep writing to a directory or table and consumers to keep reading from the same\n", + "directory or table. Readers will see the latest snapshot that existed at the time the\n", + "reading started.\n", + "\n", + "**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\n", + "DataFrame being written is compatible with the schema of the table. Columns that\n", + "are present in the table but not in the DataFrame are set to null. If there are extra\n", + "columns in the DataFrame that are not present in the table, this operation throws\n", + "an exception. Delta Lake has DDL to add new columns explicitly and the ability to\n", + "update the schema automatically.\n", + "\n", + "**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\n", + "a table or directory in the transaction log instead of the metastore. This allows\n", + "Delta Lake to list files in large directories in constant time and be efficient while\n", + "reading data.\n", + "\n", + "**• Data versioning and time travel:** Delta Lake allows users to read a previous\n", + "snapshot of the table or directory. When files are modified during writes, Delta\n", + "Lake creates newer versions of the files and preserves the older versions. When\n", + "\n", + "\n", + "users want to read the older versions of the table or directory, they can provide\n", + "a timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\n", + "constructs the full snapshot as of that timestamp or version based on the\n", + "information in the transaction log. This allows users to reproduce experiments and\n", + "reports and also revert a table to its older versions, if needed.\n", + "\n", + "**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\n", + "be used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\n", + "Combined with ACID transactions and scalable metadata handling, the efficient\n", + "streaming sink enables lots of near real-time analytics use cases without having to\n", + "maintain a complicated streaming and batch pipeline.\n", + "\n", + "**• Record update and deletion:** Delta Lake will support merge, update and delete\n", + "DML commands. This allows engineers to easily upsert and delete records in data\n", + "lakes and simplify their change data capture and GDPR use cases. Since Delta Lake\n", + "tracks and modifies data at file-level granularity, it is much more efficient than\n", + "reading and overwriting entire partitions or tables.\n", + "\n", + "**• Data expectations (coming soon):** Delta Lake will also support a new API to set\n", + "data expectations on tables or directories. Engineers will be able to specify a\n", + "boolean condition and tune the severity to handle data expectations. When Apache\n", + "Spark jobs write to the table or directory, Delta Lake will automatically validate\n", + "the records and when there is a violation, it will handle the records based on the\n", + "severity provided.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unpacking the**\n", + "**Transaction Log**\n", + "\n", + "The transaction log is key to understanding Delta Lake because it is the common thread\n", + "that runs through many of its most important features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log is\n", + "an ordered record of every transaction that has ever been performed on a Delta Lake\n", + "table since its inception.\n", + "\n", + "Delta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\n", + "given table to work on the table at the same time. To show users correct views of the\n", + "data at all times, the transaction log serves as a single source of truth: the central\n", + "repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on an\n", + "open table that has been modified since the last time it was read, Spark checks the\n", + "transaction log to see what new transactions are posted to the table. Then, Spark\n", + "updates the end user’s table with those new changes. This ensures that a user’s\n", + "version of a table is always synchronized with the master record as of the most recent\n", + "query and that users cannot make divergent, conflicting changes to a table.\n", + "\n", + "In this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\n", + "solution to the problem of multiple concurrent reads and writes.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Implementing atomicity to ensure**\n", + "**operations complete fully**\n", + "Atomicity is one of the four properties of ACID transactions that guarantees that\n", + "operations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\n", + "fully or don’t complete at all. Without this property, it’s far too easy for a hardware\n", + "failure or a software bug to cause data to be only partially written to a table, resulting\n", + "in messy or corrupted data.\n", + "\n", + "The transaction log is the mechanism through which Delta Lake is able to offer\n", + "the guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\n", + "transaction log, it never happened. By only recording transactions that execute fully\n", + "and completely, and using that record as the single source of truth, the Delta Lake\n", + "transaction log allows users to reason about their data and have peace of mind about\n", + "its fundamental trustworthiness, at petabyte scale.\n", + "\n", + "**Dealing with multiple concurrent reads and writes**\n", + "But how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\n", + "Lake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n", + "\n", + "\n", + "table at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n", + "**concurrency control** .\n", + "\n", + "Optimistic concurrency control is a method of dealing with concurrent transactions\n", + "that assumes the changes made to a table by different users can complete without\n", + "conflicting with one another. It is incredibly fast because when dealing with petabytes\n", + "of data, there’s a high likelihood that users will be working on different parts of the data\n", + "altogether, allowing them to complete non-conflicting transactions simultaneously.\n", + "\n", + "Of course, even with optimistic concurrency control, sometimes users do try to\n", + "modify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\n", + "for that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\n", + "then it attempts to solve any conflict optimistically.\n", + "\n", + "This protocol allows Delta Lake to deliver on the ACID principle of isolation, which\n", + "ensures that the resulting state of the table after multiple, concurrent writes is the\n", + "same as if those writes had occurred serially, in isolation from one another.\n", + "\n", + "\n", + "-----\n", + "\n", + "As all the transactions made on Delta Lake tables are stored directly to disk, this\n", + "process satisfies the ACID property of durability, meaning it will persist even in the\n", + "event of system failure.\n", + "\n", + "**Time travel, data lineage and debugging**\n", + "Every table is the result of the sum total of all the commits recorded in the Delta Lake\n", + "transaction log — no more and no less. The transaction log provides a step-by-step\n", + "instruction guide, detailing exactly how to get from the table’s original state to its\n", + "current state.\n", + "\n", + "Therefore, we can recreate the state of a table at any point in time by starting with\n", + "an original table, and processing only commits made after that point. This powerful\n", + "ability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n", + "\n", + "\n", + "of situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n", + "\n", + "As the definitive record of every change ever made to a table, the Delta Lake\n", + "transaction log offers users a verifiable data lineage that is useful for governance,\n", + "audit and compliance purposes. It can also be used to trace the origin of an\n", + "inadvertent change or a bug in a pipeline back to the exact action that caused it. Users\n", + "can run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\n", + "that were made.\n", + "\n", + "**Want to learn more about Delta Lake’s transaction log?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Use Schema**\n", + "**Enforcement and**\n", + "**Evolution**\n", + "\n", + "As business problems and requirements evolve over time, so does the structure of\n", + "your data. With Delta Lake, incorporating new columns or objects is easy; users have\n", + "access to simple semantics to control the schema of their tables. At the same time,\n", + "it is important to call out the importance of schema enforcement to prevent users\n", + "from accidentally polluting their tables with mistakes or garbage data in addition to\n", + "schema evolution, which enables them to automatically add new columns of rich data\n", + "when those columns belong.\n", + "\n", + "**Schema enforcement rejects any new columns or other schema changes that**\n", + "**aren’t compatible with your table.** By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together,\n", + "these features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Understanding table schemas**\n", + "Every DataFrame in Apache Spark contains a schema, a blueprint that defines the\n", + "shape of the data, such as data types and columns, and metadata. With Delta Lake,\n", + "the table’s schema is saved in JSON format inside the transaction log.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What is schema enforcement?**\n", + "Schema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\n", + "data quality by rejecting writes to a table that don’t match the table’s schema.\n", + "\n", + "Like the front-desk manager at a busy restaurant who only accepts reservations, it\n", + "checks to see whether each column of data inserted into the table is on its list of\n", + "expected columns (in other words, whether each one has a “reservation”), and rejects\n", + "any writes with columns that aren’t on the list.\n", + "\n", + "**How does schema enforcement work?**\n", + "Delta Lake uses **schema validation on write,** which means that all new writes to a\n", + "table are checked for compatibility with the target table’s schema at write time. If the\n", + "schema is not compatible, Delta Lake cancels the transaction altogether (no data is\n", + "written), and raises an exception to let the user know about the mismatch.\n", + "\n", + "To determine whether a write to a table is compatible, Delta Lake uses the following\n", + "rules. The DataFrame to be written cannot contain:\n", + "\n", + "**• Any additional columns that are not present in the target table’s schema.**\n", + "Conversely, it’s OK if the incoming data doesn’t contain every column in the table —\n", + "those columns will simply be assigned null values.\n", + "\n", + "**• \u0007Column data types that differ from the column data types in the target table.**\n", + "If a target table’s column contains StringType data, but the corresponding column\n", + "in the DataFrame contains IntegerType data, schema enforcement will raise an\n", + "exception and prevent the write operation from taking place.\n", + "\n", + "**• Column names that differ only by case.** This means that you cannot have columns\n", + "such as “Foo” and “foo” defined in the same table. While Spark can be used in case\n", + "sensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\n", + "when storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\n", + "column information. To avoid potential mistakes, data corruption or loss issues (which\n", + "we’ve personally experienced at Databricks), we decided to add this restriction.\n", + "\n", + "\n", + "-----\n", + "\n", + "Rather than automatically adding the new columns, Delta Lake enforces the schema,\n", + "and stops the write from occurring. To help identify which column(s) caused the\n", + "mismatch, Spark prints out both schemas in the stack trace for comparison.\n", + "\n", + "**How is schema enforcement useful?**\n", + "Because it’s such a stringent check, schema enforcement is an excellent tool to use\n", + "as a gatekeeper for a clean, fully transformed data set that is ready for production or\n", + "consumption. It’s typically enforced on tables that directly feed:\n", + "\n", + "- Machine learning algorithms\n", + "\n", + "- BI dashboards\n", + "\n", + "- Data analytics and visualization tools\n", + "\n", + "- Any production system requiring highly structured,\n", + "strongly typed, semantic schemas\n", + "\n", + "In order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\n", + "a look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "**What is schema evolution?**\n", + "Schema evolution is a feature that allows users to easily change a table’s current\n", + "schema to accommodate data that is changing over time. Most commonly, it’s used\n", + "when performing an append or overwrite operation, to automatically adapt the\n", + "schema to include one or more new columns.\n", + "\n", + "**How does schema evolution work?**\n", + "Following up on the example from the previous section, developers can\n", + "easily use schema evolution to add the new columns that were previously\n", + "rejected due to a schema mismatch. Schema evolution is activated by adding\n", + ".option(‘mergeSchema’, ‘true’) to your .write or .writeStream\n", + "Spark command, as shown in the following example.\n", + "\n", + "\n", + "#Add the mergeSchema option\n", + "\n", + "loans.write.format( “delta” ) \\\n", + "\n", + ".option( “mergeSchema” , “true” ) \\\n", + "\n", + ".mode( “append” ) \\\n", + "\n", + ".save(DELTALAKE_SILVER_PATH)\n", + "\n", + "By including the mergeSchema option in your query, any columns that are present\n", + "\n", + "in the DataFrame but not in the target table are automatically added to the end of the\n", + "\n", + "schema as part of a write transaction. Nested fields can also be added, and these\n", + "\n", + "fields will get added to the end of their respective struct columns as well.\n", + "\n", + "Data engineers and scientists can use this option to add new columns (perhaps a\n", + "\n", + "newly tracked metric, or a column of this month’s sales figures) to their existing ML\n", + "\n", + "production tables without breaking existing models that rely on the old columns.\n", + "\n", + "The following types of schema changes are eligible for schema evolution during table\n", + "\n", + "appends or overwrites:\n", + "\n", + "- Adding new columns (this is the most common scenario)\n", + "\n", + "- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n", + "\n", + "→ ShortType → IntegerType\n", + "\n", + "Other changes, not eligible for schema evolution, require that the schema and data\n", + "\n", + "are overwritten by adding .option(“overwriteSchema”,“true”) . Those\n", + "\n", + "changes include:\n", + "\n", + "- Dropping a column\n", + "\n", + "- Changing an existing column’s data typeC (in place)\n", + "\n", + "- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\n", + "supported, allowing users to perform the following actions on table schemas:\n", + "\n", + "- Adding columns\n", + "\n", + "- Changing column comments\n", + "\n", + "- Setting table properties that define the behavior of the table, such as setting the\n", + "retention duration of the transaction log\n", + "\n", + "**How is schema evolution useful?**\n", + "Schema evolution can be used anytime you _intend_ to change the schema of your table\n", + "(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\n", + "be there). It’s the easiest way to migrate your schema because it automatically adds the\n", + "correct column names and data types, without having to declare them explicitly.\n", + "\n", + "**Summary**\n", + "Schema enforcement rejects any new columns or other schema changes that\n", + "aren’t compatible with your table. By setting and upholding these high standards,\n", + "analysts and engineers can trust that their data has the highest levels of integrity and\n", + "can reason about it with clarity, allowing them to make better business decisions.\n", + "On the flip side of the coin, schema evolution complements enforcement by making it\n", + "easy for intended schema changes to take place automatically. After all, it shouldn’t\n", + "be hard to add a column.\n", + "\n", + "Schema enforcement is the yin to schema evolution’s yang. When used together, these\n", + "features make it easier than ever to block out the noise and tune in to the signal.\n", + "\n", + "**Want to learn more about schema enforcement and evolution?**\n", + "\n", + "Read our blog post > Watch our tech talk >\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake**\n", + "**DML Internals**\n", + "\n", + "Delta Lake supports data manipulation language (DML) commands including UPDATE,\n", + "DELETE and MERGE. These commands simplify change data capture (CDC), audit and\n", + "governance, and GDPR/CCPA workflows, among others.\n", + "\n", + "In this chapter, we will demonstrate how to use each of these DML commands,\n", + "describe what Delta Lake is doing behind the scenes, and offer some performance\n", + "tuning tips for each one.\n", + "\n", + "**Delta Lake DML: UPDATE**\n", + "You can use the UPDATE operation to selectively update any rows that match a\n", + "filtering condition, also known as a predicate. The code below demonstrates how\n", + "to use each type of predicate as part of an UPDATE statement. Note that Delta Lake\n", + "offers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\n", + "only the SQL code.\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n", + "\n", + "\n", + "-----\n", + "\n", + "**UPDATE: Under the hood**\n", + "Delta Lake performs an UPDATE on a table in two steps:\n", + "\n", + "1. Find and select the files containing data that match the predicate and, therefore,\n", + "need to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\n", + "this process.\n", + "\n", + "2. \u0007Read each matching file into memory, update the relevant rows, and write out the\n", + "result into a new data file.\n", + "\n", + "Once Delta Lake has executed the UPDATE successfully, it adds a commit in the\n", + "transaction log indicating that the new data file will be used in place of the old one\n", + "from now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n", + "— recorded as a data file that applied to an older version of the table, but not the\n", + "current version. Delta Lake is able to use it to provide data versioning and time travel.\n", + "\n", + "**UPDATE + Delta Lake time travel = Easy debugging**\n", + "Keeping the old data files turns out to be very useful for debugging because you can\n", + "use Delta Lake “time travel” to go back and query previous versions of a table at any\n", + "\n", + "\n", + "time. In the event that you update your table incorrectly and want to figure out what\n", + "happened, you can easily compare two versions of a table to one another to see what\n", + "has changed.\n", + "\n", + "SELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n", + "\n", + "- FROM mytable VERSION AS OF 12\n", + "\n", + "**UPDATE: Performance tuning tips**\n", + "The main way to improve the performance of the UPDATE command on Delta Lake\n", + "is to add more predicates to narrow down the search space. The more specific the\n", + "search, the fewer files Delta Lake needs to scan and/or modify.\n", + "\n", + "**Delta Lake DML: DELETE**\n", + "You can use the DELETE command to selectively delete rows based upon a\n", + "predicate (filtering condition).\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "\n", + "-----\n", + "\n", + "In the event that you want to revert an accidental DELETE operation, you can use time\n", + "travel to roll back your table to the way it was.\n", + "\n", + "**DELETE: Under the hood**\n", + "DELETE works just like UPDATE under the hood. Delta Lake makes two scans of\n", + "the data: The first scan is to identify any data files that contain rows matching the\n", + "predicate condition. The second scan reads the matching data files into memory,\n", + "at which point Delta Lake deletes the rows in question before writing out the newly\n", + "clean data to disk.\n", + "\n", + "After Delta Lake completes a DELETE operation successfully, the old data files are\n", + "not deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\n", + "longer part of the active table) in the Delta Lake transaction log. Remember, those old\n", + "files aren’t deleted immediately because you might still need them to time travel back\n", + "to an earlier version of the table. If you want to delete files older than a certain time\n", + "period, you can use the VACUUM command.\n", + "\n", + "**DELETE + VACUUM: Cleaning up old data files**\n", + "Running the VACUUM command permanently deletes all data files that are:\n", + "\n", + "1. No longer part of the active table and\n", + "2. \u0007Older than the retention threshold, which is seven days by default\n", + "\n", + "Delta Lake does not automatically VACUUM old files — you must run the command\n", + "yourself, as shown below. If you want to specify a retention period that is different\n", + "from the default of seven days, you can provide it as a parameter.\n", + "\n", + "from delta.tables import - deltaTable.\n", + "\n", + "# vacuum files older than 30 days(720 hours)\n", + "\n", + "deltaTable.vacuum( 720 )\n", + "\n", + "\n", + "-----\n", + "\n", + "**DELETE: Performance tuning tips**\n", + "Just like with the UPDATE command, the main way to improve the performance of\n", + "a DELETE operation on Delta Lake is to add more predicates to narrow down the\n", + "search space. The Databricks managed version of Delta Lake also features other\n", + "performance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n", + "[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n", + "\n", + "**Delta Lake DML: MERGE**\n", + "The Delta Lake MERGE command allows you to perform upserts, which are a mix of\n", + "an UPDATE and an INSERT. To understand upserts, imagine that you have an existing\n", + "table (aka a target table), and a source table that contains a mix of new records and\n", + "updates to existing records.\n", + "\n", + "\n", + "**Here’s how an upsert works:**\n", + "\n", + "- When a record from the source table matches a preexisting record in the target\n", + "table, Delta Lake updates the record.\n", + "\n", + "- When there is no such match, Delta Lake inserts the new record.\n", + "\n", + "The Delta Lake MERGE command greatly simplifies workflows that can be complex\n", + "and cumbersome with other traditional data formats like Parquet. Common scenarios\n", + "where merges/upserts come in handy include change data capture, GDPR/CCPA\n", + "compliance, sessionization, and deduplication of records.\n", + "\n", + "**For more information about upserts, read:**\n", + "\n", + "[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n", + "\n", + "[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n", + "\n", + "[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**MERGE: Under the hood**\n", + "Delta Lake completes a MERGE in two steps:\n", + "\n", + "1. Perform an inner join between the target table and source table to select all files\n", + "that have matches.\n", + "2. Perform an outer join between the selected files in the target and source tables\n", + "and write out the updated/deleted/inserted data.\n", + "\n", + "The main way that this differs from an UPDATE or a DELETE under the hood is that\n", + "Delta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\n", + "strategies when seeking to improve performance.\n", + "\n", + "**MERGE: Performance tuning tips**\n", + "To improve performance of the MERGE command, you need to determine which of the\n", + "two joins that make up the merge is limiting your speed.\n", + "\n", + "If the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\n", + "takes too long), try the following strategies:\n", + "\n", + "- Add more predicates to narrow down the search space.\n", + "\n", + "- Adjust shuffle partitions.\n", + "\n", + "- Adjust broadcast join thresholds.\n", + "\n", + "- Compact the small files in the table if there are lots of them, but don’t compact them\n", + "into files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n", + "\n", + "\n", + "**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n", + "**locality of updates.**\n", + "\n", + "On the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\n", + "themselves takes too long), try the strategies below.\n", + "\n", + "- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\n", + "before writes (with Optimized Writes in Databricks Delta Lake).\n", + "\n", + "- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\n", + "broadcast join, but if you’re doing a right outer join, Spark can do one, and you can\n", + "adjust the broadcast thresholds as needed.\n", + "\n", + "- **Cache the source table / DataFrame:** Caching the source table can speed up the\n", + "second scan, but be sure not to cache the target table, as this can lead to cache\n", + "coherency issues.\n", + "\n", + "Delta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\n", + "greatly simplify the workflow for many common big data operations. In this chapter, we\n", + "demonstrated how to use these commands in Delta Lake, shared information about\n", + "how each one works under the hood, and offered some performance tuning tips.\n", + "\n", + "**Want a deeper dive into DML internals, including snippets of code?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Quickly**\n", + "**Processes Petabytes With**\n", + "**Data Skipping and Z-Ordering**\n", + "\n", + "Delta Lake is capable of sifting through petabytes of data within seconds. Much of this\n", + "speed is owed to two features: (1) data skipping and (2) Z-Ordering.\n", + "\n", + "Combining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\n", + "amount of data that needs to be scanned to answer selective queries against large\n", + "Delta tables, which typically translates into substantial runtime improvements and\n", + "cost savings.\n", + "\n", + "Using Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\n", + "data lakes can be queried in a matter of seconds by skipping files not relevant to\n", + "the query. For example, 93.2% of the records in a 504 TB data set were skipped for a\n", + "typical query in a real-world cybersecurity analysis use case, reducing query times by\n", + "up to two orders of magnitude. In other words, Delta Lake can speed up your queries\n", + "by as much as 100x.\n", + "\n", + "**Want to see data skipping and Z-Ordering in action?**\n", + "\n", + "Apple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n", + "\n", + "use Delta Lake as a unified solution for data engineering and data science in the context\n", + "\n", + "of cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n", + "\n", + "[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n", + "\n", + "\n", + "-----\n", + "\n", + "AND / OR / NOT are also supported as well as “literal op column” predicates.\n", + "\n", + "Even though data skipping kicks in when the above conditions are met, it may not\n", + "always be effective. But, if there are a few columns that you frequently filter by and\n", + "want to make sure that’s fast, then you can explicitly optimize your data layout with\n", + "respect to skipping effectiveness by running the following command:\n", + "\n", + "OPTIMIZE [ WHERE ]\n", + "ZORDER BY ( [, …])\n", + "\n", + "**Exploring the details**\n", + "Apart from partition pruning, another common technique that’s used in the data\n", + "warehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n", + "[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\n", + "as minimum and maximum values at a certain granularity that are correlated with I/O\n", + "granularity. And we want to leverage those statistics at query planning time in order\n", + "to avoid unnecessary I/O.\n", + "\n", + "This is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\n", + "inserted into a Delta Lake table, file-level min/max statistics are collected for all\n", + "columns (including nested ones) of supported types. Then, when there’s a lookup\n", + "query against the table, Delta Lake first consults these statistics in order to determine\n", + "which files can safely be skipped.\n", + "\n", + "**Want to learn more about data skipping and Z-Ordering, including**\n", + "**how to apply it within a cybersecurity analysis?**\n", + "\n", + "[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n", + "\n", + "\n", + "**Using data skipping and Z-Order clustering**\n", + "Data skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\n", + "Delta Lake, kicking in whenever your SQL queries or data set operations include filters\n", + "of the form “column op literal,” where:\n", + "\n", + "- column is an attribute of some Delta Lake table, be it top-level or nested, whose\n", + "data type is string / numeric / date/ timestamp\n", + "\n", + "- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n", + "\n", + "\n", + "- literal is an explicit (list of) value(s) of the same data type as a column\n", + "\n", + "\n", + "-----\n", + "\n", + "**Features**\n", + "Use Delta Lake’s robust features\n", + "to reliably manage your data\n", + "\n", + "## CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why Use MERGE**\n", + "**With Delta Lake?**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\n", + "MERGE command, which allows you to efficiently upsert and delete records in your\n", + "data lakes.\n", + "\n", + "MERGE dramatically simplifies how a number of common data pipelines can be built\n", + "-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\n", + "can now be replaced by simple MERGE queries.\n", + "\n", + "This finer-grained update capability simplifies how you build your big data\n", + "pipelines for various use cases ranging from change data capture to GDPR. You\n", + "no longer need to write complicated logic to overwrite tables and overcome a lack\n", + "of snapshot isolation.\n", + "\n", + "With changing data, another critical capability required is the ability to roll back, in\n", + "case of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n", + "[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n", + "\n", + "In this chapter, we’ll discuss common use cases where existing data might need to be\n", + "updated or deleted. We’ll also explore the challenges inherent to upserts and explain\n", + "how MERGE can address them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**When are upserts necessary?**\n", + "There are a number of common use cases where existing data in a data lake needs to\n", + "be updated or deleted:\n", + "\n", + "- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\n", + "the right to be forgotten (also known as data erasure) in GDPR, organizations must\n", + "remove a user’s information upon request. This data erasure includes deleting user\n", + "information in the data lake as well.\n", + "\n", + "- **Change data capture from traditional databases:** In a service-oriented\n", + "architecture, typically web and mobile applications are served by microservices\n", + "built on traditional SQL/NoSQL databases that are optimized for low latency. One\n", + "of the biggest challenges organizations face is joining data across these various\n", + "siloed data systems, and hence data engineers build pipelines to consolidate\n", + "all data sources into a central data lake to facilitate analytics. These pipelines\n", + "often have to periodically read changes made on a traditional SQL/NoSQL table\n", + "and apply them to corresponding tables in the data lake. Such changes can take\n", + "various forms: Tables with slowly changing dimensions, change data capture of all\n", + "inserted/updated/deleted rows, etc.\n", + "\n", + "- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\n", + "case in many areas ranging from product analytics to targeted advertising to\n", + "predictive maintenance. Building continuous applications to track sessions and\n", + "recording the results that write into data lakes is difficult because data lakes have\n", + "always been optimized for appending data.\n", + "\n", + "- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\n", + "Delta Lake table by appending data to the table. However, often the sources can\n", + "generate duplicate records and downstream de-duplication steps are needed to\n", + "take care of them.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Why upserts into data lakes have**\n", + "**traditionally been challenging**\n", + "Since data lakes are fundamentally based on files, they have always been optimized\n", + "for appending data rather than for changing existing data. Hence, building the above\n", + "use case has always been challenging.\n", + "\n", + "Users typically read the entire table (or a subset of partitions) and then overwrite\n", + "them. Therefore, every organization tries to reinvent the wheel for their requirement\n", + "by handwriting complicated queries in SQL, Spark, etc. This approach is:\n", + "\n", + "- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\n", + "records causes pipelines to be slow and costly. Hand-tuning the table layout and\n", + "query optimization is tedious and requires deep domain knowledge.\n", + "\n", + "- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\n", + "human errors. For example, multiple pipelines concurrently modifying the same table\n", + "without any transactional support can lead to unpredictable data inconsistencies\n", + "and in the worst case, data losses. Often, even a single handwritten pipeline can\n", + "easily cause data corruptions due to errors in encoding the business logic.\n", + "\n", + "- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\n", + "keep track of and maintain. In the long term, this alone can significantly increase\n", + "the organizational and infrastructural costs.\n", + "\n", + "**Introducing MERGE in Delta Lake**\n", + "With Delta Lake, you can easily address the use cases above without any of the\n", + "aforementioned problems using the following MERGE command:\n", + "\n", + "MERGE INTO\n", + "\n", + "USING\n", + "\n", + "ON\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "\n", + "[ WHEN NOT MATCHED [ AND ] THEN ]\n", + "\n", + "where\n", + "\n", + "=\n", + "\n", + "DELETE |\n", + "\n", + "UPDATE SET - |\n", + "\n", + "UPDATE SET column1 = value1 [, column2 = value2 ...]\n", + "\n", + "=\n", + "\n", + "INSERT - |\n", + "\n", + "INSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n", + "\n", + "Let’s understand how to use MERGE with a simple example. Suppose you have a\n", + "[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\n", + "Furthermore, you have a table of new addresses for both existing and new users. To\n", + "merge all the new addresses to the main user table, you can run the following:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING updates\n", + "\n", + "ON users.userId = updates.userId\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = updates.addresses\n", + "\n", + "WHEN NOT MATCHED THEN\n", + "INSERT (userId, address) VALUES (updates.userId, updates.address)\n", + "\n", + "This will perform exactly what the syntax says -- for existing users (i.e., MATCHED\n", + "clause), it will update the address column, and for new users (i.e., NOT MATCHED\n", + "clause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\n", + "MERGE operation can be orders of magnitude faster than overwriting entire partitions\n", + "or tables since Delta Lake reads only relevant files and updates them. Specifically,\n", + "Delta Lake's MERGE has the following advantages:\n", + "\n", + "\n", + "\n", + "[ WHEN MATCHED [ AND ] THEN ]\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying use cases with MERGE**\n", + "**Deleting data due to GDPR**\n", + "Complying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\n", + "get any easier. You can set up a simple scheduled job with an example code, like\n", + "below, to delete all the users who have opted out of your service.\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING opted_out_users\n", + "\n", + "ON opted_out_users.userId = users.userId\n", + "\n", + "WHEN MATCHED THEN DELETE\n", + "\n", + "**Applying change data from databases**\n", + "You can easily apply all data changes — updates, deletes, inserts — generated from an\n", + "external database into a Delta Lake table with the MERGE syntax as follows:\n", + "\n", + "MERGE INTO users\n", + "\n", + "USING (\n", + "\n", + "SELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n", + "\n", + "(\n", + "\n", + "SELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n", + "\n", + "FROM changes GROUP BY userId\n", + "\n", + ")\n", + "\n", + ") latestChange\n", + "\n", + "ON latestChange.userId = users.userId\n", + "\n", + "WHEN MATCHED AND latestChange.deleted = TRUE THEN\n", + "\n", + "DELETE\n", + "\n", + "WHEN MATCHED THEN\n", + "\n", + "UPDATE SET address = latestChange.address\n", + "\n", + "WHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n", + "\n", + "INSERT (userId, address) VALUES (userId, address)\n", + "\n", + "\n", + "\n", + "- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\n", + "partitions. This eliminates all the complications of rewriting partitions, updating\n", + "the Hive metastore with MSCK and so on.\n", + "\n", + "- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\n", + "rewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\n", + "Delta Lake with all its I/O and processing optimizations makes all the reading and\n", + "writing data by MERGE significantly faster than similar operations in Apache Spark.\n", + "\n", + "- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\n", + "concurrent writers update the data correctly with ACID transactions, and concurrent\n", + "readers always see a consistent snapshot of the data.\n", + "\n", + "Here is a visual explanation of how MERGE compares with handwritten pipelines.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Updating session information from streaming**\n", + "**pipelines**\n", + "If you have streaming event data flowing in and if you want to sessionize the streaming\n", + "event data and incrementally update and store sessions in a Delta Lake table, you\n", + "can accomplish this using the foreachBatch in Structured Streaming and MERGE.\n", + "For example, suppose you have a Structured Streaming DataFrame that computes\n", + "updated session information for each user. You can start a streaming query that\n", + "applies all the sessions update to a Delta Lake table as follows (Scala).\n", + "\n", + "streamingSessionUpdatesDF.writeStream\n", + "\n", + ".foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n", + "\n", + "microBatchOutputDF.createOrReplaceTempView(“updates”)\n", + "\n", + "microBatchOutputDF.sparkSession.sql(s”””\n", + "\n", + "MERGE INTO sessions\n", + "\n", + "USING updates\n", + "\n", + "ON sessions.sessionId = updates.sessionId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *\n", + "\n", + "WHEN NOT MATCHED THEN INSERT * “”” )\n", + "\n", + "}.start()\n", + "\n", + "For a complete working example of each Batch and MERGE, see this notebook\n", + "( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n", + "\n", + "[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n", + "\n", + "[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n", + "\n", + "[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n", + "\n", + "[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simple, Reliable Upserts and**\n", + "**Deletes on Delta Lake Tables**\n", + "**Using Python APIs**\n", + "\n", + "In this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\n", + "Lake within the context of an on-time flight performance scenario. We will show how\n", + "to upsert and delete data, query old versions of data with time travel, and vacuum\n", + "older versions for cleanup.\n", + "\n", + "**How to start using Delta Lake**\n", + "The Delta Lake package is installable through PySpark by using the --packages\n", + "option. In our example, we will also demonstrate the ability to VACUUM files and execute\n", + "Delta Lake SQL commands within Apache Spark. As this is a short demonstration, we\n", + "will also enable the following configurations:\n", + "\n", + "\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n", + "\n", + "to allow us to vacuum files shorter than the default retention duration of seven days.\n", + "Note, this is only required for the SQL command VACUUM\n", + "\n", + "\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n", + "\n", + "to enable Delta Lake SQL commands within Apache Spark; this is not required for\n", + "Python or Scala API calls.\n", + "\n", + "# Using Spark Packages\n", + "\n", + "./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n", + "\n", + "databricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n", + "\n", + "sql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Loading and saving our Delta Lake data**\n", + "This scenario will be using the On-Time Flight Performance or Departure Delays data\n", + "set generated from the RITA BTS Flight Departure Statistics; some examples of this data\n", + "in action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\n", + "by reading the data set.\n", + "\n", + "\u0007# Location variables\n", + "\n", + "\n", + "/departureDelays.delta$ ls l\n", + "\n", + ".\n", + "\n", + "..\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n", + "\n", + "Part- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n", + "\n", + "\n", + "tripdelaysFilePath = “/root/data/departuredelays.csv”\n", + "\n", + "pathToEventsTable = “/root/deltalake/departureDelays.delta”\n", + "\n", + "Now, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n", + "\n", + "# Read flight delay data\n", + "\n", + "\n", + "departureDelays = spark.read \\\n", + "\n", + ".option( “header” , “true” ) \\\n", + "\n", + ".option( “inferSchema” , “true” ) \\\n", + "\n", + ".csv(tripdelaysFilePath)\n", + "\n", + "Next, let’s save our departureDelays data set to a Delta Lake table. By saving this table\n", + "to Delta Lake storage, we will be able to take advantage of its features including ACID\n", + "transactions, unified batch and streaming and time travel.\n", + "\n", + "# Save flight delay data into Delta Lake format\n", + "\n", + "departureDelays \\\n", + "\n", + ".write \\\n", + "\n", + "\n", + "# Load flight delay data in Delta Lake format\n", + "\n", + "delays_delta = spark \\\n", + "\n", + ".read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “departureDelays.delta” )\n", + "\n", + "# Create temporary view\n", + "\n", + "delays_delta.createOrReplaceTempView(“delays_delta”)\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’”).show()\n", + "\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".mode( “overwrite” ) \\\n", + "\n", + ".save( “departureDelays.delta” )\n", + "\n", + "Note, this approach is similar to how you would normally save Parquet data; instead\n", + "of specifying format(“parquet”) , you will now specify format(“delta”) . If\n", + "you were to take a look at the underlying file system, you will notice four files created\n", + "for the departureDelays Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, lets determine the number of flights originating from Seattle to San Francisco; in\n", + "this data set, there are 1698 flights.\n", + "\n", + "**In-place conversion to Delta Lake**\n", + "If you have existing Parquet tables, you have the ability to convert them to Delta Lake\n", + "format in place, thus not needing to rewrite your table. To convert the table, you can\n", + "run the following commands.\n", + "\n", + "\n", + "deltaTable DeltaTable .forPath(spark, pathToEventsTable\n", + "\n", + ")\n", + "\n", + "# Delete all on-time and early flights\n", + "\n", + "deltaTable. delete ( “delay < 0” )\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "# Convert non partitioned parquet table at path ‘/path/to/table’\n", + "\n", + "deltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n", + "\n", + "table`” )\n", + "\n", + "# Convert partitioned parquet table at path ‘/path/to/table’ and\n", + "\n", + "partitioned by integer column named ‘part’\n", + "\n", + "\n", + "After we delete (more on this below) all of the on-time and early flights, as you can\n", + "see from the preceding query there are 837 late flights originating from Seattle to\n", + "San Francisco. If you review the file system, you will notice there are more files even\n", + "though you deleted data.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n", + "\n", + "part-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n", + "\n", + "part- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n", + "\n", + "part- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n", + "\n", + "part- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n", + "\n", + "part- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n", + "\n", + "part- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n", + "\n", + "part- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n", + "\n", + "\n", + "partitionedDeltaTable = DeltaTable .convertToDelta(spark,\n", + "\n", + "“parquet.`/path/to/table`”, “part int” )\n", + "\n", + "**Delete our flight data**\n", + "To delete data from a traditional data lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to delete\n", + "2. Create a new table based on the previous query\n", + "3. Delete the original table\n", + "4. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this process\n", + "by running a DELETE statement. To show this, let’s delete all of the flights that had\n", + "arrived early or on-time (i.e., delay < 0).\n", + "\n", + "\n", + "from delta.tables import - \n", + "\n", + "from pyspark.sql.functions import - \n", + "\n", + "# Access the Delta Lake table\n", + "\n", + "\n", + "-----\n", + "\n", + "In traditional data lakes, deletes are performed by rewriting the entire table\n", + "excluding the values to be deleted. With Delta Lake, deletes are instead performed\n", + "by selectively writing new versions of the files containing the data to be deleted and\n", + "only marks the previous files as deleted. This is because Delta Lake uses multiversion\n", + "concurrency control (MVCC) to do atomic operations on the table: For example, while\n", + "one user is deleting data, another user may be querying the previous version of the\n", + "table. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\n", + "and query previous versions as we will see later.\n", + "\n", + "**Update our flight data**\n", + "To update data from your traditional Data Lake table, you will need to:\n", + "\n", + "1. Select all of the data from your table not including the rows you want to modify\n", + "2. Modify the rows that need to be updated/changed\n", + "3. Merge these two tables to create a new table\n", + "4. Delete the original table\n", + "5. Rename the new table to the original table name for downstream dependencies\n", + "\n", + "Instead of performing all of these steps, with Delta Lake, we can simplify this\n", + "process by running an UPDATE statement. To show this, let’s update all of the flights\n", + "originating from Detroit to Seattle.\n", + "\n", + "\n", + "With the Detroit flights now tagged as Seattle flights, we now have 986 flights\n", + "originating from Seattle to San Francisco. If you were to list the file system for\n", + "your departureDelays folder (i.e., $../departureDelays/ls -l ), you will\n", + "notice there are now 11 files (instead of the 8 right after deleting the files and the four\n", + "files after creating the table).\n", + "\n", + "**Merge our flight data**\n", + "A common scenario when working with a data lake is to continuously append data to\n", + "your table. This often results in duplicate data (rows you do not want to be inserted\n", + "into your table again), new rows that need to be inserted, and some rows that need to\n", + "be updated. With Delta Lake, all of this can be achieved by using the merge operation\n", + "(similar to the SQL MERGE statement).\n", + "\n", + "Let’s start with a sample data set that you will want to be updated, inserted or\n", + "de-duplicated with the following query.\n", + "\n", + "\n", + "# Update all flights originating from Detroit to now be\n", + "\n", + "\n", + "originating from Seattle\n", + "\n", + "deltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n", + "\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "# How many flights are between Seattle and San Francisco\n", + "\n", + "\n", + "The output of this query looks like the following table. Note, the color-coding has been\n", + "added to clearly identify which rows are de-duplicated (blue), updated (yellow) and\n", + "inserted (green).\n", + "\n", + "\n", + "spark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n", + "\n", + "and destination = ‘SFO’” ).show()\n", + "\n", + "\n", + "-----\n", + "\n", + "Next, let’s generate our own merge_table that contains data we will insert, update\n", + "or de-duplicate with the following code snippet.\n", + "\n", + "items = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n", + "\n", + "‘SEA’ , ‘SFO’ ),\n", + "\n", + "(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n", + "\n", + "\n", + "With Delta Lake, this can be easily achieved via a merge statement as noted in the\n", + "following code snippet.\n", + "\n", + "# Merge merge_table with flights\n", + "\n", + "deltaTable. alias( “flights” ) \\\n", + "\n", + ".merge(merge_table. alias ( “updates”),”flights.date =\n", + "\n", + "updates.date” ) \\\n", + "\n", + ".whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n", + "\n", + ".whenNotMatchedInsertAll() \\\n", + "\n", + ".execute()\n", + "\n", + "# What flights between SEA and SFO for these date periods\n", + "\n", + "spark.sql( “select * from delays_delta where origin = ‘SEA’ and\n", + "\n", + "destination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n", + "\n", + "\n", + "cols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n", + "\n", + "\n", + "merge_table = spark.createDataFrame(items, cols)\n", + "\n", + "merge_table.toPandas()\n", + "\n", + "In the preceding table ( merge_table ), there are three rows with a unique date value:\n", + "\n", + "1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n", + "2. 1010710: This row is a _duplicate_ (blue)\n", + "3. 1010832: This is a new row to be _inserted_ (green)\n", + "\n", + "\n", + "All three actions of de-duplication, update and insert were efficiently completed with\n", + "one statement.\n", + "\n", + "**View table history**\n", + "As previously noted, after each of our transactions (delete, update), there were more\n", + "files created within the file system. This is because for each transaction, there are\n", + "different versions of the Delta Lake table.\n", + "\n", + "\n", + "-----\n", + "\n", + "This can be seen by using the DeltaTable.history() method as noted below\n", + "\n", + "Note: You can also perform the same task with SQL:\n", + "\n", + "spark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n", + "\n", + "As you can see, there are three rows representing the different versions of the table\n", + "(below is an abridged version to help make it easier to read) for each of the operations\n", + "(create table, delete and update):\n", + "\n", + "**Travel back in time with table history**\n", + "With Time Travel, you can review the Delta Lake table as of the version or timestamp.\n", + "To view historical data, specify the version or timestamp option; in the following code\n", + "snippet, we will specify the version option.\n", + "\n", + "\n", + "# Load DataFrames for each version\n", + "\n", + "dfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "0 ).load( “departureDelays.delta” )\n", + "\n", + "dfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n", + "\n", + "1 ).load( “departureDelays.delta” )\n", + "\n", + "dfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n", + "\n", + "2 ).load( “departureDelays.delta” )\n", + "\n", + "# Calculate the SEA to SFO flight counts for each version of history\n", + "\n", + "cnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "cnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n", + "\n", + "# Print out the value\n", + "\n", + "print ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n", + "\n", + "(cnt0, cnt1, cnt2))\n", + "\n", + "## Output\n", + "\n", + "SEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n", + "\n", + "Whether for governance, risk management and compliance (GRC) or rolling back\n", + "errors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\n", + "delete had occurred with these operators) and data (e.g., the actual rows deleted). But\n", + "how do we remove the data files either for compliance or size reasons?\n", + "\n", + "**Clean up old table versions with vacuum**\n", + "The [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\n", + "older than seven days’ reference. If you were to view the file system, you’ll notice the\n", + "11 files for your table.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n", + "\n", + "part- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n", + "\n", + "part- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n", + "\n", + "part- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n", + "\n", + "Part- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "To delete all of the files so that you only keep the current snapshot of data, you will specify a\n", + "small value for the vacuum method (instead of the default retention of 7 days).\n", + "\n", + "# Remove all files older than 0 hours old.\n", + "\n", + "deltaTable.vacuum( 0 )\n", + "\n", + "Note , you perform the same task via SQL syntax:¸\n", + "\n", + "# Remove all files older than 0 hours old\n", + "\n", + "spark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n", + "\n", + "Once the vacuum has completed, when you review the file system you will notice fewer\n", + "files as the historical data has been removed.\n", + "\n", + "/departureDelays.delta$ ls -l\n", + "\n", + "_delta_log\n", + "\n", + "part- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n", + "\n", + "part- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n", + "\n", + "part- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n", + "\n", + "part- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n", + "\n", + "Note, the ability to time travel back to a version older than the retention period is lost\n", + "after running vacuum.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Time Travel for**\n", + "**Large-Scale Data Lakes**\n", + "\n", + "Time travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n", + "[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\n", + "metadata handling, and unifies streaming and batch data processing. Delta Lake runs on\n", + "top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "With this feature, Delta Lake automatically versions the big data that you store in your\n", + "data lake, and you can access any historical version of that data. This temporal data\n", + "management simplifies your data pipeline by making it easy to audit, roll back data\n", + "in case of accidental bad writes or deletes, and reproduce experiments and reports.\n", + "\n", + "Your organization can finally standardize on a clean, centralized, versioned big data\n", + "repository in your own cloud storage for your analytics.\n", + "\n", + "**Common challenges with changing data**\n", + "\n", + "- **Audit data changes:** Auditing data changes is critical both in terms of data\n", + "compliance as well as simple debugging to understand how data has changed over\n", + "time. Organizations moving from traditional data systems to big data technologies\n", + "and the cloud struggle in such scenarios.\n", + "\n", + "- **Reproduce experiments and reports:** During model training, data scientists\n", + "run various experiments with different parameters on a given set of data. When\n", + "scientists revisit their experiments after a period of time to reproduce the models,\n", + "typically the source data has been modified by upstream pipelines. A lot of times,\n", + "they are caught unaware by such upstream data changes and hence struggle to\n", + "reproduce their experiments. Some scientists and organizations engineer best\n", + "\n", + "\n", + "-----\n", + "\n", + "practices by creating multiple copies of the data, leading to increased storage\n", + "costs. The same is true for analysts generating reports.\n", + "\n", + "- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n", + "\n", + "This can happen because of issues ranging from infrastructure instabilities to messy\n", + "data to bugs in the pipeline. For pipelines that do simple appends to directories or a\n", + "table, rollbacks can easily be addressed by date-based partitioning. With updates\n", + "and deletes, this can become very complicated, and data engineers typically have\n", + "to engineer a complex pipeline to deal with such scenarios.\n", + "\n", + "**Working with Time Travel**\n", + "Delta Lake’s time travel capabilities simplify building data pipelines for the above use\n", + "cases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n", + "\n", + "- Data scientists manage their experiments better\n", + "\n", + "- Data engineers simplify their pipelines and roll back bad writes\n", + "\n", + "- Data analysts do easy reporting\n", + "\n", + "Organizations can finally standardize on a clean, centralized, versioned big data\n", + "repository in their own cloud storage for analytics. We are thrilled to see what you will\n", + "be able to accomplish with this feature.\n", + "\n", + "As you write into a Delta Lake table or directory, every operation is automatically\n", + "versioned. You can access the different versions of the data two different ways:\n", + "\n", + "**1. Using a timestamp**\n", + "**Scala syntax**\n", + "You can provide the timestamp or date string as an option to DataFrame reader:\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "\n", + "-----\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “timestampAsOf” , “2019-01-01” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "SELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n", + "\n", + "If the reader code is in a library that you don’t have access to, and if you are passing\n", + "input parameters to the library to read data, you can still travel back in time for a table\n", + "by passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n", + "\n", + "val inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "val df = loadData(inputPath)\n", + "\n", + "// Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath : String ) : DataFrame = {\n", + "\n", + "spark.read\n", + "\n", + ".format(“delta”)\n", + "\n", + ".load(inputPath)\n", + "\n", + "}\n", + "\n", + "inputPath = “/path/to/my/table@20190101000000000”\n", + "\n", + "df = loadData(inputPath)\n", + "\n", + "# Function in a library that you don’t have access to\n", + "\n", + "def loadData(inputPath):\n", + "\n", + "return spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load(inputPath)\n", + "\n", + "\n", + "-----\n", + "\n", + "**2. Using a version number**\n", + "In Delta Lake, every write has a version number, and you can use the version number\n", + "to travel back in time as well.\n", + "\n", + "**Scala syntax**\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".option( “versionAsOf” , “5238” )\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "val df = spark.read\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**Python syntax**\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".option( “versionAsOf” , “5238” ) \\\n", + "\n", + ".load( “/path/to/my/table” )\n", + "\n", + "df = spark.read \\\n", + "\n", + ".format( “delta” ) \\\n", + "\n", + ".load( “/path/to/my/table@v5238” )\n", + "\n", + "**SQL syntax**\n", + "\n", + "SELECT count(*) FROM my_table VERSION AS OF 5238\n", + "\n", + "\n", + "-----\n", + "\n", + "**Audit data changes**\n", + "You can look at the history of table changes using the DESCRIBE HISTORY command\n", + "or through the UI.\n", + "\n", + "**Reproduce experiments and reports**\n", + "Time travel also plays an important role in machine learning and data science.\n", + "Reproducibility of models and experiments is a key consideration for data scientists\n", + "because they often create hundreds of models before they put one into production,\n", + "and in that time-consuming process would like to go back to earlier models. However,\n", + "because data management is often separate from data science tools, this is really\n", + "hard to accomplish.\n", + "\n", + "Databricks solves this reproducibility problem by integrating Delta Lake’s Time\n", + "Travel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\n", + "lifecycle. For reproducible machine learning training, you can simply log a\n", + "\n", + "\n", + "timestamped URL to the path as an MLflow parameter to track which version of the\n", + "data was used for each training job.\n", + "\n", + "This enables you to go back to earlier settings and data sets to reproduce earlier\n", + "models. You neither need to coordinate with upstream teams on the data nor worry\n", + "about cloning data for different experiments. This is the power of unified analytics,\n", + "whereby data science is closely married with data engineering.\n", + "\n", + "**Rollbacks**\n", + "Time travel also makes it easy to do rollbacks in case of bad writes. For example, if\n", + "your GDPR pipeline job had a bug that accidentally deleted user information, you can\n", + "easily fix the pipeline:\n", + "\n", + "INSERT INTO my_table\n", + "\n", + "SELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n", + "\n", + "WHERE userId = 111\n", + "\n", + "\n", + "-----\n", + "\n", + "You can also fix incorrect updates as follows:\n", + "\n", + "# Will use the latest version of the table for all operations below\n", + "\n", + "MERGE INTO my_table target\n", + "\n", + "\n", + "USING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n", + "\n", + "ON source.userId = target.userId\n", + "\n", + "WHEN MATCHED THEN UPDATE SET - \n", + "\n", + "If you simply want to roll back to a previous version of your table, you can do so with\n", + "either of the following commands:\n", + "\n", + "RESTORE TABLE my_table VERSION AS OF [version_number]\n", + "\n", + "RESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n", + "\n", + "**Pinned view of a continuously updating**\n", + "**Delta Lake table across multiple downstream jobs**\n", + "With AS OF queries, you can now pin the snapshot of a continuously updating Delta\n", + "Lake table for multiple downstream jobs. Consider a situation where a Delta Lake table\n", + "is being continuously updated, say every 15 seconds, and there is a downstream job\n", + "that periodically reads from this Delta Lake table and updates different destinations.\n", + "In such scenarios, typically you want a consistent view of the source Delta Lake table\n", + "so that all destination tables reflect the same state.\n", + "\n", + "You can now easily handle such scenarios as follows:\n", + "\n", + "version = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n", + "\n", + "my_table)” ).collect()\n", + "\n", + "\n", + "data = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n", + "\n", + "( “event_type = e1” ).write.jdbc( “table1” )\n", + "\n", + "data.where( “event_type = e2” ).write.jdbc( “table2” )\n", + "\n", + "...\n", + "\n", + "data.where( “event_type = e10” ).write.jdbc( “table10” )\n", + "\n", + "**Queries for time series analytics made simple**\n", + "Time travel also simplifies time series analytics. For example, if you want to find out\n", + "how many new customers you added over the last week, your query could be a very\n", + "simple one like this:\n", + "\n", + "SELECT count( distinct userId) - (\n", + "\n", + "SELECT count( distinct userId)\n", + "\n", + "FROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n", + "\n", + "FROM my_table\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n", + "\n", + "[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n", + "\n", + "[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n", + "\n", + "[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Easily Clone Your Delta Lake**\n", + "**for Testing, Sharing and ML**\n", + "**Reproducibility**\n", + "\n", + "Delta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\n", + "recreate tables for ML reproducibility. Creating copies of tables in a data lake or data\n", + "warehouse has several practical uses. However, given the volume of data in tables\n", + "in a data lake and the rate of its growth, making physical copies of tables is an\n", + "expensive operation.\n", + "\n", + "Delta Lake now makes the process simpler and cost-effective with the help of\n", + "table clones.\n", + "\n", + "**What are clones?**\n", + "Clones are replicas of a source table at a given point in time. They have the same\n", + "metadata as the source table: same schema, constraints, column descriptions, statistics\n", + "and partitioning. However, they behave as a separate table with a separate lineage\n", + "or history. Any changes made to clones only affect the clone and not the source. Any\n", + "changes that happen to the source during or after the cloning process also do not get\n", + "reflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\n", + "clones: shallow or deep.\n", + "\n", + "**Shallow clones**\n", + "A _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\n", + "table being cloned; the data files of the table itself are not copied. This type of cloning\n", + "does not create another physical copy of the data resulting in minimal storage costs.\n", + "Shallow clones are inexpensive and can be extremely fast to create.\n", + "\n", + "\n", + "-----\n", + "\n", + "These clones are not self-contained and depend on the source from which they were\n", + "cloned as the source of data. If the files in the source that the clone depends on are removed,\n", + "for example with VACUUM, a shallow clone may become unusable. Therefore, shallow\n", + "clones are typically used for short-lived use cases such as testing and experimentation.\n", + "\n", + "**Deep clones**\n", + "Shallow clones are great for short-lived use cases, but some scenarios require a\n", + "separate and independent copy of the table’s data. A deep clone makes a full copy of\n", + "the metadata and the data files of the table being cloned. In that sense, it is similar in\n", + "functionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\n", + "But it is simpler to specify since it makes a faithful copy of the original table at the\n", + "specified version, and you don’t need to re-specify partitioning, constraints and other\n", + "information as you have to do with CTAS. In addition, it is much faster, robust and can\n", + "work in an incremental manner against failures.\n", + "\n", + "With deep clones, we copy additional metadata, such as your streaming application\n", + "transactions and COPY INTO transactions, so you can continue your ETL applications\n", + "exactly where it left off on a deep clone.\n", + "\n", + "**Where do clones help?**\n", + "Sometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\n", + "not talking about human clones here. There are many scenarios where you need a\n", + "copy of your data sets — for exploring, sharing or testing ML models or analytical\n", + "queries. Below are some examples of customer use cases.\n", + "\n", + "**Testing and experimentation with a production table**\n", + "When users need to test a new version of their data pipeline they often have to rely\n", + "on sample test data sets that are not representative of all the data in their production\n", + "environment. Data teams may also want to experiment with various indexing techniques\n", + "to improve the performance of queries against massive tables. These experiments and\n", + "\n", + "\n", + "tests cannot be carried out in a production environment without risking production\n", + "data processes and affecting users.\n", + "\n", + "It can take many hours or even days, to spin up copies of your production tables for a test\n", + "or a development environment. Add to that, the extra storage costs for your development\n", + "environment to hold all the duplicated data — there is a large overhead in setting a test\n", + "environment reflective of the production data. With a shallow clone, this is trivial:\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n", + "\n", + "# Python\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=True)\n", + "\n", + "// Scala\n", + "\n", + "DeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n", + "\n", + "isShallow=true)\n", + "\n", + "After creating a shallow clone of your table in a matter of seconds, you can start\n", + "running a copy of your pipeline to test out your new code, or try optimizing your table\n", + "in different dimensions to see how you can improve your query performance, and much\n", + "much more. These changes will only affect your shallow clone, not your original table.\n", + "\n", + "**Staging major changes to a production table**\n", + "Sometimes, you may need to perform some major changes to your production table.\n", + "These changes may consist of many steps, and you don’t want other users to see the\n", + "changes that you’re making until you’re done with all of your work. A shallow clone can\n", + "help you out here:\n", + "\n", + "\n", + "-----\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n", + "\n", + "DELETE FROM temp.staged_changes WHERE event_id is null;\n", + "\n", + "UPDATE temp.staged_changes SET change_date = current_date()\n", + "\n", + "WHERE change_date is null;\n", + "\n", + "...\n", + "\n", + "-- Perform your verifications\n", + "\n", + "Once you’re happy with the results, you have two options. If no other change has\n", + "been made to your source table, you can replace your source table with the clone.\n", + "If changes have been made to your source table, you can merge the changes into\n", + "your source table.\n", + "\n", + "-- If no changes have been made to the source\n", + "\n", + "REPLACE TABLE prod.events CLONE temp.staged_changes;\n", + "\n", + "-- If the source table has changed\n", + "\n", + "MERGE INTO prod.events USING temp.staged_changes\n", + "\n", + "ON events.event_id <=> staged_changes.event_id\n", + "\n", + "WHEN MATCHED THEN UPDATE SET *;\n", + "\n", + "-- Drop the staged table\n", + "\n", + "DROP TABLE temp.staged_changes;\n", + "\n", + "**Machine learning result reproducibility**\n", + "Coming up with an effective ML model is an iterative process. Throughout this process\n", + "of tweaking the different parts of the model, data scientists need to assess the\n", + "accuracy of the model against a fixed data set.\n", + "\n", + "This is hard to do in a system where the data is constantly being loaded or updated. A\n", + "snapshot of the data used to train and test the model is required. This snapshot allows\n", + "the results of the ML model to be reproducible for testing or model governance purposes.\n", + "\n", + "\n", + "-----\n", + "\n", + "We recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\n", + "example of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n", + "\n", + "Once you’re happy with the results and would like to archive the data for later retrieval,\n", + "for example, next Black Friday, you can use deep clones to simplify the archiving process.\n", + "MLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\n", + "autolog() ) will tell you which version of the table was used to run a set of experiments.\n", + "\n", + "# Run your ML workloads using Python and then\n", + "\n", + "DeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n", + "\n", + "store_bf2020”)\n", + "\n", + "**Data migration**\n", + "A massive table may need to be moved to a new, dedicated bucket or storage system\n", + "for performance or governance reasons. The original table will not receive new\n", + "updates going forward and will be deactivated and removed at a future point in time.\n", + "Deep clones make the copying of massive tables more robust and scalable.\n", + "\n", + "-- SQL\n", + "\n", + "CREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n", + "\n", + "ALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n", + "\n", + "With deep clones, since we copy your streaming application transactions and\n", + "COPY INTO transactions, you can continue your ETL applications from exactly where\n", + "it left off after this migration!\n", + "\n", + "**Data sharing**\n", + "In an organization, it is often the case that users from different departments are\n", + "looking for data sets that they can use to enrich their analysis or models. You may\n", + "want to share your data with other users across the organization. But rather than\n", + "setting up elaborate pipelines to move the data to yet another store, it is often easier\n", + "and economical to create a copy of the relevant data set for users to explore and\n", + "\n", + "\n", + "-----\n", + "\n", + "**Looks awesome! Any gotchas?**\n", + "Just to reiterate some of the gotchas mentioned above as a single list, here’s what you\n", + "should be wary of:\n", + "\n", + "- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\n", + "the source table after the cloning process starts will not be reflected in the\n", + "clone.\n", + "\n", + "- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\n", + "deleted in the source table (for example through VACUUM), your shallow clone\n", + "may not be usable.\n", + "\n", + "- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\n", + "queries on your source table and clone may not return the same result.\n", + "\n", + "- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\n", + "deep clones to migrate your tables and continue your ETL processes from\n", + "where it left off.\n", + "\n", + "**How can I use it?**\n", + "Shallow and deep clones support new advances in how data teams test and manage\n", + "their modern cloud data lakes and warehouses. Table clones can help your team\n", + "implement production-level testing of their pipelines, fine-tune their indexing for optimal\n", + "query performance, create table copies for sharing — all with minimal overhead and\n", + "expense. If this is a need in your organization, we hope you will take table cloning for\n", + "a spin and give us your feedback — we look forward to hearing about new use cases and\n", + "extensions you would like to see in the future.\n", + "\n", + "**Additional resource**\n", + "\n", + "[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n", + "\n", + "\n", + "test the data to see if it is a fit for their needs without affecting your own production\n", + "systems. Here deep clones again come to the rescue.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n", + "\n", + "**Data archiving**\n", + "For regulatory or archiving purposes, all data in a table needs to be preserved for a\n", + "certain number of years, while the active table retains data for a few months. If you\n", + "want your data to be updated as soon as possible, but you have a requirement to keep\n", + "data for several years, storing this data in a single table and performing time travel\n", + "may become prohibitively expensive.\n", + "\n", + "In this case, archiving your data in a daily, weekly or monthly manner is a better\n", + "solution. The incremental cloning capability of deep clones will really help you here.\n", + "\n", + "-- The following code can be scheduled to run at your convenience\n", + "\n", + "CREATE OR REPLACE TABLE archive.events CLONE prod.events;\n", + "\n", + "Note that this table will have an independent history compared to the source table,\n", + "therefore, time travel queries on the source table and the clone may return different\n", + "results based on your frequency of archiving.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Enabling Spark SQL DDL**\n", + "**and DML in Delta Lake on**\n", + "**Apache Spark 3.0**\n", + "\n", + "The release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\n", + "enabling a new set of features that were simplified using Delta Lake from SQL. Here\n", + "are some of the key features.\n", + "\n", + "**Support for SQL DDL commands**\n", + "**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\n", + "You can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\n", + "SQL operations when creating (or replacing) tables.\n", + "\n", + "**Create or replace tables**\n", + "\n", + "-- Create table in the metastore\n", + "\n", + "CREATE TABLE events (\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "USING DELTA\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "-- If a table with the same name already exists, the table is replaced\n", + "\n", + "with\n", + "\n", + "the new configuration, else it i s created\n", + "\n", + "CREATE OR REPLACE TABLE events (\n", + "\n", + "\n", + "-----\n", + "\n", + "date DATE,\n", + "\n", + "eventId STRING,\n", + "\n", + "eventType STRING,\n", + "\n", + "data STRING)\n", + "\n", + "\n", + "INSERT INTO events SELECT * FROM newEvents\n", + "\n", + "-- To atomically replace all of the data in a table, you can use\n", + "\n", + "overwrite mode\n", + "\n", + "INSERT OVERWRITE events SELECT * FROM newEvents\n", + "\n", + "\n", + "USING DELTA\n", + "\n", + "\n", + "PARTITIONED BY (date)\n", + "\n", + "LOCATION ‘/delta/events’\n", + "\n", + "**Explicitly alter the table schema**\n", + "\n", + "-- Alter table and schema\n", + "\n", + "\n", + "-- Delete events\n", + "\n", + "DELETE FROM events WHERE date < ‘2017-01-01’\n", + "\n", + "-- Update events\n", + "\n", + "UPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n", + "\n", + "\n", + "ALTER TABLE table_name ADD COLUMNS (\n", + "\n", + "\n", + "col_name data_type\n", + "\n", + "[COMMENT col_comment]\n", + "\n", + "[FIRST|AFTER colA_name],\n", + "\n", + "...)\n", + "\n", + "You can also use the Scala/Java/Python APIs:\n", + "\n", + "- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\n", + "APIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n", + "\n", + "- \u0007DeltaTable.forName(tableName) API to create instances of\n", + "io.delta.tables .DeltaTable which is useful for executing\n", + "Update/Delete/Merge operations in Scala/Java/Python.\n", + "\n", + "**Support for SQL Insert, Delete, Update and Merge**\n", + "One of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\n", + "would DML operations such as delete, update and merge be available in Spark SQL?\n", + "Wait no more, these operations are now available in SQL! Below are examples of how\n", + "you can write delete, update and merge (insert, update, delete and de-duplication\n", + "operations using Spark SQL).\n", + "\n", + "-- Using append mode, you can atomically add new data to an existing\n", + "\n", + "Delta table\n", + "\n", + "\n", + "-- Upsert data to a target Delta\n", + "\n", + "-- table using merge\n", + "\n", + "MERGE INTO events\n", + "\n", + "USING updates\n", + "\n", + "ON events.eventId = updates.eventId\n", + "\n", + "WHEN MATCHED THEN UPDATE\n", + "\n", + "SET events.data = updates.data\n", + "\n", + "WHEN NOT MATCHED THEN INSERT (date, eventId, data)\n", + "\n", + "VALUES (date, eventId, data)\n", + "\n", + "It is worth noting that the merge operation in Delta Lake supports more advanced\n", + "syntax than standard ANSI SQL syntax. For example, merge supports\n", + "\n", + "- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n", + "“... WHEN MATCHED THEN DELETE ...”\n", + "\n", + "- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\n", + "and source rows match. For example:\n", + "\n", + "...\n", + "\n", + "WHEN MATCHED AND events.shouldDelete THEN DELETE\n", + "\n", + "WHEN MATCHED THEN UPDATE SET events.data = updates.data\n", + "\n", + "\n", + "-----\n", + "\n", + "\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\n", + "sources column. For example:\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "such as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\n", + "block deletes and updates in a Delta table using delta.appendOnly=true .\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN MATCHED THEN SET *\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "WHEN NOT MATCHED THEN INSERT *\n", + "\n", + "-- equivalent to updating/inserting with event .date = updates.date,\n", + "\n", + "events.eventId = updates.eventId, event .data = updates.data\n", + "\n", + "**Automatic and incremental Presto/Athena manifest**\n", + "**generation**\n", + "As noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\n", + "to read Delta Lake by using manifest files; the manifest files contain the list of the\n", + "most current version of files as of manifest generation. As described in the preceding\n", + "chapter, you will need to:\n", + "\n", + "- Generate a Delta Lake manifest file\n", + "\n", + "- Configure Presto or Athena to read the generated manifests\n", + "\n", + "- Manually re-generate (update) the manifest file\n", + "\n", + "New for Delta Lake 0.7.0 is the capability to update the manifest file automatically\n", + "with the following command:\n", + "\n", + "ALTER TABLE delta.`pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.compatibility.symlinkFormatManifest.enabled=true\n", + "\n", + ")\n", + "\n", + "**Configuring your table through table properties**\n", + "With the ability to set table properties on your table by using ALTER TABLE SET\n", + "TBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n", + "\n", + "\n", + "[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n", + "\n", + "You can also easily control the history of your Delta Lake table retention by the\n", + "following [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n", + "\n", + "- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n", + "(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\n", + "want to alter this value based on your requirements (e.g., GDPR historical context)\n", + "\n", + "- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\n", + "must have been deleted before being a candidate for VACUUM. By default, data\n", + "files older than seven days are deleted.\n", + "\n", + "As of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\n", + "configure these properties.\n", + "\n", + "ALTER TABLE delta. `pathToDeltaTable`\n", + "\n", + "SET TBLPROPERTIES(\n", + "\n", + "delta.logRetentionDuration = “interval “\n", + "\n", + "delta.deletedFileRetentionDuration = “interval “\n", + "\n", + ")\n", + "\n", + "**Support for adding user-defined metadata**\n", + "**in Delta Lake table commits**\n", + "You can specify user-defined strings as metadata in commits made by Delta\n", + "Lake table operations, either using the DataFrameWriter option userMetadata or\n", + "the SparkSession configuration spark.databricks.delta.commitInfo.\n", + "userMetadata .\n", + "\n", + "In the following example, we are deleting a user (1xsdf1) from our data lake per user\n", + "request. To ensure we associate the user’s request with the deletion, we have also\n", + "added the DELETE request ID into the userMetadata.\n", + "\n", + "\n", + "-----\n", + "\n", + "SET spark.databricks.delta.commitInfo.userMetadata={\n", + "\n", + "“GDPR”:”DELETE Request 1x891jb23”\n", + "\n", + "\n", + "There were a lot of great questions during the AMA concerning structured streaming\n", + "and using trigger.once .\n", + "\n", + "\n", + "};\n", + "\n", + "\n", + "For more information, some good resources explaining this concept include:\n", + "\n", + "- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\n", + "trade-off discussed here .\n", + "\n", + "**Additional resources**\n", + "\n", + "[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n", + "\n", + "[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n", + "\n", + "[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n", + "\n", + "\n", + "DELETE FROM user_table WHERE user_id = ‘1xsdf1’\n", + "\n", + "When reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\n", + "identify the associated deletion request within the transaction log.\n", + "\n", + "**Other highlights**\n", + "Other highlights for the Delta Lake 0.7.0 release include:\n", + "\n", + "- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n", + "3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n", + "\n", + "- Improved support for streaming one-time triggers — With Spark 3.0, we now\n", + "ensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\n", + "in a Delta Lake table in a single micro-batch even if rate limits are set with the\n", + "DataStreamReader option maxFilesPerTrigger.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse**\n", + "Combining the best elements of data\n", + "lakes and data warehouses\n", + "\n", + "## CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "architects began envisioning a single system to house data for many different\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "and data warehouses.\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "consistency as multiple parties concurrently read or write data, typically using SQL.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "warehouses.\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "or copy data between different systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "such as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n", + "[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\n", + "use separate clusters, thus these systems are able to scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "Cloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\n", + "store large data warehouses and data lakes. Unfortunately, their implementation\n", + "as key-value stores makes it difficult to achieve ACID transactions and high\n", + "performance: Metadata operations, such as listing objects, are expensive, and\n", + "consistency guarantees are limited. In this paper, we present Delta Lake, an\n", + "open source ACID table storage layer over cloud object stores initially developed\n", + "at Databricks. Delta Lake uses a transaction log that is compacted into Apache\n", + "Parquet format to provide ACID properties, time travel, and significantly faster\n", + "metadata operations for large tabular data sets (e.g., the ability to quickly search\n", + "billions of table partitions for those relevant to a query). It also leverages this\n", + "design to provide high-level features such as automatic data layout optimization,\n", + "upserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\n", + "Spark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\n", + "thousands of Databricks customers that process exabytes of data per day, with\n", + "the largest instances managing exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\n", + "Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\n", + "Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\n", + "Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "Microsoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "enables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n", + "[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "source file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "important for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "exploration and refinement are standard for many analytic and data science\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "data that went into a company’s products or decision-making was structured\n", + "data from operational systems, whereas today, many products incorporate\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "unstructured data.\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "systems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "and other issues will be addressed as the technology continues to mature and\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "diverse data applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "paper that describes some of the core technological challenges and solutions that\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "can read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "object stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "approach because the table is just a group of objects that can be accessed from\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "customers into a specific service provider, leaving customers to contend with\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "similar data structures and data management features to those in a data warehouse,\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "in the open source community. These use cases span a variety of data sources and\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "transactional features to replace some or all of the functionality provided by message\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "access protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "workloads through three components: an improved query optimizer, a caching\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "By linking these three components together, we think it will be easier for customers\n", + "to understand how improvements in multiple places within the Databricks code\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "new advances in how data teams design their data architectures for increased\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming**\n", + "Using Delta Lake to express\n", + "computation on streaming data\n", + "\n", + "## CHAPTER 04\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake Solves Common**\n", + "**Pain Points in Streaming**\n", + "\n", + "The pain points of a traditional streaming and data warehousing solution can be\n", + "broken into two groups: data lake and data warehouse pains.\n", + "\n", + "**Data lake pain points**\n", + "While data lakes allow you to flexibly store an immense amount of data in a file system,\n", + "there are many pain points including (but not limited to):\n", + "\n", + "- Consolidation of streaming data from many disparate systems is difficult.\n", + "\n", + "- Updating data in a data lake is nearly impossible, and much of the streaming\n", + "data needs to be updated as changes are made. This is especially important in\n", + "scenarios involving financial reconciliation and subsequent adjustments.\n", + "\n", + "- Query speeds for a data lake are typically very slow.\n", + "\n", + "- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n", + "\n", + "**Data warehouse pain points**\n", + "The power of a data warehouse is that you have a persistent performant store of your\n", + "data. But the pain points for building modern continuous applications include (but are\n", + "not limited to):\n", + "\n", + "- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n", + "\n", + "- Accessing streaming data and stored data together is very difficult, if at all possible.\n", + "\n", + "- Data warehouses do not scale very well.\n", + "\n", + "- Tying compute and storage together makes using a warehouse very expensive.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Delta Lake on Databricks solves these issues**\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\n", + "performance optimizations to cloud data lakes. More succinctly, Delta Lake combines\n", + "the advantages of data lakes and data warehouses with Apache Spark™ to allow you\n", + "to do incredible things.\n", + "\n", + "- Delta Lake, along with Structured Streaming, makes it possible to analyze\n", + "streaming and historical data together at high speeds.\n", + "\n", + "- When Delta Lake tables are used as sources and destinations of streaming big\n", + "data, it is easy to consolidate disparate data sources.\n", + "\n", + "- Upserts are supported on Delta Lake tables.\n", + "\n", + "- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n", + "\n", + "- Easily include machine learning scoring and advanced analytics into ETL\n", + "and queries.\n", + "\n", + "- Decouples compute and storage for a completely scalable solution.\n", + "\n", + "In the following use cases, we’ll share what this looks like in practice.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Simplifying Streaming Stock**\n", + "**Data Analysis Using Delta Lake**\n", + "\n", + "Real-time analysis of stock data is a complicated endeavor. After all, there are many\n", + "challenges in maintaining a streaming system and ensuring transactional consistency\n", + "of legacy and streaming data concurrently.\n", + "\n", + "Thankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\n", + "system to analyze stock data in real time. In this section, we’ll share how to simplify\n", + "the streaming of stock data analysis using Delta Lake.\n", + "\n", + "In the following diagram, you can see a high-level architecture that simplifies this\n", + "problem. We start by ingesting two different sets of data into two Delta Lake tables.\n", + "The two data sets are stock prices and fundamentals.\n", + "\n", + "After ingesting the data into their respective tables, we then join the data in an ETL\n", + "process and write the data out into a third Delta Lake table for downstream analysis.\n", + "\n", + "Delta Lake helps solve these problems by combining the scalability, streaming and\n", + "access to the advanced analytics of Apache Spark with the performance and ACID\n", + "compliance of a data warehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "# Create Fundamental Data (Databricks Delta table)\n", + "\n", + "dfBaseFund = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksFundamentals’ )\n", + "\n", + "# Create Price Data (Databricks Delta table)\n", + "\n", + "dfBasePrice = spark \\\\\n", + "\n", + ".read \\\\\n", + "\n", + ".format( ‘delta’ ) \\\\\n", + "\n", + ".load( ‘/delta/stocksDailyPrices’ )\n", + "\n", + "\n", + "**Implement your streaming**\n", + "**stock analysis solution with Delta Lake**\n", + "Delta Lake and Apache Spark do most of the work for our solution; you can try out the\n", + "full [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n", + "\n", + "As noted in the preceding diagram, we have two data sets to process — one for\n", + "fundamentals and one for price data. To create our two Delta Lake tables, we specify\n", + "the .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n", + "\n", + "\n", + "-----\n", + "\n", + "While we’re updating the stockFundamentals and stocksDailyPrices ,\n", + "we will consolidate this data through a series of ETL jobs into a consolidated view\n", + "( stocksDailyPricesWFund ).\n", + "\n", + "With the following code snippet, we can determine the start and end date of available\n", + "data and then combine the price and fundamentals data for that date range into DBFS.\n", + "\n", + "# Determine start and end date of available data\n", + "\n", + "row = dfBasePrice.agg(\n", + "\n", + "func.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n", + "\n", + "func.min(dfBasePrice.price_date) .alias ( “minDate” )\n", + "\n", + ").collect()[ 0 ]\n", + "\n", + "startDate = row[ “minDate” ]\n", + "\n", + "endDate = row[ “maxDate” ]\n", + "\n", + "# Define our date range function\n", + "\n", + "\n", + "# Save data to DBFS\n", + "\n", + "dfPriceWFund\n", + "\n", + ".write\n", + "\n", + ".format( ‘delta’ )\n", + "\n", + ".mode( ‘append’ )\n", + "\n", + ".save( ‘/delta/stocksDailyPricesWFund’ )\n", + "\n", + "# Loop through dates to complete fundamentals + price ETL process\n", + "\n", + "for single_date in daterange(\n", + "\n", + "startDate, (endDate + datetime.timedelta(days= 1 ))\n", + "\n", + "):\n", + "\n", + "print ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n", + "\n", + "start = datetime.datetime.now()\n", + "\n", + "combinePriceAndFund(single_date)\n", + "\n", + "end = datetime.datetime.now()\n", + "\n", + "print ( end - start)\n", + "\n", + "\n", + "def daterange(start_date, end_date):\n", + "\n", + "\n", + "Now we have a stream of consolidated fundamentals and price data that is being\n", + "pushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\n", + "Delta Lake table by specifying .format(“delta”) against that DBFS location.\n", + "\n", + "\n", + "for n in range( int ((end_date - start_date).days)):\n", + "\n", + "yield start_date + datetime.timedelta(n)\n", + "\n", + "\n", + "# Define combinePriceAndFund information by date and\n", + "\n", + "\n", + "def combinePriceAndFund(theDate):\n", + "\n", + "dfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n", + "\n", + "dfPrice = dfBasePrice. where (\n", + "\n", + "dfBasePrice.price_date == theDate\n", + "\n", + "\n", + "dfPriceWithFundamentals = spark\n", + "\n", + ".readStream\n", + "\n", + ".format( “delta” )\n", + "\n", + ".load( “/delta/stocksDailyPricesWFund” )\n", + "\n", + "\n", + ").drop( ‘price_date’ )\n", + "\n", + "\n", + "# Drop the updated column\n", + "\n", + "dfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n", + "\n", + "\n", + "// Create temporary view of the data\n", + "\n", + "dfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n", + "\n", + "\n", + "-----\n", + "\n", + "Now that we have created our initial Delta Lake table, let’s create a view that will\n", + "allow us to calculate the price/earnings ratio in real time (because of the underlying\n", + "streaming data updating our Delta Lake table).\n", + "\n", + "%sql\n", + "\n", + "CREATE OR REPLACE TEMPORARY VIEW viewPE AS\n", + "\n", + "select ticker,\n", + "\n", + "price_date,\n", + "\n", + "first(close) as price,\n", + "\n", + "(close/eps_basic_net) as pe\n", + "\n", + "from priceWithFundamentals\n", + "\n", + "where eps_basic_net > 0\n", + "\n", + "group by ticker, price_date, pe\n", + "\n", + "**Analyze streaming stock data in real time**\n", + "With our view in place, we can quickly analyze our data using Spark SQL.\n", + "\n", + "%sql\n", + "\n", + "select - \n", + "\n", + "from viewPE\n", + "\n", + "where ticker == “AAPL”\n", + "\n", + "order by price_date\n", + "\n", + "\n", + "-----\n", + "\n", + "As the underlying source of this consolidated data set is a Delta Lake table, this view\n", + "isn’t just showing the batch data but also any new streams of data that are coming in\n", + "as per the following streaming dashboard.\n", + "\n", + "Underneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\n", + "tables but also keeping the state of the distinct number of keys (in this case ticker\n", + "symbols) that need to be tracked.\n", + "\n", + "\n", + "Because you are using Spark SQL, you can execute aggregate queries at scale\n", + "and in real time.\n", + "\n", + "%sql\n", + "\n", + "SELECT ticker, AVG(close) as Average_Close\n", + "\n", + "FROM priceWithFundamentals\n", + "\n", + "GROUP BY ticker\n", + "\n", + "ORDER BY Average_Close\n", + "\n", + "In closing, we demonstrated how to simplify streaming stock data analysis using\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\n", + "Databricks integrated workspace to create a performant, scalable solution that has\n", + "the advantages of both data lakes and data warehouses.\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\n", + "commonly associated with streaming and transactional consistency, enabling\n", + "data engineering and data science teams to focus on understanding the trends in\n", + "their stock data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How Tilting Point Does Streaming**\n", + "**Ingestion Into Delta Lake**\n", + "\n", + "Tilting Point is a new-generation games partner that provides top development\n", + "studios with expert resources, services and operational support to optimize\n", + "high-quality live games for success. Through its user acquisition fund and its\n", + "world-class technology platform, Tilting Point funds and runs performance\n", + "marketing management and live games operations to help developers achieve\n", + "profitable scale.\n", + "\n", + "By leveraging Delta Lake, Tilting Point is able to leverage quality data and make\n", + "it readily available for analytics to improve the business. Diego Link, VP of\n", + "Engineering at Tilting Point, provided insights for this use case.\n", + "\n", + "The team at Tilting Point was running daily and hourly batch jobs for reporting on\n", + "game analytics. They wanted to make their reporting near real-time, getting insights\n", + "within 5–10 minutes.\n", + "\n", + "They also wanted to make their in-game LiveOps decisions based on real-time player\n", + "behavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\n", + "effects and even alert on service interruptions in game operations. The goal was to\n", + "ensure that the game experience was as robust as possible for their players.\n", + "\n", + "Additionally, they had to store encrypted Personally Identifiable Information (PII) data\n", + "separately in order to maintain GDPR compliance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How data flows and associated challenges**\n", + "Tilting Point has a proprietary software development kit that developers integrate\n", + "with to send data from game servers to an ingest server hosted in AWS. This service\n", + "removes all PII data and then sends the raw data to an Amazon Firehose endpoint.\n", + "Firehose then dumps the data in JSON format continuously to S3.\n", + "\n", + "To clean up the raw data and make it available quickly for analytics, the team\n", + "considered pushing the continuous data from Firehose to a message bus (e.g.,\n", + "Kafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\n", + "process data and write to Delta Lake tables.\n", + "\n", + "While that architecture sounds ideal for low latency requirements of processing\n", + "data in seconds, Tilting Point didn’t have such low latency needs for their ingestion\n", + "pipeline. They wanted to make the data available for analytics in a few minutes, not\n", + "seconds. Hence they decided to simplify our architecture by eliminating a message\n", + "bus and instead use S3 as a continuous source for their structured streaming job.\n", + "\n", + "But the key challenge in using S3 as a continuous source is identifying files that\n", + "changed recently.\n", + "\n", + "Listing all files every few minutes has two major issues:\n", + "\n", + "- **Higher latency:** Listing all files in a directory with a large number of files has high\n", + "overhead and increases processing time.\n", + "\n", + "- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n", + "\n", + "**Leveraging Structured Streaming with blob store as**\n", + "**source and Delta Lake tables as sink**\n", + "To continuously stream data from cloud blob storage like S3, Tilting Point uses\n", + "[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\n", + "stream data from S3 without the need to write any state management code on what\n", + "files were recently processed.\n", + "\n", + "\n", + "-----\n", + "\n", + "This is how Tilting Point’s ingestion pipeline looks:\n", + "\n", + "- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\n", + "to SQS via SNS.\n", + "\n", + "- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\n", + "information to read the actual file contents in S3. An example code below:\n", + "\n", + "spark.readStream \\\n", + "\n", + ".format( “s3-sqs” ) \\\n", + "\n", + ". option ( “fileFormat” , “json” ) \\\n", + "\n", + ". option ( “queueUrl” , ...) \\\n", + "\n", + ". schema (...) \\\n", + "\n", + ". load ()\n", + "\n", + "- Tilting Point’s structured streaming job then cleans up and transforms the data.\n", + "Based on the game data, the streaming job uses the foreachBatch API of Spark\n", + "streaming and writes to 30 different Delta Lake tables.\n", + "\n", + "- The streaming job produces lots of small files. This affects performance of\n", + "downstream consumers. So, an optimize job runs daily to compact small files in\n", + "the table and store them as right file sizes so that consumers of the data have\n", + "good performance while reading the data from Delta Lake tables. Tilting Point\n", + "also runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n", + "\n", + "\n", + "-----\n", + "\n", + "The above Delta Lake ingestion architecture helps in the following ways:\n", + "\n", + "- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\n", + "This helps quickly process the new files without too much overhead in listing files.\n", + "\n", + "- **No explicit file state management:** There is no explicit file state management\n", + "needed to look for recent files.\n", + "\n", + "- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\n", + "and Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n", + "\n", + "- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\n", + "transactional guarantees. This helps with reliable data ingestion.\n", + "\n", + "- **File compaction:** One of the major problems with streaming ingestion is tables\n", + "ending up with a large number of small files that can affect read performance.\n", + "Before Delta Lake, we had to set up a different table to write the compacted\n", + "data. With Delta Lake, thanks to ACID transactions, we can compact the files and\n", + "rewrite the data back to the same table safely.\n", + "\n", + "- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\n", + "ingestion tables to downstream consumers while data is being appended by a\n", + "streaming job and modified during compaction.\n", + "\n", + "- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\n", + "previous version of the table.\n", + "\n", + "In this section, we walked through Tilting Point’s use cases and how they do\n", + "streaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\n", + "efficiently without too much operational overhead to make good quality data\n", + "readily available for analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Building a Quality of Service**\n", + "**Analytics Solution for Streaming**\n", + "**Video Services**\n", + "\n", + "As traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\n", + "libraries of content. For companies whose entire business model revolved around\n", + "producing great content, which they then licensed to distributors, the shift to now\n", + "owning the entire glass-to-glass experience has required new capabilities, such as\n", + "building media supply chains for content delivery to consumers, supporting apps for\n", + "a myriad of devices and operating systems, and performing customer relationship\n", + "functions like billing and customer service.\n", + "\n", + "With most services renewing on a monthly basis, subscription service operators need\n", + "to prove value to their subscribers at all times. General quality of streaming video\n", + "issues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\n", + "screen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n", + "[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n", + "\n", + "When you start streaming, you realize there are so many places where breaks can\n", + "happen and the viewer experience can suffer. There may be an issue at the source in\n", + "the servers on-premises or in the cloud; in transit at either the CDN level or ISP level\n", + "or the viewer’s home network; or at the playout level with player/client issues. What\n", + "breaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\n", + "x 106. There is no pre-release testing that can quite replicate real-world users and\n", + "their ability to push even the most redundant systems to their breaking point as they\n", + "\n", + "\n", + "-----\n", + "\n", + "channel surf, click in and out of the app, sign on from different devices simultaneously\n", + "and so on. And because of the nature of TV, things will go wrong during the most\n", + "important, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n", + "[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\n", + "rather regional or a national issue? If national, is it across all devices or only certain\n", + "types (e.g., possibly the OEM updated the OS on an older device type, which ended up\n", + "causing compatibility issues with the client)?\n", + "\n", + "Identifying, remediating and preventing viewer quality of experience issues becomes\n", + "a big data problem when you consider the number of users, the number of actions\n", + "they are taking and the number of handoffs in the experience (servers to CDN to ISP to\n", + "home network to client). Quality of Service (QoS) helps make sense of these streams\n", + "of data so you can understand what is going wrong, where and why. Eventually you\n", + "can get into predictive analytics around what could go wrong and how to remediate\n", + "it before anything breaks.\n", + "\n", + "**Databricks Quality of Service solution overview**\n", + "The aim of this solution is to provide the core for any streaming video platform that\n", + "wants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n", + "[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\n", + "a Unified Data Analytics Platform for both the real-time insights and the advanced\n", + "analytics capabilities.\n", + "\n", + "[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\n", + "leveraging the most complete and recent data sets powered by robust and reliable\n", + "data pipelines. This decreases time to market for new features by accelerating\n", + "data science using a collaborative environment. It provides support for managing\n", + "the end-to-end machine learning lifecycle and reduces operational costs across\n", + "all cycles of software development by having a unified platform for both data\n", + "engineering and data science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Video QoS solution architecture**\n", + "With complexities like low-latency monitoring alerts and highly scalable infrastructure\n", + "required for peak video traffic hours, the straightforward architectural choice was\n", + "the Delta Architecture — both standard big data architectures like Lambda and Kappa\n", + "Architectures have disadvantages around the operational effort required to maintain\n", + "multiple types of pipelines (streaming and batch) and lack support for a unified data\n", + "engineering and data science approach.\n", + "\n", + "The Delta Architecture is the next-generation paradigm that enables all the data\n", + "personas in your organization to be more productive:\n", + "\n", + "- Data engineers can develop data pipelines in a cost-efficient manner\n", + "continuously without having to choose between batch and streaming\n", + "\n", + "- Data analysts can get near real-time insights and faster answers to their BI queries\n", + "\n", + "- Data scientists can develop better machine learning models using more reliable data\n", + "sets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n", + "\n", + "\n", + "-----\n", + "\n", + "Writing data pipelines using the Delta Architecture follows the best practices of\n", + "having a multi-layer “multi-hop” approach where we progressively add structure to\n", + "data: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n", + "(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\n", + "reporting or data science, and “Gold” tables are the final presentation layer.\n", + "\n", + "For the pure streaming use cases, the option of materializing the DataFrames in\n", + "intermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\n", + "cost (an example being real-time monitoring alerts vs. updates of the recommender\n", + "system based on new content).\n", + "\n", + "A streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n", + "\n", + "The number of “hops” in this approach is directly impacted by the number of consumers\n", + "downstream, complexity of the aggregations (e.g., Structured Streaming enforces\n", + "certain limitations around chaining multiple aggregations) and the maximization of\n", + "operational efficiency.\n", + "\n", + "The QoS solution architecture is focused around best practices for data processing\n", + "and is not a full video-on-demand (VoD) solution — with some standard components\n", + "like the “front door” service Amazon API Gateway being avoided from the high-level\n", + "architecture in order to keep the focus on data and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "High-level architecture for the QoS platform\n", + "\n", + "\n", + "**Making your data ready for analytics**\n", + "Both sources of data included in the QoS solution (application events and CDN logs)\n", + "are using the JSON format, great for data exchange — allowing you to represent\n", + "complex nested structures, but not scalable and difficult to maintain as a storage\n", + "format for your data lake / analytics system.\n", + "\n", + "\n", + "In order to make the data directly queryable across the entire organization, the\n", + "Bronze to Silver pipeline (the “make your data available to everyone” pipeline) should\n", + "transform any raw formats into Delta Lake and include all the quality checks or data\n", + "masking required by any regulatory agencies.\n", + "\n", + "\n", + "-----\n", + "\n", + "Raw format of the app events\n", + "\n", + "**Video applications events**\n", + "Based on the architecture, the video application events are pushed directly to\n", + "Kinesis Streams and then just ingested to a Delta Lake append-only table without\n", + "any changes to the schema.\n", + "\n", + "Using this pattern allows a high number of consumers downstream to process the\n", + "data in a streaming paradigm without having to scale the throughput of the Kinesis\n", + "stream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\n", + "we don’t have to worry about the way the size of the processing window will impact the\n", + "number of files in your target table — known as the “small files” issue in the big data world.\n", + "\n", + "Both the timestamp and the type of message are being extracted from the JSON\n", + "event in order to be able to partition the data and allow consumers to choose the\n", + "type of events they want to process. Again combining a single Kinesis stream for\n", + "the events with a Delta Lake “Events” table reduces the operational complexity while\n", + "making things easier for scaling during peak hours.\n", + "\n", + "\n", + "All the details are extracted from JSON for the Silver table\n", + "\n", + "\n", + "-----\n", + "\n", + "**CDN logs**\n", + "The CDN logs are delivered to S3, so the easiest way to process them is the Databricks\n", + "Auto Loader, which incrementally and efficiently processes new data files as they\n", + "arrive in S3 without any additional setup.\n", + "\n", + "auto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n", + "\n", + ".option( “cloudFiles.format” , “json” ) \\\n", + "\n", + ".option( “cloudFiles.region” , region) \\\n", + "\n", + ".load(input_location)\n", + "\n", + "anonymized_df = auto_loader_df. select ( ‘*’ , ip_\n", + "\n", + "anonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n", + "\n", + ".drop( ‘requestip’ )\\\n", + "\n", + ".withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n", + "\n", + "anonymized_df.writeStream \\\n", + "\n", + ".option( ‘checkpointLocation’ , checkpoint_location)\\\n", + "\n", + ".format( ‘delta’ ) \\\n", + "\n", + ".table(silver_database + ‘.cdn_logs’ )\n", + "\n", + "As the logs contain IPs — considered personal data under the GDPR regulations — the\n", + "“make your data available to everyone” pipeline has to include an anonymization step.\n", + "Different techniques can be used, but we decided to just strip the last octet from IPv4\n", + "and the last 80 bits from IPv6. On top, the data set is also enriched with information\n", + "around the origin country and the ISP provider, which will be used later in the Network\n", + "Operation Centers for localization.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating the Dashboard /**\n", + "**Virtual Network Operation Centers**\n", + "Streaming companies need to monitor network performance and the user experience\n", + "as near real-time as possible, tracking down to the individual level with the ability to\n", + "abstract at the segment level, easily defining new segments such as those defined by\n", + "geos, devices, networks and/or current and historical viewing behavior.\n", + "\n", + "For streaming companies that has meant adopting the concept of Network Operation\n", + "Centers (NOC) from telco networks for monitoring the health of the streaming\n", + "experience for their users at a macro level, flagging and responding to any issues\n", + "early on. At their most basic, NOCs should have dashboards that compare the current\n", + "experience for users against a performance baseline so that the product teams can\n", + "quickly and easily identify and attend to any service anomalies.\n", + "\n", + "In the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\n", + "be effortlessly connected in order to build more complex visualizations, but based\n", + "on customer feedback, built-in dashboards are, most of the time, the fastest way to\n", + "present the insights to business users.\n", + "\n", + "The aggregated tables for the NOC will basically be the Gold layer of our Delta\n", + "Architecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n", + "\n", + "\n", + "-----\n", + "\n", + "The dashboard is just a way to visually package the results of SQL queries or Python\n", + "/ R transformation — each notebook supports multiple dashboards so in case of\n", + "multiple end users with different requirements we don’t have to duplicate the code —\n", + "as a bonus the refresh can also be scheduled as a Databricks job.\n", + "\n", + "Visualization of the results of a SQL query\n", + "\n", + "Loading time for videos (time to first frame) allows better understanding of the\n", + "performance for individual locations of your CDN — in this case the AWS CloudFront\n", + "Edge nodes — which has a direct impact in your strategy for improving this KPI —\n", + "either by spreading the user traffic over multi-CDNs or maybe just implementing a\n", + "dynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n", + "\n", + "\n", + "-----\n", + "\n", + "Failure to understand the reasons for high levels of buffering — and the poor video\n", + "quality experience that it brings — has a significant impact on subscriber churn rate.\n", + "On top of that, advertisers are not willing to spend money on ads responsible for\n", + "reducing the viewer engagement — as they add extra buffering on top, so the profits\n", + "on the advertising business usually are impacted too. In this context, collecting as\n", + "much information as possible from the application side is crucial to allow the analysis\n", + "to be done not only at video level but also browser or even type / version of application.\n", + "\n", + "On the content side, events for the application can provide useful information about\n", + "user behavior and overall quality of experience. How many people that paused a video\n", + "have actually finished watching that episode / video? What caused the stoppage: The\n", + "quality of the content or delivery issues? Of course, further analyses can be done by\n", + "linking all the sources together (user behavior, performance of CDNs /ISPs) to not only\n", + "create a user profile but also to forecast churn.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Creating (near) real-time alerts**\n", + "When dealing with the velocity, volume and variety of data generated in video\n", + "streaming from millions of concurrent users, dashboard complexity can make it\n", + "harder for human operators in the NOC to focus on the most important data at the\n", + "moment and zero-in on root cause issues. With this solution, you can easily set up\n", + "automated alerts when performance crosses certain thresholds that can help the\n", + "human operators of the network as well as set off automatic remediation protocols\n", + "via a Lambda function. For example:\n", + "\n", + "- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\n", + "latency vs. baseline average), initiate automatic CDN traffic shifts.\n", + "\n", + "- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\n", + "product team that there is likely a client issue for a specific device.\n", + "\n", + "- If viewers on a certain ISP are having higher-than-average buffering and\n", + "pixelation issues, alert frontline customer representatives on responses and ways\n", + "to decrease issues (e.g., set stream quality lower).\n", + "\n", + "From a technical perspective, generating real-time alerts requires a streaming\n", + "engine capable of processing data real time and publish-subscribe service to push\n", + "notifications.\n", + "\n", + "\n", + "updates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n", + "[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\n", + "on a rule-based engine (e.g., validating the percentage of errors for each individual\n", + "type of app over a period of time) really straightforward.\n", + "\n", + "def send_error_notification(row):\n", + "\n", + "sns_client = boto3.client( ‘sns’ , region)\n", + "\n", + "error_message = ‘Number of errors for the App has exceeded the\n", + "\n", + "threshold {}’ .format(row[ ‘percentage’ ])\n", + "\n", + "response = sns_client.publish(\n", + "\n", + "TopicArn =,\n", + "\n", + "Message = error_message,\n", + "\n", + "Subject =,\n", + "\n", + "MessageStructure = ‘string’ )\n", + "\n", + "# Structured Streaming Job\n", + "\n", + "getKinesisStream( “player_events” )\\\n", + "\n", + ".selectExpr( “type” , “app_type” )\\\n", + "\n", + ".groupBy( “app_type” )\\\n", + "\n", + ".apply(calculate_error_percentage)\\\n", + "\n", + ". where ( “percentage > {}” .format(threshold)) \\\n", + "\n", + ".writeStream\\\n", + "\n", + ". foreach (send_error_notification)\\\n", + "\n", + ".start()\n", + "\n", + "\n", + "Integrating microservices using Amazon SNS and Amazon SQS\n", + "\n", + "Sending email notifications using AWS SNS\n", + "\n", + "The QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\n", + "by using Amazon SNS and its integrations with Amazon Lambda (see below for the\n", + "\n", + "\n", + "-----\n", + "\n", + "On top of the basic email use case, the Demo Player includes three widgets updated\n", + "in real time using AWS AppSync: the number of active users, the most popular videos\n", + "and the number of users concurrently watching a video.\n", + "\n", + "Updating the application with the results of real-time aggregations\n", + "\n", + "The QoS solution is applying a similar approach — Structured Streaming and Amazon\n", + "SNS — to update all the values allowing for extra consumers to be plugged in using AWS\n", + "SQS. This is a common pattern when huge volumes of events have to be enhanced and\n", + "analyzed; pre-aggregate data once and allow each service (consumer) to make their\n", + "own decision downstream.\n", + "\n", + "**Next steps: machine learning**\n", + "Manually making sense of the historical data is important but is also very slow. If\n", + "we want to be able to make automated decisions in the future, we have to integrate\n", + "machine learning algorithms.\n", + "\n", + "As a Unified Data Platform, Databricks empowers data scientists to build better data\n", + "science products using features like Runtime for Machine Learning with built-in\n", + "or the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n", + "\n", + "\n", + "-----\n", + "\n", + "We have already explored a few important use cases across our customer base while\n", + "focusing on the possible extensions to the QoS solution.\n", + "\n", + "**Point-of-failure prediction and remediation**\n", + "As D2C streamers reach more users, the costs of even momentary loss of service\n", + "increases. ML can help operators move from reporting to prevention by forecasting\n", + "where issues could come up and remediating before anything goes wrong (e.g.,\n", + "a spike in concurrent viewers leads to switching CDNs to one with more capacity\n", + "automatically).\n", + "\n", + "**Customer churn**\n", + "Critical to growing subscription services is keeping the subscribers you have. By\n", + "understanding the quality of service at the individual level, you can add QoS as a\n", + "variable in churn and customer lifetime value models. Additionally, you can create\n", + "customer cohorts for those who have had video quality issues in order to test\n", + "proactive messaging and save offers.\n", + "\n", + "\n", + "**Getting started with the Databricks streaming video**\n", + "**QoS solution**\n", + "Providing consistent quality in the streaming video experience is table stakes at this\n", + "point to keep fickle audiences with ample entertainment options on your platform.\n", + "With this solution we have sought to create a quick start for most streaming video\n", + "platform environments to embed this QoS real-time streaming analytics solution in\n", + "a way that:\n", + "1. Scales to any audience size\n", + "2. Quickly flags quality performance issues at key parts of the distribution workflow\n", + "3. Is flexible and modular enough to easily customize for your audience and your\n", + "needs, such as creating new automated alerts or enabling data scientists to test\n", + "and roll out predictive analytics and machine learning\n", + "\n", + "To get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n", + "[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\n", + "system, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Customer Use Cases**\n", + "See how customers are using\n", + "Delta Lake to rapidly innovate\n", + "\n", + "## CHAPTER 05\n", + "\n", + "\n", + "-----\n", + "\n", + "**Healthdirect Australia**\n", + "Provides Personalized and Secure Online\n", + "Patient Care With Databricks\n", + "\n", + "As the shepherds of the National Health Services Directory (NHSD), Healthdirect\n", + "is focused on leveraging terabytes of data covering time-driven, activity-based\n", + "healthcare transactions to improve health care services and support. With\n", + "governance requirements, siloed teams and a legacy system that was difficult\n", + "to scale, they moved to Databricks. This boosted data processing for downstream\n", + "machine learning while improving data security to meet HIPAA requirements.\n", + "\n", + "**Spotlight on Healthdirect**\n", + "**Industry:** Healthcare and life sciences\n", + "6x\n", + "Improvement in data processing\n", + "20M\n", + "Records ingested in minutes\n", + "\n", + "**Data quality and governance issues, silos, and the**\n", + "**inability to scale**\n", + "Due to regulatory pressures, Healthdirect Australia set forth to improve overall data\n", + "quality and ensure a level of governance on top of that, but they ran into challenges\n", + "when it came to data storage and access. On top of that, data silos were blocking the\n", + "team from efficiently preparing data for downstream analytics. These disjointed data\n", + "\n", + "\n", + "-----\n", + "\n", + "sources impacted the consistency of data reads, as data was oftentimes out-of-sync\n", + "between the various systems in their stack. The low-quality data also led to higher\n", + "error rates and processing inefficiencies. This fragmented architecture created\n", + "significant operational overhead and limited their ability to have a comprehensive\n", + "view of the patient.\n", + "\n", + "Further, they needed to ingest over 1 billion data points due to a changing landscape\n", + "of customer demand such as bookings, appointments, pricing, eHealth transaction\n", + "activity, etc. — estimated at over 1TB of data.\n", + "\n", + "“We had a lot of data challenges. We just couldn’t process efficiently enough. We\n", + "were starting to get batch overruns. We were starting to see that a 24-hour window\n", + "isn’t the most optimum time in which we want to be able to deliver healthcare data\n", + "and services,” explained Peter James, Chief Architect at Healthdirect Australia.\n", + "\n", + "Ultimately, Healthdirect realized they needed to modernize their end-to-end process\n", + "and tech stack to properly support the business.\n", + "\n", + "**Modernizing analytics with Databricks and Delta Lake**\n", + "Databricks provides Healthdirect Australia with a Unified Data Platform that simplifies\n", + "data engineering and accelerates data science innovation. The notebook environment\n", + "enables them to make content changes in a controlled fashion rather than having to\n", + "run bespoke jobs each time.\n", + "\n", + "“Databricks has provided a big uplift for our teams and our data operations,” said\n", + "James. “The analysts were working directly with the data operations teams. They are\n", + "able to achieve the same pieces of work together within the same time frames that\n", + "used to take twice as long. They’re working together, and we’re seeing just a massive\n", + "acceleration in the speed at which we can deliver service.”\n", + "\n", + "\n", + "-----\n", + "\n", + "With Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\n", + "Within these zones, they store their data “as is,” in their structured or unstructured\n", + "state, in Delta Lake tables. From there, they use a metadata-driven schema and hold\n", + "the data within a nested structure within that table. What this allows them to do is\n", + "handle data consistently from every source and simplifies the mapping of data to the\n", + "various applications pulling the data.\n", + "\n", + "Meanwhile, through Structured Streaming, they were able to convert all of their\n", + "ETL batch jobs into streaming ETL jobs that could serve multiple applications\n", + "consistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\n", + "Databricks Unified Data Platform provides significant architectural improvements\n", + "that have boosted performance, reduced operational overheads and increased\n", + "process efficiencies.\n", + "\n", + "\n", + "**Faster data pipelines result in better patient-driven**\n", + "**healthcare**\n", + "As a result of the performance gains delivered by Databricks and the improved data\n", + "reliability through Delta Lake, Healthdirect Australia realized improved accuracy of\n", + "their fuzzy name match algorithm from less than 80% with manual verification to 95%\n", + "and no manual intervention.\n", + "\n", + "The processing improvements with Delta Lake and Structured Streaming allowed\n", + "them to process more than 30,000 automated updates per month. Prior to Databricks,\n", + "they had to use unreliable batch jobs that were highly manual to process the same\n", + "number of updates over a span of 6 months — a 6x improvement in data processing.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the\n", + "healthcare sector.”\n", + "\n", + "– Peter James, Chief Architect, Healthdirect Australia\n", + "\n", + "\n", + "-----\n", + "\n", + "They were also able to increase their data load rate to 1 million records per minute,\n", + "loading their entire 20 million record data set in 20 minutes. Before the adoption\n", + "of Databricks, this used to take more than 24 hours to process the same 1 million\n", + "transactions, blocking analysts from making swift decisions to drive results.\n", + "\n", + "Last, data security, which was critical to meet compliance requirements, was greatly\n", + "improved. Databricks provides standard security accreditations like HIPAA, and\n", + "Healthdirect was able to use Databricks to meet Australia’s security requirements.\n", + "This yielded significant cost reductions and gave them continuous data assurance\n", + "by monitoring changes to access privileges like changes in roles, metadata-level\n", + "security changes, data leakage, etc.\n", + "\n", + "“Databricks delivered the time to market as well as the analytics and operational\n", + "uplift that we needed in order to be able to meet the new demands of the healthcare\n", + "sector,” said James.\n", + "\n", + "With the help of Databricks, they have proven the value of data and analytics and how\n", + "it can impact their business vision. With transparent access to data that boasts\n", + "well-documented lineage and quality, participation across various business and\n", + "analyst groups has increased — empowering teams to collaborate and more\n", + "easily and quickly extract value from their data with the goal of improving\n", + "healthcare for everyone.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Comcast**\n", + "Uses Delta Lake and MLflow to\n", + "Transform the Viewer Experience\n", + "\n", + "**Spotlight on Comcast**\n", + "**Industry:** Media and entertainment\n", + "10x\n", + "Reduction in overall compute costs to process data\n", + "90%\n", + "Reduction in required DevOps resources to manage infrastructure\n", + "Reduced\n", + "Deployment times from weeks to minutes\n", + "\n", + "As a global technology and media company connecting millions of customers to\n", + "personalized experiences, Comcast struggled with massive data, fragile data pipelines\n", + "\n", + "and poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n", + "— they can build performant data pipelines for petabytes of data and easily manage the\n", + "lifecycle of hundreds of models to create a highly innovative, unique and award-winning\n", + "viewer experience using voice recognition and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Infrastructure unable to support data and ML needs**\n", + "Instantly answering a customer’s voice request for a particular program while turning\n", + "billions of individual interactions into actionable insights, strained Comcast’s IT\n", + "infrastructure and data analytics and data science teams. To make matters more\n", + "complicated, Comcast needed to deploy models to a disjointed and disparate range\n", + "of environments: cloud, on-premises and even directly to devices in some instances.\n", + "\n", + "- **Massive data:** Billions of events generated by the entertainment system and 20+\n", + "million voice remotes, resulting in petabytes of data that need to be sessionized\n", + "for analysis.\n", + "\n", + "- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\n", + "hard to recover. Small files were difficult to manage, slowing data ingestion for\n", + "downstream machine learning.\n", + "\n", + "- **Poor collaboration:** Globally dispersed data scientists working in different\n", + "scripting languages struggled to share and reuse code.\n", + "\n", + "- **Manage management of ML models:** Developing, training and deploying hundreds\n", + "of models was highly manual, slow and hard to replicate, making it difficult to scale.\n", + "\n", + "- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\n", + "and models while ops wanted to deploy on proven infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Automated infrastructure, faster data**\n", + "**pipelines with Delta Lake**\n", + "Comcast realized they needed to modernize their entire approach to analytics from\n", + "data ingest to the deployment of machine learning models to delivering new features\n", + "that delight their customers. Today, the Databricks Unified Data Platform enables\n", + "Comcast to build rich data sets and optimize machine learning at scale, streamline\n", + "workflows across teams, foster collaboration, reduce infrastructure complexity, and\n", + "deliver superior customer experiences.\n", + "\n", + "- **Simplified infrastructure management:** Reduced operational costs through\n", + "automated cluster management and cost management features such as\n", + "autoscaling and spot instances.\n", + "\n", + "\n", + "\n", + "- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\n", + "initial processing of the raw telemetry from video and voice applications and devices.\n", + "\n", + "- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\n", + "and reliable ingestion at scale.\n", + "\n", + "- **Collaborative workspaces:** Interactive notebooks improve cross-team\n", + "collaboration and data science creativity, allowing Comcast to greatly accelerate\n", + "model prototyping for faster iteration.\n", + "\n", + "- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\n", + "and model serving via the Kubeflow environment, allowing them to track and\n", + "manage hundreds of models with ease.\n", + "\n", + "- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\n", + "that can reliably join historic and streaming data for richer insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering personalized experiences with ML**\n", + "In the intensely competitive entertainment industry, there is no time to press the\n", + "Pause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\n", + "delighted with competition-beating customer experiences.\n", + "\n", + "- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\n", + "a highly innovative and award-winning viewer experience with intelligent voice\n", + "commands that boosts engagement.\n", + "\n", + "- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\n", + "ingestion, replacing 640 machines with 64 while improving performance. Teams\n", + "can spend more time on analytics and less time on infrastructure management.\n", + "\n", + "- **Less DevOps:** Reduced the number of DevOps full-time employees required for\n", + "onboarding 200 users from 5 to 0.5.\n", + "\n", + "- **Higher data science productivity:** Fostered collaboration between global data\n", + "scientists by enabling different programming languages through a single\n", + "interactive workspace. Also, Delta Lake has enabled the data team to use data at\n", + "any point within the data pipeline, allowing them to act more quickly in building\n", + "and training new models.\n", + "\n", + "- **Faster model deployment:** Reduced deployment times from weeks to minutes as\n", + "operations teams deployed models on disparate platforms.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Banco Hipotecario**\n", + "Personalizes the Banking\n", + "Experience With Data and ML\n", + "\n", + "Banco Hipotecario — a leading Argentinian commercial bank — is on a mission\n", + "to leverage machine learning to deliver new insights and services that will delight\n", + "customers and create upsell opportunities. With a legacy analytics and data\n", + "warehousing system that was rigid and complex to scale, they turned to Databricks\n", + "to unify data science, engineering and analytics.\n", + "\n", + "As a result of this partnership, they were able to significantly increase customer\n", + "acquisition and cross-sells while lowering the cost for acquisition, greatly impacting\n", + "overall customer retention and profitability.\n", + "\n", + "**Spotlight on Banco Hipotecario**\n", + "**Industry:** Financial services\n", + "35%\n", + "\n", + "Reduction in cost of acquisition\n", + "**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n", + "\n", + "\n", + "-----\n", + "\n", + "**Legacy analytics tools are slow, rigid and**\n", + "**impossible to scale**\n", + "Banco Hipotecario set forth to increase customer acquisition by reducing risk and\n", + "improving the customer experience. With data analytics and machine learning\n", + "anchoring their strategy, they hoped to influence a range of use cases from fraud\n", + "detection and risk analysis to serving product recommendations to drive upsell and\n", + "cross-sell opportunities and forecast sales.\n", + "\n", + "Banco Hipotecario faced a number of the challenges that often come along with\n", + "outdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n", + "— the list goes on.\n", + "\n", + "“In order to execute on our data analytics strategy, new technologies were needed\n", + "in order to improve data engineering and boost data science productivity,” said\n", + "Daniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\n", + "took were to move to a cloud-based data lake, which led us to Azure Databricks\n", + "and Delta Lake.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**A unified platform powers the data lake**\n", + "**and easy collaboration**\n", + "Banco Hipotecario turned to Databricks to modernize their data warehouse\n", + "environment, improve cross-team collaboration, and drive data science innovation.\n", + "Fully managed in Microsoft Azure, they were able to easily and reliably ingest massive\n", + "volumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\n", + "automated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n", + "\n", + "Delta Lake has been especially useful in bringing reliability and performance to Banco\n", + "Hipotecario’s data lake environment. With Delta Lake, they are now able to build\n", + "reliable and performant ETL pipelines like never before.\n", + "\n", + "\n", + "Meanwhile, performing SQL Analytics on Databricks has helped them do data\n", + "exploration, cleansing and generate data sets in order to create models, enabling the\n", + "team to deploy their first model within the first three months, and the second model\n", + "generated was rolled out in just two weeks.\n", + "\n", + "At the same time, data scientists were finally able to collaborate, thanks to interactive\n", + "notebooks; this meant faster builds, training and deployment. And MLflow streamlined\n", + "the ML lifecycle and removed the overreliance on data engineering.\n", + "\n", + "“Databricks gives our data scientists the means to easily create our own experiments\n", + "and deploy them to production in weeks, rather than months,” said Miguel Villalba,\n", + "Head of Data Engineering and Data Science.\n", + "\n", + "\n", + "-----\n", + "\n", + "**An efficient team maximizes customer**\n", + "**acquisition and retention**\n", + "Since moving to Databricks, the data team at Banco Hipotecario could not be happier,\n", + "as Databricks has unified them across functions in an integrated fashion.\n", + "\n", + "The results of data unification and markedly improved collaboration and autonomy\n", + "cannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\n", + "their cross-sell into new products by a whopping 90%, while machine learning has\n", + "reduced the cost of customer acquisition by 35%.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Viacom18**\n", + "Migrates From Hadoop to Databricks to\n", + "Deliver More Engaging Experiences\n", + "\n", + "Viacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\n", + "with 40x growth over the past decade. They offer multi-platform, multigenerational\n", + "and multicultural brand experiences to 600+ million monthly viewers.\n", + "\n", + "In order to deliver more engaging experiences for their millions of viewers, Viacom18\n", + "migrated from their Hadoop environment due to its inability to process data at scale\n", + "efficiently. With Databricks, they have streamlined their infrastructure management,\n", + "increased data pipeline speeds and increased productivity among their data teams.\n", + "\n", + "Today, Viacom18 is able to deliver more relevant viewing experiences to their\n", + "subscribers, while identifying opportunities to optimize the business and drive\n", + "greater ROI.\n", + "\n", + "**Spotlight on Viacom18**\n", + "**Industry:** Media and entertainment\n", + "26%\n", + "Increase in operational efficiency lowers overall costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**Growth in subscribers and terabytes of viewing data**\n", + "**push Hadoop to its limits**\n", + "Viacom18, a joint venture between Network18 and ViacomCBS, is focused on\n", + "providing its audiences with highly personalized viewing experiences. The core\n", + "of this strategy requires implementing an enterprise data architecture that enables\n", + "the building of powerful customer analytics on daily viewer data. But with millions of\n", + "consumers across India, the sheer amount of data was tough to wrangle: They were\n", + "tasked with ingesting and processing over 45,000 hours of daily content on VOOT\n", + "(Viacom18’s on-demand video subscription platform), which easily generated 700GB\n", + "to 1TB of data per day.\n", + "\n", + "“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\n", + "Vice President of Digital Transformation and Technology. “We deliver personalized\n", + "content recommendations across our audiences around the world based on\n", + "individual viewing history and preferences in order to increase viewership and\n", + "customer loyalty.”\n", + "\n", + "Viacom18’s data lake, which was leveraging on-premises Hadoop for operations,\n", + "wasn’t able to optimally process 90 days of rolling data within their management’s\n", + "defined SLAs, limiting their ability to deliver on their analytics needs, which impacted\n", + "not only the customer experience but also overall costs.\n", + "\n", + "To meet this challenge head-on, Viacom18 needed a modern data warehouse with the\n", + "ability to analyze data trends for a longer period of time instead of daily snapshots. They\n", + "also needed a platform that simplified infrastructure by allowing their team to easily\n", + "provision clusters with features like auto-scaling to help reduce compute costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Rapid data processing for analytics**\n", + "**and ML with Databricks**\n", + "To enable the processing power and data science capabilities they required, Viacom18\n", + "partnered with Celebal Technologies, a premier Salesforce, data analytics and big data\n", + "consulting organization based in India. The team at Celebal leveraged Azure Databricks\n", + "to provide Viacom18 with a unified data platform that modernizes its data warehousing\n", + "capabilities and accelerates data processing at scale.\n", + "\n", + "The ability to cache data within Delta Lake resulted in the much-needed acceleration\n", + "of queries, while cluster management with auto-scaling and the decoupling of\n", + "\n", + "\n", + "storage and compute simplified Viacom18’s infrastructure management and\n", + "optimized operational costs. “Delta Lake has created a streamlined approach to\n", + "the management of data pipelines,” explained Dey. “This has led to a decrease in\n", + "operational costs while speeding up time-to-insight for downstream analytics and\n", + "data science.”\n", + "\n", + "The notebooks feature was an unexpected bonus for Viacom18, as a common workspace\n", + "gave data teams a way to collaborate and increase productivity on everything from\n", + "model training to ad hoc analysis, dashboarding and reporting via PowerBI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Leveraging viewer data to power personalized**\n", + "**viewing experiences**\n", + "Celebal Technologies and Databricks have enabled Viacom18 to deliver innovative\n", + "customer solutions and insights with increased cross-team collaboration and\n", + "productivity. With Databricks, Viacom18’s data team is now able to seamlessly\n", + "navigate their data while better serving their customers.\n", + "\n", + "“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\n", + "and deliver customer behavioral and engagement insights to the analysts and data\n", + "scientists,” said Dey.\n", + "\n", + "In addition to performance gains, the faster query times have also lowered the overall\n", + "cost of ownership, even with daily increases in data volumes. “Azure Databricks has\n", + "greatly streamlined processes and improved productivity by an estimated 26%,”\n", + "concluded Dey.\n", + "\n", + "Overall, Dey cites the migration from Hadoop to Databricks has delivered significant\n", + "business value — reducing the cost of failure, accelerating processing speeds at\n", + "scale, and simplifying ad hoc analysis for easier data exploration and innovations that\n", + "deliver highly engaging customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "# What’s next?\n", + "\n", + "Now that you understand Delta Lake, it may be time to take a look\n", + "at some additional resources.\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n", + "\n", + "- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n", + "\n", + "- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/try-databricks)**\n", + "**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n", + "\n", + "\n", + "-----
SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf2024-09-19T16:57:19Z
**eBook**\n", + "\n", + "## The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "\n", + "**C H A P TE R 1**\n", + "\n", + "**C H A P TE R 2**\n", + "\n", + "**C H A P TE R 3**\n", + "\n", + "**C H A P TE R 4**\n", + "\n", + "**C H A P TE R 5**\n", + "\n", + "**C H A P TE R 6**\n", + "\n", + "**C H A P TE R 7**\n", + "\n", + "**C H A P TE R 8**\n", + "\n", + "**C H A P TE R 9**\n", + "\n", + "**C H A P TE R 10**\n", + "\n", + "**C H A P TE R 11**\n", + "\n", + "**C H A P TE R 12**\n", + "\n", + "\n", + "**The data lakehouse** ...................................................................................................................................................................................... **4**\n", + "\n", + "**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n", + "\n", + "**Data reliability and performance** ................................................................................................................................... **18**\n", + "\n", + "**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n", + "\n", + "**Security** .............................................................................................................................................................................................................................. **41**\n", + "\n", + "**Instant compute and serverless** ................................................................................................................................... **48**\n", + "\n", + "**Data warehousing** ......................................................................................................................................................................................... **52**\n", + "\n", + "**Data engineering** ............................................................................................................................................................................................. **56**\n", + "\n", + "**Data streaming** .................................................................................................................................................................................................. **68.**\n", + "\n", + "**Data science and machine learning** ........................................................................................................................ **7** **3.**\n", + "\n", + "**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n", + "\n", + "**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "#### The Data Team’s Guide to the Databricks Lakehouse Platform\n", + "\n", + "_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\n", + "designed for data practitioners and leaders who are embarking\n", + "on their journey into the data lakehouse architecture.\n", + "\n", + "In this eBook, you will learn the full capabilities of the data lakehouse architecture\n", + "and how the Databricks Lakehouse Platform helps organizations of all sizes — from\n", + "enterprises to startups in every industry — with all their data, analytics, AI and\n", + "machine learning use cases on one platform.\n", + "\n", + "You will see how the platform combines the best elements of data warehouses\n", + "and data lakes to increase the reliability, performance and scalability of your\n", + "data platform. Discover how the lakehouse simplifies complex workloads in data\n", + "engineering, data warehousing, data streaming, data science and machine learning\n", + "— and bolsters collaboration for your data teams, allowing them to maintain new\n", + "levels of governance, flexibility and agility in an open and multicloud environment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### The data lakehouse\n", + "# 01\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The evolution of data architectures\n", + "\n", + "\n", + "Data has moved front and center within every organization as data-driven insights\n", + "have fueled innovation, competitive advantage and better customer experiences.\n", + "\n", + "However, as companies place mandates on becoming more data-driven,\n", + "their data teams are left in a sprint to deliver the right data for business\n", + "insights and innovation. With the widespread adoption of cloud, data teams\n", + "often invest in large-scale complex data systems that have capabilities for\n", + "streaming, business intelligence, analytics and machine learning to support\n", + "the overall business objectives.\n", + "\n", + "To support these objectives, data teams have deployed cloud data\n", + "\n", + "warehouses and data lakes.\n", + "\n", + "\n", + "Traditional data systems: The data warehouse and data lake\n", + "\n", + "With the advent of big data, companies began collecting large amounts of\n", + "data from many different sources, such as weblogs, sensor data and images.\n", + "Data warehouses — which have a long history as the foundation for decision\n", + "support and business intelligence applications — cannot handle large volumes\n", + "of data.\n", + "\n", + "While data warehouses are great for structured data and historical analysis,\n", + "they weren’t designed for unstructured data, semi-structured data, and data\n", + "with high variety, velocity and volume, making them unsuitable for many types\n", + "of data.\n", + "\n", + "This led to the introduction of data lakes, providing a single repository of raw\n", + "data in a variety of formats. While suitable for storing big data, data lakes do\n", + "not support transactions, nor do they enforce data quality, and their lack of\n", + "consistency/isolation makes it almost impossible to read, write or process data.\n", + "\n", + "For these reasons, many of the promises of data lakes never materialized and,\n", + "in many cases, reduced the benefits of data warehouses.\n", + "\n", + "As companies discovered new use cases for data exploration, predictive modeling\n", + "and prescriptive analytics, the need for a single, flexible, high-performance system\n", + "only grew. Data teams require systems for diverse data applications including SQL\n", + "analytics, real-time analytics, data science and machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "To solve for new use cases and new users, a common approach is to use multiple\n", + "systems — a data lake, several data warehouses and other specialized systems\n", + "such as streaming, time-series, graph and image databases. But having multiple\n", + "systems introduces complexity and delay, as data teams invariably need to\n", + "move or copy data between different systems, effectively losing oversight and\n", + "governance over data usage.\n", + "\n", + "\n", + "You have now duplicated data in two different systems and the changes you\n", + "make in one system are unlikely to find their way to the other. So, you are going\n", + "to have data drift almost immediately, not to mention paying to store the same\n", + "data multiple times.\n", + "\n", + "Then, because governance is happening at two distinct levels across these\n", + "platforms, you are not able to control things consistently.\n", + "\n", + "\n", + "**Challenges with data, analytics and AI**\n", + "\n", + "In a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\n", + "measurable value from data. The challenge is that most companies continue to\n", + "implement two different platforms: data warehouses for BI and data lakes for AI.\n", + "These platforms are incompatible with each other, but data from both systems\n", + "is generally needed to deliver game-changing outcomes, which makes success\n", + "with AI extremely difficult.\n", + "\n", + "Today, most of the data is landing in the data lake, and a lot of it is unstructured.\n", + "In fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\n", + "unstructured by 2025. But, this data is where much of the value from AI resides.\n", + "Subsets of the data are then copied to the data warehouse into structured\n", + "tables, and back again in some cases.\n", + "\n", + "You also must secure and govern the data in both warehouses and offer\n", + "fine-grained governance, while lakes tend to be coarser grained at the file level.\n", + "Then, you stand up different stacks of tools on these platforms to do either\n", + "BI or AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, the tool stacks on top of these platforms\n", + "are fundamentally different, which makes it difficult\n", + "to get any kind of collaboration going between the\n", + "teams that support them.\n", + "\n", + "This is why AI efforts fail. There is a tremendous\n", + "amount of complexity and rework being introduced\n", + "into the system. Time and resources are being\n", + "wasted trying to get the right data to the right\n", + "people, and everything is happening too slowly\n", + "to get in front of the competition.\n", + "\n", + "\n", + "**Realizing this requires two disparate,**\n", + "**incompatible data platforms**\n", + "\n", + "\n", + "**Business** **SQL** **Incomplete** **Data science** **Data**\n", + "\n", + "**support for**\n", + "\n", + "**intelligence** **analytics** **and ML** **streaming**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "**Incomplete**\n", + "**support for**\n", + "**use cases**\n", + "\n", + "\n", + "**Incompatible**\n", + "**security and**\n", + "**governance models**\n", + "\n", + "**Copy subsets of data**\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa T|n a|c b|e and security le ACLs|\n", + "|||||\n", + "\n", + "|Col1|Col2|Col3|Col4|\n", + "|---|---|---|---|\n", + "|Governa File|n s|c a|e and security nd blobs|\n", + "|||||\n", + "\n", + "\n", + "**Disjointed**\n", + "**and duplicative**\n", + "\n", + "**Data warehouse** **data silos** **Data lake**\n", + "Structured tables Unstructured files:\n", + "logs, text, images, video\n", + "\n", + "\n", + "-----\n", + "\n", + "**Moving forward with a lakehouse architecture**\n", + "\n", + "To satisfy the need to support AI and BI directly on vast amounts of data stored\n", + "in data lakes (on low-cost cloud storage), a new data management architecture\n", + "emerged independently across many organizations and use cases: the\n", + "data lakehouse.\n", + "\n", + "The data lakehouse can store _all_ and _any_ type of data once in a data lake and\n", + "make that data accessible directly for AI and BI. The lakehouse paradigm has\n", + "specific capabilities to efficiently allow both AI and BI on all the enterprise’s data\n", + "at a massive scale. Namely, it has the SQL and performance capabilities such as\n", + "indexing, caching and MPP processing to make BI work fast on data lakes. It also\n", + "has direct file access and direct native support for Python, data science and AI\n", + "frameworks without the need for a separate data warehouse.\n", + "\n", + "In short, a lakehouse is a data architecture that combines the best elements\n", + "of data warehouses and data lakes. Lakehouses are enabled by a new system\n", + "design, which implements similar data structures and data management features\n", + "found in a data warehouse directly on the low-cost storage used for data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Data lakehouse\n", + "\n", + "One platform to unify all your data, analytics and AI workloads\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "All machine learning, SQL,\n", + "BI, and streaming use cases\n", + "\n", + "One security and governance\n", + "approach for all data assets\n", + "on all clouds\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key features for a lakehouse**\n", + "\n", + "Recent innovations with the data lakehouse architecture can help simplify\n", + "your data and AI workloads, ease collaboration for data teams, and maintain\n", + "the kind of flexibility and openness that allows your organization to stay agile\n", + "as you scale. Here are key features to consider when evaluating data lakehouse\n", + "architectures:\n", + "\n", + "Transaction support: In an enterprise lakehouse, many data pipelines will\n", + "often be reading and writing data concurrently. Support for ACID (Atomicity,\n", + "Consistency, Isolation and Durability) transactions ensures consistency as\n", + "multiple parties concurrently read or write data.\n", + "\n", + "Schema enforcement and governance: The lakehouse should have\n", + "a way to support schema enforcement and evolution, supporting data\n", + "warehouse schema paradigms such as star/snowflake. The system should\n", + "be able to reason about data integrity, and it should have robust governance\n", + "and auditing mechanisms.\n", + "\n", + "Data governance: Capabilities including auditing, retention and lineage\n", + "have become essential, particularly considering recent privacy regulations.\n", + "\n", + "Tools that allow data discovery have become popular, such as data catalogs\n", + "and data usage metrics.\n", + "\n", + "BI support: Lakehouses allow the use of BI tools directly on the source\n", + "data. This reduces staleness and latency, improves recency and lowers cost\n", + "by not having to operationalize two copies of the data in both a data lake\n", + "and a warehouse.\n", + "\n", + "\n", + "Storage decoupled from compute: In practice, this means storage and\n", + "compute use separate clusters, thus these systems can scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also\n", + "have this property.\n", + "\n", + "Openness: The storage formats, such as Apache Parquet, are open and\n", + "standardized, so a variety of tools and engines, including machine learning\n", + "and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "Support for diverse data types (unstructured and structured):\n", + "The lakehouse can be used to store, refine, analyze and access data types\n", + "needed for many new data applications, including images, video, audio,\n", + "semi-structured data and text.\n", + "\n", + "Support for diverse workloads: Use the same data repository for a range\n", + "of workloads including data science, machine learning and SQL analytics.\n", + "Multiple tools might be needed to support all these workloads.\n", + "\n", + "End-to-end streaming: Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "**Learn more**\n", + "\n", + "**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "\n", + "**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "\n", + "**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Lakehouse: A new generation of open platforms\n", + "\n", + "\n", + "###### This is the lakehouse paradigm\n", + "\n", + "\n", + "Databricks is the inventor and pioneer of the\n", + "data lakehouse architecture. The data lakehouse\n", + "architecture was coined in the research paper,\n", + "[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\n", + "introduced by Databricks’ founders, UC Berkeley\n", + "and Stanford University at the 11th Conference on\n", + "Innovative Data Systems Research (CIDR) in 2021.\n", + "\n", + "At Databricks, we are continuously innovating on\n", + "the lakehouse architecture to help customers deliver\n", + "on their data, analytics and AI aspirations. The ideal\n", + "data, analytics and AI platform needs to operate\n", + "differently. Rather than copying and transforming\n", + "data in multiple systems, you need one platform\n", + "that accommodates all data types.\n", + "\n", + "\n", + "**Data science** **Data**\n", + "**and ML** **streaming**\n", + "\n", + "\n", + "**All ML, SQL, BI**\n", + "**and streaming use cases**\n", + "\n", + "**One security and governance**\n", + "**approach for all data assets**\n", + "**on all clouds**\n", + "\n", + "**A reliable data platform**\n", + "**to efficiently handle**\n", + "**all data types**\n", + "\n", + "\n", + "**Persona-based**\n", + "**use cases**\n", + "\n", + "**Unity Catalog**\n", + "Fine-grained governance\n", + "for data and AI\n", + "\n", + "**Delta Lake**\n", + "Data reliability and performance\n", + "\n", + "\n", + "**Business**\n", + "**intelligence**\n", + "\n", + "\n", + "**SQL**\n", + "**analytics**\n", + "\n", + "\n", + "Files and blobs and table ACLs\n", + "\n", + "\n", + "Ideally, the platform must be open, so that you\n", + "are not locked into any walled gardens. You would\n", + "also have one security and governance model.\n", + "It would not only manage all data types, but it\n", + "would also be cloud-agnostic to govern data\n", + "wherever it is stored.\n", + "\n", + "Last, it would support all major data, analytics and AI\n", + "workloads, so that your teams can easily collaborate\n", + "and get access to all the data they need to innovate.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is the Databricks Lakehouse Platform?\n", + "\n", + "The Databricks Lakehouse Platform unifies your\n", + "data warehousing and AI uses cases on a single\n", + "platform. It combines the best elements of data\n", + "lakes and data warehouses to deliver the reliability,\n", + "strong governance and performance of data\n", + "warehouses with the openness, flexibility and\n", + "machine learning support of data lakes.\n", + "\n", + "This unified approach simplifies your modern data\n", + "stack by eliminating the data silos that traditionally\n", + "separate and complicate data engineering, analytics,\n", + "BI, data science and machine learning. It’s built\n", + "on open source and open standards to maximize\n", + "flexibility. And, its common approach to data\n", + "management, security and governance helps you\n", + "\n", + "operate more efficiently and innovate faster.\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "Data Data Data Data science\n", + "warehousing engineering streaming and ML\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "**Simple**\n", + "\n", + "The unified approach simplifies your data\n", + "architecture by eliminating the data silos that\n", + "traditionally separate analytics, BI, data science\n", + "and machine learning. With a lakehouse, you\n", + "can eliminate the complexity and expense that\n", + "make it hard to achieve the full potential of\n", + "your analytics and AI initiatives.\n", + "\n", + "\n", + "**Open**\n", + "\n", + "Delta Lake forms the open foundation of\n", + "the lakehouse by providing reliability and\n", + "performance directly on data in the data\n", + "lake. You’re able to avoid proprietary walled\n", + "gardens, easily share data and build your\n", + "modern data stack with unrestricted access\n", + "to the ecosystem of open source data projects\n", + "and the broad Databricks partner network.\n", + "\n", + "\n", + "**Multicloud**\n", + "\n", + "The Databricks Lakehouse Platform offers\n", + "you a consistent management, security and\n", + "governance experience across all clouds. You\n", + "do not need to invest in reinventing processes\n", + "for every cloud platform that you are using to\n", + "support your data and AI efforts. Instead, your\n", + "data teams can simply focus on putting all\n", + "your data to work to discover new insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform architecture\n", + "\n", + "**Data reliability and performance for lakehouse**\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\n", + "with all major analytics tools and works with the widest variety of formats to\n", + "store and process data.\n", + "\n", + "\n", + "**Instant compute and serverless**\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions and\n", + "manages the compute layer on behalf of the customer in the Databricks cloud\n", + "account instead of the customer account. As of the current release, serverless\n", + "compute is supported for use with Databricks SQL.\n", + "\n", + "In Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n", + "\n", + "\n", + "[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\n", + "a state-of-the-art vectorized engine for fast querying and provides the best\n", + "performance for all workloads in the lakehouse.\n", + "\n", + "In Chapter 3, we explore the details of data reliability and performance\n", + "\n", + "for the lakehouse.\n", + "\n", + "**Unified governance and security for lakehouse**\n", + "\n", + "The Databricks Lakehouse Platform provides unified governance with enterprise\n", + "scale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\n", + "governance for your data and AI assets in the lakehouse — files, tables,\n", + "dashboards, and machine learning models — giving you much better control,\n", + "management and security across clouds.\n", + "\n", + "[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\n", + "data across the organization in real time, independent of the platform\n", + "on which the data resides.\n", + "\n", + "In Chapter 4, we go into the details of unified governance for lakehouse\n", + "\n", + "and, in Chapter 5, we dive into the details of security for lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### The Databricks Lakehouse Platform workloads\n", + "\n", + "The Databricks Lakehouse Platform architecture supports different workloads\n", + "such as data warehousing, data engineering, data streaming, data science and\n", + "machine learning on one simple, open and multicloud data platform.\n", + "\n", + "**Data warehousing**\n", + "\n", + "Data warehousing is one of the most business-critical workloads for data teams,\n", + "and the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\n", + "lets you run all your SQL and BI applications at scale with up to 12x better price/\n", + "performance, a unified governance model, open formats and APIs, and your tools\n", + "of choice — no lock-in. Reduce resource management overhead with serverless\n", + "compute, and easily ingest, transform and query all your data in-place to deliver\n", + "real-time business insights faster.\n", + "\n", + "Built on open standards and APIs, the Databricks Lakehouse Platform provides\n", + "the reliability, quality and performance that data lakes natively lack, plus\n", + "integrations with the ecosystem for maximum flexibility.\n", + "\n", + "In Chapter 7, we go into the details of data warehousing on the lakehouse.\n", + "\n", + "**Data engineering**\n", + "\n", + "Data engineering on the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows on\n", + "any cloud platform, and meet regulatory requirements to maintain governance.\n", + "\n", + "\n", + "automates the complexity of building and maintaining pipelines and running ETL\n", + "workloads so data engineers and analysts can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "In Chapter 8, we go into the details of data engineering on the lakehouse.\n", + "\n", + "**Data streaming**\n", + "\n", + "[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\n", + "Lakehouse Platform and is the future of all data processing. Real-time processing\n", + "provides the freshest possible data to an organization’s analytics and machine\n", + "learning models enabling them to make better, faster decisions, more accurate\n", + "predictions, offer improved customer experiences and more.\n", + "\n", + "The Databricks Lakehouse Platform Dramatically simplifies data streaming to\n", + "deliver real-time analytics, machine learning and applications on one platform.\n", + "\n", + "In Chapter 9, we go into the details of data streaming on the lakehouse.\n", + "\n", + "**Data science and machine learning**\n", + "\n", + "Data science and machine learning (DSML) on the lakehouse is a powerful\n", + "workload that is unique to many other data offerings. DSML on the lakehouse\n", + "provides a data-native and collaborative solution for the full ML lifecycle. It\n", + "can maximize data and ML team productivity, streamline collaboration, empower\n", + "ML teams to prepare, process and manage data in a self-service manner,\n", + "and standardize the ML lifecycle from experimentation to production.\n", + "\n", + "In Chapter 10, we go into the details of DSML on the lakehouse.\n", + "\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform that\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform and your**\n", + "**modern data stack**\n", + "\n", + "The Databricks Lakehouse Platform is open and provides the flexibility to\n", + "continue using existing infrastructure, to easily share data and build your modern\n", + "data stack with unrestricted access to the ecosystem of open source data\n", + "projects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n", + "\n", + "In Chapter 11, we go into the details of our technology partners and the\n", + "\n", + "modern data stack.\n", + "\n", + "#### Global adoption of the Databricks Lakehouse Platform\n", + "\n", + "\n", + "Today, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\n", + "across industries doing transformational work. Organizations around the globe\n", + "are driving change and delivering a new generation of data, analytics and AI\n", + "applications. We believe that the unfulfilled promise of data and AI can finally\n", + "be fulfilled with one platform for data analytics, data science and machine\n", + "learning with the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n", + "\n", + "[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n", + "\n", + "[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n", + "\n", + "[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n", + "\n", + "[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n", + "\n", + "[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 03\n", + "\n", + "\n", + "### Data reliability and performance\n", + "\n", + "To bring openness, reliability and lifecycle management to data lakes,\n", + "the Databricks Lakehouse Platform is built on the foundation of Delta\n", + "Lake. Delta Lake solves challenges around unstructured/structured data\n", + "ingestion, the application of data quality, difficulties with deleting data for\n", + "compliance or issues with modifying data for data capture.\n", + "\n", + "Although data lakes are great solutions for holding large quantities of raw\n", + "data, they lack important attributes for data reliability and quality and\n", + "often don’t offer good performance when compared to data warehouses.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Problems with today’s data lakes\n", + "\n", + "When it comes to data reliability and quality, examples of these\n", + "missing attributes include:\n", + "\n", + "**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\n", + "appends and reads\n", + "\n", + "**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\n", + "For example, rejecting writes that don’t match a table’s schema.\n", + "\n", + "**•** **Lack of integration with data catalog:** Results in dark data and no single\n", + "source of truth\n", + "\n", + "Even just the absence of these three attributes can cause a lot of extra work\n", + "for data engineers as they strive to ensure consistent high-quality data in the\n", + "pipelines they create.\n", + "\n", + "\n", + "These challenges are solved with two key technologies that are at the foundation\n", + "of the lakehouse: Delta Lake and Photon.\n", + "\n", + "**What is Delta Lake?**\n", + "\n", + "Delta Lake is a file-based, open source storage format that provides ACID\n", + "transactions and scalable metadata handling, and unifies streaming and batch\n", + "data processing. It runs on top of existing data lakes and is compatible with\n", + "Apache Spark™ and other processing engines.\n", + "\n", + "Delta Lake uses Delta Tables which are based on Apache Parquet, a commonly\n", + "used format for structured data already utilized by many organizations. Therefore,\n", + "switching existing Parquet tables to Delta Tables is easy and quick. Delta\n", + "Tables can also be used with semi-structured and unstructured data, providing\n", + "versioning, reliability, metadata management, and time travel capabilities that\n", + "make these types of data easily managed as well.\n", + "\n", + "\n", + "As for performance, data lakes use object storage, so data is mostly kept in\n", + "immutable files leading to the following problems:\n", + "\n", + "**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\n", + "indexing practices in the form of partitioning that leads to hundreds of dev hours\n", + "spent tuning file sizes to improve read/write performance. Often, partitioning\n", + "proves to be ineffective over time if the wrong field was selected for partitioning\n", + "or due to high cardinality columns.\n", + "\n", + "**•** **Too many small files:** With no support for transactions, appending new data\n", + "takes the form of adding more and more files, leading to “small file problems,”\n", + "a known root cause of query performance degradation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake features**\n", + "\n", + "\n", + "**ACID guarantees**\n", + "\n", + "Delta Lake ensures that all data changes\n", + "written to storage are committed for durability\n", + "and made visible to readers atomically. In other\n", + "words, no more partial or corrupted files.\n", + "\n", + "**Scalable data and metadata handling**\n", + "\n", + "Since Delta Lake is built on data lakes, all reads\n", + "and writes using Spark or other distributed\n", + "processing engines are inherently scalable to\n", + "petabyte-scale. However, unlike most other\n", + "storage formats and query engines, Delta Lake\n", + "leverages Spark to scale out all the metadata\n", + "processing, thus efficiently handling metadata\n", + "of billions of files for petabyte-scale tables.\n", + "\n", + "\n", + "**Audit history and time travel**\n", + "\n", + "The Delta Lake transaction log records details\n", + "about every change made to data, providing a full\n", + "audit trail of the changes. These data snapshots\n", + "allow developers to access and revert to earlier\n", + "versions of data for audits, rollbacks or to\n", + "reproduce experiments.\n", + "\n", + "**Schema enforcement and schema evolution**\n", + "\n", + "Delta Lake automatically prevents the insertion of\n", + "data with an incorrect schema, i.e., not matching\n", + "the table schema. And when needed, it allows the\n", + "table schema to be explicitly and safely evolved to\n", + "accommodate ever-changing data.\n", + "\n", + "\n", + "**Support for deletes, updates and merges**\n", + "\n", + "Most distributed processing frameworks do not\n", + "support atomic data modification operations on\n", + "data lakes. Delta Lake supports merge, update\n", + "and delete operations to enable complex use\n", + "cases including but not limited to change data\n", + "capture (CDC), slowly changing dimension (SCD)\n", + "operations and streaming upserts.\n", + "\n", + "**Streaming and batch unification**\n", + "\n", + "A Delta Lake table can work both in batch\n", + "and as a streaming source and sink. The\n", + "ability to work across a wide variety of latencies,\n", + "ranging from streaming data ingestion to batch\n", + "historic backfill, to interactive queries all work\n", + "out of the box.\n", + "\n", + "\n", + "-----\n", + "\n", + "**The Delta Lake transaction log**\n", + "\n", + "A key to understanding how Delta Lake provides all these capabilities is the\n", + "transaction log. The Delta Lake transaction log is the common thread that runs\n", + "through many of Delta Lake’s most notable features, including ACID transactions,\n", + "scalable metadata handling, time travel and more. The Delta Lake transaction log\n", + "is an ordered record of every transaction that has ever been performed on\n", + "a Delta Lake table since its inception.\n", + "\n", + "Delta Lake is built on top of Spark to allow multiple readers and writers of a\n", + "given table to work on a table at the same time. To always show users correct\n", + "views of the data, the transaction log serves as a single source of truth: the\n", + "central repository that tracks all changes that users make to the table.\n", + "\n", + "When a user reads a Delta Lake table for the first time or runs a new query on\n", + "an open table that has been modified since the last time it was read, Spark\n", + "checks the transaction log to see what new transactions are posted to the table.\n", + "Then, Spark updates the table with those recent changes. This ensures that a\n", + "user’s version of a table is always synchronized with the master record as of the\n", + "most recent query, and that users cannot make divergent, conflicting changes\n", + "to a table.\n", + "\n", + "\n", + "**Flexibility and broad industry support**\n", + "\n", + "Delta Lake is an open source project, with an engaged community of\n", + "contributors building and growing the Delta Lake ecosystem atop a set of open\n", + "APIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\n", + "as an open storage standard in different environments and use cases, comes a\n", + "broad set of integration with industry-leading tools, technologies and formats.\n", + "\n", + "Organizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\n", + "flexibility in how they ingest, store and query data. They are not limited in storing\n", + "data in a single cloud provider and can implement a true multicloud approach to\n", + "data storage.\n", + "\n", + "Connectors to tools, such as Fivetran, allow you to leverage Databricks’\n", + "ecosystem of partner solutions, so organizations have full control of building the\n", + "right ingestion pipelines for their use cases. Finally, consuming data via queries\n", + "for exploration or business intelligence (BI) is also flexible and open.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delta Lake integrates with all major analytics tools**\n", + "\n", + "Eliminates unnecessary data movement and duplication\n", + "\n", + "\n", + "-----\n", + "\n", + "In addition to a wide ecosystem of tools and technologies, Delta Lake supports\n", + "a broad set of data formats for structured, semi-structured and unstructured\n", + "data. These formats include image binary data that can be stored in Delta\n", + "Tables, graph data format, geospatial data types and key-value stores.\n", + "\n", + "**Learn more**\n", + "\n", + "[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "[Documentation](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n", + "\n", + "[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n", + "\n", + "\n", + "**What is Photon?**\n", + "\n", + "As many organizations standardize on the lakehouse paradigm, this new\n", + "architecture poses challenges with the underlying query execution engine\n", + "for accessing and processing structured and unstructured data. The execution\n", + "engine needs to provide the performance of a data warehouse and the scalability\n", + "of data lakes.\n", + "\n", + "Photon is the next-generation query engine on the Databricks Lakehouse\n", + "Platform that provides dramatic infrastructure cost savings and speedups for\n", + "all use cases — from data ingestion, ETL, streaming, data science and interactive\n", + "queries — directly on your data lake. Photon is compatible with Spark APIs and\n", + "implements a more general execution framework that allows efficient processing\n", + "of data with support of the Spark API. This means getting started is as easy as\n", + "turning it on — no code change and no lock-in. With Photon, typical customers are\n", + "seeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\n", + "to 85% reduction in VM compute hours.\n", + "\n", + "Spark instructions Photon instructions\n", + "\n", + "\n", + "Photon engine\n", + "\n", + "\n", + "Delta/Parquet\n", + "\n", + "Photon writer\n", + "to Delta/Parquet\n", + "\n", + "\n", + "-----\n", + "\n", + "Why process queries with Photon?\n", + "\n", + "\n", + "Query performance on Databricks has steadily increased over the years,\n", + "powered by Spark and thousands of optimizations packaged as part of the\n", + "Databricks Runtime (DBR). Photon provides an additional 2x speedup per the\n", + "TPC-DS 1TB benchmark compared to the latest DBR versions.\n", + "\n", + "**Relative speedup to DBR 2.1 by DBR version**\n", + "Higher is better\n", + "\n", + "\n", + "**Customers have observed significant speedups using**\n", + "**Photon on workloads such as:**\n", + "\n", + "**•** **SQL-based jobs:** Accelerate large-scale production jobs on\n", + "SQL and Spark DataFrames\n", + "\n", + "**•** **IoT use cases:** Faster time-series analysis using Photon\n", + "compared to Spark and traditional Databricks Runtime\n", + "\n", + "**•** **Data privacy and compliance:** Query petabytes-scale data\n", + "sets to identify and delete records without duplicating data\n", + "with Delta Lake, production jobs and Photon\n", + "\n", + "**•** **Loading data into Delta and Parquet:** Vectorized I/O\n", + "speeds up data loads for Delta and Parquet tables, lowering\n", + "overall runtime and costs of data engineering jobs\n", + "\n", + "\n", + "Release date - DBR version (TPC-DS 1TB 10 x i3xl)\n", + "\n", + "\n", + "-----\n", + "\n", + "**100TB TPC-DS price/performance**\n", + "Lower is better\n", + "\n", + "\n", + "Best price/performance for analytics\n", + "in the cloud\n", + "\n", + "Written from the ground up in C++, Photon takes\n", + "advantage of modern hardware for faster queries,\n", + "providing up to 12x better price/performance\n", + "compared to other cloud data warehouses —\n", + "all natively on your data lake.\n", + "\n", + "\n", + "Databricks SQL Databricks SQL Cloud data Cloud data Cloud data\n", + "spot on-demand warehouse 1 warehouse 2 warehouse 3\n", + "\n", + "**System**\n", + "\n", + "\n", + "-----\n", + "\n", + "Works with your existing code\n", + "and avoids vendor lock-in\n", + "\n", + "Photon is designed to be compatible with the\n", + "Apache Spark DataFrame and SQL APIs to ensure\n", + "workloads run seamlessly without code changes.\n", + "All you do is turn it on. Photon will seamlessly\n", + "coordinate work and resources and transparently\n", + "accelerate portions of your SQL and Spark queries.\n", + "No tuning or user intervention required.\n", + "\n", + "\n", + "**Photon in the Databricks Lakehouse Platform**\n", + "\n", + "**Client: submit SQL**\n", + "\n", + "Parsing\n", + "Catalyst: analysis/\n", + "planning/optimization\n", + "scheduling\n", + "\n", + "Execute task Execute task Execute task Execute task\n", + "\n", + "_Lifecycle of a Photon query_\n", + "\n", + "\n", + "Spark\n", + "driver\n", + "JVM\n", + "\n", + "Spark\n", + "executors mixed\n", + "JVM/Native\n", + "\n", + "\n", + "-----\n", + "\n", + "Optimizing for all data use cases\n", + "and workloads\n", + "\n", + "Photon is the first purpose-built lakehouse engine\n", + "designed to accelerate all data and analytics\n", + "workloads: data ingestion, ETL, streaming, data\n", + "science, and interactive queries. While we started\n", + "Photon primarily focused on SQL to provide\n", + "customers with world-class data warehousing\n", + "performance on their data lakes, we’ve significantly\n", + "increased the scope of ingestion sources, formats,\n", + "APIs and methods supported by Photon since\n", + "then. As a result, customers have seen dramatic\n", + "infrastructure cost savings and speedups on\n", + "Photon across all their modern Spark (e.g., Spark\n", + "SQL and DataFrame) workloads.\n", + "\n", + "\n", + "Query optimizer\n", + "\n", + "Native execution engine\n", + "\n", + "Caching\n", + "\n", + "\n", + "_Accelerating all workloads on the lakehouse_\n", + "\n", + "**Learn more**\n", + "\n", + "[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n", + "\n", + "[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 04\n", + "\n", + "\n", + "### Unified governance and sharing for data, analytics and AI\n", + "\n", + "Today, more and more organizations recognize the importance of making\n", + "high-quality data readily available to data teams to drive actionable insights\n", + "and business value. At the same time, organizations also understand the risks\n", + "of data breaches which negatively impact brand value and inevitably lead to\n", + "erosion of customer trust. Governance is one of the most critical components\n", + "of a lakehouse data platform architecture; it helps ensure that data assets\n", + "are securely managed throughout the enterprise. However, many companies\n", + "are using different incompatible governance models leading to complex and\n", + "expensive solutions.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Key challenges with data and AI governance\n", + "\n", + "**Diversity of data and AI assets**\n", + "\n", + "The increased use of data and the added complexity of the data landscape\n", + "have left organizations with a difficult time managing and governing all types\n", + "of their data-related assets. No longer is data stored in files or tables. Data\n", + "assets today take many forms, including dashboards, machine learning models\n", + "and unstructured data like video and images that legacy data governance\n", + "solutions simply are not built to govern and manage.\n", + "\n", + "\n", + "**Rising multicloud adoption**\n", + "\n", + "More and more organizations now leverage a multicloud strategy to optimize\n", + "costs, avoid vendor lock-in, and meet compliance and privacy regulations. With\n", + "nonstandard, cloud-specific governance models, data governance across clouds\n", + "is complex and requires familiarity with cloud-specific security and governance\n", + "concepts, such as identity and access management (IAM).\n", + "\n", + "**Disjointed tools for data governance on the lakehouse**\n", + "\n", + "Today, data teams must deal with a myriad of fragmented tools and services for\n", + "their data governance requirements, such as data discovery, cataloging, auditing,\n", + "sharing, access controls, etc. This inevitably leads to operational inefficiencies\n", + "and poor performance due to multiple integration points and network latency\n", + "between the services.\n", + "\n", + "\n", + "**Two disparate and incompatible data platforms**\n", + "\n", + "Organizations today use two different platforms for their data analytics and\n", + "AI efforts — data warehouses for BI and data lakes for AI. This results in data\n", + "replication across two platforms, presenting a major governance challenge.\n", + "With no unified view of the data landscape, it is difficult to see where data is\n", + "stored, who has access to what data, and consistently define and enforce data\n", + "access policies across the two platforms with different governance models.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### One security and governance approach\n", + "\n", + "Lakehouse systems provide a uniform way to manage access control, data\n", + "quality and compliance across all of an organization’s data using standard\n", + "interfaces similar to those in data warehouses by adding a management\n", + "interface on top of data lake storage.\n", + "\n", + "Modern lakehouse systems support fine-grained (row, column and view level)\n", + "access control via SQL, query auditing, attribute-based access control, data\n", + "versioning and data quality constraints and monitoring. These features are\n", + "generally provided using standard interfaces familiar to database administrators\n", + "(for example, SQL GRANT commands) to allow existing personnel to manage\n", + "all the data in an organization in a uniform way. Centralizing all the data in\n", + "a lakehouse system with a single management interface also reduces the\n", + "administrative burden and potential for error that comes with managing\n", + "multiple separate systems.\n", + "\n", + "\n", + "#### What is Unity Catalog?\n", + "\n", + "Unity Catalog is a unified governance solution for all data, analytics and AI\n", + "assets including files, tables, dashboards and machine learning models in your\n", + "lakehouse on any cloud. Unity Catalog simplifies governance by empowering\n", + "data teams with a common governance model based on ANSI-SQL to define\n", + "and enforce fine-grained access controls. With attribute-based access controls,\n", + "data administrators can enable fine-grained access controls on rows and\n", + "columns using tags (attributes). Built-in data search and discovery allows\n", + "data teams to quickly find and reference relevant data for any use case. Unity\n", + "Catalog offers automated data lineage for all workloads in SQL, R, Scala and\n", + "Python, to build a better understanding of the data and its flow in the lakehouse.\n", + "Unity Catalog also allows data sharing across or within organizations and\n", + "seamless integrations with your existing data governance tools.\n", + "\n", + "With Unity Catalog, data teams can simplify governance for all data and AI\n", + "assets with one consistent model to discover, access and share data, giving\n", + "you much better native performance, management and security across clouds.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key benefits**\n", + "\n", + "\n", + "The common metadata layer for cross-workspace metadata is at the account\n", + "level and eases collaboration by allowing different workspaces to access Unity\n", + "Catalog metadata through a common interface and break down data silos.\n", + "Further, the data permissions in Unity Catalog are applied to account-level\n", + "identities, rather than identities that are local to a workspace, allowing\n", + "a consistent view of users and groups across all workspaces.\n", + "\n", + "\n", + "Catalog, secure and audit access to all data assets on any cloud\n", + "\n", + "Unity Catalog provides centralized metadata, enabling data teams to create\n", + "a single source of truth for all data assets ranging from files, tables, dashboards\n", + "to machine learning models in one place.\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog offers a unified data access layer that provides a simple and\n", + "streamlined way to define and connect to your data through managed tables,\n", + "external tables, or files, while managing their access controls. Unity Catalog\n", + "centralizes access controls for files, tables and views.\n", + "\n", + "It allows fine-grained access controls for restricting access to certain rows\n", + "and columns to the users and groups who are authorized to query them. With\n", + "Attribute-Based Access Controls (ABAC), you can control access to multiple\n", + "data items at once based on user and data attributes, further simplifying\n", + "governance at scale. For example, you will be able to tag multiple columns\n", + "as personally identifiable information (PII) and manage access to all columns\n", + "tagged as PII in a single rule.\n", + "\n", + "Today, organizations are dealing with an increased burden of regulatory\n", + "compliance, and data access auditing is a critical component to ensure your\n", + "organization is set up for success while meeting compliance requirements.\n", + "Unity Catalog also provides centralized fine-grained auditing by capturing an\n", + "audit log of operations such as create, read, update and delete (CRUD) that have\n", + "been performed against the data. This allows a fine-grained audit trail showing\n", + "who accessed a given data set and helps you meet your compliance and\n", + "business requirements.\n", + "\n", + "\n", + "-----\n", + "\n", + "Built-in data search and discovery\n", + "\n", + "Data discovery is a critical component to break\n", + "down data silos and democratize data across\n", + "your organization to make data-driven decisions.\n", + "Unity Catalog provides a rich user interface for\n", + "data search and discovery, enabling data teams to\n", + "quickly search relevant data assets across the data\n", + "landscape and reference them for all use cases —\n", + "BI, analytics and machine learning — accelerating\n", + "time-to-value and boosting productivity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Automated data lineage for all workloads\n", + "\n", + "Data lineage describes the transformations and\n", + "refinements of data from source to insight. Lineage\n", + "includes capturing all the relevant metadata and\n", + "events associated with the data in its lifecycle,\n", + "including the source of the data set, what other\n", + "data sets were used to create it, who created it and\n", + "when, what transformations were performed, which\n", + "other data sets leverage it, and many other events\n", + "and attributes. Unity Catalog offers automated data\n", + "lineage down to table and column level, enabling\n", + "data teams to get an end-to-end view of where\n", + "data is coming from, what transformations were\n", + "performed on the data and how data is consumed\n", + "by end applications such as notebooks, workflows,\n", + "dashboards, machine learning models, etc.\n", + "\n", + "With automated data lineage for all workloads —\n", + "SQL, R, Python and Scala, data teams can quickly\n", + "identify and perform root cause analysis of any\n", + "errors in the data pipelines or end applications.\n", + "Second, data teams can perform impact analysis\n", + "to see dependencies of any data changes\n", + "on downstream consumers and notify them\n", + "about the potential impact. Finally, data lineage\n", + "also empowers data teams with increased\n", + "understanding of their data and reduces tribal\n", + "knowledge. Unity Catalog can also capture lineage\n", + "associated with non-data entities, such as notebooks,\n", + "workflows and dashboards. Lineage can be\n", + "\n", + "\n", + "_Data lineage with Unity Catalog_\n", + "\n", + "retrieved via REST APIs to support integrations\n", + "with other catalogs.\n", + "\n", + "Integrated with your existing tools\n", + "\n", + "\n", + "**Resources**\n", + "\n", + "[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n", + "\n", + "[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n", + "\n", + "[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n", + "\n", + "\n", + "Unity Catalog helps you to future-proof your data\n", + "and AI governance with the flexibility to leverage\n", + "your existing data catalogs and governance\n", + "solutions — Collibra, Alation, Immuta, Privacera,\n", + "Microsoft Purview and AWS Lakeformation.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Open data sharing and collaboration\n", + "\n", + "Data sharing has become important in the digital\n", + "economy as enterprises wish to exchange data\n", + "easily and securely with their customers, partners,\n", + "suppliers and internal lines of business to better\n", + "collaborate and unlock value from that data. But\n", + "to date, a lack of standards-based data sharing\n", + "protocol has resulted in data sharing solutions\n", + "tied to a single vendor or commercial product,\n", + "introducing vendor lock-in risks. What the industry\n", + "deserves is an open approach to data sharing.\n", + "\n", + "**Why data sharing is hard**\n", + "\n", + "Data sharing has evolved from an optional feature\n", + "of a few data platforms to a business necessity\n", + "and success factor for organizations. Our solution\n", + "architects encounter daily the classic scenarios\n", + "of a retailer looking to publish sales data to their\n", + "suppliers in real time or a supplier that wants to\n", + "share real-time inventory.\n", + "\n", + "As a reminder, data sharing recently triggered\n", + "the most impressive scientific development that\n", + "humankind has ever seen. On January 5, 2021, the\n", + "first sample of the genome of the coronavirus was\n", + "\n", + "\n", + "uploaded to the internet. It wasn’t a lung biopsy\n", + "from a patient in Wuhan, but a shared digital\n", + "genomic data set that triggered the development\n", + "of the first batch of COVID vaccines worldwide.\n", + "\n", + "\n", + "treatments, tests and tracking mutations as they\n", + "are passed down through a lineage, a branch of\n", + "the coronavirus family tree. The above graphic\n", + "shows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n", + "\n", + "\n", + "Since then, coronavirus experts have daily\n", + "exchanged public data sets, looking for better\n", + "\n", + "\n", + "-----\n", + "\n", + "Sharing data, as well as consuming data from\n", + "external sources, allows you to collaborate with\n", + "partners, establish new partnerships, enable\n", + "research and can generate new revenue streams\n", + "with data monetization.\n", + "\n", + "Despite those promising examples, existing data\n", + "sharing technologies come with several limitations:\n", + "\n", + "**•** Traditional data sharing technologies, such as\n", + "Secure File Transfer Protocol (SFTP), do not\n", + "scale well and only serve files offloaded to a\n", + "server\n", + "\n", + "**•** Cloud object stores operate on an object level\n", + "and are cloud-specific\n", + "\n", + "**•** Commercial data sharing offerings baked into\n", + "vendor products often share tables instead of\n", + "files, but scaling them is expensive and they\n", + "are not open and, therefore, do not permit data\n", + "sharing with a different platform\n", + "\n", + "The following table compares proprietary vendor\n", + "solutions with SFTP, cloud object stores and Delta\n", + "Sharing.\n", + "\n", + "\n", + "\n", + "|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n", + "|---|---|---|---|---|\n", + "|Secure|||||\n", + "|Cheap|||||\n", + "|Vendor agnostic|||||\n", + "|Multicloud|||||\n", + "|Open source|||||\n", + "|Table/DataFrame abstraction|||||\n", + "|Live data|||||\n", + "|Predicate pushdown|||||\n", + "|Object store bandwidth|||||\n", + "|Zero compute cost|||||\n", + "|Scalability|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "**Open source data sharing and Databricks**\n", + "\n", + "To address the limitations of existing data sharing solutions, Databricks developed\n", + "[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\n", + "to the Linux Foundation.\n", + "\n", + "An open source–based solution, such as Delta Sharing, eliminates the lock-in\n", + "of commercial solutions and brings a number of additional benefits such as\n", + "community-developed integrations with popular, open source data processing\n", + "frameworks. In addition, open protocols allow the easy integration of commercial\n", + "clients, such as BI tools.\n", + "\n", + "**What is Databricks Delta Sharing?**\n", + "\n", + "Databricks Delta Sharing provides an open solution to securely share live data\n", + "from your lakehouse to any computing platform. Recipients don’t have to be\n", + "on the Databricks platform or on the same cloud or a cloud at all. Data providers\n", + "can share live data, without replicating or moving it to another system. Recipients\n", + "benefit from always having access to the latest version of data and can quickly\n", + "query shared data using tools of their choice for BI, analytics and machine\n", + "learning, reducing time-to-value. Data providers can centrally manage, govern,\n", + "audit and track usage of the shared data on one platform.\n", + "\n", + "Unity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\n", + "for data sharing, enabling organizations to share live, large-scale data without\n", + "replication and make data easily and quickly accessible from tools of your\n", + "choice, with enterprise-grade security.\n", + "\n", + "\n", + "**Key benefits**\n", + "\n", + "Open cross-platform sharing\n", + "\n", + "Easily share existing data in Delta Lake and Apache Parquet formats between\n", + "different vendors. Consumers don’t have to be on the Databricks platform, same\n", + "cloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\n", + "and Java allow recipients to consume shared data directly from the tools of their\n", + "choice. Delta Sharing eliminates the need to set up a new ingestion process to\n", + "consume data. Data recipients can directly access the fresh data and query it\n", + "using tools of their choice. Recipients can also enrich data with data sets from\n", + "popular data providers.\n", + "\n", + "Sharing live data without copying it\n", + "\n", + "Share live ready-to-query data, without replicating or moving it to another system.\n", + "Most enterprise data today is stored in cloud data lakes. Any of the existing data\n", + "sets on the provider’s data lake can easily be shared across clouds, regions or\n", + "data platforms without any data replication or physical movement of data. Data\n", + "providers can update their data sets reliably in real time and provide a fresh and\n", + "consistent view of their data to recipients.\n", + "\n", + "Centralized administration and governance\n", + "\n", + "You can centrally govern, track and audit access to the shared data from a single\n", + "point of enforcement to meet compliance requirements. Detailed user-access\n", + "audit logs are kept to know who is accessing the data and monitor usage of the\n", + "shared data down to table, partition and version level.\n", + "\n", + "\n", + "-----\n", + "\n", + "An open Marketplace for data solutions\n", + "\n", + "The demand for third-party data to make data-driven innovations is greater than ever,\n", + "\n", + "and data marketplaces act as a bridge between data providers and data consumers to\n", + "\n", + "help facilitate the discovery and distribution of data sets.\n", + "\n", + "Databricks Marketplace provides an open marketplace for exchanging data products\n", + "\n", + "such as data sets, notebooks, dashboards and machine learning models. To accelerate\n", + "\n", + "insights, data consumers can discover, evaluate and access more data products from\n", + "\n", + "third-party vendors than ever before. Providers can now commercialize new offerings\n", + "\n", + "and shorten sales cycles by providing value-added services on top of their data.\n", + "\n", + "Databricks Marketplace is powered by Delta Sharing, allowing consumers to access\n", + "\n", + "data products without having to be on the Databricks platform. This open approach\n", + "\n", + "allows data providers to broaden their addressable market without forcing consumers\n", + "\n", + "into vendor lock-in.\n", + "\n", + "_Databricks Marketplace_\n", + "\n", + "\n", + "Privacy-safe data cleanrooms\n", + "\n", + "Powered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n", + "\n", + "a flexible data cleanroom solution allowing businesses to easily collaborate with their\n", + "\n", + "customers and partners on any cloud in a privacy-safe way. Participants in the data\n", + "\n", + "cleanrooms can share and join their existing data, and run complex workloads in any\n", + "\n", + "language — Python, R, SQL, Java and Scala — on the data while maintaining data\n", + "\n", + "privacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n", + "\n", + "data replication across clouds or regions with other participants, which simplifies data\n", + "\n", + "operations and reduces cost.\n", + "\n", + "_Data cleanrooms with Databricks Lakehouse Platform_\n", + "\n", + "\n", + "-----\n", + "\n", + "**How it works**\n", + "\n", + "Delta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\n", + "is natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\n", + "or externally.\n", + "\n", + "Delta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\n", + "Azure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n", + "\n", + "**Data provider** **Data recipient**\n", + "\n", + "Data science And many more On-premises\n", + "\n", + "The data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\n", + "decides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\n", + "shares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n", + "\n", + "The data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\n", + "Spark, Java and Python, and is working with partners on many more.\n", + "\n", + "\n", + "-----\n", + "\n", + "The Delta Sharing data exchange follows three efficient steps:\n", + "\n", + "1. The recipient’s client authenticates to the sharing server and asks to query\n", + "a specific table. The client can also provide filters on the data (for example,\n", + "“country=US”) as a hint to read just a subset of the data.\n", + "\n", + "2. The server verifies whether the client is allowed to access the data, logs the\n", + "request, and then determines which data to send back. This will be a subset\n", + "of the data objects in cloud storage systems that make up the table.\n", + "\n", + "3. To transfer the data, the server generates short-lived presigned URLs that\n", + "allow the client to read these Parquet files directly from the cloud provider,\n", + "so that the transfer can happen in parallel at massive bandwidth, without\n", + "streaming through the sharing server.\n", + "\n", + "**Learn more**\n", + "\n", + "[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n", + "\n", + "[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n", + "\n", + "[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n", + "\n", + "[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 05\n", + "\n", + "\n", + "### Security\n", + "\n", + "Organizations that operate in multicloud environments need a unified, reliable\n", + "and consistent approach to secure data. We’ve learned from our customers that\n", + "a simple and unified approach to data security for the lakehouse is one of the\n", + "most critical requirements for modern data solutions. Databricks is trusted by\n", + "the world’s largest organizations to provide a powerful lakehouse platform with\n", + "high security and scalability. In fact, thousands of customers trust Databricks\n", + "with their most sensitive data to analyze and build data products using machine\n", + "learning (ML). With significant investment in building a highly secure and scalable\n", + "platform, Databricks delivers end-to-end platform security for data and users.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Platform architecture reduces risk\n", + "\n", + "The Databricks Lakehouse architecture is split into\n", + "two separate planes to simplify your permissions,\n", + "avoid data duplication and reduce risk. The control\n", + "plane is the management plane where Databricks\n", + "runs the workspace application and manages\n", + "notebooks, configuration and clusters. Unless you\n", + "choose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\n", + "runs inside your cloud service provider account,\n", + "processing your data without taking it out of your\n", + "account. You can embed Databricks in your data\n", + "exfiltration protection architecture using features\n", + "like customer-managed VPCs/VNets and admin\n", + "console options that disable export.\n", + "\n", + "While certain data, such as your notebooks,\n", + "configurations, logs, and user information, is\n", + "present within the control plane, that information\n", + "is encrypted at rest, and communication to and\n", + "from the control plane is encrypted in transit.\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Control pane|Col3|\n", + "|---|---|---|\n", + "||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n", + "||Cluster manager||\n", + "\n", + "\n", + "You also have choices for where certain data lives:\n", + "You can host your own store of metadata about\n", + "your data tables (Hive metastore), or store query\n", + "\n", + "\n", + "**Data**\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "\n", + "results in your cloud service provider account and\n", + "decide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Step-by-step example\n", + "\n", + "\n", + "\n", + "**Users**\n", + "\n", + "**Interactive**\n", + "**users**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "**DBFS root**\n", + "\n", + "|Col1|ample|Col3|Col4|Col5|\n", + "|---|---|---|---|---|\n", + "||Control pane 1 4||||\n", + "|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "Suppose you have a data engineer that signs in to Databricks and\n", + "writes a notebook that transforms raw data in Kafka to a normalized\n", + "data set sent to storage such as Amazon S3 or Azure Data Lake\n", + "Storage. Six steps make that happen:\n", + "\n", + "1. The data engineer seamlessly authenticates, via your single sign-on\n", + "if desired, to the Databricks web UI in the control plane, hosted in\n", + "the Databricks account.\n", + "\n", + "2. As the data engineer writes code, their web browser sends it to\n", + "the control plane. JDBC/ODBC requests also follow the same path,\n", + "authenticating with a token.\n", + "\n", + "3. When ready, the control plane uses Cloud Service Provider APIs to\n", + "create a Databricks cluster, made of new instances in the data plane,\n", + "in your CSP account. Administrators can apply cluster policies to\n", + "enforce security profiles.\n", + "\n", + "4. Once the instances launch, the cluster manager sends the data\n", + "engineer’s code to the cluster.\n", + "\n", + "5. The cluster pulls from Kafka in your account, transforms the data\n", + "in your account and writes it to a storage in your account.\n", + "\n", + "6. The cluster reports status and any outputs back to the cluster manager.\n", + "\n", + "The data engineer does not need to worry about many of the details —\n", + "simply write the code and Databricks runs it.\n", + "\n", + "\n", + "#### Network and server security\n", + "\n", + "Here is how Databricks interacts with your cloud service provider\n", + "account to manage network and server security\n", + "\n", + "**Networking**\n", + "\n", + "Regardless of where you choose to host the data plane, Databricks networking\n", + "is straightforward. If you host it yourself, Databricks by default will still configure\n", + "networking for you, but you can also control data plane networking with your\n", + "own managed VPC or VNet.\n", + "\n", + "The serverless data plane network infrastructure is managed by Databricks in\n", + "a Databricks cloud service provider account and shared among customers,\n", + "with additional network boundaries between workspaces and between clusters.\n", + "\n", + "Databricks does not rewrite or change your data structure in your storage, nor\n", + "does it change or modify any of your security and governance policies. Local\n", + "firewalls complement security groups and subnet firewall policies to block\n", + "unexpected inbound connections.\n", + "\n", + "Customers at the enterprise tier can also use the IP access list feature on\n", + "the control plane to limit which IP addresses can connect to the web UI or\n", + "REST API — for example, to allow only VPN or office IPs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Servers**\n", + "\n", + "In the data plane, Databricks clusters automatically run the latest hardened\n", + "system image. Users cannot choose older (less secure) images or code. For AWS\n", + "and Azure deployments, images are typically updated every two-to-four weeks.\n", + "GCP is responsible for its system image.\n", + "\n", + "Databricks runs scans for every release, including:\n", + "\n", + "**•** System image scanning for vulnerabilities\n", + "\n", + "**•** Container OS and library scanning\n", + "\n", + "\n", + "**Severity** **Remediation time**\n", + "\n", + "**Critical** **< 14 days**\n", + "\n", + "**High** **< 30 days**\n", + "\n", + "**Medium** **< 60 days**\n", + "\n", + "**Low** **When appropriate**\n", + "\n", + "\n", + "\n", + "**•** Static and dynamic code scanning\n", + "\n", + "**Databricks access**\n", + "\n", + "\n", + "Databricks code is peer reviewed by developers who have security training.\n", + "Significant design documents go through comprehensive security reviews.\n", + "Scans run fully authenticated, with all checks enabled, and issues are\n", + "tracked against the timeline shown in this table.\n", + "\n", + "Note that Databricks clusters are typically short-lived (often terminated\n", + "after a job completes) and do not persist data after they terminate. Clusters\n", + "typically share the same permission level (excluding high concurrency or\n", + "Databricks SQL clusters, where more robust security controls are in place).\n", + "Your code is launched in an unprivileged container to maintain system\n", + "stability. This security design provides protection against persistent attackers\n", + "and privilege escalation.\n", + "\n", + "\n", + "Databricks access to your environment is limited to cloud service provider APIs\n", + "for our automation and support access. Automated access allows the Databricks\n", + "control plane to configure resources in your environment using the cloud service\n", + "provider APIs. The specific APIs vary based on the cloud. For instance, an AWS\n", + "cross-account IAM role, or Azure-owned automation or GKE automation do not\n", + "grant access to your data sets (see the next section).\n", + "\n", + "Databricks has a custom-built system that allows staff to fix issues or handle\n", + "support requests — for example, when you open a support request and check the\n", + "box authorizing access to your workspace. Access requires either a support ticket\n", + "or engineering ticket tied expressly to your workspace and is limited to a subset of\n", + "employees and for limited time periods. Additionally, if you have configured audit\n", + "log delivery, the audit logs show the initial access event and the staff’s actions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Identity and access**\n", + "\n", + "Databricks supports robust ACLs and SCIM. AWS customers can configure\n", + "SAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\n", + "GCP automatically integrate with Azure Active Directory or GCP identity.\n", + "\n", + "Databricks supports a variety of ways to enable users to access their data.\n", + "\n", + "**Examples include:**\n", + "\n", + "**•** The Table ACLs feature uses traditional SQL-based statements to\n", + "manage access to data and enable fine-grained view-based access\n", + "\n", + "**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\n", + "users of that cluster automatically access allowed resources without\n", + "explicit credentials\n", + "\n", + "**•** External storage can be mounted or accessed using a securely\n", + "stored access key\n", + "\n", + "**•** The Secrets API separates credentials from code when accessing\n", + "external resources\n", + "\n", + "\n", + "**Data security**\n", + "\n", + "Databricks provides encryption, isolation and auditing.\n", + "\n", + "**Databricks encryption capabilities are**\n", + "**in place both at rest and in motion**\n", + "\n", + "\n", + "\n", + "|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n", + "|---|---|\n", + "\n", + "\n", + "**Customers can isolate users at multiple levels:**\n", + "\n", + "**•** **Workspace level:** Each team or department can use a separate workspace\n", + "\n", + "**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n", + "\n", + "to a given cluster\n", + "\n", + "**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\n", + "languages (SQL, Python) allow for the safe coexistence of users of different\n", + "privilege levels, and is used with Table ACLs\n", + "\n", + "**•** **Single-user cluster:** Users can create a private dedicated cluster\n", + "\n", + "Activities of Databricks users are logged and can be delivered automatically to\n", + "a cloud storage bucket. Customers can also monitor provisioning activities by\n", + "monitoring cloud audit logs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Compliance**\n", + "\n", + "**Databricks supports the following compliance standards on**\n", + "\n", + "**our multi-tenant platform:**\n", + "\n", + "**•** **SOC 2 Type II**\n", + "\n", + "**•** **ISO 27001**\n", + "\n", + "**•** **ISO 27017**\n", + "\n", + "**•** **ISO 27018**\n", + "\n", + "Certain clouds support Databricks deployment options for FedRAMP\n", + "High, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\n", + "are also GDPR and CCPA ready.\n", + "\n", + "**Learn more**\n", + "\n", + "To learn more about Databricks security,\n", + "visit the [Security and Trust Center](https://databricks.com/trust)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 06\n", + "\n", + "\n", + "### Instant compute and serverless\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Benefits of Databricks Serverless SQL\n", + "\n", + "Serverless SQL is much easier to administer with Databricks taking on the\n", + "responsibility of deploying, configuring and managing your cluster VMs. Databricks\n", + "can transfer compute capacity to user queries typically in about 15 seconds — so\n", + "you no longer need to wait for clusters to start up or scale out to run your queries.\n", + "\n", + "Serverless SQL also has built-in connectors to your favorite tools such as Tableau,\n", + "Power BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\n", + "authentication support and high performance. And finally, you save on cost\n", + "because you do not need to overprovision or pay for the idle capacity.\n", + "\n", + "\n", + "#### What is serverless compute?\n", + "\n", + "Serverless compute is a fully managed service where Databricks provisions\n", + "and manages the compute layer on behalf of the customer in the Databricks\n", + "cloud account instead of the customer account. As of the current release,\n", + "serverless compute is supported for use with Databricks SQL. This new\n", + "capability for Databricks SQL provides instant compute to users for their\n", + "BI and SQL workloads, with minimal management required and capacity\n", + "optimizations that can lower overall cost by 20%-40% on average. This\n", + "makes it even easier for organizations to expand adoption of the lakehouse\n", + "for business analysts who are looking to access the rich, real-time data sets\n", + "of the lakehouse with a simple and performant solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Inside Serverless SQL**\n", + "\n", + "\n", + "**Databricks Serverless SQL**\n", + "\n", + "**Managed servers**\n", + "\n", + "**Serverless SQL**\n", + "**compute**\n", + "\n", + "**Secure**\n", + "**Instant compute**\n", + "\n", + "\n", + "At the core of Serverless SQL is a compute\n", + "platform that operates a pool of servers located\n", + "in a Databricks’ account, running Kubernetes\n", + "containers that can be assigned to a user\n", + "within seconds.\n", + "\n", + "When many users are running reports or queries\n", + "at the same time, the compute platform adds more\n", + "servers to the cluster (again, within seconds) to\n", + "handle the concurrent load. Databricks manages\n", + "the entire configuration of the server and\n", + "automatically performs the patching and upgrades\n", + "as needed.\n", + "\n", + "Each server is running a secure configuration and\n", + "all processing is secured by three layers of isolation:\n", + "The Kubernetes container hosting the runtime; the\n", + "virtual machine (VM) hosting the container; and\n", + "the virtual network for the workspace. Each layer\n", + "is isolated to one workspace with no sharing or\n", + "cross-network traffic allowed. The containers use\n", + "hardened configurations, VMs are shut down and\n", + "not reused, and network traffic is restricted\n", + "to nodes in the same cluster.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Performance of Serverless SQL\n", + "\n", + "We ran a set of internal tests to compare\n", + "Databricks Serverless SQL to the current\n", + "Databricks SQL and several traditional cloud\n", + "data warehouses. We found Serverless SQL\n", + "to be the most cost-efficient and performant\n", + "environment to run SQL workloads when\n", + "considering cluster startup time, query\n", + "execution time and overall cost.\n", + "\n", + "\n", + "**Databricks Serverless SQL is the highest**\n", + "**performing and most cost-effective solution**\n", + "\n", + "**Cloud SQL solutions compared**\n", + "\n", + "\n", + "**Faster**\n", + "\n", + "**Query**\n", + "**execution**\n", + "**time**\n", + "\n", + "**Slower**\n", + "\n", + "\n", + "**Serverless**\n", + "**SQL**\n", + "\n", + "**CDW1**\n", + "\n", + "**CDW3**\n", + "\n", + "\n", + "**Cost Estimate**\n", + "\n", + "**High**\n", + "\n", + "**Medium**\n", + "\n", + "**Low**\n", + "\n", + "\n", + "**CDW2**\n", + "\n", + "\n", + "**CDW4**\n", + "\n", + "\n", + "**Slower** **Faster**\n", + "**(~5min)** **Startup time** **(~2-3sec)**\n", + "\n", + "**Learn more**\n", + "\n", + "The feature is currently in Public Preview. Sign up to\n", + "[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\n", + "Serverless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 07\n", + "\n", + "\n", + "### Data warehousing\n", + "\n", + "Data warehouses are not keeping up with today’s world. The explosion of\n", + "languages other than SQL and unstructured data, machine learning, IoT and\n", + "streaming analytics are forcing organizations to adopt a bifurcated architecture\n", + "of disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\n", + "is ubiquitous and known by millions of professionals, it has never been treated\n", + "as a first-class citizen on data lakes, until the lakehouse.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is data warehousing\n", + "\n", + "The Databricks Lakehouse Platform provides a simplified multicloud and\n", + "serverless architecture for your data warehousing workloads. Data warehousing on\n", + "the lakehouse allows SQL analytics and BI at scale with a common governance\n", + "model. Now you can ingest, transform and query all your data in-place — using\n", + "your SQL and BI tools of choice — to deliver real-time business insights at the\n", + "best price/performance. Built on open standards and APIs, the lakehouse\n", + "provides the reliability, quality and performance that data lakes natively lack,\n", + "and integrations with the ecosystem for maximum flexibility — no lock-in.\n", + "\n", + "With data warehousing on the lakehouse, organizations can unify all analytics\n", + "and simplify their architecture to enable their business with real-time business\n", + "insights at the best price/performance.\n", + "\n", + "\n", + "#### Key benefits\n", + "\n", + "**Best price/performance**\n", + "\n", + "Lower costs, get the best price/performance and eliminate\n", + "resource management overhead\n", + "\n", + "On-premises data warehouses have reached their limits — they physically\n", + "cannot scale to handle the growing volumes of data, and don’t provide the\n", + "elasticity customers need to respond to ever-changing business needs.\n", + "Cloud data warehouses are a great alternative to on-premises data\n", + "warehouses, providing greater scale and elasticity, but cloud costs for\n", + "proprietary cloud data warehouses typically yield to an exponential cost\n", + "increase following the growth of data volume.\n", + "\n", + "The Databricks Lakehouse Platform provides instant, elastic SQL serverless\n", + "compute — decoupled from storage on cheap cloud object stores — and\n", + "thousands of performance optimizations that can lower overall infrastructure\n", + "costs by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\n", + "types and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\n", + "use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Built-in governance**\n", + "\n", + "One source of truth and one unified\n", + "governance layer across all data teams\n", + "\n", + "Underpinned by Delta Lake, the Databricks\n", + "Lakehouse Platform simplifies your architecture by\n", + "allowing you to establish one single copy of all your\n", + "data for in-place analytics and ETL/ELT on your\n", + "existing data lakes — no more data movements\n", + "and copies in disjointed systems. Then, seamless\n", + "integration with Databricks Unity Catalog lets you\n", + "easily discover, secure and manage all your data\n", + "with fine-grained governance, data lineage, and\n", + "standard SQL.\n", + "\n", + "**Rich ecosystem**\n", + "\n", + "Ingest, transform and query all your\n", + "data in-place with your favorite tools\n", + "\n", + "Very few tools exist to conduct BI on data lakes.\n", + "Generally, doing so has required data analysts to\n", + "\n", + "submit Spark jobs or use a developer interface.\n", + "While these tools are common for data scientists,\n", + "they require knowledge of languages and\n", + "interfaces that are not traditionally part of a data\n", + "analyst’s tool set. As a result, the learning curve for\n", + "an analyst to make use of a data lake is too high\n", + "when well-established tools and methods already\n", + "exist for data warehouses.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform works with\n", + "your preferred tools like dbt, Fivetran, Power BI or\n", + "Tableau, allowing analysts and analytical engineers\n", + "to easily ingest, transform and query the most\n", + "recent and complete data, without having to move\n", + "it into a separate data warehouse. Additionally, it\n", + "empowers every analyst across your organization\n", + "to quickly and collaboratively find and share new\n", + "insights with a built-in SQL editor, visualizations\n", + "and dashboards.\n", + "\n", + "**Break down silos**\n", + "\n", + "Accelerate time from raw to actionable\n", + "data and go effortlessly from BI to ML\n", + "\n", + "\n", + "applications, organizations will need to manage\n", + "an entirely different system than their SQL-only\n", + "data warehouse, slowing down collaboration and\n", + "innovation.\n", + "\n", + "The Databricks Lakehouse Platform provides the\n", + "most complete end-to-end data warehousing\n", + "solution for all your modern analytics needs,\n", + "and more. Now you can empower data teams\n", + "and business users to access the latest data\n", + "faster for downstream real-time analytics and go\n", + "effortlessly from BI to ML. Speed up the time from\n", + "raw to actionable data at any scale — in batch and\n", + "streaming. And go from descriptive to advanced\n", + "analytics effortlessly to uncover new insights.\n", + "\n", + "\n", + "It is challenging for data engineering teams to\n", + "enable analysts at the speed that the business\n", + "requires. Data warehouses need data to be\n", + "ingested and processed ahead of time before\n", + "analysts can access and query it using BI tools.\n", + "Because traditional data warehouses lack\n", + "real-time processing and do not scale well for\n", + "large ETL jobs, they create new data movements\n", + "and bottlenecks for the data engineering team,\n", + "and make it slow for analysts to access the\n", + "latest data. And for advanced analytics (ML)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data warehousing on Databricks**\n", + "\n", + "**Truly decoupled, serverless, compute layer**\n", + "\n", + "\n", + "**Data consumers**\n", + "\n", + "\n", + "**Data processing**\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "**ETL** **ETL**\n", + "\n", + "**Bronze raw** **Silver staging** **Gold DW/marts**\n", + "\n", + "\n", + "**Open storage layer**\n", + "\n", + "**Data ingest**\n", + "\n", + "**Data sources**\n", + "\n", + "\n", + "**Databricks**\n", + "**Partner Connect**\n", + "\n", + "\n", + "**Continuous**\n", + "**ingest**\n", + "\n", + "\n", + "**Batch**\n", + "**ingest**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**On-premises**\n", + "\n", + "**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n", + "\n", + "**DWH**\n", + "\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Try Databricks SQL for free](https://dbricks.co/dbsql)\n", + "\n", + "[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n", + "\n", + "[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n", + "[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n", + "\n", + "\n", + "[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n", + "\n", + "[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 08\n", + "\n", + "\n", + "### Data engineering\n", + "\n", + "Organizations realize the value data plays as a strategic asset for growing\n", + "revenues, improving the customer experience, operating efficiently or improving\n", + "a product or service. Data is really the driver of all these initiatives. Nowadays,\n", + "data is often streamed and ingested from hundreds of different data sources,\n", + "sometimes acquired from a data exchange, cleaned in various ways with\n", + "different orchestrated steps, versioned and shared for analytics and AI.\n", + "And increasingly, data is being monetized.\n", + "\n", + "Data teams rely on getting the right data at the right time for analytics, data\n", + "science and machine learning, but often are faced with challenges meeting\n", + "the needs of their initiatives for data engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why data engineering is hard\n", + "\n", + "One of the biggest challenges is accessing and managing the increasingly\n", + "complex data that lives across the organization. Most of the complexity\n", + "arises with the explosion of data volumes and data types, with organizations\n", + "amassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "\n", + "With this volume, managing data pipelines to transform and process data\n", + "is slow and difficult, and increasingly expensive. And to top off the complexity,\n", + "most businesses are putting an increased emphasis on multicloud\n", + "environments which can be even more difficult to maintain.\n", + "\n", + "[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\n", + "that data itself has become a product, and the challenging goal of the data\n", + "engineer is to build and run the machinery that creates this high-fidelity\n", + "data product all the way from ingestion to monetization.\n", + "\n", + "\n", + "Despite current technological advances data engineering remains\n", + "difficult for several reasons:\n", + "\n", + "**Complex data ingestion methods**\n", + "\n", + "Data ingestion means retrieving batch and streaming data from various\n", + "sources and in various formats. Ingesting data is hard and complex since you\n", + "either need to use an always-running streaming platform like Apache Kafka\n", + "or you need to be able to keep track of which files haven’t been ingested yet.\n", + "Data engineers are required to spend a lot of time hand-coding repetitive\n", + "and error-prone data ingestion tasks.\n", + "\n", + "**Data engineering principles**\n", + "\n", + "These days, large operations teams are often just a memory of the past.\n", + "Modern data engineering principles are based on agile software development\n", + "methodologies. They apply the well-known “you build it, you run it” paradigm,\n", + "use isolated development and production environments, CI/CD, and version\n", + "control transformations that are pushed to production after validation. Tooling\n", + "needs to support these principles.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Third-party tools**\n", + "\n", + "Data engineers are often required to run additional third-party tools for\n", + "orchestration to automate tasks such as ELT/ETL or customer code in\n", + "notebooks. Running third-party tools increases the operational overhead\n", + "and decreases the reliability of the system.\n", + "\n", + "**Performance tuning**\n", + "\n", + "Finally, with all pipelines and workflows written, data engineers need to\n", + "constantly focus on performance, tuning pipelines and architectures to meet\n", + "SLAs. Tuning such architectures requires in-depth knowledge of the underlying\n", + "architecture and constantly observing throughput parameters.\n", + "\n", + "Most organizations are dealing with a complex landscape of data warehouses\n", + "and data lakes these days. Each of those platforms has its own limitations,\n", + "workloads, development languages and governance model.\n", + "\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:\n", + "\n", + "_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n", + "_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n", + "**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n", + "_kinds of workflows._\n", + "\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "#### Benefits of data engineering on the lakehouse\n", + "\n", + "By simplifying and modernizing with the lakehouse architecture, data engineers\n", + "gain an enterprise-grade and enterprise-ready approach to building data\n", + "pipelines. The following are eight key differentiating capabilities that a data\n", + "engineering solution team can enable with the Databricks Lakehouse Platform:\n", + "\n", + "**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\n", + "engineers can enable fast, reliable, scalable and automatic data ingestion\n", + "for analytics, data science or machine learning.\n", + "\n", + "\n", + "\n", + "**•** **Data pipeline observability:** Monitor overall data pipeline estate status\n", + "from a dataflow graph dashboard and visually track end-to-end pipeline\n", + "health for performance, quality, status and latency.\n", + "\n", + "**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\n", + "analytics and machine learning use cases by enabling easy and automatic\n", + "data pipeline deployments into production or roll back pipelines and\n", + "minimize downtime.\n", + "\n", + "**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\n", + "of data processing tasks for data and machine learning pipelines with the\n", + "ability to run multiple non-interactive tasks as a directed acyclic graph\n", + "(DAG) on a Databricks compute cluster.\n", + "\n", + "\n", + "\n", + "**•** **Automated ETL pipelines:** Data engineers can reduce development\n", + "time and effort and focus on implementing business logic and data\n", + "quality checks within the data pipeline using SQL or Python.\n", + "\n", + "**•** **Data quality checks:** Improve data reliability throughout the data\n", + "lakehouse so data teams can confidently trust the information for\n", + "downstream initiatives with the ability to define data quality and\n", + "automatically address errors.\n", + "\n", + "**•** **Batch and streaming:** Allow data engineers to set tunable data latency\n", + "with cost controls without having to know complex stream processing\n", + "and implement recovery logic.\n", + "\n", + "**•** **Automatic recovery:** Handle transient errors and use automatic recovery\n", + "for most common error conditions that can occur during the operation of\n", + "a pipeline with fast, scalable fault-tolerance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data engineering is all about data quality**\n", + "\n", + "The goal of modern data engineering is to distill data with a quality that is fit for\n", + "downstream analytics and AI. Within the Lakehouse, data quality is achieved on\n", + "three different levels.\n", + "\n", + "\n", + "1. On a **technical level** , data quality is\n", + "guaranteed by enforcing and evolving\n", + "schemas for data storage and ingestion.\n", + "\n", + "**Kenesis**\n", + "\n", + "**CSV,**\n", + "**JSON, TXT...**\n", + "\n", + "**Data Lake**\n", + "\n", + "\n", + "2. On an **architectural level** , data quality is\n", + "often achieved by implementing the medallion\n", + "architecture. A medallion architecture is a data\n", + "design pattern used to logically organize data in\n", + "a [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\n", + "progressively improving the structure and quality\n", + "of data as it flows through each layer of the\n", + "architecture, e.g., from Bronze to Silver to Gold\n", + "layer tables.\n", + "\n", + "\n", + "3. The **Databricks Unity Catalog** comes\n", + "with robust data quality management with\n", + "built-in quality controls, testing, monitoring\n", + "and enforcement to ensure accurate and\n", + "useful data is available for downstream BI,\n", + "analytics and machine learning workloads.\n", + "\n", + "**Streaming**\n", + "**analytics**\n", + "\n", + "\n", + "**Bronze**\n", + "\n", + "\n", + "**Silver**\n", + "\n", + "\n", + "**Gold**\n", + "\n", + "\n", + "**BI and**\n", + "\n", + "**reporting**\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned, Business-level\n", + "and history augmented aggregates\n", + "\n", + "**Quality**\n", + "\n", + "\n", + "**Data science**\n", + "\n", + "**and ML**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data ingestion\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers can build robust\n", + "hyper-scale ingestion pipelines in streaming and batch mode. They can\n", + "incrementally process new files as they land on cloud storage — with no\n", + "need to manage state information — in scheduled or continuous jobs.\n", + "\n", + "Data engineers can efficiently track new files (with the ability to scale\n", + "to billions of files) without having to list them in a directory. Databricks\n", + "automatically infers the schema from the source data and evolves it as\n", + "the data loads into the Delta Lake lakehouse. Efforts continue with\n", + "enhancing and supporting Auto Loader, our powerful data ingestion\n", + "tool for the Lakehouse.\n", + "\n", + "**What is Auto Loader?**\n", + "\n", + "Have you ever imagined that ingesting data could become as easy\n", + "as dropping a file into a folder? Welcome to Databricks Auto Loader.\n", + "\n", + "[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\n", + "efficiently processes new data files as they arrive in the cloud storage built\n", + "into the Databricks Lakehouse. Auto Loader can detect and enforce the\n", + "schema of your data and, therefore, guarantee data quality. New files or\n", + "files that have been changed since the last time new data was processed\n", + "are identified automatically and ingested. Noncompliant data sets are\n", + "quarantined into rescue data columns. You can use the [trigger once]\n", + "option with Auto Loader to turn it into a job that turns itself off.\n", + "\n", + "\n", + "**Ingestion for data analysts: COPY INTO**\n", + "\n", + "Ingestion also got much easier for data analysts and analytics engineers working\n", + "with Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\n", + "lake-first approach and loads data from a folder location into a Delta Lake table.\n", + "COPY INTO can be scheduled and called by a job repeatedly. When run, only new\n", + "files from the source location will be processed.\n", + "\n", + "#### Data transformation\n", + "\n", + "Turning SQL queries into production ETL pipelines typically involves a lot\n", + "of tedious, complicated operational work. Even at a small scale, the majority\n", + "of a data practitioner’s time is spent on tooling and managing infrastructure.\n", + "\n", + "Although the medallion architecture is an established and reliable pattern\n", + "for improving data quality, the implementation of this pattern is challenging\n", + "for many data engineering teams.\n", + "\n", + "While hand-coding the medallion architecture was hard for data engineers,\n", + "creating data pipelines was outright impossible for data analysts not being\n", + "able to code with Spark Structured Streaming in Scala or Python.\n", + "\n", + "Even at a small scale, most data engineering time is spent on tooling and\n", + "managing infrastructure rather than transformation. Auto-scaling, observability\n", + "and governance are difficult to implement and, as a result, often left out of the\n", + "solution entirely.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Live Tables?\n", + "\n", + "Delta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\n", + "infrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\n", + "and apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\n", + "both Python and SQL and is tailored to work with both streaming and batch workloads.\n", + "\n", + "With DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n", + "\n", + "\n", + "**Write** **create live table**\n", + "\n", + "\n", + "**Create** **a pipeline** **Click** **Start**\n", + "\n", + "Start\n", + "\n", + "\n", + "-----\n", + "\n", + "DLT reduces the implementation time by accelerating development and\n", + "automating complex operational tasks. Since DLT can use plain SQL, it also\n", + "enables data analysts to create production pipelines and turns them into\n", + "the often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\n", + "execution applied with Photon.\n", + "\n", + "Software engineering principles are applied for data engineering to foster the\n", + "idea of treating your data as code. Your data is the sole source of truth for what\n", + "is going on inside your business.\n", + "\n", + "Beyond just the transformations, there are many things that should be included\n", + "\n", + "Dependency\n", + "Full refresh\n", + "management\n", + "\n", + "*Coming soon\n", + "\n", + "\n", + "in the code that define your data. Declaratively express entire data flows in SQL\n", + "or Python. Natively enable modern software engineering best practices like\n", + "separate development and production environments, the ability to easily test\n", + "before deploying, deploy and manage environments using parameterization, unit\n", + "testing and documentation.\n", + "\n", + "DLT also automatically scales compute, providing the option to set the minimum\n", + "and maximum number of instances and let DLT size up the cluster according\n", + "to cluster utilization. In addition, tasks like orchestration, error handling and\n", + "recovery, and performance optimization are all handled automatically.\n", + "\n", + "\n", + "Incremental\n", + "computation*\n", + "\n", + "\n", + "Checkpointing\n", + "and retries\n", + "\n", + "\n", + "-----\n", + "\n", + "Expectations in the code help prevent bad data from flowing into tables, track\n", + "data quality over time, and provide tools to troubleshoot bad data with granular\n", + "pipeline observability. This enables a high-fidelity lineage diagram of your\n", + "pipeline to track dependencies and aggregate data quality metrics across all\n", + "your pipelines.\n", + "\n", + "Unlike other products that force you to deal with streaming and batch workloads\n", + "separately, DLT supports any type of data workload with a single API so data\n", + "engineers and analysts alike can build cloud-scale data pipelines faster without\n", + "the need for advanced data engineering skills.\n", + "\n", + "#### Data orchestration\n", + "\n", + "The lakehouse makes it much easier for businesses to undertake ambitious data\n", + "and machine learning (ML) initiatives. However, orchestrating and managing\n", + "end-to-end production workflows remains a bottleneck for most organizations,\n", + "relying on external tools or cloud-specific solutions that are not part of their\n", + "lakehouse platform. Tools that decouple task orchestration from the underlying\n", + "data processing platform reduce the overall reliability of their production\n", + "workloads, limit observability, and increase complexity for end users.\n", + "\n", + "#### What is Databricks Workflows?\n", + "\n", + "[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n", + "[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\n", + "any cloud.\n", + "\n", + "\n", + "Workflows lets you orchestrate data flow pipelines (written in DLT or dbt),\n", + "as well as machine learning pipelines, or any other tasks such as notebooks\n", + "or Python wheels. Since Databricks Workflows is fully managed, it eliminates\n", + "operational overhead for data engineers, enabling them to focus on your\n", + "workflows not on managing your infrastructure. It provides an easy point-and-click\n", + "authoring experience for all your data teams, not just those with specialized skills.\n", + "Deep integration with the underlying lakehouse platform ensures you will create\n", + "and run reliable production workloads on any cloud while providing deep and\n", + "centralized monitoring with simplicity for end users.\n", + "\n", + "Sharing job clusters over multiple tasks reduces the time a job takes, reduces\n", + "costs by eliminating overhead and increases cluster utilization with parallel tasks.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\n", + "shows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\n", + "triggers the execution of all dependent tasks.\n", + "\n", + "You can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n", + "\n", + "external orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Orchestrate anything\n", + "\n", + "Remember that DLT is one of many task types for Databricks Workflows.\n", + "This is where the managed data flow pipelines with DLT tie together with\n", + "the easy point-and-click authoring experience of Databricks Workflows.\n", + "\n", + "In the following example, you can see an end-to-end workflow built with\n", + "customers in a workshop: Data is streamed from Twitter according to search\n", + "terms, then ingested with Auto Loader using automatic schema detection and\n", + "enforcement. In the next step, the data is cleaned and transformed with Delta\n", + "Live table pipelines written in SQL, and finally run through a pre-trained BERT\n", + "language model from Hugging Face for sentiment analysis of the tweets.\n", + "Different task types for ingest, cleanse/transform and ML are combined\n", + "in a single workflow.\n", + "\n", + "Using Workflows, these tasks can be scheduled to provide a daily overview of\n", + "social media coverage and customer sentiment for a business. After streaming\n", + "tweets with filtering for keywords such as “data engineering,” “lakehouse” and\n", + "“Delta Lake,” we curated a list of those tweets that were classified as positive\n", + "with the highest probability score.\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n", + "[Lakehouse](https://databricks.com/solutions/data-pipelines)\n", + "\n", + "\n", + "[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n", + "\n", + "[Databricks Workflows](https://www.databricks.com/product/workflows)\n", + "\n", + "\n", + "[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data streaming\n", + "# 09\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "There are two types of data processing: batch processing\n", + "and streaming processing.\n", + "\n", + "\n", + "Batch processing refers to the discontinuous, periodic processing\n", + "of data that has been stored for a period of time. For example,\n", + "an organization may need to run weekly reports on a set of\n", + "predictable transaction data. There is no need for this data\n", + "to be streaming — it can be processed on a weekly basis.\n", + "\n", + "Streaming processing, on the other hand, refers to unbounded\n", + "processing of data as it arrives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming Challenges**\n", + "\n", + "However, getting value from streaming data can be a tricky practice. While most\n", + "data today can be considered streaming data, organizations are overwhelmed by\n", + "the need to access, process and analyze the volume, speed and variety of this\n", + "data moving through their platforms. To keep pace with innovation, they must\n", + "quickly make sense of data streams decisively, consistently and in real time.\n", + "\n", + "Three common technical challenges organizations experience\n", + "with implementing real-time data streaming include:\n", + "\n", + "**•** **Specialized APIs and language skills:** Data practitioners encounter\n", + "barriers to adopting streaming skillsets because there are new languages,\n", + "APIs and tools to learn.\n", + "\n", + "**•** **Operational complexity:** To implement data streaming at scale, data\n", + "teams need to integrate and manage streaming-specific tools with\n", + "their other cloud services. They also have to manually build complex\n", + "operational tooling to help these systems recover from failure, restart\n", + "workloads without reprocessing data, optimize performance, scale the\n", + "underlying infrastructure, and so on.\n", + "\n", + "**•** **Incompatible governance models:** Different governance and security\n", + "models across real-time and historical data platforms makes it difficult\n", + "to provide the right access to the right users, see the end-to-end data\n", + "lineage, and/or meet compliance requirements.\n", + "\n", + "\n", + "In a wide variety of cases, an organization might find it useful to\n", + "leverage streaming data. Here are some common examples:\n", + "\n", + "**•** **Retail:** Real-time inventory updates help support business activities, such\n", + "as inventory and pricing optimization and optimization of the supply chain,\n", + "logistics and just-in-time delivery.\n", + "\n", + "**•** **Smart energy:** Smart meter monitoring in real time allows for smart\n", + "electricity pricing models and connection with renewable energy sources\n", + "to optimize power generation and distribution.\n", + "\n", + "**•** **Preventative maintenance:** By reducing unplanned outages and\n", + "unnecessary site and maintenance visits, real-time streaming analytics can\n", + "lower operational and equipment costs.\n", + "\n", + "**•** **Industrial automation:** Manufacturers can use streaming and predictive\n", + "analytics to improve production processes and product quality, including\n", + "setting up automated alerts.\n", + "\n", + "**•** **Healthcare:** To optimize care recommendations, real-time data allows\n", + "for the integration of various smart sensors to monitor patient condition,\n", + "medication levels and even recovery speed.\n", + "\n", + "**•** **Financial institutions:** Firms can conduct real-time analysis of\n", + "\n", + "transactions to detect fraudulent transactions and send alerts. They\n", + "can use fraud analytics to identify patterns and feed data into machine\n", + "learning algorithms.\n", + "\n", + "\n", + "Regardless of specific use cases, the central tenet of streaming data is that it\n", + "gives organizations the opportunity to leverage the freshest possible insights for\n", + "better decision-making and more optimized customer experiences.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data streaming architecture**\n", + "\n", + "Before addressing these challenges head-on, it may help to take a step back and\n", + "discuss the ingredients of a streaming data pipeline. Then, we will explain how\n", + "the Databricks Lakehouse Platform operates within this context to address the\n", + "aforementioned challenges.\n", + "\n", + "Every application of streaming data requires a pipeline that brings the data from\n", + "its origin point — whether sensors, IoT devices or database transactions — to its\n", + "final destination.\n", + "\n", + "In building this pipeline, streaming architectures typically employ two layers.\n", + "First, streaming capture systems **capture** and temporarily store streaming data\n", + "for processing. Sometimes these systems are also called messaging systems\n", + "or messaging buses. These systems are optimized for small payloads and high\n", + "frequency inputs/outputs. Second, streaming **processing** systems continuously\n", + "process data from streaming capture systems and other storage systems.\n", + "\n", + "**Capturing** **Processing**\n", + "\n", + "\n", + "It may help to think of a simplified streaming pipeline\n", + "according to the following seven phases:\n", + "\n", + "1. Data is continuously generated at origin points\n", + "\n", + "2. The generated data is captured from those origin points by\n", + "a capture system like Apache Kafka (with limited retention)\n", + "\n", + "**3. The captured data is extracted and incrementally ingested to**\n", + "**a processing platform like Databricks; data is ingested exactly**\n", + "**once and stored permanently, even if this step is rerun**\n", + "\n", + "**4. The ingested data is converted into a workable format**\n", + "\n", + "**5. The formatted data is cleansed, transformed and joined in**\n", + "**a number of pipeline steps**\n", + "\n", + "**6. The transformed data is processed downstream through**\n", + "**analysis or ML modeling**\n", + "\n", + "7. The resulting analysis or model is used for some sort of practical\n", + "application, which may be anything from basic reporting to an\n", + "event-driven software application\n", + "\n", + "You will notice four of the steps in this list are in boldface. This is because the\n", + "lakehouse architecture is specifically designed to optimize this part of the\n", + "pipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\n", + "analyze and model on streaming data _alongside_ batch-processed data. It can\n", + "accommodate both structured _and_ unstructured data. It is here that the value\n", + "of unifying the best pieces of data lakes and data warehouses really shines for\n", + "complex enterprise use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data Streaming on the Lakehouse**\n", + "\n", + "Now let’s zoom in a bit and see how the Databricks Lakehouse\n", + "Platform addresses each part of the pipeline mentioned above.\n", + "\n", + "**Streaming data ingestion and transformation** begins with continuously\n", + "and incrementally collecting raw data from streaming sources through a\n", + "feature called Auto Loader. Once the data is ingested, it can be transformed\n", + "from raw, messy data into clean, fresh, reliable data appropriate for downstream\n", + "analytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\n", + "manage these data pipelines while automatically taking care of infrastructure\n", + "management and scaling, data quality, error testing and other administrative\n", + "tasks. DLT is a high-level abstraction built on Spark Structured Streaming,\n", + "a scalable and fault-tolerant stream processing engine.\n", + "\n", + "**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\n", + "of streaming data. With fresher data streaming into SQL analytics or BI\n", + "reporting, more actionable insights can be achieved, resulting in better\n", + "business outcomes.\n", + "\n", + "**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\n", + "deployment is supported with structured streaming for continuous inference\n", + "from a live data stream. Like real-time analytics, real-time ML is a downstream\n", + "impact of streaming data, but for different business use cases (i.e., AI instead\n", + "of BI). Real-time modeling has many benefits, including more accurate\n", + "predictions about the future.\n", + "\n", + "\n", + "**Real-time applications** process data directly from streaming pipelines and\n", + "trigger programmatic actions, such as displaying a relevant ad, updating the\n", + "price on a pricing page, stopping a fraudulent transaction, etc. There typically\n", + "is no human-in-the-loop for such applications.\n", + "\n", + "\n", + "Data in cloud storage and message stores\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks Lakehouse Platform differentiators**\n", + "\n", + "Understanding what the lakehouse architecture provides is one\n", + "\n", + "thing, but it is useful to understand how Databricks uniquely\n", + "\n", + "approaches the common challenges mentioned earlier around\n", + "\n", + "working with streaming data.\n", + "\n", + "**Databricks empowers unified data teams.** Data engineers, data scientists\n", + "and analysts can easily build streaming data workloads with the languages\n", + "and tools they already know and the APIs they already use.\n", + "\n", + "**Databricks simplifies development and operations.** Organizations can\n", + "focus on getting value from data by reducing complexity and automating\n", + "much of the production aspects associated with building and maintaining\n", + "real-time data workloads.\n", + "\n", + "\n", + "See why customers love streaming on the Databricks\n", + "Lakehouse Platform with these resources.\n", + "\n", + "**Learn more**\n", + "\n", + "[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n", + "\n", + "[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n", + "\n", + "[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n", + "\n", + "[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n", + "\n", + "\n", + "**Databricks is one platform for streaming and batch data.** Organizations\n", + "can eliminate data silos, centralize security and governance models, and\n", + "provide complete support for all their real-time use cases under one roof —\n", + "the roof of the lakehouse.\n", + "\n", + "Finally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\n", + "deeply integrated with Spark Structured Streaming and overcomes many of\n", + "the limitations typically associated with streaming systems and files.\n", + "\n", + "In summary, the Databricks Lakehouse Platform dramatically simplifies data\n", + "streaming to deliver real-time analytics, machine learning and applications on\n", + "one platform. And, that platform is built on a foundation with streaming at its\n", + "core. This means organizations of all sizes can use their data in motion and\n", + "make more informed decisions faster than ever.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Data science and machine learning\n", + "# 10\n", + "\n", + "\n", + "**CHAPTER**\n", + "\n", + "\n", + "While most companies are aware of the potential benefits of applying\n", + "machine learning and AI, realizing these potentials can often be quite\n", + "challenging for those brave enough to take the leap. Some of the\n", + "largest hurdles come from siloed/disparate data systems, complex\n", + "experimentation environments, and getting models served in a\n", + "production setting.\n", + "\n", + "\n", + "Fortunately, the Databricks Lakehouse Platform provides a helping\n", + "hand and lets you use data to derive innovative insights, build\n", + "powerful predictive models, and enable data scientists, ML engineers,\n", + "and developers of all kinds to create within the space of machine\n", + "learning and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Machine Learning\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Exploratory data analysis\n", + "\n", + "With all the data in one place, data is easily\n", + "explored and visualized from within the\n", + "notebook-style experience that provides support\n", + "for various languages (R, SQL, Python and Scala)\n", + "as well as built-in visualizations and dashboards.\n", + "Confidently and securely share code with\n", + "co-authoring, commenting, automatic versioning,\n", + "Git integrations and role-based access controls.\n", + "The platform provides laptop-like simplicity at\n", + "production-ready scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Model creation and management\n", + "\n", + "From data ingestion to model training and tuning, all the way through to\n", + "production model serving and versioning, the Lakehouse brings the tools\n", + "needed to simplify those tasks.\n", + "\n", + "Get right into experimenting with the Databricks ML runtimes, optimized and\n", + "preconfigured to include most popular libraries like scikit-learn, XGBoost and\n", + "more. Massively scale thanks to built-in support for distributed training and\n", + "hardware acceleration with GPUs.\n", + "\n", + "From within the runtimes, you can track model training sessions, package and\n", + "reuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\n", + "created by Databricks and included as a managed service within the Lakehouse.\n", + "It provides a centralized location from which to manage models and package\n", + "code in an easily reusable way.\n", + "\n", + "Training these models often involves the use of features housed in a centralized\n", + "feature store. Fortunately, Databricks has a built-in feature store that allows you\n", + "to create new features, explore and re-use existing features, select features for\n", + "training and scoring machine learning models, and publish features to low-latency\n", + "online stores for real-time inference.\n", + "\n", + "If you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\n", + "experimentation by pointing to your data set and automatically training models\n", + "and tuning hyperparameters to save both novice and advanced users precious\n", + "time in the machine learning process.\n", + "\n", + "\n", + "AutoML will also report back metrics related to the model training results as well\n", + "as the code needed to repeat the training already custom-tailored to your data\n", + "set. This glass box approach ensures that you are never trapped or suffer from\n", + "vendor lock-in.\n", + "\n", + "In that regard, the Lakehouse supports the industry’s widest range of data tools,\n", + "development environments, and a thriving ISV ecosystem so you can make your\n", + "workspace your own and put out your best work.\n", + "\n", + "##### Compute platform\n", + "\n", + "**Any ML workload optimized and accelerated**\n", + "\n", + "**Databricks Machine Learning Runtime**\n", + "\n", + "- Optimized and preconfigured ML frameworks\n", + "\n", + "- Turnkey distribution ML\n", + "\n", + "- Built-in AutoML\n", + "\n", + "- GPU support out of the box\n", + "\n", + "\n", + "Built-in **ML frameworks**\n", + "and **model explainability**\n", + "\n", + "Built-in support for **AutoML**\n", + "and **hyperparameter tuning**\n", + "\n", + "\n", + "Built-in support for\n", + "**distributed training**\n", + "\n", + "Built-in support for\n", + "**hardware accelerators**\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Deploy your models to production\n", + "\n", + "Exploring and creating your machine learning models\n", + "typically represents only part of the task. Once the\n", + "models exist and perform well, they must become\n", + "part of a pipeline that keeps models updated,\n", + "monitored and available for use by others.\n", + "\n", + "**Webhooks** allow registering of\n", + "\n", + "\n", + "Databricks can help here by providing a world-class\n", + "experience for model versioning, monitoring and\n", + "serving within the same platform that you can use\n", + "to generate the models themselves. This means you\n", + "can make all your ML pipelines in the same place,\n", + "monitor them for drift, retrain them with new data,\n", + "and promote and serve them easily and at scale.\n", + "\n", + "Throughout the ML lifecycle, rest assured knowing\n", + "that lineage and governance are being tracked the\n", + "entire way. This means regulatory compliance and\n", + "security woes are significantly reduced, potentially\n", + "saving costly issues down the road.\n", + "\n", + "\n", + "callbacks on events like stage\n", + "\n", + "transitions to integrate with CI/CD\n", + "\n", + "automation.\n", + "\n", + "**Tags** allow storing deployment\n", + "\n", + "— specific metadata with model\n", + "\n", + "versions, e.g., whether the\n", + "\n", + "deployment was successful.\n", + "\n", + "\n", + "**Model lifecycle management**\n", + "\n", + "Staging Production Archived\n", + "\n", + "\n", + "Logged\n", + "model\n", + "\n", + "**Comments** allow communication\n", + "\n", + "and collaboration between\n", + "\n", + "teammates when reviewing\n", + "\n", + "model versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Learn more**\n", + "\n", + "[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n", + "\n", + "[Databricks Data Science](https://databricks.com/product/data-science)\n", + "\n", + "[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 11\n", + "\n", + "\n", + "### Databricks Technology Partners and the modern data stack\n", + "\n", + "Databricks Technology Partners integrate their solutions with Databricks to\n", + "provide complementary capabilities for ETL, data ingestion, business intelligence,\n", + "machine learning and governance. These integrations allow customers to leverage\n", + "the Databricks Lakehouse Platform’s reliability and scalability to innovate faster\n", + "while deriving valuable data insights. Use preferred analytical tools with optimized\n", + "connectors for fast performance, low latency and high user concurrency to your\n", + "data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\n", + "tools to your lakehouse using validated integrations and helps you discover and try new solutions.\n", + "\n", + "**Databricks thrives within your modern data stack**\n", + "\n", + "**BI and dashboards** **Machine learning** **Data science**\n", + "\n", + "\n", + "**Data governance**\n", + "\n", + "**Data pipelines**\n", + "\n", + "**Data ingestion**\n", + "\n", + "\n", + "Data Data Data\n", + "warehousing engineering streaming\n", + "\n", + "**Unity Catalog**\n", + "\n", + "\n", + "Data science\n", + "and ML\n", + "\n", + "\n", + "**Consulting**\n", + "**and SI partners**\n", + "\n", + "\n", + "**Delta Lake**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "**Learn more**\n", + "\n", + "\n", + "[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n", + "\n", + "[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n", + "\n", + "\n", + "[Partner Connect](https://databricks.com/partnerconnect)\n", + "\n", + "[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "### Get started with the Databricks Lakehouse Platform\n", + "# 12\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build solutions together with interactive\n", + "notebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\n", + "scikit-learn and more.\n", + "\n", + "**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\n", + "hosted by Databricks\n", + "\n", + "**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "\n", + "**[Databricks documentation](https://databricks.com/documentation)**\n", + "\n", + "Get detailed documentation to get started with\n", + "the Databricks Lakehouse Platform on your cloud\n", + "of choice: Databricks on AWS, Azure Databricks\n", + "and [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n", + "\n", + "**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n", + "\n", + "Get a firsthand look at Databricks from the\n", + "practitioner’s perspective with these simple\n", + "on-demand videos. Each demo is paired with\n", + "related materials — including notebooks, videos\n", + "and eBooks — so that you can try it out for\n", + "yourself on Databricks.\n", + "\n", + "\n", + "**[Databricks Academy](https://databricks.com/learn/training/home)**\n", + "\n", + "Whether you are new to the data lake or building on\n", + "an existing skill set, you can find a curriculum tailored\n", + "to your role or interest. With training and certification\n", + "through Databricks Academy, you will learn to master\n", + "the Databricks Lakehouse Platform for all your big\n", + "data analytics projects.\n", + "\n", + "**[Databricks Community](https://community.databricks.com/)**\n", + "\n", + "\n", + "**[Databricks Labs](https://databricks.com/learn/labs)**\n", + "\n", + "Databricks Labs are projects created by the\n", + "field to help customers get their use cases\n", + "into production faster.\n", + "\n", + "**[Databricks customers](https://databricks.com/customers)**\n", + "\n", + "Discover how innovative companies across\n", + "every industry are leveraging the Databricks\n", + "Lakehouse Platform.\n", + "\n", + "\n", + "Get answers, network with peers and solve\n", + "the world’s toughest problems, together.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000\n", + "organizations worldwide — including Comcast, Condé Nast,\n", + "H&M and over 40% of the Fortune 500 — rely on the Databricks\n", + "Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve the\n", + "world’s toughest problems. To learn more, follow Databricks on\n", + "[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf2024-09-19T16:57:20Z
##### Guide\n", + "\n", + "## 6 Strategies for Building Personalized Customer Experiences\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "**Introduction** ................................................................................................................................................................................................................. **3**\n", + "\n", + "**1.** **Building a Foundation for Personalization**\n", + "Leveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n", + "\n", + "**2.** **Estimating Customer Lifetime Value**\n", + "Building Brand Loyalty With Data ................................................................................................................................................................. **6**\n", + "\n", + "**3.** **Mitigating Customer Churn**\n", + "Balancing Acquisition and Retention .......................................................................................................................................................... **10**\n", + "\n", + "**4.** **Streamlining Customer Analysis and Targeting**\n", + "Creating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n", + "\n", + "**5.** **Assessing Consumer Interest Data**\n", + "Fine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n", + "\n", + "**6.** **Delivering Personalized Customer Journeys**\n", + "Crafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n", + "\n", + "**Conclusion**\n", + "Building a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "In today’s experience-driven world, the most beloved brands are the ones that\n", + "know their customers. Customers are loyal to brands that recognize their needs\n", + "and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\n", + "buying from a brand that personalizes the shopping and user experience to the\n", + "wants and needs of the customer. And as organizations pursue omnichannel\n", + "excellence, these same high expectations of online experiences also extend to\n", + "brick-and-mortar locations — revealing for many merchants that personalized\n", + "engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized\n", + "experiences requires integrating various types of data — including demographics,\n", + "behavioral and transactional — to develop robust profiles. This guide focuses on six\n", + "actionable strategic pillars for businesses to leverage automation, real-time data,\n", + "AI-driven analysis and well-tuned ML models to architect and deliver customized\n", + "customer experiences at every touch point.\n", + "\n", + "\n", + "# 76%\n", + "\n", + "of consumers are more\n", + "likely to purchase due to\n", + "personalization\n", + "\n", + "\n", + "# 76%\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Foundation for Personalization\n", + "\n", + "Get a 360-degree view of the customer by leveraging ML-based entity resolution\n", + "\n", + "\n", + "To create truly personalized interactions, you need actionable insights\n", + "about your customers. Start by establishing a common customer profile and\n", + "accurately linking together customer records across disparate data sets.\n", + "\n", + "Get a 360-degree view of your target customer by bringing together:\n", + "\n", + "- Sales and traffic-driven first-party data\n", + "\n", + "- Product ratings and surveys\n", + "\n", + "- Customer surveys and support center calls\n", + "\n", + "- Third-party data purchased from data aggregators and online trackers\n", + "\n", + "- Zero-party data provided by customers themselves\n", + "\n", + "Location\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n", + "\n", + "Grab is the largest online-to-offline platform in Southeast Asia and\n", + "has generated over 6 billion transactions for transport, food and\n", + "grocery delivery, and digital payments. Grab uses Databricks to create\n", + "sophisticated customer segmentation and recommendation engines\n", + "that can now ingest and optimize thousands of user-generated signals\n", + "and data sources simultaneously, enhancing data integrity and security,\n", + "and reducing weeks of work to only hours.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/grab)\n", + "\n", + "\n", + "\n", + "Demographics\n", + "\n", + "\n", + "Orders\n", + "\n", + "Network/\n", + "Usage\n", + "\n", + "\n", + "“The C360 platform empowered teams to create\n", + "consumer features at scale, which in turn allows\n", + "for these features to be extended to other markets\n", + "and used by other teams. This helps to reduce the\n", + "engineering overhead and costs exponentially.”\n", + "\n", + "**N I K H I L DWA R A K A N AT H**\n", + "Head of Analytics, Grab\n", + "\n", + "\n", + "Social\n", + "\n", + "Apps/\n", + "Clickstream\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|||||||\n", + "||Cus 3|t 6|o|mer 0||\n", + "|||||||\n", + "|||||||\n", + "\n", + "\n", + "\n", + "Service Call/\n", + "Records\n", + "\n", + "\n", + "Customer\n", + "360\n", + "\n", + "\n", + "Billing\n", + "\n", + "Devices\n", + "\n", + "\n", + "-----\n", + "\n", + "Given the different data sources and data types, automated matching can still\n", + "be incredibly challenging due to inconsistent formats, misinterpretation of data,\n", + "and entry errors across various systems. And even if inconsistent, all that data\n", + "may be perfectly valid — but to accurately connect the millions of customer\n", + "identities most retailers manage, businesses must lean on automation.\n", + "\n", + "In a machine learning (ML) approach to entity resolution, text attributes like\n", + "name, address and phone number are translated into numerical representations\n", + "that can be used to quantify the degree of similarity between any two attribute\n", + "values. But your ability to train such a model depends on your access to\n", + "accurately labeled training data. It’s a time-consuming exercise, but if done right,\n", + "the model learns to reflect the judgments of the human reviewers.\n", + "\n", + "Many organizations rely on libraries encapsulating this knowledge to build their\n", + "applications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\n", + "bringing together ML-based approaches to intelligent candidate pair generation\n", + "and pair-scoring. Oriented toward the construction of custom workflows, Zingg\n", + "presents these capabilities within the context of commonly employed steps\n", + "such as training data label assignment, model training, data set deduplication,\n", + "and (cross-data set) record matching.\n", + "\n", + "Built as a native Apache Spark TM application, Zingg scales well to apply these\n", + "techniques to enterprise-sized data sets. Organizations can then use Zingg in\n", + "combination with platforms such as Databricks Lakehouse to provide the back\n", + "end to human-in-the-middle workflow applications that automate the bulk of\n", + "the entity resolution work and present data experts with a more manageable\n", + "set of edge case pairs to interpret.\n", + "\n", + "\n", + "As an active-learning solution, models can be retrained to take advantage of\n", + "this additional human input to improve future predictions and further reduce\n", + "the number of cases requiring expert review. Finally, these technologies can be\n", + "assembled to enable their own enterprise-scaled customer entity resolution\n", + "workflow applications.\n", + "\n", + "**Need help building your foundation for a**\n", + "**360-degree view of your customers?**\n", + "\n", + "Get pre-built code sample data and step-by-step instructions\n", + "in a Databricks notebook in the **Customer Entity Resolution**\n", + "**Solution Accelerator.**\n", + "\n", + "**•** Translating text attributes (like name, address, phone number)\n", + "into quantifiable numerical representations\n", + "\n", + "**•** Training ML models to determine if these numerical labels\n", + "form a match\n", + "\n", + "**•** Scoring the confidence of each match\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Estimating Customer Lifetime Value\n", + "\n", + "Building brand loyalty to drive share of wallet with data\n", + "\n", + "\n", + "Once you’ve set up a 360-degree view of the customer, the next challenge\n", + "is how to spend money to profitably grow the brand. The goal is to spend\n", + "marketing dollars on activities that attract loyal customers and avoid spending on\n", + "unprofitable customers or activities that damage the brand. Keep in mind, that\n", + "making decisions solely based on ROI isn’t the answer. This one-track approach\n", + "could ultimately weaken your brand equity and make you more dependent on\n", + "lowering your price through promotions as a way to generate sales.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "**Identifying and engaging brand loyalists**\n", + "\n", + "Today’s customer has overwhelmingly abundant options in products and\n", + "services to choose from. That’s why personalizing customer experiences is so\n", + "important, as it increases revenue, marketing efficiency and customer retention.\n", + "\n", + "Not every customer carries the same potential for profitability. Different\n", + "customers derive different value from your products and services, which directly\n", + "translates into differences in the overall amount of value a business can expect\n", + "in return. Mutually beneficial relationships carefully align customer acquisition\n", + "cost (CAC) and retention rates with the total revenue or customer lifetime value\n", + "(CLV).\n", + "\n", + "\n", + "**Predicting and increasing customer lifetime value with ML**\n", + "\n", + "\n", + "Kolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\n", + "attracts over 10 million monthly active users. With Databricks, they\n", + "achieved a 30% increase in player LTV, improved data team productivity\n", + "by 3x, and reduced ML model-to-production time by 40x.\n", + "\n", + "[Get the full story](https://databricks.com/customers/kolibri-games)\n", + "\n", + "Within your existing customer base are people ranging from brand loyalists to\n", + "brand transients. Brand loyalists are highly engaged with your brand, are willing\n", + "to share their experience with others, and are the most likely to purchase\n", + "again. Brand transients have no loyalty to your brand and shop based on price.\n", + "Your focus should be on growing the group of brand loyalists while minimizing\n", + "interactions with brand transients.\n", + "\n", + "\n", + "**Calculating customers’ lifetime intent**\n", + "\n", + "To assess the remaining lifetime in a customer relationship, businesses must\n", + "\n", + "carefully examine the transactional signals and other indicators from previous\n", + "customer engagements and transactions.\n", + "\n", + "For example, if a frequent customer slows down their buying habits — or simply\n", + "doesn’t make a purchase for an extended period of time — it may signal the\n", + "upcoming end of the relationship. However, in the case of another customer\n", + "who engages infrequently, the same extended absence may not signal anything\n", + "notable. The infrequent buyer may continue to purchase even after a long pause\n", + "in activity.\n", + "\n", + "\n", + "-----\n", + "\n", + "Customer A\n", + "\n", + "Customer B\n", + "\n", + "Customer C\n", + "\n", + "\n", + "Past Future\n", + "\n", + "Different customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n", + "\n", + "\n", + "Every customer relationship with a business has a lifespan. Understanding what\n", + "point in the lifespan at a given time provides critical insight to inform marketing\n", + "and sales tactics. By proactively discovering shifts in the relationship, you can\n", + "adapt how to respond to each customer at the optimal time. For example, a\n", + "certain signal might prompt a change in how to deliver products and services,\n", + "which could help maximize revenue.\n", + "\n", + "Transactional signals can be used to estimate the probability that a customer\n", + "is active and likely to return in the future. Popularized as the Buy ’til You Die\n", + "(BTYD) model, analysts can compare a customer’s frequency and recency of\n", + "\n", + "engagement to similar patterns across their user population to accurately\n", + "predict individual CLV.\n", + "\n", + "\n", + "The mathematics behind these predictive CLV models is complex, but the logic\n", + "behind these critical models is accessible through a popular Python library\n", + "named Lifetimes, which allows the input of simple summary metrics in order to\n", + "derive customer-specific lifetime estimates.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**How personalized experiences keep customers coming**\n", + "**back for more**\n", + "\n", + "Publicis Groupe empowers brands to transform retail experiences with\n", + "digital technologies, but data challenges and team silos stood in the\n", + "way of delivering the personalization that their customers required.\n", + "See how they use Databricks to create a single customer view that\n", + "allows them to drive customer loyalty and retention. As a result, they’ve\n", + "seen a 45%–50% increase in customer campaign revenue.\n", + "\n", + "[Get the full story](https://databricks.com/customers/publicis-groupe)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Delivering customer lifetime estimates to the business**\n", + "\n", + "\n", + "Spark natively distributes this work across a multi-server environment, enabling\n", + "consistent, accurate and efficient analysis. Spark’s flexibility allows models to\n", + "adapt in real time as new information is ingested, eliminating the bottlenecks\n", + "that come with manual data mapping and profile building.\n", + "\n", + "With per customer metrics calculated, the Lifetimes library can be used to train\n", + "multiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\n", + "predict engagements over time using proprietary data can take several months\n", + "and thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\n", + "businesses tap into the infrastructure behind their Spark environments and\n", + "distribute the training outputs across models.\n", + "\n", + "\n", + "Using the Lifetimes library to calculate customer-specific probabilities at speed\n", + "and scale can be challenging — from processing large volumes of transaction\n", + "data to deriving data curves and value distribution patterns and, eventually,\n", + "to integration with business initiatives. But with the proper approach, you can\n", + "resolve all of them.\n", + "\n", + "These models depend on three key per customer metrics:\n", + "\n", + "**FREQUENCY**\n", + "The number of times within a given time period in which a repeat\n", + "transaction is observed\n", + "\n", + "**AGE**\n", + "The length of time between the occurrence of an initial transaction\n", + "to the end of a given time period\n", + "\n", + "**RECENCY**\n", + "\n", + "The “age” of a customer (how long they’ve engaged with a brand)\n", + "at the time of their latest repeat transaction\n", + "\n", + "\n", + "-----\n", + "\n", + "**Solution deployment**\n", + "\n", + "\n", + "Once properly trained, these models can determine the probability that a\n", + "customer will re-engage, as well as the number of engagements a business\n", + "can expect from that customer over time. But the real challenge is putting\n", + "these predictive capabilities into the hands of those that determine\n", + "customer engagement.\n", + "\n", + "Matrices illustrating the probability a customer is alive (left) and the number of future\n", + "purchases in a 30-day window given a customer’s frequency and recency metrics (right).\n", + "\n", + "\n", + "Businesses need a way to develop and deploy solutions in a highly scalable\n", + "environment with a limited upfront cost. Databricks Solution Accelerators\n", + "leverage real-world sample data sets and pre-built code to show how raw data\n", + "can be transformed into real solutions — including step-by-step instructions\n", + "ready to go in a Databricks notebook.\n", + "\n", + "**Need help determining your customers’**\n", + "**lifetime value?**\n", + "\n", + "Use the **Customer Lifetime Value Accelerator** to\n", + "\n", + "**•** Ingest sample retail data\n", + "\n", + "**•** Use pre-built code to develop visualizations and explore\n", + "past purchase behavior\n", + "\n", + "**•** Apply machine learning to predict the likelihood and\n", + "nature of future purchases\n", + "\n", + "**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Mitigating Customer Churn\n", + "\n", + "Balancing acquisition and retention with personalized experiences\n", + "\n", + "\n", + "There are no guarantees of success. With a bevy of options at their disposal,\n", + "customer churn is a reality that companies face and are focused on overcoming\n", + "every day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\n", + "estimated a segment average 7.2% monthly rate of churn. When narrowed to\n", + "brands focused on consumer goods, that rate jumped to 10.0%. This figure\n", + "translates to a lifetime of 10 months for the average subscription box service,\n", + "leaving businesses of this kind with little time to recover acquisition costs and\n", + "bring subscribers to net profitability.\n", + "\n", + "**C A S E S T U DY**\n", + "##### Riot Games\n", + "\n", + "**Creating an optimal in-game experience for League of Legends**\n", + "\n", + "Riot Games is one of the top PC game developers in the world, with over\n", + "100 million monthly active users, 500 billion data points, and over 26\n", + "petabytes of data and counting. They turned to Databricks to build a more\n", + "\n", + "efficient and scalable way to leverage data and improve the overall gaming\n", + "experience — ensuring customer engagement and reducing churn.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/riot-games)\n", + "\n", + "Organizations must take an honest look at the cost of acquisition relative to a\n", + "customer’s lifetime value (LTV) earned. These figures need to be brought into a\n", + "\n", + "healthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n", + "\n", + "\n", + "**Understanding attrition predictability through subscriptions:**\n", + "**Examining retention-based acquisition variables**\n", + "\n", + "Public data for subscription services is extremely hard to come by. KKBox, a\n", + "Taiwan-based music streaming service, recently released over two years of\n", + "anonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\n", + "the data, we uncover customer dynamics familiar to any subscription provider.\n", + "\n", + "Most subscribers join the KKBox service through a 30-day trial offer. Customers\n", + "then appear to enlist in one-year subscriptions, which provide the service with\n", + "a steady flow of revenue. Subscribers typically churn at the end of the 30-day\n", + "trial and at regular one-year intervals.\n", + "\n", + "The Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\n", + "retained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n", + "\n", + "\n", + "-----\n", + "\n", + "By Initial Payment Method\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different payment methods.\n", + "\n", + "By Initial Payment Plan Days\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers selecting different initial payment methods and terms/days.\n", + "\n", + "\n", + "This pattern of high initial drop-off, followed by a period of slower but continuing\n", + "drop-off cycles makes intuitive sense. Where it gets interesting is when the\n", + "data changes. The patterns of customer churn become vastly different as time\n", + "passes and new or changing elements are introduced (e.g., payment methods\n", + "and options, membership tiers, etc.).\n", + "\n", + "By Registration Channel\n", + "\n", + "timeline\n", + "\n", + "Customer attrition by subscription day on the KKBox streaming service for\n", + "customers registering via different channels.\n", + "\n", + "\n", + "-----\n", + "\n", + "These patterns seem to indicate that KKBox _could_ potentially differentiate\n", + "between customers based on their lifetime potential, using only the information\n", + "available at subscriber acquisition. In the same way, non-subscription businesses\n", + "could use similar data techniques to get an accurate illustration of the total\n", + "lifetime value of a particular customer, even before collecting historical data.\n", + "\n", + "This information can help businesses target certain shoppers with effective\n", + "discounts or promotions as early as trial registration. Nevertheless, it’s always\n", + "important to consider more than individual data points.\n", + "\n", + "The baseline risk of customer attrition over a subscription lifespan.\n", + "\n", + "\n", + "The channel and payment method multipliers combine to explain a customer’s risk of attrition\n", + "at various points in time. The higher the value, the higher the proportional risk of churn in the\n", + "associated period.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Applying churn analytics to your data**\n", + "\n", + "This analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n", + "**2)** to paint a quantitative picture of the specific factors that explain that risk,\n", + "giving analysts a clearer understanding of what to focus on, what to ignore and\n", + "what to investigate further. The main challenge is organizing the input data.\n", + "\n", + "The data required to examine customer attrition may be scattered across\n", + "multiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\n", + "the creation of transparent, sustainable data processing pipelines that are\n", + "flexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n", + "**condition to be managed** , and attrition data should be periodically revisited to\n", + "maintain alignment between acquisition and retention efforts.\n", + "\n", + "**Need help predicting customer churn?**\n", + "\n", + "Use the **Subscriber Churn Prediction Accelerator** to analyze\n", + "behavioral data, identify subscribers with an increased risk of\n", + "cancellation, and predict attrition. Machine learning lets you\n", + "quantify a user’s likelihood to churn, identifying factors that\n", + "explain the risk.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Streamlining Customer Analysis and Targeting\n", + "\n", + "Creating efficient and highly targeted customer experiences with behavioral data\n", + "\n", + "\n", + "Effective targeting comes down to one fundamental element: the cost of\n", + "delivering a good or service relative to what a consumer is willing to pay.\n", + "\n", + "In the earliest applications of segmentation, manufacturers recognized that\n", + "specialized product lines targeting specific consumer groups could help\n", + "brands stand out against competitors.\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Finding that special something every time**\n", + "\n", + "Pandora is a jewelry company with global reach. They built their master\n", + "consumer view (MCV) dashboard on the Databricks Lakehouse Platform,\n", + "giving them the insights necessary to deliver highly targeted messaging\n", + "and personalization — resulting in 80% growth in email marketing\n", + "success, a 50% increase in click-to-open rate across 65 million emails,\n", + "and 255M DKK (Danish Krone) in quarterly revenue.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/pandora)\n", + "\n", + "This mode of thinking extends beyond product development and into every\n", + "customer-oriented business function, requiring specific means of ideation,\n", + "production and delivery. The work put into segmentation doesn’t need to be\n", + "a gamble. Scrutinizing customers and testing responsiveness is an ongoing\n", + "process. Organizations must analyze and adapt to shifting markets, changing\n", + "consumer demand and evolving business objectives.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Powering insight-driven dashboards to increase customer**\n", + "**acquisition**\n", + "\n", + "Bagelcode is a global game company with more than 50 million global\n", + "users. By using the Databricks Lakehouse Platform, they are now able to\n", + "support more diversified indicators, such as a user’s level of frequency\n", + "and the amount of time they use a specific function for each game,\n", + "enabling more well-informed responses. In addition, the company is\n", + "mitigating customer churn by better predicting gamer behavior and\n", + "providing personalized experiences at scale.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/bagelcode)\n", + "\n", + "“Thanks to Databricks Lakehouse, we can support\n", + "real-time business decision-making based on data\n", + "analysis results that are automatically updated on\n", + "an hourly and daily basis, even as data volumes have\n", + "increased by nearly 1,000 times.”\n", + "\n", + "**J O O H Y U N K I M**\n", + "Vice President, Data and AI, Bagelcode\n", + "\n", + "\n", + "-----\n", + "\n", + "A brand’s goal with segmentation should be to define a shared customer\n", + "perspective on customers, allowing the organization to engage users consistently\n", + "and cohesively. But any adjustments to customer engagement require careful\n", + "consideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Responding to global demand shifts with ease**\n", + "\n", + "Reckitt produces some of the world’s most recognizable and trusted\n", + "consumer brands in hygiene, health and nutrition. With Databricks\n", + "Lakehouse on Azure, they’re able to meet the needs of billions of\n", + "consumers worldwide by surfacing real-time, highly accurate, deep\n", + "customer insights, leading to a better understanding of trends and\n", + "demand, allowing them to provide best-in-class experiences in\n", + "every market.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/reckitt)\n", + "\n", + "\n", + "**A segmentation walk-through: Grocery chain promotions**\n", + "\n", + "A promotions management team for a large grocery chain is responsible for\n", + "running a number of promotional campaigns, each of which is intended to drive\n", + "greater overall sales. Today, these marketing campaigns include leaflets and\n", + "coupons mailed to individual households, manufacturer coupon matching,\n", + "in-store discounts and the stocking of various private-label alternatives to\n", + "popular national brands.\n", + "\n", + "Recognizing uneven response rates between households, the team is eager to\n", + "determine if customers might be segmented based on their responsiveness\n", + "to these promotions. They anticipate that such segmentation may allow the\n", + "promotions management team to better target individual households, driving\n", + "overall higher response rates for each promotional dollar spent.\n", + "\n", + "Using historical data from point-of-sale systems along with campaign\n", + "information from their promotions management systems, the team derives\n", + "a number of features that capture the behavior of various households with\n", + "regard to promotions. Applying standard data preparation techniques, the data\n", + "is organized for analysis and using a variety of clustering algorithms, such as\n", + "k-means and hierarchical clustering, the team settles on two potentially useful\n", + "cluster designs.\n", + "\n", + "\n", + "-----\n", + "\n", + "Overlapping segment designs separating households based on their responsiveness to\n", + "various promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n", + "\n", + "**Assessing results**\n", + "\n", + "\n", + "Comparing households by demographic factors not used in developing the\n", + "clusters themselves, some interesting patterns separating cluster members\n", + "by age and other factors are identified. While this information may be useful\n", + "\n", + "in not only predicting cluster membership and designing more effective\n", + "campaigns targeted to specific groups of households, the team recognizes\n", + "the need to collect additional demographic data before putting too much\n", + "emphasis on these results.\n", + "\n", + "\n", + "With profiling, marketers can discern those customer households in the\n", + "highlighted example fall into two groups: those who are responsive to coupons\n", + "and mailed leaflets, and those who are not. Further divisions show differing\n", + "degrees of responsiveness to other promotional offers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Need help segmenting your customers for**\n", + "**more targeted marketing?**\n", + "\n", + "Use the **Customer Segmentation Accelerator** and drive\n", + "better purchasing predictions based on behaviors. Through\n", + "sales data, campaigns and promotions systems, you can\n", + "build useful customer clusters to effectively target various\n", + "households with different promos and offers.\n", + "\n", + "Age-based differences in cluster composition of behavior-based customer segments.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "The results of the analysis now drive a dialog between the data scientists and\n", + "the promotions management team. Based on initial findings, a revised analysis\n", + "will be performed focused on what appear to be the most critical features\n", + "differentiating households as a means to simplify the cluster design and evaluate\n", + "overall cluster stability. Subsequent analyses will also examine the revenue\n", + "\n", + "generated by various households to understand how changes in promotional\n", + "engagement may impact customer spending.\n", + "\n", + "Using this information, the team believes they will have the ability to make a case\n", + "for change to upper management. Should a change in promotions targeting be\n", + "approved, the team makes plans to monitor household spending, promotions\n", + "spend and campaign responsiveness rates using much of the same data used in\n", + "this analysis. This will allow the team to assess the impact of these efforts and\n", + "identify when the segmentation design needs to be revisited.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Assessing Consumer Interest Data to Inform Engagement Strategies\n", + "\n", + "Fine-tuning ML recommendations to boost conversions\n", + "\n", + "\n", + "Personalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\n", + "important to identify high-value audiences who have the highest likelihood of\n", + "specific actions. Here’s where **propensity scoring** comes in.\n", + "\n", + "Specifically, this process allows companies to estimate customers’ potential\n", + "receptiveness to an offer or to content related to a subset of products, and\n", + "determine which messaging to apply. Calculating propensity scores requires\n", + "assessment of past interactions and data points (e.g., frequency of purchases,\n", + "percentage of spend associated with a particular product category, days since\n", + "last purchase and other historical data).\n", + "\n", + "Databricks provides critical capabilities for propensity scoring (like the Feature\n", + "Store, AutoML and MLflow) to help businesses answer three key considerations\n", + "and develop a robust process:\n", + "\n", + "**1.** How to maintain the significant number of features used\n", + "to train propensity models\n", + "\n", + "**2.** How to rapidly train models aligned with new campaigns\n", + "\n", + "**3.** How to rapidly re-deploy models, retrained as customer\n", + "patterns drift, into the scoring pipeline\n", + "\n", + "**Boosting model training efficiency**\n", + "\n", + "With the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\n", + "created by others.\n", + "\n", + "\n", + "The feature store is a centralized repository that enables the persistence,\n", + "discovery and sharing of features across various model training exercises.\n", + "As features are captured, lineage and other metadata are captured. Standard\n", + "security models ensure that only permitted users and processes may\n", + "employ these features, enforcing the organization’s data access policies on\n", + "data science processes.\n", + "\n", + "**Extracting the complexities of ML**\n", + "\n", + "[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\n", + "best practices. As a glass box solution, AutoML first generates a collection of\n", + "notebooks representing various aligned model variations. In addition to iteratively\n", + "training models, AutoML allows you to access the notebooks associated with each\n", + "model, creating an editable starting point for further exploration.\n", + "\n", + "**Streamlining the overall ML lifecycle**\n", + "\n", + "[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\n", + "Databricks Lakehouse. This repository enables tracking and analysis of the various\n", + "model iterations generated by both AutoML and custom training cycles alike.\n", + "\n", + "When used in combination with the Databricks Feature Store, models persisted\n", + "with MLflow can retain knowledge of the features used during training. As models\n", + "are retrieved, this same information allows the model to retrieve relevant features\n", + "from the Feature Store, greatly simplifying the scoring workflow and enabling\n", + "rapid deployment.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to build a propensity scoring workflow with Databricks**\n", + "\n", + "Using these features in combination, many organizations implement propensity\n", + "scoring as part of a three-part workflow:\n", + "\n", + "**1.** Data engineers work with data scientists to define features relevant\n", + "to the propensity scoring exercise and persist these to the Feature Store.\n", + "Daily or even real-time feature engineering processes are then defined\n", + "to calculate up-to-date feature values as new data inputs arrive.\n", + "\n", + "Model Training\n", + "and Deployment\n", + "\n", + "\n", + "**2.** As part of the inference workflow, customer identifiers are presented to\n", + "previously trained models in order to generate propensity scores based on\n", + "the latest features available. Feature Store information captured with the\n", + "model allows data engineers to retrieve these features and easily generate\n", + "the desired scores, which can then be used for analysis within Databricks\n", + "Lakehouse or published to downstream marketing systems.\n", + "\n", + "**3.** In the model-training workflow, data scientists periodically retrain the\n", + "propensity score models to capture shifts in customer behaviors. As these\n", + "models are persisted to MLfLow, change management processes are used\n", + "to evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\n", + "production version of each model is retrieved to generate customer scores.\n", + "\n", + "\n", + "Score Generation\n", + "and Publication ETL\n", + "\n", + "**Need help assessing interest from your**\n", + "**target audience?**\n", + "\n", + "\n", + "Feature\n", + "Engineering ETL\n", + "\n", + "Feature Store Profiles\n", + "\n", + "\n", + "Sales\n", + "\n", + "Promotions\n", + "\n", + "Customer\n", + "\n", + "\n", + "Use the **Propensity Scoring Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Downstream\n", + "Applications\n", + "\n", + "\n", + "A three-part propensity scoring workflow.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Delivering Personalized Customer Journeys\n", + "\n", + "Strategies for crafting a real-time recommendation engine\n", + "\n", + "\n", + "As the economy continues to weather unpredictable disruptions, shortages and\n", + "demand, delivering personalized customer experiences at speed and scale will\n", + "require adaptability on the ground and within a company’s operational tech stack.\n", + "\n", + "\n", + "With the Databricks Lakehouse, Al-Futtaim has transformed their data\n", + "strategy and operations, allowing them to create a “golden customer\n", + "record” that improves all decision-making from forecasting demand to\n", + "powering their global loyalty program.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/al-futtaim)\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "“Databricks Lakehouse allows every division in our\n", + "organization — from automotive to retail — to gain\n", + "a unified view of our customer across businesses.\n", + "With these insights, we can optimize everything from\n", + "forecasting and supply chain, to powering our loyalty\n", + "program through personalized marketing campaigns,\n", + "cross-sell strategies and offers.”\n", + "\n", + "**D M I T R I Y D O V G A N**\n", + "Head of Data Science, Al-Futtaim Group\n", + "\n", + "As COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\n", + "safety and community, brands most attuned to changing needs and sentiments\n", + "saw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\n", + "business and many lost, organizations that had already begun the journey toward\n", + "improved customer experience saw better outcomes, closely mirroring patterns\n", + "[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n", + "\n", + "\n", + "**Creating a unified view across 200+ brands**\n", + "\n", + "As a driving force for economic growth in the Middle East, Al-Futtaim\n", + "impacts the lives of millions of people across the region through the\n", + "distribution and operations of global brands like Toyota, IKEA, Ace\n", + "Hardware and Marks & Spencer.\n", + "\n", + "Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**\n", + "\n", + "Personalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\n", + "The [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\n", + "how they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n", + "[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing the beauty product shopping experience**\n", + "\n", + "Flaconi wanted to leverage data and AI to become the No. 1 online\n", + "beauty product destination in Europe. However, they struggled with\n", + "massive volumes of streaming data and with infrastructure complexity\n", + "that was resource-intensive and costly to scale. See how they used\n", + "Databricks to increase time-to-market by 200x, reduce staff costs by\n", + "40% and increase net order income.\n", + "\n", + "Get the full story\n", + "\n", + "\n", + "¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\n", + "Experience Performance Index in 2007-09.\n", + "\n", + "Source: Forrester Customer Experience Performance Index (2007-09); press search\n", + "\n", + "CX leaders outperform laggards, even in a down market, in this visualization of the Forrester\n", + "Customer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n", + "\n", + "\n", + "-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Flipp is an online marketplace that aggregates weekly shopping circulars,\n", + "so consumers get deals and discounts without clipping coupons. Siloed\n", + "customer data sources once made getting insights difficult. Now with\n", + "Databricks, Flipp’s data teams can access and democratize data, helping\n", + "them do their jobs more effectively while bringing better deals to users,\n", + "more meaningful insights to partners, and a 10% jump in foot traffic to\n", + "brick-and-mortar retailers.\n", + "\n", + "Get the full story\n", + "\n", + "The engines we use to serve content based on customer preferences are known\n", + "as recommenders. With some recommenders, a heavy focus on the shared\n", + "preferences of similar customers helps define what recommendations will actually\n", + "make an impact. With others, it can be more useful to focus on the properties of\n", + "the content itself (e.g., product descriptions).\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n", + "\n", + "\n", + "Providing deep, effective personalized experiences to customers depends\n", + "on a brand’s ability to intelligently leverage consumer and market data from a\n", + "wide variety of sources to fuel faster, smarter decisions — without sacrificing\n", + "accuracy for speed. The Databricks Lakehouse Platform is purpose-built for\n", + "exactly that, offering a scalable data architecture that unifies all your data,\n", + "analytics and AI to deliver unforgettable customer experiences.\n", + "\n", + "Created on open source and open standards, Databricks offers a robust\n", + "and cost-effective platform for brands to collaborate with partners, clients,\n", + "manufacturers and distributors to unleash more innovation and efficiencies\n", + "at every touch point. Businesses can rapidly ingest available data in real time,\n", + "\n", + "\n", + "at scale, and create accessible, data-driven insights that enable actionable\n", + "strategies across the value chain.\n", + "\n", + "Databricks is a multicloud platform, designed for quick enterprise development.\n", + "Teams using the Lakehouse can more effectively reveal the 360-degree view into\n", + "their company’s operational health and the evolving needs of their customers\n", + "— all while empowering teams to easily unify data efforts, perform fine-grained\n", + "analyses and streamline cross-functional data operations using a single,\n", + "sophisticated solution.\n", + "\n", + "\n", + "###### Learn more about Databricks Lakehouse for industries\n", + " like Retail & Consumer Goods, Media & Entertainment\n", + " and more at databricks.com/solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n", + "\n", + "a mission to help data teams solve the world’s toughest problems. To learn more,\n", + "\n", + "follow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n", + "\n", + "##### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf2024-09-19T16:57:19Z
#### eBook\n", + "\n", + "# Big Book of Retail\n", + " & Consumer Goods Use Cases\n", + "\n", + "##### Driving real-time decisions\n", + " with the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n", + "\n", + "**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n", + "\n", + "Common challenges 6\n", + "\n", + "The Lakehouse for Retail 8\n", + "\n", + "**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n", + "\n", + "Case Study: Gousto 14\n", + "\n", + "Case Study: ButcherBox 14\n", + "\n", + "**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n", + "\n", + "Case Study: Embark 16\n", + "\n", + "**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n", + "\n", + "Case Study: H&M 19\n", + "\n", + "Case Study: Edmunds 19\n", + "\n", + "**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n", + "\n", + "**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n", + "\n", + "Case Study: Reckitt 25\n", + "\n", + "**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n", + "\n", + "**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n", + "\n", + "Case Study: Wehkamp 31\n", + "\n", + "Case Study: Columbia 31\n", + "\n", + "Case Study: Pandora 31\n", + "\n", + "**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n", + "\n", + "**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n", + "\n", + "Case Study: ButcherBox 37\n", + "\n", + "Case Study: Sam’s Club 37\n", + "\n", + "**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n", + "\n", + "**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n", + "\n", + "**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n", + "\n", + "**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 1:**\n", + "### Introduction\n", + "\n", + "\n", + "Retailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n", + "\n", + "e-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n", + "\n", + "decisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n", + "\n", + "to support these decisions, but legacy systems are limited to data that’s hours or days old.\n", + "\n", + "**When seconds matter, only the Lakehouse delivers better decisions**\n", + "\n", + "Retail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n", + "\n", + "The integration of physical and e-commerce customer experiences into an omnichannel journey has been\n", + "\n", + "happening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n", + "\n", + "purchasing patterns.\n", + "\n", + "In reaction to these industry changes, retailers have responded with significant, rapid investments — including\n", + "\n", + "stronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n", + "\n", + "capabilities have addressed the immediate need — and created expectations of making decisions in real\n", + "\n", + "time — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n", + "\n", + "Unfortunately, most legacy systems are only able to process information in hours or days.\n", + "\n", + "The delays caused by waiting for data are leading to significant risks and costs for the industry.\n", + "\n", + "**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n", + "\n", + "the-minute order data. Not having this information causes them to spend more resources on having people\n", + "\n", + "pick orders separately, at a higher operating cost.\n", + "\n", + "**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n", + "\n", + "that in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n", + "\n", + "sales, or worse, the customer becoming unsatisfied and moving to different retailers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n", + "\n", + "and other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n", + "\n", + "week.\n", + "\n", + "The margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n", + "\n", + "zero. Reducing the error rate requires better predictions and real-time data.\n", + "\n", + "**Use Case Guide**\n", + "\n", + "In this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n", + "\n", + "**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n", + "\n", + "**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n", + "\n", + "**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n", + "\n", + "performance, while achieving a market-leading total cost of ownership.\n", + "\n", + "Databricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n", + "\n", + "in real time. This use case guide also highlights common use cases across the industry, and offers additional\n", + "\n", + "resources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n", + "\n", + "journey to drive better customer experiences with data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 2:**\n", + "### Modern Data Platform\n", + " for Real-Time Retail\n", + "\n", + "\n", + "Retailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n", + "\n", + "changes, retailers are increasingly focused on improving the real-time availability of data and insights, and\n", + "\n", + "performing advanced analytics delivered within tight business service windows.\n", + "\n", + "**Common challenges**\n", + "\n", + "In response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n", + "\n", + "in modernizing distribution centers, partnering with delivery companies, and investing in customer\n", + "\n", + "engagement systems.\n", + "\n", + "Warehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n", + "\n", + "distribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n", + "\n", + "that became accustomed to having fast, same-day, and sometimes even overnight delivery options\n", + "\n", + "during the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n", + "\n", + "experience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n", + "\n", + "## $41B Market | Retail Warehouse Automation\n", + "\n", + "Yet while retailers modernize different areas of their operations, they’re constrained by a single point of\n", + "\n", + "weakness, as they are reliant on legacy data platforms to bring together all of this data.\n", + "\n", + "Powering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n", + "\n", + "governance of information, and powering business intelligence and predictive analytics all within the time\n", + "\n", + "required by retail operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n", + "\n", + "is the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n", + "\n", + "systems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n", + "\n", + "and other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n", + "\n", + "aggregated and integrated with each other before they can be used. The problem? Retailers have used\n", + "\n", + "legacy data warehouses that are built around batch processing. And worse, increasing the frequency\n", + "\n", + "of how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n", + "\n", + "merchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n", + "\n", + "other data sets. The result? Accurate data to drive decisions can be delayed by days.\n", + "\n", + "**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n", + "\n", + "trade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n", + "\n", + "Running forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n", + "\n", + "but doing so requires tens of millions of model calculations that need to be performed in narrow service\n", + "\n", + "windows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n", + "\n", + "forced to accept the trade-off and live with less accurate predictions.\n", + "\n", + "**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n", + "\n", + "real-time data to thousands of employees is a daunting task. While data warehouses are capable\n", + "\n", + "of serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n", + "\n", + "frequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n", + "\n", + "decisions that are more frequent.\n", + "\n", + "**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n", + "\n", + "focused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n", + "\n", + "a trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n", + "\n", + "struggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n", + "\n", + "deliver personalized experiences at scale to win in retail.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### The Lakehouse for Retail\n", + "\n", + "Databricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n", + "\n", + "all types of data — from images to structured data — in real time, provide enterprise-class management\n", + "\n", + "and governance, and then immediately turn that data into actionable insights with real-time reporting and\n", + "\n", + "predictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n", + "\n", + "(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n", + "\n", + "**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n", + "**or frequency** **processing** **any persona** **& collaboration**\n", + "\n", + "_Semi-structured batch_\n", + "\n", + "\n", + "**All of**\n", + "**your sources**\n", + "\n", + "Competitive activity\n", + "\n", + "E-commerce\n", + "\n", + "Mobile Applications\n", + "\n", + "Video & Images\n", + "\n", + "Point of Sale\n", + "\n", + "Distribution & Logistics\n", + "\n", + "Customer & Loyalty\n", + "\n", + "Delivery & Partners\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Unstructured batch_\n", + "\n", + "_Semi-structured real-time_\n", + "\n", + "_Structured real-time_\n", + "\n", + "_Structured batch_\n", + "\n", + "\n", + "Data Lakehouse\n", + "\n", + "Data Management and Governance\n", + "\n", + "Process, manage and query all of your data\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "**Internal Teams**\n", + "\n", + "Production\n", + "Machine Learning\n", + "\n", + "**Customers**\n", + "\n", + "BI Reporting\n", + "& Dashboarding\n", + "\n", + "**Partners**\n", + "\n", + "Real-time Applications\n", + "\n", + "\n", + "Any Cloud\n", + "\n", + "\n", + "_Structured real-time_\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture**\n", + "\n", + "At the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n", + "\n", + "offs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n", + "\n", + "that combines the best elements of data warehouses and data lakes — to directly address these factors by\n", + "\n", + "enabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n", + "\n", + "managed and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n", + "\n", + "data scientists and data engineers can all leverage this information to serve models for applications and\n", + "\n", + "power real-time reporting, advanced analytics, large-scale forecasting models and more.\n", + "\n", + "**EDGE** **HYBRID** **CLOUD**\n", + "\n", + "\n", + "\n", + "REST Model Serving\n", + "\n", + "|Machine Learning Operations Tracking Registery|RES|\n", + "|---|---|\n", + "||Application|\n", + "\n", + "\n", + "\n", + "Replication\n", + "\n", + "\n", + "Automatic DBs\n", + "\n", + "|Col1|Real-tim|\n", + "|---|---|\n", + "|||\n", + "\n", + "\n", + "Raw Data\n", + "\n", + "(Bronze Table)\n", + "\n", + "\n", + "Clean Data\n", + "\n", + "(Silver Table)\n", + "\n", + "\n", + "Refined Data\n", + "\n", + "(Gold Table)\n", + "\n", + "\n", + "Business\n", + "Applications\n", + "\n", + "Power BI\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How it works\n", + "\n", + "The Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n", + "\n", + "simplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n", + "\n", + "differentiated capabilities that help retailers win.\n", + "\n", + "Robust data Time-sensitive machine\n", + "Data in real time Use all of your data Real-time reporting\n", + "management learning\n", + "\n", + "\n", + "**Limited.** EDWs support the\n", + "\n", + "management of structured\n", + "\n", + "data.\n", + "\n", + "**No.** Data lakes lack\n", + "\n", + "enterprise-class data\n", + "\n", + "management tools.\n", + "\n", + "**Yes.** Delta and Unity\n", + "\n", + "Catalog offer native\n", + "\n", + "data management and\n", + "\n", + "governance of all data types.\n", + "\n", + "\n", + "**No.** EDWs offer quick access\n", + "\n", + "to reports on old data.\n", + "\n", + "**No.** Data lakes were not\n", + "\n", + "designed for reporting, let\n", + "\n", + "alone real-time reporting.\n", + "\n", + "**No.** Data lakes are able to\n", + "\n", + "support large analytics,\n", + "\n", + "but lack the ability to meet\n", + "\n", + "business SLAs.\n", + "\n", + "\n", + "**No.** EDWs must extract data\n", + "\n", + "and send it to a third party\n", + "\n", + "for machine learning.\n", + "\n", + "**Yes.** Data views can be\n", + "\n", + "materialized, enabling front-\n", + "\n", + "line employees with real-\n", + "\n", + "time data.\n", + "\n", + "**Yes.** The Lakehouse can\n", + "\n", + "scale to process the most\n", + "\n", + "demanding predictions\n", + "\n", + "within business SLAs.\n", + "\n", + "\n", + "**No.** Data warehouses are\n", + "\n", + "batch oriented, restricting\n", + "\n", + "data updates to hours or days.\n", + "\n", + "**No.** Data lakes are batch\n", + "\n", + "oriented.\n", + "\n", + "**Yes.** Support for real-time\n", + "\n", + "streaming data.\n", + "\n", + "\n", + "**No.** Data warehouses have\n", + "\n", + "very limited support for\n", + "\n", + "unstructured data.\n", + "\n", + "**Yes.** Data lakes offer support\n", + "\n", + "for all types of data.\n", + "\n", + "**Yes.** Supports all types of\n", + "\n", + "data in a centrally managed\n", + "\n", + "platform.\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "**WAREHOUSE**\n", + "\n", + "\n", + "**LEGACY DATA**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "**(HADOOP)**\n", + "\n", + "\n", + "**DATA LAKES**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "**DATA**\n", + "\n", + "\n", + "**ROBUST**\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n", + "\n", + "for streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n", + "\n", + "and point-of-sale data. And Delta Lake enables this world-record-leading performance while\n", + "\n", + "maintaining support for ACID transactions.\n", + "\n", + "**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n", + "\n", + "and a growing variety of other data sources. This data is extremely powerful in helping to improve our\n", + "\n", + "understanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n", + "\n", + "to take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n", + "\n", + "architecture.\n", + "\n", + "**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n", + "\n", + "was lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n", + "\n", + "compliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n", + "\n", + "to a modern data architecture does not require sacrificing enterprise maturity.\n", + "\n", + "**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n", + "\n", + "or recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n", + "\n", + "can scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n", + "\n", + "sensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n", + "\n", + "availability and out-of-stocks, and delivering highly personalized predictions.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "By using Databricks to build and support your lakehouse, you can empower your business with even more\n", + "\n", + "speed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n", + "\n", + "start with the use case that will have the most impact on your business. As you implement the pattern, you\n", + "\n", + "will find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n", + "\n", + "guidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 3**\n", + "### Use Case:\n", + " Real-Time Supply\n", + " Chain Data\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "As companies see a surge in demand from e-commerce and delivery services, and seek increasing\n", + "\n", + "efficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n", + "\n", + "roadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n", + "\n", + "items are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n", + "\n", + "control tower.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Manufacturers Distributors Logistics Restaurants\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n", + "\n", + "happening and when a customer can act on it\n", + "\n", + "**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n", + "\n", + "have the added pressure to take immediate action on it\n", + "\n", + "**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n", + "\n", + "plant operations or as part of a supply chain control tower.\n", + "\n", + "**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n", + "\n", + "improvement in speed to data, with greater reliability\n", + "\n", + "**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n", + "\n", + "that they were able to move from batch to real-time at neutral costs\n", + "\n", + "**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n", + "\n", + "real-time data ingestions. Customers frequently report that the amount of code required to support\n", + "\n", + "streaming ingestion is 50% less than previous solutions.\n", + "\n", + "**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n", + "\n", + "happening, predict and prevent issues, and even gain days on major changes such as production\n", + "\n", + "schedules between shifts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks allows for both streaming and batch data sets to be ingested and made available to enable\n", + "\n", + "real-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n", + "\n", + "ACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n", + "\n", + "Delta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n", + "\n", + "learning experiments.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n", + "\n", + "data, IoT sensor, transportation management\n", + "\n", + "\n", + "-----\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n", + "\n", + "daily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n", + "\n", + "provided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n", + "\n", + "outbreak by providing real-time insight into performance on the factory picking lines.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "As a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n", + "\n", + "hundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n", + "\n", + "data in under three minutes.\n", + "\n", + "Now, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n", + "\n", + "address any logistical and delivery issues.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 4**\n", + "### Use Case: Truck Monitoring\n", + "\n", + "\n", + "With many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n", + "\n", + "of trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n", + "\n", + "manner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n", + "\n", + "delays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n", + "\n", + "\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n", + "\n", + "\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n", + "\n", + "accidents or shipment delays\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with the Databricks Lakehouse**\n", + "\n", + "Databricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n", + "\n", + "to delivery.\n", + "\n", + "**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n", + "\n", + "ability to monitor driver safety more immediately\n", + "\n", + "**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n", + "\n", + "expansion without sacrificing data quality and speed\n", + "\n", + "**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n", + "\n", + "proactive maintenance and reduced risk of accidents\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n", + "\n", + "to route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n", + "\n", + "analytics provide companies with the tools they need to scale and improve their operations.\n", + "\n", + "**Typical use case data sources include:**\n", + "\n", + "Supply planning, transportation management, manufacturing, predictive maintenance\n", + "\n", + "**CASE STUDY**\n", + "\n", + "With 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n", + "\n", + "to unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n", + "\n", + "via dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n", + "\n", + "autonomous trucks.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 5**\n", + "### Use Case: Inventory Allocation\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Replenishment planning is the process of determining what needs to go where. It is used by replenishment\n", + "\n", + "planning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n", + "\n", + "vendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n", + "\n", + "and on what day.\n", + "\n", + "Replenishment is challenging for companies because it deals with rapidly changing data and the need to\n", + "\n", + "make complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n", + "\n", + "data to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n", + "\n", + "number of products being sent to stores. This results in lost sales and low customer satisfaction.\n", + "\n", + "Inventory allocation is a process that might be performed multiple times a day during peak seasons, or\n", + "\n", + "daily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n", + "\n", + "multiple times a day — on demand and dynamically — during peak season without paying a premium for\n", + "\n", + "this capability throughout the year.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Restaurants\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n", + "\n", + "This information is used to determine which products get put on trucks and go to specific stores.\n", + "\n", + "\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n", + "\n", + "the service windows\n", + "\n", + "\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n", + "\n", + "attributes that may be more or less popular by store\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n", + "\n", + "\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n", + "\n", + "improvement in forecast accuracy\n", + "\n", + "\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n", + "\n", + "\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n", + "\n", + "information on flavors or apparel sizes for specific stores\n", + "\n", + "**Solution overview**\n", + "\n", + "The objective of inventory allocation is to quickly determine when to distribute items and where — from\n", + "\n", + "warehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n", + "\n", + "rate of products, the available inventory and the shipping schedules, and then using this information to\n", + "\n", + "create an optimized manifest of what items should be carried on which trucks, at what point, and at what\n", + "\n", + "time. This becomes the plan for route accounting systems that arrange deliveries.\n", + "\n", + "Inventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n", + "\n", + "in a store for a long time, that store may receive heightened priority for the item in the allocation.\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "\n", + "**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n", + "\n", + "stock, promotions data, weather\n", + "\n", + "**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n", + "\n", + "**demand forecasting.**\n", + "\n", + "**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n", + "\n", + "Our most popular notebook at Databricks. This blog walks you through the business and technical\n", + "\n", + "challenges of performing demand forecasting and explains how we approached solving it.\n", + "\n", + "**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n", + "\n", + "Video and Q&A from our webinar with Starbucks.\n", + "\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "H&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n", + "\n", + "performant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n", + "\n", + "driven organization that could better forecast operations to streamline costs and boost revenue.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Edmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n", + "\n", + "Platform, they are able to simplify access to their disparate data sources and build ML models that make\n", + "\n", + "predictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n", + "\n", + "on their website is accurate and up to date, improving overall customer satisfaction.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 6**\n", + "### Use Case: Point of Sale\n", + " and Clickstream\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Disruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n", + "\n", + "coupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n", + "\n", + "retailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n", + "\n", + "recorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n", + "\n", + "This would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n", + "\n", + "crucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n", + "\n", + "complete and real-time snapshot of each customer’s shopping behavior.\n", + "\n", + "Near real-time availability of information means that retailers can continuously update their estimates of\n", + "\n", + "item availability. No longer is the business managing operations based on their knowledge of inventory\n", + "\n", + "states as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n", + "\n", + "they are now.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n", + "\n", + "incomplete sales data\n", + "\n", + "\u0007Both POS and clickstream data need to be unified and ingested in real time\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks brings POS and clickstream data together for a unified data source that leads to real-time\n", + "\n", + "insights and a clearer understanding of customer behavior.\n", + "\n", + "\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n", + "\n", + "clickstream data\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n", + "\n", + "customer purchasing behaviors and trends\n", + "\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 7**\n", + "### Use Case: On-Shelf Availability\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Ensuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n", + "\n", + "missing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n", + "\n", + "their stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n", + "\n", + "worldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n", + "\n", + "according to industry research firm IHL.\n", + "\n", + "In the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n", + "\n", + "of going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n", + "\n", + "belong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n", + "\n", + "items online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n", + "\n", + "buy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n", + "\n", + "$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n", + "\n", + "On-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n", + "\n", + "considered in stock when it is actually in a current customer’s basket. If another customer places the same\n", + "\n", + "item in their basket, there is the possibility that the first customer will purchase the last available item\n", + "\n", + "before the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n", + "\n", + "these situations, customers may order an item that is picked for delivery at a much later time. The window\n", + "\n", + "between ordering and picking creates the probability of out-of-stocks.\n", + "\n", + "On-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n", + "\n", + "replenishment points, and generates a signal that suggests an item may be out of stock. This information is\n", + "\n", + "used to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n", + "\n", + "of thousands of people around the world do work that is generated by these algorithms.\n", + "\n", + "The sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n", + "\n", + "all of their products. Companies have between midnight and 4 AM to collect all of the needed information\n", + "\n", + "and run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n", + "\n", + "the priority categories or products to analyze, which means a significant percentage of their unavailable\n", + "\n", + "products will not be proactively addressed.\n", + "\n", + "\n", + "-----\n", + "\n", + "One of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n", + "\n", + "While some retailers are investing in computer vision and robots, and others employ the use of people to\n", + "\n", + "manually survey item availability, most retailers default to a signal of determining when an item has not been\n", + "\n", + "scanned in an acceptable time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "The biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n", + "\n", + "data from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n", + "\n", + "volumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n", + "\n", + "warehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n", + "\n", + "process that can require multiple hours per night.\n", + "\n", + "For this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n", + "\n", + "different days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n", + "\n", + "levels. Among the challenges:\n", + "\n", + "\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n", + "\n", + "\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n", + "\n", + "few categories\n", + "\n", + "\u0007Dealing with false positives and negatives in predictions\n", + "\n", + "Distributing information quickly and efficiently to internal systems and external partners\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n", + "\n", + "compromises.\n", + "\n", + "**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n", + "\n", + "process large volumes of highly detailed and frequently changing point-of-sale transaction data.\n", + "\n", + "**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n", + "\n", + "**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n", + "\n", + "**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n", + "\n", + "reporting\n", + "\n", + "**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n", + "\n", + "**Solution overview**\n", + "\n", + "Databricks enables companies to perform on-shelf availability analysis without making compromises to the\n", + "\n", + "breadth or quality of predictions.\n", + "\n", + "It begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n", + "\n", + "biggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n", + "\n", + "a data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n", + "\n", + "metadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n", + "\n", + "based systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n", + "\n", + "Once data is available, users need to generate predictions about item availability on the shelf. With its\n", + "\n", + "extremely performant engine and the ability to distribute computation across countless nodes, Spark\n", + "\n", + "provides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n", + "\n", + "or against a subset of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n", + "\n", + "In this solution, we show how the\n", + "\n", + "Databricks Lakehouse Platform enables\n", + "\n", + "real-time insights to rapidly respond\n", + "\n", + "\n", + "And lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n", + "\n", + "Lake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n", + "\n", + "feed their predictive alerts to downstream retail operations systems or even to external partners within the\n", + "\n", + "tightest service windows, and in enough time to drive actions on that day.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n", + "\n", + "manual inventory data (optional), robotic or computer vision inventory data (optional)\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Reckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n", + "\n", + "organization to struggle with the complexity of forecast demand, especially with large volumes of different\n", + "\n", + "types of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n", + "\n", + "uses predictive analytics, product placement and business forecasting to better support neighborhood\n", + "\n", + "grocery stores.\n", + "\n", + "\n", + "to demand, drive more sales by\n", + "\n", + "ensuring stock is available on shelf, and\n", + "\n", + "scale out your forecasting models to\n", + "\n", + "accommodate any size operation.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 8**\n", + "### Use Case: Customer and Vehicle Identification\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "COVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n", + "\n", + "options. Retailers that were able to implement these new services have been able to differentiate overall\n", + "\n", + "customer experiences and mitigate catastrophic hits on revenue levels.\n", + "\n", + "For retailers to create a seamless contactless experience for customers, they need real-time data to\n", + "\n", + "know when a customer has arrived and where they’re located, as well as provide updates throughout the\n", + "\n", + "pickup journey. And through the use of computer vision, they can capture that data by employing optical\n", + "\n", + "recognition on images to read vehicle license plates.\n", + "\n", + "Retailers can also use information captured from license plates to make recommendations on buying\n", + "\n", + "patterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n", + "\n", + "information to better serve their customers in real time.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Drive-Through\n", + "Food Retailers\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "\u0007Ineffective data processing can lead to suboptimal order preparation timing\n", + "\n", + "\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n", + "\n", + "\n", + "-----\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n", + "\n", + "time communications throughout the entire shopping and curbside or drive-through experience.\n", + "\n", + "\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n", + "\n", + "preparation timing\n", + "\n", + "\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n", + "\n", + "each subsequent visit is equally as or more seamless than the last\n", + "\n", + "\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n", + "\n", + "vehicle identification and order prediction\n", + "\n", + "**CASE STUDY**\n", + "\n", + "**CASE STUDY**\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 9**\n", + "### Use Case: Recommendation Engines\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n", + "\n", + "frequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n", + "\n", + "is by recommending products and services that align with customer needs.\n", + "\n", + "Providing an experience that makes customers feel understood helps retailers stand out from the crowd\n", + "\n", + "of mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n", + "\n", + "this more critical than ever for retail organizations. With research showing the cost of customer acquisition\n", + "\n", + "is as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n", + "\n", + "continue to build deeper connections with existing customers in order to retain a solid consumer base.\n", + "\n", + "There is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n", + "\n", + "of spending.\n", + "\n", + "Recommendation engines are used to create personalized experiences for users across retail channels.\n", + "\n", + "These recommendations are generated based on the data collected from purchases, items interacted\n", + "\n", + "with, users’ behavior across physical and digital channels, and other data such as from customer service\n", + "\n", + "interactions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n", + "\n", + "behavioral data, marketers are able to create recommendations that are integrated with other business\n", + "\n", + "objectives such as highlighting items that are on promotion or product availability.\n", + "\n", + "Creating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n", + "\n", + "the customer experience in every possible area of consumer engagement, from proactive notifications and\n", + "\n", + "offers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n", + "\n", + "and upsell, and even suggestions for complementary items after the purchase.\n", + "\n", + "\n", + "-----\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "Media Telecom Financial Services\n", + "(any B2B or B2C\n", + "company)\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Recommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n", + "\n", + "but traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n", + "\n", + "recommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n", + "\n", + "irrelevant.\n", + "\n", + "**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n", + "\n", + "is improved by having recent data, and yet most systems struggle to handle the large volumes of\n", + "\n", + "information involved.\n", + "\n", + "**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n", + "\n", + "touchpoints in one place are critical to enabling this use case. More data, including transaction and\n", + "\n", + "clickstream data, is critical for driving accuracy and precision in messaging.\n", + "\n", + "**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n", + "\n", + "changing dynamics, and deliver real-time recommendations via APIs.\n", + "\n", + "**Automation.** This is an “always-on” use case where automation is essential for scalability and\n", + "\n", + "responsiveness based on frequent model updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "Many firms choose to use recommender systems from Amazon or Google. Using these systems trains\n", + "\n", + "the general recommendation engine in a way that helps competitors improve the accuracy of their own\n", + "\n", + "recommendations.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Recommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n", + "\n", + "retailers must own, and Databricks provides a solid platform for enabling this.\n", + "\n", + "Using Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n", + "\n", + "personalization, sample value metrics from a media agency include:\n", + "\n", + "**200% ROI for 70% of retailers** engaging in advanced personalization\n", + "\n", + "**10% improvement** in conversions\n", + "\n", + "**35% improvement** in purchase frequency\n", + "\n", + "**37% improvement** in customer lifetime value\n", + "\n", + "**Solution overview**\n", + "\n", + "Recommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n", + "\n", + "capturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n", + "\n", + "to combine various sources of data in a timely and efficient manner, from transactions, demographics and\n", + "\n", + "preference information across products, to clickstream, digital journey and marketing analytics data to bring\n", + "\n", + "a 360 view of customer interactions to enable omnichannel personalization.\n", + "\n", + "By identifying changes in user behavior or engagement, retailers are able to detect early signals that\n", + "\n", + "indicate a propensity to buy or a change in preferences, and recommend products and services that will\n", + "\n", + "keep consumers engaged.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n", + "\n", + "clickstream data, mobile data:\n", + "\n", + "**Engagement data** — transaction log data, clickstream data, promotion interaction\n", + "\n", + "**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n", + "\n", + "children, location\n", + "\n", + "**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n", + "\n", + "to churn\n", + "\n", + "**CASE STUDY**\n", + "\n", + "For Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n", + "\n", + "for help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n", + "\n", + "personalized to each of their customers.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Columbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n", + "\n", + "Databricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n", + "\n", + "business decisions.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Pandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n", + "\n", + "Lakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n", + "\n", + "quarterly revenue.\n", + "\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Databricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n", + "\n", + "with content-based and collaborative\n", + "\n", + "filter methods, and both item-\n", + "\n", + "and user-based analysis. These\n", + "\n", + "accelerators have been further refined\n", + "\n", + "to be highly performant to enable\n", + "\n", + "frequent retraining of models.\n", + "\n", + "To begin working on recommendation\n", + "\n", + "engines, contact your Databricks\n", + "\n", + "account team.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 10**\n", + "### Use Case: Perpetual Inventory\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "With the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n", + "\n", + "customer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n", + "\n", + "inventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n", + "\n", + "levels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n", + "\n", + "accurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n", + "\n", + "The key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n", + "\n", + "records related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n", + "\n", + "and lower overall costs.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Supply Chain\n", + "\n", + "\n", + "Inventory\n", + "Management\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n", + "\n", + "**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n", + "\n", + "view of inventory\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n", + "\n", + "the latest sales data\n", + "\n", + "**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n", + "\n", + "companies know they’re getting the most accurate information at any point\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n", + "\n", + "management costs\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 11**\n", + "### Use Case: Automated\n", + " Replenishments\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers favor convenience more than ever when it comes to their goods, and automated replenishments\n", + "\n", + "help meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n", + "\n", + "key role in ensuring consumers get a refill automatically delivered at the right time.\n", + "\n", + "On the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n", + "\n", + "reducing the time needed to forecast, order and receive thousands of items.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Distributors Logistics Direct to\n", + "Customer\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n", + "\n", + "replenishment orders\n", + "\n", + "With VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n", + "\n", + "for replenishment even when the customer can’t fulfill that order\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team\n", + "\n", + "to have them perform a free proof-of-\n", + "\n", + "concept with your real-time data.\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Databricks enables real-time inventory updates, giving businesses the insights they need to properly\n", + "\n", + "manage inventory and to forecast more accurately.\n", + "\n", + "**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n", + "\n", + "customer needs\n", + "\n", + "**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n", + "\n", + "unique properties and expiry dates\n", + "\n", + "**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 12**\n", + "### Use Case: Fresh Food Forecasting\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Fresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n", + "\n", + "store traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n", + "\n", + "range of suppliers to work with and the products expire, which creates significant amounts of waste.\n", + "\n", + "In order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n", + "\n", + "sell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n", + "\n", + "timing for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n", + "\n", + "changing needs around fresh food.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Distributors Logistics Restaurants\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n", + "\n", + "enough to conduct daily forecasting and daily replenishment\n", + "\n", + "**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n", + "\n", + "**\u0007** Customers are forced to compromise on what they can analyze\n", + "\n", + "\n", + "-----\n", + "\n", + "HOW TO GET STARTED\n", + "\n", + "Contact your Databricks account team to get\n", + "\n", + "started with inventory allocation. Databricks\n", + "\n", + "does not have a Solution Accelerator.\n", + "\n", + "View our webinar covering demand forecasting\n", + "\n", + "with Starbucks and then read our blog about\n", + "\n", + "demand forecasting.\n", + "\n", + "[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n", + "\n", + "This blog details the importance of time series\n", + "\n", + "forecasting, walks through building a simple\n", + "\n", + "model to show the use of Facebook Prophet, and\n", + "\n", + "then shows off the combination of Facebook\n", + "\n", + "Prophet and Adobe Spark to scale to hundreds\n", + "\n", + "of models.\n", + "\n", + "[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n", + "\n", + "Video and Q&A from our webinar with Starbucks\n", + "\n", + "\n", + "**Value with Databricks**\n", + "\n", + "Customers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\n", + "spoiled products, as well as lower inventory and handling costs.\n", + "\n", + "**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n", + "\n", + "double-digit improvement in forecast accuracy\n", + "\n", + "**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n", + "\n", + "millions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\n", + "few hours.\n", + "\n", + "**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n", + "\n", + "the products they forecast. They can predict demand for all products as frequently as required.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Databricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\n", + "Solution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\n", + "can be efficiently scaled to tens of millions of predictions in tight service windows.\n", + "\n", + "**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n", + "\n", + "expiration dates and weather.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "ButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\n", + "customer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\n", + "Databricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\n", + "rest of its data estate.\n", + "\n", + "\n", + "on demand forecasting.\n", + "\n", + "**CASE STUDY**\n", + "\n", + "Sam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\n", + "trillions of events going through the company. Find out how Databricks became a key component in the shift\n", + "from on premises Hadoop clusters to a cloud based platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 13**\n", + "### Use Case: Propensity-to-Buy\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Customers often have repeatable purchase patterns that may not be noticed upon initial observation.\n", + "\n", + "While we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n", + "\n", + "mornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n", + "\n", + "predict these buying moments when customers are not in our stores?\n", + "\n", + "The purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n", + "\n", + "purchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n", + "\n", + "models leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n", + "\n", + "useful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n", + "\n", + "models are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n", + "\n", + "data from receipts, and causal data that helps to explain when and why customers make purchases.\n", + "\n", + "Propensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n", + "\n", + "management, email and mobile alerts, recommendations and others.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "Retail E-commerce Direct to\n", + "Consumer\n", + "\n", + "\n", + "-----\n", + "\n", + "**Challenges**\n", + "\n", + "**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n", + "\n", + "outreach to customers to avoid angering them.\n", + "\n", + "Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequently\n", + "\n", + "Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Value with Databricks**\n", + "\n", + "**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n", + "\n", + "efficiently synthesize this into data for analysis\n", + "\n", + "**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n", + "\n", + "purchase frequency\n", + "\n", + "**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n", + "\n", + "share of the customer wallet. They need to quickly test and incorporate additional data that improves the\n", + "\n", + "accuracy of their models.\n", + "\n", + "**Solution overview:**\n", + "\n", + "Propensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n", + "\n", + "moment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n", + "\n", + "incorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n", + "\n", + "moment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n", + "\n", + "or even later in the day you have lost the opportunity to act\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "With the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n", + "\n", + "enables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n", + "\n", + "to refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n", + "\n", + "systems, where the consumer can be engaged.\n", + "\n", + "As this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n", + "\n", + "Calculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n", + "\n", + "periods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n", + "\n", + "skipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n", + "\n", + "**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n", + "\n", + "mobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 14**\n", + "### Use Case: Next Best Action\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "The e-commerce boom over the last couple of years has given consumers ample choice for digital\n", + "\n", + "shopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n", + "\n", + "risk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n", + "\n", + "best action for customers, you can greatly increase your conversion rates.\n", + "\n", + "**R E L E V A N T F O R**\n", + "\n", + "\n", + "Retail Consumer\n", + "Goods\n", + "\n", + "\n", + "Direct to\n", + "Consumer\n", + "\n", + "\n", + "E-commerce\n", + "\n", + "\n", + "**Challenges**\n", + "\n", + "Siloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n", + "\n", + "resulting in suboptimal recommendations for the next best action\n", + "\n", + "Companies need to ingest large amounts of data in real time and then take action on it immediately\n", + "\n", + "Many businesses still struggle with training their ML models to properly determine the next best action\n", + "\n", + "(and self-optimize based on the results)\n", + "\n", + "\n", + "-----\n", + "\n", + "**HOW TO GET STARTED**\n", + "\n", + "To begin working on propensity-to-\n", + "\n", + "buy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n", + "\n", + "\n", + "**Value with Databricks:**\n", + "\n", + "Databricks provides all the tools needed to **process large volumes of data and find the next best**\n", + "\n", + "**action** at any given point in the customer journey\n", + "\n", + "**Near real-time insights** — the greater speed to data means businesses can react immediately to\n", + "\n", + "customer actions\n", + "\n", + "**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n", + "\n", + "basic information, transactional data, online behavior/purchase history, and more) to get a complete\n", + "\n", + "customer profile\n", + "\n", + "**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n", + "\n", + "step for customers\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER 15**\n", + "### Customers That Innovate With Databricks Lakehouse for Retail\n", + "\n", + "\n", + "Some of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n", + "\n", + "for Retail to deliver real-time experiences to their customers.\n", + "\n", + "Today, data is at the core of every innovation in the retail and consumer packaged goods industry.\n", + "\n", + "Databricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n", + "\n", + "harness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n", + "\n", + "experiences to customers.\n", + "\n", + "Get started with a free trial of Lakehouse for Retail and start building better data applications today.\n", + "\n", + "**[Start your free trial](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than\n", + "\n", + "7,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune\n", + "\n", + "500 — rely on the Databricks Lakehouse Platform\n", + "\n", + "to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around\n", + "\n", + "the globe. Founded by the original creators of\n", + "\n", + "Apache Spark™, Delta Lake and MLflow, Databricks\n", + "\n", + "is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks\n", + "\n", + "on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf2024-09-19T16:57:21Z
**eBook**\n", + "\n", + "# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n", + "\n", + "### Real-world use cases with Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Three Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n", + "\n", + "The Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n", + "\n", + "Common Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n", + "\n", + "Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n", + "\n", + "Key Use Cases for Insurance:\n", + "\n", + "**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n", + "\n", + "**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n", + "\n", + "**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n", + "\n", + "**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n", + "\n", + "Global Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n", + "\n", + "**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n", + "\n", + "Get Started With Industry Solutions ............................................................................................................................................................. **20**\n", + "\n", + "Conclusion ................................................................................................................................................................................................................... **26**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "With the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\n", + "from the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\n", + "insights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\n", + "data from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\n", + "clickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n", + "\n", + "Consumers want stronger reassurance for what they value most: financial security and greater peace of mind.\n", + "Insurers have always prided themselves on delivering such protection and security. However, customer needs\n", + "have changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\n", + "challenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\n", + "their customers to remain competitive.\n", + "\n", + "Data-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\n", + "pricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\n", + "digital investments and enterprise data strategy has become a top priority for boards and senior executives\n", + "in the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\n", + "to having one reliable source of truth for data, which is derived from batch and streaming data, structured and\n", + "unstructured data, from multiple clouds and jurisdictions.\n", + "\n", + "\n", + "In a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\n", + "fundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\n", + "Databricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\n", + "transform their businesses, resulting in significant operational efficiencies and improved customer experiences\n", + "at a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\n", + "and common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\n", + "Lakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\n", + "partners available to assist you on your journey.\n", + "\n", + "\n", + "**The future of insurance will**\n", + "\n", + "**become increasingly data-driven,**\n", + "\n", + "**and analytics enabled.”**\n", + "\n", + "**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Lakehouse reference architecture below illustrates a sample framework upon\n", + "which insurers can build. Moving from left to right in the diagram, the first layer\n", + "represents various data sources such as on-premises systems, web and mobile\n", + "applications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\n", + "is then ingested through automated data pipelines, and processed within the\n", + "Lakehouse platform across three layers (Bronze, Silver and Gold). These layers\n", + "are responsible for data preparation, including ML model registry, centralized\n", + "\n", + "\n", + "governance, workflow orchestration, and job scheduling. They ensure a compliant\n", + "and secure infrastructure that sits atop the cloud layer (or multiple clouds),\n", + "eliminating the need for data duplication. Finally, the transformed data is delivered\n", + "as actionable insights and supports use cases such as automated reporting,\n", + "business analytics, customer 360, and claims analytics. These use cases not only\n", + "mitigate risk but also drive revenue.\n", + "\n", + "\n", + "**Data Sources**\n", + "\n", + "**On-Premises**\n", + "**Servers**\n", + "\n", + "\n", + "**Ingestion**\n", + "\n", + "\n", + "**Lakehouse for Financial Services**\n", + "\n", + "**Bronze Layer** **Silver Layer** **Gold Layer**\n", + "\n", + "\n", + "**Serving**\n", + "\n", + "**Automated**\n", + "**Reporting**\n", + "\n", + "\n", + "**Web and Mobile**\n", + "**Applications**\n", + "\n", + "\n", + "**Business Analytics**\n", + "**and Interactive**\n", + "**Dashboards**\n", + "\n", + "\n", + "**Raw Entity Data**\n", + "\n", + "\n", + "**Curated Feature**\n", + "**Sets**\n", + "\n", + "\n", + "**Aggregated**\n", + "**Business Views**\n", + "\n", + "\n", + "**Automated Data Pipelines**\n", + "**(Batch or Streaming)**\n", + "\n", + "**Collaborative**\n", + "**Data Source**\n", + "\n", + "\n", + "**Internet-of-Things**\n", + "**(IoT) Devices**\n", + "\n", + "\n", + "**Enterprise Data**\n", + "**Warehouses**\n", + "\n", + "\n", + "**Third-Party APIs**\n", + "**and Services**\n", + "\n", + "\n", + "**ML Model**\n", + "**Registry**\n", + "\n", + "\n", + "**Centralized Data**\n", + "**Governance**\n", + "\n", + "\n", + "**Workflow**\n", + "**Orchestration**\n", + "\n", + "\n", + "**Productionized**\n", + "**Referenced Data**\n", + "**and Models**\n", + "\n", + "**Job Scheduling**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Three Trends Driving Transformation in Insurance\n", + "\n", + "Over the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\n", + "The following three trends are driving this transformation in the insurance industry:\n", + "\n", + "\n", + "**The rapid emergence of large language**\n", + "**models and generative AI**\n", + "\n", + "In recent years, there has been a significant\n", + "breakthrough in the field of artificial intelligence with\n", + "the emergence of large language models (LLMs)\n", + "and generative AI. These models, such as GPT-4 and\n", + "its predecessors, Databricks Dolly and others are\n", + "built using deep learning techniques and massive\n", + "amounts of training data, enabling them to generate\n", + "human-like text and perform a wide range of natural\n", + "language processing tasks. LLMs and generative AI\n", + "can help insurance companies automate repetitive\n", + "tasks such as underwriting, claims processing,\n", + "\n", + "and customer service, improving efficiency and\n", + "reducing costs. They can also help insurers to better\n", + "understand customer needs and preferences,\n", + "leading to more personalized products and services.\n", + "However, as with any disruptive technology, the\n", + "adoption of LLMs and generative AI will require\n", + "careful consideration of ethical and regulatory\n", + "issues, such as data privacy and algorithmic bias.\n", + "\n", + "\n", + "**Transformed ecosystems**\n", + "**and open insurance**\n", + "\n", + "[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\n", + "insurtechs in their ecosystems to achieve high\n", + "margins in commoditized products. Open insurance,\n", + "which involves sharing and managing insurancerelated data through APIs, is more than an item in\n", + "the regulatory agenda. It can give consumers access\n", + "to better products and accurate pricing, as well as\n", + "enable them to execute transactions more easily.\n", + "In its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\n", + "found that organizations that promote external data\n", + "sharing have three times the measurable economic\n", + "\n", + "benefit across a variety of performance metrics\n", + "compared to their peers.\n", + "\n", + "\n", + "**Revised target operating model**\n", + "**with a focus on talent**\n", + "\n", + "Demographic shifts and perennial cost pressures\n", + "make it critical for insurers to attract and retain\n", + "talent. Consequently, it’s important for insurers\n", + "to equip their workforces with the right tools\n", + "and technologies to help them identify business\n", + "processes that can be optimized to differentiate\n", + "themselves from their competitors, with an emphasis\n", + "on moments that matter in the customer journey,\n", + "according to EY. Recent research from Deloitte\n", + "highlights the advantages of upskilling and building\n", + "a future-ready workforce. One of the benefits\n", + "\n", + "of AI adoption in the workforce is that it enables\n", + "organizations to automate a wide range of business\n", + "processes, boosting speed and efficiency. But what’s\n", + "even more important is that it enables employees to\n", + "focus on higher-value work, according to Deloitte.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for Modern Data Infrastructure\n", + "\n", + "**Insurers turning to cloud and data analytics**\n", + "\n", + "\n", + "The insurance industry has undergone significant changes over the years, and\n", + "one of the areas that has evolved the most is data management. With the\n", + "growing need for advanced analytics and digital transformation, many insurance\n", + "companies are turning to cloud technology and modern data infrastructures\n", + "to enhance their data management strategies. The benefits of adopting cloud\n", + "technology are numerous, particularly the ability to efficiently store and quickly\n", + "access vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\n", + "insurers to scale costs, adapt to changing work environments, and meet evolving\n", + "customer and business requirements.\n", + "\n", + "\n", + "dynamic pricing and underwriting, and form the foundation for claims automation.\n", + "By implementing advanced analytics, insurers can innovate more easily, scale their\n", + "businesses, and bring new products to market more quickly.\n", + "\n", + "To remain competitive, insurance companies must increase their investment in\n", + "cloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\n", + "policy administration, and customer satisfaction. Overall, the adoption of cloud\n", + "technology and data analytics is imperative for insurance providers to enhance\n", + "operational efficiency, improve business processes, and stay relevant in today’s\n", + "fast-paced business landscape.\n", + "\n", + "\n", + "Furthermore, insurance providers can leverage the cloud to analyze customer\n", + "data at scale, gaining insights into behaviors that drive hyper-personalization,\n", + "\n", + "\n", + "-----\n", + "\n", + "**Let’s take a closer look look at a few examples:**\n", + "\n", + "\n", + "**Auto insurers** need to integrate new data sources, such as weather and traffic,\n", + "to build solutions capable of real-time processing. This enables them to alert\n", + "emergency services promptly and gain a better understanding of drivers’ driving\n", + "patterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n", + "\n", + "**Commercial insurance** , including property, general liability, cyber insurance and\n", + "business income insurance, utilizes ML-based automation of actuarial models.\n", + "This automation facilitates underwriting, claims forecasting and dynamic pricing\n", + "for their customers. Another notable trend in recent years is the use of IoT-\n", + "\n", + "\n", + "based alerting for sensitive or valuable commodities. For example, in the case of\n", + "vaccines, IoT sensors can monitor the temperature in real time and send alerts to\n", + "the appropriate team or person if the temperature exceeds acceptable thresholds.\n", + "This is crucial as vaccines must be stored within specific temperature ranges.\n", + "\n", + "In **life insurance** , complex ML models can be employed to create a profile of\n", + "the customer’s lifestyle and, importantly, detect any changes to it. This deeper\n", + "understanding and 360-degree view of the customer enable more customized\n", + "underwriting and pricing based on the policyholder’s current health, lifestyle and\n", + "eating habits.\n", + "\n", + "\n", + "|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n", + "|---|---|---|---|---|---|\n", + "|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n", + "|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n", + "|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n", + "|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n", + "|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n", + "\n", + "\n", + "**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n", + "\n", + "\n", + "-----\n", + "\n", + "## Common Challenges Insurers Face Using Legacy Technology\n", + "\n", + "\n", + "Modernization is not an easy process for insurers, and while transforming IT\n", + "ecosystems is necessary to improve business outcomes, ensuring business\n", + "continuity is absolutely critical. However, the volume of data they collect, along\n", + "with changes in user behavior and legacy systems that can’t handle this amount of\n", + "data, are forcing insurance providers to accelerate their modernization journeys.\n", + "\n", + "Insurance providers face several challenges when using legacy technology, including:\n", + "\n", + "**Legacy on-premises systems:** Legacy on-premises systems are not only\n", + "expensive to maintain, but they also store large amounts of big data in silos across\n", + "the business. This makes it difficult to access the data, hindering data analytics\n", + "efforts and limiting executives’ ability to make informed business decisions.\n", + "\n", + "**Ingesting large volumes of transactional data in real time:** The inability to\n", + "ingest data from transaction systems in real time is a major obstacle to obtaining\n", + "critical insights. Transaction logs from operations such as policy administration,\n", + "enrollment and claims constantly stream data. However, many insurance\n", + "companies still rely on legacy data warehouses built around batch processing,\n", + "which is not suitable for ingesting and integrating large data sets. As a result,\n", + "insurers often opt to ingest data nightly, leading to delays in receiving accurate\n", + "data for decision-making.\n", + "\n", + "\n", + "**Performing fine-grained analysis at scale within tight time frames:** Legacy\n", + "technology forces insurers to make a trade-off when analyzing data for user intent.\n", + "They can choose between detailed and accurate predictions or fast predictions.\n", + "Running detailed forecasts can improve accuracy, but it requires performing\n", + "millions of model calculations within narrow service windows, which exceeds the\n", + "capability of legacy data platforms. Consequently, insurance companies have to\n", + "accept less accurate predictions.\n", + "\n", + "**Powering real-time decisions on the front line:** Serving real-time data to\n", + "thousands of workers is a complex task. While data warehouses can serve reports\n", + "to large groups of users, they are limited to providing stale data. As a result, most\n", + "insurers only provide daily or weekly updates to reports and rely on employees’\n", + "judgment for more frequent decisions.\n", + "\n", + "**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\n", + "to deliver personalized experiences across every channel, both digital and offline.\n", + "While insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\n", + "high volumes, leading to inaccurate analytics. To succeed in the insurance industry,\n", + "companies must deliver personalized experiences at scale.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Lakehouse for Insurance addresses the key challenges faced across the\n", + "insurance value chain. The lakehouse enables the integration of various data types,\n", + "including images and structured data, in real time. It offers robust management\n", + "and governance capabilities, and rapidly transforms data into actionable insights\n", + "\n", + "\n", + "through real-time reporting and predictive analytics. This platform-as-a-service\n", + "solution delivers exceptional speed and industry-leading total cost of ownership,\n", + "providing insurers with faster insights to enhance the customer experience and\n", + "gain a competitive edge.\n", + "\n", + "\n", + "**Product**\n", + "**Development &**\n", + "**Feature Selection**\n", + "\n", + "\n", + "**Application**\n", + "**Review &**\n", + "**Submission**\n", + "\n", + "\n", + "**Policy Issue,**\n", + "**Service &**\n", + "**Administration**\n", + "\n", + "\n", + "**Sales & Lead**\n", + "**Management**\n", + "\n", + "**Hyperpersonalization/**\n", + "**life events**\n", + "\n", + "\n", + "**Underwriting**\n", + "**and Pricing**\n", + "\n", + "**UW rules**\n", + "**guidelines &**\n", + "**technical pricing**\n", + "\n", + "\n", + "**Rating Offer &**\n", + "**Endorsements**\n", + "\n", + "**Evaluate**\n", + "**rate options,**\n", + "**pricing and**\n", + "**endorsements**\n", + "\n", + "\n", + "**Claims**\n", + "\n", + "\n", + "**Coverage/** **Review policy**\n", + "**features/riders** **documents**\n", + "**(submission)**\n", + "\n", + "\n", + "**Omnichannel** **Fraud, frequency,**\n", + "**severity and**\n", + "**reserves**\n", + "\n", + "\n", + "**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n", + "\n", + "\n", + "\n", + "**•** Dynamic segmentation\n", + "\n", + "**•** Personas\n", + "\n", + "**•** Hyper-personalization\n", + "\n", + "**•** Intelligent automation\n", + "\n", + "\n", + "\n", + "**•** Product architecture and\n", + "manufacturing\n", + "\n", + "**•** Configurable products\n", + "\n", + "**•** Competitor rates\n", + "\n", + "\n", + "\n", + "**•** Reflexive questionnaire\n", + "\n", + "**•** LLM assistance for\n", + "document summarization\n", + "\n", + "**•** NLP for unstructured data\n", + "\n", + "\n", + "\n", + "**•** Evaluation of risk within\n", + "appetite\n", + "\n", + "**•** Validation of UW\n", + "requirements\n", + "\n", + "**•** Straight-through\n", + "processing optimization\n", + "\n", + "**•** Risk assessment via\n", + "actuarial pricing\n", + "\n", + "**•** Triaging of risk to\n", + "underwriter SME for policy/\n", + "exposure changes\n", + "\n", + "\n", + "\n", + "**•** Predict loss cost\n", + "(frequency and severity)\n", + "\n", + "**•** Computer vision on images\n", + "to identify loss\n", + "\n", + "**•** Auto-adjudication and\n", + "triaging of claims to claim\n", + "adjuster\n", + "\n", + "**•** Tailor communication by\n", + "segment (e.g., email, text,\n", + "mail, or omnichannel)\n", + "\n", + "**•** Identify Fraud, Waste and\n", + "Abuse, route to ICU\n", + "\n", + "\n", + "**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n", + "\n", + "\n", + "-----\n", + "\n", + "## Why Lakehouse for Insurance\n", + "\n", + "Databricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\n", + "best-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n", + "\n", + "\n", + "\n", + "**•** Insurance companies can store any type of\n", + "data using Databricks Lakehouse for Insurance,\n", + "leveraging the low-cost object storage supported\n", + "by cloud providers. This helps break down data\n", + "silos that hinder efforts to aggregate data for\n", + "advanced analytics, such as claim triaging and\n", + "fraud identification, regulatory reporting, or\n", + "compute-intensive risk workloads. Another critical\n", + "feature is the time-travel capabilities of the\n", + "lakehouse architecture, allowing insurers to access\n", + "any historical version of their data.\n", + "\n", + "\n", + "\n", + "**•** Supporting streaming use cases, such as\n", + "monitoring transaction data, is easier with the\n", + "lakehouse. It utilizes Apache Spark ™ as the data\n", + "processing engine and Delta Lake as the storage\n", + "layer. Spark enables seamless switching between\n", + "batch and streaming workloads with just a single\n", + "line of code. Delta Lake’s native support for ACID\n", + "transactions ensures reliable and high-performing\n", + "streaming workloads.\n", + "\n", + "\n", + "\n", + "**•** For both machine learning and non-machine\n", + "learning insurance models, a comprehensive\n", + "governance framework is provided. Data, code,\n", + "libraries and models are linked and independently\n", + "version controlled using technologies like Delta\n", + "Lake and MLflow. Delta Lake ensures stability by\n", + "allowing insurance companies to declare their\n", + "expectations for data quality upfront. MLflow\n", + "enables training models in any language and\n", + "deploying them anywhere, minimizing the need for\n", + "complex handoffs between data science practices,\n", + "independent validation units and operational teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Level-up value with Databricks Lakehouse for insurance**\n", + "\n", + "Building your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\n", + "use cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n", + "\n", + "With a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\n", + "cloud data architecture. The key benefits include:\n", + "\n", + "\n", + "**1. Cost and complexity reduction**\n", + "\n", + "The Databricks Lakehouse provides an open, simple\n", + "and unified cloud data management architecture\n", + "that streamlines operational inefficiencies, reduces\n", + "IT infrastructure costs, and enhances productivity\n", + "across teams.\n", + "\n", + "\n", + "**2. Enhanced risk management and control**\n", + "\n", + "By unlocking the value of enterprise data, the\n", + "platform helps reduce corporate governance and\n", + "security risks. It facilitates data-driven decisionmaking through governed discovery, access and\n", + "data sharing.\n", + "\n", + "\n", + "**3. Accelerated innovation**\n", + "\n", + "The platform enables the acceleration of digital\n", + "transformation, modernization and cloud migration\n", + "initiatives, fostering new growth opportunities\n", + "and driving innovation for improved customer and\n", + "workforce experiences.\n", + "\n", + "\n", + "To help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Reference Architecture for Smart Claims**\n", + "\n", + "\n", + "**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n", + "\n", + "or incrementally through change data capture (CDC). These\n", + "\n", + "include structured and unstructured data sets like images, text,\n", + "\n", + "and video, such as IoT sensor data, operational data like claims\n", + "\n", + "and policies, and on-prem or third-party data such as from\n", + "\n", + "credit bureaus, weather, and driving records. Partner Connect\n", + "\n", + "offers a range of ingest tools from different vendors that you can\n", + "\n", + "directly use from the Databricks portal.\n", + "\n", + "\n", + "**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n", + "\n", + "path to transform the data based on business\n", + "\n", + "requirements. All the data resides in cloud storage,\n", + "\n", + "where Delta refines it into Bronze, Silver and Gold\n", + "\n", + "zones of a medallion pipeline blueprint. Databricks\n", + "\n", + "Workflows provide orchestration of the various\n", + "\n", + "dependent tasks, with advanced capabilities like\n", + "\n", + "\n", + "**3.** \u0007Databricks SQL, with Photon\n", + "\n", + "and serverless options, caters\n", + "\n", + "to BI consumption use cases to\n", + "\n", + "refresh a dashboard monitoring\n", + "\n", + "key metrics and KPIs, with\n", + "\n", + "query history and alerts on\n", + "\n", + "critical events.\n", + "\n", + "\n", + "**4.** \u0007Databricks ML Runtime,\n", + "\n", + "MLFlow, along with\n", + "\n", + "Feature Store, Auto ML,\n", + "\n", + "and real-time Model\n", + "\n", + "Serving enable ML\n", + "\n", + "use cases to provide\n", + "\n", + "\n", + "**5.** \u0007Delta Sharing provides\n", + "\n", + "a secure and governed\n", + "\n", + "way of sharing data\n", + "\n", + "internally and externally\n", + "\n", + "without copying it,\n", + "\n", + "using Unity Catalog.\n", + "\n", + "\n", + "predictive insights.\n", + "\n", + "\n", + "retry, repair and job status notifications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Secure data sharing with Delta Lake**\n", + "\n", + "At the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\n", + "Lake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\n", + "unify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n", + "\n", + "Once the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\n", + "They can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n", + "\n", + "**Business intelligence**\n", + "\n", + "**Streaming**\n", + "\n", + "**Centralized**\n", + "**governance**\n", + "\n", + "\n", + "##### Lakehouse Platform\n", + "\n", + "\n", + "**Data science / ML**\n", + "\n", + "**One copy**\n", + "**of data**\n", + "\n", + "**Data warehouse**\n", + "\n", + "**Orchestration**\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Claims automation and transformation\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving technological advancements\n", + "and increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\n", + "amount of structured and unstructured data coming in from different sources, in different formats, and time\n", + "frames. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\n", + "by a combination of technology and human intervention that seamlessly expedites the process.\n", + "\n", + "**Business problem**\n", + "\n", + "Missing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\n", + "leakage and inefficient processes in triaging claims to the right resource.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Enable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\n", + "including MLflow model lifecycle management.\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Decrease in annual claims payout\n", + "\n", + "**•** Increase in claim fraud detection/prevention\n", + "\n", + "**•** Improve efficiencies by 15%\n", + "\n", + "**“Applying AI as broadly, as aggressively**\n", + "\n", + "**and as enthusiastically as possible. No part**\n", + "\n", + "**of our business should be untouched by it.”**\n", + "\n", + "— \u0007Masashi Namatame, Group Chief Digital Officer,\n", + "Managing Executive Officer, Tokio Marine\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**Tokio Marine: Striving to**\n", + "**become Al-driven**\n", + "\n", + "Insurers of all types now routinely use AI\n", + "models to drive underwriting, streamline claims\n", + "processing and accelerate claims adjudication,\n", + "protect against insurance fraud, and improve\n", + "risk forecasting, for example. Tokio Marine —\n", + "Japan’s oldest insurance company, which has\n", + "done business since 1879 — has been applying\n", + "advanced uses of AI, particularly in its auto\n", + "insurance business, says Masashi Namatame,\n", + "Group Chief Digital Officer and Managing\n", + "Executive Officer at Tokio Marine: “To assess\n", + "collision damages, the company uses an AIbased computer vision solution to analyze\n", + "photos from accident scenes.” Comparing these\n", + "with what he describes as “thousands or even\n", + "millions” of photos of past analogous incidents,\n", + "the model produces liability assessments of the\n", + "parties involved and projects anticipated repair\n", + "costs. AI has also provided the company with\n", + "tangible benefits in online sales — especially in\n", + "personalized product recommendations and\n", + "contract writing, according to Namatame. Read\n", + "the case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Dynamic pricing and underwriting\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "In modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\n", + "carriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\n", + "This involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\n", + "geolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\n", + "offers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n", + "\n", + "**Business problem**\n", + "\n", + "Actuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\n", + "capabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "**•** Unified cloud-native platform\n", + "\n", + "**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n", + "\n", + "**•** Reduced total cost of ownership compared to legacy Hadoop systems\n", + "\n", + "**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\n", + "lowering loss ratios\n", + "\n", + "**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**American financial services**\n", + "**mutual organization**\n", + "\n", + "This organization aimed to leverage the vast\n", + "amounts of structured and unstructured data\n", + "it collected to enhance its underwriting and\n", + "decision-making processes, enabling greater\n", + "efficiency and effectiveness. However, the\n", + "company’s legacy infrastructure struggled\n", + "to scale with the increasing data volume and\n", + "processing demands, limiting its ability to\n", + "analyze the data and derive actionable insights.\n", + "\n", + "With Databricks, the insurer centralized\n", + "everything on one unified Lakehouse platform,\n", + "\n", + "supporting all operational and analytical\n", + "use cases. This allowed them to analyze\n", + "broader sets of data for superior underwriting\n", + "performance and create a digitally empowered,\n", + "end-to-end underwriting experience.\n", + "\n", + "\n", + "\n", + "**•** Improve competitive position\n", + "\n", + "**•** Decrease combined ratio\n", + "\n", + "**•** 15% improvement in efficiencies\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "## Anomaly detection and fraudulent claims\n", + "\n", + "**Overview**\n", + "\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**One of the largest U.S.**\n", + "**insurance companies and a**\n", + "**leading small business insurer**\n", + "\n", + "The increasing availability of data and market\n", + "competition challenge insurance providers to\n", + "offer better pricing to their customers. This\n", + "U.S.-based insurer, with hundreds of millions of\n", + "insurance records to analyze for downstream\n", + "ML, realized that their legacy batch analysis\n", + "process was slow and inaccurate, providing\n", + "limited insight for predicting the frequency\n", + "and severity of claims. With Databricks, they\n", + "were able to scale up the use of deep learning\n", + "models, resulting in more accurate pricing\n", + "predictions and increased revenue from\n", + "claims. By leveraging Databricks Lakehouse,\n", + "they harmonized data, analytics and AI at\n", + "scale, enabling accurate pricing predictions\n", + "and supporting various use cases from vehicle\n", + "telematics to actuarial modeling.\n", + "\n", + "\n", + "Fraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\n", + "American consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\n", + "in 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\n", + "change to support new channels and services, offering transactional features and facilitating payments through\n", + "digital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\n", + "consumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\n", + "models. It often involves a complex decision science process that combines a rules engine with a robust and\n", + "scalable machine learning platform.\n", + "\n", + "**Business problem**\n", + "\n", + "Insurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Modernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\n", + "providers (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\n", + "This data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n", + "\n", + "**$1 of fraud costs companies 3.36x in chargeback,**\n", + "**replacement and operational costs**\n", + "\n", + "\n", + "[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n", + "\n", + "\n", + "-----\n", + "\n", + "**K E Y U S E C A S E**\n", + "\n", + "## Customer 360 and hyper-personalization\n", + "\n", + "\n", + "**Overview**\n", + "\n", + "Winning the hearts and minds of your customers\n", + "starts with personalizing the user experience. The\n", + "ability to offer complementary products to meet\n", + "the needs of your customers lets you build deeper\n", + "relationships with them and engender their loyalty.\n", + "In addition, a better understanding of the finer\n", + "details within accounts allows you to offer them\n", + "more personalized products. To do this, you need\n", + "360-degree customer views, which requires you to\n", + "locate and consolidate all your customers’ contact\n", + "data from every digital tool that you use and house\n", + "it in one central location. With Databricks Lakehouse,\n", + "insurers can “hyper-personalize,” increase\n", + "cross-sell/upsell opportunities, enhance customer\n", + "360 and bring new products to market faster.\n", + "\n", + "**Business problem**\n", + "\n", + "The inability to reconcile customer records across\n", + "different lines of business limits real-time customer\n", + "insights necessary for upselling and cross-selling.\n", + "Siloed data makes it challenging to create accurate\n", + "and comprehensive customer profiles, resulting in\n", + "suboptimal recommendations for the next best action.\n", + "\n", + "\n", + "**Solution/value with Databricks**\n", + "\n", + "Databricks provides the tools needed to process\n", + "large volumes of data and determine the next best\n", + "action at any point in the customer journey.\n", + "\n", + "**•** Eliminates data silos by unifying all customer data,\n", + "including basic information, transactional data,\n", + "online behavior/purchase history, etc., to create\n", + "complete customer profiles\n", + "\n", + "**•** Integrated data security ensures that security\n", + "measures are incorporated at every layer of the\n", + "Databricks Lakehouse Platform\n", + "\n", + "**•** Delta improves data quality, providing a single\n", + "source of truth for real-time streams and ensuring\n", + "reliable and high-quality data for data teams\n", + "\n", + "**•** Integrated ML and AI capabilities utilize AI to\n", + "create self-optimizing ML models that determine\n", + "the next best step for each customer\n", + "\n", + "**•** MLflow model lifecycle management helps manage\n", + "the entire machine learning lifecycle reliably,\n", + "securely and at scale\n", + "\n", + "\n", + "**Business outcomes and benefits**\n", + "\n", + "**•** Use AI, ML, automation and real-time data to\n", + "gain deeper customer insights and understand\n", + "their needs\n", + "\n", + "**•** Improve competitive positioning\n", + "\n", + "**•** Enhance the customer experience\n", + "\n", + "**C U S T O M E R C A S E S T U D Y**\n", + "\n", + "**160-year-old U.S.**\n", + "**insurance company**\n", + "\n", + "This insurance provider underwent a significant\n", + "digital transformation to provide a more\n", + "personalized financial services experience to\n", + "its 10,000 advisors and millions of customers\n", + "across various touchpoints. Recognizing the\n", + "importance of becoming data-driven, the\n", + "company leveraged Databricks in its client\n", + "360 platform to aggregate transactional and\n", + "behavioral data, along with core attributes,\n", + "providing business users with next-best-action\n", + "recommendations for seamless customer\n", + "engagement.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Global Regulatory Impact in Insurance\n", + "\n", + "\n", + "**Navigating global regulations**\n", + "**with technical implementation**\n", + "\n", + "Digital innovation continues to reshape the insurance sector. The pace and scale\n", + "of technological change are likely to increase due to factors such as artificial\n", + "intelligence (AI), cloud computing, and the entry of new players like insurtechs,\n", + "e-tailers, and manufacturers from outside the insurance industry.\n", + "\n", + "To succeed and thrive in today’s economic environment, insurers should prioritize\n", + "upgrading their infrastructure and technology, rather than solely focusing on\n", + "transforming operations. For example, migrating from on-premises systems to the\n", + "cloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n", + "\n", + "As insurers upgrade their compliance processes to meet new global regulations,\n", + "such as IFRS 17 and LDTI, the impact of regulatory updates becomes more\n", + "complex for organizations operating across multiple jurisdictions. Instead of\n", + "merely responding to regulatory and industry requirements, insurance companies\n", + "should make data-focused investments that help them anticipate and meet the\n", + "expectations of distributors and policyholders.\n", + "\n", + "\n", + "**IFRS-17**\n", + "\n", + "IFRS 17 is an International Finance Reporting Standard (IFRS) for\n", + "insurance contracts. IFRS 17 aims to standardize insurance accounting\n", + "by providing consistent principles for all facets of accounting for\n", + "insurance contracts. IFRS 17 removes existing inconsistencies so\n", + "analysts, investors and others can more easily compare companies,\n", + "contracts and industries.\n", + "\n", + "**LDTI for long-duration contracts**\n", + "\n", + "The Financial Accounting Standards Board long-duration targeted\n", + "improvements (LDTI) introduced changes to the U.S. GAAP accounting\n", + "model to simplify and improve the financial reporting of long-duration\n", + "contracts, including providing financial statement users with more\n", + "timely and relevant information about those contracts.\n", + "\n", + "\n", + "It is crucial for insurers to redirect their focus toward developing advanced data\n", + "management and utilization capabilities that offer better insights and improved\n", + "performance. These investments serve as not only a foundation for regulatory\n", + "compliance but also a starting point for more comprehensive and proactive\n", + "transformation initiatives.\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N D U S T R Y S O L U T I O N S**\n", + "\n", + "## Get Started With Accelerators, Brickbuilders and Enablers\n", + "\n", + "Insurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n", + "\n", + "**Adoption challenges**\n", + "\n", + "\n", + "Numerous challenges hinder organizations from developing and implementing the\n", + "necessary technical solutions to enhance their operational effectiveness, increase\n", + "revenue, and stay competitive. These challenges include:\n", + "\n", + "**•** Lack of technical skills (data scientists/data engineers): Companies often\n", + "struggle to find employees proficient in Python or Scala, or individuals who\n", + "possess extensive experience in data science.\n", + "\n", + "\n", + "\n", + "**•** Business problems require in-depth data science and industry knowledge:\n", + "Businesses seek solutions tailored to address specific problems, rather than\n", + "generic technical features.\n", + "\n", + "**•** Companies seek actionable insights: Organizations prefer readily applicable\n", + "patterns that can be quickly implemented, rather than custom data science\n", + "solutions that come with potential costs and risks of implementation failure.\n", + "\n", + "\n", + "**What are accelerators/enablers?**\n", + "\n", + "\n", + "**Solution Accelerators**\n", + "\n", + "Save hours on discovery, design, development and\n", + "testing with Databricks Solution Accelerators. Our\n", + "purpose-built guides, including fully functional\n", + "notebooks and best practices, expedite results for\n", + "your most common and high-impact use cases. With\n", + "these accelerators, you can go from idea to proof of\n", + "concept (PoC) in as little as two weeks.\n", + "\n", + "\n", + "**Brickbuilders**\n", + "\n", + "Brickbuilder Solutions are data and AI solutions\n", + "designed by leading consulting companies to\n", + "address industry-specific business requirements.\n", + "Built on the Databricks Lakehouse Platform and\n", + "backed by the industry experience of these\n", + "consultancies, businesses can have confidence\n", + "in solutions tailored to their specific use cases.\n", + "Brickbuilder Solutions can be implemented at any\n", + "stage of the customer journey.\n", + "\n", + "\n", + "**Solution Enablers**\n", + "\n", + "Solution enablers consist of targeted collections\n", + "of notebooks and materials, such as webinars and\n", + "blog posts, designed to support larger solutions.\n", + "They aim to solve pain points or address specific\n", + "layers of business capabilities, such as resolving data\n", + "ingestion challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Get Started With Industry Solutions\n", + "\n", + "\n", + "**Claims transformation:**\n", + "**automation and fraud prevention**\n", + "\n", + "Insurers are entering a new era of claims transformation, supported by evolving\n", + "technological advancements and growing data availability. The end-to-end claims\n", + "process, from extracting relevant information from documentation submitted\n", + "when filing a claim to triaging and routing claims and the underwriting process,\n", + "is ripe for digital transformation. By leveraging the Databricks Lakehouse,\n", + "organizations can handle millions of data points coming in different formats and\n", + "time frames, from various sources, at an unprecedented volume. Every touchpoint\n", + "in the claims journey, starting even before an incident occurs, will be supported by\n", + "a combination of technology and human intervention that seamlessly expedites\n", + "the process. Personalizing the claims experience by anticipating needs, providing\n", + "real-time status alerts, and reducing friction in the process increases customer\n", + "loyalty and retention.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Accelerate underwriting through collaboration and efficient ML**\n", + "\n", + "A leading P&C insurer took full advantage of the MongoDB and Databricks\n", + "integration, leveraging both platforms to foster collaboration between their data\n", + "and developer teams. The integration provides a more natural development\n", + "experience for Spark users and exposes all of Spark’s libraries. This allows\n", + "MongoDB data to be materialized as DataFrames and data sets for analysis\n", + "using machine learning, graph, streaming and SQL APIs. The insurer also benefits\n", + "from automatic schema inference. With this integration, the insurer was able to\n", + "train and observe their ML models (MongoDB Atlas Charts) more efficiently and\n", + "incorporate them into business applications.\n", + "\n", + "As a result, crucial underwriting processes that previously took days are now executed\n", + "in seconds. In addition to the time and cost savings, the company can provide a more\n", + "immediate response to customers within its digital experience platform.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n", + "\n", + "**Claims processing is the process whereby an insurer receives,**\n", + "\n", + "\n", + "**verifies and processes a claim report submitted by a policyholder.**\n", + "\n", + "**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n", + "\n", + "**criticial component of customer satisfaction with their carrier.”**\n", + "\n", + "\n", + "**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n", + "\n", + "\n", + "[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n", + "\n", + "\n", + "**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n", + "\n", + "\n", + "**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n", + "\n", + "**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Risk management:**\n", + "**dynamic pricing and underwriting**\n", + "\n", + "Modernized approaches at insurance carriers require a full digital transformation,\n", + "and one aspect of this transformation involves dynamic pricing and underwriting\n", + "to reduce premiums. Insurance providers are now consuming data from the largest\n", + "mobile telematics providers to obtain the most granular sensor and trip summaries\n", + "for users of online insurance applications. Not only is this data critical for pricing,\n", + "but it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\n", + "and underwriting automate routine tasks and provide teams with alternative\n", + "data sources to empower actuarial and underwriting professionals to become\n", + "“exponential.” This allows teams to focus on key aspects of risk selection and\n", + "analysis that drive competitive advantage and market differentiation. By leveraging\n", + "personalized data points, insurers can deliver near real-time underwriting\n", + "decisions for life insurance applicants, reducing policy abandonment and costs.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**Automated extraction of medical risk factors for life insurance underwriting**\n", + "**(John Snow Labs)**\n", + "\n", + "Life insurance underwriting considers an applicant’s medical risk factors in\n", + "addition to mortality risk characteristics. These risk factors are often found\n", + "in free-text documents. New insurance-specific natural language processing\n", + "(NLP) models can automatically extract relevant medical history and risk factors\n", + "from such documents. Forward-thinking companies are embracing accelerated\n", + "underwriting, which utilizes new data along with algorithmic tools and modeling\n", + "techniques to quickly assess and group applicants without requiring bodily fluids,\n", + "physician’s notes, and so on. This joint Solution Accelerator from Databricks and\n", + "John Snow Labs simplifies the implementation of this approach, creating a faster,\n", + "more consistent, and scalable underwriting experience.\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n", + "\n", + "**Risk is highly influenced by behavior, and 80% of morbidity in**\n", + "\n", + "\n", + "**healthcare risk is driven by factors such as smoking, drinking**\n", + "\n", + "**alcohol, physical activity and diet. In the case of driving,**\n", + "\n", + "**60% of fatal accidents are a result of behavior alone. If insurers**\n", + "\n", + "**can change customer behaviors and help them make better**\n", + "\n", + "**choices, then the risk curve shifts.”**\n", + "\n", + "\n", + "**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n", + "\n", + "**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n", + "\n", + "**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "\n", + "\n", + "[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "\n", + "\n", + "**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Product distribution:**\n", + "**segmentation and personalization**\n", + "\n", + "The most forward-thinking and data-driven insurers are\n", + "focused on achieving personalization at scale. They are\n", + "exploring new partnerships and business models to create\n", + "integrated, value-added experiences that prioritize the\n", + "overall health and financial wellness of their customers,\n", + "rather than just their insurance needs. These insurers\n", + "are investing in new data sources, analytics platforms,\n", + "and artificial intelligence (AI)-powered decision engines\n", + "that enable them to connect producers with like-minded\n", + "customers or engage customers with enticing offers\n", + "and actionable steps based on their previous choices.\n", + "The outcome is more efficient and effective service\n", + "from producers, trusted and convenient interactions for\n", + "consumers, and increased customer engagement and\n", + "growth for insurers in an increasingly digital-oriented world.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n", + "\n", + "[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\n", + "insurance companies. It enables them to complete, unify and comprehensively capture customer profiles\n", + "using a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n", + "360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\n", + "as call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n", + "360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n", + "\n", + "With Persona 360, you can:\n", + "\n", + "**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n", + "1,695+ attributes and segments\n", + "\n", + "**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\n", + "Persona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\n", + "users to comprehend and activate the data\n", + "\n", + "**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\n", + "personalized campaigns\n", + "\n", + "\n", + "**Learn more:**\n", + "\n", + "\n", + "**Watch video:**\n", + "\n", + "\n", + "**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n", + "\n", + "**Demand for hyper-personalized and real-time risk protection**\n", + "\n", + "\n", + "**requires broad adoption of artificial** **intelligence (AI), machine**\n", + "\n", + "**learning and digital platforms.**\n", + "\n", + "**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n", + "\n", + "\n", + "**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n", + "\n", + "**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n", + "\n", + "\n", + "[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n", + "\n", + "\n", + "**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n", + "\n", + "**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n", + "**by insurance provider type**\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n", + "|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n", + "|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n", + "|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n", + "|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n", + "|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n", + "|---|---|---|---|---|\n", + "|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n", + "|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n", + "|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n", + "|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\n", + "Insurance empowers insurance providers to leverage the potential of data and analytics to address strategic\n", + "challenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n", + "\n", + "**Customers that innovate with Databricks Lakehouse for Insurance**\n", + "\n", + "Some of the top property and casualty, life and health insurance companies and reinsurers in the world turn\n", + "to Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\n", + "smarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000 organizations worldwide — including\n", + "\n", + "Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n", + "\n", + "Lake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n", + "\n", + "problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n", + "\n", + "###### Contact us for a personalized demo at:\n", + " dbricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf2024-09-19T16:57:21Z
```\n", + "TECHNICAL GUIDE\n", + "\n", + "```\n", + "\n", + "# Solving Common Data Challenges \n", + "\n", + "\n", + "#### Startups and Digital Native Businesses\n", + "\n", + "\n", + "-----\n", + "\n", + "### Table of Contents\n", + "\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Creating a unified data architecture for data quality, governance and efficiency\n", + "\n", + "# 03\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building effective machine learning operations\n", + "\n", + "```\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building a data architecture to support scale and performance\n", + "\n", + "# 04\n", + "SUMMARY:\n", + "\n", + "###### The Databricks Lakehouse Platform addresses these challenges\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "\n", + "This guide shares how the lakehouse architecture can increase\n", + "productivity and cost-efficiently support all your data, analytics\n", + "and AI workloads, and flexibly scale with the pace of growth\n", + "for your company. Read the entire guide or dive straight into a\n", + "specific challenge.\n", + "\n", + "With the advent of cloud infrastructure, a new generation of\n", + "startups has rapidly built and scaled their businesses. The use of\n", + "cloud infrastructure, once seen as innovative, has now become\n", + "table stakes. The differentiator for the fastest-moving startups\n", + "and digital natives now comes from the effective use of data\n", + "at scale, primarily analytics and AI. Digital natives — defined\n", + "as fast-moving, lean, and technically savvy, born-in-the-cloud\n", + "organizations — are beginning to focus on new data-driven use\n", + "cases such as real-time machine learning and personalized\n", + "customer experiences.\n", + "\n", + "To pursue these new data-intensive use cases and initiatives,\n", + "organizations must look beyond the technologies that delivered\n", + "them to this point in time. Over time, these technologies, such\n", + "as transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n", + "\n", + "This guide examines some of the biggest data challenges and\n", + "solutions for startups and for scaling digital native businesses\n", + "that have reached the point where an end-to-end modern data\n", + "platform is a smart investment. Some key considerations include:\n", + "systems that are not cost-efficient and require time-consuming\n", + "administration and engineering toil. In addition to growing\n", + "maintenance needs, data is often stored in disparate locations\n", + "and formats, with little or no governance, making real-time use\n", + "cases, analytics and AI difficult or impossible.\n", + "\n", + "\n", + "**Consolidating on a unified data platform**\n", + "As mentioned above, siloed data storage and management add administrative and\n", + "financial cost. You can benefit significantly when you unify your data in one location\n", + "with a flexible architecture that scales with your needs and delivers performance\n", + "for future success. For this, you will want an open platform that supports all your\n", + "data including batch and streaming workloads, data analytics and machine learning.\n", + "With data unification, you create a more efficient, integrated approach to ingesting,\n", + "cleaning and organizing your data. You also need automation to make data analysis\n", + "easier for the nontechnical users in the company. But broader data access also\n", + "means more focus on security, privacy, compliance and access control, which can\n", + "create overhead for a growing.\n", + "\n", + "**Scaling up capacity and increasing performance**\n", + "**and usability of the data solutions**\n", + "Data teams at growing digital native organizations find it time intensive and costly to\n", + "handle the growing volume and velocity of their data being ingested from multiple\n", + "sources, across multiple clouds. You now need a unified and simplified platform that\n", + "can instantly scale up capacity and deliver more computing power on demand to\n", + "free up your data teams to produce outputs more quickly. This lowers the total cost\n", + "for the overall infrastructure by eliminating redundant licensing, infrastructure and\n", + "administration costs.\n", + "\n", + "**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "As cloud-born companies grow, data volumes rapidly increase, leading to new\n", + "challenges and use cases. Among the challenges:\n", + "\n", + "\n", + "Application stacks optimized for transaction\n", + "use cases aren’t able to handle the volume,\n", + "velocity and variety of data that modern data\n", + "teams require. For example, this leads to query\n", + "performance issues as data volume grows.\n", + "\n", + "Data silos develop as each team within an\n", + "organization chooses different ETL/ELT and\n", + "storage solutions for their needs. As the\n", + "organization grows and changes, these pipelines\n", + "and storage solutions become brittle, hard to\n", + "maintain and nearly impossible to integrate.\n", + "\n", + "\n", + "These data silos lead to discoverability,\n", + "integration and access issues, which prevent\n", + "teams from leveraging the full value of the\n", + "organization’s available data.\n", + "\n", + "Data governance is hard. Disparate ETL/ELT\n", + "and storage solutions lead to governance,\n", + "compliance, auditability and access control\n", + "challenges, which expose organizations to\n", + "tremendous risk.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides\n", + "a unified set of tools for building, deploying,\n", + "sharing and maintaining data solutions at scale.\n", + "It integrates with cloud storage and the security\n", + "in your cloud account, manages and deploys\n", + "cloud infrastructure on your behalf. Your data\n", + "practitioners no longer need separate storage\n", + "systems for their data. And you don’t have to rely\n", + "on your cloud provider for security. The lakehouse\n", + "has its own robust security built into the platform.\n", + "\n", + "\n", + "For all the reasons above, the most\n", + "consistent advice from successful data\n", + "practitioners is to create a “single source\n", + "of truth” by unifying all data on a single\n", + "platform. With the Databricks Lakehouse\n", + "Platform, you can unify all your data on one\n", + "platform, reducing data infrastructure costs\n", + "and compute. You don’t need excess data\n", + "copies and you can retire expensive\n", + "legacy infrastructure.\n", + "```\n", + " 01\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: GRAMMARLY\n", + "\n", + "### Helping 30 million people and 50,000 teams communicate more effectively\n", + "\n", + "```\n", + "\n", + "While its business is based on analytics, [Grammarly](http://www.grammarly.com)\n", + "\n", + "for many years relied on a homegrown analytics\n", + "\n", + "platform to drive its AI writing assistant to\n", + "\n", + "help users improve multiple aspects of written\n", + "\n", + "communications. As teams developed their own\n", + "\n", + "requirements, data silos inevitably emerged as\n", + "\n", + "different business areas implemented analytics\n", + "\n", + "tools individually.\n", + "\n", + "“Every team decided to solve their analytics\n", + "\n", + "needs in the best way they saw fit,” said Chris\n", + "\n", + "Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholds\n", + "\n", + "the standards we set as a data platform team,” said Locklin. “Lineage is\n", + "\n", + "the last crucial piece for access control.”\n", + "\n", + "Data analysts within Grammarly now have a consolidated interface for\n", + "\n", + "analytics, which leads to a single source of truth and confidence in the\n", + "\n", + "accuracy and availability of all data managed by the data platform team.\n", + "\n", + "Having a consistent data source across the company also resulted in\n", + "\n", + "greater speed and efficiency and reduced costs. Data practitioners\n", + "\n", + "experienced 110% faster querying at 10% of the cost to ingest compared\n", + "\n", + "to a data warehouse. Grammarly can now make its 5 billion daily events\n", + "\n", + "available for analytics in under 15 minutes rather than 4 hours. Migrating\n", + "\n", + "off its rigid legacy infrastructure gave Grammarly the flexibility to do\n", + "\n", + "more and the confidence that the platform will evolve with its needs.\n", + "\n", + "Grammarly is now able to sustain a flexible, scalable and highly secure\n", + "\n", + "analytics platform that helps 30 million people and 50,000 teams\n", + "\n", + "worldwide write more effectively every day.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/grammarly)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to unify the data infrastructure with Databricks\n", + "\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\n", + "is composed of two primary parts:\n", + "\n", + "- The infrastructure to deploy, configure and\n", + "manage the platform and services\n", + "\n", + "\n", + "You can build a Databricks workspace by configuring\n", + "secure integrations between the Databricks platform\n", + "and your cloud account, and then Databricks deploys\n", + "temporary Apache Spark™/Photon clusters using cloud\n", + "resources in your account to process and store data\n", + "in object storage and other integrated services you\n", + "control. Here are three steps to get started with the\n", + "Databricks Lakehouse Platform:\n", + "\n", + "**Understand the architecture**\n", + "The lakehouse provides a unified architecture,\n", + "meaning that all data is stored in the same\n", + "accessible place. The diagram shows how data\n", + "comes in from sources like a customer relationship\n", + "management (CRM) system, an enterprise resource\n", + "planning (ERP) system, websites or unstructured\n", + "customer emails.\n", + "\n", + "**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).\n", + "\n", + "[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Lakehouse organizes data stored with Delta Lake in cloud object\n", + "storage with familiar concepts like database, tables and views. Delta Lake extends\n", + "Parquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\n", + "scalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\n", + "and was developed for tight integration with Structured Streaming, allowing you to\n", + "easily use a single copy of data for both batch and streaming operations to provide\n", + "incremental processing at scale.This model combines many of the benefits of a data\n", + "warehouse with the scalability and flexibility of a data lake.\n", + "\n", + "To learn more about the optimized storage layer that provides the foundation for\n", + "storing data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n", + "[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n", + "\n", + "The first step in unifying your data architecture is setting up how data is to be\n", + "accessed and used across the organization. We’ll discuss this as a series of steps:\n", + "\n", + "**1** Set up governance with Unity Catalog\n", + "\n", + "**2** Grant secure access to the data\n", + "\n", + "\n", + "###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n", + " – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n", + "\n", + "[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n", + "\n", + "To set up Unity Catalog for your organization,\n", + "you do the following:\n", + "\n", + "\n", + "**1** Configure an S3 bucket and IAM role that\n", + "Unity Catalog can use to store and access\n", + "data in your AWS account.\n", + "\n", + "**2** Create a metastore for each region in\n", + "\n", + "which your organization operates, and\n", + "attach workspaces to the metastore. Each\n", + "workspace will have the same view of the\n", + "data you manage in Unity Catalog.\n", + "\n", + "\n", + "**3** If you have a new account, add users,\n", + "groups and service principals to your\n", + "Databricks account.\n", + "\n", + "**4** Next, create and grant access to\n", + "\n", + "catalogs, schemas and tables.\n", + "\n", + "\n", + "For complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How Unity Catalog works\n", + "\n", + "\n", + "You will notice that the hierarchy of primary data\n", + "objects in Unity Catalog flows from metastore to table:\n", + "\n", + "**Metastore** is the top-level container for metadata.\n", + "Each metastore exposes a three-level namespace\n", + "(catalog.schema.table) that organizes your data.\n", + "\n", + "\n", + "**Metastore** **Catalog** **Schemas**\n", + "\n", + "\n", + "**Views**\n", + "\n", + "**Managed**\n", + "**Tables**\n", + "\n", + "\n", + "**Catalog** is the first layer of the object hierarchy, used\n", + "to organize your data assets.\n", + "\n", + "\n", + "**Schemas** , also known as databases, are the second\n", + "layer of the object hierarchy and contain tables and\n", + "views.\n", + "\n", + "**Table** is the lowest level in the object hierarchy, and\n", + "tables can be external (stored in external locations in\n", + "your cloud storage of choice) or managed (stored in a\n", + "storage container in your cloud storage that you create\n", + "\n", + "expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n", + "\n", + "Unity Catalog users, service principals, and groups\n", + "must also be added to workspaces to access Unity\n", + "Catalog data in a notebook, a Databricks SQL query,\n", + "Data Explorer or a REST API command. The assignment\n", + "of users, service principals, and groups to workspaces\n", + "is called identity federation. All workspaces attached\n", + "to a Unity Catalog metastore are enabled for identity\n", + "federation.\n", + "\n", + "Securable objects in Unity Catalog are hierarchical,\n", + "meaning that granting a privilege on a catalog or schema\n", + "automatically grants the privilege to all current and\n", + "future objects within the catalog or schema. For more\n", + "on granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\n", + "A common scenario is to set up a schema per team\n", + "where only that team has USE SCHEMA and CREATE on\n", + "the schema. This means that any tables produced by\n", + "team members can only be shared within the team.\n", + "Data Explorer uses the privileges configured by Unity\n", + "Catalog administrators to ensure that users are only\n", + "able to see catalogs, databases, tables and views that\n", + "they have permission to query.\n", + "\n", + "\n", + "[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\n", + "many Unity Catalog features. Use Data Explorer to view\n", + "schema details, preview sample data, and see table\n", + "details and properties. Administrators can view and\n", + "change owners. Admins and data object owners can grant\n", + "and revoke permissions through this interface.\n", + "\n", + "**Set up secure access**\n", + "In Unity Catalog, data is secure by default. Initially, users\n", + "have no access to data in a metastore. Access can\n", + "be granted by either a metastore admin, the owner of\n", + "an object, or the owner of the catalog or schema that\n", + "contains the object. Securable objects in Unity Catalog\n", + "are hierarchical and privileges are inherited downward.\n", + "\n", + "Unity Catalog’s security model is based on standard ANSI\n", + "SQL and allows administrators to grant permissions in\n", + "their existing data lake using familiar syntax, at the level of\n", + "catalogs, databases (schema), tables and views. Privileges\n", + "and metastores are shared across workspaces, allowing\n", + "administrators to set secure permissions once against\n", + "\n", + "groups synced from identity providers and know that\n", + "end users only have access to the proper data in any\n", + "Databricks workspace they enter.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: BUTCHERBOX\n", + "\n", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significant\n", + "\n", + "\n", + "“We knew we needed to migrate from our legacy data warehouse\n", + "\n", + "environment to a data analytics platform that would unify our\n", + "\n", + "data and make it easily accessible for quick analysis to improve\n", + "\n", + "supply chain operations, forecast demand and, most importantly,\n", + "\n", + "keep up with our growing customer base,” explained Jake Stone,\n", + "\n", + "Senior Manager, Business Analytics, at ButcherBox.\n", + "\n", + "The platform allows analysts to share builds and iterate on a\n", + "\n", + "project without getting into the code. Querying a table of 18\n", + "\n", + "billion rows would have been problematic with a traditional\n", + "\n", + "platform. With Databricks, ButcherBox can do it in three minutes.\n", + "\n", + "“Delta Lake provides us with a single source of truth for all of\n", + "\n", + "our data,” said Stone. “Now our data engineers are able to build\n", + "\n", + "reliable data pipelines that thread the needle on key topics such\n", + "\n", + "as inventory management, allowing us to identify in near real-\n", + "\n", + "time what our trends are so we can figure out how to effectively\n", + "\n", + "move inventory.”\n", + "\n", + "[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "problem because they blocked complete\n", + "\n", + "visibility into critical insights needed to make\n", + "\n", + "strategic and marketing decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Set up secure data sharing**\n", + "Databricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\n", + "to share data with other entities regardless of their\n", + "computing platforms. Delta Sharing is integrated with\n", + "Unity Catalog. Your data must be registered with Unity\n", + "Catalog to manage, govern, audit and track usage of the\n", + "shared data on the Lakehouse Platform. The primary\n", + "concepts of Delta Sharing are shares (read-only\n", + "collections of tables and table partitions to be shared)\n", + "and recipients (objects that associate an organization\n", + "with a credential or secure sharing identifier).\n", + "\n", + "As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n", + "\n", + "**View data lineage**\n", + "You can use Unity Catalog to capture runtime data\n", + "lineage across queries in any language executed on\n", + "a Databricks cluster or SQL warehouse. Lineage can\n", + "be visualized in Data Explorer in near real-time and\n", + "retrieved with the Databricks REST API. Lineage is\n", + "aggregated across all workspaces attached to Unity\n", + "Catalog and captured down to the column level, and\n", + "includes notebooks, workflows and dashboards related\n", + "to the query. To understand the requirements and how\n", + "to capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n", + "[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n", + "\n", + "\n", + "Unity Catalog Metastore\n", + "\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data providers can use Databricks audit logging to\n", + "monitor the creation and modification of shares,\n", + "and recipients can monitor recipient activity on\n", + "shares. Data recipients who use shared data in a\n", + "Databricks account can use Databricks audit logging\n", + "to understand who is accessing which data.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n", + "\n", + "- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n", + "\n", + "- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n", + "\n", + "- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n", + "\n", + "- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "\n", + "\n", + "###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloud\n", + "\n", + "The Databricks Lakehouse Platform is open\n", + "source with multicloud flexibility so that you can\n", + "use your data however and wherever you want —\n", + "no vendor lock-in\n", + "\n", + "\n", + "-----\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 02\n", + "\n", + "### Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "As modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\n", + "the increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\n", + "suddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\n", + "cost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\n", + "upon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n", + "\n", + "Here are some common challenges around managing data and performance at scale:\n", + "\n", + "\n", + "**Volume and velocity** — Exponentially\n", + "increasing data sources, and the speed at\n", + "which they capture and create data.\n", + "\n", + "**Latency requirements** — The demands of\n", + "downstream applications and users have\n", + "evolved (people want data and the results\n", + "from the data faster).\n", + "\n", + "\n", + "**Governance** — Cataloging, auditing, securing and\n", + "reporting on data is burdensome at scale when\n", + "using old systems not built with data access\n", + "controls and compliance in mind.\n", + "\n", + "**Multicloud** is really hard.\n", + "\n", + "\n", + "**Data storage** — Storing data in the wrong\n", + "format is slow to access, query and is\n", + "expensive at scale.\n", + "\n", + "\n", + "**Data format** — Supporting structured, semistructured and unstructured data formats\n", + "is now a requirement. Most data storage\n", + "solutions are designed to handle only one type\n", + "of data, requiring multiple products\n", + "to be stitched together.\n", + "\n", + "```\n", + "02\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Lakehouse solves scale and performance challenges\n", + "\n", + "\n", + "The solution for growing digital companies is a unified\n", + "and simplified platform that can instantly scale up\n", + "capacity to deliver more computing power on demand,\n", + "freeing up teams to go after the much-needed data\n", + "and produce outputs more quickly. With a lakehouse,\n", + "they can replace their data silos with a single home for\n", + "their structured, semi-structured and unstructured\n", + "data. Users and applications throughout the enterprise\n", + "environment can connect to the same single copy of\n", + "the data to drive diverse workloads.\n", + "\n", + "The lakehouse architecture is cost-efficient for\n", + "scaling, lowering the total cost of ownership for the\n", + "overall infrastructure by consolidating all data estate\n", + "and use cases onto a single platform and eliminating\n", + "redundant licensing, infrastructure and administration\n", + "costs. Unlike other warehouse options that can only\n", + "scale horizontally, the Databricks Lakehouse can scale\n", + "horizontally and vertically based on workload demands.\n", + "\n", + "With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "```\n", + "\n", + "With more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n", + "\n", + "day, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n", + "\n", + "legacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n", + "\n", + "decreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n", + "\n", + "infrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n", + "\n", + "downstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n", + "\n", + "actionable insights for different use cases, from predictive maintenance to smarter product development.\n", + "\n", + "“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n", + "\n", + "performant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n", + "\n", + "Wassym Bensaid, Vice President of Software Development at Rivian.\n", + "\n", + "For instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n", + "\n", + "accelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n", + "\n", + "roll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n", + "\n", + "connected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n", + "\n", + "smart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n", + "\n", + "has seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Step 2**\n", + "**Understand the common Delta Lake operations**\n", + "The Databricks Lakehouse Platform simplifies the\n", + "entire data lifecycle, from data ingestion to monitoring\n", + "and governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\n", + "open-source storage system based on the Delta\n", + "format providing reliability through ACID transactions\n", + "and scalable metadata handling. Large quantities of\n", + "raw files in blob storage can be converted to Delta to\n", + "organize and store the data cheaply. This allows for\n", + "flexibility of data movement while being performant\n", + "and less expensive.\n", + "\n", + "\n", + "**Step 1**\n", + "**Get a trial Databricks account**\n", + "Start your 14-day free trial with Databricks on\n", + "AWS in a few easy steps.\n", + "[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\n", + "uses compute and S3 storage resources in your cloud\n", + "provider account.\n", + "\n", + "\n", + "and writing data can occur simultaneously without risk\n", + "of many queries resulting in performance degradation\n", + "or deadlock for business-critical workloads.\n", + "\n", + "This means that users and applications throughout\n", + "the enterprise environment can connect to the same\n", + "single copy of the data to drive diverse workloads, with\n", + "all viewers guaranteed to receive the most current\n", + "version of the data at the time their query executes.\n", + "With performance features like indexing, Delta Lake\n", + "customers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n", + "[up to 48x faster.](https://www.databricks.com/customers/columbia)\n", + "\n", + "\n", + "[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\n", + "and learn how to create, manage and query tables.\n", + "With support for ACID transactions and schema\n", + "enforcement, Delta Lake provides the reliability that\n", + "traditional data lakes lack. This enables you to scale\n", + "reliable data insights throughout the organization and\n", + "run analytics and other data projects directly on your\n", + "data lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n", + "\n", + "Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 3**\n", + "**Ingest data efficiently at scale**\n", + "With a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\n", + "from hundreds of data sources for analytics, AI and\n", + "streaming applications into one place.\n", + "\n", + "Databricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\n", + "data ingestion. To ingest any file that can land in a data\n", + "lake, Auto Loader incrementally and automatically\n", + "processes new data files as they arrive in cloud storage\n", + "in scheduled or continuous jobs. Auto Loader scales to\n", + "support near real-time ingestion of millions of files\n", + "per hour.\n", + "\n", + "For pushing data in Delta Lake, the SQL command\n", + "[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\n", + "into Delta Lake. COPY INTO is best used when the input\n", + "directory contains thousands of files or fewer, and the\n", + "user prefers SQL. COPY INTO can be used over JDBC\n", + "to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "**Step 4**\n", + "**Leverage production-ready tools**\n", + "**to automate ETL pipelines**\n", + "Once the raw data is ingested, Databricks provides\n", + "a suite of production-ready tools that allow data\n", + "professionals to quickly develop and deploy extract,\n", + "\n", + "transform and load (ETL) pipelines. Databricks SQL\n", + "allows analysts to run SQL queries against the same\n", + "tables used in production ETL workloads, allowing for\n", + "real-time business intelligence at scale.\n", + "\n", + "With your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "for data orchestration and learn how easy it is to create\n", + "a cluster, create a Databricks notebook, configure\n", + "[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\n", + "interact with the data, and schedule a job.\n", + "\n", + "\n", + "Databricks supports workloads in SQL, Python, Scala\n", + "and R, allowing users with diverse skill sets and\n", + "technical backgrounds to leverage their knowledge\n", + "to derive analytic insights. You can use all languages\n", + "supported by Databricks to define production jobs, and\n", + "notebooks can leverage a combination of languages.\n", + "\n", + "This means that you can promote queries written by\n", + "SQL analysts for last-mile ETL into production data\n", + "engineering code with almost no effort. Queries and\n", + "workloads defined by personas across the organization\n", + "leverage the same data sets, so there’s no need to\n", + "reconcile field names or make sure dashboards are up\n", + "to date before sharing code and results with\n", + "other teams.\n", + "\n", + "\n", + "-----\n", + "\n", + "With [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\n", + "a framework that uses a simple declarative approach\n", + "to build ETL and ML pipelines on batch or streaming\n", + "data while automating operational complexities such\n", + "as infrastructure management, task orchestration,\n", + "error handling and recovery, retries, and performance\n", + "optimization.\n", + "\n", + "Delta Live Tables extends functionality in Apache Spark\n", + "Structured Streaming and allows you to write just a\n", + "few lines of declarative Python or SQL to deploy a\n", + "production-quality data pipeline with:\n", + "\n", + "- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n", + "\n", + "- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n", + "\n", + "- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n", + "\n", + "- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n", + "\n", + "With DLT, engineers can also treat their data as code\n", + "and apply software engineering best practices like\n", + "testing, monitoring and documentation to deploy\n", + "reliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\n", + "maintain all data dependencies across the pipeline and\n", + "reuse ETL pipelines with environment-independent\n", + "data management.\n", + "\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "### Stopping sophisticated ransomware in its tracks\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: ABNORMAL SECURITY\n", + "\n", + "```\n", + "\n", + "The increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n", + "\n", + "to meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n", + "\n", + "that scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n", + "\n", + "Abnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n", + "\n", + "pipelines and constantly refined ML models.\n", + "\n", + "“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n", + "\n", + "Abnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n", + "\n", + "product better.”\n", + "\n", + "The company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n", + "\n", + "maximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n", + "\n", + "directly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n", + "\n", + "delivers reliability, security and performance on the data lake for both streaming and batch operations. With\n", + "\n", + "Databricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n", + "\n", + "decisions and improve detection efficacy.\n", + "\n", + "Databricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n", + "\n", + "productivity and work in the same space without constantly competing for compute resources.\n", + "\n", + "With Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n", + "\n", + "infrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n", + "\n", + "\n", + "-----\n", + "\n", + "Delta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\n", + "that trigger intermittently and are unpredictable. It optimizes cluster utilization\n", + "by only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\n", + "unnecessary idle node capacity.\n", + "\n", + "\n", + "Delta Live Tables helps prevent bad data from flowing into tables through validation,\n", + "integrity checks and predefined error policies. In addition, you can monitor data\n", + "\n", + "quality trends over time to get insight into how your data is evolving and where\n", + "changes may be necessary.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 5**\n", + "**Use Databricks SQL for serverless compute**\n", + "[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\n", + "warehouse on the Lakehouse Platform for running your\n", + "SQL and BI applications at scale with up to 12x better\n", + "price/performance. It’s imperative for younger, growing\n", + "companies to reduce resource contention, and one way\n", + "to accomplish that is with serverless compute. Running\n", + "serverless removes the need to manage, configure or\n", + "scale cloud infrastructure on the lakehouse, freeing up\n", + "your data team for what they do best.\n", + "\n", + "\n", + "See for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n", + "[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\n", + "stored in your data lake.\n", + "\n", + "The Databricks SQL REST API supports services to\n", + "manage queries and dashboards, query history and SQL\n", + "warehouses.\n", + "\n", + "\n", + "Databricks SQL warehouses provide instant, elastic\n", + "SQL compute — decoupled from storage — and will\n", + "automatically scale to provide unlimited concurrency\n", + "without disruption, for high concurrency use cases. DB\n", + "SQL has data governance and security built in. Handle\n", + "high concurrency with fully managed load balancing\n", + "and scaling of compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Faster queries with Photon**\n", + "[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\n", + "to deliver dramatic infrastructure cost savings and\n", + "accelerate all data and analytics workloads: data\n", + "ingestion, ETL, streaming, interactive queries, data\n", + "science and machine learning.\n", + "\n", + "Photon is used by default in Databricks SQL. To\n", + "enable Photon acceleration, select the **Use Photon**\n", + "**Acceleration** checkbox when you create the cluster.\n", + "If you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\n", + "set runtime_engine to PHOTON.\n", + "\n", + "Photon supports a number of instance types on\n", + "the driver and worker nodes. Photon instance types\n", + "consume DBUs at a different rate than the same\n", + "instance type running the non-Photon runtime. For\n", + "more information about Photon instances and DBU\n", + "consumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n", + "\n", + "Photon will seamlessly coordinate work and resources\n", + "and transparently accelerate portions of your SQL and\n", + "Spark queries. No tuning or user intervention required.\n", + "Photon is compatible with Apache Spark APIs, so\n", + "getting started is as easy as turning it on — no code\n", + "change and no lock- in. Written entirely in C++, Photon\n", + "provides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\n", + "per the TPC-DS 1TB benchmark, and customers have\n", + "observed 3x–8x speedups on average.\n", + "\n", + "\n", + "With Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\n", + "Databricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n", + "\n", + "Learn how to connect BI tools to Databricks SQL\n", + "compute resources with the following user guides:\n", + "\n", + "\n", + "[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n", + "\n", + "[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n", + "\n", + "\n", + "[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n", + "\n", + "[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n", + "\n", + "\n", + "[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n", + "\n", + "[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 6**\n", + "**Orchestrate workflows**\n", + "Databricks provides a comprehensive suite of tools and integrations to support your\n", + "data processing workflows.\n", + "\n", + "Databricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\n", + "orchestration service for all your teams, so you can focus on your workflows, not on\n", + "managing your infrastructure. Orchestrate diverse workloads for the full lifecycle\n", + "including Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n", + "\n", + "Here’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\n", + "learn how to create notebooks, create and run a job, view the run details, and run jobs\n", + "with different parameters.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 7**\n", + "**Run an end-to-end analytics pipeline**\n", + "This where you can see how everything works together to run efficiently at scale. First\n", + "take the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\n", + "will write to and read data from an external location managed by Unity Catalog and\n", + "configure Auto Loader to ingest data to Unity Catalog.\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n", + "\n", + "- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n", + "\n", + "- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n", + "\n", + "- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n", + "\n", + "- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/)\n", + "\n", + "\n", + "###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n", + " —Anup Segu, Senior Software Engineer, YipitData\n", + "\n", + "[Learn more.](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "# 03\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Building effective machine-learning operations\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 03\n", + "\n", + "### Building effective machine-learning operations\n", + "\n", + "```\n", + "Growing startups and digital native companies face several challenges when they\n", + "start building, maintaining and scaling machine learning operations (MLOps) for their\n", + "data science teams.\n", + "\n", + "\n", + "MLOps is different from DevOps. DevOps practices\n", + "and tooling alone are insufficient because ML\n", + "applications rely on an assortment of artifacts (e.g.,\n", + "models, data, code) that can each require different\n", + "methods of experiment tracking, model training,\n", + "feature development, governance, feature and\n", + "model serving.\n", + "\n", + "For data teams beginning their machine learning\n", + "journeys, the challenge of training data models can\n", + "be labor-intensive and not cost-effective because\n", + "the data has to be converted into features and\n", + "\n", + "trained on a separate machine learning platform\n", + "\n", + "\n", + "Data teams often perform development in\n", + "disjointed, siloed stacks spanning DataOps,\n", + "ModelOps and DevOps\n", + "\n", + "Development and training environment\n", + "disconnect. Moving code and data between\n", + "personal development environments and\n", + "machine learning platforms for model training\n", + "at scale is error prone and cumbersome. The\n", + "“it worked on my machine” problem.\n", + "\n", + "Gathering high-quality data. Data that is siloed\n", + "across the organization is hard to discover,\n", + "collect, clean and use. This leads to stale data\n", + "and delays in development of models.\n", + "\n", + "\n", + "See **Create a unified data architecture.**\n", + "```\n", + " 03\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Siloed stacks spanning DataOps, ModelOps and DevOps\n", + "\n", + "When data engineers help ingest, refine and prep\n", + "data, they do so on their own stack. This data has\n", + "to be converted into features and then trained on\n", + "a separate machine learning platform. This cross-\n", + "platform handoff often results in data staleness,\n", + "difficulty in maintaining versions, and eventually,\n", + "poorly performing models. Even after you have\n", + "trained your model, you have to deal with yet another\n", + "tech stack for model deployment. It’s challenging\n", + "to serve features in real time and difficult to trace\n", + "problems in production back to the data.\n", + "\n", + "The downstream business impact is massive —\n", + "longer and more expensive projects, and lower\n", + "model accuracy in production leading to declining\n", + "business metrics.\n", + "\n", + "If you are looking at launching or scaling your\n", + "MLOps, you should probably focus on an incremental\n", + "strategy. At Databricks, we see firsthand how\n", + "customers develop their MLOps approaches across\n", + "a huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n", + "[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\n", + "building robust MLOps practices.\n", + "\n", + "\n", + "###### Databricks solution:\n", + "\n", + "Databricks Machine Learning is an integrated\n", + "end-to-end machine learning environment\n", + "incorporating managed services for experiment\n", + "tracking, model training, feature development and\n", + "management, and model serving. The capabilities\n", + "of Databricks map directly to the steps of model\n", + "development and deployment. With Databricks\n", + "Machine Learning, you can:\n", + "\n", + "\n", + "Train models either manually or with AutoML\n", + "\n", + "Track training parameters and models using\n", + "experiments with MLflow tracking\n", + "\n", + "Create feature tables and access them for model\n", + "training and inference\n", + "\n", + "Share, manage and serve models using MLflow\n", + "Model Registry\n", + "\n", + "Deploy models for Serverless Real-time Inference\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Use MLOps on the Databricks Lakehouse Platform\n", + "\n", + "To gain efficiencies and reduce costs, many smaller\n", + "digital companies are employing machine learning\n", + "operations. MLOps is a set of processes and\n", + "automation for managing models, data and code, and\n", + "unique library dependencies to improve performance\n", + "stability and long-term efficiency in ML systems.\n", + "\n", + "To describe it simply, MLOps = ModelOps + DataOps +\n", + "DevOps. The aim of MLOps is to improve the long-term\n", + "performance, stability and success rate of ML systems\n", + "while maximizing the efficiency of the teams who\n", + "build them.\n", + "\n", + "\n", + "Not only does MLOps improve organizational efficiency,\n", + "it also allows the models to iterate faster and react\n", + "to real-life changes in the data. This ability separates\n", + "companies that can grow to meet their customer’s\n", + "challenges in a reactive manner versus those that will\n", + "spend significant time on data updates/processes and\n", + "miss the opportunity to do something with\n", + "their models.\n", + "\n", + "The absence of MLOps is typically marked by an\n", + "overabundance of manual processes which are slower\n", + "\n", + "\n", + "and more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\n", + "capping the ability for a data team to take on new projects. The process is complex. In larger organizations,\n", + "several specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\n", + "natives and high-growth startups may be forced to wear several hats.\n", + "\n", + "\n", + "-----\n", + "\n", + "And once an ML project goes into production, the\n", + "MLOps continues, since the models, data and code\n", + "change over time due to regulatory and business\n", + "requirements. But the ML system must be resilient and\n", + "flexible. Addressing these challenges with a defined\n", + "MLOps strategy can dramatically reduce the iteration\n", + "cycle of delivering models to production.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Steps in machine learning model development and deployment:\n", + "\n", + "\n", + "**Step 1**\n", + "**Data preparation**\n", + "Manually preparing and labeling data is a thankless,\n", + "time-consuming job. With Databricks, teams can\n", + "label data with human effort, machine learning\n", + "models in Databricks, or a combination of both.\n", + "Teams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\n", + "workflow that allows humans to easily inspect and\n", + "correct a model’s predicted labels. This process can\n", + "drastically reduce the amount of unstructured data\n", + "you need to achieve strong model performance.\n", + "\n", + "The [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\n", + "ready-to-go environment with many external\n", + "libraries, including TensorFlow, PyTorch, Horovod,\n", + "scikit-learn and XGBoost. It provides\n", + "extensions to improve performance, including GPU\n", + "acceleration in XGBoost, distributed deep\n", + "learning using HorovodRunner, and model\n", + "checkpointing.\n", + "\n", + "To use Databricks Runtime ML, select the ML version\n", + "of the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\n", + "access data in Unity Catalog for machine learning\n", + "workflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\n", + "isolation clusters are not compatible with Databricks\n", + "Runtime for Machine Learning.\n", + "\n", + "\n", + "Machine learning applications often\n", + "need to use shared storage for data\n", + "loading and model checkpointing. You\n", + "can load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\n", + "files. A table is a collection of\n", + "structured data stored as a directory\n", + "on cloud object storage.\n", + "\n", + "For [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\n", + "use [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\n", + "new features, explore and reuse\n", + "existing features, track lineage and\n", + "feature creation code, and publish\n", + "features to low-latency online stores\n", + "for real-time inference. The Feature\n", + "Store is a centralized repository\n", + "that enables data scientists to find\n", + "and share features. It ensures that\n", + "the same code used to compute\n", + "the feature values is used for model\n", + "training and inference. The Feature\n", + "Store library is available only on\n", + "Databricks Runtime for Machine\n", + "Learning and is accessible through\n", + "Databricks notebooks and workflows.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n", + "\n", + "- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n", + "\n", + "- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "C `USTOMER STORY: ZIPLINE`\n", + "\n", + "### Data-driven drones deliver lifesaving medical aid around the world\n", + "\n", + "\n", + "Automated logistics and delivery system\n", + "\n", + "provider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n", + "\n", + "cutting-edge drone technology and a global\n", + "\n", + "autonomous logistics network to save lives\n", + "\n", + "\n", + "information they need to accurately measure success, find\n", + "\n", + "the metrics that relate to customer experiences or logistics,\n", + "\n", + "and improve on them exponentially as more data is ingested\n", + "\n", + "and machine learning models are refined.\n", + "\n", + "\n", + "by giving remote communities access to\n", + "\n", + "\n", + "emergency and preparatory medical aid and\n", + "\n", + "resources, regardless of where they are in the\n", + "\n", + "world.\n", + "\n", + "Doing so requires the ability to ingest and\n", + "\n", + "analyze huge chunks of time series data in real\n", + "\n", + "time. This data is produced every time a drone\n", + "\n", + "takes flight and includes performance data,\n", + "\n", + "in-flight battery management, regional weather\n", + "\n", + "patterns, geographic obstacles, landing errors\n", + "\n", + "and a litany of other information that must be\n", + "\n", + "processed.\n", + "\n", + "\n", + "“About 30% of the deliveries we do are lifesaving emergency\n", + "\n", + "deliveries, where the product being delivered does not exist\n", + "\n", + "at the hospital. We have to be fast, and we have to be able\n", + "\n", + "to rely on all the different kinds of data to predict failures\n", + "\n", + "before they occur so that we can guarantee a really, really\n", + "\n", + "high service level to the people who are literally depending\n", + "\n", + "on us with their lives,” said Zipline CEO Keller Rinaudo.\n", + "\n", + "“Databricks gives us confidence in our operations, and\n", + "\n", + "enables us to continuously improve our technology, expand\n", + "\n", + "our impact, and provide lifesaving aid where and when it’s\n", + "\n", + "needed, every single day.”\n", + "\n", + "[Read full story here.](https://www.databricks.com/customers/zipline)\n", + "\n", + "\n", + "Every Zipline flight generates a gigabyte of data\n", + "\n", + "with potential life-or-death consequences,\n", + "\n", + "but accessing and federating the data for both\n", + "\n", + "internal and external decision-making was\n", + "\n", + "challenging. With Databricks as the common\n", + "\n", + "platform, Zipline’s data team can access all the\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 2**\n", + "**Model training**\n", + "For training machine learning and deep learning\n", + "models, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\n", + "prepares a data set for model training, performs a set\n", + "of trials using open-source libraries such as scikit-learn\n", + "and XGBoost, and creates a Python notebook with\n", + "the source code for each trial run so you can review,\n", + "reproduce and modify the code.\n", + "\n", + "In Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\n", + "creating data science and machine learning workflows\n", + "and collaborating with colleagues. Databricks\n", + "notebooks provide real-time coauthoring in multiple\n", + "languages, automatic versioning and built-in data\n", + "visualizations.\n", + "\n", + "\n", + "###### Resources:\n", + "\n", + "- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n", + "\n", + "- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n", + "\n", + "- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n", + "\n", + "- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n", + "\n", + "- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n", + "\n", + "- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n", + "\n", + "- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n", + "\n", + "\n", + "**Step 3**\n", + "**Track model development**\n", + "The model development process is iterative, and can\n", + "be challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\n", + "you keep track of the model development process,\n", + "including parameter settings or combinations you have\n", + "tried and how they affected the model’s performance.\n", + "\n", + "MLflow tracking uses experiments and runs to log\n", + "and track your model development. A run is a single\n", + "execution of model code. An experiment is a collection\n", + "of related runs. Within an experiment, you can compare\n", + "and filter runs to understand how your model performs\n", + "and how its performance depends on the parameter\n", + "settings, input data, etc.\n", + "\n", + "MLflow can automatically log training code written\n", + "in many ML frameworks. This is the easiest way to\n", + "get started using MLflow tracking. With MLflow’s\n", + "autologging capabilities, a single line of code\n", + "automatically logs the resulting model.\n", + "\n", + "\n", + "A hosted version of MLflow Model Registry can help\n", + "[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\n", + "apply webhooks to automatically trigger actions based\n", + "on registry events. For example, you can trigger CI\n", + "builds when a new model version is created or notify\n", + "your team members through Slack each time a model\n", + "transition to production is requested. This promotes\n", + "a traceable version control work process. You can\n", + "leverage this feature for web traffic A/B testing and\n", + "funneled to versions of deployed models for more\n", + "precise population studies.\n", + "\n", + "\n", + "**Step 4**\n", + "**Deploy machine learning models**\n", + "You can use MLflow to deploy models for batch or\n", + "streaming inference or to set up a REST endpoint to\n", + "serve the model. Simplify your model deployment by\n", + "registering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\n", + "you have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n", + "[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\n", + "the model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n", + "\n", + "[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\n", + "applications, Databricks recommends the following\n", + "workflow.\n", + "\n", + "To debug and tune model inference on Databricks,\n", + "using GPUs (graphics processing units) can efficiently\n", + "optimize the running speed for model inference. As\n", + "GPUs and other accelerators become faster, it is\n", + "important that the data input pipeline keep up with\n", + "demand. The data input pipeline reads the data into\n", + "Spark DataFrames, transforms it and loads it as the\n", + "input for model inference.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: ITERABLE\n", + "\n", + "### Optimizing touch points across the entire customer journey\n", + "\n", + "```\n", + "“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n", + "\n", + "rising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n", + "\n", + "Principal Product Manager, [Iterable](https://iterable.com/)\n", + "\n", + "Captivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n", + "\n", + "connections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n", + "\n", + "patients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n", + "\n", + "than 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n", + "\n", + "This need to build personalized and automated customer experiences for its clients drove the company to find a\n", + "\n", + "fully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n", + "\n", + "the ability to scale for analytics and AI.\n", + "\n", + "With Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n", + "\n", + "unique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n", + "\n", + "learning models that deliver top-notch and personalized user experiences for higher-converting marketing\n", + "\n", + "campaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### ML Stages\n", + "\n", + "ML workflows include the following key assets: code,\n", + "models and data. These assets need to be developed\n", + "(dev), tested (staging) and deployed (production).\n", + "Each stage needs to operate within an execution\n", + "environment. So the execution environments, code,\n", + "models and data are divided into dev, staging and\n", + "production.\n", + "\n", + "ML project code is often stored in a version control\n", + "repository (such as Git), with most organizations using\n", + "branches corresponding to the lifecycle phases of\n", + "development, staging or production.\n", + "\n", + "Since model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\n", + "management to have its own service. MLflow and its\n", + "Model Registry support managing model artifacts\n", + "directly via UI and APIs. The loose coupling of model\n", + "artifacts and code provides flexibility to update\n", + "production models without code changes, streamlining\n", + "the deployment process in many cases.\n", + "\n", + "Databricks recommends creating separate\n", + "environments for the different stages of ML code and\n", + "model development with clearly defined transitions\n", + "between stages. The recommended MLOps workflow is\n", + "broken into these three stages:\n", + "\n", + "\n", + "[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\n", + "is experimentation. Data scientists develop features\n", + "and models and run experiments to optimize model\n", + "performance. The output of the development process is\n", + "ML pipeline code that can include feature computation,\n", + "model training inference and monitoring\n", + "\n", + "\n", + "-----\n", + "\n", + "[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\n", + "This stage focuses on testing the ML pipeline code\n", + "for production readiness, including code for model\n", + "training as well as feature engineering pipelines and\n", + "inference code. The output of the staging process is a\n", + "release branch that triggers the CI/CD system to start\n", + "the production stage.\n", + "\n", + "\n", + "-----\n", + "\n", + "[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\n", + "ML engineers own the production environment\n", + "where ML pipelines are deployed. These pipelines\n", + "compute fresh feature values, train and test new model\n", + "versions, publish predictions to downstream tables\n", + "or applications, and monitor the entire process to\n", + "avoid performance degradation and instability. Data\n", + "scientists have visibility to test results, logs, model\n", + "artifacts and production pipeline status to allow them\n", + "to identify and diagnose problems in production.\n", + "\n", + "The Databricks Machine Learning home page provides\n", + "quick access to all the machine learning resources. To\n", + "access this page, move your mouse or pointer over\n", + "the left sidebar in the Databricks workspace. From\n", + "the persona switcher at the top of the sidebar, select\n", + "\n", + "Machine Learning.\n", + "\n", + "From the shortcuts menu, you can create\n", + "a [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\n", + "The center of the screen includes any recently viewed\n", + "items, and the sidebar provides quick access to\n", + "the [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n", + "[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\n", + "New users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\n", + "that illustrate how to use Databricks throughout the\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n", + "\n", + "- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n", + "\n", + "- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n", + "\n", + "- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n", + "\n", + "- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n", + "\n", + "- [Watch the demos](https://www.databricks.com/discover/demos)\n", + "\n", + "\n", + "ML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\n", + "for a model-training tutorial notebook that steps\n", + "through loading data, training and tuning a model,\n", + "comparing and analyzing model performance and\n", + "using the model for inference.\n", + "\n", + "Also be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\n", + "learn how your organization can build a robust MLOPs\n", + "practice incrementally.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 04\n", + "```\n", + "SUMMARY: \u0003\n", + "\n", + "## The Databricks Lakehouse Platform addresses these challenges\n", + " 04\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "### Summary\n", + "\n", + "We’ve organized the common data challenges for startups and growing digital native\n", + "\n", + "businesses into three main buckets: Building a **unified data architecture** — one that\n", + "\n", + "supports **scalability and performance** ; and building effective **machine learning**\n", + "\n", + "**operations** , all with an eye on cost efficiency and increased productivity.\n", + "\n", + "The Lakehouse Platform provides an efficient and scalable architecture that solves\n", + "these challenges and will support your data, analytics and AI workloads now and as\n", + "you scale.\n", + "\n", + "With [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\n", + "performant digital native applications and analytic workloads — designed to scale as\n", + "you grow. Use your data however and wherever you want with open-source flexibility,\n", + "leverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\n", + "data workloads while Databricks automatically manages your infrastructure as you\n", + "scale. Leverage serverless Databricks SQL to increase productivity and scale on\n", + "demand with up to 12x better price/performance.\n", + "\n", + "Easily access data for ML models and accelerate the full ML lifecycle from\n", + "experimentation to production.\n", + "\n", + "Discover more about the lakehouse for companies born in the cloud **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Get started with Databricks Trial\n", + "\n", + "Get a collaborative environment for data teams to build\n", + "solutions together with interactive notebooks to use\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\n", + "TensorFlow, Keras, scikit-learn and more.\n", + "\n", + "\n", + "### Get started with About Databricks Trial Databricks\n", + "\n", + "Get a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\n", + "solutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\n", + "Apache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\n", + "TensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San\n", + "\n", + "Available as a 14-day full trial in your own cloud or as\n", + "\n", + "Francisco, with offices around the globe. Founded by\n", + "\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and\n", + "MLflow, Databricks is on a mission to help data teams\n", + "solve the world’s toughest problems. To learn more,\n", + "follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n", + "\n", + "\n", + "\n", + "- Available as a 14-day full trial in your own cloud or as\n", + "a lightweight trial hosted by Databricks.\n", + "\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf2024-09-19T16:57:22Z
**EBOOK**\n", + "\n", + "# Four Forces Driving Intelligent Manufacturing\n", + "\n", + "### A data-driven business built on Lakehouse for Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction .................................................................................................................................................................................................................................................. **03**\n", + "\n", + "The four driving forces of change ..................................................................................................................................................................................................... **04**\n", + "\n", + "Digital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n", + "\n", + "Manufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n", + "\n", + "The foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n", + "\n", + "DRIVING FORCE NO. 1\n", + "The shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n", + "\n", + "DRIVING FORCE NO. 2\n", + "Transparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n", + "\n", + "DRIVING FORCE NO. 3\n", + "Future opportunities for manufacturing business models ......................................................................................................................................................... **13**\n", + "\n", + "DRIVING FORCE NO. 4\n", + "The focus on sustainability ....................................................................................................................................................................................................................... **15**\n", + "\n", + "Leveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n", + "\n", + "The building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n", + "\n", + "Manufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n", + "\n", + "2 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n", + "\n", + "\n", + "But today it’s data- and AI-driven businesses that\n", + "are being rewarded because they’re using process\n", + "and product optimization not previously possible,\n", + "able to forecast and sense supply chain demand,\n", + "and, crucially, introduce new forms of revenue\n", + "based upon service rather than product.\n", + "\n", + "The drivers for this evolution will be the emergence\n", + "of what we refer to as “Intelligent Manufacturing”\n", + "that has been enabled by the rise of computational\n", + "power at the Edge and in the Cloud. As well as\n", + "new levels of connectivity speed enabled by 5G\n", + "and fiber optic, combined with increased use of\n", + "advanced analytics and machine learning (ML).\n", + "\n", + "\n", + "Yet, even with all the technological advances\n", + "enabling these new data-driven businesses,\n", + "challenges exist.\n", + "\n", + "McKinsey’s recent research with the World\n", + "Economic Forum estimates the value creation\n", + "potential of manufacturers and suppliers that\n", + "implement Industry 4.0 in their operations\n", + "at USD$37 trillion by 2025. Truly a huge number.\n", + "But the challenge that most companies still\n", + "struggle with is the move from piloting point\n", + "solutions to delivering sustainable impact at scale.\n", + "[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n", + "\n", + "\n", + "##### 80% of manufacturers\n", + "[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n", + "\n", + "##### 57% of manufacturing leaders feel their organization\n", + "[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n", + "\n", + "[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n", + "[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n", + "##### manufacturers by 2025\n", + "\n", + "\n", + "3 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The four driving forces of change\n", + "\n", + "###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n", + "\n", + "\n", + "###### Skills and production gaps\n", + "\n", + "The rise of the digital economy is demanding a new set of skills.\n", + "For today’s Intelligent Manufacturing organizations, there’s a fundamental\n", + "need for computer and programming skills for automation, along\n", + "with critical-thinking abilities. Also important is the ability to use\n", + "collaboration systems and new advanced assistance tools, such as\n", + "automation, virtual reality (VR) and augmented reality (AR). The deficit\n", + "of workers with these skills is of critical concern to manufacturers.\n", + "\n", + "In addition, the industry dynamics are pushing companies to increase\n", + "and refine both partner/supplier relationships, optimize internal\n", + "operations and build robust supply chains that do not rely upon\n", + "safety stock to weather supply chain swings. Historical focus on\n", + "operational use cases is now extending to building agile supply chains.\n", + "\n", + "###### Supply chain volatility\n", + "\n", + "If the events of the last few years proved anything, it’s that supply\n", + "chains need to be robust and resilient. Historically, supply chain volatility\n", + "was smoothed by holding “safety stock,” which added costs without\n", + "financial value. Then the pendulum swung to “just in time delivery,”\n", + "where efficient use of working capital disregarded demand risks.\n", + "\n", + "Recent experiences have highlighted that demand sensing is needed\n", + "in addition to safety stock for high-risk parts or raw materials. The ability\n", + "to monitor, predict and respond to external factors – including natural\n", + "disasters, shipping and warehouse constraints, and geopolitical disruption\n", + "– is vital to reduce risk and promote agility. Many of these external\n", + "data sources leverage unstructured data (news, social posts, videos\n", + "and images), and being able to manage both structured and unstructured\n", + "data available to measure and analyze this volatility is key.\n", + "\n", + "\n", + "###### Need for new and additional sources of revenue\n", + "\n", + "Manufacturers’ growth historically has been limited\n", + "to new product introduction rate or expansion into\n", + "new geographies. The emergence of “equipment\n", + "as-a-service” is changing that dynamic. It’s pivoting\n", + "the business from product-centric growth to one\n", + "leveraging added services, which are not slaves to the\n", + "product development introduction cycle and can be highly\n", + "differentiated depending on the market segment and types\n", + "of products. Real-time data plays an outsize role, as now\n", + "businesses are in unison with use cases such as predictive\n", + "maintenance, stock replenishment and worker safety.\n", + "\n", + "###### An increased focus on sustainability\n", + "\n", + "Manufacturers have always focused on efficiency,\n", + "but they’re increasingly seeing the value chain as circular.\n", + "It’s no longer enough to consider an organization’s own\n", + "carbon footprint – it needs to also include indirect\n", + "emissions and other environmental impacts from the\n", + "activities it doesn’t own or control. This requires a\n", + "360-degree view of sustainability, which includes both\n", + "internal and external factors in measuring compliance\n", + "with ESG programs.\n", + "\n", + "**This eBook will look closer at these four key challenges**\n", + "**and their associated use cases, as well as some**\n", + "**of the most effective technologies and solutions**\n", + "**that can be implemented to respond to them.**\n", + "\n", + "\n", + "4 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Digital transformation is not a destination, it’s a journey\n", + "\n", + "##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n", + "\n", + "This transition from manual operations to automated\n", + "solutions is enhancing and optimizing operational\n", + "efficiency and decision-making, while also making\n", + "supply chains more frictionless and reliable, as well\n", + "as enabling organizations to become more responsive\n", + "and adaptable to market and customer needs.\n", + "\n", + "This disruption has been driven by a rush of new\n", + "technologies including artificial intelligence, machine\n", + "learning, advanced analytics, digital twins, Internet\n", + "of Things (IoT), and automation. These, in turn, have\n", + "been enabled by the greater network capabilities of 5G.\n", + "Industry 4.0 is well underway. Intelligent Manufacturing\n", + "isn’t the future, it’s what competitive organizations\n", + "have established today.\n", + "\n", + "\n", + "## The data and AI maturity curve\n", + "### From descriptive to prescriptive\n", + "\n", + "Prescriptive\n", + "Analytics\n", + "\n", + "Predictive\n", + "Modeling\n", + "\n", + "**How** can we make it happen?\n", + "\n", + "Data\n", + "Exploration\n", + "\n", + "\n", + "**What** will happen?\n", + "\n", + "**Why** did it happen?\n", + "\n", + "\n", + "Ad Hoc\n", + "Queries\n", + "\n", + "\n", + "Reports\n", + "\n", + "\n", + "Cleaned\n", + "Data\n", + "\n", + "**What** happened?\n", + "\n", + "Analytics Maturity\n", + "\n", + "\n", + "Raw\n", + "Data\n", + "\n", + "\n", + "5 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing – use case maturity matrix\n", + "\n", + "\n", + "No\n", + "\n", + "1\n", + "\n", + "2\n", + "\n", + "3\n", + "\n", + "4\n", + "\n", + "5\n", + "\n", + "6\n", + "\n", + "7\n", + "\n", + "8\n", + "\n", + "9\n", + "\n", + "10\n", + "\n", + "11\n", + "\n", + "12\n", + "\n", + "13\n", + "\n", + "14\n", + "\n", + "15\n", + "\n", + "16\n", + "\n", + "17\n", + "\n", + "18\n", + "\n", + "19\n", + "\n", + "20\n", + "\n", + "21\n", + "\n", + "22\n", + "\n", + "23\n", + "\n", + "\n", + "Use case name\n", + "\n", + "EDW offload\n", + "\n", + "Product 360\n", + "\n", + "Voice of customer insights\n", + "\n", + "Testing & simulation optimization\n", + "\n", + "Supplier 360\n", + "\n", + "Spend analytics\n", + "\n", + "Sourcing event optimization\n", + "\n", + "Process & quality monitoring\n", + "\n", + "Process 360\n", + "\n", + "Equipment predictive maintenance\n", + "\n", + "Quality & yield optimization\n", + "\n", + "Supply chain 360\n", + "\n", + "Demand analytics\n", + "\n", + "Inventory visibility & tracking\n", + "\n", + "Inventory optimization\n", + "\n", + "Logistics route optimization\n", + "\n", + "Customer 360\n", + "\n", + "Marketing & sales personalization\n", + "\n", + "Recommendation engine\n", + "\n", + "Asset/Vehicle 360\n", + "\n", + "Connected asset & value-added services\n", + "\n", + "Quality event detection & traceability\n", + "\n", + "Asset predictive maintenance\n", + "\n", + "\n", + "Peer Competitive Scale\n", + "\n", + "Standard among peer group\n", + "\n", + "Common among peer group\n", + "\n", + "Strategic among peer group\n", + "\n", + "\n", + "Design\n", + "\n", + "\n", + "Purchasing\n", + "\n", + "**11**\n", + "\n", + "**10**\n", + "\n", + "**13**\n", + "\n", + "**12**\n", + "\n", + "**17**\n", + "\n", + "\n", + "New innovations\n", + "\n", + "Manufacturing\n", + "\n", + "Supply Chain\n", + "\n", + "\n", + "That is not to say that the digital transformation\n", + "journey is simple. Replacing legacy systems, breaking\n", + "down data and organizational silos, bridging the gap\n", + "between operational technology (OT) and informational\n", + "technology (IT), reskilling workforces, and much more\n", + "requires a clear and determined digitalization strategy,\n", + "and to reach new levels of IT and data maturity.\n", + "\n", + "\n", + "**16**\n", + "\n", + "\n", + "Much of the aforementioned transformation requires\n", + "a foundation of effective data management and\n", + "architecture to be in place. Without this ability to\n", + "control the vast amounts of structured data (highly\n", + "organized and easily decipherable) and unstructured\n", + "data (qualitative, no predefined data model),\n", + "manufacturers cannot generate actionable insights\n", + "from their data, derive value from machine learning,\n", + "monitor and analyze supply chains, or coordinate\n", + "decisions across the business.\n", + "\n", + "\n", + "**15**\n", + "\n", + "\n", + "**14**\n", + "\n", + "\n", + "Marketing & Sales\n", + "\n", + "Service\n", + "\n", + "\n", + "**19**\n", + "\n", + "\n", + "**18**\n", + "\n", + "\n", + "**23**\n", + "\n", + "\n", + "**22**\n", + "**21**\n", + "**20**\n", + "\n", + "\n", + "Awareness\n", + "\n", + "\n", + "Exploration Optimization Transformation\n", + "\n", + "Maturity Stages\n", + "\n", + "\n", + "6 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The foundations for data-driven manufacturing\n", + "\n", + "###### Cloud-native platforms\n", + "\n", + "Improve data management, enhance data analytics\n", + "and expand the use of enterprise data, including streaming\n", + "structured and unstructured data\n", + "\n", + "###### Technology-enabled collaboration\n", + "\n", + "Democratize analytics and ML capabilities – ensure the right\n", + "users have access to the right data driving business value\n", + "\n", + "###### The ability to scale machine learning use cases\n", + "\n", + "A central place to store and discover ML models and enabling\n", + "greater collaboration between ML, data and business users\n", + "\n", + "\n", + "##### 95% agree that\n", + "[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "\n", + "[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n", + "##### USD$2.8 trillion by 2025\n", + "\n", + "\n", + "##### 85% have accelerated\n", + "[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n", + "\n", + "\n", + "###### Open standards and open data architectures\n", + "\n", + "Leverage open source standards and open data formats\n", + "to accelerate innovation and enable the integration\n", + "of best-of-breed, third-party tools and services\n", + "\n", + "\n", + "7 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 1\n", + "\n", + "## The shift from manufacturing to Intelligent Manufacturing\n", + "\n", + "##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n", + "\n", + "\n", + "Yet the reasons for the lack of manufacturing\n", + "talent today are manifold, and COVID-19 has only\n", + "contributed to an existing problem. For instance,\n", + "many highly experienced baby boomers are\n", + "retiring from the workforce, leaving fewer people\n", + "with the in-depth knowledge of custom equipment\n", + "and machines. Meanwhile, younger generations\n", + "have a poor perception of what manufacturing jobs\n", + "are like and are reluctant to step into the industry.\n", + "Meaning not only a problem with retaining skills,\n", + "but also attracting them.\n", + "\n", + "And, of course, there is a growing gap between\n", + "the current capabilities of industrial workers and\n", + "the skill sets needed for today’s data-driven,\n", + "sensor-filled, 5G-enabled Intelligent Manufacturing.\n", + "\n", + "\n", + "With the drive to optimize operations, stabilize\n", + "supply chains and reinvent business models\n", + "through equipment-as-a-service, the skill sets\n", + "have radically changed from even a decade ago.\n", + "\n", + "Intelligent Manufacturing’s use cases are placing\n", + "a high demand on robotics programmers and\n", + "technicians, cybersecurity experts, digital twin\n", + "architects, supply network analysts, and people\n", + "who can leverage AI and machine learning\n", + "algorithms because deployment of these common\n", + "use cases is producing multiples of returns for\n", + "those embracing Intelligent Manufacturing.\n", + "\n", + "\n", + "8 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n", + "\n", + "\n", + "##### 44% report difficulty\n", + "[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n", + "\n", + "##### 83% of manufacturing workers are interested\n", + "[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n", + "\n", + "##### 56% of Gen Z say\n", + "[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n", + "\n", + "### Proof through customer success\n", + "\n", + "##### Watch our case study\n", + "\n", + "\n", + "###### Digital twins\n", + "\n", + "Ingesting information from sensors and other data sources,\n", + "these virtual replicas of physical assets create models\n", + "to which a layer of visualization can be applied. This enables\n", + "users to predict failures, assess performance and reveal\n", + "opportunities for optimization. Digital twins unlock the ability\n", + "for manufacturers to monitor and manage production remotely,\n", + "as well as explore “what-if” scenarios.\n", + "\n", + "###### Process and quality optimization\n", + "\n", + "Process and quality optimization generally covers the\n", + "optimization of equipment, operating procedures, and control\n", + "loops. It requires access to accurate, up-to-date data about\n", + "conditions, collected through IoT devices to monitor every\n", + "aspect. The introduction of deep learning architectures is\n", + "enabling manufacturing machinery to identify visual clues\n", + "that are indicative of quality issues in manufactured goods,\n", + "while digital twins can be used to spot inefficiencies without\n", + "the need to pause production.\n", + "\n", + "###### Throughput optimization\n", + "\n", + "Increasing throughput is critical for meeting delivery schedules,\n", + "and manufacturers are always looking for ways to identify\n", + "and eliminate bottlenecks, reduce inventory and increase\n", + "the utilization of assets. Throughput optimization makes\n", + "use of data-driven algorithms to identify, rank and resolve\n", + "labor, equipment or inventory bottlenecks.\n", + "\n", + "\n", + "###### Equipment predictive maintenance\n", + "\n", + "Rather than wait for a piece of equipment to fail or\n", + "stick to a fixed schedule, predictive maintenance adopts\n", + "a predictive approach to equipment maintenance.\n", + "By monitoring real-time data collected from hundreds\n", + "of IoT sensors, machine learning techniques can detect\n", + "anomalies in operations and possible defects in equipment\n", + "and processes. Predictive maintenance correlates data across\n", + "many more dimensions than traditional inspection techniques,\n", + "to anticipate failures and prevent costly breakdowns.\n", + "\n", + "###### Quality and yield optimization (with computer vision)\n", + "\n", + "Quality assurance focuses on the use of data analytics,\n", + "AI and machine learning to identify and prevent defects\n", + "during the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n", + "[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\n", + "recognition and machine learning, computer vision\n", + "can automate visual inspections, detecting faults\n", + "and imperfections faster and more cost effectively\n", + "than manual approaches.\n", + "\n", + "\n", + "9 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 2\n", + "\n", + "## Transparency, visibility, data: optimizing the supply chain\n", + "\n", + "##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n", + "\n", + "\n", + "Such resiliency requires a combination\n", + "of technologies and solutions. For example,\n", + "decision support tools with predictive capabilities\n", + "– to monitor the supply chain and analyze\n", + "what-if scenarios. Demand sensing and forecasting\n", + "in combination with enterprise critical systems\n", + "(ERP) needs to combine data from a wide variety\n", + "of sources.\n", + "\n", + "10 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "Working together, combining millions of data points\n", + "from across organizations’ operations along with\n", + "other external sources, these technologies can\n", + "be used to optimize supply chains, reduce costs\n", + "and improve customer service and loyalty.\n", + "However, achieving this – embracing the latest\n", + "in AI, machine learning and predictive analytics –\n", + "means being able to manage and maintain\n", + "a flow of accurate, relevant data and to be able\n", + "to translate this data into actionable insights.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n", + "\n", + "\n", + "###### Demand, inventory, logistics\n", + "\n", + "\n", + "###### Purchasing\n", + "\n", + "**Spend analytics:** Most obviously, transparency and insight into where\n", + "cash is spent is vital for identifying opportunities to reduce external\n", + "spending across supply markets, suppliers and locations. However, spend\n", + "analytics are also hugely important to supply chain agility and resilience.\n", + "This requires a single source of data truth for finance and procurement\n", + "departments. For example, integrating purchase order, invoice,\n", + "accounts payable, and general-ledger account data to create a level of\n", + "transparency, visibility and consistency to inform supplier discussions\n", + "and deploy strategies to manage cash better during times\n", + "of disruption.\n", + "\n", + "###### Cross supply chain collaboration\n", + "\n", + "**Supply chain 360:** With real-time insights and aggregated supply\n", + "chain data in a single business intelligence dashboard, manufacturers\n", + "are empowered with greater levels of visibility, transparency\n", + "and insights for more informed decision-making. This dashboard\n", + "can be used to identify risks and take corrective steps,\n", + "assess suppliers, control costs and more.\n", + "\n", + "\n", + "**Demand analytics:** By collecting and analyzing millions –\n", + "if not billions – of data points about market and customer\n", + "behavior and product performance, manufacturers can\n", + "use this understanding to improve operations and support\n", + "strategic decisions that affect the demand of products\n", + "and services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n", + "\n", + "**Inventory visibility and tracking:**\n", + "Inventory visibility is the ability to view and track\n", + "inventory in real time, with insights into SKU stock levels\n", + "and which warehouse or fulfillment center it is stored at.\n", + "With complete oversight of inventory across multiple\n", + "channels, this helps improve supply chain efficiency,\n", + "demand forecasting and order accuracy, while ultimately\n", + "enhancing the customer experience.\n", + "\n", + "\n", + "**Inventory optimization:** The practice of having the right\n", + "amount of available inventory to meet demand, both in the\n", + "present and the future, enables manufacturers to address\n", + "demand expectations, and reduce the costs of common\n", + "inventory issues. Inventory optimization incorporates\n", + "data for demand forecasting, inventory strategy and\n", + "stock replenishment. With the addition of AI reinforced\n", + "learning models, this can help improve demand prediction,\n", + "recommend stock levels, and automatically order\n", + "raw materials to fulfill orders, while also detecting\n", + "and responding to shifts in demand.\n", + "\n", + "**Logistics route optimization:** Using AI, route optimization\n", + "can help manufacturers go beyond normal route planning\n", + "and include parameters to further drive logistics efficiency.\n", + "What-if scenarios present route options that help cut\n", + "transportation costs, boost productivity and execute\n", + "on-time deliveries.\n", + "\n", + "\n", + "**Supply chain network design:** By building and modeling the supply\n", + "chain, it enables manufacturers to understand the costs and time\n", + "to bring goods and services to market. Supply chain network design\n", + "helps to evaluate delivery at the lowest possible cost, optimal sources\n", + "and inventory deployment, as well as define distribution strategies.\n", + "\n", + "11 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n", + "##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n", + "\n", + " Only 6% of companies believe\n", + "[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n", + "\n", + "##### 57% believe that supply chain management \n", + "[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n", + "[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n", + "\n", + "### Supply chain optimization case study\n", + "\n", + "##### Watch our case study\n", + "\n", + "12 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 3\n", + "\n", + "## Future opportunities for manufacturing business models\n", + "\n", + "##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n", + "\n", + "\n", + "These new opportunities are forming part\n", + "of a longer-term industry shift from the sale\n", + "of goods (CapEx) to recurring revenue streams,\n", + "such as through Equipment-as-a-Service (EaaS)\n", + "models. While this approach is not new to many\n", + "(Rolls-Royce’s “Power-by-the-Hour” engine\n", + "subscription model has been around since 1962),\n", + "customer demand, advances in industrial IoT\n", + "technology, and a continuing decline in\n", + "sales and margins have seen EaaS emerge\n", + "as an imperative for manufacturers.\n", + "\n", + "\n", + "Opening up some of these new revenue streams,\n", + "of course, demands operational flexibility, but more\n", + "importantly, digital maturity. This means cloud\n", + "technologies that allow employees new levels\n", + "of access to data, the ability to work anywhere,\n", + "and adapt rapidly to new needs. The introduction\n", + "of a microservices architecture, to allow the agile\n", + "development and deployment of new IT services.\n", + "And the democratization of data, so the entire\n", + "organization and its ecosystem of partners\n", + "and suppliers have access to information\n", + "about market demand, operations, production,\n", + "logistics and transportation.\n", + "\n", + "\n", + "13 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "##### By 2023, 20% of industrial equipment manufacturers will\n", + "[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n", + "\n", + "##### In 2025, the global EaaS market is estimated\n", + "[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n", + "\n", + "##### In the U.S., 34% said\n", + "[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n", + "[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n", + "[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n", + "[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n", + "\n", + "### Equipment as a service case study\n", + "\n", + "##### Read our case study\n", + "\n", + "\n", + "### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n", + "\n", + "\n", + "###### Connected assets\n", + "\n", + "The digital connectivity of high-value\n", + "physical assets is helping to drive a\n", + "more efficient use of assets and cost\n", + "savings. Connected assets can provide\n", + "continuous, real-time data on their\n", + "operating conditions, even if they are on\n", + "the other side of the world. Connected\n", + "assets can also be used as the foundation\n", + "of as-a-service business models to\n", + "track the usage of rented machines, and\n", + "for automakers to use with connected\n", + "vehicles and electrification strategies.\n", + "\n", + "\n", + "###### Quality event detection and traceability\n", + "\n", + "Manufacturers are increasingly seeking\n", + "end-to-end supply chain traceability —\n", + "to be able to identify and trace\n", + "the history, distribution, location\n", + "and application of products, parts\n", + "and materials. With event-based\n", + "traceability, typically using blockchain\n", + "ledgers, manufacturers can record\n", + "events along the supply chain.\n", + "This can help aid legal compliance,\n", + "support quality assurance and brand\n", + "trust, and provide full supply chain\n", + "visibility for better risk management.\n", + "\n", + "\n", + "###### Demand-driven manufacturing\n", + "\n", + "**Equipment-as-a-Service:**\n", + "Startup organizations without\n", + "the in-house infrastructure can\n", + "use a third-party to realize their\n", + "concepts, while manufacturers\n", + "with the production capabilities\n", + "can ensure minimal downtime\n", + "for their assets. This involves\n", + "greater risk for the manufacturer,\n", + "but also the potential for higher\n", + "and annuitized revenues.\n", + "\n", + "\n", + "14 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Driving Force No. 4\n", + "\n", + "## The focus on sustainability\n", + "\n", + "##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n", + "\n", + "\n", + "When looking at the entire manufacturing\n", + "value chain, there are many areas where\n", + "more sustainable practices can deliver\n", + "measurable change. Products can be\n", + "designed in a way that reduces waste\n", + "and increases their longevity; materials\n", + "can be selected and sourced in a more\n", + "ethical way; operational efficiency and\n", + "green energy can improve production;\n", + "and the introduction of sustainable\n", + "practices for transportation and\n", + "shipping can help reduce carbon\n", + "footprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n", + "\n", + "\n", + "There are a number of business\n", + "operating models that employ the four\n", + "Rs and support the circular economy.\n", + "Sharing platforms and aaS models help\n", + "optimize manufacturing capacity and\n", + "enable businesses to rent rather than\n", + "buy the machinery and equipment\n", + "they need. Product use extension helps\n", + "extend the lifecycle of products through\n", + "repair and refurbishment, while resource\n", + "recovery means recovering raw materials\n", + "from end-of-life products.\n", + "\n", + "Achieving this means establishing\n", + "a redesigned supply chain that\n", + "leverages many use cases, technologies\n", + "and solutions we covered earlier.\n", + "\n", + "\n", + "It will require greater levels of\n", + "collaboration between suppliers\n", + "and vendors. It will require optimizing\n", + "production lines and transportation.\n", + "It will require greater levels of customer\n", + "engagement to extend product lifecycles\n", + "and close the loop of the supply chain.\n", + "\n", + "But most of all, it will require data,\n", + "to provide visibility and intelligence\n", + "across the network, and to be able\n", + "to make the decisions to improve\n", + "efficiency in the present, as well as\n", + "longer-term decisions based on a\n", + "broad view of sustainability impacts.\n", + "\n", + "\n", + "15 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "### Sustainability Solution Accelerator\n", + "\n", + "##### Read now\n", + "\n", + "\n", + "[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n", + "[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "##### world’s energy consumption\n", + "[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n", + "\n", + "\n", + "##### 80% of the world’s leading companies \n", + "[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n", + "\n", + "\n", + "##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n", + "\n", + "\n", + "16 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Leveraging the Databricks Lakehouse for Manufacturing\n", + "\n", + "Our open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\n", + "and transportation & logistics organizations to unlock more value and transform how they use data and AI.\n", + "\n", + "\n", + "All your sources Any structure or frequency\n", + "\n", + "\n", + "Reliable, real-time processing Analytics capabilities for any use case or persona\n", + "\n", + "\n", + "Competitor News\n", + "& Social\n", + "\n", + "Consumer Devices\n", + "\n", + "Video & Images\n", + "\n", + "IoT & Shop Floor\n", + "\n", + "Enterprise Resource\n", + "Planning\n", + "\n", + "Sales Transaction\n", + "& Syndicated\n", + "\n", + "Inventory & Logistics\n", + "\n", + "\n", + "Unstructured batch\n", + "\n", + "\n", + "Ad Hoc Data Science\n", + "\n", + "Low-cost, rapid experimentation\n", + "with new data and models.\n", + "\n", + "Production Machine Learning\n", + "\n", + "High volume, fine-grained analysis at scale\n", + "served in the tightest of service windows.\n", + "\n", + "BI Reporting and Dashboarding\n", + "\n", + "Power real-time dashboarding directly,\n", + "or feed data to a data warehouse for\n", + "high-concurrency reporting.\n", + "\n", + "Real-Time Applications\n", + "\n", + "\n", + "Lakehouse enables a real-time\n", + "data-driven business with the ability\n", + "to ingest structured, semi-structured\n", + "and unstructured data from ERP,\n", + "SCM, IoT, social or other sources\n", + "in your value chain so that predictive\n", + "AI and ML insights can be realized.\n", + "This enables them to operate their\n", + "business in real time, deliver more\n", + "accurate analytics that leverage all\n", + "their data, and drive collaboration\n", + "and innovation across their value\n", + "chain. Most important for capital\n", + "intensive manufacturing business,\n", + "it enables them to move quickly\n", + "from proof-of-concept (PoC)\n", + "ideation to ROI quickly.\n", + "\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Unstructured batch\n", + "\n", + "Semi-structured real-time\n", + "\n", + "Structured real-time\n", + "\n", + "Structured batch\n", + "\n", + "Structured real-time\n", + "\n", + "\n", + "Data Lakehouse\n", + "\n", + "Process, manage, and\n", + "query all your data.\n", + "\n", + "Any cloud\n", + "\n", + "\n", + "Provide real-time data to downstream\n", + "applications or power applications via APIs.\n", + "\n", + "\n", + "17 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## The building blocks of Lakehouse for Manufacturing\n", + "\n", + "\n", + "###### Real Time\n", + "\n", + "Make data-informed decisions\n", + "\n", + "\n", + "###### Solution Accelerators\n", + "\n", + "Accelerate the possibilities\n", + "of capabilities\n", + "\n", + "\n", + "###### Partner Solutions\n", + "\n", + "Accelerate the\n", + "creation of insights\n", + "\n", + "\n", + "###### Speed\n", + "\n", + "Delivering fast ROI\n", + "\n", + "\n", + "**Real-time data to make informed**\n", + "**decisions:** The Lakehouse Platform\n", + "streamlines data ingestion and\n", + "management in a way that makes it easy\n", + "to automate and secure data with fast,\n", + "real-time performance. This means you\n", + "can consolidate and enhance data from\n", + "across the organization and turn it into\n", + "accessible, actionable insights.\n", + "\n", + "\n", + "**Solution Accelerators for new**\n", + "**capabilities:** Through our Solution\n", + "Accelerators, manufacturers can\n", + "easily access and deploy common and\n", + "high-impact use cases. For manufacturers\n", + "restricted by time and resources, these\n", + "accelerators provide the tools and\n", + "pre-built code to deliver PoCs in\n", + "less than two weeks.\n", + "\n", + "\n", + "**Pre-built applications to deliver**\n", + "**solutions faster:** We make it easy\n", + "for you to discover data, analytics\n", + "and AI tools, using pre-built integrations\n", + "to connect with partner solutions,\n", + "integrating them (and existing solutions)\n", + "into the Lakehouse Platform to rapidly\n", + "expand capabilities in a few clicks.\n", + "\n", + "\n", + "**The speed to deliver fast ROI:**\n", + "With faster data ingestion and access\n", + "to insights combined with easier, quicker\n", + "deployments, this means accelerated\n", + "digital transformation and higher ROI.\n", + "\n", + "\n", + "18 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturers’ end goals\n", + "\n", + "##### Intelligent Manufacturing leaders leverage a combination of familiar manufacturing techniques and recent value producing and differentiating use of data-led use cases.\n", + "\n", + "This means making use of IIoT, cloud computing, data analytics,\n", + "machine learning and more to create an end-to-end digital ecosystem\n", + "across the entire value chain and build scalable architectures\n", + "that take data from edge to AI. It means embracing automation\n", + "and robotics, optimizing how organizations use assets and\n", + "augmenting the capabilities of workforces, and introducing new\n", + "levels of connectivity to accelerate performance. Not to mention\n", + "open the door to new platform and as-a-service business models\n", + "with the potential to generate new revenue streams.\n", + "\n", + "Also key to the data-driven transformation of manufacturing is visibility:\n", + "a 360-degree, end-end-to view of the supply chain. Not only is this\n", + "critical for the efficiency, optimization and profitability of operations,\n", + "it is needed to be able to take new strides in sustainability.\n", + "\n", + "Of course, better data management is not only about unlocking\n", + "insight, empowering AI, and enabling decision-making. It’s also about\n", + "governance: acknowledging format issues, adhering to compliance,\n", + "protecting IP, ensuring data security. All this needs to be taken into\n", + "consideration when bringing onboard an ISV to establish a modern,\n", + "unified architecture for data and AI.\n", + "\n", + "19 Four Forces Driving Intelligent Manufacturing\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems. To learn more,\n", + "follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "Get started with a free trial of Databricks and\n", + "start building data applications today\n", + "\n", + "##### Start your free trial\n", + "\n", + "To learn more, visit us at:\n", + "\n", + "**[Databricks for Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf2024-09-19T16:57:21Z
## Driving Innovation and Transformation in the Federal Government With Data + AI\n", + "\n", + "Empowering the federal government\n", + "to efficiently deliver on mission objectives\n", + "and better serve citizens\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "State of the union: Data and AI in the federal government **03**\n", + "\n", + "Recognizing the opportunity for data and AI **04**\n", + "\n", + "Challenges to innovation **07**\n", + "\n", + "The Databricks Lakehouse Platform: Modernizing the federal government to achieve mission objectives **09**\n", + "\n", + "Customer story: U.S. Citizenship and Immigration Services **13**\n", + "\n", + "Conclusion **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### State of the union: Data and AI in the federal government\n", + "\n", + "For the private sector, the growth, maturation and application of data analytics and\n", + "\n", + "artificial intelligence (AI) have driven innovation. This has resulted in solutions that have\n", + "\n", + "helped to improve efficiencies in everything from optimizing supply chains to accelerating\n", + "\n", + "drug development to creating personalized customer experiences and much more.\n", + "\n", + "Unfortunately, the federal government and many of its agencies are just beginning to take\n", + "\n", + "advantage of the benefits that data, analytics and AI can deliver. This inability to innovate\n", + "\n", + "is largely due to aging technology investments, resulting in a sprawl of legacy systems\n", + "\n", + "siloed by agencies and departments.\n", + "\n", + "Additionally, the government is one of the largest employers in the world, which introduces\n", + "\n", + "significant complexity, operational inefficiencies and a lack of transparency that limit the\n", + "\n", + "ability of its agencies to leverage the data at their disposal for even basic analytics – let\n", + "\n", + "alone advanced data analytic techniques, such as machine learning.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Recognizing the opportunity for data and AI\n", + "\n", + "The opportunity for the federal government to leverage data analytics and AI cannot be\n", + "\n", + "overstated. With access to some of the largest current and historical data sets available to the\n", + "\n", + "\n", + "United States — and with vast personnel resources and some of the best private sector use\n", + "\n", + "cases and applications of AI available in the world — the federal government has the ability to\n", + "\n", + "transform the efficiency and effectiveness of many of its agencies.\n", + "\n", + "In fact, the federal government plans to spend $4.3 billion in artificial intelligence research and\n", + "\n", + "development across agencies in fiscal year 2023, according to a recent report from Bloomberg\n", + "\n", + "Government. These priorities are validated by a recent Gartner study of government CIOs\n", + "\n", + "across all levels (including state and local), confirming that the top game-changing technologies\n", + "\n", + "are AI, data analytics and the cloud.\n", + "\n", + "And as an indication of the potential impact, a recent study by Deloitte shows the government\n", + "\n", + "can save upward of $3 billion annually on the low end to more than $41 billion annually on the\n", + "\n", + "high end from data-driven automation and AI.\n", + "\n", + "Sources:\n", + "\n", + "[• Gartner Survey Finds Government CIOs to Focus Technology Investments on Data Analytics and Cybersecurity in 2019](https://www.gartner.com/en/newsroom/press-releases/2019-01-23-gartner-survey-finds-government-cios-to-focus-technol)\n", + "\n", + "[• Administration Projects Agencies Will Spend $1 Billion on Artificial Intelligence Next Year](https://www.nextgov.com/emerging-tech/2019/09/administration-projects-agencies-will-spend-1-billion-artificial-intelligence-next-year/159781/)\n", + "\n", + "\n", + "Investment in AI to\n", + "\n", + "automate repetitive tasks\n", + "\n", + "can improve efficiencies\n", + "\n", + "across government agencies,\n", + "\n", + "which could save **96.7**\n", + "#### million federal hours annually, with a potential\n", + "\n", + "savings of **$3.3 billion.**\n", + "\n", + "**WILLIAM EGGERS, PETER VIECHNICKI**\n", + "\n", + "**AND DAVID SCHATSKY**\n", + "\n", + "[Deloitte Insights](https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/artificial-intelligence-government.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**An increased focus on cloud, analytics and AI = operational efficiency**\n", + "\n", + "1. AI/ML\n", + "2. Data Analytics\n", + "3. Cloud\n", + "\n", + "**$1B** **TOP PRIORITIES** **$41B+**\n", + "\n", + "Data and AI Research and Government CIOs’ top Estimated government\n", + "Development Initiative game-changing technologies savings from data-driven\n", + "automation\n", + "\n", + "**U.S. Government**\n", + "\n", + "Fortunately, the President’s Management Agenda (PMA) has recognized the need to\n", + "\n", + "modernize their existing infrastructure, federate data for easier access and build more\n", + "\n", + "\n", + "**IT Modernization Act**\n", + "\n", + "Allows agencies to invest\n", + "\n", + "in modern technology\n", + "\n", + "solutions to improve\n", + "\n", + "service to the public,\n", + "\n", + "secure sensitive systems\n", + "\n", + "and data, and save\n", + "\n", + "taxpayer dollars.\n", + "\n", + "\n", + "**Federal Data Strategy**\n", + "\n", + "A 10-year vision for how\n", + "\n", + "the federal government will\n", + "\n", + "accelerate the use of data to\n", + "\n", + "achieve its mission, serve the\n", + "\n", + "public and steward resources,\n", + "\n", + "while protecting security,\n", + "\n", + "privacy and confidentiality.\n", + "\n", + "\n", + "**AI Executive Order**\n", + "\n", + "Makes AI a top research\n", + "\n", + "and development priority for\n", + "\n", + "federal agencies, provides\n", + "\n", + "a shared ethics framework\n", + "\n", + "for developing and using AI,\n", + "\n", + "and expands job rotation\n", + "\n", + "programs to increase\n", + "\n", + "the number of AI experts\n", + "\n", + "at agencies.\n", + "\n", + "\n", + "advanced data analytics capabilities by establishing mandates for modernization, data\n", + "\n", + "openness and the progression of AI innovations.\n", + "\n", + "\n", + "This will put agencies in a better position to leverage the scale of the cloud and democratize\n", + "\n", + "\n", + "This will put agencies in a better position to leverage the scale of the cloud and democratize The end result will be transformative innovation that can not only improve the operational\n", + "\n", + "secure access to data in order to enable downstream business intelligence and AI use cases. efficiencies of each agency, but also support the delivery of actionable insights in real time\n", + "\n", + "\n", + "efficiencies of each agency, but also support the delivery of actionable insights in real time\n", + "\n", + "\n", + "for more informed decision-making. This benefits citizens in the form of better services,\n", + "\n", + "stronger national security and smarter resource management.\n", + "\n", + "\n", + "-----\n", + "\n", + "Top data and AI use cases in the government\n", + "\n", + "\n", + "**H E A LT H C A R E**\n", + "\n", + "Improve the delivery and quality of healthcare services for citizens with powerful analytics and a 360°\n", + "\n", + "view of patients.\n", + "\n", + "- Patient 360 - Insurance management\n", + "\n", + "- Population health - Genomics\n", + "\n", + "- Supply chain optimization - Drug discovery and delivery\n", + "\n", + "\n", + "Across the federal government, data and AI is providing the insights and predictive\n", + "\n", + "capabilities to thwart cyberattacks and national threats, provide better social services more\n", + "\n", + "efficiently, and improve the delivery and quality of healthcare services.\n", + "\n", + "**H O M E L A N D S E C U R I T Y**\n", + "\n", + "\n", + "Detect and prevent criminal activities and national threats with real-time analytics and data-driven\n", + "\n", + "decision-making.\n", + "\n", + "\n", + "\n", + "- Customs and border protection - Counter-terrorism\n", + "\n", + "- Immigration and citizenship - Federal emergency aid management\n", + "\n", + "**D E F E N S E**\n", + "\n", + "\n", + "**E N E R G Y**\n", + "\n", + "Improve energy management with data insights that ensure energy resiliency and sustainability.\n", + "\n", + "- Security of energy infrastructure - Energy exploration\n", + "\n", + "- Smarter energy management - Electrical grid reliability\n", + "\n", + "\n", + "Apply the power of predictive analytics to geospatial, IoT and surveillance data to improve operations\n", + "\n", + "\n", + "**C O M M E R C E**\n", + "\n", + "Proactively detect anomalies with machine learning to mitigate risk and prevent fraudulent activity.\n", + "\n", + "- Tax fraud and collection - Grants management\n", + "\n", + "- Process and operations management - Customer 360\n", + "\n", + "**I N T E L L I G E N C E C O M M U N I T Y**\n", + "\n", + "Leverage real-time insights to make informed decisions that can impact the safety of our citizens and\n", + "\n", + "the world.\n", + "\n", + "- Threat detection - Intelligence surveillance and reconnaissance\n", + "\n", + "- Neutralize cyberattacks - Social media analytics\n", + "\n", + "\n", + "and protect the nation.\n", + "\n", + "- Logistics - Surveillance and reconnaissance\n", + "\n", + "- Predictive maintenance - Law enforcement and readiness\n", + "\n", + "\n", + "-----\n", + "\n", + "### Challenges to innovation\n", + "\n", + "The opportunity to drive innovation throughout the federal government is massive and\n", + "\n", + "has implications for every U.S. citizen. But there are several critical barriers preventing\n", + "\n", + "\n", + "Ten of the existing legacy systems\n", + "most in need of modernization\n", + "cost about **$337 million a year**\n", + "to operate and maintain.\n", + "\n", + "\n", + "agencies from making the progress needed to realize the value of their data and delivering\n", + "\n", + "those innovations.\n", + "\n", + "**THE GOVERNMENT ACCOUNTABILITY OFFICE,**\n", + "\n", + "**INFORMATION TECHNOLOGY REPORT TO CONGRESS, JUNE 2019**\n", + "\n", + "The complexities and impact of legacy data warehouses and marts\n", + "\n", + "Multiple federal agencies are burdened with a legacy IT infrastructure that is being left\n", + "\n", + "\n", + "behind by the technological advancements seen in the private sector. This infrastructure\n", + "\n", + "is traditionally built with on-premises data warehouses and data marts that are highly\n", + "\n", + "complex to maintain, costly to scale as compute is coupled with storage, limited from a\n", + "\n", + "data science perspective, and they lack support for the growing volumes of unstructured\n", + "\n", + "data. This inhibits data-driven innovation and blocks the use of AI, leaving agencies to\n", + "\n", + "search for data science tools to fill the gaps.\n", + "\n", + "Infrastructure also becomes harder and more expensive to maintain as it ages. Over time,\n", + "\n", + "these environments become more complex due to their need for specialized patches and\n", + "\n", + "updates that keep these systems available while doing nothing to solve the issues of poor\n", + "\n", + "interoperability, ever-decreasing processing speeds, and an inability to scale – all of which\n", + "\n", + "are critically necessary to support today’s more data-intensive use cases. For example,\n", + "\n", + "systems at the departments of Education, Health and Human Services, Treasury, and Social\n", + "\n", + "Security are over 40 years old.¹ This is causing pain in a variety of areas.\n", + "\n", + "\n", + "often requires significant customization and, even then, there is still a chance that the final\n", + "\n", + "integration won’t be successful. These systems also keep personnel from spending their\n", + "\n", + "energy and resources on emerging technologies such as AI.\n", + "\n", + "And data reliability is a big concern. Replication of data occurs across data marts as\n", + "\n", + "various teams try to access and explore it, creating data management and governance\n", + "\n", + "challenges. Without a single source of truth, teams struggle with data inconsistencies,\n", + "\n", + "which can result in inaccurate analysis and model performance that is only compounded\n", + "\n", + "over time.\n", + "\n", + "Thankfully, there are initiatives in place, such as the Data Center and Cloud Optimization\n", + "\n", + "Initiative Program Management Office (DCCOI PMO), which are investing in modernizing IT\n", + "\n", + "infrastructure for federal agencies.²\n", + "\n", + "\n", + "Maintaining these systems requires a massive investment of both time and money\n", + "\n", + "compared to modern cloud-based systems. For the technical teams that are tasked with\n", + "\n", + "\n", + "trying to integrate any of these legacy systems with third-party tooling or services, this\n", + "\n", + "\n", + "[¹ Agencies Need to Develop Modernization Plans for Critical Legacy Systems](https://www.gao.gov/assets/gao-19-471.pdf)\n", + "\n", + "[² IT Modernization](https://www.gsa.gov/technology/government-it-initiatives/data-center-optimization-initiative-dcoi)\n", + "\n", + "\n", + "-----\n", + "\n", + "Data is critical … and complicated\n", + "\n", + "Data is both the greatest asset and one of the greatest challenges that federal agencies must\n", + "\n", + "learn to manage. While the volume and usefulness of data collected by federal agencies are\n", + "\n", + "not in question, much of it is locked in legacy source systems, comes in diverse structured\n", + "\n", + "\n", + "Data silos hamper any data-driven advancements\n", + "\n", + "In any data-driven organization, the need to have trusted, timely and efficient access to\n", + "\n", + "data is critical. For the data teams responsible for driving the digital transformation of\n", + "\n", + "federal agencies, the challenges they face are myriad.\n", + "\n", + "\n", + "and unstructured formats, and is subject to a variety of governance models.\n", + "\n", + "We have already seen how existing, legacy infrastructure, as well as the integration of\n", + "\n", + "\n", + "Not only is this data siloed and very difficult to integrate, but the data volumes collected\n", + "\n", + "by federal agencies are massive. At Health and Human Services, for example, or the\n", + "\n", + "Department of Veterans Affairs, healthcare data sets will be sized by population and include\n", + "\n", + "electronic health records, clinical data, imaging and more. For the Department of Defense\n", + "\n", + "\n", + "fragmented data sources, will strain data engineering teams trying to deliver high-quality\n", + "\n", + "data at scale. Their challenge includes developing the right data pipelines that will take\n", + "\n", + "the massive volumes of raw data coming from fragmented sources into one centralized\n", + "\n", + "location with clean, secure and compliant data for agency decision-makers.\n", + "\n", + "\n", + "and the Department of Homeland Security, data includes everything from mapping, satellite\n", + "\n", + "\n", + "Data scientists and analysts alike must have the right toolset to collaboratively investigate,\n", + "\n", + "extract and report meaningful insights from this data. Unfortunately, data silos extend\n", + "\n", + "to organizational silos, which make collaboration inside an agency as well as between\n", + "\n", + "agencies very difficult. With different groups of data teams leveraging their own coding\n", + "\n", + "and analytical tools, communicating insights and working across teams — let alone\n", + "\n", + "across agencies — is almost impossible. This lack of collaboration can drastically limit\n", + "\n", + "the capabilities of any data analytics or AI initiatives — from the deployment of shared\n", + "\n", + "business intelligence (BI) reports and dashboards for data investigation and decision-\n", + "\n", + "making to the training of machine learning models to automate processes and make\n", + "\n", + "predictions. Compounding these challenges is an overall lack of data science expertise and\n", + "\n", + "skills within federal agencies. As a result, even with access to their data, without intuitive\n", + "\n", + "tooling it’s very difficult to deliver advanced analytic use cases with ML and AI.\n", + "\n", + "Organizational silos also impact the effectiveness of data analysts, who are responsible\n", + "\n", + "for analyzing and reporting insights from the data to better inform subject-matter experts\n", + "\n", + "or policy — and decision-makers. Without a data platform that eliminates these silos and\n", + "\n", + "enables visualization of and reporting on shared data, data analysts will be limited in how\n", + "\n", + "they are able to drive the organizational and policy agendas of their respective agencies.\n", + "\n", + "\n", + "imagery and intelligence data to payroll and human resources data. The Social Security\n", + "\n", + "Administration and Internal Revenue Service manage personal data for every single citizen in\n", + "\n", + "the United States.\n", + "\n", + "Combining these various forms of data from disparate legacy systems that are not\n", + "\n", + "integrated — and doing it across different government agencies and departments — can be\n", + "\n", + "slow and error prone, hindering downstream analytics and actionable insights. The teams\n", + "\n", + "that are responsible for this are faced with not only integrating these data sources, but also\n", + "\n", + "managing the entire ETL workflow in order to enable the application of basic analytics, let\n", + "\n", + "alone machine learning and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**THE DATABRICKS LAKEHOUSE PLATFORM:**\n", + "### Modernizing the federal government to achieve mission objectives\n", + "\n", + "\n", + "Databricks provides federal agencies with a Lakehouse Platform that combines the best of data warehouses and data\n", + "\n", + "lakes — to store and manage all your data for all your analytics workloads. Databricks federates all data and democratizes\n", + "\n", + "access for downstream use cases, empowering federal agencies to unlock the full potential of their data to deliver on\n", + "\n", + "their mission objectives and better serve citizens.\n", + "\n", + "\n", + "Federal agencies that are\n", + "powering impactful innovations\n", + "with Databricks Lakehouse\n", + "\n", + "\n", + "Lakehouse offers a single solution for all major data workloads, whether structured or unstructured, and supports use\n", + "\n", + "\n", + "cases from streaming analytics to BI, data science and AI.\n", + "\n", + "\n", + "Using predictive\n", + "analytics for better\n", + "passenger safety and\n", + "experience\n", + "\n", + "Enabling operational\n", + "efficiencies through\n", + "process automation\n", + "to streamline the path\n", + "to citizenship\n", + "\n", + "\n", + "All your\n", + "government data\n", + "\n", + "\n", + "Reliable, Analytics capabilities\n", + "real-time processing for every use case\n", + "\n", + "AD HOC\n", + "DATA SCIENCE\n", + "\n", + "\n", + "Health\n", + "\n", + "Surveillance\n", + "\n", + "Social Security\n", + "\n", + "Demographics\n", + "\n", + "Crime\n", + "\n", + "Audio/Visual\n", + "\n", + "Geospatial\n", + "\n", + "\n", + "Structured batch\n", + "\n", + "Unstructured stream\n", + "\n", + "Structured batch\n", + "\n", + "Structured batch\n", + "\n", + "Unstructured batch\n", + "\n", + "Unstructured stream\n", + "\n", + "Unstructured stream\n", + "\n", + "\n", + "PRODUCTION\n", + "MACHINE LEARNING\n", + "\n", + "\n", + "**DATA LAKEHOUSE**\n", + "\n", + "Process, manage\n", + "and query all your data\n", + "\n", + "\n", + "BI REPORTING AND\n", + "SCORECARDING\n", + "\n", + "\n", + "Leveraging advanced\n", + "analytics to improve\n", + "outcomes for patients\n", + "through Medicare and\n", + "Medicaid services\n", + "\n", + "\n", + "The Databricks Lakehouse Platform has three unique characteristics that address head-on the biggest challenges that\n", + "\n", + "federal agencies are facing:\n", + "\n", + "\n", + "It offers simplicity with regard to data\n", + "\n", + "management, in that the Databricks\n", + "\n", + "Lakehouse is architected to support all\n", + "\n", + "of an agency’s data workloads on one\n", + "\n", + "\n", + "It is built on open standards so\n", + "\n", + "that any existing investments\n", + "\n", + "in tooling or resources can\n", + "\n", + "remain effective\n", + "\n", + "\n", + "And it’s collaborative, enabling\n", + "\n", + "agency data engineers, analysts\n", + "\n", + "and data scientists to work\n", + "\n", + "together much more easily\n", + "\n", + "\n", + "common platform\n", + "\n", + "\n", + "-----\n", + "\n", + "Managing federal data with a unified approach\n", + "\n", + "\n", + "Databricks enables aggregation and processing of massive collections of diverse and\n", + "\n", + "sensitive agency data that currently exists in silos, both structured and unstructured. As\n", + "\n", + "we’ve seen, for many agencies this would be incredibly difficult with the infrastructure\n", + "\n", + "challenges they are experiencing. The Databricks Lakehouse leverages Delta Lake to unify\n", + "\n", + "\n", + "By providing a unified data foundation for business intelligence, data science and machine\n", + "\n", + "learning, federal agencies can add reliability, performance and quality to existing data lakes\n", + "\n", + "while simplifying data engineering and infrastructure management with automation to\n", + "\n", + "simplify the development and management of data pipelines.\n", + "\n", + "\n", + "the very large and diverse amounts of data that government agencies are working with.\n", + "\n", + "Delta Lake is an open format, centralized data storage layer that delivers reliability, security\n", + "\n", + "and performance — for both streaming and batch operations.\n", + "\n", + "The Lakehouse Platform combines the best elements of data lakes and data warehouses — delivering the data management and performance\n", + "typically found in data warehouses with the low-cost, flexible object stores offered by data lakes\n", + "\n", + "\n", + "-----\n", + "\n", + "Break down the institutional silos limiting collaboration\n", + "\n", + "Foster collaboration at every step with the latest machine learning tools that allow everyone\n", + "\n", + "to work and build value together — from data scientists to researchers to business\n", + "\n", + "decision-makers. Close the glaring skills gap within these government organizations by\n", + "\n", + "providing tooling that simplifies the ML lifecycle and empowers the data teams that do not\n", + "\n", + "have the data science expertise to still be productive with their data through integrating BI\n", + "\n", + "tools and SQL analytics capabilities.\n", + "\n", + "Empower data scientists with an intuitive and interactive workspace where they can easily\n", + "\n", + "collaborate on data, share models and code, and manage the entire machine learning\n", + "\n", + "lifecycle in one place. Databricks notebooks natively support Python, R, SQL and Scala so\n", + "\n", + "practitioners can work together with the languages and libraries of their choice.\n", + "\n", + "Deliver on mission objectives with powerful analytics across agencies\n", + "\n", + "The Databricks Lakehouse Platform includes a business intelligence capability — Databricks\n", + "\n", + "SQL. Databricks SQL allows data analysts and users to query and run reports against all of\n", + "\n", + "an agency’s unified data. Databricks SQL integrates with BI tools, like Tableau and Microsoft\n", + "\n", + "Power BI, and complements any existing BI tools with a SQL-native interface, allowing data\n", + "\n", + "analysts and data scientists to query data directly within Databricks.\n", + "\n", + "Additionally, with Databricks SQL, the data team can turn insights from real-world data into\n", + "\n", + "\n", + "powerful visualizations designed for machine learning. Visualizations can then be turned\n", + "\n", + "into interactive dashboards to share insights with peers across agencies, policymakers,\n", + "\n", + "\n", + "Easily create visualizations and share dashboards via integrations with BI tools, like Tableau and Microsoft Power BI\n", + "\n", + "\n", + "regulators and decision-makers.\n", + "\n", + "\n", + "-----\n", + "\n", + "Ensure data security and compliance at scale\n", + "\n", + "Databricks is fully aware of the sensitivity of the data that many of our federal agencies are\n", + "\n", + "responsible for. From national security and defense data to individual health and financial\n", + "\n", + "information to national infrastructure and energy data — all of it is critical. Data is protected\n", + "\n", + "at every level of the platform through deep integration with fine-grained, cloud-provider\n", + "\n", + "access control mechanisms. The Databricks Lakehouse is a massively secure and scalable\n", + "\n", + "multicloud platform running millions of machines every day. It is independently audited\n", + "\n", + "and compliant with FedRAMP security assessment protocols on the Azure cloud and can\n", + "\n", + "provide a HIPAA-compliant deployment on both AWS and Azure clouds.\n", + "\n", + "The platform’s administration capabilities include tools to manage user access, control\n", + "\n", + "spend, audit usage, and analyze activity across every workspace, all while seamlessly\n", + "\n", + "enforcing user and data governance, at any scale.\n", + "\n", + "With complete AWS accreditation, Databricks runs across all major networks including\n", + "\n", + "GovCloud, SC2S, C2S and commercial; all networks, including public, NIPR, SIPR and JWICS;\n", + "\n", + "and ATOs, including FISMA, IL5, IL6, ICD 503 INT-A and INT-B.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CUSTOMER STORY: U.S. CITIZENSHIP AND IMMIGRATION SERVICES**\n", + "### Streamlining the path to citizenship with data\n", + "\n", + "##### 24x faster\n", + "\n", + "query\n", + "performance\n", + "\n", + "\n", + "##### 10 minutes\n", + "\n", + "to process tables\n", + "with 120 million rows\n", + "\n", + "\n", + "##### 40 million\n", + "\n", + "applications\n", + "processed\n", + "\n", + "\n", + "The U.S. Citizenship and Immigration Services (USCIS) gains actionable insights from\n", + "\n", + "dashboards via Tableau to better understand how to streamline operations and more quickly\n", + "\n", + "process immigration and employment applications as well as petitions. Today, their data\n", + "\n", + "analyst team has over 6,000 Tableau dashboards running — all powered by Databricks.\n", + "\n", + "The U.S. Citizenship and Immigration Services is the government agency that oversees\n", + "\n", + "\n", + "lawful immigration to the United States. Over the last decade, the volume of immigration-\n", + "\n", + "and citizenship-related applications has skyrocketed across naturalizations, green cards,\n", + "\n", + "employment authorizations and other categories. With millions of applications and petitions\n", + "\n", + "flooding the USCIS, processing delays were reaching crisis levels — with overall case\n", + "\n", + "processing times increasing 91% since FY2014.\n", + "\n", + "\n", + "-----\n", + "\n", + "Processing delays fueled by on-premises, legacy architecture\n", + "\n", + "Core to these issues was an on-premises, legacy architecture that was complex, slow and\n", + "\n", + "costly to scale. By migrating to AWS and Databricks, USCIS adopted a unified approach\n", + "\n", + "to data analytics with more big data processing power and the federation of data\n", + "\n", + "across dozens of disparate sources. This has unlocked operational efficiencies and new\n", + "\n", + "\n", + "A new era of data-driven innovation improves operations\n", + "\n", + "USCIS now has the ability to understand their data more quickly, which has unlocked new\n", + "\n", + "opportunities for innovation. With Databricks, they are able to run queries in 19 minutes,\n", + "\n", + "something that used to take an entire day — a 24x performance gain. This means they are\n", + "\n", + "spending far less time troubleshooting and more time creating value.\n", + "\n", + "\n", + "opportunities for their entire data organization to drive business intelligence and fuel ML\n", + "\n", + "innovations designed to streamline application and petition processes.\n", + "\n", + "Removing complexities with a fully managed cloud platform\n", + "\n", + "\n", + "Since migrating to the cloud and integrating Databricks into their data analytics workflows,\n", + "\n", + "USCIS has been able to make smarter decisions that help streamline processes and\n", + "\n", + "leverage ML to reduce application processing times. These newfound efficiencies and\n", + "\n", + "capabilities have allowed them to scale their data footprint from about 30 data sources to\n", + "\n", + "75 without issue.\n", + "\n", + "Databricks provided USCIS with significant impact where it mattered most — faster\n", + "\n", + "processing speeds that enabled data analysts to deliver timely reports to decision-\n", + "\n", + "\n", + "We discovered Databricks, and\n", + "the light bulb really clicked for\n", + "us on what we needed to do\n", + "moving forward to stay relevant.\n", + "\n", + "\n", + "makers — and that freed up data scientists to build ML models to help improve operations.\n", + "\n", + "Leveraging the efficiencies of the cloud and Delta Lake, they were able to easily provision a\n", + "\n", + "\n", + "26-node cluster within minutes and ingest tables with 120 million rows into S3 in under 10\n", + "\n", + "minutes. Prior to Databricks, performing the same processes would have taken somewhere\n", + "\n", + "\n", + "**SHAWN BENJAMIN**\n", + "\n", + "**CHIEF OF DATA AND BUSINESS INTELLIGENCE, USCIS**\n", + "\n", + "\n", + "between two and three hours.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Enabling federal agencies to take advantage of data analytics and AI will help them execute\n", + "\n", + "their missions both effectively and efficiently. The Databricks Lakehouse Platform will unify\n", + "\n", + "data, analytics and AI workloads, making agencies data-driven and giving policymakers\n", + "\n", + "access to deeper, more meaningful insights for decision-making. It will also eliminate data\n", + "\n", + "silos and increase communication and collaboration across agencies to ensure the best\n", + "\n", + "results for all citizens.\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 5,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M, and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems.\n", + "\n", + "Get started with a free trial of Databricks and\n", + "start building data applications today\n", + "\n", + "**START YOUR FREE TRIAL**\n", + "\n", + "To learn more, visit us at: **dbricks.co/federal**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Data-AI-in-Fed-Gov-Ebook.pdf2024-09-19T16:57:19Z
**eBook**\n", + "\n", + "# Cybersecurity in Financial Services\n", + "\n", + "### Protecting financial institutions with advanced analytics and AI\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "The State of the Industry .................................................................................................................................................................................... **03**\n", + "\n", + "A New Commitment to Cybersecurity ....................................................................................................................................................... **04**\n", + "\n", + "The Biggest Challenge With Security Analytics ..................................................................................................................................... **05**\n", + "\n", + "Journey of SecOps: Destination Lakehouse ............................................................................................................................................ **06**\n", + "\n", + "Rethinking Cybersecurity in Financial Services With Databricks Lakehouse ......................................................................... **07**\n", + "\n", + "Lakehouse in Financial Services ..................................................................................................................................................................... **08**\n", + "\n", + "Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations .................................................................................. **12**\n", + "\n", + "Common Use Cases ................................................................................................................................................................................................ **14**\n", + "\n", + "Getting Started With Databricks for Cybersecurity ............................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "## The State of the Industry\n", + "\n", + "\n", + "Cloud, cost and complexity of customer data and cybersecurity are\n", + "top of mind for every financial services security leader today. As\n", + "financial services institutions (FSIs) continue to accelerate their digital\n", + "transformation, cybercriminals, fraudsters and state-sponsored actors\n", + "continue with more sophisticated threats. The impact of these attacks\n", + "ranges from the exposure of highly sensitive data to the disruption\n", + "of services and the exploitation of backdoors for future attacks — all\n", + "resulting in both financial and non-financial costs. Responding quickly\n", + "to potential threats requires security tools capable of analyzing billions\n", + "of threat signals in real-time.\n", + "\n", + "Recently, it seems like every week reveals a new data breach or ransomware assault,\n", + "and the cost is skyrocketing: more than $4 million per incident, up 10 percent from\n", + "2020, and about $401 million for a substantial [breach at a large corporation](https://www.ibm.com/security/data-breach) .\n", + "\n", + "\n", + "**Cybersecurity is no longer just a back-office cost and now**\n", + "**poses critical business risks, such as:**\n", + "\n", + "**•** Operational disruption\n", + "\n", + "**•** Material customer loss\n", + "\n", + "**•** Increase in insurance premiums\n", + "\n", + "**•** Lawsuits or fines\n", + "\n", + "**•** Systemic destabilization\n", + "\n", + "**•** Credit downgrade\n", + "\n", + "**•** Reputational damage\n", + "\n", + "Source: Navigating Cyber 2022, FS-ISAC, Annual Cyber Threat Review and Predictions\n", + "\n", + "\n", + "-----\n", + "\n", + "## A New Commitment to Cybersecurity\n", + "\n", + "\n", + "It comes as no surprise that in recent years FSIs have seen an amplified\n", + "commitment to cybersecurity. As business leaders look to new solutions, large\n", + "portions of IT budgets are now devoted to leveraging data and AI to thwart\n", + "cyberattacks.\n", + "\n", + "Furthermore, regulators are taking notice of the increased risk of cybersecurity\n", + "threats. Growing geopolitical tensions have also prompted federal agencies such\n", + "as the Cybersecurity and Infrastructure Security Agency and the Federal Bureau\n", + "of Investigation [to warn](https://www.wsj.com/livecoverage/russia-ukraine-latest-news-2022-04-05/card/banks-haven-t-seen-rise-in-cyberattacks-from-russia-yet-p3F5ebzAhTauVjsNx46E) that “tough sanctions imposed on Russia could prompt a\n", + "spate of cyberattacks against critical infrastructure such as banks.” Additionally,\n", + "the Securities and Exchange Commission released its [2022 Exam Priorities](https://www.sec.gov/news/press-release/2022-57) , which\n", + "include information security, and specifically “how firms are safeguarding their\n", + "customers’ records and assets from cyber threats, including oversight of thirdparty providers, identification of red flags related to identity theft, response to\n", + "incidents, including to ransomware attacks and management of operational risk in\n", + "light of ‘a dispersed workforce.’”\n", + "\n", + "However, as is often the case, implementing new cybersecurity strategies and\n", + "processes is easier said than done.\n", + "\n", + "\n", + "**Cybersecurity needs a transformation**\n", + "**... breaches, cost and complexity are growing**\n", + "\n", + "\n", + "## 100%\n", + "of organizations surveyed have had\n", + "breaches.\n", + "**The average breach costs $4M**\n", + "\n", + "## 85%\n", + "**will increase their cyber budget**\n", + "next FY. Cybersecurity industry will\n", + "grow to $366B by ‘28\n", + "\n", + "\n", + "## 67%\n", + "of organizations were **breached at**\n", + "**least three times** . A mega breach\n", + "costs $401M.\n", + "\n", + "**Cost, Complexity, Cloud**\n", + "\n", + "- \u0007Hundreds of tools with expanding\n", + "footprints\n", + "\n", + "- \u0007Data locked in vendor proprietary\n", + "tools\n", + "\n", + "- \u0007Humans compensating for\n", + "analytical and integration\n", + "deficiencies\n", + "\n", + "\n", + "In this eBook, we’ll take a closer look at the challenges associated with replacing\n", + "the infrastructure of a legacy data analytics system, and how financial institutions\n", + "are solving them with Databricks.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Biggest Challenge With Security Analytics\n", + "\n", + "\n", + "For many FSIs, on-premises security incident and event management (SIEM)\n", + "technologies have been the go-to solution for threat detection, analysis and\n", + "investigations. However, these legacy technologies were built for a world where big\n", + "data was measured in gigabytes, not today’s terabytes or petabytes. This means\n", + "that not only are legacy SIEMs unable to scale to today’s data volumes, but they\n", + "are also unable to serve the modern, distributed enterprise.\n", + "\n", + "By now, the advantages of moving to the cloud are no secret to anyone. For FSIs,\n", + "scalability, simplicity, efficiency and cost are absolutely essential components of\n", + "success. Many within FinServ are looking to cloud computing to make this possible,\n", + "adding detection and response in the cloud to the security team’s responsibility.\n", + "\n", + "Because legacy SIEMs predate the emergence of cloud, artificial intelligence and\n", + "machine learning (AI/ML) in the mainstream, they’re unable to address the complex\n", + "data and AI-driven analytics needed for threat detection, threat hunting, in-stream\n", + "threat intelligence enrichment, analytical automation and analyst collaboration.\n", + "\n", + "In other words, legacy SIEMs are no longer suitable for the modern enterprise or\n", + "the current threat landscape.\n", + "\n", + "\n", + "**Counting the Financial Cost of Legacy SIEMs**\n", + "\n", + "The financial cost of the continued use of legacy SIEMs continues to rise because\n", + "most SIEM providers charge their customers based on the volume of data\n", + "ingested. While some legacy technologies are available in the cloud, they’re either\n", + "not designed to be cloud-native applications or confined to a single cloud service\n", + "provider. As a result, security teams have to employ multiple tools for detection,\n", + "investigation and response — or pay exorbitant egress charges for data transiting\n", + "from one cloud provider to another. This causes operational slowdowns, errors\n", + "driven by complexity, and inconsistent implementation of security policies.\n", + "\n", + "A lack of support for multiple clouds also means an increase in maintenance\n", + "overhead. Security staff members are often stressed because analysts have to\n", + "learn different tools for different cloud platforms. For some, it also creates an\n", + "implicit cloud vendor lock-in, meaning that security teams are unable to support\n", + "missions because their tools are not portable across multiple cloud providers.\n", + "\n", + "Collectively, these drawbacks to legacy SIEMs result in a much weaker security\n", + "posture for FSIs.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Journey of SecOps: Destination Lakehouse\n", + "\n", + "How did security analytics get to this point? In the early days, there was a need to aggregate alerts from antiviruses and intrusion detection systems. SIEMs were born, built\n", + "on data warehouses, relational databases or NoSQL database management systems. But as incident investigation needs evolved, those data warehouses weren’t able to\n", + "handle the volume and variety of data, which led to the development of data lakes. Data lakes were cost-effective and scalable but didn’t have strong data governance and\n", + "data hygiene, earning them the moniker of “data swamps.” Simply integrating the two tech stacks is really complicated because of varying governance models, data silos\n", + "and inconsistent use case support. Fast-forward to today, security teams now need AI/ML at scale in a multicloud world.\n", + "\n", + "Why choose one or the other? The lakehouse architecture has emerged in recent years to help address these concerns with a single unified architecture for all your threat\n", + "data, analytics and AI in the cloud. The governance and transactional capabilities of the data warehouse, the scale and flexibility of a data lake, AI/ML from the ground up\n", + "and multicloud native deployments in one platform – this is a modern architecture called the lakehouse (data lake and data warehouse).\n", + "\n", + "**Current Challenges** **Introducing the Data Lakehouse**\n", + "\n", + "\n", + "**Cloud Storage**\n", + "No support for\n", + "analytics or\n", + "investigations\n", + "\n", + "**SIEMs**\n", + "No attack chaining.\n", + "Poor for high\n", + "cardinality search.\n", + "\n", + "\n", + "**UBA tools**\n", + "No historical search,\n", + "blackbox,\n", + "proprietary storage\n", + "\n", + "**No SIEM/Log**\n", + "solution is\n", + "multicloud\n", + "native\n", + "\n", + "\n", + "**Curated Alerts** **Cloud-scale**\n", + "**search**\n", + "\n", + "**ML/AI** **Multicloud**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Rethinking Cybersecurity in Financial Services With Databricks Lakehouse\n", + "\n", + "Databricks introduced the first data lakehouse platform to the industry, and today over 7,000 customers use it worldwide. With Databricks Lakehouse, FSIs that are ready to\n", + "modernize their data infrastructure and analytics capabilities for better protection against cyber threats now have one cost-effective solution that addresses the needs of\n", + "all their teams.\n", + "\n", + "The Databricks Lakehouse Platform combines the best elements of data lakes and data warehouses, delivering the low-cost, flexible object stores offered by data lakes and\n", + "the data management and performance typically found in data warehouses. This unified platform simplifies existing architecture by eliminating the data silos that traditionally\n", + "separate analytics, data science and ML. It’s built on open source, open data and open standards to maximize flexibility, and its inherent collaborative capabilities accelerate\n", + "the ability to work across teams and innovate faster. Moreover, because it’s multicloud, it works the same way no matter which cloud provider is used.\n", + "\n", + "ETL and Enrichment\n", + "\n", + "**Proof Point**\n", + "\n", + "**Firewall**\n", + "\n", + "**Antivirus**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse in Financial Services\n", + "\n", + "By unifying data with analytics and AI, Lakehouse allows FSIs to easily access all their data for downstream advanced analytics capabilities to support complex security\n", + "use cases. Lakehouse facilitates collaboration between threat intelligence teams and cyber operations, enables security operations teams to detect advanced threats, and\n", + "reduces human resource burnout through analytical automation and collaboration. Importantly, Lakehouse also accelerates investigations from days to minutes.\n", + "\n", + "Along with a more modern architecture, the Lakehouse Platform includes Delta Lake, which unifies all security data in a transactional data lake to feed advanced analytics.\n", + "The analytics and collaboration are done in notebooks, and security teams can use multiple languages — SQL, Python, R and Scala — in the same notebook. This makes\n", + "it easy for security practitioners to explore data and develop advanced analytics and reporting using their favorite methods. Additionally, a separation of compute from\n", + "storage means performance at scale without impacting overall storage costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**C A S E S T U D Y**\n", + "\n", + "**When It Comes to Security, Data Is the Best Defense***\n", + "\n", + "**Protecting HSBC’s 40 million customers begins with collecting and processing data from billions**\n", + "**of signals to make previously impossible threat detection possible**\n", + "\n", + "security operation departments, creating an enhanced relationship that results\n", + "in better defenses, insight into the security posture of the organization, and the\n", + "ability to respond at the pace of the adversary.\n", + "\n", + "\n", + "The old way of thinking about security — stronger locks, higher walls — is outdated\n", + "and ineffective. “When defending an organization, too often we just focus heavily\n", + "on tools, technology, and reactive scenarios,” said T.J. Campana, managing director\n", + "of global defense and chief technology officer at HSBC, the multinational bank. “But\n", + "the security business is a data business. And the data always has a story to tell us.”\n", + "\n", + "The quality of security, he added, is proportional to the information that can be\n", + "\n", + "distilled from petabytes of data that endlessly flows through company networks.\n", + "That means “empowering people to get the right insights, in the right way to\n", + "quickly prevent, detect, and respond to threats, wherever and whenever they\n", + "occur,” said George Webster, executive director of global cybersecurity science\n", + "and analytics at HSBC.\n", + "\n", + "If a big organization is made up of tens of millions of parts that must click together\n", + "seamlessly, security keeps those seals tight. Data gathering, analytical tools, and\n", + "human intellect work together as one. This involves fusing the data science and\n", + "\n", + "\n", + "But working across years of data at petabyte scale is not an easy task, especially\n", + "when a long time is measured in minutes and the adversary is constantly working\n", + "against you. To put this in perspective, the security teams at HSBC intake 10 times\n", + "the amount of data contained in all of the books in the U.S. Library of Congress\n", + "every day, and must process months, if not years, of data at a time. That is where\n", + "innovative design, smart people, and leveraging the right technology come into\n", + "play. “We have to break the paradigm of the tool being the end goal of defense\n", + "and instead view the tools as an enabler of our people,” said Webster. “It is always\n", + "about the people,” added Campana.\n", + "\n", + "HSBC turned away from the common security paradigm by leveraging the big data\n", + "processing techniques from Azure Databricks. In many ways, their open source\n", + "Delta Lake is the key enabler, with Spark being the engine. Delta Lake allows these\n", + "teams to structure, optimize, and unlock data at scale, while Spark allows multiple\n", + "complex programs to seamlessly crunch through the data. This enables HSBC’s\n", + "security teams to constantly evolve their defenses, create new capabilities at\n", + "pace, and perform investigations that were previously impossible. When a new\n", + "threat emerges, the bank doesn’t have the luxury to wait for the security market to\n", + "identify, respond, and mitigate. Instead, the bank turns to its people and creates\n", + "what is needed at breathtaking speed.\n", + "\n", + "\n", + "-----\n", + "\n", + "**C A S E S T U D Y : C O N T I N U E D**\n", + "\n", + "\n", + "It’s an essential function for HSBC, which needs to continually think about how to\n", + "keep more than 40 million customers in 64 countries and territories safe. Taken\n", + "together, it’s an all-brains-on-deck moment with data and people guiding the\n", + "ship. It’s also a tall task for a company as massive and multifaceted as HSBC.\n", + "Headquartered in the UK, it is one of the largest global banks (total assets: a\n", + "whopping $2.968 trillion), with operations across Africa, Europe, Asia, and the\n", + "Americas. It’s also the largest bank in Hong Kong and even prints some of the local\n", + "currency, which bears the HSBC name.\n", + "\n", + "The bank’s cybersecurity approach involves fusing the data science and security\n", + "operation departments, creating an enhanced relationship that results in more\n", + "efficient threat discovery, rapid development of operational use cases and AI\n", + "models. This enables the continuous creation of capabilities that stop adversaries\n", + "before they even start. “We have to get out of the mindset that security is a walled\n", + "garden,” said Webster. “We must create truly collaborative environments for our\n", + "people to enable the business to operate,” said Campana.\n", + "\n", + "Staffing this symbiotic power center will be someone Campana optimistically calls\n", + "“the analyst of the future,” a description that’s both mindset and skillset: threat\n", + "hunter and data scientist.\n", + "\n", + "In addition, when another organization is hit by cybercrime, HSBC analyzes it\n", + "to understand how it may have responded and then improves its defenses\n", + "accordingly. That’s in contrast to the industry norm; a Ponemon survey revealed\n", + "\n", + "\n", + "that 47 percent of organizations have not assessed the readiness of their incident\n", + "response teams. That means the first time they test their plans will be at the worst\n", + "possible time — in the middle of a cyber attack.\n", + "\n", + "The proactive approach is a far cry from the old reactive conveyor belt model of\n", + "security when alert tickets were received from tooling and processed in a slow\n", + "and linear way. Today, cross-disciplinary security teams don’t just react; they\n", + "continually search for the signals in the noise — tiny aberrations that indicate\n", + "something’s not right – and send up red flags in real-time. “We’re scanning\n", + "hundreds of billions of signals per day. I cannot wait. We need situational\n", + "awareness right now,” said Campana.\n", + "\n", + "That increased speed is critical for threat assessment. Information theft may be\n", + "the most expensive and fastest-rising consequence of cybercrime, but data is not\n", + "the only target. Core systems are being hacked in a dangerous trend to disrupt\n", + "and destroy. Regulators are also increasingly asking banks for controls in place to\n", + "detect and preempt financial crimes. That’s where big data tooling like Delta Lake\n", + "and Spark shine, and where it will continually be called on to address the security\n", + "needs of new initiatives.\n", + "\n", + "“Digital security is about organically adjusting to risks,” said Webster. “It’s a journey\n", + "of continual discovery with one central goal: to protect customers. They want\n", + "things easy and they want them quick. It’s our job to make sure that it’s secure.”\n", + "\n", + "*This story previously appeared in [WIRED Brand Lab for Databricks](https://www.wired.com/sponsored/story/when-it-comes-to-security-data-is-the-best-defense/) .\n", + "\n", + "\n", + "-----\n", + "\n", + "**Advantages of a Lakehouse**\n", + "\n", + "\n", + "**A cost-efficient upgrade**\n", + "\n", + "Databricks customers only pay for the data they\n", + "analyze, not for what they collect. This means that\n", + "security teams can collect any amount of data\n", + "without worrying about ingest-based pricing, and\n", + "only pay for the data that’s actually used for analysis\n", + "— for example, an incident investigation or a data\n", + "call for an audit. This pricing model enables security\n", + "teams to collect data that was previously out of\n", + "reach, such as netflow data, endpoint detection and\n", + "response data, and application and services data.\n", + "\n", + "Further, Databricks is a fully managed service,\n", + "meaning that security teams don’t have to\n", + "pre-commit to hardware capital expenditures.\n", + "With no hardware to manage and no big data\n", + "implementations to maintain, security teams\n", + "can significantly reduce their management and\n", + "maintenance costs.\n", + "\n", + "\n", + "**Multicloud**\n", + "\n", + "Databricks is cloud-native on AWS, Microsoft Azure\n", + "and Google Cloud. This creates freedom for the\n", + "security teams to use whatever cloud provider they\n", + "like. Additionally, teams can acquire and maintain\n", + "operational consistency across all providers when\n", + "they have multiple cloud footprints. This enables\n", + "consistent policy implementation, reduced\n", + "complexity for staff and increased efficiency.\n", + "\n", + "Additionally, Databricks enables faster detection,\n", + "investigation and response across the enterprise\n", + "because analytics can be reused across the\n", + "major cloud providers through a unified platform\n", + "that centralizes data for easy sharing and fosters\n", + "collaboration across teams.\n", + "\n", + "\n", + "**Enterprise security and**\n", + "**360° risk management**\n", + "\n", + "The Lakehouse Platform is easy to set up, manage,\n", + "scale and, most importantly, secure. This is because\n", + "Lakehouse easily integrates with existing security\n", + "and management tools, enabling users to extend\n", + "their policies for peace of mind and greater control.\n", + "\n", + "With multicloud management, security admins and\n", + "data teams get a consistent experience across all\n", + "major cloud providers. This saves valuable time\n", + "and the resources required to upskill talent on\n", + "proprietary services for data, analytics and AI.\n", + "\n", + "Security, risk and compliance leaders are also\n", + "able to give team members a range of security\n", + "permissions that come with thorough audit trails.\n", + "This allows teams to quickly spin up and wind down\n", + "collaborative workspaces for any project and to\n", + "manage use cases from end to end — from enabling\n", + "user access and controlling spend to auditing usage\n", + "and analyzing activity across every workspace to\n", + "enforce user and data governance.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations\n", + "\n", + "\n", + "According to George Webster, head of cybersecurity sciences and analytics at\n", + "HSBC, Lakehouse and SIEM is the pattern for security operations. What does\n", + "it look like? It leverages the strengths of the two components: Lakehouse for\n", + "multicloud native storage and analytics, SIEM for security operations workflows.\n", + "For Databricks customers like HSBC, there are two general patterns for this\n", + "integration that are both underpinned by what Webster calls the cybersecurity\n", + "data lake with Lakehouse.\n", + "\n", + "In the first pattern, Lakehouse stores all the data for the maximum retention\n", + "period. A subset of the data is then sent to the SIEM and stored for a fraction of\n", + "the time. This pattern has the advantage of allowing analysts to query near-term\n", + "\n", + "\n", + "data using the SIEM while having the ability to do historical analysis and more\n", + "sophisticated analytics in Databricks. It also lets them manage any licensing or\n", + "storage costs for the SIEM deployment.\n", + "\n", + "The second pattern is to send the highest-volume data sources to Databricks —\n", + "for example, cloud-native logs, endpoint threat detection and response logs, DNS\n", + "data and network events. Low-volume data sources such as alerts, e-mail logs\n", + "and vulnerability scan data go to the SIEM. This pattern enables Tier 1 analysts to\n", + "quickly handle high-priority alerts in the SIEM. Threat-hunt teams and investigators\n", + "can leverage the advanced analytical capabilities of Databricks. This pattern has a\n", + "cost-benefit of offloading processing, ingestion and storage from the SIEM.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Databricks and Splunk:**\n", + "**A Case Study in Cost-Savings**\n", + "\n", + "Databricks integrates with your preferred SIEM, like\n", + "Splunk, and the Splunk-certified Databricks add-on\n", + "can be used to meet SOC needs without changing\n", + "the user interface. This example features a global\n", + "financial institution’s security operation, where\n", + "the organization grew throughput from 25TB per\n", + "day with only 180 days lookback, to 100TB per day\n", + "with 395 days lookback using the Databricks SIEM\n", + "augmentation. The total cost of ownership savings,\n", + "including infrastructure and license costs, saved tens\n", + "of millions (more than $80mn per year) in cloud costs.\n", + "\n", + "\n", + "##### FinServ Security Operations\n", + "\n", + "Databricks + Splunk **Drastically** Lowered Costs\n", + "\n", + "**CURRENT STATE** **FUTURE OPTION**\n", + "\n", + "100\n", + "\n", + "75\n", + "\n", + "\n", + "**Throughput**\n", + "TB per day\n", + "\n", + "**Lookback**\n", + "**period**\n", + "Days\n", + "\n", + "\n", + "50\n", + "\n", + "\n", + "**100**\n", + "\n", + "\n", + "25\n", + "\n", + "**25**\n", + "\n", + "0\n", + "\n", + "Splunk only Splunk + Databricks\n", + "\n", + "**395**\n", + "\n", + "**180**\n", + "\n", + "Splunk only Splunk + Databricks\n", + "\n", + "TCO savings with Splunk and Databricks vs. Splunk only solution: $81M\n", + "\n", + "\n", + "-----\n", + "\n", + "## Common Use Cases\n", + "\n", + "As FSIs focus on modernizing their data analytics and warehousing capabilities, the Databricks Lakehouse Platform\n", + "brings a new level of empowerment to FSIs, allowing them to unlock the full potential of their data to deliver on their\n", + "objectives and better serve their customers.\n", + "\n", + "**Common use cases include:**\n", + "\n", + "\n", + "\n", + "**•** **Threat hunting:** Empower security teams to\n", + "proactively detect and discover advanced\n", + "threats using months or years of data\n", + "\n", + "**•** **Incident investigation:** Gain complete visibility\n", + "across network, endpoint, cloud and application\n", + "data to respond to incidents\n", + "\n", + "**•** **Phishing threat detection:** Uncover social\n", + "engineering attacks that are often used to steal\n", + "user data, including log-in credentials and credit\n", + "card numbers\n", + "\n", + "**•** **Supply chain monitoring:** Leverage ML to\n", + "identify suspicious behavior within your software\n", + "supply chain\n", + "\n", + "\n", + "\n", + "**•** **Ransomware detection:** Scope the impact\n", + "and spread of ransomware attacks to inform\n", + "complete mitigation and remediation\n", + "\n", + "**•** **Credentials-abuse detection:** Identify and\n", + "investigate anomalous credential usage across\n", + "your infrastructure\n", + "\n", + "**•** **Insider-threats detection:** Find and respond\n", + "to malicious threats from people within an\n", + "organization who have inside information about\n", + "security practices, data and computer systems\n", + "\n", + "**•** **Network traffic analysis:** Examine real-time\n", + "network availability and activity to identify\n", + "anomalies, vulnerabilities and malware\n", + "\n", + "\n", + "\n", + "**•** **Analytics automation:** Automatically\n", + "contextualize and enrich multiple streaming and\n", + "batch analytics to accelerate analyst workflows\n", + "and decision-making\n", + "\n", + "**•** **Augmenting anti-money laundering practices**\n", + "**(AML):** Using structured and unstructured\n", + "data to maintain a list of politically exposed\n", + "individuals, often referred to as PEP, to augment a\n", + "bank’s AML processes. This includes pulling data\n", + "from an organization externally (keeping the PEP\n", + "list up-to-date including out-of-country officials\n", + "and diplomats) as well as internally (including\n", + "critical personnel, network admins, etc.) who\n", + "need extra scrutiny.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started With Databricks for Cybersecurity\n", + "\n", + "Getting up and running on Databricks to address your cybersecurity needs is easy with our Solution\n", + "Accelerators. Databricks Solution Accelerators are highly optimized, fully functional analytics solutions that\n", + "provide customers with a fast start to solving their data problems.\n", + "\n", + "**•** [Cybersecurity analytics and AI at scale with Splunk and Databricks](https://databricks.com/solutions/accelerators/cybersecurity-analytics-and-ai) : Rapidly detect threats,\n", + "investigate the impact and reduce risks with the Databricks add-on for Splunk\n", + "\n", + "**•** [Threat detection at scale with DNS analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html) : Recognize cybercriminals using DNS,\n", + "threat intelligence feeds and ML\n", + "\n", + "Databricks Solution Accelerators are free. Join the hundreds of Databricks customers using Solution\n", + "Accelerators to drive better outcomes in their businesses.\n", + "\n", + "If you’d like to learn more about how we are helping financial services institutions securely leverage data and AI,\n", + "please visit us at [dbricks.co/fiserv](https://databricks.com/solutions/industries/financial-services) or reach out to us at [cybersecurity@databricks.com](mailto:cybersecurity%40databricks.com?subject=) .\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including\n", + "\n", + "Comcast, Condé Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks\n", + "\n", + "Lakehouse Platform to unify their data, analytics and AI. Databricks is headquartered in San\n", + "\n", + "Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™\n", + "\n", + "Delta Lake and MLflow, Databricks is on a mission to help data teams solve the world’s\n", + "\n", + "toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "#### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n", + "\n", + "###### To learn more, visit us at:\n", + " dbricks.com/fiserv\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-eBook-finServ-cyber.pdf2024-09-19T16:57:20Z
**EBOOK**\n", + "\n", + "## Why the Data Lakehouse Is Your Next Data Warehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Preface .......................................................................................................................................................................................................................................... **3**\n", + "\n", + "Introduction ............................................................................................................................................................................................................................. **4**\n", + "\n", + "Our Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n", + "\n", + "Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n", + "\n", + "Why Databricks SQL? ............................................................................................................................................................................................... 6\n", + "\n", + "Common use cases .................................................................................................................................................................................................... 7\n", + "\n", + "The Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n", + "\n", + "**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n", + "\n", + "**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n", + "\n", + "**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n", + "\n", + "Conclusion ............................................................................................................................................................................................................................. **24**\n", + "\n", + "Customer Stories ............................................................................................................................................................................................................... **25**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Preface\n", + "\n", + "Historically, data teams have had to resort to a bifurcated architecture to run traditional\n", + "BI and analytics workloads, copying subsets of the data already stored in their data lake\n", + "to a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\n", + "governance inherent in proprietary architectures.\n", + "\n", + "Our customers have asked us to simplify their data architecture. We decided to accelerate\n", + "our investments to do just that.\n", + "\n", + "\n", + "We introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\n", + "first-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\n", + "We use the term “lakehouse” to reflect our customers’ desire to combine the best of data\n", + "warehouses and data lakes. With the lakehouse, you can now establish one source of truth\n", + "for all data and enable all workloads from AI to BI on one platform. And we want to provide\n", + "you with ease-of-use and state-of-the-art performance at the lowest cost.\n", + "\n", + "\n", + "**Reynold Xin**\n", + "\n", + "Original Creator of Apache Spark, TM\n", + "Co-founder and Chief Architect,\n", + "Databricks\n", + "\n", + "\n", + "This eBook covers how we went back to the drawing board to build Databricks SQL — the\n", + "last mile of enabling data warehousing capabilities for your existing data lakes — as part of\n", + "the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "Most organizations operate their business with a complex data architecture that\n", + "combines data warehouses and data lakes. For one thing, data lakes are great\n", + "for machine learning (ML). They support open formats and a large ecosystem.\n", + "But data lakes have poor support for business intelligence (BI) and suffer\n", + "complex data quality problems. Data warehouses, on the other hand, are great\n", + "for BI applications. But they have limited support for ML workloads, can’t handle\n", + "natural language data, large-scale structured data, or raw, video, audio or image\n", + "files, and are proprietary systems with only a SQL interface.\n", + "\n", + "As a result, data is moved around the organization through data pipelines and\n", + "systems that create a multitude of data silos. A large amount of time is spent\n", + "maintaining these pipelines and systems rather than creating new value from\n", + "data, and downstream consumers struggle to get a single source of truth of the\n", + "data due to the inherent siloing of data that takes place. The situation becomes\n", + "very expensive, and decision-making speed and quality are negatively affected.\n", + "\n", + "Unifying these systems can be transformational in how we think about data.\n", + "\n", + "\n", + "##### The need for simplification\n", + "\n", + "It is time for a new data architecture that can meet both today’s and tomorrow’s\n", + "needs. Without any compromise. Advanced analytics and ML are one of the\n", + "most strategic priorities for data-driven organizations today, and the amount\n", + "of unstructured data is growing exponentially. So it makes sense to position\n", + "the data lake as the center of the data infrastructure. However, for this to be\n", + "achievable, the data lake needs to adopt the strengths of data warehouses.\n", + "\n", + "The answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\n", + "and standardized system design: one that implements data structure and data\n", + "management features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n", + "\n", + "##### Building the Data Lakehouse\n", + "[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n", + "\n", + "\n", + "-----\n", + "\n", + "### Our Approach: The Databricks Lakehouse Platform\n", + "\n", + "Our customers have asked us for simplification. This is why we’ve embarked on\n", + "this journey to deliver one simple, open and collaborative platform for all your\n", + "data, AI and BI workloads on your existing data lakes.\n", + "\n", + "The [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\n", + "combining the data management and performance typically found in data\n", + "warehouses with the low-cost, flexible object stores offered by data lakes.\n", + "\n", + "It’s built on open source and open standards to maximize flexibility, and lets you\n", + "store all your data — structured, semi-structured and unstructured — in your\n", + "existing data lake while still getting the data quality, performance, security and\n", + "governance you’d expect from a data warehouse. Data only needs to exist once\n", + "to support all of your data, AI and BI workloads on one common platform\n", + "— establishing one source of truth.\n", + "\n", + "Finally, the Lakehouse Platform provides tailored and collaborative\n", + "experiences so data engineers, data scientists and analysts can work together\n", + "on one common platform across the entire data lifecycle — from ingestion to\n", + "consumption and the serving of data products — and innovate faster.\n", + "\n", + "Let’s look at how, with the right data structures and data management\n", + "capabilities in place, we can now deliver data warehouse and analytics\n", + "capabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n", + "\n", + "\n", + "Databricks SQL is a serverless data warehouse on the Databricks Lakehouse\n", + "Platform that lets you run all your SQL and BI applications at scale with up to 12x\n", + "better price/performance, a unified governance model, open formats and APIs,\n", + "and your tools of choice — no vendor lock-in. Reduce resource management\n", + "overhead with serverless compute, and easily ingest, transform and query\n", + "all your data in place to deliver real-time business insights faster. In fact, DB\n", + "SQL now holds the new world record in 100TB TPC-DS, the gold standard\n", + "performance benchmark for data warehousing.\n", + "\n", + "Built on open standards and APIs, the lakehouse provides an open, simplified and\n", + "multicloud architecture that brings the best of data warehousing and data lakes\n", + "together, and integrations with a rich ecosystem for maximum flexibility.\n", + "\n", + "\n", + "##### Why Databricks SQL?\n", + "\n", + "Best Price/Performance\n", + "Lower costs, get world-class performance, and eliminate the need to manage,\n", + "configure or scale cloud infrastructure with serverless.\n", + "\n", + "Built-In Governance\n", + "Establish one single copy for all your data using open standards, and one unified\n", + "governance layer across all data teams using standard SQL.\n", + "\n", + "Rich Ecosystem\n", + "Use SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\n", + "to ingest, transform and query all your data in place.\n", + "\n", + "Break Down Silos\n", + "Empower every analyst to access the latest data faster for downstream real-time\n", + "analytics, and go effortlessly from BI to ML.\n", + "\n", + "**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Common use cases\n", + "\n", + "Thousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\n", + "for hundreds of analysts across their organizations, and to build custom data applications to better serve their\n", + "customers. Below are some examples of use cases for Databricks SQL.\n", + "\n", + "**At Atlassian, we have proven**\n", + "\n", + "\n", + "**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n", + "**your BI tools of choice** **the freshest data** **data applications**\n", + "\n", + "\n", + "**that there is no longer a need**\n", + "\n", + "**for two separate data things.**\n", + "\n", + "**Technology has advanced**\n", + "\n", + "**far enough for us to consider**\n", + "\n", + "**one single unified lakehouse**\n", + "\n", + "**architecture.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager,\n", + "Atlassian\n", + "\n", + "\n", + "Enable business analysts to\n", + "directly query data lake data\n", + "using their favorite BI tool and\n", + "avoid data silos. Reengineered\n", + "and optimized connectors\n", + "ensure fast performance,\n", + "low latency and high user\n", + "concurrency to your data lake.\n", + "Now analysts can use the best\n", + "tool for the job on one single\n", + "source of truth for your data.\n", + "\n", + "\n", + "Empower every analyst and SQL\n", + "professional in your organization\n", + "to quickly find and share new\n", + "insights by providing them with\n", + "a collaborative and self-served\n", + "analytics experience. Confidently\n", + "manage data permissions with\n", + "fine-grained governance, share and\n", + "reuse queries, and quickly analyze\n", + "and share results using interactive\n", + "visualizations and dashboards.\n", + "\n", + "\n", + "Build more effective and\n", + "tailored data applications\n", + "for your own organization or\n", + "your customers. Benefit from\n", + "the ease of connectivity,\n", + "management and better price/\n", + "performance of DB SQL to\n", + "simplify development of dataenhanced applications at scale,\n", + "all served from your data lake.\n", + "\n", + "\n", + "-----\n", + "\n", + "### The Inner Workings of the Lakehouse\n", + "\n", + "\n", + "In the next chapter, we’ll unpack the three foundational layers of the Databricks\n", + "Lakehouse Platform and how we went back to the drawing board to build this\n", + "experience. Specifically, we’ll dive into how we built Databricks SQL to deliver\n", + "analytics and data warehousing workloads on your lakehouse.\n", + "\n", + "\n", + "Those layers are:\n", + "\n", + "**1 .** The storage layer, or how we store and govern data\n", + "\n", + "**2 .** The compute layer, or how we process queries\n", + "\n", + "**3 .** The consumption layer, or the tools you can use to interface with the system\n", + "\n", + "\n", + "###### PART 1: STORAGE LAYER\n", + "\n", + "In order to bring the best of data lakes and data\n", + "warehouses, we needed to support the openness\n", + "and flexibility of data lakes, as well as the quality,\n", + "performance and governance you’d expect from a\n", + "data warehouse.\n", + "\n", + "\n", + "**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n", + "|---|---|---|\n", + "|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n", + "|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n", + "|All data types|Structured only|All data types|\n", + "|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Transactional guarantees for your data lake\n", + "\n", + "\n", + "The open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\n", + "lake challenges around data quality and reliability. It is the foundation for the\n", + "lakehouse, and Databricks SQL stores and processes data using Delta Lake.\n", + "\n", + "For example, it provides ACID transactions to ensure that every operation either\n", + "fully succeeds or fully aborts for later retries — without requiring new data\n", + "pipelines to be created. It unifies batch and streaming pipelines so you can\n", + "easily merge existing and new data at the speed required for your business. With\n", + "Time Travel, Delta Lake automatically records all past transactions, so it’s easy\n", + "to access and use previous versions of your data for compliance needs or for\n", + "ML applications. Advanced indexing, caching and auto-tuning allow optimization\n", + "of Delta tables for the best query performance. Delta Lake also acts as the\n", + "foundation for fine-grained, role-based access controls on the lakehouse.\n", + "\n", + "As a result, Delta Lake allows you to treat tables in Databricks SQL just like you\n", + "treat tables in a database: updates, inserts and merges can take place with high\n", + "performance at the row level. This is particularly useful if you are inserting new\n", + "\n", + "\n", + "data rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n", + "(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\n", + "with one open and standard format — not only for SQL but also for Python, Scala\n", + "and other languages — so you can run all analytical and ML use cases on the\n", + "same data.\n", + "\n", + "**Delta Lake provides the key**\n", + "\n", + "An open format storage layer built for lake-first architecture\n", + "\n", + "ACID transactions, Time Travel, highly available\n", + "\n", + "Advanced indexing, caching, auto-tuning\n", + "\n", + "Fine-grained, role-based access controls\n", + "\n", + "Streaming & batch, analytics & ML\n", + "\n", + "Python, SQL, R, Scala\n", + "\n", + "Delta Lake brings data quality, performance and governance to the lakehouse\n", + "\n", + "**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n", + "##### Delta Lake: The Definitive Guide\n", + "[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A framework for building a curated data lake\n", + "\n", + "\n", + "With the ability to ingest petabytes of data with auto-evolving schemas, Delta\n", + "Lake helps turn raw data into actionable data by incrementally and efficiently\n", + "processing data as it arrives from files or streaming sources like Kafka, Kinesis,\n", + "Event Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\n", + "as it arrives with no manual intervention, as well as infer schema, detect column\n", + "changes for structured and unstructured data formats, and prevent data loss by\n", + "rescuing data columns that don’t meet data quality specifications. And now with\n", + "[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\n", + "various sources.\n", + "\n", + "As you refine the data, you can add more structure to it. Databricks recommends\n", + "the Bronze, Silver and Gold pattern. It lets you easily merge and transform new\n", + "and existing data — in batch or streaming — while benefiting from the low-cost,\n", + "flexible object storage offered by data lakes. Bronze is the initial landing zone\n", + "for the pipeline. We recommend copying data that’s as close to its raw form as\n", + "possible to easily replay the whole pipeline from the beginning, if needed. Silver\n", + "is where the raw data gets cleansed (think data quality checks), transformed\n", + "and potentially enriched with external data sets. Gold is the production-grade\n", + "data that your entire company can rely on for business intelligence, descriptive\n", + "statistics, and data science/machine learning.\n", + "\n", + "\n", + "By the time you get to Gold, the tables are high-value business-level metrics\n", + "that have all the schema enforcement and constraints applied. This way, you can\n", + "retain the flexibility of the data lake at the Bronze and Silver levels, and then use\n", + "the Gold level for high-quality business data.\n", + "\n", + "Auto Loader\n", + "\n", + "\n", + "BRONZE\n", + "\n", + "\n", + "SILVER GOLD\n", + "\n", + "\n", + "Structured Streaming\n", + "\n", + "Batch\n", + "\n", + "COPY INTO\n", + "\n", + "Partners\n", + "\n", + "\n", + "Raw ingestion Filtered, cleaned Business-level\n", + "and history and augmented aggregates\n", + "\n", + "|Col1|Col2|\n", + "|---|---|\n", + "||R|\n", + "\n", + "\n", + "**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### An aside on batch and streaming data pipelines\n", + "\n", + "\n", + "The best way to set up and run data pipelines in the Bronze/Silver/Gold\n", + "pattern recommended on the previous page is in Delta Live Tables (DLT).\n", + "DLT makes it easy to build and manage reliable batch and streaming\n", + "data pipelines that deliver high-quality data. It helps data engineering\n", + "teams simplify ETL development and management with declarative\n", + "pipeline development, automatic data testing, and deep visibility for\n", + "monitoring and recovery.\n", + "\n", + "The fact that you can run all your batch and streaming pipelines together\n", + "in one simple, declarative framework makes data engineering easy on the\n", + "Databricks Lakehouse Platform. We regularly talk to customers who have\n", + "been able to reduce pipeline development time from weeks — or months\n", + "— to mere minutes with Delta Live Tables. And by the way, even data\n", + "\n", + "\n", + "analysts can easily interrogate DLT pipelines for the queries they need\n", + "to run, without knowing any sort of specialized programming language\n", + "or niche skills.\n", + "\n", + "One of the top benefits of DLT, and Delta Lake in general, is that it is built\n", + "with streaming pipelines in mind. Today, the world operates in real time, and\n", + "businesses are increasingly expected to analyze and respond to their data in\n", + "real time. With streaming data pipelines built on DLT, analysts can easily access,\n", + "query and analyze data with greater accuracy and actionability than with\n", + "conventional batch processing. Delta Live Tables makes real-time analytics a\n", + "reality for our customers.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Fine-grained governance on the lakehouse\n", + "\n", + "Delta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\n", + "on the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\n", + "provides fine-grained governance across clouds, data and ML assets. Among the\n", + "benefits of the Unity Catalog, it allows you to:\n", + "\n", + "**• Discover, audit and govern data assets in one place:** A user-friendly\n", + "interface, automated data lineage across tables, columns, notebooks,\n", + "workflows and dashboards, role-based security policies, table or\n", + "column-level tags, and central auditing capabilities make it easy for\n", + "data stewards to discover, manage and secure data access to meet\n", + "compliance and privacy needs directly on the lakehouse.\n", + "\n", + "\n", + "\n", + "**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\n", + "open standard SQL DCL. This means database administrators can easily\n", + "grant permission to arbitrary, user-specific views, or set permissions on\n", + "all columns tagged together, using familiar SQL.\n", + "\n", + "**• Centrally manage and audit shared data across organizations:** Every\n", + "organization needs to share data with customers, partners and suppliers\n", + "to better collaborate and to unlock value from their data. Unity Catalog\n", + "builds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\n", + "shared assets within and across organizations.\n", + "\n", + "\n", + "The Unity Catalog makes it easy for data stewards to discover, manage and secure data access\n", + "to meet compliance and privacy needs on the lakehouse.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 2: COMPUTE LAYER\n", + "\n", + "\n", + "The next layer to look at is the compute layer, or how we process queries.\n", + "\n", + "Apache Spark TM has been the de facto standard for data lake compute. It’s great\n", + "for processing terabytes and petabytes of data cheaply, but historically Spark\n", + "SQL uses a nonstandard syntax and can be difficult to configure.\n", + "\n", + "\n", + "Data warehouses, on the other hand, tend to support short running queries\n", + "really well, especially when you have a lot of users issuing queries concurrently.\n", + "They tend to be easier to set up, but don’t necessarily scale or they become\n", + "too costly.\n", + "\n", + "\n", + "**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n", + "|---|---|---|\n", + "|Economical|Scaling is exponentially more expensive|Economical|\n", + "|High operational complexity|Ease of use|Ease of use|\n", + "||||\n", + "\n", + "\n", + "A popular belief is that large workloads require a drastically different system\n", + "than low latency, high concurrency workloads. For example, there’s the classic\n", + "trade-off in computer systems between latency and throughput.\n", + "\n", + "But after spending a lot of time analyzing these systems, we found that it was\n", + "possible to simultaneously improve large query performance and concurrency\n", + "\n", + "\n", + "and latency. Although the classic trade-offs definitely existed, they were only\n", + "explicit when we optimized the system to the very theoretical optimal. It turned\n", + "out the vast majority of software — and this includes all data warehouse systems\n", + "and Databricks — were far away from optimal.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n", + "\n", + "\n", + "To achieve world-class performance for analytics on the lakehouse, we chose to\n", + "completely rebuild the compute layer. But performance isn’t everything. We also\n", + "want it to be simple to administer and cheaper to use. Databricks SQL leverages\n", + "serverless SQL warehouses that let you get started in seconds, and it’s powered\n", + "by a new native MPP vectorized engine: Photon.\n", + "\n", + "Databricks SQL warehouses are optimized and elastic SQL compute resources.\n", + "Just pick the cluster size and Databricks automatically determines the best\n", + "instance types and VMs configuration for the best price/performance. This\n", + "means you don’t have to worry about estimating peak demand or paying too\n", + "much by overprovisioning. You just need to click a few buttons to operate.\n", + "To further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\n", + "With the serverless capability, queries start rapidly with zero infrastructure\n", + "management or configuration overhead. This lowers your total cost, as you pay\n", + "only for what you consume without idle time or overprovisioned resources.\n", + "\n", + "\n", + "Since CPU clock speeds have plateaued, we also wanted to find new ways to\n", + "process data faster, beyond raw compute power. One of the most impactful\n", + "methods has been to improve the amount of data that can be processed in\n", + "parallel. However, data processing engines need to be specifically architected to\n", + "take advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\n", + "C++ based vectorized query processing engine that dramatically improves query\n", + "performance while remaining fully compatible with open Spark APIs. Databricks\n", + "SQL warehouses are powered by Photon, which seamlessly coordinates work and\n", + "resources and transparently accelerates portions of your SQL queries directly on\n", + "your data lake. No need to move the data to a data warehouse.\n", + "\n", + "**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n", + "##### Photon: A Fast Query Engine for Lakehouse Systems\n", + "\n", + "[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Did you know?**\n", + "\n", + "Databricks SQL warehouses scale automatically throughout the day to\n", + "better suit your business needs. Administration is simplified by identifying\n", + "how many clusters can scale out with min and max, and Databricks SQL will\n", + "auto-scale as needed. This ensures that you have ample compute to serve\n", + "your needs, without overprovisioning. Administrators appreciate the ability\n", + "to have better control over consumption costs, while users appreciate that\n", + "their queries process as fast and efficiently as possible. For most BI and\n", + "analytics use cases, using medium-size warehouses with scaling is a great\n", + "balance of price/performance that fits most business needs.\n", + "\n", + "In the next section, we will discuss examples of Databricks SQL performance results\n", + "on large-scale analytic workloads as well as highly concurrent workloads.\n", + "\n", + "\n", + "Running Scheduled Starting Cluster Scale\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Large query performance: the fastest data warehouse\n", + "\n", + "\n", + "The industry standard benchmark used by data warehouses is TPC-DS. It includes\n", + "100 queries that range from very simple to very sophisticated to simulate decision\n", + "support workloads. This benchmark was created by a committee formed by\n", + "data warehousing vendors. The chart at right shows price/performance results\n", + "running the 100TB version of TPC-DS, since for large workloads the numbers that\n", + "ultimately matter pertain to the performance cost. As you can see, Databricks SQL\n", + "outperforms all cloud data warehouses we have measured.\n", + "\n", + "**[LEARN MORE](https://dbricks.co/benchmark)**\n", + "\n", + "**Did you know?**\n", + "\n", + "\n", + "**$2,000**\n", + "\n", + "**$1,791**\n", + "\n", + "**$1,500**\n", + "\n", + "**$1,000**\n", + "\n", + "**$952**\n", + "\n", + "\n", + "**$500**\n", + "\n", + "\n", + "**$242**\n", + "**$146**\n", + "\n", + "\n", + "**$358**\n", + "\n", + "\n", + "**$0**\n", + "Databricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\n", + "Spot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n", + "\n", + "System\n", + "\n", + "100TB TPC-DS price/performance benchmark (lower is better).\n", + "\n", + "\n", + "Databricks SQL has set a [new world record in](http://tpc.org/5013)\n", + "[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\n", + "benchmark for data warehousing. Databricks\n", + "SQL outperformed the previous record by 2.2x.\n", + "And this result has been formally audited and\n", + "reviewed by the TPC council.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Highly concurrent analytics workloads\n", + "\n", + "Beyond large queries, it is also common for highly concurrent analytics workloads\n", + "to execute over small data sets. To optimize concurrency, we used the same\n", + "TPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\n", + "streams. We analyzed the results to identify and remove bottlenecks, and\n", + "built hundreds of optimizations to improve concurrency. Databricks SQL now\n", + "outperforms some of the best cloud data warehouses for both large queries and\n", + "small queries with lots of users.\n", + "\n", + "Real-world workloads, however, are not just about either large or small queries.\n", + "Databricks SQL also provides intelligent workload management with a dual\n", + "queuing system and highly parallel reads.\n", + "\n", + "\n", + "16,523\n", + "\n", + "12,248\n", + "\n", + "###### ~3X\n", + "\n", + "4,672\n", + "\n", + "\n", + "11,690\n", + "\n", + "\n", + "July 2020\n", + "\n", + "\n", + "Jan 2021 Oct 2022\n", + "\n", + "\n", + "CLOUD DW X SQL WAREHOUSE X - L SIZE\n", + "\n", + "10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Intelligent workload management with smart queuing system\n", + "\n", + "Real-world workloads typically include a mix of small and large queries. Therefore\n", + "the smart queuing and load balancing capabilities of Databricks SQL need to\n", + "account for that too. Databrick SQL uses a smart dual queuing system (in preview)\n", + "that prioritizes small queries over large, as analysts typically care more about the\n", + "latency of short queries than large ones.\n", + "\n", + "\n", + "##### Highly parallel reads with improved I/O performance\n", + "\n", + "It is common for some tables in a lakehouse to be composed of many files — for\n", + "example, in streaming scenarios such as IoT ingest when data arrives continuously.\n", + "In legacy systems, the execution engine can spend far more time listing these\n", + "files than actually executing the query. Our customers told us they do not want to\n", + "sacrifice performance for data freshness. With async and highly parallel I/O, when\n", + "executing a query, Databricks SQL now automatically reads the next blocks of data\n", + "from cloud storage while the current block is being processed. This considerably\n", + "increases overall query performance on small files (by 12x for 1MB files) and “cold\n", + "data” (data that is not cached) use cases as well.\n", + "\n", + "**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "###### PART 3: CONSUMPTION LAYER\n", + "\n", + "\n", + "The third layer of the Databricks Lakehouse Platform would similarly have to bridge\n", + "the best of both data lakes and data warehouses. In the lakehouse, you would\n", + "have to be able to work seamlessly with your tools of choice — whether you are a\n", + "business analyst, data scientist, or ML or data engineer.\n", + "\n", + "\n", + "The lakehouse must treat Python, Scala, R and SQL programming languages\n", + "and ecosystems as first-class citizens to truly unify data engineering, ML and BI\n", + "workloads in one place.\n", + "\n", + "\n", + "**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n", + "\n", + "|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n", + "|---|---|---|\n", + "|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n", + "|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n", + "||||\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A platform for your tools of choice\n", + "\n", + "\n", + "At Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\n", + "closely with a large number of software vendors to make sure you can easily use your tools of choice\n", + "on Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\n", + "your favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\n", + "concurrency and performance improvements, we make sure that the direct and live query experience is great.\n", + "\n", + "\n", + "**Now more than ever, organizations**\n", + "\n", + "**need a data strategy that enables**\n", + "\n", + "**speed and agility to be adaptable.**\n", + "\n", + "**As organizations are rapidly moving**\n", + "\n", + "**their data to the cloud, we’re**\n", + "\n", + "**seeing growing interest in doing**\n", + "\n", + "**analytics on the data lake. The**\n", + "\n", + "**introduction of Databricks SQL**\n", + "\n", + "**delivers an entirely new experience**\n", + "\n", + "**for customers to tap into insights**\n", + "\n", + "**from massive volumes of data with**\n", + "\n", + "**the performance, reliability and**\n", + "\n", + "**scale they need. We’re proud to**\n", + "\n", + "**partner with Databricks to bring**\n", + "\n", + "**that opportunity to life.**\n", + "\n", + "**Francois Ajenstat**\n", + "Chief Product Officer, Tableau\n", + "\n", + "\n", + "+ Any other Apache Spark-compatible client\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Faster BI results retrieval with Cloud Fetch\n", + "\n", + "Once query results are computed, cloud data warehouses often collect and\n", + "stream back results to BI clients on a single thread. This can create a bottleneck\n", + "and greatly slows down the experience if you are fetching anything more than a\n", + "few megabytes of results in size. To provide analysts with the best experience\n", + "from their favorite BI tools, we also needed to speed up how the system delivers\n", + "results to BI tools like Power BI or Tableau once computed.\n", + "\n", + "That’s why we’ve reimagined this approach with a new architecture called\n", + "[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\n", + "all of the compute nodes to cloud storage, and then sends the list of files using\n", + "pre-signed URLs back to the client. The client then can download in parallel\n", + "all the data from cloud storage. This approach provides up to 10x performance\n", + "improvement in real-world scenarios.\n", + "\n", + "\n", + "parallel\n", + "data\n", + "transfers\n", + "\n", + "\n", + "Cloud Storage\n", + "\n", + "**Cluster**\n", + "\n", + "\n", + "SQL Endpoint\n", + "\n", + "\n", + "CUSTOMER BENCHMARK\n", + "TABLEAU EXTRACT\n", + "\n", + "\n", + "Cloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n", + "**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n", + "\n", + "\n", + "-----\n", + "\n", + "##### A first-class SQL development experience\n", + "\n", + "In addition to supporting your favorite tools, we\n", + "are also focused on providing a native first-class\n", + "SQL development experience. We’ve talked to\n", + "hundreds of analysts using various SQL editors\n", + "like SQL Workbench every day, and worked with\n", + "them to provide the dream set of capabilities\n", + "for SQL development.\n", + "\n", + "For example, Databricks SQL now supports\n", + "[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\n", + "special SQL dialect. Query tabs allow you to work\n", + "on multiple queries at once, autosave gives you\n", + "peace of mind so you never have to worry about\n", + "losing your drafts, integrated history lets you\n", + "easily look at what you have run in the past, and\n", + "intelligent auto-complete understands subqueries\n", + "and aliases for a delightful experience.\n", + "\n", + "\n", + "The built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n", + "\n", + "\n", + "-----\n", + "\n", + "Finally, with Databricks SQL, analysts can easily\n", + "make sense of query results through a wide variety\n", + "of rich visualizations and quickly build dashboards\n", + "with an intuitive drag-and-drop interface. To keep\n", + "everyone current, dashboards can be shared and\n", + "configured to automatically refresh, as well as to\n", + "alert the team to meaningful changes in the data.\n", + "\n", + "\n", + "Easily combine visualizations to build rich dashboards that can be shared with stakeholders.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Conclusion\n", + "\n", + "Databricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\n", + "into actionable data, combining the flexibility and openness of data lakes\n", + "with the reliability and performance of data warehouses. The Unity Catalog\n", + "provides fine-grained governance on the lakehouse across all clouds using\n", + "one friendly interface and standard SQL.\n", + "\n", + "Databricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\n", + "standard performance benchmark for data warehousing. It is powered by\n", + "Photon, the new vectorized query engine for the lakehouse, and by SQL\n", + "warehouses for instant, elastic compute decoupled from storage.\n", + "\n", + "Finally, Databricks SQL offers a native first-class SQL development\n", + "experience, with a built-in SQL editor, rich visualizations and dashboards,\n", + "and integrates seamlessly with your favorite BI- and SQL-based tools for\n", + "maximum productivity.\n", + "\n", + "\n", + "Databricks SQL under the hood.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Atlassian\n", + "\n", + "\n", + "Atlassian is a leading provider of collaboration, development and issue-tracking\n", + "\n", + "software for teams. With over 150,000 global customers (including 85 of the Fortune\n", + "\n", + "100), Atlassian is advancing the power of collaboration with products including Jira,\n", + "\n", + "Confluence, Bitbucket, Trello and more.\n", + "\n", + "USE CASE\n", + "\n", + "Atlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\n", + "down operational costs. Atlassian currently has a number of use cases focused on putting the\n", + "customer experience at the forefront.\n", + "\n", + "**Customer support and service experience**\n", + "With the majority of their customers being server-based (using products like Jira and Confluence),\n", + "Atlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\n", + "customer support experience.\n", + "\n", + "**Marketing personalization**\n", + "The same insights could also be used to deliver personalized marketing emails to drive\n", + "engagement with new features and products.\n", + "\n", + "**Anti-abuse and fraud detection**\n", + "They can predict license abuse and fraudulent behavior through anomaly detection and\n", + "predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Atlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\n", + "and externally. They have moved from a data warehousing paradigm to standardization on Databricks,\n", + "enabling the company to become more data driven across the organization. Over 3,000 internal users in\n", + "areas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\n", + "insights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\n", + "using the platform to drive more personalized support and service experiences to their customers.\n", + "\n", + "**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\n", + "finance, sales, support and R&D\n", + "\n", + "**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n", + "\n", + "**•** MLflow streamlines MLOps for faster delivery\n", + "\n", + "**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n", + "\n", + "With cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\n", + "access all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\n", + "Already the company has:\n", + "\n", + "**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\n", + "jobs from EMR to Databricks with minimal effort and low-code change\n", + "\n", + "**•** Decreased delivery time by 30% with shorter dev cycles\n", + "\n", + "**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n", + "\n", + "\n", + "**At Atlassian, we need to ensure**\n", + "**teams can collaborate well**\n", + "**across functions to achieve**\n", + "**constantly evolving goals. A**\n", + "**simplified lakehouse architecture**\n", + "**would empower us to ingest high**\n", + "**volumes of user data and run the**\n", + "**analytics necessary to better**\n", + "**predict customer needs and**\n", + "**improve the experience of our**\n", + "**customers. A single, easy-to-use**\n", + "**cloud analytics platform allows**\n", + "**us to rapidly improve and build**\n", + "**new collaboration tools based on**\n", + "**actionable insights.**\n", + "\n", + "**Rohan Dhupelia**\n", + "Data Platform Senior Manager, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "### ABN AMRO\n", + "\n", + "\n", + "As an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n", + "\n", + "by legacy infrastructure and data warehouses that complicated access to data across various\n", + "\n", + "sources and created inefficient data processes and workflows. Today, Azure Databricks\n", + "\n", + "empowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n", + "\n", + "scientists and analysts who work collaboratively on improving business operations and\n", + "\n", + "introducing new go-to-market capabilities across the company.\n", + "\n", + "USE CASE\n", + "\n", + "ABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\n", + "providing automation and insight across operations.\n", + "\n", + "**Personalized finance**\n", + "ABN AMRO leverages real-time data and customer insights to provide products and services tailored to\n", + "customers’ needs. For example, they use machine learning to power targeted messaging within their automated\n", + "marketing campaigns to help drive engagement and conversion.\n", + "\n", + "**Risk management**\n", + "Using data-driven decision-making, they are focused on mitigating risk for both the company and their\n", + "customers. For example, they generate reports and dashboards that internal decision makers and leaders use to\n", + "better understand risk and keep it from impacting ABN AMRO’s business.\n", + "\n", + "**Fraud detection**\n", + "With the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\n", + "impacts their customers. Among the activities they’re trying to address are money laundering and fake credit\n", + "card applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "SOLUTION AND BENEFITS\n", + "\n", + "Today, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\n", + "scientists and analysts who work collaboratively on improving business operations and introducing new\n", + "go-to-market capabilities across the company.\n", + "\n", + "**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\n", + "downstream analytics\n", + "\n", + "**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\n", + "through reports and dashboards\n", + "\n", + "**•** MLflow speeds deployment of new models that improve the customer experience — with new use\n", + "cases delivered in under two months\n", + "\n", + "\n", + "**Databricks has changed the way**\n", + "**we do business. It has put us in**\n", + "**a better position to succeed in**\n", + "**our data and AI transformation**\n", + "**as a company by enabling data**\n", + "**professionals with advanced data**\n", + "**capabilities in a controlled and**\n", + "**scalable way.**\n", + "\n", + "**Stefan Groot**\n", + "Head of Analytics Engineering,\n", + "ABN AMRO\n", + "\n", + "\n", + "#### 10x faster\n", + "\n", + "time to market — use cases\n", + "deployed in two months\n", + "\n", + "\n", + "#### 100+ \n", + "\n", + "use cases to be delivered\n", + "over the coming year\n", + "\n", + "\n", + "#### 500+\n", + "\n", + "empowered business\n", + "and IT users\n", + "\n", + "\n", + "**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### SEGA Europe\n", + "\n", + "**Improving the player experience**\n", + "\n", + "# “ is at the heart of everything\n", + "\n", + "**we do, and we very much**\n", + "**see Databricks as a key**\n", + "**partner, supporting us to drive**\n", + "**forward the next generation of**\n", + "**community gaming.**\n", + "\n", + "**Felix Baker**\n", + "Data Services Manager, SEGA Europe\n", + "\n", + "\n", + "SEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n", + "\n", + "Lakehouse Platform to personalize the player experience and build its own machine\n", + "\n", + "learning algorithm to help target and tailor games for over 30 million of its customers.\n", + "\n", + "As housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\n", + "titles, including Football Manager,™ saw over double the number of sales during the first lockdown\n", + "compared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\n", + "in players over the course of the COVID-19 pandemic. With more anonymized data being collected through\n", + "an analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\n", + "sheer volume of data, extract meaningful insights from it and enable the data science team to improve\n", + "general workflow.\n", + "\n", + "**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the lakehouse company. More than 7,000 organizations\n", + "\n", + "worldwide — including Comcast, Condé Nast and over 50% of the\n", + "\n", + "Fortune 500 — rely on the Databricks Lakehouse Platform to unify their\n", + "\n", + "data, analytics and AI. Databricks is headquartered in San Francisco,\n", + "\n", + "with offices around the globe. Founded by the original creators of\n", + "\n", + "Apache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "Contact us for a personalized demo\n", + "**databricks.com/contact**\n", + "\n", + "**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf2024-09-19T16:57:19Z
# Big Book of Data and AI Use Cases for the Public Sector\n", + "\n", + "### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "The State of Data and AI in the Government .......................................................................................... 3\n", + "\n", + "The Need for a Modern Data Architecture ............................................................................................. 5\n", + "\n", + "Introducing the Lakehouse for Public Sector ......................................................................................... 6\n", + "\n", + "**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n", + "\n", + "**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n", + "\n", + "**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n", + "\n", + "**U S E C A S E :** Money Laundering ................................................................................................................. 17\n", + "\n", + "**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n", + "\n", + "**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n", + "\n", + "**U S E C A S E :** Public Health Management .................................................................................................. 24\n", + "\n", + "Conclusion ................................................................................................................................................. 26\n", + "\n", + "\n", + "-----\n", + "\n", + "## The State of Data and AI in the Government\n", + "\n", + "###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n", + "\n", + "\n", + "In 2018, the U.S. Federal Government embarked on one of its most ambitious\n", + "efforts since putting a man on the moon — embedding data into all aspects of\n", + "decision-making. By enacting the Evidence-Based Policymaking Act of 2018,\n", + "Congress set in motion requirements for agencies to modernize their data and\n", + "analytics capabilities, including the appointment of agency-level chief data\n", + "officers. A year later came the Federal Data Strategy, which provided further\n", + "guidance for how agencies should manage and use data by 2030.\n", + "\n", + "\n", + "With all of this guidance, agencies are starting to make meaningful improvements\n", + "to their data strategy, but when it comes to innovating with data, agencies still\n", + "lag behind the private sector. This begs the question: what’s standing in the way?\n", + "The hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\n", + "they can largely be attributed to a patchwork of legacy technologies that have\n", + "been amassed over the last 30 to 40 years. While these hurdles stand in the\n", + "way, a number of innovative agencies are making significant progress as they\n", + "embrace new data and AI capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Federal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n", + "50% from 2018. There’s a good reason for this level of spend: Deloitte recently\n", + "published a report, “AI-augmented Government,” that estimates the federal\n", + "government could free up as many as 1.2 billion hours of work and save up to\n", + "$41.1 billion annually through the use of AI-driven automation. Early adopters\n", + "of advanced analytics are starting to see the fruits of their labor. For example,\n", + "[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\n", + "on applicants by 24x, automate the processing of millions of applications,\n", + "and reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n", + "[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n", + "\n", + "In this eBook, we explore the hurdles of legacy technologies and how a modern\n", + "data lakehouse can help agencies unlock innovative data and analytics use cases\n", + "at all levels of government. Over the following seven example use cases, covering\n", + "everything from cyber threat detection to improving public health,\n", + "\n", + "\n", + "**An increased focus on cloud, analytics and AI = operational efficiency**\n", + "\n", + "1. AI/ML\n", + "2. Data Analytics\n", + "3. Cloud\n", + "\n", + "**$1B** **TOP PRIORITIES** **$41B+**\n", + "\n", + "Data and AI Research and Government CIOs’ top Estimated government\n", + "Development Initiative game-changing technologies savings from data-driven\n", + "automation\n", + "\n", + "**U.S. Government**\n", + "\n", + "we demonstrate how the Databricks Lakehouse for Public Sector is critical to\n", + "improving citizen services and delivering on mission objectives. This guide also\n", + "includes resources in the form of Solution Accelerators, reference architectures\n", + "and real-world customer stories to help as you embark on your own journey to\n", + "drive a safer and more prosperous nation through the use of data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Modern Data Architecture\n", + "\n", + "###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n", + "\n", + "##### Common challenges\n", + "\n", + "\n", + "Many government agencies are burdened with a legacy IT infrastructure that is\n", + "built with on-premises data warehouses that are complex to maintain, are costly\n", + "to scale as compute is coupled with storage, and lack support for unstructured\n", + "data and advanced analytics. This severely inhibits data-driven innovation.\n", + "Maintaining these systems requires a massive investment of both time and\n", + "money compared to modern cloud-based systems and creates a number of\n", + "avoidable challenges:\n", + "\n", + "\n", + "government is often done in weekly or daily batches, but decision-making\n", + "needs to happen in real time. Critical events like cyber attacks and health\n", + "pandemics can’t wait a week.\n", + "\n", + "**Lack of citizen insights**\n", + "\n", + "When data is siloed, teams get an incomplete view of the citizen,\n", + "resulting in missed opportunities to improve the delivery of services that\n", + "impact the quality of life for their constituents.\n", + "\n", + "\n", + "**Lack of reliability**\n", + "\n", + "\n", + "Siloed systems result in data replication as teams spin up new data marts\n", + "to support their one-off use cases. Without a single source of truth, teams\n", + "struggle with data inconsistencies, which can result in inaccurate analysis\n", + "and model performance that is only compounded over time.\n", + "\n", + "**Lack of agility**\n", + "\n", + "Disjointed analytics tools and legacy infrastructure hinder the ability of\n", + "teams to conduct real-time analytics. Most data processing in the\n", + "\n", + "\n", + "**Lack of productivity**\n", + "\n", + "Data scientists and data analysts alike must have the right tool set to\n", + "collaboratively investigate, extract and report meaningful insights from\n", + "their data. Unfortunately, data silos lead to organizational silos, which make\n", + "collaboration inside an agency as well as between agencies very difficult.\n", + "With different groups of data teams leveraging their own coding and\n", + "analytical tools, communicating insights and working across teams —\n", + "let alone across agencies — is almost impossible. This lack of collaboration\n", + "can drastically limit the capabilities of any data analytics or AI initiative.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introducing the Lakehouse for Public Sector\n", + "\n", + "\n", + "The reason that the Databricks Lakehouse is\n", + "able to deliver the simplicity, flexibility and\n", + "speed that a government agency requires is\n", + "that it fundamentally reimagines the modern\n", + "data architecture. Databricks provides federal,\n", + "state and local agencies with a cloud-native\n", + "Lakehouse Platform that combines the best\n", + "of data warehouses and data lakes — to store\n", + "and manage all your data for all your analytics\n", + "workloads. With this modern architecture,\n", + "agencies can federate all their data and\n", + "democratize access for downstream use\n", + "cases, empowering their teams to deliver on\n", + "their mission objectives by unlocking the full\n", + "potential of their data.\n", + "\n", + "\n", + "**Delivering real-time data insight in support of the mission**\n", + "\n", + "- Fraud, Waste & Abuse\n", + "\n", + "- Cybersecurity\n", + "\n", + "- Medicaid Dashboards &\n", + "Reporting\n", + "\n", + "- Process Improvement\n", + "\n", + "- Predictive Maintenance\n", + "\n", + "- SCM & Demand Forecasting\n", + "\n", + "- Smart Military/Censor Data\n", + "\n", + "- Military Heatlh\n", + "\n", + "- COVID Response/Decision\n", + "Support\n", + "\n", + "- Smart Cities/Connected\n", + "Vehicles\n", + "\n", + "- Citizen Engagement\n", + "\n", + "- Data-Driven Decision-Making\n", + "\n", + "\n", + "-----\n", + "\n", + "**Federate all of your agency’s data**\n", + "\n", + "Any type of data can be stored because, like a data lake, the Databricks\n", + "Lakehouse is built using the low-cost object storage supported by cloud\n", + "providers. Leveraging this capability helps break down the data silos that\n", + "hinder efforts to aggregate data for advanced analytics (e.g., predictive\n", + "maintenance) or compute-intensive workloads like detecting cyber\n", + "threats across billions of signals. Probably even more important is the\n", + "ability of the lakehouse architecture to travel back in time, ensuring full\n", + "audit compliance and high governance standards for analytics and AI.\n", + "\n", + "**Power real-time decision-making**\n", + "\n", + "Streaming use cases such as IoT analytics or disease spread tracking is\n", + "simpler to support because the lakehouse uses Apache Spark TM as the\n", + "data processing engine and Delta Lake as a storage layer. With Spark,\n", + "you can toggle between batch and streaming workloads with just a line\n", + "of code. With Delta Lake, native support for ACID transactions means\n", + "that you can deploy streaming workloads without the overhead of\n", + "common reliability and performance issues. These capabilities make\n", + "real-time analytics possible.\n", + "\n", + "\n", + "**Unlock collaborative analytics for all personas**\n", + "\n", + "The Databricks Lakehouse for Public Sector is your one-stop shop for\n", + "all your analytics and AI. The platform includes a business intelligence\n", + "capability — Databricks SQL — that empowers data analysts to query and run\n", + "reports against all of an agency’s unified data. Databricks SQL integrates with\n", + "BI tools like Tableau and Microsoft Power BI and complements any existing BI\n", + "tools with a SQL-native interface, allowing data analysts and data scientists\n", + "to query data directly within Databricks and build powerful dashboards.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Deliver on your mission with predictive insights**\n", + "In the same environment, data scientists can build, share and collaborate\n", + "on machine learning models for advanced use cases like fraud detection\n", + "or geospatial analytics. Additionally, MLflow, an open source toolkit for\n", + "managing the ML lifecycle, is built into the Lakehouse so data scientists\n", + "can manage everything in one place. Databricks natively supports Python,\n", + "R, SQL and Scala so practitioners can work together with the languages and\n", + "libraries of their choice, reducing the need for separate tools. With these\n", + "capabilities, data teams can turn insights from real-world data into powerful\n", + "visualizations designed for machine learning. Visualizations can then be\n", + "turned into interactive dashboards to share insights with peers across\n", + "agencies, policymakers, regulators and decision-makers.\n", + "\n", + "\n", + "##### Customers That Innovate With Databricks Lakehouse for Public Sector\n", + "\n", + "Some of the top government agencies in the world turn to the\n", + "Databricks Lakehouse for Public Sector to bring analytics and AI-driven\n", + "automation and innovation to the communities they serve.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Cybersecurity\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Limited window of data**\n", + "Given the high cost of storage, most agencies retain only a few weeks of threat\n", + "data. This can be a real problem in scenarios where a perpetrator gains access\n", + "to a network but waits months before doing anything malicious. Without a long\n", + "historical record, security teams can’t analyze cyberattacks over long-term\n", + "horizons or conduct deep forensic reviews.\n", + "\n", + "##### Solution overview\n", + "\n", + "For government agencies that are ready to modernize their security data\n", + "infrastructure and analyze data at petabyte-scale more cost-effectively,\n", + "Databricks provides an open lakehouse platform that augments existing SIEMs\n", + "to help democratize access to data for downstream analytics and AI. Built\n", + "on Apache Spark and Delta Lake, Databricks is optimized to process large\n", + "volumes of streaming and historic data for real-time threat analysis and incident\n", + "response. Security teams can query threat data going years into the past in just\n", + "minutes and build ML models to detect new threat patterns and reduce false\n", + "positives. Additionally, Databricks created a Splunk-certified add-on to augment\n", + "Splunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n", + "\n", + "\n", + "Cyberattacks from bad actors and nation states are a huge and growing threat\n", + "to government agencies. Recent large-scale attacks like the ones on SolarWinds,\n", + "log4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\n", + "frequency of broad-reaching cyberattacks. Data breaches cost the federal\n", + "government more than $4 million per incident in 2021 and threaten national\n", + "security. Staying ahead of the next threat requires continuous monitoring of\n", + "security data from an agency’s entire attack surface before, during and after\n", + "an incident.\n", + "\n", + "##### Challenges\n", + "\n", + "**Scaling existing SIEM solutions**\n", + "Agencies looking to expand existing SIEM tools for today’s petabytes of data can\n", + "expect increased licensing, storage, compute and integration resources resulting\n", + "in tens of millions of dollars in additional costs per year.\n", + "\n", + "**Rules-based systems**\n", + "Many legacy SIEM tools lack the critical analytics capabilities — such as\n", + "advanced analytics, graph processing and machine learning — needed to detect\n", + "unknown threat patterns or deliver on a broader set of security use cases like\n", + "behavioral analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n", + "\n", + "Detecting criminals and nation states through DNS analytics. In order to address\n", + "common cybersecurity challenges such as deployment complexity, tech\n", + "limitation and cost, security teams need a real-time data analytics platform that\n", + "can handle cloud scale, analyze data wherever it is, natively support streaming\n", + "and batch analytics, and have collaborative content development capabilities.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n", + "\n", + "**Fighting Cyber Threats in Real Time**\n", + "Since partnering with Databricks, HSBC has reduced costs, accelerated threat\n", + "detection and response, and improved their security posture. Not only can\n", + "they process all of their required data, but they’ve also increased online query\n", + "retention from just days to months at petabyte scale. HSBC is now able to\n", + "execute 2-3x more threat hunts per analyst.\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n", + "\n", + "Designed for cloud-scale security operations, the add-on provides Splunk\n", + "analysts with access to all data stored in the Lakehouse. Bidirectional pipelines\n", + "between Splunk and Databricks allow agency analysts to integrate directly into\n", + "Splunk visualizations and security workflows.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Predictive Maintenance\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Integrating unstructured data**\n", + "Equipment data doesn’t just come in the form of IoT data. Agencies can gather\n", + "rich unstructured signals like audio, visual (e.g., video inspections) and text\n", + "(e.g., maintenance logs). Most legacy data architectures are unable to integrate\n", + "structured and unstructured data sources.\n", + "\n", + "**Operationalizing machine learning**\n", + "Most agencies lack the advanced analytics tools needed to build models that\n", + "can predict potential equipment failures. Those that do typically have their\n", + "data scientists working in a siloed set of tools, resulting in unnecessary data\n", + "replication and inefficient workflows.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is tailor-made for building IoT applications at scale.\n", + "With Databricks, agencies can easily manage large streaming volumes of small\n", + "files, with ACID transaction guarantees and reduced job fails compared to\n", + "traditional data warehouse architectures. Additionally, the Lakehouse is cloud\n", + "native and built on Apache Spark, so scaling for petabytes of data is not an issue.\n", + "With the Lakehouse, agencies can bring together all of their structured and\n", + "unstructured data with a unified set of tooling for data engineering, model building\n", + "and production rollout. With these capabilities, operations teams can quickly\n", + "detect and act on pending equipment failures before they affect performance.\n", + "\n", + "\n", + "Predictive maintenance is oftentimes associated with the manufacturing sector,\n", + "but in reality it extends far beyond the factory floor. Consider this for a moment:\n", + "the U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\n", + "buses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\n", + "vehicles — like multimillion-dollar aircraft — contain sensors that generate\n", + "massive amounts of data on the use and conditions of various components. And\n", + "it’s not just vehicles. Modern public utilities stream data through connected IoT\n", + "devices. All of this data can be analyzed to identify the root cause of a failure\n", + "and predict future maintenance, helping to avoid costly repairs and critical\n", + "assets from being out of service.\n", + "\n", + "##### Challenges\n", + "\n", + "**Managing IoT data at scale**\n", + "With billions of sensors generating information, most data systems are unable to\n", + "handle the sheer volume of data. Before agencies can even start analyzing their\n", + "data, legacy data warehouse–based tools require preprocessing of data, making\n", + "real-time analysis impossible.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "**Solution Accelerator: Predictive Maintenance**\n", + "Learn how to ingest real-time IoT data from field devices, perform complex\n", + "time series processing in Delta Lake and leverage machine learning to build\n", + "predictive maintenance models.\n", + "\n", + "[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n", + "\n", + "[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n", + "\n", + "[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n", + "\n", + "\n", + "[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n", + "[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n", + "\n", + "##### Customer story\n", + "\n", + "**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n", + "\n", + "**Protecting the Water Supply for 700,000 Residents**\n", + "Utilizing machine learning for predictive analytics to help stop water main\n", + "breaks before they occur, potentially saving hundreds of thousands of dollars\n", + "in repairs while reducing service interruption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Weather Sensor\n", + "Readings\n", + "(semi-structured)\n", + "\n", + "Real-time\n", + "streaming\n", + "\n", + "Wind Turbine\n", + "Telematics\n", + "(semi-structured)\n", + "\n", + "Maintenance Logs\n", + "(unstructured)\n", + "\n", + "\n", + "#### Databricks Lakehouse Platform\n", + "\n", + "Bronze Layer Silver Layer Gold Layer\n", + "\n", + "\n", + "Append Raw\n", + "Merge Data\n", + "Data\n", + "\n", + "\n", + "Join Streams and\n", + "Analyze Data\n", + "\n", + "Enriched\n", + "Readings\n", + "\n", + "\n", + "Output\n", + "\n", + "\n", + "Build Predictive\n", + "Maintenance Model\n", + "\n", + "\n", + "Granular\n", + "Readings\n", + "\n", + "\n", + "Aggregated\n", + "Hourly\n", + "Readings\n", + "\n", + "\n", + "Real-time Dashboards for Real-Time Dashboards for\n", + "Optimizing Performance Optimizing Performance\n", + "\n", + "|Col1|Col2|Col3|\n", + "|---|---|---|\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Fraud Detection\n", + "\n", + "\n", + "##### Overview\n", + "\n", + "According to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\n", + "monetary losses to fraud, waste and abuse go undetected and total tens of\n", + "billions of dollars. Financial fraud comes in many forms, from individuals taking\n", + "advantage of relief programs to complex networks of criminal organizations\n", + "working together to falsify medical claims and rebate forms. Investigative teams\n", + "hoping to stay ahead of fraudsters need advanced analytics techniques so they\n", + "can detect anomalous behavior buried in a sea of data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Lack of machine learning**\n", + "A rules-based approach is not enough. Bad actors are getting more and more\n", + "sophisticated in how they take advantage of government programs, necessitating\n", + "an AI-driven approach.\n", + "\n", + "**Unreliable data**\n", + "Getting high-quality, clean data and maintaining a rich feature store is critical\n", + "for identifying ever-evolving fraud patterns while maintaining a strict record of\n", + "previous data points.\n", + "\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables teams to develop complex ML models with\n", + "high governance standards and bridge the gap between data science and\n", + "technology to address the challenge of analyzing large volumes of data at scale\n", + "— 40 billion financial transactions a year are made in the United States alone.\n", + "Additionally, Databricks makes it possible to combine modern AI techniques\n", + "with the legacy rules-based methods that underpin current approaches to fraud\n", + "detection all within a common and efficient Spark-based orchestration engine.\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n", + "\n", + "Due to an ever-changing landscape, building a financial fraud detection\n", + "framework often goes beyond just creating a highly accurate machine learning\n", + "model. Oftentimes it involves a complex-decision science setup that combines\n", + "a rules engine with a need for a robust and scalable machine learning platform.\n", + "In this example, we show how to build a holistic fraud detection solution on\n", + "Databricks using data from a financial institution.\n", + "\n", + "\n", + "**Analytics at scale**\n", + "Training complex ML models with hundreds of features on gigabytes of\n", + "structured, semi-structured and unstructured data can be impossible without a\n", + "highly scalable and distributed infrastructure.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n", + "\n", + "**Identifying Financial Fraud at Scale**\n", + "Processes hundreds of billions of market events\n", + "per day on the Databricks Lakehouse and uses\n", + "the power of machine learning to identify illicit\n", + "activity in near real-time.\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Money Laundering\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "Approximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\n", + "and with criminal organizations — both at home and abroad — implementing\n", + "increasingly sophisticated methods for laundering funds, it’s getting harder to\n", + "stop. While the federal government continues to apply pressure on the financial\n", + "sector through heightened regulation, more is needed to combat laundering.\n", + "Modern AI techniques such as graph analytics and computer vision can be\n", + "used to process different types of structured (e.g., financial transactions) and\n", + "unstructured (e.g., real estate images) data and identify illicit behavior. This\n", + "allows investigative teams to automate labor-intensive activities like confirming\n", + "a residential address or reviewing transaction histories, and instead dig into\n", + "priority threats.\n", + "\n", + "##### Challenges\n", + "\n", + "**Complex data science**\n", + "Modern anti-money laundering (AML) practices require multiple ML capabilities\n", + "such as entity resolution, computer vision and graph analytics on entity\n", + "metadata, which is typically not supported by any one data platform.\n", + "\n", + "\n", + "**Time-consuming false positives**\n", + "Any reported suspicious activity must be investigated manually to ensure\n", + "accuracy. Many legacy solutions generate a high number of false positives or fail\n", + "to identify unknown patterns, resulting in wasted effort by investigators.\n", + "\n", + "##### Solution overview\n", + "\n", + "AML solutions face the operational burden of processing billions of transactions\n", + "a day. The Databricks Lakehouse Platform combines the low storage cost\n", + "benefits of cloud data lakes with the robust transaction capabilities of data\n", + "warehouses, making it the ideal foundation for building AML analytics at massive\n", + "scale. At the core of Databricks is Delta Lake, which can store and combine\n", + "both unstructured and structured data to build entity relationships; moreover,\n", + "Databricks Delta Engine provides efficient access using the new Photon compute\n", + "to speed up BI queries on tables spanning billions of transactions. On top of\n", + "these capabilities, ML is a first-class citizen in the Lakehouse, which means\n", + "analysts and data scientists do not waste time subsampling or moving data to\n", + "share dashboards and stay one step ahead of bad actors.\n", + "\n", + "\n", + "**Model transparency**\n", + "Although AI can be used to address many money laundering use cases, the lack\n", + "of transparency in the development of ML models offers little explainability,\n", + "inhibiting broader adoption.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n", + "\n", + "\n", + "Lakehouse Platform leveraging a series of next-gen machine learning techniques\n", + "including NLP, computer vision, entity resolution and graph analytics. This\n", + "approach helps teams better adapt to the reality of modern laundering practices.\n", + "\n", + "\n", + "Current anti-money laundering practices bear little resemblance to those of the\n", + "last decade. In today’s digital world, financial institutions are processing billions\n", + "of transactions daily, increasing the surface area of money laundering. With this\n", + "accelerator, we demonstrate how to build a scalable AML solution on the\n", + "\n", + "\n", + "##### Reference architecture\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Entity Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**No machine learning capabilities**\n", + "Entity resolution typically relies on basic rules-based logic to compare records\n", + "(e.g., matching on name and address), but with messy, large volumes of data,\n", + "advanced analytics is needed to improve accuracy and accelerate efforts.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse is an ideal platform for building entity analytics at\n", + "scale. With support for a wide range of data formats and a rich and extensible\n", + "set of data transformation and ML capabilities, Databricks enables agencies to\n", + "bring together all of their data in a central location and move beyond simple\n", + "rules-based methods for entity resolution. Data teams can easily explore\n", + "different machine learning techniques like natural language processing,\n", + "classification and graph analytics to automate entity matching. And one-click\n", + "provisioning and deprovisioning of cloud resources makes it easy for teams to\n", + "cost-effectively allocate the necessary compute resources for any size job so\n", + "they can uncover findings faster.\n", + "\n", + "\n", + "Entity analytics aims to connect disparate data sources to build a full view of\n", + "a person or an organization. This has many applications in the public sector,\n", + "such as fraud detection, national security and population health. For example,\n", + "Medicare fraud teams need to understand which prescriptions are filled, claims\n", + "filed and facilities visited across geographies to uncover suspicious behavior.\n", + "Before teams can even look for suspicious behavior, they must first determine\n", + "which records are associated. In the United States, nearly 50,000 people share\n", + "the name John Smith (and there are thousands of others with similar names).\n", + "Imagine trying to identify the right John Smith for this type of analysis. That’s no\n", + "easy task.\n", + "\n", + "##### Challenges\n", + "\n", + "**Disjointed data**\n", + "Managing complex and brittle ETL pipelines in order to cleanse and join data\n", + "across siloed systems and data stores.\n", + "\n", + "\n", + "**Compute intensive**\n", + "Identifying related entities across population-level data sets requires massive\n", + "compute power that far outstrips legacy on-prem data architectures.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n", + "\n", + "Learn from Databricks experts on how entity analytics is being deployed\n", + "in the public sector and watch a demo that shows how to use ML to link\n", + "payments and treatments across millions of records in a public CMS data set.\n", + "\n", + "[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n", + "\n", + "While focused on retail, this accelerator has applications for any organization\n", + "working on entity matching, especially as it relates to items that might be stored\n", + "across locations. In this notebook, we demonstrate how to use machine learning\n", + "and the Databricks Lakehouse Platform to resolve differences between product\n", + "definitions and descriptions, and determine which items are likely pairs and\n", + "which are distinct across disparate data sets.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n", + "\n", + "In this talk, NewWave shares the specifics on CMS’s entity resolution use case,\n", + "the ML necessary for this data and the unique uses of Databricks in providing\n", + "this capability.\n", + "\n", + "##### Sample workflow\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Geospatial Analytics\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "**Broad range of analytics capabilities**\n", + "Enterprises require a diverse set of data applications — including SQL-based\n", + "analytics, real-time monitoring, data science and machine learning — to support\n", + "geospatial workloads given the diverse nature of the data and use cases.\n", + "\n", + "##### Solution overview\n", + "\n", + "With Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\n", + "workloads, as it provides a single source of truth for all types of structured,\n", + "unstructured, streaming and batch data, enabling seamless spatio-temporal\n", + "unification and cross-querying with tabular and raster-based data. Built on\n", + "Apache Spark, the Lakehouse easily scales for data sets consisting of billions\n", + "of rows of data with distributed processing in the cloud. To expand on the core\n", + "capabilities of the Lakehouse, Databricks has introduced the Mosaic library,\n", + "an extension to the Apache Spark framework, built for fast and easy processing\n", + "of large geospatial data sets. Popular frameworks such as Apache Sedona or\n", + "GeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\n", + "Lakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\n", + "to support all types of geospatial use cases.\n", + "\n", + "\n", + "Every day billions of handheld and IoT devices, along with thousands of\n", + "airborne and satellite remote sensing platforms, generate hundreds of exabytes\n", + "of location-aware data. This boom of geospatial big data combined with\n", + "advancements in machine learning is enabling government agencies to develop\n", + "new capabilities. The potential use cases for geospatial analytics and AI touch\n", + "every part of the government, including disaster recovery (e.g., flood/earthquake\n", + "mapping), defense and intel (e.g., detecting threats using drone footage),\n", + "infrastructure (e.g., public transportation planning), civilian safety (e.g., crime\n", + "prediction), public health (e.g., disease spread tracking), and much more. Every\n", + "agency at the state and federal level needs to consider how they can tap into\n", + "geospatial data.\n", + "\n", + "##### Challenges\n", + "\n", + "**Massive volumes of geospatial data**\n", + "With the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\n", + "daily, outpacing their ability to store and process this data at scale.\n", + "\n", + "\n", + "**Compute-intensive spatial workloads**\n", + "Geospatial data is complex in structure, with various formats not well suited for\n", + "legacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Build a Lakehouse to support all of your geospatial analytics and AI use cases\n", + "with the Mosaic library. Mosaic provides a number of capabilities including easy\n", + "conversion between common spatial data encodings, constructors to easily\n", + "generate new geometries from Spark native data types, many of the OGC SQL\n", + "standard ST_ functions implemented as Spark Expressions for transforming,\n", + "aggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\n", + "all provided with the flexibility of a Scala, SQL or Python API.\n", + "\n", + "[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n", + "\n", + "Learn how to build powerful geospatial insights and visualizations with a\n", + "Lakehouse for all your geospatial data processing, analytics and AI.\n", + "\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n", + "\n", + "**Analyzing Flight Data to Improve Aviation**\n", + "To help airlines better serve their millions of passengers, USDOT built a\n", + "modern analytics architecture on Databricks that incorporates data such as\n", + "weather, flight, aeronautical and surveillance information. With this new\n", + "platform, they reduced compute costs by 90% and can now power use cases\n", + "such as predicting air cargo traffic patterns, flight delays and the financial\n", + "impact of flight cancellations.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n", + "\n", + "**Customer Story: Flood Prediction With Machine Learning**\n", + "In an effort to improve the safety of civil projects, Stantec built a machine\n", + "learning model on Databricks leveraging large volumes of weather and geological\n", + "data — oftentimes consisting of trillions of data points — to predict the impact\n", + "of flash floods on various regions and adjust civil planning accordingly.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Reference architecture\n", + "\n", + "Mosaic Kepler Magics\n", + "Geometry Display Functions\n", + "for Map Display\n", + "\n", + "ESRI Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "Built-In Indexing\n", + "System Support\n", + "\n", + "\n", + "JTS Java API for\n", + "Geometry Operations\n", + "\n", + "\n", + "-----\n", + "\n", + "###### USE CASE:\n", + "## Public Health Management\n", + "\n", + "##### Overview\n", + "\n", + "\n", + "In their lifetime, every human is expected to generate a million gigabytes of\n", + "health data spanning electronic health records, medical images, claims, wearable\n", + "data, genomics and more. This data is critical to understanding the health of\n", + "the individual, but when aggregated and analyzed across large populations,\n", + "government agencies can glean important insights like disease trends, the\n", + "impact of various treatment guidelines and the effectiveness of resources. By\n", + "adding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\n", + "location, income level, education, housing — agencies can better identify\n", + "underserved communities and the critical factors that contribute to positive\n", + "health outcomes.\n", + "\n", + "##### Challenges\n", + "\n", + "**Rapidly growing health data**\n", + "Healthcare data is growing exponentially. Unfortunately, legacy on-premises data\n", + "architectures are complex to manage and too costly to scale for populationscale analytics.\n", + "\n", + "\n", + "**Complexities of ML in healthcare**\n", + "The legacy analytics platforms that underpin healthcare lack the robust data\n", + "science capabilities needed for predictive health use cases like disease risk\n", + "scoring. There’s also the challenge of managing reproducibility, which is critical\n", + "when building ML models that can impact patient outcomes.\n", + "\n", + "##### Solution overview\n", + "\n", + "The Databricks Lakehouse enables public health agencies to bring together all\n", + "their research and patient data in a HIPAA-certified environment and marry it\n", + "with powerful analytics and AI capabilities to deliver real-time and predictive\n", + "insights at population scale. The Lakehouse eliminates the need for legacy\n", + "data architectures, which have historically inhibited innovation in patient care\n", + "by creating data silos and making advanced analytics difficult. Databricks led\n", + "open source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\n", + "that make it easy to ingest and prepare healthcare-specific data modalities for\n", + "downstream analytics.\n", + "\n", + "\n", + "**Fragmented patient data**\n", + "It is widely accepted that over 80% of medical data is unstructured, yet most\n", + "organizations still focus their attention on data warehouses designed to only\n", + "support structured data and SQL-based analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### How to get started\n", + "\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n", + "\n", + "Our joint solutions with John Snow Labs bring together the power of Spark NLP\n", + "for Healthcare with the collaborative analytics and AI capabilities of Databricks.\n", + "Informatics teams can ingest raw unstructured medical text files into Databricks,\n", + "extract meaningful insights using natural language processing techniques,\n", + "and make the data available for downstream analytics. We have specific NLP\n", + "solutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n", + "\n", + "[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n", + "\n", + "One of the most powerful tools for identifying patients at risk for a chronic\n", + "condition is the analysis of real world data (RWD). This Solution Accelerator\n", + "notebook provides a template for building a machine learning model that\n", + "assesses the risk of a patient for a given condition within a given window of time\n", + "based on a patient’s encounter history and demographics information.\n", + "\n", + "\n", + "[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n", + "\n", + "Databricks COVID-19 surveillance solution takes a data-driven approach to\n", + "adaptive response, applying predictive analytics to COVID-19 data sets to\n", + "help drive more effective shelter-in-place policies.\n", + "\n", + "##### Customer story\n", + "\n", + "**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n", + "\n", + "**From Vaccine Management to ICU Planning**\n", + "During the pandemic, the Chesapeake Regional Information System for our\n", + "Patients implemented a modern data architecture on Databricks to address\n", + "critical reporting needs. This allowed them to analyze 400 billion data points\n", + "\n", + "for innovative use cases like real-time disease spread tracking, vaccine\n", + "distribution and prioritizing vulnerable populations.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Conclusion\n", + "\n", + "Today, data is at the core of how government agencies operate and AI is at the\n", + "\n", + "forefront of driving innovation into the future. The Databricks Lakehouse for\n", + "\n", + "Public Sector enables government agencies at the federal, state and local level\n", + "\n", + "to harness the full power of data and analytics to solve strategic challenges and\n", + "\n", + "make smarter decisions that improve the safety and quality of life of all citizens.\n", + "\n", + "Get started with a free trial of Databricks Lakehouse and start building better\n", + "\n", + "data applications today.\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "###### Contact us for a personalized demo databricks.com/contact\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf2024-09-19T16:57:20Z
###### EBOOK\n", + "\n", + "# Lakehouse for Manufacturing\n", + "\n", + "###### Build a connected customer experience, optimize operations and unify your data ecosystem\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction .......................................................................................................................... **3**\n", + "\n", + "Manufacturing Transformation Trends .............................................................................. **5**\n", + "\n", + "Manufacturing Data Challenges ......................................................................................... **9**\n", + "\n", + "Databricks Lakehouse for Manufacturing ....................................................................... **10**\n", + "\n", + "Building Innovative Solutions on the Lakehouse ............................................................. **12**\n", + "\n", + "**SOLUTION:** Part-Level Demand Forecasting ....................................................................... 12\n", + "\n", + "**SOLUTION:** Overall Equipment Effectiveness & KPI Monitoring ............................................. 14\n", + "\n", + "**SOLUTION:** Digital Twins ................................................................................................... 15\n", + "\n", + "**SOLUTION:** Computer Vision ............................................................................................ 16\n", + "\n", + "An Ecosystem on the Lakehouse for Manufacturing ...................................................... **17**\n", + "\n", + "**SOLUTION:** Avanade Intelligent Manufacturing .................................................................. **18**\n", + "\n", + "**SOLUTION:** DataSentics Quality Inspector ........................................................................ **18**\n", + "\n", + "SOLUTION: Tredence Predictive Supply Risk Management ................................................. **19**\n", + "\n", + "Leading Manufacturing Companies That Choose Us ................................................... **20**\n", + "\n", + "\n", + "-----\n", + "\n", + "## Introduction\n", + "\n", + "Market conditions in manufacturing are more challenging than ever. Operating margins\n", + "and growth are impacted by the rising cost of labor, materials, energy and transportation, all\n", + "peaking at the same time. Disruptive events in the supply chain are increasing in frequency\n", + "and intensity, leading to significant revenue losses and damaged brand reputation.\n", + "\n", + "Effective acquisition and retention of next-generation talent is a considerable issue for\n", + "manufacturers. There are more jobs in the industry than there are people to do them, further\n", + "compounding the problem of slower than expected industrial productivity growth over the\n", + "last 15 years. The industry is also one of the largest consumers of energy, and faces a direct\n", + "challenge of transforming operations to be more sustainable as governments are prioritizing\n", + "net-zero policies that require a step change in energy efficiency and transition to low-carbon\n", + "energy sources.\n", + "\n", + "The manufacturing industry generates massive amounts of new data every day — estimated\n", + "to be two to four times more in size than in industries such as communications, media,\n", + "retail and financial services. This explosion of data has opened the door for the global\n", + "manufacturing ecosystem to boost productivity, quality, sustainability and growth beyond\n", + "what was previously thought possible.\n", + "\n", + "Unfortunately, legacy data warehouse-based architectures weren’t built for the massive\n", + "volumes and type of data coming in through today’s factories, products, processes and\n", + "workers, let alone to support the advanced AI/ML use cases required to meet the customer\n", + "expectations of shorter lead times, reliable delivery and smarter products.\n", + "\n", + "\n", + "-----\n", + "\n", + "For that, companies need to adopt a modern data architecture that provides the speed, scale and\n", + "collaboration needed by broad teams of data engineers, data scientists, and analysts. Manufacturers need\n", + "a comprehensive data platform that can not only handle massive volumes of data, but effectively and\n", + "seamlessly operationalize the value from data, analytics and AI.\n", + "\n", + "This is achieved by:\n", + "\n", + "Removing data silos by placing all data, regardless of type or frequency, in a single, open\n", + "architecture — including unstructured data from sensors, telemetry, natural language logs,\n", + "videos and images — helping you to gain end-to-end visibility into your business\n", + "\n", + "Ensuring your data is “always on” so that the freshest and highest quality data is available for\n", + "all for the full spectrum of enterprise analytics and AI/ML use cases, allowing you to drive ITOT convergence\n", + "\n", + "Having a comprehensive open architecture so IT and data teams can move with agility\n", + "to bring AI and ML to where it’s needed, when it’s needed, including in connectivityconstrained environments\n", + "\n", + "Maintaining fine-grained governance and access control on your data assets, protecting\n", + "\n", + "sensitive intellectual property and customer data\n", + "\n", + "The Databricks Lakehouse for Manufacturing does just this. It’s a comprehensive approach that empowers\n", + "teams in the industry to collaborate and innovate around data, analytics and AI. It eliminates the technical\n", + "limitations of legacy technologies and gives data teams the ability to drive deeper, end-to-end insight\n", + "into supply chains, automate processes to reduce costs and grow productivity, and achieve sustainable\n", + "transformation for a more prosperous future. Welcome to the Lakehouse for Manufacturing.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing Transformation Trends\n", + "\n", + "\n", + "The future of manufacturing is smart, sustainable and service oriented. Today’s\n", + "forward-thinking leaders are preparing the foundation they need to support that\n", + "future by leveraging fast and connected data from all corners of the enterprise.\n", + "There are four key trends driving transformation in manufacturing:\n", + "\n", + "**Boosting industrial productivity through automation**\n", + "\n", + "A spike in labor costs, as well as the cost of energy and materials, puts significant\n", + "pressure on operating margins. At the same time, industrial productivity has\n", + "plateaued — it is at the same level today as it was in the late 2000s. In the face\n", + "of these macro challenges and economic uncertainty, there has never been a\n", + "more burning need to reduce costs and improve productivity through greater\n", + "visibility and automation.\n", + "\n", + "The industry has made strides in collecting data from machines and performing\n", + "predictive analytics on sensor readings, with 47% of manufacturers citing the\n", + "use of predictive maintenance to reduce operational costs with considerable\n", + "upside ahead.\n", + "\n", + "However, there is an entirely different class of unstructured data in the form of\n", + "images, videos and LiDAR that is opening the door to game-changing automation\n", + "in quality inspection, flow optimization and production scheduling. Historically,\n", + "these critical processes have depended on manual and visual inspection of\n", + "products and operations, which is resource intensive and less accurate than\n", + "ML-driven computer vision techniques. This untapped data and capability\n", + "is allowing manufacturers to deliver higher product quality and deliver on\n", + "production demands using fewer resources. Andrew Ng, a machine learning\n", + "\n", + "\n", + "pioneer, rightly describes the massive opportunity for these technologies in\n", + "his quote: “It is incumbent on every CEO in any manufacturing or industrial\n", + "automation company to figure out how to make deep learning technology work\n", + "for your business.”\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Corning\n", + "\n", + "#### $2 million in cost avoidance through \n", + "\n", + "manufacturing upset event reduction\n", + "\n", + "**Driving Better Efficiency in Manufacturing Process With ML**\n", + "\n", + "Corning has been one of the world’s leading innovators in materials science for\n", + "\n", + "nearly 200 years. Delivering high-quality products is a key objective across the\n", + "\n", + "company’s manufacturing facilities around the world, and it’s always on a mission\n", + "\n", + "to explore how ML can help deliver on that goal. Databricks has been central\n", + "\n", + "to the company’s digital transformation, as it provides a simplified and unified\n", + "\n", + "platform where teams can centralize all data and ML work. Now, they can train\n", + "\n", + "models, register them in MLflow, generate all additional artifacts — like exported\n", + "\n", + "formats — and track them in the same place as the base model.\n", + "\n", + "[LEARN MORE](https://www.databricks.com/blog/2023/01/05/how-corning-built-end-end-ml-databricks-lakehouse-platform.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Gaining end-to-end operations and**\n", + "**supply chain visibility**\n", + "\n", + "Modern customer expectations are forcing manufacturers to focus on more\n", + "customer-centric KPIs: quality, on-time commitments and speed of delivery.\n", + "That’s not to say that asset and labor efficiency are less important — however,\n", + "with customer expectations of shorter lead times and more reliable delivery,\n", + "the success measures in manufacturing are shifting to a mantra of “measure\n", + "what your customer values.”\n", + "\n", + "High-performing manufacturers that embed this deep into their operational\n", + "playbook also perform best on productivity and ROIC growth results, as\n", + "evidenced in a recent study by the World Economic Forum and the International\n", + "Centre of Industrial Transformation. The problem? In a post-pandemic world,\n", + "operations and supply chains are persistently constrained, with increasing\n", + "disruptions, spiraling costs and unpredictable performance. The business\n", + "impact is considerable — studies have shown that a 30-day disruption can\n", + "reduce EBITDA by 5% and impact annual revenue by as much as 20%.\n", + "\n", + "Manufacturing companies need to be able to deliver on customer expectations,\n", + "commitments and service levels, all while lowering costs and increasing\n", + "productivity. Manufacturers need an enterprise data platform that can provide\n", + "real-time visibility into order flows, production processes, supplier performance,\n", + "inventory and logistics execution, breaking down departmental silos to maximize\n", + "customer responsiveness, improve manufacturing agility and boost performance.\n", + "\n", + "\n", + "**Transforming your business model through**\n", + "**tech-fueled services**\n", + "\n", + "Servitization, defined as the process of building revenue streams from services,\n", + "has been trending for some time. The adaptation of the business model has\n", + "been considerably profitable: on average, services account for ~30% of industrial\n", + "manufacturing companies but contribute 60%+ of profit.\n", + "\n", + "In aftersale services, a clear customer preference for business outcome-based\n", + "offerings has emerged in almost every corner of the manufacturing industry.\n", + "The use of data, analytics and AI is foundational to delivering more personalized\n", + "customer outcomes, proactive field service delivery and differentiated missioncritical applications to their customers.\n", + "\n", + "With greater autonomy, connectivity and sensorization, manufacturers operate\n", + "in a paradigm where their products generate more and more data every second,\n", + "opening up numerous new addressable opportunities for value creation. The\n", + "business of manufacturing is no longer linear, and manufacturers will need to\n", + "reimagine their businesses to go beyond merely providing the primary unit of\n", + "production — the next SKU, machine, vehicle or airplane — and leverage this data\n", + "to operate a platform business with higher growth, stickier revenue streams and\n", + "greater resilience to demand shocks.\n", + "\n", + "\n", + "-----\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Rolls-Royce\n", + "\n", + "**Aerospace Goes Green With Data and AI**\n", + "\n", + "While most people think of luxury cars when they hear “Rolls-Royce,” the\n", + "\n", + "Civil Aerospace branch is its own company, having separated from the car\n", + "\n", + "manufacturing arm in 1971. The now wildly successful manufacturer of commercial\n", + "\n", + "airplane engines is a leader in its industry for innovation. Today, Rolls-Royce\n", + "\n", + "\n", + "_“We employed Databricks to optimize inventory planning using data and analytics,_\n", + "_positioning parts where they need to be, based on the insight we gain from our_\n", + "_connected engines in real time and usage patterns we see in our service network. This_\n", + "_has helped us minimize risks to engine availability, reduce lead times for spare parts_\n", + "_and drive more efficiency in stock turns — all of this enables us to deliver TotalCare,_\n", + "_the aviation industry’s leading Power-by-the-Hour (PBH) maintenance program.”_\n", + "\n", + "**S T U A R T H U G H E S**\n", + "\n", + "Chief Information and Digital Officer\n", + "Rolls-Royce Civil Aerospace\n", + "\n", + "\n", + "obtains information directly from the airlines’ engines and funnels it into the\n", + "\n", + "Databricks platform. This gives the company insights into how the engines are\n", + "\n", + "performing and ways to improve maintenance schedules, translating to less\n", + "\n", + "downtime, delays, and rerouting — all of which reduce carbon footprint.\n", + "\n", + "[LEARN MORE](https://www.wired.com/sponsored/story/how-tech-is-helping-to-save-the-world/)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Driving a more sustainable approach**\n", + "**to manufacturing**\n", + "\n", + "Global efforts on reducing greenhouse gas (GHG)\n", + "emissions are accelerating, with over 70 countries\n", + "representing more than 75% of global emissions\n", + "having signed agreements to reach net-zero\n", + "emissions by 2050. Manufacturing-centric sectors\n", + "are critical to achieving net-zero sustainability\n", + "commitments around the world, as they represent\n", + "over 50% of global energy consumption and\n", + "contribute to ~25% of global emissions.\n", + "\n", + "Those at the forefront of data, analytics and\n", + "AI are setting science-based targets and are\n", + "driving favorable sustainability outcomes today\n", + "by deriving better insights from their operations,\n", + "supply chains and the outcomes that their\n", + "products generate for their end customers.\n", + "\n", + "\n", + "**CUSTOMER STORY SPOTLIGHT:**\n", + "##### Shell\n", + "\n", + "**Delivering Innovative Energy Solutions for a Cleaner World**\n", + "\n", + "\n", + "Shell has been at the forefront of creating a cleaner tomorrow by investing in digital\n", + "\n", + "technologies to tackle climate change and become a net-zero emissions energy\n", + "\n", + "business. Across the business, they are turning to data and AI to improve operational\n", + "\n", + "efficiencies, drive customer engagement, and tap into new innovations like renewable\n", + "\n", + "energy. Hampered by large volumes of data, Shell chose Databricks to be one of\n", + "\n", + "the foundational components of its Shell.ai platform. Today, Databricks empowers\n", + "\n", + "hundreds of Shell’s engineers, scientists and analysts to innovate together as part of\n", + "\n", + "their ambition to deliver cleaner energy solutions more rapidly and efficiently.\n", + "\n", + "[LEARN MORE](https://www.google.com/url?q=https://www.databricks.com/customers/shell&sa=D&source=editors&ust=1679097620349908&usg=AOvVaw00lb46oTfGRpOREXOI1Ue3)\n", + "\n", + "_“Shell has been undergoing a digital transformation as part of our ambition to deliver more_\n", + "_and cleaner energy solutions. As part of this, we have been investing heavily in our data lake_\n", + "_architecture. Our ambition has been to enable our data teams to rapidly query our massive_\n", + "_data sets in the simplest possible way. The ability to execute rapid queries on petabyte_\n", + "_scale data sets using standard BI tools is a game changer for us. Our co-innovation_\n", + "_approach with Databricks has allowed us to influence the product road map, and we are_\n", + "_excited to see this come to market.”_\n", + "\n", + "\n", + "### Millions\n", + "of dollars saved in\n", + "potential engine\n", + "repair costs\n", + "\n", + "data team\n", + "### 250\n", + "members supporting\n", + "160+ high-value use\n", + "cases\n", + "\n", + "faster –\n", + "### 9x\n", + "5 minutes to validate\n", + "a label, reduced from\n", + "45 minutes\n", + "\n", + "\n", + "**D A N I E L J E AV O N S**\n", + "General Manager – Advanced Analytics CoE\n", + "\n", + "Shell\n", + "\n", + "\n", + "-----\n", + "\n", + "## Manufacturing Data Challenges\n", + "\n", + "\n", + "**Massive unstructured/OT data volumes**\n", + "\n", + "The industry is seeing immense growth in data volumes: much of this massive\n", + "growth is due to semi-structured and unstructured data from connected workers,\n", + "buildings, vehicles and factories. This growth in multi-modal data from IoT sensors,\n", + "process historians, product telemetry, images, cameras and perception systems\n", + "has outpaced legacy data warehouse-centric technologies. On-prem and cloud\n", + "data warehouse tech-based architectures are too complex and too costly for the\n", + "large and heterogeneous data sets prevalent in the industry.\n", + "\n", + "**Driving IT-OT convergence**\n", + "\n", + "The success and pace of data modernization efforts in manufacturing is so often\n", + "muted by critical data being stuck in multiple closed systems and proprietary\n", + "formats, making it difficult and cost-prohibitive to extract the full potential of IT\n", + "and OT data sets. In addition, data quality issues such as outdated or inaccurate\n", + "data can often lead to a disjointed and incomplete view of customers, operations\n", + "and assets. For years, companies have lacked a common foundation for complex\n", + "and heterogeneous manufacturing data — from IoT-generated data streams to\n", + "financial metrics stored in ERP applications — and it has impacted their ability to\n", + "provide the freshest, highest-quality and most complete data for analytics.\n", + "\n", + "\n", + "**Bringing AI/ML to where it’s needed**\n", + "\n", + "To realize the promise of AI/ML in manufacturing, machine learning models need\n", + "to be brought as close to the decision as possible, often at the edge in facilities\n", + "and locations with limited or intermittent connectivity to the internet or cloud.\n", + "This requires deployment flexibility to on-premises or edge devices, with an\n", + "experience comparable to that in the cloud.\n", + "\n", + "**Inability to innovate at scale**\n", + "\n", + "CDOs want to be able to quickly and efficiently reproduce successes at global\n", + "scale. Technical and business users want to simply and quickly know what data\n", + "sets are available to solve the business issue at hand. Analysts want flexibility to\n", + "use the tools they are most familiar with in order to stay responsive to business\n", + "needs. Fragmented approaches to architecture and tooling make scaling\n", + "business impact very difficult, which results in talent churn, slower development\n", + "and duplicative efforts — all leading to higher costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Databricks Lakehouse for Manufacturing\n", + "\n", + "**Deliver personalized outcomes and frictionless experiences**\n", + "\n", + "**Millions of assets streaming IoT data**\n", + "\n", + "**5%–10% reduction in unplanned downtime and cost**\n", + "\n", + "**Accurate prices across 1,000s of locations and millions of dealers**\n", + "\n", + "**200%+ increase in offer conversion rates**\n", + "\n", + "With Databricks Lakehouse for Manufacturing, manufacturers can gain a\n", + "single view of their customers that combines data from each stage of the\n", + "customer journey. With a 360-degree view in place, manufacturers can drive\n", + "more differentiated sales strategies and precise service outcomes in the\n", + "field, delivering higher revenue growth, profitability and CSAT scores.\n", + "\n", + "With the Databricks Lakehouse, you can analyze product telemetry data,\n", + "customer insights and service networks to deliver highest uptime, quality of\n", + "service and economic value through the product lifecycle.\n", + "\n", + "**Optimize the supply chain, production processes and fulfillment logistics**\n", + "\n", + "**with real-time analytics and AI.**\n", + "\n", + "The Databricks Lakehouse for Manufacturing is the only enterprise data platform\n", + "that helps manufacturing organizations optimize their supply chains, boost\n", + "product innovation, increase operational efficiencies, predict fulfillment needs\n", + "and reduce overall costs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Gain real-time insight for agile manufacturing and logistics**\n", + "\n", + "**30%–50% improvement in forecast accuracy**\n", + "\n", + "**90% lower cost for new manufacturing line**\n", + "\n", + "**4%–8% reduction in logistics costs**\n", + "\n", + "**10% improvement in carbon footprint**\n", + "\n", + "The Databricks Lakehouse lets you build a resilient and predictive supply\n", + "chain by eliminating the trade-off between accuracy or depth of analysis\n", + "and time. With scalable, fine-grained forecasts to predict or sense demand,\n", + "or perform supply chain planning and optimization, Databricks improves\n", + "accuracy of decisions, leading to higher revenue growth and lower costs.\n", + "\n", + "The lakehouse provides an “always on” architecture that makes IT-OT\n", + "convergence a reality, by continuously putting all data to work regardless of the\n", + "frequency at which it arrives (periodic, event-driven or real-time streaming)\n", + "and creates valuable data products that can empower decision makers. This\n", + "creates real-time insight into performance with data from connected factory\n", + "equipment, order flows and production processes to drive the most effective\n", + "resource scheduling.\n", + "\n", + "\n", + "**Empower the manufacturing workforce of the future**\n", + "\n", + "**25% improvement in data team productivity**\n", + "\n", + "**50x faster time to insight**\n", + "\n", + "**50% reduction in workplace injuries**\n", + "\n", + "With Databricks, manufacturers can increase the impact and decrease the\n", + "time-to-value of their data assets, ultimately making data and AI central to every\n", + "part of their operation. And by empowering data teams across engineering,\n", + "analytics and AI to work together, Databricks frees up employees to self-serve\n", + "and focus on realizing maximum business value — improving product quality,\n", + "reducing downtime and exceeding customer expectations.\n", + "\n", + "**Execute product innovation at the speed of data**\n", + "\n", + "**90% decrease in time to market of new innovations**\n", + "\n", + "**20x faster data processing of vehicle and road data**\n", + "\n", + "It is critical that manufacturers are offering the most desirable value\n", + "propositions so end consumers don’t look elsewhere. By tapping into product\n", + "performance and attribute data along with market trends and operations\n", + "information, manufacturers can make strategic decisions.\n", + "\n", + "With Databricks, manufacturers can decrease time to market with new products\n", + "to increase sales by analyzing customer behavior and insights (structured,\n", + "unstructured and semi-structured), product telemetry (streaming, RFID, computer\n", + "vision) and digital twins, and leveraging that data to drive product decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Building Innovative Solutions on the Lakehouse\n", + "\n", + "\n", + "The flexibility of the Databricks Lakehouse Platform means that you can start\n", + "with the use case that will have the most impact on your business. Through\n", + "our experience working with some of the largest and most cutting-edge\n", + "manufacturers in the world, we’ve developed Solution Accelerators based\n", + "on the most common needs of manufacturers to help you get started. These\n", + "purpose-built guides — fully functional notebooks and best practices — speed\n", + "up results across your most common and high-impact use cases. Go from idea\n", + "to proof of concept (PoC) in as little as two weeks. Check out the full list of\n", + "Solution Accelerators [here](https://www.databricks.com/solutions/accelerators) .\n", + "\n", + "**S O L U T I O N**\n", + "**Part-Level Demand**\n", + "**Forecasting**\n", + "\n", + "\n", + "Demand forecasting is a critical business process for manufacturing and\n", + "supply chains. McKinsey estimates that over the next 10 years, supply\n", + "chain disruptions can cost close to half (~45%) of a year’s worth of profits\n", + "for companies. Having accurate and up-to-date forecasts is vital to plan\n", + "the scaling of manufacturing operations, ensure sufficient inventory and\n", + "guarantee customer fulfillment.\n", + "\n", + "In recent years, manufacturers have been investing heavily in quantitativebased forecasting that is driven by historical data and powered using either\n", + "statistical or machine learning techniques. Benefits include:\n", + "\n", + "**•** Better sales planning and revenue forecasting\n", + "\n", + "**•** Optimized safety stock to maximize turn-rates and\n", + "service-delivery performance\n", + "\n", + "**•** Improved production planning by tracing back\n", + "production outputs to raw material levels\n", + "\n", + "**A disruption lasting just 30 days or less could**\n", + "\n", + "**equal losses of** **3%-5% of EBITDA.**\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks Lakehouse can enable large-scale forecasting solutions to help\n", + "manufacturers navigate the most common data challenges when trying to\n", + "forecast demand.\n", + "\n", + "**C O M M O N U S E C A S E S :**\n", + "\n", + "Scalable, accurate forecasts across large numbers of store-item\n", + "combinations experiencing intermittent demand\n", + "\n", + "Automated model selection to ensure the best model is selected\n", + "for each store-item combination\n", + "\n", + "Metrics to identify the optimal frequency with which to generate\n", + "new predictions\n", + "\n", + "Manage material shortages and predict overplanning\n", + "\n", + "**Try our** **[Parts-Level Solution Accelerator](https://www.databricks.com/solutions/accelerators/demand-forecasting)** **to facilitate**\n", + "\n", + "**fine-grained demand forecasts and planning.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Overall Equipment Effectiveness**\n", + "**& KPI Monitoring**\n", + "\n", + "\n", + "​The need to monitor and measure manufacturing equipment performance is\n", + "critical for operational teams within manufacturing. Today, Overall Equipment\n", + "Effectiveness (OEE) is considered the standard for measuring manufacturing\n", + "equipment productivity. According to Engineering USA, an OEE value of 85% or\n", + "above is considered world-leading. However, many manufacturers typcially achieve\n", + "a range of between 40% and 60%. Reasons for underachievement often include:\n", + "\n", + "**•** Delayed inputs due to manual processes that are prone to human error\n", + "\n", + "**•** Bottlenecks created by data silos, impeding the flow of fresh data to\n", + "stakeholders\n", + "\n", + "**•** A lack of collaboration capabilities, keeping stakeholders from working on the\n", + "same information at the same time\n", + "\n", + "**Poor OEE value** **can be a result of poor parts quality, slow**\n", + "**production performance and production availability issues.**\n", + "\n", + "Databricks Lakehouse can help manufacturers maneuver through the\n", + "challenges of ingesting and converging operational technology (OT) data with\n", + "traditional data from IT systems to build forecasting solutions.\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Incrementally ingest and process sensor data from IoT devices\n", + "in a variety of formats\n", + "\n", + "Compute and surface KPIs and metrics to drive valuable insights\n", + "\n", + "Optimize plant operations with data-driven decisions\n", + "\n", + "**Try our** **[Solution Accelerator for OEE and KPI Monitoring](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)** **for**\n", + "**performant and scalable end-to-end monitoring.**\n", + "\n", + "\n", + "-----\n", + "\n", + "Market dynamics and volatility are requiring manufacturers to bring products to\n", + "market more quickly, optimize production processes and build agile supply chains\n", + "at scale at a lower price. To do so, many manufacturers have turned to building\n", + "digital twins, which are virtual representations of objects, products, pieces of\n", + "equipment, people, processes or even complete manufacturing ecosystems.\n", + "\n", + "Digital twins provide insights — derived from sensors (often IoT or IIoT) that\n", + "are embedded in the original equipment — that have the potential to transform\n", + "the manufacturing industry by driving greater efficiency, reducing costs and\n", + "improving quality.\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**Digital Twins**\n", + "\n", + "\n", + "**Digital twin technologies can improve product**\n", + "\n", + "**quality by** **up to 25%.**\n", + "\n", + "Databricks Lakehouse can bring digital twins to life through fault-tolerant\n", + "processing of streaming workloads generated by IoT sensor data and complex\n", + "event processing (important for modeling physical processes).\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Process real-world data in real time\n", + "\n", + "Compute insights at scale and deliver to multiple downstream applications\n", + "\n", + "Optimize plant operations with data-driven decisions\n", + "\n", + "**Try our** **[Solution Accelerator for Digital Twins](https://www.databricks.com/solutions/accelerators/digital-twins)** **to accelerate**\n", + "**time to market of new innovations.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Computer Vision**\n", + "\n", + "The rise in computer vision has been fueled by the rapid developments in\n", + "neural network technologies, which use AI to better understand and interpret\n", + "images with near-perfect precision. In manufacturing, computer vision can\n", + "transform operations by, for example, identifying product defects to improve\n", + "quality control, detecting safety hazards on the production floor, and tracking\n", + "and managing inventory levels.\n", + "\n", + "**As per the American Society for Quality, cost of poor quality for**\n", + "\n", + "**companies can be as high as** **20% of revenue.**\n", + "\n", + "\n", + "Databricks Lakehouse can easily ingest complex, unstructured image and video\n", + "data at massive scale. Through the most popular computer vision libraries, data\n", + "teams can scale AI models that leverage computer vision to recognize patterns,\n", + "detect objects and make predictions with 99% accuracy.\n", + "\n", + "**C O M M O N U S E C A S E S**\n", + "\n", + "Quickly identify defects and ensure that products and processes meet\n", + "quality standards\n", + "\n", + "Automate positioning and guidance to ensure that parts and products are\n", + "properly aligned and assembled\n", + "\n", + "Predict maintenance issues to reduce downtime and maintenance costs,\n", + "improve parts reliability, and increase safety for workers\n", + "\n", + "**Try our** **[Solution Accelerator for Computer Vision](https://www.databricks.com/blog/2021/12/17/enabling-computer-vision-applications-with-the-data-lakehouse.html)** **to improve**\n", + "**efficiency, reduce costs and enhance overall safety.**\n", + "\n", + "\n", + "-----\n", + "\n", + "## An Ecosystem on the Lakehouse for Manufacturing\n", + "\n", + "We’ve partnered with leading consulting firms and\n", + "independent software vendors to deliver innovative,\n", + "manufacturing-specific solutions. Databricks\n", + "Brickbuilder Solutions help you cut costs and\n", + "increase value from your data. Backed by decades\n", + "of industry expertise — and built for the Databricks\n", + "Lakehouse Platform — Brickbuilder Solutions are\n", + "tailored to your exact needs.\n", + "\n", + "We also work with technology partners like Alteryx,\n", + "AtScale, Fivetran, Microsoft Power BI, Qlik, Sigma,\n", + "Simplement, Tableau and ThoughtSpot to accelerate\n", + "the availability and value of data. This allows\n", + "businesses to unify data from complex source\n", + "systems and operationalize it for analytics, AI and\n", + "ML on the Databricks Lakehouse Platform.\n", + "\n", + "\n", + "-----\n", + "\n", + "**S O L U T I O N**\n", + "**Avanade Intelligent Manufacturing**\n", + "\n", + "Every year, businesses lose millions of dollars due to equipment failure,\n", + "unscheduled downtime and lack of control in maintenance scheduling. Along\n", + "with lost dollars, businesses will experience lower employee morale when\n", + "stations are in and out of service. Avanade’s Intelligent Manufacturing solution\n", + "supports connected production facilities and assets, workers, products and\n", + "consumers to create value through enhanced insights and improved outcomes.\n", + "Manufacturers can harness data to drive interoperability and enhanced insights\n", + "at scale using analytics and AI. Outcomes include improvements across\n", + "production (e.g., uptime, quantity and yield), better experiences for workers,\n", + "and greater insight into what customers want.\n", + "\n", + "**Try our joint solution,** **[Intelligent Manufacturing](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/avanade-intelligent-manufacturing)** **, to drive value and**\n", + "**operationalize team coordination and productivity.**\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**DataSentics Quality Inspector**\n", + "\n", + "Quality control is a crucial aspect of any production process, but traditional\n", + "methods can be time-consuming and prone to human error. Quality\n", + "Inspector by DataSentics, an Atos company, offers a solution that is\n", + "both efficient and reliable. With out-of-the-box models for visual quality\n", + "inspection, which are tailored to meet specific business requirements,\n", + "organizations will experience stable, scalable quality control that’s easy to\n", + "improve over time. Quality Inspector is an end-to-end solution that can be\n", + "seamlessly integrated into an existing setup, delivering high performance\n", + "and reliability.\n", + "\n", + "**Try our joint solution,** **[Quality Inspector](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to automate production quality**\n", + "**control with an increase in accuracy and quicker time to value.**\n", + "\n", + "\n", + "-----\n", + "\n", + "TREDENCE PSRM_1”: PREDICT SUPPLY RISK\n", + "\n", + "TREDENCE PSRM_2”: REAL-TIME SHIPMENT VISIBILITY\n", + "\n", + "TREDENCE PSRM_3”: DELAY ALERTS\n", + "\n", + "\n", + "**S O L U T I O N**\n", + "**Tredence Predictive Supply Risk Management**\n", + "\n", + "Customers today are faced with multiple supply risks including lack of\n", + "in-transit visibility, disruptions caused by weather, local events, among\n", + "others. Tredence’s Predictive Supply Risk Management solution, built on\n", + "the Databricks Lakehouse Platform, helps businesses meet supply risk\n", + "challenges by providing a scalable, cloud-based solution that can be\n", + "tailored to the specific needs of each organization. The platform’s flexibility\n", + "and scalability allow businesses to keep pace with changing regulations\n", + "and customer demands, while their comprehensive suite of tools helps\n", + "identify and mitigate risks across the enterprise.\n", + "\n", + "**Try our joint solution,** **[Predictive Supply Risk Management](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to**\n", + "**predict order delays, identify root causes and quantify supply**\n", + "**chain impact.**\n", + "\n", + "Visit our [site](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview) to learn more about our Databricks Partner Solutions.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Leading Manufacturing Companies That Choose Us\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is the lakehouse company. More than 9,000 organizations worldwide\n", + "\n", + "— including Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the\n", + "\n", + "Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the\n", + "\n", + "original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission\n", + "\n", + "to help data teams solve the world’s toughest problems. To learn more, follow\n", + "\n", + "Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "###### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=634147899783&utm_term=try%20databricks&gclid=CjwKCAiAr4GgBhBFEiwAgwORrTnkJaDf9SpIDy2RxOV28a2G2HtUDvJnLXiVWBsqcAWa_XmSvabkVRoCiwgQAvD_BwE#account)**\n", + "\n", + "To learn more, visit us at:\n", + "**[Manufacturing Industry Solutions](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Lakehouse-for-Manufacturing.pdf2024-09-19T16:57:19Z
**2 0 2 0 E D I T I O N** | U P D AT E D\n", + "\n", + "# Standardizing the Machine Learning Lifecycle\n", + "\n", + "### From experimentation to production with MLflow\n", + "\n", + "[��](https://mlflow.org)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "#### Contents\n", + "\n", + "Chapter 1: \u0007Machine Learning\n", + "Lifecycle Challenges 3\n", + "\n", + "Chapter 2: \u0007Applying Good Engineering\n", + "Principles to Machine Learning 7\n", + "\n", + "Chapter 3: \u0007Introducing MLflow 9\n", + "\n", + "Chapter 4: \u0007A Closer Look at MLflow\n", + "Model Registry 16\n", + "\n", + "Chapter 5: \u0007Making Organizations\n", + "Successful with ML 19\n", + "\n", + "Chapter 6: \u0007Introducing the Unified\n", + "Data Analytics Platform 20\n", + "\n", + "Chapter 7: \u0007Standardizing the Machine\n", + "Learning Lifecycle on Databricks 25\n", + "\n", + "Chapter 8: \u0007Getting Started 26\n", + "\n", + "Chapter 9: \u0007Comparison Matrix 27\n", + "\n", + "\n", + "#### Preface\n", + "\n", + "##### Technology changes quickly. Data science and machine learning (ML) are moving\n", + " even faster. In the short time since we first published this eBook, businesses across industries have rapidly matured their machine learning operations (MLOps) — implementing ML applications and moving their first models into production. This has turned ML models into corporate assets that need to be managed across the lifecycle.\n", + "\n", + " That’s why MLflow, an open-source platform developed by Databricks, has emerged\n", + " as a leader in automating the end-to-end ML lifecycle. With 1.8 million 1 downloads a month — and growing support in the developer community — this open-source platform is simplifying the complex process of standardizing and productionizing MLOps. This updated eBook explores the advantages of MLflow and introduces you to the newest component: MLflow Model Registry. You’ll also discover how MLflow fits into the Databricks Unified Data Analytics Platform for data engineering, science and analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 1: **\u0007** **Machine Learning**\n", + "\n", + "#### Lifecycle Challenges\n", + "\n", + "\n", + "Building machine learning models is hard. Putting them into production is harder. Enabling others — data\n", + "\n", + "scientists, engineers or even yourself — to reproduce your pipeline and results is equally challenging. How\n", + "\n", + "many times have you or your peers had to discard previous work because it was either not documented\n", + "\n", + "properly or too difficult to replicate?\n", + "\n", + "Getting models up to speed in the first place is significant enough that it can be easy to overlook long-\n", + "\n", + "term management. What does this involve in practice? In essence, we have to compare the results of\n", + "\n", + "different versions of ML models along with corresponding artifacts — code, dependencies, visualizations,\n", + "\n", + "intermediate data and more — to track what’s running where, and to redeploy and roll back updated models\n", + "\n", + "as needed. Each of these requires its own specific tools, and it’s these changes that make the ML lifecycle\n", + "\n", + "so challenging compared with traditional software development lifecycle (SDLC) management.\n", + "\n", + "This represents a serious shift and creates challenges compared with a more traditional software\n", + "\n", + "development lifecycle for the following reasons:\n", + "\n", + "\n", + "The diversity and number of ML\n", + "\n", + "tools involved, coupled with a\n", + "\n", + "lack of standardization across\n", + "\n", + "ML libraries and frameworks\n", + "\n", + "\n", + "The continuous nature of ML\n", + "\n", + "development, accompanied by a\n", + "\n", + "lack of tracking and management\n", + "\n", + "tools for machine learning models\n", + "\n", + "and experiments\n", + "\n", + "\n", + "The complexity of productionizing\n", + "\n", + "ML models due to the lack of\n", + "\n", + "integration among data pipelines,\n", + "\n", + "ML environments and production\n", + "\n", + "services\n", + "\n", + "\n", + "Let’s look at each of these areas in turn.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The diversity and number of ML tools involved\n", + "\n", + "\n", + "While the traditional software development process leads to the\n", + "\n", + "rationalization and governance of tools and platforms used for developing and\n", + "\n", + "managing applications, the ML lifecycle relies on data scientists’ ability to use\n", + "\n", + "multiple tools, whether for preparing data and training models, or deploying\n", + "\n", + "them for production use. Data scientists will seek the latest algorithms from\n", + "\n", + "\n", + "However, due to the variety of available tools and the lack of detailed tracking,\n", + "\n", + "teams often have trouble getting the same code to work again in the same way.\n", + "\n", + "Reproducing the ML workflow is a critical challenge, whether a data scientist\n", + "\n", + "needs to pass training code to an engineer for use in production or go back to\n", + "\n", + "past work to debug a problem.\n", + "\n", + "\n", + "the most up-to-date ML libraries and frameworks available to compare results\n", + "\n", + "and improve performance.\n", + "\n", + "**PREP DATA** **BUILD MODEL** **DEPLOY MODEL**\n", + "\n", + "Azure ML\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The continuous nature of ML development\n", + "\n", + "Technology never stands still. New data, algorithms,\n", + "\n", + "libraries and frameworks impact model performance\n", + "\n", + "continuously and, thus, need to be tested. Therefore,\n", + "\n", + "machine learning development requires a continuous\n", + "\n", + "\n", + "approach, along with tracking capabilities to\n", + "\n", + "compare and reproduce results. The performance\n", + "\n", + "of ML models depends not only on the algorithms\n", + "\n", + "used, but also on the quality of the data sets and the\n", + "\n", + "parameter values for the models.\n", + "\n", + "\n", + "**P R E P**\n", + "**D ATA**\n", + "\n", + "**B U I L D**\n", + "**M O D E L**\n", + "\n", + "\n", + "Whether practitioners work alone or on teams, it’s\n", + "\n", + "still very difficult to track which parameters, code\n", + "\n", + "and data went into each experiment to produce a\n", + "\n", + "model, due to the intricate nature of the ML\n", + "\n", + "lifecycle itself.\n", + "\n", + "**D E P L O Y**\n", + "**M O D E L**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The complexity of productionizing ML models\n", + "\n", + "\n", + "In software development, the architecture is set early on, based on the target\n", + "\n", + "application. Once the infrastructure and architecture have been chosen, they\n", + "\n", + "won’t be updated or changed due to the sheer amount of work involved in\n", + "\n", + "rebuilding applications from scratch. Modern developments, such as the move\n", + "\n", + "to microservices, are making this easier, but for the most part, SDLC focuses on\n", + "\n", + "maintaining and improving what already exists.\n", + "\n", + "\n", + "One of today’s key challenges is to effectively transition models from\n", + "\n", + "experimentation to staging and production — without needing to rewrite the code\n", + "\n", + "for production use. This is time-consuming and risky as it can introduce new\n", + "\n", + "bugs. There are many solutions available to productionize a model quickly, but\n", + "\n", + "practitioners need the ability to choose and deploy models across any platform,\n", + "\n", + "and scale resources as needed to manage model inference effectively on big data,\n", + "\n", + "in batch or real time.\n", + "\n", + "\n", + "With machine learning the first goal is to build a model. And keep in mind: a\n", + "\n", + "model’s performance in terms of accuracy and sensitivity is agnostic from the\n", + "\n", + "deployment mode. However, models can be heavily dependent on latency, and\n", + "\n", + "the chosen architecture requires significant scalability based on the business\n", + "\n", + "application. End-to-end ML pipeline designs can be great for batch analytics and\n", + "\n", + "looking at streaming data, but they can involve different approaches for real-time\n", + "\n", + "scoring when an application is based on a microservice architecture working via\n", + "\n", + "REST APIs, etc.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 2: **\u0007** **Applying Good Engineering**\n", + "\n", + "#### Principles to Machine Learning\n", + "\n", + "\n", + "Many data science and machine learning projects fail due to preventable issues that have been resolved\n", + "\n", + "in software engineering for more than a decade. However, those solutions need to be adapted due to key\n", + "\n", + "differences between developing code and training ML models.\n", + "\n", + "- \u0007 **Expertise, code and data** — With the addition of data, data science and ML, code not only needs to deal\n", + "\n", + "with data dependencies but also handle the inherent nondeterministic characteristics of statistical\n", + "\n", + "modeling. ML models are not guaranteed to behave the same way when trained twice, unlike traditional\n", + "\n", + "code, which can be easily unit tested.\n", + "\n", + "- \u0007 **Model artifacts** — In addition to application code, ML products and features also depend on models\n", + "\n", + "that are the result of a training process. Those model artifacts can often be large — on the order of\n", + "\n", + "gigabytes — and often need to be served differently from code itself.\n", + "\n", + "- \u0007 **Collaboration** — In large organizations, models that are deployed in an application are usually not trained\n", + "\n", + "by the same people responsible for the deployment. Handoffs between experimentation, testing and\n", + "\n", + "production deployments are similar but not identical to approval processes in software engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### The need for standardization\n", + "\n", + "Some of the world’s largest tech companies have already begun solving these problems internally with\n", + "\n", + "their own machine learning platforms and lifecycle management tools. 2 These internal platforms have\n", + "\n", + "been extremely successful and are designed to accelerate the ML lifecycle by standardizing the process of\n", + "\n", + "data preparation, model training, and deployment via APIs built for data scientists. The platforms not only\n", + "\n", + "help standardize the ML lifecycle but also play a major role in retaining knowledge and best practices, and\n", + "\n", + "maximizing data science team productivity and collaboration, thereby leading to greater ROI.\n", + "\n", + "Internally driven strategies still have limitations. First, they are limited to a few algorithms or frameworks.\n", + "\n", + "Adoption of new tools or libraries can lead to significant bottlenecks. Of course, data scientists always\n", + "\n", + "want to try the latest and the best algorithms, libraries and frameworks — the most recent versions of\n", + "\n", + "PyTorch, TensorFlow and so on. Unfortunately, production teams cannot easily incorporate these into\n", + "\n", + "the custom ML platform without significant rework. The second limitation is that each platform is tied\n", + "\n", + "to a specific company’s infrastructure. This can limit sharing of efforts among data scientists. As each\n", + "\n", + "framework is so specific, options for deployment can be limited.\n", + "\n", + "The question then is: Can similar benefits to these systems be provided in an open manner? This evaluation\n", + "\n", + "must be based on the widest possible mix of tools, languages, libraries and infrastructures. Without this\n", + "\n", + "approach, it will be very difficult for data scientists to evolve their ML models and keep pace with industry\n", + "\n", + "developments. Moreover, by making it available as open source, the wider industry will be able to join in and\n", + "\n", + "contribute to ML’s wider adoption. This also makes it easier to move between various tools and libraries\n", + "\n", + "over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 3: **\u0007** **Introducing MLflow**\n", + "\n", + "**M AT E I Z A H A R I A**\n", + "\n", + "Co-founder and Chief Technologist at Databricks\n", + "\n", + "\n", + "At Databricks, we believe that there should be a better way to manage the ML lifecycle. So in June 2018,\n", + "\n", + "we unveiled [MLflow](https://mlflow.org/) , an open-source machine learning platform for managing the complete ML lifecycle.\n", + "\n", + "###### “MLflow is designed to be a cross-cloud, modular, API-first framework, to work well with\n", + " all popular ML frameworks and libraries. It is open and extensible by design, and platform\n", + " agnostic for maximum flexibility.”\n", + "\n", + "With MLflow, data scientists can now package code as reproducible runs, execute and\n", + "\n", + "compare hundreds of parallel experiments, and leverage any hardware or software platform\n", + "\n", + "for training, hyperparameter tuning and more. Also, organizations can deploy and manage\n", + "\n", + "models in production on a variety of clouds and serving platforms.\n", + "\n", + "###### “ With MLflow, data science teams can systematically package and reuse models\n", + " across frameworks, track and share experiments locally or in the cloud, and deploy\n", + " models virtually anywhere,” says Zaharia. “The flurry of interest and contributions we’ve\n", + " seen from the data science community validates the need for an open-source framework to\n", + " streamline the machine learning lifecycle.”\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "**EXPERIMENT TRACKING** As mentioned previously, getting ML models to perform takes significant trial and error, and continuous configuration, building, tuning, testing,\n", + "\n", + "etc. Therefore, it is imperative to allow data science teams to track all that goes into a specific run, along with the results. With MLflow, data scientists can quickly record\n", + "\n", + "runs and keep track of model parameters, results, code and data from each experiment, all in one place.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "\n", + "**FLEXIBLE DEPLOYMENT** There is virtually no limit to what machine learning can\n", + "\n", + "do for your business. However, there are different ways to architect ML applications\n", + "\n", + "for production, and various tools can be used for deploying models, which often\n", + "\n", + "lead to code rewrites prior to deploying ML models into production. With MLflow,\n", + "\n", + "your data scientists can quickly download or deploy any saved models to various\n", + "\n", + "platforms — locally or in the cloud — from experimentation to production.\n", + "\n", + "\n", + "**REPRODUCIBLE PROJECTS** The ability to reproduce a project — entirely or just\n", + "\n", + "parts of it — is key to data science productivity, knowledge sharing and, hence,\n", + "\n", + "accelerating innovation. With MLflow, data scientists can build and package\n", + "\n", + "composable projects, capture dependencies and code history for reproducible\n", + "\n", + "results, and quickly share projects with their peers.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Key benefits\n", + "\n", + "**MODEL MANAGEMENT** Use one central place to share ML models, collaborate on moving them from experimentation to online testing and production, integrate with\n", + "\n", + "approval and governance workflows, and monitor ML deployments and their performance. This is powered by the latest MLflow component, MLflow Model Registry.\n", + "\n", + "**M O D E L D E P L O Y M E N T A N D M O N I T O R I N G**\n", + "\n", + "**I N - L I N E C O D E**\n", + "\n", + "��\n", + "\n", + "**M L L I B R A R I E S**\n", + "\n", + "###### Model Format\n", + "\n", + "**C O N TA I N E R S**\n", + "\n", + "\n", + "**F L AV O R 1**\n", + "\n", + "\n", + "**F L AV O R 2**\n", + "\n", + "**B AT C H A N D S T R E A M S C O R I N G**\n", + "\n", + "\n", + "Simple model flavors\n", + "usable by many tools\n", + "\n", + "\n", + "**C L O U D I N F E R E N C E S E R V I C E S**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Use case examples\n", + "\n", + "Let‘s examine three use cases to explore how users can leverage some of the MLflow components.\n", + "\n", + "\n", + "**EXPERIMENT TRACKING** A European energy\n", + "\n", + "company is using MLflow to track and update\n", + "\n", + "hundreds of energy-grid models. This company’s\n", + "\n", + "goal is to build a time-series model for every major\n", + "\n", + "energy producer (e.g., power plant) and consumer\n", + "\n", + "(e.g., factory), monitor these models using standard\n", + "\n", + "metrics, and combine the predictions to drive\n", + "\n", + "business processes, such as pricing. Because a\n", + "\n", + "single team is responsible for hundreds of models,\n", + "\n", + "possibly using different ML libraries, it’s important to\n", + "\n", + "have a standard development and tracking process.\n", + "\n", + "The team has standardized on Jupyter notebooks\n", + "\n", + "for development, MLflow Tracking for metrics, and\n", + "\n", + "Databricks Jobs for inference.\n", + "\n", + "\n", + "**REPRODUCIBLE PROJECTS** An online marketplace\n", + "\n", + "is using MLflow to package deep learning jobs using\n", + "\n", + "Keras and run them in the cloud. Each data scientist\n", + "\n", + "develops models locally on a laptop using a small\n", + "\n", + "data set, checks them into a Git repository with\n", + "\n", + "an MLproject file, and submits remote runs of the\n", + "\n", + "project to GPU instances in the cloud for large-scale\n", + "\n", + "training or hyperparameter search. Using MLflow\n", + "\n", + "Projects makes it easy to create the same software\n", + "\n", + "environment in the cloud and share project code\n", + "\n", + "among data scientists.\n", + "\n", + "\n", + "**MODEL PACKAGING** An e-commerce site’s data\n", + "\n", + "science team is using MLflow Model Registry to\n", + "\n", + "package recommendation models for use by\n", + "\n", + "application engineers. This presents a technical\n", + "\n", + "challenge because the recommendation\n", + "\n", + "application includes both a standard, off-the-shelf\n", + "\n", + "recommendation model and custom business logic\n", + "\n", + "for pre- and post-processing. For example, the\n", + "\n", + "application might include custom code to ensure the\n", + "\n", + "recommended items are diverse. This business logic\n", + "\n", + "needs to change in sync with the model, and the data\n", + "\n", + "science team wants to control both the business logic\n", + "\n", + "and the model, without having to submit a patch to\n", + "\n", + "the web application each time the logic has to change.\n", + "\n", + "Moreover, the team wants to A/B test distinct models\n", + "\n", + "with distinct versions of the processing logic. The\n", + "\n", + "solution was to package both the recommendation\n", + "\n", + "model and the custom logic using the python_\n", + "\n", + "function flavor in an MLflow Model, which can then\n", + "\n", + "be deployed and tested as a single unit.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Open and extensible by design\n", + "\n", + "Since we [unveiled](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) and open sourced MLflow in June 2018 at the Spark + AI Summit in San Francisco, community engagement and contributions have led to an impressive\n", + "\n", + "array of new features and integrations:\n", + "\n", + "\n", + "**SUPPORT FOR MULTIPLE**\n", + "\n", + "**PROGRAMMING LANGUAGES**\n", + "\n", + "To give developers a choice, MLflow supports R,\n", + "\n", + "Python, Java and Scala, along with a REST server\n", + "\n", + "interface that can be used from any language.\n", + "\n", + "\n", + "**INTEGRATION WITH POPULAR ML**\n", + "\n", + "**LIBRARIES AND FRAMEWORKS**\n", + "\n", + "MLflow has built-in integrations with the most popular\n", + "\n", + "machine learning libraries — such as scikit-learn,\n", + "\n", + "TensorFlow, Keras, PyTorch, H2O, and Apache Spark™\n", + "\n", + "MLlib — to help teams build, test and deploy machine\n", + "\n", + "learning applications.\n", + "\n", + "\n", + "**CROSS-CLOUD SUPPORT**\n", + "\n", + "Organizations can use MLflow to quickly deploy\n", + "\n", + "machine learning models to multiple cloud services,\n", + "\n", + "including Databricks, Azure Machine Learning and\n", + "\n", + "Amazon SageMaker, depending on their needs.\n", + "\n", + "MLflow leverages AWS S3, Google Cloud Storage and\n", + "\n", + "Azure Data Lake Storage, allowing teams to easily\n", + "\n", + "track and share artifacts from their code.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Rapid community adoption\n", + "\n", + "## 2.5M\n", + "#### monthly downloads\n", + "\n", + "## 200+\n", + "#### code contributors\n", + "\n", + "\n", + "## 100+\n", + "#### contributing organizations\n", + "\n", + "\n", + "Organizations using and contributing to MLflow\n", + "\n", + "Source: [mlflow.org](https://mlflow.org)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 4: **\u0007** **A Closer Look at**\n", + "\n", + "#### MLflow Model Registry\n", + "\n", + "\n", + "MLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\n", + "\n", + "[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\n", + "\n", + "The latest MLflow component — MLflow Model Registry — builds on MLflow’s original capabilities to\n", + "\n", + "provide organizations with one central place to share ML models, collaborate on moving them from\n", + "\n", + "experimentation to testing and production, and implement approval and governance workflows.\n", + "\n", + "��\n", + "\n", + "\n", + "**Model Registry**\n", + "\n", + "\n", + "**D O W N S T R E A M**\n", + "\n", + "\n", + "��\n", + "\n", + "**Tracking Server**\n", + "\n", + "\n", + "Data Scientists\n", + "\n", + "**Staging**\n", + "\n", + "\n", + "Data Engineers\n", + "\n", + "**Production** **Archived**\n", + "\n", + "**A U T O M AT E D J O B S**\n", + "\n", + "\n", + "**Parameters**\n", + "\n", + "\n", + "**Metrics** **Artifacts**\n", + "\n", + "\n", + "The Model Registry gives MLflow users new\n", + "\n", + "\n", + "tools for sharing, reviewing and managing\n", + "\n", + "ML models throughout their lifecycle\n", + "\n", + "\n", + "**Metadata** **Models**\n", + "\n", + "**R E S T S E R V I N G**\n", + "\n", + "**R E V I E W E R S + C I / C D T O O L S**\n", + "\n", + "The MLflow Model Registry complements the MLflow offering and is designed to help organizations\n", + "\n", + "implement good engineering principles with machine learning initiatives, such as collaboration,\n", + "\n", + "governance, reproducibility and knowledge management. The next few pages highlight some of the key\n", + "\n", + "features of this new component.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "###### One hub for managing ML models collaboratively\n", + "\n", + "Building and deploying ML models is a team sport. Not only are the responsibilities\n", + "\n", + "along the machine learning model lifecycle often split across multiple people\n", + "\n", + "(e.g., data scientists train models whereas production engineers deploy them),\n", + "\n", + "but also at each lifecycle stage, teams can benefit from collaboration and sharing\n", + "\n", + "\n", + "###### Flexible CI/CD pipelines to manage stage transitions\n", + "\n", + "MLflow lets you manage your models’ lifecycles either manually or through\n", + "\n", + "automated tools. Analogous to the approval process in software engineering,\n", + "\n", + "users can manually request to move a model to a new lifecycle stage (e.g., from\n", + "\n", + "staging to production), and review or comment on other users’ transition requests.\n", + "\n", + "\n", + "(e.g., a fraud model built in one part of the organization could be reused in others).\n", + "\n", + "Alternatively, you can use the Model Registry’s API to plug in continuous integration\n", + "\n", + "\n", + "MLflow facilitates sharing of expertise and knowledge across teams by making ML\n", + "\n", + "models more discoverable and providing collaborative features to jointly improve\n", + "\n", + "on common ML tasks. Simply register an MLflow model from your experiments to\n", + "\n", + "\n", + "and deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\n", + "\n", + "your models. Each model also links to the experiment run that built it — in MLflow\n", + "\n", + "Tracking — to let you easily review models.\n", + "\n", + "\n", + "get started. The MLflow Model Registry will then let you track multiple versions\n", + "\n", + "of the model and mark each one with a lifecycle stage: development, staging,\n", + "\n", + "production or archived.\n", + "\n", + "\n", + "Sample machine learning\n", + "models displayed via the\n", + "MLflow Model Registry\n", + "dashboard\n", + "\n", + "\n", + "The machine learning model\n", + "page view in MLflow, showing\n", + "how users can request and\n", + "review changes to a model’s\n", + "stage\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Visibility and governance for the full ML lifecycle\n", + "\n", + "In large enterprises, the number of ML models that are in development, staging\n", + "\n", + "and production at any given point in time may be in the hundreds or thousands.\n", + "\n", + "Having full visibility into which models exist, what stages they are in and who\n", + "\n", + "has collaborated on and changed the deployment stages of a model allows\n", + "\n", + "organizations to better manage their ML efforts.\n", + "\n", + "MLflow provides full visibility and enables governance by keeping track of each\n", + "\n", + "model’s history and managing who can approve changes to the model’s stages.\n", + "\n", + "Identify versions, stages and\n", + "authors of each model\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 5: **\u0007** **Making Organizations**\n", + "\n", + "#### Successful with ML\n", + "\n", + "\n", + "Standardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\n", + "\n", + "track experiments, compare results, reproduce runs and productionize faster.\n", + "\n", + "In addition to increasing data science team productivity and collaboration and applying good engineering\n", + "\n", + "practices to machine learning, organizations also need to do the following:\n", + "\n", + "\n", + "**Reliably ingest, ETL and**\n", + "\n", + "**catalog big data**\n", + "\n", + "\n", + "**Work with state-of-the-art**\n", + "\n", + "**ML frameworks and tools**\n", + "\n", + "\n", + "**Easily scale compute from**\n", + "\n", + "**single to multi-node**\n", + "\n", + "\n", + "Databricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "CHAPTER 6: **\u0007** **Introducing the Unified**\n", + "\n", + "#### Data Analytics Platform\n", + "\n", + "\n", + "Databricks accelerates innovation by unifying data science, engineering and business. Through a fully\n", + "\n", + "managed, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\n", + "\n", + "Databricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\n", + "\n", + "accelerates their innovation.\n", + "\n", + "**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\n", + "\n", + "\n", + "**BI INTEGRATIONS**\n", + "\n", + "**Access all your data**\n", + "\n", + "\n", + "**DATA SCIENCE WORKSPACE**\n", + "\n", + "**Collaboration across the lifecycle**\n", + "\n", + "**UNIFIED DATA SERVICE**\n", + "\n", + "**High-quality data with great performance**\n", + "\n", + "\n", + "\n", + "**ENTERPRISE CLOUD SERVICE**\n", + "\n", + "**A simple, scalable and secure managed service**\n", + "\n", + "##### RAW DATA LAKE\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "\n", + "###### Data engineering\n", + "\n", + "Speed up the preparation of high-quality\n", + "\n", + "data, essential for best-in-class ML\n", + "\n", + "applications, at scale\n", + "\n", + "\n", + "###### Data science\n", + "\n", + "Collaboratively explore large data sets,\n", + "\n", + "build models iteratively and deploy across\n", + "\n", + "multiple platforms\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Providing managed MLflow on Databricks\n", + "\n", + "MLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\n", + "\n", + "packaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\n", + "\n", + "By using MLflow as part of Databricks, data scientists can:\n", + "\n", + "\n", + "**WORKSPACES**\n", + "\n", + "Benefit from a streamlined\n", + "\n", + "experiment tracking experience\n", + "\n", + "with Databricks Workspace and\n", + "\n", + "collaborative Notebooks\n", + "\n", + "\n", + "**BIG DATA SNAPSHOTS**\n", + "\n", + "Track large-scale data that fed\n", + "\n", + "the models, along with all the\n", + "\n", + "other model parameters, then\n", + "\n", + "\n", + "**JOBS**\n", + "\n", + "Easily initiate jobs remotely, from\n", + "\n", + "an on-premises environment or\n", + "\n", + "from Databricks notebooks\n", + "\n", + "\n", + "**SECURITY**\n", + "\n", + "Take advantage of one common\n", + "\n", + "security model for the entire\n", + "\n", + "machine learning lifecycle\n", + "\n", + "\n", + "reproduce training runs reliably\n", + "\n", + "\n", + "Read our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Getting data ready for ML with Delta Lake\n", + "\n", + "Delta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\n", + "\n", + "data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\n", + "\n", + "By using Delta Lake, data engineers and data scientists can keep track of data used for model training.\n", + "\n", + "Files ML Runtime\n", + "\n", + "- \u0007Schema enforced high\n", + "\n", + "quality data\n", + "\n", + "\n", + "\n", + "- Optimized performance\n", + "\n", + "��\n", + "\n", + "- \u0007Full data lineage /\n", + "\n", + "governance\n", + "\n", + "- \u0007reproductibility through\n", + "\n", + "time travel\n", + "\n", + "\n", + "Streaming\n", + "\n", + "Batch\n", + "\n", + "\n", + "Ingestion\n", + "\n", + "Tables\n", + "\n", + "\n", + "Ingestion\n", + "\n", + "\n", + "Data\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data\n", + "\n", + "\n", + "Feature\n", + "\n", + "Store\n", + "\n", + "\n", + "Feature\n", + "\n", + "\n", + "**Y O U R E X I S T I N G D E LTA L A K E**\n", + "\n", + "\n", + "3rd Party Data\n", + "\n", + "Marketplace\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "###### Ready-to-use ML environments\n", + "\n", + "Databricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\n", + "\n", + "preconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\n", + "\n", + "By using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\n", + "\n", + "and simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\n", + "\n", + "\n", + "**P A C K A G E S A N D O P T I M I Z E S M O S T**\n", + "\n", + "**C O M M O N M L F R A M E W O R K S**\n", + "\n", + "\n", + "**C U S T O M I Z E D E N V I R O N M E N T S**\n", + "\n", + "**U S I N G C O N D A**\n", + "\n", + "\n", + "**C U S T O M I Z E D E N V I R O N M E N T S**\n", + "\n", + "\n", + "requirements.txt\n", + "conda.yaml\n", + "\n", + "\n", + "**...**\n", + "\n", + "\n", + "**B U I LT- I N O P T I M I Z AT I O N F O R**\n", + "\n", + "**D I S T R I B U T E D D E E P L E A R N I N G**\n", + "\n", + "Distribute and Scale any Single-Machine\n", + "ML Code to thousands of machines\n", + "\n", + "\n", + "**B U I LT- I N A U T O M L A N D**\n", + "\n", + "**E X P E R I M E N T T R A C K I N G**\n", + "\n", + "\n", + "Machine\n", + "\n", + "Learning\n", + "\n", + "\n", + "Machine\n", + "\n", + "\n", + "\n", + "Auto ML and Tracking /\n", + "Visualizations with MLflow\n", + "\n", + "\n", + "Conda-\n", + "\n", + "Based\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 7: **\u0007** **Standardizing the**\n", + "\n", + "#### Machine Learning\n", + " Lifecycle on Databricks\n", + "\n", + "**B U I L D M O D E L**\n", + "**P R E P D ATA**\n", + "\n", + "��\n", + "\n", + "Azure ML\n", + "\n", + "**D E P L O Y M O D E L**\n", + "\n", + "��\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 8: **\u0007** **Getting Started**\n", + "Take the next step toward standardizing your ML lifecycle — test drive MLflow and the\n", + "\n", + "Databricks Unified Data Analytics Platform.\n", + "\n", + "**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\n", + "\n", + "**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**M A C H I N E L E A R N I N G L I F E C Y C L E**\n", + "\n", + "CHAPTER 8: **\u0007** **Comparison Matrix**\n", + "\n", + "|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\n", + "|---|---|---|\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf2024-09-19T16:57:20Z
-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_improper_payments_eBook_v4_image.pdf2024-09-19T16:57:20Z
### Technical Migration Guide\n", + "\n", + "# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents Lakehouse Architecture 3\n", + "\n", + "The Databricks Lakehouse Platform 4\n", + "\n", + "Business Value 5\n", + "\n", + "Single source of truth 5\n", + "\n", + "Data team 6\n", + "\n", + "Future-proof 6\n", + "\n", + "Migration to Lakehouse 7\n", + "\n", + "Overview 7\n", + "\n", + "Migration strategy 8\n", + "\n", + "Migration planning 9\n", + "\n", + "ELT approach 12\n", + "\n", + "Agile modernization 15\n", + "\n", + "Security and data governance 17\n", + "\n", + "Team involvement 19\n", + "\n", + "Conclusion 19\n", + "\n", + "\n", + "-----\n", + "\n", + "## Lakehouse Architecture\n", + "\n", + "\n", + "Data warehouses were designed to provide a central data repository\n", + "\n", + "with analytic compute capabilities to help business leaders\n", + "\n", + "get analytical insights, support decision-making and business\n", + "\n", + "intelligence (BI). Legacy on-premises data warehouse architectures\n", + "\n", + "are difficult to scale and make it difficult for data teams to keep up\n", + "\n", + "with the exponential growth of data. Oftentimes data teams publish\n", + "\n", + "and use a subset of well-defined data for development and testing.\n", + "\n", + "This slows down both innovation and time to insight.\n", + "\n", + "Cloud data warehouses (CDW) were an attempt to tackle the\n", + "\n", + "on-premises data warehouse challenges. CDWs removed the\n", + "\n", + "administrative burden of tasks such as setup, upgrades and\n", + "\n", + "backups. CDWs also improved scalability and introduced cloud’s\n", + "\n", + "pay-as-you-go model to reduce cost. CDWs leverage a proprietary\n", + "\n", + "data format to achieve cloud-scale and performance; however, this\n", + "\n", + "also leads to customers locked into these formats with difficult\n", + "\n", + "\n", + "But enterprise data teams don’t need a better data warehouse.\n", + "\n", + "They need an innovative, simple solution that provides reliable\n", + "\n", + "performance, elastic scale and allows self-service to unblock\n", + "\n", + "analytics to access all data at a reasonable cost. The answer is\n", + "\n", + "the lakehouse.\n", + "\n", + "The lakehouse pattern represents a paradigm shift from traditional\n", + "\n", + "on-premises data warehouse systems that are expensive and\n", + "\n", + "complex to manage. It uses an open data management architecture\n", + "\n", + "that combines the flexibility, cost-efficiency and scale of data\n", + "\n", + "lakes with the data management and ACID semantics of data\n", + "\n", + "warehouses. A lakehouse pattern enables data transformation,\n", + "\n", + "cleansing and validation to support both business intelligence and\n", + "\n", + "machine learning (ML) users on all data. Lakehouse is cloud-centric\n", + "\n", + "and unifies a complete up-to-date data set for teams, allowing\n", + "\n", + "collaboration across an organization.\n", + "\n", + "\n", + "paths to support use cases outside the data warehouse itself\n", + "\n", + "(i.e., machine learning). Customers often find themselves with a\n", + "\n", + "bifurcated architecture, which ultimately leads to a more costly and\n", + "\n", + "complex data platform over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Databricks Lakehouse Platform\n", + "\n", + "The Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n", + "\n", + "and AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n", + "\n", + "ecosystem with open standards and data formats. Databricks is **multicloud** — delivering\n", + "\n", + "one **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n", + "\n", + "every cloud platform that you’re using to support your data and AI efforts.\n", + "\n", + "Databricks SQL stores and processes data using Delta Lake to simplify and enhance\n", + "\n", + "data warehousing capabilities. Analysts can use their favorite language, SQL, popular\n", + "\n", + "transformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n", + "\n", + "analyze data. The built-in query editor reduces contextual switching and improves\n", + "\n", + "productivity. Administrators enjoy simplified workload management via serverless\n", + "\n", + "compute and auto-scaling to meet high-concurrency workload needs. All this at a\n", + "\n", + "fraction of the cost of traditional data warehouses.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "Simple Open Multicloud\n", + "\n", + "\n", + "-----\n", + "\n", + "## Business Value\n", + "\n", + "#### Single source of truth\n", + "\n", + "Databricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n", + "\n", + "scalable storage layer where you can store all your data, including raw and historical data,\n", + "\n", + "alongside structured data tables in the data warehouse. The lakehouse pattern avoids\n", + "\n", + "data silos and shares the same elastic scale and governance across all use cases: BI, data\n", + "\n", + "engineering, streaming and AI/ML. This means that data engineering teams don’t have to\n", + "\n", + "move data to a proprietary data warehouse for business analysts or create a separate\n", + "\n", + "data store to support data science.\n", + "\n", + "Instead, data teams can access the open format Delta tables directly and combine data\n", + "\n", + "sets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n", + "\n", + "data with access to versioned history to facilitate repeatable experiments. A single source\n", + "\n", + "of truth facilitates moving from descriptive to predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Data team\n", + "\n", + "\n", + "With central data governance and fine-grained access control\n", + "\n", + "capabilities to secure the lakehouse, you can enable self-service\n", + "\n", + "SQL analytics for everyone on the Databricks Lakehouse Platform.\n", + "\n", + "This allows each team to be more agile and innovate faster.\n", + "\n", + "**Data Analysts** — Using the Databricks SQL editor\n", + "\n", + "or their tools of choice (DBT, Power BI, Tableau), SQL\n", + "\n", + "analysts can leverage familiar toolsets.\n", + "\n", + "**Data Engineers** — Utilizing Delta Lake as a unified\n", + "\n", + "storage layer, data engineering teams can eliminate\n", + "\n", + "duplicate data and ETL jobs that move data across\n", + "\n", + "various systems. Databricks supports both batch and\n", + "\n", + "streaming workloads to reduce bottlenecks and serve\n", + "\n", + "the most up-to-date data to downstream users and\n", + "\n", + "applications.\n", + "\n", + "**Administrators** — The pay-as-you-go, decentralized\n", + "\n", + "compute resource allows each team to run their\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides a reliable ETL and data\n", + "\n", + "management framework to simplify ETL pipelines. Data teams can\n", + "\n", + "build end-to-end data transformations in a single pipeline instead of\n", + "\n", + "many small ETL tasks. Databricks supports data quality enforcement\n", + "\n", + "to ensure reliability with auto-scalable infrastructure. Your teams\n", + "\n", + "can onboard new data sources quickly to power new use cases with\n", + "\n", + "fresh data. This not only allows your team to efficiently and reliably\n", + "\n", + "deliver high-quality data in a timely manner, it also reduces ETL\n", + "\n", + "workload cost significantly.\n", + "\n", + "#### Future-proof\n", + "\n", + "Unlike CDWs that lock customers in, Databricks offers an open\n", + "\n", + "platform with open standards, open protocols and open data\n", + "\n", + "formats. It supports a full range of popular languages (SQL, Python,\n", + "\n", + "R, Scala) and popular BI tools. You can leverage the performant\n", + "\n", + "and low-cost distributed compute layer for data processing — or\n", + "\n", + "use a variety of tools and engines to efficiently access the data via\n", + "\n", + "Databricks APIs. Databricks also allows data consumption with a rich\n", + "\n", + "partner ecosystem. Teams can handle all existing BI and AI use cases\n", + "\n", + "with the flexibility to support future use cases as they emerge.\n", + "\n", + "\n", + "workload in isolated environments without worrying\n", + "\n", + "about contention. Serverless SQL endpoint frees your\n", + "\n", + "team from infrastructure management challenges.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Migration to Lakehouse\n", + "\n", + "#### Overview\n", + "\n", + "A lakehouse is the ideal data architecture for data-driven organizations. It combines the\n", + "\n", + "best qualities of data warehouses and data lakes to provide a single solution for all major\n", + "\n", + "data workloads and supports use cases from streaming analytics to BI, data science and\n", + "\n", + "AI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n", + "\n", + "only consumes (charges for) compute resources when workloads are running. This pay-\n", + "\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Building the Lakehouse\n", + " at Atlassian\n", + "\n", + "[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n", + "\n", + "\n", + "as-you-go model means compute resources are automatically shut down if no processing\n", + "\n", + "is needed. Data teams can use small clusters that can power individual workloads\n", + "\n", + "they plan to migrate. They can make the choice to leverage serverless SQL endpoints\n", + "\n", + "and completely free data teams from infrastructure capacity planning and cluster\n", + "\n", + "maintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n", + "\n", + "savings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n", + "\n", + "savings compared to other cloud data warehouses.\n", + "\n", + "Data warehouse migration is never an easy task. Databricks aims to mitigate the things\n", + "\n", + "that can go wrong in these demanding migration projects. The Databricks Lakehouse\n", + "\n", + "Platform provides many out-of-the-box features to mitigate migration risks.\n", + "\n", + "**C U S T O M E R S T O R Y**\n", + "##### Driving Freight Transportation Into the Future\n", + "\n", + "[Read more](https://databricks.com/customers/jbhunt)\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration strategy\n", + "\n", + "\n", + "Migration is a huge effort and very expensive. Yet, almost every\n", + "\n", + "enterprise has to migrate to new platforms every 3–5 years because\n", + "\n", + "the old platform cannot support new use cases, catch up with\n", + "\n", + "data growth or meet scaling needs. To get better ROI on migration,\n", + "\n", + "implement a migration strategy that can reduce future re-platform\n", + "\n", + "needs and extend to your future data and AI strategy.\n", + "\n", + "Use the opportunity of a data migration to standardize your data\n", + "\n", + "in open Delta format to allow existing and future tools to access\n", + "\n", + "it directly without moving or converting it. Merge your siloed\n", + "\n", + "data warehouses into the unified storage layer in the Databricks\n", + "\n", + "Lakehouse Platform — without worrying about storage capacity.\n", + "\n", + "The unified storage layer allows your team to deploy a unified data\n", + "\n", + "governance on top to secure all data access consistently. Simplify\n", + "\n", + "your data governance story with Databricks Unity Catalog.\n", + "\n", + "\n", + "Move toward a single, consistent approach to data pipelining\n", + "\n", + "and refinement. Merge batch and streaming into a single end-\n", + "\n", + "to-end pipeline to get fresher data and provide more real-time\n", + "\n", + "decisions. Take a metadata-driven approach to align the dataflow\n", + "\n", + "with business processes and have data validation and quality\n", + "\n", + "check built-in. Through a series of curation and refinement steps,\n", + "\n", + "the output results in highly consumable and trusted data for\n", + "\n", + "downstream use cases.\n", + "\n", + "The lakehouse architecture makes it possible for the organization\n", + "\n", + "to create “data assets” by taking a stepwise approach to improving\n", + "\n", + "data and serving all essential use cases. Encourage your BI/analyst\n", + "\n", + "team to leverage Databricks serverless endpoints for self-serve\n", + "\n", + "and agility. Each team can evaluate their top priority workloads and\n", + "\n", + "migrate them in parallel to speed up migration.\n", + "\n", + "Take advantage of Databricks’ rich partner ecosystem. Your favorite\n", + "\n", + "partners are likely already integrated via Partner Connect and\n", + "\n", + "can be set up with a few clicks. There are also many ISV and SI\n", + "\n", + "consulting partners who can help your migration journey.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Migration planning\n", + "\n", + "Migrating a data warehouse to the cloud can be time consuming and challenging for your\n", + "\n", + "data teams. It’s important to agree on the data architecture, migration strategy and process/\n", + "\n", + "frameworks to be used before undertaking a data migration. Databricks provides Migration\n", + "\n", + "Assessment and Architecture Review sessions to develop a joint migration roadmap. This\n", + "\n", + "process is designed to help organizations to successfully migrate to a lakehouse architecture.\n", + "\n", + "Based on information collected and business objectives, the Databricks team will work with\n", + "\n", + "customers to propose a target architecture and provide a tailored migration roadmap.\n", + "\n", + "These assessments help get a full picture of current data systems and the future vision. They\n", + "\n", + "clarify what you are migrating and do proper use case discovery. This includes identifying\n", + "\n", + "workloads and data source dependency, for example:\n", + "\n", + "Sample migration assessment checklist:\n", + "\n", + "Identify upstream data sources and workload dependencies\n", + "\n", + "Identify active/inactive data sets and database objects\n", + "\n", + "Identify downstream application dependencies and data freshness requirements\n", + "\n", + "Define a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n", + "\n", + "Define security requirements and data governance\n", + "\n", + "Clarify access management need, document needed permissions per user/group\n", + "\n", + "Outline current tooling (ingestion, ETL and BI) and what’s needed\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s important to identify key stakeholders and keep them engaged during the migration to\n", + "\n", + "make sure they are aligned with the overall objectives. The workload assessment result will\n", + "\n", + "be reviewed with key stakeholders. Through the review process, data teams can get a better\n", + "\n", + "understanding of which workloads can most benefit from modernization.\n", + "\n", + "Databricks often works with partners to provide a workload assessment and help customers\n", + "\n", + "understand their migration complexity and properly plan a budget. Databricks also partners\n", + "\n", + "with third-party vendors that provide migration tools to securely automate major migration\n", + "\n", + "tasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n", + "\n", + "help with the migration, including:\n", + "\n", + "\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n", + "\n", + "your current system to Databricks optimized code with Delta and other best practices\n", + "\n", + "\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n", + "\n", + "migration time and cost\n", + "\n", + "\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n", + "\n", + "2x–3x faster than what was previously possible\n", + "\n", + "\n", + "-----\n", + "\n", + "#### We can use Automated conversion for most workload types\n", + "\n", + "###### EDWs\n", + "\n", + "\n", + "Open Cloud Storage\n", + "ADLS, S3, GCP Storage\n", + "\n", + "Databricks Tables, �ie�s\n", + "\n", + "Spark SQL Databricks Notebooks\n", + "\n", + "Spark SQL � little bit o� Python or Scal�\n", + "\n", + "Runs on Databricks JDBC/ODBC\n", + "\n", + "Databricks permissions- Table ACLs\n", + "\n", + "Credential Pass-throughs to Files\n", + "\n", + "Big Data ETL tools, Databricks Notebooks\n", + "\n", + "Air5o� DAGs, ADF, Databricks Job\n", + "and any other Enterprise Schedulers\n", + "\n", + "\n", + "Data Migration\n", + "\n", + "Metastore Migration\n", + "\n", + "SQL Migration\n", + "\n", + "Security\n", + "\n", + "ETL Tools\n", + "\n", + "\n", + "DB locked �ormats on Disks\n", + "\n", + "Databases, Tables, �ie�s\n", + "\n", + "Ad-hoc SQL �ueries\n", + "\n", + "T-SQL, PL/SQL, BTEQ\n", + "\n", + "Reports �rom PB`, Tableau etc^\n", + "\n", + "GRANTs, Roles\n", + "\n", + "External tables- File permissions\n", + "\n", + "Data Stage, Po�erCenter, Ab `nitio etc^\n", + "\n", + "\n", + "Orchestration ETL Schedulers\n", + "\n", + "\n", + "-----\n", + "\n", + "#### ELT approach\n", + "\n", + "The separation of storage and compute makes ELT on lakehouse a better choice than traditional\n", + "\n", + "ETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n", + "\n", + "data implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n", + "\n", + "use cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n", + "\n", + "the foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n", + "\n", + "as needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n", + "\n", + "by exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n", + "\n", + "“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n", + "\n", + "**I M P R O V E D ATA Q U A L I T Y**\n", + "\n", + "Data B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n", + "\n", + "Streaming Analytics\n", + "\n", + "CSV TXT JSON\n", + "\n", + "\n", + "D ata �a �e\n", + "\n", + "\n", + "Raw\n", + "integration\n", + "\n", + "\n", + "Filtered, Cleaned,\n", + "Augmented\n", + "\n", + "\n", + "Business-level\n", + "Aggregates\n", + "\n", + "\n", + "Reuorting\n", + "\n", + "\n", + "-----\n", + "\n", + "We highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n", + "\n", + "service in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n", + "\n", + "modernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n", + "\n", + "a traditional data warehouse, you can focus on source and expected output, and create your\n", + "\n", + "entire dataflow graph declaratively. Delta Live Tables offers:\n", + "\n", + "\u0007A metadata-driven approach — You just specify what data should be in each table or view\n", + "\n", + "rather than the details of how processing should be done\n", + "\n", + "\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n", + "\n", + "monitoring/visibility, error recovery, and lineage, which reduces the strain on data\n", + "\n", + "engineering teams and improves time-to-value in building data pipelines\n", + "\n", + "\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n", + "\n", + "are populated correctly, whether continuously or on a regular schedule. For example,\n", + "\n", + "updating one table will automatically trigger all downstream table updates to keep data\n", + "\n", + "up-to-date.\n", + "\n", + "\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n", + "\n", + "pipelines simpler and easier. DLT can also automatically recover from common error\n", + "\n", + "conditions, reducing operational overhead.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Agile modernization\n", + "\n", + "\n", + "Agile development allows teams to move quickly knowing migrated\n", + "\n", + "pipelines can be revisited at a later cycle and evolving data models\n", + "\n", + "are supported within the architecture. Allowing business impact to\n", + "\n", + "drive priorities via an agile approach helps mitigate migration risks.\n", + "\n", + "Prioritizing and selecting use cases where modernization brings\n", + "\n", + "business benefits quickly is a good starting point. Focus on the 20%\n", + "\n", + "of workloads that consume 80% of budget. By breaking workflows\n", + "\n", + "down into components and managing data stories, teams can adjust\n", + "\n", + "priorities over time. Changes can be made in collaboration with the\n", + "\n", + "user community to fit the business definition of value.\n", + "\n", + "Migrating to a lakehouse architecture leverages separation of storage\n", + "\n", + "and compute to remove resource contention between ETL and BI\n", + "\n", + "workloads. As a result, the migration process can be more agile,\n", + "\n", + "allowing you to evolve your design iteratively without big-bang effort:\n", + "\n", + "\u0007Reduce time during the initial phase on full capacity plan and\n", + "\n", + "\n", + "All of this allows you to take a more iterative and business-focused\n", + "\n", + "approach for migration instead of a full planning, execution, test/\n", + "\n", + "validation approach. Here are more approaches that help facilitate\n", + "\n", + "this phased implementation:\n", + "\n", + "\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n", + "\n", + "new data into pipelines quicker to get data in near real-time.\n", + "\n", + "\u0007Delta Live Tables (DLT) improves data quality during data\n", + "\n", + "transformation and automatically scales to address data volume\n", + "\n", + "change. DLT can also support schema evolution and quarantine\n", + "\n", + "bad data or data that needs to be reprocessed at a later stage.\n", + "\n", + "\u0007Use dedicated clusters to isolate workloads, lower the total cost\n", + "\n", + "of ownership and improve overall performance. By using multiple\n", + "\n", + "clusters, we can shut down resources when not in use and move\n", + "\n", + "away from managing fixed resources in a single large cluster.\n", + "\n", + "\n", + "scoping\n", + "\n", + "\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n", + "\n", + "\u0007Workload management is much simpler, you can isolate each\n", + "\n", + "workload with a dedicated compute resource, without worrying\n", + "\n", + "about managing workload contention\n", + "\n", + "\u0007Auto-scale and tear down the compute resources after the job\n", + "\n", + "is done to achieve cost efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "Leverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n", + "\n", + "\u0007Create a migration factory for iterative migration process\n", + "\n", + "\u0007Determine and implement a security and governance framework\n", + "\n", + "\u0007Establish a to-be environment and move use cases/workloads in logical units\n", + "\n", + "\u0007Prove business value and scale over time\n", + "\n", + "\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n", + "\n", + "Take this iterative and templated approach. Migration speed will accelerate. Customers can\n", + "\n", + "finish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n", + "\n", + "\n", + "“ M a k e i t w o r k ”\n", + "\n", + "Pa r e l l e l i z e t h e\n", + "B u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\n", + "i t e r at i o n s\n", + "\n", + "“ M a k e i t w o r k >a s t 2\n", + "\n", + "\n", + "Full %i\"ecycle %ig�t�ou�e /or�load�\n", + "\n", + "Leverage Databricks’ deep\n", + "\n", + "bench of expertise to build\n", + "\n", + "out some **templates for the**\n", + "\n", + "**most effective Databricks**\n", + "\n", + "**implementation.**\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Take an **iterative, bite-sized**\n", + "\n", + "**approach** to migration, reduce tech\n", + "\n", + "debt and rework, and bring forward\n", + "\n", + "the value of the solution earlier.\n", + "\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "Migration\n", + "\n", + "Functionality\n", + "\n", + "Optimization and Delta\n", + "\n", + "\n", + "-----\n", + "\n", + "To maximize the value of your lakehouse, you should consider retiring\n", + "\n", + "some legacy architecture design patterns. Leverage the migration\n", + "\n", + "process to simplify data warehousing tasks. Regardless of how you\n", + "\n", + "complete your migration, you could utilize lakehouse strengths to\n", + "\n", + "improve architectural patterns:\n", + "\n", + "\u0007Merge your siloed data warehouses on your unified lakehouse\n", + "\n", + "platform and unify data access and data governance via Unity\n", + "\n", + "Catalog. The lakehouse architecture provides a unified storage\n", + "\n", + "layer for all your data where there is no physical boundary\n", + "\n", + "between data. There is no need to keep data copies for each\n", + "\n", + "system using the data set. Clean up and remove jobs that are\n", + "\n", + "created to keep data in sync across various data systems.\n", + "\n", + "Keep a single copy of raw data in your lakehouse as a single\n", + "\n", + "source of truth.\n", + "\n", + "\u0007The Databricks Lakehouse Platform allows you to merge batch\n", + "\n", + "and streaming into a single system to build a simple continuous\n", + "\n", + "\n", + "\u0007Simplify your workload isolation and management by running jobs\n", + "\n", + "in dedicated clusters. Separating storage and compute allows you\n", + "\n", + "to easily isolate each task with isolated compute resources. There\n", + "\n", + "is no need to squeeze them into a single large data appliance\n", + "\n", + "and spend lots of time managing and coordinating resources.\n", + "\n", + "Leverage the elasticity of the Databricks compute layer to\n", + "\n", + "automatically handle workload concurrency changes at peak time\n", + "\n", + "instead of paying for over-provisioned resources for most of the\n", + "\n", + "time. This greatly simplifies the workload management effort the\n", + "\n", + "traditional data warehouses require.\n", + "\n", + "\u0007Simplify disaster recovery. Storage and compute separation\n", + "\n", + "allows easy disaster recovery. The cloud storage provides very\n", + "\n", + "good data redundancy and supports automated replication\n", + "\n", + "to another region. Customers can spin up compute resources\n", + "\n", + "quickly in another region and maintain service availability in case\n", + "\n", + "of an outage.\n", + "\n", + "\n", + "data flow model to process data as it arrives. Process data in\n", + "\n", + "near real-time and enable data-driven decisions with the most\n", + "\n", + "recent updates.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Security and data governance\n", + "\n", + "\n", + "Security is paramount in any data-driven organization. Data security\n", + "\n", + "should enforce the business needs for both internal and external\n", + "\n", + "data, so the lakehouse should be set up to meet your organization’s\n", + "\n", + "security requirements. Databricks provides built-in security to\n", + "\n", + "protect your data during and after migration.\n", + "\n", + "\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n", + "\n", + "or your own\n", + "\n", + "\u0007Set up a custom network policy, use IP range to control access\n", + "\n", + "\u0007Leverage Private Link to limit network traffic to not traverse the\n", + "\n", + "public internet\n", + "\n", + "\n", + "The challenge with the traditional data warehouse and data lake\n", + "\n", + "architecture is that data is stored in multiple stores and your data\n", + "\n", + "team also needs to manage data access and data governance\n", + "\n", + "twice. The lakehouse pattern uses unified storage which simplifies\n", + "\n", + "governance. The Databricks Lakehouse Platform provides a unified\n", + "\n", + "governance layer across all your data teams. Migrating to Databricks\n", + "\n", + "Unity Catalog provides data discovery, data lineage, role-based\n", + "\n", + "security policies, table or row/column-level access control, and\n", + "\n", + "central auditing capabilities that make the data platform easy for\n", + "\n", + "data stewards to confidently manage and secure data access to\n", + "\n", + "meet compliance and privacy needs, directly on the lakehouse.\n", + "\n", + "\n", + "\u0007Enable SSO, integrate with active directory and other IdPs\n", + "\n", + "\u0007Control data access to database objects using RBAC\n", + "\n", + "\u0007Enable audit logs to monitor user activities\n", + "\n", + "\n", + "-----\n", + "\n", + "A-�it Log\n", + "\n", + "Acco-nt Level$\n", + "User Management\n", + "\n", + "Cre�entials\n", + "\n", + "##### Centralized Governance\n", + "\n", + "ACL Store\n", + "\n", + "Access Control\n", + "\n", + "\n", + "Metastore\n", + "\n", + "Lineage Explorer\n", + "\n", + "Data Explorer\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Team involvement\n", + "\n", + "Plan to educate and train your team iteratively throughout the\n", + "\n", + "migration process. As new workloads are migrated, new teams will\n", + "\n", + "gain exposure to the lakehouse pattern. Plan to ramp up new team\n", + "\n", + "members as the migration process progresses, developing a data\n", + "\n", + "Center of Excellence within the organization. Databricks provides\n", + "\n", + "a cost effective platform for ad hoc work to be performed. A\n", + "\n", + "sandbox environment can be leveraged for teams to get exposure\n", + "\n", + "to Databricks technology and get hands-on experience. Databricks\n", + "\n", + "also provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n", + "\n", + "to get hands-on experience relevant to their immediate tasks, gain\n", + "\n", + "\n", + "#### Conclusion\n", + "\n", + "Data warehouse migration touches many business areas and\n", + "\n", + "impacts many teams, but the Databricks Lakehouse Platform\n", + "\n", + "simplifies this transition, reduces risks and accelerates your ROI.\n", + "\n", + "The Databricks Business Value Consulting team can work with you\n", + "\n", + "to quantify the impact of your use cases to both data and business\n", + "\n", + "teams. And the Databricks team of solution architects, professional\n", + "\n", + "services, and partners are ready to help.\n", + "\n", + "Reach out to your Databricks account team or send a message to\n", + "\n", + "[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n", + "\n", + "\n", + "exposure to new things and try new ideas.\n", + "\n", + "#### Additional resources\n", + "\n", + "[Migrate to Databricks](https://databricks.com/solutions/migration)\n", + "\n", + "[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n", + "\n", + "\n", + "-----\n", + "\n", + "##### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe. Founded by the original\n", + "\n", + "creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n", + "\n", + "data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[Sign up for a free trial](https://databricks.com/try-databricks)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf2024-09-19T16:57:21Z
**The**\n", + "**Delta Lake**\n", + "**Series**\n", + "**Lakehouse**\n", + "\n", + "Combining the best elements of\n", + "data lakes and data warehouses\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Here’s what\n", + "#### What’s \n", + "###### you’ll find inside\n", + "#### inside?\n", + "\n", + "\n", + "The Delta Lake Series of eBooks is published\n", + "\n", + "\n", + "by Databricks to help leaders and practitioners\n", + "\n", + "understand the full capabilities of Delta Lake as\n", + "\n", + "\n", + "**Introduction**\n", + "**What is Delta Lake?**\n", + "\n", + "\n", + "well as the landscape it resides in. This eBook,\n", + "\n", + "\n", + "**The Delta Lake Series — Lakehouse** , focuses\n", + "\n", + "on lakehouse.\n", + "\n", + "\n", + "**Chapter** **01**\n", + "\n", + "##### 02 Chapter\n", + " 03 Chapter\n", + "\n", + "\n", + "What Is\n", + "a Lakehouse?\n", + "\n", + "Diving Deep Into the Inner Workings\n", + "of the Lakehouse and Delta Lake\n", + "\n", + "Understanding\n", + "Delta Engine\n", + "\n", + "\n", + "#### What’s next?\n", + "\n", + "After reading this eBook, you’ll not only\n", + "\n", + "\n", + "understand what Delta Lake offers, but you’ll\n", + "\n", + "also understand how its features result in\n", + "\n", + "substantial performance improvements.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "\n", + "lifecycle management to data lakes. Our customers have found that Delta Lake\n", + "\n", + "solves for challenges around malformed data ingestion, difficulties deleting data for\n", + "\n", + "compliance, or issues modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "\n", + "scalable cloud service.\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a Lakehouse?**\n", + "### CHAPTER 01\n", + "\n", + "\n", + "-----\n", + "\n", + "**What Is a**\n", + "**Lakehouse?**\n", + "# 01\n", + "\n", + "Over the past few years at Databricks, we’ve seen a new data management architecture\n", + "\n", + "that emerged independently across many customers and use cases: the **lakehouse.**\n", + "\n", + "In this chapter, we’ll describe this new architecture and its advantages over previous\n", + "\n", + "approaches.\n", + "\n", + "Data warehouses have a long history of decision support and business intelligence\n", + "\n", + "applications. Since its inception in the late 1980s, data warehouse technology\n", + "\n", + "continued to evolve and MPP architectures led to systems that were able to handle\n", + "\n", + "larger data sizes.\n", + "\n", + "But while warehouses were great for structured data, a lot of modern enterprises\n", + "\n", + "have to deal with unstructured data, semi-structured data, and data with high variety,\n", + "\n", + "velocity and volume. Data warehouses are not suited for many of these use cases, and\n", + "\n", + "they are certainly not the most cost-efficient.\n", + "\n", + "As companies began to collect large amounts of data from many different sources,\n", + "\n", + "architects began envisioning a single system to house data for many different\n", + "\n", + "analytic products and workloads.\n", + "\n", + "About a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n", + "\n", + "in a variety of formats. While suitable for storing data, data lakes lack some critical\n", + "\n", + "features: They do not support transactions, they do not enforce data quality, and their\n", + "\n", + "lack of consistency / isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "\n", + "-----\n", + "\n", + "**A lakehouse combines the best elements**\n", + "**of data lakes and data warehouses**\n", + "\n", + "A lakehouse is a new data architecture that combines the best elements of data lakes\n", + "\n", + "and data warehouses.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "\n", + "warehouses.\n", + "\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "\n", + "\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "\n", + "or copy data between different systems.\n", + "\n", + "\n", + "Lakehouses are enabled by a new system design: implementing similar data struc-\n", + "\n", + "tures and data management features to those in a data warehouse, directly on the\n", + "\n", + "kind of low-cost storage used for data lakes. They are what you would get if you had\n", + "\n", + "to redesign data warehouses in the modern world, now that cheap and highly reliable\n", + "\n", + "storage (in the form of object stores) are available.\n", + "\n", + "A lakehouse has the following key features:\n", + "\n", + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "\n", + "consistency as multiple parties concurrently read or write data, typically using\n", + "\n", + "SQL.\n", + "\n", + "\n", + "-----\n", + "\n", + "- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "\n", + "such as star/snowflake-schemas. The system should be able to reason about data\n", + "\n", + "integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n", + "\n", + "compute use separate clusters, thus these systems are able to scale to many more\n", + "\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**\n", + "\n", + "**Abstract**\n", + "\n", + "Cloud object stores such as Amazon S3 are some of the largest and most\n", + "\n", + "cost-effective storage systems on the planet, making the main attractive\n", + "\n", + "target to store large data warehouses and data lakes. Unfortunately, their\n", + "\n", + "implementation as key-value stores makes it difficult to achieve ACID\n", + "\n", + "transactions and high performance: Metadata operations, such as listing\n", + "\n", + "objects, are expensive, and consistency guarantees are limited. In this paper,\n", + "\n", + "we present Delta Lake, an open source ACID table storage layer over cloud\n", + "\n", + "object stores initially developed at Databricks. Delta Lake uses a transaction log\n", + "\n", + "that is compacted into Apache Parquet format to provide ACID properties, time\n", + "\n", + "travel, and significantly faster metadata operations for large tabular data sets\n", + "\n", + "(e.g., the ability to quickly search billions of table partitions for those relevant\n", + "\n", + "to a query). It also leverages this design to provide high-level features such\n", + "\n", + "as automatic data layout optimization, upserts, caching, and audit logs. Delta\n", + "\n", + "Lake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n", + "\n", + "other systems. Delta Lake is deployed at thousands of Databricks customers\n", + "\n", + "that process exabytes of data per day, with the largest instances managing\n", + "\n", + "exabyte-scale data sets and billions of objects.\n", + "\n", + "Authors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n", + "\n", + "Zhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n", + "\n", + "Łuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n", + "\n", + "Boncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n", + "\n", + "Read the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Some early examples**\n", + "\n", + "The [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n", + "\n", + "Microsoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n", + "\n", + "enables a similar lakehouse pattern. Other managed services such as BigQuery and\n", + "\n", + "Redshift Spectrum have some of the lakehouse features listed above, but they are\n", + "\n", + "examples that focus primarily on BI and other SQL applications.\n", + "\n", + "Companies that want to build and implement their own systems have access to open\n", + "\n", + "source file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n", + "\n", + "building a lakehouse.\n", + "\n", + "Merging data lakes and data warehouses into a single system means that data teams\n", + "\n", + "can move faster as they are able to use data without needing to access multiple systems.\n", + "\n", + "The level of SQL support and integration with BI tools among these early lakehouses\n", + "\n", + "is generally sufficient for most enterprise data warehouses. Materialized views and\n", + "\n", + "\n", + "A note about technical building blocks. While distributed file systems can be\n", + "\n", + "used for the storage layer, object stores are more commonly used in lakehouses.\n", + "\n", + "Object stores provide low-cost, highly available storage that excels at massively\n", + "\n", + "parallel reads — an essential requirement for modern data warehouses.\n", + "\n", + "**From BI to AI**\n", + "\n", + "The lakehouse is a new data management architecture that radically simplifies\n", + "\n", + "enterprise data infrastructure and accelerates innovation in an age when\n", + "\n", + "machine learning is poised to disrupt every industry. In the past, most of the\n", + "\n", + "data that went into a company’s products or decision-making was structured\n", + "\n", + "data from operational systems, whereas today, many products incorporate\n", + "\n", + "AI in the form of computer vision and speech models, text mining and others.\n", + "\n", + "Why use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n", + "\n", + "versioning, governance, security and ACID properties that are needed even for\n", + "\n", + "unstructured data.\n", + "\n", + "\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "\n", + "\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "\n", + "important for “lift and shift scenarios,” which require systems that achieve semantics\n", + "\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "\n", + "exploration and refinement are standard for many analytic and data science\n", + "\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "\n", + "systems (such as data warehouses) that have years of investments and real-\n", + "\n", + "world deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "\n", + "and other issues will be addressed as the technology continues to mature and\n", + "\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "\n", + "diverse data applications.\n", + "\n", + "\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the Inner Workings**\n", + "**of the Lakehouse and Delta Lake**\n", + "\n", + "### CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "# 02\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "\n", + "paper that describes some of the core technological challenges and solutions that\n", + "\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "\n", + "can read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "\n", + "[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "\n", + "object stores like Amazon S3 have become some of the largest and most cost-\n", + "\n", + "effective storage systems in the world, which makes them an attractive platform to\n", + "\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "\n", + "approach because the table is just a group of objects that can be accessed from\n", + "\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "\n", + "customers into a specific service provider, leaving customers to contend with\n", + "\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "\n", + "adopt a new approach later.\n", + "\n", + "**3. Lakehouse**\n", + "\n", + "With Delta Lake, an open source ACID table storage layer atop cloud object stores,\n", + "\n", + "we sought to build a car instead of a faster horse with not just a better data store,\n", + "\n", + "but a fundamental change in how data is stored and used via the lakehouse. A\n", + "\n", + "lakehouse is a new architecture that combines the best elements of data lakes and\n", + "\n", + "data warehouses. Lakehouses are enabled by a new system design: implementing\n", + "\n", + "similar data structures and data management features to those in a data warehouse,\n", + "\n", + "directly on the kind of low-cost storage used for data lakes. They are what you would\n", + "\n", + "get if you had to redesign storage engines in the modern world, now that cheap and\n", + "\n", + "highly reliable storage (in the form of object stores) are available.\n", + "\n", + "Delta Lake maintains information about which objects are part of a Delta table in an\n", + "\n", + "ACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n", + "\n", + "the cloud object store. This design allows clients to update multiple objects at once,\n", + "\n", + "replace a subset of the objects with another, etc., in a serializable manner that still\n", + "\n", + "achieves high parallel read/write performance from the objects. The log also provides\n", + "\n", + "significantly faster metadata operations for large tabular data sets. Additionally, Delta\n", + "\n", + "Lake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n", + "\n", + "snapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n", + "\n", + "caching, and audit logs. Together, these features improve both the manageability and\n", + "\n", + "performance of working with data in cloud object stores, ultimately opening the door\n", + "\n", + "to the lakehouse architecture that combines the key features of data warehouses and\n", + "\n", + "data lakes to create a better, simpler data architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "Today, Delta Lake is used across thousands of Databricks customers, processing\n", + "\n", + "exabytes of structured and unstructured data each day, as well as many organizations\n", + "\n", + "in the open source community. These use cases span a variety of data sources and\n", + "\n", + "applications. The data types stored include Change Data Capture (CDC) logs from\n", + "\n", + "enterprise OLTP systems, application logs, time-series data, graphs, aggregate\n", + "\n", + "tables for reporting, and image or feature data for machine learning. The applications\n", + "\n", + "include SQL workloads (most commonly), business intelligence, streaming, data\n", + "\n", + "science, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n", + "\n", + "be a good fit for most data lake applications that would have used structured storage\n", + "\n", + "formats like Parquet or ORC, and many traditional data warehousing workloads.\n", + "\n", + "Across these use cases, we found that customers often use Delta Lake to significantly\n", + "\n", + "simplify their data architecture by running more workloads directly against cloud\n", + "\n", + "object stores, and increasingly, by creating a lakehouse with both data lake and\n", + "\n", + "transactional features to replace some or all of the functionality provided by message\n", + "\n", + "queues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n", + "\n", + "Amazon Redshift).\n", + "\n", + "**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n", + "\n", + "- The characteristics and challenges of object stores\n", + "\n", + "- The Delta Lake storage format and access protocols\n", + "\n", + "- The current features, benefits and limitations of Delta Lake\n", + "\n", + "- Both the core and specialized use cases commonly employed today\n", + "\n", + "- Performance experiments, including TPC-DS performance\n", + "\n", + "Through the paper, you’ll gain a better understanding of Delta Lake and how it\n", + "\n", + "enables a wide range of DBMS-like performance and management features for data\n", + "\n", + "held in low-cost cloud storage. As well as how the Delta Lake storage format and\n", + "\n", + "access protocols make it simple to operate, highly available, and able to deliver high-\n", + "\n", + "bandwidth access to the object store.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding Delta Engine**\n", + "\n", + "### CHAPTER 03\n", + "\n", + "\n", + "-----\n", + "\n", + "**Understanding**\n", + "**Delta Engine**\n", + "# 03\n", + "\n", + "The Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n", + "\n", + "engine to take advantage of modern CPU architecture with optimizations to Spark\n", + "\n", + "3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n", + "\n", + "Runtime 7.0. Together, these features significantly accelerate query performance on\n", + "\n", + "data lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n", + "\n", + "adopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n", + "\n", + "**Scaling execution performance**\n", + "\n", + "One of the big hardware trends over the last several years is that CPU clock speeds\n", + "\n", + "have plateaued. The reasons are outside the scope of this chapter, but the takeaway\n", + "\n", + "is that we have to find new ways to process data faster beyond raw compute power.\n", + "\n", + "One of the most impactful methods has been to improve the amount of data that can\n", + "\n", + "be processed in parallel. However, data processing engines need to be specifically\n", + "\n", + "architected to take advantage of this parallelism.\n", + "\n", + "In addition, data teams are being given less and less time to properly model data as\n", + "\n", + "the pace of business increases. Poorer modeling in the interest of better business\n", + "\n", + "agility drives poorer query performance. Naturally, this is not a desired state, and\n", + "\n", + "organizations want to find ways to maximize both agility and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Announcing Delta Engine for**\n", + "**high-performance query execution**\n", + "\n", + "Delta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n", + "\n", + "workloads through three components: an improved query optimizer, a caching\n", + "\n", + "layer that sits between the execution layer and the cloud object storage, and a native\n", + "\n", + "vectorized execution engine that’s written in C++.\n", + "\n", + "The improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n", + "\n", + "optimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n", + "\n", + "statistics to deliver up to 18x increased performance in star schema workloads.\n", + "\n", + "Delta Engine’s caching layer automatically chooses which input data to cache for the\n", + "\n", + "user, transcoding it along the way in a more CPU-efficient format to better leverage\n", + "\n", + "the increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n", + "\n", + "performance for virtually all workloads.\n", + "\n", + "However, the biggest innovation in Delta Engine to tackle the challenges facing\n", + "\n", + "data teams today is the native execution engine, which we call Photon. (We know.\n", + "\n", + "\n", + "-----\n", + "\n", + "It’s in an engine within the engine…). This completely rewritten execution engine for\n", + "\n", + "Databricks has been built to maximize the performance from the new changes in\n", + "\n", + "modern cloud hardware. It brings performance improvements to all workload types\n", + "\n", + "while remaining fully compatible with open Spark APIs.\n", + "\n", + "**Getting started with Delta Engine**\n", + "\n", + "By linking these three components together, we think it will be easier for customers\n", + "\n", + "to understand how improvements in multiple places within the Databricks code\n", + "\n", + "aggregate into significantly faster performance for analytics workloads on data lakes.\n", + "\n", + "We’re excited about the value that Delta Engine delivers to our customers. While the\n", + "\n", + "time and cost savings are already valuable, its role in the lakehouse pattern supports\n", + "\n", + "new advances in how data teams design their data architectures for increased\n", + "\n", + "unification and simplicity.\n", + "\n", + "For more information on the Delta Engine, watch this keynote address from\n", + "\n", + "[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n", + "\n", + "\n", + "-----\n", + "\n", + "## What’s next?\n", + "\n", + "\n", + "Now that you understand Delta Lake and how its features can improve\n", + "\n", + "performance, it may be time to take a look at some additional resources.\n", + "\n", + "**Data + AI Summit Europe 2020 >**\n", + "\n", + "- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n", + "\n", + "\n", + "**Explore subsequent eBooks in the collection >**\n", + "\n", + "- The Delta Lake Series — Fundamentals and Performance\n", + "\n", + "- The Delta Lake Series — Features\n", + "\n", + "- The Delta Lake Series — Streaming\n", + "\n", + "- The Delta Lake Series — Customer Use Cases\n", + "\n", + "\n", + "\n", + "- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n", + "\n", + "- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n", + "\n", + "\n", + "\n", + "- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n", + "\n", + "- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n", + "\n", + "\n", + "**Do a deep dive into Delta Lake >**\n", + "\n", + "- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n", + "\n", + "- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n", + "\n", + "\n", + "**Vodcasts and podcasts >**\n", + "\n", + "\n", + "\n", + "- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n", + "\n", + "- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n", + "\n", + "\n", + "**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n", + "\n", + "\n", + "\n", + "- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf2024-09-19T16:57:19Z
**EBOOK**\n", + "\n", + "# All Roads Lead to the Lakehouse\n", + "\n", + "#### A deep dive into data ingestion with the lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "Introduction...................................................................................................................................................................................................................... **03**\n", + "\n", + "Life of a Data Engineer ............................................................................................................................................................................................... **04**\n", + "\n", + "Ingesting From Cloud Object Stores...................................................................................................................................................................... **05**\n", + "\n", + "COPY INTO ......................................................................................................................................................................................................... **06**\n", + "\n", + "Auto Loader ....................................................................................................................................................................................................... **09**\n", + "\n", + "Ingesting Data From External Applications .......................................................................................................................................................... **13**\n", + "\n", + "Partner Connect ............................................................................................................................................................................................... **13**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "Organizations today are inundated with data siloed across various on-premises\n", + "application systems, databases, data warehouses and SaaS applications. This\n", + "fragmentation makes it difficult to support new use cases for analytics or machine\n", + "learning, so many IT teams are now centralizing all of their data with a lakehouse\n", + "architecture built on top of Delta Lake, an open format storage layer.\n", + "\n", + "The first thing data engineers need to do to support the lakehouse architecture is to\n", + "efficiently move data from various systems into their lakehouse. Ingesting data is a\n", + "critical first step in the data engineering and management lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Life of a Data Engineer\n", + "\n", + "The primary focus of data engineers is to provide timely and reliable data to downstream\n", + "\n", + "data teams at an organization. Requests for data can come from a variety of teams, and for\n", + "\n", + "\n", + "a variety of data types. For example:\n", + "\n", + "**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n", + "\n", + "better allocate their budget for ads\n", + "\n", + "**•** Security team looking to get access to a table with low latency security data from Kafka,\n", + "\n", + "in order to run rules to detect intrusions into the network\n", + "\n", + "**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n", + "\n", + "**•** Finance team hoping to find a way to automatically ingest critical data from Google\n", + "\n", + "Sheets or transaction data from AWS Kinesis\n", + "\n", + "In each of these common scenarios, data engineers must create usable and easily\n", + "\n", + "queryable tables from semi-structured and unstructured data. Beyond writing queries to\n", + "\n", + "retrieve and transform all this data, the data engineering team must also be concerned\n", + "\n", + "with performance, because running these queries on an ongoing basis can be a big load on\n", + "\n", + "the system.\n", + "\n", + "Data engineers face the challenge of constant requests and ongoing business\n", + "\n", + "\n", + "###### W H AT I S \n", + " D E LTA L A K E ?\n", + "\n", + "Before thinking about ingestion into Delta Lake, it’s important to\n", + "\n", + "understand why ingesting into Delta Lake is the right solution in\n", + "\n", + "the first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n", + "\n", + "layer that brings data warehouse capabilities to your open data\n", + "\n", + "lake. Across industries, enterprises have enabled true collaboration\n", + "\n", + "among their data teams with a reliable single source of truth\n", + "\n", + "enabled by Delta Lake. By delivering quality, reliability, security and\n", + "\n", + "performance on your data lake — for both streaming and batch\n", + "\n", + "operations — Delta Lake eliminates data silos and makes analytics\n", + "\n", + "accessible across the enterprise. With Delta Lake, customers can\n", + "\n", + "build a cost-efficient, highly scalable lakehouse that eliminates\n", + "\n", + "data silos and provides self-serving analytics to end users.\n", + "\n", + "\n", + "requirements, as well as an ever-changing ecosystem. As business requirements change,\n", + "\n", + "so do the requirements around schemas, necessitating custom code to handle the\n", + "\n", + "changes. With all of these challenges, the work of a data engineer is extremely critical, and\n", + "\n", + "increasingly complex, with many steps involved before getting data to a state where it can\n", + "\n", + "actually be queried by the business stakeholders. So how do data engineers get the data\n", + "\n", + "that each of these teams need at the frequency, with the freshness, and in the format\n", + "\n", + "required?\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting From Cloud Object Stores\n", + "\n", + "There are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n", + "\n", + "cloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n", + "\n", + "existing tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n", + "\n", + "[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n", + "\n", + "\n", + "**Auto Loader**\n", + "\n", + "Auto Loader is an optimized data ingestion tool that incrementally and efficiently\n", + "\n", + "processes new data files as they arrive in cloud storage with minimal DevOps effort. You\n", + "\n", + "just need to provide a source directory path and start a streaming job. The new structured\n", + "\n", + "streaming source, called “cloudFiles”, will automatically set up file notification services that\n", + "\n", + "\n", + "**COPY INTO**\n", + "\n", + "COPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n", + "\n", + "Lake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n", + "\n", + "when the input directory contains thousands of files or fewer, and the user prefers SQL.\n", + "\n", + "COPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "subscribe file events from the input directory and process new files as they arrive, with the\n", + "\n", + "option of also processing existing files in that directory. Auto Loader has interfaces through\n", + "\n", + "Python and Scala, and can be used with SQL through Delta Live Tables.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### COPY INTO\n", + "\n", + "\n", + "COPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n", + "\n", + "ingestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n", + "\n", + "INTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n", + "\n", + "```\n", + "FILEFORMAT = CSV\n", + "FORMAT_OPTIONS (‘header’ = ‘true’)\n", + "\n", + "```\n", + "\n", + "While COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n", + "\n", + "\n", + "events by using cloud functions such as AWS Lambda or through orchestrators like Apache\n", + "\n", + "Airflow. COPY INTO supports incremental appends and simple transformations.\n", + "\n", + "COPY INTO is a great command to use when your source directory contains a small number\n", + "\n", + "of files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n", + "\n", + "Auto Loader, which we will cover later in this eBook.\n", + "\n", + "**Common Use Cases for COPY INTO**\n", + "\n", + "**Ingesting data to a new Delta table**\n", + "\n", + "A common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n", + "\n", + "table. To copy data into a new Delta table, users can use CREATE TABLE command first,\n", + "\n", + "followed by COPY INTO.\n", + "\n", + "Step 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\n", + "Step 2 1 : `COPY INTO` `my_table`\n", + "```\n", + " FROM ‘s3://my_bucket/my_path’ WITH (\n", + " CREDENTIAL (\n", + " AWS_ACCESS_KEY = ‘*****’,\n", + " AWS_SECRET_KEY = ‘*****’,\n", + " AWS_SESSION_TOKEN = ‘*****’\n", + " )\n", + " ENCRYPTION (\n", + " TYPE = ‘AWS_SSE_C’,\n", + " MASTER_KEY = ‘*****’\n", + "\n", + "```\n", + "\n", + "The code block above covers the AWS temporary in-line credential format. When you use\n", + "\n", + "in-line credentials in Azure and AWS, the following parameters are required for each type of\n", + "\n", + "credential and encryption:\n", + "\n", + "\n", + "|Credential Name|Required Parameters|\n", + "|---|---|\n", + "|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n", + "||AWS_SESSION_TOKEN|\n", + "|Azure SAS token|AZURE_SAS_TOKEN|\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Encryption Name|Required Parameters|\n", + "|---|---|\n", + "|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n", + "|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n", + "\n", + "\n", + "**Appending data to your Delta table**\n", + "\n", + "To append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n", + "\n", + "is a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n", + "\n", + "users point to a location of files, and once those files are ingested, Delta Lake will keep\n", + "\n", + "1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\n", + "the cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\n", + "cloud object store, you do not need to specify credentials in-line for COPY INTO.\n", + "\n", + "\n", + "-----\n", + "\n", + "track of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n", + "\n", + "get idempotency with COPY INTO, which means users are prevented from ingesting the\n", + "\n", + "same data twice to the same table.\n", + "```\n", + " COPY INTO table_identifier\n", + " FROM [ file_location | ( SELECT expression_list FROM file_location)]\n", + " FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n", + " [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n", + " [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n", + " [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n", + "\n", + "```\n", + "One of the main benefits of COPY INTO is that users don’t have to worry about providing a\n", + "\n", + "schema, because the schema is automatically inferred from your data files. Here is a very\n", + "\n", + "simple example of how you would ingest data from CSV files that have headers, where you\n", + "\n", + "leave the tool to infer the schema and the proper data types. It’s as simple as that.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Using COPY INTO without an existing table** 2\n", + "\n", + "```\n", + " CREATE TABLE my_delta_table (dummy string);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS (\n", + " ‘header’ = ‘true’ ,\n", + " ‘inferSchema’ = ‘true’ ,\n", + " ‘mergeSchema’ = ‘true’\n", + " )\n", + " COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n", + "\n", + "```\n", + "**Ingesting a CSV file without headers**\n", + "\n", + "If you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n", + "\n", + "_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n", + "\n", + "data type that you want and then alias these columns to whatever you want to call them.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM ( SELECT\n", + " _c0::int as key,\n", + " _c1::double value,\n", + " _c2::timestamp event_time\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + "\n", + "```\n", + "\n", + "In the most common case, in order to use COPY INTO, a table definition is required.\n", + "\n", + "However, if you would like to get started quickly and don’t have an existing table or require\n", + "\n", + "a specific schema, you can create your table with a dummy schema. Then, once you run\n", + "\n", + "COPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n", + "\n", + "infer the data types, and then change your Delta table to have the required schema.\n", + "\n", + "2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Evolving schema over time for CSV files** 3\n", + "\n", + "When ingesting CSV files that have a different number of columns than your existing table,\n", + "\n", + "you can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n", + "\n", + "as FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n", + "\n", + "Once “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n", + "\n", + "files and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n", + "\n", + "when you’re running the COPY INTO command. When “mergeSchema” is provided as a\n", + "\n", + "copy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n", + "\n", + "evolution only allows the addition of new columns. Data type changes for existing columns\n", + "\n", + "are not supported.\n", + "```\n", + " COPY INTO my_delta_table\n", + " FROM (SELECT\n", + " _C0::int as key,\n", + " _C1::double value,\n", + " _C2::timestamp event_time,\n", + " ...\n", + " FROM ‘s3://my-bucket/path/to/csv_files’ )\n", + " FILEFORMAT = CSV\n", + " FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n", + "\n", + "```\n", + "\n", + "**Fixing bad data**\n", + "\n", + "If you find that there is a mistake in the source data file and some of the data you ingested\n", + "\n", + "is bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n", + "\n", + "the Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n", + "\n", + "can rerun your COPY INTO command.\n", + "\n", + "Alternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n", + "\n", + "the use of the “force” copy option. You can manually remove the old data from your Delta\n", + "\n", + "Lake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n", + "\n", + "You can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n", + "\n", + "names with the FILES keyword to reload a subset of files in conjunction with “force”.\n", + "```\n", + " RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n", + " 1);\n", + " COPY INTO my_delta_table\n", + " FROM ‘s3://my-bucket/path/to/csv_files’\n", + " FILEFORMAT = CSV\n", + " PATTERN = ‘2021-09-08*.csv’\n", + " FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n", + " COPY_OPTIONS ( ‘force’ = ‘true’ )\n", + "\n", + "```\n", + "3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\n", + "clusters enabled with table ACLs.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Auto Loader\n", + "\n", + "\n", + "While COPY INTO can solve a lot of the key use cases our customers face, due to its\n", + "\n", + "limitations (scalability), there are many scenarios where we recommend Auto Loader\n", + "\n", + "for data ingestion. Auto Loader is a data source on Databricks that incrementally and\n", + "\n", + "efficiently processes new data files as they arrive in cloud storage with minimal DevOps\n", + "\n", + "effort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n", + "\n", + "Auto Loader is an incremental streaming source that provides exactly-once ingestion\n", + "\n", + "guarantees. It keeps track of which files have been ingested using a durable key-value store.\n", + "\n", + "It can discover new files very efficiently and is extremely scalable. Auto Loader has been\n", + "\n", + "battle tested. We have seen customers running Auto Loader on millions of files an hour, and\n", + "\n", + "petabytes of data per day.\n", + "\n", + "To use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n", + "\n", + "that you will use Auto Loader to load files from the cloud object stores. Next, you specify\n", + "\n", + "the format of the file — for example, JSON — as an option to Auto Loader, and you specify\n", + "\n", + "where to load it from.\n", + "```\n", + " df = spark.readStream.format( “cloudFiles” )\n", + " .option( “cloudfiles.format” , “json” )\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "Under the hood, when data lands in your cloud storage, Auto Loader discovers files either\n", + "\n", + "through directory listing or file notifications. Given permissions to the underlying storage\n", + "\n", + "bucket or container, Auto Loader can list the directory that you want to load data from\n", + "\n", + "in an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n", + "\n", + "can also automatically set up file notifications on your storage account, which allows it\n", + "\n", + "\n", + "from queues, deduplicate these notifications using its key-value store and then process\n", + "\n", + "the underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n", + "\n", + "processed, giving you exactly-once semantics.\n", + "\n", + "Directory listing mode is very easy to get started with. If your files are uploaded to your\n", + "\n", + "cloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n", + "\n", + "files by starting directory listing from the latest uploaded files, saving you both time and\n", + "\n", + "money. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n", + "\n", + "to scale to high volumes, Databricks recommends using the file notification mode. Cloud\n", + "\n", + "services such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n", + "\n", + "to upload files in a lexical order, typically by providing the upload time of records in the file\n", + "\n", + "path, such as /base/path/yyyy/MM/dd/HH/file.format.\n", + "\n", + "**Common Use Cases for Auto Loader**\n", + "\n", + "**New to Auto Loader**\n", + "\n", + "As a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n", + "\n", + "stores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n", + "\n", + "example using Python to demonstrate the ease and flexibility of Auto Loader with a few\n", + "\n", + "defined options. You can run the code in a notebook.\n", + "```\n", + " stream = spark.readStream \\\n", + " .format( “cloudFiles” ) \\\n", + " .option( “cloudFiles.format” , “csv” ) \\\n", + " .option( “cloudFiles.schemaLocation” , schema_location) \\\n", + " .load(raw_data_location)\n", + "\n", + "```\n", + "\n", + "to efficiently discover newly arriving files. When a file lands in file notification mode, the\n", + "\n", + "cloud storage system sends a notification to a queuing system. For example, in AWS, S3\n", + "\n", + "will send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n", + "\n", + "On Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n", + "\n", + "\n", + "-----\n", + "\n", + "In order to write to a Delta table from the stream, follow the example below:\n", + "```\n", + " stream.writeStream \\\n", + " .option( “mergeSchema” , “true” ) \\\n", + " .option( “checkpointLocation” , checkpoint_location) \\\n", + " .start(target_delta_table_location)\n", + "\n", + "```\n", + "**Migrating to Auto Loader**\n", + "\n", + "As a Spark user, you may be using an existing Spark structured streaming to process data.\n", + "\n", + "To migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n", + "\n", + "two lines of it into ‘cloudFiles’, specifying the file format within an option.\n", + "\n", + "\n", + "**Migrating a livestreaming pipeline**\n", + "\n", + "Migrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n", + "\n", + "INTO, you can specify a timestamp when the source files are updated or created and Auto\n", + "\n", + "Loader will ingest all modified data after that point.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Schema inference and evolution**\n", + "\n", + "Auto Loader provides schema inference and management capabilities. With a schema\n", + "\n", + "location specified, Auto Loader can store the changes to the inferred schema over time. For\n", + "\n", + "file formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n", + "\n", + "Loader can automatically infer data types or treat everything as a string.\n", + "\n", + "When data does not match your schema (e.g., an unknown column or format), Auto Loader\n", + "\n", + "has a data rescue capability that will “rescue” all data in a separate column, stored as a\n", + "\n", + "JSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n", + "\n", + "Auto Loader supports three schema evolution modes: add new columns as they are\n", + "\n", + "discovered, fail if an unexpected column is seen, or rescue new columns.\n", + "\n", + "```\n", + "df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.\n", + "format” , “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "```\n", + "df = spark.readStream\n", + " .format( “json” )\n", + " .options(format_options)\n", + " .schema(schema)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "\n", + "Once it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n", + "\n", + "Loader can scale to trillions of files, unlike the open-source file streaming source. One of\n", + "\n", + "the ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n", + "\n", + "to discover files first, then plan, Auto Loader discovers and processes files concurrently,\n", + "\n", + "making it much more efficient and leading to cost reductions in compute resources.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fixing a file that was processed with Auto Loader**\n", + "\n", + "To fix a file that was already processed, Auto Loader supports an option called\n", + "\n", + "‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n", + "\n", + "new timestamp. If you want to enable this option in an existing Auto Loader stream, you\n", + "\n", + "need to stop and restart the Auto Loader stream with the enabled option.\n", + "```\n", + " df = spark.readStream\n", + " .format( “cloudFiles” )\n", + " .option( “cloudFiles.format” , “json” )\n", + " .schema(schema)\n", + " .option( “cloudFiles.allowOverwrites” , “true” )\n", + " .options(format_options)\n", + " .load( “/path/to/table” )\n", + "\n", + "```\n", + "**Discover missing data**\n", + "\n", + "While event notification is a very scalable method to collect all data, it relies on cloud\n", + "\n", + "services, which are distributed systems and are not always reliable. With Auto Loader, you\n", + "\n", + "can additionally specify a backfill interval, where Auto Loader will perform asynchronous\n", + "\n", + "backfills at whatever interval you set up. This can be enabled with a once trigger,\n", + "\n", + "```\n", + " df = spark.readStream\n", + " .format(“cloudFiles”)\n", + " .option(“cloudFiles.format”, “json”)\n", + " .schema(schema)\n", + " .option( “cloudFiles.backfillInterval” , “1 week” )\n", + " .options(format_options)\n", + " .load(“/path/to/table”)\n", + " .writeStream\n", + " .trigger(Trigger.AvailableNow())\n", + " .option(“checkpointLocation”, checkpointDir)\n", + " .start()\n", + "\n", + "```\n", + "The trigger tells Auto Loader how frequently to process incoming data. A processing time\n", + "\n", + "trigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n", + "\n", + "interval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n", + "\n", + "process all new data that has been added until the start of your application. Once the data\n", + "\n", + "is processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n", + "\n", + "process all the new data in a single micro-batch, which requires it to first discover all the\n", + "\n", + "new files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n", + "\n", + "and perform rate limiting, which makes it a preferable alternative to Trigger Once.\n", + "\n", + "\n", + "processing time trigger and available now trigger. The following example shows how to use\n", + "\n", + "backfill internal and trigger availableNow together:\n", + "\n", + "\n", + "-----\n", + "\n", + "**Using Auto Loader in SQL with Delta Live Tables**\n", + "\n", + "Delta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n", + "\n", + "framework to develop, test, monitor, manage and operationalize data pipelines at scale to\n", + "\n", + "drive insights for data science, machine learning and analytics. Auto Loader is available in\n", + "\n", + "Delta Live Tables.\n", + "\n", + "```\n", + "CREATE INCREMENTAL LIVE TABLE\n", + " autoloader_test\n", + "AS\n", + "SELECT\n", + " *,\n", + " id + id2 AS new_id\n", + "FROM\n", + " CLOUD_FILES (\n", + " “some/cloud/path” , – the path to the data\n", + " “json” – the file format\n", + " );\n", + "\n", + "```\n", + "\n", + "**Live Tables understands**\n", + "\n", + "**and coordinates data flow**\n", + "\n", + "**between your queries**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Ingesting Data From External Applications\n", + "\n", + "While Auto Loader and COPY INTO are powerful tools, not all data is available as files\n", + "\n", + "in cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n", + "\n", + "your data and break down the silos between sources and downstream teams. To do this,\n", + "\n", + "customers need to discover and connect a broad set of data, BI and AI tools, and systems\n", + "\n", + "to the data within their lakehouse.\n", + "\n", + "##### Partner Connect\n", + "\n", + "Historically, stitching multiple enterprise tools and data sources together has been a burden\n", + "\n", + "on the end user, making it very complicated and expensive to execute at any scale. Partner\n", + "\n", + "Connect solves this challenge by making it easy for you to integrate data, analytics and AI\n", + "\n", + "tools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n", + "\n", + "validated solutions from Databricks partners that support your expanding analytics needs.\n", + "\n", + "To ingest into the lakehouse, select the partner tile in Partner Connect via the left\n", + "\n", + "navigation bar in Databricks. Partner Connect will automatically configure resources such\n", + "\n", + "as clusters, tokens and connection files for you to connect with your data ingestion tools\n", + "\n", + "of choice. You can finish signing up for a trial account on the partner’s website or directly\n", + "\n", + "log in if you already used Partner Connect to create a trial account. Once you log in, you will\n", + "\n", + "see that Databricks is already configured as a destination in the partner portal and ready\n", + "\n", + "to be used.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Common Use Case for Partner Connect**\n", + "\n", + "**Ingest Salesforce data via Fivetran into Delta Lake**\n", + "\n", + "Clicking on the Fivetran tile in Partner Connect starts an automated workflow between\n", + "\n", + "the two products. Databricks automatically provisions a SQL endpoint and associated\n", + "\n", + "credentials for Fivetran to interact with, and passes the user’s identity and the SQL\n", + "\n", + "\n", + "endpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n", + "\n", + "Databricks destination is automatically created. This destination is configured to ingest into\n", + "\n", + "Delta via the SQL endpoint that was auto-configured by Partner Connect.\n", + "\n", + "The customer now selects their choice of data source in Fivetran from hundreds of pre-\n", + "\n", + "built connectors — for example, Salesforce. The user authenticates to the Salesforce\n", + "\n", + "source, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "(in this case the Account & Contact objects) and starts the initial sync. This automation\n", + "\n", + "has saved users dozens of manual steps and copying/pasting of configuration if they\n", + "\n", + "manually set up the connection. It also protects the user from making any unintentional\n", + "\n", + "configuration errors and spending time debugging those errors. The Salesforce tables\n", + "\n", + "are now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n", + "\n", + "details or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n", + "\n", + "Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n", + "\n", + "unify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n", + "\n", + "globe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n", + "\n", + "mission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf2024-09-19T16:57:19Z
# 2023 State\n", + " of Data + AI\n", + "```\n", + "Powered by the Databricks Lakehouse\n", + "\n", + "```\n", + "2023 STATE OF DATA + AI\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||W|e’|r|e|in||th|e|||||||\n", + "|||||||go|l|de|n|a|ge||of|||||||\n", + "|||||||||||||||||||||\n", + "|||||||d|a|ta|a|n|d|A|I|||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "INTRO\n", + "\n", + "In the 6 months since ChatGPT launched, the world has woken up to the vast potential\n", + "of AI. The unparalleled pace of AI discoveries, model improvements and new products\n", + "on the market puts data and AI strategy at the top of conversations across every\n", + "organization around the world. We believe that AI will usher in the next generation of\n", + "product and software innovation, and we’re already seeing this play out in the market.\n", + "The next generation of winning companies and executives will be those who understand\n", + "and leverage AI.\n", + "\n", + "In this report, we examine patterns and trends in data and AI adoption across more\n", + "than 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\n", + "applications across companies’ entire data estates, the Databricks Lakehouse provides\n", + "a unique vantage point into the state of data and AI, including which products and\n", + "technologies are the fastest growing, the types of data science and machine learning\n", + "(DS/ML) applications being developed and more.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Here are the major stories we uncovered:\n", + "\n", + "```\n", + "\n", + "Companies are adopting\n", + "machine learning and large\n", + "language models (LLMs)\n", + "at a rapid pace. Natural\n", + "language processing (NLP)\n", + "is dominating use cases,\n", + "with an accelerated focus\n", + "on LLMs.\n", + "\n", + "\n", + "Organizations are investing in\n", + "data integration products as\n", + "they prioritize more DS/ML\n", + "initiatives. 50% of our fastestgrowing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "Organizations are increasingly\n", + "using the Lakehouse for data\n", + "warehousing, as evidenced\n", + "by the high growth of data\n", + "integration tools dbt and\n", + "Fivetran, and the accelerated\n", + "adoption of Databricks SQL.\n", + "\n", + "\n", + "We hope that by sharing these trends, data leaders will be able to benchmark\n", + "their organizations and gain insights that help inform their strategies for an\n", + "era defined by data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Summary of\n", + "\n", + "Key Findings\n", + " DATA SCIENCE AND MACHINE LEARNING:\n", + "\n", + " NLP AND LLMS ARE IN HIGH DEMAND\n", + " 1\n", + "\n", + "```\n", + "**•** The number of companies using SaaS LLM APIs (used to access\n", + "services like ChatGPT) has grown 1310% between the end of\n", + "November 2022 and the beginning of May 2023\n", + "\n", + "**•** NLP accounts for 49% of daily Python data science library usage,\n", + "making it the most popular application\n", + "\n", + "**•** Organizations are putting substantially more models into production\n", + "(411% YoY growth) while also increasing their ML experimentation\n", + "(54% YoY growth)\n", + "\n", + "**•** Organizations are getting more efficient with ML; for every three\n", + "\n", + "experimental models, roughly one is put into production, compared\n", + "to five experimental models a year prior\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "FASTEST-GROWING DATA\n", + "AND AI PRODUCTS\n", + "\n", + "```\n", + "```\n", + "ADOPTION AND\n", + "MIGRATION TRENDS\n", + "\n", + "```\n", + "61% of customers migrating to the\n", + "Lakehouse are coming from onprem and cloud data warehouses\n", + "\n", + "The volume of data in Delta Lake\n", + "has grown 304% YoY\n", + "\n", + "The Lakehouse is increasingly\n", + "being used for data warehousing,\n", + "including serverless data\n", + "warehousing with Databricks\n", + "SQL, which grew 144% YoY\n", + "\n", + "\n", + "BI is the top data and AI market, but\n", + "growth trends in other markets show that\n", + "companies are increasingly looking at\n", + "more advanced data use cases\n", + "\n", + "The fastest-growing data and AI product\n", + "is dbt, which grew 206% YoY by number\n", + "of customers\n", + "\n", + "Data integration is the fastest-growing\n", + "data and AI market on the Databricks\n", + "Lakehouse with 117% YoY growth\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Methodology: How did Databricks\n", + "\n", + "create this report?\n", + "\n", + "```\n", + "The _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\n", + "collected from our customers based on how they are using the Databricks\n", + "Lakehouse and its broad ecosystem of integrated tools. This report focuses\n", + "on machine learning adoption, data architecture (integrations and migrations)\n", + "and use cases. The customers in this report represent every major industry\n", + "and range in size from startups to many of the world’s largest enterprises.\n", + "\n", + "Unless otherwise noted, this report presents and analyzes data from February 1,\n", + "2022, to January 31, 2023, and usage is measured by number of customers.\n", + "When possible, we provide YoY comparisons to showcase growth trends over time.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Data Science and\n", + "\n", + "Machine Learning\n", + "NATURAL LANGUAGE PROCESSING AND LARGE\n", + "LANGUAGE MODELS ARE IN HIGH DEMAND\n", + "\n", + "```\n", + "Across all industries, companies leverage data science and\n", + "machine learning (DS/ML) to accelerate growth, improve\n", + "predictability and enhance customer experiences. Recent\n", + "advancements in large language models (LLMs) are propelling\n", + "companies to rethink AI within their own data strategies.\n", + "Given the rapidly evolving DS/ML landscape, we wanted to\n", + "understand several aspects of the market:\n", + "\n", + "- Which types of DS/ML applications are companies investing\n", + "in? In particular, given the recent buzz, what does the data\n", + "around LLMs look like?\n", + "\n", + "- Are companies making headway on operationalizing\n", + "\n", + "their machine learning models (MLOps)?\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Time Series Time Series\n", + "Speech Recognition\n", + "Simulations &\u0003\n", + "\n", + "Optimizations\n", + "Recommender Systems\n", + "Natural\n", + "\n", + "\u0003Language \u0003\n", + "\n", + "Processing\n", + "Industry Data Modeling\n", + "Graph\n", + "Geospatial\n", + "Computer Vision\n", + "Anomaly Detection\n", + "\u0003& Segmentation\n", + "\n", + "```\n", + "```\n", + " SPECIALIZED PYTHON \u0003DS/ML\n", + "\n", + " LIBRARIES FROM \u0003FEBRUARY 2022 \n", + "\n", + " TO JANUARY 2023\n", + "\n", + "```\n", + "\n", + "Note: This chart reflects the unique\n", + "number of notebooks using ML\n", + "libraries per day in each of the\n", + "categories. It includes libraries used\n", + "for the particular problem-solving use\n", + "cases mentioned. It does not include\n", + "libraries used in tooling for data\n", + "preparations and modeling.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Natural language processing dominates\n", + "\n", + "machine learning use cases\n", + "\n", + "```\n", + "\n", + "Our second most popular DS/ML application is\n", + "simulations and optimization, which accounts for 30% of\n", + "all use cases. This signals organizations are using data to\n", + "model prototypes and solve problems cost-effectively.\n", + "\n", + "\n", + "To understand how organizations are applying AI and\n", + "ML within the Lakehouse, we aggregated the usage\n", + "of specialized Python libraries, which include NLTK,\n", + "Transformers and FuzzyWuzzy, into popular data science\n", + "use cases. 1 We look at data from these libraries because\n", + "Python is on the cutting edge of new developments in ML,\n", + "advanced analytics and AI, and has consistently ranked\n", + "as one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\n", + "recent years.\n", + "\n", + "Our most popular use case is natural language processing\n", + "(NLP), a rapidly growing field that enables businesses to\n", + "gain value from unstructured textual data. This opens the\n", + "door for users to accomplish tasks that were previously\n", + "too abstract for code, such as summarizing content or\n", + "extracting sentiment from customer reviews. In our data\n", + "set, 49% of libraries used are associated with NLP. LLMs\n", + "also fall within this bucket. Given the innovations launched\n", + "in recent months, we expect to see NLP take off even\n", + "more in coming years as it is applied to use cases like\n", + "chatbots, research assistance, fraud detection, content\n", + "generation and more.\n", + "\n", + "```\n", + " In our data set, 49% of\n", + " specialized Python libraries\n", + " used are associated with NLP\n", + "\n", + "```\n", + "Many of the DS/ML use cases are predominantly\n", + "leveraged by specific industries. While they take up a\n", + "smaller share of the total, they are mission-critical for\n", + "many organizations. For example, time series includes\n", + "forecasting, a use case that is especially popular in\n", + "industries such as Retail and CPG, which rely heavily\n", + "on the ability to forecast the need for every item in\n", + "every store.\n", + "\n", + "\n", + "1. This data does not include general-purpose ML libraries, including\n", + "scikit-learn or TensorFlow.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " USE OF LARGE LANGUAGE MODELS (LLMS)\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "We have rolled these libraries up into groupings based on the type of functionality they provide.\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||||||||\n", + "|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n", + "||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n", + "|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n", + "|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n", + "|||SaaS|||||||||||||||||||||||||||||\n", + "|||LLM|Tools||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n", + "|2022||||||||||||||||||||20|23||||||||||\n", + "||||||||||||||||||||||||||||||||\n", + "||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n", + "\n", + "\n", + "\n", + "D t i t tl di i th l t k f D b d t lit\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Large language models are\n", + "\n", + "the “it” tool\n", + "\n", + "```\n", + "LLMs are currently one of the hottest and most-watched areas\n", + "in the field of NLP. LLMs have been instrumental in enabling\n", + "machines to understand, interpret and generate human language\n", + "in a way that was previously impossible, powering everything\n", + "from machine translation to content creation to virtual assistants\n", + "and chatbots.\n", + "\n", + "Transformer-related libraries have been growing in popularity\n", + "even before ChatGPT thrust LLMs into the public consciousness.\n", + "Within the last 6 months, our data shows two accelerating\n", + "trends: organizations are building their own LLMs, which models\n", + "like [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\n", + "they are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\n", + "LLMs, have the highest adoption within the Lakehouse.\n", + "\n", + "The second most popular type is SaaS LLMs, which are used\n", + "to access models like OpenAI. This category has grown\n", + "exponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\n", + "number of Lakehouse customers using SaaS LLMs has grown\n", + "\n", + "\n", + "Organizations can leverage LLMs either by\n", + "using SaaS LLM APIs to call services like\n", + "ChatGPT from OpenAI or by operating their\n", + "own LLMs in-house.\n", + "\n", + "Thinking of building your own modern LLM\n", + "application? This approach could entail\n", + "the use of specialized transformer-related\n", + "Python libraries to train the model, as well as\n", + "LLM tools like LangChain to develop prompt\n", + "interfaces or integrations to other systems.\n", + "```\n", + "LLM DEFINITIONS\n", + "\n", + "```\n", + "**◊** **Transformer-related libraries:**\n", + "Python libraries used to train LLMs\n", + "(example: Hugging Face)\n", + "\n", + "**◊** **SaaS LLM APIs:** Libraries used to access\n", + "LLMs as a service (example: OpenAI)\n", + "\n", + "**◊** **LLM tools:** Toolchains for working\n", + "with and building proprietary LLMs\n", + "(example: LangChain)\n", + "\n", + "\n", + "an impressive 1310% between the end of November 2022 and\n", + "the beginning of May 2023. (In contrast, transformer-related\n", + "libraries grew 82% in this same period.)\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " ac e ea g e pe e a o a d p oduc o\n", + "take off across industries\n", + "\n", + "```\n", + "\n", + "The increasing demand for ML solutions and the growing\n", + "availability of technologies have led to a significant\n", + "increase in experimentation and production, two distinct\n", + "parts of the ML model lifecycle. We look at the _logging_ and\n", + "_registering_ of models in MLflow, an open source platform\n", + "developed by Databricks, to understand how ML is\n", + "trending and being adopted within organizations.\n", + "```\n", + " LOGGED MODELS AND\n", + "\n", + " ML EXPERIMENTATION\n", + "\n", + "```\n", + "During the experimentation phase of ML, data scientists\n", + "develop models designed to solve given tasks. After training\n", + "the models, they test them to evaluate their accuracy,\n", + "precision, recall (the percentage of correctly predicted\n", + "positive instances out of all actual positive instances), and\n", + "more. These metrics are logged (recorded) in order to analyze\n", + "the various models’ performance and identify which approach\n", + "works best for the given task.\n", + "\n", + "We have chosen logged models as a proxy to measure ML\n", + "experimentation because the MLflow Tracking Server is\n", + "\n", + "designed to facilitate experiment tracking and reproducibility.\n", + "\n", + "\n", + "MLflow Model Registry launched in May 2021. Overall, the\n", + "number of logged models has grown 54% since February\n", + "2022, while the number of registered models has grown\n", + "411% over the same period. This growth in volume suggests\n", + "organizations are understanding the value of investing in\n", + "and allocating more people power to ML.\n", + "```\n", + "REGISTERED MODELS AND ML PRODUCTION\n", + "\n", + "```\n", + "Production models have undergone the experimentation\n", + "phase and are then deployed in real-world applications. They\n", + "are typically used to make predictions or decisions based on\n", + "new data. Registering a model is the process of recording and\n", + "storing metadata about a trained model in a centralized location\n", + "that allows users to easily access and reuse existing models.\n", + "Registering models prior to production enables organizations to\n", + "ensure consistency and reliability in model deployment and scale.\n", + "\n", + "We have chosen registered models to represent ML production\n", + "because the MLflow Model Registry is designed to manage\n", + "models that have left the experimentation phase through the\n", + "\n", + "rest of their lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "g y yi p\n", + "\n", + "was registered. Recent advances in ML, such as improved\n", + "open source libraries like MLflow and Hugging Face, have\n", + "\n", + "radically simplified building and putting models into\n", + "production. The result is that 34% of logged models are\n", + "now candidates for production today, an improvement\n", + "from over 20% just a year ago.\n", + "\n", + "\n", + "before committing an ML model to production. We wanted\n", + "to understand, “How many models do data scientists\n", + "\n", + "experiment with before moving to production?”\n", + "\n", + "Our data shows the ratio of logged to registered models\n", + "is 2.9 : 1 as of January 2023. This means that for roughly\n", + "every three experimental models, one model will get\n", + "registered as a candidate for production. This ratio has\n", + "improved significantly from just a year prior, when we\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||VS. S|||||||||||||||||||||||\n", + "|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "||||||Models|||||||||||||||||||||||\n", + "||||||ber of|||||||||||||||||||||||\n", + "||||||Num|||||||||||||||||||||||\n", + "|||||||||||||||||||||||||||||\n", + "|2.|9 :|1||||||||||||||||||||||||||\n", + "\n", + "```\n", + "Ratio of Logged to Registered\n", + "\n", + " Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\n", + "Models in Jan 2023 2023\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "The Modern Data\n", + "and AI Stack\n", + "\n", + "```\n", + "Over the last several years, the trend toward building\n", + "open, unified data architectures has played out in our\n", + "own data. We see that data leaders are opting to preserve\n", + "choice, leverage the best products and deliver innovation\n", + "across their organizations by democratizing access to\n", + "data for more people.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " FASTEST-GROWING DATA AND AI PRODUCTS\n", + " dbt 206%\n", + "\n", + "```\n", + "```\n", + "Fivetran\n", + "Informatica\n", + "Qlik Data Integration\n", + "Esri\n", + "Looker\n", + "Hugging Face\n", + "\n", + "```\n", + "```\n", + " 181%\n", + " 174%\n", + " 152%\n", + " 145%\n", + " 141%\n", + "110%\n", + "\n", + "```\n", + "```\n", + "Lytics\n", + "Great Expectations\n", + "Kepler.gl\n", + "\n", + "```\n", + "```\n", + " 101%\n", + " 100%\n", + "95%\n", + "\n", + "```\n", + "```\n", + "0% 50% 100% 150% 200%\n", + " Year-Over-Year Growth by Number of Customers\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "DBT IS THE FASTEST-GROWING DATA\n", + "\n", + "AND AI PRODUCT OF 2023\n", + "\n", + "```\n", + "As companies move quickly to develop more advanced\n", + "use cases with their data, they are investing in newer\n", + "products that produce trusted data sets for reporting,\n", + "ML modeling and operational workflows. Hence, we see\n", + "the rapid rise of data integration products. dbt, a data\n", + "transformation tool, and Fivetran, which automates\n", + "data pipelines, are our two fastest-growing data and AI\n", + "products. This suggests a new era of the data integration\n", + "market with challenger tools making headway as\n", + "companies shift to prioritize DS/ML initiatives. With Great\n", + "Expectations from Superconductive in the ninth spot,\n", + "a full 50% of our fastest-growing products represent\n", + "the data integration category.\n", + "\n", + "\n", + "-----\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||Busi|ness I|ntelli|gence|\n", + "|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n", + "|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n", + "|Custom||||||||||||||||||||\n", + "|ber of||||||||||||||||||||\n", + "|Num||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n", + "|||||||||||||||||||||\n", + "|||||||||||||||||||||\n", + "\n", + "\n", + "Note: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\n", + "categories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " a a a d a e s bus ess e ge ce s\n", + "standard, organizations invest in their machine\n", + "learning foundation\n", + "\n", + "```\n", + "\n", + "To understand how organizations are prioritizing their data\n", + "initiatives, we aggregated all data and AI products on the\n", + "Databricks Lakehouse and categorized them into four\n", + "core markets: BI, data governance and security, DS/ML,\n", + "and data integration. Our data set confirms that BI tools\n", + "are more widely adopted across organizations relative to\n", + "more nascent categories — and they continue to grow,\n", + "with a 66% YoY increase in adoption. This aligns with the\n", + "broader trend of more organizations performing data\n", + "warehousing on a Lakehouse, covered in the next section,\n", + "Views from the Lakehouse.\n", + "\n", + "\n", + "While BI is often where organizations start their data\n", + "journey, companies are increasingly looking at more\n", + "advanced data and AI use cases.\n", + "```\n", + "DEMAND FOR DATA INTEGRATION PRODUCTS\n", + "\n", + "IS GROWING FAST\n", + "\n", + "```\n", + "We see the fastest growth in the data integration market.\n", + "These tools enable a company to integrate vast amounts\n", + "of upstream and downstream data in one consolidated\n", + "view. Data integration products ensure that all BI and DS/\n", + "ML initiatives are built on solid foundation.\n", + "\n", + "While it’s easier for smaller markets to experience\n", + "faster growth, at 117% YoY increased adoption, the data\n", + "integration market is growing substantially faster than BI.\n", + "This trend dovetails with the rapid growth of ML adoption\n", + "we see across the Lakehouse, covered in the DS/ML\n", + "section of the report.\n", + "\n", + "```\n", + "Data integration is the\n", + "fastest-growing market,\n", + "\n", + " with 117% YoY growth\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Views from\n", + "the Lakehouse\n", + "MIGRATION AND DATA\n", + "\n", + "FORMAT TRENDS\n", + "\n", + "```\n", + "Data migration is a major undertaking: it can be risky,\n", + "expensive and delay companies’ timelines. It’s not a\n", + "task to jump into lightly. As organizations run into the\n", + "limitations, scalability challenges and the cost burden\n", + "of legacy data platforms, they are increasingly likely\n", + "to migrate to a new type of architecture.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "Migration trends:\n", + "\n", + "the best data warehouse\n", + "\n", + "is a Lakehouse\n", + "\n", + "```\n", + "The Lakehouse Platform is an attractive\n", + "alternative to traditional data warehouses\n", + "because it supports advanced use cases and\n", + "DS/ML, allowing organizations to boost their\n", + "overall data strategy. As evidenced by the most\n", + "popular data and AI products, with BI and data\n", + "integration tools at the top, organizations are\n", + "increasingly using the data lakehouse for data\n", + "warehousing. To better understand which legacy\n", + "platforms organizations are moving away from,\n", + "\n", + "we look at the migrations of new customers\n", + "to Databricks.\n", + "\n", + "An interesting takeaway is that roughly half of the\n", + "companies moving to the Lakehouse are coming\n", + "from data warehouses. This includes the 22%\n", + "that are moving from cloud data warehouses.\n", + "It also demonstrates a growing focus on running\n", + "data warehousing workloads on a Lakehouse\n", + "and unifying data platforms to reduce cost.\n", + "\n", + "```\n", + " SOURCE OF NEW CUSTOMER \u0003\n", + "\n", + " MIGRATIONS TO DATABRICKS\n", + "\n", + "```\n", + "```\n", + "12%\n", + "\n", + "```\n", + "```\n", + "39%\n", + "\n", + "```\n", + "```\n", + "27%\n", + "\n", + "```\n", + "```\n", + "22%\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "Rising tides: the volume\n", + "\n", + "of data in Delta Lake\n", + "\n", + "has grown 304% YoY\n", + "\n", + "```\n", + "As the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\n", + "large proportion is in the form of semi-structured\n", + "and unstructured data. Previously, organizations\n", + "had to manage multiple different platforms for\n", + "their structured, unstructured and semi-structured\n", + "data, which caused unnecessary complexity and\n", + "high costs. The Lakehouse solves this problem by\n", + "providing a unified platform for all data types\n", + "and formats.\n", + "\n", + "Delta Lake is the foundation of the Databricks\n", + "Lakehouse. The Delta Lake format encompasses\n", + "structured, unstructured and semi-structured\n", + "data. Use has surged over the past 2 years.\n", + "When compared to the steady, flat or declining\n", + "growth in other storage formats (e.g., text, JSON\n", + "and CSV), our data shows that a growing number\n", + "of organizations are turning to Delta Lake to manage\n", + "their data. In June 2022, Delta Lake surpassed\n", + "Parquet as the most popular data lake source,\n", + "reaching 304% YoY growth.\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||BY|STO||RAG||E FO||RMA|T|||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|ata||||||||||||||||||\n", + "|e of D||||||||||||||||||\n", + "|Volum||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "|||||||||||||||||||\n", + "||Jan|||||||J|an|||Jan||||Ja||\n", + "|||||Jan||||||||||||||\n", + "|2|019|||2020||||20|21|||2022||||202||\n", + "|||||||||Delta|Te|xt||CSV||Av||ro||\n", + "|||||||||Parquet|OR|C||JSON||||||\n", + "|||||||||||||||||||\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + " g g ,\n", + "with emphasis on serverless\n", + "\n", + "```\n", + "\n", + "Over the past 2 years, companies have vastly increased their usage\n", + "of data warehousing on the Lakehouse Platform. This is especially\n", + "demonstrated by use of Databricks SQL ­— the serverless data\n", + "warehouse on the Lakehouse — which shows 144% YoY growth.\n", + "This suggests that organizations are increasingly ditching traditional\n", + "data warehouses and are able to perform all their BI and analytics\n", + "on a Lakehouse.\n", + "\n", + "```\n", + " Data \n", + "Warehouse\n", + "\n", + "```\n", + "```\n", + "Data \n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "Platform\n", + "\n", + "```\n", + "```\n", + "Lakehouse\n", + "\n", + "```\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "\n", + "|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n", + "|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n", + "||||||||||||||||||||||||||\n", + "||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n", + "||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n", + "||DA|TABR|ICK|S SQ|||||||||||||||||||||\n", + "||||||||ustome||||||||||||||||||\n", + "||||||||r of C||||||||||||||||||\n", + "||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n", + "||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n", + "||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n", + "\n", + "\n", + "-----\n", + "\n", + "CONCLUSION\n", + "```\n", + "Generation AI\n", + "\n", + "```\n", + "We’re excited that companies are progressing into more\n", + "advanced ML and AI use cases, and the modern data and\n", + "AI stack is evolving to keep up. Along with the rapid growth\n", + "of data integration tools (including our fastest growing,\n", + "dbt), we’re seeing the rapid rise of NLP and LLM usage in\n", + "our own data set, and there’s no doubt that the next few\n", + "years will see an explosion in these technologies. It’s never\n", + "been more clear: the companies that harness the power\n", + "of DS/ML will lead the next generation of data.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "About Databricks\n", + "\n", + "```\n", + "Databricks is the data and AI company. More than 9,000\n", + "organizations worldwide — including Comcast, Condé Nast, and\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "headquartered in San Francisco, with offices around the globe.\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "the world’s toughest problems. To learn more, follow Databricks\n", + "on Twitter, LinkedIn and Instagram.\n", + "\n", + "[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n", + "\n", + "© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf2024-09-19T16:57:20Z
**eBook**\n", + "\n", + "# Making Your Digital Twin Come to Life\n", + "\n", + "##### With the Lakehouse for Manufacturing and Tredence\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Digital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n", + "\n", + "What Are Digital Twins? ........................................................................................................................................................................................ **07**\n", + "\n", + "Digital Twin Architectures .................................................................................................................................................................................. **08**\n", + "\n", + "How to Build a Digital Twin ................................................................................................................................................................................ **09**\n", + "\n", + "Why Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n", + "\n", + "Why Databricks for Digital Twins? ................................................................................................................................................................... **13**\n", + "\n", + "Why Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n", + "\n", + "Using Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "The concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\n", + "over 25 years ago, during the early phases of foundation and cofferdam construction for the\n", + "London Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\n", + "the years since this first application, edge computing, AI, data connectivity, 5G connectivity\n", + "and the improvements of the Internet of Things (IoT) have enabled digital twins to become\n", + "cost-effective and are now an imperative in today’s data-driven businesses.\n", + "\n", + "Today’s manufacturing industries are expected to streamline and optimize all the processes in their value\n", + "chain from product development and design, through operations and supply chain optimization to obtaining\n", + "feedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\n", + "and is addressing a multitude of challenges within manufacturing, logistics and transportation.\n", + "\n", + "\n", + "[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "\n", + "\n", + "**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 10%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 50%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 25%\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Introduction (continued)**\n", + "\n", + "\n", + "**Digital twin market growth rate accelerates**\n", + "\n", + "Digital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\n", + "is forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\n", + "at a CAGR of 58%, riding on the wave of Industry 4.0.\n", + "\n", + "\n", + "**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to Manufacturing\n", + "\n", + "Industry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\n", + "would have come at significant costs without digital twin technology.\n", + "\n", + "**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n", + "\n", + "\n", + "\n", + "**•** Product design and development is performed with\n", + "less cost and is completed in less time as iterative\n", + "simulations, using multiple constraints, deliver the\n", + "best or most optimized design. All commercial\n", + "aircraft are designed using digital twins.\n", + "\n", + "**•** Digital twins provide the awareness of how long\n", + "inventory will last, when to replenish and how to\n", + "minimize the supply chain disruptions. The oil and gas\n", + "industry, for example, uses supply chain–oriented\n", + "digital twins to reduce supply chain bottlenecks in\n", + "storage and midstream delivery, schedule tanker\n", + "off-loads and model demand with externalities.\n", + "\n", + "\n", + "\n", + "**•** Continuous quality checks on produced items\n", + "with ML/AI generated feedback pre-emptively\n", + "assuring improved product quality. Final paint\n", + "inspection in the automotive industry, for example,\n", + "is performed with computer vision built on top of\n", + "digital twin technology.\n", + "\n", + "**•** Striking the sweet spot between when to replace\n", + "a part before the process degrades or breaks\n", + "down and utilizing the components to their fullest,\n", + "digital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\n", + "building an asset performance management suite.\n", + "\n", + "\n", + "\n", + "**•** Digital twins create the opportunity to have\n", + "multiple departments in sync by providing\n", + "necessary instructions modularly to attain\n", + "a required throughput. Digital twins are the\n", + "backbone of kaizen events that optimize\n", + "manufacturing process flow.\n", + "\n", + "**•** Customer feedback loops can be modeled through\n", + "inputs, from point of sale customer behavior,\n", + "buying preferences, or product performance and\n", + "then integrated into the product development\n", + "process, forming a closed loop providing an\n", + "improved product design.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n", + "\n", + "The top four use cases are heavily focused on operational processes and are typically the first to be deployed\n", + "in manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\n", + "deployment, but typically offer higher and longer-lasting value.\n", + "\n", + "**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n", + "\n", + "\n", + "Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**\n", + "\n", + "**8%**\n", + "**8%**\n", + "\n", + "\n", + "Can you imagine the cost to change\n", + "an oil refinery’s crude distillation\n", + "unit process conditions to improve\n", + "the output of diesel one week\n", + "and gasoline the next to address\n", + "changes in demand and ensure\n", + "maximum economic value? Can you\n", + "imagine how to replicate an even\n", + "simple supply chain to model risk?\n", + "\n", + "\n", + "**5%**\n", + "\n", + "\n", + "**1%**\n", + "\n", + "\n", + "-----\n", + "\n", + "### What Are Digital Twins?\n", + "\n", + "\n", + "Knowing the business challenges and benefits digital twins deliver, let’s turn to\n", + "the basics and explore what digital twins are and how a modern data stack is\n", + "necessary to build effective and timely digital twins. The classic definition of\n", + "digital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n", + "\n", + "\n", + "For a discrete or continuous manufacturing process, a digital twin gathers system\n", + "and processes state data with the help of various IoT sensors [operational\n", + "technology data (OT)] and enterprise data [informational technology (IT)] to form a\n", + "virtual model which is then used to run simulations, study performance issues and\n", + "generate possible insights.\n", + "\n", + "\n", + "**Types of Digital Twins**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twin Architectures\n", + "\n", + "Classic digital twins have been physics-based models of specific systems. More recently,\n", + "**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n", + "\n", + "\n", + "These twins provide the opportunity to not just monitor and simulate system performance under specific\n", + "conditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\n", + "the industrial environment.\n", + "\n", + "Digital twins undergo a series of changes during their lifecycle to become completely autonomous.\n", + "\n", + "**Data-Driven Operational Digital Twins: Maturity Journey**\n", + "\n", + "**AI**\n", + "\n", + "Simulate & Optimize\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "# 6-8 18-24\n", + "## years to months\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "# 20% to 25%\n", + "\n", + "\n", + "**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n", + "\n", + "\n", + "Identify next best action and\n", + "integrate with actuation systems\n", + "\n", + "\n", + "**IoT**\n", + "\n", + "**Edge/**\n", + "**Cloud**\n", + "\n", + "\n", + "**Digital Twins**\n", + "\n", + "**ERP**\n", + "\n", + "\n", + "Predict & Diagnose\n", + "\n", + "|Col1|I i|\n", + "|---|---|\n", + "\n", + "\n", + "\n", + "Predictive maintenance, process\n", + "improvements and Root Causing\n", + "\n", + "\n", + "Monitor & Alert\n", + "\n", + "|Col1|P i|\n", + "|---|---|\n", + "\n", + "\n", + "Real-time operations monitoring\n", + "and alerting\n", + "\n", + "\n", + "-----\n", + "\n", + "### How to Build a Digital Twin\n", + "\n", + "\n", + "A data architecture capability is needed to capture\n", + "and collect the ever-expanding volume and variety\n", + "of data streaming in real time from example\n", + "protocols, such as ABB Total Flow, Allen Bradley,\n", + "Emerson, Fanuc, GE, Hitachi and Mitsubishi.\n", + "\n", + "\n", + "Data collection, data analytics, application\n", + "enablement and data integration orchestrate the\n", + "time-series data stream and transfer to the cloud.\n", + "Azure IoT Hub is used to securely ingest data from\n", + "edge to cloud.\n", + "\n", + "\n", + "Cloud infrastructure and analytics capabilities are\n", + "offered within the flexibility of the cloud. Azure\n", + "Digital Twin is used to model and visualize process\n", + "workflows. Databricks MLflow and Delta Lake scale to\n", + "deliver real-time predictive analytics.\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Digital Twins: Technical Architecture**\n", + "\n", + "\n", + "-----\n", + "\n", + "**How to Build a Digital Twin (continued)**\n", + "\n", + "**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n", + "\n", + "\n", + "**System and use case discovery**\n", + "**and blueprinting**\n", + "\n", + "**•** Identify priority plant processes and systems\n", + "to model, with focused use cases (e.g., asset\n", + "maintenance, energy management, process\n", + "monitoring/optimization, etc.)\n", + "\n", + "**•** Develop a validated process outline, blueprint and\n", + "key performance indicators\n", + "\n", + "**•** Develop a set of process variables, control\n", + "variables and manipulated variables\n", + "\n", + "**•** Design control loop\n", + "\n", + "**•** Validate and document process and asset FMEA\n", + "for all assets and sub-systems\n", + "\n", + "\n", + "**Technology infrastructure requirements**\n", + "\n", + "**•** Technical edge infrastructure onsite — to sense,\n", + "collect and transmit real-time information\n", + "\n", + "**•** Clean, reliable data availability in the cloud\n", + "\n", + "**•** Data processing and analytics platform — to\n", + "design, develop and implement solutions\n", + "\n", + "**•** Stream processing and deployment of models for\n", + "predictions and soft sensing\n", + "\n", + "\n", + "**Visualization delivered**\n", + "\n", + "**•** Information communication — visual\n", + "representation of digital twin along with remote\n", + "controlling functions (e.g., Power BI dashboards,\n", + "time series insights, web app-based digital\n", + "twin portals)\n", + "\n", + "**•** Closed-loop feedback — to send the insights and\n", + "actions back to form a closed loop — Azure – Event\n", + "Grid and Event Hub with connection from IoT Hub to\n", + "Azure IoT edge devices and control systems is used\n", + "\n", + "\n", + "\n", + "**•** Edge platform to orchestrate the data, insights and\n", + "actions between the cloud and site IT systems\n", + "\n", + "**•** Cloud to edge integration — to enable seamless\n", + "monitoring, alerting and integration with plant\n", + "OT/IT systems\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Is Manufacturing Struggling With Data and AI?\n", + "\n", + "**Challenge** **Root Cause** **Goal**\n", + "\n", + "\n", + "Aggregate high volumes and velocities of\n", + "\n", + "structured and unstructured data to power\n", + "\n", + "predictive analytics (e.g., images, IoT, ERP/SCM)\n", + "\n", + "Data architectures that scale for TBs /PBs of\n", + "\n", + "enterprise IT and OT data\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "for on-premises 30 years ago\n", + "\n", + "\n", + "Siloed data from systems designed\n", + "**Siloed data across the value chain**\n", + "\n", + "\n", + "Legacy architectures such as data\n", + "\n", + "historians that can’t handle semi-structured\n", + "\n", + "or unstructured data\n", + "\n", + "\n", + "**Unable to scale enterprise data sets**\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "granular supply chain issues in the real world\n", + "\n", + "\n", + "Address manufacturing issues or track\n", + "**Lack real-time insights** Batch-oriented data transfer\n", + "\n", + "\n", + "**Can’t meet intellectual property**\n", + "\n", + "\n", + "**Can’t meet intellectual property** Data lineage established across organizational\n", + "\n", + "Systems that do not establish data lineage\n", + "**requirements** silos and disjointed workflows\n", + "\n", + "\n", + "silos and disjointed workflows\n", + "\n", + "\n", + "### Data architecture is the root cause of this struggle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Databricks for Digital Twins?\n", + "\n", + "\n", + "Lakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\n", + "from across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\n", + "digital twins with predictive insights across the value chain from product development to optimizing operations\n", + "to building agile supply chains to robust customer insights.\n", + "\n", + "\n", + "Databricks open Lakehouse\n", + "\n", + "Platform has shown time and\n", + "\n", + "again that it is the foundational\n", + "\n", + "enabling technology to power\n", + "\n", + "digital twins for manufacturing. But\n", + "\n", + "the real power is the Databricks\n", + "\n", + "partnership with Tredence that\n", + "\n", + "speeds implementation for\n", + "\n", + "tailored use cases that deliver\n", + "\n", + "superior ROI in less time.”\n", + "\n", + "**Dr. Bala Amavasai** ,\n", + "\n", + "Manufacturing CTO, Databricks\n", + "\n", + "\n", + "**Supports Real-Time**\n", + "**Decisions**\n", + "\n", + "Lakehouse for Manufacturing\n", + "leverages any enterprise data\n", + "source — from business critical\n", + "ERP data to edge sensor data in\n", + "one integrated platform, making it\n", + "easy to automate and secure data\n", + "with fast, real-time performance.\n", + "\n", + "\n", + "**Faster and More**\n", + "**Accurate Analysis**\n", + "\n", + "The true benefits of digital twins\n", + "are not the business intelligence\n", + "dashboards, but machine\n", + "learning insights generated\n", + "from incorporating real-time\n", + "data. Scalable and shareable\n", + "notebook-based machine learning\n", + "accelerates ROI.\n", + "\n", + "\n", + "**Open Data Sharing**\n", + "**and Collaboration**\n", + "\n", + "Drive stronger customer insights\n", + "and greater service with partners\n", + "leveraging open and secure\n", + "data collaboration between\n", + "departments or your supply chain\n", + "delivering faster ROI.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Why Tredence for Digital Twins?\n", + "\n", + "\n", + "Over the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\n", + "expertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\n", + "Now, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\n", + "further strengthen their ‘’last mile impact’’ vision.\n", + "\n", + "\n", + "Tredence is excited to\n", + "\n", + "co-innovate with Databricks to\n", + "\n", + "deliver the solutions required for\n", + "\n", + "enterprises to create digital twins\n", + "\n", + "from the ground up and implement\n", + "\n", + "them swiftly to maximize their ROI.\n", + "\n", + "Our partnership enables clients to\n", + "\n", + "get the most out of Tredence’s data\n", + "\n", + "science capabilities to build decision\n", + "\n", + "intelligence around manufacturing\n", + "\n", + "processes and Databricks’\n", + "\n", + "Lakehouse Platform to realize the full\n", + "\n", + "promise of digital twins.”\n", + "\n", + "**Naresh Agarwal** ,\n", + "\n", + "Head of Industrials, Tredence\n", + "\n", + "\n", + "**Global Reach**\n", + "\n", + "Tredence offers a global team with\n", + "the subject matter expertise that\n", + "delivers practitioner and useroriented solutions to identify\n", + "and solve for challenges in\n", + "digital transformation design\n", + "and implementation.\n", + "\n", + "\n", + "**Purpose-Built Solutions**\n", + "\n", + "Adopt contextual edge to cloud,\n", + "purpose-built AIoT solutions\n", + "that unify your ecosystems with\n", + "connected insights and enhance\n", + "productivity, while enabling\n", + "efficient cost structures.\n", + "\n", + "\n", + "**Focused Dedication**\n", + "\n", + "A dedicated centre of excellence\n", + "(CoE) for AIoT and smart\n", + "manufacturing solutions —\n", + "serving the entire manufacturing\n", + "value chain from product\n", + "development to manufacturing and\n", + "downstream operations.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Using Digital Twins to Drive Insights\n", + "\n", + "\n", + "**Use Case**\n", + "\n", + "**Predictive Maintenance**\n", + "\n", + "- \u0007Rolls-Royce sought to use real-time\n", + "engine data to reduce unplanned\n", + "maintenance and downtime\n", + "\n", + "- \u0007Legacy systems were unable to\n", + "scale data ingestion of engine\n", + "sensor data in real time for ML\n", + "\n", + "**Impact**\n", + "\n", + "\n", + "**Why Databricks?**\n", + "\n", + "- \u0007The Lakehouse Platform on Azure unifies in-flight data\n", + "streams with external environmental conditions data to\n", + "predict engine performance issues\n", + "\n", + "- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\n", + "across use cases\n", + "\n", + "- \u0007MLflow speeds deployment of new models and reduces\n", + "incidents of grounded planes\n", + "\n", + "\n", + "Rolls-Royce uses Databricks\n", + "to drive insights around predictive\n", + "maintenance, improving\n", + "airframe reliability and reducing\n", + "carbon emissions.\n", + "\n", + "\n", + "#### 22 million tons\n", + "of carbon emissions saved\n", + "\n", + "\n", + "#### 5% reduction\n", + "in unplanned airplane groundings\n", + "\n", + "\n", + "#### Millions of pounds\n", + "in inventory cost savings from a 50%\n", + "improvement in maintenance efficiency\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n", + "\n", + "Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n", + "\n", + "analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n", + "\n", + "original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n", + "\n", + "###### Get started with a free trial of Databricks and start building data applications today\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n", + "\n", + "To learn more, visit us at:\n", + "\n", + "**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf2024-09-19T16:57:22Z
### EBOOK\n", + "\n", + "# A Compact Guide to Large Language Models\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 1\n", + "## Introduction\n", + "\n", + "##### Definition of large language models (LLMs)\n", + "\n", + "Large language models are AI systems that are designed to process and analyze\n", + "vast amounts of natural language data and then use that information to generate\n", + "responses to user prompts. These systems are trained on massive data sets\n", + "using advanced machine learning algorithms to learn the patterns and structures\n", + "of human language, and are capable of generating natural language responses to\n", + "a wide range of written inputs. Large language models are becoming increasingly\n", + "important in a variety of applications such as natural language processing,\n", + "machine translation, code and text generation, and more.\n", + "\n", + "While this guide will focus on language models, it’s important to understand that\n", + "they are only one aspect under a larger generative AI umbrella. Other noteworthy\n", + "generative AI implementations include projects such as art generation from text,\n", + "audio and video generation, and certainly more to come in the near future.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Extremely brief historical background and development of LLMs\n", + "\n", + "\n", + "###### 1950s–1990s\n", + "Initial attempts are made to map hard rules around languages and\n", + "follow logical steps to accomplish tasks like translating a sentence\n", + "from one language to another.\n", + "\n", + "While this works sometimes, strictly defined rules only work for\n", + "concrete, well-defined tasks that the system has knowledge about.\n", + "\n", + "###### 1990s \n", + "Language models begin evolving into statistical models and\n", + "language patterns start being analyzed, but larger-scale projects\n", + "are limited by computing power.\n", + "\n", + "###### 2000s \n", + "Advancements in machine learning increase the complexity of\n", + "language models, and the wide adoption of the internet sees an\n", + "\n", + "enormous increase in available training data.\n", + "\n", + "###### 2012 \n", + "Advancements in deep learning architectures and larger data sets\n", + "lead to the development of GPT (Generative Pre-trained Transformer).\n", + "\n", + "\n", + "###### 2018\n", + "Google introduces BERT (Bidirectional Encoder Representations\n", + "from Transformers), which is a big leap in architecture and paves\n", + "the way for future large language models.\n", + "\n", + "###### 2020\n", + "OpenAI releases GPT-3, which becomes the largest model at\n", + "175B parameters and sets a new performance benchmark for\n", + "language-related tasks.\n", + "\n", + "###### 2022\n", + "ChatGPT is launched, which turns GPT-3 and similar models into\n", + "a service that is widely accessible to users through a web interface\n", + "and kicks off a huge increase in public awareness of LLMs and\n", + "generative AI.\n", + "\n", + "###### 2023\n", + "Open source LLMs begin showing increasingly impressive results\n", + "with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\n", + "GPT-4 is also released, setting a new benchmark for both parameter\n", + "size and performance.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2\n", + "## Understanding Large Language Models\n", + "\n", + "\n", + "##### What are language models and how do they work?\n", + "\n", + "Large language models are advanced artificial intelligence systems that take\n", + "some input and generate humanlike text as a response. They work by first\n", + "analyzing vast amounts of data and creating an internal structure that models\n", + "the natural language data sets that they’re trained on. Once this internal\n", + "structure has been developed, the models can then take input in the form of\n", + "natural language and approximate a good response.\n", + "\n", + "##### If they’ve been around for so many years, why are they just now making headlines?\n", + "\n", + "A few recent advancements have really brought the spotlight to generative AI\n", + "and large language models:\n", + "\n", + "**A D VA N C E M E N T S I N T E C H N I Q U E S**\n", + "Over the past few years, there have been significant advancements in the\n", + "techniques used to train these models, resulting in big leaps in performance.\n", + "Notably, one of the largest jumps in performance has come from integrating\n", + "human feedback directly into the training process.\n", + "\n", + "\n", + "**I N C R E A S E D A C C E S S I B I L I T Y**\n", + "The release of ChatGPT opened the door for anyone with internet access\n", + "to interact with one of the most advanced LLMs through a simple web\n", + "interface. This brought the impressive advancements of LLMs into the\n", + "spotlight, since previously these more powerful LLMs were only available\n", + "to researchers with large amounts of resources and those with very deep\n", + "technical knowledge.\n", + "\n", + "**G R O W I N G C O M P U TAT I O N A L P O W E R**\n", + "The availability of more powerful computing resources, such as graphics\n", + "processing units (GPUs), and better data processing techniques allowed\n", + "researchers to train much larger models, improving the performance of\n", + "these language models.\n", + "\n", + "**I M P R O V E D T R A I N I N G D ATA**\n", + "As we get better at collecting and analyzing large amounts of data, the\n", + "\n", + "model performance has improved dramatically. In fact, Databricks showed\n", + "that you can get amazing results training a relatively small model with a\n", + "high-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\n", + "with the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n", + "\n", + "\n", + "-----\n", + "\n", + "##### So what are organizations using large language models for?\n", + "\n", + "Here are just a few examples of common use cases for large language models:\n", + "\n", + "**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\n", + "One of the most common implementations, LLMs can be used by\n", + "organizations to provide help with things like customer support,\n", + "troubleshooting, or even having open-ended conversations with userprovided prompts.\n", + "\n", + "**C O D E G E N E R AT I O N A N D D E B U G G I N G**\n", + "LLMs can be trained on large amounts of code examples and give\n", + "useful code snippets as a response to a request written in natural language.\n", + "With the proper techniques, LLMs can also be built in a way to reference\n", + "other relevant data that it may not have been trained with, such as a\n", + "company’s documentation, to help provide more accurate responses.\n", + "\n", + "**S E N T I M E N T A N A LY S I S**\n", + "Often a hard task to quantify, LLMs can help take a piece of text and gauge\n", + "emotion and opinions. This can help organizations gather the data and\n", + "\n", + "feedback needed to improve customer satisfaction.\n", + "\n", + "\n", + "**L A N G U A G E T R A N S L AT I O N**\n", + "Globalize all your content without hours of painstaking work by simply\n", + "feeding your web pages through the proper LLMs and translating them to\n", + "different languages. As more LLMs are trained in other languages, quality\n", + "and availability will continue to improve.\n", + "\n", + "**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\n", + "Entire customer calls or meetings could be efficiently summarized so that\n", + "others can more easily digest the content. LLMs can take large amounts of\n", + "text and boil it down to just the most important bytes.\n", + "\n", + "**C O N T E N T G E N E R AT I O N**\n", + "Start with a detailed prompt and have an LLM develop an outline for you.\n", + "Then continue on with those prompts and LLMs can generate a good first\n", + "draft for you to build off. Use them to brainstorm ideas, and ask the LLM\n", + "questions to help you draw inspiration from.\n", + "\n", + "**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\n", + "language, but they might not know who won the big sporting event last year.\n", + "It’s always important to fact check and understand the responses before\n", + "\n", + "using them as a reference.\n", + "\n", + "\n", + "**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\n", + "The ability to categorize and sort large volumes of data enables the\n", + "identification of common themes and trends, supporting informed\n", + "decision-making and more targeted strategies.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 3\n", + "## Applying Large Language Models\n", + "\n", + "\n", + "There are a few paths that one can take when looking to apply large language\n", + "models for their given use case. Generally speaking, you can break them down\n", + "into two categories, but there’s some crossover between each. We’ll briefly cover\n", + "the pros and cons of each and what scenarios fit best for each.\n", + "\n", + "##### Proprietary services\n", + "\n", + "As the first widely available LLM powered service, OpenAI’s ChatGPT was the\n", + "explosive charge that brought LLMs into the mainstream. ChatGPT provides\n", + "a nice user interface (or API) where users can feed prompts to one of many\n", + "models (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\n", + "among the highest-performing models, trained on enormous data sets, and are\n", + "capable of extremely complex tasks both from a technical standpoint, such as\n", + "code generation, as well as from a creative perspective like writing poetry in a\n", + "specific style.\n", + "\n", + "The downside of these services is the absolutely enormous amount of compute\n", + "required not only to train them (OpenAI has said GPT-4 cost them over $100\n", + "million to develop) but also to serve the responses. For this reason, these\n", + "extremely large models will likely always be under the control of organizations,\n", + "\n", + "\n", + "and require you to send your data to their servers in order to interact with their\n", + "language models. This raises privacy and security concerns, and also subjects\n", + "users to “black box” models, whose training and guardrails they have no control\n", + "over. Also, due to the compute required, these services are not free beyond a\n", + "very limited use, so cost becomes a factor in applying these at scale.\n", + "\n", + "In summary: Proprietary services are great to use if you have very complex tasks,\n", + "are okay with sharing your data with a third party, and are prepared to incur\n", + "costs if operating at any significant scale.\n", + "\n", + "##### Open source models\n", + "\n", + "The other avenue for language models is to go to the open source community,\n", + "where there has been similarly explosive growth over the past few years.\n", + "Communities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n", + "\n", + "from contributors that can help solve tons of specific use cases such as text\n", + "generation, summarization and classification. The open source community has\n", + "been quickly catching up to the performance of the proprietary models, but\n", + "ultimately still hasn’t matched the performance of something like GPT-4.\n", + "\n", + "\n", + "-----\n", + "\n", + "It does currently take a little bit more work to grab an open source model and\n", + "start using it, but progress is moving very quickly to make them more accessible\n", + "to users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n", + "[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\n", + "experience to pull any Hugging Face transformer model and use it as a Python\n", + "object. Oftentimes, you can find an open source model that solves your specific\n", + "problem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\n", + "the model into your environment and host it yourself. This means that you can\n", + "keep the data in your control for privacy and governance concerns as well as\n", + "manage your costs.\n", + "\n", + "\n", + "##### Conclusion and general guidelines\n", + "\n", + "Ultimately, every organization is going to have unique challenges to overcome,\n", + "and there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\n", + "becomes more data driven, everything, including LLMs, will be reliant on having\n", + "a strong foundation of data. LLMs are incredible tools, but they have to be used\n", + "and implemented on top of this strong data foundation. Databricks brings both\n", + "that strong data foundation as well as the integrated tools to let you use and\n", + "fine-tune LLMs in your domain.\n", + "\n", + "\n", + "Another huge upside to using open source models is the ability to fine-tune\n", + "them to your own data. Since you’re not dealing with a black box of a proprietary\n", + "service, there are techniques that let you take open source models and train\n", + "them to your specific data, greatly improving their performance on your\n", + "specific domain. We believe the future of language models is going to move\n", + "in this direction, as more and more organizations will want full control and\n", + "understanding of their LLMs.\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 4\n", + "## So What Do I Do Next If I Want to Start Using LLMs?\n", + "\n", + "\n", + "That depends where you are on your journey! Fortunately, we have a few paths\n", + "for you.\n", + "\n", + "If you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\n", + "you can watch one of Databricks’ most talented developers and speakers go\n", + "over these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n", + "[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n", + "\n", + "If you’re ready to dive a little deeper and expand your education and\n", + "understanding of LLM foundations, we’d recommend checking out our\n", + "[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\n", + "and dive into the theory behind foundation models.\n", + "\n", + "If your hands are already shaking with excitement and you already have some\n", + "working knowledge of Python and Databricks, we’ll provide some great examples\n", + "with sample code that can get you up and running with LLMs right away!\n", + "\n", + "\n", + "###### Getting started with NLP using Hugging Face transformers pipelines\n", + "\n", + " Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n", + "\n", + " Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000\n", + "\n", + "organizations worldwide — including Comcast, Condé Nast and\n", + "\n", + "over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n", + "\n", + "Platform to unify their data, analytics and AI. Databricks is\n", + "\n", + "headquartered in San Francisco, with offices around the globe.\n", + "\n", + "Founded by the original creators of Apache Spark™, Delta Lake\n", + "\n", + "and MLflow, Databricks is on a mission to help data teams solve\n", + "\n", + "the world’s toughest problems. To learn more, follow Databricks on\n", + "\n", + "[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n", + "\n", + "**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n", + "\n", + "#### Contact us for a personalized demo: databricks.com/contact\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf2024-09-19T16:57:20Z
# Building Reliable Data Lakes at Scale With Delta Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "#### Data Engineering Drivers 2\n", + "\n", + " Data Pipeline Key Goals 4\n", + "\n", + " Apache Spark™: The First Unified Analytics Engine 5\n", + "\n", + " Data Reliability Challenges With Data Lakes 6\n", + "\n", + " Delta Lake: A New Storage Layer 7\n", + "\n", + " Delta Lake: Key Features 8\n", + "\n", + " Getting Started With Delta Lake 10\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "#### Data Engineering Drivers\n", + "\n", + "Data engineering professionals are needing to respond to several different drivers.\n", + "\n", + "Chief among the drivers they face are:\n", + "\n", + "**Rise of Advanced Analytics** — Advanced analytics, including methods\n", + "\n", + "based on machine learning techniques, have evolved to such a degree that\n", + "\n", + "organizations seek to derive far more value from their corporate assets.\n", + "\n", + "**Widespread Adoption** — Once the province of leading edge, high-tech\n", + "\n", + "companies, these advanced approaches are being adopted across a\n", + "\n", + "multitude of industries from retail to hospitality to healthcare and across\n", + "\n", + "private as well as public sector organizations. This is further driving the need\n", + "\n", + "for strong data engineering practices.\n", + "\n", + "**Regulation** — With the growth of data generation and data collection,\n", + "\n", + "there is increased interest in how the data is protected and managed.\n", + "\n", + "Regulatory regimes such as GDPR (General Data Protection Regulation)\n", + "\n", + "from the EU and other jurisdictions mandate very specific ways in which\n", + "\n", + "data must be managed.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "**Technology Innovation** — The move to cloud-based analytics architectures\n", + "\n", + "that is now well underway is being propelled further by innovations such as\n", + "\n", + "analytics-focused chipsets, pipeline automation and the unification of data\n", + "\n", + "and machine learning. All these offer data professionals new approaches for\n", + "\n", + "their data initiatives.\n", + "\n", + "**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n", + "\n", + "also subject to increasing scrutiny. There is also a greater understanding of\n", + "\n", + "data as a valuable asset. Deriving value from data must be done in a manner\n", + "\n", + "that is financially responsible and actually value adding to the enterprise and\n", + "\n", + "meeting ROI hurdles.\n", + "\n", + "**Role Evolution** — Reflecting the importance of managing the data and\n", + "\n", + "maximizing value extraction, the Chief Data Officer (CDO) role is becoming\n", + "\n", + "more prominent and newer roles such as Data Curator are emerging.\n", + "\n", + "They must balance the needs of governance, security and democratization.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Key Goals\n", + "\n", + "#### Data Pipeline Key Goals\n", + "\n", + "Making quality data available in a reliable manner is a major determinant of success for data\n", + "\n", + "analytics initiatives be they regular dashboards or reports, or advanced analytics projects\n", + "\n", + "drawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n", + "\n", + "responsibility need to take account of a broad set of dependencies and requirements as they\n", + "\n", + "design and build their data pipelines.\n", + "\n", + "Three primary goals that data engineers typically seek to address as they work to enable the\n", + "\n", + "analytics professionals in their organizations are:\n", + "\n", + "**Deliver quality data in less time** — When it comes to data, quality and timeliness\n", + "\n", + "are key. Data with gaps or errors (which can arise for many reasons) is\n", + "\n", + "“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n", + "\n", + "users. Equally well, many applications require up-to-date information (who\n", + "\n", + "wants to use last night’s closing stock price or weather forecast) and are of\n", + "\n", + "limited value without it.\n", + "\n", + "**Enable faster queries** — Wanting fast responses to queries is natural enough\n", + "\n", + "in today’s “New York minute,” online world. Achieving this is particularly\n", + "\n", + "demanding when the queries are based on very large data sets.\n", + "\n", + "**Simplify data engineering at scale** — It is one thing to have high reliability and\n", + "\n", + "performance in a limited, development or test environment. What matters\n", + "\n", + "more is the ability to have robust, production data pipelines at scale without\n", + "\n", + "requiring high operational overhead.\n", + "\n", + "\n", + "-----\n", + "\n", + "### ™\n", + "## Apache Spark\n", + "\n", + "#### Apache Spark ™ : The First Unified Analytics Engine\n", + "\n", + "Originally developed at UC Berkeley in 2009, Apache Spark can be\n", + "\n", + "considered the first unified analytics engine. Uniquely bringing data\n", + "\n", + "\n", + "and AI technologies together, Spark comes packaged with higher-level\n", + "\n", + "libraries, including support for SQL queries, streaming data, machine\n", + "\n", + "learning and graph processing. These standard libraries increase\n", + "\n", + "developer productivity and can be seamlessly combined to create\n", + "\n", + "\n", + "Customer\n", + "Data\n", + "\n", + "Emails/\n", + "Web Pages\n", + "\n", + "\n", + "Click\n", + "Streams\n", + "\n", + "Video/\n", + "Speech\n", + "\n", + "...\n", + "\n", + "Sensor\n", + "Data (IoT)\n", + "\n", + "\n", + "complex workflows.\n", + "\n", + "\n", + "#### Big Data Processing\n", + "\n", + "\n", + "#### Machine Learning\n", + "\n", + "\n", + "Since its release, Apache Spark, has seen rapid adoption by\n", + "\n", + "enterprises across a wide range of industries. Internet powerhouses\n", + "\n", + "\n", + "ETL + SQL + Streaming MLlib + SparkR\n", + "\n", + "\n", + "such as Netflix, Yahoo and eBay have deployed Spark at massive scale,\n", + "\n", + "\n", + "collectively processing multiple petabytes of data on clusters of over\n", + "\n", + "8,000 nodes making it the de facto choice for new analytics initiatives.\n", + "\n", + "It has quickly become the largest open source community in big data,\n", + "\n", + "with over 1000 contributors from 250+ organizations.\n", + "\n", + "\n", + "##### While Spark has had a significant impact in taking data analytics to the next level, practitioners continue to face data reliability and performance challenges with their data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Data Reliability Challenges With Data Lakes\n", + "\n", + "\n", + "**Failed Writes** — If a production job that is writing data experiences failures which\n", + "\n", + "are inevitable in large distributed environments, it can result in data corruption\n", + "\n", + "through partial or multiple writes. What is needed is a mechanism that is able to\n", + "\n", + "ensure that either a write takes place completely or not at all (and not multiple times,\n", + "\n", + "adding spurious data). Failed jobs can impose a considerable burden to recover\n", + "\n", + "to a clean state.\n", + "\n", + "\n", + "**Schema Mismatch** — When ingesting content from multiple sources, typical of\n", + "\n", + "large, modern big data environments, it can be difficult to ensure that the same\n", + "\n", + "data is encoded in the same way i.e., the schema matches. A similar challenge\n", + "\n", + "arises when the formats for data elements are changed without informing the\n", + "\n", + "data engineering team. Both can result in low quality, inconsistent data that\n", + "\n", + "requires cleaning up to improve its usability. The ability to observe and enforce\n", + "\n", + "schema would serve to mitigate this.\n", + "\n", + "\n", + "**Lack of Consistency** — In a complex big data environment, one may be interested\n", + "\n", + "in considering a mix of both batch and streaming data. Trying to read data while\n", + "\n", + "it is being appended to provides a challenge since on the one hand there is a\n", + "\n", + "desire to keep ingesting new data while on the other hand anyone reading the\n", + "\n", + "data prefers a consistent view. This is especially an issue when there are multiple\n", + "\n", + "readers and writers at work. It is undesirable and impractical, of course, to\n", + "\n", + "stop read access while writes complete or stop write access while reads are\n", + "\n", + "in progress.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: A New Storage Layer\n", + "\n", + "[Delta Lake](https://delta.io/) is an open source storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable metadata handling, and unifies\n", + "\n", + "streaming and batch data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs. Raw data is ingested\n", + "\n", + "from various batch and streaming input sources. Simple, reliable data pipelines help create a curated data lake containing tables of differing degrees of\n", + "\n", + "refinement based on business needs. The data in these tables is then made available via the standard Spark APIs or special connectors for various use cases\n", + "\n", + "such as machine learning, SQL analytics or feeding to a data warehouse.\n", + "\n", + "Streaming\n", + "\n", + "###### Analytics and Machine Learning\n", + "\n", + "\n", + "Batch\n", + "\n", + "\n", + "Ingestion Tables Refined Tables\n", + "(Bronze) (Silver)\n", + "\n", + "\n", + "Feature/Agg Data Store\n", + "(Gold)\n", + "\n", + "\n", + "###### Your Existing Data Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: Key Features\n", + "\n", + "\n", + "**ACID Transactions —** Data lakes typically have multiple data pipelines reading\n", + "\n", + "and writing data concurrently, and data engineers have to go through a tedious\n", + "\n", + "process to ensure data integrity, due to the lack of transactions. Delta Lake\n", + "\n", + "brings ACID transactions to your data lakes. It provides serializability, the\n", + "\n", + "\n", + "**Scalable Metadata Handling —** In big data, even the metadata itself can be “big\n", + "\n", + "data.” Delta Lake treats metadata just like data, leveraging Spark’s distributed\n", + "\n", + "processing power to handle all its metadata. As a result, Delta Lake can handle\n", + "\n", + "petabyte-scale tables with billions of partitions and files at ease.\n", + "\n", + "\n", + "strongest level of isolation level.\n", + "\n", + "\n", + "**Time Travel (data versioning) —** Delta Lake provides snapshots of data enabling\n", + "\n", + "developers to access and revert to earlier versions of data for audits, rollbacks or\n", + "\n", + "to reproduce experiments. For further details, please see this [documentation](https://www.google.com/url?q=https://docs.delta.io/latest/delta-batch.html%23-deltatimetravel&sa=D&source=editors&ust=1666305658154469&usg=AOvVaw0Zh1svr9wsqkIDKGQTgtLh) .\n", + "\n", + "\n", + "**Schema Enforcement —** Delta Lake provides the ability to specify your schema\n", + "\n", + "and enforce it. This helps ensure that the data types are correct and required\n", + "\n", + "columns are present, preventing bad data from causing data corruption.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Delta Lake: Key Features\n", + "\n", + "Parquet\n", + "\n", + "\n", + "**Open Format —** All data in Delta Lake is stored in Apache Parquet format,\n", + "\n", + "enabling Delta Lake to leverage the efficient compression and encoding schemes\n", + "\n", + "that are native to Parquet.\n", + "\n", + "**Unified Batch and Streaming Source and Sink** — A table in Delta Lake is both a\n", + "\n", + "batch table, as well as a streaming source and sink. Streaming data ingest, batch\n", + "\n", + "historic backfill, and interactive queries all just work out of the box.\n", + "\n", + "\n", + "**Schema Evolution —** Big data is continuously changing. Delta Lake\n", + "\n", + "enables you to make changes to a table schema that can be applied\n", + "\n", + "automatically, without the need for cumbersome DDL.\n", + "\n", + "**100% Compatible With Apache Spark API —** Developers can use Delta\n", + "\n", + "Lake with their existing data pipelines with minimal change as it is fully\n", + "\n", + "compatible with Spark, the commonly used big data processing engine.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Getting Started With Delta Lake\n", + "\n", + "**Getting started with Delta Lake is easy. Specifically, to create a Delta table simply specify Delta instead of using Parquet.**\n", + "\n", + "\n", + "#### Instead of parquet ...\n", + "```\n", + "dataframe\n", + ".write\n", + ".format(“ parquet ”)\n", + ".save(“/data”)\n", + "\n", + "```\n", + "\n", + "#### … simply say delta\n", + "```\n", + "dataframe\n", + ".write\n", + ".format(“ delta ”)\n", + ".save(“/data”)\n", + "\n", + "```\n", + "\n", + "##### Learn more about Delta Lake :\n", + "\n", + "[Delta Lake Blogs](https://delta.io/blog)\n", + "\n", + "Delta Lake Tutorials\n", + "\n", + "[Delta Lake Integrations](https://delta.io/integrations/)\n", + "\n", + "**For more information, please refer to the** **[documentation](https://docs.delta.io/latest/index.html)** **.**\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf2024-09-19T16:57:20Z
#### eBook\n", + "\n", + "# The CDP Build vs Buy Guide:\n", + "\n", + "### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Customer Data Platform\n", + "\n", + "\n", + "Organizations need to deliver personalized experiences to their customers to stay ahead\n", + "of the curve — that means they need a customer data platform (CDP). Through a CDP, data\n", + "from every touch point, along with third-party information, is brought together to provide\n", + "a unified view of the customer. This enables your marketing team to analyze, identify and\n", + "activate customers with targeted content.\n", + "\n", + "The key question for all IT teams at these organizations is whether to build or to buy.\n", + "\n", + "A CDP that sounds like music to the ears of business leaders may be perceived as noise\n", + "by enterprise IT leaders. The business side of the house needs immediate enablement, and\n", + "an out-of-the-box system dedicated to the specialized needs of marketers seems like the\n", + "fastest path to a solution.\n", + "\n", + "But for IT, the CDP is yet another system, bringing stack baggage and redundancies to\n", + "existing marketing and analytics systems.. The cost of adding another system to the\n", + "landscape and the redundancy of sensitive customer data creates a governance challenge\n", + "that has immediate consequences.\n", + "\n", + "**Critical IT Needs** **Critical Business Needs**\n", + "\n", + "\n", + "Keep control of data access and\n", + "governance; ability to architecture a\n", + "customer data stack with decisions on\n", + "where data is stored and where queries\n", + "are executed\n", + "\n", + "\n", + "Get customer data access via a no-code\n", + "interface to generate insights; build customer\n", + "experiences and activate data within\n", + "business applications\n", + "\n", + "\n", + "-----\n", + "\n", + "The question of whether to build or buy seems to leave legitimate needs and concerns by one\n", + "side or the other unaddressed — which is why so many organizations who have built a CDP\n", + "have expressed dissatisfaction regardless of which side of the fence they came down upon.\n", + "\n", + "**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n", + "**both sides of the debate and provide organizations a third choice of both building and**\n", + "**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\n", + "the business with no-code and ease of use interface along with the flexibility and centralized\n", + "governance IT desires. By shifting the conversation from building or buying to building _and_\n", + "buying, we’ve opened the door to finding the right balance of approaches for our customer\n", + "organizations, helping organizations find greater success in their personalization journey.\n", + "\n", + "**“We made an attempt to internally build a CDP platform and while we**\n", + "**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n", + "**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n", + "**or offer a campaign interface to our product marketers that could empower**\n", + "**them to create and manage those journeys. It was going to take at least two**\n", + "**years for us to build all of that functionality in house.”**\n", + "\n", + "– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "## Combining the Build and Buy Approaches\n", + "\n", + "\n", + "Bringing together the best of build and buy involves the deployment of the CDP alongside or\n", + "within the lakehouse platform. There are three approaches to this:\n", + "\n", + "**Bundled** **Composable**\n", + "\n", + "**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "(Local & Views)\n", + "\n", + "\n", + "Query\n", + "Virtualization\n", + "\n", + "Metadata\n", + "\n", + "\n", + "Data Copy\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "Storage\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Compute Compute\n", + "\n", + "Storage Storage\n", + "\n", + "\n", + "-----\n", + "\n", + "Deployment Type\n", + "\n", + "**Bundled**\n", + "\n", + "**Composable –**\n", + "**Hybrid**\n", + "\n", + "**Composable –**\n", + "**Lakehouse-Only**\n", + "\n", + "\n", + "Description\n", + "\n", + "The CDP and the lakehouse are managed as two separate systems. Connectors in either system (as well as\n", + "third-party tools) allow data to be exchanged, typically as part of an ad hoc or batch process. This approach\n", + "allows the organization to leverage the functionality of both systems but data is duplicated making governance\n", + "an on-going concern.\n", + "\n", + "The CDP and the lakehouse are managed as two separate systems, but deeper integrations between the two\n", + "allow the organization to decide within which system a specific dataset should reside. Real-time integrations\n", + "between the systems allow CDP users to select information assets in the lakehouse and generate queries\n", + "spanning data on either side of the platform divide. This approach minimizes the need for data duplication\n", + "which simplifies data governance, even though it must be implemented within two separate systems.\n", + "\n", + "All CDP information assets reside within the lakehouse. User interfaces built on other technologies, directly\n", + "interact with the lakehouse for access to data. This approach minimizes redundancy and allows organizations\n", + "to implement a centralized data governance strategy for all consumers of customer-relevant data.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Deployment Architectures \n", + "\n", + "\n", + "The choice of which of these deployment architectures is best depends on the functional\n", + "requirements of a specific organization. Each has its benefits, and in the case of parallel\n", + "and federated deployments, organizations can easily transition between deployment\n", + "architectures over time. The following table captures many of the typical benefits\n", + "associated with the different deployment architectures.\n", + "\n", + "\n", + "Bundled CDP\n", + "Deployment Composable CDPHybrid Composable CDPLakehouse-Only\n", + "\n", + "\n", + "Typical\n", + "User\n", + "\n", + "**IT**\n", + "\n", + "\n", + "Component\n", + "\n", + "Digital Touchpoints\n", + "\n", + "Data Modeling\n", + "\n", + "Identity Resolution\n", + "\n", + "Data Governance\n", + "\n", + "\n", + "Description\n", + "\n", + "Collect and integrate\n", + "data from digital\n", + "channels (website,\n", + "app, etc.)\n", + "\n", + "Unify and model data\n", + "to make it usable by\n", + "other applications\n", + "\n", + "Deduplicate records to\n", + "build a private ID graph\n", + "with a single view of\n", + "the customer\n", + "\n", + "Control data access\n", + "and permitted actions\n", + "on the data\n", + "\n", + "\n", + "Included with CDP\n", + "via a tag\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "Primarily with CDP\n", + "or other tools (MDM,\n", + "Lakehouse)\n", + "\n", + "Included with CDP\n", + "\n", + "\n", + "Works with any digital\n", + "touchpoint collection\n", + "system\n", + "\n", + "Either within the CDP\n", + "or in Lakehouse via\n", + "real-time integration\n", + "\n", + "CDP, MDM, or\n", + "Lakehouse\n", + "\n", + "Both CDP and\n", + "Lakehouse\n", + "\n", + "\n", + "Works with any digital\n", + "touchpoint collection\n", + "system\n", + "\n", + "Unified environment with\n", + "minimal data replication\n", + "in and centralized\n", + "governance in Lakehouse\n", + "\n", + "Built with Lakehouse and\n", + "additional tools\n", + "\n", + "Managed centrally from\n", + "Lakehouse\n", + "\n", + "\n", + "-----\n", + "\n", + "Bundled CDP\n", + "Deployment Composable CDPHybrid Composable CDPLakehouse-Only\n", + "\n", + "\n", + "Typical\n", + "User\n", + "\n", + "**Business**\n", + "\n", + "\n", + "Component\n", + "\n", + "Predictive Scoring\n", + "\n", + "Marketing Audience\n", + "Segments\n", + "\n", + "Customer Journey\n", + "Orchestration\n", + "\n", + "Data Activations\n", + "\n", + "Analytics\n", + "\n", + "\n", + "Description\n", + "\n", + "Create and execute\n", + "models predicting\n", + "user behaviors such as\n", + "purchase or churn\n", + "\n", + "Use a self-service UI\n", + "to build rule-based\n", + "or model-based\n", + "audiences\n", + "\n", + "Define and optimize\n", + "the customer journey\n", + "and interactions with\n", + "the brand across every\n", + "channel and every\n", + "phase of the customer\n", + "lifecycle\n", + "\n", + "Integrate seamlessly\n", + "with delivery systems\n", + "for both inbound and\n", + "outbound customer\n", + "experiences\n", + "\n", + "Understand audience\n", + "and customer journey\n", + "performance\n", + "\n", + "\n", + "Included with CDP\n", + "with supplement\n", + "scoring from\n", + "Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP\n", + "\n", + "\n", + "CDP, or automatically\n", + "present with Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "CDP, marketing\n", + "automation, or\n", + "additional tools\n", + "\n", + "Included with CDP\n", + "\n", + "Sometimes included\n", + "with CDP or built\n", + "with Lakehouse and\n", + "additional tools\n", + "\n", + "\n", + "Automatically present\n", + "with Lakehouse\n", + "\n", + "Included with CDP\n", + "\n", + "CDP, marketing\n", + "automation, or\n", + "additional tools\n", + "\n", + "CDP, or additional tools\n", + "\n", + "Built with Lakehouse\n", + "and additional tools\n", + "\n", + "\n", + "-----\n", + "\n", + "## About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 9,000 organizations worldwide —\n", + "including Comcast, Condé Nast, H&M, and over 50% of the Fortune 500 — rely on\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "is headquartered in San Francisco, with offices around the globe. Founded by the\n", + "original creators of Apache SparkTM, Delta Lake and MLflow, Databricks is on a\n", + "mission to help data teams solve the world’s toughest problems.\n", + "\n", + "## About ActionIQ\n", + "\n", + "AIQ brings order to CX chaos. Our Customer Experience Hub empowers\n", + "everyone to be a CX champion by giving business teams the freedom to explore\n", + "and action on customer data while helping technical teams regain control of\n", + "where data lives and how it’s used.\n", + "\n", + "**[Get in touch](https://www.actioniq.com/get-started/)** with our experts to learn more.\n", + "\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf2024-09-19T16:57:20Z
-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "-----SUCCESS/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf2024-09-19T16:57:19Z
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "**EBOOK**\n\n## The Big Book of Data Engineering 2nd Edition\n\nA collection of technical\nblogs, including code\nsamples and notebooks\n\n##### With all-new content\n\n\n-----\n\n#### Contents\n\n**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n\n**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n\n**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n\n**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n\n**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n\n**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n\n**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n\n**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n\n**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n\n**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n\n**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n\n**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n\n**4 . 1** Akamai .................................................................................................................................................................................................... 77\n\n**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n\n**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n\n**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n\n**4 . 5** Rivian .................................................................................................................................................................................................... 90\n\n**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n\n\n-----\n\n**SECTION**\n\n# 01\n\n\n### Introduction to Data Engineering on Databricks\n\n\n-----\n\nOrganizations realize the value data plays as a strategic asset for various\nbusiness-related initiatives, such as growing revenues, improving the customer\nexperience, operating efficiently or improving a product or service. However,\naccessing and managing data for these initiatives has become increasingly\ncomplex. Most of the complexity has arisen with the explosion of data volumes\nand data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\nto increase, 73% of the data goes unused for analytics or decision-making. In\norder to try and decrease this percentage and make more data usable, data\nengineering teams are responsible for building data pipelines to efficiently and\nreliably deliver data. But the process of building these complex data pipelines\ncomes with a number of difficulties:\n\n**•** In order to get data into a data lake, data engineers are required\nto spend immense time hand-coding repetitive data ingestion tasks\n\n**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state\n\n**Data pipeline observability**\nMonitor overall data pipeline status from a dataflow graph dashboard and\nvisually track end-to-end pipeline health for performance, quality and latency.\nData pipeline observability capabilities include:\n\n**•** A high-quality, high-fidelity lineage diagram that provides visibility\ninto how data flows for impact analysis\n\n**•** Granular logging with performance and status of the data pipeline\nat a row level\n\n**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n\n\n**Automatic deployments and operations**\nEnsure reliable and predictable delivery of data for analytics and machine\nlearning use cases by enabling easy and automatic data pipeline deployments\nand rollbacks to minimize downtime. Benefits include:\n\n**•** Complete, parameterized and automated deployment for the\ncontinuous delivery of data\n\n**•** End-to-end orchestration, testing and monitoring of data pipeline\ndeployment across all major cloud providers\n\n**Migrations**\nAccelerating and de-risking the migration journey to the lakehouse, whether\nfrom legacy on-prem systems or disparate cloud services.\n\nThe migration process starts with a detailed discovery and assessment to\nget insights on legacy platform workloads and estimate migration as well as\nDatabricks platform consumption costs. Get help with the target architecture\nand how the current technology stack maps to Databricks, followed by a\nphased implementation based on priorities and business needs. Throughout\nthis journey companies can leverage:\n\n**•** Automation tools from Databricks and its ISV partners\n\n**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n\n**•** Databricks Professional Services and training\n\nThis is the recommended approach for a successful migration, whereby\ncustomers have seen a 25-50% reduction in costs and 2-3x faster time to value\nfor their use cases.\n\n\n-----\n\n**Unified governance**\nWith Unity Catalog, data engineering and governance teams benefit from an\nenterprisewide data catalog with a single interface to manage permissions,\ncentralize auditing, automatically track data lineage down to the column level,\nand share data across platforms, clouds and regions. Benefits:\n\n**•** Discover all your data in one place, no matter where it lives,\nand centrally manage fine-grained access permissions using an\nANSI SQL-based interface\n\n**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**\n\nAs organizations strive to become data-driven, data engineering is a focal\npoint for success. To deliver reliable, trustworthy data, data engineers shouldn’t\nneed to spend time manually developing and maintaining an end-to-end\nETL lifecycle. Data engineering teams need an efficient, scalable way to\nsimplify ETL development, improve data reliability and manage operations.\n\nAs described, the eight key differentiating capabilities simplify the\nmanagement of the ETL lifecycle by automating and maintaining all data\ndependencies, leveraging built-in quality controls with monitoring and by\nproviding deep visibility into pipeline operations with automatic recovery.\nData engineering teams can now focus on easily and rapidly building reliable\nend-to-end production-ready data pipelines using only SQL or Python\nfor batch and streaming that deliver high-value data for analytics, data\nscience or machine learning.\n\n\n**Follow proven best practices**\n\nIn the next section, we describe best practices for data engineering\nend-to end use cases drawn from real-world examples. From data ingestion\nand real-time processing to analytics and machine learning, you’ll learn\nhow to translate raw data into actionable data.\n\nAs you explore the rest of this guide, you can find data sets and code\nsamples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\nget your hands dirty as you explore all aspects of the data lifecycle on the\nDatabricks Lakehouse Platform.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 02\n\n\n### Guidance and Best Practices\n\n**2.1** Top 5 Databricks Performance Tips\n\n**2.2** How to Profile PySpark\n\n**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n\n**2.4** Streaming in Production: Collected Best Practices\n\n**2.5** Streaming in Production: Collected Best Practices, Part 2\n\n**2.6** Building Geospatial Data Products\n\n**2.7** Data Lineage With Unity Catalog\n\n**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:\n\n**•** **Use larger clusters.** It may sound obvious, but this is the number\none problem we see. It’s actually not any more expensive to use a large\ncluster for a workload than it is to use a smaller one. It’s just faster.\nIf there’s anything you should take away from this article, it’s this.\n\nRead section 1. Really.\n\n**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\nto learn more. You won’t regret it.\n\n\n\n**•** **Clean out your configurations** . Configurations carried from one\nApache Spark™ version to the next can cause massive problems. Clean up!\nRead section 3 to learn more.\n\n**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\ncorrectly, if at all. See Section 4 to learn more.\n\n**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\nyou’re writing Spark code, jump to section 5.\n\n**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\nblog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n\n**1. Give your clusters horsepower!**\n\nThis is the number one mistake customers make. Many customers create tiny\nclusters of two workers with four cores each, and it takes forever to do anything.\nThe concern is always the same: they don’t want to spend too much money on\nlarger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n\n\n-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2\n\nNotice that the total cost of the workload stays the same while the real-world\ntime it takes for the job to run drops significantly. So, bump up your Databricks\ncluster specs and speed up your workloads without spending any more money. It\n\ncan’t really get any simpler than that.\n\n**2. Use Photon**\n\nOur colleagues in engineering have rewritten the Spark execution engine in C++\nand dubbed it Photon. The results are impressive!\n\n\nBeyond the obvious improvements due to running the engine in native code,\nthey’ve also made use of CPU-level performance features and better memory\n\nmanagement. On top of this, they’ve rewritten the Parquet writer in C++. So this\nmakes writing to Parquet and Delta (based on Parquet) super fast as well!\n\nBut let’s also be clear about what Photon is speeding up. It improves\ncomputation speed for any built-in functions or operations, as well as writes to\nParquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\nspending most of its time reading from an ancient on-prem database? Won’t\nhelp there either, unfortunately.\n\n\n-----\n\nThe good news is that it helps where it can. So even if part of your job can’t be\nsped up, it will speed up the other parts. Also, most jobs are written with the\nnative operations and spend a lot of time writing to Delta, and Photon helps a lot\nthere. So give it a try. You may be amazed by the results!\n\n**3. Clean out old configurations**\n\nYou know those Spark configurations you’ve been carrying along from version to\nversion and no one knows what they do anymore? They may not be harmless.\nWe’ve seen jobs go from running for hours down to minutes simply by cleaning\nout old configurations. There may have been a quirk in a particular version of\nSpark, a performance tweak that has not aged well, or something pulled off\nsome blog somewhere that never really made sense. At the very least, it’s worth\nrevisiting your Spark configurations if you’re in this situation. Often the default\nconfigurations are the best, and they’re only getting better. Your configurations\nmay be holding you back.\n\n**4. The Delta Cache is your friend**\n\nThis may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.\n\nOf course, your mileage may vary. If you’re doing BI, which involves reading the\nsame tables over and over again, caching gives an amazing boost. However, if\nyou’re simply reading a table once and writing out the results as in some ETL\njobs, you may not get much benefit. You know your jobs better than anyone.\nGo forth and conquer.\n\n\n-----\n\n**5. Be aware of lazy evaluation**\n\n\nHowever, there is a catch here. Every time you try to display or write out\nresults, it runs the execution plan again. Let’s look at the same block of code\nbut extend it and do a few more operations.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n.filter(...)\n)\n\n_# Now run the execution plan to get results_\ndf2.display()\n\n_# Unfortunately this will run the plan again, including filtering, joining,_\n_etc_\ndf2.display()\n\n_# So will this…_\ndf2.count()\n—------\n\n\nIf you’re a data analyst or data scientist only using SQL or doing BI you can skip\nthis section. However, if you’re in data engineering and writing pipelines or doing\nprocessing using Databricks/Spark, read on.\n\nWhen you’re writing Spark code like select, groupBy, filter, etc., you’re really\nbuilding an execution plan. You’ll notice the code returns almost immediately when\nyou run these functions. That’s because it’s not actually doing any computation. So\neven if you have petabytes of data, it will return in less than a second.\n\nHowever, once you go to write your results out you’ll notice it takes longer. This\nis due to lazy evaluation. It’s not until you try to display or write results that your\nexecution plan is actually run.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.\n\n\nThis works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\nbenefit greatly from lazy evaluation, but it’s something a lot of customers trip\nover. So be aware of its existence and save results you reuse in order to avoid\nunnecessary computation.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nLet’s look at the same block of code again, but this time let’s avoid the\nrecomputation:\n\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n)\n\n_# save it_\ndf2.write.save(path)\n\n_# load it back in_\ndf3 = spark.read.load(path)\n\n_# now use it_\ndf3.display()\n\n_# this is not doing any extra computation anymore. No joins, filtering,_\n_etc. It’s already done and saved._\ndf3.display()\n\n_# nor is this_\ndf3.count()\n\n\n-----\n\nSECTION 2.2 \u0007\n\n**How to Profile PySpark**\n\nby **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n\nOctober 6, 2022\n\n\nIn Apache Spark™, declarative Python APIs are supported for big data workloads.\nThey are powerful enough to handle most common use cases. Furthermore,\nPySpark UDFs offer more flexibility since they enable users to run arbitrary\nPython code on top of the Apache Spark™ engine. Users only have to state\n“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\nPySpark easier to use, but it can be difficult to identify performance bottlenecks\nand apply custom optimizations.\n\nTo address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**\n\nPySpark applications run as independent sets of processes on a cluster,\ncoordinated by the SparkContext object in the driver program. On the driver\nside, PySpark is a regular Python process; thus, we can profile it as a normal\nPython program using cProfile as illustrated below:\n\nimport cProfile\n\nwith cProfile.Profile() as pr:\n_# Your code_\n\npr.print_stats()\n\n**Workers profiling**\n\nExecutors are distributed on worker nodes in the cluster, which introduces\ncomplexity because we need to aggregate profiles. Furthermore, a Python worker\nprocess is spawned per executor for PySpark UDF execution, which makes the\nprofiling more intricate.\n\n\n-----\n\nThe UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\nand becomes a major tool to profile workers for PySpark applications. We’ll\nillustrate how to use the UDF profiler with a simple Pandas UDF example.\n\nFirstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n```\n sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n by 'id' )\n ).withColumn( 'v' , rand())\n\n```\nLater, we will group by the id column, which results in 8 groups with 1,000 rows\nper group.\n\nThe Pandas UDF plus_one is then created and applied as shown below:\n```\n import pandas as pd\n def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf.apply( lambda x: x + 1 , axis= 1 )\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n res.collect()\n\n```\n\nExecuting the example above and running sc.show_profiles() prints the\nfollowing profile. The profile below can also be dumped to disk by sc.dump_\nprofiles(path).\n\nThe UDF id in the profile (271, highlighted above) matches that in the Spark plan\nfor res. The Spark plan can be shown by calling res.explain() .\n\n\nNote that plus_one takes a pandas DataFrame and returns another pandas\nDataFrame. For each group, all columns are passed together as a pandas\nDataFrame to the plus_one UDF, and the returned pandas DataFrames are\ncombined into a PySpark DataFrame.\n\n\n-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function\n\nDigging into the column details: plus_one is triggered once per group, 8 times\nin total; _arith_method of pandas Series is called once per row, 8,000 times\nin total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\nrow, thus suffering from high invocation overhead.\n\nWe can reduce such overhead by substituting the pandas.DataFrame.apply\nwith pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\nfollows:\n```\n import pandas as pd\n def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf + 1\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n schema)\n res.collect()\n\n```\n\nThe updated profile is as shown below.\n\nWe can summarize the optimizations as follows:\n\n**•** Arithmetic operation from 8,000 calls to 8 calls\n\n**•** Total function calls from 2,898,160 calls to 2,384 calls\n\n**•** Total execution time from 2.300 seconds to 0.004 seconds\n\nThe short example above demonstrates how the UDF profiler helps us deeply\nunderstand the execution, identify the performance bottleneck and enhance\nthe overall performance of the user-defined function.\n\nThe UDF profiler was implemented based on the executor-side profiler,\nwhich is designed for PySpark RDD API. The executor-side profiler is available\nin all active Databricks Runtime versions.\n\n\n-----\n\nBoth the UDF profiler and the executor-side profiler run on Python workers.\nThey are controlled by the spark.python.profile Spark configuration, which\nis false by default. We can enable that Spark configuration on a Databricks\nRuntime cluster as shown below.\n\n\n**Conclusion**\n\nPySpark profilers are implemented based on cProfile; thus, the profile reporting\nrelies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\ncollecting profile reports from Python workers.\n\nPowerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022\n\n\n[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\napproach for creating reliable data pipelines and fully manages the underlying\ninfrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\nactionable insights derived from near real-time data. Delta Live Tables enables\nlow-latency streaming data pipelines to support such use cases with low\nlatencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n\nThis article will walk through using DLT with Apache Kafka while providing the\nrequired Python code to ingest streams. The recommended system architecture\nwill be explained, and related DLT settings worth considering will be explored\nalong the way.\n\n**Streaming platforms**\n\nEvent buses or message buses decouple message producers from consumers.\nA popular streaming use case is the collection of click-through data from\nusers navigating a website where every user interaction is stored as an event in\n\n\nApache Kafka. The event stream from Kafka is then used for real-time streaming\ndata analytics. Multiple message consumers can read the same data from Kafka\nand use the data to learn about audience interests, conversion rates, and bounce\nreasons. The real-time, streaming event data from the user interactions often\nalso needs to be correlated with actual purchases stored in a billing database.\n\n**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”\n\nWhen developing DLT with Python, the @dlt.table decorator is used to create a\nDelta Live Table. To ensure the data quality in a pipeline, DLT uses [Expectations](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html)\nwhich are simple SQL constraints clauses that define the pipeline’s behavior with\ninvalid records.\n\nSince streaming workloads often come with unpredictable data volumes,\nDatabricks employs [enhanced autoscaling](https://databricks.com/blog/2022/06/29/delta-live-tables-announces-new-capabilities-and-performance-optimizations.html) for data flow pipelines to minimize the\noverall end-to-end latency while reducing cost by shutting down unnecessary\ninfrastructure.\n\n**Delta Live Tables** are fully recomputed, in the right order, exactly once for each\npipeline run.\n\nIn contrast, **streaming Delta Live Tables** are stateful, incrementally computed\nand only process data that has been added since the last pipeline run. If the\nquery which defines a streaming live tables changes, new data will be processed\nbased on the new query but existing data is not recomputed. Streaming live\ntables always use a streaming source and only work over append-only streams,\nsuch as Kafka, Kinesis, or Auto Loader. Streaming DLTs are based on top of Spark\nStructured Streaming.\n\n\nYou can chain multiple streaming pipelines, for example, workloads with very\nlarge data volume and low latency requirements.\n\n**Direct ingestion from streaming engines**\n\nDelta Live Tables written in Python can directly ingest data from an event bus like\nKafka using Spark Structured Streaming. You can set a short retention period for\nthe Kafka topic to avoid compliance issues, reduce costs and then benefit from\nthe cheap, elastic and governable storage that Delta provides.\n\nAs a first step in the pipeline, we recommend ingesting the data as is to a Bronze\n(raw) table and avoid complex transformations that could drop important data.\nLike any Delta table the Bronze table will retain the history and allow it to perform\nGDPR and other compliance tasks.\n\nIngest streaming data from Apache Kafka\n\n\n-----\n\nWhen writing DLT pipelines in Python, you use the @dlt.table annotation\nto create a DLT table. There is no special attribute to mark streaming DLTs in\nPython; simply use spark.readStream() to access the stream. Example code\nfor creating a DLT table with the name kafka_bronze that is consuming data\nfrom a Kafka topic looks as follows:\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nTOPIC = \"tracker-events\"\nKAFKA_BROKER = spark.conf.get( \"KAFKA_SERVER\" )\n_# subscribe to TOPIC at KAFKA_BROKER_\nraw_kafka_events = (spark.readStream\n. format ( \"kafka\" )\n.option( \"subscribe\" , TOPIC)\n.option( \"kafka.bootstrap.servers\" , KAFKA_BROKER)\n.option( \"startingOffsets\" , \"earliest\" )\n.load()\n)\n\n**@dlt.table(table_properties={** **\"pipelines.reset.allowed\"** **:** **\"false\"** **})**\n```\n def kafka_bronze ():\n\n```\nreturn raw_kafka_events\n\npipelines.reset.allowed\n\nNote that event buses typically expire messages after a certain period of time,\nwhereas Delta is designed for infinite retention.\n\nThis might lead to the effect that source data on Kafka has already been deleted\nwhen running a full refresh for a DLT pipeline. In this case, not all historic data\ncould be backfilled from the messaging platform, and data would be missing in\nDLT tables. To prevent dropping data, use the following DLT table property:\n\n\npipelines.reset.allowed=false\n\nSetting pipelines.reset.allowed to false prevents refreshes to the table but\ndoes not prevent incremental writes to the tables or new data from flowing into\nthe table.\n\n**Checkpointing**\n\nIf you are an experienced Spark Structured Streaming developer, you will notice\nthe absence of checkpointing in the above code. In Spark Structured Streaming\ncheckpointing is required to persist progress information about what data has\nbeen successfully processed and upon failure, this metadata is used to restart a\nfailed query exactly where it left off.\n\nWhereas checkpoints are necessary for failure recovery with exactly-once\nguarantees in Spark Structured Streaming, DLT handles state automatically\nwithout any manual configuration or explicit checkpointing required.\n\n**Mixing SQL and Python for a DLT pipeline**\n\nA DLT pipeline can consist of multiple notebooks but one DLT notebook is\nrequired to be written entirely in either SQL or Python (unlike other Databricks\nnotebooks where you can have cells of different languages in a single notebook).\n\nNow, if your preference is SQL, you can code the data ingestion from Apache\nKafka in one notebook in Python and then implement the transformation logic of\nyour data pipelines in another notebook in SQL.\n\n\n-----\n\n**Schema mapping**\n\nWhen reading data from messaging platform, the data stream is opaque and a\nschema has to be provided.\n\nThe Python example below shows the schema definition of events from a fitness\ntracker, and how the value part of the [Kafka message is mapped](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html) to that schema.\n\nevent_schema = StructType([ \\\nStructField( \"time\" , TimestampType(), True ) , \\\nStructField( \"version\" , StringType(), True ), \\\nStructField( \"model\" , StringType(), True ) , \\\nStructField( \"heart_bpm\" , IntegerType(), True ), \\\nStructField( \"kcal\" , IntegerType(), True ) \\\n])\n\n_# temporary table, visible in pipeline but not in data browser,_\n_# cannot be queried interactively_\n**@dlt.table(comment=** **\"real schema for Kakfa payload\"** **,**\n**temporary=** **True** **)**\n```\n def kafka_silver ():\n\n```\nreturn (\n_# kafka streams are (timestamp,value)_\n_# value contains the kafka payload_\n\ndlt.read_stream( \"kafka_bronze\" )\n.select(col( \"timestamp\" ),from_json(col( \"value\" )\n.cast( \"string\" ), event_schema).alias( \"event\" ))\n.select( \"timestamp\" , \"event.*\" )\n\n\n**Benefits**\n\nReading streaming data in DLT directly from a message broker minimizes the\narchitectural complexity and provides lower end-to-end latency since data is\ndirectly streamed from the messaging broker and no intermediary step is involved.\n\n**Streaming ingest with cloud object store intermediary**\n\nFor some specific use cases, you may want to offload data from Apache Kafka,\ne.g., using a Kafka connector, and store your streaming data in a cloud object\nintermediary. In a Databricks workspace, the cloud vendor-specific objectstore can then be mapped via the Databricks Files System (DBFS) as a cloudindependent folder. Once the data is offloaded, [Databricks Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) can\ningest the files.\n\nAuto Loader can ingest data with a single line of SQL code. The syntax to ingest\nJSON files into a DLT table is shown below (it is wrapped across two lines for\nreadability).\n\n_-- INGEST with Auto Loader_\ncreate or replace streaming live table raw\nas select `*` FROM cloud_files(\"dbfs:/data/twitter\", \"json\")\n\n\n-----\n\nNote that Auto Loader itself is a streaming data source and all newly arrived files\nwill be processed exactly once, hence the streaming keyword for the raw table\nthat indicates data is ingested incrementally to that table.\n\nSince offloading streaming data to a cloud object store introduces an additional\nstep in your system architecture it will also increase the end-to-end latency\nand create additional storage costs. Keep in mind that the Kafka connector\nwriting event data to the cloud object store needs to be managed, increasing\noperational complexity.\n\nTherefore Databricks recommends as a best practice to directly access event\nbus data from DLT using [Spark Structured Streaming](https://www.databricks.com/blog/2022/08/09/low-latency-streaming-data-pipelines-with-delta-live-tables-and-apache-kafka.html#described) as described above.\n\n**Other event buses or messaging systems**\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to other event buses or messaging systems. DLT supports any data\nsource that Databricks Runtime directly supports.\n\n**Amazon Kinesis**\nIn Kinesis, you write messages to a fully managed serverless stream. Same as\nKafka, Kinesis does not permanently store messages. The default message\nretention in Kinesis is one day.\n\nWhen using Amazon Kinesis, replace format(“kafka”) with format(“kinesis”) in the\nPython code for streaming ingestion above and add Amazon Kinesis-specific\nsettings with option(). For more information, check the section about Kinesis\nIntegration in the Spark Structured Streaming documentation.\n\n\n**Azure Event Hubs**\n\nFor Azure Event Hubs settings, check the official [documentation at Microsoft](https://docs.microsoft.com/en-us/azure/event-hubs/event-hubs-kafka-spark-tutorial) and\nthe article [Delta Live Tables recipes: Consuming from Azure Event Hubs](https://alexott.blogspot.com/2022/06/delta-live-tables-recipes-consuming.html) .\n\n**Summary**\n\nDLT is much more than just the “T” in ETL. With DLT, you can easily ingest from\nstreaming and batch sources, cleanse and transform data on the Databricks\nLakehouse Platform on any cloud with guaranteed data quality.\n\nData from Apache Kafka can be ingested by directly connecting to a Kafka broker\nfrom a DLT notebook in Python. Data loss can be prevented for a full pipeline\nrefresh even when the source data in the Kafka streaming layer expired.\n\n**Get started**\n\nIf you are a Databricks customer, simply follow the [guide to get started](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables) . Read the\nrelease notes to learn more about what’s included in this GA release. If you are\nnot an existing Databricks customer, [sign up for a free trial](https://www.databricks.com/try-databricks) , and you can view our\ndetailed [DLT pricing here](https://www.databricks.com/product/pricing) .\n\nJoin the conversation in the [Databricks Community](https://community.databricks.com/s/topic/0TO8Y000000VJEhWAO/summit22) where data-obsessed peers\nare chatting about Data + AI Summit 2022 announcements and updates. Learn.\nNetwork.\n\nLast but not least, enjoy the [Dive Deeper into Data Engineering](https://youtu.be/uhZabeKxXBw) session from the\nsummit. In that session, I walk you through the code of another streaming data\nexample with a Twitter livestream, Auto Loader, Delta Live Tables in SQL, and\nHugging Face sentiment analysis.\n\n\n-----\n\nSECTION 2.4 \u0007\n\n**Streaming in Production: Collected Best Practices**\n\nby **B Y A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nDecember 12, 2022\n\n\nReleasing any data pipeline or application into a production state requires\nplanning, testing, monitoring, and maintenance. Streaming pipelines are no\ndifferent in this regard; in this blog we present some of the most important\nconsiderations for deploying streaming pipelines and applications to a\nproduction environment.\n\nAt Databricks, we offer two different ways of building and running streaming\npipelines and applications — [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) and [Databricks Workflows](https://www.databricks.com/product/workflows) .\nDLT is our flagship, fully managed ETL product that supports both batch and\nstreaming pipelines. It offers declarative development, automated operations,\ndata quality, advanced observability capabilities, and more. Workflows enable\ncustomers to run Apache Spark™ workloads in Databricks’ optimized runtime\nenvironment (i.e., Photon) with access to unified governance (Unity Catalog) and\nstorage (Delta Lake). Regarding streaming workloads, both DLT and Workflows\n\nshare the same core streaming engine — Spark Structured Streaming. In the\ncase of DLT, customers program against the DLT API and DLT uses the Structured\nStreaming engine under the hood. In the case of Jobs, customers program\nagainst the Spark API directly.\n\n\nThe recommendations in this blog post are written from the Structured\nStreaming engine perspective, most of which apply to both DLT and Workflows\n(although DLT does take care of some of these automatically, like Triggers and\nCheckpoints). We group the recommendations under the headings “Before\nDeployment” and “After Deployment” to highlight when these concepts will\nneed to be applied and are releasing this blog series with this split between\nthe two. There will be additional deep-dive content for some of the sections\nbeyond as well. We recommend reading all sections before beginning work\nto productionalize a streaming pipeline or application, and revisiting these\nrecommendations as you promote it from dev to QA and eventually production.\n\n**Before deployment**\n\nThere are many things you need to consider when creating your streaming\napplication to improve the production experience. Some of these topics, like\nunit testing, checkpoints, triggers, and state management, will determine how\nyour streaming application performs. Others, like naming conventions and how\nmany streams to run on which clusters, have more to do with managing multiple\nstreaming applications in the same environment.\n\n\n-----\n\n**Unit testing**\n\n\nThe cost associated with finding and fixing a bug goes up exponentially\nthe farther along you get in the SDLC process, and a Structured Streaming\napplication is no different. When you’re turning that prototype into a hardened\nproduction pipeline you need a CI/CD process with built-in tests. So how do you\ncreate those tests?\n\nAt first you might think that unit testing a streaming pipeline requires something\nspecial, but that isn’t the case. The general guidance for streaming pipelines is\nno different than [guidance you may have heard for Spark batch jobs](https://docs.databricks.com/notebooks/testing.html) . It starts by\norganizing your code so that it can be unit tested effectively:\n\n**•** Divide your code into testable chunks\n\n**•** Organize your business logic into functions calling other functions.\nIf you have a lot of logic in a [foreachBatch](https://docs.databricks.com/structured-streaming/foreach.html) or you’ve implemented\n[mapGroupsWithState](https://docs.databricks.com/structured-streaming/initial-state-map-groups-with-state.html) or flatMapGroupsWithState, organize that code into\nmultiple functions that can be individually tested.\n\n**•** Do not code in dependencies on the global state or external systems\n\n**•** Any function manipulating a DataFrame or data set should be organized\nto take the DataFrame/data set/configuration as input and output the\nDataFrame/data set\n\nOnce your code is separated out in a logical manner you can implement unit\ntests for each of your functions. Spark-agnostic functions can be tested like any\nother function in that language. For testing UDFs and functions with DataFrames\nand data sets, there are multiple Spark testing frameworks available. These\n\n\nframeworks support all of the DataFrame/data set APIs so that you can easily\ncreate input, and they have specialized assertions that allow you to compare\nDataFrame content and schemas. Some examples are:\n\n**•** The built-in Spark test suite, designed to test all parts of Spark\n\n**•** spark-testing-base, which has support for both Scala and Python\n\n**•** spark-fast-tests, for testing Scala Spark 2 & 3\n\n**•** chispa, a Python version of spark-fast-tests\n\nCode examples for each of these libraries can be found [here](https://github.com/alexott/spark-playground/tree/master/testing) .\n\nBut wait! I’m testing a streaming application here — don’t I need to make\nstreaming DataFrames for my unit tests? The answer is no; you do not! Even\nthough a streaming DataFrame represents a data set with no defined ending,\nwhen functions are executed on it they are executed on a microbatch — a\ndiscrete set of data. You can use the same unit tests that you would use for a\nbatch application, for both stateless and stateful streams. One of the advantages\nof Structured Streaming over other frameworks is the ability to use the same\ntransformation code for both streaming and with other batch operations for\nthe same sink. This allows you to simplify some operations, like backfilling\ndata, for example, where rather than trying to sync the logic between two\ndifferent applications, you can just modify the input sources and write to the\nsame destination. If the sink is a Delta table, you can even do these operations\nconcurrently if both processes are append-only operations.\n\n\n-----\n\n**Triggers**\n\n\nprocess a microbatch in order to maximize resource utilization, but setting the\ninterval longer would make sense if your stream is running on a shared cluster\nand you don’t want it to constantly take the cluster resources.\n\nIf you do not need your stream to run continuously, either because data doesn’t\ncome that often or your SLA is 10 minutes or greater, then you can use the\nTrigger.Once option. This option will start up the stream, check for anything new\nsince the last time it ran, process it all in one big batch, and then shut down.\nJust like with a continuously running stream when using Trigger.Once, the\ncheckpoint that guarantees fault tolerance (see below) will guarantee exactlyonce processing.\n\nSpark has a new version of Trigger.Once called Trigger.AvailableNow. While\nTrigger.Once will process everything in one big batch, which depending on your\ndata size may not be ideal, Trigger.AvailableNow will split up the data based on\nmaxFilesPerTrigger and maxBytesPerTrigger settings. This allows the data to be\nprocessed in multiple batches. Those settings are ignored with Trigger.Once.\nYou can see examples for setting triggers [here](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#triggers) .\n\n**Pop quiz —** how do you turn your streaming process into a batch process\nthat automatically keeps track of where it left off with just one line of code?\n\n**Answer —** change your processing time trigger to Trigger.Once/Trigger.\nAvailableNow! Exact same code, running on a schedule, that will neither miss nor\nreprocess any records.\n\n\nNow that you know your code works, you need to determine how often your\nstream will look for new data. This is where [triggers](https://docs.databricks.com/structured-streaming/triggers.html) come in. Setting a trigger is\none of the options for the writeStream command, and it looks like this:\n\n_// Scala/Java_\n.trigger(Trigger.ProcessingTime( \"30 seconds\" ))\n\n_# Python_\n.trigger(processingTime= '30 seconds' )\n\nIn the above example, if a microbatch completes in less than 30 seconds,\nthen the engine will wait for the rest of the time before kicking off the next\nmicrobatch. If a microbatch takes longer than 30 seconds to complete, then the\nengine will start the next microbatch immediately after the previous one finishes.\n\nThe two factors you should consider when setting your trigger interval are how\nlong you expect your stream to process a microbatch and how often you want\nthe system to check for new data. You can lower the overall processing latency\nby using a shorter trigger interval and increasing the resources available for\nthe streaming query by adding more workers or using compute or memory\noptimized instances tailored to your application’s performance. These increased\nresources come with increased costs, so if your goal is to minimize costs, then a\nlonger trigger interval with less compute can work. Normally you would not set a\ntrigger interval longer than what it would typically take for your stream to\n\n\n-----\n\n**Name your stream**\n\n\nYou name your children, you name your pets, now it’s time to name your streams.\nThere’s a writeStream option called .queryName that allows you to provide a\nfriendly name for your stream. Why bother? Well, suppose you don’t name it. In\nthat case, all you’ll have to go on in the Structured Streaming tab in the Spark UI\nis the string and the unintelligible guid that is automatically generated\nas the stream’s unique identifier. If you have more than one stream running on a\ncluster, and all of them have and unintelligible strings as identifiers,\nhow do you find the one you want? If you’re exporting metrics how do you tell\nwhich is which?\n\nMake it easy on yourself, and name your streams. When you’re managing them in\nproduction you’ll be glad you did, and while you’re at it, go and name your batch\nqueries in any foreachBatch() code you have.\n\n**Fault tolerance**\n\nHow does your stream recover from being shut down? There are a few different\ncases where this can come into play, like cluster node failures or intentional\nhalts, but the solution is to set up checkpointing. Checkpoints with write-ahead\nlogs provide a degree of protection from your streaming application being\ninterrupted, ensuring it will be able to pick up again where it last left off.\n\nCheckpoints store the current offsets and state values (e.g., aggregate values) for\nyour stream. Checkpoints are stream specific so each should be set to its own\nlocation. Doing this will let you recover more gracefully from shutdowns, failures\nfrom your application code or unexpected cloud provider failures or limitations.\n\n\nTo configure checkpoints, add the checkpointLocation option to your stream\ndefinition:\n\n_// Scala/Java/Python_\nstreamingDataFrame.writeStream\n.format( \"delta\" )\n.option( \"path\" , \"\" )\n.queryName( \"TestStream\" )\n.option( \"checkpointLocation\" , \"\" )\n.start()\n\nTo keep it simple — every time you call .writeStream, you must specify the\ncheckpoint option with a unique checkpoint location. Even if you’re using\nforeachBatch and the writeStream itself doesn’t specify a path or table option,\nyou must still specify that checkpoint. It’s how Spark Structured Streaming gives\nyou hassle-free fault tolerance.\n\nEfforts to manage the checkpointing in your stream should be of little concern\nin general. As [Tathagata Das has said](https://youtu.be/rl8dIzTpxrI?t=454) , “The simplest way to perform streaming\nanalytics is not having to reason about streaming at all.” That said, one setting\n\ndeserves mention as questions around the maintenance of checkpoint files\ncome up occasionally. Though it is an internal setting that doesn’t require direct\nconfiguration, the setting spark.sql.streaming.minBatchesToRetain (default 100)\ncontrols the number of checkpoint files that get created. Basically, the number\nof files will be roughly this number times two, as there is a file created noting the\noffsets at the beginning of the batch (offsets, a.k.a write ahead logs) and another\non completing the batch (commits). The number of files is checked periodically\nfor cleanup as part of the internal processes. This simplifies at least one aspect\nof long-term streaming application maintenance for you.\n\n\n-----\n\nIt is also important to note that some changes to your application code can\ninvalidate the checkpoint. Checking for any of these changes during code\nreviews before deployment is recommended. You can find examples of changes\nwhere this can happen in [Recovery Semantics after Changes in a Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query)\n[Query](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#recovery-semantics-after-changes-in-a-streaming-query) . Suppose you want to look at checkpointing in more detail or consider\nwhether asynchronous checkpointing might improve the latency in your\nstreaming application. In that case, these are covered in greater depth in\n[Speed Up Streaming Queries With Asynchronous State Checkpointing](https://www.databricks.com/blog/2022/05/02/speed-up-streaming-queries-with-asynchronous-state-checkpointing.html) .\n\n**State management and RocksDB**\n\nStateful streaming applications are those where current records may depend\non previous events, so Spark has to retain data in between microbatches.\nThe data it retains is called state, and Spark will store it in a state store and\nread, update and delete it during each microbatch. Typical stateful operations\nare streaming aggregations, streaming dropDuplicates, stream-stream joins,\nmapGroupsWithState, or flatMapGroupsWithState. Some common types of\nexamples where you’ll need to think about your application state could be\nsessionization or hourly aggregation using group by methods to calculate\n\nbusiness metrics. Each record in the state store is identified by a key that is used\nas part of the stateful computation, and the more unique keys that are required\nthe larger the amount of state data that will be stored.\n\nWhen the amount of state data needed to enable these stateful operations\ngrows large and complex, it can degrade your workloads’ performance, leading\nto increased latency or even failures. A typical indicator of the state store being\n\n\nthe culprit of added latency is large amounts of time spent in garbage collection\n(GC) pauses in the JVM. If you are monitoring the microbatch processing time,\nthis could look like a continual increase or wildly varying processing time across\nmicrobatches.\n\nThe default configuration for a state store, which is sufficient for most general\nstreaming workloads, is to store the state data in the executors’ JVM memory.\nLarge number of keys (typically millions, see the Monitoring & Instrumentation\nsection in part 2 of this blog) can add excessive memory pressure on the\nmachine memory and increase the frequency of hitting these GC pauses as it\ntries to free up resources.\n\nOn the Databricks Runtime (now also supported in [Apache Spark 3.2+](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#rocksdb-state-store-implementation) ) you can\nuse [RocksDB](http://rocksdb.org/) as an alternative state store provider to alleviate this source of\nmemory pressure. RocksDB is an embeddable persistent key-value store for fast\nstorage. It features high performance through a log-structured database engine\nwritten entirely in C++ and optimized for fast, low-latency storage.\n\nLeveraging RocksDB as the state store provider still uses machine memory\nbut no longer occupies space in the JVM and makes for a more efficient\nstate management system for large amounts of keys. This doesn’t come for\nfree, however, as it introduces an extra step in processing every microbatch.\nIntroducing RocksDB shouldn’t be expected to reduce latency except when it is\nrelated to memory pressure from state data storage in the JVM. The RocksDBbacked state store still provides the same degree of fault tolerance as the\nregular state storage as it is included in the stream checkpointing.\n\n\n-----\n\nRocksDB configuration, like checkpoint configuration, is minimal by design and so\nyou only need to declare it in your overall Spark configuration:\n\nspark.conf. set (\n\"spark.sql.streaming.stateStore.providerClass\" ,\n\"com.databricks.sql.streaming.state.RocksDBStateStoreProvider\" )\n\nIf you are monitoring your stream using the streamingQueryListener class, then\nyou will also notice that RocksDB metrics will be included in the stateOperators\nfield. For more detailed information on this see the [RocksDB State Store Metrics](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics)\n[section](https://docs.databricks.com/spark/latest/structured-streaming/production.html#rocksdb-state-store-metrics) of “Structured Streaming in Production.”\n\nIt’s worth noting that large numbers of keys can have other adverse impacts in\naddition to raising memory consumption, especially with unbounded or nonexpiring state keys. With or without RocksDB, the state from the application\nalso gets backed up in checkpoints for fault tolerance. So it makes sense that\nif you have state files being created so that they will not expire, you will keep\naccumulating files in the checkpoint, increasing the amount of storage required\nand potentially the time to write it or recover from failures as well. For the data\nin memory (see the Monitoring & Instrumentation section in part 2 of this blog)\n\nthis situation can lead to somewhat vague out-of-memory errors, and for the\ncheckpointed data written to cloud storage you might observe unexpected\nand unreasonable growth. Unless you have a business need to retain streaming\nstate for all the data that has been processed (and that is rare), read the [Spark](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html)\n[Structured Streaming documentation](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) and make sure to implement your stateful\noperations so that the system can drop state records that are no longer needed\n(pay close attention to dropDuplicates and stream-stream joins).\n\n\n**Running multiple streams on a cluster**\n\nOnce your streams are fully tested and configured, it’s time to figure out how to\norganize them in production. It’s a common pattern to stack multiple streams on\nthe same Spark cluster to maximize resource utilization and save cost. This is fine\nto a point, but there are limits to how much you can add to one cluster before\nperformance is affected. The driver has to manage all of the streams running on\nthe cluster, and all streams will compete for the same cores across the workers.\nYou need to understand what your streams are doing and plan your capacity\nappropriately to stack effectively.\n\nHere is what you should take into account when you’re planning on stacking\nmultiple streams on the same cluster:\n\n**•** Make sure your driver is big enough to manage all of your streams. Is your\ndriver struggling with a high CPU utilization and garbage collection? That\nmeans it’s struggling to manage all of your streams. Either reduce the\nnumber of streams or increase the size of your driver.\n\n**•** Consider the amount of data each stream is processing. The more data\nyou are ingesting and writing to a sink, the more cores you will need in\norder to maximize your throughput for each stream. You’ll need to reduce\nthe number of streams or increase the number of workers depending on\nhow much data is being processed. For sources like Kafka you will need to\nconfigure how many cores are being used to ingest with the minPartitions\noption if you don’t have enough cores for all of the partitions across all of\nyour streams.\n\n\n-----\n\n**•** Consider the complexity and data volume of your streams. If all of the\nstreams are doing minimal manipulation and just appending to a sink, then\neach stream will need fewer resources per microbatch and you’ll be able to\nstack more. If the streams are doing stateful processing or computation/\nmemory-intensive operations, that will require more resources for good\nperformance and you’ll want to stack fewer streams.\n\n**•** Consider [scheduler pools](https://spark.apache.org/docs/latest/job-scheduling.html#fair-scheduler-pools) . When stacking streams they will all be\ncontending for the same workers and cores, and one stream that needs a\nlot of cores will cause the other streams to wait. Scheduler pools enable\nyou to have different streams execute on different parts of the cluster.\nThis will enable streams to execute in parallel with a subset of the available\nresources.\n\n\n**Conclusion**\n\nSome of the ideas we’ve addressed here certainly deserve their own time\nand special treatment with a more in-depth discussion, which you can look\nforward to in later deep dives. However, we hope these recommendations are\nuseful as you begin your journey or seek to enhance your production streaming\nexperience. Be sure to continue with the next post, “Streaming in Production:\nCollected Best Practices, Part 2.”\n\n**[Review Databrick’s Structured Streaming Getting Started Guide](https://www.databricks.com/spark/getting-started-with-apache-spark/streaming)**\n\n\n\n**•** Consider your SLA. If you have mission critical streams, isolate them as a\nbest practice so lower-criticality streams do not affect them.\n\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nOn Databricks we typically see customers stack between 10-30 streams on a\ncluster, but this varies depending on the use case. Consider the factors above so\nthat you can have a good experience with performance, cost and maintainability.\n\n\n-----\n\nSECTION 2.5 \u0007\n\n**Streaming in Production: Collected Best Practices, Part 2**\n\nby **A N G E L A C H U** and **T R I S T E N W E N T L I N G**\n\nJanuary 10, 2023\n\n\nIn our two-part blog series titled “Streaming in Production: Collected Best\nPractices,” this is the second article. Here we discuss the “After Deployment”\nconsiderations for a Structured Streaming Pipeline. The majority of the\nsuggestions in this post are relevant to both Structured Streaming Jobs and\nDelta Live Tables (our flagship and fully managed ETL product that supports\nboth batch and streaming pipelines).\n\n**After deployment**\n\nAfter the deployment of your streaming application, there are typically three\nmain things you’ll want to know:\n\n**•** How is my application running?\n\n**•** Are resources being used efficiently?\n\n**•** How do I manage any problems that come up?\n\nWe’ll start with an introduction to these topics, followed by a deeper dive later in\nthis blog series.\n\n\n**Monitoring and instrumentation (How is my application running?)**\n\nStreaming workloads should be pretty much hands-off once deployed to\nproduction. However, one thing that may sometimes come to mind is: “how is my\napplication running?” Monitoring applications can take on different levels and\nforms depending on:\n\n**•** the metrics collected for your application (batch duration/latency,\nthroughput, …)\n\n**•** where you want to monitor the application from\n\nAt the simplest level, there is a streaming dashboard ( [A Look at the New](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html)\n[Structured Streaming UI](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) ) and built-in logging directly in the Spark UI that can be\nused in a variety of situations.\n\nThis is in addition to setting up failure alerts on jobs running streaming\nworkloads.\n\nIf you want more fine-grained metrics or to create custom actions based on\nthese metrics as part of your code base, then the StreamingQueryListener is\nbetter aligned with what you’re looking for.\n\n\n-----\n\nIf you want the Spark metrics to be reported (including machine level traces for\ndrivers or workers) you should use the platform’s [metrics sink](https://spark.apache.org/docs/latest/monitoring.html#metrics) .\n\nThe Apache Spark Structured Streaming UI\n\n\nAnother point to consider is where you want to surface these metrics for\nobservability. There is a Ganglia dashboard at the cluster level, integrated partner\napplications like [Datadog](https://www.datadoghq.com/blog/databricks-monitoring-datadog/) for monitoring streaming workloads, or even more open\nsource options you can build using tools like Prometheus and Grafana. Each\nhas advantages and disadvantages to consider around cost, performance, and\nmaintenance requirements.\n\nWhether you have low volumes of streaming workloads where interactions in the\nUI are sufficient or have decided to invest in a more robust monitoring platform,\nyou should know how to observe your production streaming workloads. Further\n“Monitoring and Alerting” posts later in this series will contain a more thorough\ndiscussion. In particular, we’ll see different measures on which to monitor\nstreaming applications and then later take a deeper look at some of the tools\nyou can leverage for observability.\n\n**Application optimization (Are resources being used effectively?**\n\n**Think “cost”)**\n\nThe next concern we have after deploying to production is “is my application\n\nusing resources effectively?” As developers, we understand (or quickly learn) the\ndistinction between working code and well-written code. Improving the way your\ncode runs is usually very satisfying, but what ultimately matters is the overall\ncost of running it. Cost considerations for Structured Streaming applications will\nbe largely similar to those for other Spark applications. One notable difference\nis that failing to optimize for production workloads can be extremely costly,\nas these workloads are frequently “always-on” applications, and thus wasted\nexpenditure can quickly compound. Because assistance with cost optimization is\n\n\n-----\n\nfrequently requested, a separate post in this series will address it. The key points\nthat we’ll focus on will be efficiency of usage and sizing.\n\nGetting the cluster sizing right is one of the most significant differences between\nefficiency and wastefulness in streaming applications. This can be particularly\ntricky because in some cases it’s difficult to estimate the full load conditions of\nthe application in production before it’s actually there. In other cases, it may be\ndifficult due to natural variations in volume handled throughout the day, week, or\nyear. When first deploying, it can be beneficial to oversize slightly, incurring the\nextra expense to avoid inducing performance bottlenecks. Utilize the monitoring\ntools you chose to employ after the cluster has been running for a few weeks\nto ensure proper cluster utilization. For example, are CPU and memory levels\nbeing used at a high level during peak load or is the load generally small and the\ncluster may be downsized? Maintain regular monitoring of this and keep an eye\nout for changes in data volume over time; if either occurs, a cluster resize may be\nrequired to maintain cost-effective operation.\n\nAs a general guideline, you should avoid excessive shuffle operations, joins, or an\nexcessive or extreme watermark threshold (don’t exceed your needs), as each\ncan increase the number of resources you need to run your application. A large\nwatermark threshold will cause Structured Streaming to keep more data in the\nstate store between batches, leading to an increase in memory requirements\nacross the cluster. Also, pay attention to the type of VM configured — are you\nusing memory-optimized for your memory-intense stream? Compute-optimized\nfor your computationally-intensive stream? If not, look at the utilization levels\nfor each and consider trying a machine type that could be a better fit. Newer\nfamilies of servers from cloud providers with more optimal CPUs often lead to\nfaster execution, meaning you might need fewer of them to meet your SLA.\n\n\n**Troubleshooting (How do I manage any problems that come up?)**\n\nThe last question we ask ourselves after deployment is “how do I manage any\nproblems that come up?” As with cost optimization, troubleshooting streaming\napplications in Spark often looks the same as other applications since most of\nthe mechanics remain the same under the hood. For streaming applications,\nissues usually fall into two categories — failure scenarios and latency scenarios\n\n**Failure scenarios**\n\nFailure scenarios typically manifest with the stream stopping with an error,\nexecutors failing or a driver failure causing the whole cluster to fail. Common\ncauses for this are:\n\n**•** Too many streams running on the same cluster, causing the driver to be\noverwhelmed. On Databricks, this can be seen in Ganglia, where the driver\nnode will show up as overloaded before the cluster fails.\n\n**•** Too few workers in a cluster or a worker size with too small of a core-tomemory ratio, causing executors to fail with an Out Of Memory error.\nThis can also be seen on Databricks in Ganglia before an executor fails,\nor in the Spark UI under the executors tab.\n\n**•** Using a collect to send too much data to the driver, causing it to fail\nwith an Out Of Memory error.\n\n\n-----\n\n**Latency scenarios**\n\nFor latency scenarios, your stream will not execute as fast as you want or expect.\nA latency issue can be intermittent or constant. Too many streams or too small\nof a cluster can be the cause of this as well. Some other common causes are:\n\n**•** Data skew — when a few tasks end up with much more data than the rest\nof the tasks. With skewed data, these tasks take longer to execute than the\nothers, often spilling to disk. Your stream can only run as fast as its slowest\ntask.\n\n**•** Executing a stateful query without defining a watermark or defining a very\nlong one will cause your state to grow very large, slowing down your stream\nover time and potentially leading to failure.\n\n**•** Poorly optimized sink. For example, performing a merge into an overpartitioned Delta table as part of your stream.\n\n**•** Stable but high latency (batch execution time). Depending on the cause,\nadding more workers to increase the number of cores concurrently available\nfor Spark tasks can help. Increasing the number of input partitions and/or\ndecreasing the load per core through batch size settings can also reduce\nthe latency.\n\nJust like troubleshooting a batch job, you’ll use Ganglia to check cluster\nutilization and the Spark UI to find performance bottlenecks. There is a\nspecific [Structured Streaming tab](https://www.databricks.com/blog/2020/07/29/a-look-at-the-new-structured-streaming-ui-in-apache-spark-3-0.html) in the Spark UI created to help monitor and\ntroubleshoot streaming applications. On that tab each stream that is running will\nbe listed, and you’ll see either your stream name if you named your stream or\n\n\n if you didn’t. You’ll also see a stream ID that will be visible on the Jobs\ntab of the Spark UI so that you can tell which jobs are for a given stream.\n\nYou’ll notice above we said which jobs are for a given stream. It’s a common\nmisconception that if you were to look at a streaming application in the Spark\nUI you would just see one job in the Jobs tab running continuously. Instead,\ndepending on your code, you will see one or more jobs that start and complete\nfor each microbatch. Each job will have the stream ID from the Structured\nStreaming tab and a microbatch number in the description, so you’ll be able to\ntell which jobs go with which stream. You can click into those jobs to find the\nlongest running stages and tasks, check for disk spills, and search by Job ID in\nthe SQL tab to find the slowest queries and check their explain plans.\n\nThe Jobs tab in the Apache Spark UI\n\n\n-----\n\nIf you click on your stream in the Structured Streaming tab you’ll see how much\ntime the different streaming operations are taking for each microbatch, such as\nadding a batch, query planning and committing (see earlier screenshot of the\nApache Spark Structured Streaming UI). You can also see how many rows are\nbeing processed as well as the size of your state store for a stateful stream.\nThis can give insights into where potential latency issues are.\n\nWe will go more in-depth with troubleshooting later in this blog series, where\nwe’ll look at some of the causes and remedies for both failure scenarios and\nlatency scenarios as we outlined above.\n\n**Conclusion**\n\nYou may have noticed that many of the topics covered here are very similar to\nhow other production Spark applications should be deployed. Whether your\nworkloads are primarily streaming applications or batch processes, the majority\nof the same principles will apply. We focused more on things that become\nespecially important when building out streaming applications, but as we’re\n\n\nsure you’ve noticed by now, the topics we discussed should be included in\nmost production deployments.\n\nAcross the majority of industries in the world today information is needed\nfaster than ever, but that won’t be a problem for you. With Spark Structured\nStreaming you’re set to make it happen at scale in production. Be on the lookout\nfor more in-depth discussions on some of the topics we’ve covered in this blog,\nand in the meantime keep streaming!\n\n**[Review Databricks Structured Streaming in](https://docs.databricks.com/structured-streaming/production.html)**\n**[Production Documentation](https://docs.databricks.com/structured-streaming/production.html)**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.6 \u0007\n\n**Building Geospatial Data Products**\n\nby **M I L O S C O L I C**\n\nJanuary 6, 2023\n\n\nGeospatial data has been driving innovation for centuries, through use of\nmaps, cartography and more recently through digital content. For example,\nthe oldest map has been found etched in a piece of mammoth tusk and dates\n[approximately 25,000 BC](https://en.wikipedia.org/wiki/History_of_cartography) . This makes geospatial data one of the oldest data\nsources used by society to make decisions. A more recent example, labeled\nas the birth of spatial analysis, is that of Charles Picquet in 1832 who used\ngeospatial data to analyze [Cholera outbreaks in Paris](https://gallica.bnf.fr/ark:/12148/bpt6k842918.image) ; a couple of decades\nlater John Snow in 1854 followed the same approach for [Cholera outbreaks in](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak)\n[London](https://en.wikipedia.org/wiki/1854_Broad_Street_cholera_outbreak) . These two individuals used geospatial data to solve one of the toughest\nproblems of their times and in effect save countless lives. Fast-forwarding to the\n20th century, the concept of [Geographic Information Systems (GIS)](https://education.nationalgeographic.org/resource/geographic-information-system-gis) was [first](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf)\n[introduced](https://gisandscience.files.wordpress.com/2012/08/3-an-introduction-to-the-geo-information-system-of-the-canada-land-inventory.pdf) in 1967 in Ottawa, Canada, by the Department of Forestry and\nRural Development.\n\nToday we are in the midst of the cloud computing industry revolution —\nsupercomputing scale available to any organization, virtually infinitely scalable\nfor both storage and compute. Concepts like [data mesh](https://www.databricks.com/blog/2022/10/19/building-data-mesh-based-databricks-lakehouse-part-2.html) and [data marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html)\nare emerging within the data community to address questions like platform\nfederation and interoperability. How can we adopt these concepts to geospatial\ndata, spatial analysis and GIS systems? By adopting the concept of data\nproducts and approaching the design of geospatial data as a product.\n\n\nIn this blog we will provide a point of view on how to design scalable geospatial\ndata products that are modern and robust. We will discuss how Databricks\nLakehouse Platform can be used to unlock the full potential of geospatial\nproducts that are one of the most valuable assets in solving the toughest\nproblems of today and the future.\n\n**What is a data product? And how to design one?**\n\nThe most broad and the most concise definition of a “data product” was coined\nby DJ Patil (the first U.S. Chief Data Scientist) in _Data Jujitsu: The Art of Turning_\n_Data into Product:_ “a product that facilitates an end goal through the use of\ndata.” The complexity of this definition (as admitted by Patil himself) is needed to\nencapsulate the breadth of possible products, to include dashboards, reports, Excel\n\nspreadsheets, and even CSV extracts shared via emails. You might notice that the\nexamples provided deteriorate rapidly in quality, robustness and governance.\n\nWhat are the concepts that differentiate a successful product versus an\nunsuccessful one? Is it the packaging? Is it the content? Is it the quality of the\ncontent? Or is it only the product adoption in the market? Forbes defines the\n10 must-haves of a successful product. A good framework to summarize this is\nthrough the value pyramid.\n\n\n-----\n\nFigure 1: Product value pyramid (source)\n\nThe value pyramid provides a priority on each aspect of the product. Not every\nvalue question we ask about the product carries the same amount of weight. If\nthe output is not useful none of the other aspects matter — the output isn’t really\na product but becomes more of a data pollutant to the pool of useful results.\nLikewise, scalability only matters after simplicity and explainability are addressed.\n\nHow does the value pyramid relate to the data products? Each data output, in\norder to be a data product:\n\n**•** **Should have clear usefulness.** The amount of the data society is\ngenerating is rivaled only by the amount of data pollutants we are\ngenerating. These are outputs lacking clear value and use, much less a\nstrategy for what to do with them.\n\n\n\n**•** **Should be explainable.** With the emergence of AI/ML, explainability has\nbecome even more important for data driven decision-making. Data\nis as good as the metadata describing it. Think of it in terms of food —\ntaste does matter, but a more important factor is the nutritional value\nof ingredients.\n\n**•** **Should be simple.** An example of product misuse is using a fork to eat\ncereal instead of using a spoon. Furthermore, simplicity is essential but\nnot sufficient — beyond simplicity the products should be intuitive.\nWhenever possible both intended and unintended uses of the data\nshould be obvious.\n\n**•** **Should be scalable.** Data is one of the few resources that grows with\nuse. The more data you process the more data you have. If both inputs\nand outputs of the system are unbounded and ever-growing, then the\nsystem has to be scalable in compute power, storage capacity and\ncompute expressive power. Cloud data platforms like Databricks are in\na unique position to answer for all of the three aspects.\n\n**•** **Should generate habits.** In the data domain we are not concerned\nwith customer retention as is the case for the retail products. However,\nthe value of habit generation is obvious if applied to best practices.\nThe systems and data outputs should exhibit the best practices and\npromote them — it should be easier to use the data and the system in\nthe intended way than the opposite.\n\nThe geospatial data should adhere to all the aforementioned aspects — any data\nproducts should. On top of this tall order, geospatial data has some specific needs.\n\n\n-----\n\n**Geospatial data standards**\n\n\n\n**•** **“Advocate the understanding and use of geospatial data standards**\n**within other sectors of government.”** — Value pyramid applies to\nthe standards as well — concepts like ease of adherence (usefulness/\nsimplicity), purpose of the standard (explainability/usefulness), adoption\n(habit generation) are critical for the value generation of a standard.\n\nA critical tool for achieving the data standards mission is the [FAIR](https://www.go-fair.org/fair-principles/) data\nprinciples:\n\n**•** **Findable** — The first step in (re)using data is to find them. Metadata\nand data should be easy to find for both humans and computers.\nMachine-readable metadata are essential for automatic discovery of\ndata sets and services.\n\n**•** **Accessible** — Once the user finds the required data, she/he/they\nneed to know how they can be accessed, possibly including\nauthentication and authorization.\n\n**•** **Interoperable** — The data usually needs to be integrated with\nother data. In addition, the data needs to interoperate with\napplications or workflows for analysis, storage, and processing.\n\n**•** **Reusable** — The ultimate goal of FAIR is to optimize the reuse of data.\nTo achieve this, metadata and data should be well-described so that\nthey can be replicated and/or combined in different settings.\n\n\nGeospatial data standards are used to ensure that geographic data is collected,\norganized, and shared in a consistent and reliable way. These standards can\ninclude guidelines for things like data formatting, coordinate systems, map\nprojections, and metadata. Adhering to standards makes it easier to share data\nbetween different organizations, allowing for greater collaboration and broader\naccess to geographic information.\n\nThe Geospatial Commision (UK government) has defined the UK Geospatial\nData Standards Register as a central repository for data standards to be applied\nin the case of geospatial data. Furthermore, the mission of this registry is to:\n\n**•** **“Ensure UK geospatial data is more consistent and coherent and usable**\n**across a wider range of systems.”** — These concepts are a callout for the\nimportance of explainability, usefulness and habit generation (possibly\nother aspects of the value pyramid).\n\n**•** **“Empower the UK geospatial community to become more engaged with**\n**the relevant standards and standards bodies.”** — Habit generation within\nthe community is as important as the robust and critical design on the\nstandard. If not adopted standards are useless.\n\n\n-----\n\nWe share the belief that the FAIR principles are crucial for the design of scalable\ndata products we can trust. To be fair, FAIR is based on common sense, so why\nis it key to our considerations? _“What I see in FAIR is not new in itself, but what it_\n_does well is to articulate, in an accessible way, the need for a holistic approach_\n_to data improvement. This ease in communication is why FAIR is being used_\n_increasingly widely as an umbrella for data improvement — and not just in the_\n_geospatial community.”_ — [A FAIR wind sets our course for data improvement](https://geospatialcommission.blog.gov.uk/2022/03/02/a-fair-wind-sets-our-course-for-data-improvement/) .\n\nTo further support this approach, the [Federal Geographic Data Committee](https://www.fgdc.gov/standards) has\ndeveloped the [National Spatial Data Infrastructure (NSDI) Strategic Plan](https://www.fgdc.gov/nsdi-plan/nsdi-strategic-plan-2021-2024.pdf) that\ncovers the years 2021-2024 and was approved in November 2020. The goals\nof NSDI are in essence FAIR principles and convey the same message of designing\nsystems that promote the circular economy of data — data products that flow\nbetween organizations following common standards and in each step through the\ndata supply chain unlock new value and new opportunities. The fact that these\nprinciples are permeating different jurisdictions and are adopted across different\nregulators is a testament to the robustness and soundness of the approach.\n\n\nThe FAIR concepts weave really well together with the data product design.\nIn fact FAIR is traversing the whole product value pyramid and forms a value\ncycle. By adopting both the value pyramid and FAIR principles we design data\nproducts with both internal and external outlook. This promotes data reuse\nas opposed to data accumulation.\n\nWhy do FAIR principles matter for geospatial data and geospatial data\n\nproducts? FAIR is transcendent to geospatial data, it is actually transcendent\nto data, it is a simple yet coherent system of guiding principles for good design\n— and that good design can be applied to anything including geospatial data\nand geospatial systems.\n\n\nFigure 2:\nNDSI Strategic Goals\n\n\n-----\n\n**Grid index systems**\n\nIn traditional GIS solutions’ performance of spatial operations are usually\nachieved by building tree structures ( [KD trees](https://en.wikipedia.org/wiki/K-d_tree) , [ball trees](https://www.researchgate.net/publication/283471105_Ball-tree_Efficient_spatial_indexing_for_constrained_nearest-neighbor_search_in_metric_spaces) , [Quad trees](https://en.wikipedia.org/wiki/Quadtree) , etc).\nThe issue with tree approaches is that they eventually break the scalability\nprinciple — when the data is too big to be processed in order to build the tree\nand the computation required to build the tree is too long and defeats the\npurpose. This also negatively affects the accessibility of data; if we cannot\nconstruct the tree we cannot access the complete data and in effect we cannot\nreproduce the results. In this case, grid index systems provide a solution.\n\n\nGrid index systems are built from the start with the scalability aspects of the\ngeospatial data in mind. Rather than building the trees, they define a series of\ngrids that cover the area of interest. In the case of [H3](https://h3geo.org/) (pioneered by Uber),\nthe grid covers the area of the Earth; in the case of local grid index systems\n(e.g., [British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) ) they may only cover the specific area of interest.\nThese grids are composed of cells that have unique identifiers. There is a\nmathematical relationship between location and the cell in the grid. This makes\nthe grid index systems very scalable and parallel in nature.\n\n\nFigure 4: Grid Index Systems (H3, British National Grid)\n\n\n-----\n\nAnother important aspect of grid index systems is that they are open source,\nallowing index values to be universally leveraged by data producers and\nconsumers alike. Data can be enriched with the grid index information at any\nstep of its journey through the data supply chain. This makes the grid index\nsystems an example of community driven data standards. Community driven\ndata standards by nature do not require enforcement, which fully adheres\nto the habit generation aspect of value pyramid and meaningfully addresses\ninteroperability and accessibility principles of FAIR.\n\n\nDatabricks has recently announced [native support for the H3 grid index system](https://www.databricks.com/blog/2022/09/14/announcing-built-h3-expressions-geospatial-processing-and-analytics.html)\nfollowing the same value proposition. Adopting common industry standards\ndriven by the community is the only way to properly drive habit generation and\ninteroperability. To strengthen this statement, organizations like [CARTO](https://carto.com/blog/hexagons-for-location-intelligence/) , [ESRI](https://www.esri.com/arcgis-blog/products/bus-analyst/analytics/using-uber-h3-hexagons-arcgis-business-analyst-pro/)\nand [Google](https://opensource.googleblog.com/2017/12/announcing-s2-library-geometry-on-sphere.html) have been promoting the usage of grid index systems for scalable\nGIS system design. In addition, Databricks Labs project [Mosaic](https://databrickslabs.github.io/mosaic/) supports the\n[British National Grid](https://en.wikipedia.org/wiki/Ordnance_Survey_National_Grid) as the standard grid index system that is widely used in\nthe UK government. Grid index systems are key for the scalability of geospatial\ndata processing and for properly designing solutions for complex problems\n(e.g., figure 5 — flight holding patterns using H3).\n\n**Geospatial data diversity**\n\nGeospatial data standards spend a solid amount of effort regarding data\nformat standardization, and format for that matter is one of the most\nimportant considerations when it comes to interoperability and reproducibility.\nFurthermore, if the reading of your data is complex — how can we talk about\nsimplicity? Unfortunately geospatial data formats are typically complex, as\ndata can be produced in a number of formats including both open source\n\nand vendor-specific formats. Considering only vector data, we can expect\ndata to arrive in WKT, WKB, GeoJSON, web CSV, CSV, Shape File, GeoPackage,\nand many others. On the other hand, if we are considering raster data we can\nexpect data to arrive in any number of formats such as GeoTiff, netCDF, GRIB, or\nGeoDatabase; for a comprehensive list of formats please consult this [blog](https://gisgeography.com/gis-formats/) .\n\n\nFigure 5: Example of using H3 to express flight holding patterns\n\n\n-----\n\nGeospatial data domain is so diverse and has organically grown over the years\naround the use cases it was addressing. Unification of such a diverse ecosystem\nis a massive challenge. A recent effort by the Open Geospatial Consortium\n(OGC) to standardize to [Apache Parquet](https://parquet.apache.org/) and its geospatial schema specification\n[GeoParquet](https://geoparquet.org/) is a step in the right direction. Simplicity is one of the key aspects\nof designing a good scalable and robust product — unification leads to simplicity\nand addresses one of the main sources of friction in the ecosystem — the data\ningestion. Standardizing to GeoParquet brings a lot of value that addresses all of\nthe aspects of FAIR data and value pyramid.\n\nFigure 6: Geoparquet as a geospatial standard data format\n\n\nWhy introduce another format into an already complex ecosystem? GeoParquet\nisn’t a new format — it is a schema specification for Apache Parquet format that\nis already widely adopted and used by the industry and the community. Parquet\nas the base format supports binary columns and allows for storage of arbitrary\ndata payload. At the same time the format supports structured data columns\nthat can store metadata together with the data payload. This makes it a choice\nthat promotes interoperability and reproducibility. Finally, [Delta Lake](https://delta.io/) format\nhas been built on top of parquet and brings [ACID](https://en.wikipedia.org/wiki/ACID) properties to the table. ACID\nproperties of a format are crucial for reproducibility and for trusted outputs. In\naddition, Delta is the format used by scalable data sharing solution [Delta Sharing](https://www.databricks.com/product/delta-sharing) .\n\nDelta Sharing enables enterprise scale data sharing between any public cloud\nusing Databricks (DIY options for private cloud are available using open source\nbuilding blocks). Delta Sharing completely abstracts the need for custom built\nRest APIs for exposing data to other third parties. Any data asset stored in Delta\n(using GeoParquet schema) automatically becomes a data product that can be\nexposed to external parties in a controlled and governed manner. Delta Sharing\nhas been built from the ground up with [security best practices in mind](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html?utm_source=bambu&utm_medium=social&utm_campaign=advocacy&blaid=3352307) .\n\n\n-----\n\nFigure 7: Delta Sharing simplifying data access in the ecosystem\n\n**Circular data economy**\n\n\nBorrowing the concepts from the sustainability domain, we can define a circular\ndata economy as a system in which data is collected, shared, and used in a way\nthat maximizes its value while minimizing waste and negative impacts, such as\nunnecessary compute time, untrustworthy insights, or biased actions based\ndata pollutants. Reusability is the key concept in this consideration — how can\nwe minimize the \"reinvention of the wheel.\" There are countless data assets out\nin the wild that represent the same area, same concepts with just ever slight\nalterations to better match a specific use case. Is this due to the actual\n\n\noptimizations or due to the fact it was easier to create a new copy of the assets\nthan to reuse the existing ones? Or was it too hard to find the existing data\nassets, or maybe it was too complex to define data access patterns.\n\nData asset duplication has many negative aspects in both FAIR considerations\nand data value pyramid considerations — having many disparate similar (but\ndifferent) data assets that represent the same area and same concepts can\ndeteriorate simplicity considerations of the data domain — it becomes hard\nto identify the data asset we actually can trust. It can also have very negative\n\n\n-----\n\nimplications toward habit generation. Many niche communities will emerge\nthat will standardize to themselves ignoring the best practices of the wider\necosystem, or worse yet they will not standardize at all.\n\nIn a circular data economy, data is treated as a valuable resource that can be\nused to create new products and services, as well as improving existing ones.\nThis approach encourages the reuse and recycling of data, rather than treating it\nas a disposable commodity. Once again, we are using the sustainability analogy\nin a literal sense — we argue that this is the correct way of approaching the\nproblem. Data pollutants are a real challenge for organizations both internally and\nexternally. An article by The Guardian states that less than 1% of collected data is\nactually analyzed. There is too much data duplication, the majority of data is hard\nto access and deriving actual value is too cumbersome. Circular data economy\npromotes best practices and reusability of existing data assets allowing for a more\nconsistent interpretation and insights across the wider data ecosystem.\n\n\nFigure 8: Databricks Marketplace\n\n\n-----\n\nInteroperability is a key component of FAIR data principles, and from\ninteroperability a question of circularity comes to mind. How can we design an\necosystem that maximizes data utilization and data reuse? Once again, FAIR\ntogether with the value pyramid holds answers. Findability of the data is key to\nthe data reuse and to solving for data pollution. With data assets that can be\ndiscovered easily we can avoid the recreation of same data assets in multiple\nplaces with just slight alteration. Instead we gain a coherent data ecosystem\nwith data that can be easily combined and reused. Databricks has recently\nannounced the [Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-databricks-marketplace-an-open-marketplace-for-all-data-and-ai-assets.html) . The idea behind the marketplace is in\nline with the original definition of data product by DJ Patel. The marketplace\nwill support sharing of data sets, notebooks, dashboards, and machine learning\nmodels. The critical building block for such a marketplace is the concept of\nDelta Sharing — the scalable, flexible and robust channel for sharing any data —\ngeospatial data included.\n\n\nDesigning scalable data products that will live in the marketplace is crucial.\nIn order to maximize the value add of each data product one should strongly\nconsider FAIR principles and the product value pyramid. Without these guiding\nprinciples we will only increase the issues that are already present in the\ncurrent systems. Each data product should solve a unique problem and should\nsolve it in a simple, reproducible and robust way.\n\n**You can read more on how Databricks Lakehouse**\n**Platform can help you accelerate time to value from**\n**your data products in the eBook:** **[A New Approach](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)**\n**[to Data Sharing](https://www.databricks.com/p/ebook/a-new-approach-to-data-sharing)** **.**\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.7 \u0007\n\n**Data Lineage With Unity Catalog**\n\nby **P A U L R O O M E , TA O F E N G A N D S A C H I N T H A K U R**\n\nJune 8, 2022\n\n\nThis blog will discuss the importance of data lineage, some of the common\nuse cases, our vision for better data transparency and data understanding with\ndata lineage.\n\n**What is data lineage and why is it important?**\n\nData lineage describes the transformations and refinements of data from source\nto insight. Lineage includes capturing all the relevant metadata and events\nassociated with the data in its lifecycle, including the source of the data set,\nwhat other data sets were used to create it, who created it and when, what\ntransformations were performed, what other data sets leverage it, and many other\nevents and attributes. With a data lineage solution, data teams get an end-to-end\nview of how data is transformed and how it flows across their data estate.\n\nAs more and more organizations embrace a data-driven culture and set up\nprocesses and tools to democratize and scale data and AI, data lineage is\nbecoming an essential pillar of a pragmatic data management and governance\nstrategy.\n\nTo understand the importance of data lineage, we have highlighted some of the\ncommon use cases we have heard from our customers below.\n\n\n**Impact analysis**\nData goes through multiple updates or revisions over its lifecycle, and\nunderstanding the potential impact of any data changes on downstream\nconsumers becomes important from a risk management standpoint. With data\nlineage, data teams can see all the downstream consumers — applications,\ndashboards, machine learning models or data sets, etc. — impacted by data\nchanges, understand the severity of the impact, and notify the relevant\nstakeholders. Lineage also helps IT teams proactively communicate data\nmigrations to the appropriate teams, ensuring business continuity.\n\n**Data understanding and transparency**\nOrganizations deal with an influx of data from multiple sources, and building\na better understanding of the context around data is paramount to ensure\nthe trustworthiness of the data. Data lineage is a powerful tool that enables\ndata leaders to drive better transparency and understanding of data in their\norganizations. Data lineage also empowers data consumers such as data scientists,\ndata engineers and data analysts to be context-aware as they perform analyses,\nresulting in better quality outcomes. Finally, data stewards can see which data sets\nare no longer accessed or have become obsolete to retire unnecessary data and\nensure data quality for end business users .\n\n\n-----\n\n**Debugging and diagnostics**\nYou can have all the checks and balances in place, but something will eventually\nbreak. Data lineage helps data teams perform a root cause analysis of any errors\nin their data pipelines, applications, dashboards, machine learning models, etc.,\nby tracing the error to its source. This significantly reduces the debugging time,\nsaving days, or in many cases, months of manual effort.\n\n**Compliance and audit readiness**\nMany compliance regulations, such as the General Data Protection Regulation\n(GDPR), California Consumer Privacy Act (CCPA), Health Insurance Portability and\nAccountability Act (HIPPA), Basel Committee on Banking Supervision (BCBS) 239,\nand Sarbanes-Oxley Act (SOX), require organizations to have clear understanding\nand visibility of data flow. As a result, data traceability becomes a key requirement\nin order for their data architecture to meet legal regulations. Data lineage helps\norganizations be compliant and audit-ready, thereby alleviating the operational\noverhead of manually creating the trails of data flows for audit reporting purposes.\n\n\n**Effortless transparency and proactive control with**\n**data lineage**\n\nThe [lakehouse](https://www.databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) provides a pragmatic data management architecture that\nsubstantially simplifies enterprise data infrastructure and accelerates innovation\nby unifying your data warehousing and AI use cases on a single platform.\nWe believe data lineage is a key enabler of better data transparency and data\nunderstanding in your lakehouse, surfacing the relationships between data,\njobs, and consumers, and helping organizations move toward proactive data\nmanagement practices. For example:\n\n**•** As the owner of a dashboard, do you want to be notified next time that a\ntable your dashboard depends upon wasn’t loaded correctly?\n\n**•** As a machine learning practitioner developing a model, do you want to be\nalerted that a critical feature in your model will be deprecated soon?\n\n**•** As a governance admin, do you want to automatically control access to\ndata based on its provenance?\n\nAll of these capabilities rely upon the automatic collection of data lineage across\nall use cases and personas — which is why the lakehouse and data lineage are a\npowerful combination.\n\n\n-----\n\nData lineage for tables\n\nData lineage for table columns\n\n\nData Lineage for notebooks, workflows, dashboards\n\n**Built-in security:** Lineage graphs in Unity Catalog are privilege-aware and share\nthe same permission model as Unity Catalog. If users do not have access to\na table, they will not be able to explore the lineage associated with the table,\nadding an additional layer of security for privacy considerations.\n\n**Easily exportable via REST API:** Lineage can be visualized in the Data Explorer\nin near real-time, and retrieved via REST API to support integrations with our\ncatalog partners.\n\n**Getting started with data lineage in Unity Catalog**\n\nData lineage is available with Databricks Premium and Enterprise tiers for\nno additional cost. If you already are a Databricks customer, follow the data\nlineage guides ( [AWS](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) | [Azure](https://docs.microsoft.com/azure/databricks/data-governance/unity-catalog/data-lineage) ) to get started. If you are not an existing Databricks\ncustomer, sign up for a [free trial](https://www.databricks.com/try-databricks) with a Premium or Enterprise workspace.\n\n\n-----\n\nSECTION 2.8\n\n**Easy Ingestion to Lakehouse With COPY INTO**\n\nby **A E M R O A M A R E , E M M A L I U , A M I T K A R A** and **J A S R A J D A N G E**\n\nJanuary 17, 2023\n\n\nA new data management architecture known as the data lakehouse emerged\nindependently across many organizations and use cases to support AI and BI\ndirectly on vast amounts of data. One of the key success factors for using the\ndata lakehouse for analytics and machine learning is the ability to quickly and\neasily ingest data of various types, including data from on-premises storage\nplatforms (data warehouses, mainframes), real-time streaming data, and bulk\ndata assets.\n\nAs data ingestion into the lakehouse is an ongoing process that feeds the\nproverbial ETL pipeline, you will need multiple options to ingest various formats,\ntypes and latency of data. For data stored in cloud object stores such as AWS\nS3, Google Cloud Storage and Azure Data Lake Storage, Databricks offers\nAuto Loader, a natively integrated feature, that allows data engineers to ingest\nmillions of files from the cloud storage continuously. In other streaming cases\n\n(e.g., IoT sensor or clickstream data), Databricks provides native connectors\nfor Apache Spark Structured Streaming to quickly ingest data from popular\nmessage queues, such as [Apache Kafka](https://docs.databricks.com/spark/latest/structured-streaming/kafka.html?_ga=2.117268486.126296912.1643033657-734003504.1641217794) , Azure Event Hubs or AWS Kinesis at low\nlatencies. Furthermore, many customers can leverage popular ingestion tools\n\n\nthat integrate with Databricks, such as Fivetran — to easily ingest data from\nenterprise applications, databases, mainframes and more into the lakehouse.\nFinally, analysts can use the simple “COPY INTO” command to pull new data into\nthe lakehouse automatically, without the need to keep track of which files have\nalready been processed.\n\nThis blog focuses on COPY INTO, a simple yet powerful SQL command that allows\nyou to perform batch file ingestion into Delta Lake from cloud object stores.\nIt’s idempotent, which guarantees to ingest files with exactly-once semantics\nwhen executed multiple times, supporting incremental appends and simple\ntransformations. It can be run once, in an ad hoc manner, and can be scheduled\nthrough Databricks Workflows. In recent Databricks [Runtime releases](https://docs.databricks.com/release-notes/runtime/releases.html) , COPY\nINTO introduced new functionalities for data preview, validation, enhanced error\nhandling, and a new way to copy into a schemaless Delta Lake table so that users\n\ncan get started quickly, completing the end-to-end user journey to ingest from\ncloud object stores. Let’s take a look at the popular COPY INTO use cases.\n\n\n-----\n\n**1. Ingesting data for the first time**\n\n\nThe default for data validation is to parse all the data in the source directory to\nensure that there aren’t any issues, but the rows returned for preview are limited.\nOptionally, you can provide the number of rows to preview after VALIDATE.\n\nThe COPY_OPTION “mergeSchema” specifies that it is okay to evolve the schema\nof your target Delta table. Schema evolution only allows the addition of new\ncolumns, and does not support data type changes for existing columns. In other\nuse cases, you can omit this option if you intend to manage your table schema\nmore strictly as your data pipeline may have strict schema requirements and\nmay not want to evolve the schema at all times. However, our target Delta table\nin the example above is an empty, columnless table at the moment; therefore,\nwe have to specify the COPY_OPTION “mergeSchema” here.\n\nFigure 1: COPY INTO VALIDATE mode output\n\n\nCOPY INTO requires a table to exist as it ingests the data into a target Delta\ntable. However, you have no idea what your data looks like. You first create an\nempty Delta table.\n```\n CREATE TABLE my_example_data;\n\n```\nBefore you write out your data, you may want to preview it and ensure the\ndata looks correct. The COPY INTO Validate mode is a new feature in\nDatabricks Runtime [10.3](https://docs.databricks.com/release-notes/runtime/10.3.html) and above that allows you to preview and validate\nsource data before ingesting many files from the cloud object stores.\nThese validations include:\n\n**•** if the data can be parsed\n\n**•** the schema matches that of the target table or if the schema\nneeds to be evolved\n\n**•** all nullability and check constraints on the table are met\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\n-----\n\n**2. Configuring COPY INTO**\n\n\nFigure 2 shows the validate output that the header is properly parsed.\n\nFigure 2: COPY INTO VALIDATE mode output with enabled header and inferSchema\n\n**3. Appending data to a Delta table**\n\nNow that the preview looks good, we can remove the VALIDATE keyword and\nexecute the COPY INTO command.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\n\nWhen looking over the results of VALIDATE (see Figure 1), you may notice that\nyour data doesn’t look like what you want. Aren’t you glad you previewed your\ndata set first? The first thing you notice is the column names are not what is\nspecified in the CSV header. What’s worse, the header is shown as a row in your\ndata. You can configure the CSV parser by specifying FORMAT_OPTIONS.\nLet’s add those next.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleData'\nFILEFORMAT `=` CSV\nVALIDATE\nFORMAT_OPTIONS ( 'header' `=` 'true' , 'inferSchema' `=` 'true' , 'mergeSchema' `=`\n'true' )\nCOPY_OPTIONS ( 'mergeSchema' `=` 'true' )\n\nWhen using the FORMAT OPTION, you can tell COPY INTO to infer the data types\nof the CSV file by specifying the inferSchema option; otherwise, all default\ndata types are STRINGs. On the other hand, binary file formats like AVRO and\nPARQUET do not need this option since they define their own schema. Another\n\noption, “mergeSchema” states that the schema should be inferred over a\ncomprehensive sample of CSV files rather than just one. The comprehensive list\nof format-specific options can be found in the [documentation](https://learn.microsoft.com/en-us/azure/databricks/sql/language-manual/delta-copy-into#format-options) .\n\n\n-----\n\nCOPY INTO keeps track of the state of files that\nhave been ingested. Unlike commands like INSERT\nINTO, users get idempotency with COPY INTO,\nwhich means users won’t get duplicate data in\nthe target table when running COPY INTO multiple\ntimes from the same source data.\n\nCOPY INTO can be run once, in an ad hoc manner,\nand can be scheduled with Databricks Workflows.\nWhile COPY INTO does not support low latencies\nfor ingesting natively, you can trigger COPY INTO\nthrough orchestrators like Apache Airflow.\n\n\nFigure 3: Databricks workflow UI to schedule a task\n\n\n-----\n\n**4. Secure data access with COPY INTO**\n\nCOPY INTO supports secure access in several ways. In this section, we want to\nhighlight two new options you can use in both [Databricks SQL](https://dbricks.co/dbsql) and notebooks\nfrom recent releases:\n\n**Unity Catalog**\nWith the general availability of Databrick Unity Catalog, you can use COPY INTO\nto ingest data to Unity Catalog managed or external tables from any source and\nfile format supported by COPY INTO. Unity Catalog also adds new options for\nconfiguring secure access to raw data, allowing you to use Unity Catalog external\nlocations or storage credentials to access data in cloud object storage. Learn\nmore about how to use [COPY INTO with Unity Catalog](https://docs.databricks.com/ingestion/copy-into/unity-catalog.html#use-copy-into-to-load-data-with-unity-catalog) .\n\n**Temporary Credentials**\nWhat if you have not configured Unity Catalog or instance profile? How about\ndata from a trusted third party bucket? Here is a convenient COPY INTO feature\nthat allows you to [ingest data with inline temporary credentials](https://docs.databricks.com/ingestion/copy-into/temporary-credentials.html) to handle the ad\nhoc bulk ingestion use case.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath' WITH (\nCREDENTIAL (AWS_ACCESS_KEY `=` '...' , AWS_SECRET_KEY `=` '...' , AWS_SESSION_\nTOKEN `=` '...' )\n)\nFILEFORMAT `=` CSV\n\n\n**5. Filtering files for ingestion**\n\nWhat about ingesting a subset of files where the filenames match a pattern? You\ncan apply glob patterns — a glob pattern that identifies the files to load from the\nsource directory. For example, let’s filter and ingest files which contain the word\n`raw_data` in the filename below.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data*.csv'\nFORMAT_OPTIONS ( 'header' `=` 'true' )\n\n**6. Ingest files in a time period**\n\nIn data engineering, it is frequently necessary to ingest files that have been\nmodified before or after a specific timestamp. Data between two timestamps\nmay also be of interest. The ‘modifiedAfter’ and ‘modifiedBefore’ format options\noffered by COPY INTO allow users to ingest data from a chosen time window into\na Delta table.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_*.csv'\nFORMAT_OPTIONS( '2022-0912T10:53:11.000+0000' 'header' ) `=` 'true' , 'modifiedAfter' `=`\n\n\n-----\n\n**7. Correcting data with the force option**\n\nBecause COPY INTO is by default idempotent, running the same query against\nthe same source files more than once has no effect on the destination table\nafter the initial execution. You must propagate changes to the target table\nbecause, in real-world circumstances, source data files in cloud object storage\nmay be altered for correction at a later time. In such a case, it is possible to first\nerase the data from the target table before ingesting the more recent data files\nfrom the source. For this operation you only need to set the copy option ‘force’\nto ‘true’.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nPATTERN `=` '*raw_data_2022*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n\n**8. Applying simple transformations**\n\nWhat if you want to rename columns? Or the source data has changed and a\nprevious column has been renamed to something else? You don’t want to ingest\nthat data as two separate columns, but as a single column. We can leverage the\nSELECT statement in COPY INTO perform simple transformations.\n\nCOPY INTO demo.my_example_data\nFROM ( SELECT concat(first_name, \" \", last_name) as full_name,\n`*` EXCEPT (first_name, last_name)\nFROM 's3://my-bucket/exampleDataPath'\n)\nFILEFORMAT `=` CSV\nPATTERN `=` '*.csv'\nFORMAT_OPTIONS( 'header' `=` 'true' )\nCOPY_OPTIONS ( 'force' `=` 'true' )\n\n**9. Error handling and observability with COPY INTO**\n\n**Error handling:**\nHow about ingesting data with file corruption issues? Common examples of file\ncorruption are:\n\n**•** Files with an incorrect file format\n\n**•** Failure to decompress\n\n**•** Unreadable files (e.g., invalid Parquet)\n\n\n-----\n\nCOPY INTO’s format option ignoreCorruptFiles helps skip those files while\nprocessing. The result of the COPY INTO command returns the number of files\nskipped in the num_skipped_corrupt_files column. In addition, these corrupt\nfiles aren’t tracked by the ingestion state in COPY INTO, therefore they can be\nreloaded in a subsequent execution once the corruption is fixed. This option is\navailable in Databricks [Runtime 11.0+](https://docs.databricks.com/release-notes/runtime/11.0.html) .\n\nYou can see which files have been detected as corrupt by running COPY INTO in\nVALIDATE mode.\n\nCOPY INTO my_example_data\nFROM 's3://my-bucket/exampleDataPath'\nFILEFORMAT `=` CSV\nVALIDATE ALL\nFORMAT_OPTIONS( 'ignoreCorruptFiles' `=` 'true' )\n\n**Observability:**\nIn Databricks Runtime 10.5, [file metadata column](https://docs.databricks.com/ingestion/file-metadata-column.html) was introduced to provide\ninput file metadata information, which allows users to monitor and get key\nproperties of the ingested files like path, name, size and modification time, by\nquerying a hidden STRUCT column called _metadata. To include this information\nin the destination, you must explicitly reference the _metadata column in your\nquery in COPY INTO.\n\nCOPY INTO my_example_data\nFROM (\nSELECT `*` , _metadata source_metadata FROM 's3://my-bucket/\nexampleDataPath'\n)\nFILEFORMAT `=` CSV\n\n\n**How does it compare to Auto Loader?**\n\nCOPY INTO is a simple and powerful command to use when your source\ndirectory contains a small number of files (i.e., thousands of files or less), and if\nyou prefer SQL. In addition, COPY INTO can be used over JDBC to push data into\nDelta Lake at your convenience, a common pattern by many ingestion partners.\nTo ingest a larger number of files both in streaming and batch we recommend\nusing [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) . In addition, for a modern data pipeline based on [medallion](https://www.databricks.com/glossary/medallion-architecture)\n[architecture](https://www.databricks.com/glossary/medallion-architecture) , we recommend using Auto Loader in [Delta Live Tables pipelines](https://docs.databricks.com/ingestion/auto-loader/dlt.html) ,\nleveraging advanced capabilities of automatic error handling, quality control,\ndata lineage and setting [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html) in a declarative approach.\n\n**How to get started?**\n\nTo get started, you can go to **[Databricks SQL](https://dbricks.co/dbsql)** query editor, update and run the\nexample SQL commands to ingest from your cloud object stores. Check out\nthe options in No. 4 to establish secure access to your data for querying it in\nDatabricks SQL. To get familiar with COPY INTO in Databricks SQL, you can also\nfollow this [quickstart tutorial.](https://docs.databricks.com/ingestion/copy-into/tutorial-dbsql.html)\n\nAs an alternative, you can use this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/db-385-demo_copy_into.html) in Data Science & Engineering and\nMachine Learning workspaces to learn most of the COPY INTO features in this\nblog, where source data and target Delta tables are generated in DBFS.\n\nMore tutorials for COPY INTO can be found [here](https://docs.databricks.com/ingestion/copy-into/index.html) .\n\n\n-----\n\nSECTION 2.9 \u0007\n\n**Simplifying Change Data Capture With Databricks Delta Live Tables**\n\nby **M O J G A N M A Z O U C H I**\n\nApril 25, 2022\n\n\nThis guide will demonstrate how you can leverage change data capture in Delta\nLive Tables pipelines to identify new records and capture changes made to the\ndata set in your data lake. Delta Live Tables pipelines enable you to develop\nscalable, reliable and low latency data pipelines, while performing change data\ncapturee in your data lake with minimum required computation resources and\nseamless out-of-order data handling.\n\n**Note:** We recommend following [Getting Started with Delta Live Tables](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables)\nwhich explains creating scalable and reliable pipelines using Delta Live Tables\n(DLT) and its declarative ETL definitions.\n\n**Background on change data capture**\n\nChange data capture ( [CDC](https://en.wikipedia.org/wiki/Change_data_capture) ) is a process that identifies and captures incremental\nchanges (data deletes, inserts and updates) in databases, like tracking customer,\norder or product status for near-real-time data applications. CDC provides realtime data evolution by processing data in a continuous incremental fashion as\nnew events occur.\n\n\nSince [over 80% of organizations plan on implementing multicloud strategies](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/)\n[by 2025](https://solutionsreview.com/data-integration/whats-changed-2020-gartner-magic-quadrant-for-data-integration-tools/) , choosing the right approach for your business that allows seamless\nreal-time centralization of all data changes in your ETL pipeline across multiple\nenvironments is critical.\n\nBy capturing CDC events, Databricks users can re-materialize the source table\nas Delta Table in Lakehouse and run their analysis on top of it, while being able\nto combine data with external systems. The MERGE INTO command in Delta Lake\non Databricks enables customers to efficiently upsert and delete records in\ntheir data lakes — you can check out our previous deep dive on the topic [here](https://www.databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html) .\nThis is a common use case that we observe many of Databricks customers are\nleveraging Delta Lakes to perform, and keeping their data lakes up to date with\nreal-time business data.\n\nWhile Delta Lake provides a complete solution for real-time CDC synchronization\nin a data lake, we are now excited to announce the change data capture feature\nin Delta Live Tables that makes your architecture even simpler, more efficient and\nscalable. DLT allows users to ingest CDC data seamlessly using SQL and Python.\n\nEarlier CDC solutions with Delta tables were using MERGE INTO operation, which\nrequires manually ordering the data to avoid failure when multiple rows of the\nsource data set match while attempting to update the same rows of the target\n\n\n-----\n\nDelta table. To handle the out-of-order data, there was an extra step required to\npreprocess the source table using a foreachBatch implementation to eliminate\nthe possibility of multiple matches, retaining only the latest change for each\nkey (see the [change data capture example](https://www.databricks.com/blog/2022/04/25/simplifying-change-data-capture-with-databricks-delta-live-tables.html#) ). The new APPLY CHANGES INTO\noperation in DLT pipelines automatically and seamlessly handles out-of-order\ndata without any need for data engineering manual intervention.\n\n**CDC with Databricks Delta Live Tables**\n\nIn this blog, we will demonstrate how to use the APPLY CHANGES INTO command\nin Delta Live Tables pipelines for a common CDC use case where the CDC data\nis coming from an external system. A variety of CDC tools are available such\nas Debezium, Fivetran, Qlik Replicate, Talend, and StreamSets. While specific\nimplementations differ, these tools generally capture and record the history\nof data changes in logs; downstream applications consume these CDC logs. In\nour example, data is landed in cloud object storage from a CDC tool such as\nDebezium, Fivetran, etc.\n\nWe have data from various CDC tools landing in a cloud object storage or a\nmessage queue like Apache Kafka. Typically we see CDC used in an ingestion\nto what we refer as the medallion architecture. A medallion architecture is a\ndata design pattern used to logically organize data in a Lakehouse, with the\ngoal of incrementally and progressively improving the structure and quality of\ndata as it flows through each layer of the architecture. Delta Live Tables allows\nyou to seamlessly apply changes from CDC feeds to tables in your Lakehouse;\ncombining this functionality with the medallion architecture allows for\n\n\nincremental changes to easily flow through analytical workloads at scale. Using\nCDC together with the medallion architecture provides multiple benefits to users\nsince only changed or added data needs to be processed. Thus, it enables users\nto cost-effectively keep Gold tables up-to-date with the latest business data.\n\n**NOTE:** The example here applies to both SQL and Python versions of CDC\nand also on a specific way to use the operations; to evaluate variations,\nplease see the official documentation [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#python) .\n\n**Prerequisites**\n\nTo get the most out of this guide, you should have a basic familiarity with:\n\n**•** SQL or Python\n\n**•** Delta Live Tables\n\n**•** Developing ETL pipelines and/or working with Big Data systems\n\n**•** Databricks interactive notebooks and clusters\n\n**•** You must have access to a Databricks Workspace with permissions\nto create new clusters, run jobs, and save data to a location on\nexternal cloud object storage or [DBFS](https://docs.gcp.databricks.com/data/databricks-file-system.html)\n\n**•** For the pipeline we are creating in this blog, “Advanced” product\nedition which supports enforcement of data quality constraints,\nneeds to be selected\n\n\n-----\n\n**The data set**\n\nHere we are consuming realistic looking CDC data from an external database. In\nthis pipeline, we will use the [Faker](https://github.com/joke2k/faker) library to generate the data set that a CDC\ntool like Debezium can produce and bring into cloud storage for the initial ingest\nin Databricks. Using [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) we incrementally load the messages from cloud\nobject storage, and store them in the Bronze table as it stores the raw messages.\nThe Bronze tables are intended for data ingestion which enable quick access to a\nsingle source of truth. Next we perform APPLY CHANGES INTO from the cleaned\nBronze layer table to propagate the updates downstream to the Silver table. As\ndata flows to Silver tables, generally it becomes more refined and optimized\n(“just-enough”) to provide an enterprise a view of all its key business entities.\nSee the diagram below.\n\n\nThis blog focuses on a simple example that requires a JSON message with\nfour fields of customer’s name, email, address and id along with the two fields:\noperation (which stores operation code (DELETE, APPEND, UPDATE, CREATE) and\noperation_date (which stores the date and timestamp for the record came for\neach operation action) to describe the changed data.\n\nTo generate a sample data set with the above fields, we are using a Python\npackage that generates fake data, Faker. You can find the notebook related to this\ndata generation section [here](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/1-cdc-data-generator.html) . In this notebook we provide the name and storage\nlocation to write the generated data there. We are using the DBFS functionality of\nDatabricks; see the [DBFS documentation](https://docs.databricks.com/user-guide/dbfs-databricks-file-system.html) to learn more about how it works. Then,\nwe use a PySpark user-defined function to generate the synthetic data set for\neach field, and write the data back to the defined storage location, which we will\nrefer to in other notebooks for accessing the synthetic data set.\n\n**Ingesting the raw data set using Auto Loader**\n\nAccording to the medallion architecture paradigm, the Bronze layer holds the\nmost raw data quality. At this stage we can incrementally read new data using\nAuto Loader from a location in cloud storage. Here we are adding the path to our\ngenerated data set to the configuration section under pipeline settings, which\nallows us to load the source path as a variable. So now our configuration under\npipeline settings looks like below:\n\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\"\n\n\n-----\n\nThen we load this configuration property in our notebooks.\n\nLet’s take a look at the Bronze table we will ingest, a. In SQL, and b. Using Python\n\n**A . S Q L**\n\nSET spark.source;\nCREATE STREAMING LIVE TABLE customer_bronze\n(\naddress string ,\nemail string ,\nid string ,\nfirstname string ,\nlastname string ,\noperation string ,\noperation_date string ,\n_rescued_data string\n)\nTBLPROPERTIES ( \"quality\" = \"bronze\" )\nCOMMENT \"New customer data incrementally ingested from cloud object\nstorage landing zone\"\nAS\nSELECT *\nFROM cloud_files( \"${source}/customers\" , \"json\" , map( \"cloudFiles.\ninferColumnTypes\" , \"true\" ));\n\n\n**B . P Y T H O N**\n\nimport dlt\nfrom pyspark.sql.functions import - \nfrom pyspark.sql.types import - \n\nsource = spark.conf.get( \"source\" )\n\n**@dlt.table(name=** **\"customer_bronze\"** **,**\n**comment =** **\"New customer data incrementally ingested from**\n**cloud object storage landing zone\"** **,**\n**table_properties={**\n**\"quality\"** **:** **\"bronze\"**\n**}**\n**)**\n```\n def customer_bronze ():\n\n```\nreturn (\nspark.readStream. format ( \"cloudFiles\" ) \\\n.option( \"cloudFiles.format\" , \"json\" ) \\\n.option( \"cloudFiles.inferColumnTypes\" , \"true\" ) \\\n.load( f\" {source} /customers\" )\n)\n\nThe above statements use the Auto Loader to create a streaming live table\ncalled customer_bronze from json files. When using Auto Loader in Delta Live\n\nTables, you do not need to provide any location for schema or checkpoint, as\nthose locations will be managed automatically by your DLT pipeline.\n\nAuto Loader provides a Structured Streaming source called cloud_files in\nSQL and cloudFiles in Python, which takes a cloud storage path and format as\nparameters.\n\nTo reduce compute costs, we recommend running the DLT pipeline in\nTriggered mode as a micro-batch assuming you do not have very low latency\nrequirements.\n\n\n-----\n\n**Expectations and high-quality data**\n\nIn the next step to create a high-quality, diverse, and accessible data set,\nwe impose quality check expectation criteria using Constraints. Currently,\na constraint can be either retain, drop, or fail. For more detail see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-expectations.html) . All\nconstraints are logged to enable streamlined quality monitoring.\n\n**A . S Q L**\n\nCREATE TEMPORARY STREAMING LIVE TABLE customer_bronze_clean_v(\nCONSTRAINT valid_id EXPECT (id IS NOT NULL ) ON VIOLATION DROP ROW ,\nCONSTRAINT valid_address EXPECT (address IS NOT NULL ),\nCONSTRAINT valid_operation EXPECT (operation IS NOT NULL ) ON VIOLATION\nDROP ROW\n)\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Cleansed bronze customer view (i.e. what will become Silver)\"\nAS SELECT `*`\nFROM STREAM(LIVE.customer_bronze);\n\n**B . P Y T H O N**\n```\n @dlt.view(name= \"customer_bronze_clean_v\" ,\n comment= \"Cleansed bronze customer view (i.e. what will become Silver)\" )\n\n```\n\n**Using APPLY CHANGES INTO statement to propagate changes to**\n\n**downstream target table**\n\nPrior to executing the Apply Changes Into query, we must ensure that a target\nstreaming table which we want to hold the most up-to-date data exists. If it\ndoes not exist we need to create one. Below cells are examples of creating a\ntarget streaming table. Note that at the time of publishing this blog, the target\nstreaming table creation statement is required along with the Apply Changes\nInto query, and both need to be present in the pipeline — otherwise your table\ncreation query will fail.\n\n**A . S Q L**\n\nCREATE STREAMING LIVE TABLE customer_silver\nTBLPROPERTIES (\"quality\" `=` \"silver\")\nCOMMENT \"Clean, merged customers\";\n\n**B . P Y T H O N**\n\ndlt.create_target_table(name= \"customer_silver\" ,\ncomment= \"Clean, merged customers\" ,\ntable_properties={\n\"quality\" : \"silver\"\n\n```\n@dlt.expect_or_drop( \"valid_id\" , \"id IS NOT NULL\" )\n@dlt.expect( \"valid_address\" , \"address IS NOT NULL\" )\n@dlt.expect_or_drop( \"valid_operation\" , \"operation IS NOT NULL\" )\ndef customer_bronze_clean_v ():\n return dlt.read_stream( \"customer_bronze\" ) \\\n\n```\n`.select(` `\"address\"` `,` `\"email\"` `,` `\"id\"` `,` \"firstname\" `,` `\"lastname\"` `,`\n```\n\"operation\" , \"operation_date\" , \"_rescued_data\" )\n\n```\n\n-----\n\nNow that we have a target streaming table, we can propagate changes to the\ndownstream target table using the Apply Changes Into query. While CDC feed\ncomes with INSERT, UPDATE and DELETE events, DLT default behavior is to apply\nINSERT and UPDATE events from any record in the source data set matching\non primary keys, and sequenced by a field which identifies the order of events.\nMore specifically it updates any row in the existing target table that matches\nthe primary key(s) or inserts a new row when a matching record does not exist\nin the target streaming table. We can use APPLY AS DELETE WHEN in SQL, or its\nequivalent apply_as_deletes argument in Python to handle DELETE events.\n\nIn this example we used \"id\" as my primary key, which uniquely identifies the\ncustomers and allows CDC events to apply to those identified customer records\nin the target streaming table. Since \"operation_date\" keeps the logical order of\nCDC events in the source data set, we use \"SEQUENCE BY operation_date\" in\nSQL, or its equivalent \"sequence_by = col(\"operation_date\")\" in Python to handle\nchange events that arrive out of order. Keep in mind that the field value we use\nwith SEQUENCE BY (or sequence_by) should be unique among all updates to\nthe same key. In most cases, the sequence by column will be a column with\ntimestamp information.\n\nFinally we used \"COLUMNS * EXCEPT (operation, operation_date, _rescued_\ndata)\" in SQL, or its equivalent \"except_column_list\"= [\"operation\", \"operation_\ndate\", \"_rescued_data\"] in Python to exclude three columns of \"operation\",\n\"operation_date\", \"_rescued_data\" from the target streaming table. By default all\nthe columns are included in the target streaming table, when we do not specify\nthe \"COLUMNS\" clause.\n\n\n**A . S Q L**\n\nAPPLY CHANGES INTO LIVE.customer_silver\nFROM stream(LIVE.customer_bronze_clean_v)\nKEYS (id)\nAPPLY AS DELETE WHEN operation `=` \"DELETE\"\nSEQUENCE BY operation_date\nCOLUMNS `*` EXCEPT (operation, operation_date,\n_rescued_data);\n\n**B . P Y T H O N**\n```\n dlt.apply_changes(\n target = \"customer_silver\",\n source = \"customer_bronze_clean_v\",\n keys = [\"id\"],\n sequence_by = col(\"operation_date\"),\n apply_as_deletes = expr(\"operation = 'DELETE'\"),\n except_column_list = [\"operation\", \"operation_date\", \"_rescued_data\"])\n\n```\nTo check out the full list of available clauses see [here](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html#requirements) .\n\nPlease note that, at the time of publishing this blog, a table that reads from the\ntarget of an APPLY CHANGES INTO query or apply_changes function must be a\nlive table, and cannot be a streaming live table.\n\nA [SQL](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-retail-dlt-cdc-sql.html) and [Python](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/2-Retail_DLT_CDC_Python.html) notebook is available for reference for this section. Now that\nwe have all the cells ready, let’s create a pipeline to ingest data from cloud object\nstorage. Open Jobs in a new tab or window in your workspace, and select “Delta\nLive Tables.”\n\n\n-----\n\nThe pipeline associated with this blog has the following DLT pipeline settings:\n\n{\n\"clusters\" : [\n{\n\"label\" : \"default\" ,\n\"num_workers\" : 1\n}\n],\n\"development\" : true ,\n\"continuous\" : false ,\n\"edition\" : \"advanced\" ,\n\"photon\" : false ,\n\"libraries\" : [\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/1-CDC_DataGenerator\"\n}\n},\n{\n\"notebook\" : {\n\"path\" : \"/Repos/mojgan.mazouchi@databricks.com/Delta-Live-Tables/\nnotebooks/2-Retail_DLT_CDC_sql\"\n}\n}\n],\n\"name\" : \"CDC_blog\" ,\n\"storage\" : \"dbfs:/home/mydir/myDB/dlt_storage\" ,\n\"configuration\" : {\n\"source\" : \"/tmp/demo/cdc_raw\" ,\n\"pipelines.applyChangesPreviewEnabled\" : \"true\"\n},\n\"target\" : \"my_database\"\n\n\n1. Select “Create Pipeline” to create a new pipeline\n\n2. Specify a name such as “Retail CDC Pipeline”\n\n3. Specify the Notebook Paths that you already created earlier, one for the\ngenerated data set using Faker package, and another path for the ingestion\nof the generated data in DLT. The second notebook path can refer to the\nnotebook written in SQL, or Python depending on your language of choice.\n\n4. To access the data generated in the first notebook, add the data set path in\nconfiguration. Here we stored data in “/tmp/demo/cdc_raw/customers”, so\nwe set “source” to “/tmp/demo/cdc_raw/” to reference “source/customers” in\nour second notebook.\n\n5. Specify the Target (which is optional and referring to the target database),\nwhere you can query the resulting tables from your pipeline\n\n6. Specify the Storage Location in your object storage (which is optional), to\naccess your DLT produced data sets and metadata logs for your pipeline\n\n7. Set Pipeline Mode to Triggered. In Triggered mode, DLT pipeline will consume\nnew data in the source all at once, and once the processing is done it will\nterminate the compute resource automatically. You can toggle between\nTriggered and Continuous modes when editing your pipeline settings. Setting\n“continuous”: false in the JSON is equivalent to setting the pipeline to\nTriggered mode.\n\n8. For this workload you can disable the autoscaling under Autopilot Options,\nand use only one worker cluster. For production workloads, we recommend\nenabling autoscaling and setting the maximum numbers of workers needed\nfor cluster size.\n\n9. Select “Start”\n\n10. Your pipeline is created and running now!\n\n\n-----\n\nYou can check out our previous deep dive on the topic [here](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) . Try this [notebook](https://www.databricks.com/wp-content/uploads/notebooks/DB-129/3-retail-dlt-cdc-monitoring.html)\nto see pipeline observability and data quality monitoring on the example DLT\npipeline associated with this blog.\n\n**Conclusion**\n\nIn this blog, we showed how we made it seamless for users to efficiently\nimplement change data capture (CDC) into their lakehouse platform with Delta\nLive Tables (DLT). DLT provides built-in quality controls with deep visibility into\npipeline operations, observing pipeline lineage, monitoring schema, and quality\nchecks at each step in the pipeline. DLT supports automatic error handling and\nbest in class auto-scaling capability for streaming workloads, which enables\nusers to have quality data with optimum resources required for their workload.\n\nData engineers can now easily implement CDC with a new declarative [APPLY](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability)\n[CHANGES INTO API](https://www.databricks.com/discover/pages/getting-started-with-delta-live-tables#pipeline-observability) with DLT in either SQL or Python. This new capability lets\nyour ETL pipelines easily identify changes and apply those changes across tens\nof thousands of tables with low-latency support.\n\n**Ready to get started and try out CDC in Delta Live Tables for yourself?**\nPlease watch this [webinar](https://www.databricks.com/p/webinar/tackle-data-transformation) to learn how Delta Live Tables simplifies the\ncomplexity of data transformation and ETL, and see our [Change data capture](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE)\n[with Delta Live Tables](https://docs.databricks.com/data-engineering/delta-live-tables/delta-live-tables-cdc.html?_gl=1*d51pfv*_gcl_aw*R0NMLjE2NDYyNTYzOTkuQ2p3S0NBaUF5UHlRQmhCNkVpd0FGVXVha29wck1CWldNUG5INUNpczB3cnMwUGZfd2JxOV9vRWU4bVFITkptZWVaOV9lVFVIYVk0a3Bob0NkYWtRQXZEX0J3RQ..&_ga=2.123024395.1232434169.1646524051-1547688913.1627598437&_gac=1.158632392.1646256400.CjwKCAiAyPyQBhB6EiwAFUuakoprMBZWMPnH5Cis0wrs0Pf_wbq9_oEe8mQHNJmeeZ9_eTUHaY4kphoCdakQAvD_BwE) document, official [github](https://github.com/databricks/delta-live-tables-notebooks) and follow the steps in this\n[video](https://vimeo.com/700994477) to create your pipeline!\n\n\n**DLT pipeline lineage observability and data quality**\n**monitoring**\n\nAll DLT pipeline logs are stored in the pipeline’s storage location. You can specify\nyour storage location only when you are creating your pipeline. Note that once\nthe pipeline is created you can no longer modify storage location.\n\n\n-----\n\nSECTION 2.10 \u0007\n\n**Best Practices for Cross-Government Data Sharing**\n\nby **M I L O S C O L I C , P R I T E S H P AT E L , R O B E R T W H I F F I N , R I C H A R D J A M E S W I L S O N ,**\n\n**M A R C E L L F E R E N C Z** and **E D W A R D K E L LY**\n\nFebruary 21, 2023\n\n\nGovernment data exchange is the practice of sharing data between different\ngovernment agencies and often partners in commercial sectors. Government\ncan share data for various reasons, such as to improve government operations’\nefficiency, provide better services to the public, or support research and policymaking. In addition, data exchange in the public sector can involve sharing with the\nprivate sector or receiving data from the private sector. The considerations span\nmultiple jurisdictions and over almost all industries. In this blog, we will address the\nneeds disclosed as part of national data strategies and how modern technologies,\nparticularly Delta Sharing, Unity Catalog, and clean rooms, can help you design,\nimplement and manage a future-proof and sustainable data ecosystem.\n\n**Data sharing and public sector**\n\n“The miracle is this: the more we share the more we have.” — [Leonard Nimoy.](https://en.wikipedia.org/wiki/Leonard_Nimoy)\n\nProbably the quote about sharing that applies the most profoundly to the\ntopic of data sharing. To the extent that the purpose of sharing the data is to\ncreate new information, new insights, and new data. The importance of data\nsharing is even more amplified in the government context, where federation\n\n\nbetween departments allows for increased focus. Still, the very same federation\nintroduces challenges around data completeness, data quality, data access,\nsecurity and control, [FAIR](https://en.wikipedia.org/wiki/FAIR_data) -ness of data, etc. These challenges are far from trivial\nand require a strategic, multifaceted approach to be addressed appropriately.\nTechnology, people, process, legal frameworks, etc., require dedicated\nconsideration when designing a robust data sharing ecosystem.\n\n[The National Data Strategy](https://www.gov.uk/government/publications/uk-national-data-strategy/national-data-strategy) (NDS) by the UK government outlines five actionable\nmissions through which we can materialize the value of data for the citizen and\nsociety-wide benefits.\n\n\n-----\n\nIt comes as no surprise that each and every one of the missions is strongly\nrelated to the concept of data sharing, or more broadly, data access both within\nand outside of government departments:\n\n**1. Unlocking the value of the data across the economy** — Mission 1 of the\nNDS aims to assert government and the regulators as enablers of the value\nextraction from data through the adoption of best practices. The UK data\neconomy was estimated to be near [£125 billion in 2021](https://www.gov.uk/government/publications/uks-digital-strategy/uk-digital-strategy) with an upwards trend.\nIn this context, it is essential to understand that the government-collected\nand provided open data can be crucial for addressing many of the challenges\nacross all industries.\n\nFor example, insurance providers can better assess the risk of insuring\nproperties by ingesting and integrating [Flood areas](https://environment.data.gov.uk/flood-monitoring/doc/reference#flood-areas) provided by [DEFRA](https://www.gov.uk/government/organisations/department-for-environment-food-rural-affairs) . On\nthe other hand, capital market investors could better understand the risk of\ntheir investments by ingesting and integrating the [Inflation Rate Index](https://www.ons.gov.uk/economy/inflationandpriceindices/timeseries/l55o/mm23) by [ONS](https://www.ons.gov.uk/) .\nReversely, it is crucial for regulators to have well-defined data access and\ndata sharing patterns for conducting their regulatory activities. This clarity\ntruly enables the economic actors that interact with government data.\n\n\n**2. Securing a pro-growth and trusted data regime** — The key aspect of\nMission 2 is data trust, or more broadly, adherence to data quality norms.\nData quality considerations become further amplified for data sharing and\ndata exchange use cases where we are considering the whole ecosystem\nat once, and quality implications transcend the boundaries of our own\nplatform. This is precisely why we have to adopt “data sustainability.” What\nwe mean by sustainable data products are data products that harness the\nexisting sources over reinvention of the same/similar assets, accumulation of\nunnecessary data (data pollutants) and that anticipate future uses.\n\nUngoverned and unbounded data sharing could negatively impact data\nquality and hinder the growth and value of data. The quality of how the data\nis shared should be a key consideration of data quality frameworks. For\nthis reason, we require a solid set of standards and best practices for data\nsharing with governance and quality assurance built into the process and\ntechnologies. Only this way can we ensure the sustainability of our data and\nsecure a pro-growth trusted data regime.\n\n\n-----\n\n**3. Transforming government’s use of data to drive efficiency and improve**\n**public services** — “By 2025 data assets are organized and supported as\nproducts, regardless of whether they’re used by internal teams or external\ncustomers… Data products continuously evolve in an agile manner to meet\nthe needs of consumers… these products provide data solutions that can\nmore easily and repeatedly be used to meet various business challenges and\nreduce the time and cost of delivering new AI-driven capabilities.” —\n[The data-driven enterprise of 2025](https://www.mckinsey.com/capabilities/quantumblack/our-insights/the-data-driven-enterprise-of-2025) by McKinsey. AI and ML can be powerful\nenablers of digital transformation for both the public and private sectors.\n\nAI, ML, reports, and dashboards are just a few examples of data products\nand services that extract value from data. The quality of these solutions is\ndirectly reflected in the quality of data used for building them and our ability\nto access and leverage available data assets both internally and externally.\nWhilst there is a vast amount of data available for us to build new intelligent\nsolutions for driving efficiency for better processes, better decision-making,\nand better policies — there are numerous barriers that can trap the data,\nsuch as legacy systems, data silos, fragmented standards, proprietary\nformats, etc. Modeling data solutions as data products and standardizing\nthem to a unified format allows us to abstract such barriers and truly\nleverage the data ecosystem.\n\n\n**4. Ensuring the security and resilience of the infrastructure on which**\n**data relies** — Reflecting on the vision of the year 2025 — this isn’t that far\nfrom now and even in a not so distant future, we will be required to rethink\nour approach to data, more specifically — what is our digital supply chain\ninfrastructure/data sharing infrastructure? Data and data assets are products\nand should be managed as products. If data is a product, we need a coherent\nand unified way of providing those products.\n\nIf data is to be used across industries and across both private and public\nsectors, we need an open protocol that drives adoption and habit generation.\nTo drive adoption, the technologies we use must be resilient, robust, trusted\nand usable by/for all. Vendor lock-in, platform lock-in or cloud lock-in are all\nboundaries to achieving this vision.\n\n**5. Championing the international flow of data** — Data exchange between\njurisdictions and across governments will likely be one of the most\ntransformative applications of data at scale. Some of the world’s toughest\nchallenges depend on the efficient exchange of data between governments\n— prevention of criminal activities, counterterrorism activities, net-zero\nemission goals, international trade, the list goes on and on. Some steps in\nthis direction are already materializing: the U.S. federal government and UK\ngovernment have agreed on data exchange for countering serious crime\nactivities. This is a true example of championing international flow data and\nusing data for good. It is imperative that for these use cases, we approach\ndata sharing from a security-first angle. Data sharing standards and protocols\nneed to adhere to security and privacy best practices.\n\n\n-----\n\nWhile originally built with a focus on the UK government and how to better\nintegrate data as a key asset of a modern government, these concepts apply in\na much wider global public sector context. In the same spirit, the U.S. Federal\nGovernment proposed the [Federal Data Strategy](https://strategy.data.gov/overview/) as a collection of principles,\npractices, action steps and timeline through which government can leverage\nthe full value of Federal data for mission, service and the public good.\n\nThe principles are grouped into three primary topics:\n\n**•** **Ethical governance** — Within the domain of ethics, the sharing of data\nis a fundamental tool for promoting transparency, accountability and\nexplainability of decision-making. It is practically impossible to uphold\nethics without some form of audit conducted by an independent party.\nData (and metadata) exchange is a critical enabler for continuous robust\nprocesses that ensure we are using the data for good and we are using data\nwe can trust.\n\n\n\n**•** **Conscious design** — These principles are strongly aligned with the idea of\ndata sustainability. The guidelines promote forward thinking around usability\nand interoperability of the data and user-centric design principles of\nsustainable data products.\n\n**•** **Learning culture** — Data sharing, or alternatively knowledge sharing, has\nan important role in building a scalable learning ecosystem and learning\nculture. Data is front and center of knowledge synthesis, and from a\nscientific angle, data proves factual knowledge. Another critical component\nof knowledge is the “Why?” and data is what we need to address the\n“Why?” component of any decisions we make, which policy to enforce, who\nto sanction, who to support with grants, how to improve the efficiency of\ngovernment services, how to better serve citizens and society.\n\nIn contrast to afore discussed qualitative analysis of the value of data sharing\nacross governments, the European Commission forecasts the economic value\nof the European data economy will [exceed €800 billion by 2027](https://commission.europa.eu/strategy-and-policy/priorities-2019-2024/europe-fit-digital-age/european-data-strategy_en) — roughly the\nsame size as the [Dutch economy in 2021](https://ec.europa.eu/eurostat/databrowser/view/NAMA_10_GDP/default/table?lang=en&category=na10.nama10.nama_10_ma) ! Furthermore, they predict more than 10\nmillion data professionals in Europe alone. The technology and infrastructure to\nsupport the data society have to be accessible to all, interoperable, extensible,\nflexible and open. Imagine a world in which you’d need a different truck to\ntransport products between different warehouses because each road requires a\ndifferent set of tires — the whole supply chain would collapse. When it comes to\ndata, we often experience the “one set of tires for one road” paradox. Rest APIs\nand data exchange protocols have been proposed in the past but have failed\nto address the need for simplicity, ease of use and cost of scaling up with the\nnumber of data products.\n\n\n-----\n\n**Delta Sharing — the new data**\n**highway**\n\nDelta Sharing provides an open protocol for\nsecure data sharing to any computing platform.\nThe protocol is based on Delta data format and is\nagnostic concerning the cloud of choice.\n\nDelta is an open source data format that avoids\nvendor, platform and cloud lock-in, thus fully\nadhering to the principles of data sustainability,\nconscious design of the U.S. Federal Data Strategy\nand mission 4 of the UK National Data Strategy.\nDelta provides a governance layer on top of the\nParquet data format. Furthermore, it provides many\nperformance optimizations not available in Parquet\nout of the box. The openness of the data format\nis a critical consideration. It is the main factor for\ndriving the habit generation and adoption of best\npractices and standards.\n\n\n-----\n\nDelta Sharing is a protocol based on a lean set of REST APIs to manage sharing,\npermissions and access to any data asset stored in Delta or Parquet formats.\nThe protocol defines two main actors, the data provider (data supplier, data\nowner) and the data recipient (data consumer). The recipient, by definition, is\nagnostic to the data format at the source. Delta Sharing provides the necessary\nabstractions for governed data access in many different languages and tools.\n\nDelta Sharing is uniquely positioned to answer many of the challenges of data\nsharing in a scalable manner within the context of highly regulated domains like\nthe public sector:\n\n**• Privacy and security concerns** — Personally identifiable data or otherwise\nsensitive or restricted data is a major part of the data exchange needs of a\ndata-driven and modernized government. Given the sensitive nature of such\ndata, it is paramount that the governance of data sharing is maintained in a\ncoherent and unified manner. Any unnecessary process and technological\ncomplexities increase the risk of over-sharing data. With this in mind,\nDelta Sharing has been designed with [security best practices](https://www.databricks.com/blog/2022/08/01/security-best-practices-for-delta-sharing.html) from the\nvery inception. The protocol provides end-to-end encryption, short-lived\ncredentials, and accessible and intuitive audit and governance features. All\nof these capabilities are available in a centralized way across all your Delta\ntables across all clouds.\n\n**• Quality and accuracy** — Another challenge of data sharing is ensuring\nthat the data being shared is of high quality and accuracy. Given that\nthe underlying data is stored as Delta tables, we can guarantee that the\n[transactional nature of data](https://docs.delta.io/latest/concurrency-control.html#concurrency-control) is respected; Delta ensures ACID properties\nof data. Furthermore, Delta supports [data constraints](https://docs.delta.io/latest/delta-constraints.html#constraints) to guarantee data\n\n\nquality requirements at storage. Unfortunately, other formats such as [CSV](https://en.wikipedia.org/wiki/Comma-separated_values) ,\n[CSVW](https://csvw.org/) , [ORC](https://www.google.com/search?q=orc+data+format&rlz=1C5GCEM_enGB931GB932&ei=CzHRY6KqI4S78gL7hoigCw&oq=ORC+da&gs_lcp=Cgxnd3Mtd2l6LXNlcnAQARgAMgUIABCRAjIFCAAQkQIyBQgAEIAEMgUIABCABDIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjIHCAAQgAQQCjoKCAAQRxDWBBCwAzoHCAAQsAMQQzoNCAAQ5AIQ1gQQsAMYAToPCC4Q1AIQyAMQsAMQQxgCOgwILhDIAxCwAxBDGAI6FQguEMcBENEDENQCEMgDELADEEMYAjoECAAQQzoGCAAQChBDOgoIABCxAxCDARBDOgcIABCxAxBDSgQIQRgASgQIRhgBUCxY3AJg3QxoAXABeACAAW6IAbgCkgEDMC4zmAEAoAEByAETwAEB2gEGCAEQARgJ2gEGCAIQARgI&sclient=gws-wiz-serp) , [Avro](https://en.wikipedia.org/wiki/Apache_Avro) , [XML](https://en.wikipedia.org/wiki/XML) , etc., do not have such properties without significant\nadditional effort. The issue becomes even more emphasized by the fact\nthat data quality cannot be ensured in the same way on both the data\nprovider and data recipient side without the exact reimplementation of the\nsource systems. It is critical to embed quality and metadata together with\ndata to ensure quality travels together with data. Any decoupled approach\nto managing data, metadata and quality separately increases the risk of\nsharing and can lead to undesirable outcomes.\n\n**• Lack of standardization** — Another challenge of data sharing is the lack\nof standardization in how data is collected, organized, and stored. This is\nparticularly pronounced in the context of governmental activities. While\ngovernments have proposed standard formats (e.g., Office for National\nStatistics [promotes usage of CSVW](https://www.ons.gov.uk/aboutus/transparencyandgovernance/datastrategy/datastandards#metadata-exchange) ), aligning all private and public\nsector companies to standards proposed by such initiatives is a massive\nchallenge. Other industries may have different requirements for scalability,\ninteroperability, format complexity, lack of structure in data, etc. Most of\nthe currently advocated standards are lacking in multiple such aspects.\nDelta is the most mature candidate for assuming the central role in the\nstandardization of data exchange format. It has been built as a transactional\nand scalable data format, it supports structured, semi-structured and\nunstructured data, it stores data schema and metadata together with data\nand it provides a scalable enterprise-grade sharing protocol through Delta\nSharing. Finally, Delta is one of the most popular open source projects\nin the ecosystem and, since May 2022, has surpassed [7 million monthly](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/)\n[downloads](https://delta.io/blog/2022-08-02-delta-2-0-the-foundation-of-your-data-lake-is-open/) .\n\n\n-----\n\n**• Cultural and organizational barriers** — These challenges can be\nsummarized by one word: friction. Unfortunately, it’s a common problem\nfor civil servants to struggle to obtain access to both internal and external\ndata due to over-cumbersome processes, policies and outdated standards.\nThe principles we are using to build our data platforms and our data sharing\nplatforms have to be self-promoting, have to drive adoption and have to\ngenerate habits that adhere to best practices.\n\nIf there is friction with standard adoption, the only way to ensure standards\nare respected is by enforcement and that itself is yet another barrier to\nachieving data sustainability. Organizations have already adopted Delta\nSharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\n[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\n[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\nSharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\nand governed.\n\n\n\n**• Technical challenges** — Federation at the government scale or even\nfurther across multiple industries and geographies poses technical\nchallenges. Each organization within this federation owns its platform\nand drives technological, architectural, platform and tooling choices.\n\nHow can we promote interoperability and data exchange in this vast,\ndiverse technological ecosystem? The data is the only viable integration\nvehicle. As long as the data formats we utilize are scalable, open and\ngoverned, we can use them to abstract from individual platforms and\ntheir intrinsic complexities.\n\nDelta format and Delta Sharing solve this wide array of requirements and\nchallenges in a scalable, robust and open way. This positions Delta Sharing\nas the strongest choice for unification and simplification of the protocol and\nmechanism through which we share data across both private and public sectors.\n\n\n-----\n\n**Data Sharing through data clean rooms**\n\n\n[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\nshare data with third parties in a privacy-safe environment. With Unity Catalog ,\nyou can enable fine-grained access controls on the data and meet your privacy\nrequirements. In this architecture, the data participants never get access to\nthe raw data. The only outputs from the clean rooms are those data assets\ngenerated in a pre-agreed, governed and fully controlled manner that ensures\ncompliance with the requirements of all parties involved.\n\nFinally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\non the premise. In contrast, less restricted data is free to leverage the power\nof the cloud offerings. In said scenario, there may be a need to combine the\npower of the cloud with the restricted data to solve advanced use cases where\ncapabilities are unavailable on the on-premises data platforms. Data clean rooms\ncan ensure that no physical data copies of the raw restricted data are created,\nresults are produced within the clean room’s controlled environment and results\nare shared back to the on-premises environment (if the results maintain the\nrestricted access within the defined policies) or are forwarded to any other\ncompliant and predetermined destination system.\n\n\nTaking the complexities of data sharing within highly regulated space and the\npublic sector one step further — what if we require to share the knowledge\ncontained in the data without ever granting direct access to the source data to\nexternal parties? These requirements may prove achievable and desirable where\nthe data sharing risk appetite is very low.\n\nIn many public sector contexts, there are concerns that combining the data that\ndescribes citizens could lead to a big brother scenario where simply too much\ndata about an individual is concentrated in a single data asset. If it were to fall\ninto the wrong hands, such a hypothetical data asset could lead to immeasurable\nconsequences for individuals and the trust in public sector services could\nerode. On the other hand, the value of a 360 view of the citizen could accelerate\nimportant decision-making. It could immensely improve the quality of policies\nand services provided to the citizens.\n\n\n-----\n\n**Citizen value of data sharing**\n\nEvery decision made by the government is a decision that affects its citizens.\nWhether the decision is a change to a policy, granting a benefit or preventing\ncrime, it can significantly influence the quality of our society. Data is a key factor\nin making the right decisions and justifying the decisions made. Simply put,\nwe can’t expect high-quality decisions without the high quality of data and a\ncomplete view of the data (within the permitted context). Without data sharing,\nwe will remain in a highly fragmented position where our ability to make those\ndecisions is severely limited or even completely compromised. In this blog, we\nhave covered several technological solutions available within the lakehouse that\ncan derisk and accelerate how the government is leveraging the data ecosystem\nin a sustainable and scalable way.\n\nFor more details on the industry use cases that Delta Sharing is addressing\nplease consult [A New Approach to Data Sharing](https://www.databricks.com/product/unity-catalog) eBook.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 03\n\n\n### Ready-to-Use Notebooks and Data Sets\n\n\n-----\n\n**Digital Twins**\n\nLeverage digital twins — virtual\nrepresentations of devices and\nobjects — to optimize operations and\ngain insights\n\n\nThis section includes several Solution Accelerators — free, ready-to-use\n\nexamples of data solutions from different industries ranging from retail to\n\nmanufacturing and healthcare. Each of the following scenarios includes\n\nnotebooks with code and step-by-step instructions to help you get\n\nstarted. Get hands-on experience with the Databricks Lakehouse Platform\n\n\nby trying the following for yourself: **[Explore the Solution](https://databricks.com/solutions/accelerators/digital-twins)**\n\n\n**Overall Equipment**\n**Effectiveness**\n\nIngest equipment sensor data for\nmetric generation and data driven\ndecision-making\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)**\n\n**Real-time point of**\n**sale analytics**\n\nCalculate current inventories for\nvarious products across multiple store\nlocations with Delta Live Tables\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/real-time-point-of-sale-analytics)**\n\n\n**Recommendation Engines**\n**for Personalization**\n\nImprove customers’ user experience\nand conversion with personalized\nrecommendations\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Understanding Price**\n**Transparency Data**\n\nEfficiently ingest large healthcare data\nsets to create price transparency for\nbetter understanding of healthcare costs\n\n**[Explore the Solution](https://www.databricks.com/solutions/accelerators/price-transparency-data)**\n\nAdditional Solution Accelerators with ready-to-use notebooks can be found here:\n\n**[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)**\n\n\n-----\n\n**SECTION**\n\n# 04\n\n\n### Case Studies\n\n**4.1** Akamai\n\n**4.2** Grammarly\n\n**4.3** Honeywell\n\n**4.4** Wood Mackenzie\n\n**4.5** Rivian\n\n**4.6** AT&T\n\n\n-----\n\nSECTION 4.1\n**Akamai delivers real-time security**\n**analytics using Delta Lake**\n\n\n###### <1\n\n**Min ingestion time,**\n**reduced from 15 min**\n\n\n###### <85%\n\n**Of queries have a response**\n**time of 7 seconds or less**\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\n[Threat Detection](https://databricks.com/solutions/accelerators/threat-detection)\n\n**P L AT F O R M U S E C A S E**\nDelta Lake, Data Streaming, Photon,\n[Databricks SQL](https://databricks.com/product/databricks-sql)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\nAkamai runs a pervasive, highly distributed content delivery network (CDN). Its CDN\n\nuses approximately 345,000 servers in more than 135 countries and over 1,300 networks\n\nworldwide to route internet traffic for some of the largest enterprises in media, commerce,\n\nfinance, retail and many other industries. About 30% of the internet’s traffic flows through\n\nAkamai servers. Akamai also provides cloud security solutions.\n\nIn 2018, the company launched a web security analytics tool that offers Akamai customers\n\na single, unified interface for assessing a wide range of streaming security events and\n\nperforming analysis of those events. The web analytics tool helps Akamai customers to\n\ntake informed actions in relation to security events in real time. Akamai is able to stream\n\nmassive amounts of data and meet the strict SLAs it provides to customers by leveraging\n\nDelta Lake and the Databricks Lakehouse Platform for the web analytics tool.\n\n\n-----\n\n**Ingesting and streaming enormous amounts of data**\n\nAkamai’s web security analytics tool ingests approximately 10GB of data related\nto security events per second. Data volume can increase significantly when\nretail customers conduct a large number of sales — or on big shopping days like\nBlack Friday or Cyber Monday. The web security analytics tool stores several\npetabytes of data for analysis purposes. Those analyses are performed to\nprotect Akamai’s customers and provide them with the ability to explore and\nquery security events on their own.\n\nThe web security analytics tool initially relied on an on-premises architecture\nrunning Apache Spark™ on Hadoop. Akamai offers strict service level agreements\n(SLAs) to its customers of 5 to 7 minutes from when an attack occurs until it is\ndisplayed in the tool. The company sought to improve ingestion and query speed\nto meet those SLAs. “Data needs to be as real-time as possible so customers\ncan see what is attacking them,” says Tomer Patel, Engineering Manager at\nAkamai. “Providing queryable data to customers quickly is critical. We wanted to\nmove away from on-prem to improve performance and our SLAs so the latency\nwould be seconds rather than minutes.”\n\n**Delta Lake allows us to not only query the data better but to**\n**also acquire an increase in the data volume. We’ve seen an**\n**80% increase in traffic and data in the last year, so being able**\n**to scale fast is critical.**\n\n\nAfter conducting proofs of concept with several companies, Akamai chose to\nbase its streaming analytics architecture on Spark and the Databricks Lakehouse\nPlatform. “Because of our scale and the demands of our SLA, we determined that\nDatabricks was the right solution for us,” says Patel. “When we consider storage\noptimization, and data caching, if we went with another solution, we couldn’t\nachieve the same level of performance.”\n\n**Improving speed and reducing costs**\n\nToday, the web security analytics tool ingests and transforms data, stores it\nin cloud storage, and sends the location of the file via Kafka. It then uses a\nDatabricks Job as the ingest application. Delta Lake, the open source storage\nformat at the base of the Databricks Lakehouse Platform, supports real-time\nquerying on the web security analytics data. Delta Lake also enables Akamai to\nscale quickly. “Delta Lake allows us to not only query the data better but to also\nacquire an increase in the data volume,” says Patel. “We’ve seen an 80% increase\nin traffic and data in the last year, so being able to scale fast is critical.”\n\nAkamai also uses Databricks SQL (DBSQL) and Photon, which provide extremely\n\nfast query performance. Patel added that Photon provided a significant boost\nto query performance. Overall, Databricks’ streaming architecture combined\nwith DBSQL and Photon enables Akamai to achieve real-time analytics, which\ntranslates to real-time business benefits.\n\n\n**Tomer Patel**\nEngineering Manager, Akamai\n\n\n-----\n\nPatel says he likes that Delta Lake is open source, as the company has benefitted\nfrom a community of users working to improve the product. “The fact that Delta\nLake is open source and there’s a big community behind it means we don’t need\nto implement everything ourselves,” says Patel. “We benefit from fixed bugs that\nothers have encountered and from optimizations that are contributed to the\nproject.” Akamai worked closely with Databricks to ensure Delta Lake can meet\nthe scale and performance requirements Akamai defined. These improvements\nhave been contributed back to the project (many of which were made available as\npart of Delta Lake 2.0), and so any user running Delta Lake now benefits from the\ntechnology being tested at such a large scale in a real-world production scenario.\n\n\n**Meeting aggressive requirements for scale,**\n**reliability and performance**\n\nUsing Spark Structured Streaming on the Databricks Lakehouse Platform enables\nthe web security analytics tool to stream vast volumes of data and provide\nlow-latency, real-time analytics-as-a-service to Akamai’s customers. That way\nAkamai is able to make available security event data to customers within the\nSLA of 5 to 7 minutes from when an attack occurs. “Our focus is performance,\nperformance, performance,” says Patel. “The platform’s performance and\nscalability are what drives us.”\n\nUsing the Databricks Lakehouse Platform, it now takes under 1 minute to ingest\nthe security event data. “Reducing ingestion time from 15 minutes to under 1\nminute is a huge improvement,” says Patel. “It benefits our customers because\nthey can see the security event data faster and they have a view of what exactly\nis happening as well as the capability to filter all of it.”\n\nAkamai’s biggest priority is to provide customers with a good experience and\nfast response times. To date, Akamai has moved about 70% of security event\ndata from its on-prem architecture to Databricks, and the SLA for customer\nquery and response time has improved significantly as a result. “Now, with the\nmove to Databricks, our customers experience much better response time, with\nover 85% of queries completing under 7 seconds.” Providing that kind of realtime data means Akamai can help its customers stay vigilant and maintain an\noptimal security configuration.\n\n\n-----\n\nSECTION 4.2\n**Grammarly uses Databricks Lakehouse to improve**\n**user experience**\n\n\n###### 110%\n\n**Faster querying, at 10% of the cost**\n**to ingest, than a data warehouse**\n\n\n###### 5 billion\n\n**Daily events available for**\n**analytics in under 15 minutes**\n\n\nGrammarly’s mission is to improve lives by improving communication. The company’s\n\ntrusted AI-powered communication assistance provides real-time suggestions to\n\nhelp individuals and teams write more confidently and achieve better results. Its\n\ncomprehensive offerings — [Grammarly Premium](https://www.grammarly.com/premium) , [Grammarly Business](https://www.grammarly.com/business) , [Grammarly for](https://www.grammarly.com/edu)\n\n[Education](https://www.grammarly.com/edu) and [Grammarly for Developers](https://developer.grammarly.com/) — deliver leading communication support\n\nwherever writing happens. As the company grew over the years, its legacy, homegrown\n\nanalytics system made it challenging to evaluate large data sets quickly and cost-\n\neffectively.\n\nBy migrating to the Databricks Lakehouse Platform, Grammarly is now able to sustain a\n\nflexible, scalable and highly secure analytics platform that helps 30 million people and\n\n50,000 teams worldwide write more effectively every day.\n\n\n**I N D U S T R Y**\n[Technology and Software](https://www.databricks.com/solutions/industries/technology-and-software)\n\n**S O L U T I O N**\nRecommendation Engines, Advertising\nEffectiveness, Customer Lifetime Value\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Unity Catalog,\n[Machine Learning, ETL](https://www.databricks.com/product/machine-learning)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Harnessing data to improve communications for millions of**\n**users and thousands of teams**\n\nWhen people use Grammarly’s AI communication assistance, they receive\nsuggestions to help them improve multiple dimensions of communication,\nincluding spelling and grammar correctness, clarity and conciseness, word\nchoice, style, and tone. Grammarly receives feedback when users accept, reject\nor ignore its suggestions through app-created events, which total about 5 billion\nevents per day.\n\nHistorically, Grammarly relied on a homegrown legacy analytics platform and\nleveraged an in-house SQL-like language that was time-intensive to learn and\nmade it challenging to onboard new hires. As the company grew, Grammarly\ndata analysts found that the platform did not sufficiently meet the needs of its\nessential business functions, especially marketing, sales and customer success.\nAnalysts found themselves copying and pasting data from spreadsheets\nbecause the existing system couldn’t effectively ingest the external data needed\nto answer questions such as, “Which marketing channel delivers the highest\nROI?” Reporting proved challenging because the existing system didn’t support\nTableau dashboards, and company leaders and analysts needed to ensure they\ncould make decisions quickly and confidently.\n\n\n**Databricks Lakehouse has given us the flexibility to unleash**\n**our data without compromise. That flexibility has allowed us**\n**to speed up analytics to a pace we’ve never achieved before.**\n\n**Chris Locklin**\nEngineering Manager, Data Platforms, Grammarly\n\nGrammarly also sought to unify its data warehouses in order to scale and\nimprove data storage and query capabilities. As it stood, large Amazon EMR\nclusters ran 24/7 and drove up costs. With the various data sources, the team\nalso needed to maintain access control. “Access control in a distributed file\nsystem is difficult, and it only gets more complicated as you ingest more data\nsources,” says Chris Locklin, Engineering Manager, Data Platforms at Grammarly.\nMeanwhile, reliance on a single streaming workflow made collaboration among\nteams challenging. Data silos emerged as different business areas implemented\nanalytics tools individually. “Every team decided to solve their analytics needs in\nthe best way they saw fit,” says Locklin. “That created challenges in consistency\nand knowing which data set was correct.”\n\n\n-----\n\nAs its data strategy was evolving, Grammarly’s priority was to get the most out\nof analytical data while keeping it secure. This was crucial because security is\nGrammarly’s number-one priority and most important feature, both in how it\nprotects its users’ data and how it ensures its own company data remains secure.\nTo accomplish that, Grammarly’s data platform team sought to consolidate\ndata and unify the company on a single platform. That meant sustaining a highly\nsecure infrastructure that could scale alongside the company’s growth, improving\ningestion flexibility, reducing costs and fueling collaboration.\n\n**Improving analytics, visualization and decision-making**\n**with the lakehouse**\n\nAfter conducting several proofs of concept to enhance its infrastructure,\nGrammarly migrated to the Databricks Lakehouse Platform. Bringing all the\nanalytical data into the lakehouse created a central hub for all data producers\nand consumers across Grammarly, with Delta Lake at the core.\n\nUsing the lakehouse architecture, data analysts within Grammarly now have a\nconsolidated interface for analytics, which leads to a single source of truth and\n\nconfidence in the accuracy and availability of all data managed by the data\nplatform team. Across the organization, teams are using Databricks SQL to\nconduct queries within the platform on both internally generated product data\nand external data from digital advertising platform partners. Now, they can easily\nconnect to Tableau and create dashboards and visualizations to present to\nexecutives and key stakeholders.\n\n\n“Security is of utmost importance at Grammarly, and our team’s numberone objective is to own and protect our analytical data,” says Locklin. “Other\ncompanies ask for your data, hold it for you, and then let you perform analytics\non it. Just as Grammarly ensures our users’ data always remains theirs, we\nwanted to ensure our company data remained ours. Grammarly’s data stays\ninside of Grammarly.”\n\nWith its data consolidated in the lakehouse, different areas of Grammarly’s\nbusiness can now analyze data more thoroughly and effectively. For example,\nGrammarly’s marketing team uses advertising to attract new business. Using\nDatabricks, the team can consolidate data from various sources to extrapolate\na user’s lifetime value, compare it with customer acquisition costs and get rapid\nfeedback on campaigns. Elsewhere, data captured from user interactions flow\ninto a set of tables used by analysts for ad hoc analysis to inform and improve\nthe user experience.\n\nBy consolidating data onto one unified platform, Grammarly has eliminated data\nsilos. “The ability to bring all these capabilities, data processing and analysis\nunder the same platform using Databricks is extremely valuable,” says Sergey\nBlanket, Head of Business Intelligence at Grammarly. “Doing everything from ETL\nand engineering to analytics and ML under the same umbrella removes barriers\nand makes it easy for everyone to work with the data and each other.”\n\n\n-----\n\nTo manage access control, enable end-to-end observability and monitor data\nquality, Grammarly relies on the data lineage capabilities within Unity Catalog.\n“Data lineage allows us to effectively monitor usage of our data and ensure it\nupholds the standards we set as a data platform team,” says Locklin. “Lineage is\nthe last crucial piece for access control. It allows analysts to leverage data to do\ntheir jobs while adhering to all usage standards and access controls, even when\nrecreating tables and data sets in another environment.”\n\n**Faster time to insight drives more intelligent**\n**business decisions**\n\nUsing the Databricks Lakehouse Platform, Grammarly’s engineering teams now\nhave a tailored, centralized platform and a consistent data source across the\ncompany, resulting in greater speed and efficiency and reduced costs. The\nlakehouse architecture has led to 110% faster querying, at 10% of the cost to\ningest, than a data warehouse. Grammarly can now make its 5 billion daily events\navailable for analytics in under 15 minutes rather than 4 hours, enabling lowlatency data aggregation and query optimization. This allows the team to quickly\n\nreceive feedback about new features being rolled out and understand if they are\nbeing adopted as expected. Ultimately, it helps them understand how groups\nof users engage with the UX, improving the experience and ensuring features\nand product releases bring the most value to users. “Everything my team does\nis focused on creating a rich, personalized experience that empowers people to\ncommunicate more effectively and achieve their potential,” says Locklin.\n\n\nMoving to the lakehouse architecture also solved the challenge of access control\nover distributed file systems, while Unity Catalog enabled fine-grained, rolebased access controls and real-time data lineage. “Unity Catalog gives us the\nability to manage file permissions with more flexibility than a database would\nallow,” says Locklin. “It solved a problem my team couldn’t solve at scale. While\nusing Databricks allows us to keep analytical data in-house, Unity Catalog helps\nus continue to uphold the highest standards of data protection by controlling\naccess paradigms inside our data. That opens a whole new world of things that\nwe can do.”\n\nUltimately, migrating to the Databricks Lakehouse Platform has helped\nGrammarly to foster a data-driven culture where employees get fast access\nto analytics without having to write complex queries, all while maintaining\nGrammarly’s enterprise-grade security practices. “Our team’s mission is to help\nGrammarly make better, faster business decisions,” adds Blanket. “My team\nwould not be able to effectively execute on that mission if we did not have a\nplatform like Databricks available to us.” Perhaps most critically, migrating off its\nrigid legacy infrastructure gives Grammarly the adaptability to do more while\nknowing the platform will evolve as its needs evolve. “Databricks has given us the\nflexibility to unleash our data without compromise,” says Locklin. “That flexibility\nhas allowed us to speed up analytics to a pace we’ve never achieved before.”\n\n\n-----\n\nSECTION 4.3\n**Honeywell selects Delta Live Tables for streaming data**\n\nCompanies are under growing pressure to reduce energy use, while at the same time\n\nthey are looking to lower costs and improve efficiency. Honeywell delivers industry-\n\nspecific solutions that include aerospace products and services, control technologies\n\nfor buildings and industry, and performance materials globally. Honeywell’s Energy\n\nand Environmental Solutions division uses IoT sensors and other technologies to help\n\nbusinesses worldwide manage energy demand, reduce energy consumption and carbon\n\nemissions, optimize indoor air quality, and improve occupant well-being.\n\nAccomplishing this requires Honeywell to collect vast amounts of data. Using Delta Live\n\nTables on the Databricks Lakehouse Platform, Honeywell’s data team can now ingest\n\nbillions of rows of sensor data into Delta Lake and automatically build SQL endpoints for\n\nreal-time queries and multilayer insights into data at scale — helping Honeywell improve\n\nhow it manages data and extract more value from it, both for itself and for its customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Delta Lake, Delta Live Tables\n\n\n**C LO U D**\n[Azure](https://databricks.com/product/azure) **Databricks helps us pull together many different data sources, do**\n**aggregations, and bring the significant amount of data we collect**\n**from our buildings under control so we can provide customers value.**\n\n**Dr. Chris Inkpen**\nGlobal Solutions Architect, Honeywell Energy and Environmental Solutions\n\n\n-----\n\n**Processing billions of IoT data points per day**\n\nHoneywell’s solutions and services are used in millions of buildings around the\nworld. Helping its customers create buildings that are safe, more sustainable\nand productive can require thousands of sensors per building. Those sensors\nmonitor key factors such as temperature, pressure, humidity and air quality.\nIn addition to the data collected by sensors inside a building, data is also\ncollected from outside, such as weather and pollution data. Another data set\nconsists of information about the buildings themselves — such as building\ntype, ownership, floor plan, square footage of each floor and square footage\nof each room. That data set is combined with the two disparate data streams,\nadding up to a lot of data across multiple structured and unstructured formats,\nincluding images and video streams, telemetry data, event data, etc. At peaks,\nHoneywell ingests anywhere between 200 to 1,000 events per second for any\nbuilding, which equates to billions of data points per day. Honeywell’s existing\ndata infrastructure was challenged to meet such demand. It also made it difficult\nfor Honeywell’s data team to query and visualize its disparate data so it could\nprovide customers with fast, high-quality information and analysis.\n\n**ETL simplified: high-quality, reusable data pipelines**\n\nWith Delta Live Tables (DLT) on the Databricks Lakehouse Platform, Honeywell’s\ndata team can now ingest billions of rows of sensor data into Delta Lake and\nautomatically build SQL endpoints for real-time queries and multilayer insights\ninto data at scale. “We didn’t have to do anything to get DLT to scale,” says Dr.\n\n\nChris Inkpen, Global Solutions Architect at Honeywell Energy and Environmental\nSolutions. “We give the system more data, and it copes. Out of the box, it’s given\nus the confidence that it will handle whatever we throw at it.”\n\nHoneywell credits the Databricks Lakehouse Platform for helping it to unify its\nvast and varied data — batch, streaming, structured and unstructured — into\none platform. “We have many different data types. The Databricks Lakehouse\nPlatform allows us to use things like Apache Kafka and Auto Loader to load and\nprocess multiple types of data and treat everything as a stream of data, which is\nawesome. Once we’ve got structured data from unstructured data, we can write\nstandardized pipelines.”\n\nHoneywell data engineers can now build and leverage their own ETL pipelines\nwith Delta Live Tables and gain insights and analytics quickly. ETL pipelines can\nbe reused regardless of environment, and data can run in batches or streams. It’s\nalso helped Honeywell’s data team transition from a small team to a larger team.\n“When we wrote our first few pipelines before DLT existed, only one person could\nwork in one part of the functionality. Now that we’ve got DLT and the ability to\nhave folders with common functionality, we’ve got a really good platform where\nwe can easily spin off different pipelines.”\n\nDLT also helped Honeywell establish standard log files to monitor and costjustify its product pipelines. “Utilizing DLT, we can analyze which parts of our\npipeline need optimization,” says Inkpen. “With standard pipelines, that was\nmuch more chaotic.”\n\n\n-----\n\n**Enabling ease, simplicity and scalability across the**\n**infrastructure**\n\nDelta Live Tables has helped Honeywell’s data team consistently query\ncomplex data while offering simplicity of scale. It also enables end-to-end data\nvisualization of Honeywell’s data streams as they flow into its infrastructure, are\ntransformed, and then flow out. “Ninety percent of our ETL is now captured in\ndiagrams, so that’s helped considerably and improves data governance. DLT\nencourages — and almost enforces — good design,” says Inkpen.\n\nUsing the lakehouse as a shared workspace has helped promote teamwork and\ncollaboration at Honeywell. “The team collaborates beautifully now, working\ntogether every day to divvy up the pipeline into their own stories and workloads,”\nsays Inkpen.\n\nMeanwhile, the ability to manage streaming data with low latency and better\nthroughput has improved accuracy and reduced costs. “Once we’ve designed\nsomething using DLT, we’re pretty safe from scalability issues — certainly a\nhundred times better than if we hadn’t written it in DLT,” says Inkpen. “We can\nthen go back and look at how we can take a traditional job and make it more\nperformant and less costly. We’re in a much better position to try and do that\nfrom DLT.”\n\n\nUsing Databricks and DLT also helps the Honeywell team perform with greater\nagility, which allows them to innovate faster while empowering developers to\nrespond to user requirements almost immediately. “Our previous architecture\nmade it impossible to know what bottlenecks we had and what we needed to\nscale. Now we can do data science in near real-time.”\n\nUltimately, Honeywell can now more quickly provide its customers with the\ndata and analysis they need to make their buildings more efficient, healthier\nand safer for occupants. “I’m continuously looking for ways to improve our\nlifecycles, time to market, and data quality,” says Inkpen. “Databricks helps\nus pull together many different data sources, do aggregations, and bring the\nsignificant amount of data we collect from our buildings under control so we\ncan provide customers value.”\n\n**Ready to get started? Learn more about** **[Delta Live Tables here](https://www.databricks.com/product/delta-live-tables)** **.**\n\n\n-----\n\nSECTION 4.4\n**Wood Mackenzie helps customers transition to a more**\n**sustainable future**\n\n\n###### 12 Billion\n\n**Data points processed**\n**each week**\n\n\n###### 80-90%\n\n**Reduction in**\n**processing time**\n\n\n###### Cost Savings\n\n**In operations through**\n**workflow automation**\n\n\nWood Mackenzie offers customized consulting and analysis for a wide range of clients\n\nin the energy and natural resources sectors. Founded in Edinburgh, the company first\n\ncultivated deep expertise in upstream oil and gas, then broadened its focus to deliver\n\ndetailed insight for every interconnected sector of the energy, chemicals, metals and\n\nmining industries.\n\nToday it sees itself playing an important role in the transition to a more sustainable\n\nfuture. Using Databricks Workflows to automate ETL pipelines helps Wood Mackenzie\n\ningest and process massive amounts of data. Using a common workflow provided\n\nhigher visibility to engineering team members, encouraging better collaboration. With\n\nan automated, transparent workflow in place, the team saw improved productivity and\n\ndata quality and an easier path to fix pipeline issues when they arise.\n\n\n**I N D U S T R Y**\n[Energy and Utilities](https://www.databricks.com/solutions/industries/oil-and-gas)\n\n**P L AT F O R M U S E C A S E**\nLakehouse, Workflows\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Delivering insights to the energy industry**\n\nFulfilling Wood Mackenzie’s mission, the Lens product is a data analytics platform\nbuilt to deliver insights at key decision points for customers in the energy sector.\nFeeding into Lens are vast amounts of data collected from various data sources\nand sensors used to monitor energy creation, oil and gas production, and more.\nThose data sources update about 12 billion data points every week that must\nbe ingested, cleaned and processed as part of the input for the Lens platform.\nYanyan Wu, Vice President of Data at Wood Mackenzie, manages a team of big\ndata professionals that build and maintain the ETL pipeline that provides input\ndata for Lens. The team is leveraging the Databricks Lakehouse Platform and\nuses Apache Spark™ for parallel processing, which provides greater performance\nand scalability benefits compared to an earlier single-node system working\nsequentially. “We saw a reduction of 80-90% in data processing time, which\nresults in us providing our clients with more up-to-date, more complete and\nmore accurate data,” says Wu.\n\n**Our mission is to transform the way we power the planet.**\n**Our clients in the energy sector need data, consulting services**\n**and research to achieve that transformation. Databricks**\n**Workflows gives us the speed and flexibility to deliver the**\n**insights our clients need.**\n\n\n**Improved collaboration and transparency with a common**\n**workflow**\n\nThe data pipeline managed by the team includes several stages for standardizing\nand cleaning raw data, which can be structured or unstructured and may be in\nthe form of PDFs or even handwritten notes.\n\nDifferent members of the data team are responsible for different parts of\nthe pipeline, and there is a dependency between the processing stages each\nteam member owns. Using [Databricks Workflows](https://www.databricks.com/product/workflows) , the team defined a common\nworkstream that the entire team uses. Each stage of the pipeline is implemented\nin a Python notebook, which is run as a job in the main workflow.\n\nEach team member can now see exactly what code is running on each stage,\nmaking it easy to find the cause of the issue. Knowing who owns the part of the\npipeline that originated the problem makes fixing issues much faster. “Without\na common workflow, different members of the team would run their notebooks\nindependently, not knowing that failure in their run affected stages downstream,”\nsays Meng Zhang, Principal Data Analyst at Wood Mackenzie. “When trying to\nrerun notebooks, it was hard to tell which notebook version was initially run and\nthe latest version to use.”\n\n\n**Yanyan Wu**\nVice President of Data, Wood Mackenzie\n\n\n-----\n\nUsing Workflows’ alerting capabilities to notify the team when a workflow task\nfails ensures everyone knows a failure occurred and allows the team to work\ntogether to resolve the issue quickly. The definition of a common workflow\ncreated consistency and transparency that made collaboration easier. “Using\nDatabricks Workflows allowed us to encourage collaboration and break up the\nwalls between different stages of the process,” explains Wu. “It allowed us all to\nspeak the same language.”\n\nCreating transparency and consistency is not the only advantage the team saw.\nUsing Workflows to automate notebook runs also led to cost savings compared\nto running interactive notebooks manually.\n\n**Improved code development productivity**\n\nThe team’s ETL pipeline development process involves iteration on PySpark\nnotebooks. Leveraging [interactive notebooks](https://www.databricks.com/product/collaborative-notebooks) in the Databricks UI makes it easy\nfor data professionals on the team to manually develop and test a notebook.\nBecause Databricks Workflows supports running notebooks as task type\n(along with Python files, JAR files and other types), when the code is ready for\n\n\ndeveloping notebooks with the interactive notebook UI while leveraging the\npower of automation, which reduces potential issues that may happen when\nrunning notebooks manually.\n\nThe team has gone even further in increasing productivity by developing a\nCI/CD process. “By connecting our source control code repository, we know\nthe workflow always runs the latest code version we committed to the repo,”\nexplains Zhang. “It’s also easy to switch to a development branch to develop a\nnew feature, fix a bug and run a development workflow. When the code passes\nall tests, it is merged back to the main branch and the production workflow is\nautomatically updated with the latest code.”\n\nGoing forward, Wood Mackenzie plans to optimize its use of Databricks\nWorkflows to automate machine learning processes such as model training,\nmodel monitoring and handling model drift. The firm uses ML to improve its data\nquality and extract insights to provide more value to its clients. “Our mission is to\ntransform how we power the planet,” Wu says. “Our clients in the energy sector\nneed data, consulting services and research to achieve that transformation.\nDatabricks Workflows gives us the speed and flexibility to deliver the insights our\nclients need.”\n\n\nproduction, it’s easy and cost effective to automate it by adding it to a workflow.\nThe workflow can then be easily revised by adding or removing any steps to\nor from the defined flow. This way of working keeps the benefit of manually\n\n\n-----\n\nSECTION 4.5\n**Rivian redefines driving experience with**\n**the Databricks Lakehouse**\n\n###### 250 platform users\n\n**A 50x increase from a year ago**\n\nRivian is preserving the natural world for future generations with revolutionary Electric\n\nAdventure Vehicles (EAVs). With over 25,000 EAVs on the road generating multiple\n\nterabytes of IoT data per day, the company is using data insights and machine\n\nlearning to improve vehicle health and performance. However, with legacy cloud\n\ntooling, it struggled to scale pipelines cost-effectively and spent significant resources\n\non maintenance — slowing its ability to be truly data driven.\n\nSince moving to the Databricks Lakehouse Platform, Rivian can now understand how\n\na vehicle is performing and how this impacts the driver using it. Equipped with these\n\ninsights, Rivian is innovating faster, reducing costs, and ultimately, delivering a better\n\ndriving experience to customers.\n\n\n**I N D U S T R Y**\n[Manufacturing](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)\n\n**S O L U T I O N**\nPredictive Maintenance, Scaling ML Models\nfor IoT, Data-Driven ESG\n\n**P L AT F O R M**\n[Lakehouse](https://www.databricks.com/product/data-lakehouse) , [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , [Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n**C LO U D**\n[AWS](https://www.databricks.com/product/aws)\n\n\n-----\n\n**Struggling to democratize data on a legacy platform**\n\n\nsharing of data, which further contributed to productivity issues. Required data\nlanguages and specific expertise of toolsets created a barrier to entry that\nlimited developers from making full use of the data available. Jason Shiverick,\nPrincipal Data Scientist at Rivian, said the biggest issue was the data access. “I\nwanted to open our data to a broader audience of less technical users so they\ncould also leverage data more easily.”\n\nRivian knew that once its EAVs hit the market, the amount of data ingested would\nexplode. In order to deliver the reliability and performance it promised, Rivian\nneeded an architecture that would not only democratize data access, but also\nprovide a common platform to build innovative solutions that can help ensure a\nreliable and enjoyable driving experience.\n\n**Databricks Lakehouse empowers us to lower the barrier of**\n**entry for data access across our organization so we can build**\n**the most innovative and reliable electric vehicles in the world.**\n\n**Wassym Bensaid**\nVice President of Software Development, Rivian\n\n\nBuilding a world that will continue to be enjoyed by future generations requires\na shift in the way we operate. At the forefront of this movement is Rivian —\nan electric vehicle manufacturer focused on shifting our planet’s energy and\ntransportation systems entirely away from fossil fuel. Today, Rivian’s fleet\nincludes personal vehicles and involves a partnership with Amazon to deliver\n100,000 commercial vans. Each vehicle uses IoT sensors and cameras to\ncapture petabytes of data ranging from how the vehicle drives to how various\nparts function. With all this data at its fingertips, Rivian is using machine learning\nto improve the overall customer experience with predictive maintenance so that\npotential issues are addressed before they impact the driver.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility\nand tooling limitations that decreased output, prevented collaboration and\nincreased operational costs. It had 30 to 50 large and operationally complicated\ncompute clusters at any given time, which was costly. Not only was the system\ndifficult to manage, but the company experienced frequent cluster outages\nas well, forcing teams to dedicate more time to troubleshooting than to data\nanalysis. Additionally, data silos created by disjointed systems slowed the\n\n\n-----\n\n**Predicting maintenance issues with Databricks Lakehouse**\n\nRivian chose to modernize its data infrastructure on the Databricks Lakehouse\nPlatform, giving it the ability to unify all of its data into a common view for\ndownstream analytics and machine learning. Now, unique data teams have\na range of accessible tools to deliver actionable insights for different use\ncases, from predictive maintenance to smarter product development. Venkat\nSivasubramanian, Senior Director of Big Data at Rivian, says, “We were able\nto build a culture around an open data platform that provided a system for\nreally democratizing data and analysis in an efficient way.” Databricks’ flexible\nsupport of all programming languages and seamless integration with a variety of\ntoolsets eliminated access roadblocks and unlocked new opportunities. Wassym\nBensaid, Vice President of Software Development at Rivian, explains, “Today we\nhave various teams, both technical and business, using Databricks Lakehouse\nto explore our data, build performant data pipelines, and extract actionable\nbusiness and product insights via visual dashboards.”\n\n\nmetrics, Rivian can improve the accuracy of smart features and the control\nthat drivers have over them. Designed to take the stress out of long drives and\ndriving in heavy traffic, features like adaptive cruise control, lane change assist,\nautomatic emergency driving, and forward collision warning can be honed over\ntime to continuously optimize the driving experience for customers.\n\nSecure data sharing and collaboration was also facilitated with the Databricks\nUnity Catalog. Shiverick describes how unified governance for the lakehouse\nbenefits Rivian productivity. “Unity Catalog gives us a truly centralized data\ncatalog across all of our different teams,” he said. “Now we have proper access\nmanagement and controls.” Venkat adds, “With Unity Catalog, we are centralizing\ndata catalog and access management across various teams and workspaces,\nwhich has simplified governance.” End-to-end version controlled governance\nand auditability of sensitive data sources, like the ones used for autonomous\ndriving systems, produces a simple but secure solution for feature engineering.\nThis gives Rivian a competitive advantage in the race to capture the autonomous\ndriving grid.\n\n\nRivian’s ADAS (advanced driver-assistance systems) Team can now easily\nprepare telemetric accelerometer data to understand all EAV motions. This core\nrecording data includes information about pitch, roll, speed, suspension and\nairbag activity, to help Rivian understand vehicle performance, driving patterns\nand connected car system predictability. Based on these key performance\n\n\n-----\n\n**Accelerating into an electrified and sustainable world**\n\n\nBy scaling its capacity to deliver valuable data insights with speed, efficiency\nand cost-effectiveness, Rivian is primed to leverage more data to improve\noperations and the performance of its vehicles to enhance the customer\nexperience. Venkat says, “The flexibility that lakehouse offers saves us a lot of\nmoney from a cloud perspective, and that’s a huge win for us.” With Databricks\nLakehouse providing a unified and open source approach to data and analytics,\nthe Vehicle Reliability Team is able to better understand how people are using\ntheir vehicles, and that helps to inform the design of future generations of\nvehicles. By leveraging the Databricks Lakehouse Platform, they have seen a\n30%–50% increase in runtime performance, which has led to faster insights and\nmodel performance.\n\nShiverick explains, “From a reliability standpoint, we can make sure that\ncomponents will withstand appropriate lifecycles. It can be as simple as\nmaking sure door handles are beefy enough to endure constant usage, or as\ncomplicated as predictive and preventative maintenance to eliminate the\nchance of failure in the field. Generally speaking, we’re improving software quality\nbased on key vehicle metrics for a better customer experience.”\n\n\nFrom a design optimization perspective, Rivian’s unobstructed data view is also\nproducing new diagnostic insights that can improve fleet health, safety, stability\nand security. Venkat says, “We can perform remote diagnostics to triage a\nproblem quickly, or have a mobile service come in, or potentially send an OTA\nto fix the problem with the software. All of this needs so much visibility into\nthe data, and that’s been possible with our partnership and integration on the\nplatform itself.” With developers actively building vehicle software to improve\nissues along the way.\n\nMoving forward, Rivian is seeing rapid adoption of Databricks Lakehouse across\ndifferent teams — increasing the number of platform users from 5 to 250 in only\none year. This has unlocked new use cases including using machine learning to\noptimize battery efficiency in colder temperatures, increasing the accuracy of\nautonomous driving systems, and serving commercial depots with vehicle health\ndashboards for early and ongoing maintenance. As more EAVs ship, and its fleet\nof commercial vans expands, Rivian will continue to leverage the troves of data\ngenerated by its EAVs to deliver new innovations and driving experiences that\nrevolutionize sustainable transportation.\n\n\n-----\n\nSECTION 4.6\n**Migrating to the cloud to better serve**\n**millions of customers**\n\n\n###### 300%\n\n**ROI from OpEx savings**\n**and cost avoidance**\n\n\n###### 3X\n\n**Faster delivery of ML/data**\n**science use cases**\n\n\nConsistency in innovation is what keeps customers with a telecommunications company\n\nand is why AT&T is ranked among the best. However, AT&T’s massive on-premises legacy\n\nHadoop system proved complex and costly to manage, impeding operational agility\n\nand efficiency and engineering resources. The need to pivot to cloud to better support\n\nhundreds of millions of subscribers was apparent.\n\nMigrating from Hadoop to Databricks on the Azure cloud, AT&T experienced significant\n\nsavings in operating costs. Additionally, the new cloud-based environment has unlocked\n\naccess to petabytes of data for correlative analytics and an AI-as-a-Service offering for\n\n2,500+ users across 60+ business units. AT&T can now leverage all its data — without\n\noverburdening its engineering team or exploding operational costs — to deliver new\n\nfeatures and innovations to its millions of end users.\n\n\n**I N D U S T R Y**\n[Communication Service Providers](https://www.databricks.com/solutions/industries/telco-industry-solutions)\n\n**S O L U T I O N**\nCustomer Retention, Subscriber Churn\nPrediction, Threat Detection\n\n**P L AT F O R M**\nLakehouse, Data Science, Machine Learning,\n[Data Streaming](https://www.databricks.com/product/data-streaming)\n\n**C LO U D**\n[Azure](https://www.databricks.com/product/azure)\n\n\n-----\n\n**Hadoop technology adds operational complexity and**\n**unnecessary costs**\n\nAT&T is a technology giant with hundreds of millions of subscribers and ingests\n10+ petabytes[ [a](https://www.databricks.com/blog/2022/04/11/data-att-modernization-lakehouse.html) ] of data across the entire data platform each day. To harness\nthis data, it has a team of 2,500+ data users across 60+ business units to ensure\nthe business is data powered — from building analytics to ensure decisions are\nbased on the best data-driven situation awareness to building ML models that\nbring new innovations to its customers. To support these requirements, AT&T\nneeded to democratize and establish a data single version of truth (SVOT) while\nsimplifying infrastructure management to increase agility and lower overall costs.\n\nHowever, physical infrastructure was too resource intensive. The combination\nof a highly complex hardware setup (12,500 data sources and 1,500+ servers)\ncoupled with an on-premises Hadoop architecture proved complex to\nmaintain and expensive to manage. Not only were the operational costs to\nsupport workloads high, but there were also additional capital costs around\ndata centers, licensing and more. Up to 70% of the on-prem platform had to\n\nbe prioritized to ensure 50K data pipeline jobs succeeded and met SLAs and\n\ndata quality objectives. Engineers’ time was focused on managing updates,\n\n\nWith these deeply rooted technology issues, AT&T was not in the best position\nto achieve its goals of increasing its use of insights for improving its customer\nexperience and operating more efficiently. “To truly democratize data across\nthe business, we needed to pivot to a cloud-native technology environment,”\nsaid Mark Holcomb, Distinguished Solution Architect at AT&T. “This has freed\nup resources that had been focused on managing our infrastructure and move\nthem up the value chain, as well as freeing up capital for investing in growthoriented initiatives.”\n\n**A seamless migration journey to Databricks**\n\nAs part of its due diligence, AT&T ran a comprehensive cost analysis and\nconcluded that Databricks was both the fastest and achieved the best price/\nperformance for data pipelines and machine learning workloads. AT&T knew the\nmigration would be a massive undertaking. As such, the team did a lot of upfront\nplanning — they prioritized migrating their largest workloads first to immediately\nreduce their infrastructure footprint. They also decided to migrate their data\nbefore migrating users to ensure a smooth transition and experience for their\nthousands of data practitioners.\n\n\nfixing performance issues or simply provisioning resources rather than focusing\n\n\non higher-valued tasks. The resource constraints of physical infrastructure\n\nalso drove serialization of data science activities, slowing innovation. Another\n\nhurdle faced in operationalizing petabytes of data was the challenge of building\n\nstreaming data pipelines for real-time analytics, an area that was key to\n\nsupporting innovative use cases required to better serve its customers.\n\n\n**The migration from Hadoop to Databricks enables us to bring**\n**more value to our customers and do it more cost-efficiently**\n**and much faster than before.**\n\n**Mark Holcomb**\nDistinguished Solution Architect, AT&T\n\n\n-----\n\nThey spent a year deduplicating and synchronizing data to the cloud before\nmigrating any users. This was a critical step in ensuring the successful migration\nof such a large, complex multi-tenant environment of 2,500+ users from 60+\nbusiness units and their workloads. The user migration process occurred over\nnine months and enabled AT&T to retire on-premises hardware in parallel with\nmigration to accelerate savings as early as possible. Plus, due to the horizontal,\nscalable nature of Databricks, AT&T didn’t need to have everything in one\ncontiguous environment. Separating data and compute, and across multiple\naccounts and workspaces, ensured analytics worked seamlessly without any API\ncall limits or bandwidth issues and consumption clearly attributed to the 60+\nbusiness units.\n\nAll in all, AT&T migrated over 1,500 servers, more than 50,000 production CPUs,\n12,500 data sources and 300 schemas. The entire process took about two and a\nhalf years. And it was able to manage the entire migration with the equivalent of\n15 full-time internal resources. “Databricks was a valuable collaborator throughout\nthe process,” said Holcomb. “The team worked closely with us to resolve product\nfeatures and security concerns to support our migration timeline.”\n\n**Databricks reduces TCO and opens new paths to**\n**innovation**\n\nOne of the immediate benefits of moving to Databricks was huge cost savings.\nAT&T was able to rationalize about 30% of its data by identifying and not\nmigrating underutilized and duplicate data. And prioritizing the migration of\nthe largest workloads allowed half the on-prem equipment to be rationalized\n\n\nduring the course of the migration. “By prioritizing the migration of our most\ncompute-intensive workloads to Databricks, we were able to significantly drive\ndown costs while putting us in position to scale more efficiently moving forward,”\nexplained Holcomb. The result is an anticipated 300% five-year migration ROI\nfrom OpEx savings and cost avoidance (e.g., not needing to refresh data center\nhardware).\n\nWith data readily available and the means to analyze data at any scale, teams\nof citizen data scientists and analysts can now spend more time innovating,\ninstead of serializing analytics efforts or waiting on engineering to provide the\nnecessary resources — or having data scientists spend their valuable time\non less complex or less insightful analyses. Data scientists are now able to\ncollaborate more effectively and speed up machine learning workflows so that\nteams can deliver value more quickly, with a 3x faster time to delivery for new\ndata science use cases.\n\n“Historically you would have had operations in one system and analytics in a\nseparate one,” said Holcomb. “Now we can do more use cases like operational\nanalytics in a platform that fosters cross-team collaboration, reduces cost and\nimproves the consistency of answers.” Since migrating to Databricks, AT&T now\nhas a single version of truth to create new data-driven opportunities, including\na self-serve AI-as-a-Service analytics platform that will enable new revenue\nstreams and help it continue delivering exceptional innovations to its millions\nof customers.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### EBOOK\n\n# 8 Steps to Becoming an AI-Forward Retailer\n\n\n-----\n\n## Contents\n\n\nIntroduction .............................................................................................................................................................................................. **3**\n\nThe State of the Retail Industry:\n\nThe Diverging Performance of Data Leaders vs. Data Laggards ...................................................................................... **4**\n\nBegin With a Shared Vision of Success ....................................................................................................................................... **6**\n\nWhy Companies Struggle With Setting Clear Business Outcomes for AI ................................................................... **7**\n\nBefore Diving In: Assess Your Readiness ..................................................................................................................................... **9**\n\nGetting Started: Putting Some Wins on the Board .................................................................................................................. **11**\n\nGoing Big: Learning to Embrace Transformational Change ............................................................................................... **12**\n\nNormalizing the Process: Engraining a Data-Driven Mindset\n\nInto the Fabric of the Business ...................................................................................................................................................... **14**\n\nFrom Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise .......................................... **16**\n\nThe 8 Steps to Building a Data-Forward Retailer ................................................................................................................... **17**\n\nTransform Retail Data Into Actionable Insights ....................................................................................................................... **21**\n\n\n-----\n\n## Introduction\n\n\nIn a world where data is king, retailers have historically been trailblazers, pioneering data technology\nadoption to supercharge their operations, enhance customer understanding and sharpen\npersonalization. The journey began with the simple cash register about 150 years ago, progressed to\nstandardized product reporting with the introduction of the UPC and EAN, and has evolved to include\ncutting-edge technologies such as RFID and machine learning.\n\nToday, we stand on the brink of “Generation AI,” defined by sophisticated language models and\nimages. Retailers, with their history of embracing data technologies, find themselves in a strong\nposition to reap the benefits of this new era. Automation of customer service, supply chain modeling\nwith digital twins and delivering hyper-personalized experiences in real time are all in the cards,\npromising to bolster revenue, improve margins and slash costs for early adopters.\n\nAccording to an internal analysis by Databricks, data pioneers are already outstripping their\ncompetition. The “Databricks 30” — an index tracking the publicly traded data and AI leaders across\nsix major industry sectors, including retail — shows these front-runners outperforming the rest of the\nmarket by an impressive and increasing margin. It’s clear: retailers integrating data and AI strategies\nare setting themselves up for significant gains and a robust competitive advantage.\n\nHowever, for retailers mired in the landscape of outdated data platforms, the transformation into an\nAI-driven organization can seem a Herculean task. Embracing this wave of innovative technologies may\nfeel overwhelming, yet it’s clear that those who make the leap stand to gain significantly in the rapidly\nevolving retail landscape.\n\nTo help you navigate the rapidly evolving world of retail and consumer goods, this eBook provides a\nroad map for organizations embarking on digital transformation journeys — a shift that is as much\nabout culture as it is about technology, if not more so. The core advice? Start with a crystal-clear\nvision for transformation, outlining a compelling case for why such change is vital for the company’s\nlong-term survival. Then, initiate the process by introducing AI to make gradual enhancements in\ncritical business procedures.\n\n\n-----\n\n## The State of the Retail Industry: The Diverging Performance of Data Leaders vs. Data Laggards\n\n\nThe pandemic’s fallout has led to a widening chasm between the retail industry’s\nleaders and laggards. McKinsey & Company encapsulated this trend succinctly:\n“Companies with tech-forward business models, who were already pulling ahead\npre-crisis, left their competitors in the dust.”\n\nBut what exactly is a “tech-forward business model”? It isn’t a simple narrative of\ndigital natives dethroning traditional retailers. Heavyweights like Walmart, Target\nand Costco held their own against Amazon. Nor was it purely a matter of scale —\nsmaller brands like Warby Parker or Everlane managed to carve out substantial\nconsumer bases, competing against larger, established players.\n\n**The common denominator among all victors**\n**was their ability to harness data, analytics and AI**\n**to rapidly react to shifts in consumer behavior.**\n\n\nmethods, optimizing operations to alleviate the pressure these modes exerted\non margins. They successfully established tighter partnerships with suppliers\nand logistic entities, collaborating toward shared triumphs.\n\nIn all these instances, it was their timely access to information, foresight\ndriven by this data, and the exploration of probable outcomes that set these\norganizations apart. Infusing data-driven decision-making into core processes\nwithin the organization, as well as those crossing partner boundaries, unlocked\nthis approach’s full potential.\n\nTo illustrate the significance of prioritizing data and AI, we developed the\nDatabricks 30 Index. Drawing inspiration from Morgan Stanley’s “Data Era”\nstocks research, this index tracks marquee customers across our top five\nverticals and partners. The Databricks 30 is an equal-weight price index,\n\ncomposed of five marquee customers each across Retail/Consumer Products,\nFinancial Services, Healthcare, Media/Entertainment, Manufacturing/Logistics,\nplus five strategic partners.\n\n\nThese businesses deftly used consumer demand insights to understand the\neffects of supply chain disruptions and labor shortages and reallocate resources\nto mitigate the most harmful impacts. They adeptly introduced new delivery\n\n\n-----\n\nOur analysis reveals that companies in the Databricks 30 Index outpaced the\nS&P 500 by an impressive +21 percentage points (pp) over the past three years.\nIn other words, if the stock market rose by 50% during this period, the Databricks\n30 Index would have soared by 71% (outperforming by 21pp). Even more\nremarkable, excluding tech entirely from the Databricks 30, the Databricks 30\nex-Tech index outperforms the S&P 500 by an even larger margin over the same\ntime frame: +23pp.\n\n\nDB30 DOw30\n\n\nSimilar to Morgan Stanley’s analysis, we find that non-tech U.S. companies that\nare investing in cloud, data and innovation do, in fact, win.\n\n\nSo now that we see the impact, let’s dive into the steps retail organizations can\ntake to put themselves on a trajectory of continued growth and success amid an\never-changing landscape.\n\n\n01-01-2019 01-01-2020 01-01-2021 01-01-2022 01-01-2023\n\n\n01-01-2019 01-01-2020 01-01-2021\n\n\nDATE\n\n\n-----\n\n## Begin With a Shared Vision of Success\n\n\nThe most overlooked activity in becoming an AI-forward retailer is the most\ncrucial. In the rush to secure a position on the AI frontier, many companies\nare leaping before they look, embarking on AI initiatives without a clear\nunderstanding of what they want to achieve. Simply adopting the newest,\nshiniest tech tools isn’t a silver bullet. Many companies set themselves up for\nfailure by neglecting to clearly define the expected business outcomes at the\nonset of the initiative, a strategic move that can effectively reduce project risk\nand costs and lead to the ultimate success of the program. In fact, in an attempt\nto accelerate results, this cavalier approach can instead spiral into expensive\nmistakes, wasted resources and a decrease in trust for stakeholders from\nunmet expectations. It’s like setting sail on an open ocean without a destination\nin mind; the journey might provide some interesting detours, but it lacks\ndirection and purpose.\n\nHowever, when organizations take the time to articulate their expected\nbusiness outcomes before deploying AI and data-driven programs, they position\nthemselves to reduce project risk and costs. By aligning AI initiatives with\nspecific business objectives and creating a shared vision with stakeholders,\nthe focus becomes less about the technology itself and more about how it\ncan be used to reach these defined goals.\n\n\nTechnology decisions, too, are improved by having a known target. Without\nclear business outcomes in mind, companies tend to design, develop and\nimplement technologies that _might_ be needed to solve the problem. Aligning\nthe technical road map and activities with business outcomes mitigates the\nrisk of misallocated resources and the potential fallout from the unfulfilled\npromise of AI.\n\nFurthermore, a clear understanding of expected business outcomes allows\nfor efficient project management and cost control. Companies can set key\nperformance indicators (KPIs) tied directly to these outcomes. This not only\nprovides a means to measure progress, but also helps control costs by\nensuring that resources are targeted toward initiatives that deliver value.\n\nIt’s not just about numbers either; having explicit objectives aids in cultivating\n\nstakeholder buy-in. Clear communication about the purpose and potential\nbenefits of an AI initiative can foster support from executives, employees,\ninvestors and customers alike. This collective backing can further mitigate risk\nand cut costs by ensuring that everyone is pulling in the same direction.\n\n\n-----\n\n## Why Companies Struggle With Setting Clear Business Outcomes for AI\n\n\nGetting started with AI at your organization might be daunting, and that’s\nbecause it is a big undertaking! Struggling to define clear outcomes for AI\nprojects is a common issue among many businesses for a variety of reasons.\nHere are some key factors that contribute to this challenge:\n\n**They believe the data strategy is a technology problem.**\n\nCompanies often hire a chief data officer, or make the data strategy\nthe responsibility of the technology organization.\n\n**They lack an understanding of their business processes**\nAn alarming number of businesses jump onto the AI bandwagon without\nunderstanding how their business operates. Decisions are made at\nthe leadership level, but how they translate to operational decisions is\nmuddled. Data and AI are fundamentally business process technologies,\n\nand without fully understanding how the business works, any initiative\nin data and AI is bound to have limited success.\n\n\n**They lack a data culture**\n\nSomewhat related to the previous point, many companies have teams\nthat make decisions based on experience and intuition. These should\nnot be discounted, but the reason for intuition is often a result of a\npoor definition of processes, which prevents the ability to measure\nand improve processes.\n\n**They struggle to get high-quality data**\n\nAI projects require good-quality, relevant data. Many businesses\nstruggle with issues related to data access, quality, privacy and\nsecurity, which can complicate the process of defining clear outcomes.\n\n**They lack the organizational structures required**\n\nImplementing AI often requires significant changes in business\n\nprocesses, organizational structures and even corporate culture.\nMany companies find it hard to manage these changes, leading to\ndifficulties in setting and achieving clear outcomes.\n\n\n-----\n\nData and AI programs are a business process problem first, and a\ntechnology problem last. Familiarity with technology is important, but\nirrelevant if companies do not understand it.\n\nAddressing these challenges often requires companies to invest in\neducation about AI capabilities, to formulate clear strategies, to manage\nchange effectively, and to bring on board the necessary skills either\nby hiring new talent or upskilling existing employees. It’s a journey that\nrequires commitment, but the potential benefits of successful AI initiatives\nmake it a worthwhile venture.\n\n\n**They don’t have the right people in place**\n\nThere’s often a gap between the skills available within a company and\nthe skills needed to define and achieve AI outcomes. Without team\nmembers who understand AI, data analysis and project management,\nbusinesses can struggle to set clear objectives for AI initiatives.\n\n**They struggle to quantify the value of AI projects**\n\nAI’s benefits can sometimes be intangible or long-term, making them\ndifficult to quantify. Companies may struggle to define outcomes in\nmeasurable terms, complicating the process of setting objectives\nand monitoring progress.\n\n\n-----\n\n## Before Diving In: Assess Your Readiness\n\n\nThere is a growing sense of urgency for organizations relatively new to data\nand AI-driven enablement to “get in the game.” Profiles of top performers and\nheadline-making achievements create a clearer sense of what is possible\nand what can be gained, leaving those entering into the space eager to achieve\nsimilar results.\n\nBut what’s missing in those articles are the sustained investments in\nprocess, people and technology and the numerous challenges, missteps and\noutright failures that had to occur before success was achieved. Data-driven\ntransformation is a journey, and before any successful journey is pursued,\nit’s wise to reflect on the organization’s readiness so that you can anticipate\nchallenges and identify areas for remediation and improvement that will\ndeliver you to your intended destination.\n\nWith this in mind, we encourage organizations new to this space to\nassess their maturity in terms of the use and management of their existing\ninformation assets:\n\n1. How easily discoverable and accessible are data in\nyour environment?\n\n\n3. Is the quality of these data formally verified?\n\n4. Are key entities such as products and customers actively\nmanaged, and can data related to these items be easily linked\nacross various data sources?\n\n5. How quickly are data made available for analysis following their\ncreation or modification? Is this latency aligned with how you\nmight use this data?\n\n6. Are processes established for determining appropriate uses of\ndata, governing access and providing oversight on consumption?\n\n7. Is there one individual responsible for effective data management\nacross the enterprise, and has this person established a\n\nprocess for receiving and responding to feedback and shifting\norganizational priorities?\n\nThis list of questions is by no means exhaustive, but it should help to identify\nblockers that are likely to become impediments down the road.\n\n\n2. How well understood are these information assets?\n\n\n-----\n\nSimilarly, we would encourage organizations to assess their maturity in terms of\nanalytics capabilities:\n\n1. Is business performance at all levels assessed in terms of\nkey metrics?\n\n2. How frequently are data-driven analyses used in making key\nbusiness decisions?\n\n3. To what degree are advanced analytics techniques\n— i.e., data science — used in decision-making processes?\n\n4. Are predictive models regularly leveraged as part of operational\nbusiness processes?\n\n5. How is experimentation used to assess the performance of\nvarious initiatives?\n\n\nLastly, and probably most importantly, we’d encourage the organization to\nperform a frank assessment of its readiness to embrace change. Becoming a\ndata-driven enterprise is fundamentally about operating differently than before.\nDecision-making authority becomes more diffuse and often more automated.\nProject outcomes become less certain as the organization focuses on innovation\nwhere learning is emphasized over predictable results. Process silos often\nbecome more intertwined as new modes of engagement evolve.\n\nWhen done right, this transition creates a healthy tension between what’s\nneeded to be successful today and what’s needed to be successful tomorrow.\nBut this can also manifest itself as employee resistance and political infighting\nas processes and organizational structures evolve. What’s often needed to\novercome this is strong leadership, a clear vision and mandate for change as\nwell as a reassessment of incentive structures and active organizational change\nmanagement as the organization transitions into this new way of working.\n\n\n6. Are predictive models used to automate key business decisions?\n\n\n7. Has the organization embraced a model of continuous deployment\nfor the regular update of model-driven processes?\n\n\n**TRADITIONAL APPROACH**\n\n**Upfront reqs** **Technical implementation** **Production**\n\n\n**ITERATIVE APPROACH**\n\n\nContinuous feedback\n\n\n**Business questions** **Testing** **Production** **Optimization**\n\nContinuous learning and optimization\n\nAn iterative approach involves the use of data to continually optimize the performance of data products.\n\n\n-----\n\n## Getting Started: Putting Some Wins on the Board\n\n\nWith the organization ready to proceed, the next phase is about learning to\ndeliver new solutions within your organization. There will be new technologies\nto deploy and new skills to develop, and there will be new patterns for\nintegration into business workflows and procedures for incremental updates\nand improvements. But most importantly, there will need to be a new level of\npartnership and trust between the business and the technology sides of the\norganization that needs to be carefully nurtured.\n\nThe best way we have found to do this is to start with projects that improve\non existing operational workflows, i.e., do what you do, but do it smarter.\nThe business is often familiar with existing pain points and can more clearly\nenvision how a new capability can be folded into its processes. They are also\nfamiliar with how to assess the impact a new approach may have on their\nbusiness and can help design tests to validate whether the intended results\n\n\nAs capabilities demonstrating value over the status quo are developed, they\nare folded into business processes. This is not a one-and-done effort but part\nof an ongoing cycle of deployment to continue so long as the team has a line\nof sight to meaningful gains. The team does not wait for the ideal solution but\ninstead focuses on incremental improvements that deliver measurable value\nalong the way.\n\nOversight for this process is provided by another body, one tasked with the\nsuccess of the overall transformative efforts within the business. As success\nis delivered, there will be growing demand for the time and talents of these\nteams, and the organization will need to prioritize resources across an increasing\nnumber of opportunities. This steering committee will need to be responsible for\nallocating limited resources and advocating for additional ones as well to strike\nthe right balance of investments for the organization.\n\n\nare or are not being delivered.\n\n\n**DEMAND FORECASTING**\n\nDemand forecasting is a massive challenge for retail and consumer goods\n\norganizations. And one where even an incremental change can have a massive impact,\n\nso it’s often one of the first projects organizations identify to put a win on the board.\n\nAccording to [McKinsey](https://www.mckinsey.com/featured-insights/artificial-intelligence/notes-from-the-ai-frontier-applications-and-value-of-deep-learning) , a 10% to 20% improvement in supply chain forecasting\n\naccuracy is likely to produce a 5% reduction in inventory costs and a 2% to 3%\n\nincrease in revenues. To hit the ground running, check out the [Databricks Solution](https://www.databricks.com/solutions/accelerators/demand-forecasting)\n\n[Accelerators for Demand Forecasting](https://www.databricks.com/solutions/accelerators/demand-forecasting) — pre-built notebooks and best practices for\n\nkey use cases.\n\n\nWork on these projects is a collaborative effort between the business and IT.\nTogether, the project team explores a potential solution with a notion of how it\nmay be integrated in mind from the outset. As the project unfolds, all members\nare part of the iterative cycles and help to steer the solution in new directions\nuntil an item of value is derived.\n\n\n-----\n\n## Going Big: Learning to Embrace Transformational Change\n\n\nWith some experience under your belt, it’s time to build on the organizational\nmuscle developed during initial efforts and flex for more transformative impact.\nAgain, the focus is on established functions within the business, but instead of\npointed, incremental improvements, the team begins to create a vision for the\npart of the organization that would operate if it were to fully embrace data and\nAI enablement.\n\nIt’s at this phase that many of the concerns about organizational resistance\nmentioned earlier are most likely to manifest themselves. Ideally, initial\nimplementation efforts have built champions within the business, but it’s still\nimportant to be mindful of pushback that can emerge as the organization more\nfully begins to change. Having and maintaining strong business sponsorship\nin this phase is critical, and having that sponsor articulate and regularly\nreinforce a clear vision for the change that’s now underway can help everyone\n\nunderstand the need to support these efforts.\n\n\nSo far in this exploration of the journey to data and AI transformation, we’ve\nminimized the importance of technology in order to focus on the business and\norganizational aspects that often get neglected in this conversation. But it’s\nat this stage that the organization needs to have established its preference\nfor data and analytics platforms. Because of the breadth of needs that will\nhave to be addressed and the ongoing innovation taking place in the data\nscience community, we strongly suggest standardizing on a platform that is\nopen and flexible while also providing cost-effective use of both infrastructure\nand people resources and strong data governance and protection. For many\norganizations, the Databricks Lakehouse Platform has proven itself to be the\nideal platform to meet these needs.\n\n**WHY STANDARDIZE ON DATABRICKS?**\n\nThe Databricks Lakehouse is the only enterprise data and AI\n\nplatform that allows retailers to leverage all of their data, from any\n\nsource, on any workload to always offer more engaging customer\n\nexperiences driven by real-time data, at the lowest cost and with\n\nthe greatest investment protection.\n\n\n-----\n\nBut simply standardizing on a platform is not enough. The organization\nneeds to work through the roles and responsibilities around the use of this\nplatform and processes for moving things from experimentation and formal\ndevelopment to testing and operationalization.\n\nThe importance of having an MLOps strategy really comes to life at this\nphase. This doesn’t mean your strategy around MLOps can’t change, but this\nphase is when you want to think about and define your answers to some key\nquestions such as the following:\n\n1. How do we evaluate new and existing (retrained) models as\npart of their movement from development to production?\n\n2. How do we determine when a model should be retrained?\n\n3. What are the preferred mechanisms for production deployment?\n\n4. How do we fall back should we have a deployment problem?\n\n5. What are the service level expectations for the\ndeployment processes?\n\n\n###### ”Databricks Lakehouse has simplified the adoption of AI so that we can deliver better shopping experiences for our customers.”\n\n**Numan Ali**\n\nSolutions Architect, Data and Analytics Center of Excellence at Pandora\n\n\n-----\n\n## Normalizing the Process: Engraining a Data-Driven Mindset Into the Fabric of the Business\n\n\nToo often, leadership views innovation as a destination and not a process\n(“Let’s launch an LLM app!”). An enterprise doesn’t simply transform into a\ndata-driven organization overnight and then it’s done. Yes, there will be an\nupfront investment, but there will also be ongoing investment in order to\nsupport sustained innovation.\n\nIronically, one of the major obstacles to this change is viewing the goal as\nsimply delivering a project or projects. Think about it — just 12 months ago,\nonly a few specialists in academia and industry were talking about generative\nAI and large language models (LLMs). Today, [retailers have to integrate this](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html)\n[new technology](https://www.databricks.com/blog/2023/04/13/retail-age-generative-ai.html) or fall behind others who will find a way to create more\npersonalized consumer experiences with it.\n\n\nTechnology, especially when it comes to data and AI, moves far too quickly.\nWhat retailer tech teams need to deliver at the end of the day is applications,\nof course, but also the ability to react quickly to change. What sort of ongoing\ninvestments in terms of people, process and technology do retailers need to\nfoster in order to ingrain an innovation mindset?\n\nThis is an ongoing balancing act where organizations need to innovate and look\nfor new opportunities but also sustain that innovation in a way that is realistic\nfor the business. For this, let’s consider the 70-20-10 rule: the idea that\ncompanies should allocate 70% of innovation investment to core initiatives,\n20% to adjacent ones and 10% to transformational ones, or “moonshots.” While\nnot a hard-and-fast rule, this concept was touted by Google co-founder Larry\nPage in a [Fortune magazine article](https://www.google.com/url?q=https://money.cnn.com/2008/04/29/magazines/fortune/larry_page_change_the_world.fortune/&sa=D&source=editors&ust=1690998645852122&usg=AOvVaw2AHj-fx8XkEeMKP2Ts5gDu) , and was validated by a [study conducted](https://hbr.org/2012/05/managing-your-innovation-portfolio)\n[by Harvard Business Review](https://hbr.org/2012/05/managing-your-innovation-portfolio) , which found that companies following the rule\n\noutperformed their peers, typically realizing a P/E premium of 10% to 20%.\n\n\n-----\n\nThe goal of the 70-20-10 rule is to help guide the organization toward\nsustained innovation and spend the bulk of time on the core business. This is\npart of why we recommend starting first with fast (just 2- to 3-month total)\npilot projects to use AI on existing business use cases like demand forecasting\nand call center optimization. By working in these areas with a focus on learning\nand iterating, retailers will soon find where data silos and rigidity exist in the\nsystem. As these foundational barriers are knocked down, it then makes it\npossible to tackle more transformational use cases and start to build the\ncharacteristics of a data-forward enterprise. In other words, start to utilize\ndata and data-driven insights as a primary driver for decision-making and\noperations, while also prioritizing continuous data analysis and improvement.\n\n\n**TRANSFORMATIVE**\n\n\n**ADJACENT**\n\n\n**CORE**\n\n\n###### Companies that allocated about 70% of their innovation activity to core initiatives, \n### 20% to adjacent ones and 10% to\n###### transformational ones outperformed their peers.\n\n**Bansi Nagji & Geoff Tuff**\n_Managing Your Innovation Portfolio_\nHarvard Business Review, May 2012\n\n\n-----\n\n## From Hindsight to Foresight: The Journey to Becoming a Data-Forward Enterprise\n\n\nSo what does it take to successfully embark on this\njourney to becoming a data-forward enterprise?\nFirst and foremost, you need to not only establish\na baseline understanding of what has occurred by\nexamining historical data but leverage advancements\nin technologies (e.g., streaming, computer vision,\nvoice recognition) to make predictions of the future.\n\nThrough the use of both historical data and\npredictive techniques such as forecasting,\nrecommendations, prescriptive care and nextbest-action, organizations can begin to improve\ndecisions and, in some cases, automate certain\ndecision-making processes. But rather than moving\n\nfrom historical views to predictive actions in a\nlinear fashion, this journey involves addressing both\napproaches simultaneously. Once you are able to\nunify historical and predictive analysis, you can then\ntake significant steps toward becoming a dataforward enterprise.\n\n\n##### The Data-Forward Enterprise\n\nData, analytics and AI working in concert\n\n\n**Data Purgatory**\nThings are better, but data isn’t\ndriving the business\n\n\n**Data Maturity**\nEvery aspect of the\nbusiness is supported\nby insights and AI\n\n\n**Data Siloed**\nData and teams are segregated\ninto different systems\n\nDATA MATURITY\n\nBeing data-forward means silos cease to exist, and data, analytics and AI are informing every aspect of the business.\n\n\n-----\n\n## The 8 Steps to Building a Data-Forward Retailer\n\n\nBefore you start your data-forward journey, a few critical steps must be\nconsidered to establish a solid foundation to build upon. Based on our\nwork with the largest and most successful retailers in the world, spanning\nstartups to global giants, we at Databricks have seen that the most successful\nfollowed these steps to effectively gain wallet share, whereas those who\ncouldn’t would often leave major gaps that competitors could take advantage\nof. These steps are the basics to prepare businesses for where they need\nto be both now and in the near future.\n\n\n**2** **Get grounded: Understand the technology**\n\nTo start, business leaders need to ground themselves in technology, especially\nwhen it comes to AI. AI can do amazing things, but it is not magical and vendors\nare prone to overpromising and underdelivering. Less than getting deep into\ncode, the purpose is to understand the limitations and ideal use cases.\n\nDatabricks provides several [free resources for retailers](https://www.databricks.com/explore/retail-resources) , but we recommend\nstarting with [The Big Book of Retail & Consumer Goods Use Cases](https://www.databricks.com/resources/ebook/big-book-of-retail-consumer-goods-use-cases) for a C-level\nperspective of how different brands are using data, analytics and AI to drive\nrevenue or cut operational costs.\n\n\n**1** **Set the foundation: Define goals and objectives**\n\n\nThe best way to avoid shiny object syndrome (where you start out with a\n\ntechnology and then try to figure out what to do with it) is to first identify the\nproblems you want to solve. From there, you can set goals around innovation\nto align incentives, and, most importantly, ensure you are driving specific\nbusiness outcomes such as improving customer engagement, optimizing\ninventory management or increasing sales.\n\n\n**3** **Understand the skills and processes in your business**\n\nAs we will get into in step 4, starting with smaller pilot projects enables you\nto not just deliver a quick win and validate the use of AI in the enterprise, but\nalso understand the in-house capabilities in terms of people, process and\ntechnology to deliver technical projects. And if required, be willing and ready\nto hire people with the right skill sets that can help you make the most of your\ndata. For example, building a core team of data analysts can help extract deep\ninsights that lead to better decision-making and identify opportunities for\ngrowth. It is critical at this step to define the roles you need, determine how\nyou will source for those roles (via external hiring or internal transfer), and\nensure those roles have opportunities for career progression.\n\n\n-----\n\nFor inspiration and a head start, check out our [Solution Accelerators for Retail](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods)\n[& Consumer Goods](https://www.databricks.com/solutions/accelerators?industry=Retail%20and%20Consumer%20Goods) . These free resources were created to help our customers\nsave hours of discovery, design, development and testing. Our purpose-built\nguides — fully functional notebooks and best practices — speed up results\nacross your most common and high-impact use cases and enable you to go\nfrom idea to proof of concept (PoC) in as little as two weeks. We have over\n20 accelerators built specifically for critical retail and consumer goods use\ncases, from Demand Forecasting and On-Shelf Availability to Recommendation\nEngines and Customer Lifetime Value. We also have a set of Solution\nAccelerators specifically for [LLMs in Retail & Consumer Goods.](https://www.databricks.com/solutions/accelerators/large-language-models-retail)\n\n**5** **Implement data management and governance early**\n\nThe first step to successfully implementing AI/ML in your business broadly\nis to ensure you have accurate, reliable and current data to train your\nmodels against. This data can (and should) come from a variety of sources,\nso it’s key to unify all data types and sources (sales transactions, customer\nfeedback, social media) in a centralized location that is easily accessible,\nwhile not losing sight of data security to maintain customer trust. Setting\nup data governance parameters to control who has which kinds of access\nto what data, and being able to audit the history of this access, will actually\naccelerate innovation while ensuring data security and compliance.\n\n\n**Delivering exactly what customers want,**\n**every time, and on time**\n\nData is at the heart of Gousto’s mission to change the\nway people eat through the delivery of boxes of fresh\ningredients and easy-to-follow recipes. However, even\nas their business exploded at the start of the pandemic,\ntheir systems couldn’t ingest data fast enough, couldn’t\ntalk to each other and wouldn’t scale — forcing them to\ntemporarily stop accepting new customers. Now Gousto is\nset up to achieve exciting ambitions for menu expansion,\nsophisticated personalization and next-day delivery. Learn\nhow they did it.\n\n**[READ THE FULL GOUSTO STORY](https://www.databricks.com/customers/gousto)**\n\n**4** **Start small: Pilot a project**\n\nThere is no substitute for rolling your sleeves up and running a pilot project to\nevaluate the feasibility and potential impact of a project before implementing\nit on a larger scale. When selecting a pilot project, we recommend starting with\na project that will deliver clear business value, such as incremental revenue\nor clear cost savings, yet only takes 2-3 months to complete. The more time\nthere is between project inception and seeing results, the more likely it will lose\nmomentum internally.\n\n\n-----\n\n**6** **Incorporate AI across the business (starting with daily tasks)**\n\nGiven the large upfront investment in data scientists and engineers to build\nan AI program, the ROI will come from using it at scale. Constantly look to\nuncover patterns and repeatable processes that can be optimized or fully\nautomated with AI.\n\n**Building a global fashion icon with a**\n**customer-first approach**\n\nBritish luxury brand Burberry was seeking an efficient way to\nannotate its thousands of highly specific marketing assets\nfor better targeting. Working with Labelbox within Databricks\nLakehouse, they are now able to complete image annotation\nprojects in hours instead of months. And marketing team\nmembers now have access to powerful content insights\nwithout needing to ask data scientists for help.\n\n**[READ THE FULL BURBERRY STORY](https://www.databricks.com/customers/burberry)**\n\n**Customizing interactions that convert clicks**\n**to revenue with Databricks Lakehouse**\n\nGlobal jewelry manufacturer and retailer Pandora needed a\nunified view of all their data where they could easily segment,\ncategorize and analyze to deliver custom messaging to\nconsumers. With Databricks Lakehouse, they now have the\ninsights they need to deliver highly targeted messaging —\nincreasing consumer engagement from the initial opening of\na marketing email to maximizing shopping bag conversions to\ndriving revenue on the website.\n\n**[READ THE FULL PANDORA STORY](https://www.databricks.com/customers/pandora)**\n\n\n**Building an operationally efficient**\n**omnichannel business**\n\nThe Hershey Company analyzes the data they need to\nstay in front of changing human behavior and delight their\ncustomers. With Databricks Lakehouse, they can analyze\ndata feeds from their largest retail customer — uncovering\ninsights that will help extend their industry leadership.\n\n**[READ THE FULL HERSHEY STORY](https://www.databricks.com/customers/hershey)**\n\n\n**Ushering in a new era**\n**of data-driven retailing**\n\nOutdoor apparel brand Columbia Sportswear has enabled\ndata and analytics self-service throughout the organization in\na way that ensures everyone is working from a single source\nof truth. Whichever data team needs access to the data,\nDatabricks Lakehouse gives them the confidence that the\ndata is reliable and consistent.\n\n**[READ THE FULL COLUMBIA SPORTSWEAR STORY](https://www.google.com/url?q=https://www.databricks.com/customers/columbia&sa=D&source=editors&ust=1690998645853115&usg=AOvVaw0_kRasuzyi4ESz1SMB0n-K)**\n\n\n-----\n\n**7** **Foster a culture of data-driven decision-making**\n\nWhat does it mean to have a culture of data-driven decision-making? In\npractice, it means empowering all employees to use data to inform their\ndecisions. Only some strategic decisions will be based on complete and\naccurate information. It’s unwise to assume otherwise. The right approach\nis to leverage as much data as possible, from past tests or current efforts,\nto mitigate risk. Leaders need to not only ask for data but also ensure\nthat their employees will be able to find the data they need.\n\n**Unlocking critical trends and insights**\n**needed to serve our 180 million customers**\n\nReckitt, the maker of Lysol as well as hundreds of other\nhousehold brands, was looking to deliver best-in-class\ncustomer experiences to their over 180 million customers\nspanning the globe. With Databricks Lakehouse, Reckitt\nhas established a data-first culture by surfacing real-time,\nhighly accurate, deep customer data insights that have\nled to a better understanding of international market\ntrends and demand across the multiple product lines\nthey support.\n\n**[READ THE FULL RECKITT STORY](https://www.databricks.com/customers/reckitt)**\n\n\n**Customer 360 to enable faster speed**\n**to market, better results**\n\nThe Middle East’s Al-Futtaim serves as a local distributor\nfor global brands such as Toyota, IKEA and Ace Hardware.\nWith Databricks Lakehouse serving as a unified platform to\naggregate and analyze various data sources on all customers,\nthey have created a “golden customer record” that improves\nall decision-making, from forecasting demand to powering\ntheir global loyalty program.\n\n**[READ THE FULL AL-FUTTAIM STORY](https://www.google.com/url?q=https://www.databricks.com/customers/al-futtaim&sa=D&source=editors&ust=1690998645853527&usg=AOvVaw3cs-6mM2ANTKDCzTdTvEYH)**\n\n**8** **Continuously evaluate and improve**\n\nRecognize that establishing a data-driven culture is an ongoing journey and\nnever a set destination. Constantly evaluate your data collection, analysis and\ndecision-making process to identify areas for improvement. Even small and\nconstant incremental improvements will deliver large gains in absolute terms\nwhen applied at scale. You can always personalize more, forecast better, or\nbetter manage your supply chain as you bring in better data sources and refine\nyour models.\n\n\n-----\n\n## Transform Retail Data Into Actionable Insights\n\n\nBecoming data forward is not a crazy idea. Too often, leaders or organizations\nallow themselves to be intimidated by focusing on large-scale transformations.\nBut it’s the small operational changes that can make your business more efficient\nas well as shift the larger culture forward. Once you’ve set this foundation, it then\nallows you to move toward bigger things. These steps may fail, but it’s actually\npositive to have these setbacks to learn from to try again. The bigger risk is to\nnot try and thus fall behind competitors who are embracing the internal changes\nneeded to take advantage of AI and machine learning.\n\nCore to delivering on these steps to become a data-forward retailer is a solid\ndata foundation that can unify your data and AI workloads with sharing and\ngovernance built in, so internal and external teams can get access to the\ndata they need when they need it. With the [Databricks Lakehouse for Retail](https://www.databricks.com/solutions/industries/retail-industry-solutions) ,\ncompanies gain valuable insights into customer behavior, optimize supply chain\n\noperations and make informed business decisions in real time.\n\n\nEXPLORE DATABRICKS LAKEHOUSE FOR RETAIL\n\nAccess key resources to understanding how a lakehouse\nfor retail can set you on the path toward becoming a\ndata-forward organization.\n\n**[LEARN MORE](https://www.databricks.com/explore/retail-resources)**\n\n\n#### Visit our website to learn more about Databricks Lakehouse for Retail.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast, and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks#account)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/8-steps-to-becoming-a-ai-forward-retailer-ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "### eBook\n\n# The Big Book\n of MLOps\n\n#### A data-centric approach\n to build and scale AI,\n including LLMOps\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\n## Contents\n\n**A U T H O R S :**\n\n**Joseph Bradley**\n\nLead Product Specialist\n\n**Rafi Kurlansik**\n\nLead Product Specialist\n\n**Matt Thomson**\n\nDirector, EMEA Product Specialists\n\n**Niall Turbitt**\n\nLead Data Scientist\n\n\n**C H A P T E R 1 :** \u0007 **Introduction** 3\n\n###### People and process 4\n\n People 5\n\n Process 6\n\n Why should I care about MLOps? 8\n\n Guiding principles 9\n\n**C H A P T E R 2 :** \u0007 **Fundamentals of MLOps** 11\n\n###### Semantics of dev, staging and prod 11\n\n ML deployment patterns 15\n\n**C H A P T E R 3 :** **MLOps Architecture and Process** \u0007 19\n\n###### Architecture components 19\n\n Data Lakehouse 19\n\n MLflow 19\n\n Databricks and MLflow Autologging 20\n\n Feature Store 20\n\n MLflow Model Serving 20\n\n Databricks SQL 20\n\n Databricks Workflows and Jobs 20\n\n Reference architecture 21\n\n Overview 22\n\n Dev 23\n\n Staging 27\n\n Prod 30\n\n**C H A P T E R 4 :** \u0007 **LLMOps – Large Language Model Operations** 36\n\n###### Discussion of key topics for LLMOps 39\n\n Reference architecture 46\n\n Looking ahead 48\n\n\n-----\n\n**CHAPTER 1:**\n## Introduction\n\n**Note:** Our prescription for MLOps is general to\n\nany set of tools and applications, though we give\n\nconcrete examples using Databricks features\n\nand functionality. We also note that no single\n\narchitecture or prescription will work for all\n\norganizations or use cases. Therefore, while we\n\nprovide guidelines for building MLOps, we call out\n\nimportant options and variations. This whitepaper\n\nis written primarily for ML engineers and data\n\nscientists wanting to learn more about MLOps,\n\nwith high-level guidance and pointers to more\n\nresources.\n\n\nThe past decade has seen rapid growth in the adoption of machine learning (ML). While the early\n\nadopters were a small number of large technology companies that could afford the necessary resources,\n\nin recent times ML-driven business cases have become ubiquitous in all industries. Indeed, according to\n\nMIT Sloan Management Review, 83% of CEOs report that [artificial intelligence (AI) is a strategic priority](https://sloanreview.mit.edu/projects/artificial-intelligence-in-business-gets-real/) .\n\nThis democratization of ML across industries has brought huge economic benefits, with [Gartner estimating](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018)\n\n[that $3.9T in business value](https://www.gartner.com/en/newsroom/press-releases/2018-04-25-gartner-says-global-artificial-intelligence-business-value-to-reach-1-point-2-trillion-in-2018) will be created by AI in 2022.\n\nHowever, building and deploying ML models is complex. There are many options available for achieving\n\nthis but little in the way of well-defined and accessible standards. As a result, over the past few years we\n\nhave seen the emergence of the machine learning operations (MLOps) field. **MLOps is a set of processes**\n\n**and automation for managing models, data and code to improve performance stability and long-term**\n\n**efficiency in ML systems.** Put simply, MLOps = [ModelOps](https://en.wikipedia.org/wiki/ModelOps) + [DataOps](https://en.wikipedia.org/wiki/DataOps) + [DevOps](https://en.wikipedia.org/wiki/DevOps) .\n\nThe concept of developer operations (DevOps) is nothing new. It has been used for decades to deploy\n\nsoftware applications, and the deployment of ML applications has much to gain from it. However, strong\n\nDevOps practices and tooling alone are insufficient because ML applications rely on a constellation of\n\nartifacts (e.g., models, data, code) that require special treatment. Any MLOps solution must take into\n\naccount the various people and processes that interact with these artifacts.\n\nHere at Databricks we have seen firsthand how customers develop their MLOps approaches, some of\n\nwhich work better than others. We launched the open source [MLflow](https://www.mlflow.org/) project to help make our customers\n\nsuccessful with MLOps, and with over 10 million downloads/month from PyPI as of May 2022, MLflow’s\n\nadoption is a testament to the appetite for operationalizing ML models.\n\nThis whitepaper aims to explain how your organization can build robust MLOps practices incrementally.\n\nFirst, we describe the people and process involved in deploying ML applications and the need for\n\noperational rigor. We also provide general principles to help guide your planning and decision-making. Next,\n\nwe go through the fundamentals of MLOps, defining terms and broad strategies for deployment. Finally, we\n\nintroduce a general MLOps reference architecture, the details of its processes, and best practices.\n\n\n-----\n\n#### People and process\n\n**M L W O R K F L O W A N D P E R S O N A S**\n\nData Governance Officer\n\nDat1\nData Scientist\nEngineer\n\nML Engineer\n\nBusiness Stakeholder\n\n\nDataa\nPreparation\n\n\nEvplorator{a\nData unal{sis\n\n\nFeature Mode� Modela Deplo{�ent\nEngineering Training Validation\n\n\nMode� Modela Deplo{�ent Monitoring\nTraining Validation\n\n\nModela\nValidation\n\n\n**Figure 1**\n\n\n-----\n\n#### People\n\nBuilding ML applications is a team sport, and while in the real world people “wear many hats,” it is still\n\nuseful to think in terms of archetypes. They help us understand roles and responsibilities and where\n\nhandoffs are required, and they highlight areas of complexity within the system. We distinguish between\n\nthe following personas:\n\n**M L P E R S O N A S**\n\n\nData\nGovernance\nOfficer\n\nResponsible for ensuring\n\nthat data governance,\n\ndata privacy and other\n\ncompliance measures are\n\nadhered to across the\n\nmodel development and\n\ndeployment process. Not\n\ntypically involved in day-to-\n\nday operations.\n\n\nData\nEngineer\n\nResponsible for building\n\ndata pipelines to process,\n\norganize and persist data\n\nsets for machine learning\n\nand other downstream\n\napplications.\n\n\nData\nScientist\n\nResponsible for\n\nunderstanding the business\n\nproblem, exploring available\n\ndata to understand\n\nif machine learning is\n\napplicable, and then training,\n\ntuning and evaluating a\n\nmodel to be deployed.\n\n\nML\nEngineer\n\nResponsible for deploying\n\nmachine learning models to\n\nproduction with appropriate\n\ngovernance, monitoring and\n\nsoftware development best\n\npractices such as continuous\n\nintegration and continuous\n\ndeployment ( [CI/CD](https://en.wikipedia.org/wiki/CI/CD) ).\n\n\nBusiness\nStakeholder\n\nResponsible for using the\n\nmodel to make decisions for\n\nthe business or product, and\n\nresponsible for the business\n\nvalue that the model is\n\nexpected to generate.\n\n\n-----\n\n#### Process\n\nTogether, these people develop and maintain ML applications. While the development process follows\n\na distinct pattern, it is not entirely monolithic. The way you deploy a model has an impact on the steps\n\nyou take, and using techniques like reinforcement learning or online learning will change some details.\n\nNevertheless, these steps and personas involved are variations on a core theme, as illustrated in Figure 1\n\nabove.\n\nLet’s walk through the process step by step. Keep in mind that this is an iterative process, the frequency of\n\nwhich will be determined by the particular business case and data.\n\n**M L P R O C E S S**\n\n\nData\nPreparation\n\n\nExploratory\nData Analysis\n\n\nFeature\nEngineering\n\n\nModel\nTraining\n\n\nModel\nValidation\n\n\nDeployment Monitoring\n\n\n###### Data preparation\n\nPrior to any data science or ML work lies the data engineering needed to prepare production data and make\n\nit available for consumption. This data may be referred to as “raw data,” and in later steps, data scientists\n\nwill extract features and labels from the raw data.\n\n###### Exploratory data analysis (EDA)\n\nAnalysis is conducted by data scientists to assess statistical properties of the data available, and determine\n\nif they address the business question. This requires frequent communication and iteration with business\n\nstakeholders.\n\n\n-----\n\n###### Feature engineering\n\nData scientists clean data and apply business logic and specialized transformations to engineer features for\n\nmodel training. These data, or features, are split into training, testing and validation sets.\n\n###### Model training\n\nData scientists explore multiple algorithms and hyperparameter configurations using the prepared data, and\n\na best-performing model is determined according to predefined evaluation metric(s).\n\n###### Model validation\n\nPrior to deployment a selected model is subjected to a validation step to ensure that it exceeds\n\nsome baseline level of performance, in addition to meeting any other technical, business or regulatory\n\nrequirements. This necessitates collaboration between data scientists, business stakeholders and ML\n\nengineers.\n\n###### Deployment\n\nML engineers will deploy a validated model via batch, streaming or online serving, depending on the\n\nrequirements of the use case.\n\n###### Monitoring\n\nML engineers will monitor deployed models for signs of performance degradation or errors. Data scientists\n\nwill often be involved in early monitoring phases to ensure that new models perform as expected after\n\ndeployment. This will inform if and when the deployed model should be updated by returning to earlier\n\nstages in the workflow.\n\nThe data governance officer is ultimately responsible for making sure this entire process is compliant with\n\ncompany and regulatory policies.\n\n\n-----\n\n#### Why should I care about MLOps?\n\nConsider that the typical ML application depends on the aforementioned people and process, as well\n\nas regulatory and ethical requirements. These dependencies change over time — and your models, data\n\nand code must change as well. The data that were a reliable signal yesterday become noise; open source\n\nlibraries become outdated; regulatory environments evolve; and teams change. ML systems must be\n\nresilient to these changes. Yet this broad scope can be a lot for organizations to manage — there are many\n\nmoving parts! Addressing these challenges with a defined MLOps strategy can dramatically reduce the\n\niteration cycle of delivering models to production, thereby accelerating time to business value.\n\nThere are two main types of risk in ML systems: **technical risk** inherent to the system itself and **risk of**\n\n**noncompliance** with external systems. Both of these risks derive from the dependencies described above.\n\nFor example, if data pipeline infrastructure, KPIs, model monitoring and documentation are lacking, then you\n\nrisk your system becoming destabilized or ineffective. On the other hand, even a well-designed system that\n\nfails to comply with corporate, regulatory and ethical requirements runs the risk of losing funding, receiving\n\nfines or incurring reputational damage. Recently, one private company’s data collection practices were\n\nfound to have violated the Children’s Online Privacy Protection Rule (COPPA). The [FTC fined](https://www.protocol.com/policy/ftc-algorithm-destroy-data-privacy) the company\n\n$1.5 million and [ordered](https://www.ftc.gov/system/files/ftc_gov/pdf/wwkurbostipulatedorder.pdf) it to destroy or delete the illegally harvested data, and all models or algorithms\n\ndeveloped with that data.\n\nWith respect to efficiency, the absence of MLOps is typically marked by an overabundance of manual\n\nprocesses. These steps are slower and more prone to error, affecting the quality of models, data and code.\n\nEventually they form a bottleneck, capping the ability for a data team to take on new projects.\n\nSeen through these lenses, the aim of MLOps becomes clear: improve the long-term performance\n\nstability and success rate of ML systems while maximizing the efficiency of teams who build them. In the\n\nintroduction, we defined MLOps to address this aim: MLOps is a **set of processes and automation** to\n\nmanage **models, data and code** to meet the two goals of **stable performance and long-term efficiency in**\n\n**ML systems** . _MLOps = ModelOps + DataOps + DevOps_ .\n\nWith clear goals we are ready to discuss principles that guide design decisions and planning for MLOps\n\n\nM o d e l O p s D a t a O p s D e �O p s\n\n\n-----\n\nGiven the complexity of ML\n\nprocesses and the different personas\n\ninvolved, it is helpful to start from\n\nsimpler, high-level guidance. We\n\npropose several broadly applicable\n\nprinciples to guide MLOps decisions.\n\nThey inform our design choices in\n\nlater sections, and we hope they can\n\nbe adapted to support whatever your\n\n\n#### Guiding principles\n\n###### Always keep your business goals in mind\n\nJust as the core purpose of ML in a business is to enable data-driven decisions and products, the core\n\npurpose of MLOps is to ensure that those data-driven applications remain stable, are kept up to date and\n\ncontinue to have positive impacts on the business. When prioritizing technical work on MLOps, consider the\n\nbusiness impact: Does it enable new business use cases? Does it improve data teams’ productivity? Does it\n\nreduce operational costs or risks?\n\n###### Take a data-centric approach to machine learning\n\nFeature engineering, training, inference and monitoring pipelines are data pipelines. As such, they need to be\n\nas robust as other production data engineering processes. Data quality is crucial in any ML application, so\n\nML data pipelines should employ systematic approaches to monitoring and mitigating data quality issues.\n\nAvoid tools that make it difficult to join data from ML predictions, model monitoring, etc., with the rest of\n\nyour data. The simplest way to achieve this is to develop ML applications on the same platform used to\n\nmanage production data. For example, instead of downloading training data to a laptop, where it is hard\n\nto govern and reproduce results, secure the data in cloud storage and make that storage available to your\n\ntraining process.\n\n\nbusiness use case may be.\n\n\n-----\n\n###### \u0007Implement MLOps in a modular fashion\n\nAs with any software application, code quality is paramount for an ML application. Modularized code\n\nenables testing of individual components and mitigates difficulties with future code refactoring. Define\n\nclear steps (e.g., training, evaluation or deployment), supersteps (e.g., training-to-deployment pipeline) and\n\nresponsibilities to clarify the modular structure of your ML application.\n\n###### Process should guide automation\n\nWe automate processes to improve productivity and lower risk of human error, but not every step of a\n\nprocess can or should be automated. People still determine the business question, and some models will\n\nalways need human oversight before deployment. Therefore, the development process is primary and each\n\nmodule in the process should be automated as needed. This allows incremental build-out of automation\n\nand customization. Furthermore, when it comes to particular automation tools, choose those that align to\n\nyour people and process. For example, instead of building a model logging framework around a generic\n\ndatabase, you can choose a specialized tool like MLflow, which has been designed with the ML model\n\nlifecycle in mind.\n\n\n-----\n\n**CHAPTER 2:**\n## Fundamentals of MLOps\n\n**Note:** In our experience with customers, there\n\ncan be variations in these three stages, such as\n\nsplitting staging into separate “test” and “QA”\n\nsubstages. However, the principles remain the\n\nsame and we stick to a dev, staging and prod\n\nsetup within this paper.\n\n\n#### Semantics of dev, staging and prod\n\nML workflows include the following key assets: code, models and data. These assets need to be developed\n\n(dev), tested (staging) and deployed (prod). For each stage, we also need to operate within an execution\n\nenvironment. Thus, all the above — execution environments, code, models and data — are divided into dev,\n\nstaging and prod.\n\nThese divisions can best be understood in terms of quality guarantees and access control. On one end,\n\nassets in prod are generally business critical, with the highest guarantee of quality and tightest control on\n\nwho can modify them. Conversely, dev assets are more widely accessible to people but offer no guarantee\n\nof quality.\n\nFor example, many data scientists will work together in a dev environment, freely producing dev model\n\nprototypes. Any flaws in these models are relatively low risk for the business, as they are separate from\n\nthe live product. In contrast, the staging environment replicates the execution environment of production.\n\nHere, code changes made in the dev environment are tested prior to code being deployed to production.\n\nThe staging environment acts as a gateway for code to reach production, and accordingly, fewer people\n\nare given access to staging. Code promoted to production is considered a live product. In the production\n\nenvironment, human error can pose the greatest risk to business continuity, and so the least number of\n\npeople have permission to modify production models.\n\nOne might be tempted to say that code, models and data each share a one-to-one correspondence with\n\nthe execution environment — e.g., all dev code, models and data are in the dev environment. That is often\n\nclose to true but is rarely correct. Therefore, we will next discuss the precise semantics of dev, staging\n\nand prod for execution environments, code, models and data. We also discuss mechanisms for restricting\n\naccess to each.\n\n\n-----\n\n###### Execution environments\n\nAn execution environment is the place where models and data are created or consumed by code. Each\n\nexecution environment consists of compute instances, their runtimes and libraries, and automated jobs.\n\nWith Databricks, an “environment” can be defined via dev/staging/prod separation at a few levels. An\n\norganization could create distinct environments across multiple cloud accounts, multiple Databricks\n\nworkspaces in the same cloud account, or within a single Databricks workspace. These separation patterns\n\nare illustrated in Figure 2 below.\n\n**E N V I R O N M E N T S E P A R AT I O N P AT T E R N S**\n\n\nMultiple clou$\naccounts\n\nstaging\n\nprod\n\n\nMultiple Databricks\nworkspaces\n\nstaging\n\nprod\n\n\nDatabricks workspace\naccess controls\n\n\ndev\n\nstaging\n\nprod\n\n\ndev\n\n\ndev\n\n\n**Figure 2**\n\n\n-----\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\n\n###### Code\n\nML project code is often stored in a version control repository (such as Git), with most organizations\n\nusing branches corresponding to the lifecycle phases of development, staging or production. There are a\n\nfew common patterns. Some use only development branches (dev) and one main branch (staging/prod).\n\nOthers use main and development branches (dev), branches cut for testing potential releases (staging), and\n\nbranches cut for final releases (prod). Regardless of which convention you choose, separation is enforced\n\nthrough Git repository branches.\n\n\nlifecycle management functions that are needed\n\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once and to replace a subset\n\nof the objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\n\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\n\nthat’s higher. For example, the dev environment can run any code, but the prod environment can only run\n\nprod code.\n\n###### Models\n\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\n\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\n\na new model version before you push a code change, and vice versa. Consider the following scenarios:\n\n\npoint-in-time snapshots or rollback of erroneous\n\n\n\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\n\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\n\nof being generated, tested and marked as “production” to predict on the most recent transactions. In\n\nthis case the code lifecycle is slower than the model lifecycle.\n\n\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\n\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\n\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\n\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\n\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\n\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\n\nproduction models without code changes, streamlining the deployment process in many cases. Model\n\nartifacts are secured using MLflow access controls or cloud storage permissions\n\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n###### Data\n\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\n\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\n\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\n\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\n\n\nreliability and freshness. Access to data in each environment is controlled with table access controls\n\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\n| |\n\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\n\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\n\nwill be the highest quality and tightly controlled.\n\n\n\n\n\n\n\n\n|ASSET|SEMANTICS|SEPARATED BY|\n|---|---|---|\n|Execution environments|Labeled according to where development, testing and connections with production systems happen|Cloud provider and Databricks Workspace access controls|\n|Models|Labeled according to model lifecycle phase|MLflow access controls or cloud storage permissions|\n|Data|Labeled according to its origin in dev, staging or prod execution environments|Table access controls or cloud storage permissions|\n|Code|Labeled according to software development lifecycle phase|Git repository branches|\n\n\n**Table 1**\n\n\n-----\n\n#### ML deployment patterns\n\nThe fact that models and code can be managed separately results in multiple possible patterns for getting\n\nML artifacts through staging and into production. We explain two major patterns below.\n\n**D E P L O Y M O D E L S**\n\ndev staging prod\n\n**D E P L O Y C O D E**\n\ndev staging prod\n\nThese two patterns differ in terms of whether the model artifact or the training code that produces the\n\nmodel artifact is promoted toward production.\n\n\n-----\n\n###### Deploy models\n\nIn the first pattern, the model artifact is generated by training code in the development environment.\n\nThis artifact is then tested in staging for compliance and performance before finally being deployed into\n\nproduction. This is a simpler handoff for data scientists, and in cases where model training is prohibitively\n\nexpensive, training the model once and managing that artifact may be preferable. However, this simpler\n\narchitecture comes with limitations. If production data is not accessible from the development environment\n\n(e.g., for security reasons), this architecture may not be viable. This architecture does not naturally support\n\nautomated model retraining. While you could automate retraining in the development environment, you\n\nwould then be treating “dev” training code as production ready, which many deployment teams would not\n\naccept. This option hides the fact that ancillary code for featurization, inference and monitoring needs to be\n\ndeployed to production, requiring a separate code deployment path.\n\n###### Deploy code\n\nIn the second pattern, the code to train models is developed in the dev environment, and this code is\n\nmoved to staging and then production. Models will be trained in each environment: initially in the dev\n\nenvironment as part of model development, in staging (on a limited subset of data) as part of integration\n\ntests, and finally in the production environment (on the full production data) to produce the final model.\n\nIf an organization restricts data scientists’ access to production data from dev or staging environments,\n\ndeploying code allows training on production data while respecting access controls. Since training code\n\ngoes through code review and testing, it is safer to set up automated retraining. Ancillary code follows the\n\nsame pattern as model training code, and both can go through integration tests in staging. However, the\n\nlearning curve for handing code off to collaborators can be steep for many data scientists, so opinionated\n\nproject templates and workflows are helpful. Finally, data scientists need visibility into training results from\n\nthe production environment, for only they have the knowledge to identify and fix ML-specific issues.\n\n\n-----\n\nThe diagram below contrasts the code lifecycle for the above deployment patterns across the different\n\nexecution environments.\n\n\nCode\ndevelopment\n\nDevelopment\nenvironment\n\n\nUnit\ntests\n\n\nIntegration\ntests\n\nDevelopment\nenvironment\n\nStaging\nenvironment\n\n\nModel\ntraining\n\n\nContinuous\ndeployment\n\nStaging\nenvironment\n\nProduction\nenvironment\n\n\nDeploy\npipelines\n\nProduction\nenvironment\n\n\n#### Deploy models\n\n Deploy code\n\n\n**In general we recommend following the “deploy code” approach, and the reference architecture in**\n\n**this document is aligned to it.** Nevertheless, there is no perfect process that covers every scenario, and\n\nthe options outlined above are not mutually exclusive. Within a single organization, you may find some use\n\ncases deploying training code and others deploying model artifacts. Your choice of process will depend on\n\nthe business use case, resources available and what is most likely to succeed.\n\n\n-----\n\n|Col1|Col2|DEPLOY MODELS|DEPLOY CODE|\n|---|---|---|---|\n|Process|Dev|Develop training code. Develop ancillary code.1 Train model on prod data.  Promote model and ancillary code.|Develop training code. Develop ancillary code.  Promote code.|\n||Staging|Test model and ancillary code.  Promote model and ancillary code.|Train model on data subset. Test ancillary code.  Promote code.|\n||Prod|Deploy model. Deploy ancillary pipelines.|Train model on prod data. Test model. Deploy model. Deploy ancillary pipelines.|\n|Trade-offs|Automation| Does not support automated retraining in locked-down env.| Supports automated retraining in locked-down env.|\n||Data access control| Dev env needs read access to prod training data.| Only prod env needs read access to prod training data.|\n||Reproducible models| Less eng control over training env, so harder to ensure reproducibility.| Eng control over training env, which helps to simplify reproducibility.|\n||Data science familiarity| DS team builds and can directly test models in their dev env.| DS team must learn to write and hand off modular code to eng.|\n||Support for large projects| T\u0007his pattern does not force the DS team to use modular code for model training, and it has less iterative testing.| \u0007This pattern forces the DS team to use modular code and iterative testing, which helps with coordination and development in larger projects.|\n||Eng setup and maintenance| Has the simplest setup, with less CI/CD infra required.| \u0007Requires CI/CD infra for unit and integration tests, even for one-off models.|\n|When to use||Use this pattern when your model is a one-off or when model training is very expensive. Use when dev, staging and prod are not strictly separated envs.|Use this pattern by default. Use when dev, staging and prod are strictly separated envs.|\n\n\n**Table 2** **1** “\u0007Ancillary code” refers to code for ML pipelines other than the model training pipeline. Ancillary code could be featurization, inference, monitoring or other pipelines.\n\n\n-----\n\n**CHAPTER 3:**\n## MLOps Architecture\n and Process\n\n###### Lakehouse Platform\n\n\n#### Architecture components\n\nBefore unpacking the reference architecture, take a moment to familiarize yourself with the Databricks\n\nfeatures used to facilitate MLOps in the workflow prescribed.\n\n###### Data Lakehouse\n\nA [Data Lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) unifies the best elements of data lakes and data warehouses — delivering\n\ndata management and performance typically found in data warehouses with the low-cost, flexible object\n\nstores offered by data lakes. Data in the lakehouse are typically organized using a “medallion” architecture\n\nof Bronze, Silver and Gold tables of increasing refinement and quality.\n\n###### MLflow\n\n[MLflow](https://www.mlflow.org/) is an open source project for managing the end-to-end machine learning lifecycle. It has the\n\nfollowing primary components:\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n\u0007 **Tracking:** Allows you to track experiments to record and compare parameters, metrics and model\n\nartifacts. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/tracking.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/tracking) [GCP](https://docs.gcp.databricks.com/applications/mlflow/tracking.html) .\n| |\n\n\n\u0007 **Models (“MLflow flavors”):** Allows you to store and deploy models from any ML library to a variety of\n\nmodel serving and inference platforms. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/models.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/models) [GCP](https://docs.gcp.databricks.com/applications/mlflow/models.html) .\n| |\n\n\u0007 **Model Registry:** Provides a centralized model store for managing models’ full lifecycle stage transitions:\n\n\nfrom staging to production, with capabilities for versioning and annotating. The registry also provides\n\nwebhooks for automation and continuous deployment. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/model-registry.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-registry) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-registry.html) .\n| |\n\nDatabricks also provides a fully managed and hosted version of MLflow with enterprise security features,\n\nhigh availability, and other Databricks workspace features such as experiment and run management and\n\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\n\nmachine learning model training runs and running machine learning projects.\n\n\n-----\n\n###### Databricks and MLflow Autologging\n\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\n\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging\n\n\nautomatically captures model parameters, metrics, files and lineage information when you train models with\n\ntraining runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\n| |\n\n###### Feature Store\n\nThe Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\n\n\nacross an organization and also ensures that the same feature computation code is used for model training\n\nand inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\n| |\n\n###### MLflow Model Serving\n\nMLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\n\n\nthat are updated automatically based on the availability of model versions and their stages. See\n\ndocumentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\n| |\n\n###### Databricks SQL\n\nDatabricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\n\n\ndata lake, create multiple visualization types to explore query results from different perspectives, and build\n\nand share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\n| |\n\n###### Databricks Workflows and Jobs\n\nDatabricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\n\n\nways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\n\nsteps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\n| |\n\n\n-----\n\n#### Reference architecture\n\nWe are now ready to review a general reference architecture for implementing MLOps on the Databricks\n\nLakehouse platform using the recommended “deploy code” pattern from earlier. This is intended to cover\n\nthe majority of use cases and ML techniques, but it is by no means comprehensive. When appropriate,\n\nwe will highlight alternative approaches to implementing different parts of the process.\n\nWe begin with an overview of the system end-to-end, followed by more detailed views of the process\n\nin development, staging and production environments. These diagrams show the system as it operates\n\nin a steady state, with the finer details of iterative development cycles omitted. This structure is\n\nsummarized below.\n\n**O V E R V I E W**\n```\n dev staging prod\n\n```\n\n\u0007Data\n\n\u0007Exploratory data analysis (EDA)\n\n\u0007Project code\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Commit code\n\n\n\u0007Merge request\n\n\u0007Unit tests (CI)\n\n\u0007Integration tests (CI)\n\n\u0007Merge\n\n\u0007Cut release branch\n\n\n\u0007Feature table refresh\n\n\u0007Model training\n\n\u0007Continuous deployment (CD)\n\n\u0007Online serving (REST APIs)\n\n\u0007Inference: batch or streaming\n\n\u0007Monitoring\n\n\u0007Retraining\n\n\n-----\n\n###### Overview\n\nSource control\n\ndev staging (main) release\n\nMerge reIuest to staging Cut release branch Pull from release branch to production\n\n\n**Figure 3**\n\n\nDevelopment\nenvironment\n\nExploratory\ndata analysis\n\n\nStaging\nenvironment\n\nCreate dev branch Commit code C} trigger Merge\n\n\nProduction\nenvironment\n\nModel Registry\n\nSt�ge{ �one St�ge{ St�ging St�ge{ Production\n\n\n. . .\n\n\nInference & serving dev\n\nFeature table refresh dev\n\n\nUnit tests\n(CI)\n\n\nPush model to registr� Load model for testing Load model for inference\n\nIntegration\ntests (CI)\n\n\ndev\n\n\ndev\n\n\nPromote to production\n\n\nInference & serving\n\n\nModel training dev\n\nrelease\n\ndev\n\n\nFeature\ntable refresh\n\nrelease\n\n\nMode�\ntraining\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring\n\nrelease\n\n\nData tables Feature tables Feature tables Data tables Feature tables Metrics tables\n\nHere we see the overall process for deploying code and model artifacts, the inputs and outputs for pipelines,\n\nand model lifecycle stages in production. Code source control is the primary conduit for deploying ML\n\npipelines from development to production. Pipelines and models are prototyped on a dev branch in the\n\ndevelopment environment, and changes to the codebase are committed back to source control. Upon merge\n\nrequest to the staging branch (usually the “main” branch), a continuous integration (CI) process tests the\n\ncode in the staging environment. If the tests pass, new code can be deployed to production by cutting a\n\ncode release. In production, a model is trained on the full production data and pushed to the MLflow Model\n\nRegistry. A continuous deployment (CD) process tests the model and promotes it toward the production\n\nstage in the registry. The Model Registry’s production model can be served via batch, streaming or REST API.\n\n\n-----\n\n###### Dev\n\nIn the development environment, data scientists and ML engineers can collaborate on all pipelines in\n\nan ML project, committing their changes to source control. While engineers may help to configure this\n\nenvironment, data scientists typically have significant control over the libraries, compute resources and\n\ncode that they use.\n\n\n**Figure 4** Development environment\n\n0�\n\nE�ploratory\ndata analysis\n\n0�\n\n\ndev\n\n\nSource control\n\nTracking Server\n\nMetrics Parameters Models\n\ndev\n\n\n. . .\n\nmodels\n\n\ntrain.py\n\ndeploy.py\n\nin(erence.py\n\nmonitoring.py\n\ndat<\n\n(eaturization.py\n\ntests\n\nunit.py\n\nintegration.py\n\n\nInference: Streaming or batch\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\n\nModel training\n\nTraining and\nEvaluation\ntuning\n\n\nCreate dev mrancg\n\n0u\n\nCommit code\n\n\n04\n\n\n\ndev\n\n\ndev\n\n\n0�\n\n\nLakehouse\n\n\nFeature tamles Bronze / Silver / Gold\n\nprod data\n\n\nFeature tamles Temp tamles\n\ndev data\n\n\n-----\n\n###### Data\n\nData scientists working in the dev environment possess read-only access to production data. They also\n\nrequire read-write access to a separate dev storage environment to develop and experiment with new\n\nfeatures and other data tables.\n\n###### Exploratory data analysis (EDA)\n\nThe data scientist explores and analyzes data in an interactive, iterative process. This process is used to\n\nassess whether the available data has the potential to address the business problem. EDA is also where the\n\ndata scientist will begin discerning what data preparation and featurization are required for model training.\n\nThis ad hoc process is generally not part of a pipeline that will be deployed in other execution environments.\n\n###### Project code\n\nThis is a code repository containing all of the pipelines or modules involved in the ML system. Dev branches\n\nare used to develop changes to existing pipelines or to create new ones. Even during EDA and initial phases of\n\na project, it is recommended to develop within a repository to help with tracking changes and sharing code.\n\n\n-----\n\n###### Feature table refresh\n\nThis pipeline reads from raw data tables and feature tables and writes to tables in the Feature Store. The\n\npipeline consists of two steps:\n\n\u0007 **Data preparation**\n\nThis step checks for and corrects any data quality issues prior to featurization.\n\n**\u0007Featurization**\n\nIn the dev environment, new features and updated featurization logic can be tested by writing to feature\n\ntables in dev storage, and these dev feature tables can be used for model prototyping. Once this\n\nfeaturization code is promoted to production, these changes will affect the production feature tables.\n\nFeatures already available in production feature tables can be read directly for development.\n\nIn some organizations, feature engineering pipelines are managed separately from ML projects. In such\n\ncases, the featurization pipeline can be omitted from this architecture.\n\n\n-----\n\n###### Model training\n\nData scientists develop the model training pipeline in the dev environment with dev or prod feature tables.\n\n\u0007 **Training and tuning**\n\nThe training process reads features from the feature store and/or Silver- or Gold-level Lakehouse tables,\n\nand it logs model parameters, metrics and artifacts to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . After training and\n\nhyperparameter tuning, the final model artifact is logged to the tracking server to record a robust link\n\nbetween the model, its input data, and the code used to generate it.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out data. The results of these tests are logged to the\n\nMLflow tracking server.\n\nIf governance requires additional metrics or supplemental documentation about the model, this is the\n\ntime to add them using MLflow tracking. Model interpretations (e.g., plots produced by [SHAP](https://shap.readthedocs.io/en/latest/index.html) or [LIME](https://arxiv.org/abs/1602.04938) )\n\nand plain text descriptions are common, but defining the specifics for such governance requires input\n\nfrom business stakeholders or a data governance officer.\n\n**\u0007Model output**\n\nThe output of this pipeline is an ML model artifact stored in the MLflow tracking server. When this\n\ntraining pipeline is run in staging or production, ML engineers (or their CI/CD code) can load the model\n\nvia the model URI (or path) and then push the model to the Model Registry for management and testing.\n\n###### Commit code\n\nAfter developing code for featurization, training, inference and other pipelines, the data scientist or\n\nML engineer commits the dev branch changes into source control. This section does not discuss the\n\ncontinuous deployment, inference or monitoring pipelines in detail; see the “Prod” section below for more\n\ninformation on those.\n\n\n-----\n\n###### Staging\n\nThe transition of code from development to production occurs in the staging environment. This code\n\nincludes model training code and ancillary code for featurization, inference, etc. Both data scientists and ML\n\nengineers are responsible for writing tests for code and models, but ML engineers manage the continuous\n\nintegration pipelines and orchestration.\n\nSource control\n\n0] 0_\n\ndev staging >main< release\n\nMerge reHuest to staging Cut release branch\n\nStaging environment\n\nCI trigger Merge\n\n0�\n\n\n**Figure 5**\n\n\nUnit tests\n(CI)\n\n\nTracking Server\n\n0�\n\nModel Registry\n\ndev\n\n\n03\n\nIntegration tests (CI)\n\n\nFeature\nStore tests\n\n\nModel\ntraining tests\n\n\nModel\ndeployment\ntests\n\n\nInference\ntests\n\n\nModel\nmonitoring\ntests\n\n\nLakehouse\n\n\ndev\n\nFeature tables Temp tables\n\nstaging data\n\n\n-----\n\n###### Data\n\nThe staging environment may have its own storage area for testing feature tables and ML pipelines. This\n\ndata is generally temporary and only retained long enough to run tests and to investigate test failures. This\n\ndata can be made readable from the development environment for debugging.\n\n###### Merge code\n\n\u0007 **Merge request**\n\nThe deployment process begins when a merge (or pull) request is submitted against the staging branch\n\nof the project in source control. It is common to use the “main” branch as the staging branch.\n\n**\u0007Unit tests (CI)**\n\nThis merge request automatically builds source code and triggers unit tests. If tests fail, the merge\n\nrequest is rejected.\n\n\n-----\n\n###### Integration tests (CI)\n\nThe merge request then goes through integration tests, which run all pipelines to confirm that they function\n\ncorrectly together. The staging environment should mimic the production environment as much as is\n\nreasonable, running and testing pipelines for featurization, model training, inference and monitoring.\n\nIntegration tests can trade off fidelity of testing for speed and cost. For example, when models are\n\nexpensive to train, it is common to test model training on small data sets or for fewer iterations to reduce\n\ncost. When models are deployed behind REST APIs, some high-SLA models may merit full-scale load\n\ntesting within these integration tests, whereas other models may be tested with small batch jobs or a few\n\nqueries to temporary REST endpoints.\n\nOnce integration tests pass on the staging branch, the code may be promoted toward production.\n\n\u0007 **Merge**\n\nIf all tests pass, the new code is merged into the staging branch of the project. If tests fail, the CI/CD\n\nsystem should notify users and post results on the merge (pull) request.\n\nNote: It can be useful to schedule periodic integration tests on the staging branch, especially if the branch is\n\nupdated frequently with concurrent merge requests.\n\n###### Cut release branch\n\nOnce CI tests have passed on a commit in the staging branch, ML engineers can cut a release branch from\n\nthat commit.\n\n\n-----\n\n**Figure 6**\n\n\n###### Prod\n\nThe production environment is typically managed by a select set of ML engineers and is where ML pipelines\n\ndirectly serve the business or application. These pipelines compute fresh feature values, train and test new\n\nmodel versions, publish predictions to downstream tables or applications, and monitor the entire process to\n\navoid performance degradation and instability. While we illustrate batch and streaming inference alongside\n\nonline serving below, most ML applications will use only one of these methods, depending on the business\n\nrequirements.\n\nProduction environment\n\n\n0b\n\n0�\n\n0�\n\n\nModel Registry\n\n\nOnline serving\n\n\nStage: None Stage: Staging Stage: Production\n\n\nLog\nrequests and\npredictions\n\nrelease\n\n\nLoad model for\nonline serving\n\n\nEna�le online\nserving\n\n\nFeature table refresh\n\nData\nFeaturization\npreparation\n\nrelease\n\n0B\n\n\n0~\n\n\nLoad model for testing\n\n\nLoad model for testing Load model for inference\n\n\nInference: Batch or streaming\n\n\nRegister and request transition\n\nModel training\n\nTraining\nEvaluation\nand tuning\n\nrelease\n\n\nPromote to staging Promote to production\n\n\nModel\nData ingest\ninference\n\n\nPu�lish\npredictions\n\n\n03\n\n\nContinuous Deployment (CD)\n\n\nrelease\n\nMonitoring\n\n\nData ingest\n\n\nCheck model\nperformance\nand data drift\n\n\nPu�lish\nmetrics\n\n\nCompare\nStaging vs\nProduction\n\n\nRequest model\ntransition to\nProduction\n\nrelease\n\n\nCompliance\nchecks\n\n\n0�\n\n\nTrigger model training\n\n\nrelease\n\n\nData ta�les Feature ta�les Feature ta�les Monitoring ta�les\nLakehouse\n\n\n-----\n\nThough data scientists may not have write or compute access in the production environment, it is\n\nimportant to provide them with visibility to test results, logs, model artifacts and the status of ML pipelines\n\nin production. This visibility allows them to identify and diagnose problems in production.\n\n###### Feature table refresh\n\nThis pipeline transforms the latest production Lakehouse data into production feature tables. It can use batch\n\nor streaming computation, depending on the freshness requirements for downstream training and inference.\n\nThe pipeline can be defined as a [Databricks Job](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.rxs6npet1ull) which is scheduled, triggered or continuously running.\n\n###### Model training\n\nThe model training pipeline runs either when code changes affect upstream featurization or training logic, or\n\nwhen automated retraining is scheduled or triggered. This pipeline runs on the full production data.\n\n\u0007 **Training and tuning**\n\nDuring the training process, logs are recorded to the [MLflow tracking server](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) . These include model\n\nmetrics, parameters, tags and the model itself.\n\nDuring development, data scientists may test many algorithms and hyperparameters, but it is common\n\nto restrict those choices to the top-performing options in the production training code. Restricting tuning\n\ncan reduce the variance from tuning in automated retraining, and it can make training and tuning faster.\n\n**\u0007Evaluation**\n\nModel quality is evaluated by testing on held-out production data. The results of these tests are\n\nlogged to the MLflow tracking server. During development, data scientists will have selected meaningful\n\nevaluation metrics for the use case, and those metrics or their custom logic will be used in this step.\n\n**\u0007Register and request transition**\n\nFollowing model training, the model artifact is registered to the [MLflow Model Registry](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) of the production\n\nenvironment, set initially to ’stage=None’. The final step of this pipeline is to request a transition of the\n\n\n-----\n\n###### Continuous deployment (CD)\n\nThe CD pipeline is executed when the training pipeline finishes and requests to transition the model to\n\n‘stage=Staging’. There are three key tasks in this pipeline:\n\n\u0007 **Compliance checks**\n\nThese tests load the model from the Model Registry, perform compliance checks (for tags, documentation,\n\netc.), and approve or reject the request based on test results. If compliance checks require human\n\nexpertise, this automated step can compute statistics or visualizations for people to review in a manual\n\napproval step at the end of the CD pipeline. Regardless of the outcome, results for that model version\n\nare recorded to the Model Registry through metadata in tags and comments in descriptions.\n\nThe MLflow UI can be used to manage stage transition requests manually, but requests and transitions\n\ncan be automated via MLflow APIs and [webhooks](https://docs.databricks.com/applications/mlflow/model-registry-webhooks.html) . If the model passes the compliance checks, then\n\nthe transition request is approved and the model is promoted to ‘stage=Staging’. If the model fails, the\n\ntransition request is rejected and the model is moved to ‘stage=Archived’ in the Model Registry.\n\n**\u0007Compare staging vs. production**\n\nTo prevent performance degradation, models promoted to ‘stage=Staging’ must be compared to the\n\n‘stage=Production’ models they are meant to replace. The metric(s) for comparison should be defined\n\naccording to the use case, and the method for comparison can vary from canary deployments to A/B\n\ntests. All comparison results are saved to metrics tables in the lakehouse.\n\nIf this is the first deployment and there is no ‘stage=Production’ model yet, the ‘stage=Staging’ model\n\nshould be compared to a business heuristic or other threshold as a baseline. For a new version\n\nof an existing ‘stage=Production’ model, the ‘stage=Staging’ model is compared with the current\n\n‘stage=Production’ model.\n\n\n-----\n\n**\u0007Request model transition to production**\n\nIf the candidate model passes the comparison tests, a request is made to transition it to\n\n‘stage=Production’ in the Model Registry. As with other stage transition requests, notifications,\n\napprovals and rejections can be managed manually via the MLflow UI or automatically through APIs and\n\nwebhooks. This is also a good time to consider human oversight, as it is the last step before a model is\n\nfully available to downstream applications. A person can manually review the compliance checks and\n\nperformance comparisons to perform checks which are difficult to automate.\n\n###### Online serving (REST APIs)\n\nFor lower throughput and lower latency use cases, online serving is generally necessary. With MLflow, it is\n\nsimple to deploy models to [Databricks Model Serving](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.72shqep1kelf) , cloud provider serving endpoints, or on-prem or\n\ncustom serving layers.\n\nIn all cases, the serving system loads the production model from the Model Registry upon initialization. On\n\neach request, it fetches features from an online Feature Store, scores the data and returns predictions. The\n\nserving system, data transport layer or the model itself could log requests and predictions.\n\n###### Inference: batch or streaming\n\nThis pipeline is responsible for reading the latest data from the Feature Store, loading the model from\n\n‘stage=Production’ in the Model Registry, performing inference and publishing predictions. For higher\n\nthroughput, higher latency use cases, batch or streaming inference is generally the most cost-effective\n\noption.\n\nA batch job would likely publish predictions to Lakehouse tables, over a JDBC connection, or to flat files.\n\nA streaming job would likely publish predictions either to Lakehouse tables or to message queues like\n\nApache Kafka.®\n\n\n-----\n\n###### Monitoring\n\nInput data and model predictions are monitored, both for statistical properties (data drift, model\n\nperformance, etc.) and for computational performance (errors, throughput, etc.). These metrics are\n\npublished for dashboards and alerts.\n\n\u0007 **Data ingestion**\n\nThis pipeline reads in logs from batch, streaming or online inference.\n\n**\u0007Check accuracy and data drift**\n\nThe pipeline then computes metrics about the input data, the model’s predictions and the infrastructure\n\nperformance. Metrics that measure statistical properties are generally chosen by data scientists during\n\ndevelopment, whereas metrics for infrastructure are generally chosen by ML engineers.\n\n\u0007 **Publish metrics**\n\nThe pipeline writes to Lakehouse tables for analysis and reporting. Tools such as [Databricks SQL](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.nsthucrt9k77) are used\n\nto produce monitoring dashboards, allowing for health checks and diagnostics. The monitoring job or the\n\ndashboarding tool issues notifications when health metrics surpass defined thresholds.\n\n**\u0007Trigger model training**\n\nWhen the model monitoring metrics indicate performance issues, or when a model inevitably becomes\n\nout of date, the data scientist may need to return to the development environment and develop a new\n\nmodel version.\n\n\n-----\n\n**Note:** While automated retraining is supported\n\nin this architecture, it isn’t required, and caution\n\n\n###### Retraining\n\nThis architecture supports automatic retraining using the same model training pipeline above. While we\n\nrecommend beginning with manually triggered retraining, organizations can add scheduled and/or triggered\n\nretraining when needed.\n\n\u0007 **Scheduled**\n\nIf fresh data are regularly made available, rerunning model training on a defined schedule can help models\n\nto keep up with changing trends and behavior.\n\n**\u0007Triggered**\n\nIf the monitoring pipeline can identify model performance issues and send alerts, it can additionally\n\ntrigger retraining. For example, if the distribution of incoming data changes significantly or if the model\n\nperformance degrades, automatic retraining and redeployment can boost model performance with\n\nminimal human intervention.\n\n\nmust be taken in cases where it is implemented.\n\n\nIt is inherently difficult to automate selecting the\n\ncorrect action to take from model monitoring\n\n\nWhen the featurization or retraining pipelines themselves begin to exhibit performance issues, the data\n\nscientist may need to return to the dev environment and resume experimentation to address such issues.\n\n\nalerts. For example, if data drift is observed, does\n\nit indicate that we should automatically retrain, or\n\ndoes it indicate that we should engineer additional\n\nfeatures to encode some new signal in the data?\n\n\n-----\n\n**CHAPTER 4:**\n## LLMOps – Large Language Model Operations\n\n\n#### Large language models\n\nLLMs have splashed into the mainstream of business and news, and there is no doubt that they will disrupt\n\ncountless industries. In addition to bringing great potential, they present a new set of questions for MLOps:\n\n\u0007Is prompt engineering part of operations, and if so, what is needed?\n\n\u0007Since the “large” in “LLM” is an understatement, how do cost/performance trade-offs change?\n\n\u0007Is it better to use paid APIs or to fine-tune one’s own model?\n\n…and many more!\n\nThe good news is that “LLMOps” (MLOps for LLMs) is not that different from traditional MLOps. However,\n\nsome parts of your MLOps platform and process may require changes, and your team will need to learn a\n\nmental model of how LLMs coexist alongside traditional ML in your operations.\n\nIn this section, we will explain what may change for MLOps when introducing LLMs. We will discuss several\n\nkey topics in detail, from prompt engineering to packaging, to cost/performance trade-offs. We also provide\n\na reference architecture diagram to illustrate what may change in your production environment.\n\n###### What changes with LLMs?\n\nFor those not familiar with large language models (LLMs), see [this summary](https://www.databricks.com/product/machine-learning/large-language-models) for a quick introduction. The\n\none-sentence summary is: LLMs are a new class of natural language processing (NLP) models that have\n\nsignificantly surpassed their predecessors in performance across a variety of tasks, such as open-ended\n\nquestion answering, summarization and execution of near-arbitrary instructions.\n\nFrom the perspective of MLOps, LLMs bring new requirements, with implications for MLOps practices and\n\nplatforms. We briefly summarize key properties of LLMs and the implications for MLOps here, and we delve\n\ninto more detail in the next section.\n\n\n-----\n\n**Table 3**\n\n\n\n|KEY PROPERTIES OF LLMS|IMPLICATIONS FOR MLOPS|\n|---|---|\n|LLMs are available in many forms: \u0007Very general proprietary models behind paid APIs \u0007Open source models that vary from general to specific applications \u0007Custom models fine-tuned for specific applications|Development process: Projects often develop incrementally, starting from existing, third-party or open source models and ending with custom fine-tuned models.|\n|Many LLMs take general natural language queries and instructions as input. Those queries can contain carefully engineered “prompts” to elicit the desired responses.|Development process: Designing text templates for querying LLMs is often an important part of developing new LLM pipelines. Packaging ML artifacts: Many LLM pipelines will use existing LLMs or LLM serving endpoints; the ML logic developed for those pipelines may focus on prompt templates, agents or “chains” instead of the model itself. The ML artifacts packaged and promoted to production may frequently be these pipelines, rather than models.|\n|Many LLMs can be given prompts with examples and context, or additional information to help answer the query.|Serving infrastructure: When augmenting LLM queries with context, it is valuable to use previously uncommon tooling such as vector databases to search for relevant context.|\n|LLMs are very large deep learning models, often ranging from gigabytes to hundreds of gigabytes.|Serving infrastructure: Many LLMs may require GPUs for real-time model serving. Cost/performance trade-offs: Since larger models require more computation and are thus more expensive to serve, techniques for reducing model size and computation may be required.|\n|LLMs are hard to evaluate via traditional ML metrics since there is often no single “right” answer.|Human feedback: Since human feedback is essential for evaluating and testing LLMs, it must be incorporated more directly into the MLOps process, both for testing and monitoring and for future fine-tuning.|\n\n\n-----\n\nThe list above may look long, but as we will see in the next section, many existing tools and processes\n\nonly require small adjustments in order to adapt to these new requirements. Moreover, many aspects\n\ndo not change:\n\n\u0007The separation of development, staging and production remains the same\n\n\u0007Git version control and model registries remain the primary conduits for promoting pipelines and\n\nmodels toward production\n\n\u0007The lakehouse architecture for managing data remains valid and essential for efficiency\n\n\u0007Existing CI/CD infrastructure should not require changes\n\n\u0007The modular structure of MLOps remains the same, with pipelines for data refresh, model tuning,\n\nmodel inference, etc.\n\n\n-----\n\n#### Discussion of key topics for LLMOps\n\nSo far, we have listed top potential changes to MLOps as you introduce LLMs. In this section, we will dive into\n\nmore details about selected topics.\n\n###### Prompt engineering\n\nPrompt engineering is the practice of adjusting the text prompt given to an LLM in order to elicit better\n\nresponses — using engineering techniques. It is a very new practice, but some best practices are emerging.\n\nWe will cover a few tips and best practices and link to useful resources.\n\n**1** \u0007Prompts and prompt engineering are model-specific. A prompt given to two different models will\n\ngenerally _not_ produce the same results. Similarly, prompt engineering tips do not apply to all models.\n\nIn the extreme case, many LLMs have been fine-tuned for specific NLP tasks and do not even require\n\nprompts. On the other hand, very general LLMs benefit greatly from carefully crafted prompts.\n\n**2** \u0007When approaching prompt engineering, go from simple to complex: track, templatize and automate.\n\n\u0007Start by tracking queries and responses so that you can compare them and iterate to improve\n\nprompts. Existing tools such as MLflow provide tracking capabilities; see [MLflow LLM Tracking](https://mlflow.org/docs/latest/llm-tracking.html) for\n\nmore details. Checking structured LLM pipeline code into version control also helps with prompt\n\ndevelopment, for git diffs allow you to review changes to prompts over time. Also see the section\n\nbelow on packaging model and pipelines for more information about tracking prompt versions.\n\n\u0007Then, consider using tools for building prompt templates, especially if your prompts become complex.\n\nNewer LLM-specific tools such as [LangChain](https://python.langchain.com/en/latest/index.html) and [LlamaIndex](https://gpt-index.readthedocs.io/en/latest/) provide such templates and more.\n\n\u0007Finally, consider automating prompt engineering by replacing manual engineering with automated\n\ntuning. Prompt tuning turns prompt development into a data-driven process akin to hyperparameter\n\ntuning for traditional ML. The [Demonstrate-Search-Predict (DSP) Framework](https://github.com/stanfordnlp/dsp) is a good example of a\n\ntool for prompt tuning.\n\n\n-----\n\n###### Resources\n\nThere are lots of good resources about\nprompt engineering, especially for popular\n\nmodels and services:\n\n\u0007DeepLearning.AI course on [ChatGPT](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n[Prompt Engineering](https://www.deeplearning.ai/short-courses/chatgpt-prompt-engineering-for-developers/)\n\n\u0007DAIR.AI [Prompt Engineering Guide](https://www.promptingguide.ai/)\n\n\u0007 [Best practices for prompt engineering](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n[with the OpenAI API](https://help.openai.com/en/articles/6654000-best-practices-for-prompt-engineering-with-openai-api)\n\n\n**3** \u0007Most prompt engineering tips currently published online are for ChatGPT, due to its immense\n\npopularity. Some of these generalize to other models as well. We will provide a few tips here:\n\n\u0007Use clear, specific prompts, which may include an instruction, context (if needed), a user query or\n\ninput, and a description of the desired output type or format\n\n\u0007Provide examples in your prompt (“few-shot learning”) to help the LLM to understand what you want\n\n\u0007Tell the model how to behave, such as telling it to admit if it cannot answer a question\n\n\u0007Tell the model to think step-by-step or explain its reasoning\n\n\u0007If your prompt includes user input, use techniques to prevent prompt hacking, such as making it very\n\nclear which parts of the prompt correspond to your instruction vs. user input\n\n\n-----\n\n###### Packaging models or pipelines for deployment\n\nIn traditional ML, there are generally two types of ML logic to package for deployment: models and\n\npipelines. These artifacts are generally managed toward production via a Model Registry and Git version\n\ncontrol, respectively.\n\nWith LLMs, it is common to package ML logic in new forms. These may include:\n\n\u0007A lightweight call to an LLM API service (third party or internal)\n\n\u0007A “chain” from LangChain or an analogous pipeline from another tool. The chain may call an LLM API or a\n\nlocal LLM model.\n\n\u0007An LLM or an LLM+tokenizer pipeline, such as a [Hugging Face](https://huggingface.co/) pipeline. This pipeline may use a\n\npretrained model or a custom fine-tuned model.\n\n\u0007An engineered prompt, possibly stored as a template in a tool such as LangChain\n\nThough LLMs add new terminology and tools for composing ML logic, all of the above still constitute models\n\nand pipelines. Thus, the same tooling such as [MLflow](https://mlflow.org/) can be used to package LLMs and LLM pipelines for\n\ndeployment. [Built-in model flavors](https://mlflow.org/docs/latest/models.html) include:\n\n\u0007PyTorch and TensorFlow\n\n\u0007Hugging Face Transformers (relatedly, see Hugging Face Transformers’s [MLflowCallback](https://huggingface.co/docs/transformers/en/main_classes/callback#transformers.integrations.MLflowCallback) )\n\n\u0007LangChain\n\n\u0007OpenAI API\n\n\u0007(See the [documentation](https://mlflow.org/docs/latest/models.html) for a complete list)\n\nFor other LLM pipelines, MLflow can package the pipelines via the [MLflow pyfunc flavor](https://mlflow.org/docs/latest/models.html#python-function-python-function) , which can store\n\narbitrary Python code.\n\n\n**Note about prompt versioning:** Just as it is helpful\n\nto track model versions, it is helpful to track prompt\n\nversions (and LLM pipeline versions, more generally).\n\nPackaging prompts and pipelines as MLflow Models\n\nsimplifies versioning. Just as a newly retrained\n\nmodel can be tracked as a new model version in the\n\nMLflow Model Registry, a newly updated prompt can\n\nbe tracked as a new model version.\n\n**Note about deploying models vs. code:** Your\n\ndecisions around packaging ML logic as version\n\ncontrolled code vs. registered models will help\n\nto inform your decision about choosing between\n\nthe deploy models, deploy code and hybrid\n\narchitectures. Review the subsection below about\n\nhuman feedback, and make sure that you have a\n\nwell-defined testing process for whatever artifacts\n\nyou choose to deploy.\n\n\n-----\n\n###### Managing cost/performance trade-offs\n\nOne of the big Ops topics for LLMs is managing cost/performance trade-offs, especially for inference\n\nand serving. With “small” LLMs having hundreds of millions of parameters and large LLMs having hundreds\n\nof billions of parameters, computation can become a major expense. Thankfully, there are many ways to\n\nmanage and reduce costs when needed. We will review some key tips for balancing productivity and costs.\n\n**1** \u0007Start simple, but plan for scaling. When developing a new LLM-powered application, speed of\n\ndevelopment is key, so it is acceptable to use more expensive options, such as paid APIs for existing\n\nmodels. As you go, make sure to collect data such as queries and responses. In the future, you can use\n\nthat data to fine-tune a smaller, cheaper model which you can own.\n\n**2** \u0007Scope out your costs. How many queries per second do you expect? Will requests come in bursts?\n\nHow much does each query cost? These estimates will inform you about project feasibility and will help\n\nyou to decide when to consider bringing the model in-house with open source models and fine-tuning.\n\n**3** \u0007Reduce costs by tweaking LLMs and queries. There are many LLM-specific techniques for reducing\n\ncomputation and costs. These include shortening queries, tweaking inference configurations and using\n\nsmaller versions of models.\n\n**4** \u0007Get human feedback. It is easy to reduce costs but hard to say how changes impact your results,\n\nunless you get human feedback from end users.\n\n\n-----\n\n###### Resources\n\n**Fine-tuning**\n\n\u0007 [Fine-Tuning Large Language Models with](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n[Hugging Face and DeepSpeed](https://www.databricks.com/blog/2023/03/20/fine-tuning-large-language-models-hugging-face-and-deepspeed.html)\n\n\u0007Webinar: [Build Your Own Large Language](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n[Model Like Dolly: How to fine-tune and](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[deploy your custom LLM](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n\n**Model distillation,**\n**quantization and pruning**\n\n\n###### Methods for reducing costs of inference\n\n**Use a smaller model**\n\n\u0007Pick a different existing model. Try smaller versions of models (such as “t5-small” instead of “t5-base”)\n\nor alternate architectures.\n\n\u0007Fine-tune a custom model. With the right training data, a fine-tuned model can often be smaller and/or\n\nperform better than a generic model.\n\n\u0007Use model distillation (or knowledge distillation). This technique “distills” the knowledge of the original\n\nmodel into a smaller model.\n\n\u0007Reduce floating point precision (quantization). Models can sometimes use lower precision arithmetic\n\nwithout losing much in quality.\n\n\n\u0007 [Gentle Introduction to 8-bit Matrix](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\n**\u0007Reduce computation for a given model**\n\n\u0007Shorten queries and responses. Computation scales with input and output sizes, so using more concise\n\nqueries and responses reduces costs.\n\n\u0007Tweak inference configurations. Some types of inference, such as beam search, require more computation.\n\n**Other**\n\n\u0007Split traffic. If your return on investment (ROI) for an LLM query is low, then consider splitting traffic so that\n\nlow ROI queries are handled by simpler, faster models or methods. Save LLM queries for high ROI traffic.\n\n\u0007Use pruning techniques. If you are training your own LLMs, there are pruning techniques that allow\n\nmodels to use sparse computation during inference. This reduces computation for most or all queries.\n\n\n[Multiplication for transformers at scale](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[using Hugging Face Transformers,](https://huggingface.co/blog/hf-bitsandbytes-integration)\n[Accelerate and bitsandbytes](https://huggingface.co/blog/hf-bitsandbytes-integration)\n\n\u0007 [Large Transformer Model Inference](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n[Optimization](https://lilianweng.github.io/posts/2023-01-10-inference-optimization/)\n\n\u0007 [Making LLMs even more accessible with](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n[bitsandbytes, 4-bit quantization and](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n[QLoRA](https://huggingface.co/blog/4bit-transformers-bitsandbytes)\n\n\n-----\n\n###### Human feedback, testing, and monitoring\n\nWhile human feedback is important in many traditional ML applications, it becomes much more important\n\nfor LLMs. Since most LLMs output natural language, it is very difficult to evaluate the outputs via traditional\n\nmetrics. For example, suppose an LLM were used to summarize a news article. Two equally good summaries\n\nmight have almost completely different words and word orders, so even defining a “ground-truth” label\n\nbecomes difficult or impossible.\n\nHumans — ideally your end users — become essential for validating LLM output. While you can pay human\n\nlabelers to compare or rate model outputs, the best practice for user-facing applications is to build human\n\nfeedback into the applications from the outset. For example, a tech support chatbot may have a “click here\n\nto chat with a human” option, which provides implicit feedback indicating whether the chatbot’s responses\n\nwere helpful.\n\nIn terms of operations, not much changes from traditional MLOps:\n\n\u0007 **Data:** Human feedback is simply data, and it should be treated like any other data. Store it in your\n\nlakehouse, and process it using the same data pipeline tooling as other data.\n\n\u0007 **Testing and monitoring:** A/B testing and incremental rollouts of new models and pipelines may become\n\nmore important, superceding offline quality tests. If you can collect user feedback, then these rollout\n\nmethods can validate models before they are fully deployed.\n\n\u0007 **Fine-tuning:** Human feedback becomes especially important for LLMs when it can be incorporated into\n\nfine-tuning models via techniques like Reinforcement Learning from Human Feedback (RLHF). Even if you\n\nstart with an existing or generic model, you can eventually customize it for your purposes via fine-tuning.\n\n\n###### Resources\n\n**Reinforcement Learning from**\n**Human Feedback (RLHF)**\n\n\u0007Chip Huyen blog post on\n\n[“RLHF: Reinforcement Learning from](https://huyenchip.com/2023/05/02/rlhf.html)\n\n[Human Feedback”](https://huyenchip.com/2023/05/02/rlhf.html)\n\n\u0007Hugging Face blog post on\n\n[“Illustrating Reinforcement Learning from](https://huggingface.co/blog/rlhf)\n\n[Human Feedback (RLHF)”](https://huggingface.co/blog/rlhf)\n\n\u0007 [Wikipedia](https://en.wikipedia.org/wiki/Reinforcement_learning_from_human_feedback)\n\n\n-----\n\n###### Other topics\n\n\u0007 **Scaling out:** Practices around scaling out training, fine-tuning and inference are similar to traditional ML,\n\nbut some of your tools may change. Tools like [Apache Spark](https://spark.apache.org/) [™](https://spark.apache.org/) and [Delta Lake](https://delta.io/) remain general enough for\n\nyour LLM data pipelines and for batch and streaming inference, and they may be helpful for distributing\n\nfine-tuning. To handle LLM fine-tuning and training, you may need to adopt some new tools such as\n\n[distributed PyTorch](https://pytorch.org/tutorials/beginner/dist_overview.html) , [distributed TensorFlow](https://www.tensorflow.org/guide/distributed_training) , and [DeepSpeed](https://www.deepspeed.ai/) .\n\n\u0007 **[Model serving:](https://www.databricks.com/product/model-serving)** If you manage the serving system for your LLMs, then you may need to make\n\nadjustments to handle larger models. While serving with CPUs can work for smaller deep learning\n\nmodels, most LLMs will benefit from or require GPUs for serving and inference.\n\n\u0007 **Vector databases:** Some but not all LLM applications require vector databases for efficient similarity-\n\nbased lookups of documents or other data. Vector databases may be an important addition to your\n\nserving infrastructure. Operationally, it is analogous to a feature store: it is a specialized tool for storing\n\npreprocessed data which can be queried by inference jobs or model serving systems.\n\n\n-----\n\n#### Reference architecture\n\nTo illustrate potential adjustments to your reference architecture from traditional MLOps, we provide a\n\nmodified version of the previous production architecture.\n\nProduction environment\n\nModel Registry\n\nStage: �one Stage: Staging Stage: Production\n\nLoad model for testing Load model for inference\n\n\nPush model to registry Promote to production\n\n\nModel serving\n\n\nLLM API request\n\nrelease\n\n\nFine-Tine LLM\n\nrelease\n\n\nVector Database\nUpdate\n\nrelease\n\n\nContinuous\nDeployment (CD)\n\nrelease\n\n\nMonitoring &\nEvaluation\n\nrelease\n\n\nInternal/External Data tables Vector database Metrics tables Human feedback\nmodel hub\n\n**Figure 7**\n\n\n-----\n\n###### Additional resources\n\nWith LLMs being such a novel field, we link to\nseveral LLM resources below, which are not\n\nnecessarily “LLMOps” but may prove useful\nto you.\n\n\u0007 [edX: Professional Certificate in Large](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n[Language Models](https://www.edx.org/professional-certificate/databricks-large-language-models)\n\n\u0007Chip Huyen blog post on [“Building LLM](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\n[applications for production”](https://huyenchip.com/2023/04/11/llm-engineering.html)\n\nLLM lists and leaderboards\n\n\u0007 [LMSYS Leaderboard](https://chat.lmsys.org/?leaderboard)\n\n\u0007 [Hugging Face Open LLM Leaderboard](https://huggingface.co/spaces/HuggingFaceH4/open_llm_leaderboard)\n\n\u0007 [Stanford Center for Research on](https://crfm.stanford.edu/)\n\n[Foundation Models](https://crfm.stanford.edu/)\n\n\u0007 [Ecosystem graphs](https://crfm.stanford.edu/ecosystem-graphs/index.html)\n\u0007 [\u0007HELM](https://crfm.stanford.edu/helm/latest/?)\n\n\u0007Blog post on [“Open Source ChatGPT](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\nThe primary changes to this production architecture are:\n\n\u0007 **Internal/External Model Hub:** Since LLM applications often make use of existing, pretrained models,\n\nan internal or external model hub becomes a valuable part of the infrastructure. It appears here in\n\nproduction to illustrate using an existing base model that is then fine-tuned in production. Without fine-\n\ntuning, this hub would mainly be used in development.\n\n\u0007 **Fine-Tune LLM:** Instead of de novo Model Training, LLM applications will generally fine-tune an existing\n\nmodel (or use an existing model without any tuning). Fine-tuning is a lighter-weight process than training,\n\nbut it is similar operationally.\n\n\u0007 **Vector Database:** Some (but not all) LLM applications use vector databases for fast similarity searches,\n\nmost often to provide context or domain knowledge in LLM queries. We replaced the Feature Store (and\n\nits Feature Table Refresh job) with the Vector Database (and its Vector Database Update job) to illustrate\n\nthat these data stores and jobs are analogous in terms of operations.\n\n\u0007 **Model Serving:** The architectural change illustrated here is that some LLM pipelines will make external\n\nAPI calls, such as to internal or third-party LLM APIs. Operationally, this adds complexity in terms of\n\npotential latency or flakiness from third-party APIs, as well as another layer of credential management.\n\n\u0007 **Human Feedback in Monitoring and Evaluation:** Human feedback loops may be used in traditional ML\n\nbut become essential in most LLM applications. Human feedback should be managed like other data,\n\nideally incorporated into monitoring based on near real-time streaming.\n\n\n[Alternatives”](https://www.saattrupdan.com/posts/2023-04-16-open-source-chatgpt-alternatives)\n\n\n-----\n\n#### Looking ahead\n\nLLMs only became mainstream in late 2022, and countless libraries and technologies are being built to\n\nsupport and leverage LLM use cases. You should expect rapid changes. More powerful LLMs will be open-\n\nsourced; tools and techniques for customizing LLMs and LLM pipelines will become more plentiful and\n\nflexible; and an explosion of techniques and ideas will gradually coalesce into more standardized practices.\n\nWhile this technological leap provides us all with great opportunities, the use of cutting-edge technologies\n\nrequires extra care in LLMOps to build and maintain stable, reliable LLM-powered applications. The good\n\nnews is that much of your existing MLOps tooling, practices and knowledge will transfer smoothly over to\n\nLLMs. With the additional tips and practices mentioned in this section, you should be well set up to harness\n\nthe power of large language models.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n9,000 organizations worldwide — including Comcast,\n\nCondé Nast and over 50% of the Fortune 500 — rely\n\non the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered\n\nin San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark ™ ,\n\nDelta Lake and MLflow, Databricks is on a mission\n\nto help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "-----\n\n# TABLE OF CONTENTS\n\n\n##### Welcome to Data, Analytics and AI ....... 02\n\n**Do you know what you’re getting into?** ............................................ **02**\n\n**How to use this book** \b��������������������������������������������������������������������������������������� **02**\n\n##### Business Value .......................................................................... 03\n\n**Talking to the business (feels like combat)** \b����������������������������� **03**\n\n**Creating Value Alignment** \b������������������������������������������������������������������ **03**\n\n**Goals and Outcomes** \b���������������������������������������������������������������������������� **04**\n\n##### Ultimate Class Build Guide .................................. 04\n\n**Creating a character** \b����������������������������������������������������������������������������� **04**\n\n- Data Engineers \b������������������������������������������������������������������������������������� **04**\n\n- Data Scientists \b������������������������������������������������������������������������������������� **05**\n\n- Data Analysts \b���������������������������������������������������������������������������������������� **05**\n\n##### Diving In ............................................................................................... 05\n\n**Producing game data** \b���������������������������������������������������������������������������� **05**\n\n**And receiving it in cloud** \b��������������������������������������������������������������������� **08**\n\n**Getting data from your game to the cloud** \b������������������������������ **08**\n\n##### The Value of Data Throughout the Game Developer Lifecycle ................................... 09\n\n**Lifecycle overview** \b���������������������������������������������������������������������������������� **09**\n\n**Use data to develop a next-generation**\n\n**customer experience** \b��������������������������������������������������������������������������� **09**\n\n##### Getting Started with Gaming Use Cases .............................................................. 10\n\n**Where do I start? Start with Game Analytics** \b������������������������� **10**\n\n**Understand your audience** \b���������������������������������������������������������������������������� **11**\n\n- Player Segmentation \b���������������������������������������������������������������������������� **11**\n\n- Player Lifetime Value \b��������������������������������������������������������������������������� **12**\n\n- Social Media Monitoring \b�������������������������������������������������������������������� **12**\n\n- Player Feedback Analysis \b����������������������������������������������������������������� **13**\n\n- Toxicity Detection \b��������������������������������������������������������������������������������� **13**\n\n**Find your audience** \b���������������������������������������������������������������������������������� **14**\n\n\n**Activating Your Playerbase** \b������������������������������������������������������������������������� **15**\n\n- Player Recommendations \b����������������������������������������������������������������� **15**\n\n- Next Best Offer/Action \b����������������������������������������������������������������������� **15**\n\n- Churn Prediction & Prevention \b������������������������������������������������������� **16**\n\n- Real-time Ad Targeting \b����������������������������������������������������������������������� **16**\n\n**Operational Use Cases** \b�������������������������������������������������������������������������� **17**\n\n- Anomaly Detection \b������������������������������������������������������������������������������ **17**\n\n- Build Pipeline \b������������������������������������������������������������������������������������������� **17**\n\n- Crash Analytics \b�������������������������������������������������������������������������������������� **18**\n\n##### Things to Look Forward To ..................................... 19\n\n Appendix .............................................................................................. 21\n\n**Ultimate Class Build Guide** \b������������������������������������������������������������������ **21**\n\n- Creating a Character \b��������������������������������������������������������������������������� **21**\n\n- Data Engineers \b���������������������������������������������������������������������������� **21**\n\n- Data Scientists \b���������������������������������������������������������������������������� **21**\n\n- Data Analysts \b������������������������������������������������������������������������������ **22**\n\n**Data Access and the Major Cloud Providers** ................................ **23**\n\n- Cloud Rosetta Stone \b�������������������������������������������������������������������������� **23**\n\n- Jargon Glossary \b������������������������������������������������������������������������������������ **23**\n\n- Getting started with the major cloud providers \b������������������� **23**\n\n**Getting Started with Detailed Use Cases** \b���������������������������������� **25**\n\n- Game analytics \b������������������������������������������������������������������������������������� **25**\n\n- Player Segmentation \b�������������������������������������������������������������������������� **25**\n\n- Player Lifetime Value \b�������������������������������������������������������������������������� **26**\n\n- Social Media Monitoring \b������������������������������������������������������������������� **28**\n\n- Player Feedback Analysis \b���������������������������������������������������������������� **29**\n\n- Toxicity Detection \b������������������������������������������������������������������������������� **30**\n\n- Multi-Touch Attribution and Media Mix Modeling \b���������������� **31**\n\n- Player Recommendations \b���������������������������������������������������������������� **32**\n\n- Next Best Offer/Action \b���������������������������������������������������������������������� **33**\n\n- Churn Prediction & Prevention \b����������������������������������������������������� **34**\n\n- Real-time Ad Targeting \b���������������������������������������������������������������������� **35**\n\n**Getting Started with Operational Use Cases** \b�������������������������� **36**\n\n- Anomaly Detection \b����������������������������������������������������������������������������� **36**\n\n- Build Pipeline \b����������������������������������������������������������������������������������������������������� **37**\n\n- Crash Analytics \b������������������������������������������������������������������������������������� **39**\n\n\nMulti-Touch Attribution \b��������������������������������������������������������������������� **14**\n\n\n-----\n\n# Welcome to Data, Analytics, and AI\n\n\n### Do you know what you’re getting into?\n\nYou may have heard the stories of game studios spending\n\ncountless hours trying to more effectively acquire, engage,\n\nand retain players. Well, did you know that data, analytics,\n\nand AI plays a central role in the development and operation\n\nof today’s top-grossing video games? Studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are no\n\nlonger a “nice-to-have”, but table stakes for success.\n\nThe objective of this handbook is to guide you on the\n\nrole data, analytics, and AI plays in the development\n\nand operations of video games. We’ll cover who the key\n\nstakeholders are and how to align people across business\n\nunits. Then we’ll talk through strategies to help you\n\nsuccessfully advocate for data, analytics, and AI projects\n\ninternally. Finally, we dive deep through the most common\n\nuse cases. We want to give you enough information to feel\n\n\nwell as helpful tips when operating as or working with one of\n\nthese classes.\n\nWe follow this with the fundamentals for building a Proof\n\nof Concept (POC) or Minimum Viable Product (MVP). That\n\nis, connecting to the cloud; accessing your data; and\n\nmost importantly, being able to represent the value you’re\n\nseeking to unlock as you sell your project into your team and\n\nbroader organization.\n\nFinally, we’ll dive into the most common use cases for data,\n\nanalytics, and AI within game development. Similar to a tech-\n\ntree in a video game, we begin with the most basic use cases\n\n- setting up your game analytics. Then we progress through\n\nmore advanced data use cases such as player segmentation,\n\nassessing lifetime value, detecting and mitigating toxicity,\n\nmulti-touch attribution, recommendation engines, player\n\nchurn prediction and prevention, and more.\n\nDon’t forget to review the Appendix. You’ll find a handy\n\n“ Jargon Glossary ”, “ Cloud Rosetta Stone ”, and “ get started\n\nguide for the three major cloud providers ”. All incredibly\n\nhelpful assets to keep as hotkeys.\n\n\nempowered to make a demonstrable impact. Just by reading\n\nthis you are adding incredible insight and value to yourself as\n\n\nan industry professional. Quest on!\n\n### How to use this book\n\nThis book is primarily intended for technical professionals\n\nwho are engaging with data within game studios. No\n\nmatter your role in the gaming industry, you will be able to\n\nglean key takeaways that will make you more effective in\n\nyour individual role and within the larger team — be that\n\nproduction, art, engineering, marketing, or otherwise.\n\nBegin your journey by reviewing the “ **Data, Analytics, and AI**\n\n**Ground Rules** ” section to the right, which presents some This\n\nsection presents some rules and guidelines for interpreting\n\nthe role that data plays in the game development lifecycle.\n\nNext, it’s time to learn about the key professions (aka\n\ncharacter classes) that interact and engage with data,\n\nanalytics, and AI on a consistent basis within a game studio.\n\nThis section breaks down each of the classes, providing an\n\n\n**Data, Analytics, and AI Ground Rules**\n\nThis guide assumes you understand the following:\n\n- You understand the basics of data, analytics, and AI:\n\nHow and why data is stored in a system, why data\n\nis transformed, the different types of output that\n\ndata can feed into — such as a report, an analysis\n\nanswering a question, or a machine learning model.\n\nIf this is the first time you’re creating a character,\n\nwe highly recommend reviewing our data, analytics,\n\nand AI tutorial — aka getting started training and\n\ndocumentation, available at [dbricks.co/training](https://www.databricks.com/learn/training/home)\n\n- You have a basic understanding of cloud\n\ninfrastructure. Specifically what it is, who are the\n\nkey players, and associated terms (e.g., virtual\n\nmachines, APIs, applications)\n\n- You are generally aware of the game development\n\nlifecycle; pre-production, production, testing/QA,\n\nlaunch, operation\n\n\noverview of each character’s strengths and weaknesses as\n\n\n-----\n\n# Business Value\n\n\nDemonstrating business value is important when working\n\non data, analytics, and AI projects because it helps ensure\n\nthat the efforts of the project are aligned with the goals\n\nand objectives of the business. By showing how the project\n\ncan positively impact a game’s key performance indicators\n\n(KPIs) and bottom-line metrics, such as game revenue, player\n\nsatisfaction, and operational efficiency, studio stakeholders\n\nare more likely to support and invest in the project.\n\nAdditionally, demonstrating business value can help justify\n\nthe resources, time, and money that are required to execute\n\nthe project, and can also help prioritize which projects should\n\nbe pursued. By focusing on business value, data, analytics,\n\nand AI projects can become strategic initiatives that\n\ncontribute to the long-term success of your game studio.\n\n### Talking to the business (feels like combat)\n\nWhile we highly encourage everyone to read this section,\n\nyou may already feel confident understanding the needs and\n\nconcerns of your internal stakeholders, and how to sell-in a\n\nproject successfully. If so, feel free to skip this section.\n\nWe would love to dive into the data to explore and discover\n\nas much as possible, unfortunately in most environments,\n\nwe are limited by resources and time. Understanding both\n\nthe businesses pain points and strategic goals is crucial to\n\nchoosing projects that will benefit the business, create value\n\nand make your message much easier to sell.\n\nWhenever we embark on a proof-of-concept (PoC) or\n\nminimum viable product (MVP) — to prove out a new\n\n**Questions to ask:**\n\n- What other strategic goals and pain points can\n\nyou list out and how would you prioritize them as\n\na business leader?\n\n- Does your prioritization match how your team,\n\nmanager and/or leadership would prioritize?\n\nTypically the closer the match, the easier initial\n\nprojects will be to “sell”.\n\n\nmethodology or technology — we will need to pitch it back\n\nfor adoption. The technology could be revolutionary and\n\nabsolutely amazing, but without the value proposition and tie\n\nback to goals, it is likely to land flat or fail to be adopted.\n\nIt is key to talk to your stakeholders to understand their\n\nperception of pain points and positions on potential projects\n\nto add value. Much like stopping at the Tavern when the\n\nadventuring party gets to town, these can be informal\n\nconversations where you socialize potential solutions while\n\ngathering information about what matters.\n\n### Creating value alignment\n\nSo what are your strategic goals and pain points and how\n\nmight they be addressed through a use case from a PoC or\n\nMVP leveraging your data?\n\nA few examples of strategic goals that are top of mind for our\n\ncustomers at the beginning of any fiscal or calendar year:\n\n- Reduce costs\n\n- Simplify your infrastructure\n\n- Acquire more players\n\n- Monetize your playerbase\n\n- Retain your players (aka prevent churn)\n\nHere are four ways the Databricks Lakehouse can provide\n\nvalue that aligns with your strategic goals and pain points:\n\n`1.` **\u0007Improved collaboration:** Databricks platform allows\n\neveryone to share and collaborate on data, notebooks and\n\nmodels between data scientists, engineers and business\n\nusers. This enables for a more efficient and streamlined\n\nprocess for data analysis and decision making.\n\n`2.` **Find and explore your data:** The data in the Lakehouse is\n\ncataloged and accessible, which enables business users\n\nto explore and query the data easily and discover insights\n\nby themselves.\n\n`3.` **\u0007Uncover actionable business insights:** By putting\n\nyour game’s data into a Lakehouse architecture, it\n\ncan be better analyzed using various tools provided\n\nby Databricks such as SQL, dashboards, notebooks,\n\nvisualization and machine learning to better understand\n\nyour playerbase, providing valuable insights into player\n\nbehavior and performance. These insights can help the\n\n\n-----\n\nand retention, and use that information to improve the\n\ngame and grow monetization.\n\n`4.` **\u0007Lead with data-driven decisions:** A Lakehouse\n\narchitecture provides a single source of truth for your\n\norganization’s data. Data engineers write once, data\n\nanalysts interpret the data, and data scientists can run\n\nmachine machine learning models on the same data.\n\n_This cannot be understated in the value this provides an_\n\n_organization from a total cost of ownership perspective._\n\nWith the ability to access and analyze all the data in one\n\nplace, the business can make unified data-driven decisions,\n\nrather than relying on intuition or fragmented data.\n\n### Goals and outcomes\n\nLike many projects, starting with a strong foundation of ‘what\n\nsuccess looks like’ will significantly improve your likelihood\n\nof achieving your objectives. Here are a few best-practices\n\nwe recommend:\n\n`1.` **Set goals:** Define your hypothesis, then use your data\n\nand process to prove or disprove your hypothesis. You\n\nhave a goal in mind, make it part of the experiment. If\n\nthe outcome differs from the expectation, that is part of\n\nexperiments and we can learn from it to improve the next\n\nexperiment. This is all about shortening the feedback loop\n\n\nproject appropriately. For example, are you doing this as\n\na side project? Do you have 2 sprints to show progress?\n\nIt’s important to scope your project based on the time,\n\nresources, and quality needed for the said project to be a\n\nsuccess.\n\n`3.` **Scope down:** Ruthlessly control scope for any PoC or\n\nMVP. Prioritization is your best friend. Stakeholders and\n\nyour own internal team will naturally want to increase\n\nscope because there’s no shortage of good ideas. But by\n\ncontrolling scope, you improve your chances of shipping\n\non time and on budget. Don’t let perfection be the enemy\n\nof good. There are always exceptions to this, but that is\n\nwhat the next sprint is for.\n\n`4.` **Deliver on time:** Recovering lost goodwill is incredibly\n\ndifficult - strive to always deliver on time. Make sure your\n\ngoals, constraints and scope creep will not explode your\n\ntimeline as creating tight feedback loops and iteration\n\ncycles is what will make you more agile than the competition.\n\n`5.` **Socialize early, and often:** Show quantifiable value as\n\nquickly as possible, both to your immediate team and\n\nbusiness stakeholders. Measure the value as frequently\n\nas makes sense, and socialize early and often to promote\n\nvisibility of the project and ensure tight alignment across\n\nteams. This will empower you to create tighter feedback\n\nloops that will help improve any future iterations of your\n\nproduct, platform, or technology.\n\n\nbetween insight and action.\n\n# Ultimate Class Build Guide\n\n\n### Creating a character\n\nHave you rolled your character already? Data engineers, data\n\nscientists, and data analysts form the heart of mature game\n\ndata teams. Though, depending on studio size and resources,\n\n\nmaking sense of large amounts of data. Depending on the size\n\nof the organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in data\n\nengineering, analytics and data science. Key characters include:\n\n\ngame developers may also be pulled in from time to time to\n\n\nperform data engineering and or data science tasks. Though for\n\nthe sake of this guide, we’ll keep focus on roles of data engineers,\n\ndata scientists, and data analysts. There are many aspects to\n\nthese roles, but they can be summarized in that Data Engineers\n\ncreate and maintain critical data workflows, Data Analysts\n\ninterpret data and create reports that keep the business teams\n\nrunning seamlessly, and Data Scientists are responsible for\n\n\n**Data Engineers**\n\nData engineers build systems that collect, manage, and\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n\n-----\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Data Analysts**\n\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n**Learn more about these character classes**\n\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n# Diving In\n\n\nBefore we get to the primary use cases of game data,\n\nanalytics, and AI, we need to cover some basics. That is, the\n\ndifferent types of game data and how they are produced.\n\nAnd the subsequent receiving of that data in the cloud to\n\n\n### Producing game data…\n\nSpeaking in generalities, there are four buckets of data as it\n\nrelates to your video game.\n\n\ncollect, clean, and prepare for analysis.\n\n**1. Game Telemetry**\n\nGame telemetry refers to the data collected about player\n\nbehavior and interactions within a video game. The primary\n\ndata source is the game engine. And the goal of game\n\ntelemetry is to gather information that can help game\n\ndevelopers understand player behavior and improve the\n\noverall game experience.\n\nSome of the primary metrics that are typically tracked in\n\ngame telemetry include:\n\n- **Player engagement:** Track the amount of time players\n\nspend playing the game, and their level of engagement\n\nwith different parts of the game.\n\n- **Game progress:** Monitor player progress through\n\ndifferent levels and milestones in the game.\n\n- **In-game purchases:** Track the number and value of\n\nin-game purchases made by players.\n\n- **Player demographics:** Collect demographic information\n\nabout players, such as age, gender, location, and device type.\n\n- **Session length:** Monitor the length of each player session,\n\nand how often players return to the game.\n\n- **Retention:** Track the percentage of players who return to\n\nthe game after their first session.\n\n\n-----\n\nsuch as the types of actions taken, the number of deaths,\n\nand the use of power-ups.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels.\n\n**2. Business KPIs**\n\nThe second bucket of data is business key performance\n\nindicators (or KPIs). Business KPIs are metrics that measure\n\nthe performance and success of a video game from a\n\nbusiness perspective. The primary data source for business\n\nKPIs include game telemetry, stores, and marketplaces.\n\nThese KPIs help game studios understand the financial and\n\noperational performance of their games and make informed\n\ndecisions about future development and growth.\n\nSome of the primary business metrics that are typically\n\ntracked include:\n\n- **Revenue:** Track the total revenue generated by the game,\n\nincluding sales of the game itself, in-game purchases,\n\nand advertising.\n\n- **Player Acquisition Cost (CAC):** Calculate the cost\n\nof acquiring a new player, including marketing and\n\nadvertising expenses.\n\n- **Lifetime Value (LTV):** Estimate the amount of revenue a\n\nplayer will generate over the course of their time playing\n\nthe game.\n\n- **Player Retention:** Track the percentage of players who\n\ncontinue to play the game over time, and how long they\n\nplay for.\n\n- **Engagement:** Measure the level of engagement of players\n\nwith the game, such as the number of sessions played,\n\ntime spent playing, and in-game actions taken.\n\n- **User Acquisition:** Track the number of new players\n\nacquired through different marketing channels and the\n\ncost of acquiring each player.\n\n- **Conversion Rate:** Measure the percentage of players who\n\nmake an in-game purchase or complete a specific action.\n\n- **Gross Margin:** Calculate the profit generated by the game\n\nafter subtracting the cost of goods sold, such as the cost\n\nof game development and server hosting.\n\n**3. Game Services**\n\nSimilar to game telemetry, game services provide critical\n\ninfrastructure that requires careful monitoring and management.\n\nThese services include things like game server hosting,\n\n\nand more. Here the source of data is the game services used.\n\nSome of the common metrics game teams typically track for\n\nthese services include:\n\n- **Concurrent Players:** Track the number of players who are\n\nsimultaneously connected to the game servers to ensure\n\nthat the servers have enough capacity to handle the\n\nplayer demand.\n\n- **Server Availability:** Monitor the uptime and downtime of\n\nthe game servers to ensure that players have access to\n\nthe game when they want to play, particularly important\n\nfor global live service games where demand fluctuates\n\nthrought the day.\n\n- **Latency:** Measure the time it takes for data to travel\n\nfrom the player’s device to the game server and back,\n\nto ensure that players have a smooth and responsive\n\ngaming experience.\n\n- **Network Bandwidth:** Monitor the amount of data being\n\ntransmitted between the player’s device and the game\n\nserver to ensure that players have a high-quality gaming\n\nexperience, even on slow internet connections.\n\n- **Live Operations:** Monitor the success of in-game events,\n\npromotions, and other live operations to understand what\n\nresonates with players and what doesn’t.\n\n- **Player Feedback:** Monitor player feedback and reviews,\n\nincluding ratings and comments on social media, forums,\n\nand app stores, to understand what players like and dislike\n\nabout the game.\n\n- **Chat Activity:** Track the number of messages and\n\ninteractions between players in the game’s chat channels\n\nto understand the level of social engagement and\n\ncommunity building in the game.\n\n**4. Data beyond the game**\n\nThe last bucket comes from data sources beyond the video\n\ngame. These typically include the following:\n\n- **Social Media Data:** Social media platforms, such as\n\nFacebook, Twitter, TikTok and Instagram, can provide\n\nvaluable insights into player behavior, feedback and\n\npreferences, as well as help game teams understand\n\nhow players are talking about their games online with\n\ndifferent communities.\n\n- **Forum Data:** Online forums and discussion boards, such\n\nas Reddit and Discord, can be rich sources of player\n\nfeedback and opinions about the game.\n\n\n-----\n\n#### The secret to success is bringing all of the disparate data sources\n together, so you have as complete a 360-degree view as possible of\n what’s happening in and around your game.\n\n\n\n- **Player Reviews:** Ratings and reviews on app stores, such\n\nas Steam, Epic, Google Play and the Apple App Store, can\n\nprovide valuable feedback on player experiences and help\n\ngame teams identify areas for improvement.\n\n- **Third-Party Data:** Third-party data sources, such as\n\nmarket research firms and industry data providers, can\n\nprovide valuable insights into broader gaming trends and\n\nhelp game teams make informed decisions about their\n\ngames and marketing strategies.\n\nThis is a lot of data. And it’s no wonder that studios globally\n\nstruggle with fragmented views of their audience, with data\n\noften outpacing legacy technologies. Today, the need for real-\n\ntime capabilities and the leap from descriptive to predictive\n\nanalytics has made it so that data, analytics, and AI are now\n\ntable stakes for a game to be successful. Tapping into these\n\nfour buckets of data sources, you’ll find actionable insights that\n\ndrive better understanding of your playerbase, more efficient\n\nacquisition, stronger and longer lasting engagement, and\n\nmonetization that deepens the relationship with your players.\n\nThat’s what we’re going to dig into throughout the rest of\n\nthis book.\n\n**Let’s begin with how to get data out of your game!**\n\nThere are a variety of ways to get data out of the game and\n\ninto cloud resources. In this section, we will provide resources\n\nfor producing data streams in Unity and Unreal. In addition,\n\nwe will also provide a generic approach that will work for any\n\ngame engine, as long as you are able to send HTTP requests.\n\n**Unity**\n\nSince Unity supports C#, you would use a .NET SDK from the\n\ncloud provider of your choice. All three major cloud providers\n\n\n[using AWS](https://www.youtube.com/watch?v=yv4ynyCytdU) is provided here.\n\n- **AWS:** [AWS .NET SDK - Unity considerations](https://docs.aws.amazon.com/sdk-for-net/v3/developer-guide/unity-special.html)\n\n- **GCP:** [GCP .NET SDK Documentation](https://cloud.google.com/dotnet/docs/reference)\n\n- **Azure:** [Azure .NET SDK Overview](https://learn.microsoft.com/en-us/dotnet/azure/sdk/azure-sdk-for-dotnet)\n\n- **Kafka (Open-source alternative):** [Kafka .NET connector](https://github.com/confluentinc/confluent-kafka-dotnet)\n\nFrom here, the SDK is used to send data to a messaging\n\nservice. These messaging services will be covered in more\n\ndetail in the next section.\n\n**Unreal Engine**\n\nUnreal supports development with C++, so you could use\n\nC++ SDKs or Blueprint interfaces to those SDKs.\n\nThe resources for each SDK are provided here\n\n- **AWS:** [How to integrate AWS C++ SDK with Unreal Engine](https://aws.amazon.com/blogs/gametech/how-to-integrate-the-aws-c-sdk-with-unreal-engine/)\n\n- **Azure:** [Azure C++ SDK with PlayFab](https://learn.microsoft.com/en-us/gaming/playfab/sdks/unreal/)\n\n- **Kafka (Open-source alternative):** [Getting started with](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\n[Kafka and C++](https://docs.confluent.io/kafka-clients/librdkafka/current/overview.html#ak-cplus)\n\nJust like with the Unity example above, from here the data is\n\nsent to a messaging streaming service.\n\nOther engines may not support C++ or C#, but there is still a\n\nway to get your data into the cloud, no matter the language!\n\nBy hitting an API Gateway with a HTTP POST request, you are\n\nable to send data to cloud services from many more types of\n\napplications. A sample high level architecture of this solution\n\nin AWS and Azure can be seen below:\n\n**AWS:**\n\n\nhave .NET SDKs to use and I have linked the documentation\n\n\n**Azure:**\n\n\nfor each below.\n\nNo matter the cloud provider, if you want to use a SDK you\n\ninstall it through the NuGet package manager into your Unity\n\nproject. [A walkthrough of how to implement the .NET SDK](https://www.youtube.com/watch?v=yv4ynyCytdU)\n\n\n-----\n\nOnce the data has been sent from the game into an event-\n\nstreaming service, how do we get that data to a more\n\npermanent home? Here we will start by outlining what these\n\nmessaging services do and how we can use them to point\n\nour data to a desired location.\n\nMessaging services ingest real-time event data, being\n\nstreamed to them from a number of different sources,\n\nand then send them to their appropriate target locations.\n\nThese target locations can be databases, compute clusters\n\nor cloud object stores. A key property of the messaging\n\nservices is to preserve the time in which the events arrive, so\n\nthat it is always known the order that events occurred.\n\n\n\n- Data is stored in object storage such as S3, Azure Storage\n\nor GCP Buckets using Delta Lake.\n\n- Delta Lake is an open-source storage framework that makes\n\nit easy to maintain data consistency and track changes.\n\n**Data Governance & Cataloging:**\n\n- Unity Catalog in Databricks provides tools for data\n\ngovernance that helps with compliance and controlling\n\naccess to data in the lake.\n\n- Unity Catalog also allows to track data lineage, auditing and\n\ndata discovery with the use of data catalogs and governance.\n\n- Metadata about the data including the structure, format,\n\nand location of the data can be stored in a data catalog.\n\n\nExamples of cloud messaging services include AWS Kinesis\n\n\nFirehose, Google PubSub, and Azure Event Hubs Messaging.\n\nIf you prefer to use open-source products, Apache Kafka is a\n\nvery popular open-source alternative.\n\n### Getting data from your game to the cloud\n\nMoving to the cloud platform part of the journey involves\n\nbuilding a gaming Lakehouse. The gaming Lakehouse allows\n\ngaming companies to store, manage, and analyze large volumes\n\nof gaming data, such as player behavior, performance metrics,\n\nand financial transactions, to gain valuable insights and make\n\ndata-driven decisions to improve their business outcomes.\n\n**Next here are the basics of the Databricks**\n\n**platform simplified.**\n\n**Data Ingestion:**\n\n- Data can be ingested into the Gaming Lakehouse using\n\nvarious built-in data ingestion capabilities provided by\n\nDatabricks such as Structured Streaming and Delta Live\n\nTables for a single simple API that handles streaming or\n\nbatch pipelines.\n\n- Data can be ingested in real-time or batch mode from\n\n\n**Data Quality:**\n\n- Databricks platform enables you to validate, clean\n\nand enrich data using built-in libraries and rule-based\n\nvalidation using Delta Live Tables.\n\n- It also allows tracking data quality issues and missing\n\nvalues by using Databricks Delta Live Tables tables.\n\n**Data Security:**\n\n- Databricks provides a comprehensive security model to\n\nsecure data stored in the lake.\n\n- Access to data can be controlled through robust access\n\ncontrols on objects such as catalogs, schemas, tables,\n\nrows, columns, models, experiments, and clusters.\n\n**Analytics:**\n\n- The processed data can be analyzed using various\n\ntools provided by Databricks such as SQL Dashboards,\n\nNotebooks, visualizations and ML.\n\n- Game studios can gain insights into player performance and\n\nbehaviorto better engageplayers and improve their games.\n\n**Get started with your preferred cloud**\n\n\nvarious sources such as game clients, servers or APIs.\n\nData can be cleaned, transformed and enriched with\n\nadditional data sources, making it ready for analysis.\n\n\n-----\n\n# The Value of Data Throughout the Game Development Lifecycle\n\n\n### Lifecycle overview\n\nOver the last decade, the way games have been developed\n\nand monetized has changed dramatically. Most if not all\n\ntop grossing games are now built using a games-as-service\n\nstrategy, meaning titles shipped in cycles of constant\n\niteration to increase engagement and monetization of\n\nplayers over time. Games-as-a-Service models have the\n\nability to create sticky, high-margin games, but they also\n\nheavily depend on cloud-based services such as game\n\nplay analytics, multiplayer servers and matchmaking, player\n\nrelationship management, performance marketing and more.\n\nData plays an integral role in the development and operation\n\nof video games. Teams need tools and services to optimize\n\nplayer lifetime value (LTV) with databases that can process\n\nterabytes-petabytes of evolving data, analytics solutions\n\nthat can access that data with near real-time latency, and\n\nmachine learning (ML) models that can translate insights into\n\nactionable and innovative gameplay features.\n\nA game’s development lifecycle is unique to each studio. With\n\ndifferent skillsets, resources, and genres of games, there is no\n\n\none model. Below is a simplified view of a game development\n\nlifecycle for a studio running a games-as-a-service model.\n\nWhat’s important to remember is that throughout your title’s\n\ndevelopment lifecycle, there is data that can help you better\n\nunderstand your audience, more effectively find and acquire\n\nplayers, and more easily activate and engage them. Whether\n\nusing game play data to optimize creative decision making\n\nduring pre-production, tapping machine learning models to\n\npredict and prevent churn, or identifying the next best offer\n\nor action for your players in real-time, **data is your friend** .\n\n### Use data to develop a next-generation customer experience\n\nIn the game industry, customer experience (CX) is an\n\nimportant factor that can impact a player’s enjoyment of a\n\ngame and the length they choose to play that game over time.\n\nIn today’s highly competitive and fast-paced games industry,\n\na game studio’s ability to deliver exceptional and seamless\n\ncustomer experiences can be a strategic differentiator when\n\nit comes to cutting through the noise and winning a gamer’s\n\n\n## Game Development Lifecycle\n\n**Game Development Lifecycle**\n\n#### Games-as-a-Service (GaaS) / Games-as-a-Community (GaaC) Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)\n\n\n**Game Development Lifecycle**\n\n\n_Game-as-a-service (GaaS) / Game-as-a-Community (GaaC)_\n\n\n**1. Pre-Production**\n\nBrainstorm how to give life to the many\n\nideas laid out in the planning phase\n\n\n**3. Testing**\n\nEvery feature and mechanic in the game needs\n\nto be tested for game loop and quality control\n\n\n**5. Operation**\n\nAs studios increasingly adopt games-as-a-service models, the\n\nongoing operation of a video game is as critical as the launch itself\n\n**OPERATE** **MEASURE** **ENGAGE** **MONETIZE**\n\n\n\n|DISCOVERY & COMPATIBILITY INTEGRATION RELEASE PUBLISH AWARENESS|Col2|Col3|Col4|Col5|Col6|Col7|Col8|\n|---|---|---|---|---|---|---|---|\n|||||||||\n|||||||||\n\n\n**ONBOARDING** **BUILD & TEST** **FLIGHTING AND**\n**EXPERIMENTATION**\n\n\n**2. Production**\n\nMost of the time, effort, and resources\n\nspent on developing video games are\n\nspent in production stage\n\n\n**4. Launch**\n\nWhether developing alongside the community with\n\nalpha and beta releases, or launching into general\n\navailability, a game launch is a critical milestone\n\n\n-----\n\ncan help drive value through customer experience:\n\n`1.` **Personalization:** Game studios can use data analytics\n\nand machine learning to personalize the game experience\n\nfor each player based on their preferences and behavior.\n\nThis can include personalized recommendations for\n\ncontent, in-game events, and other features that are\n\ntailored to the player’s interests.\n\n`2.` **Omnichannel support:** Players often use multiple\n\nchannels, such as social media, forums, and in-game\n\nsupport, to communicate with game studios. Next\n\ngeneration customer experience involves providing a\n\nseamless and integrated support experience across all\n\nthese channels in near-real time.\n\n`3.` **Continuous improvement:** Game studios can use data\n\nand feedback from players to continuously improve\n\n\ngathering feedback on new features and using it to refine\n\nand optimize the game over time.\n\nIn summary, defining what a next generation customer\n\nexperience looks like for your game is important because it can\n\nhelp you create a more personalized, seamless, and enjoyable\n\nexperience for your players, which can lead to increased\n\nengagement, monetization, and loyalty. There are many\n\nways teams can use data throughout a game’s development\n\nlifecycle, but far and away the most valuable focus area will be\n\nin building and refining the customer experience.\n\nThroughout the rest of this guide, we will dig into the most\n\ncommon use cases for data, analytics, and AI in game\n\ndevelopment, starting with where we recommend everyone\n\nbegins: game analytics.\n\n\n# Getting Started with Gaming Use Cases\n\n\n### Where do I start? Start with game analytics\n\n**Overview**\n\nBig question: Where’s the best place to start when it comes\n\nto game data, analytics, and AI? For most game studios,\n\nthe best place to start is with game analytics. Setting up a\n\ndashboard for your game analytics that helps you correlate\n\ndata across disparate sources is infinitely valuable in a world\n\n\nwhere there is no one gaming data source to rule them all.\n\nAn effective dashboard should include your game telemetry\n\ndata, data from any game services you’re running, and data\n\nsources outside of your game such as stores, marketplaces,\n\nand social media. See below.\n\n**What we’re trying to solve/achieve**\n\nGetting a strong foundation in game analytics unlocks more\n\nadvanced data, analytics, and AI use cases. For example,\n\nconcurrent player count plus store and marketplace data\n\n\n**GAME TELEMETRY**\n\n\n**Data Sources**\n\n**GAME SERVICES** **OTHER SOURCES**\n\n\n-----\n\nand lifetime value. Usage telemetry combined with crash\n\nreporting and social media listening helps you more quickly\n\nuncover where players might be getting frustrated. And\n\ncorrelating chat logs, voice transcriptions, and or discord\n\n\nthat are relevant and engaging to your players, giving you\n\ntools to effectively market and monetize with your audience.\n\n**Let’s start with Player Segmentation.**\n\n\nand reddit forums can help you identify disruptive behavior\n\n\nbefore it gets out of hand, giving you the tools to take\n\nactionable steps to mitigate toxicity within your community.\n\n**Get started and set up your Analytics Dashboard**\n\n### Understand your audience\n\nWith your analytics pipelines set up, the first area of focus is to\n\nbetter understand your audience. This can help you inform a\n\nvariety of key business decisions, from the highest macro order\n\nof “what game(s) to develop”, to how to market and monetize\n\nthose games, and how to optimize the player experience.\n\nBy understanding the demographics, preferences, and\n\nbehaviors of their audience, a game studio can create games\n\nthat are more likely to appeal to their target market and be\n\nsuccessful. You can also use this understanding to tailor your\n\nmarketing and monetization strategies to the needs and\n\npreferences of your players.\n\nAdditionally, understanding your audience can help you\n\n\n##### Player Segmentation\n\n**Overview**\n\nPlayer segmentation is the practice of dividing players\n\ninto groups based on shared characteristics or behaviors.\n\nSegmentation has a number of benefits. You can better\n\nunderstand your players, create more personalized content,\n\nimprove player retention, and optimize monetization, all of\n\nwhich contributes to an improved player experience.\n\n**What we’re trying to solve/achieve**\n\nThe primary objective of segmentation is to ensure you’re\n\nnot treating your entire playerbase the exact same. Humans\n\nare different, and your players have different motivations,\n\npreferences and behaviors. Recognizing this and engaging\n\nwith them in a way that meets them where they’re at\n\nis one of the most impactful ways you can cultivate\n\nengagement with your game. As we mentioned above,\n\nthe benefits of segmentation are broad reaching. Through\n\nbetter understanding of your playerbase, you can better\n\npersonalize experiences, tailoring content and customer\n\nexperience to specific groups of players that increases\n\nengagement and satisfaction. Better understanding of\n\nyour players also helps in improving player retention. By\n\nidentifying common characteristics of players who are at\n\nrisk of churning (i.e., stopping play), you can develop targeted\n\nstrategies that only reach specific audiences.\n\nCreate advanced customer segments to build out more\n\neffective user stories, and identify potential purchasing\n\npredictions based on behaviors. Leverage existing sales\n\ndata, campaigns and promotions systems to create robust\n\nsegments with actionable behavior insights to inform your\n\nproduct roadmap. You can then use this information to build\n\nuseful customer clusters that are targetable with different\n\npromos and offers to drive more efficient acquisition and\n\ndeeper engagement with existing players.\n\n\nidentify potential pain points or areas for improvement\n\n\nwithin your games, allowing you to proactively make changes\n\n\n**Get started with Player Segmentation**\n\n\nto address these issues and improve the player experience\n\nbefore a player potentially churns.\n\n\n-----\n\n**Overview**\n\nPlayer lifetime value (LTV) is a measure of the value that a\n\nplayer brings to a game over the lifetime they play that game.\n\nIt is typically calculated by multiplying the average revenue\n\nper user (ARPU) by the average player lifespan. For example,\n\nif the average player spends $50 per year and plays the\n\ngame for 2 years, their LTV would be $50 * 2 = $100.\n\n**What we’re trying to solve/achieve**\n\nGame studios care about LTV because it helps them\n\nunderstand the long-term value of their players and make\n\ninformed decisions about how to invest in player acquisition\n\nand retention. For example, if the LTV of a player is higher\n\nthan the cost of acquiring them (e.g., through advertising),\n\nit may be worth investing more in player acquisition. On the\n\nother hand, if the LTV of a player is lower than the cost of\n\nacquiring them, it may be more cost-effective to focus on\n\nretaining existing players rather than acquiring new ones.\n\nLTV is one of the more important metrics that game studios,\n\nparticularly those building live service games, can use to\n\nunderstand the value of their players. It is important to\n\nconsider other metrics as well, such as player retention,\n\nmonetization, and engagement.\n\n**Get started with Player Lifetime Value**\n\n##### Social Media Monitoring\n\n**Overview**\n\nAs the great Warren Buffet once said, “It takes 20 years to\n\nbuild a reputation and five minutes to ruin it. If you think\n\nabout that, you’ll do things differently.” Now more than ever,\n\npeople are able to use social media and instantly amplify\n\ntheir voices to thousands of people who share similar\n\ninterests and hobbies. Take Reddit as an example. r/gaming,\n\nthe largest video game community (also called a subreddit)\n\nhas over 35 million members with nearly 500 new posts\n\nand 10,000 new comments per day, while over 120 game-\n\nspecific subreddits have more than 10,000 members each,\n\nthe largest being League of Legends with over 700,000\n\nmembers. The discourse that takes place on online social\n\nplatforms generates massive amounts of raw and organic\n\n\nbe used to understand how customers think and discover\n\nexactly what they want.\n\nThe act and process of monitoring content online across the\n\ninternet and social media for keyword mentions and trends\n\nfor downstream processing and analytics is called media\n\nmonitoring. By applying media monitoring to social media\n\nplatforms, game developers are able to gain new advantages\n\nthat previously might not have been possible, including:\n\n- Programmatically aggregate product ideas for new\n\nfeature prioritization\n\n- Promote a better user experience by automatically\n\nresponding to positive or negative comments\n\n- Understand the top influencers in the industry who can\n\nsway public opinion\n\n- Monitor broader industry trends and emerging segments\n\nsuch as free-to-play games\n\n- Detect and react to controversies or crises as they begin\n\n- Get organic and unfiltered feedback of games and features\n\n- Understand customer sentiment at scale\n\n- Make changes faster to keep customer satisfaction high\n\nand prevent churn\n\nBy failing to monitor, understand, and act on what customers\n\nare saying about the games and content you release as\n\nwell as broader industry trends, you risk those customers\n\nleaving for a better experience that meets the demands and\n\nrequirements of what customers want.\n\n**What we’re trying to solve/achieve**\n\nBy monitoring and listening to what existing and potential\n\ncustomers are saying on social media, game developers\n\nare able to get a natural and organic understanding of how\n\ncustomers actually feel about the games and products they\n\nrelease, or gauge consumer interest before investing time\n\nand money in a new idea. The main process for social media\n\nmonitoring is to gather data from different social media\n\nplatforms, such as Twitter or YouTube, process those comments\n\nor tweets, then take action on the processed data. While\n\ncustomer feedback can be manually discovered and processed\n\nin search of certain keyword mentions or feedback, it is a much\n\nbetter idea to automate it and do it programmatically.\n\n**Get started with Social Media Monitoring**\n\n\n-----\n\n**Overview**\n\nPlayer feedback analysis is the process of collecting,\n\nanalyzing, and acting on player feedback to inform game\n\ndevelopment. It involves collecting player feedback from\n\nmultiple sources, such as in-game surveys, customer\n\nsupport tickets, social media, marketplace reviews, and\n\nforums, and using data analytics tools to identify patterns,\n\ntrends, and insights. The goal of player feedback analysis is\n\nto better understand player needs, preferences, and pain\n\npoints, and use this information to inform game development\n\ndecisions and improve the overall player experience.\n\nPlayer feedback analysis is an important part of game\n\ndevelopment as it helps ensure that the game continues to\n\nmeet player needs and expectations. By regularly collecting and\n\nanalyzing player feedback, game studios can make data-driven\n\ndecisions to improve the game, increase player engagement\n\nand retention, and ultimately drive success and growth.\n\nFor this use case, we’re going to focus on taking online\n\nreviews for your video game and categorizing the different\n\ntopics players are talking about (bucketing topics) in order\n\nto better understand the themes (via positive or negative\n\nsentiment) affecting your community.\n\n**What we’re trying to solve/achieve**\n\nThis is incredibly helpful, providing data-driven customer\n\ninsight into your development process. Whether used in\n\n\n**Overview**\n\nAcross massively multiplayer online video games (MMOs),\n\nmultiplayer online battle arena games (MOBAs) and other\n\nforms of online gaming, players continuously interact in real\n\ntime to either coordinate or compete as they move toward a\n\ncommon goal — winning. This interactivity is integral to game\n\nplay dynamics, but at the same time, it’s a prime opening for\n\ntoxic behavior — an issue pervasive throughout the online\n\nvideo gaming sphere.\n\nToxic behavior manifests in many forms, such as the varying\n\ndegrees of griefing, cyberbullying and sexual harassment\n\nthat are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\n\n[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\n\nthe multiplayer game, _Dead by Daylight_ .\n\n\npre-production, such as looking at games that are similar\n\n\n**Survivors**\n\n\nwith reviews to learn where those games have strengths and\n\nweaknesses; or using player feedback analysis with a live\n\nservice title to identify themes that can apply to your product\n\nroadmap, player feedback analysis helps teams better\n\nsupport and cultivate engagement with the player community.\n\n\n**GEN**\n\n**RUSHING**\n\n\n**GEN**\n\n\n**HIDING** **ACTIVATING** **LOOPING**\n**EMOTES**\n\n\n**RUSH** **BLINDING** **SANDBAGGING**\n**UNHOOKING**\n\n**TEABAGGING**\n\n\n**REPORTING** **REPORTING**\n\n\n**REPORTING** **REPORTING**\n\n\n**TEXT**\n**CHATTING**\n\n\nUltimately, player feedback analysis does two things. 1) It\n\n\n**Less**\n\n**toxic**\n\n\n**Most**\n**toxic**\n\n\ncan help you stack rank themes according to positive and\n\nnegative sentiment, and 2) you can weight those themes\n\naccording to impact on player engagement, toxicity,\n\nmonetization, churn, and more. We’ve all read reviews that\n\nare overly positive, or overly negative. The process of player\n\nfeedback analysis helps to normalize feedback across the\n\ncommunity (keeping in mind, only for those who have written\n\na review), so you’re not over indexing on one review, or a\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n\n**HATCH** **HATCH**\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\n\n**CAMPING** **CAMPING**\n\n\n**FARMING** **FARMING**\n\n\n**CAMPING** **CAMPING**\n\n\n**BEING AWAY**\n**FROM**\n**KEYBOARD**\n**(AFK)**\n\n\n**CAMPING**\n\n**DRIBBLING** **TUNNELING**\n\n\n**LOBBY**\n**DODGING**\n\n**BODY**\n**BLOCKING**\n\n**FACE**\n**SLUGGING** **CAMPING**\n\n\n**Killers**\n\n\nsingle theme that may seem in the moment very pressing.\n\nIn addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\n\n\n**Get started with Player Feedback Analysis**\n\n\non gamers and the community -- an issue that cannot be\n\n\n-----\n\ngame studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\n\n\n[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\n\ntoxicity, and of those, 20% reported leaving the game due to\n\nthese interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\n\nshowed that having a disruptive or toxic encounter in the first\n\nsession of the game led to players being over three times\n\nmore likely to leave the game without returning. Given that\n\nplayer retention is a top priority for many studios, particularly\n\nas game delivery transitions from physical media releases to\n\nlong-lived services, it’s clear that toxicity must be curbed.\n\nCompounding this issue related to churn, some companies\n\nface challenges related to toxicity early in development,\n\neven before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\n\nreleased into testing without text or voice chat due in part\n\nto not having a system in place to monitor or manage toxic\n\n\nIn this section, we’re going to talk about how to use your data\n\nto more effectively find your target audience across the web.\n\nWhether you’re engaging in paid advertising, influencer or\n\nreferral marketing, PR, cross promotion, community building,\n\netc - use data to separate activity from impact. You want\n\nto focus on the channels and strategies that leverage your\n\nresources most effectively, be that time or money.\n\nSay you have a cohort of highly engaged players who are\n\nspending money on your title, and you want to find more\n\ngamers just like that. Doing an analysis on the demographic\n\nand behavioral data of this cohort will give you the\n\ninformation needed to use an ad platform (such as Meta,\n\nGoogle, or Unity) to do lookalike modeling and target those\n\npotential gamers for acquisition.\n\n\ngamers and interactions. This illustrates that the scale of\n\n\nthe gaming space has far surpassed most teams’ ability to\n\nmanage such behavior through reports or by intervening in\n\ndisruptive interactions. Given this, it’s essential for studios\n\nto integrate analytics into games early in the development\n\nlifecycle and then design for the ongoing management of\n\ntoxic interactions.\n\n**What we’re trying to solve/achieve**\n\nToxicity in gaming is clearly a multifaceted issue that\n\nhas become a part of video game culture and cannot be\n\naddressed universally in a single way. That said, addressing\n\ntoxicity within in-game chat can have a huge impact given\n\nthe frequency of toxic behavior and the ability to automate\n\nthe detection of it using natural language processing (NLP). In\n\nsummary, by leveraging machine learning to better identify\n\ndisruptive behavior so that better-informed decisions\n\naround handling actions can be made.\n\n**Get started with Toxicity Detection**\n\n\n##### Multi-Touch Attribution\n\n**Overview**\n\nMulti-touch attribution is a method of attributing credit to\n\ndifferent marketing channels or touchpoints that contribute to\n\na sale or conversion. In other words, it is a way of understanding\n\nhow different marketing efforts influence a customer’s decision\n\nto make a purchase or take a desired action.\n\nThere are a variety of different attribution models that can\n\nbe used to assign credit to different touchpoints, each with\n\nits own strengths and limitations. For example, the last-\n\nclick model attributes all credit to the last touchpoint that\n\nthe customer interacted with before making a purchase,\n\nwhile the first-click model attributes all credit to the first\n\ntouchpoint. Other models, such as the linear model or\n\nthe time decay model, distribute credit across multiple\n\ntouchpoints based on different algorithms.\n\n**What we’re trying to solve/achieve**\n\nMulti-touch attribution can be useful for game studios because\n\nit can help them understand which marketing channels or\n\nefforts are most effective at driving conversions and inform their\n\nmarketing strategy. However, it is important to choose the right\n\nattribution model for your title based on your business model\n\n(one-time purchase, subscription, free-to-play, freemium,\n\nin-game advertising, etc.) and regularly review and optimize your\n\nattribution efforts to ensure they are accurate and effective.\n\n**Get started with Multi-Touch Attribution**\n\n\n-----\n\n### Activating Your Playerbase\n\nSo far, we’ve discussed how to better understand your\n\nplayers, and how to acquire more of your target audience.\n\nNext, we’re going to dig into how to better activate your\n\nplayers to create a more engaged and loyal playerbase that\n\nstays with your game for the long-term. Here, we’re going to\n\nfocus on strategies that differentiate your gamer experience.\n\n##### Player Recommendations\n\n\nand make in-game purchases. Additionally, personalized\n\nrecommendations can help improve the overall player\n\nexperience and increase satisfaction.\n\nGame studios can use a variety of techniques to create player\n\nrecommendations, such as machine learning algorithms,\n\ncollaborative filtering, and manual curation. It is important\n\nto regularly review and optimize these recommendations to\n\nensure that they are effective and relevant to players.\n\n**Get started with Player Recommendations**\n\n\n**Overview**\n\nPlayer recommendations are suggestions for content or actions\n\n\nthat a game studio makes to individual players based on their\n\ninterests and behaviors. These recommendations can be used\n\nto promote specific in-game items, encourage players to try\n\nnew features, or simply provide a personalized experience.\n\n**What we’re trying to solve/achieve**\n\nPlayer recommendations matter to game studios because\n\nthey can help improve player retention, engagement, and\n\nmonetization. By providing players with recommendations\n\nthat are relevant and engaging, studios can increase the\n\nlikelihood that players will continue to play their games\n\n\n##### Next Best Offer/Action\n\n**Overview**\n\nNext best offer (NBO) and next best action (NBA) are\n\ntechniques that businesses use to make personalized\n\nrecommendations to their customers. NBO refers to the\n\npractice of recommending the most relevant product or\n\nservice to a customer based on their past purchases and\n\nbehaviors. NBA refers to the practice of recommending the\n\nmost relevant action or interaction to a customer based on\n\nthe same information.\n\n\n-----\n\nin-game purchase to a player based on their past spending\n\nhabits and the items they have shown an interest in. They\n\nmight use NBA to recommend a specific level or event to a\n\nplayer based on their progress and interests.\n\n**What we’re trying to solve/achieve**\n\nIt’s important to remember that next best offer is a specific\n\nuse case within personalization that involves making\n\nrecommendations to players on the most valuable in-game\n\nitem or action they should take next. For example, a next\n\nbest offer recommendation in a mobile game might suggest\n\nthat a player purchase a specific in-game currency or unlock\n\na new character.\n\nBoth NBO and NBA can be used to improve customer\n\nretention, engagement, and monetization by providing\n\npersonalized recommendations that are more likely to be\n\nrelevant and appealing to individual customers. They can be\n\nimplemented using a variety of techniques, such as machine\n\nlearning algorithms or manual curation.\n\n**Get started with Next Best Offer/Action**\n\n##### Churn Prediction & Prevention\n\n**Overview**\n\nVideo games live and die by their player base. For Games-\n\n\nmay overwhelm the ability of these players to consume,\n\nreinforcing the overall problem of player churn.\n\nAt some point, it becomes critical for teams to take a cold,\n\nhard look at the cost of acquisition relative to the subscriber\n\nlifetime value (LTV) earned. These figures need to be brought\n\ninto a healthy balance, and retention needs to be actively\n\nmanaged, not as a point-in-time problem to be solved, but\n\nas a “chronic condition” which needs to be managed for the\n\nongoing health of the title.\n\nHeadroom for continued acquisition-driven growth can\n\nbe created by carefully examining why some players leave\n\nand some players stay. When centered on factors known\n\nat the time of acquisition, gaming studios may have the\n\nopportunity to rethink key aspects of their acquisition\n\nstrategy that promote higher average retention rates, which\n\ncan lead to higher average revenue per user.\n\n**Prerequisites for use case**\n\nThis use case assumes a certain level of existing data\n\ncollection infrastructure in the studio. Notably, a studio ready\n\nto implement a churn prediction and prevention model\n\nshould have\n\n- A cloud environment where player data is stored\n\n- This source data should contain player behavior and\n\nsession telemetry events from within the game. This is\n\nthe foundation that insights can be built on top of.\n\n\nas-a-Service (GaaS) titles, engagement is the most\n\n\nimportant metric a team can measure. Naturally, proactively\n\npreventing churn is critical to sustained engagement and\n\n\n**Get started with Churn Prediction & Prevention**\n\n\ngrowth. Through churn prediction and prevention, you will\n\n\nbe able to analyze behavioral data to identify subscribers\n\nwith an increased risk of churn. Next, you will use machine\n\nlearning to quantify the likelihood of a subscriber to churn, as\n\nwell as indicate which factors create that risk.\n\n**What we’re trying to solve/achieve**\n\nBalancing customer acquisition and retention is critical.\n\nThis is the central challenge to the long-term success of\n\nany live service game. This is particularly challenging in that\n\nsuccessful customer acquisition strategies needed to get\n\ngames to scale tend to be followed by service disruptions or\n\ndeclines in quality and customer experience, accelerating\n\nplayer abandonment. To replenish lost subscribers, the\n\nacquisition engine continues to grind and expenses mount.\n\nAs games reach for customers beyond the core playerbase\n\nthey may have initially targeted, the title may not resonate\n\n\n##### Real-time Ad Targeting\n\n**Overview**\n\nReal-time ad targeting in the context of game development\n\nfocuses on using data to deliver personalized and relevant\n\nadvertisements to players in near real-time, while they are\n\nplaying a game. Real-time targeting is performanced based,\n\nusing highly personalized messagings which are achieved\n\nby using data to precisely determine the most opportune\n\nmoments to display ads, based on factors such as player\n\nbehavior, game state, and other contextual information.\n\nKnowing when to send those ads is based on data. This\n\nuse case is specific to titles using in-game advertising as a\n\nbusiness model. It’s important to note that in-game real-\n\ntime ad targeting requires a sophisticated tech stack, with\n\n\n-----\n\nwith bigger ad ecosystem, ad networks and partners. The\n\nDatabricks Lakehouse platform is an optimal foundation as it\n\nalready contains many of the connectors required to enable\n\nthis use case.\n\n**What we’re trying to solve/achieve**\n\nThe goal of in-game real-time ad targeting is to provide a\n\nmore immersive and relevant advertising experience for\n\nplayers, while also increasing the effectiveness of the ads\n\nfor advertisers. By delivering targeted ads that are relevant\n\nto each player’s interests, game developers can create a\n\nmore enjoyable and personalized gaming experience, which\n\ncan help to reduce churn and increase the lifetime value of\n\neach player. Additionally, real-time ad targeting can also help\n\ngame developers monetize their games more effectively, as\n\nadvertisers are willing to pay a premium for hyper-targeted\n\nand engaged audiences.\n\n**Get started with Real-time Ad Targeting**\n\n### Operational use cases\n\nIn the game development industry, operational analytics\n\n\n**Overview**\n\nAnomaly detection plays an important role in the operation\n\nof a live service video game by helping to identify and\n\ndiagnose unexpected behaviors in real-time. By identifying\n\npatterns and anomalies in player behavior, system\n\nperformance, and network traffic, this information can then\n\nbe used to detect and diagnose server crashes, performance\n\nbottlenecks, and hacking attempts. The ability to understand\n\nif there will be an issue before it becomes widespread is\n\nimmensely valuable. Without anomaly detection, which is\n\na form of advanced analytics, you’re always in a reactive\n\n(rather than proactive) state. Anomaly detection is a type of\n\nquality of service solution.\n\n**What we’re trying to solve/achieve**\n\nThe goal of anomaly detection is to ensure that players\n\nhave a stable and enjoyable gaming experience. This has\n\nan impact across your game, from reducing downtime,\n\nto minimizing player churn, and improving your game’s\n\nreputation and revenue. Additionally, the insights gained from\n\nanomaly detection can also be used to mitigate cheating and\n\ndisruptive behavior.\n\n**Get started with Anomaly Detection**\n\n\nare essential for ensuring a smooth and efficient production\n\n\nprocess. One common use case is anomaly detection, where\n\ndata analytics is utilized to identify any unusual patterns\n\nor behaviors in the game, such as crashes or performance\n\nissues. This helps developers quickly identify and fix\n\nproblems, improving the overall quality of the game. Another\n\nexample is build pipelines, where data analytics can be used\n\nto monitor and optimize the process of creating new builds\n\nof the game. By tracking key metrics such as build time,\n\nerror rates, and resource utilization, developers can make\n\ninformed decisions about how to optimize the build process\n\nfor maximum efficiency. Other operational use cases in game\n\ndevelopment include tracking player behavior, measuring\n\nserver performance, and analyzing sales and marketing data.\n\nLets explore a few of these below.\n\n\n##### Build Pipeline\n\n**Overview**\n\nA build pipeline is a set of automated processes that\n\nare used to compile and assemble the code, assets, and\n\nresources that make up a game project. The build pipeline\n\ntypically includes several stages, such as code compilation,\n\noptimization, testing, and release. The purpose of a build\n\npipeline is to streamline the game development process\n\nand ensure that each stage of development is completed\n\nefficiently and effectively. A build pipeline can be configured\n\nto run automatically, so that new builds are generated\n\nwhenever changes are made to the code or assets. This\n\nhelps to ensure that the game is always up-to-date and\n\nready for testing and release. The logs are collected are in\n\nnear-real time from build servers. A simplified example:Dev\n\nX is committing code on title Y, submitted on day Z,\n\nalong with the log files from the pipeline and build server.\n\nBuilds typically take multiple hours to complete, requiring\n\nsignificant amounts of compute via build farms. Being able to\n\n\n-----\n\nare wasting compute, and being able to predict which builds\n\nwill fail as they goes through the pipeline are ways to curb\n\noperational expenses.\n\n**What we’re trying to solve/achieve**\n\nWith this use case, we’re seeking to reduce wasted compute\n\nand build a foundational view of what was developed, by\n\nwho, when and how testing performed. In an ideal state, our\n\nautomated build pipeline could send a notification to the\n\ndeveloper with a confidence metric on the build making it\n\nthrough, allowing them to decide whether to continue or\n\nmove another build through the pipeline. Often, developers\n\ndo not have clear visibility until the build has completed\n\nor failed. By providing more insight to devs into the build\n\npipeline process, we can increase the rate at which builds\n\nare completed efficiently and effectively.\n\n**Get started with Build Pipeline**\n\n##### Crash Analytics\n\n\nresources were being used. How long crash testing takes\n\ncan vary, depending on the game’s business model, amount\n\nof content, and scope. For a title with a one-time release,\n\nwhere there is a large amount of content and a complex\n\nstoryline, the chances of hidden crashes causing errors while\n\nin development are high, making it require more time to\n\nperform testing before the game can be published. For titles\n\nbuilt in a game-as-a-service model, i.e. a game shipped in\n\ncycles of constant iteration, crash detection should be done\n\ncontinuously, since errors in newly released content might\n\naffect the base game and lead to crashes.\n\nIncreasingly, titles are being released in alpha (where\n\ndevelopers do the testing), closed beta (which includes a\n\nlimited group of testers/sample-users who do the gameplay\n\ntesting) and open betas (where anyone interested can register\n\nto try the game). All of which happens before the game is\n\n“officially” released. Regardless of alpha, beta, or GA, players\n\nmay stumble over game crashes, which triggers crash reports\n\nthat are sent to the developers for fixing. But sometimes, it\n\ncan be challenging to understand the issue that caused the\n\ncrash from crash reports provided by your game’s platform.\n\n**What we’re trying to solve/achieve**\n\nUltimately, the purpose of crash analytics is to identify the\n\nroot cause of a crash, and help you take steps to prevent\n\nsimilar crashes from happening in the future. This feedback\n\nloop can be tightened through automation in the data\n\npipeline. For example, by tracking crashes caused on builds\n\nfrom committers, the data can provide build suggestions\n\nto improve crash rate. Furthermore, teams can automate\n\ndeduplication when multiple players experience the same\n\nerrors, helping to reduce noise in the alerts received.\n\n**Get started with Crash Analytics**\n\n\n**Overview**\n\nGames crash, it is a fact of game development. The\n\ncombination of drivers, hardware, software, and\n\nconfigurations create unique challenges in tracking, resolving\n\nand managing the user experience.\n\nCrash analytics and reporting is the process of collecting\n\ninformation about crashes or unexpected failures in a\n\nsoftware application, in this case, a video game. A crash\n\nreport typically includes information about the state of the\n\ngame at the time of the crash, such as what the player was\n\n\n-----\n\n# Things to look forward to\n\n\nThis eBook was created to help game developers better\n\nwrap their heads around the general concepts in which data,\n\nanalytics, and AI can be used to support the development\n\nand growth of video games. **If you only have 5 minutes,**\n\n**these takeaways are critical to your success** .\n\nFor more information on advanced data, analytics, and AI use\n\ncases, as well as education resources, we highly recommend\n\nDatabricks training portal [dbricks.co/training](http://dbricks.co/training) .\n\n**Top takeaways:**\n\nIf you take nothing else from this guide, here are the most\n\nimportant takeaways we want to leave with you on your journey.\n\n`1.` **Data is fundamental. Data, analytics, and AI play a role**\n\nthroughout the entire game development lifecycle - from\n\ndiscovery to pre-production, development to operating\n\na game as a live service. Build better games, cultivate\n\ndeeper player engagements, and operate more effectively\n\n\nby utilizing the full potential of your data.\n\n`2.` **Define your goals.** Start by establishing the goals of what\n\nyou’re hoping to learn and or understand around your\n\ngame. Clear goals make it easier to identify key metrics\n\nto track, example goals include; developing high-quality\n\ngames that provide engaging and satisfying player\n\nexperiences, increasing player engagement and retention\n\nby analyzing and improving gameplay and mechanics, and\n\nbuilding a strong and positive brand reputation through\n\neffective marketing and community outreach.\n\n`3.` **Identify and understand your data sources.** Spend time\n\nto identify and understand the breadth of data sources\n\nyou are already collecting, be that game telemetry,\n\nmarketplace, game services, or sources beyond the game\n\nlike social media. It is critical to collect the right data, and\n\ntrack the right metrics based on the goals and objectives\n\nyou have set for your game.\n\n`4.` **Start small, and iterate quickly.** Recognize that goals and\n\nobjectives evolve as you learn more about the interaction\n\n\n-----\n\nare most effective when scoped small with tight feedback\n\nloops, allowing you to quickly adapt with your community\n\nand alongside shifting market conditions.\n\n`5.` **Game analytics forms the foundation.** Start by getting a\n\ngame analytics dashboard up and running. The process of\n\nbuilding out a dashboard will naturally require connecting\n\nand transforming your data in a way to unlock more\n\nadvanced use cases down the road.\n\n`6.` **Plan and revisit your data strategy frequently.** Once\n\ndashboarding is set up, you’ll have a better picture of what\n\ndownstream data use cases make the most sense for\n\nyour game and business objectives. As you move to use\n\ncases such as player segmentation, churn analysis, and\n\nplayer lifetime value, revisit your data strategy frequently\n\nto ensure you’re spending time on use cases that drive\n\nactionable insights for you and your team.\n\n`7.` **Show value broad and wide.** Whether your data strategy\n\nis new or well established on the team, build the habit\n\nof communicating broadly to stakeholders across the\n\ncompany. Early in the process, it is important to gather\n\ncritical feedback on what data is helpful and where there\n\nare opportunities for improvement. The worst thing that\n\ncan happen is you create something that no one uses.\n\nThat is a waste of everyone’s time and money.\n\n`8.` **Ask for help.** Engage with your technical partners. There\n\nare humans who can help ensure you’re developing your\n\ndata and analytics platform in a way that is efficient and\n\neffective. There are numerous partners with domain\n\nexpertise in data science and data engineering that can\n\naccelerate your data journey - here is our recommended\n\npartner list for [data, analytics, and AI workloads](https://www.databricks.com/company/partners/consulting-and-si) .\n\n`9.` **Participate in the community.** The community for game\n\nanalytics is large and growing. It is important to research and\n\n\nyour needs and interests. Here are a few of our favorites:\n\n`a.` [IGDA Game Analytics](https://igda.org/sigs/analytics/) : The IGDA has a number of\n\nSpecial Interest Groups that bring together user\n\nresearchers, designers, data engineers and data\n\nscientists focused on understanding player behavior\n\nand experiences. They offer resources and events\n\nfor those working in games user research, including a\n\nyearly Games User Research Summit.\n\n`b.` [Data Science Society](https://www.datasciencesociety.net/) : The Data Science Society is a\n\nglobal community of data scientists and engineers.\n\nWhile not specifically focused on game development,\n\nthey offer a wealth of resources and opportunities for\n\nlearning, networking, and collaboration in the field of\n\ndata science.\n\n`c.` [Hugging Face](https://huggingface.co/) : is hub of open source models for Natural\n\nLanguage Processing, computer vision, and other fields\n\nwhere AI plays its role. They also provide an online\n\nplatform where users can access pre-trained models\n\nand tools, share their own models and datasets, and\n\ncollaborate with other developers in the community.\n\n`d.` [Data Engineering subreddit](https://www.reddit.com/r/dataengineering/) : The Data Engineering\n\nsubreddit is a forum for data engineers to discuss\n\ntopics related to building and managing data pipelines,\n\ndata warehousing, and related technologies. While\n\nnot specifically focused on game development, it\n\ncan be a valuable resource for those working on data\n\nengineering in the gaming industry.\n\n`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n\nfirst step in your data journey. Imagine how the output of\n\nyour data can be presented in a way to help stakeholders\n\nacross your company achieve more. For example, dropping\n\ndata into an application that can help game designers\n\nmake balancing decisions based on player events.\n\n\n-----\n\n# APPENDIX Ultimate class build guide\n\n\n### Creating a character\n\nThe heart and soul of mature data teams are formed by this\n\ntrio of classes. There are many aspects to these roles, but\n\nthey can be summarized in that Data Engineers create and\n\nmaintain critical data workflows, Data Analysts interpret data\n\nand create reports that keep the business teams running\n\nseamlessly, and Data Scientists are responsible for making\n\nsense of large amounts of data. Depending on the size of\n\nthe organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in\n\ndata engineering, analytics and data science.\n\nWhether you’re looking to stand-up an analytics dashboard\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n##### Data Engineers\n\n\n**Goals and Priorities of Data Engineers**\n\n- Enable access to usable data for real-time insights — data\n\nthat both enables timely decision-making and is accurate\n\nand reproducible\n\n- Increase user confidence and trust in data. This involves\n\nensuring high consistency and reliability in ETL processes\n\n- Limit the issues and failures experienced by other\n\nengineers and data scientists, allowing those roles to\n\nfocus less on troubleshooting and more on drawing\n\nmeaningful conclusions from data and building new\n\nproducts / features\n\n**What Data Engineers care about:**\n\n- Enabling access to data for real-time insights — data that\n\nboth enables timely decision-making and is accurate and\n\nreproducible\n\n- Building high-performance, reliable and scalable pipelines\n\nfor data processing\n\n- Delivering data for consumption from a variety of sources\n\nby Data Analysts and Data Scientists against tight SLAs\n\n- A Data Engineer’s biggest challenge? Collaboration\n\nacross teams\n\n\nData engineers build systems that collect, manage, and\n\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n**Responsibilities:**\n\n- Data Engineers are responsible for data migration,\n\nmanipulation, and integration of data (joining dissimilar\n\ndata systems)\n\n- Setup and maintenance of ETL pipelines to convert\n\nsource data into actionable data for insights. It is the\n\nresponsibility of the data engineer to make sure these\n\npipelines run efficiently and are well orchestrated.\n\n- The Data Engineer sets up the workflow process\n\nto orchestrate pipelines for the studio’s data and\n\ncontinuously validates it\n\n- Managing workflows to enable data scientists and data\n\nanalysts, and ensuring workflows are well-integrated with\n\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\n\n\n##### Data Scientists\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Responsibilities:**\n\n- Responsible for making sense of the large amounts of data\n\ncollected for a given game title, such as game telemetry,\n\nbusiness KPIs, game health and quality, and sources\n\nbeyond the game such as social media listening\n\n- The analytics portion of a Data Scientist’s job means\n\nlooking at new and existing data to try and discover new\n\nthings within it\n\n- The engineering component may include writing out\n\npipeline code and deploying it to a repository\n\n- Data Scientists are responding for building, maintaining, and\n\nmonitoring models used for analytics and/or data products\n\n\n-----\n\n**Goals and Priorities:**\n\n- Developing new business capabilities (such as behavioral\n\nsegmentation, churn prediction, recommendations) and\n\noptimizing processes around those capabilities\n\n- Increase ROI by building algorithms and tools that are\n\nmaintainable and reusable\n\n- Exploring (or further expanding) the use of machine\n\nlearning models for specific use cases\n\n- Bridges the gap between engineering and analytics,\n\nbetween the technology teams and business teams\n\n- Provides business side of studio with data that is crucial\n\nin decision-making, for example a churn model that helps\n\npredict the impact of a new feature set\n\n**What Data Scientists care about:**\n\n- Creating exploratory analysis or models to accurately\n\npredict business metrics, e.g., customer spend, churn,\n\netc., and provide data-driven recommendations\n\n- Enable team with actionable insights that are easy to\n\nunderstand and well curated\n\n- Create and move models from experimentation to\n\nproduction\n\n- A Data Scientist’s biggest challenge? Keeping up with\n\nadvancements and innovation in data science, and\n\nknowing which tools and libraries to use\n\n##### Data Analysts\n\nA data analyst reviews data to identify key insights into a\n\ngame studio’s customers and ways the data can be used to\n\nsolve problems.\n\n**Responsibilities:**\n\n- Often serves as the go-to point of contact for non-\n\n\n\n- Analysts often interpret data and create reports or other\n\ndocumentation for studio leadership\n\n- Analysts typically are responsible for mining and\n\ncompiling data\n\n- Streamline and or simplify processes when possible\n\n**Goals and Priorities:**\n\n- Empower stakeholder and business teams with\n\nactionable data\n\n- “Catch things before they break”. Proactively mitigate\n\npotential data issues before they occur (for internal and\n\nexternal customers)\n\n- Analysts are often recruited to assist other teams (i.e., BI\n\nteams) with their domain knowledge\n\n- Driving business impact through documentation and\n\nreliable data\n\n**What Data Analysts care about:**\n\n- Easy access to high quality data.\n\n- Quickly find insights from data with SQL queries and\n\ninteractive visualizations.\n\n- The ability to easily share insights and while creating\n\nimpactful assets for others to consume (dashboards, reports).\n\n- A Data Analyst’s biggest challenge? Working with complex\n\nprocesses and complicated technologies that are filled\n\nwith messy data. While fighting these challenges, Analysts\n\nare often left alone or forced through paths that prevent\n\ncollaboration with others across team/organization.\n\n- Untrustworthy data: often Analysts get asked to provide\n\nanswers to leadership that will leverage the data to\n\ndetermine the direction of the company. When the data is\n\nuntrustworthy or incorrect due to previously mentioned\n\nchallenges this can eventually lead to lack of trust in the\n\ndata teams from leadership or the business.\n\n\ntechnical business / operations colleagues for data\n\naccess / analysis questions\n\n\n-----\n\n# Data access and the major cloud providers\n\n\n### Cloud Rosetta Stone\n\n[AWS / Azure / GCP Service Comparison - Click Here](https://cloud.google.com/free/docs/aws-azure-gcp-service-comparison)\n\nIf you are newer to the cloud computing space, it is easy to\n\nget lost between the hundreds of different services between\n\nthe three major cloud providers. The table below is meant to\n\nhighlight the important data, analytics, and AI services used\n\nby the various hyperscale service providers Amazon,\n\nMicrosoft, and Google. In addition, it aims to pair up services\n\nfrom different cloud providers that serve the same purpose.\n\n### Getting started with the major cloud providers\n\nHere are some quick ways to get started with the three major\n\ncloud providers: AWS, Azure, and GCP:\n\n**AWS:**\n\n`1.` **[Create an AWS account](https://portal.aws.amazon.com/billing/signup)** **:** The first step is to create an\n\naccount on the AWS website. This will give you access to\n\nthe AWS Management Console, which is the web-based\n\ninterface for managing your AWS resources.\n\n\n`2.` **Use the AWS free tier:** AWS offers a free tier of service\n\nthat provides a limited amount of free resources each\n\nmonth. This is a great way to get started and try out\n\nvarious AWS services without incurring any charges.\n\n`3.` **Explore the AWS Management Console:** Once you have\n\nan account and are logged in, take some time to explore\n\nthe AWS Management Console and familiarize yourself\n\nwith the various services that are available.\n\n`4.` **Next you can search for Databricks:** In the AWS\n\nManagement Console, use the search bar in the top-left\n\ncorner of the page and search for “Databricks”.\n\n`5.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`6.` **Launch Databricks Workspace:** To launch the Databricks\n\nWorkspace on AWS, you can use the CloudFormation\n\ntemplate provided by Databricks. Databricks\n\nCloudFormation template creates an IAM role, security\n\ngroup, and Databricks Workspace in your AWS account.\n\n**Azure:**\n\n`1.` **[Create an Azure account](https://azure.microsoft.com/en-us/free/gaming/)** **:** The first step is to create\n\nan account on Azure portal. This will give you access to\n\nthe Azure portal, which is the web-based interface for\n\nmanaging your Azure resources.\n\n\n\n\n\n\n\n\n\n\n\n|Service Type|Service Description|AWS Service|Azure Service|GCP Service|\n|---|---|---|---|---|\n|Storage|Object storage for various file types and artifacts (CSV, JSON, Delta, JAR). Objects can be retrieved by other services|Amazon Simple Storage Service (S3)|Azure Blob Storage|Google Cloud Storage|\n|Compute|High-performance VMs to run applications. Platform where data transformations are run in Big Data apps.|Amazon Elastic Compute (EC2)|Azure Virtual Machines|Google Compute Engine|\n|Messaging|Real-time event streaming services to write data to object stores or data warehouses. One OSS version is Kafka|Amazon Kinesis|Azure Service Bus Messaging|Google Pub/Sub|\n|Data Warehouse|Traditional data storage layer for structured data, to then be used by data analysts. Often used to read from a Data Lake, which acts as a single source of truth|Redshift or Databricks|Synapse or Databricks|BigQuery or Databricks|\n\n\n-----\n\n**Jargon Glossary**\n\n|CDP|Customer Data Platform (CDP). A CDP is a piece of software that combines data from multiple tools to create a single centralized customer database containing data on all touch points and interactions with your product or service.|\n|---|---|\n|ETL|Extract, Transform, Load. In computing, extract, transform, load is a three-phase process where data is extracted, transformed and loaded into an output data container. The data can be collated from one or more sources and it can also be outputted to one or more destinations|\n|KPI|Key Performance Indicator, a quantifiable measure of performance over time for a specifci objective. KPIs provide targets for teams to shoot for, milestones to gauge progress, and insights that help people across the organization make better decisions.|\n|POC|Proof of Concept (PoC). A proof of concept is a prototype or initial implementation of a solution that is developed to demonstrate the feasibility of a concept or idea. It is often used to test the effectiveness of a new tool or approach to data analysis or machine learning before investing in a full-scale implementation.|\n|MVP|Minimum Viable Product (MVP). An MVP refers to the smallest possible solution that can be delivered to meet a specific business need. The goal of an MVP is to quickly validate assumptions and prove the potential value of a larger project. By delivering a smaller solution first, stakeholders can gain confidence in the project and see a return on investment sooner, while also providing feedback to improve the larger project.|\n|ROI|Return on investment (ROI), which is calculated by dividing the profit earned on an investment by the cost of that investment.|\n|Serverless computing|Using compute platforms that are completely managed by service providers. When using serverless computing, you simply execute queries or deploy applications and the service provider (AWS, Databricks, etc.) handles necessary server maintenance.|\n|VPC|Virtual Private Cloud. A VPC is a virtual cloud networking environment, which helps organize and give you control of your resources. You also define how resources within your VPC can communicate with other regions, VPCs, and the public internet with traffic rules and security groups.|\n\n\n`2.` **Take Azure tutorials:** Azure provides tutorials,\n\ndocumentation, and sample templates to help you get\n\nstarted. These resources can help you understand the\n\nbasics of Azure and how to use its services.\n\n`3.` **You can search for Databricks:** In the Azure portal, use the\n\nsearch bar at the top of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the Azure portal, Azure\n\nCLI or Azure Powershell. Once created, you’ll be able to\n\naccess your Databricks Workspace through the Azure portal.\n\n`6.` **Other Azure Services:** Once you have a Databricks\n\nworkspace setup, you can easily connect it to other Azure\n\nServices such as Azure Storage, Event Hubs, Azure Data\n\nLake Storage, Azure SQL and Cosmos DB for example.\n\n\n**GCP:**\n\n`1.` **[Create a GCP account](https://console.cloud.google.com/freetrial)** **:** the first step is to create an\n\naccount on GCP portal. This will give you access to the\n\nGCP Console, which is the web-based interface for\n\nmanaging your GCP resources.\n\n`2.` **Explore the GCP Console:** Once you have an account\n\nand are logged in, take some time to explore the GCP\n\nConsole and familiarize yourself with the various services\n\nthat are available.\n\n`3.` **Search for Databricks:** In the GCP Console, use the search bar\n\nin the top-left corner of the page and search for “Databricks”.\n\n`4.` **Navigate to the Databricks page:** Once you have found\n\nthe Databricks page, you can access it to get started with\n\nthe Databricks service.\n\n`5.` **Create a new Databricks workspace:** To create a new\n\nDatabricks workspace, you can use the GCP Console or\n\nthe gcloud command-line tool. Once created, you’ll be\n\nable to access your Databricks Workspace through the\n\nGCP Console.\n\n\n-----\n\n# Detailed Use Cases\n\n\n### Getting started with game analytics\n\nFortunately, standing up an effective analytics dashboard\n\nis getting easier. It all starts with getting your data into an\n\narchitecture that sets your team up for success. Selecting\n\nany of the major cloud providers — [AWS](https://portal.aws.amazon.com/billing/signup) [,](https://portal.aws.amazon.com/billing/signup) [Azure](https://azure.microsoft.com/en-us/free/gaming/) [,](https://azure.microsoft.com/en-us/free/gaming/) [GCP](https://console.cloud.google.com/freetrial) —\n\nyou can land all your data into a cloud data lake, then use\n\nDatabricks Lakehouse architecture to run real-time and\n\nreliable processing. Databricks can then help you visualize\n\nthat data in a dashboard, or send to a visual analytics\n\nplatform, such as Tableau.\n\n`1.` **Sign up for a Databricks account:** You’ll need to create\n\nan account on the Databricks website in order to use the\n\nplatform.\n\n`2.` **Access the Databricks portal:** Interact with the\n\nDatabricks platform and run tasks such as creating\n\nclusters, running jobs, and accessing data.\n\n`3.` **Set up a development environment:** You’ll need a\n\ndevelopment environment where you can write and\n\ntest your code, whether you’re using a local IDE or the\n\nDatabricks Workspace.\n\n`4.` **Collect data:** Once you have your development environment\n\nset up, you can start collecting data from your game. This\n\ncan involve integrating or building a SDK into your game\n\ncode, or using another tool to send data to cloud storage.\n\n`5.` **Process and analyze the data:** Once you have collected\n\nyour data, you can use Databricks to process and analyze\n\nit. This can involve cleaning and transforming the data,\n\nrunning queries or machine learning algorithms, or\n\ncreating visualizations.\n\n`6.` **Monitor and optimize:** Regularly monitor your analytics\n\nto ensure that they are accurate and relevant, and use the\n\ninsights you gain to optimize your game.\n\nKeep in mind that these are just general steps to get started\n\nwith Databricks for game analytics. The specific steps you’ll\n\nneed to take will depend on your specific use case and needs.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define your goals:** What do you want to learn from your\n\nanalytics data? Having clear goals will help you focus on\n\ncollecting the right data and making meaningful use of it.\n\n- **Plan your data collection:** Determine what data you need\n\nto collect, how you will collect it, and how you will store it.\n\n- **Consider privacy:** Make sure you are transparent with your\n\nplayers about what data you are collecting and how you\n\nwill use it, and give them the option to opt out if they wish.\n\n- **Use analytics to inform design:** Leverage your analytics data\n\nto inform decisions around game design, such as any balance\n\nchanges or new content targeting a specific audience.\n\n- **Monitor and test your analytics implementation:** Regularly\n\ncheck your analytics to ensure that data is being collected\n\ncorrectly, and conduct tests to validate the accuracy of\n\nyour data.\n\n- **Visualize your data:** Dashboarding your data is one of the\n\nmost effective ways to quickly and effectively make sense\n\nof what’s happening at a given moment in time.\n\n- **Use data to improve player retention:** Analyze player\n\nbehavior and use the insights you gain to improve player\n\nretention, such as by identifying and addressing pain\n\npoints or by providing personalized content.\n\n- **Collaborate with your team:** Share your analytics\n\nfindings with your team and encourage them to use the\n\ndata to inform their work.\n\n- **Keep it simple:** Don’t try to collect too much data or\n\ncreate overly complex analytics systems. Keep it simple\n\nand focused on your goals.\n\n- **Start where you are:** If you’ve yet to gather all of your\n\ndata, don’t go build some fancy model. Start with the data\n\nyou have available to you and build from there.\n\n### Getting started with Player Segmentation\n\nPlayer segmentation is crucial to studios as it allows them\n\nto better understand their audience and tailor their game\n\nexperience to meet their specific needs and preferences.\n\nBy dividing players into different segments based on factors\n\nsuch as demographics, playing styles, and in-game behavior,\n\n\n-----\n\nstudios can gain valuable insights into what motivates and\n\nengages their players. This information can then be used\n\nto design games that not only provide a more enjoyable\n\nexperience for players, but also drive player retention\n\nand increase revenue for the studio. In a competitive\n\nindustry where player satisfaction is key to success, player\n\nsegmentation is an essential tool for studios to stay ahead of\n\nthe game.\n\nStart by evaluating the segmentation goals such as:\n\n- **Personalize the experience:** Changing or creating\n\nexperience specific designs to the player.\n\n- **Create relevant content:** Surface the best content to\n\nplayers based on features and behaviors that will matter\n\nthe most depending on the player’s place in the games\n\nlife cycle.\n\n- **Monetization:** Create tailored monetization strategies\n\nthat effectively reach and convert each player group. For\n\nexample, you may have a group of highly engaged players\n\nwho are more likely to make in-app purchases, while\n\nanother group is less likely to spend money but may be\n\nmore receptive to advertisements.\n\nThe next steps would be to identify, collect and analyze\n\nplayer data. By gathering information on player behavior,\n\npreferences, and demographics, you can gain insights\n\ninto their motivations, pain points, and what drives their\n\nengagement with your game.\n\nThere are multiple types of player data to collect, including:\n\n- **Player Behavior:** Track player behavior and actions\n\nwithin your game to gain insights into their play style,\n\npreferences, and patterns.\n\n- **Surveys:** Ask players directly about their preferences,\n\nmotivations, and feedback through in-game surveys, email\n\nquestionnaires, or other forms of direct communication.\n\n- **Focus groups:** Gather a small group of players to discuss\n\nand provide feedback on specific aspects of your game\n\nand player experience.\n\n- **Social media listening:** Monitor social media platforms\n\nto gather insights into how players are engaging with and\n\ntalking about your game.\n\n**[Customer Segmentation solution accelerator](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n**Tips / Best Practices**\n\nDefine your segmentation goals: Determine what you want\n\nto learn about your players and why. This will help you focus\n\nyour analysis and ensure that your segments are meaningful\n\nand actionable.\n\n- **Use meaningful criteria:** Choose criteria that are relevant\n\nto your goals and that differentiate players in meaningful\n\nways. This could include demographic information, in-game\n\nbehavior, spending habits, or a combination of factors.\n\n- **Analyze player data:** Use data from your players to inform\n\nyour segmentation strategy. This could include data\n\non in-game behavior, spending habits, or demographic\n\ninformation.\n\n- **Use multiple methods:** We recommend using a\n\ncombination of methods, such as clustering to create\n\nsegments that are statistically meaningful and actionable\n\nto your game.\n\n- **Validate your segments:** Test your segments to ensure\n\nthat they accurately reflect the differences you observed\n\nin your player data. This could involve comparing the\n\nsegments to each other, or validating the segments\n\nagainst external data sources.\n\n- **Consider ethical and privacy concerns:** Ensure that\n\nyour segmentation strategy is ethical and complies\n\nwith privacy laws and regulations. This could involve\n\nanonymizing your player data, obtaining consent from\n\nplayers, or other measures to protect player privacy.\n\n- **Monitor and refine your segments:** Regularly review\n\nyour segments to ensure that they remain relevant and\n\nmeaningful. Refine your segments as necessary to reflect\n\nchanges in your player data or your goals.\n\n### Getting Started with Player Lifetime Value\n\nAssuming you’ve followed the steps to collecting, storing, and\n\npreparing your player data for analysis; To calculate player\n\nlifetime value (LTV), the quick and dirty way of assessing\n\noverall player LTV is to divide the total revenue by the total\n\nnumber of registered players. Note, LTV is a critical calculation\n\nfor return on investment, which is player lifetime spend versus\n\nthe amount spent on player acquisition. Ideally, you want\n\nlifetime spend to be equal to or more than cost of acquisition.\n\n\n-----\n\nAs long as your game and its community are currently active,\n\nany player lifetime value calculations should be considered\n\nmodels, not exact numbers. This is because many of the players\n\nyou’re considering are likely actively registered and actively\n\nplaying, so the exact player LTV number is a moving target.\n\nAdvanced\npredictive\nmodels\n\nSimple\npredictive\nmodels\n\n\nHistorical\naverage and\nbenchmarks\n\n\nBut these models are not entirely accurate since it doesn’t\n\ntake into account the players who are registered but have\n\nyet to generate any revenue. Instead, a data-driven approach\n\npivoted around player segmentation or cohorts will generally\n\nyield more actionable insight, far more than calculating a\n\nsingle LTV for the entire player base.\n\nYou can define your game’s cohorts in multiple ways. Perhaps\n\nthe most obvious in terms of calculating LTV is going by daily\n\nactive cohorts, or users who joined your game on the same\n\nday. You could also organize cohorts by users who joined\n\nyour game through a certain ad campaign or promotional\n\neffort, by country or geographic location, or by the type of\n\ndevice used.\n\n**[Lifetime Value solution accelerator](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n**ACCURACY**\n\n**Tips / Best Practices**\n\n\n**Use multiple data sources:** To get a complete picture of\n\na player’s value, be sure to consider data from a variety\n\nof sources, including in-game purchases, ad revenue, and\n\nother monetization strategies.\n\n**Consider player retention:** Player retention is a key factor\n\nin LTV, so be sure to consider how long players are likely to\n\nplay your game when calculating LTV.\n\n**Use accurate data:** Make sure you are using accurate\n\ndata when calculating LTV. This might involve cleaning and\n\nprocessing your data, or using trusted sources such as in-\n\ngame analytics tools.\n\n**Regularly review and update your LTV estimates:** Player\n\nLTV can change over time, so be sure to regularly review\n\nand update your estimates to ensure they are accurate.\n\n**Test and optimize:** Use experimentation methods such\n\nas A/B testing to see how different variables, such as\n\nin-game events or pricing strategies, affect LTV. Use the\n\ninsights you gain to optimize your LTV calculations.\n\n**Be aware of outside factors:** Your calculations should\n\nconsider the many outside factors that can affect your\n\nLTV, such as the virality of your game, any spikes or surge\n\nin visitors due to unexpected promotions (influencers,\n\nreviewers talking about your game), any significant changes\n\nto your game that users respond well to, and other organic\n\nlifts that are difficult to predict with existing data.\n\n\nThe first calculation is relatively simple. We suggest using\n\naverage revenue per user (ARPU), which is a game’s daily\n\nrevenue divided by the number of active users, to help you\n\ncalculate lifetime value. First, you’ll need to define what is\n\nan active player using retention values; which can be set to\n\na week, multi-day, or multi-week period of time depending\n\non how your game has performed to date. You can then look\n\nat the number of users who churn on a given day, averaging\n\nwith the number of days from the player’s first visit to the\n\ncurrent date (or the specific date you’ve considered the end\n\nfor said exercise). This is your playerbase lifetime value (note\n\nnot Player Lifetime Value). To get Lifetime Value, divide daily\n\nrevenue by the number of daily active users, and multiply\n\nthat by the Lifetime Value to get your player LTV.\n\nIt’s important to note that while calculating player lifetime\n\nvalue, the term is not entirely accurate since most player\n\nlifetimes are not over (particularly true for live service\n\ngames). But for the purpose of modeling, we recommend\n\nkeeping the amount of time that you consider a lifetime\n\nrelatively short, allowing you to extrapolate. Keeping the time\n\nperiod shorter helps mitigate inaccuracies, specifically, the\n\nlonger you stretch out what you consider a lifetime the more\n\nlikely you are to collect inactive users in your count.\n\n\n-----\n\n### Getting Started with Social Media Monitoring\n\nSocial media monitoring has three primary components:\n\ncollecting the data, processing the results, and taking action\n\non the findings. When it comes to collecting the data, whether\n\nyou’re looking for tweets, YouTube comments, or Reddit\n\nposts, it can be very easy to get started since many social\n\nmedia platforms such as Twitter, YouTube, and Reddit all\n\nprovide their own detailed and comprehensive APIs making it\n\neasy to start gathering data from those platforms with proper\n\ndocumentation and code examples to help along the way.\n\nOnce the data has been collected, the next step is to process\n\nit and prepare it to be used in the next step. Processing your\n\ndata can range in complexity from a simple keywords filter\n\nor more complicated approach such as filtering by location,\n\nremoving emojis, and censoring and substituting words. With\n\nthe data collected and processed, it can move to the final\n\nstage and be analyzed for downstream use and actionable\n\ninsights by applying sentiment analysis or text mining.\n\nIf a game studio is looking to save time and have the above\n\nsteps performed for them, it may be appealing to buy a\n\npre-built tool. The primary benefits of buying an off the shelf\n\nsolution is that it is often faster and easier to get started\n\nwith, and the development of the tool is handled by a third\n\nparty who will have experience in building media monitoring\n\n\nsolutions. On the other hand, building your own custom\n\nsolution will provide more flexibility and control. Many pre-\n\nbuilt media monitoring tools might not have the capabilities\n\nrequired to effectively process video, audio, and image\n\ndata, and may not be able to control the frequency in which\n\ndata is processed, whether it be near real-time or batch.\n\nAdditionally, pre-built solutions tend to take a generalist\n\napproach for NLP, whether it be keyword extraction, topic\n\nfiltering, or sentiment analysis, which often leads to poor\n\nresults and feedback, especially for an industry as unique as\n\nthe gaming industry where certain industry-specific slang\n\nor terminology is frequently used. Overall, building your\n\nown media monitoring tool will provide greater control and\n\nflexibility leading to a better tailored return on investment,\n\nand luckily Databricks makes it even easier to get started.\n\nWith the Databricks Lakehouse platform, all data engineering,\n\ndata science, machine learning, and data analytics can\n\nbe done in a single place without having to stitch multiple\n\nsystems and tools together.\n\nData engineers can use Workflows and Jobs to call social\n\nmedia platform APIs on a scheduled basis and use Delta Live\n\nTables to create declarative data pipelines for cleaning and\n\nprocessing the data that comes in. Data scientists can use\n\ntools such as ML-specific Databricks runtimes (DBRs) that\n\ncome with many of the most popular and common libraries\n\nalready installed, MLflow which makes model development,\n\n\n-----\n\ntracking, and serving easy and efficient, and various other\n\ntools such as AutoML and Bamboolib. Data analysts are able\n\nto create real-time alerts, dashboards, and visualizations\n\nusing Databricks SQL. Each of the three personas will be able\n\nto effectively collaborate with each other and integrate each\n\npiece of their work into the broader data architecture.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://databricks.com/company/contact)\n\n[out](https://databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\nWhile social media monitoring can be easy to get started\n\nwith, there are a few key points to keep in mind.\n\n- Remember the Pareto principle (roughly 80% of impact\n\ncomes from 20% of activity) and diminishing returns. While\n\nit’s important to monitor large platforms such as Reddit,\n\nTwitter, and YouTube, it might not be worthwhile to monitor\n\nsmaller platforms (in terms of engagement) as the bulk of\n\ncustomer feedback will be on those major platforms.\n\n- Monitor other sources of information. It is also useful to\n\nmonitor mentions of key company personnel such as\n\nexecutives or public facing employees.\n\n- While follower count does matter on platforms such as\n\nTwitter, don’t ignore users with low-follower counts. It only\n\ntakes one or two re-tweets from other users to become a\n\nlarge issue.\n\n- On social media, customers can see through generic\n\ncorporate responses to complaints, so it is important\n\nto get a clear understanding of the issue and provide a\n\nclear response.\n\n### Getting Started with Player Feedback Analysis\n\nThe easiest place to start is gathering your data. With\n\naccounts set up on Steam, Epic, Apple, Google, Xbox, Sony,\n\nNintendo (or whatever platform you’re using), identify the ID\n\nfor your game(s), and pull the reviews corresponding to that\n\ngame into Databricks through an API call.\n\n\nFrom here, you clean the data using some of the pre-\n\nprocessing available in Python that removes any emojis and\n\nASCII characters. Once complete, run through Spark NLP\n\npipeline which does the basic natural language processing\n\nsteps such as normalization, stemming, lemmatization. We\n\nrecommend running through pre-trained models, such as Word\n\nEmbeddings and Named Entity Recognition models from John\n\nSnow Labs. This should complete the pipeline and generates\n\nthe aspects for the reviews provided by the community.\n\nThis data is then loaded into a Delta table for further analysis,\n\nsuch as using a visual dashboard (built on SQL queries inside\n\nDatabricks) to analyze and understand the aspects the\n\ncommunity is talking about, which can then be shared back\n\nwith the development team for analysis and action. This is a\n\ngreat exercise to run once per month.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Check for word groupings:** Make sure your word groupings\n\nare accurate to improve the analysis. For example, if your\n\ngame is called Football Manager, and the shorthand is FM,\n\nmake sure both of those are grouped appropriately.\n\n- **Leverage domain knowledge:** Clean the reviews based\n\non your domain knowledge. There are generic steps one\n\ncould take, but that will not be as effective as someone\n\nwith domain, and specific game knowledge of your title.\n\n- **Experiment with models:** Feel free to try multiple pre-\n\ntrained models, and or tweak the pre-trained models\n\nbased on your understanding of the domain to improve\n\nthe accuracy of your results.\n\n- **Work one title at a time:** This process works best when\n\npulling reviews for a single title, specifically one version of\n\none title at a time.\n\n- **Let the model to the heavy lift, but use humans to double-**\n\n**check:** The sentiment corresponding to the aspects in the\n\nmodel will be labeled as Positive or Negative. In the case\n\nof a neutral review, the model will do its best to determine\n\nwhether that is more positive or negative. A best practice\n\nis to spend time going back through the aspects early to\n\ndetermine model accuracy and make updates accordingly.\n\n\n-----\n\n### Getting Started with Toxicity Detection\n\nOur recommendation on tackling the toxicity issue is\n\nto leverage cloud-agnostic and flexible tooling that can\n\nconsume chat data from a variety of sources, such as chat\n\nlogs, voice transcriptions, or sources like discord and reddit\n\nforums. No matter if the data is in log form from game\n\nservers or events from a message system, Databricks can\n\nprovide quick and easy ways to ingest the data.\n\nLeveraging a simplified architecture like the diagram\n\nabove shows no matter the source, getting chat data for\n\ninferencing and model development can be as simple. While\n\nwe leveraged a pre-built model from John Snow Labs to\n\naccelerate development, you can bring the ML framework of\n\nyour choice to the platform.\n\n**[Gaming Toxicity solution accelerator](https://notebooks.databricks.com/notebooks/CME/Toxicity_Detection_in_Gaming/index.html)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define what toxic and disruptive behavior looks**\n\n**like within your community:** Clearly define what you\n\nconsider to be toxic behavior, as this will determine how\n\nyou measure and detect it. This might include things like\n\nhateful language, harassment, or cheating.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you detect toxicity. This might include\n\ndata on in-game chat, player reports, and other sources.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and identify patterns of toxic\n\nbehavior. This will allow you to more accurately detect\n\ntoxicity and prioritize cases for review.\n\n- **Test and optimize:** Regularly review and test your toxicity\n\ndetection systems to ensure they are accurate and\n\neffective. Use experimentation methods such as A/B\n\ntesting to see how different strategies impact toxicity rates.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are detecting toxicity, and give\n\nthem the option to opt out if they wish.\n\n- **Take action:** When toxic behavior is detected, take\n\nappropriate action to address it. The health and wellness\n\nof your community depends on it. This might involve\n\nbanning players, issuing warnings, or taking other\n\ndisciplinary measures.\n\n\n-----\n\n### Getting Started with Multi-Touch Attribution and Media Mix Modeling\n\nTo get started with multi-touch attribution, you need to first\n\nselect an attribution model. There are a variety of different\n\nattribution models to choose from, each with its own\n\n\nattribution credit according to your chosen model (above).\n\nWe highly recommend you regularly review and test your\n\nattribution efforts to ensure they are accurate and effective.\n\nUse experimentation methods such as A/B testing to see\n\nhow different strategies impact conversion rates.\n\n**[Multi-Touch Attribution solution accelerator](https://notebooks.databricks.com/notebooks/CME/Multi-touch_Attribution/index.html#Multi-touch_Attribution_1.html)**\n\n\nstrengths and limitations.\n\n\n`1.` **Last-click model:** This model attributes all credit to the\n\nlast touchpoint that the customer interacted with before\n\nmaking a purchase or taking a desired action.\n\n`2.` **First-click model:** This model attributes all credit to the\n\nfirst touchpoint that the customer interacted with.\n\n`3.` **Linear model:** This model attributes equal credit to each\n\ntouchpoint that the customer interacted with.\n\n`4.` **Time decay model:** This model attributes more credit to\n\ntouchpoints that are closer in time to the purchase\n\nor desired action.\n\n`5.` **Position-based model:** This model attributes a portion of\n\nthe credit to the first and last touchpoints, and the remainder\n\nis distributed evenly among the other touchpoints.\n\n`6.` **Custom model:** Some businesses create their own\n\nattribution model based on specific business needs or goals.\n\nEach attribution model has its own strengths and limitations,\n\nand the right model for a particular video game will depend\n\non a variety of factors, including the goals of your title, the\n\ncustomer journey, and the types of marketing channels being\n\nused. It is important to carefully consider the pros and cons\n\nof each model and choose the one that best aligns with the\n\nneeds of your game.\n\nNext, you’re going to want to set up tracking. In order to\n\nattribute credit to different touchpoints, you’ll need to set up\n\ntracking to capture data on customer interactions. This might\n\ninvolve integrating tracking code into the game, or using a\n\nthird-party tracking tool.\n\nWith tracking set up, you’ll start collecting data on player\n\ninteractions and be able to use that information to calculate\n\n\n**Tips / Best Practices - things to consider**\n\n- **Define clear goals:** Sounds simple, but by clearly defining\n\nthe goals of your acquisition campaign and what success\n\nlooks like, you will be able to guide your decision-making\n\nand ensure that you are measuring the right metrics -\n\nsuch as cost per install, return on ad spend, conversion\n\nrate, lifetime value, retention rate, and more.\n\n- **Use a data-driven approach:** Use data to inform your\n\ndecision-making. Collect data on all touchpoints in the\n\nplayer journey, including ad impressions, clicks, installs,\n\nand in-game actions.\n\n- **Choose the right attribution model:** Select the right\n\nattribution model that accurately reflects the player\n\njourney for your specific genre of game. This can be a\n\ncomplex process. A couple of things to keep in mind\n\n- Consider the touchpoints that are most important for\n\nyour player journey, such as first ad impression, first\n\nclick, or first in-game action\n\n- Consider the business goals you’re trying to achieve.\n\nFor example, if you are focused on maximizing return\n\non investment, a last-click attribution model may be\n\nmost appropriate. On the other hand, if you are looking\n\nto understand the impact of each touchpoint, a multi-\n\ntouch attribution model may be more appropriate.\n\n- Consider the data you have available, including ad\n\nimpressions, clicks, installs, and in-game actions.\n\n- **Continuously monitor and optimize:** Continuously\n\nmonitor and optimize your acquisition campaigns based on\n\nthe data. Test different approaches, make adjustments as\n\nneeded, and use A/B testing to determine what works best.\n\n\n-----\n\n### Getting Started with Player Recommendations\n\nRecommendations is an advanced use case. We don’t\n\nrecommend (hehe) that you start here, instead, we’re\n\nassuming that you’ve done the work to set up your game\n\nanalytics (collecting, cleaning, and preparing data for analysis)\n\nand that you’ve done basic segmentation to place your\n\nplayers in cohorts based on their interests and behaviors.\n\nRecommendations can come in many forms for video games.\n\nFor this context, we’re going to focus on the wide-and-deep\n\nlearning for recommender systems, which has the ability\n\nto both memorize and generalize recommendations based\n\non player behavior and interactions. First [introduced by](https://arxiv.org/abs/1606.07792)\n\n[Google](https://arxiv.org/abs/1606.07792) for use in its Google Play app store, the wide-and-\n\ndeep machine learning (ML) model has become popular in a\n\nvariety of online scenarios for its ability to personalize user\n\nengagements, even in ‘cold start problem’ scenarios with\n\nsparse data inputs.\n\nThe goal with wide-and-deep recommenders is to provide\n\n\n**Understanding the model design**\n\nTo understand the concept of wide-and-deep recommend­\n\nations, it’s best to think of it as two separate, but collaborating,\n\nengines. The wide model, often referred to in the literature as\n\nthe linear model, memorizes users and their past choices. Its\n\ninputs may consist simply of a user identifier and a product\n\nidentifier, though other attributes relevant to the pattern (such\n\nas time of day) may also be incorporated.\n\nThe deep portion of the model, so named as it is a deep\n\nneural network, examines the generalizable attributes of a\n\nuser and their choices. From these, the model learns the\n\nbroader characteristics that tend to favor user selections.\n\nTogether, the wide-and-deep submodels are trained\n\non historical product selections by individual users to\n\npredict future selections. The end result is a single model\n\ncapable of calculating the probability with which a user will\n\npurchase a given item, given both memorized past choices\n\nand generalizations about a user’s preferences. These\n\nprobabilities form the basis for user-specific rankings, which\n\ncan be used for making recommendations.\n\n\nan intimate level of player understanding. This model uses\n\n\nexplicit and implicit feedback to expand the considerations\n\nset for players. Wide-and-deep recommenders go beyond\n\nsimple weighted averaging of player feedback found in some\n\ncollaborative filters to balance what is understood about\n\nthe individual with what is known about similar gamers. If\n\ndone properly, the recommendations make the gamer feel\n\nunderstood (by your title) and this should translate into\n\ngreater value for both the player and you as the business.\n\n\n**Building the model**\n\nThe intuitive logic of the wide-and-deep recommender\n\nbelies the complexity of its actual construction. Inputs\n\nmust be defined separately for each of the wide-and-\n\ndeep portions of the model and each must be trained in a\n\ncoordinated manner to arrive at a single output, but tuned\n\nusing optimizers specific to the nature of each submodel.\n\nThankfully, the [Tensorflow DNNLinearCombinedClassifier](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier)\n\n[estimator](https://www.tensorflow.org/api_docs/python/tf/estimator/DNNLinearCombinedClassifier) provides a pre-packaged architecture, greatly\n\nsimplifying the assembly of an overall model.\n\n\n**User A**\n\n- user identity\n\n- user attributes\n\n**Product B**\n\n\n**Wide**\n**Sub-Model**\n\n\n**Probability of**\n\n**User A + Product B**\n\n**Wide & Deep**\n**Model**\n\n\n**Deep**\n**Sub-Model**\n\n\n\n- product identity\n\n- product attributes\n\n\n-----\n\n**Training**\n\nThe challenge for most teams is then training the\n\nrecommender on the large number of user-product\n\ncombinations found within their data. Using [Petastorm](https://petastorm.readthedocs.io/en/latest/) , an\n\nopen-source library for serving large datasets assembled in\n\nApache Spark™ to Tensorflow (and other ML libraries), one can\n\ncache the data on high-speed, temporary storage and then\n\nread that data in manageable increments to the model during\n\ntraining. In doing so, we limit the memory overhead associated\n\nwith the training exercise while preserving performance.\n\n**Tuning**\n\nTuning the model becomes the next challenge. Various model\n\nparameters control its ability to arrive at an optimal solution.\n\nThe most efficient way to work through the potential parameter\n\ncombinations is simply to iterate through some number of\n\ntraining cycles, comparing the models’ evaluation metrics with\n\neach run to identify the ideal parameter combinations. By\n\ntrials, we can parallelize this work across many compute nodes,\n\nallowing the optimizations to be performed in a timely manner.\n\n**Deploying**\n\nFinally, we need to deploy the model for integration with\n\nvarious retail applications. Leveraging [MLflow](https://www.mlflow.org/) allows us\n\nto both persist our model and package it for deployment\n\nacross a wide variety of microservices layers, including\n\nAzure Machine Learning, AWS Sagemaker, Kubernetes and\n\nDatabricks Model Serving.\n\nWhile this seems like a large number of technologies to bring\n\ntogether just to build a single model, Databricks integrates all\n\nof these technologies within a single platform, providing data\n\nscientists, data engineers & [MLOps](https://www.databricks.com/glossary/mlops) Engineers a unified exper­\n\nience. The pre-integration of these technologies means various\n\nper­sonas can work faster and leverage additional capabilities,\n\nsuch as the [automated tracking](https://docs.databricks.com/machine-learning/automl-hyperparam-tuning/index.html#automated-mlflow-tracking) of models, to enhance the\n\ntransparency of the organization’s model building efforts.\n\nTo see an end-to-end example of how a wide and deep\n\nrecommender model may be built on Databricks, please\n\ncheck out the following notebooks: [Get the notebook](https://d1r5llqwmkrl74.cloudfront.net/notebooks/RCG/Wide_and_Deep/index.html#Wide_and_Deep_1.html)\n\n**[Recommendation Engines solution accelerator](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n\n**Tips / Best Practices - things to consider**\n\n- **Use data to inform recommendations:** Use data from\n\nyour analytics, player feedback, and other sources to\n\nunderstand what players like and dislike. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and engaging for individual players.\n\n- **Segment your players:** Consider segmenting your players\n\nbased on characteristics such as playstyle, spending\n\nhabits, and demographic information. This will allow you\n\nto create more targeted recommendations for different\n\ngroups of players.\n\n- **Consider the player’s current context:** When creating\n\nrecommendations, consider the player’s current context,\n\nsuch as what they are doing in the game and what\n\ncontent they have already consumed. This will help\n\nyou create recommendations that are more likely to be\n\nrelevant and timely.\n\n- **Test and optimize your recommendations:** Use\n\nexperimentation methods such as A/B testing to see\n\nhow different recommendations perform with different\n\nplayer segments. Use the insights you gain to optimize\n\nyour recommendations.\n\n- **Be transparent:** Make sure you are transparent with\n\nplayers about how you are creating recommendations and\n\ngive them the option to opt out if they wish.\n\n- **Use recommendations to improve the player experience:**\n\nUse personalized recommendations to improve the player\n\nexperience and increase engagement and satisfaction.\n\n### Getting Started with Next Best Offer/Action\n\nSince NBO/NBA is a specific use case of personalization, how a\n\nteam might get started implementing this will look very similar\n\nto how they would with broader personalization activities.\n\nBegin with ensuring you are appropriately collecting player\n\ndata (behavior, preferences, in-game purchases, etc), storing\n\nit in your cloud data lake using a service such as Delta Lake\n\nfrom Databricks. From here, you’ll prepare the data using\n\nDatabricks to clean, transform, and prepare for analysis.\n\nThis may include aggregating data from multiple sources,\n\nremoving duplicates and outliers, and transforming the data\n\ninto a format suitable for analysis. As you analyze the player\n\ndata, seek to identify patterns and trends in player behavior\n\n\n-----\n\nand preferences that will give you signal on which actions are\n\nmore likely to be successful.\n\nFrom here, you can build a recommendation model based\n\non the player data analysis, and incorporate information\n\non in-game items and player preferences to make\n\npersonalized recommendations.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Define your goals:** Like every use case, starting with\n\nclearly defined goals helps to ensure your implementation\n\nof NBO and NBA will be as effective and efficient as\n\npossible. Your goals will also help you determine what data\n\nto collect and how it will be used.\n\n- **Collect relevant data:** Based on your goals, make sure\n\nyou are collecting the right data to inform your NBO and\n\nNBA recommendations. This might include data on player\n\nbehavior, engagement, and spending habits.\n\n- **Leverage machine learning to scale your**\n\n**recommendations:** Use machine learning algorithms to\n\nanalyze your data and make personalized recommendations\n\nto your players. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** THIS IS CRITICAL. Use experimentation\n\nmethods such as A/B testing to see how different\n\nrecommendations perform with different player segments.\n\nPast performance is not a perfect indicator of future\n\nsuccess. Consistent testing allows you to tune your NBO and\n\nNBA recommendations so they evolve with your playerbase.\n\n- **Consider the player’s context:** When making recommend­\n\nations, consider the player’s current context, such as what\n\nthey are doing in the game and what content they have\n\nalready consumed. This will help you create recommend­\n\nations that are more likely to be relevant and timely.\n\n- **Be transparent:** Make sure you are transparent with\n\nyour players about how you are using their data to make\n\nrecommendations, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your NBO and NBA\n\n\n### Getting Started with Churn Prediction & Prevention\n\nThe exciting part of this analysis is that not only does it\n\nhelp to quantify the risk of customer churn but it paints a\n\nquantitative picture of exactly which factors explain that risk.\n\nIt’s important that we not draw too rash of a conclusion with\n\nregards to the causal linkage between a particular attribute\n\nand its associated hazard, but it’s an excellent starting point\n\nfor identifying where an organization needs to focus its\n\nattention for further investigation.\n\nThe hard part in this analysis is not the analytic techniques.\n\nThe Kaplan-Meier curves and Cox Proportional Hazard\n\nmodels used to perform the analysis above are well\n\nestablished and widely supported across analytics platforms.\n\nThe principal challenge is organizing the input data.\n\nThe vast majority of subscription services are fairly new as\n\nbusinesses. As such, the data required to examine customer\n\nattrition may be scattered across multiple systems,\n\nmaking an integrated analysis more difficult. Data Lakes\n\nare a starting point for solving this problem, but complex\n\ntransformations required to cleanse and restructure data\n\nthat has evolved as the business itself has (often rapidly)\n\nevolved requires considerable processing power. This is\n\ncertainly the case with the KKBox information assets and is a\n\npoint noted by the data provider in their public challenge.\n\nThe key to successfully completing this work is the\n\nestablishment of transparent, maintainable data processing\n\npipelines executed on an elastically scalable (and therefore\n\ncost-efficient) infrastructure, a key driver behind the [Delta](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n[Lake pattern](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html) . While most organizations may not be overly\n\ncost-conscious in their initial approach, it’s important to\n\nremember the point made above that churn is a chronic\n\ncondition to be managed. As such, this is an analysis that\n\nshould be periodically revisited to ensure acquisition and\n\nretention practices are aligned.\n\nTo support this, we are making the code behind our\n\nanalysis available for download and review. If you have any\n\nquestions about how this solution can be deployed in your\n\nenvironment, please don’t hesitate to [reach out](https://www.databricks.com/company/contact) to us.\n\n\nefforts with your team and encourage them to use the\n\n\ndata to inform their work.\n\n\n**[Churn Prediction solution accelerator](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Define churn:** Clearly define what you consider to be\n\nplayer churn, as this will determine how you measure\n\nand predict it. For example, you might consider churn to\n\nbe when a player stops playing your game for a certain\n\nnumber of days, or when they uninstall it.\n\n- **Collect relevant data:** Make sure you are collecting the\n\nright data to help you predict and prevent churn. This\n\nmight include data on player behavior, engagement, and\n\nspending habits.\n\n- **Use machine learning:** Use machine learning algorithms\n\nto analyze your data and predict which players are at\n\nrisk of churning. This will allow you to identify trends and\n\npatterns that might not be immediately apparent.\n\n- **Test and optimize:** Use experimentation methods such as\n\nA/B testing to see how different strategies impact churn\n\nrates. Use the insights you gain to optimize your churn\n\nprevention efforts.\n\n- **Focus on retention:** Implement retention strategies that are\n\ntailored to the needs and preferences of your players. This\n\nmight involve providing personalized content, addressing\n\npain points, or offering incentives to continue playing.\n\n- **Be transparent:** Make sure you are transparent with your\n\nplayers about how you are using their data to predict and\n\nprevent churn, and give them the option to opt out if\n\nthey wish.\n\n- **Collaborate with your team:** Share your churn prediction\n\nand prevention efforts with your team and encourage\n\nthem to use the data to inform their work.\n\n### Getting Started with Read-time Ad Targeting\n\nTypically, implementing a real-time ad targeting strategy begins\n\noutside of your game (in services such as Google Ads, Unity\n\nAdvertising), where your game becomes the delivery point\n\nfor the advertisement. Here, you will need to integrate with\n\nAd networks that provide real-time ad targeting capabilities.\n\nThat will allow you to access a range of available ad assets\n\nto dynamically select and display the most relevant ads to\n\nplayers. Both Google AdMob and Unity Ads are great for banner\n\nads, native ads, and rewarded video ads. Your role is to ensure\n\nthat the data you’re collecting is fed back into the advertising\n\nplatform to better serve targeted ads to your playerbase.\n\n\nTo use a service like Databricks to manage the data needed\n\nto provide real-time ad targeting in your application, you can\n\nfollow the below steps:\n\n`1.` **Collect and store player data:** Collect data on player\n\nbehavior, preferences, and demographics, and store it in\n\na data lake using Databricks. Popular analytics tools such\n\nas Google Analytics or Mixpanel can be integrated into\n\nthe game to collect data on player behavior. These tools,\n\njust like tracking website traffic, can track in-game events,\n\nprovide insights on player behavior and demographics..\n\nand they give you access to detailed reports and\n\ndashboards. Another option is to build in-house tracking\n\nsystems to collect data on player behavior - logging\n\nevents, e.g in-game purchases or player actions, activities\n\nsuch as “at which level does a player quit playing” and\n\nstoring this in a database for analysis. The downside of\n\nbuilding in-house tracking systems is you will need to host\n\nand maintain your own logging servers.\n\n`2.` **Prepare the data:** Use Databricks to clean, transform,\n\nand prepare the player data for analysis. This may\n\ninclude aggregating data from multiple sources, removing\n\nduplicates and outliers, and transforming the data into a\n\nformat suitable for analysis.\n\n`3.` **Analyze the data:** Use Databricks’ built-in machine\n\nlearning and data analytics capabilities to analyze the\n\nplayer data and identify patterns and trends.\n\n`4.` **Create audience segments:** Based on the analysis,\n\nuse Databricks to create audience segments based on\n\ncommon characteristics such as interests, behaviors,\n\nand preferences.\n\n`5.` **Integrate with the ad server:** When an ad opportunity\n\npresents itself within the game, a call is made to the ad\n\nserver. This call includes information about the player,\n\nsuch as the audience segment that they belong to. The\n\nad server then uses this information to decide what ad to\n\ndeliver to the player.\n\n`6.` **Monitor and optimize:** Use Databricks to monitor the\n\nperformance of the ad targeting and make optimizations\n\nas needed, such as adjusting the audience segments or\n\nadjusting the targeting algorithms.\n\nBy using a service like Databricks to manage the data needed\n\nfor real-time ad targeting, game developers can effectively\n\nleverage their player data to create more personalized and\n\nengaging experiences, increase revenue, and reduce churn.\n\n\n-----\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Focus on player data:** Make player data the center of your\n\ntargeting strategy by collecting and storing comprehensive\n\ninformation on player behavior, preferences, and\n\ndemographics. Here, it’s critical to ensure the game code\n\ndata trackers are properly implemented in order to collect\n\nthis data (see Game Analytics section for detail).\n\n- **Segment your audience:** Create audience segments\n\nbased on common characteristics such as interests,\n\nbehaviors, and preferences, and use these segments to\n\n\n**Test and iterate:** Continuously test and iterate your\n\ntargeting strategy to refine your audience segments and\n\nimprove targeting accuracy.\n\n**Balance relevance and privacy:** Balance the need for\n\nrelevant, personalized ads with players’ privacy by only\n\ncollecting and using data that is necessary for targeting\n\nand obtaining player consent.\n\n**Monitor performance:** Regularly monitor the performance\n\nof your targeting strategy to ensure that it is delivering the\n\ndesired results and make optimizations as needed.\n\n**Partner with the right ad platform:** Choose an ad\n\nplatform that is well-suited to your needs and aligns with\n\nyour goals, and work closely with them to ensure that your\n\ntargeting strategy is delivering the best results.\n\n\ndeliver targeted ads.\n\n# Operational use cases\n\n\n### Anomaly Detection\n\nFirst thing is to begin collecting the data, game server / client\n\nlogs out of your project. Then consume this into Databricks\n\nDelta, to have a continuous anomaly detection model\n\nrunning. Focus this on key pieces of information you want to\n\nmonitor, for example - for live service games, this is going to\n\nbe infrastructure and network-related metrics such as Ping\n\nand Server Health (# of clients connected, server uptime,\n\nserver usage, CPU/RAM, # of sessions, time of sessions).\n\nOnce the model is ingesting and tuned specifically for the\n\nmetrics based on the information you have above. You would\n\nbuild out alerts or notifications based on these specific\n\nmetrics hitting a threshold that you define as needing\n\nattention. From here, you can build out automated systems\n\nto mitigate those effects - such as migrating players to a\n\ndifferent server, canceling matches, scaling infrastructure,\n\ncreating tickets for admins to review.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n**Tips / Best Practices**\n\n- **Define the problem and objectives clearly:** Before\n\nimplementing an anomaly detection solution, it is\n\nimportant to define the problem you are trying to solve\n\nand your specific objectives. This will help ensure that\n\nyou have the right data sources and use the appropriate\n\nalgorithms to achieve your goals.\n\n- **Choose the right data sources:** To effectively detect\n\nanomalies, you need to have the right data sources.\n\nConsider data from player behavior, system performance,\n\nand network traffic, as well as any other data sources that\n\nare relevant to your problem and objectives.\n\n- **Clean and preprocess the data:** To ensure that the\n\ndata you use for anomaly detection is accurate and\n\nmeaningful, it is important to clean and preprocess the\n\ndata. This includes removing any irrelevant or invalid data,\n\nhandling missing values, and normalizing the data\n\nif necessary.\n\n- **Choose the right algorithms:** There are many algorithms\n\nthat can be used for anomaly detection, including\n\nstatistical methods, machine learning algorithms, and\n\nrule-based systems. Choose the algorithms that are best\n\n\n-----\n\nsuited to your data and problem, and that provide the\n\nright level of accuracy, speed, and scalability.\n\n- **Validate the results:** Before deploying the anomaly\n\ndetection solution in production, it is important to validate\n\nthe results by testing the solution on a small subset of\n\ndata and comparing the results to expected outcomes.\n\n- **Monitor and update the solution:** Once the anomaly\n\ndetection solution is deployed, it is important to monitor\n\nits performance and accuracy, and update the solution as\n\nneeded. This may include retraining the algorithms, adding\n\nor removing data sources, and updating the parameters\n\nand thresholds used by the algorithms.\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Avoid overfitting:** Overfitting occurs when the anomaly\n\ndetection solution is too complex and learns the noise\n\nin the data rather than the underlying patterns. To avoid\n\noverfitting, it is important to choose algorithms that are\n\nappropriate for the size and complexity of the data, and to\n\nvalidate the results using a separate test dataset.\n\n- **False positive and false negative results:** False positive\n\nand false negative results can occur when the anomaly\n\ndetection solution is not properly calibrated, or when\n\nthe solution is applied to data that is significantly\n\ndifferent from the training data. To minimize the risk of\n\nfalse positive and false negative results, it is important\n\nto validate the results using a separate test dataset, and\n\nto fine-tune the parameters and thresholds used by the\n\nalgorithms as needed.\n\n- **Scalability:** Scalability can be a concern when\n\nimplementing an anomaly detection solution, especially\n\nwhen dealing with large amounts of data. To ensure that\n\nthe solution can scale to meet the demands of a growing\n\nplayer base, it is important to choose algorithms that\n\nare fast and scalable, and to deploy the solution using a\n\nscalable infrastructure.\n\n### Getting Started with Build Pipeline\n\nAn operational goal game projects have is to make sure\n\ngame project builds are generated, delivered quickly and\n\nefficiently to internal testing & external users.\n\n\nA few of the key metrics and capabilities with analyzing your\n\nbuild pipelines are the below:\n\n- **Build time and speed:** This includes metrics such as\n\nthe time it takes to create a build, number of builds, and\n\ncompute spent.\n\n- **Build size and storage:** size of the builds, amount of\n\nstorage, and network costs.\n\n- **Bug tracking and resolution:** This includes metrics such\n\nas the number of bugs reported, the time it takes to\n\nresolve them, and the number of bugs that are resolved in\n\neach build.\n\n- **Code quality and efficiency:** This includes metrics such\n\nas code complexity, code duplication, and the number of\n\ncode lines written.\n\n- **Collaboration and communication:** Such as the number\n\nof code reviews, the number of team meetings, and the\n\nnumber of code commits.\n\n- **Advanced capabilities:** Such as Predicting real time build\n\nfailure to reduce spend and combining build data with\n\nCrash Analytics (see below) to have “commit to build”\n\nvisibility for accelerated bug fixing.\n\nBefore you start implementing your build pipeline, it’s\n\nimportant to define your requirements. What are the key\n\ngoals of your build pipeline? Choosing the right CI/CD tools is\n\ncritical to the success of your build pipeline. There are many\n\ndifferent tools available, including Jenkins, Azure Devops,\n\nPerforce, gitlab and more. When choosing a CI/CD tool,\n\nconsider factors such as ease of use, scalability, and cost. In\n\naddition, consider the specific needs of your game project,\n\nand choose a tool that can meet those needs.\n\nThe general recommendation is to look at automating your\n\nbuild process early. Once you’ve chosen your CI/CD tools, you\n\ncan automate your build process by setting up a build server,\n\nconfiguring your CI/CD tool, and creating a script to build your\n\ngame project. The build process should be automated as much\n\nas possible, and it should include steps to compile your code,\n\nrun automated tests, and generate a build of your project.\n\nOnce you have automated your build process, often the\n\nnext step is to implement CD (Continuous Delivery). This\n\ninvolves automating the deployment of your game builds\n\ndelivery to stakeholders, such as QA testers, beta testers, or\n\nend-users via publishing platforms. CD can help ensure that\n\nstakeholders have access to the latest version of your game\n\n\n-----\n\nas soon as possible, allowing them to provide feedback and\n\nhelp drive the development process forward.\n\nFinally, it’s important to monitor and measure your build\n\npipeline to ensure that it’s working as expected. This can\n\ninvolve using tools such as Databricks Dashboards to\n\nvisualize the status of your pipeline, or using metrics such\n\nas build times, test results, and deployment success rates\n\nto evaluate the performance of your pipeline. By monitoring\n\nand measuring your build pipeline, you can identify areas for\n\nimprovement and make changes as needed to ensure that\n\nyour pipeline continues to meet your needs.\n\nIf you have any questions about how databricks can\n\nintegrate into your devops solution, please don’t hesitate to\n\n[reach out](https://www.databricks.com/company/contact) to us.\n\n**Tips / Best Practices**\n\n- **Seek to automate early and often:** Automate as much\n\nof the build process as possible, from checking code into\n\nversion control to generating builds and distributing them\n\nto stakeholders. This can help reduce errors and save time,\n\nallowing game teams to focus on more high value tasks.\n\n\n**Version control, version control, version control:** Use a\n\nversion control system to manage the source code and\n\nother assets. This ensures that changes to the codebase\n\nare tracked and can be easily undone if needed.\n\n**Implement continuous integration and delivery:**\n\nContinuous integration (CI) involves automatically building\n\nand testing after code changes are checked into version\n\ncontrol. With CI, new changes to the codebase do not\n\nbreak existing functionality. By automating the build\n\nprocess, CI helps to reduce errors and save time. CD, on\n\nthe other hand, involves automatically delivering builds to\n\nstakeholders, such as QA testers, beta testers, or end-\n\nusers, after they have passed the automated tests. By\n\ncombining CI and CD, a video game project can ensure\n\nthat builds are generated and delivered quickly and\n\nefficiently, without the need for manual intervention.\n\n**Build for scalability:** As your game project grows, you\n\nwill need a build pipeline solution that is scalable and can\n\nhandle the needs of your game team.\n\n**Integration with other tools:** Integrate the build pipeline\n\nsolution with other tools and systems, such as issue\n\ntracking, testing, and deployment tools, to ensure a\n\nsmooth and efficient workflow.\n\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n\n|GAME INFRASTRUCTURE|Col2|\n|---|---|\n|||\n|||\n\n\n**AWS**\n\n**Quicksight**\n\n\n-----\n\n### Getting Started with Crash Analytics\n\nBuilding a pipeline to build a holistic view to support crash\n\nanalytics means data coming from multiple different\n\nsources, different velocities and joining the data together.\n\nThe amount of data sources depends on your game projects\n\npublishing platforms, some may come from console based\n\nproviders such as sony, xbox, and nintendo or pc platforms\n\nlike Steam, Epic Games Marketplace, GoG and many others.\n\n**High level steps**\n\n- Determine what platforms your game is running on and\n\nhow to interface to collect data.\n\n- **Collect crash data:** Implement crash reporting tools in\n\nyour game to collect data on crashes. The source data\n\nmay be delivered in varying formats such as JSON or CSV.\n\n- **Load crash data into Databricks:** Use Databricks’ data\n\ningestion tools to load the crash data into your workspace.\n\nThis could involve using Databricks’ built-in data source\n\nconnectors, or programmatically ingest files to load the data.\n\n\n\n- **Transform and clean the crash data:** Use Databricks’\n\ndata processing and transformation tools to clean and\n\nprepare the crash data for analysis. This could involve\n\nusing Databricks’ capabilities like DLT, or using SQL to\n\nperform custom transformations.\n\n- **Visualize crash data:** Use Databricks’ dashboarding tools\n\nto create visualizations that help you understand the\n\npatterns and trends in your crash data. This could involve\n\nusing Databricks’ built-in visualization tools, or integrating\n\nwith external visualization tools like Tableau or PowerBI.\n\n- **Analyze crash data:** Use Databricks’ machine learning\n\nand statistical analysis tools to identify the root causes\n\nof crashes. This could involve using Spark MLlib or many\n\nof the popular tools to build machine learning models, or\n\nusing SQL to perform custom analyses.\n\n- **Monitor and refine your pipeline:** Regularly review your\n\npipeline to ensure that it remains relevant and useful.\n\nRefine your pipeline as necessary to reflect changes in\n\nyour crash data or your goals.\n\nIf you have any questions about how this solution can be\n\ndeployed in your environment, please don’t hesitate to [reach](https://www.databricks.com/company/contact)\n\n[out](https://www.databricks.com/company/contact) to us.\n\n\n-----\n\n**Tips / Best Practices**\n\n- **Automated collection and aggregation of crash reports:**\n\nCollecting crash reports should be an automated process\n\nthat is integrated into the output of the build pipeline\n\nfor the game. The crash reports should be automatically\n\naggregated and made available for analysis in near real-time.\n\n- **Clear reporting and prioritization of issues:** The solution\n\nshould provide clear reporting on the most common\n\nissues and allow game developers to prioritize fixing the\n\nmost impactful problems first.\n\n- **Integration with other analytics tools:** The crash analytics\n\nsolution should integrate with other analytics tools, such\n\nas player behavior tracking, to provide a more complete\n\npicture of how crashes are impacting the player experience.\n\n- **Flexibility and scalability:** As the game grows, the\n\n\nAdditionally, there are some key gotchas to look out for when\n\nimplementing an anomaly detection solution.\n\n- **Data privacy and security:** Ensure that crash reports do\n\nnot contain sensitive information that could be used to\n\nidentify individual players.\n\n- **Scalability:** As the number of players and crashes\n\nincreases, it may become difficult to manage and analyze\n\nthe growing volume of data.\n\n- **Integration with other tools:** Be aware when integrating\n\ncrash analytics with other tools and systems, especially if\n\nthe tools use different data formats or data structures.\n\n- **Prioritization of issues:** Determine which crashes are\n\nthe most impactful and prioritize fixes accordingly. This\n\ncan be a complex process, especially if there are a large\n\nnumber of different crash types and causes.\n\n\nsolution should be able to scale to accommodate an\n\nincreasing number of players and crashes.\n\n**Data privacy and security:** It’s important to consider data\n\nprivacy and security when implementing a crash analytics\n\nsolution. This may involve implementing measures to\n\nanonymize crash reports, or taking steps to ensure that\n\nsensitive information is not included in the reports.\n\n**Reference Architecture**\n\n**Databricks**\n**SQL**\n\n**Power BI**\n\n**AWS**\n\n**Quicksight**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "### Executive Guide\n\n# Transform and Scale Your Organization With Data and AI\n\n#### A guide for CIOs, CDOs, and\n data and AI executives\n\n\n-----\n\n## Contents\n\n**A U T H O R :**\n\n**Chris D’Agostino**\n\nGlobal Field CTO\n\nDatabricks\n\n**E D I T O R S :**\n\nManveer Sahota\n\n\n**C H A P T E R 1 :** \u0007 **Executive Summary** 3\n\n**C H A P T E R 2 :** \u0007 **Define the Strategy** 6\n\n**1.** Establish the goals and business value 8\n\n**2.** Identify and prioritize use cases 19\n\n**3.** Build successful data teams 22\n\n**4.** Deploy a modern data stack 28\n\n**5.** Improve data governance and compliance 36\n\n**6.** Democratize access to quality data 41\n\n**7.** Dramatically increase productivity of your workforce 47\n\n**8.** Make informed build vs. buy decisions 52\n\n**9.** Allocate, monitor and optimize costs 55\n\n**10.** Move to production and scale adoption 58\n\n\nJessica Barbieri\n\n\nToby Balfre\n\n\n**C H A P T E R 3 :** **Conclusion** \u0007 63\n\n\n-----\n\n**CHAPTER 1:**\n## Executive Summary\n\nData and AI leaders are faced with the challenge\n\nof future-proofing their architecture and platform\n\ninvestments. The Lakehouse implementation from\n\nDatabricks combines the best features of EDWs\n\nand data lakes by enabling all their workloads using\n\nopen source and open standards — avoiding the\n\nvendor lock-in, black box design and proprietary\n\ndata formats of other cloud vendors.\n\n\nIt’s not surprising that many industry experts say data is the most valuable resource in the modern\n\neconomy — some even go so far as to describe it as the “new oil.” But at Databricks, we think of data as\n\nwater. Its core compound never changes, and it can be transformed to whatever use case is desired,\n\nwith the ability to get it back to its original form. Furthermore, just as water is essential to life, data is now\n\nessential to survival, competitive differentiation and innovation for every business. Clearly, the impact and\n\nimportance of data are growing exponentially in both our professional and personal lives, while artificial\n\nintelligence (AI) is being infused in more of our daily digital interactions. The explosion in data availability\n\nover the last decade and the forecast for growth at a compounded [annual growth rate (CAGR) of 23%](https://www.google.com/url?q=https://www.idc.com/getdoc.jsp?containerId%3DprUS47560321&sa=D&source=docs&ust=1651117260200496&usg=AOvVaw3jdZ_6YHlXGQlUMJK8ULux) over\n\n2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\n\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\n\ngoals but also in minimizing these seven key business risks.\n\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\n\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\n\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\n\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\n\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n\nsignificant return on investment (ROI) — one that starts in months, not years.\n\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n\nto deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n\nidentify and execute on AI opportunities.\n\n\n-----\n\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\n\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n\norganizations have the option of moving away from closed, proprietary systems offered by a variety\n\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n\nindustry standards.\n\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n\nwe’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n\narchitecture, which decouples data storage from compute while providing the best price/performance\n\nmetrics for all your data workloads — including data warehousing. We have captured the lessons learned\n\nand summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\n\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n\nshown in Figure 1.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n**Figure 1:**\nThe Databricks Lakehouse Platform\n\n\n-----\n\n**The lakehouse architecture benefits organizations in several ways:**\n\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n\n**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n\n**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\n\nOur intention is to present key considerations and equip you with the knowledge to ask informed questions,\n\nmake the most critical decisions early in the process, and develop the comprehensive strategy that most\n\norganizations lack.\n\nIn addition, we have created an easy-to-follow Data and AI Maturity Model and provided a comprehensive\n\nprofessional services offering that organizations can leverage to measure their readiness, reskill their staff\n\nand track progress as they embark on their data transformation initiative.\n\n\n-----\n\n**CHAPTER 2:**\n## Define the Strategy\n\n\nThe most critical step to enable data, analytics and AI at scale is to develop a comprehensive and executable\n\nstrategy for how your organization will leverage people, processes and platforms to drive measurable\n\nbusiness results against your corporate priorities. The strategy serves as a set of principles that every\n\nmember of your organization can refer to when making decisions. The strategy should cover the roles and\n\nresponsibilities of teams within your organization for how you capture, store, curate and process data to run\n\nyour business — including the internal and external resources (labor and budget) needed to be successful.\n\n\nEstablish the\ngoals and\nbusiness value\n\n\nBuild\nsuccessful\ndata teams\n\n\nEase data\ngovernance and\ncompliance\n\n\nSimplify\nthe user\nexperience\n\n\nAllocate,\nmonitor and\noptimize costs\n\n\nIdentify and\nprioritize\nuse cases\n\n\nDeploy a modern\ndata architecture\n\n\nDemocratize\naccess to\nquality data\n\n\nMake informed\nbuild vs. buy\ndecisions\n\n\nMove to\nproduction and\ndrive adoption\n\n\n**Figure 2:**\nThe 10 steps to a winning data and AI strategy\n\n\n-----\n\n#### Here are 10 key considerations\n\n**1.** \u0007Secure buy-in and alignment on the overall business goals, timeline and appetite for the initiative.\n\n**2.** \u0007Identify, evaluate and prioritize use cases that actually provide a significant ROI.\n\n**3.** \u0007Create high-performing teams and empower your business analyst, data scientist, machine learning\n\nand data engineering talent.\n\n**4.** \u0007Future-proof your technology investment with a modern data architecture.\n\n**5.** \u0007Ensure you satisfy the European Union’s General Data Protection Regulation (GDPR), the California\n\nConsumer Privacy Act (CCPA) and other emerging data compliance and governance regulations.\n\n**6.** \u0007Implement needed policies, procedures and technology to guarantee data quality and enable secure\n\ndata access and the sharing of all your data across the organization.\n\n**7.** \u0007Streamline the user experience (UX), improve collaboration and simplify the complexity of your tooling.\n\n**8.** \u0007Make informed build vs. buy decisions and ensure you are focusing your limited resources on the most\n\nimportant problems.\n\n**9.** \u0007Establish the initial budgets and allocate and optimize costs based on SLAs and usage patterns.\n\n**10.** \u0007Codify best practices for moving into production and how to measure progress, rate of adoption and\n\nuser satisfaction.\n\nThe strategy should clearly answer these 10 topics and more, and should be captured in a living document,\n\nowned and governed by the CDO and made available for everyone in the organization to review and provide\n\nfeedback on. The strategy will evolve based on the changing market/political conditions, evolving business,\n\nthe technology landscape or a combination of any of these — but it should serve as the North Star for\n\nhow you will navigate the many decisions and trade-offs that you will need to make over the course of the\n\ntransformation.\n\n\nThis guide takes a stepwise approach to\n\naddressing each of these 10 topics.\n\n\n-----\n\nStudies have shown that data scientists spend 80%\n\nof their time collecting and compiling data sets\n\n\n#### 1. Establish the goals and business value\n\nMost organizations on a data, analytics and AI journey establish a set of goals for the resulting investment.\n\nThe goals generally fall into one of three categories:\n\n**1.** **Business outcomes**\n\n**2.** **People**\n\n**3.** **Technology**\n\n\nand only 20% of their time developing insights and\n\n\nIn terms of business outcomes, organizations need to adapt more quickly to market opportunities and\n\nemerging risks, and their legacy-based information systems make that difficult to achieve. As a result,\n\nbusiness leaders see the digital transformation as an opportunity to build a new technology foundation\n\nfrom which to run their business and increase business value. One that is more agile, scalable, secure and\n\neasier to use — making the organization better positioned to adapt, innovate and thrive in the modern and\n\ndynamic economy.\n\nFor organizations today, people are one of their most valuable assets — you cannot succeed in data,\n\nanalytics and AI without them. The battle for top talent is as fierce as ever, and the way that people work\n\nimpacts your ability to hire and retain the skills you need to succeed. It is important to make sure that\n\nemployees work in a frictionless data environment, to the extent possible, so they feel productive each day\n\nand can do their best work.\n\nFinally, from a technology perspective, organizations have grown tired of the high costs associated with\n\ncomplex system architectures, vendor lock-in, and proprietary solutions that are slow to evolve. The\n\nindustry trend is to move away from large capital expenditures (capex) to pay for network and server\n\ncapacity in advance — and toward a “just-in-time” and “pay-for-what-you-use” operating expense (opex)\n\napproach. Your data analytics environment should support this trend as well — using open standards, low-\n\ncost storage and on-demand compute that efficiently spins up to perform data workloads and spins down\n\nonce they are complete.\n\n\nalgorithms. Organizations that are able to invert\n\nthese numbers benefit in two ways — happier\n\nemployees and improved time to market for use\n\ncases. These employers create more favorable\n\nworking environments and lower the risk of burnout\n\nand the resulting regrettable attrition.\n\n\n-----\n\n**Executive buy-in and support**\n\nLarge organizations are difficult to change — but it’s not impossible. In order to be successful, you need\n\nto have unwavering buy-in and support from the highest levels of management — including the CEO and\n\nboard of directors. With this support, you have the leverage you need to develop the strategy, decide on\n\nan architecture and implement a solution that can truly change the way your business is run. Without it,\n\nyou have a very expensive science project that has little hope of succeeding. Why? Because the majority\n\nof people in your organization are busy doing their day jobs. The added work to support the initiative must\n\nbe offset by a clear articulation of the resulting benefits — not only for the business but for the personnel\n\nwithin it. The transformation should result in a positive change to how people do their jobs on a daily basis.\n\nTransformation for data, analytics and AI needs to be a company-wide initiative that has the support from\n\nall the leaders. Even if the approach is to enable data and AI one business unit (BU) at a time, the plan needs\n\nto be something that is fully embraced in order to succeed. Ideally, the senior-most executives serve as\n\nvocal proponents.\n\n\n-----\n\n**Evolve to an AI-first company — not just a data-first company**\n\nData and AI transformations should truly transform the way organizations use data, not just evolve it. For\n\ndecades, businesses have operated using traditional business processes and leveraged Structured Query\n\nLanguage (SQL) and business intelligence (BI) tools to query, manipulate and report on a subset of their\n\ndata. There are five major challenges with this approach:\n\n**1.** \u0007A true self-assessment of where your organization is on the AI maturity curve. Most organizations will\n\nuse pockets of success with analytics and AI to move higher up the maturity curve, but in reality the\n\nability to replicate and scale the results is nearly impossible.\n\nAuto�ated Decision�Ma�ing\n\n#### Tech leaders are to the right of the Data Maturity Curve\n\n\nPrescriptive Anal�tics\n\nPredictive Modeling\n\nData Exploration\n\n\nFrom hindsight to foresight\n\n\nHow should\nwe respond?\n\n\nAuto�aticall� �a��\nthe best decision\n\n\nAd Hoc Queries\n\nReports\nClean Data\n\nWHAT HAPPENED? WHAT W255 HAPPEN?\n\nData and A2 Maturit�\n\n\n**Figure 3:**\nThe Data Maturity Curve\n\n\n-----\n\n**2.** \u0007Data volumes and types have outgrown even the most modern approaches to SQL-based data\n\nprocessing.\n\n**3.** \u0007These large data volumes also make it nearly impossible for your workforce to continue to\n\nprogrammatically state, in a priority manner, how data insights can be achieved or how the business\n\nshould react to changing data.\n\n**4.** \u0007Organizations need to reduce the costs of processing all this data. You simply cannot afford to hire the\n\nnumber of people needed to respond to every piece of data flowing into your environment. Machines\n\nscale, people do not.\n\n**5.** \u0007Advances in machine learning and AI have simplified the steps and reduced the expertise needed to\n\ngain game-changing insights. For these reasons, plus many others, the organizations that thrive in the\n\n21st century will do so based on their ability to leverage all the data at their disposal. Traditional ways\n\nof processing and managing data will not work. Using ML and AI will empower your workforce to\n\nleverage data to make better decisions for managing risk, helping your organization succeed in the\n\nmodern economy.\n\n**Go “all in” on the cloud**\n\nThe COVID-19 pandemic has caused rapid adoption of cloud-based solutions for collaboration and\n\nvideoconferencing — and organizations are now using this time to reevaluate their use of on-premises and\n\ncloud-based services. The cloud vendors provide many benefits to organizations, including Infrastructure\n\nas a Service (IaaS), Platform as a Service (PaaS) and Software as a Service (SaaS) solutions. These benefits,\n\nespecially when combined with the use of open source software (OSS), increase the speed at which\n\norganizations can use the latest technologies while also reducing their capex in these budget-conscious times.\n\nFor AWS, Microsoft, Google and other cloud providers, the game is about data acquisition. The more\n\ncorporate data that resides in a specific cloud, the more sticky the customer is to the vendor. At the same\n\ntime, multicloud support is both a selling point and an aspirational goal for many organizations. Companies\n\nare well aware of vendor lock-in and want to abstract their applications so they can be moved across\n\nclouds if there is a compelling business reason.\n\n\n-----\n\nApproaching your technology choices with a multicloud point of view gives the organization more sovereignty\n\nover the data — flexibility to run workloads anywhere, ease of integration when acquiring businesses that\n\nrun on different cloud providers and simplified compliance with emerging regulations that may require\n\ncompanies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\n\nincreasingly important.\n\n**Modernize business applications**\n\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\n\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\n\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n\nservices and APIs to easily provide access to an application’s functionality.\n\nCloud-based architectures, commodity databases and software application development frameworks make\n\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\n\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n\na backing database) has become straightforward with the latest tooling available to your application\n\ndevelopment teams.\n\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\n\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n\napplications that generate and store a significant amount of the data consumed within an organization. Using\n\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n\n\n“We are on an amazing journey. Being among\n\nthe fastest-growing enterprise software cloud\n\ncompanies on record was unimaginable when\n\nwe started Databricks. To get here, we’ve stayed\n\nfocused on the three big bets we made when\n\nfounding the company — cloud, open source\n\nand machine learning. Fast-forward seven years,\n\nthousands of data teams around the globe are\n\nworking better together on Databricks.”\n\n**Ali Ghodsi**\n\nCo-founder and CEO\n\nDatabricks\n\n\n-----\n\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n\nother applications within your environment to store copies of the data — unless absolutely necessary for\n\nperformance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n\nthe data from the actual SOR.\n\nData from these SORs should be made available in three ways:\n\n**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n\n**2.** \u0007Ensure that copies of the data land in the data lake.\n\n**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n\nconsumption by downstream applications.\n\n**Move toward real-time decisioning**\n\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n\nand the second is to view data as an individual event. This so-called “time value of data” is an important\n\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n\nthe same data platform.\n\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\n\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that\n\ncan positively affect your ability to reduce risk, better service your customers or lower your operating costs.\n\nThe goal is to act immediately — with reliability and accuracy — upon the arrival of a new streaming event.\n\nThis “time value of data” is shown in Figure 4 on the next page.\n\n\n-----\n\nFor example, real-time processing of clickstream data from your customer-facing mobile application can\n\nindicate when the customer is having trouble and may need to call into your call center. This insight gives\n\nyou the opportunity to interject with a digital assistant or to pass on “just-in-time” information to your call\n\ncenter agents — improving the customer experience and lowering customer churn.\n\nData, analytics and AI rely on the ”time value of data” — a powerful concept that allows you to train your\n\nmachine learning models using historical data and provides you with the ability to make real-time decisions\n\nas new events take place. For example, credit card fraud models can use deep historical data about a given\n\ncustomer’s buying patterns (location, day of week, time of day, retailer, average purchase amount, etc.) to\n\nbuild rich models that are then executed for each new credit card transaction. This real-time execution,\n\ncombined with historical data, enables the best possible customer experience.\n\n#### Time Value of Data\n\n\nThe Databricks Lakehouse Platform allows you to\n\ncombine real-time streaming and batch processing\n\nusing one architecture and a consistent set of\n\nprogramming APIs.\n\n**Figure 4:**\nTime Value of Data\n\n\nValue of an individual data\n\nrecord is very high once created\nbut decreases over time\n\n\nValue of data records\n\nin aggregate increases\nover time\n\n\nReal-Time Decisioning Real-Time Analysis Trend Analysis Model Training\n\n\n-----\n\n**Land** **_all_** **data in a data lake**\n\nIn order to effectively drive data, analytics and AI adoption, relevant data needs to be made available to the\n\nuser as quickly as possible. Data is often siloed in various business applications and is hard and/or slow to\n\naccess. Likewise, organizations can no longer afford to wait for data to be loaded into data stores like a data\n\nwarehouse, with predefined schemas that are designed to allow you to ask very specific questions about\n\nthat data only. What do you do when you want to ask a different question? To further complicate matters,\n\nhow do you handle new data sets that cannot easily be manipulated to fit into your predefined data stores?\n\nHow do you find new insights as quickly as possible?\n\nThe overall goal is to gain insights from the data as quickly as possible — which can happen at any step\n\nalong the data pipeline — including raw, refined and curated data states.\n\nThis phenomenon has led to the concept known as the four Vs of data — specifically, _volume_ , _velocity_ ,\n\n_variety_ and _veracity_ . Data-, analytics- and AI-driven organizations need to be able to store and process\n\nall their data, regardless of size, shape or speed. In addition, data lineage and provenance are critical to\n\nknowing whether or not you can trust the data.\n\n**Change the way people work**\n\nWhen done correctly, organizations get value from data, analytics and AI in three ways — infrastructure\n\nsavings, productivity gains and business-impacting use cases. Productivity gains require a true focus on\n\nminimizing the number of steps needed to produce results with data. This can be accomplished by:\n\n**1.** \u0007 Making data more accessible and ensuring it can be trusted\n\n**2.** Minimizing the number of tools/systems needed to perform work\n\n**3.** Creating a flywheel effect by leveraging the work of others\n\n\n“We believe that the data lakehouse architecture\n\npresents an opportunity comparable to the one\n\nwe saw during early years of the data warehouse\n\nmarket. The unique ability of the lakehouse to\n\nmanage data in an open environment, blend all\n\nvarieties of data from all parts of the enterprise and\n\ncombine the data science focus of the data lake\n\nwith the end-user analytics of the data warehouse\n\nwill unlock incredible value for organizations.”\n\n**Bill Inmon**\n\nThe father of the data warehouse\n\n\n-----\n\nIn large organizations, it’s understandable why application and data silos are prevalent. Each business unit\n\nis laser-focused on achieving their goals, and the use of information technology is viewed as an enabler.\n\nSystems and applications get built over time to satisfy specific needs within a line of business. As a result,\n\nit’s not surprising to learn that employees must jump through a large number of hoops to get access to the\n\ndata they need to do their jobs. It should be as simple as getting your identity and PC.\n\nWith Databricks, users can collaborate and perform\n\n\nA primary goal of your data and AI transformation should be to focus on improving the user experience —\n\nin other words, improving how your entire organization interacts with data. Data must be easily discoverable\n\nwith default access to users based on their role(s) — with a simple process to compliantly request access to\n\ndata sets that are currently restricted. The tooling you make available should satisfy the principal needs of\n\nthe various personas — data engineers, data scientists, machine learning engineers, business analysts, etc.\n\nFinally, the results of the work performed by a user or system upstream should be made available to users\n\nand systems downstream as “data assets” that can drive business value.\n\nOrganizations that maximize the productivity of their workforce and enable employees to do their best work\n\nunder optimal conditions are the ones that have the greatest chance to recruit and retain top talent.\n\n**Minimize time in the “seam”**\n\nAs you begin your data transformation, it is important to know that the longer it takes, the more risk and\n\ncost you introduce into your organization. The stepwise approach to migrating your existing data ecosystem\n\nto a modern data stack will require you to operate in two environments simultaneously, the old and the new,\n\nfor some period of time. This will have a series of momentary adverse effects on your business:\n\n\u0007It will increase your operational costs substantially, as you will run two sets of infrastructure\n\n\u0007It will increase your data governance risk, since you will have multiple copies of your data sitting in two\n\nvery different ecosystems\n\n\ntheir work more efficiently, regardless of their\n\npersona or role. The user experience is designed\n\nto support the workloads of data analysts, SQL\n\ndevelopers, data engineers, data scientists and\n\nmachine learning professionals.\n\n\n-----\n\n\u0007It increases the cyberattack footprint and vectors, as the platforms will likely have very different security\n\nmodels and cyber defenses\n\n\u0007It will cause strain on your IT workforce due to the challenges of running multiple environments\n\n\u0007It will require precise communications to ensure that your business partners know which environment to\n\nuse and for what data workloads\n\nTo mitigate some of the strain on the IT workforce, some organizations hire staff augmentation firms to\n\n“keep the lights on” for the legacy systems while the new systems are being implemented and rolled out.\n\nIt’s important to remember this is a critical but short-lived experience for business continuity.\n\n**Shut down legacy platforms**\n\nIn keeping with the goal of minimizing time in the seam, the project plan and timeline must include the\n\nsteps and sequencing for shutting down legacy platforms. For example, many companies migrate their on-\n\npremises Apache Hadoop data lake to a cloud-based object store. The approach for shutting down the on-\n\npremises Hadoop system is generally as follows:\n\n**1.** \u0007Identify the stakeholders (business and IT) who own the jobs that run in the Hadoop environment.\n\n**2.** \u0007Declare that no changes can be made to the Hadoop environment — with the exception of emergency\n\nfixes or absolutely critical new business use cases.\n\n**3.** \u0007Inventory the data flow paths that feed data into the Hadoop environment.\n\n**4.** \u0007Identify the source systems that feed the data.\n\n**5.** \u0007Inventory the data that is currently stored in the Hadoop environment and understand the rate of change.\n\n**6.** \u0007Inventory the software processes (aka jobs) that handle the data and understand the output of the jobs.\n\n**7.** \u0007Determine the downstream consumers of the output from the jobs.\n\n\n-----\n\n**8.** \u0007Prioritize the jobs to move to the modern data architecture.\n\n**9.** \u0007One by one, port the data input, job execution, job output and downstream consumers to the new\n\narchitecture.\n\n**10.** \u0007Run legacy and new jobs in parallel for a set amount of time — in order to validate that things are\n\nworking smoothly.\n\n**11.** \u0007Shut down the legacy data feeds, job execution and consumption. Wait. Look for smoke.\n\n**12.** \u0007Rinse and repeat — until all jobs are migrated.\n\n**13.** \u0007Shut down the Hadoop cluster.\n\nA similar model can also be applied to legacy on-premises enterprise data warehouses.\n\nYou can follow the same process for other legacy systems in your environment. Some of these systems\n\nmay be more complex and require the participation of more stakeholders to identify the fastest way to\n\nrationalize the data and processes. It is important, however, to make sure that the organization has the\n\nfortitude to hold the line when there is pressure to make changes to the legacy environments or extend\n\ntheir lifespan. Setting firm dates for when these legacy systems will be retired will serve as a forcing function\n\nfor teams when they onboard to the new modern data architecture. Having the executive buy-in from page\n\n9 plays a crucial role in seeing the shutdown of legacy platforms through.\n\n\n-----\n\n#### 2. Identify and prioritize use cases\n\nAn important next step in enabling data, analytics and AI to transform your business is to identify use cases\n\nthat drive business value — while prioritizing the ones that are achievable under the current conditions\n\n(people, processes, data and infrastructure). There are typically hundreds of use cases within an organization\n\nthat could benefit from better data and AI — but not all use cases are of equal importance or feasibility.\n\nLeaders require a systematic approach for identifying, evaluating, prioritizing and implementing use cases.\n\n**Establish the list of potential use cases**\n\nThe first step is to ideate by bringing together various stakeholders from across the organization and\n\nunderstand the overall business drivers — especially those that are monitored by the CEO and board of\n\ndirectors. The second step is to identify use case opportunities in collaboration with business stakeholders,\n\nand understand the business processes and the data required to implement the use case. After steps one and\n\ntwo, the next step is to prioritize these cases by calculating the expected ROI. To avoid this becoming a pet\n\nproject within the data/IT teams, it’s important to have a line of business champion at the executive level.\n\nThere needs to be a balance between use cases that are complex and ones that are considered low-\n\nhanging fruit. For example, determining if a web visitor is an existing or net new customer requires a fairly\n\nstraightforward algorithm that uses web browser cookie data and the correlation of the devices used by a\n\ngiven individual or household. However, developing a sophisticated credit card fraud model that takes into\n\naccount geospatial, temporal, merchant and customer-purchasing behavior requires a broader set of data\n\nto perform the analytics.\n\nIn terms of performance, thought should be given to the speed at which the use case must execute. In\n\ngeneral, the greater the performance, the higher the cost. Therefore, it’s worth considering grouping use\n\ncases into three categories:\n\n**1.** Sub-second response\n\n**2.** Multi-second response\n\n**3.** Multi-minute response\n\n\n-----\n\nBeing pragmatic about the true service level agreement (SLA) will save time and money by avoiding over-\n\nengineering the design and infrastructure.\n\n**Thinking in terms of “data assets”**\n\nMachine learning algorithms require data — data that is readily available, of high quality and relevant — to\n\nperform the experiments, train the models, and then execute the model when it is deployed to production.\n\nThe quality and veracity of the data used to perform these machine learning steps are key to deploying\n\nmodels into production that produce a tangible ROI.\n\nIt is critical to understand what steps are needed in order to make the data available for a given use case.\n\nOne point to consider is to prioritize use cases that make use of similar or adjacent data. If your engineering\n\nteams need to perform work to make data available for one use case, then look for opportunities to have the\n\nengineers do incremental work in order to surface data for adjacent use cases.\n\nMature data and AI companies embrace the concept of “data assets” or “data products” to indicate\n\nthe importance of adopting a design strategy and data asset roadmap for the organization. Taking this\n\napproach helps stakeholders avoid fit-for-purpose data sets that drive only a single use case — and raise\n\nthe level of thinking to focus on data assets that can fuel many more business functions. The “data asset”\n\nroadmap helps data source owners understand the priority and complexity of the data assets that need to\n\nbe created. Using this approach, data becomes part of the fabric of the company, evolves the culture, and\n\ninfluences the design of business applications and other systems within the organization.\n\n**Determine the highest impact/priority**\n\nAs shown in Figure 5, organizations can evaluate a given use case using a scorecard approach that takes into\n\naccount three factors: strategic importance, feasibility and tangible ROI. Strategic importance measures\n\nwhether or not the use case helps meet immediate corporate goals and has the potential to drive growth or\n\nreduce risk. Feasibility measures whether or not the organization has the data and IT infrastructure, plus the\n\ndata science talent readily available, to implement the use case. The ROI score indicates whether or not the\n\norganization can easily measure the impact to the P/L.\n\n\n-----\n\n|= Scored by business stakeholders = Scored by technology stakeholders|Col2|SCORING GUIDELINES (RELATIVE SCORING)|Col4|Col5|\n|---|---|---|---|---|\n|||1 = LOW SCORE, DO LATER|5 = AVERAGE, NICE TO HAVE|10 = HIGH, MUST HAVE|\n|Strategic Importance Score How important is it to business success?|Business Alignment|Not required for any corporate goals|Not required for immediate corporate goals|Required for immediate corporate goals|\n||Business Driver|Does not drive growth/profitability (P&L) or competitiveness|Could drive some growth/profitability (P&L)|Significantly drives growth/profitability (P&L) and competitiveness|\n||IT Foundation|No BI/IT dependencies|BI/IT best practice|BI/IT foundational element|\n|Feasibility Score What is the current data and AI readiness?|Data Access and Trust Adjusting Based on Availability|Low awareness of available data (internal and external) or the problems it can solve|Some ingestion and exploration of large-scale data is possible|Large-scale data is available for exploration in the cloud|\n||Delivery (Data Engineers, Data Scientists, Data Analysts)|Limited in-house resources|Hiring plan for data science and engineering resources, few available in-house|Scaled data science, engineering, cloud and deployment organization|\n||Architecture|Current thinking on architecture resembles on-prem traditional data warehousing solution with batch processes rather than a data lakehouse approach|Architecture has been built and tested, some use cases are underway with multiple data sources now available in the cloud|The platform is utilized at scale across the business and is able to evolve to meet the demands of new business lines and services driven by data|\n|ROI Score How tangible and large is the ROI?|ROI Potential|Mostly productivity gains, “soft intangible benefits”|Some P&L impact, not easily tangible|Significant P&L impact, “hard measured benefits”|\n\n\n**Figure 5:**\nMethodology for scoring use cases\n**Ensure business and technology leadership alignment**\n\nPrioritizing use cases requires striking a balance between offensive- and defensive-oriented use cases.\n\nIt is important for executives to evaluate use cases in terms of opportunity growth (offensive) and risk\n\nreduction (defensive). For example, data governance and compliance use cases should take priority\n\nover offensive-oriented use cases when the cost of a data breach or noncompliance is higher than the\n\nacquisition of a new customer.\n\n\n-----\n\nThe Databricks Professional Services team can\n\nhelp customers identify revenue-generating and\n\ncost-saving opportunities for data and AI use cases\n\nthat provide a significant ROI when adopting the\n\n\n#### 3. Build successful data teams\n\nIn order to succeed with data, analytics and AI, companies must find and organize the right talent into high-\n\nperforming teams — ones that can execute against a well-defined strategy with the proper tools, processes,\n\ntraining and leadership. Digital transformations require executive-level support and are likely to fail without\n\nit — especially in large organizations.\n\nHowever, it’s not enough to simply hire the best data and AI talent — the organization must want to succeed, at\n\nan enterprise level. In other words, they must also evolve their company culture into one that embraces data,\n\ndata literacy, collaboration, experimentation and agile principles. We define these companies as “data native.”\n\n\nlakehouse architecture.\n\n**Chief information officers and chief data officers — two sides of the data coin**\n\nData native companies generally have a single, accountable executive who is responsible for areas such\n\nas data science, business analytics, data strategy, data governance and data management. The data\n\nmanagement aspects include registering data sets in a data catalog, tracing data lineage as data sets flow\n\nthrough the environment, performing data quality checks and scanning for sensitive data in the clear.\n\nMany organizations are rapidly adding the chief data officer (CDO) role to their executive ranks in order\n\nto oversee and manage these responsibilities. The CDO works closely with CIOs and other business\n\nstakeholders to establish the overall project plan, design and implementation — and to align project\n\nmanagement, product management, business analysis, data engineering, data scientist and machine\n\nlearning talent.\n\nThe CDO and CIO will need to build a broad coalition of support from stakeholders who are incentivized to\n\nmake the transformation a success and help drive organization-wide adoption. To do this, the stakeholders\n\nmust understand the benefits of — and their role and responsibilities in — supporting the initiative.\n\n\n-----\n\nThere are two organizational constructs that are found in most successful data native companies. The first is\n\nthe creation of an _AI/ML center of excellence_ (COE) that is designed to establish in-house expertise around\n\nML and AI, and which is then used to educate the rest of the organization on best practices. The second is\n\nthe formation of a _data and AI transformation steering committee_ that will oversee and guide decisions and\n\npriorities for the transformative data, analytics and AI initiatives, plus help remove obstacles.\n\nFurthermore, CDOs need to bring their CIOs along early in the journey.\n\n**Creating an AI/ML COE**\n\nData science is a fast-evolving discipline with an ever-growing set of frameworks and algorithms to enable\n\neverything from statistical analysis to supervised learning to deep learning using neural networks. While it is\n\ndifficult to establish specific and exact boundaries between the various disciplines, for the purposes of this\n\ndocument, we use “data science” as an umbrella term to cover machine learning and artificial intelligence.\n\nHowever, the general distinction is that data science is used to produce insights, machine learning is used to\n\nproduce predictions, and artificial intelligence is used to produce actions. In contrast, while a data scientist\n\nis expected to forecast the future based on past patterns, data analysts extract meaningful insights from\n\nvarious data sources. A data scientist creates questions, while a data analyst finds answers to the existing\n\nset of questions.\n\nOrganizations wanting to build a data science competency should consider hiring talent into a centralized\n\norganization, or COE, for the purposes of establishing the tools, techniques and processes for performing\n\ndata science. The COE works with the rest of the organization to educate and promote the appropriate use\n\nof data science for various use cases.\n\n\n-----\n\nA common approach is to have the COE report into the CDO, but still have data scientists dotted line into\n\nthe business units or department. Using this approach, you achieve two goals:\n\n\u0007The data scientists are closer to the business stakeholders, have a better understanding of the data\n\nwithin a business unit and can help identify use cases that drive value\n\n\u0007Having the data scientists reporting into the CDO provides a structure that encourages collaboration\n\nand consistency in how work is performed among the cohort and brings that to the entire organization\n\n**Data and AI transformation steering committee**\n\nThe purpose of the steering committee is to provide governance and guidance to the data transformation\n\ninitiative. The CDO and CIO should co-chair the committee along with one business executive who can be\n\na vocal advocate and help drive adoption. The level of executive engagement is critical to success of the\n\ninitiative.\n\nThe steering committee should meet regularly with leaders from across the organization to hear status\n\nreports and resolve any conflicts and remove obstacles, if possible. The leaders should represent a broad\n\ngroup of stakeholders, including:\n\n\u0007\n**Program/project management:** To report the status of progress for deploying the new data\n\necosystem and driving adoption through use cases\n\n\u0007\n**Business partners:** To provide insight and feedback on how easy or difficult it is to drive adoption\n\nof the platform\n\n\u0007\n**Engineering:** To report the status of the implementation and what technology trade-offs need\n\nto be made\n\n\u0007\n**Data science:** To report on the progress made by the COE on educating the organization about\n\nuse cases for ML and to report the status of various implementations\n\n\n-----\n\n\u0007\n**InfoSec:** To review the overall security, including network, storage, application and data\n\nencryption and tokenization\n\n\u0007\n**Architecture:** To oversee that the implementation adheres to architectural standards\n\nand guardrails\n\n\u0007\n**Risk, compliance and legal:** To oversee the approach to data governance\n\nand ethics in ML\n\n\u0007\n**User experience:** To serve as the voice of the end users who will perform their jobs using\n\nthe new data ecosystem\n\n\u0007\n**Communication:** To provide up-to-date communications to the organization about next\n\nsteps and how to drive adoption\n\n**Partnering with architecture and InfoSec**\n\nEarly on, the CDO and CIO should engage the engineering and architecture community within the\n\norganization to ensure that everyone understands the technical implications of the overall strategy. This\n\nminimizes the chances that the engineering teams will build separate and competing data platforms. In\n\nregulated industries that require a named enterprise architect (EA), this will be a key relationship to foster.\n\nThe EA is responsible for validating that the overall technology design and data management features\n\nsupport the performance and regulatory compliance requirements — specifically, whether the proposed\n\ndesign can meet the anticipated SLAs of the most demanding use cases and support the volume, velocity,\n\nvariety and veracity (four Vs) of the data environment.\n\n\nIt is important to fully understand which\n\nenvironments and accounts your data is stored\n\nin. The goal is to minimize the number of copies of\n\nyour data and to keep the data within your cloud\n\naccount — and not the vendor’s.\n\nMake sure the architecture and security model for\n\nprotecting data is well understood.\n\n\n-----\n\nFrom an InfoSec perspective, the CDO must work to ensure that the proper controls and security are\n\napplied to the new data ecosystem and that the authentication, authorization and access control methods\n\nmeet all the data governance requirements. An industry best practice is to enable self-service registration\n\nof data sets, by the data owner, and support the assignment of security groups or roles to help automate\n\nthe access control process. This allows data sets to be accessible only to the personnel that belong to a\n\ngiven group. The group membership could be based primarily on job function or role within the organization.\n\nThis approach provides fast onboarding of new employees, but caution should be taken not to proliferate\n\ntoo many access control groups — in other words, do not get too fine grained with group permissions, as\n\nthey will become increasingly difficult to manage. A better strategy is to be more coarse-grained and use\n\nrow- and column-level security sparingly.\n\n**Centralized vs. federated labor strategy**\n\nIn most organizations today, managers work in silos, making decisions with the best intentions but focused\n\non their own functional areas. The primary risk to the status quo is that there will be multiple competing and\n\nconflicting approaches to creating enterprise data and AI platforms. This duplication of effort will waste time\n\nand money and potentially erode the confidence and motivation of the various teams. While it certainly is\n\nbeneficial to compare and contrast different approaches to implementing an architecture, the approaches\n\nshould be strictly managed, with everyone designing for the same goals and requirements — as described in\n\nthis strategy document and adhering to the architectural principles and best practices.\n\nEven still, the roles of the CDO and CIO together should deliver a data analytics and AI platform with the\n\nleast amount of complexity as possible, and one that can easily scale across the organization. It is very\n\nchallenging to merge disparate data platform efforts into a single, cohesive design. It is best to get out\n\nin front of this wave of innovation and take input from the various teams to create a single, centralized\n\nplatform. Having the data engineering teams centralized, reporting into a CIO, makes it easier to design a\n\nmodern data stack — while ensuring that there is no duplication of effort when implementing the platform\n\ncomponents. Figure 6 shows one possible structure.\n\n\n-----\n\n**Figure 6:**\nCentralized teams with matrixed responsibilities\n\n\n**Data Scientist**\nModel and predict with data\n\n**Data Analyst**\nVisualize and describe data\n\n\n**Team A ($1.1M)** **Team B ($1.3M)** **Team C ($1.5M)**\n\n**Data Engineer**\nStore, process, maintain data\n\n**Business Partners**\n**and Domain Experts**\n\n\nCentralize data scientists under CDO — embed in lines of business for day-to-day tasking\n\nCentralize data engineers under CIO/CTO — initially as an enterprise function\n\n**Hiring, training and upskilling your talent**\n\nWhile this guide does not cover recruiting strategies, it is important to note that data engineering and data\n\nscience talent is very difficult to find in this competitive market. As a result, every organization should\n\nconsider what training and upskilling opportunities exist for their current staff. A large number of online\n\ncourses, at relatively low cost, teach the fundamentals of data science and AI. It will still be important to\n\naugment your existing staff with experienced data scientists and machine learning experts. You will then\n\nneed to establish clear training paths, resources and timelines to upskill your talent.\n\nUsing the COE construct, it is easier to upskill a mix of data science talent by having the experts mentor the\n\nless experienced staff. The majority of Ph.D.-level talent comes from academia and has a vested interest\n\nin educating others. It’s important to set up the structure and allow time in the schedule for knowledge\n\ntransfer, experimentation and a safe environment in which to fail. A key aspect in accelerating the\n\nexperience of your talent is to enable data science using production-like data and creating a collaborative\n\nenvironment for code sharing.\n\n\n-----\n\nThe Databricks training, [documentation](https://docs.databricks.com) and\n\n[certification](https://databricks.com/learn/certification) available to customers is industry-\n\nleading, and our [Solution Accelerators](https://databricks.com/solutions/accelerators) provide\n\n\n#### 4. Deploy a modern data stack\n\nThe modern data architecture can most easily be described as the evolution of the enterprise data\n\nwarehouse (EDW) from the 1980s and the Hadoop-style data lakes from the mid-2000s. The capabilities,\n\nlimitations and lessons learned from working with these two legacy data architectures inspired the next\n\ngeneration of data architecture — what the industry now refers to as the lakehouse.\n\nFigure 7 shows how the architectures have evolved as networking, storage, memory and CPU performance\n\nhave improved over time.\n\n\nexemplar code for organizations to hit the ground\n\nrunning with data and AI.\n\n**Figure 7:**\nA brief history of data architectures\n\n\n-----\n\n**Evolving beyond the enterprise data warehouse and data lake**\n\nThe EDW provided organizations with the ability to easily load structured and semi-structured data into\n\nwell-organized tables — like rows and columns in a spreadsheet — and execute Structured Query Language\n\n(SQL) queries and generate business intelligence (BI) reports to measure the health and performance of\n\nthe business. Though the EDW coupled storage and compute, it provided organizations with the ability to\n\ncatalog data, apply robust security and audit, monitor costs and support a large number of simultaneous\n\nusers — while still being performant. The EDW served its purpose for decades. However, most of the recent\n\nadvances in AI have been in better models to process unstructured data (text, images, video, audio), but\n\nthese are precisely the types of data that an EDW is not optimized for.\n\nTherefore, in the mid-2000s, organizations wanted to take advantage of new data sets — _ones that_\n\n_contained unstructured data_ — and apply new analytics — _ones that leveraged emerging data science_\n\n_algorithms_ . In order to accomplish this, massive investments in on-premises data lakes occurred — most\n\noften leveraging Apache Hadoop and its distributed file system, known as HDFS, running on low-cost,\n\ncommodity hardware. The Hadoop-style data lake provided the separation of compute from storage that\n\norganizations were seeking — thus eliminating the risk of vendor lock-in and opening the doors to a wide\n\nrange of new analytics. Despite all these benefits, the architecture proved to be difficult to use, with a\n\ncomplex programming model known as MapReduce, and the performance fell short of the majority of real-\n\ntime use cases.\n\nOver time, Hadoop workloads were often migrated to Apache Spark™ workloads, which run 100x faster by\n\nprocessing data in-memory across a cluster — with the ability to massively scale. The Spark programming\n\nmodel was also simpler to use and provided a consistent set of application programming interfaces (APIs)\n\nfor languages such as Python, SQL, R, Java and Scala. Spark was the first major step in separating compute\n\nfrom storage and providing the scale needed for distributed workloads.\n\n\n-----\n\nA data lakehouse combines the best of data\n\n\n**Cloud-based data lakes**\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud object stores like\n\nAmazon S3 and Azure Data Lake Storage (ADLS) have become some of the largest, most cost-effective\n\nstorage systems in the world — which make them an attractive platform to serve as the next generation\n\nof data lakes. Object stores excel at massively parallel reads — an essential requirement for modern data\n\nwarehouses.\n\n\nlakes and data warehouses, enabling BI and ML\n\n\nHowever, data lakes lack some critical features: They do not support transactions, they do not enforce\n\ndata quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\n\n**Lakehouse — the modern data architecture**\n\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\n\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n\narchitecture possible.\n\n\non all data on a simple, open and multicloud\n\nmodern data stack.\n\n\n-----\n\n**Exploratory Data Scientist**\n\n\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n\n\n**Curated Data Lake**\n\n\n**Raw Data Ingest**\n“Bronze”\n\n\n**Filtered/Cleaned/Augmented**\n“Silver”\n\n\n**Business-Level Aggregates**\n“Gold”\n\n\n**D ATA Q U A L I T Y**\n\n**Data Sources (Batch and Real-Time)**\n\n\n**Unstructured**\n\n- Image, Video, Audio\n\n- Free Text, Blob\n\n\n**Semi-Structured**\n\n- Logs, Clickstream\n\n- CSV, JSON, XML\n\n\n**Structured**\n\n- Systems of Record\n\n- Operational DBs\n\n\n**Figure 8:**\nThe building blocks for a modern data architecture\n\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\n\ntarget-state architecture supports loading all the data types that might be interesting to an organization —\n\nstructured, semi-structured and unstructured — and provides a single processing layer, using consistent\n\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\n\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\n\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n\nThe architecture makes possible the efficient creation of “data assets” for the organization by taking a\n\nstepwise approach to improving data.\n\n\n-----\n\n**Lakehouse key features**\n\nTo effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n\navailable for stakeholders to run business-critical production workloads:\n\n\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\n\nmonitoring and recovery.\n\n\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n\nread or write data, typically using SQL.\n\n\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n\n\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\n\nunstructured data like tables, files, models and dashboards in concert with existing data, storage and\n\ncatalogs.\n\n\u0007 **Storage is decoupled from compute:** In practice this means storage and compute use separate\n\nclusters, thus these systems are able to scale to many more concurrent users and larger data sizes.\n\nSome modern data warehouses also have this property.\n\n\u0007 **Openness:** The storage formats they use are open and standardized, such as Parquet, and they provide\n\nan API so a variety of tools and engines, including machine learning and Python/R libraries, can efficiently\n\naccess the data directly.\n\n\nDatabricks released Delta Lake to the open source\n\ncommunity in 2019. Delta Lake provides all the data\n\nlifecycle management functions that are needed\n\nto make cloud-based object stores reliable and\n\nperformant. This design allows clients to update\n\nmultiple objects at once, replace a subset of\n\nthe objects with another, etc., in a serializable\n\nmanner that still achieves high parallel read/write\n\nperformance from the objects — while offering\n\nadvanced capabilities like time travel (e.g., query\n\npoint-in-time snapshots or rollback of erroneous\n\nupdates), automatic data layout optimization,\n\nupserts, caching and audit logs.\n\n\n-----\n\n\u0007 **Support for diverse data types ranging from unstructured to structured data:** The lakehouse can be\n\nused to store, refine, analyze and access data types needed for many new data applications, including\n\nimages, video, audio, semi-structured data and text.\n\n\u0007 **Support for diverse workloads:** This includes data science, machine learning, SQL and analytics. Multiple\n\ntools might be needed to support all these workloads, but they all rely on the same data repository.\n\n\u0007 **End-to-end streaming:** Real-time reports are the norm in many enterprises. Support for streaming\n\neliminates the need for separate systems dedicated to serving real-time data applications.\n\n\u0007 **BI support:** Lakehouses enable the use of BI tools directly on the source data. This reduces staleness,\n\nimproves recency, reduces latency and lowers the cost of having to operationalize two copies of the\n\ndata in both a data lake and a warehouse.\n\n\u0007 **Multicloud:** The Databricks Lakehouse Platform offers you a consistent management, security and\n\ngovernance experience across all clouds. You don’t need to invest in reinventing processes for every\n\ncloud platform that you’re using to support your data and AI efforts. Instead, your data teams can simply\n\nfocus on putting all your data to work to discover new insights and create business value.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n\n**Figure 9:**\nDelta Lake is the open data storage layer that delivers reliability,\nsecurity and performance on your data lake — for both\nstreaming and batch operations\n\n\n-----\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional features. Tools\n\nfor security and access control are basic requirements. Data governance capabilities, including auditing,\n\nretention and lineage, have become essential, particularly in light of recent privacy regulations. Tools that\n\nenable data discovery such as data catalogs and data usage metrics are also needed. With a lakehouse,\n\nsuch enterprise features only need to be implemented, tested and administered for a single system.\n\nDatabricks is the only cloud-native vendor\n\n\n**Databricks — innovation driving performance**\n\nAdvanced analytics and machine learning on unstructured and large-scale data are two of the most\n\nstrategic priorities for enterprises today — and the growth of unstructured data is going to increase\n\nexponentially — so it makes sense for CIOs and CDOs to think about positioning their data lake as the\n\ncenter of their data infrastructure. The main challenge is whether or not it can perform reliably and fast\n\nenough to meet the SLAs of the various workloads — especially SQL-based analytics.\n\nDatabricks has focused its engineering efforts on incorporating a wide range of industry-leading software\n\nand hardware improvements in order to implement the first lakehouse solution. Our approach capitalizes\n\non the computing advances of the Apache Spark framework and the latest networking, storage and CPU\n\ntechnologies to provide the performance customers need to simplify their architecture. These innovations\n\ncombine to provide a single architecture that can store and process all the data sets within an organization —\n\nsupporting the range of analytics outlined above.\n\n**BI and SQL workloads**\n\nPerhaps the most significant challenge for the lakehouse architecture is the ability to support SQL queries\n\nfor star/snowflake schemas in support of BI workloads. Part of the reason EDWs have remained a major\n\npart of the data ecosystem is because they provide low-latency, high-concurrency query support. In order\n\nto compete with the EDW, optimizations must be found within the lakehouse architecture that provide\n\nsatisfactory query performance for the majority of BI workloads. Fortunately, advances in query plan, query\n\nexecution, statistical analysis of files in the object store, and hardware and software improvements make it\n\npossible to deliver on this promise.\n\n\nto be recognized as a Leader in both\n\n[2021 Magic Quadrant reports:](https://databricks.com/p/ebook/databricks-named-leader-by-gartner)\n\n**Cloud Database Management Systems** and\n\n**Data Science and Machine Learning Platforms**\n\n\n-----\n\n**A word about the data mesh architecture**\n\nIn 2019, another architectural concept, called the data mesh, was introduced. This architecture addresses\n\nwhat some designers identify as weaknesses of a centralized data lake. Namely, that you fill the data lake\n\nusing a series of extract, transform, load (ETL) processes — which unnecessarily adds complexity. The data\n\nmesh approach avoids centralizing data in one location and encourages the source systems to create\n\n“data products” or “data assets” that are served up directly to consumers for data and AI workloads. The\n\ndesigners advocate for a federated approach to data and AI — while using enterprise policies to govern how\n\nsource systems make data assets available.\n\nThere are several challenges with this approach. First, the data mesh assumes that each source system\n\ncan dynamically scale to meet the demands of the consumers — particularly challenging when data assets\n\nbecome “hot spots” within the ecosystem. Second, centralized policies oftentimes leave the implementation\n\ndetails to the individual teams. This has the potential of inconsistent implementations, which may lead to\n\nperformance degradations and differing cost profiles. Finally, the data mesh approach assumes that each\n\nsource system team has the necessary skills, or can acquire them, to build robust data products.\n\nThe lakehouse architecture is not at odds with the data mesh philosophy — as ingesting higher-quality data\n\nfrom the source systems reduces the curation steps needed inside the data lake itself.\n\n\n-----\n\n#### 5. Improve data governance and compliance\n\nData governance is perhaps the most challenging aspect of data transformation initiatives. Every\n\nstakeholder recognizes the importance of making data readily available, of high quality and relevant to help\n\ndrive business value. Likewise, organizations understand the risks of failing to get it right — the potential for\n\nundetected data breaches, negative impact on the brand and the potential for significant fines in regulated\n\nenvironments. However, organizations shouldn’t perceive data governance or a defensive data strategy as\n\na blocker or deterrent to business value. In fact, many organizations have leveraged their strong stance on\n\ndata governance as a competitive differentiator to earn and maintain customer trust, ensure sound data\n\nand privacy practices, and protect their data assets\n\n**Why data governance fails**\n\nWhile most people agree that data governance is a set of principles, practices and tooling that helps\n\nmanage the complete lifecycle of your data, what is often not discussed is what constitutes a pragmatic\n\napproach — one that balances realistic policies with automation and scalability.\n\nToo often the policies developed around data governance define very strict data management principles —\n\nfor example, the development of an enterprise-wide ontological model that all data must adhere to.\n\nOrganizations can spend months, if not years, trying to define the perfect set of policies. The engineering\n\neffort to automate the enforcement of the new policies is not prioritized, or takes too long, due to the\n\ncomplexity of the requirements. Meanwhile, data continues to flow through the organization without a\n\nconsistent approach to governance, and too much of the effort is done manually and fraught with human error.\n\n\nWhat are the basic building blocks of a sound data\n\ngovernance approach?\n\n\n-----\n\n**A pragmatic approach to data governance**\n\nAt a high level, organizations should enable the following data management capabilities:\n\n**\u0007Identify all sources of data**\n\n\u0007Identify all data-producing and data-storing applications\n\n\u0007Identify the systems of record (SOR) for each data set\n\n\u0007Label data sets as internal or external (third party)\n\n\u0007Identify where sensitive data is stored — GDPR/CCPA scope\n\n\u0007Limit which operational data stores (ODSs) can re-store SOR data\n\n**\u0007Catalog data sets**\n\n\u0007Register all data sets in a centralized data catalog\n\n\u0007Create a lightweight, self-service data registration process\n\n\u0007Limit manual entry as much as possible\n\n\u0007Record the schema, if any, for the data set\n\n\u0007Use an inference engine or tool to extract the data set schema\n\n\u0007Add business and technical metadata to make it meaningful\n\n\u0007Use machine learning to classify data sets\n\n\u0007Use crowdsourcing to validate the machine-based results\n\n**Track data lineage**\n\n\u0007Track data set flow and what systems act on data\n\n\u0007Create an enumerated list of action values for specific operations\n\n\u0007Emit lineage events via streaming layer and aggregate in data lake lineage event schema:\n\n\n\n\u0007Optional: Add a source code repository URL for action traceability\n\n\n-----\n\n**\u0007Perform data quality checks**\n\n\u0007Create a rules library that is centrally managed and versioned\n\n\u0007Update the rules library periodically with new rules\n\n\u0007Use a combination of checks — null/not null, regex, valid values\n\n\u0007Perform schema enforcement checks against data set registration\n\nBy minimizing the number of copies of your data\n\n\n**\u0007Scan for sensitive data**\n\n\u0007Establish a tokenization strategy for sensitive data — GDPR/CCPA\n\n\u0007Tokenize all sensitive data stored in the data lake — avoid cleartext\n\n\u0007Use fixed-length tokens to preserve analytic value\n\n\u0007Determine the approach for token lookup/resolution when needed\n\n\u0007Ensure that any central token stores are secure with rotating keys\n\n\u0007Identify which data elements from GDPR/CCPA to include in scans\n\n\u0007Efficiently scan for sensitive data in cleartext using the rules library\n\n**\u0007Establish approved data flow patterns**\n\n\u0007Determine pathways for data flow (source —> target)\n\n\u0007Limit the ways to get SOR data (APIs, streaming, data lake, etc.)\n\n\u0007Determine read/write patterns for the data lake\n\n\u0007Strictly enforce data flow pathways to/from data lake\n\n\u0007Detect violations and anomalies using lineage event analysis\n\n\u0007Identify offending systems and shut down or grant exception\n\n\u0007Record data flow exceptions and set a remediation deadline\n\n**\u0007Centralize data access controls**\n\n\u0007Establish a common governance model for all data and AI assets\n\n\u0007Centrally define access policies for all data and AI assets\n\n\u0007Enable fine-grained access controls at row and column levels\n\n\u0007Centrally enforce access policies across all workloads — BI, analytics, ML\n\n\nand moving to a single data processing layer where\n\nall your data governance controls can run together,\n\nyou improve your chances of staying in compliance\n\nand detecting a data breach.\n\n\n-----\n\n**\u0007Make data discovery easy**\n\n\u0007Establish a data discovery model\n\n\u0007Use manual or automatic data classification\n\n\u0007Provide a visual interface for data discovery across your data estate\n\n\u0007Simplify data discovery with rich keyword- or business glossary-based search\n\n**\u0007Centralize data access auditing**\n\n\u0007Establish a framework or best practices for access auditing\n\n\u0007Capture audit logs for all CRUD operations performed on data\n\n\u0007Make auditing reports easily accessible to data stewards/admins for ensuring compliance\n\nThis is not intended to be an exhaustive list of features and requirements but rather a framework to\n\nevaluate your data governance approach. There will be violations at runtime, so it will be important to have\n\nprocedures in place for how to handle these violations. In some cases, you may want to be very strict and\n\nshut down the data flow of the offending system. In other cases, you may want to quarantine the data until\n\nthe offending system is fixed. Finally, some SLAs may require the data to flow regardless of a violation. In\n\nthese cases, the receiving systems must have their own methodology for dealing with bad data.\n\n\n-----\n\n**Hidden cost of data governance**\n\nThere are numerous examples of high-profile data breaches and failure to comply with consumer data\n\nprotection legislation. You don’t have to look very far to see reports of substantial fines levied against\n\norganizations that were not able to fully protect the data within their data ecosystem. As organizations\n\nproduce and collect more and more data, it’s important to remember that while storage is cheap, failing\n\nto enforce proper data governance is very, very expensive.\n\nIn order to catalog, lineage trace, quality check, and scan your data effectively, you will need a lot of\n\ncompute power when you consider the massive amounts of data that exist in your organization. Each\n\ntime you copy a piece of data to load it into another tool or platform, you need to determine what data\n\ngovernance techniques exist there and how you ensure that you truly know where all your data resides.\n\nImagine the scenario where data flows through your environment and is loaded into multiple platforms\n\nusing various ETL processes. How do you handle the situation when you discover that sensitive data is\n\nin cleartext? Without a consistent set of data governance tools, you may not be able to remediate the\n\nproblem before it’s flagged for violation.\n\nHaving a smaller attack surface and fewer ingress/egress routes helps guard your data and protect your\n\norganization’s brand and balance sheet.\n\nThe bottom line is that the more complex your data ecosystem architecture is, the more difficult and costly\n\nit is to get data governance right.\n\n\n-----\n\n#### 6. Democratize access to quality data\n\nEffective data and AI solutions rely more on the amount of quality data available than on the sophistication\n\nor complexity of the model or algorithm. Google published a paper titled “The Unreasonable Effectiveness of\n\nData” demonstrating this point. The takeaway is that organizations should focus their efforts on making sure\n\ndata scientists have access to the widest selection of relevant and high-quality data to perform their jobs —\n\nwhich is to create new opportunities for revenue growth, cost reduction and risk reduction.\n\n**The 80/20 data science dilemma**\n\nMost existing data environments have their data stored primarily in different operational data stores within a\n\ngiven business unit (BU) — creating several challenges:\n\n\u0007Most business units deploy use cases that are based only on their own data — without taking advantage\n\nof cross-BU opportunities\n\n\u0007The schemas are generally not well understood outside of BU or department — with only the database\n\ndesigners and power users being able to make efficient use of the data. This is referred to as the “tribal\n\nknowledge” phenomenon.\n\n\u0007The approval process and different system-level security models make it difficult and time-consuming\n\nfor data scientists to gain the proper access to the data they need\n\nIn order to perform analysis, users are forced to log in to multiple systems to collect their data. This is most\n\noften done using single-node data science and generates unnecessary copies of data stored on local disk\n\ndrives, various network shares or user-controlled cloud storage. In some cases, the data is copied to “user\n\nspaces” within production platform environments. This has the strong potential of degrading the overall\n\nperformance for true production workloads.\n\nTo make matters worse, these copies of data are generally much smaller than the full-size data sets that would\n\nbe needed in order to get the best model performance for your ML and AI workloads.\n\n\n-----\n\nSmall data sets reduce the effectiveness of exploration, experimentation, model development and model\n\ntraining — resulting in inaccurate models when deployed into production and used with full-size data sets.\n\nAs a result, data science teams are spending 80% of their time wrangling data sets and only 20% of their\n\ntime performing analytic work — work that may need to be redone once they have access to the full-size\n\ndata sets. This is a serious problem for organizations that want to remain competitive and generate game-\n\nchanging results.\n\nAnother factor contributing to reduced productivity is the way in which end users are typically granted\n\naccess to data. Security policies usually require both coarse-grained and fine-grained data protections.\n\nIn other words, granting access at a data set level but limiting access to specific rows and columns (fine-\n\ngrained) within the data set.\n\n**Rationalize data access roles**\n\nThe most common approach to providing coarse-grained and fine-grained access is to use what’s known\n\nas role-based access control (RBAC). Individual users log on to system-level accounts or via a single sign-on\n\n(SSO) authentication and access control solution.\n\nUsers can access data by being added to one or more Lightweight Directory Access Protocol (LDAP) groups.\n\nThere are different strategies for identifying and creating these groups — but typically, they are done on a\n\nsystem-by-system basis, with a 1:1 mapping for each coarse- and fine-grained access control combination.\n\nThis approach to data access usually produces a proliferation of user groups. It is not unusual to see several\n\nthousand discrete security groups for large organizations — despite having a much smaller number of\n\ndefined job functions.\n\nThis approach creates one of the biggest security challenges in large organizations. When personnel leave\n\nthe company, it is fairly straightforward to remove them from the various security groups. However, when\n\npersonnel move around within the organization, their old security group assignments often remain intact\n\nand new ones are assigned based on their new job function. This leads to personnel continuing to have\n\naccess to data that they no longer have a “need to know.”\n\n\nThe Databricks Lakehouse Platform brings together\n\nall the data and AI personas into one environment\n\nand makes it easy to collaborate, share code and\n\ninsights, and operate against the same view of data.\n\n\n-----\n\n**Data classification**\n\nHaving all your data sets stored in a single, well-managed data lake gives you the ability to use partition\n\nstrategies to segment your data based on “need to know.” Some organizations create a partition based\n\non which business unit owns the data and which one owns the data classification. For example, in a\n\nfinancial services company, credit card customers’ data could be stored separately from that of debit card\n\ncustomers, and access to GDPR/CCPA-related fields could be handled using classification labels.\n\nThe simplest approach to data classification is to use three labels:\n\n\u0007 **Public data:** Data that can be freely disclosed to the public. This would include your annual report, press\n\nreleases, etc.\n\n\u0007 **Internal data:** Data that has low security requirements but should not be shared with the public or\n\ncompetitors. This would include strategy briefings and market or customer segmentation research.\n\n\u0007 **Restricted data:** Highly sensitive data regarding customers or internal business operations. Disclosure\n\ncould negatively affect operations and put the organization at financial or legal risk. Restricted data\n\nrequires the highest level of security protection.\n\nSome organizations introduce additional labels, but care should be taken to make sure that everyone clearly\n\nunderstands how to apply them.\n\nThe data classification requirements should be clearly documented and mapped to any legal or regulatory\n\nrequirements. For example, CCPA is so sweeping that it includes 11 categories of personal information —\n\nand defines “personal information” as “information that identifies, relates to, describes, is capable of\n\nbeing associated with, or could reasonably be linked, directly or indirectly, with a particular consumer or\n\nhousehold.”\n\n\n-----\n\nJust examining one CCPA category, _Customer Records Information_ , we see that the following information is\n\nto be protected: name, signature, social security number, physical characteristics or description, address,\n\ntelephone number, passport number, driver’s license or state identification card number, insurance policy\n\nnumber, education, employment, employment history, bank account number, credit or debit card number,\n\nother financial information, medical information, and health insurance information.\n\nThere are generally three different approaches in industry to performing data classification:\n\n**1. \u0007Content-based:** Scans or inspects and interprets files to find sensitive information. This is generally\n\ndone using regular expressions and lookup tables to map values to actual entities stored inside the\n\norganization (e.g., customer SSN).\n\n**2. \u0007Context-based:** Evaluates the source of the data (e.g., application, location or creator) to determine\n\nthe sensitivity of the data.\n\n**3. \u0007User-based:** Relies on a manual, end-user selection of each data set or element and requires expert\n\ndomain knowledge to ensure accuracy.\n\nTaking all this into account, an organization could implement a streamlined set of roles for RBAC that\n\nuses the convention where “domain” might be the\n\nbusiness unit within an organization, “entity” is the noun that the role is valid for, “data set” or “data asset” is\n\nthe ID, and “classification” is one of the three values (public, internal, restricted).\n\nThere is a “deny all default” policy that does not allow access to any data unless there is a corresponding\n\nrole assignment. Wild cards can be used to grant access to eliminate the need to enumerate every\n\ncombination.\n\n\n-----\n\nFor example, gives a user or a system access to all the\n\ndata fields that describe a credit card transaction for a customer, including the 16-digit credit card number.\n\nWhereas would allow the user or system\n\naccess only to nonsensitive data regarding the transaction.\n\nThis gives organizations the chance to rationalize their security groups by using a domain naming\n\nconvention to provide coarse-grained and fine-grained access without the need for creating tons of LDAP\n\ngroups. It also dramatically eases the administration of granting access to data for a given user.\n\n**Everyone working from the same view of data**\n\nThe modern data stack, when combined with a simplified security group approach and a robust data\n\ngovernance methodology, gives organizations an opportunity to rethink how data is accessed — and greatly\n\nimproves time to market for their analytic use cases. All analytic workloads can now operate from a single,\n\nshared view of your data.\n\nCombining this with a sensitive data tokenization strategy can make it straightforward to empower data\n\nscientists to do their job and shift the 80/20 ratio in their favor. It’s now easier to work with full-size data\n\nsets that both obfuscate NPI/PII information and preserve analytic value.\n\nNow, data discovery is easier because data sets have been registered in the catalog with full descriptions\n\nand business metadata — with some organizations going as far as showing realistic sample data for a\n\nparticular data set. If a user does not have access to the underlying data files, having data in one physical\n\nlocation eases the burden of granting access, and then it’s easier to deploy access-control policies and\n\ncollect/analyze audit logs to monitor data usage and to look for bad actors.\n\n\nAdopting the Databricks Lakehouse Platform allows\n\nyou to add data sets into a well-managed data lake\n\nusing low-cost object stores, and makes it easy to\n\npartition data based on domain, entity, data set and\n\nclassification levels to provide fine-grained (row-\n\nlevel and column-level) security.\n\n\n-----\n\n**Data security, validation and curation — in one place**\n\nThe modern data architecture using Databricks Lakehouse makes it easy to take a consistent approach to\n\nprotecting, validating and improving your organization’s data. Data governance policies can be enforced\n\nusing the built-in features of schema validation, expectations and pipelines — the three main steps to data\n\ncuration. Databricks enables moving data through well-defined states: Raw —> Refined —> Curated or, as we\n\nrefer to it at Databricks, Bronze —> Silver —> Gold.\n\nThe raw data is known as “Bronze-level” data and serves as the landing zone for all your important analytic\n\ndata. Bronze data functions as the starting point for a series of curation steps that filter, clean and augment\n\nthe data for use by downstream systems. The first major refinement results in data being stored in “Silver-\n\nlevel” tables within the data lake. These tables carry all the benefits of the Delta Lake product — for example,\n\nACID transactions and time travel. The final step in the process is to produce business-level aggregates, or\n\n“Gold-level” tables, that combine data sets from across the organization. It’s a set of data used to improve\n\ncustomer service across the full line of products, perform GDPR/CCPA reporting or look for opportunities to\n\ncross-sell to increase customer retention. For the first time, organizations can truly optimize data curation\n\nand ETL — eliminating unnecessary copies of data and the duplication of effort that often happens in ETL\n\njobs with legacy data ecosystems. This “solve once, access many times” approach speeds time to market,\n\nimproves the user experience and helps retain talent.\n\n**Extend the impact of your data with secure data sharing**\n\nData sharing is crucial to drive business value in today’s digital economy. More and more organizations\n\nare now looking to securely share trusted data with their partners/suppliers, internal lines of business or\n\ncustomers to drive collaboration, improve internal efficiency and generate new revenue streams with data\n\nmonetization. Additionally, organizations are interested in leveraging external data to drive new product\n\ninnovations and services.\n\nBusiness executives must establish and promote a data sharing culture in their organizations to build\n\ncompetitive advantage.\n\n\n-----\n\n#### 7. Dramatically increase productivity of your workforce\n\nNow that you have deployed a modern data stack and have landed all your analytical data in a well-\n\nmanaged data lake with a rationalized approach to access control, the next question is, “What tools should I\n\nprovide to the user community so they can be most effective at using the new data ecosystem?”\n\n**Design thinking: working backward from the user experience**\n\nDesign thinking is a human-centered approach to innovation — focused on understanding customer needs,\n\nrapid prototyping and generating creative ideas — that will transform the way you develop products, services,\n\nprocesses and organizations. Design thinking was introduced as a technique to not only improve but also\n\nbring joy to the way people work. The essence of design thinking is to determine what motivates people to\n\ndo their job, where their current pain points are and what could be improved to make their jobs enjoyable.\n\n**Moving beyond best of breed**\n\nIf you look across a large enterprise, you will find no shortage of database design, ETL, data cleansing, model\n\ntraining and model deployment tools. Many organizations take a “best of breed” approach in providing\n\ntooling for their end users. This typically occurs because leaders genuinely want to empower business\n\nunits, departments and teams to select the tool that best suits their specific needs — so-called federated\n\ntool selection. Data science tooling, in particular, tends not to be procured at the “enterprise” level at first —\n\ngiven the high cost of rolling it out to the entire user population.\n\n\n-----\n\nWhen tool selection becomes localized, there are a few things to consider:\n\n\u0007Tools are generally thought of as discrete components within an ecosystem and, therefore,\n\ninterchangeable with criteria that are established within a specific tool category. The tool with the best\n\noverall score gets selected.\n\n\u0007The selection criteria for a tool usually contains a subjective list of “must-have” features based on\n\npersonal preference or adoption within a department, or because a given tool is better suited to support\n\na current business process\n\n\u0007Discrete tools tend to leapfrog one another and add features based on market demand rather quickly\n\n\u0007Evaluations that are performed over many months likely become outdated by the time the tool has\n\nmoved into production\n\n\u0007The “enterprise” requirements are often limited to ensuring that the tool fits into the overall architecture\n\nand security environment but nothing more\n\n\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n\nof tools in play or streamlining the user experience\n\n\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n\npartnership model, the ability to influence the roadmap and professional services support\n\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\n\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\n\n\n-----\n\nDatabricks is a leading data and AI company —\n\n\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\n\ndata processing, validation and curation should work. It’s the integration between the discrete functions\n\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\n\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\n\nconsequences of not doing the integration properly can be serious — in terms of security, compliance,\n\nefficiency, cost, etc.\n\n\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\n\n\nSo, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\n\ntake from both parties — sometimes calling for an organization to adjust their processes to better fit how\n\nthe platform works. There are many instances where a given business process could be simplified or recast\n\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\n\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\n\napply to the broadest set of customers.\n\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\n\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\n\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\n\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\n\nand collaboration helps improve the user experience and decreases time to market.\n\n\n[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n\nlistening to the needs of thousands of customers\n\nand having our engineers work side by side with\n\ncustomer teams to deliver real business value using\n\ndata and AI.\n\n\n-----\n\n**Unified platform, unified personas**\n\nDeploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n\ndata stack — will provide an integrated suite of tools for the full range of personas in your organization,\n\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\n\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\n\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n\nand deployment. All the work streams function off a single view of the data, and the handoffs between\n\nsubsystems are well managed.\n\nData processing happens in one auditable environment, and the number of copies of data is kept to an\n\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\n\nis eliminated.\n\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\n\ndifferently — for example, changing a string to an integer. This has a cascading effect, and the downstream\n\nconsumers must be able to adjust by monitoring the execution and detecting the changes. The data\n\nscientist, in turn, must update and test new models on the new data. Your data platform should make the\n\ndetection and remediation easier, not harder.\n\nFor the data engineers, their primary focus is extracting data from source systems and moving it into the\n\nnew data ecosystem. The data pipeline function can be simplified with a unified data platform because\n\nthe programming model and APIs are consistent across programming languages (e.g., Scala, Python). This\n\nresults in improved operations and maintenance (O&M). The runtime environment is easier to troubleshoot\n\nand debug since the compute layer is consistent, and the logging and auditing associated with the data\n\nprocessing and data management is centralized and of more value.\n\n\n-----\n\n**Maximize the productivity of your workforce**\n\nOnce you have a data platform that brings together your full range of personas, you should focus on the\n\nnext step for increasing productivity — namely, self-service environments.\n\nIn large organizations, there needs to be a strategy for how solutions are promoted up through the runtime\n\nenvironments for development, testing and production. These environments need to be nearly identical to\n\none another — using the same version of software while limiting the number, size and horsepower of the\n\ncompute nodes. To the extent possible, development and test should be performed with realistic test/\n\nsynthetic data. One strategy to support this is to tap into the flow of production data and siphon off a small\n\npercentage that is then changed in randomized fashion — obfuscating the real data but keeping the same\n\ngeneral shape and range of values.\n\nThe **DEV** environment should be accessible to everyone without any organizational red tape. The DEV\n\nenvironments should be small and controlled with policies that spin them up and tear them down efficiently.\n\nEvery aspect of the DEV infrastructure should be treated as ephemeral. Nothing should exist in the\n\nenvironment that cannot be destroyed and easily rebuilt.\n\nThe **TEST** environment should mimic the PROD environment as much as possible, including the monitoring\n\ntools — within obvious cost/budget constraints. The use of the TEST environment can be requested by\n\nthe developers, but the process is governed using a workflow/sign-off approval approach — signed off by\n\nmanagement.\n\nMoving to **PROD** is the final step, and there usually is a “separation of duties” that is required so that\n\ndevelopers cannot randomly promote software to run in production. Again, this process should be\n\nstrictly governed using a workflow/sign-off approval approach — signed off by management as well.\n\nMany organizations fully automate the steps, except the sign-offs, and support the notion of continuous\n\ndeployments.\n\n\n**DEV** **TEST**\n\n**PROD**\n\n\n-----\n\n#### 8. Make informed build vs. buy decisions\n\nA key piece of the strategy will involve the decision around which components of the data ecosystem are\n\nbuilt by the in-house engineering team and which components are purchased through a vendor relationship.\n\nThere is increased emphasis within engineering teams on taking a “builder” approach. In other words, the\n\nengineering teams prefer to develop their own solutions in-house rather than rely on vendor products.\n\n**Competitive advantage**\n\nThis “roll your own’’ approach has some advantages — including being able to establish the overall product\n\nvision, prioritize features and directly allocate the resources to build the software. However, it is important to\n\nkeep in mind which aspects of your development effort give you the most competitive advantage.\n\nSpend some time working with the data transformation steering committee and other stakeholders to\n\ndebate the pros and cons of building out various pieces of the data ecosystem. The primary factor should\n\ncome down to whether or not a given solution offers true competitive advantage for the organization. Does\n\nbuilding this piece of software make it harder for your competitors to compete with you? If the answer is no,\n\nthen it is better to focus your engineering and data science resources on deriving insights from your data.\n\n**Beware: becoming your own software vendor**\n\nAs many engineering leaders know, building your own software is an exciting challenge. However, it does\n\ncome with added responsibility — namely, managing the overall project timeline and costs, and being\n\nresponsible for the design, implementation, testing, documentation, training, and ongoing maintenance and\n\nupdates. You basically are becoming your own software vendor for every component of the ecosystem\n\nthat you build yourself. When you consider the cost of a standard-sized team, it is not uncommon to spend\n\nseveral million dollars per year building out individual component parts of the new data system. This doesn’t\n\ninclude the cost to operate and maintain the software once it is in production.\n\n\n-----\n\nTo offset the anticipated development costs, engineering teams will oftentimes make the argument that\n\nthey are starting with open source software and extending it to meet the “unique requirements” of your\n\norganization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n\nunique and b) the development offers the competitive advantage that you need.\n\nEven software built on top of open source still requires significant investment in integration and testing.\n\nThe integration work is particularly challenging because of the large number of open source libraries that\n\nare required in the data science space. The question becomes, “Is this really the area that you want your\n\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n\n**How long will it take? Can the organization afford to wait?**\n\nEven if you decide the software component provides a competitive advantage and is something worth\n\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\n\ntake longer and cost more money than initially planned.\n\nThe organization should understand the impact to the overall performance and capabilities of the daily\n\necosystem for any features tied to the in-house development effort. Your business partners likely do\n\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n\nfeatures and schedule.\n\n\nDatabricks is built on top of popular open source\n\nsoftware that it created. Engineering teams can\n\nimprove the underpinnings of the Databricks\n\nplatform by submitting code via pull request and\n\nbecoming committers to the projects. The benefit\n\nto organizations is that their engineers contribute\n\nto the feature set of the data platform while\n\nDatabricks remains responsible for all integration\n\nand performance testing plus all the runtime\n\nsupport, including failover and disaster recovery.\n\n\n-----\n\n**Don’t forget about the data**\n\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\n\n“data assets” consumable to the end users or systems. Data insights, model training and model execution\n\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n\nsets from multiple lines of business or departments. Focusing your data engineering and data science\n\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n\ncreating true competitive advantage.\n\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n\nserve up data for analysis should not be underestimated. The value of this work is equally important to\n\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\n\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n\nengineers innovate on components that don’t bring true competitive advantage.\n\n\n-----\n\n#### 9. Allocate, monitor and optimize costs\n\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\n\ncould be easily shared and reused by other members of the team. The more the team used the unified\n\nplatform, the more they collaborated and their level of expertise increased.\n\n**Reduce complexity, reduce costs**\n\nThe architectures of enterprise data warehouses (EDWs) and data lakes were either more limited or\n\nmore complex — resulting in increased time to market and increased costs. This was mainly due to the\n\nrequirement to perform ETL to explore data in the EDW or the need to split data using multiple pipelines\n\nfor the data lake. The data lakehouse architecture simplifies the cost allocation because all the processing,\n\nserving and analytics are performed in a single compute layer.\n\nOrganizations can rightsize the data environments and control costs using policies. The centralized\n\nand consistent approach to security, auditing and monitoring makes it easier to spot inefficiencies and\n\nbottlenecks in the data ecosystem. Performance improvements can be gained quickly as more platform\n\nexpertise is developed within the workforce.\n\n\nThe Databricks platform optimizes costs for your\n\ndata and AI workloads by intelligently provisioning\n\ninfrastructure only as you need it. Customers can\n\nestablish policies that govern the size of clusters\n\nbased on DEV, TEST, PROD environments or\n\nanticipated workloads.\n\n\n-----\n\nDatabricks monitors and records usage and allows\n\norganizations to easily track costs on a data and\n\n\n**Centralized funding model**\n\nAs previously mentioned, data transformation initiatives require substantial funding. Centralizing the budget\n\nunder the CDO provides consistency and visibility into how funds are allocated and spent — increasing\n\nthe likelihood of a positive ROI. Funding at the beginning of the initiative will be significantly higher than\n\nthe funding in the out-years. It’s not uncommon to see 3- to 5-year project plans for larger organizations.\n\nFunding for years 1 and 2 is often reduced in years 3 and 4 and further reduced in year 5 — until it reaches a\n\nsteady state that is more sustainable.\n\n\nAI workload basis. This provides the ability to\n\n\nThe budget takes into account the cost of the data engineering function, commercial software licenses and\n\nbuilding out the center of excellence to accelerate the data science capabilities of the organization. Again,\n\nthe CDO must partner closely with the CIO and the enterprise architect to make sure that the resources are\n\nfocused on the overall implementation plan and to make sound build vs. buy decisions.\n\nIt’s common to see the full budget controlled by the CDO, with a significant portion allocated to resources\n\nin the CIO’s organization to perform the data engineering tasks. The data science community reports into\n\nthe CDO and is matrixed into the lines of business in order to better understand the business drivers and\n\nthe data sets. Finally, investing in data governance cannot wait until the company has suffered from a major\n\nregulatory challenge, a data breach or some other serious defense-related problem. CDOs should spend\n\nthe necessary time to educate leaders throughout the organization on the value of data governance.\n\n\nimplement an enterprise-wide chargeback mode\n\nand put in place appropriate spending limits.\n\n\n-----\n\n**Chargeback models**\n\nTo establish the centralized budget to fund the data transformation initiative, some organizations impose\n\na “tax” on each part of the organization — based on size as well as profit and loss. This base-level funding\n\nshould be used to build the data engineering and data science teams needed to deploy the building blocks\n\nof the new data ecosystem. However, as different teams, departments and business units begin using the\n\nnew data ecosystem, the infrastructure costs, both compute and storage, will begin to grow. The costs will\n\nnot be evenly distributed, due to different levels of usage from the various parts of the organization. The\n\ngroups with the heavier usage should obviously cover their pro rata share of the costs. This requires the\n\nability to monitor and track usage — not only based on compute but also on the amount of data generated\n\nand consumed. This so-called chargeback model is an effective and fair way to cover the cost deltas over\n\nand above the base-level funding.\n\nPlus, not all the departments or lines of business will require the same level of compute power or fault\n\ntolerance. The architecture should support the ability to separate out the runtime portions of the data\n\necosystem and isolate the workloads based on the specific SLAs for the use cases in each environment.\n\nSome workloads cannot fail and their SLAs will require full redundancy, thus increasing the number of\n\nnodes in the cluster or even requiring multiple clusters operating in different cloud regions. In contrast, less\n\ncritical workloads that can fail and be restarted can run on less costly infrastructure. This makes it easier to\n\nbetter manage the ecosystem by avoiding a one-size-fits-all approach and allocating costs to where the\n\nperformance is needed most.\n\n\n-----\n\n#### 10. Move to production and scale adoption\n\nNow that you’ve completed the hard work outlined in the first nine steps, it is time to put the new data\n\necosystem to use. In order to get truly game-changing results, organizations must be really disciplined at\n\nmanaging and using data to enable use cases that drive business value. They must also establish a clear\n\nset of metrics to measure adoption and track the net promoter score (NPS) so that the user experience\n\ncontinues to improve over time.\n\n**If you build it, they will come**\n\nKeep in mind that your business partners are likely the ones to do the heavy lifting when it comes to data\n\nset registration. Without a robust set of relevant, quality data to use, the data ecosystem will be useless.\n\nA high level of automation for the registration process is important because it’s not uncommon to see\n\nthousands of data sets in large organizations. The business and technical metadata plus the data quality\n\nrules will help guarantee that the data lake is filled with consumable data. The lineage solution should\n\nprovide a visualization that shows the data movement and verifies that the approved data flow paths are\n\nbeing followed.\n\nSome key metrics to keep an eye on are:\n\n\u0007Percentage of source systems contributing data to the ecosystem\n\n\u0007Percentage of real-time streaming relative to API and batch transfers\n\n\u0007Percentage of registered data sets with full business and technical metadata\n\n\u0007Volume of data written to the data lake\n\n\u0007Percentage of raw data that enters a data curation pipeline\n\n\u0007Volume of data consumed from the data lake\n\n\u0007Number of tables defined and populated with curated data\n\n\u0007Number of models trained with data from the data lake\n\n\u0007Lineage reports and anomaly detection incidents\n\n\u0007Number of users running Python, SQL, Scala and R workloads\n\n\nIn 2018, Databricks released MLflow — an open\n\nsource platform to manage the ML lifecycle,\n\nincluding experimentation, reproducibility,\n\ndeployment and a central model registry. MLflow\n\nis included in the Databricks Lakehouse Platform\n\nand accelerates the adoption of machine learning\n\nand AI in organizations.\n\n\n-----\n\n**Communication plan**\n\nCommunication is critical throughout the data transformation initiative — however, it is particularly\n\nimportant once you move into production. Time is precious and you want to avoid rework, if at all possible.\n\nOrganizations often overlook the emotional and cultural toll that a long transformation process takes on\n\nthe workforce. The seam between the legacy environment and the new data ecosystem is an expensive\n\nand exhausting place to be — because your business partners are busy supporting two data worlds. Most\n\nusers just want to know when the new environment will be ready. They don’t want to work with partially\n\ncompleted features, especially while performing double duty.\n\nEstablish a solid communication plan and set expectations for when features will come online. Make sure\n\nthere is detailed documentation, training and a support/help desk to field users’ questions.\n\n**DevOps — software development + IT operations**\n\nMature organizations develop a series of processes and standards for how software and data are developed,\n\nmanaged and delivered. The term “DevOps” comes from the software engineering world and refers to\n\ndeveloping and operating large-scale software systems. DevOps defines how an organization, its developers,\n\noperations staff and other stakeholders establish the goal of delivering quality software reliably and\n\nrepeatedly. In short, DevOps is a culture that consists of two practices: continuous integration (CI) and\n\ncontinuous delivery (CD).\n\nThe CI portion of the process is the practice of frequently integrating newly written or changed code\n\nwith the existing code repository. As software is written, it is continuously saved back to the source code\n\nrepository, merged with other changes, built, integrated and tested — and this should occur frequently\n\nenough that the window between commit and build is narrow enough that no errors can occur without\n\ndevelopers noticing them and correcting them immediately.\n\nThis is particularly important for large, distributed teams to ensure that the software is always in a working\n\nstate — despite the frequent changes from various developers. Only software that passes the CI steps is\n\ndeployed — resulting in shortened development cycles, increased deployment velocity and the creation of\n\ndependable releases.\n\n\nSoftware development IT operations\n\n\n-----\n\n**DataOps — data processing + IT operations**\n\nDataOps is a relatively new focus area for the data engineering and data science communities. Its goal is to\n\nuse the well-established processes from DevOps to consistently and reliably improve the quality of data\n\nused to power data and AI use cases. DataOps automates and streamlines the lifecycle management tasks\n\nneeded for large volumes of data — basically, ensuring that the volume, velocity, variety and veracity of the\n\ndata are taken into account as data flows through the environment. DataOps aims to reduce the end-to-\n\nend cycle time of data analytics — from idea, to exploration, to visualizations and to the creation of new\n\ndata sets, data assets and models that create value.\n\nFor DataOps to be effective, it must encourage collaboration, innovation and reuse among the stakeholders,\n\nand the data tooling should be designed to support the workflow and make all aspects of data curation and\n\nETL more efficient.\n\n**MLOps — machine learning + IT operations**\n\nNot surprisingly, the term “MLOps” takes the DevOps approach and applies it to the machine learning and\n\ndeep learning space — automating or streamlining the core workflow for data scientists. MLOps is a bit\n\nunique when compared with DevOps and DataOps because the approach to deploying effective machine\n\nlearning models is far more iterative and requires much more experimentation — data scientists try different\n\nfeatures, parameters and models in a tight iteration cycle. In all these iterations, they must manage the code\n\nbase, understand the data used to perform the training and create reproducible results. The logging aspect\n\nof the ML development lifecycle is critical.\n\nMLOps aims to manage deployment of machine learning and deep learning models in large-scale\n\nproduction environments while also focusing on business and regulatory requirements. The ideal MLOps\n\nenvironment would include data science tools where models are constructed and analytical engines where\n\ncomputations are performed.\n\n\nData processing IT operations\n\n#### \n\nMachine learning IT operations\n\n\n-----\n\nThe overall workflow for deploying production ML models is shown in Figure 10.\n\nUnlike most software applications that execute a series of discrete operations, ML platforms are not\n\ndeterministic and are highly dependent on the statistical profile of the data they use. ML platforms can\n\nsuffer performance degradation of the system due to changing data profiles. Therefore, the model has to\n\nbe refreshed even if it currently “works” — leading to more iterations of the ML workflow. The ML platform\n\nshould natively support this style of iterative data science.\n\n**Ethics in AI**\n\nAs more organizations deploy data and AI solutions, there is growing concern around a number of issues\n\nrelated to ethics — in particular, how do you ensure the data and algorithms used to make decisions are\n\nfair and ethical, and that the outcomes have the appropriate impact on the target audience? Organizations\n\nmust ensure that the “black box” algorithms that produce results have the transparency, interpretability and\n\nexplainability to satisfy legal and regulatory safeguards.\n\nThe vast majority of AI work still involves software development by human beings and the use of curated\n\ndata sets. There is the obvious potential for bias and the application of AI in domains that are ethically\n\nquestionable. CDOs are faced with the added challenge of needing to be able to defend the use of AI,\n\nexplain how it works and describe the impact of its existence on the target audience — whether internal\n\nworkers or customers.\n\n\nData extraction\n\nData preparation\n\nModel e�aluation\n\n\nData analI�i�\n\n4\nModel training\n\n6\nModel �er�ing and\nexecution\n\n\nModel monitoring\n\n**Figure 10:**\nWorkflow for deploying production ML models\n\n\n-----\n\n**Data and AI Maturity Model**\n\nWhen data and AI become part of the fabric of the company and the stakeholders in the organization adopt\n\na data asset and AI mindset, the company moves further along a well-defined maturity curve, as shown in\n\nFigure 11.\n\n**Top-Line Categories and Ranking Criteria**\n\n**L O W M AT U R I T Y / V A L U E** **H I G H M AT U R I T Y / V A L U E**\n\n1. Explore 2. Experiment 3. Formalize 4. Optimize 5. Transform\n\n\nOrganization is beginning\nto explore big data and\nAI, and understand the\npossibilities and potential\nof a few starter projects\nand experiment\n\n**Figure 11:**\nThe Data and AI Maturity Model\n\n\nOrganization builds\nthe basic capabilities\nand foundations to\nbegin exploring a more\nexpansive data and AI\nstrategy, but it lacks vision,\nlong-term objectives or\nleadership buy-in\n\n\nData and AI are budding\ninto drivers of value for\nBUs aligned to specific\nprojects and initiatives as\nthe core tenets of data\nand AI are integrated into\ncorporate strategy\n\n\nData and AI are core\ndrivers of value across the\norganization, structured\nand central to corporate\nstrategy, with a scalable\narchitecture that meets\nbusiness needs and buy-in\nfrom across the organization\n\n\nData and AI are at the\nheart of the corporate\nstrategy and are\ninvaluable differentiators\nand drivers of competitive\nadvantage\n\n\nDatabricks partners with its customers to enable them to do an internal self-assessment. The output of the\n\nself-assessment allows organizations to:\n\n\u0007Understand the current state of their journey to data and AI maturity\n\n\u0007Identify key gaps in realizing (more) value from data and AI\n\n\u0007Plot a path to increase maturity with specific actions\n\n\u0007Identify Databricks resources who can help support their journey\n\n\n-----\n\n**CHAPTER 3:**\n## Conclusion\n\n\nAfter a decade in which most enterprises took a hybrid approach to their data architecture — and struggled\n\nwith the complexity, cost and compromise that come with supporting both data warehouses and data lakes\n\n— the lakehouse paradigm represents a breakthrough. Choosing the right modern data stack will be critical\n\nto future-proofing your investment and enabling data and AI at scale. The simple, open and multicloud\n\narchitecture of the Databricks Lakehouse Platform delivers the simplicity and scalability you need to\n\nunleash the power of your data teams to collaborate like never before — in real time, with all their data, for\n\nevery use case.\n\nFor more information, please visit [Databricks](https://databricks.com/solutions/roles/data-leaders) or [contact us](https://databricks.com/company/contact) .\n\n**A B O U T T H E A U T H O R**\n\nChris D’Agostino is the Global Field CTO at Databricks, having joined the company in January 2020. His role\n\nis to provide thought leadership and serve as a trusted advisor to our top customers, globally.\n\nPrior to Databricks, Chris ran a 1,000-person data engineering function for a top 10 U.S. bank. In that role,\n\nhe led a team that was responsible for building out a modern data architecture that emphasized the key\n\nattributes of the lakehouse architecture.\n\nChris has also held leadership roles at a number of technology companies.\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf", + "2024-09-19T16:57:23Z" + ], + [ + "### eBook\n\n# A New Approach to Data Sharing\n\n#### Open data sharing and collaboration for data, analytics, and AI\n\n### Second Edition\n\n\n-----\n\n## Contents Introduction — Data Sharing in Today’s Digital Economy 4\n\n**Chapter 1: What Is Data Sharing and Why Is It Important?** **5**\n\nCommon data sharing use cases 6\n\nData monetization 6\n\nData sharing with partners or suppliers (B2B) 6\n\nInternal lines of business (LOBs) sharing 6\n\nKey benefits of data sharing 7\n\n**Chapter 2: Conventional Methods of Data Sharing and Their Challenges** **8**\n\nLegacy and homegrown solutions 9\n\nProprietary vendor solutions 11\n\nCloud object storage 13\n\n**Chapter 3: Delta Sharing — An Open Standard for Secure Sharing of Data Assets** **14**\n\nWhat is Delta Sharing? 14\n\nKey benefits of Delta Sharing 16\n\nMaximizing value of data with Delta Sharing 18\n\nData monetization with Delta Sharing 19\n\nB2B sharing with Delta Sharing 21\n\nInternal data sharing with Delta Sharing 23\n\n**Chapter 4: How Delta Sharing Works** **26**\n\n\n-----\n\n**Chapter 5: Introducing Databricks Marketplace** **28**\n## Contents\n\nWhat is Databricks Marketplace? 30\n\nKey benefits of Databricks Marketplace 30\n\nEnable collaboration and accelerate innovation 32\n\nPowered by a fast, growing ecosystem 32\n\nUse cases for an open marketplace 32\n\nNew upcoming feature: AI model sharing 33\n\n**Chapter 6: Share securely with Databricks Clean Rooms** **34**\n\nWhat is a data clean room? 34\n\nCommon data clean room use cases 36\n\nShortcomings of existing data clean rooms 38\n\nKey benefits of Databricks Clean Rooms 39\n\n**Resources: Getting started with Data Sharing and Collaboration** **40**\n\n**About the Authors** **42**\n\n\n-----\n\n## Introduction\n Data Sharing in Today’s Digital Economy\n\n\nToday’s economy revolves around data. Everyday, more and more\n\norganizations must exchange data with their customers, suppliers\n\nand partners. Security is critical. And yet, efficiency and immediate\n\naccessibility are equally important.\n\nWhere data sharing may have been considered optional, it’s now\n\nrequired. More organizations are investing in streamlining internal\n\nand external data sharing across the value chain. But they still face\n\nmajor roadblocks — from human inhibition to legacy solutions to\n\nvendor lock-in.\n\nTo be truly data-driven, organizations need a better way to share\n\ndata. [Gartner predicts that by 2024](https://www.gartner.com/en/documents/3999501) , organizations that promote\n\ndata sharing will outperform their peers on most business value\n\n\nwho have successfully executed data sharing initiatives are 1.7x\n\nmore effective in showing business value and return on investment\n\nfrom their data analytics strategy.\n\nTo compete in the digital economy, organizations need an open —\n\nand secure — approach to data sharing.\n\nThis eBook takes a deep dive into the modern era of data sharing\n\nand collaboration, from common use cases and key benefits to\n\nconventional approaches and the challenges of those methods.\n\nYou’ll get an overview of our open approach to data sharing and find\n\nout how Databricks allows you to share your data across platforms,\n\nto share all your data and AI, and to share all your data securely with\n\nunified governance in a privacy-safe way.\n\n\nmetrics. In addition, Gartner recently found that Chief Data Officers\n\n\n-----\n\n## Chapter 1\n What Is Data Sharing and Why Is It Important?\n\nData sharing is the ability to make the same data available to one or many stakeholders — both external\n\nand internal. Nowadays, the ever-growing amount of data has become a strategic asset for any company.\n\nData sharing — within your organization or externally — is an enabling technology for data commercialization\n\nand enhanced analysis. Sharing data as well as consuming data from external sources allows companies\n\nto collaborate with partners, establish new partnerships and generate new revenue streams with data\n\nmonetization. Data sharing can deliver benefits to business groups across the enterprise. For those business\n\ngroups, data sharing can enable access to data needed to make critical decisions. This includes but is not\n\nlimited to roles such as the data analyst, data scientist and data engineer.\n\n\n-----\n\n#### Common data sharing use cases\n\n\n#### Data\n monetization\n\nCompanies across industries are commercializing\n\ndata. Large multinational organizations have\n\nformed exclusively to monetize data, while other\n\norganizations are looking for ways to monetize\n\ntheir data and generate additional revenue\n\nstreams. Examples of these companies can\n\nrange from an agency with an identity graph to a\n\ntelecommunication company with proprietary 5G\n\ndata or to retailers that have a unique ability to\n\ncombine online and offline data. Data vendors are\n\ngrowing in importance as companies realize they\n\nneed external data for better decision-making.\n\n\n#### Data sharing with partners\n or suppliers (B2B)\n\nMany companies now strive to share data with\n\npartners and suppliers as similarly as they share\n\nit across their own organizations. For example,\n\nretailers and their suppliers continue to work more\n\nclosely together as they seek to keep their products\n\nmoving in an era of ever-changing consumer tastes.\n\nRetailers can keep suppliers posted by sharing sales\n\ndata by SKU in real time, while suppliers can share\n\nreal-time inventory data with retailers so they know\n\nwhat to expect. Scientific research organizations\n\ncan make their data available to pharmaceutical\n\ncompanies engaged in drug discovery. Public safety\n\nagencies can provide real-time public data feeds\n\nof environmental data, such as climate change\n\nstatistics or updates on potential volcanic eruptions.\n\n\n#### Internal lines of business\n (LOBs) sharing\n\nWithin any company, different departments, lines\n\nof business and subsidiaries seek to share data so\n\neveryone can make decisions based on a complete\n\nview of the current business reality. For example,\n\nfinance and HR departments need to share data\n\nas they analyze the true costs of each employee.\n\nMarketing and sales teams need a common view\n\nof data to determine the effectiveness of recent\n\nmarketing campaigns. And different subsidiaries\n\nof the same company need a unified view of the\n\nhealth of the business. Removing data silos — which\n\nare often established for the important purpose of\n\npreventing unauthorized access to data — is critical\n\nfor digital transformation initiatives and maximizing\n\nthe business value of data.\n\n\n-----\n\n#### Key benefits of data sharing\n\nAs you can see from the use cases described above, there are many benefits of data sharing, including:\n\n\n**Greater collaboration with existing partners.** In today’s hyper-\n\nconnected digital economy, no single organization can advance its\n\nbusiness objectives without partnerships. Data sharing helps solidify\n\nexisting partnerships and can help organizations establish new ones.\n\n\u0007 **Ability to generate new revenue streams.** With data sharing,\n\norganizations can generate new revenue streams by offering data\n\nproducts or data services to their end consumers.\n\n\n**Ease of producing new products, services or business models.**\n\nProduct teams can leverage both first-party data and third-party\n\ndata to refine their products and services and expand their product/\n\nservice catalog.\n\n**Greater efficiency of internal operations.** Teams across the\n\norganization can meet their business goals far more quickly when\n\nthey don’t have to spend time figuring out how to free data from\n\nsilos. When teams have access to live data, there’s no lag time\n\nbetween the need for data and the connection with the appropriate\n\ndata source.\n\n\n-----\n\n## Chapter 2\n Conventional Methods of Data Sharing and Their Challenges\n\nSharing data across different platforms, companies and clouds is no easy task. In the past,\n\norganizations have hesitated to share data more freely because of the perceived lack\n\nof secure technology, competitive concerns and the cost of implementing data sharing\n\nsolutions.\n\nEven for companies that have the budget to implement data sharing technology, many of\n\nthe current approaches can’t keep up with today’s requirements for open-format, multi-\n\ncloud, high-performance solutions. Most data sharing solutions are tied to a single vendor,\n\nwhich creates friction for data providers and data consumers who use non-compatible\n\nplatforms.\n\nOver the past 30 years, data sharing solutions have come in three forms: legacy and\n\nhomegrown solutions, cloud object storage and closed source commercial solutions.\n\nEach of these approaches comes with its pros and cons.\n\n\n-----\n\n#### Legacy and homegrown solutions\n\nMany companies have built homegrown data sharing solutions based on legacy\n\ntechnologies such as email, (S)FTP or APIs.\n\n\nProvider\n\nETL\n\n\nConsumer\n\n\nBatch data\nfrom provider\n\n\nTable �\n\nTable 2\n\n\nFTP/SSH/API\nServer\n\n\nFTP/SSH/API ETL Database Analyst Run Analysis\nServer\n\n\n**Figure 1:**\nLegacy data\nsharing solutions\n\n\n**Pros**\n\n\u0007 **Vendor agnostic.** FTP, email and APIs are all well-documented protocols. Data\n\nconsumers can leverage a suite of clients to access data provided to them.\n\n\u0007 **Flexibility.** Many homegrown solutions are built on open source technologies\n\nand will work both on-prem and on clouds.\n\n\n-----\n\n**Cons**\n\n\u0007 **Data movement.** It takes significant effort to extract data from cloud storage, transform\n\nit and host it on an FTP server for different recipients. Additionally, this approach\n\nresults in creating copies of data sets. Data copying causes duplication and prevents\n\norganizations from instantly accessing live data.\n\n\u0007 **Complexity of sharing data.** Homegrown solutions are typically built on complex\n\narchitectures due to replication and provisioning. This can add considerable time to\n\ndata sharing activities and result in out-of-date data for end consumers.\n\n\u0007 **Operational overhead for data recipients.** Data recipients have to extract, transform\n\nand load (ETL) the shared data for their end use cases, which further delays the time to\n\ninsights. For any new data updates from the providers, the consumers have to rerun ETL\n\npipelines again and again.\n\n\u0007 **Security and governance.** As modern data requirements become more stringent,\n\nhomegrown and legacy technologies have become more difficult to secure and govern.\n\n\u0007 **Scalability.** Such solutions are costly to manage and maintain and don’t scale to\n\naccommodate large data sets.\n\n\n-----\n\n#### Proprietary vendor solutions\n\nCommercial data sharing solutions are a popular option among companies that don’t want\n\nto devote the time and resources to building an in-house solution yet also want more\n\ncontrol than what cloud object storage can offer.\n\n\nVendor 1 Platform\n\nProprietary\ndata format\n\n\nVendor V Platform\n\nProprietary\ndata format\n\n\nData Provider 1\n\nData;\nProvider\n\n\nData Provider 1\n\n\nData;\nConsumer\n\nShared data set\n\n\nData;\nProvider\n\nShared dataset\n\n\nData;\nConsumer\n\n\nNo cross-platform\nsharing\n\n\n**Figure 2:**\nProprietary\nvendor solutions\n\n\nShared dataset\n\nShared data set\n\n\nShared data set\n\n\nShared data set\n\n\nSharing limited to recipients\non the same platform\n\nData;\nConsumer\n\n\nData;\nConsumere\n\n\n**Pros**\n\n\u0007 **Simplicity.** Commercial solutions allow users to share data easily with anyone else who uses\n\nthe same platform.\n\n\n-----\n\n**Cons**\n\n\u0007 **Vendor lock-in.** Commercial solutions don’t interop with other platforms well. While\n\ndata sharing is easy among fellow customers, it’s usually impossible with those who\n\nuse competing solutions. This reduces the reach of data, resulting in vendor lock-in.\n\nFurthermore, platform differences between data providers and recipients introduce\n\ndata sharing complexities.\n\n\u0007 **Data movement.** Data must be loaded onto the platform, requiring additional ETL and\n\ndata copies.\n\n\u0007 **Scalability.** Commercial data sharing comes with scaling limits from the vendors.\n\n\u0007 **Cost.** All the above challenges create additional cost for sharing data with potential\n\nconsumers, as data providers have to replicate data for different recipients on different\n\ncloud platforms.\n\n\n-----\n\n#### Cloud object storage\n\n\n**Cons**\n\n\u0007 **Limited to a single cloud provider.** Recipients have to be on the\n\nsame cloud to access the objects.\n\n\u0007 **Cumbersome security and governance.** Assigning permissions\n\nand managing access is complex. Custom application logic is\n\nneeded to generate signed URLs.\n\n\u0007 **Complexity.** Personas managing data sharing (DBAs, analysts)\n\nfind it difficult to understand Identity Access Management\n\n(IAM) policies and how data is mapped to underlying files. For\n\ncompanies with large volumes of data, sharing via cloud storage\n\nis time-consuming, cumbersome and nearly impossible to scale.\n\n\u0007 **Operational overhead for data recipients.** The data recipients\n\nhave to run extract, transform and load (ETL) pipelines on the\n\nraw files before consuming them for their end use cases.\n\nThe lack of a comprehensive solution makes it challenging for data\n\nproviders and consumers to easily share data. Cumbersome and\n\nincomplete data sharing processes also constrain the development\n\nof business opportunities from shared data.\n\n\nObject storage is considered a good fit for the cloud because it is\n\nelastic and can more easily scale into multiple petabytes to support\n\nunlimited data growth. The big three cloud providers all offer object\n\nstorage services (AWS S3, Azure Blob, Google Cloud Storage) that\n\nare cheap, scalable and extremely reliable.\n\nAn interesting feature of cloud object storage is the ability to\n\ngenerate signed URLs, which grant time-limited permission to\n\ndownload objects. Anyone who receives the presigned URL can\n\nthen access the specified objects, making this a convenient\n\nway to share data.\n\n**Pros**\n\n\u0007 **Sharing data in place.** Object storage can be shared in place,\n\nallowing consumers to access the latest available data.\n\n\u0007 **Scalability.** Cloud object storage profits from availability and\n\ndurability guarantees that typically cannot be achieved\n\non-premises. Data consumers retrieve data directly from the\n\ncloud providers, saving bandwidth for the providers.\n\n\n-----\n\n## Chapter 3\n Delta Sharing — An Open Standard for Secure Sharing of Data Assets\n\n\nWe believe the future of data sharing should be characterized by\n\nopen technology. Data sharing shouldn’t be tied to a proprietary\n\ntechnology that introduces unnecessary limitations and financial\n\nburdens to the process. It should be readily available to anyone who\n\nwants to share data at scale. This philosophy inspired us to develop\n\nand release a new protocol for sharing data: Delta Sharing.\n\n#### What is Delta Sharing?\n\nDelta Sharing provides an open solution to securely share live data\n\nfrom your lakehouse to any computing platform. Recipients don’t\n\n\nData providers can centrally manage, govern, audit and track\n\nusage of the shared data on one platform. Delta Sharing is natively\n\nintegrated with [Unity Catalog](https://databricks.com/product/unity-catalog) , enabling organizations to centrally\n\nmanage and audit shared data across organizations and confidently\n\nshare data assets while meeting security and compliance needs.\n\nWith Delta Sharing, organizations can easily share existing large-\n\nscale data sets based on the open source formats Apache Parquet\n\nand Delta Lake without moving data. Teams gain the flexibility to\n\nquery, visualize, transform, ingest or enrich shared data with their\n\ntools of choice.\n\n\nhave to be on the Databricks platform or on the same cloud or a\n\ncloud at all. Data providers can share live data without replicating\n\nit or moving it to another system. Recipients benefit from always\n\nhaving access to the latest version of data and can quickly query\n\nshared data using tools of their choice for BI, analytics and machine\n\nlearning, reducing time-to-value.\n\n\n-----\n\nData ����i�e�\n\n\nAny u�e cy�e\n\nAnalytics\n\nBI\n\nData Science\n\n\nData Recipient\n\nAny sool\n\nAnd many more\n\n\nAny cloud/on-prem\n\nOn-premises\n\n\nAccess permissions\n\nDelta Sharing Protocol\n\n\nDelta �a�e �a�le Delta Sharing Ser�er\n\n\nNo replication\nEasy to manage\nSecure\n\n\n**Figure 3:**\nDelta Sharing\n\n\nDatabricks designed Delta Sharing with five goals in mind:\n\n\u0007Provide an open cross-platform sharing solution\n\n\u0007Share live data without copying it to another system\n\n\u0007Support a wide range of clients such as Power BI, Tableau, Apache Spark™, pandas and Java, and\n\nprovide flexibility to consume data using the tools of choice for BI, machine learning and AI use cases\n\n\u0007Provide strong security, auditing and governance\n\n\u0007Scale to massive structured data sets and also allow sharing of unstructured data and future data\n\nderivatives such as ML models, dashboards and notebooks, in addition to tabular data\n\n\n-----\n\n#### Key benefits of Delta Sharing\n\nBy eliminating the obstacles and shortcomings associated with typical data sharing\n\napproaches, Delta Sharing delivers several key benefits, including:\n\n\n**Open cross-platform sharing.** Delta Sharing establishes a new\n\nopen standard for secure data sharing and supports open source\n\nDelta and Apache Parquet formats. Data recipients don’t have to be\n\non the Databricks platform or on the same cloud, as Delta Sharing\n\nworks across clouds and even from cloud to on-premises setups. To\n\ngive customers even greater flexibility, Databricks has also released\n\nopen source connectors for pandas, Apache Spark, Elixir and\n\nPython, and is working with partners on many more.\n\n\u0007 **Securely share live data without replication.** Most enterprise\n\n\n**Centralized governance.** With Databricks Delta Sharing, data\n\nproviders can grant, track, audit and even revoke access to shared\n\ndata sets from a single point of enforcement to meet compliance and\n\nother regulatory requirements. Databricks Delta Sharing users get:\n\n\u0007Implementation of Delta Sharing as part of Unity Catalog, the\n\ngovernance offering for Databricks Lakehouse\n\n\u0007Simple, more secure setup and management of shares\n\n\u0007The ability to create and manage recipients and data shares\n\n\u0007Audit logging captured automatically as part of Unity Catalog\n\n\u0007Direct integration with the rest of the Databricks ecosystem\n\n\u0007No separate compute for providing and managing shares\n\n\ndata today is stored in cloud data lakes. Any of these existing data\n\nsets on the provider’s data lake can easily be shared without any\n\ndata replication or physical movement of data. Data providers can\n\nupdate their data sets reliably in real time and provide a fresh and\n\nconsistent view of their data to recipients.\n\n\n-----\n\n**Share data products, including AI models, dashboards and**\n\n**notebooks, with greater flexibility.** Data providers can choose\n\nbetween sharing anentire table or sharing only a version or\n\nspecific partitions of a table. However, sharing just tabular data\n\nis not enough to meet today’s consumer demands. Delta Sharing\n\nalso supports sharing of non-tabular data and data derivatives\n\nsuch as data streams, AI models, SQL views and arbitrary files,\n\nenablingincreased collaboration and innovation. Data providers can\n\nbuild, package and distribute data products including data sets,\n\nAI and notebooks, allowingdata recipients to get insights faster.\n\nFurthermore, this approach promotes and empowers the exchange\n\nof knowledge — not just data — between different organizations.\n\n\n**Share data at a lower cost.** Delta Sharing lowers the cost of\n\nmanaging and consuming shares for both data providers and\n\nrecipients. Providers can share data from their cloud object store\n\nwithout replicating, thereby reducing the cost of storage. Incontrast,\n\nexisting data sharing platforms require data providers to first move\n\ntheir data into their platform or store data in proprietary formats in\n\ntheir managed storage, which often costs more and results in data\n\nduplication. With Delta Sharing, data providers don’t need to set\n\nup separate computing environments to share data. Consumers\n\ncan access shared data directly using their tools of choice without\n\nsetting up specific consumption ecosystems, thereby reducing\n\ncosts.\n\n\nWith Delta Sharing we are able to achieve a truly open marketplace\n\nand truly open ecosystem. In contrast, commercial products are\n\nmostly limited to sharing raw tabular data and cannot be used to\n\n\nshare these higher-valued data derivatives.\n\n\n\u0007 **Reduced time-to-value.** Delta Sharing eliminates the need to\n\nset up a new ingestion process to consume data. Data recipients\n\ncan directly access the fresh data and query it using tools of their\n\nchoice. Recipients can also enrich data with data sets from popular\n\ndata providers. The Delta Sharing ecosystem of open source and\n\ncommercial partners is growing every day.\n\n\n-----\n\n#### Maximizing value of data with Delta Sharing\n\nDelta Sharing is already transforming data sharing activities for companies in a wide range of industries. Given the sheer\n\nvariety of data available and the technologies that are emerging, it is hard to anticipate all the possible use cases Delta\n\nSharing can address. The Delta Sharing approach is to share any data anytime with anyone easily and securely.\n\nIn this section we will explore the building blocks of such an approach and the use cases emerging from these.\n\n\n“Delta Sharing helped us streamline our data delivery process\n\nfor large data sets. This enables our clients to bring their own\n\ncompute environment to read fresh curated data with little-to-\n\nno integration work, and enables us to continue expanding our\n\ncatalog of unique, high-quality data products.”\n\n— **William Dague** , Head of Alternative Data, Nasdaq\n\n\n“We recognize that openness of data will play a key role in\n\nachieving Shell’s Carbon Net Zero ambitions. Delta Sharing\n\nprovides Shell with a standard, controlled and secure protocol\n\nfor sharing vast amounts of data easily with our partners to work\n\ntoward these goals without requiring our partners be on the same\n\ndata sharing platform.”\n\n— **Bryce Bartmann** , Chief Digital Technology Advisor, Shell\n\n\n“Leveraging the powerful capabilities of Delta Sharing from\n\n\nDatabricks enables Pumpjack Dataworks to have a faster\n\nonboarding experience, removing the need for exporting,\n\nimporting and remodeling of data, which brings immediate\n\nvalue to our clients. Faster results yield greater commercial\n\nopportunity for our clients and their partners.”\n\n\n“Data accessibility is a massive consideration for us. We believe\n\nthat Delta Sharing will simplify data pipelines by enabling us to\n\nquery fresh data from the place where it lives, and we are not\n\nlocked into any platform or data format.”\n\n— **Rayne Gaisford** , Global Head of Data Strategy, Jefferies\n\n\n— **Corey Zwart** , Head of Engineering, Pumpjack Dataworks\n\n“As a data company, giving our customers access to our data sets\n\nis critical. The Databricks Lakehouse Platform with Delta Sharing\n\nreally streamlines that process, allowing us to securely reach a\n\nmuch broader user base regardless of cloud or platform.”\n\n— **Felix Cheung** , VP of Engineering, SafeGraph\n\n\n-----\n\n#### Data monetization with Delta Sharing\n\nDelta Sharing enables companies to monetize their data product simply and with necessary governance.\n\nData /on.2-er $\n\n\nCloud Storage\n\n\nFulfllleen\n\nEntitles various data products\n\nData Vendor\n\nUnity\nCatalog\n\n\nUnity\nCatalog\n\nCloud Storage\n\nData /on.2-er �\n\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O\n\nR/O\n\n\n**Figure 4:**\nData monetization\nwith Delta Sharing\n\n\nDelta\nSharing\n\n\nBillieg Audit Log\n\n\n-----\n\nWith Delta Sharing, a data provider can seamlessly share large data sets and overcome\n\nthe scalability issues associated with SFTP servers. Data providers can easily expand their\n\ndata product lines since Delta Sharing doesn’t require you to build a dedicated service\n\nfor each of your data products like API services would. The company simply grants and\n\nmanages access to the data recipients instead of replicating the data — thereby reducing\n\ncomplexity and latency. Any data that exits your ELT/ETL pipelines becomes a candidate\n\nfor a data product. Any data that exists on your platform can be securely shared with your\n\nconsumers. This grants a wider addressable market — your products have appeal to a\n\nbroader range of consumers, from those who say “we need access to your raw data only”\n\nto those who say “we want only small subsets of your Gold layer data.”\n\nTo mitigate cost concerns, Delta Sharing maintains an audit log that tracks any permitted\n\naccess to the data. Data providers can use this information to determine the costs\n\nassociated with any of the data products and evaluate if such products are commercially\n\nviable and sensible.\n\n\n-----\n\n#### B2B sharing with Delta Sharing\n\nCloud Storage\n\nPartner A\n\nUnity\nCatalog\n\n\nPartner U\n\n\nUnity\nCatalog\n\nCloud Storage\n\nPartner B\n\nN o n - D ata b r i c k s C u s t o m e r\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\nR/O R/O\n\nR/O\n\n\n**Figure 5:**\nB2B sharing with\nDelta Sharing\n\n\nDelta\nSharing\n\n\n-----\n\nDelta Sharing applies in the case of bidirectional exchange of data.\n\nCompanies use Delta Sharing to incorporate partners and suppliers\n\nseamlessly into their workflows. Traditionally, this is not an easy task.\n\nAn organization typically has no control over how their partners are\n\nimplementing their own data platforms. The complexity increases\n\nwhen we consider that the partners and suppliers can reside in\n\na public cloud, private cloud or an on-premises deployed data\n\nplatform. The choices of platform and architecture are not imposed\n\non your partners and suppliers. Due to its open protocol, Delta\n\nSharing addresses this requirement foundationally. Through a wide\n\narray of existing connectors (and many more being implemented),\n\nyour data can land anywhere your partners and suppliers need to\n\nconsume it.\n\n\nIn addition to the location of data consumer residency, the\n\ncomplexity of data arises as a consideration. The traditional\n\napproach to sharing data using APIs is inflexible and imposes\n\nadditional development cycles on both ends of the exchange in\n\norder to implement both the provider pipelines and consumer\n\npipelines. With Delta Sharing, this problem can be abstracted. Data\n\ncan be shared as soon as it lands in the Delta table and when the\n\nshares and grants are defined. There are no implementation costs\n\non the provider side. On the consumer side, data simply needs\n\nto be ingested and transformed into an expected schema for the\n\ndownstream processes.\n\nThis means that you can form much more agile data exchange\n\npatterns with your partners and suppliers and attain value from your\n\ncombined data much quicker than ever before.\n\n\n-----\n\n#### Internal data sharing with Delta Sharing\n\nInternal data sharing is becoming an increasingly important consideration for any modern\n\norganization, particularly where data describing the same concepts have been produced in\n\ndifferent ways and in different data silos across the organization. In this situation it is important\n\nto design systems and platforms that allow governed and intentional federation of data and\n\nprocesses, and at the same time allow easy and seamless integration of said data and processes.\n\nArchitectural design patterns such as Data Mesh have emerged to address these specific\n\nchallenges and considerations. Data Mesh architecture assumes a federated design and\n\ndissemination of ownership and responsibility to business units or divisions. This, in fact, has\n\nseveral advantages, chief among them that data is owned by the parts of the organization closest\n\nto the source of the data. Data residence is naturally enforced since data sits within the geo-\n\nlocality where it has been generated. Finally, data volumes and data variety are kept in control\n\ndue to the localization within a data domain (or data node). On the other hand, the architecture\n\npromotes exchange of data between different data domains when that data is needed to deliver\n\noutcomes and better insights.\n\n\n-----\n\nBusiness Unit 1 Business Unit ,\ni n R e g i o n A i n R e g i o n -\n\nCloud Storage\n\nUnity\nCatalog\n\nR/O R/O\n\n\nUnity\nCatalog\n\nCloud Storage\n\n\nDelta\nSharing\n\n\nBusiness Unit B\n\ni n R e g i o n A\n\n\nDelta\nSharing\n\nR/O R/O\n\n\nCloud Storage\n\nBusiness Unit �\nN o n - D ata b r i c k s C u s t o m e r\n\nO n a n y c lo u d o r o n - p r e m i s e s\n\nStorage\n\n\n**Figure 6:**\nBuilding a Data Mesh\nwith Delta Sharing\n\n\n-----\n\nUnity Catalog enables consolidated data access control across\n\ndifferent data domains within an organization using the Lakehouse\n\non Databricks. In addition, Unity Catalog adds a set of simple and\n\neasy-to-use declarative APIsto govern and control data exchange\n\npatterns between the data domains in the Data Mesh.\n\nTo make matters even more complicated, organizations can grow\n\nthrough mergers and acquisitions. In such cases we cannot assume\n\nthat organizations being acquired have followed the same set of\n\nrules and standards to define their platforms and produce their\n\ndata. Furthermore, we cannot even assume that they have used\n\nthe same cloud providers, nor can we assume the complexity of\n\ntheir data models. Delta Sharing can simplify and accelerate the\n\n\nunification and assimilation of newly acquired organizations and\n\ntheir data and processes.. Individual organizations can be treated\n\nas new data domains in the overarching mesh. Only selected data\n\nsources can be exchanged between the different platforms. This\n\nenables teams to move freely between the organizations that are\n\nmerging without losing their data — if anything, they are empowered\n\nto drive insights of higher quality by combining the data of both.\n\nWith Unity Catalog and Delta Sharing, the Lakehouse architecture\n\nseamlessly combines with the Data Mesh architecture to deliver\n\nmore power than ever before, pushing the boundaries of what’s\n\npossible and simplifying activities that were deemed daunting not\n\nso long ago.\n\n\n-----\n\n## Chapter 4\n How Delta Sharing Works\n\n\nDelta Sharing is designed to be simple, scalable, nonproprietary\n\nand cost-effective for organizations that are serious about getting\n\nmore from their data. Delta Sharing is natively integrated with Unity\n\nCatalog, which enables customers to add fine-grained governance\n\nand security controls, making it easy and safe to share data\n\n\nDelta Sharing is a simple REST protocol that securely grants\n\ntemporary access to part of a cloud data set. It leverages modern\n\ncloud storage systems — such as AWS S3, Azure ADLS or Google’s\n\nGCS — to reliably grant read-only access to large data sets. Here’s\n\nhow it works for data providers and data recipients.\n\n\ninternally or externally.\n\nData PJQIiLeJ Data Recipient\n\nAccess permissions\n\nRequest table\n\nPre-signed short-lived URLs\n\n\nDelta Lake\n\nParquet `iles\n\n\nDelta Sharing Server\n\n\n**Figure 7:**\nHow Delta Sharing\nworks connecting data\nproviders and data\nrecipients\n\n\nTemporary direct access to fles Parquet ormatt\nin the object store — AWS S3, GCP, ADLS\n\n\n\n- • •\nDelta Sharing Client\n\n\n-----\n\n#### Data providers\n\nThe data provider shares existing tables or parts thereof (such as\n\nspecific table versions or partitions) stored on the cloud data lake\n\nin Delta Lake format. The provider decides what data they want to\n\nshare and runs a sharing server in front of it that implements the\n\nDelta Sharing protocol and manages recipient access. . To manage\n\nshares and recipients, you can use SQL commands,the Unity\n\nCatalog CLI or the intuitive user interface.\n\n#### Data recipients\n\nThe data recipient only needs one of the many Delta Sharing clients\n\nthat support the protocol. Databricks has released open source\n\nconnectors for pandas, Apache Spark, Java and Python, and is\n\nworking with partners on many more.\n\n\n#### The data exchange\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n**1.** \u0007The recipient’s client authenticates to the sharing server and\n\nasks to query a specific table. The client can also provide filters\n\non the data (for example, “country=US”) as a hint to read just a\n\nsubset of the data.\n\n**2.** \u0007The server verifies whether the client is allowed to access the\n\ndata, logs the request, and then determines which data to send\n\nback. This will be a subset of the data objects in cloud storage\n\nsystems that make up the table.\n\n**3.** \u0007To allow temporary access to the data, the server generates\n\nshort-lived presigned URLs that allow the client to read Parquet\n\nfiles directly from the cloud provider so that the read-only\n\naccess can happen in parallel at massive bandwidth, without\n\nstreaming through the sharing server.\n\n\n-----\n\n## Chapter 5\n Introducing Databricks Marketplace\n\n\nEnterprises need open collaboration for data and AI. Data sharing\n\n— within an organization or externally — allows companies to\n\ncollaborate with partners, establish new partnerships and generate\n\nnew revenue streams with data monetization.\n\nThe demand for generative AI is driving disruption across industries,\n\nincreasing the urgency for technical teams to build generative AI\n\nmodels and Large Language Models (LLMs) on top of their own data\n\nto differentiate their offerings.\n\n\nTraditional data marketplaces are restricted and offer only data or\n\nsimple applications, therefore limiting their value to data consumers.\n\nThey also don’t offer tools to evaluate the data assets beyond basic\n\ndescriptions or examples. Finally, data delivery is limited, often\n\nrequiring ETL or a proprietary delivery mechanism.\n\nEnterprises need a better way to share data and AI that is flexible,\n\nsecure and unlocks business value. An ecosystem makes data\n\nsharing and collaboration powerful.\n\n\n**Today, data marketplaces present many challenges and collaboration can be complex for both data consumers and data providers.**\n\n**Data Consumers** **Data Providers**\n\n\nFocus on data only\nor simple applications\n\nLengthy discovery and\nevaluation\n\nDelayed time-to-insights\nwith vendor lock-in\n\n\nLimited opportunities to\n\nmonetize new types of assets\n\n\nLimited opportunities to\n\n\nDifficulty reaching\n\nmore users\n\n\nDifficulty reaching\n\n\nLack of secure technology\n\nand unified governance\n\n\nLack of secure technology\n\n\n-----\n\n#### Challenges in today's data marketplaces\n\n**Data Consumers** **Data Providers**\n\n\n\u0007 **Focus on data only or simple applications:** Accessing only\n\ndata sets means organizations looking to take advantage of\n\nAI/ML need to look elsewhere or start from scratch, causing\n\ndelays in driving business insights.\n\n\u0007 **Lengthy discovery and evaluation:** The tools most\n\nmarketplaces provide for data consumers to evaluate data\n\nare simply descriptions and example SQL statements. Minimal\n\n\n\u0007 **Limited opportunities to monetize new types of assets:**\n\nA data-only approach means organizations are limited to\n\nmonetizing anything beyond a data set and will face more\n\nfriction to create new revenue opportunities with non-\n\ncompatible platforms.\n\n**Difficulty reaching more users:** Data providers must choose\n\nbetween forgoing potential business or incurring the expense\n\nof replicating data.\n\n\nevaluation tools mean it takes more time to figure out if a data\n\nproduct is right for you, which might include more time in\n\nback-and-forth messages with a provider or searching for a\n\nnew provider altogether.\n\n\n**Delayed time-to-insights with vendor lock-in:** Delivery\n\nthrough proprietary sharing technologies or FTP means either\n\nvendor lock-in or lengthy ETL processes to get the data where\n\nyou need to work with it.\n\n\n**Lack of secure technology and unified governance:** Without\n\nopen standards for sharing data securely across platforms\n\nand clouds, data providers must use multiple tools to secure\n\naccess to scattered data, leading to inconsistent governance.\n\n\n-----\n\n#### What is Databricks Marketplace?\n\n\napproach allows you to put your data to work more quickly in\n\nevery cloud with your tools of choice.\n\nMarketplace brings together a vast ecosystem of data\n\nconsumers and data providers to collaborate across a wide\n\narray of data sets without platform dependencies, complicated\n\nETL, expensive replication and vendor lock-in.\n\n\nDatabricks Marketplace is an open marketplace for all your data,\n\nanalytics and AI, powered by Delta Sharing.\n\nSince Marketplace is powered by Delta Sharing, you can benefit\n\nfrom open source flexibility and no vendor lock-in, enabling you\n\nto collaborate across all platforms, clouds and regions. This open\n\n\n#### Key Benefits of Databricks Marketplace\n\n**Consumers** **Providers**\n\n\nDatabricks\nMarketplace\nprovides key benefits\nfor both data\nconsumers and data\nproviders.\n\n\nDiscover more\n\nthan just data\n\n\nReach users\n\non any platform\n\n\nReach users\n\n\nEvaluate data\n\nproducts faster\n\nAvoid vendor lock-in\n\n\nMonetize more\n\nthan just data\n\n\nMonetize more\n\n\nShare data securely\n\n\n-----\n\n#### Databricks Marketplace drives innovation and expands revenue opportunities\n\n\n##### Data Consumers\n\n For data consumers, the Databricks Marketplace dramatically expands the opportunity to deliver innovation and advance analytics and AI initiatives.\n\n**Discover more than just data:** Access more than just data sets,\n\nincluding AI models, notebooks, applications and solutions.\n\n**Evaluate data products faster:** Pre-built notebooks and sample\n\ndata help you quickly evaluate and have much greater confidence\n\nthat a data product is right for your AI or analytics initiatives.\n\nObtain the fastest and simplest time to insight.\n\n**Avoid vendor lock-in:** Substantially reduce the time to deliver\n\ninsights and avoid lock-in with open and seamless sharing and\n\ncollaboration across clouds, regions, or platforms. Directly\n\nintegrate with your tools of choice and right where you work.\n\n\n##### Data Providers\n\n For data providers, the Databricks Marketplace enables them the ability to reach new users and unlock new revenue opportunities.\n\n**Reach users on any platform:** Expand your reach across\n\nplatforms and access a massive ecosystem beyond walled\n\ngardens. Streamline delivery of simple data sharing to any cloud\n\nor region, without replication.\n\n**Monetize more than just data:** Monetize the broadest set of\n\ndata assets including data sets, notebooks, AI models to reach\n\nmore data consumers.\n\n**Share data securely:** Share all your data sets, notebooks, AI\n\nmodels, dashboards and more securely across clouds, regions\n\nand data platforms.\n\n\n-----\n\n#### Enable collaboration and accelerate innovation\n\n\n#### Powered by a fast, growing ecosystem\n\nEnterprises need open collaboration for data and AI. In the past few\n\nmonths, we've continued to increase partners across industries,\n\nincluding Retail, Communications and Media & Entertainment,\n\n\n\u0007 **Advertising and Retail**\n\nIncorporate shopper behavior analysis | Ads uplift/\n\nperformance | Demand forecasting | “Next best SKU”\n\nprediction | Inventory analysis | Live weather data\n\n\nFinancial Services, with 520+ listings you can explore in our open\n\n\n\u0007 **Finance**\n\nIncorporate data from stock exchange to predict\n\neconomic impact | Market research | Public census and\n\nhousing data to predict insurance sales\n\n\u0007 **Healthcare and Life Sciences**\n\nGenomic target identification | Patient risk scoring\n\nAccelerating drug discovery | Commercial effectiveness |\n\nClinical research\n\nFor more on Databricks Marketplace,\n\ngo to [marketplace.databricks.com](http://marketplace.databricks.com) , or refer to the\n\nResources section on page 41 .\n\n\nMarketplace from 80+ providers and counting.\n\n#### Use cases for an open marketplace\n\nOrganizations across all industries have many use cases for\n\nconsuming and sharing third-party data from the simple (dataset\n\njoins) to the more advanced (AI notebooks, applications and\n\ndashboards).\n\n\n-----\n\n#### New upcoming feature: AI model sharing\n\n\nNowadays, it may seem like every organization wants to become\n\nan AI organization. However, most organizations are new to AI.\n\nDatabricks has heard from customers that they want to discover\n\nout-of-the-box AI models on Marketplace to help them kickstart\n\ntheir AI innovation journey.\n\nTo meet this demand, Databricks will be adding AI model sharing\n\ncapabilities on Marketplace to provide users access to both OSS\n\nand proprietary AI (both first-and third-party) models. This will\n\nenable data consumers and providers to discover and monetize AI\n\nmodels and integrate AI into their data solutions.\n\n\nUsing this feature, data consumers can evaluate AI models with\n\nrich previews, including visualizations and pre-built notebooks with\n\nsample data. With Databricks Marketplace, there are no difficult\n\ndata delivery mechanisms — you can get the AI models instantly\n\nwith the click of a button. All of this works out-of-the-box with the AI\n\ncapabilities of the Databricks Lakehouse Platform for both real-time\n\nand batch inference. For real-time inference, you can use model\n\nserving endpoints. For batch inference, you can invoke the models\n\nas functions directly from DBSQL or notebooks.\n\nWith AI model sharing, Databricks customers will have access\n\nto best-in-class models from leading providers, as well as OSS\n\nmodels published by Databricks which can be quickly and securely\n\napplied on top of their data. Databricks will curate and publish\n\nits own open source models across common use cases, such as\n\ninstruction-following and text summarization, and optimize tuning or\n\ndeployment of these models.\n\nUsing AI models from Databricks Marketplace can help your\n\norganization summarize complex information quickly and easily to\n\nhelp accelerate the pace of innovation.\n\n\n-----\n\n## Chapter 6\n Share securely with Databricks Clean Rooms\n\n\nWhile the demand for external data to make data-driven\n\ninnovations is greater than ever, there is growing concern among\n\norganizations around data privacy. The need for organizations to\n\nshare data and collaborate with their partners and customers in a\n\nsecure, governed and privacy-centric way is driving the concept\n\nof “data clean rooms.”\n\n\n#### What is a data clean room?\n\nA data clean room provides a secure, governed and privacy-safe\n\nenvironment where participants can bring their sensitive data, which\n\nmight include personally identifiable information (PII), and perform\n\njoint analysis on that private data. Participants have full control\n\nof the data and can decide which participants can perform what\n\nanalysis without exposing any sensitive data.\n\n\n###### Collaborator A\n Data Cleanroom\nE.G., AGENCIES, PUBLISHERS, MVPDS, RETAILERS\n\n\u0007What is our audience overlap?\n\n\n###### Collaborator B\n\nE.G., ADVERTISERTS\n\n\n**Figure 8:**\nData clean room\ndiagram example\nfor audience\noverlap analysis in\nadvertising\n\n\nHow did my campaign do in\n\nterms of reach and frequency?\n\n\n\u0007What is the lift in purchases\n\namong those in-segment versus\nthose out-of-segment?\n\n**Collaborator A-owned sensitive data** **Secure and privacy-preserving environment** **Collaborator B-owned sensitive data**\n\n\n-----\n\nA data clean room is not a new concept. Google introduced the idea in 2017 when it announced Ads Data Hub, which allows\n\nadvertisers to gain impression-level insights about cross-device media campaigns in a more secure, privacy-safe environment. In\n\nthe last few years, the demand for clean rooms has accelerated. IDC predicts that by 2024, 65% of G2000 enterprises will form data\n\nsharing partnerships with external stakeholders via data clean rooms to increase interdependence while safeguarding data privacy.\n\nThere are various compelling needs driving this demand:\n\n\n**Privacy-first world.** Stringent data privacy regulations such as\n\nGDPR and CCPA, along with sweeping changes in third-party\n\nmeasurement, have transformed how organizations collect, use\n\nand share data. For example, Apple’s [App Tracking Transparency](https://developer.apple.com/app-store/user-privacy-and-data-use/)\n\n[Framework](https://developer.apple.com/app-store/user-privacy-and-data-use/) (ATT) provides users of Apple devices the freedom\n\nand flexibility to easily opt out of app tracking. Google also plans\n\nto [phase out support for third-party cookies in Chrome](https://blog.google/products/chrome/updated-timeline-privacy-sandbox-milestones/) by late\n\n2024. As these privacy laws and practices evolve, the demand\n\nfor data cleanrooms is likely to rise as the industry moves to new\n\n\n**Collaboration in a fragmented ecosystem.** Today, consumers have\n\nmore options than ever before when it comes to where, when and\n\nhow they engage with content. As a result, the digital footprint of\n\nconsumers is fragmented across different platforms, necessitating\n\nthat companies collaborate with their partners to create a unified\n\nview of their customers’ needs and requirements. To facilitate\n\ncollaboration across organizations, cleanrooms provide a secure\n\nand private way to combine their data with other data to unlock new\n\ninsights or capabilities.\n\n\nidentifiers that are PII based, such as UID 2.0, and organizations\n\ntry to find new ways to share and join data with customers and\n\npartners in a privacy-centric way.\n\n**New ways to monetize data.** Most organizations are looking to\n\nmonetize their data in one form or another. With today’s privacy\n\nlaws, companies will try to find any possible advantages to monetize\n\ntheir data without the risk of breaking privacy rules. This creates an\n\nopportunity for data vendors or publishers to join data for big data\n\nanalytics without having direct access to the data.\n\n\n-----\n\n#### Common data clean room uses cases\n\n\n#### Category management for retail and consumer goods\n\nClean rooms enable real-time collaboration between retailers\n\nand suppliers, ensuring secure information exchange for demand\n\nforecasting, inventory planning and supply chain optimization.\n\nThis improves product availability, reduces costs and streamlines\n\noperations for both parties.\n\n#### Real-world evidence (RWE) for healthcare\n\nClean rooms provide secure access to sensitive healthcare data sets,\n\nallowing collaborators to connect and query multiple sources of data\n\nwithout comprising data privacy. This supports RWE use cases such\n\nas regulatory decisions, safety, clinical trial design and observational\n\nresearch.\n\n\n#### Audience overlap exploration for media and entertainment\n\nBy creating a clean room environment, media companies can\n\nsecurely share their audience data with advertisers or other media\n\npartners. This allows them to perform in-depth analysis and identify\n\nshared audience segments without directly accessing or exposing\n\nindividual user information.\n\n#### Know Your Customer (KYC) in banking\n\nKYC standards are designed to combat financial fraud, money\n\nlaundering and terrorism financing. Clean rooms can be used within a\n\ngiven jurisdiction to allow financial services companies to collaborate\n\nand run shared analytics to build a holistic view of a transaction for\n\ninvestigations.\n\n\n-----\n\n#### Personalization with expanded interests for retailers\n\nRetailers want to target consumers based on past purchases, as\n\nwell as other purchases with different retailers. Clean rooms enable\n\nretailers to augment their knowledge of consumers to suggest new\n\nproducts and services that are relevant to the individual but have\n\n\n#### 5G data monetization for telecom\n\n5G data monetization enables telecoms to capitalize on data\n\nfrom 5G networks. Clean rooms provide a secure environment\n\nfor collaboration with trusted partners, ensuring privacy while\n\nmaximizing data value for optimized services, personalized\n\nexperiences and targeted advertising.\n\n\nnot yet been purchased.\n\n\n-----\n\n#### Shortcomings of existing data clean rooms\n\n\nOrganizations exploring clean room options are finding some glaring\n\nshortcomings in the existing solutions that limit the full potential of the\n\n“clean rooms” concept.\n\nFirst, many existing data clean room vendors require data to be on the\n\nsame cloud, same region, and/or same data platform. Participants then\n\nhave to move data into proprietary platforms, which results in lock-in\n\nand additional data storage costs.\n\n\nSecond, most existing solutions are not scalable to expand\n\ncollaboration beyond a few collaborators at a time. For example,\n\nan advertiser might want to get a detailed view of their ad\n\nperformance across different platforms, which requires analysis\n\nof the aggregated data from multiple data publishers. With\n\ncollaboration limited to just a few participants, organizations get\n\npartial insights on one clean room platform and end up moving\n\ntheir data to another clean room vendor to aggregate the data,\n\nincurring the operational overhead of collating partial insights.\n\nFinally, existing clean room solutions do not provide the flexibility\n\nto run arbitrary analysis and are mainly restricted to SQL, a\n\nsubset of Python, and pre-defined templates. While SQL is\n\nabsolutely needed for clean rooms, there are times when you\n\nrequire complex computations such as machine learning or\n\nintegration with APIs where SQL doesn’t satisfy the full depth of\n\nthe technical requirements.\n\n\n-----\n\n#### Key benefits of Databricks Clean Rooms\n\nDatabricks Clean Rooms allow businesses to easily collaborate with their customers and partners in a secure environment on\n\nany cloud in a privacy-safe way. Key benefits of Databricks Clean Rooms include:\n\n\n**Flexible - your language and workload of**\n\n**choice.** Databricks Clean Rooms empower\n\ncollaborators to share and join their existing\n\ndata and run complex workloads in any\n\nlanguage —Python, R, SQL, Java and Scala —\n\non the data while maintaining data privacy.\n\nBeyond traditional SQL, users can run arbitrary\n\nworkloads and languages, allowing them to train\n\nmachine learning models, perform inference\n\nand utilize open-source or third-party privacy-\n\nenhancing technologies. This flexibility enables\n\ndata scientists and analysts to achieve more\n\ncomprehensive and advanced data analysis\n\nwithin the secure Clean Room environment.\n\n\n**Scalable, multi-party collaboration.**\n\nWith Databricks Clean Rooms, you can\n\nlaunch a clean room and work with multiple\n\ncollaborators at a time. This capability\n\nenables real-time collaboration, fostering\n\nefficient and rapid results. Moreover,\n\nDatabricks Clean Rooms seamlessly\n\nintegrate with identity service providers,\n\nallowing users to leverage offerings from\n\nthese providers during collaboration. The\n\nability to collaborate with multiple parties\n\nand leverage identity services enhances the\n\noverall data collaboration experience within\n\nDatabricks Clean Rooms.\n\n\n**Interoperable - any data source**\n\n**with no replication.** Databricks Clean\n\nRooms excel in interoperability, ensuring\n\nsmooth collaboration across diverse\n\nenvironments. With Delta Sharing,\n\ncollaborators can seamlessly work\n\ntogether across different cloud providers,\n\nregions and even data platforms without\n\nthe need for extensive data movement.\n\nThis eliminates data silos and enables\n\norganizations to leverage existing\n\ninfrastructure and data ecosystems while\n\nmaintaining the utmost security and\n\ncompliance.\n\n\n-----\n\n## Resources\n Getting started with Data Sharing and Collaboration\n\n\nData sharing plays a key role in business processes across the\n\nenterprise, from product development and internal operations to\n\ncustomer experience and compliance. However, most businesses\n\nhave been slow to move forward because of incompatibility\n\nbetween systems, complexity and security concerns.\n\nData-driven organizations need an open — and secure — approach\n\nto data sharing.\n\n\nDatabricks offers an open approach to data sharing and\n\ncollaboration with a variety of tools to:\n\n\u0007 **Share across platforms:** You can share live data sets, as well\n\nas AI models, dashboards and notebooks across platforms,\n\nclouds and regions. This open approach is powered by\n\nDelta Sharing, the world’s first open protocol for secure data\n\nsharing, which allows organizations to share data for any use\n\ncase, any tool and on any cloud.\n\n\u0007 **Share all your data and AI: Databricks Marketplace** is an\n\nopen marketplace for all your data, analytics and AI, enabling\n\nboth data consumers and data providers with the ability to\n\ndeliver innovation and advance analytics and AI initiatives.\n\n\u0007 **Share securely: Databricks Clean Rooms** allows businesses\n\nto easily collaborate with customers and partners on any\n\ncloud in a privacy-safe way. With Delta Sharing, clean room\n\nparticipants can securely share data from their data lakes\n\nwithout any data replication across clouds or regions. Your\n\ndata stays with you without vendor lock-in, and you can\n\ncentrally audit and monitor the usage of your data.\n\n\n-----\n\nGet started with these products by exploring the resources below.\n\n\n**Delta Sharing**\n\n\u0007 [Data Sharing on Databricks](https://www.databricks.com/product/delta-sharing)\n\n[\u0007Learn about Databricks Unity Catalog](https://www.databricks.com/product/unity-catalog)\n\n[\u0007Blog post: What’s new with Data Sharing and Collaboration on the](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[Lakehouse](https://www.databricks.com/blog/whats-new-data-sharing-and-collaboration-lakehouse)\n\n[\u0007Learn about open source Delta Sharing](https://delta.io/sharing/)\n\n[Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n\n**Databricks Marketplace**\n\n[\u0007Learn about Databricks Marketplace](https://www.databricks.com/product/marketplace)\n\n[\u0007Explore Databricks Marketplace](https://marketplace.databricks.com/)\n\n[\u0007Video: Databricks Marketplace - Going Beyond Data and](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[Applications](https://youtu.be/d11QcTaqHE4?feature=shared)\n\n[\u0007Demo: Databricks Marketplace](https://www.databricks.com/resources/demos/videos/data-sharing/marketplace)\n\n[\u0007AWS Documentation: What is Databricks Marketplace](https://docs.databricks.com/en/marketplace/index.html)\n\n[\u0007Azure Documentation: What is Databricks Marketplace](https://learn.microsoft.com/en-us/azure/databricks/marketplace/)\n\n\n[AWS Documentation](https://docs.databricks.com/en/data-sharing/index.html)\n\n\n**Databricks Clean Rooms**\n\n\u0007 [Learn about Databricks Clean Rooms](https://www.databricks.com/product/clean-room)\n\n[\u0007Video: What’s new with Data Sharing and Collaboration on](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[the Lakehouse](https://youtu.be/imSi6dYBXSg?feature=shared)\n\n[\u0007eBook: The Definitive Guide to Data Clean Rooms](https://www.databricks.com/resources/ebook/market-smarter-data-clean-rooms)\n\n[\u0007Webinar: Unlock the Power of Secure Data Collaboration](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n[with Clean Rooms](https://events.databricks.com/202304-AMER-VE-Clean-Room-Panel?utm_source=habu&_gl=1*1r1w5jw*_gcl_au*NTc4ODMxMjE4LjE2ODg5MjQ0Njk.*rs_ga*ODM5OTc3OTgtOTdmYy00ZmZhLTkwMTktZTlhYmFhNzlmZWE2*rs_ga_PQSEQ3RZQC*MTY5Mjg4ODIzNzc4NC45OC4xLjE2OTI4ODgzMDYuNTkuMC4w&_ga=2.161567100.1599267366.1692625473-835843671.1688924469)\n\n\n[Azure Documentation](https://learn.microsoft.com/en-us/azure/databricks/data-sharing/)\n\n\n-----\n\n## About the Authors\n\n\n**Vuong Nguyen** is a Solution Architect at Databricks, focusing on\n\nmaking analytics and AI simple for customers by leveraging the\n\npower of the Databricks Lakehouse Platform. You can reach Vuong\n\non [LinkedIn](https://www.linkedin.com/in/vuong-nguyen) .\n\n\n**Sachin Thakur** is a Principal Product Marketing Manager on the\n\nDatabricks Data Engineering and Analytics team. His area of focus\n\nis data governance with Unity Catalog, and he is passionate about\n\nhelping organizations democratize data and AI with the Databricks\n\nLakehouse Platform. You can reach Sachin on [LinkedIn](https://www.linkedin.com/in/sachin10thakur/) .\n\n\n**Milos Colic** is a Senior Solution Architect at Databricks. His\n\n\npassion is to help customers with their data exchange and data\n\nmonetization needs. Furthermore, he is passionate about geospatial\n\ndata processing and ESG. You can reach Milos on [LinkedIn](https://www.linkedin.com/in/milos-colic/) .\n\n\n**Jay Bhankharia** is a Senior Director on the Databricks Data\n\nPartnerships team. His passion is to help customers gain insights\n\nfrom data to use the power of the Databricks Lakehouse Platform\n\nfor their analytics needs. You can reach Jay on [LinkedIn](https://www.linkedin.com/in/jay-bhankharia-cfa-b9835612/) .\n\n\n**Itai Weiss** is a Lead Delta Sharing Specialist at Databricks and has\n\n\nover 20 years of experience in helping organizations of any size\n\nbuild data solutions. He focuses on data monetization and loves to\n\nhelp customers and businesses get more value from the data they\n\nhave. You can reach Itai on [LinkedIn](https://www.linkedin.com/in/itai-weiss/) .\n\n**Somasekar Natarajan** (Som) is a Solution Architect at\n\nDatabricks specializing in enterprise data management. Som has\n\nworked with Fortune organizations spanning three continents for\n\nclose to two decades with one objective — helping customers to\n\n\n**Giselle Goicochea** is a Senior Product Marketing Manager\n\non the Databricks Data Engineering and Analytics team. Her area\n\nof focus is data sharing and collaboration with Delta Sharing and\n\nDatabricks Marketplace. You can reach Giselle on [LinkedIn](https://www.linkedin.com/in/giselle-goicochea/) .\n\n**Kelly Albano** is a Product Marketing Manager on the Databricks\n\nData Engineering and Analytics team. Her area of focus is security,\n\ncompliance and Databricks Clean Rooms. You can reach\n\nKelly on [LinkedIn](https://www.linkedin.com/in/kellyalbano/) .\n\n\nharness the power of data. You can reach Som on [LinkedIn](https://www.linkedin.com/in/somasekar-natarajan/) .\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n© Databricks 2023 All rights reserved\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### The Delta Lake Series Complete Collection\n\n\n-----\n\n### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\nlifecycle management to data lakes. With Delta Lake, there will be no more\nmalformed data ingestion, difficulties deleting data for compliance, or issues\nmodifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\nyour data lake and the rate that teams can leverage that data with a secure and\nscalable cloud service.\n\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\n\n\n-----\n\nContents Processes Petabytes With Data Skipping and Z-Ordering\n\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n\nPerformance Matter **you’ll find inside** 5 Features 22\n\n\n\nProcesses Petabytes With Data Skipping and Z-Ordering\n\n\nRollbacks 39\n\nPinned view of a continuously updating\n\nDelta Lake table across multiple downstream jobs\n\nQueries for time series analytics made simple\n\nEasily Clone Your Delta Lake\n\nfor Testing, Sharing and ML\n\nReproducibility 41\n\nWhat are clones? 41\n\n\nA lakehouse combines the best elements\n\nof data lakes and data warehouses 52\n\nSome early examples 55\n\nFrom BI to AI 55\n\nDiving Deep Into the\n\nInner Workings of the Lakehouse and Delta Lake 56\n\n1. Data lakes 57\n\n2. Custom storage engines 57\n\n\nCreating the Dashboard /\n\nVirtual Network Operation Centers 82\n\nCreating (near) real-time alerts 85\n\nNext steps: machine learning 86\n\nPoint-of-failure prediction and remediation 87\n\nCustomer churn 87\n\nGetting started with the Databricks streaming video QoS solution 87\n\nCustomer Use Cases 88\n\nHealthdirect Australia 89\n\nData quality and governance issues, silos, and the inability to scale 89\n\n\nFundamentals & Performance\n\n\nUsing data skipping and Z-Order clustering 21\n\n\nThe Fundamentals of Delta Lake: Why Reliability and\n\n\nExploring the details 21\n\n\nPerformance Matter\n\n\nFeatures\n\n\nChallenges with data lakes\n\nDelta Lake’s key functionalities\n\nUnpacking the Transaction Log\n\nImplementing atomicity to ensure\n\n\nWhy Use MERGE\n\nWith Delta Lake?\n\nWhen are upserts necessary? 24\n\nWhy upserts into data lakes have\n\n\noperations complete fully\n\n\noperations complete fully 9\n\nDealing with multiple concurrent reads and writes **Chapter**\n\nTime travel, data lineage and debugging 10\n\nHow to Use Schema Enforcement and Evolution\n\nUnderstanding table schemas 11\n\n#### 01\n\n\nFundamentals and Performance traditionally been challenging 25\n\n\ntraditionally been challenging\n\n\nShallow clones\n\nDeep clones\n\n\n**Chapter**\n\n42\n\n42\n\n#### 04\n\n\n3. Lakehouse\n\n\nDealing with multiple concurrent reads and writes\n\n\nIntroducing MERGE in Delta Lake\n\n\nIn the research paper, the authors explain: 59\n\n\n3. Lakehouse Streaming 58\n\n\n\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\nand Performance Matter Deleting data due to GDPR 26\n\n\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\n\nDelta Engine **•** **USE CASE #1:** 60 Simplifying Streaming Stock Faster data pipelines result in better patient-driven healthcare 91\n\nScaling execution performance Data Analysis Using Delta Lake 60 Comcast 93\n\n\nTime travel, data lineage and debugging\n\n\nSimplifying use cases with MERGE\n\n\nWhere do clones help?\n\n\nUnderstanding\n\n\nModernizing analytics with Databricks and Delta Lake\n\n\nHow to Use Schema Enforcement and Evolution\n\n\nDeleting data due to GDPR\n\n\nTesting and experimentation with a production table\n\n\nDelta Engine\n\n\nFaster data pipelines result in better patient-driven healthcare\n\n\n\n- Unpacking the Transaction Log Applying change data from databases 26\n\n- How to Use Schema Enforcement and Evolution Updating session information from streaming pipelines 27\n\n- Delta Lake DML Internals How to start using Delta Lake 28\n\n- How Delta Lake Quickly Processes Petabytes Loading and saving our Delta Lake data 29\nWith Data Skipping and Z-Ordering In-place conversion to Delta Lake 30\n\n\nUnderstanding table schemas\n\n\nApplying change data from databases\n\n\nStaging major changes to a production table\n\n\nScaling execution performance\n\n\nComcast\n\n\nAnnouncing Delta Engine for **•** **USE CASE #2:** How Tilting Point Does Streaming Infrastructure unable to support data and ML needs\n\nhigh-performance query execution Ingestion Into Delta Lake 61 Automated infrastructure, faster data\n\n\nWhat is schema enforcement?\n\nHow does schema enforcement work?\n\nHow is schema enforcement useful?\n\nWhat is schema evolution?\n\nHow does schema evolution work?\n\n\nUpdating session information from streaming pipelines\n\n\nMachine learning result reproducibility\n\nData migration\n\nData sharing\n\nData archiving\n\nLooks awesome! Any gotchas?\n\nHow can I use it?\n\nEnabling Spark SQL DDL\n\n\nAnnouncing Delta Engine for\n\n\nInfrastructure unable to support data and ML needs\n\n\nHow to start using Delta Lake\n\n\nhigh-performance query execution\n\n\nAutomated infrastructure, faster data\n\n\nGetting started with Delta Engine **•** **USE CASE #3:** 62 Building a Quality of Service pipelines with Delta Lake 95\n\nStreaming Analytics Solution for Streaming Video Services 63 Delivering personalized experiences with ML\n\n\nLoading and saving our Delta Lake data\n\n\nGetting started with Delta Engine\n\n\npipelines with Delta Lake\n\n\nIn-place conversion to Delta Lake\n\n\nStreaming\n\n\nDelivering personalized experiences with ML\n\n\nDelete our flight data\n\nUpdate our flight data 31\n\nMerge our flight data 31\n\n\nHow Delta Lake Solves Common Pain Points in Streaming\n\n\nBanco Hipotecario 97\n\nLegacy analytics tools are slow, rigid and\n\nimpossible to scale 98\n\n\nHow is schema evolution useful? 14\n\nSummary **Chapter** 14\n\nDelta Lake\n\nDML Internals 15\n\nDelta Lake DML: UPDATE 15\n\n#### 02\n\n\nFeatures\n\n\n#### 05 Chapter\n\n\nData lake pain points Customer Use Cases 64\n\n\nHow is schema evolution useful?\n\n\nData lake pain points\n\n\nSummary\n\n\nData warehouse pain points\n\n\n\n- Why Use MERGE With Delta Lake? View table history 32\n\n- Simple, Reliable Upserts and Deletes on Delta Lake Travel back in time with table history 33\nTables Using Python APIs Clean up old table versions with vacuum 33\n\n\nHow Delta Lake on Databricks solves these issues **•** **USE CASE #1:** Healthdirect Australia Provides Personalized 65 A unified platform powers the data lake\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake and Secure Online Patient Care With Databricks 66 and easy collaboration 99\n\n\nDelta Lake\n\n\nView table history\n\n\nand DML in Delta Lake on\n\n\nHow Delta Lake on Databricks solves these issues\n\n\nA unified platform powers the data lake\n\n\nDML Internals\n\n\nTravel back in time with table history\n\n\nApache Spark 3.0\n\n\nSimplifying Streaming Stock Data Analysis Using Delta Lake\n\n\nand easy collaboration\n\n\nImplement your streaming **•** **USE CASE #2:** Comcast Uses Delta Lake and MLflow to An efficient team maximizes customer\n\nstock analysis solution with Delta Lake Transform the Viewer Experience 67 acquisition and retention 100\n\n\nDelta Lake DML: UPDATE\n\n\nClean up old table versions with vacuum\n\n\nSupport for SQL DDL commands\n\n\nImplement your streaming\n\n\nAn efficient team maximizes customer\n\n\n\n- Time Travel for Large-Scale Data Lakes Common challenges with changing data 35\n\n- Easily Clone Your Delta Lake for Testing, Sharing Working with Time Travel 36\nand ML Reproducibility 1. Using a timestamp 36\n\n\nUPDATE: Under the hood 16\n\nUPDATE + Delta Lake time travel = Easy debugging\n\nUPDATE: Performance tuning tips 16\n\nDelta Lake DML: DELETE 16\n\nDELETE: Under the hood 17\n\nDELETE + VACUUM: Cleaning up old data files\n\n\nCommon challenges with changing data\n\n\nto define tables in the Hive metastore\n\n\nstock analysis solution with Delta Lake\n\n\nacquisition and retention\n\n\nAnalyze streaming stock data in real time 69 **•** **USE CASE #3:** Banco Hipotecario Personalizes the Banking Viacom18 101\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake Experience With Data and ML 71 Growth in subscribers and terabytes of viewing data push Hadoop to its limits 102\n\n\nWorking with Time Travel\n\n\nCreate or replace tables\n\n\nAnalyze streaming stock data in real time 69\n\n\nViacom18\n\n\n1. Using a timestamp\n\n\nExplicitly alter the table schema\n\n\nHow Tilting Point Does Streaming Ingestion Into Delta Lake\n\n\nGrowth in subscribers and terabytes of viewing data push Hadoop to its limits\n\n\n\n- Enabling Spark SQL DDL and DML in Delta Lake Scala syntax 36\non Apache Spark 3.0 Python syntax 37\n\n\nHow data flows and associated challenges 72 **•** **USE CASE #4:** Viacom18 Migrates From Hadoop to Rapid data processing for analytics\n\nLeveraging Structured Streaming with blob store as Databricks to Deliver More Engaging Experiences 72 and ML with Databricks 103\n\n\nScala syntax\n\n\nSupport for SQL Insert, Delete, Update and Merge\n\nAutomatic and incremental Presto/Athena manifest generation\n\nConfiguring your table through table properties\n\nSupport for adding user-defined metadata\n\nin Delta Lake table commits 48\n\nOther highlights 49\n\nLakehouse 50\n\nWhat Is a\n\nLakehouse? 51\n\n\nHow data flows and associated challenges 72\n\n\nRapid data processing for analytics\n\n\nPython syntax\n\n\nLeveraging Structured Streaming with blob store as\n\n\nand ML with Databricks\n\n\nSQL syntax 37\n\n2. Using a version number\n\nScala syntax\n\n\nsource and Delta Lake tables as sink\n\n\nLeveraging viewer data to power personalized viewing experiences 104\n\n\nDELETE: Performance tuning tips 18\n\nDelta Lake DML: MERGE **Chapter** 18\n\nHere’s how an upsert works: 18\n\nMERGE: Under the hood 19\n\nMERGE: Performance tuning tips **03** 19\n\n\nDELETE: Performance tuning tips\n\n\nLakehouse\n\n\nBuilding a Quality of Service Analytics Solution for Streaming Video Services 75\n\nDatabricks Quality of Service solution overview 76\n\nVideo QoS solution architecture 77\n\nMaking your data ready for analytics 79\n\nVideo applications events 80\n\nCDN logs 81\n\n\nDelta Lake DML: MERGE\n\n\n\n- What Is a Lakehouse? Python syntax 38\n\n- Diving Deep Into the Inner Workings of the SQL syntax 38\nLakehouse and Delta Lake Audit data changes 39\n\n\nHere’s how an upsert works:\n\n\nPython syntax\n\n\nMERGE: Under the hood\n\n\nSQL syntax\n\n\nMERGE: Performance tuning tips\n\n\nAudit data changes\n\n\nHow Delta Lake Quickly\n\n\n\n- Understanding Delta Engine Reproduce experiments and reports 39\n\n\n-----\n\n**Fundamentals and Performance**\nBoost data reliability for machine learning and\nbusiness intelligence with Delta Lake\n\n## CHAPTER 01\n\n\n-----\n\n**The Fundamentals of Delta**\n**Lake: Why Reliability and**\n**Performance Matter**\n\nWhen it comes to data reliability, performance — the speed at which your programs\nrun — is of utmost importance. Because of the ACID transactional protections that\nDelta Lake provides, you’re able to get the reliability and performance you need.\n\nWith Delta Lake, you can stream and batch concurrently, perform CRUD operations,\nand save money because you’re now using fewer VMs. It’s easier to maintain your data\nengineering pipelines by taking advantage of streaming, even for batch jobs.\n\nDelta Lake is a storage layer that brings reliability to your data lakes built on HDFS and\ncloud object storage by providing ACID transactions through optimistic concurrency\ncontrol between writes and snapshot isolation for consistent reads during writes.\nDelta Lake also provides built-in data versioning for easy rollbacks and reproducing\nreports.\n\nIn this chapter, we’ll share some of the common challenges with data lakes as well as\nthe Delta Lake features that address them.\n\n**Challenges with data lakes**\nData lakes are a common element within modern data architectures. They serve as a\ncentral ingestion point for the plethora of data that organizations seek to gather and\nmine. While a good step forward in getting to grips with the range of data, they run\ninto the following common problems:\n\n\n-----\n\n**1. \u0007Reading and writing into data lakes is not reliable.** Data engineers often run into\nthe problem of unsafe writes into data lakes that cause readers to see garbage\ndata during writes. They have to build workarounds to ensure readers always see\nconsistent data during writes.\n\n**2. \u0007The data quality in data lakes is low.** Dumping unstructured data into a data\nlake is easy, but this comes at the cost of data quality. Without any mechanisms\nfor validating schema and the data, data lakes suffer from poor data quality. As a\nconsequence, analytics projects that strive to mine this data also fail.\n\n**3. Poor performance with increasing amounts of data.** As the amount of data\nthat gets dumped into a data lake increases, the number of files and directories\nalso increases. Big data jobs and query engines that process the data spend a\nsignificant amount of time handling the metadata operations. This problem is more\npronounced in the case of streaming jobs or handling many concurrent batch jobs.\n\n**4. \u0007Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.\n\nBecause of these challenges, many big data projects fail to deliver on their vision or\nsometimes just fail altogether. We need a solution that enables data practitioners to\nmake use of their existing data lakes, while ensuring data quality.\n\n**Delta Lake’s key functionalities**\nDelta Lake addresses the above problems to simplify how you build your data lakes.\nDelta Lake offers the following key functionalities:\n\n**• ACID transactions:** Delta Lake provides ACID transactions between multiple\nwrites. Every write is a transaction, and there is a serial order for writes recorded in\na transaction log. The transaction log tracks writes at file level and uses [optimistic](https://en.wikipedia.org/wiki/Optimistic_concurrency_control)\n\n\n-----\n\n[concurrency control](https://en.wikipedia.org/wiki/Optimistic_concurrency_control) , which is ideally suited for data lakes since multiple writes\ntrying to modify the same files don’t happen that often. In scenarios where\nthere is a conflict, Delta Lake throws a concurrent modification exception for\nusers to handle them and retry their jobs. Delta Lake also offers the highest level\nof isolation possible ( [serializable isolation](https://en.wikipedia.org/wiki/Isolation_(database_systems)#Serializable) ) that allows engineers to continuously\nkeep writing to a directory or table and consumers to keep reading from the same\ndirectory or table. Readers will see the latest snapshot that existed at the time the\nreading started.\n\n**• \u0007Schema management:** Delta Lake automatically validates that the schema of the\nDataFrame being written is compatible with the schema of the table. Columns that\nare present in the table but not in the DataFrame are set to null. If there are extra\ncolumns in the DataFrame that are not present in the table, this operation throws\nan exception. Delta Lake has DDL to add new columns explicitly and the ability to\nupdate the schema automatically.\n\n**• \u0007Scalable metadata handling:** Delta Lake stores the metadata information of\na table or directory in the transaction log instead of the metastore. This allows\nDelta Lake to list files in large directories in constant time and be efficient while\nreading data.\n\n**• Data versioning and time travel:** Delta Lake allows users to read a previous\nsnapshot of the table or directory. When files are modified during writes, Delta\nLake creates newer versions of the files and preserves the older versions. When\n\n\nusers want to read the older versions of the table or directory, they can provide\na timestamp or a version number to Apache Spark’s read APIs, and Delta Lake\nconstructs the full snapshot as of that timestamp or version based on the\ninformation in the transaction log. This allows users to reproduce experiments and\nreports and also revert a table to its older versions, if needed.\n\n**• Unified batch and streaming sink:** Apart from batch writes, Delta Lake can also\nbe used as an efficient streaming sink with [Apache Spark’s structured streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) .\nCombined with ACID transactions and scalable metadata handling, the efficient\nstreaming sink enables lots of near real-time analytics use cases without having to\nmaintain a complicated streaming and batch pipeline.\n\n**• Record update and deletion:** Delta Lake will support merge, update and delete\nDML commands. This allows engineers to easily upsert and delete records in data\nlakes and simplify their change data capture and GDPR use cases. Since Delta Lake\ntracks and modifies data at file-level granularity, it is much more efficient than\nreading and overwriting entire partitions or tables.\n\n**• Data expectations (coming soon):** Delta Lake will also support a new API to set\ndata expectations on tables or directories. Engineers will be able to specify a\nboolean condition and tune the severity to handle data expectations. When Apache\nSpark jobs write to the table or directory, Delta Lake will automatically validate\nthe records and when there is a violation, it will handle the records based on the\nseverity provided.\n\n\n-----\n\n**Unpacking the**\n**Transaction Log**\n\nThe transaction log is key to understanding Delta Lake because it is the common thread\nthat runs through many of its most important features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log is\nan ordered record of every transaction that has ever been performed on a Delta Lake\ntable since its inception.\n\nDelta Lake is built on top of [Apache Spark](https://databricks.com/spark/about) to allow multiple readers and writers of a\ngiven table to work on the table at the same time. To show users correct views of the\ndata at all times, the transaction log serves as a single source of truth: the central\nrepository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on an\nopen table that has been modified since the last time it was read, Spark checks the\ntransaction log to see what new transactions are posted to the table. Then, Spark\nupdates the end user’s table with those new changes. This ensures that a user’s\nversion of a table is always synchronized with the master record as of the most recent\nquery and that users cannot make divergent, conflicting changes to a table.\n\nIn this chapter, we’ll explore how the Delta Lake transaction log offers an elegant\nsolution to the problem of multiple concurrent reads and writes.\n\n\n-----\n\n**Implementing atomicity to ensure**\n**operations complete fully**\nAtomicity is one of the four properties of ACID transactions that guarantees that\noperations (like an INSERT or UPDATE) performed on your [data lake](https://databricks.com/glossary/data-lake) either complete\nfully or don’t complete at all. Without this property, it’s far too easy for a hardware\nfailure or a software bug to cause data to be only partially written to a table, resulting\nin messy or corrupted data.\n\nThe transaction log is the mechanism through which Delta Lake is able to offer\nthe guarantee of atomicity. For all intents and purposes, if it’s not recorded in the\ntransaction log, it never happened. By only recording transactions that execute fully\nand completely, and using that record as the single source of truth, the Delta Lake\ntransaction log allows users to reason about their data and have peace of mind about\nits fundamental trustworthiness, at petabyte scale.\n\n**Dealing with multiple concurrent reads and writes**\nBut how does Delta Lake deal with multiple concurrent reads and writes? Since Delta\nLake is powered by Apache Spark, it’s not only possible for multiple users to modify a\n\n\ntable at once — it’s expected. To handle these situations, Delta Lake employs **optimistic**\n**concurrency control** .\n\nOptimistic concurrency control is a method of dealing with concurrent transactions\nthat assumes the changes made to a table by different users can complete without\nconflicting with one another. It is incredibly fast because when dealing with petabytes\nof data, there’s a high likelihood that users will be working on different parts of the data\naltogether, allowing them to complete non-conflicting transactions simultaneously.\n\nOf course, even with optimistic concurrency control, sometimes users do try to\nmodify the same parts of the data at the same time. Luckily, Delta Lake has a protocol\nfor that. Delta Lake handles these cases by implementing a rule of mutual exclusion,\nthen it attempts to solve any conflict optimistically.\n\nThis protocol allows Delta Lake to deliver on the ACID principle of isolation, which\nensures that the resulting state of the table after multiple, concurrent writes is the\nsame as if those writes had occurred serially, in isolation from one another.\n\n\n-----\n\nAs all the transactions made on Delta Lake tables are stored directly to disk, this\nprocess satisfies the ACID property of durability, meaning it will persist even in the\nevent of system failure.\n\n**Time travel, data lineage and debugging**\nEvery table is the result of the sum total of all the commits recorded in the Delta Lake\ntransaction log — no more and no less. The transaction log provides a step-by-step\ninstruction guide, detailing exactly how to get from the table’s original state to its\ncurrent state.\n\nTherefore, we can recreate the state of a table at any point in time by starting with\nan original table, and processing only commits made after that point. This powerful\nability is known as “time travel,” or data versioning, and can be a lifesaver in any number\n\n\nof situations. For more information, please refer to [Introducing Delta Time Travel for](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[Large-Scale Data Lakes](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) and [Getting Data Ready for Data Science With Delta Lake and](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n[MLflow.](https://www.youtube.com/watch?v=hQaENo78za0&list=PLTPXxbhUt-YVPwG3OWNQ-1bJI_s_YRvqP&index=21&t=112s)\n\nAs the definitive record of every change ever made to a table, the Delta Lake\ntransaction log offers users a verifiable data lineage that is useful for governance,\naudit and compliance purposes. It can also be used to trace the origin of an\ninadvertent change or a bug in a pipeline back to the exact action that caused it. Users\ncan run the [DESCRIBE HISTORY](https://docs.delta.io/latest/delta-utility.html#describe-history) command to see metadata around the changes\nthat were made.\n\n**Want to learn more about Delta Lake’s transaction log?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**How to Use Schema**\n**Enforcement and**\n**Evolution**\n\nAs business problems and requirements evolve over time, so does the structure of\nyour data. With Delta Lake, incorporating new columns or objects is easy; users have\naccess to simple semantics to control the schema of their tables. At the same time,\nit is important to call out the importance of schema enforcement to prevent users\nfrom accidentally polluting their tables with mistakes or garbage data in addition to\nschema evolution, which enables them to automatically add new columns of rich data\nwhen those columns belong.\n\n**Schema enforcement rejects any new columns or other schema changes that**\n**aren’t compatible with your table.** By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\n\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together,\nthese features make it easier than ever to block out the noise and tune in to the signal.\n\n**Understanding table schemas**\nEvery DataFrame in Apache Spark contains a schema, a blueprint that defines the\nshape of the data, such as data types and columns, and metadata. With Delta Lake,\nthe table’s schema is saved in JSON format inside the transaction log.\n\n\n-----\n\n**What is schema enforcement?**\nSchema enforcement, or schema validation, is a safeguard in Delta Lake that ensures\ndata quality by rejecting writes to a table that don’t match the table’s schema.\n\nLike the front-desk manager at a busy restaurant who only accepts reservations, it\nchecks to see whether each column of data inserted into the table is on its list of\nexpected columns (in other words, whether each one has a “reservation”), and rejects\nany writes with columns that aren’t on the list.\n\n**How does schema enforcement work?**\nDelta Lake uses **schema validation on write,** which means that all new writes to a\ntable are checked for compatibility with the target table’s schema at write time. If the\nschema is not compatible, Delta Lake cancels the transaction altogether (no data is\nwritten), and raises an exception to let the user know about the mismatch.\n\nTo determine whether a write to a table is compatible, Delta Lake uses the following\nrules. The DataFrame to be written cannot contain:\n\n**• Any additional columns that are not present in the target table’s schema.**\nConversely, it’s OK if the incoming data doesn’t contain every column in the table —\nthose columns will simply be assigned null values.\n\n**• \u0007Column data types that differ from the column data types in the target table.**\nIf a target table’s column contains StringType data, but the corresponding column\nin the DataFrame contains IntegerType data, schema enforcement will raise an\nexception and prevent the write operation from taking place.\n\n**• Column names that differ only by case.** This means that you cannot have columns\nsuch as “Foo” and “foo” defined in the same table. While Spark can be used in case\nsensitive or insensitive (default) mode, Delta Lake is case-preserving but insensitive\nwhen storing the schema. [Parquet](https://databricks.com/glossary/what-is-parquet) is case sensitive when storing and returning\ncolumn information. To avoid potential mistakes, data corruption or loss issues (which\nwe’ve personally experienced at Databricks), we decided to add this restriction.\n\n\n-----\n\nRather than automatically adding the new columns, Delta Lake enforces the schema,\nand stops the write from occurring. To help identify which column(s) caused the\nmismatch, Spark prints out both schemas in the stack trace for comparison.\n\n**How is schema enforcement useful?**\nBecause it’s such a stringent check, schema enforcement is an excellent tool to use\nas a gatekeeper for a clean, fully transformed data set that is ready for production or\nconsumption. It’s typically enforced on tables that directly feed:\n\n- Machine learning algorithms\n\n- BI dashboards\n\n- Data analytics and visualization tools\n\n- Any production system requiring highly structured,\nstrongly typed, semantic schemas\n\nIn order to prepare their data for this final hurdle, many users employ a simple multihop architecture that progressively adds structure to their tables. To learn more, take\na look at [Productionizing Machine Learning With Delta Lake.](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n**What is schema evolution?**\nSchema evolution is a feature that allows users to easily change a table’s current\nschema to accommodate data that is changing over time. Most commonly, it’s used\nwhen performing an append or overwrite operation, to automatically adapt the\nschema to include one or more new columns.\n\n**How does schema evolution work?**\nFollowing up on the example from the previous section, developers can\neasily use schema evolution to add the new columns that were previously\nrejected due to a schema mismatch. Schema evolution is activated by adding\n.option(‘mergeSchema’, ‘true’) to your .write or .writeStream\nSpark command, as shown in the following example.\n\n\n#Add the mergeSchema option\n\nloans.write.format( “delta” ) \\\n\n.option( “mergeSchema” , “true” ) \\\n\n.mode( “append” ) \\\n\n.save(DELTALAKE_SILVER_PATH)\n\nBy including the mergeSchema option in your query, any columns that are present\n\nin the DataFrame but not in the target table are automatically added to the end of the\n\nschema as part of a write transaction. Nested fields can also be added, and these\n\nfields will get added to the end of their respective struct columns as well.\n\nData engineers and scientists can use this option to add new columns (perhaps a\n\nnewly tracked metric, or a column of this month’s sales figures) to their existing ML\n\nproduction tables without breaking existing models that rely on the old columns.\n\nThe following types of schema changes are eligible for schema evolution during table\n\nappends or overwrites:\n\n- Adding new columns (this is the most common scenario)\n\n- \u0007Changing of data types from NullType → any other type, or upcasts from ByteType\n\n→ ShortType → IntegerType\n\nOther changes, not eligible for schema evolution, require that the schema and data\n\nare overwritten by adding .option(“overwriteSchema”,“true”) . Those\n\nchanges include:\n\n- Dropping a column\n\n- Changing an existing column’s data typeC (in place)\n\n- \u0007Renaming column names that differ onlyC by case (e.g., “Foo” and “foo”)\n\n\n-----\n\nFinally, with the release of Spark 3.0, explicit DDL (using ALTER TABLE ) is fully\nsupported, allowing users to perform the following actions on table schemas:\n\n- Adding columns\n\n- Changing column comments\n\n- Setting table properties that define the behavior of the table, such as setting the\nretention duration of the transaction log\n\n**How is schema evolution useful?**\nSchema evolution can be used anytime you _intend_ to change the schema of your table\n(as opposed to where you accidentally added columns to your DataFrame that shouldn’t\nbe there). It’s the easiest way to migrate your schema because it automatically adds the\ncorrect column names and data types, without having to declare them explicitly.\n\n**Summary**\nSchema enforcement rejects any new columns or other schema changes that\naren’t compatible with your table. By setting and upholding these high standards,\nanalysts and engineers can trust that their data has the highest levels of integrity and\ncan reason about it with clarity, allowing them to make better business decisions.\nOn the flip side of the coin, schema evolution complements enforcement by making it\neasy for intended schema changes to take place automatically. After all, it shouldn’t\nbe hard to add a column.\n\nSchema enforcement is the yin to schema evolution’s yang. When used together, these\nfeatures make it easier than ever to block out the noise and tune in to the signal.\n\n**Want to learn more about schema enforcement and evolution?**\n\nRead our blog post > Watch our tech talk >\n\n\n-----\n\n**Delta Lake**\n**DML Internals**\n\nDelta Lake supports data manipulation language (DML) commands including UPDATE,\nDELETE and MERGE. These commands simplify change data capture (CDC), audit and\ngovernance, and GDPR/CCPA workflows, among others.\n\nIn this chapter, we will demonstrate how to use each of these DML commands,\ndescribe what Delta Lake is doing behind the scenes, and offer some performance\ntuning tips for each one.\n\n**Delta Lake DML: UPDATE**\nYou can use the UPDATE operation to selectively update any rows that match a\nfiltering condition, also known as a predicate. The code below demonstrates how\nto use each type of predicate as part of an UPDATE statement. Note that Delta Lake\noffers APIs for Python, Scala and SQL, but for the purposes of this eBook, we’ll include\nonly the SQL code.\n\n-- Update events\n\nUPDATE events SET eventType= ‘click’ WHERE buttonPress = 1\n\n\n-----\n\n**UPDATE: Under the hood**\nDelta Lake performs an UPDATE on a table in two steps:\n\n1. Find and select the files containing data that match the predicate and, therefore,\nneed to be updated. Delta Lake uses [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) whenever possible to speed up\nthis process.\n\n2. \u0007Read each matching file into memory, update the relevant rows, and write out the\nresult into a new data file.\n\nOnce Delta Lake has executed the UPDATE successfully, it adds a commit in the\ntransaction log indicating that the new data file will be used in place of the old one\nfrom now on. The old data file is not deleted, though. Instead, it’s simply “tombstoned”\n— recorded as a data file that applied to an older version of the table, but not the\ncurrent version. Delta Lake is able to use it to provide data versioning and time travel.\n\n**UPDATE + Delta Lake time travel = Easy debugging**\nKeeping the old data files turns out to be very useful for debugging because you can\nuse Delta Lake “time travel” to go back and query previous versions of a table at any\n\n\ntime. In the event that you update your table incorrectly and want to figure out what\nhappened, you can easily compare two versions of a table to one another to see what\nhas changed.\n\nSELECT - FROM events VERSION AS OF 11 EXCEPT ALL SELECT\n\n- FROM mytable VERSION AS OF 12\n\n**UPDATE: Performance tuning tips**\nThe main way to improve the performance of the UPDATE command on Delta Lake\nis to add more predicates to narrow down the search space. The more specific the\nsearch, the fewer files Delta Lake needs to scan and/or modify.\n\n**Delta Lake DML: DELETE**\nYou can use the DELETE command to selectively delete rows based upon a\npredicate (filtering condition).\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n\n-----\n\nIn the event that you want to revert an accidental DELETE operation, you can use time\ntravel to roll back your table to the way it was.\n\n**DELETE: Under the hood**\nDELETE works just like UPDATE under the hood. Delta Lake makes two scans of\nthe data: The first scan is to identify any data files that contain rows matching the\npredicate condition. The second scan reads the matching data files into memory,\nat which point Delta Lake deletes the rows in question before writing out the newly\nclean data to disk.\n\nAfter Delta Lake completes a DELETE operation successfully, the old data files are\nnot deleted entirely — they’re still retained on disk, but recorded as “tombstoned” (no\nlonger part of the active table) in the Delta Lake transaction log. Remember, those old\nfiles aren’t deleted immediately because you might still need them to time travel back\nto an earlier version of the table. If you want to delete files older than a certain time\nperiod, you can use the VACUUM command.\n\n**DELETE + VACUUM: Cleaning up old data files**\nRunning the VACUUM command permanently deletes all data files that are:\n\n1. No longer part of the active table and\n2. \u0007Older than the retention threshold, which is seven days by default\n\nDelta Lake does not automatically VACUUM old files — you must run the command\nyourself, as shown below. If you want to specify a retention period that is different\nfrom the default of seven days, you can provide it as a parameter.\n\nfrom delta.tables import - deltaTable.\n\n# vacuum files older than 30 days(720 hours)\n\ndeltaTable.vacuum( 720 )\n\n\n-----\n\n**DELETE: Performance tuning tips**\nJust like with the UPDATE command, the main way to improve the performance of\na DELETE operation on Delta Lake is to add more predicates to narrow down the\nsearch space. The Databricks managed version of Delta Lake also features other\nperformance enhancements like improved [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) , the use of bloom filters, and\n[Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering) (multi-dimensional clustering). [Read more about Z-Order Optimize](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n[on Databricks.](https://docs.databricks.com/delta/optimizations/file-mgmt.html#z-ordering-multi-dimensional-clustering)\n\n**Delta Lake DML: MERGE**\nThe Delta Lake MERGE command allows you to perform upserts, which are a mix of\nan UPDATE and an INSERT. To understand upserts, imagine that you have an existing\ntable (aka a target table), and a source table that contains a mix of new records and\nupdates to existing records.\n\n\n**Here’s how an upsert works:**\n\n- When a record from the source table matches a preexisting record in the target\ntable, Delta Lake updates the record.\n\n- When there is no such match, Delta Lake inserts the new record.\n\nThe Delta Lake MERGE command greatly simplifies workflows that can be complex\nand cumbersome with other traditional data formats like Parquet. Common scenarios\nwhere merges/upserts come in handy include change data capture, GDPR/CCPA\ncompliance, sessionization, and deduplication of records.\n\n**For more information about upserts, read:**\n\n[Efficient Upserts Into Data Lakes With Databricks Delta](https://databricks.com/blog/2019/03/19/efficient-upserts-into-data-lakes-databricks-delta.html)\n\n[Simple, Reliable Upserts and Deletes on Delta Lake Tables Using Python APIs](https://databricks.com/blog/2019/10/03/simple-reliable-upserts-and-deletes-on-delta-lake-tables-using-python-apis.html)\n\n[Schema Evolution in Merge Operations and Operational Metrics in Delta Lake](https://databricks.com/blog/2020/05/19/schema-evolution-in-merge-operations-and-operational-metrics-in-delta-lake.html)\n\n\n-----\n\n**MERGE: Under the hood**\nDelta Lake completes a MERGE in two steps:\n\n1. Perform an inner join between the target table and source table to select all files\nthat have matches.\n2. Perform an outer join between the selected files in the target and source tables\nand write out the updated/deleted/inserted data.\n\nThe main way that this differs from an UPDATE or a DELETE under the hood is that\nDelta Lake uses joins to complete a MERGE. This fact allows us to utilize some unique\nstrategies when seeking to improve performance.\n\n**MERGE: Performance tuning tips**\nTo improve performance of the MERGE command, you need to determine which of the\ntwo joins that make up the merge is limiting your speed.\n\nIf the inner join is the bottleneck (i.e., finding the files that Delta Lake needs to rewrite\ntakes too long), try the following strategies:\n\n- Add more predicates to narrow down the search space.\n\n- Adjust shuffle partitions.\n\n- Adjust broadcast join thresholds.\n\n- Compact the small files in the table if there are lots of them, but don’t compact them\ninto files that are too large, since Delta Lake has to copy the entire file to rewrite it.\n\n\n**On Databricks’ managed Delta Lake, use Z-Order optimize to exploit the**\n**locality of updates.**\n\nOn the other hand, if the outer join is the bottleneck (i.e., rewriting the actual files\nthemselves takes too long), try the strategies below.\n\n- **Adjust shuffle partitions:** Reduce files by enabling automatic repartitioning\nbefore writes (with Optimized Writes in Databricks Delta Lake).\n\n- **\u0007Adjust broadcast thresholds:** If you’re doing a full outer join, Spark cannot do a\nbroadcast join, but if you’re doing a right outer join, Spark can do one, and you can\nadjust the broadcast thresholds as needed.\n\n- **Cache the source table / DataFrame:** Caching the source table can speed up the\nsecond scan, but be sure not to cache the target table, as this can lead to cache\ncoherency issues.\n\nDelta Lake supports DML commands including UPDATE, DELETE and MERGE INTO, which\ngreatly simplify the workflow for many common big data operations. In this chapter, we\ndemonstrated how to use these commands in Delta Lake, shared information about\nhow each one works under the hood, and offered some performance tuning tips.\n\n**Want a deeper dive into DML internals, including snippets of code?**\n\n[Read our blog post >](https://databricks.com/blog/2020/09/29/diving-into-delta-lake-dml-internals-update-delete-merge.html)\n\n\n-----\n\n**How Delta Lake Quickly**\n**Processes Petabytes With**\n**Data Skipping and Z-Ordering**\n\nDelta Lake is capable of sifting through petabytes of data within seconds. Much of this\nspeed is owed to two features: (1) data skipping and (2) Z-Ordering.\n\nCombining these features helps the [Databricks Runtime](https://databricks.com/product/databricks-runtime) to dramatically reduce the\namount of data that needs to be scanned to answer selective queries against large\nDelta tables, which typically translates into substantial runtime improvements and\ncost savings.\n\nUsing Delta Lake’s built-in data skipping and ZORDER clustering features, large cloud\ndata lakes can be queried in a matter of seconds by skipping files not relevant to\nthe query. For example, 93.2% of the records in a 504 TB data set were skipped for a\ntypical query in a real-world cybersecurity analysis use case, reducing query times by\nup to two orders of magnitude. In other words, Delta Lake can speed up your queries\nby as much as 100x.\n\n**Want to see data skipping and Z-Ordering in action?**\n\nApple’s Dominique Brezinski and Databricks’ Michael Armbrust demonstrated how to\n\nuse Delta Lake as a unified solution for data engineering and data science in the context\n\nof cybersecurity monitoring and threat response. Watch their keynote speech, Threat\n\n[Detection and Response at Scale.](https://databricks.com/session/keynote-from-apple)\n\n\n-----\n\nAND / OR / NOT are also supported as well as “literal op column” predicates.\n\nEven though data skipping kicks in when the above conditions are met, it may not\nalways be effective. But, if there are a few columns that you frequently filter by and\nwant to make sure that’s fast, then you can explicitly optimize your data layout with\nrespect to skipping effectiveness by running the following command:\n\nOPTIMIZE [ WHERE ]\nZORDER BY ( [, …])\n\n**Exploring the details**\nApart from partition pruning, another common technique that’s used in the data\nwarehousing world, but which Spark currently lacks, is I/O pruning based on [small](https://dl.acm.org/doi/10.5555/645924.671173)\n[materialized aggregates](https://dl.acm.org/doi/10.5555/645924.671173) . In short, the idea is to keep track of simple statistics such\nas minimum and maximum values at a certain granularity that are correlated with I/O\ngranularity. And we want to leverage those statistics at query planning time in order\nto avoid unnecessary I/O.\n\nThis is exactly what Delta Lake’s [data skipping](https://docs.databricks.com/delta/optimizations/file-mgmt.html#data-skipping) feature is about. As new data is\ninserted into a Delta Lake table, file-level min/max statistics are collected for all\ncolumns (including nested ones) of supported types. Then, when there’s a lookup\nquery against the table, Delta Lake first consults these statistics in order to determine\nwhich files can safely be skipped.\n\n**Want to learn more about data skipping and Z-Ordering, including**\n**how to apply it within a cybersecurity analysis?**\n\n[Read our blog post >](https://databricks.com/blog/2018/07/31/processing-petabytes-of-data-in-seconds-with-databricks-delta.html)\n\n\n**Using data skipping and Z-Order clustering**\nData skipping and Z-Ordering are used to improve the performance of needle-in-thehaystack queries against huge data sets. Data skipping is an automatic feature of\nDelta Lake, kicking in whenever your SQL queries or data set operations include filters\nof the form “column op literal,” where:\n\n- column is an attribute of some Delta Lake table, be it top-level or nested, whose\ndata type is string / numeric / date/ timestamp\n\n- op is a binary comparison operator, StartsWith / LIKE pattern%’, or IN\n\n\n- literal is an explicit (list of) value(s) of the same data type as a column\n\n\n-----\n\n**Features**\nUse Delta Lake’s robust features\nto reliably manage your data\n\n## CHAPTER 02\n\n\n-----\n\n**Why Use MERGE**\n**With Delta Lake?**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , the next-generation engine built on top of Apache Spark, supports the\nMERGE command, which allows you to efficiently upsert and delete records in your\ndata lakes.\n\nMERGE dramatically simplifies how a number of common data pipelines can be built\n-- all the complicated multi-hop processes that inefficiently rewrote entire partitions\ncan now be replaced by simple MERGE queries.\n\nThis finer-grained update capability simplifies how you build your big data\npipelines for various use cases ranging from change data capture to GDPR. You\nno longer need to write complicated logic to overwrite tables and overcome a lack\nof snapshot isolation.\n\nWith changing data, another critical capability required is the ability to roll back, in\ncase of bad writes. Delta Lake also offers [rollback capabilities with the Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html)\n[feature](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) , so that if you do a bad merge, you can easily roll back to an earlier version.\n\nIn this chapter, we’ll discuss common use cases where existing data might need to be\nupdated or deleted. We’ll also explore the challenges inherent to upserts and explain\nhow MERGE can address them.\n\n\n-----\n\n**When are upserts necessary?**\nThere are a number of common use cases where existing data in a data lake needs to\nbe updated or deleted:\n\n- \u0007 **General Data Protection Regulation (GDPR) compliance:** With the introduction of\nthe right to be forgotten (also known as data erasure) in GDPR, organizations must\nremove a user’s information upon request. This data erasure includes deleting user\ninformation in the data lake as well.\n\n- **Change data capture from traditional databases:** In a service-oriented\narchitecture, typically web and mobile applications are served by microservices\nbuilt on traditional SQL/NoSQL databases that are optimized for low latency. One\nof the biggest challenges organizations face is joining data across these various\nsiloed data systems, and hence data engineers build pipelines to consolidate\nall data sources into a central data lake to facilitate analytics. These pipelines\noften have to periodically read changes made on a traditional SQL/NoSQL table\nand apply them to corresponding tables in the data lake. Such changes can take\nvarious forms: Tables with slowly changing dimensions, change data capture of all\ninserted/updated/deleted rows, etc.\n\n- \u0007 **Sessionization:** Grouping multiple events into a single session is a common use\ncase in many areas ranging from product analytics to targeted advertising to\npredictive maintenance. Building continuous applications to track sessions and\nrecording the results that write into data lakes is difficult because data lakes have\nalways been optimized for appending data.\n\n- **\u0007De-duplication:** A common data pipeline use case is to collect system logs into a\nDelta Lake table by appending data to the table. However, often the sources can\ngenerate duplicate records and downstream de-duplication steps are needed to\ntake care of them.\n\n\n-----\n\n**Why upserts into data lakes have**\n**traditionally been challenging**\nSince data lakes are fundamentally based on files, they have always been optimized\nfor appending data rather than for changing existing data. Hence, building the above\nuse case has always been challenging.\n\nUsers typically read the entire table (or a subset of partitions) and then overwrite\nthem. Therefore, every organization tries to reinvent the wheel for their requirement\nby handwriting complicated queries in SQL, Spark, etc. This approach is:\n\n- **\u0007Inefficient:** Reading and rewriting entire partitions (or entire tables) to update a few\nrecords causes pipelines to be slow and costly. Hand-tuning the table layout and\nquery optimization is tedious and requires deep domain knowledge.\n\n- **\u0007Possibly incorrect:** Handwritten code modifying data is very prone to logical and\nhuman errors. For example, multiple pipelines concurrently modifying the same table\nwithout any transactional support can lead to unpredictable data inconsistencies\nand in the worst case, data losses. Often, even a single handwritten pipeline can\neasily cause data corruptions due to errors in encoding the business logic.\n\n- **\u0007Hard to maintain:** Fundamentally such handwritten code is hard to understand,\nkeep track of and maintain. In the long term, this alone can significantly increase\nthe organizational and infrastructural costs.\n\n**Introducing MERGE in Delta Lake**\nWith Delta Lake, you can easily address the use cases above without any of the\naforementioned problems using the following MERGE command:\n\nMERGE INTO\n\nUSING\n\nON\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n\n[ WHEN NOT MATCHED [ AND ] THEN ]\n\nwhere\n\n=\n\nDELETE |\n\nUPDATE SET - |\n\nUPDATE SET column1 = value1 [, column2 = value2 ...]\n\n=\n\nINSERT - |\n\nINSERT (column1 [, column2 ...]) VALUES (value1 [, value2 ...])\n\nLet’s understand how to use MERGE with a simple example. Suppose you have a\n[slowly changing dimension](https://en.wikipedia.org/wiki/Slowly_changing_dimension) table that maintains user information like addresses.\nFurthermore, you have a table of new addresses for both existing and new users. To\nmerge all the new addresses to the main user table, you can run the following:\n\nMERGE INTO users\n\nUSING updates\n\nON users.userId = updates.userId\n\nWHEN MATCHED THEN\n\nUPDATE SET address = updates.addresses\n\nWHEN NOT MATCHED THEN\nINSERT (userId, address) VALUES (updates.userId, updates.address)\n\nThis will perform exactly what the syntax says -- for existing users (i.e., MATCHED\nclause), it will update the address column, and for new users (i.e., NOT MATCHED\nclause) it will insert all the columns. For large tables with TBs of data, this Delta Lake\nMERGE operation can be orders of magnitude faster than overwriting entire partitions\nor tables since Delta Lake reads only relevant files and updates them. Specifically,\nDelta Lake's MERGE has the following advantages:\n\n\n\n[ WHEN MATCHED [ AND ] THEN ]\n\n\n-----\n\n**Simplifying use cases with MERGE**\n**Deleting data due to GDPR**\nComplying with the “right to be forgotten” clause of GDPR for data in data lakes cannot\nget any easier. You can set up a simple scheduled job with an example code, like\nbelow, to delete all the users who have opted out of your service.\n\nMERGE INTO users\n\nUSING opted_out_users\n\nON opted_out_users.userId = users.userId\n\nWHEN MATCHED THEN DELETE\n\n**Applying change data from databases**\nYou can easily apply all data changes — updates, deletes, inserts — generated from an\nexternal database into a Delta Lake table with the MERGE syntax as follows:\n\nMERGE INTO users\n\nUSING (\n\nSELECT userId, latest.address AS address, latest.deleted AS deleted FROM\n\n(\n\nSELECT userId, MAX(struct(TIME, address, deleted)) AS latest\n\nFROM changes GROUP BY userId\n\n)\n\n) latestChange\n\nON latestChange.userId = users.userId\n\nWHEN MATCHED AND latestChange.deleted = TRUE THEN\n\nDELETE\n\nWHEN MATCHED THEN\n\nUPDATE SET address = latestChange.address\n\nWHEN NOT MATCHED AND latestChange.deleted = FALSE THEN\n\nINSERT (userId, address) VALUES (userId, address)\n\n\n\n- **\u0007Fine-grained:** The operation rewrites data at the granularity of files and not\npartitions. This eliminates all the complications of rewriting partitions, updating\nthe Hive metastore with MSCK and so on.\n\n- **\u0007Efficient:** Delta Lake’s data skipping makes the MERGE efficient at finding files to\nrewrite, thus eliminating the need to hand-optimize your pipeline. Furthermore,\nDelta Lake with all its I/O and processing optimizations makes all the reading and\nwriting data by MERGE significantly faster than similar operations in Apache Spark.\n\n- **\u0007Transactional:** Delta Lake uses optimistic concurrency control to ensure that\nconcurrent writers update the data correctly with ACID transactions, and concurrent\nreaders always see a consistent snapshot of the data.\n\nHere is a visual explanation of how MERGE compares with handwritten pipelines.\n\n\n-----\n\n**Updating session information from streaming**\n**pipelines**\nIf you have streaming event data flowing in and if you want to sessionize the streaming\nevent data and incrementally update and store sessions in a Delta Lake table, you\ncan accomplish this using the foreachBatch in Structured Streaming and MERGE.\nFor example, suppose you have a Structured Streaming DataFrame that computes\nupdated session information for each user. You can start a streaming query that\napplies all the sessions update to a Delta Lake table as follows (Scala).\n\nstreamingSessionUpdatesDF.writeStream\n\n.foreachBatch { (microBatchOutputDF: DataFrame , batchId: Long ) =>\n\nmicroBatchOutputDF.createOrReplaceTempView(“updates”)\n\nmicroBatchOutputDF.sparkSession.sql(s”””\n\nMERGE INTO sessions\n\nUSING updates\n\nON sessions.sessionId = updates.sessionId\n\nWHEN MATCHED THEN UPDATE SET *\n\nWHEN NOT MATCHED THEN INSERT * “”” )\n\n}.start()\n\nFor a complete working example of each Batch and MERGE, see this notebook\n( [Azure](https://docs.azuredatabricks.net/_static/notebooks/merge-in-streaming.html) | [AWS](https://docs.databricks.com/_static/notebooks/merge-in-streaming.html) ).\n\n**Additional resources**\n\n[Tech Talk | Addressing GDPR and CCPA Scenarios With Delta Lake and Apache Spark](https://www.youtube.com/watch?v=tCPslvUjG1w)\n\n[Tech Talk | Using Delta as a Change Data Capture Source](https://www.youtube.com/watch?v=7y0AAQ6qX5w)\n\n[Simplifying Change Data Capture With Databricks Delta](https://databricks.com/blog/2018/10/29/simplifying-change-data-capture-with-databricks-delta.html)\n\n[Building Sessionization Pipeline at Scale With Databricks Delta](https://databricks.com/session/building-sessionization-pipeline-at-scale-with-databricks-delta)\n\n[Tech Chat | Slowly Changing Dimensions (SCD) Type 2](https://www.youtube.com/watch?v=HZWwZG07hzQ)\n\n\n-----\n\n**Simple, Reliable Upserts and**\n**Deletes on Delta Lake Tables**\n**Using Python APIs**\n\nIn this chapter, we will demonstrate how to use Python and the new Python APIs in Delta\nLake within the context of an on-time flight performance scenario. We will show how\nto upsert and delete data, query old versions of data with time travel, and vacuum\nolder versions for cleanup.\n\n**How to start using Delta Lake**\nThe Delta Lake package is installable through PySpark by using the --packages\noption. In our example, we will also demonstrate the ability to VACUUM files and execute\nDelta Lake SQL commands within Apache Spark. As this is a short demonstration, we\nwill also enable the following configurations:\n\n\u0007spark.databricks.delta.retentionDurationCheck.enabled=false\n\nto allow us to vacuum files shorter than the default retention duration of seven days.\nNote, this is only required for the SQL command VACUUM\n\n\u0007spark.sql.extensions=io.delta.sql.DeltaSparkSessionExtension\n\nto enable Delta Lake SQL commands within Apache Spark; this is not required for\nPython or Scala API calls.\n\n# Using Spark Packages\n\n./bin/pyspark --packages io.delta:delta-core_2.11:0.4.0 --conf “spark.\n\ndatabricks.delta.retentionDurationCheck.enabled=false” --conf “spark.\n\nsql.extensions=io.delta.sql.DeltaSparkSessionExtension”\n\n\n-----\n\n**Loading and saving our Delta Lake data**\nThis scenario will be using the On-Time Flight Performance or Departure Delays data\nset generated from the RITA BTS Flight Departure Statistics; some examples of this data\nin action include the and OnTime Flight Performance with GraphFrames for Apache Spark™. Within PySpark, start [2014 Flight Departure Performance via d3.js Crossfilter](https://dennyglee.com/2014/06/06/2014-flight-departure-performance-via-d3-js-crossfilter/)\nby reading the data set.\n\n\u0007# Location variables\n\n\n/departureDelays.delta$ ls l\n\n.\n\n..\n\n_delta_log\n\npart- 00000 -df6f69ea-e6aa- 424b -bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 -711bcce3-fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495- 5f6238830b68 -c000.snappy.parquet\n\nPart- 00003 - 1a791c4a-6f11-49a8 -8837-8 093a3220581 -c000.snappy.parquet\n\n\ntripdelaysFilePath = “/root/data/departuredelays.csv”\n\npathToEventsTable = “/root/deltalake/departureDelays.delta”\n\nNow, let’s reload the data, but this time our DataFrame will be backed by Delta Lake.\n\n# Read flight delay data\n\n\ndepartureDelays = spark.read \\\n\n.option( “header” , “true” ) \\\n\n.option( “inferSchema” , “true” ) \\\n\n.csv(tripdelaysFilePath)\n\nNext, let’s save our departureDelays data set to a Delta Lake table. By saving this table\nto Delta Lake storage, we will be able to take advantage of its features including ACID\ntransactions, unified batch and streaming and time travel.\n\n# Save flight delay data into Delta Lake format\n\ndepartureDelays \\\n\n.write \\\n\n\n# Load flight delay data in Delta Lake format\n\ndelays_delta = spark \\\n\n.read \\\n\n.format( “delta” ) \\\n\n.load( “departureDelays.delta” )\n\n# Create temporary view\n\ndelays_delta.createOrReplaceTempView(“delays_delta”)\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql(“select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’”).show()\n\n\n.format( “delta” ) \\\n\n.mode( “overwrite” ) \\\n\n.save( “departureDelays.delta” )\n\nNote, this approach is similar to how you would normally save Parquet data; instead\nof specifying format(“parquet”) , you will now specify format(“delta”) . If\nyou were to take a look at the underlying file system, you will notice four files created\nfor the departureDelays Delta Lake table.\n\n\n-----\n\nFinally, lets determine the number of flights originating from Seattle to San Francisco; in\nthis data set, there are 1698 flights.\n\n**In-place conversion to Delta Lake**\nIf you have existing Parquet tables, you have the ability to convert them to Delta Lake\nformat in place, thus not needing to rewrite your table. To convert the table, you can\nrun the following commands.\n\n\ndeltaTable DeltaTable .forPath(spark, pathToEventsTable\n\n)\n\n# Delete all on-time and early flights\n\ndeltaTable. delete ( “delay < 0” )\n\n# How many flights are between Seattle and San Francisco\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’” ).show()\n\n\nfrom delta.tables import - \n\n# Convert non partitioned parquet table at path ‘/path/to/table’\n\ndeltaTable = DeltaTable .convertToDelta(spark, “parquet.`/path/to/\n\ntable`” )\n\n# Convert partitioned parquet table at path ‘/path/to/table’ and\n\npartitioned by integer column named ‘part’\n\n\nAfter we delete (more on this below) all of the on-time and early flights, as you can\nsee from the preceding query there are 837 late flights originating from Seattle to\nSan Francisco. If you review the file system, you will notice there are more files even\nthough you deleted data.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -a2a19ba4- 17e 9- 4931 - 9bbf - 3c9d4997780 b-c000.snappy.parquet\n\npart-00000-df6f69ea-e6aa-424b-bc0e-f3674c4f1906-c000.snappy.parquet\n\npart- 00001 - 711bcce3 -fe9e- 466e -a22c- 8256f8b54930 -c000.snappy.parquet\n\npart- 00001 -a0423a18- 62eb - 46b3 -a82f-ca9aac1f1e93-c000.snappy.parquet\n\npart- 00002 - 778ba97d - 89b8 - 4942 -a495-5f6238830b68-c000.snappy.parquet\n\npart- 00002 -bfaa0a2a- 0a31 - 4abf -aa63- 162402f802cc -c000.snappy.parquet\n\npart- 00003 - 1a791c4a - 6f11 - 49a8 -8837- 8093a3220581 -c000.snappy.parquet\n\npart- 00003 -b0247e1d-f5ce- 4b45 - 91cd - 16413c784a66 -c000.snappy.parquet\n\n\npartitionedDeltaTable = DeltaTable .convertToDelta(spark,\n\n“parquet.`/path/to/table`”, “part int” )\n\n**Delete our flight data**\nTo delete data from a traditional data lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to delete\n2. Create a new table based on the previous query\n3. Delete the original table\n4. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this process\nby running a DELETE statement. To show this, let’s delete all of the flights that had\narrived early or on-time (i.e., delay < 0).\n\n\nfrom delta.tables import - \n\nfrom pyspark.sql.functions import - \n\n# Access the Delta Lake table\n\n\n-----\n\nIn traditional data lakes, deletes are performed by rewriting the entire table\nexcluding the values to be deleted. With Delta Lake, deletes are instead performed\nby selectively writing new versions of the files containing the data to be deleted and\nonly marks the previous files as deleted. This is because Delta Lake uses multiversion\nconcurrency control (MVCC) to do atomic operations on the table: For example, while\none user is deleting data, another user may be querying the previous version of the\ntable. This multiversion model also enables us to travel back in time (i.e., [time travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) )\nand query previous versions as we will see later.\n\n**Update our flight data**\nTo update data from your traditional Data Lake table, you will need to:\n\n1. Select all of the data from your table not including the rows you want to modify\n2. Modify the rows that need to be updated/changed\n3. Merge these two tables to create a new table\n4. Delete the original table\n5. Rename the new table to the original table name for downstream dependencies\n\nInstead of performing all of these steps, with Delta Lake, we can simplify this\nprocess by running an UPDATE statement. To show this, let’s update all of the flights\noriginating from Detroit to Seattle.\n\n\nWith the Detroit flights now tagged as Seattle flights, we now have 986 flights\noriginating from Seattle to San Francisco. If you were to list the file system for\nyour departureDelays folder (i.e., $../departureDelays/ls -l ), you will\nnotice there are now 11 files (instead of the 8 right after deleting the files and the four\nfiles after creating the table).\n\n**Merge our flight data**\nA common scenario when working with a data lake is to continuously append data to\nyour table. This often results in duplicate data (rows you do not want to be inserted\ninto your table again), new rows that need to be inserted, and some rows that need to\nbe updated. With Delta Lake, all of this can be achieved by using the merge operation\n(similar to the SQL MERGE statement).\n\nLet’s start with a sample data set that you will want to be updated, inserted or\nde-duplicated with the following query.\n\n\n# Update all flights originating from Detroit to now be\n\n\noriginating from Seattle\n\ndeltaTable.update(“origin = ‘DTW’”, { “origin”: “’SEA’” } )\n\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\n# How many flights are between Seattle and San Francisco\n\n\nThe output of this query looks like the following table. Note, the color-coding has been\nadded to clearly identify which rows are de-duplicated (blue), updated (yellow) and\ninserted (green).\n\n\nspark.sql( “select count(1) from delays_delta where origin = ‘SEA’\n\nand destination = ‘SFO’” ).show()\n\n\n-----\n\nNext, let’s generate our own merge_table that contains data we will insert, update\nor de-duplicate with the following code snippet.\n\nitems = [( 1010710 , 31 , 590 , ‘SEA’, ‘SFO’), ( 1010521 , 10 , 590 ,\n\n‘SEA’ , ‘SFO’ ),\n\n(1010822, 31, 590, ‘SEA’, ‘SFO’)]\n\n\nWith Delta Lake, this can be easily achieved via a merge statement as noted in the\nfollowing code snippet.\n\n# Merge merge_table with flights\n\ndeltaTable. alias( “flights” ) \\\n\n.merge(merge_table. alias ( “updates”),”flights.date =\n\nupdates.date” ) \\\n\n.whenMatchedUpdate(set = { “delay” : “updates.delay” } ) \\\n\n.whenNotMatchedInsertAll() \\\n\n.execute()\n\n# What flights between SEA and SFO for these date periods\n\nspark.sql( “select * from delays_delta where origin = ‘SEA’ and\n\ndestination = ‘SFO’ and date like ‘1010%’ limit 10” ).show()\n\n\ncols = [ ‘date’ , ‘delay’ , ‘distance’ , ‘origin’ , ‘destination’ ]\n\n\nmerge_table = spark.createDataFrame(items, cols)\n\nmerge_table.toPandas()\n\nIn the preceding table ( merge_table ), there are three rows with a unique date value:\n\n1. 1010521: This row needs to _update_ the _flights_ table with a new delay value (yellow)\n2. 1010710: This row is a _duplicate_ (blue)\n3. 1010832: This is a new row to be _inserted_ (green)\n\n\nAll three actions of de-duplication, update and insert were efficiently completed with\none statement.\n\n**View table history**\nAs previously noted, after each of our transactions (delete, update), there were more\nfiles created within the file system. This is because for each transaction, there are\ndifferent versions of the Delta Lake table.\n\n\n-----\n\nThis can be seen by using the DeltaTable.history() method as noted below\n\nNote: You can also perform the same task with SQL:\n\nspark.sql(“DESCRIBE HISTORY ‘” + pathToEventsTable + “’”).show()\n\nAs you can see, there are three rows representing the different versions of the table\n(below is an abridged version to help make it easier to read) for each of the operations\n(create table, delete and update):\n\n**Travel back in time with table history**\nWith Time Travel, you can review the Delta Lake table as of the version or timestamp.\nTo view historical data, specify the version or timestamp option; in the following code\nsnippet, we will specify the version option.\n\n\n# Load DataFrames for each version\n\ndfv0 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n0 ).load( “departureDelays.delta” )\n\ndfv1 = spark.read.format(“delta”).option( “versionAsOf” ,\n\n1 ).load( “departureDelays.delta” )\n\ndfv2 = spark.read.format( “delta” ).option( “versionAsOf” ,\n\n2 ).load( “departureDelays.delta” )\n\n# Calculate the SEA to SFO flight counts for each version of history\n\ncnt0 = dfv0. where( “origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt1 = dfv1. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\ncnt2 = dfv2. where (“origin = ‘SEA’”). where ( “destination = ‘SFO’” ).count()\n\n# Print out the value\n\nprint ( “SEA -> SFO Counts: Create Table: %s, Delete: %s, Update: %s” %\n\n(cnt0, cnt1, cnt2))\n\n## Output\n\nSEA -> SFO Counts : Create Table: 1698 , Delete: 837, Update: 986\n\nWhether for governance, risk management and compliance (GRC) or rolling back\nerrors, the Delta Lake table contains both the metadata (e.g., recording the fact that a\ndelete had occurred with these operators) and data (e.g., the actual rows deleted). But\nhow do we remove the data files either for compliance or size reasons?\n\n**Clean up old table versions with vacuum**\nThe [Delta Lake vacuum](https://docs.delta.io/0.7.0/delta-utility.html#vacuum) method will delete all of the rows (and files) by default that are\nolder than seven days’ reference. If you were to view the file system, you’ll notice the\n11 files for your table.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 - 5e52736b -0e63- 48f3 - 8d56 - 50f7cfa0494d -c000.snappy.parquet\n\npart- 00000 - 69eb53d5 - 34b4 - 408f -a7e4- 86e000428c37 -c000.snappy.parquet\n\n\n-----\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 20893eed - 9d4f - 4c1f -b619- 3e6ea1fdd05f -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 - bad3 - 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00001 - d4823d2e - 8f9d - 42e3 - 918d - 4060969e5844 -c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00002 - 3027786c - 20a9 - 4b19 - 868d -dc7586c275d4-c000.snappy.parquet\n\npart- 00002 -f2609f27- 3478 - 4bf9 -aeb7- 2c78a05e6ec1 -c000.snappy.parquet\n\npart- 00003 - 850436a6 -c4dd- 4535 -a1c0- 5dc0f01d3d55 -c000.snappy.parquet\n\nPart- 00003 -b9292122- 99a7 -4223-aaa9- 8646c281f199 -c000.snappy.parquet\n\nTo delete all of the files so that you only keep the current snapshot of data, you will specify a\nsmall value for the vacuum method (instead of the default retention of 7 days).\n\n# Remove all files older than 0 hours old.\n\ndeltaTable.vacuum( 0 )\n\nNote , you perform the same task via SQL syntax:¸\n\n# Remove all files older than 0 hours old\n\nspark.sql(“VACUUM ‘” + pathToEventsTable + “‘ RETAIN 0 HOURS”)\n\nOnce the vacuum has completed, when you review the file system you will notice fewer\nfiles as the historical data has been removed.\n\n/departureDelays.delta$ ls -l\n\n_delta_log\n\npart- 00000 -f8edaf04- 712e - 4ac4 - 8b42 - 368d0bbdb95b -c000.snappy.parquet\n\npart- 00001 - 9b68b9f6 -bad3- 434f - 9498 -f92dc4f503e3-c000.snappy.parquet\n\npart- 00002 - 24da7f4e - 7e8d - 40d1 -b664- 95bf93ffeadb -c000.snappy.parquet\n\npart- 00003 -b9292122- 99a7 - 4223 -aaa9- 8646c281f199 -c000.snappy.parquet\n\nNote, the ability to time travel back to a version older than the retention period is lost\nafter running vacuum.\n\n\n-----\n\n**Time Travel for**\n**Large-Scale Data Lakes**\n\nTime travel capabilities are available in [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . [Delta Lake](https://delta.io/) is an [open-source storage](https://github.com/delta-io/delta)\n[layer](https://github.com/delta-io/delta) that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable\nmetadata handling, and unifies streaming and batch data processing. Delta Lake runs on\ntop of your existing data lake and is fully compatible with Apache Spark APIs.\n\nWith this feature, Delta Lake automatically versions the big data that you store in your\ndata lake, and you can access any historical version of that data. This temporal data\nmanagement simplifies your data pipeline by making it easy to audit, roll back data\nin case of accidental bad writes or deletes, and reproduce experiments and reports.\n\nYour organization can finally standardize on a clean, centralized, versioned big data\nrepository in your own cloud storage for your analytics.\n\n**Common challenges with changing data**\n\n- **Audit data changes:** Auditing data changes is critical both in terms of data\ncompliance as well as simple debugging to understand how data has changed over\ntime. Organizations moving from traditional data systems to big data technologies\nand the cloud struggle in such scenarios.\n\n- **Reproduce experiments and reports:** During model training, data scientists\nrun various experiments with different parameters on a given set of data. When\nscientists revisit their experiments after a period of time to reproduce the models,\ntypically the source data has been modified by upstream pipelines. A lot of times,\nthey are caught unaware by such upstream data changes and hence struggle to\nreproduce their experiments. Some scientists and organizations engineer best\n\n\n-----\n\npractices by creating multiple copies of the data, leading to increased storage\ncosts. The same is true for analysts generating reports.\n\n- **Rollbacks:** Data pipelines can sometimes write bad data for downstream consumers.\n\nThis can happen because of issues ranging from infrastructure instabilities to messy\ndata to bugs in the pipeline. For pipelines that do simple appends to directories or a\ntable, rollbacks can easily be addressed by date-based partitioning. With updates\nand deletes, this can become very complicated, and data engineers typically have\nto engineer a complex pipeline to deal with such scenarios.\n\n**Working with Time Travel**\nDelta Lake’s time travel capabilities simplify building data pipelines for the above use\ncases. Time Travel in Delta Lake improves developer productivity tremendously. It helps:\n\n- Data scientists manage their experiments better\n\n- Data engineers simplify their pipelines and roll back bad writes\n\n- Data analysts do easy reporting\n\nOrganizations can finally standardize on a clean, centralized, versioned big data\nrepository in their own cloud storage for analytics. We are thrilled to see what you will\nbe able to accomplish with this feature.\n\nAs you write into a Delta Lake table or directory, every operation is automatically\nversioned. You can access the different versions of the data two different ways:\n\n**1. Using a timestamp**\n**Scala syntax**\nYou can provide the timestamp or date string as an option to DataFrame reader:\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “timestampAsOf” , “2019-01-01” )\n\n.load( “/path/to/my/table” )\n\n\n-----\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “timestampAsOf” , “2019-01-01” ) \\\n\n.load( “/path/to/my/table” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01”\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nSELECT count(*) FROM my_table TIMESTAMP AS OF “2019-01-01 01:30:00.000”\n\nIf the reader code is in a library that you don’t have access to, and if you are passing\ninput parameters to the library to read data, you can still travel back in time for a table\nby passing the timestamp in yyyyMMddHHmmssSSS format to the path:\n\nval inputPath = “/path/to/my/table@20190101000000000”\n\nval df = loadData(inputPath)\n\n// Function in a library that you don’t have access to\n\ndef loadData(inputPath : String ) : DataFrame = {\n\nspark.read\n\n.format(“delta”)\n\n.load(inputPath)\n\n}\n\ninputPath = “/path/to/my/table@20190101000000000”\n\ndf = loadData(inputPath)\n\n# Function in a library that you don’t have access to\n\ndef loadData(inputPath):\n\nreturn spark.read \\\n\n.format( “delta” ) \\\n\n.load(inputPath)\n\n\n-----\n\n**2. Using a version number**\nIn Delta Lake, every write has a version number, and you can use the version number\nto travel back in time as well.\n\n**Scala syntax**\n\nval df = spark.read\n\n.format( “delta” )\n\n.option( “versionAsOf” , “5238” )\n\n.load( “/path/to/my/table” )\n\nval df = spark.read\n\n.format( “delta” )\n\n.load( “/path/to/my/table@v5238” )\n\n**Python syntax**\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.option( “versionAsOf” , “5238” ) \\\n\n.load( “/path/to/my/table” )\n\ndf = spark.read \\\n\n.format( “delta” ) \\\n\n.load( “/path/to/my/table@v5238” )\n\n**SQL syntax**\n\nSELECT count(*) FROM my_table VERSION AS OF 5238\n\n\n-----\n\n**Audit data changes**\nYou can look at the history of table changes using the DESCRIBE HISTORY command\nor through the UI.\n\n**Reproduce experiments and reports**\nTime travel also plays an important role in machine learning and data science.\nReproducibility of models and experiments is a key consideration for data scientists\nbecause they often create hundreds of models before they put one into production,\nand in that time-consuming process would like to go back to earlier models. However,\nbecause data management is often separate from data science tools, this is really\nhard to accomplish.\n\nDatabricks solves this reproducibility problem by integrating Delta Lake’s Time\nTravel capabilities with [MLflow](https://mlflow.org/) , an open-source platform for the machine learning\nlifecycle. For reproducible machine learning training, you can simply log a\n\n\ntimestamped URL to the path as an MLflow parameter to track which version of the\ndata was used for each training job.\n\nThis enables you to go back to earlier settings and data sets to reproduce earlier\nmodels. You neither need to coordinate with upstream teams on the data nor worry\nabout cloning data for different experiments. This is the power of unified analytics,\nwhereby data science is closely married with data engineering.\n\n**Rollbacks**\nTime travel also makes it easy to do rollbacks in case of bad writes. For example, if\nyour GDPR pipeline job had a bug that accidentally deleted user information, you can\neasily fix the pipeline:\n\nINSERT INTO my_table\n\nSELECT - FROM my_table TIMESTAMP AS OF date_sub(current_date(), 1 )\n\nWHERE userId = 111\n\n\n-----\n\nYou can also fix incorrect updates as follows:\n\n# Will use the latest version of the table for all operations below\n\nMERGE INTO my_table target\n\n\nUSING my_table TIMESTAMP AS OF date_sub(current_date(), 1 ) source\n\nON source.userId = target.userId\n\nWHEN MATCHED THEN UPDATE SET - \n\nIf you simply want to roll back to a previous version of your table, you can do so with\neither of the following commands:\n\nRESTORE TABLE my_table VERSION AS OF [version_number]\n\nRESTORE TABLE my_table TIMESTAMP AS OF [timestamp]\n\n**Pinned view of a continuously updating**\n**Delta Lake table across multiple downstream jobs**\nWith AS OF queries, you can now pin the snapshot of a continuously updating Delta\nLake table for multiple downstream jobs. Consider a situation where a Delta Lake table\nis being continuously updated, say every 15 seconds, and there is a downstream job\nthat periodically reads from this Delta Lake table and updates different destinations.\nIn such scenarios, typically you want a consistent view of the source Delta Lake table\nso that all destination tables reflect the same state.\n\nYou can now easily handle such scenarios as follows:\n\nversion = spark.sql( “SELECT max(version) FROM (DESCRIBE HISTORY\n\nmy_table)” ).collect()\n\n\ndata = spark.table( “my_table@v%s” % version[ 0 ][ 0 ]data.where\n\n( “event_type = e1” ).write.jdbc( “table1” )\n\ndata.where( “event_type = e2” ).write.jdbc( “table2” )\n\n...\n\ndata.where( “event_type = e10” ).write.jdbc( “table10” )\n\n**Queries for time series analytics made simple**\nTime travel also simplifies time series analytics. For example, if you want to find out\nhow many new customers you added over the last week, your query could be a very\nsimple one like this:\n\nSELECT count( distinct userId) - (\n\nSELECT count( distinct userId)\n\nFROM my_table TIMESTAMP AS OF date_sub( current_date (), 7))\n\nFROM my_table\n\n**Additional resources**\n\n[Tech Talk | Diving Into Delta Lake: Unpacking the Transaction Log](https://databricks.com/discover/diving-into-delta-lake-talks/unpacking-transaction-log)\n\n[Tech Talk | Getting Data Ready for Data Science With Delta Lake and MLflow](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks/getting-data-ready-data-science-delta-lake-mlflow)\n\n[Data + AI Summit Europe 2020 | Data Time Travel by Delta Time Machine](https://databricks.com/session_eu20/data-time-travel-by-delta-time-machine-2)\n\n[Spark + AI Summit NA 2020 | Machine Learning Data Lineage With](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[MLflow and Delta Lake](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\n[Productionizing Machine Learning With Delta Lake](https://databricks.com/blog/2019/08/14/productionizing-machine-learning-with-delta-lake.html)\n\n\n-----\n\n**Easily Clone Your Delta Lake**\n**for Testing, Sharing and ML**\n**Reproducibility**\n\nDelta Lake has a feature called **Table Cloning** , which makes it easy to test, share and\nrecreate tables for ML reproducibility. Creating copies of tables in a data lake or data\nwarehouse has several practical uses. However, given the volume of data in tables\nin a data lake and the rate of its growth, making physical copies of tables is an\nexpensive operation.\n\nDelta Lake now makes the process simpler and cost-effective with the help of\ntable clones.\n\n**What are clones?**\nClones are replicas of a source table at a given point in time. They have the same\nmetadata as the source table: same schema, constraints, column descriptions, statistics\nand partitioning. However, they behave as a separate table with a separate lineage\nor history. Any changes made to clones only affect the clone and not the source. Any\nchanges that happen to the source during or after the cloning process also do not get\nreflected in the clone due to Snapshot Isolation. In Delta Lake we have two types of\nclones: shallow or deep.\n\n**Shallow clones**\nA _shallow_ (also known as a Zero-Copy) clone only duplicates the metadata of the\ntable being cloned; the data files of the table itself are not copied. This type of cloning\ndoes not create another physical copy of the data resulting in minimal storage costs.\nShallow clones are inexpensive and can be extremely fast to create.\n\n\n-----\n\nThese clones are not self-contained and depend on the source from which they were\ncloned as the source of data. If the files in the source that the clone depends on are removed,\nfor example with VACUUM, a shallow clone may become unusable. Therefore, shallow\nclones are typically used for short-lived use cases such as testing and experimentation.\n\n**Deep clones**\nShallow clones are great for short-lived use cases, but some scenarios require a\nseparate and independent copy of the table’s data. A deep clone makes a full copy of\nthe metadata and the data files of the table being cloned. In that sense, it is similar in\nfunctionality to copying with a CTAS command ( CREATE TABLE.. AS… SELECT… ).\nBut it is simpler to specify since it makes a faithful copy of the original table at the\nspecified version, and you don’t need to re-specify partitioning, constraints and other\ninformation as you have to do with CTAS. In addition, it is much faster, robust and can\nwork in an incremental manner against failures.\n\nWith deep clones, we copy additional metadata, such as your streaming application\ntransactions and COPY INTO transactions, so you can continue your ETL applications\nexactly where it left off on a deep clone.\n\n**Where do clones help?**\nSometimes I wish I had a clone to help with my chores or magic tricks. However, we’re\nnot talking about human clones here. There are many scenarios where you need a\ncopy of your data sets — for exploring, sharing or testing ML models or analytical\nqueries. Below are some examples of customer use cases.\n\n**Testing and experimentation with a production table**\nWhen users need to test a new version of their data pipeline they often have to rely\non sample test data sets that are not representative of all the data in their production\nenvironment. Data teams may also want to experiment with various indexing techniques\nto improve the performance of queries against massive tables. These experiments and\n\n\ntests cannot be carried out in a production environment without risking production\ndata processes and affecting users.\n\nIt can take many hours or even days, to spin up copies of your production tables for a test\nor a development environment. Add to that, the extra storage costs for your development\nenvironment to hold all the duplicated data — there is a large overhead in setting a test\nenvironment reflective of the production data. With a shallow clone, this is trivial:\n\n-- SQL\n\nCREATE TABLE delta.`/some/test/location` SHALLOW CLONE prod.events\n\n# Python\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=True)\n\n// Scala\n\nDeltaTable.forName(“spark”, “prod.events”).clone(“/some/test/location”,\n\nisShallow=true)\n\nAfter creating a shallow clone of your table in a matter of seconds, you can start\nrunning a copy of your pipeline to test out your new code, or try optimizing your table\nin different dimensions to see how you can improve your query performance, and much\nmuch more. These changes will only affect your shallow clone, not your original table.\n\n**Staging major changes to a production table**\nSometimes, you may need to perform some major changes to your production table.\nThese changes may consist of many steps, and you don’t want other users to see the\nchanges that you’re making until you’re done with all of your work. A shallow clone can\nhelp you out here:\n\n\n-----\n\n-- SQL\n\nCREATE TABLE temp.staged_changes SHALLOW CLONE prod.events;\n\nDELETE FROM temp.staged_changes WHERE event_id is null;\n\nUPDATE temp.staged_changes SET change_date = current_date()\n\nWHERE change_date is null;\n\n...\n\n-- Perform your verifications\n\nOnce you’re happy with the results, you have two options. If no other change has\nbeen made to your source table, you can replace your source table with the clone.\nIf changes have been made to your source table, you can merge the changes into\nyour source table.\n\n-- If no changes have been made to the source\n\nREPLACE TABLE prod.events CLONE temp.staged_changes;\n\n-- If the source table has changed\n\nMERGE INTO prod.events USING temp.staged_changes\n\nON events.event_id <=> staged_changes.event_id\n\nWHEN MATCHED THEN UPDATE SET *;\n\n-- Drop the staged table\n\nDROP TABLE temp.staged_changes;\n\n**Machine learning result reproducibility**\nComing up with an effective ML model is an iterative process. Throughout this process\nof tweaking the different parts of the model, data scientists need to assess the\naccuracy of the model against a fixed data set.\n\nThis is hard to do in a system where the data is constantly being loaded or updated. A\nsnapshot of the data used to train and test the model is required. This snapshot allows\nthe results of the ML model to be reproducible for testing or model governance purposes.\n\n\n-----\n\nWe recommend leveraging [Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) to run multiple experiments across a snapshot; an\nexample of this in action can be seen in [Machine Learning Data Lineage With MLflow](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n[and Delta Lake.](https://databricks.com/session_na20/machine-learning-data-lineage-with-mlflow-and-delta-lake)\n\nOnce you’re happy with the results and would like to archive the data for later retrieval,\nfor example, next Black Friday, you can use deep clones to simplify the archiving process.\nMLflow integrates really well with Delta Lake, and the autologging feature (mlflow.spark.\nautolog() ) will tell you which version of the table was used to run a set of experiments.\n\n# Run your ML workloads using Python and then\n\nDeltaTable.forName(spark, “feature_store”).cloneAtVersion(128, “feature_\n\nstore_bf2020”)\n\n**Data migration**\nA massive table may need to be moved to a new, dedicated bucket or storage system\nfor performance or governance reasons. The original table will not receive new\nupdates going forward and will be deactivated and removed at a future point in time.\nDeep clones make the copying of massive tables more robust and scalable.\n\n-- SQL\n\nCREATE TABLE delta.`zz://my-new-bucket/events` CLONE prod.events;\n\nALTER TABLE prod.events SET LOCATION ‘zz://my-new-bucket/events’;\n\nWith deep clones, since we copy your streaming application transactions and\nCOPY INTO transactions, you can continue your ETL applications from exactly where\nit left off after this migration!\n\n**Data sharing**\nIn an organization, it is often the case that users from different departments are\nlooking for data sets that they can use to enrich their analysis or models. You may\nwant to share your data with other users across the organization. But rather than\nsetting up elaborate pipelines to move the data to yet another store, it is often easier\nand economical to create a copy of the relevant data set for users to explore and\n\n\n-----\n\n**Looks awesome! Any gotchas?**\nJust to reiterate some of the gotchas mentioned above as a single list, here’s what you\nshould be wary of:\n\n- \u0007 \u0007Clones are executed on a snapshot of your data. Any changes that are made to\nthe source table after the cloning process starts will not be reflected in the\nclone.\n\n- \u0007 \u0007Shallow clones are not self-contained tables like deep clones. If the data is\ndeleted in the source table (for example through VACUUM), your shallow clone\nmay not be usable.\n\n- \u0007 \u0007Clones have a separate, independent history from the source table. Time travel\nqueries on your source table and clone may not return the same result.\n\n- \u0007 \u0007Shallow clones do not copy stream transactions or COPY INTO metadata. Use\ndeep clones to migrate your tables and continue your ETL processes from\nwhere it left off.\n\n**How can I use it?**\nShallow and deep clones support new advances in how data teams test and manage\ntheir modern cloud data lakes and warehouses. Table clones can help your team\nimplement production-level testing of their pipelines, fine-tune their indexing for optimal\nquery performance, create table copies for sharing — all with minimal overhead and\nexpense. If this is a need in your organization, we hope you will take table cloning for\na spin and give us your feedback — we look forward to hearing about new use cases and\nextensions you would like to see in the future.\n\n**Additional resource**\n\n[Simplifying Disaster Recovery With Delta Lake](https://databricks.com/session_na20/simplifying-disaster-recovery-with-delta-lake)\n\n\ntest the data to see if it is a fit for their needs without affecting your own production\nsystems. Here deep clones again come to the rescue.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE data_science.events CLONE prod.events;\n\n**Data archiving**\nFor regulatory or archiving purposes, all data in a table needs to be preserved for a\ncertain number of years, while the active table retains data for a few months. If you\nwant your data to be updated as soon as possible, but you have a requirement to keep\ndata for several years, storing this data in a single table and performing time travel\nmay become prohibitively expensive.\n\nIn this case, archiving your data in a daily, weekly or monthly manner is a better\nsolution. The incremental cloning capability of deep clones will really help you here.\n\n-- The following code can be scheduled to run at your convenience\n\nCREATE OR REPLACE TABLE archive.events CLONE prod.events;\n\nNote that this table will have an independent history compared to the source table,\ntherefore, time travel queries on the source table and the clone may return different\nresults based on your frequency of archiving.\n\n\n-----\n\n**Enabling Spark SQL DDL**\n**and DML in Delta Lake on**\n**Apache Spark 3.0**\n\nThe release of [Delta Lake 0.7.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) coincided with the release of [Apache Spark 3.0](https://github.com/delta-io/delta/releases/tag/v0.7.0) , thus\nenabling a new set of features that were simplified using Delta Lake from SQL. Here\nare some of the key features.\n\n**Support for SQL DDL commands**\n**to define tables in the** **[Hive metastore](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore)**\nYou can now define Delta tables in the [Hive](https://spark.apache.org/docs/latest/sql-data-sources-hive-tables.html#interacting-with-different-versions-of-hive-metastore) metastore and use the table name in all\nSQL operations when creating (or replacing) tables.\n\n**Create or replace tables**\n\n-- Create table in the metastore\n\nCREATE TABLE events (\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\nUSING DELTA\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n-- If a table with the same name already exists, the table is replaced\n\nwith\n\nthe new configuration, else it i s created\n\nCREATE OR REPLACE TABLE events (\n\n\n-----\n\ndate DATE,\n\neventId STRING,\n\neventType STRING,\n\ndata STRING)\n\n\nINSERT INTO events SELECT * FROM newEvents\n\n-- To atomically replace all of the data in a table, you can use\n\noverwrite mode\n\nINSERT OVERWRITE events SELECT * FROM newEvents\n\n\nUSING DELTA\n\n\nPARTITIONED BY (date)\n\nLOCATION ‘/delta/events’\n\n**Explicitly alter the table schema**\n\n-- Alter table and schema\n\n\n-- Delete events\n\nDELETE FROM events WHERE date < ‘2017-01-01’\n\n-- Update events\n\nUPDATE events SET eventType = ‘click’ WHERE eventType = ‘click’\n\n\nALTER TABLE table_name ADD COLUMNS (\n\n\ncol_name data_type\n\n[COMMENT col_comment]\n\n[FIRST|AFTER colA_name],\n\n...)\n\nYou can also use the Scala/Java/Python APIs:\n\n- DataFrame.saveAsTable(tableName) and DataFrameWriterV2\nAPIs ( [#307](https://github.com/delta-io/delta/issues/307) ).\n\n- \u0007DeltaTable.forName(tableName) API to create instances of\nio.delta.tables .DeltaTable which is useful for executing\nUpdate/Delete/Merge operations in Scala/Java/Python.\n\n**Support for SQL Insert, Delete, Update and Merge**\nOne of the most frequent questions through our [Delta Lake Tech Talks](https://databricks.com/discover/diving-into-delta-lake-talks) was when\nwould DML operations such as delete, update and merge be available in Spark SQL?\nWait no more, these operations are now available in SQL! Below are examples of how\nyou can write delete, update and merge (insert, update, delete and de-duplication\noperations using Spark SQL).\n\n-- Using append mode, you can atomically add new data to an existing\n\nDelta table\n\n\n-- Upsert data to a target Delta\n\n-- table using merge\n\nMERGE INTO events\n\nUSING updates\n\nON events.eventId = updates.eventId\n\nWHEN MATCHED THEN UPDATE\n\nSET events.data = updates.data\n\nWHEN NOT MATCHED THEN INSERT (date, eventId, data)\n\nVALUES (date, eventId, data)\n\nIt is worth noting that the merge operation in Delta Lake supports more advanced\nsyntax than standard ANSI SQL syntax. For example, merge supports\n\n- \u0007 \u0007Delete actions -- Delete a target when matched with a source row. For example,\n“... WHEN MATCHED THEN DELETE ...”\n\n- \u0007 \u0007Multiple matched actions with clause conditions -- Greater flexibility when target\nand source rows match. For example:\n\n...\n\nWHEN MATCHED AND events.shouldDelete THEN DELETE\n\nWHEN MATCHED THEN UPDATE SET events.data = updates.data\n\n\n-----\n\n\u0007Star syntax [-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Shorthand for setting target column value with the similarly named\nsources column. For example:\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nsuch as automated manifest generation. For example, with [table properties](https://www.youtube.com/watch?v=o54YMz8zvCY) , you can\nblock deletes and updates in a Delta table using delta.appendOnly=true .\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN MATCHED THEN SET *\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nWHEN NOT MATCHED THEN INSERT *\n\n-- equivalent to updating/inserting with event .date = updates.date,\n\nevents.eventId = updates.eventId, event .data = updates.data\n\n**Automatic and incremental Presto/Athena manifest**\n**generation**\nAs noted in [Query Delta Lake Tables From Presto and Athena, Improved Operations](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n[Concurrency, and Merge Performance,](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) Delta Lake supports other processing engines\nto read Delta Lake by using manifest files; the manifest files contain the list of the\nmost current version of files as of manifest generation. As described in the preceding\nchapter, you will need to:\n\n- Generate a Delta Lake manifest file\n\n- Configure Presto or Athena to read the generated manifests\n\n- Manually re-generate (update) the manifest file\n\nNew for Delta Lake 0.7.0 is the capability to update the manifest file automatically\nwith the following command:\n\nALTER TABLE delta.`pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.compatibility.symlinkFormatManifest.enabled=true\n\n)\n\n**Configuring your table through table properties**\nWith the ability to set table properties on your table by using ALTER TABLE SET\nTBLPROPERTIES, you can enable, disable or configure many features of Delta Lake\n\n\n[-](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html)\n\nYou can also easily control the history of your Delta Lake table retention by the\nfollowing [properties](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html) :\n\n- \u0007 delta.logRetentionDuration: Controls how long the history for a table\n(i.e., transaction log history) is kept. By default, 30 days of history is kept, but you may\nwant to alter this value based on your requirements (e.g., GDPR historical context)\n\n- \u0007delta.deletedFileRetentionDuration: Controls how long ago a file\nmust have been deleted before being a candidate for VACUUM. By default, data\nfiles older than seven days are deleted.\n\nAs of Delta Lake 0.7.0, you can use ALTER TABLE SET TBLPROPERTIES to\nconfigure these properties.\n\nALTER TABLE delta. `pathToDeltaTable`\n\nSET TBLPROPERTIES(\n\ndelta.logRetentionDuration = “interval “\n\ndelta.deletedFileRetentionDuration = “interval “\n\n)\n\n**Support for adding user-defined metadata**\n**in Delta Lake table commits**\nYou can specify user-defined strings as metadata in commits made by Delta\nLake table operations, either using the DataFrameWriter option userMetadata or\nthe SparkSession configuration spark.databricks.delta.commitInfo.\nuserMetadata .\n\nIn the following example, we are deleting a user (1xsdf1) from our data lake per user\nrequest. To ensure we associate the user’s request with the deletion, we have also\nadded the DELETE request ID into the userMetadata.\n\n\n-----\n\nSET spark.databricks.delta.commitInfo.userMetadata={\n\n“GDPR”:”DELETE Request 1x891jb23”\n\n\nThere were a lot of great questions during the AMA concerning structured streaming\nand using trigger.once .\n\n\n};\n\n\nFor more information, some good resources explaining this concept include:\n\n- [Running Streaming Jobs Once a Day for 10x Cost Savings](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n- [Beyond Lambda: Introducing Delta Architecture](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0) : Specifically the cost vs. latency\ntrade-off discussed here .\n\n**Additional resources**\n\n[Tech Talk | Delta Lake 0.7.0 + Spark 3.0 AMA](https://www.youtube.com/watch?v=xzKqjCB8SWU)\n\n[Tech Talks | Apache Spark 3.0 + Delta Lake](https://www.youtube.com/watch?v=x6RqJYqLoPI&list=PLTPXxbhUt-YWnAgh3RE8DOb46qZF57byx)\n\n[Enabling Spark SQL DDL and DML in Delta Lake on Apache Spark 3.0](https://databricks.com/blog/2020/08/27/enabling-spark-sql-ddl-and-dml-in-delta-lake-on-apache-spark-3-0.html)\n\n\nDELETE FROM user_table WHERE user_id = ‘1xsdf1’\n\nWhen reviewing the [history](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine) operations of the user table (user_table), you can easily\nidentify the associated deletion request within the transaction log.\n\n**Other highlights**\nOther highlights for the Delta Lake 0.7.0 release include:\n\n- Support for Azure Data Lake Storage Gen2 — Spark 3.0 has support for Hadoop\n3.2 libraries which enables support for Azure Data Lake Storage Gen2.\n\n- Improved support for streaming one-time triggers — With Spark 3.0, we now\nensure that a [one-time trigger](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup) ( Trigger.Once ) processes all outstanding data\nin a Delta Lake table in a single micro-batch even if rate limits are set with the\nDataStreamReader option maxFilesPerTrigger.\n\n\n-----\n\n**Lakehouse**\nCombining the best elements of data\nlakes and data warehouses\n\n## CHAPTER 03\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\nthat emerged independently across many customers and use cases: the **lakehouse.**\nIn this chapter, we’ll describe this new architecture and its advantages over previous\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\napplications. Since its inception in the late 1980s, data warehouse technology\ncontinued to evolve and MPP architectures led to systems that were able to handle\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\nhave to deal with unstructured data, semi-structured data, and data with high variety,\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\narchitects began envisioning a single system to house data for many different\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\nin a variety of formats. While suitable for storing data, data lakes lack some critical\nfeatures: They do not support transactions, they do not enforce data quality, and their\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\nA lakehouse is a new data architecture that combines the best elements of data lakes\nand data warehouses.\n\nLakehouses are enabled by a new system design: implementing similar data structures and data management features to those in a data warehouse, directly on the\nkind of low-cost storage used for data lakes. They are what you would get if you had\nto redesign data warehouses in the modern world, now that cheap and highly reliable\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\nbe reading and writing data concurrently. Support for ACID transactions ensures\nconsistency as multiple parties concurrently read or write data, typically using SQL.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\nwarehouses.\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\nrequire systems for diverse data applications including SQL analytics, real-time\nmonitoring, data science and machine learning. Most of the recent advances in\nAI have been in better models to process unstructured data (text, images, video,\naudio), but these are precisely the types of data that a data warehouse is not\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\nwarehouses, and other specialized systems such as streaming, time-series, graph\nand image databases. Having a multitude of systems introduces complexity and,\nmore importantly, introduces delay as data professionals invariably need to move\nor copy data between different systems.\n\n\n-----\n\n**\u0007Schema enforcement and governance:** The lakehouse should have a way to\nsupport schema enforcement and evolution, supporting DW schema paradigms\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\nreduces staleness and improves recency, reduces latency and lowers the cost of\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\nuse separate clusters, thus these systems are able to scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also have\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\nParquet, and they provide an API so a variety of tools and engines, including\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\nThe lakehouse can be used to store, refine, analyze and access data types needed\nfor many new data applications, including images, video, audio, semi-structured\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\nanalytics. Multiple tools might be needed to support all these workloads, but they all\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\nfeatures. Tools for security and access control are basic requirements. Data governance\ncapabilities including auditing, retention and lineage have become essential particularly\nin light of recent privacy regulations. Tools that enable data discovery such as data\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\nCloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\nstore large data warehouses and data lakes. Unfortunately, their implementation\nas key-value stores makes it difficult to achieve ACID transactions and high\nperformance: Metadata operations, such as listing objects, are expensive, and\nconsistency guarantees are limited. In this paper, we present Delta Lake, an\nopen source ACID table storage layer over cloud object stores initially developed\nat Databricks. Delta Lake uses a transaction log that is compacted into Apache\nParquet format to provide ACID properties, time travel, and significantly faster\nmetadata operations for large tabular data sets (e.g., the ability to quickly search\nbillions of table partitions for those relevant to a query). It also leverages this\ndesign to provide high-level features such as automatic data layout optimization,\nupserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\nSpark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\nthousands of Databricks customers that process exabytes of data per day, with\nthe largest instances managing exabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\nMukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\nMichał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\nSameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\nMicrosoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\nenables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\n[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\nsource file formats (Delta Lake, [Apache Iceberg](https://iceberg.apache.org) , [Apache Hudi](https://hudi.apache.org) ) that are suitable for\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\ncan move faster as they are able to use data without needing to access multiple systems.\nThe level of SQL support and integration with BI tools among these early lakehouses\nis generally sufficient for most enterprise data warehouses. Materialized views and\nstored procedures are available, but users may need to employ other mechanisms that\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\nimportant for “ [lift and shift scenarios](https://whatis.techtarget.com/definition/lift-and-shift) ,” which require systems that achieve semantics\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\nlibraries) for non-BI workloads like data science and machine learning. Data\nexploration and refinement are standard for many analytic and data science\napplications. Delta Lake is designed to let users incrementally improve the quality of\ndata in their lakehouse until it is ready for consumption.\n\n\nA note about technical building blocks. While distributed file systems can be\nused for the storage layer, object stores are more commonly used in lakehouses.\nObject stores provide low-cost, highly available storage that excels at massively\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\nThe lakehouse is a new data management architecture that radically simplifies\nenterprise data infrastructure and accelerates innovation in an age when\nmachine learning is poised to disrupt every industry. In the past, most of the\ndata that went into a company’s products or decision-making was structured\ndata from operational systems, whereas today, many products incorporate\nAI in the form of computer vision and speech models, text mining and others.\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\nversioning, governance, security and ACID properties that are needed even for\nunstructured data.\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\nnotebooks) over others so lakehouses will also need to improve their UX and their\nconnectors to popular tools so they can appeal to a variety of personas. These\nand other issues will be addressed as the technology continues to mature and\ndevelop. Over time, lakehouses will close these gaps while retaining the core\nproperties of being simpler, more cost-efficient and more capable of serving\ndiverse data applications.\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\nadopting the lakehouse pattern. The blog created a massive amount of interest\nfrom technology enthusiasts. While lots of people praised it as the next-generation\ndata architecture, some people thought the lakehouse is the same thing as\nthe data lake. Recently, several of our engineers and founders wrote a research\npaper that describes some of the core technological challenges and solutions that\nset the lakehouse architecture apart from the data lake, and it was accepted and\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\nthey would have said faster horses.” The crux of this statement is that people often\nenvision a better solution to a problem as an evolution of what they already know\nrather than rethinking the approach to the problem altogether. In the world of data\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\nstore data warehouses and data lakes. However, their nature as key-value stores\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\nperformance is hampered by expensive metadata operations (e.g., listing objects)\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\nThe first is directories of files (i.e., data lakes) that store the table as a collection\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\napproach because the table is just a group of objects that can be accessed from\na wide variety of tools without a lot of additional data stores or systems. However,\nboth performance and consistency problems are common. Hidden data corruption\nis common due to failed transactions, eventual consistency leads to inconsistent\nqueries, latency is high, and basic management capabilities like table versioning and\naudit logs are unavailable.\n\n**2. Custom storage engines**\nThe second approach is custom storage engines, such as proprietary systems built for\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\nservice that’s able to provide a single source of truth. However, all I/O operations need\nto connect to this metadata service, which can increase cloud resource costs and\nreduce performance and availability. Additionally, it takes a lot of engineering work to\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\nand PyTorch, which can be challenging for data teams that use a variety of computing\nengines on their data. Engineering challenges can be exacerbated by unstructured\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\ncustomers into a specific service provider, leaving customers to contend with\nconsistently high prices and expensive, time-consuming migrations if they decide to\nadopt a new approach later.\n\n**3. Lakehouse**\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\nwe sought to build a car instead of a faster horse with not just a better data store,\nbut a fundamental change in how data is stored and used via the lakehouse. A\nlakehouse is a new architecture that combines the best elements of data lakes and\ndata warehouses. Lakehouses are enabled by a new system design: implementing\nsimilar data structures and data management features to those in a data warehouse,\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\nget if you had to redesign storage engines in the modern world, now that cheap and\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\nthe cloud object store. This design allows clients to update multiple objects at once,\nreplace a subset of the objects with another, etc., in a serializable manner that still\nachieves high parallel read/write performance from the objects. The log also provides\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\ncaching, and audit logs. Together, these features improve both the manageability and\nperformance of working with data in cloud object stores, ultimately opening the door\nto the lakehouse architecture that combines the key features of data warehouses and\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\nexabytes of structured and unstructured data each day, as well as many organizations\nin the open source community. These use cases span a variety of data sources and\napplications. The data types stored include Change Data Capture (CDC) logs from\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\ntables for reporting, and image or feature data for machine learning. The applications\ninclude SQL workloads (most commonly), business intelligence, streaming, data\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\nbe a good fit for most data lake applications that would have used structured storage\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\nsimplify their data architecture by running more workloads directly against cloud\nobject stores, and increasingly, by creating a lakehouse with both data lake and\ntransactional features to replace some or all of the functionality provided by message\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\nAmazon Redshift).\n\n**[In the research paper](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **, the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\nenables a wide range of DBMS-like performance and management features for data\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\naccess protocols make it simple to operate, highly available, and able to deliver highbandwidth access to the object store.\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\nengine to take advantage of modern CPU architecture with optimizations to Spark\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\nRuntime 7.0. Together, these features significantly accelerate query performance on\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\nOne of the big hardware trends over the last several years is that CPU clock speeds\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\nis that we have to find new ways to process data faster beyond raw compute power.\nOne of the most impactful methods has been to improve the amount of data that can\nbe processed in parallel. However, data processing engines need to be specifically\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\nthe pace of business increases. Poorer modeling in the interest of better business\nagility drives poorer query performance. Naturally, this is not a desired state, and\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\nworkloads through three components: an improved query optimizer, a caching\nlayer that sits between the execution layer and the cloud object storage, and a native\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\ndata teams today is the native execution engine, which we call Photon. (We know.\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\n\n-----\n\nDatabricks has been built to maximize the performance from the new changes in\nmodern cloud hardware. It brings performance improvements to all workload types\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\nBy linking these three components together, we think it will be easier for customers\nto understand how improvements in multiple places within the Databricks code\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\nnew advances in how data teams design their data architectures for increased\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n[Spark + AI Summit 2020: Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n**Streaming**\nUsing Delta Lake to express\ncomputation on streaming data\n\n## CHAPTER 04\n\n\n-----\n\n**How Delta Lake Solves Common**\n**Pain Points in Streaming**\n\nThe pain points of a traditional streaming and data warehousing solution can be\nbroken into two groups: data lake and data warehouse pains.\n\n**Data lake pain points**\nWhile data lakes allow you to flexibly store an immense amount of data in a file system,\nthere are many pain points including (but not limited to):\n\n- Consolidation of streaming data from many disparate systems is difficult.\n\n- Updating data in a data lake is nearly impossible, and much of the streaming\ndata needs to be updated as changes are made. This is especially important in\nscenarios involving financial reconciliation and subsequent adjustments.\n\n- Query speeds for a data lake are typically very slow.\n\n- Optimizing storage and file sizes is very difficult and often requires complicated logic.\n\n**Data warehouse pain points**\nThe power of a data warehouse is that you have a persistent performant store of your\ndata. But the pain points for building modern continuous applications include (but are\nnot limited to):\n\n- Constrained to SQL queries (i.e., no machine learning or advanced analytics).\n\n- Accessing streaming data and stored data together is very difficult, if at all possible.\n\n- Data warehouses do not scale very well.\n\n- Tying compute and storage together makes using a warehouse very expensive.\n\n\n-----\n\n**How Delta Lake on Databricks solves these issues**\n[Delta Lake](https://docs.databricks.com/delta/index.html) is a unified data management system that brings data reliability and\nperformance optimizations to cloud data lakes. More succinctly, Delta Lake combines\nthe advantages of data lakes and data warehouses with Apache Spark™ to allow you\nto do incredible things.\n\n- Delta Lake, along with Structured Streaming, makes it possible to analyze\nstreaming and historical data together at high speeds.\n\n- When Delta Lake tables are used as sources and destinations of streaming big\ndata, it is easy to consolidate disparate data sources.\n\n- Upserts are supported on Delta Lake tables.\n\n- Delta Lake is ACID compliant, making it easy to create a compliant data solution.\n\n- Easily include machine learning scoring and advanced analytics into ETL\nand queries.\n\n- Decouples compute and storage for a completely scalable solution.\n\nIn the following use cases, we’ll share what this looks like in practice.\n\n\n-----\n\n**Simplifying Streaming Stock**\n**Data Analysis Using Delta Lake**\n\nReal-time analysis of stock data is a complicated endeavor. After all, there are many\nchallenges in maintaining a streaming system and ensuring transactional consistency\nof legacy and streaming data concurrently.\n\nThankfully, [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) helps solve many of the pain points of building a streaming\nsystem to analyze stock data in real time. In this section, we’ll share how to simplify\nthe streaming of stock data analysis using Delta Lake.\n\nIn the following diagram, you can see a high-level architecture that simplifies this\nproblem. We start by ingesting two different sets of data into two Delta Lake tables.\nThe two data sets are stock prices and fundamentals.\n\nAfter ingesting the data into their respective tables, we then join the data in an ETL\nprocess and write the data out into a third Delta Lake table for downstream analysis.\n\nDelta Lake helps solve these problems by combining the scalability, streaming and\naccess to the advanced analytics of Apache Spark with the performance and ACID\ncompliance of a data warehouse.\n\n\n-----\n\n# Create Fundamental Data (Databricks Delta table)\n\ndfBaseFund = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksFundamentals’ )\n\n# Create Price Data (Databricks Delta table)\n\ndfBasePrice = spark \\\\\n\n.read \\\\\n\n.format( ‘delta’ ) \\\\\n\n.load( ‘/delta/stocksDailyPrices’ )\n\n\n**Implement your streaming**\n**stock analysis solution with Delta Lake**\nDelta Lake and Apache Spark do most of the work for our solution; you can try out the\nfull [notebook](https://pages.databricks.com/rs/094-YMS-629/images/streaming-stock-data-analysis-setup.html) and follow along with the code samples below.\n\nAs noted in the preceding diagram, we have two data sets to process — one for\nfundamentals and one for price data. To create our two Delta Lake tables, we specify\nthe .format(‘delta’) against our Databricks File System ( [DBFS](https://docs.databricks.com/data/databricks-file-system.html) ) locations.\n\n\n-----\n\nWhile we’re updating the stockFundamentals and stocksDailyPrices ,\nwe will consolidate this data through a series of ETL jobs into a consolidated view\n( stocksDailyPricesWFund ).\n\nWith the following code snippet, we can determine the start and end date of available\ndata and then combine the price and fundamentals data for that date range into DBFS.\n\n# Determine start and end date of available data\n\nrow = dfBasePrice.agg(\n\nfunc.max(dfBasePrice.price_date) .alias ( “maxDate” ),\n\nfunc.min(dfBasePrice.price_date) .alias ( “minDate” )\n\n).collect()[ 0 ]\n\nstartDate = row[ “minDate” ]\n\nendDate = row[ “maxDate” ]\n\n# Define our date range function\n\n\n# Save data to DBFS\n\ndfPriceWFund\n\n.write\n\n.format( ‘delta’ )\n\n.mode( ‘append’ )\n\n.save( ‘/delta/stocksDailyPricesWFund’ )\n\n# Loop through dates to complete fundamentals + price ETL process\n\nfor single_date in daterange(\n\nstartDate, (endDate + datetime.timedelta(days= 1 ))\n\n):\n\nprint ‘Starting ’ + single_date.strftime( ‘%Y-%m-%d’ )\n\nstart = datetime.datetime.now()\n\ncombinePriceAndFund(single_date)\n\nend = datetime.datetime.now()\n\nprint ( end - start)\n\n\ndef daterange(start_date, end_date):\n\n\nNow we have a stream of consolidated fundamentals and price data that is being\npushed into [DBFS](https://docs.databricks.com/data/databricks-file-system.html) in the /delta/stocksDailyPricesWFund location. We can build a\nDelta Lake table by specifying .format(“delta”) against that DBFS location.\n\n\nfor n in range( int ((end_date - start_date).days)):\n\nyield start_date + datetime.timedelta(n)\n\n\n# Define combinePriceAndFund information by date and\n\n\ndef combinePriceAndFund(theDate):\n\ndfFund = dfBaseFund. where (dfBaseFund.price_date == theDate)\n\ndfPrice = dfBasePrice. where (\n\ndfBasePrice.price_date == theDate\n\n\ndfPriceWithFundamentals = spark\n\n.readStream\n\n.format( “delta” )\n\n.load( “/delta/stocksDailyPricesWFund” )\n\n\n).drop( ‘price_date’ )\n\n\n# Drop the updated column\n\ndfPriceWFund = dfPrice.join(dfFund, [ ‘ticker’ ]).drop( ‘updated’ )\n\n\n// Create temporary view of the data\n\ndfPriceWithFundamentals.createOrReplaceTempView( “priceWithFundamentals” )\n\n\n-----\n\nNow that we have created our initial Delta Lake table, let’s create a view that will\nallow us to calculate the price/earnings ratio in real time (because of the underlying\nstreaming data updating our Delta Lake table).\n\n%sql\n\nCREATE OR REPLACE TEMPORARY VIEW viewPE AS\n\nselect ticker,\n\nprice_date,\n\nfirst(close) as price,\n\n(close/eps_basic_net) as pe\n\nfrom priceWithFundamentals\n\nwhere eps_basic_net > 0\n\ngroup by ticker, price_date, pe\n\n**Analyze streaming stock data in real time**\nWith our view in place, we can quickly analyze our data using Spark SQL.\n\n%sql\n\nselect - \n\nfrom viewPE\n\nwhere ticker == “AAPL”\n\norder by price_date\n\n\n-----\n\nAs the underlying source of this consolidated data set is a Delta Lake table, this view\nisn’t just showing the batch data but also any new streams of data that are coming in\nas per the following streaming dashboard.\n\nUnderneath the covers, Structured Streaming isn’t just writing the data to Delta Lake\ntables but also keeping the state of the distinct number of keys (in this case ticker\nsymbols) that need to be tracked.\n\n\nBecause you are using Spark SQL, you can execute aggregate queries at scale\nand in real time.\n\n%sql\n\nSELECT ticker, AVG(close) as Average_Close\n\nFROM priceWithFundamentals\n\nGROUP BY ticker\n\nORDER BY Average_Close\n\nIn closing, we demonstrated how to simplify streaming stock data analysis using\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) . By combining Spark Structured Streaming and Delta Lake, we can use the\nDatabricks integrated workspace to create a performant, scalable solution that has\nthe advantages of both data lakes and data warehouses.\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) removes the data engineering complexities\ncommonly associated with streaming and transactional consistency, enabling\ndata engineering and data science teams to focus on understanding the trends in\ntheir stock data.\n\n\n-----\n\n**How Tilting Point Does Streaming**\n**Ingestion Into Delta Lake**\n\nTilting Point is a new-generation games partner that provides top development\nstudios with expert resources, services and operational support to optimize\nhigh-quality live games for success. Through its user acquisition fund and its\nworld-class technology platform, Tilting Point funds and runs performance\nmarketing management and live games operations to help developers achieve\nprofitable scale.\n\nBy leveraging Delta Lake, Tilting Point is able to leverage quality data and make\nit readily available for analytics to improve the business. Diego Link, VP of\nEngineering at Tilting Point, provided insights for this use case.\n\nThe team at Tilting Point was running daily and hourly batch jobs for reporting on\ngame analytics. They wanted to make their reporting near real-time, getting insights\nwithin 5–10 minutes.\n\nThey also wanted to make their in-game LiveOps decisions based on real-time player\nbehavior for giving real-time data to a bundles-and-offer system, provide up-to-theminute alerting on LiveOPs changes that actually might have unforeseen detrimental\neffects and even alert on service interruptions in game operations. The goal was to\nensure that the game experience was as robust as possible for their players.\n\nAdditionally, they had to store encrypted Personally Identifiable Information (PII) data\nseparately in order to maintain GDPR compliance.\n\n\n-----\n\n**How data flows and associated challenges**\nTilting Point has a proprietary software development kit that developers integrate\nwith to send data from game servers to an ingest server hosted in AWS. This service\nremoves all PII data and then sends the raw data to an Amazon Firehose endpoint.\nFirehose then dumps the data in JSON format continuously to S3.\n\nTo clean up the raw data and make it available quickly for analytics, the team\nconsidered pushing the continuous data from Firehose to a message bus (e.g.,\nKafka, Kinesis) and then using [Apache Spark’s Structured Streaming](https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html) to continuously\nprocess data and write to Delta Lake tables.\n\nWhile that architecture sounds ideal for low latency requirements of processing\ndata in seconds, Tilting Point didn’t have such low latency needs for their ingestion\npipeline. They wanted to make the data available for analytics in a few minutes, not\nseconds. Hence they decided to simplify our architecture by eliminating a message\nbus and instead use S3 as a continuous source for their structured streaming job.\n\nBut the key challenge in using S3 as a continuous source is identifying files that\nchanged recently.\n\nListing all files every few minutes has two major issues:\n\n- **Higher latency:** Listing all files in a directory with a large number of files has high\noverhead and increases processing time.\n\n- **Higher cost:** Listing lots of files every few minutes can quickly add to the S3 cost.\n\n**Leveraging Structured Streaming with blob store as**\n**source and Delta Lake tables as sink**\nTo continuously stream data from cloud blob storage like S3, Tilting Point uses\n[Databricks’ S3-SQS source](https://docs.databricks.com/spark/latest/structured-streaming/sqs.html#optimized-s3-file-source-with-sqs) . The S3-SQS source provides an easy way to incrementally\nstream data from S3 without the need to write any state management code on what\nfiles were recently processed.\n\n\n-----\n\nThis is how Tilting Point’s ingestion pipeline looks:\n\n- [Configure Amazon S3 event notifications](https://docs.aws.amazon.com/AmazonS3/latest/dev/NotificationHowTo.html) to send new file arrival information\nto SQS via SNS.\n\n- Tilting Point uses the S3-SQS source to read the new data arriving in S3. The S3SQS source reads the new file names that arrived in S3 from SQS and uses that\ninformation to read the actual file contents in S3. An example code below:\n\nspark.readStream \\\n\n.format( “s3-sqs” ) \\\n\n. option ( “fileFormat” , “json” ) \\\n\n. option ( “queueUrl” , ...) \\\n\n. schema (...) \\\n\n. load ()\n\n- Tilting Point’s structured streaming job then cleans up and transforms the data.\nBased on the game data, the streaming job uses the foreachBatch API of Spark\nstreaming and writes to 30 different Delta Lake tables.\n\n- The streaming job produces lots of small files. This affects performance of\ndownstream consumers. So, an optimize job runs daily to compact small files in\nthe table and store them as right file sizes so that consumers of the data have\ngood performance while reading the data from Delta Lake tables. Tilting Point\nalso runs a weekly optimize job for a second round of compaction. Architecture showing continuous data ingest into Delta Lake tables\n\n\n-----\n\nThe above Delta Lake ingestion architecture helps in the following ways:\n\n- **Incremental loading:** The S3-SQS source incrementally loads the new files in S3.\nThis helps quickly process the new files without too much overhead in listing files.\n\n- **No explicit file state management:** There is no explicit file state management\nneeded to look for recent files.\n\n- **Lower operational burden:** Since we use S3 as a checkpoint between Firehose\nand Structured Streaming jobs, the operational burden to stop streams and reprocess data is relatively low.\n\n- **Reliable ingestion:** Delta Lake uses [optimistic concurrency control](https://docs.databricks.com/delta/optimizations/isolation-level.html) to offer ACID\ntransactional guarantees. This helps with reliable data ingestion.\n\n- **File compaction:** One of the major problems with streaming ingestion is tables\nending up with a large number of small files that can affect read performance.\nBefore Delta Lake, we had to set up a different table to write the compacted\ndata. With Delta Lake, thanks to ACID transactions, we can compact the files and\nrewrite the data back to the same table safely.\n\n- **Snapshot isolation:** Delta Lake’s snapshot isolation allows us to expose the\ningestion tables to downstream consumers while data is being appended by a\nstreaming job and modified during compaction.\n\n- **Rollbacks:** In case of bad writes, [Delta Lake’s Time Travel](https://databricks.com/blog/2019/02/04/introducing-delta-time-travel-for-large-scale-data-lakes.html) helps us roll back to a\nprevious version of the table.\n\nIn this section, we walked through Tilting Point’s use cases and how they do\nstreaming ingestion using Databricks’ S3-SQS source into Delta Lake tables\nefficiently without too much operational overhead to make good quality data\nreadily available for analytics.\n\n\n-----\n\n**Building a Quality of Service**\n**Analytics Solution for Streaming**\n**Video Services**\n\nAs traditional pay TV , content owners have embraced directto-consumer (D2C) subscription and ad-supported streaming for monetizing their [continues to stagnate](https://nscreenmedia.com/us-tv-market-svod-exceed-pay-tv-2020/)\nlibraries of content. For companies whose entire business model revolved around\nproducing great content, which they then licensed to distributors, the shift to now\nowning the entire glass-to-glass experience has required new capabilities, such as\nbuilding media supply chains for content delivery to consumers, supporting apps for\na myriad of devices and operating systems, and performing customer relationship\nfunctions like billing and customer service.\n\nWith most services renewing on a monthly basis, subscription service operators need\nto prove value to their subscribers at all times. General quality of streaming video\nissues (encompassing buffering, latency, pixelation, jitter, packet loss and the blank\nscreen) have significant business impacts, whether it’s increased [subscriber churn](https://www.streamingmedia.com/Articles/ReadArticle.aspx?ArticleID=112209) or\n[decreased video engagement](https://www.tvtechnology.com/opinions/why-buffering-remains-every-video-providers-worst-nightmare) .\n\nWhen you start streaming, you realize there are so many places where breaks can\nhappen and the viewer experience can suffer. There may be an issue at the source in\nthe servers on-premises or in the cloud; in transit at either the CDN level or ISP level\nor the viewer’s home network; or at the playout level with player/client issues. What\nbreaks at n x 104 concurrent streamers is different from what breaks at n x 105 or n\nx 106. There is no pre-release testing that can quite replicate real-world users and\ntheir ability to push even the most redundant systems to their breaking point as they\n\n\n-----\n\nchannel surf, click in and out of the app, sign on from different devices simultaneously\nand so on. And because of the nature of TV, things will go wrong during the most\nimportant, high-profile events drawing the largest audiences. If you start [receiving](https://downdetector.com/)\n[complaints on social media](https://downdetector.com/) , how can you tell if they are unique to that one user or\nrather regional or a national issue? If national, is it across all devices or only certain\ntypes (e.g., possibly the OEM updated the OS on an older device type, which ended up\ncausing compatibility issues with the client)?\n\nIdentifying, remediating and preventing viewer quality of experience issues becomes\na big data problem when you consider the number of users, the number of actions\nthey are taking and the number of handoffs in the experience (servers to CDN to ISP to\nhome network to client). Quality of Service (QoS) helps make sense of these streams\nof data so you can understand what is going wrong, where and why. Eventually you\ncan get into predictive analytics around what could go wrong and how to remediate\nit before anything breaks.\n\n**Databricks Quality of Service solution overview**\nThe aim of this solution is to provide the core for any streaming video platform that\nwants to improve their QoS system. It is based on the [AWS Streaming Media Analytics](https://github.com/awslabs/aws-streaming-media-analytics)\n[Solution](https://github.com/awslabs/aws-streaming-media-analytics) provided by AWS Labs, which we then built on top of to add Databricks as\na Unified Data Analytics Platform for both the real-time insights and the advanced\nanalytics capabilities.\n\n[By using Databricks](https://databricks.com/customers) , streaming platforms can get faster insights by always\nleveraging the most complete and recent data sets powered by robust and reliable\ndata pipelines. This decreases time to market for new features by accelerating\ndata science using a collaborative environment. It provides support for managing\nthe end-to-end machine learning lifecycle and reduces operational costs across\nall cycles of software development by having a unified platform for both data\nengineering and data science.\n\n\n-----\n\n**Video QoS solution architecture**\nWith complexities like low-latency monitoring alerts and highly scalable infrastructure\nrequired for peak video traffic hours, the straightforward architectural choice was\nthe Delta Architecture — both standard big data architectures like Lambda and Kappa\nArchitectures have disadvantages around the operational effort required to maintain\nmultiple types of pipelines (streaming and batch) and lack support for a unified data\nengineering and data science approach.\n\nThe Delta Architecture is the next-generation paradigm that enables all the data\npersonas in your organization to be more productive:\n\n- Data engineers can develop data pipelines in a cost-efficient manner\ncontinuously without having to choose between batch and streaming\n\n- Data analysts can get near real-time insights and faster answers to their BI queries\n\n- Data scientists can develop better machine learning models using more reliable data\nsets with support for time travel that facilitates reproducible experiments and reports Delta Architecture using the “multi-hop” approach for data pipelines\n\n\n-----\n\nWriting data pipelines using the Delta Architecture follows the best practices of\nhaving a multi-layer “multi-hop” approach where we progressively add structure to\ndata: “Bronze” tables or Ingestion tables are usually raw data sets in the native format\n(JSON, CSV or txt), “Silver” tables represent cleaned/transformed data sets ready for\nreporting or data science, and “Gold” tables are the final presentation layer.\n\nFor the pure streaming use cases, the option of materializing the DataFrames in\nintermediate Delta Lake tables is basically just a trade-off between latency/SLAs and\ncost (an example being real-time monitoring alerts vs. updates of the recommender\nsystem based on new content).\n\nA streaming architecture can still be achieved while materializing DataFrames in Delta Lake tables\n\nThe number of “hops” in this approach is directly impacted by the number of consumers\ndownstream, complexity of the aggregations (e.g., Structured Streaming enforces\ncertain limitations around chaining multiple aggregations) and the maximization of\noperational efficiency.\n\nThe QoS solution architecture is focused around best practices for data processing\nand is not a full video-on-demand (VoD) solution — with some standard components\nlike the “front door” service Amazon API Gateway being avoided from the high-level\narchitecture in order to keep the focus on data and analytics.\n\n\n-----\n\nHigh-level architecture for the QoS platform\n\n\n**Making your data ready for analytics**\nBoth sources of data included in the QoS solution (application events and CDN logs)\nare using the JSON format, great for data exchange — allowing you to represent\ncomplex nested structures, but not scalable and difficult to maintain as a storage\nformat for your data lake / analytics system.\n\n\nIn order to make the data directly queryable across the entire organization, the\nBronze to Silver pipeline (the “make your data available to everyone” pipeline) should\ntransform any raw formats into Delta Lake and include all the quality checks or data\nmasking required by any regulatory agencies.\n\n\n-----\n\nRaw format of the app events\n\n**Video applications events**\nBased on the architecture, the video application events are pushed directly to\nKinesis Streams and then just ingested to a Delta Lake append-only table without\nany changes to the schema.\n\nUsing this pattern allows a high number of consumers downstream to process the\ndata in a streaming paradigm without having to scale the throughput of the Kinesis\nstream. As a side effect of using a Delta Lake table as a sink (which supports [optimize](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-optimize.html) !),\nwe don’t have to worry about the way the size of the processing window will impact the\nnumber of files in your target table — known as the “small files” issue in the big data world.\n\nBoth the timestamp and the type of message are being extracted from the JSON\nevent in order to be able to partition the data and allow consumers to choose the\ntype of events they want to process. Again combining a single Kinesis stream for\nthe events with a Delta Lake “Events” table reduces the operational complexity while\nmaking things easier for scaling during peak hours.\n\n\nAll the details are extracted from JSON for the Silver table\n\n\n-----\n\n**CDN logs**\nThe CDN logs are delivered to S3, so the easiest way to process them is the Databricks\nAuto Loader, which incrementally and efficiently processes new data files as they\narrive in S3 without any additional setup.\n\nauto_loader_df = spark.readStream.format( “cloudFiles” ) \\\n\n.option( “cloudFiles.format” , “json” ) \\\n\n.option( “cloudFiles.region” , region) \\\n\n.load(input_location)\n\nanonymized_df = auto_loader_df. select ( ‘*’ , ip_\n\nanonymizer( ‘requestip’ ). alias ( ‘ip’ ))\\\n\n.drop( ‘requestip’ )\\\n\n.withColumn( “origin” , map_ip_to_location(col( ‘ip’ )))\n\nanonymized_df.writeStream \\\n\n.option( ‘checkpointLocation’ , checkpoint_location)\\\n\n.format( ‘delta’ ) \\\n\n.table(silver_database + ‘.cdn_logs’ )\n\nAs the logs contain IPs — considered personal data under the GDPR regulations — the\n“make your data available to everyone” pipeline has to include an anonymization step.\nDifferent techniques can be used, but we decided to just strip the last octet from IPv4\nand the last 80 bits from IPv6. On top, the data set is also enriched with information\naround the origin country and the ISP provider, which will be used later in the Network\nOperation Centers for localization.\n\n\n-----\n\n**Creating the Dashboard /**\n**Virtual Network Operation Centers**\nStreaming companies need to monitor network performance and the user experience\nas near real-time as possible, tracking down to the individual level with the ability to\nabstract at the segment level, easily defining new segments such as those defined by\ngeos, devices, networks and/or current and historical viewing behavior.\n\nFor streaming companies that has meant adopting the concept of Network Operation\nCenters (NOC) from telco networks for monitoring the health of the streaming\nexperience for their users at a macro level, flagging and responding to any issues\nearly on. At their most basic, NOCs should have dashboards that compare the current\nexperience for users against a performance baseline so that the product teams can\nquickly and easily identify and attend to any service anomalies.\n\nIn the QoS solution we have incorporated a [Databricks dashboard](https://docs.databricks.com/notebooks/dashboards.html) . BI tools can also\nbe effortlessly connected in order to build more complex visualizations, but based\non customer feedback, built-in dashboards are, most of the time, the fastest way to\npresent the insights to business users.\n\nThe aggregated tables for the NOC will basically be the Gold layer of our Delta\nArchitecture — a combination of CDN logs and the application events. Example of Network Operations Center dashboard\n\n\n-----\n\nThe dashboard is just a way to visually package the results of SQL queries or Python\n/ R transformation — each notebook supports multiple dashboards so in case of\nmultiple end users with different requirements we don’t have to duplicate the code —\nas a bonus the refresh can also be scheduled as a Databricks job.\n\nVisualization of the results of a SQL query\n\nLoading time for videos (time to first frame) allows better understanding of the\nperformance for individual locations of your CDN — in this case the AWS CloudFront\nEdge nodes — which has a direct impact in your strategy for improving this KPI —\neither by spreading the user traffic over multi-CDNs or maybe just implementing a\ndynamic origin selection in case of AWS CloudFront using Lambda@Edge.\n\n\n-----\n\nFailure to understand the reasons for high levels of buffering — and the poor video\nquality experience that it brings — has a significant impact on subscriber churn rate.\nOn top of that, advertisers are not willing to spend money on ads responsible for\nreducing the viewer engagement — as they add extra buffering on top, so the profits\non the advertising business usually are impacted too. In this context, collecting as\nmuch information as possible from the application side is crucial to allow the analysis\nto be done not only at video level but also browser or even type / version of application.\n\nOn the content side, events for the application can provide useful information about\nuser behavior and overall quality of experience. How many people that paused a video\nhave actually finished watching that episode / video? What caused the stoppage: The\nquality of the content or delivery issues? Of course, further analyses can be done by\nlinking all the sources together (user behavior, performance of CDNs /ISPs) to not only\ncreate a user profile but also to forecast churn.\n\n\n-----\n\n**Creating (near) real-time alerts**\nWhen dealing with the velocity, volume and variety of data generated in video\nstreaming from millions of concurrent users, dashboard complexity can make it\nharder for human operators in the NOC to focus on the most important data at the\nmoment and zero-in on root cause issues. With this solution, you can easily set up\nautomated alerts when performance crosses certain thresholds that can help the\nhuman operators of the network as well as set off automatic remediation protocols\nvia a Lambda function. For example:\n\n- If a CDN is having latency much higher than baseline (e.g., if it’s more than 10%\nlatency vs. baseline average), initiate automatic CDN traffic shifts.\n\n- If more than [some threshold, e.g., 5%] of clients report playback errors, alert the\nproduct team that there is likely a client issue for a specific device.\n\n- If viewers on a certain ISP are having higher-than-average buffering and\npixelation issues, alert frontline customer representatives on responses and ways\nto decrease issues (e.g., set stream quality lower).\n\nFrom a technical perspective, generating real-time alerts requires a streaming\nengine capable of processing data real time and publish-subscribe service to push\nnotifications.\n\n\nupdates of web applications) or Amazon SQS for other consumers. The [custom for](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html)\n[each writer](https://docs.databricks.com/spark/latest/structured-streaming/foreach.html) option makes the writing of a pipeline to send email notifications based\non a rule-based engine (e.g., validating the percentage of errors for each individual\ntype of app over a period of time) really straightforward.\n\ndef send_error_notification(row):\n\nsns_client = boto3.client( ‘sns’ , region)\n\nerror_message = ‘Number of errors for the App has exceeded the\n\nthreshold {}’ .format(row[ ‘percentage’ ])\n\nresponse = sns_client.publish(\n\nTopicArn =,\n\nMessage = error_message,\n\nSubject =,\n\nMessageStructure = ‘string’ )\n\n# Structured Streaming Job\n\ngetKinesisStream( “player_events” )\\\n\n.selectExpr( “type” , “app_type” )\\\n\n.groupBy( “app_type” )\\\n\n.apply(calculate_error_percentage)\\\n\n. where ( “percentage > {}” .format(threshold)) \\\n\n.writeStream\\\n\n. foreach (send_error_notification)\\\n\n.start()\n\n\nIntegrating microservices using Amazon SNS and Amazon SQS\n\nSending email notifications using AWS SNS\n\nThe QoS solution implements the [AWS best practices for integrating microservices](https://docs.aws.amazon.com/whitepapers/latest/microservices-on-aws/introduction.html)\nby using Amazon SNS and its integrations with Amazon Lambda (see below for the\n\n\n-----\n\nOn top of the basic email use case, the Demo Player includes three widgets updated\nin real time using AWS AppSync: the number of active users, the most popular videos\nand the number of users concurrently watching a video.\n\nUpdating the application with the results of real-time aggregations\n\nThe QoS solution is applying a similar approach — Structured Streaming and Amazon\nSNS — to update all the values allowing for extra consumers to be plugged in using AWS\nSQS. This is a common pattern when huge volumes of events have to be enhanced and\nanalyzed; pre-aggregate data once and allow each service (consumer) to make their\nown decision downstream.\n\n**Next steps: machine learning**\nManually making sense of the historical data is important but is also very slow. If\nwe want to be able to make automated decisions in the future, we have to integrate\nmachine learning algorithms.\n\nAs a Unified Data Platform, Databricks empowers data scientists to build better data\nscience products using features like Runtime for Machine Learning with built-in\nor the integration with MLflow, the end-toend machine learning lifecycle management tool. support for [Hyperopt](https://docs.databricks.com/applications/machine-learning/automl-hyperparam-tuning/index.html#hyperopt-overview) / [Horvod](https://docs.databricks.com/applications/machine-learning/train-model/distributed-training/horovod-runner.html) / [AutoML](https://databricks.com/product/automl-on-databricks)\n\n\n-----\n\nWe have already explored a few important use cases across our customer base while\nfocusing on the possible extensions to the QoS solution.\n\n**Point-of-failure prediction and remediation**\nAs D2C streamers reach more users, the costs of even momentary loss of service\nincreases. ML can help operators move from reporting to prevention by forecasting\nwhere issues could come up and remediating before anything goes wrong (e.g.,\na spike in concurrent viewers leads to switching CDNs to one with more capacity\nautomatically).\n\n**Customer churn**\nCritical to growing subscription services is keeping the subscribers you have. By\nunderstanding the quality of service at the individual level, you can add QoS as a\nvariable in churn and customer lifetime value models. Additionally, you can create\ncustomer cohorts for those who have had video quality issues in order to test\nproactive messaging and save offers.\n\n\n**Getting started with the Databricks streaming video**\n**QoS solution**\nProviding consistent quality in the streaming video experience is table stakes at this\npoint to keep fickle audiences with ample entertainment options on your platform.\nWith this solution we have sought to create a quick start for most streaming video\nplatform environments to embed this QoS real-time streaming analytics solution in\na way that:\n1. Scales to any audience size\n2. Quickly flags quality performance issues at key parts of the distribution workflow\n3. Is flexible and modular enough to easily customize for your audience and your\nneeds, such as creating new automated alerts or enabling data scientists to test\nand roll out predictive analytics and machine learning\n\nTo get started, download the notebooks for the [Databricks streaming video QoS](https://databricks.com/notebooks/QoS/index.html#00.config.html)\n[solution](https://databricks.com/notebooks/QoS/index.html#00.config.html) . For more guidance on how to unify batch and streaming data into a single\nsystem, view the [Delta Architecture webinar](https://pages.databricks.com/201908-WB-Delta-Architecture-A-Step-Beyond-Lambda-Architecture_Reg.html) .\n\n\n-----\n\n**Customer Use Cases**\nSee how customers are using\nDelta Lake to rapidly innovate\n\n## CHAPTER 05\n\n\n-----\n\n**Healthdirect Australia**\nProvides Personalized and Secure Online\nPatient Care With Databricks\n\nAs the shepherds of the National Health Services Directory (NHSD), Healthdirect\nis focused on leveraging terabytes of data covering time-driven, activity-based\nhealthcare transactions to improve health care services and support. With\ngovernance requirements, siloed teams and a legacy system that was difficult\nto scale, they moved to Databricks. This boosted data processing for downstream\nmachine learning while improving data security to meet HIPAA requirements.\n\n**Spotlight on Healthdirect**\n**Industry:** Healthcare and life sciences\n6x\nImprovement in data processing\n20M\nRecords ingested in minutes\n\n**Data quality and governance issues, silos, and the**\n**inability to scale**\nDue to regulatory pressures, Healthdirect Australia set forth to improve overall data\nquality and ensure a level of governance on top of that, but they ran into challenges\nwhen it came to data storage and access. On top of that, data silos were blocking the\nteam from efficiently preparing data for downstream analytics. These disjointed data\n\n\n-----\n\nsources impacted the consistency of data reads, as data was oftentimes out-of-sync\nbetween the various systems in their stack. The low-quality data also led to higher\nerror rates and processing inefficiencies. This fragmented architecture created\nsignificant operational overhead and limited their ability to have a comprehensive\nview of the patient.\n\nFurther, they needed to ingest over 1 billion data points due to a changing landscape\nof customer demand such as bookings, appointments, pricing, eHealth transaction\nactivity, etc. — estimated at over 1TB of data.\n\n“We had a lot of data challenges. We just couldn’t process efficiently enough. We\nwere starting to get batch overruns. We were starting to see that a 24-hour window\nisn’t the most optimum time in which we want to be able to deliver healthcare data\nand services,” explained Peter James, Chief Architect at Healthdirect Australia.\n\nUltimately, Healthdirect realized they needed to modernize their end-to-end process\nand tech stack to properly support the business.\n\n**Modernizing analytics with Databricks and Delta Lake**\nDatabricks provides Healthdirect Australia with a Unified Data Platform that simplifies\ndata engineering and accelerates data science innovation. The notebook environment\nenables them to make content changes in a controlled fashion rather than having to\nrun bespoke jobs each time.\n\n“Databricks has provided a big uplift for our teams and our data operations,” said\nJames. “The analysts were working directly with the data operations teams. They are\nable to achieve the same pieces of work together within the same time frames that\nused to take twice as long. They’re working together, and we’re seeing just a massive\nacceleration in the speed at which we can deliver service.”\n\n\n-----\n\nWith Delta Lake, they’ve created logical data zones: Landing, Raw, Staging and Gold.\nWithin these zones, they store their data “as is,” in their structured or unstructured\nstate, in Delta Lake tables. From there, they use a metadata-driven schema and hold\nthe data within a nested structure within that table. What this allows them to do is\nhandle data consistently from every source and simplifies the mapping of data to the\nvarious applications pulling the data.\n\nMeanwhile, through Structured Streaming, they were able to convert all of their\nETL batch jobs into streaming ETL jobs that could serve multiple applications\nconsistently. Overall, the advent of Spark Structured Streaming, Delta Lake and the\nDatabricks Unified Data Platform provides significant architectural improvements\nthat have boosted performance, reduced operational overheads and increased\nprocess efficiencies.\n\n\n**Faster data pipelines result in better patient-driven**\n**healthcare**\nAs a result of the performance gains delivered by Databricks and the improved data\nreliability through Delta Lake, Healthdirect Australia realized improved accuracy of\ntheir fuzzy name match algorithm from less than 80% with manual verification to 95%\nand no manual intervention.\n\nThe processing improvements with Delta Lake and Structured Streaming allowed\nthem to process more than 30,000 automated updates per month. Prior to Databricks,\nthey had to use unreliable batch jobs that were highly manual to process the same\nnumber of updates over a span of 6 months — a 6x improvement in data processing.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the\nhealthcare sector.”\n\n– Peter James, Chief Architect, Healthdirect Australia\n\n\n-----\n\nThey were also able to increase their data load rate to 1 million records per minute,\nloading their entire 20 million record data set in 20 minutes. Before the adoption\nof Databricks, this used to take more than 24 hours to process the same 1 million\ntransactions, blocking analysts from making swift decisions to drive results.\n\nLast, data security, which was critical to meet compliance requirements, was greatly\nimproved. Databricks provides standard security accreditations like HIPAA, and\nHealthdirect was able to use Databricks to meet Australia’s security requirements.\nThis yielded significant cost reductions and gave them continuous data assurance\nby monitoring changes to access privileges like changes in roles, metadata-level\nsecurity changes, data leakage, etc.\n\n“Databricks delivered the time to market as well as the analytics and operational\nuplift that we needed in order to be able to meet the new demands of the healthcare\nsector,” said James.\n\nWith the help of Databricks, they have proven the value of data and analytics and how\nit can impact their business vision. With transparent access to data that boasts\nwell-documented lineage and quality, participation across various business and\nanalyst groups has increased — empowering teams to collaborate and more\neasily and quickly extract value from their data with the goal of improving\nhealthcare for everyone.\n\n\n-----\n\n**Comcast**\nUses Delta Lake and MLflow to\nTransform the Viewer Experience\n\n**Spotlight on Comcast**\n**Industry:** Media and entertainment\n10x\nReduction in overall compute costs to process data\n90%\nReduction in required DevOps resources to manage infrastructure\nReduced\nDeployment times from weeks to minutes\n\nAs a global technology and media company connecting millions of customers to\npersonalized experiences, Comcast struggled with massive data, fragile data pipelines\n\nand poor data science collaboration. With Databricks — leveraging Delta Lake and MLflow\n— they can build performant data pipelines for petabytes of data and easily manage the\nlifecycle of hundreds of models to create a highly innovative, unique and award-winning\nviewer experience using voice recognition and machine learning.\n\n\n-----\n\n**Infrastructure unable to support data and ML needs**\nInstantly answering a customer’s voice request for a particular program while turning\nbillions of individual interactions into actionable insights, strained Comcast’s IT\ninfrastructure and data analytics and data science teams. To make matters more\ncomplicated, Comcast needed to deploy models to a disjointed and disparate range\nof environments: cloud, on-premises and even directly to devices in some instances.\n\n- **Massive data:** Billions of events generated by the entertainment system and 20+\nmillion voice remotes, resulting in petabytes of data that need to be sessionized\nfor analysis.\n\n- **Fragile pipelines:** Complicated data pipelines that frequently failed and were\nhard to recover. Small files were difficult to manage, slowing data ingestion for\ndownstream machine learning.\n\n- **Poor collaboration:** Globally dispersed data scientists working in different\nscripting languages struggled to share and reuse code.\n\n- **Manage management of ML models:** Developing, training and deploying hundreds\nof models was highly manual, slow and hard to replicate, making it difficult to scale.\n\n- **Friction between dev and deployment:** Dev teams wanted to use the latest tools\nand models while ops wanted to deploy on proven infrastructure.\n\n\n-----\n\n**Automated infrastructure, faster data**\n**pipelines with Delta Lake**\nComcast realized they needed to modernize their entire approach to analytics from\ndata ingest to the deployment of machine learning models to delivering new features\nthat delight their customers. Today, the Databricks Unified Data Platform enables\nComcast to build rich data sets and optimize machine learning at scale, streamline\nworkflows across teams, foster collaboration, reduce infrastructure complexity, and\ndeliver superior customer experiences.\n\n- **Simplified infrastructure management:** Reduced operational costs through\nautomated cluster management and cost management features such as\nautoscaling and spot instances.\n\n\n\n- **Performant data pipelines:** Delta Lake is used for the ingest, data enrichment and\ninitial processing of the raw telemetry from video and voice applications and devices.\n\n- **Reliably manage small files:** Delta Lake enabled them to optimize files for rapid\nand reliable ingestion at scale.\n\n- **Collaborative workspaces:** Interactive notebooks improve cross-team\ncollaboration and data science creativity, allowing Comcast to greatly accelerate\nmodel prototyping for faster iteration.\n\n- **Simplified ML lifecycle:** Managed MLflow simplifies the machine learning lifecycle\nand model serving via the Kubeflow environment, allowing them to track and\nmanage hundreds of models with ease.\n\n- **Reliable ETL at scale:** Delta Lake provides efficient analytics pipelines at scale\nthat can reliably join historic and streaming data for richer insights.\n\n\n-----\n\n**Delivering personalized experiences with ML**\nIn the intensely competitive entertainment industry, there is no time to press the\nPause button. Armed with a unified approach to analytics, Comcast can now fastforward into the future of AI-powered entertainment — keeping viewers engaged and\ndelighted with competition-beating customer experiences.\n\n- **Emmy-winning viewer experience:** Databricks helps enable Comcast to create\na highly innovative and award-winning viewer experience with intelligent voice\ncommands that boosts engagement.\n\n- **Reduced compute costs by 10x:** Delta Lake has enabled Comcast to optimize data\ningestion, replacing 640 machines with 64 while improving performance. Teams\ncan spend more time on analytics and less time on infrastructure management.\n\n- **Less DevOps:** Reduced the number of DevOps full-time employees required for\nonboarding 200 users from 5 to 0.5.\n\n- **Higher data science productivity:** Fostered collaboration between global data\nscientists by enabling different programming languages through a single\ninteractive workspace. Also, Delta Lake has enabled the data team to use data at\nany point within the data pipeline, allowing them to act more quickly in building\nand training new models.\n\n- **Faster model deployment:** Reduced deployment times from weeks to minutes as\noperations teams deployed models on disparate platforms.\n\n\n-----\n\n**Banco Hipotecario**\nPersonalizes the Banking\nExperience With Data and ML\n\nBanco Hipotecario — a leading Argentinian commercial bank — is on a mission\nto leverage machine learning to deliver new insights and services that will delight\ncustomers and create upsell opportunities. With a legacy analytics and data\nwarehousing system that was rigid and complex to scale, they turned to Databricks\nto unify data science, engineering and analytics.\n\nAs a result of this partnership, they were able to significantly increase customer\nacquisition and cross-sells while lowering the cost for acquisition, greatly impacting\noverall customer retention and profitability.\n\n**Spotlight on Banco Hipotecario**\n**Industry:** Financial services\n35%\n\nReduction in cost of acquisition\n**Technical use cases:** Ingest and ETL, machine learning and SQL Analytics\n\n\n-----\n\n**Legacy analytics tools are slow, rigid and**\n**impossible to scale**\nBanco Hipotecario set forth to increase customer acquisition by reducing risk and\nimproving the customer experience. With data analytics and machine learning\nanchoring their strategy, they hoped to influence a range of use cases from fraud\ndetection and risk analysis to serving product recommendations to drive upsell and\ncross-sell opportunities and forecast sales.\n\nBanco Hipotecario faced a number of the challenges that often come along with\noutdated technology and processes: disorganized or inaccurate data; poor crossteam collaboration; the inability to innovate and scale; resource-intensive workflows,\n— the list goes on.\n\n“In order to execute on our data analytics strategy, new technologies were needed\nin order to improve data engineering and boost data science productivity,” said\nDaniel Sanchez, Enterprise Data Architect at Banco Hipotecario. “The first steps we\ntook were to move to a cloud-based data lake, which led us to Azure Databricks\nand Delta Lake.”\n\n\n-----\n\n**A unified platform powers the data lake**\n**and easy collaboration**\nBanco Hipotecario turned to Databricks to modernize their data warehouse\nenvironment, improve cross-team collaboration, and drive data science innovation.\nFully managed in Microsoft Azure, they were able to easily and reliably ingest massive\nvolumes of data, spinning up their whole infrastructure in 90 days. With Databricks’\nautomated cluster management capabilities, they are able to scale clusters ondemand to support large workloads.\n\nDelta Lake has been especially useful in bringing reliability and performance to Banco\nHipotecario’s data lake environment. With Delta Lake, they are now able to build\nreliable and performant ETL pipelines like never before.\n\n\nMeanwhile, performing SQL Analytics on Databricks has helped them do data\nexploration, cleansing and generate data sets in order to create models, enabling the\nteam to deploy their first model within the first three months, and the second model\ngenerated was rolled out in just two weeks.\n\nAt the same time, data scientists were finally able to collaborate, thanks to interactive\nnotebooks; this meant faster builds, training and deployment. And MLflow streamlined\nthe ML lifecycle and removed the overreliance on data engineering.\n\n“Databricks gives our data scientists the means to easily create our own experiments\nand deploy them to production in weeks, rather than months,” said Miguel Villalba,\nHead of Data Engineering and Data Science.\n\n\n-----\n\n**An efficient team maximizes customer**\n**acquisition and retention**\nSince moving to Databricks, the data team at Banco Hipotecario could not be happier,\nas Databricks has unified them across functions in an integrated fashion.\n\nThe results of data unification and markedly improved collaboration and autonomy\ncannot be overstated. Since deploying Databricks, Banco Hipotecario has increased\ntheir cross-sell into new products by a whopping 90%, while machine learning has\nreduced the cost of customer acquisition by 35%.\n\n\n-----\n\n**Viacom18**\nMigrates From Hadoop to Databricks to\nDeliver More Engaging Experiences\n\nViacom18 Media Pvt. Ltd. is one of India’s fastest-growing entertainment networks\nwith 40x growth over the past decade. They offer multi-platform, multigenerational\nand multicultural brand experiences to 600+ million monthly viewers.\n\nIn order to deliver more engaging experiences for their millions of viewers, Viacom18\nmigrated from their Hadoop environment due to its inability to process data at scale\nefficiently. With Databricks, they have streamlined their infrastructure management,\nincreased data pipeline speeds and increased productivity among their data teams.\n\nToday, Viacom18 is able to deliver more relevant viewing experiences to their\nsubscribers, while identifying opportunities to optimize the business and drive\ngreater ROI.\n\n**Spotlight on Viacom18**\n**Industry:** Media and entertainment\n26%\nIncrease in operational efficiency lowers overall costs\n\n\n-----\n\n**Growth in subscribers and terabytes of viewing data**\n**push Hadoop to its limits**\nViacom18, a joint venture between Network18 and ViacomCBS, is focused on\nproviding its audiences with highly personalized viewing experiences. The core\nof this strategy requires implementing an enterprise data architecture that enables\nthe building of powerful customer analytics on daily viewer data. But with millions of\nconsumers across India, the sheer amount of data was tough to wrangle: They were\ntasked with ingesting and processing over 45,000 hours of daily content on VOOT\n(Viacom18’s on-demand video subscription platform), which easily generated 700GB\nto 1TB of data per day.\n\n“Content is at the heart of what we do,” explained Parijat Dey, Viacom18’s Assistant\nVice President of Digital Transformation and Technology. “We deliver personalized\ncontent recommendations across our audiences around the world based on\nindividual viewing history and preferences in order to increase viewership and\ncustomer loyalty.”\n\nViacom18’s data lake, which was leveraging on-premises Hadoop for operations,\nwasn’t able to optimally process 90 days of rolling data within their management’s\ndefined SLAs, limiting their ability to deliver on their analytics needs, which impacted\nnot only the customer experience but also overall costs.\n\nTo meet this challenge head-on, Viacom18 needed a modern data warehouse with the\nability to analyze data trends for a longer period of time instead of daily snapshots. They\nalso needed a platform that simplified infrastructure by allowing their team to easily\nprovision clusters with features like auto-scaling to help reduce compute costs.\n\n\n-----\n\n**Rapid data processing for analytics**\n**and ML with Databricks**\nTo enable the processing power and data science capabilities they required, Viacom18\npartnered with Celebal Technologies, a premier Salesforce, data analytics and big data\nconsulting organization based in India. The team at Celebal leveraged Azure Databricks\nto provide Viacom18 with a unified data platform that modernizes its data warehousing\ncapabilities and accelerates data processing at scale.\n\nThe ability to cache data within Delta Lake resulted in the much-needed acceleration\nof queries, while cluster management with auto-scaling and the decoupling of\n\n\nstorage and compute simplified Viacom18’s infrastructure management and\noptimized operational costs. “Delta Lake has created a streamlined approach to\nthe management of data pipelines,” explained Dey. “This has led to a decrease in\noperational costs while speeding up time-to-insight for downstream analytics and\ndata science.”\n\nThe notebooks feature was an unexpected bonus for Viacom18, as a common workspace\ngave data teams a way to collaborate and increase productivity on everything from\nmodel training to ad hoc analysis, dashboarding and reporting via PowerBI.\n\n\n-----\n\n**Leveraging viewer data to power personalized**\n**viewing experiences**\nCelebal Technologies and Databricks have enabled Viacom18 to deliver innovative\ncustomer solutions and insights with increased cross-team collaboration and\nproductivity. With Databricks, Viacom18’s data team is now able to seamlessly\nnavigate their data while better serving their customers.\n\n“With Databricks, Viacom18’s engineers can now slice and dice large volumes of data\nand deliver customer behavioral and engagement insights to the analysts and data\nscientists,” said Dey.\n\nIn addition to performance gains, the faster query times have also lowered the overall\ncost of ownership, even with daily increases in data volumes. “Azure Databricks has\ngreatly streamlined processes and improved productivity by an estimated 26%,”\nconcluded Dey.\n\nOverall, Dey cites the migration from Hadoop to Databricks has delivered significant\nbusiness value — reducing the cost of failure, accelerating processing speeds at\nscale, and simplifying ad hoc analysis for easier data exploration and innovations that\ndeliver highly engaging customer experiences.\n\n\n-----\n\n# What’s next?\n\nNow that you understand Delta Lake, it may be time to take a look\nat some additional resources.\n\n**Do a deep dive into Delta Lake >**\n\n- [Getting Started With Delta Lake Tech Talk Series](https://databricks.com/discover/getting-started-with-delta-lake-tech-talks)\n\n- [Diving Into Delta Lake Tech Talk Series](https://databricks.com/discover/diving-into-delta-lake-talks)\n\n- [Visit the site](https://databricks.com/product/delta-lake-on-databricks) for additional resources\n\n**[Try Databricks for free >](https://databricks.com/try-databricks)**\n**[Learn more >](https://pages.databricks.com/delta-lake-open-source-reliability-for-data-lakes-reg.html)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**eBook**\n\n## The Data Team’s Guide to the Databricks Lakehouse Platform\n\n\n-----\n\n#### Contents\n\n\n**C H A P TE R 1**\n\n**C H A P TE R 2**\n\n**C H A P TE R 3**\n\n**C H A P TE R 4**\n\n**C H A P TE R 5**\n\n**C H A P TE R 6**\n\n**C H A P TE R 7**\n\n**C H A P TE R 8**\n\n**C H A P TE R 9**\n\n**C H A P TE R 10**\n\n**C H A P TE R 11**\n\n**C H A P TE R 12**\n\n\n**The data lakehouse** ...................................................................................................................................................................................... **4**\n\n**The Databricks Lakehouse Platform** .......................................................................................................................... **11**\n\n**Data reliability and performance** ................................................................................................................................... **18**\n\n**Unified governance and sharing for data, analytics and AI** ....................................... **28**\n\n**Security** .............................................................................................................................................................................................................................. **41**\n\n**Instant compute and serverless** ................................................................................................................................... **48**\n\n**Data warehousing** ......................................................................................................................................................................................... **52**\n\n**Data engineering** ............................................................................................................................................................................................. **56**\n\n**Data streaming** .................................................................................................................................................................................................. **68.**\n\n**Data science and machine learning** ........................................................................................................................ **7** **3.**\n\n**Databricks Technology Partners and the modern data stack** ............................ **7** **9.**\n\n**Get started with the Databricks Lakehouse Platform** ....................................................... **8** **1**\n\n\n-----\n\n**I N T R O D U C T I O N**\n\n#### The Data Team’s Guide to the Databricks Lakehouse Platform\n\n_The Data Team’s Guide to the Databricks Lakehouse Platform_ is\ndesigned for data practitioners and leaders who are embarking\non their journey into the data lakehouse architecture.\n\nIn this eBook, you will learn the full capabilities of the data lakehouse architecture\nand how the Databricks Lakehouse Platform helps organizations of all sizes — from\nenterprises to startups in every industry — with all their data, analytics, AI and\nmachine learning use cases on one platform.\n\nYou will see how the platform combines the best elements of data warehouses\nand data lakes to increase the reliability, performance and scalability of your\ndata platform. Discover how the lakehouse simplifies complex workloads in data\nengineering, data warehousing, data streaming, data science and machine learning\n— and bolsters collaboration for your data teams, allowing them to maintain new\nlevels of governance, flexibility and agility in an open and multicloud environment.\n\n\n-----\n\n**CHAPTER**\n\n### The data lakehouse\n# 01\n\n\n-----\n\n#### The evolution of data architectures\n\n\nData has moved front and center within every organization as data-driven insights\nhave fueled innovation, competitive advantage and better customer experiences.\n\nHowever, as companies place mandates on becoming more data-driven,\ntheir data teams are left in a sprint to deliver the right data for business\ninsights and innovation. With the widespread adoption of cloud, data teams\noften invest in large-scale complex data systems that have capabilities for\nstreaming, business intelligence, analytics and machine learning to support\nthe overall business objectives.\n\nTo support these objectives, data teams have deployed cloud data\n\nwarehouses and data lakes.\n\n\nTraditional data systems: The data warehouse and data lake\n\nWith the advent of big data, companies began collecting large amounts of\ndata from many different sources, such as weblogs, sensor data and images.\nData warehouses — which have a long history as the foundation for decision\nsupport and business intelligence applications — cannot handle large volumes\nof data.\n\nWhile data warehouses are great for structured data and historical analysis,\nthey weren’t designed for unstructured data, semi-structured data, and data\nwith high variety, velocity and volume, making them unsuitable for many types\nof data.\n\nThis led to the introduction of data lakes, providing a single repository of raw\ndata in a variety of formats. While suitable for storing big data, data lakes do\nnot support transactions, nor do they enforce data quality, and their lack of\nconsistency/isolation makes it almost impossible to read, write or process data.\n\nFor these reasons, many of the promises of data lakes never materialized and,\nin many cases, reduced the benefits of data warehouses.\n\nAs companies discovered new use cases for data exploration, predictive modeling\nand prescriptive analytics, the need for a single, flexible, high-performance system\nonly grew. Data teams require systems for diverse data applications including SQL\nanalytics, real-time analytics, data science and machine learning.\n\n\n-----\n\nTo solve for new use cases and new users, a common approach is to use multiple\nsystems — a data lake, several data warehouses and other specialized systems\nsuch as streaming, time-series, graph and image databases. But having multiple\nsystems introduces complexity and delay, as data teams invariably need to\nmove or copy data between different systems, effectively losing oversight and\ngovernance over data usage.\n\n\nYou have now duplicated data in two different systems and the changes you\nmake in one system are unlikely to find their way to the other. So, you are going\nto have data drift almost immediately, not to mention paying to store the same\ndata multiple times.\n\nThen, because governance is happening at two distinct levels across these\nplatforms, you are not able to control things consistently.\n\n\n**Challenges with data, analytics and AI**\n\nIn a recent [Accenture](https://www.accenture.com/_acnmedia/pdf-108/accenture-closing-data-value-gap-fixed.pdf) study, only 32% of companies reported tangible and\nmeasurable value from data. The challenge is that most companies continue to\nimplement two different platforms: data warehouses for BI and data lakes for AI.\nThese platforms are incompatible with each other, but data from both systems\nis generally needed to deliver game-changing outcomes, which makes success\nwith AI extremely difficult.\n\nToday, most of the data is landing in the data lake, and a lot of it is unstructured.\nIn fact, according to [IDC](https://www.idc.com/getdoc.jsp?containerId=US47998321) , about 80% of the data in any organization will be\nunstructured by 2025. But, this data is where much of the value from AI resides.\nSubsets of the data are then copied to the data warehouse into structured\ntables, and back again in some cases.\n\nYou also must secure and govern the data in both warehouses and offer\nfine-grained governance, while lakes tend to be coarser grained at the file level.\nThen, you stand up different stacks of tools on these platforms to do either\nBI or AI.\n\n\n-----\n\nFinally, the tool stacks on top of these platforms\nare fundamentally different, which makes it difficult\nto get any kind of collaboration going between the\nteams that support them.\n\nThis is why AI efforts fail. There is a tremendous\namount of complexity and rework being introduced\ninto the system. Time and resources are being\nwasted trying to get the right data to the right\npeople, and everything is happening too slowly\nto get in front of the competition.\n\n\n**Realizing this requires two disparate,**\n**incompatible data platforms**\n\n\n**Business** **SQL** **Incomplete** **Data science** **Data**\n\n**support for**\n\n**intelligence** **analytics** **and ML** **streaming**\n\n\n**SQL**\n**analytics**\n\n\n**Incomplete**\n**support for**\n**use cases**\n\n\n**Incompatible**\n**security and**\n**governance models**\n\n**Copy subsets of data**\n\n\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa T|n a|c b|e and security le ACLs|\n|||||\n\n|Col1|Col2|Col3|Col4|\n|---|---|---|---|\n|Governa File|n s|c a|e and security nd blobs|\n|||||\n\n\n**Disjointed**\n**and duplicative**\n\n**Data warehouse** **data silos** **Data lake**\nStructured tables Unstructured files:\nlogs, text, images, video\n\n\n-----\n\n**Moving forward with a lakehouse architecture**\n\nTo satisfy the need to support AI and BI directly on vast amounts of data stored\nin data lakes (on low-cost cloud storage), a new data management architecture\nemerged independently across many organizations and use cases: the\ndata lakehouse.\n\nThe data lakehouse can store _all_ and _any_ type of data once in a data lake and\nmake that data accessible directly for AI and BI. The lakehouse paradigm has\nspecific capabilities to efficiently allow both AI and BI on all the enterprise’s data\nat a massive scale. Namely, it has the SQL and performance capabilities such as\nindexing, caching and MPP processing to make BI work fast on data lakes. It also\nhas direct file access and direct native support for Python, data science and AI\nframeworks without the need for a separate data warehouse.\n\nIn short, a lakehouse is a data architecture that combines the best elements\nof data warehouses and data lakes. Lakehouses are enabled by a new system\ndesign, which implements similar data structures and data management features\nfound in a data warehouse directly on the low-cost storage used for data lakes.\n\n\n-----\n\n##### Data lakehouse\n\nOne platform to unify all your data, analytics and AI workloads\n\n###### Lakehouse Platform\n\nAll machine learning, SQL,\nBI, and streaming use cases\n\nOne security and governance\napproach for all data assets\non all clouds\n\n\n-----\n\n**Key features for a lakehouse**\n\nRecent innovations with the data lakehouse architecture can help simplify\nyour data and AI workloads, ease collaboration for data teams, and maintain\nthe kind of flexibility and openness that allows your organization to stay agile\nas you scale. Here are key features to consider when evaluating data lakehouse\narchitectures:\n\nTransaction support: In an enterprise lakehouse, many data pipelines will\noften be reading and writing data concurrently. Support for ACID (Atomicity,\nConsistency, Isolation and Durability) transactions ensures consistency as\nmultiple parties concurrently read or write data.\n\nSchema enforcement and governance: The lakehouse should have\na way to support schema enforcement and evolution, supporting data\nwarehouse schema paradigms such as star/snowflake. The system should\nbe able to reason about data integrity, and it should have robust governance\nand auditing mechanisms.\n\nData governance: Capabilities including auditing, retention and lineage\nhave become essential, particularly considering recent privacy regulations.\n\nTools that allow data discovery have become popular, such as data catalogs\nand data usage metrics.\n\nBI support: Lakehouses allow the use of BI tools directly on the source\ndata. This reduces staleness and latency, improves recency and lowers cost\nby not having to operationalize two copies of the data in both a data lake\nand a warehouse.\n\n\nStorage decoupled from compute: In practice, this means storage and\ncompute use separate clusters, thus these systems can scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also\nhave this property.\n\nOpenness: The storage formats, such as Apache Parquet, are open and\nstandardized, so a variety of tools and engines, including machine learning\nand Python/R libraries, can efficiently access the data directly.\n\nSupport for diverse data types (unstructured and structured):\nThe lakehouse can be used to store, refine, analyze and access data types\nneeded for many new data applications, including images, video, audio,\nsemi-structured data and text.\n\nSupport for diverse workloads: Use the same data repository for a range\nof workloads including data science, machine learning and SQL analytics.\nMultiple tools might be needed to support all these workloads.\n\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\n**Learn more**\n\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n\n\n-----\n\n**CHAPTER**\n\n# 02\n\n\n### The Databricks Lakehouse Platform\n\n\n-----\n\n#### Lakehouse: A new generation of open platforms\n\n\n###### This is the lakehouse paradigm\n\n\nDatabricks is the inventor and pioneer of the\ndata lakehouse architecture. The data lakehouse\narchitecture was coined in the research paper,\n[Lakehouse: A New Generation of Open Platforms that](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf) ,\nintroduced by Databricks’ founders, UC Berkeley\nand Stanford University at the 11th Conference on\nInnovative Data Systems Research (CIDR) in 2021.\n\nAt Databricks, we are continuously innovating on\nthe lakehouse architecture to help customers deliver\non their data, analytics and AI aspirations. The ideal\ndata, analytics and AI platform needs to operate\ndifferently. Rather than copying and transforming\ndata in multiple systems, you need one platform\nthat accommodates all data types.\n\n\n**Data science** **Data**\n**and ML** **streaming**\n\n\n**All ML, SQL, BI**\n**and streaming use cases**\n\n**One security and governance**\n**approach for all data assets**\n**on all clouds**\n\n**A reliable data platform**\n**to efficiently handle**\n**all data types**\n\n\n**Persona-based**\n**use cases**\n\n**Unity Catalog**\nFine-grained governance\nfor data and AI\n\n**Delta Lake**\nData reliability and performance\n\n\n**Business**\n**intelligence**\n\n\n**SQL**\n**analytics**\n\n\nFiles and blobs and table ACLs\n\n\nIdeally, the platform must be open, so that you\nare not locked into any walled gardens. You would\nalso have one security and governance model.\nIt would not only manage all data types, but it\nwould also be cloud-agnostic to govern data\nwherever it is stored.\n\nLast, it would support all major data, analytics and AI\nworkloads, so that your teams can easily collaborate\nand get access to all the data they need to innovate.\n\n\n-----\n\n#### What is the Databricks Lakehouse Platform?\n\nThe Databricks Lakehouse Platform unifies your\ndata warehousing and AI uses cases on a single\nplatform. It combines the best elements of data\nlakes and data warehouses to deliver the reliability,\nstrong governance and performance of data\nwarehouses with the openness, flexibility and\nmachine learning support of data lakes.\n\nThis unified approach simplifies your modern data\nstack by eliminating the data silos that traditionally\nseparate and complicate data engineering, analytics,\nBI, data science and machine learning. It’s built\non open source and open standards to maximize\nflexibility. And, its common approach to data\nmanagement, security and governance helps you\n\noperate more efficiently and innovate faster.\n\n\n**Lakehouse Platform**\n\nData Data Data Data science\nwarehousing engineering streaming and ML\n\n\n-----\n\n#### Benefits of the Databricks Lakehouse Platform\n\n\n**Simple**\n\nThe unified approach simplifies your data\narchitecture by eliminating the data silos that\ntraditionally separate analytics, BI, data science\nand machine learning. With a lakehouse, you\ncan eliminate the complexity and expense that\nmake it hard to achieve the full potential of\nyour analytics and AI initiatives.\n\n\n**Open**\n\nDelta Lake forms the open foundation of\nthe lakehouse by providing reliability and\nperformance directly on data in the data\nlake. You’re able to avoid proprietary walled\ngardens, easily share data and build your\nmodern data stack with unrestricted access\nto the ecosystem of open source data projects\nand the broad Databricks partner network.\n\n\n**Multicloud**\n\nThe Databricks Lakehouse Platform offers\nyou a consistent management, security and\ngovernance experience across all clouds. You\ndo not need to invest in reinventing processes\nfor every cloud platform that you are using to\nsupport your data and AI efforts. Instead, your\ndata teams can simply focus on putting all\nyour data to work to discover new insights.\n\n\n-----\n\n#### The Databricks Lakehouse Platform architecture\n\n**Data reliability and performance for lakehouse**\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format storage layer built for the lakehouse that integrates\nwith all major analytics tools and works with the widest variety of formats to\nstore and process data.\n\n\n**Instant compute and serverless**\n\nServerless compute is a fully managed service where Databricks provisions and\nmanages the compute layer on behalf of the customer in the Databricks cloud\naccount instead of the customer account. As of the current release, serverless\ncompute is supported for use with Databricks SQL.\n\nIn Chapter 6, we explore the details of instant compute and serverless for lakehouse.\n\n\n[Photon](https://databricks.com/product/photon) is the next-generation query engine built for the lakehouse that leverages\na state-of-the-art vectorized engine for fast querying and provides the best\nperformance for all workloads in the lakehouse.\n\nIn Chapter 3, we explore the details of data reliability and performance\n\nfor the lakehouse.\n\n**Unified governance and security for lakehouse**\n\nThe Databricks Lakehouse Platform provides unified governance with enterprise\nscale, security and compliance. The [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (UC) provides\ngovernance for your data and AI assets in the lakehouse — files, tables,\ndashboards, and machine learning models — giving you much better control,\nmanagement and security across clouds.\n\n[Delta Sharing](https://databricks.com/product/delta-sharing) is an open protocol that allows companies to securely share\ndata across the organization in real time, independent of the platform\non which the data resides.\n\nIn Chapter 4, we go into the details of unified governance for lakehouse\n\nand, in Chapter 5, we dive into the details of security for lakehouse.\n\n\n-----\n\n#### The Databricks Lakehouse Platform workloads\n\nThe Databricks Lakehouse Platform architecture supports different workloads\nsuch as data warehousing, data engineering, data streaming, data science and\nmachine learning on one simple, open and multicloud data platform.\n\n**Data warehousing**\n\nData warehousing is one of the most business-critical workloads for data teams,\nand the best data warehouse is a lakehouse. The Databricks Lakehouse Platform\nlets you run all your SQL and BI applications at scale with up to 12x better price/\nperformance, a unified governance model, open formats and APIs, and your tools\nof choice — no lock-in. Reduce resource management overhead with serverless\ncompute, and easily ingest, transform and query all your data in-place to deliver\nreal-time business insights faster.\n\nBuilt on open standards and APIs, the Databricks Lakehouse Platform provides\nthe reliability, quality and performance that data lakes natively lack, plus\nintegrations with the ecosystem for maximum flexibility.\n\nIn Chapter 7, we go into the details of data warehousing on the lakehouse.\n\n**Data engineering**\n\nData engineering on the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows on\nany cloud platform, and meet regulatory requirements to maintain governance.\n\n\nautomates the complexity of building and maintaining pipelines and running ETL\nworkloads so data engineers and analysts can focus on quality and reliability to\ndrive valuable insights.\n\nIn Chapter 8, we go into the details of data engineering on the lakehouse.\n\n**Data streaming**\n\n[Data streaming](https://www.databricks.com/product/data-streaming) is one of the fastest growing workloads within the Databricks\nLakehouse Platform and is the future of all data processing. Real-time processing\nprovides the freshest possible data to an organization’s analytics and machine\nlearning models enabling them to make better, faster decisions, more accurate\npredictions, offer improved customer experiences and more.\n\nThe Databricks Lakehouse Platform Dramatically simplifies data streaming to\ndeliver real-time analytics, machine learning and applications on one platform.\n\nIn Chapter 9, we go into the details of data streaming on the lakehouse.\n\n**Data science and machine learning**\n\nData science and machine learning (DSML) on the lakehouse is a powerful\nworkload that is unique to many other data offerings. DSML on the lakehouse\nprovides a data-native and collaborative solution for the full ML lifecycle. It\ncan maximize data and ML team productivity, streamline collaboration, empower\nML teams to prepare, process and manage data in a self-service manner,\nand standardize the ML lifecycle from experimentation to production.\n\nIn Chapter 10, we go into the details of DSML on the lakehouse.\n\n\nThe lakehouse provides an end-to-end data engineering and ETL platform that\n\n\n-----\n\n**Databricks Lakehouse Platform and your**\n**modern data stack**\n\nThe Databricks Lakehouse Platform is open and provides the flexibility to\ncontinue using existing infrastructure, to easily share data and build your modern\ndata stack with unrestricted access to the ecosystem of open source data\nprojects and the broad Databricks partner network with [Partner Connect](https://databricks.com/partnerconnect) .\n\nIn Chapter 11, we go into the details of our technology partners and the\n\nmodern data stack.\n\n#### Global adoption of the Databricks Lakehouse Platform\n\n\nToday, Databricks has over 7,000 [customers](https://databricks.com/customers) , from Fortune 500 to unicorns\nacross industries doing transformational work. Organizations around the globe\nare driving change and delivering a new generation of data, analytics and AI\napplications. We believe that the unfulfilled promise of data and AI can finally\nbe fulfilled with one platform for data analytics, data science and machine\nlearning with the Databricks Lakehouse Platform.\n\n\n**Learn more**\n\n[Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse)\n\n[Databricks Lakehouse Platform Demo Hub](https://databricks.com/discover/demos)\n\n[Databricks Lakehouse Platform Customer Stories](https://databricks.com/customers)\n\n[Databricks Lakehouse Platform Documentation](https://databricks.com/documentation)\n\n[Databricks Lakehouse Platform Training and Certification](https://databricks.com/learn/training/home)\n\n[Databricks Lakehouse Platform Resources](https://databricks.com/resources)\n\n\n-----\n\n**CHAPTER**\n\n# 03\n\n\n### Data reliability and performance\n\nTo bring openness, reliability and lifecycle management to data lakes,\nthe Databricks Lakehouse Platform is built on the foundation of Delta\nLake. Delta Lake solves challenges around unstructured/structured data\ningestion, the application of data quality, difficulties with deleting data for\ncompliance or issues with modifying data for data capture.\n\nAlthough data lakes are great solutions for holding large quantities of raw\ndata, they lack important attributes for data reliability and quality and\noften don’t offer good performance when compared to data warehouses.\n\n\n-----\n\n#### Problems with today’s data lakes\n\nWhen it comes to data reliability and quality, examples of these\nmissing attributes include:\n\n**•** **Lack of ACID transactions:** Makes it impossible to mix updates,\nappends and reads\n\n**•** **Lack of schema enforcement:** Creates inconsistent and low-quality data.\nFor example, rejecting writes that don’t match a table’s schema.\n\n**•** **Lack of integration with data catalog:** Results in dark data and no single\nsource of truth\n\nEven just the absence of these three attributes can cause a lot of extra work\nfor data engineers as they strive to ensure consistent high-quality data in the\npipelines they create.\n\n\nThese challenges are solved with two key technologies that are at the foundation\nof the lakehouse: Delta Lake and Photon.\n\n**What is Delta Lake?**\n\nDelta Lake is a file-based, open source storage format that provides ACID\ntransactions and scalable metadata handling, and unifies streaming and batch\ndata processing. It runs on top of existing data lakes and is compatible with\nApache Spark™ and other processing engines.\n\nDelta Lake uses Delta Tables which are based on Apache Parquet, a commonly\nused format for structured data already utilized by many organizations. Therefore,\nswitching existing Parquet tables to Delta Tables is easy and quick. Delta\nTables can also be used with semi-structured and unstructured data, providing\nversioning, reliability, metadata management, and time travel capabilities that\nmake these types of data easily managed as well.\n\n\nAs for performance, data lakes use object storage, so data is mostly kept in\nimmutable files leading to the following problems:\n\n**•** **Ineffective partitioning:** In many cases, data engineers resort to “poor man’s”\nindexing practices in the form of partitioning that leads to hundreds of dev hours\nspent tuning file sizes to improve read/write performance. Often, partitioning\nproves to be ineffective over time if the wrong field was selected for partitioning\nor due to high cardinality columns.\n\n**•** **Too many small files:** With no support for transactions, appending new data\ntakes the form of adding more and more files, leading to “small file problems,”\na known root cause of query performance degradation.\n\n\n-----\n\n**Delta Lake features**\n\n\n**ACID guarantees**\n\nDelta Lake ensures that all data changes\nwritten to storage are committed for durability\nand made visible to readers atomically. In other\nwords, no more partial or corrupted files.\n\n**Scalable data and metadata handling**\n\nSince Delta Lake is built on data lakes, all reads\nand writes using Spark or other distributed\nprocessing engines are inherently scalable to\npetabyte-scale. However, unlike most other\nstorage formats and query engines, Delta Lake\nleverages Spark to scale out all the metadata\nprocessing, thus efficiently handling metadata\nof billions of files for petabyte-scale tables.\n\n\n**Audit history and time travel**\n\nThe Delta Lake transaction log records details\nabout every change made to data, providing a full\naudit trail of the changes. These data snapshots\nallow developers to access and revert to earlier\nversions of data for audits, rollbacks or to\nreproduce experiments.\n\n**Schema enforcement and schema evolution**\n\nDelta Lake automatically prevents the insertion of\ndata with an incorrect schema, i.e., not matching\nthe table schema. And when needed, it allows the\ntable schema to be explicitly and safely evolved to\naccommodate ever-changing data.\n\n\n**Support for deletes, updates and merges**\n\nMost distributed processing frameworks do not\nsupport atomic data modification operations on\ndata lakes. Delta Lake supports merge, update\nand delete operations to enable complex use\ncases including but not limited to change data\ncapture (CDC), slowly changing dimension (SCD)\noperations and streaming upserts.\n\n**Streaming and batch unification**\n\nA Delta Lake table can work both in batch\nand as a streaming source and sink. The\nability to work across a wide variety of latencies,\nranging from streaming data ingestion to batch\nhistoric backfill, to interactive queries all work\nout of the box.\n\n\n-----\n\n**The Delta Lake transaction log**\n\nA key to understanding how Delta Lake provides all these capabilities is the\ntransaction log. The Delta Lake transaction log is the common thread that runs\nthrough many of Delta Lake’s most notable features, including ACID transactions,\nscalable metadata handling, time travel and more. The Delta Lake transaction log\nis an ordered record of every transaction that has ever been performed on\na Delta Lake table since its inception.\n\nDelta Lake is built on top of Spark to allow multiple readers and writers of a\ngiven table to work on a table at the same time. To always show users correct\nviews of the data, the transaction log serves as a single source of truth: the\ncentral repository that tracks all changes that users make to the table.\n\nWhen a user reads a Delta Lake table for the first time or runs a new query on\nan open table that has been modified since the last time it was read, Spark\nchecks the transaction log to see what new transactions are posted to the table.\nThen, Spark updates the table with those recent changes. This ensures that a\nuser’s version of a table is always synchronized with the master record as of the\nmost recent query, and that users cannot make divergent, conflicting changes\nto a table.\n\n\n**Flexibility and broad industry support**\n\nDelta Lake is an open source project, with an engaged community of\ncontributors building and growing the Delta Lake ecosystem atop a set of open\nAPIs and is part of the Linux Foundation. With the growing adoption of Delta Lake\nas an open storage standard in different environments and use cases, comes a\nbroad set of integration with industry-leading tools, technologies and formats.\n\nOrganizations leveraging Delta Lake on the Databricks Lakehouse Platform gain\nflexibility in how they ingest, store and query data. They are not limited in storing\ndata in a single cloud provider and can implement a true multicloud approach to\ndata storage.\n\nConnectors to tools, such as Fivetran, allow you to leverage Databricks’\necosystem of partner solutions, so organizations have full control of building the\nright ingestion pipelines for their use cases. Finally, consuming data via queries\nfor exploration or business intelligence (BI) is also flexible and open.\n\n\n-----\n\n**Delta Lake integrates with all major analytics tools**\n\nEliminates unnecessary data movement and duplication\n\n\n-----\n\nIn addition to a wide ecosystem of tools and technologies, Delta Lake supports\na broad set of data formats for structured, semi-structured and unstructured\ndata. These formats include image binary data that can be stored in Delta\nTables, graph data format, geospatial data types and key-value stores.\n\n**Learn more**\n\n[Delta Lake on the Databricks Lakehouse](https://databricks.com/product/delta-lake-on-databricks)\n\n[Documentation](https://docs.databricks.com/delta/index.html)\n\n[Delta Lake Open Source Project](https://docs.databricks.com/delta/index.html)\n\n[eBooks: The Delta Lake Series](https://databricks.com/p/ebook/the-definitive-guide-to-delta-lake-series)\n\n\n**What is Photon?**\n\nAs many organizations standardize on the lakehouse paradigm, this new\narchitecture poses challenges with the underlying query execution engine\nfor accessing and processing structured and unstructured data. The execution\nengine needs to provide the performance of a data warehouse and the scalability\nof data lakes.\n\nPhoton is the next-generation query engine on the Databricks Lakehouse\nPlatform that provides dramatic infrastructure cost savings and speedups for\nall use cases — from data ingestion, ETL, streaming, data science and interactive\nqueries — directly on your data lake. Photon is compatible with Spark APIs and\nimplements a more general execution framework that allows efficient processing\nof data with support of the Spark API. This means getting started is as easy as\nturning it on — no code change and no lock-in. With Photon, typical customers are\nseeing up to 80% TCO savings over traditional Databricks Runtime (Spark) and up\nto 85% reduction in VM compute hours.\n\nSpark instructions Photon instructions\n\n\nPhoton engine\n\n\nDelta/Parquet\n\nPhoton writer\nto Delta/Parquet\n\n\n-----\n\nWhy process queries with Photon?\n\n\nQuery performance on Databricks has steadily increased over the years,\npowered by Spark and thousands of optimizations packaged as part of the\nDatabricks Runtime (DBR). Photon provides an additional 2x speedup per the\nTPC-DS 1TB benchmark compared to the latest DBR versions.\n\n**Relative speedup to DBR 2.1 by DBR version**\nHigher is better\n\n\n**Customers have observed significant speedups using**\n**Photon on workloads such as:**\n\n**•** **SQL-based jobs:** Accelerate large-scale production jobs on\nSQL and Spark DataFrames\n\n**•** **IoT use cases:** Faster time-series analysis using Photon\ncompared to Spark and traditional Databricks Runtime\n\n**•** **Data privacy and compliance:** Query petabytes-scale data\nsets to identify and delete records without duplicating data\nwith Delta Lake, production jobs and Photon\n\n**•** **Loading data into Delta and Parquet:** Vectorized I/O\nspeeds up data loads for Delta and Parquet tables, lowering\noverall runtime and costs of data engineering jobs\n\n\nRelease date - DBR version (TPC-DS 1TB 10 x i3xl)\n\n\n-----\n\n**100TB TPC-DS price/performance**\nLower is better\n\n\nBest price/performance for analytics\nin the cloud\n\nWritten from the ground up in C++, Photon takes\nadvantage of modern hardware for faster queries,\nproviding up to 12x better price/performance\ncompared to other cloud data warehouses —\nall natively on your data lake.\n\n\nDatabricks SQL Databricks SQL Cloud data Cloud data Cloud data\nspot on-demand warehouse 1 warehouse 2 warehouse 3\n\n**System**\n\n\n-----\n\nWorks with your existing code\nand avoids vendor lock-in\n\nPhoton is designed to be compatible with the\nApache Spark DataFrame and SQL APIs to ensure\nworkloads run seamlessly without code changes.\nAll you do is turn it on. Photon will seamlessly\ncoordinate work and resources and transparently\naccelerate portions of your SQL and Spark queries.\nNo tuning or user intervention required.\n\n\n**Photon in the Databricks Lakehouse Platform**\n\n**Client: submit SQL**\n\nParsing\nCatalyst: analysis/\nplanning/optimization\nscheduling\n\nExecute task Execute task Execute task Execute task\n\n_Lifecycle of a Photon query_\n\n\nSpark\ndriver\nJVM\n\nSpark\nexecutors mixed\nJVM/Native\n\n\n-----\n\nOptimizing for all data use cases\nand workloads\n\nPhoton is the first purpose-built lakehouse engine\ndesigned to accelerate all data and analytics\nworkloads: data ingestion, ETL, streaming, data\nscience, and interactive queries. While we started\nPhoton primarily focused on SQL to provide\ncustomers with world-class data warehousing\nperformance on their data lakes, we’ve significantly\nincreased the scope of ingestion sources, formats,\nAPIs and methods supported by Photon since\nthen. As a result, customers have seen dramatic\ninfrastructure cost savings and speedups on\nPhoton across all their modern Spark (e.g., Spark\nSQL and DataFrame) workloads.\n\n\nQuery optimizer\n\nNative execution engine\n\nCaching\n\n\n_Accelerating all workloads on the lakehouse_\n\n**Learn more**\n\n[Announcing Photon Public Preview: The Next-Generation](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n[Query Engine on the Databricks Lakehouse Platform](https://www.databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html)\n\n[Databricks Sets Official Data Warehousing Performance Record](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n\n\n-----\n\n**CHAPTER**\n\n# 04\n\n\n### Unified governance and sharing for data, analytics and AI\n\nToday, more and more organizations recognize the importance of making\nhigh-quality data readily available to data teams to drive actionable insights\nand business value. At the same time, organizations also understand the risks\nof data breaches which negatively impact brand value and inevitably lead to\nerosion of customer trust. Governance is one of the most critical components\nof a lakehouse data platform architecture; it helps ensure that data assets\nare securely managed throughout the enterprise. However, many companies\nare using different incompatible governance models leading to complex and\nexpensive solutions.\n\n\n-----\n\n#### Key challenges with data and AI governance\n\n**Diversity of data and AI assets**\n\nThe increased use of data and the added complexity of the data landscape\nhave left organizations with a difficult time managing and governing all types\nof their data-related assets. No longer is data stored in files or tables. Data\nassets today take many forms, including dashboards, machine learning models\nand unstructured data like video and images that legacy data governance\nsolutions simply are not built to govern and manage.\n\n\n**Rising multicloud adoption**\n\nMore and more organizations now leverage a multicloud strategy to optimize\ncosts, avoid vendor lock-in, and meet compliance and privacy regulations. With\nnonstandard, cloud-specific governance models, data governance across clouds\nis complex and requires familiarity with cloud-specific security and governance\nconcepts, such as identity and access management (IAM).\n\n**Disjointed tools for data governance on the lakehouse**\n\nToday, data teams must deal with a myriad of fragmented tools and services for\ntheir data governance requirements, such as data discovery, cataloging, auditing,\nsharing, access controls, etc. This inevitably leads to operational inefficiencies\nand poor performance due to multiple integration points and network latency\nbetween the services.\n\n\n**Two disparate and incompatible data platforms**\n\nOrganizations today use two different platforms for their data analytics and\nAI efforts — data warehouses for BI and data lakes for AI. This results in data\nreplication across two platforms, presenting a major governance challenge.\nWith no unified view of the data landscape, it is difficult to see where data is\nstored, who has access to what data, and consistently define and enforce data\naccess policies across the two platforms with different governance models.\n\n\n-----\n\n#### One security and governance approach\n\nLakehouse systems provide a uniform way to manage access control, data\nquality and compliance across all of an organization’s data using standard\ninterfaces similar to those in data warehouses by adding a management\ninterface on top of data lake storage.\n\nModern lakehouse systems support fine-grained (row, column and view level)\naccess control via SQL, query auditing, attribute-based access control, data\nversioning and data quality constraints and monitoring. These features are\ngenerally provided using standard interfaces familiar to database administrators\n(for example, SQL GRANT commands) to allow existing personnel to manage\nall the data in an organization in a uniform way. Centralizing all the data in\na lakehouse system with a single management interface also reduces the\nadministrative burden and potential for error that comes with managing\nmultiple separate systems.\n\n\n#### What is Unity Catalog?\n\nUnity Catalog is a unified governance solution for all data, analytics and AI\nassets including files, tables, dashboards and machine learning models in your\nlakehouse on any cloud. Unity Catalog simplifies governance by empowering\ndata teams with a common governance model based on ANSI-SQL to define\nand enforce fine-grained access controls. With attribute-based access controls,\ndata administrators can enable fine-grained access controls on rows and\ncolumns using tags (attributes). Built-in data search and discovery allows\ndata teams to quickly find and reference relevant data for any use case. Unity\nCatalog offers automated data lineage for all workloads in SQL, R, Scala and\nPython, to build a better understanding of the data and its flow in the lakehouse.\nUnity Catalog also allows data sharing across or within organizations and\nseamless integrations with your existing data governance tools.\n\nWith Unity Catalog, data teams can simplify governance for all data and AI\nassets with one consistent model to discover, access and share data, giving\nyou much better native performance, management and security across clouds.\n\n\n-----\n\n**Key benefits**\n\n\nThe common metadata layer for cross-workspace metadata is at the account\nlevel and eases collaboration by allowing different workspaces to access Unity\nCatalog metadata through a common interface and break down data silos.\nFurther, the data permissions in Unity Catalog are applied to account-level\nidentities, rather than identities that are local to a workspace, allowing\na consistent view of users and groups across all workspaces.\n\n\nCatalog, secure and audit access to all data assets on any cloud\n\nUnity Catalog provides centralized metadata, enabling data teams to create\na single source of truth for all data assets ranging from files, tables, dashboards\nto machine learning models in one place.\n\n\n-----\n\nUnity Catalog offers a unified data access layer that provides a simple and\nstreamlined way to define and connect to your data through managed tables,\nexternal tables, or files, while managing their access controls. Unity Catalog\ncentralizes access controls for files, tables and views.\n\nIt allows fine-grained access controls for restricting access to certain rows\nand columns to the users and groups who are authorized to query them. With\nAttribute-Based Access Controls (ABAC), you can control access to multiple\ndata items at once based on user and data attributes, further simplifying\ngovernance at scale. For example, you will be able to tag multiple columns\nas personally identifiable information (PII) and manage access to all columns\ntagged as PII in a single rule.\n\nToday, organizations are dealing with an increased burden of regulatory\ncompliance, and data access auditing is a critical component to ensure your\norganization is set up for success while meeting compliance requirements.\nUnity Catalog also provides centralized fine-grained auditing by capturing an\naudit log of operations such as create, read, update and delete (CRUD) that have\nbeen performed against the data. This allows a fine-grained audit trail showing\nwho accessed a given data set and helps you meet your compliance and\nbusiness requirements.\n\n\n-----\n\nBuilt-in data search and discovery\n\nData discovery is a critical component to break\ndown data silos and democratize data across\nyour organization to make data-driven decisions.\nUnity Catalog provides a rich user interface for\ndata search and discovery, enabling data teams to\nquickly search relevant data assets across the data\nlandscape and reference them for all use cases —\nBI, analytics and machine learning — accelerating\ntime-to-value and boosting productivity.\n\n\n-----\n\nAutomated data lineage for all workloads\n\nData lineage describes the transformations and\nrefinements of data from source to insight. Lineage\nincludes capturing all the relevant metadata and\nevents associated with the data in its lifecycle,\nincluding the source of the data set, what other\ndata sets were used to create it, who created it and\nwhen, what transformations were performed, which\nother data sets leverage it, and many other events\nand attributes. Unity Catalog offers automated data\nlineage down to table and column level, enabling\ndata teams to get an end-to-end view of where\ndata is coming from, what transformations were\nperformed on the data and how data is consumed\nby end applications such as notebooks, workflows,\ndashboards, machine learning models, etc.\n\nWith automated data lineage for all workloads —\nSQL, R, Python and Scala, data teams can quickly\nidentify and perform root cause analysis of any\nerrors in the data pipelines or end applications.\nSecond, data teams can perform impact analysis\nto see dependencies of any data changes\non downstream consumers and notify them\nabout the potential impact. Finally, data lineage\nalso empowers data teams with increased\nunderstanding of their data and reduces tribal\nknowledge. Unity Catalog can also capture lineage\nassociated with non-data entities, such as notebooks,\nworkflows and dashboards. Lineage can be\n\n\n_Data lineage with Unity Catalog_\n\nretrieved via REST APIs to support integrations\nwith other catalogs.\n\nIntegrated with your existing tools\n\n\n**Resources**\n\n[Learn more about Unity Catalog](https://databricks.com/product/unity-catalog)\n\n[AWS Documentation](https://docs.databricks.com/data-governance/unity-catalog/index.html)\n\n[Azure Documentation](https://docs.microsoft.com/en-us/azure/databricks/data-governance/unity-catalog/)\n\n\nUnity Catalog helps you to future-proof your data\nand AI governance with the flexibility to leverage\nyour existing data catalogs and governance\nsolutions — Collibra, Alation, Immuta, Privacera,\nMicrosoft Purview and AWS Lakeformation.\n\n\n-----\n\n#### Open data sharing and collaboration\n\nData sharing has become important in the digital\neconomy as enterprises wish to exchange data\neasily and securely with their customers, partners,\nsuppliers and internal lines of business to better\ncollaborate and unlock value from that data. But\nto date, a lack of standards-based data sharing\nprotocol has resulted in data sharing solutions\ntied to a single vendor or commercial product,\nintroducing vendor lock-in risks. What the industry\ndeserves is an open approach to data sharing.\n\n**Why data sharing is hard**\n\nData sharing has evolved from an optional feature\nof a few data platforms to a business necessity\nand success factor for organizations. Our solution\narchitects encounter daily the classic scenarios\nof a retailer looking to publish sales data to their\nsuppliers in real time or a supplier that wants to\nshare real-time inventory.\n\nAs a reminder, data sharing recently triggered\nthe most impressive scientific development that\nhumankind has ever seen. On January 5, 2021, the\nfirst sample of the genome of the coronavirus was\n\n\nuploaded to the internet. It wasn’t a lung biopsy\nfrom a patient in Wuhan, but a shared digital\ngenomic data set that triggered the development\nof the first batch of COVID vaccines worldwide.\n\n\ntreatments, tests and tracking mutations as they\nare passed down through a lineage, a branch of\nthe coronavirus family tree. The above graphic\nshows such a [publicly shared mutation data set](https://www.ncbi.nlm.nih.gov/genbank/) .\n\n\nSince then, coronavirus experts have daily\nexchanged public data sets, looking for better\n\n\n-----\n\nSharing data, as well as consuming data from\nexternal sources, allows you to collaborate with\npartners, establish new partnerships, enable\nresearch and can generate new revenue streams\nwith data monetization.\n\nDespite those promising examples, existing data\nsharing technologies come with several limitations:\n\n**•** Traditional data sharing technologies, such as\nSecure File Transfer Protocol (SFTP), do not\nscale well and only serve files offloaded to a\nserver\n\n**•** Cloud object stores operate on an object level\nand are cloud-specific\n\n**•** Commercial data sharing offerings baked into\nvendor products often share tables instead of\nfiles, but scaling them is expensive and they\nare not open and, therefore, do not permit data\nsharing with a different platform\n\nThe following table compares proprietary vendor\nsolutions with SFTP, cloud object stores and Delta\nSharing.\n\n\n\n|Col1|Proprietary vendor solutions|SFTP|Cloud object store|Delta Sharing|\n|---|---|---|---|---|\n|Secure|||||\n|Cheap|||||\n|Vendor agnostic|||||\n|Multicloud|||||\n|Open source|||||\n|Table/DataFrame abstraction|||||\n|Live data|||||\n|Predicate pushdown|||||\n|Object store bandwidth|||||\n|Zero compute cost|||||\n|Scalability|||||\n\n\n-----\n\n**Open source data sharing and Databricks**\n\nTo address the limitations of existing data sharing solutions, Databricks developed\n[Delta Sharing](https://github.com/delta-io/delta-sharing) , with various contributions from the OSS community, and donated it\nto the Linux Foundation.\n\nAn open source–based solution, such as Delta Sharing, eliminates the lock-in\nof commercial solutions and brings a number of additional benefits such as\ncommunity-developed integrations with popular, open source data processing\nframeworks. In addition, open protocols allow the easy integration of commercial\nclients, such as BI tools.\n\n**What is Databricks Delta Sharing?**\n\nDatabricks Delta Sharing provides an open solution to securely share live data\nfrom your lakehouse to any computing platform. Recipients don’t have to be\non the Databricks platform or on the same cloud or a cloud at all. Data providers\ncan share live data, without replicating or moving it to another system. Recipients\nbenefit from always having access to the latest version of data and can quickly\nquery shared data using tools of their choice for BI, analytics and machine\nlearning, reducing time-to-value. Data providers can centrally manage, govern,\naudit and track usage of the shared data on one platform.\n\nUnity Catalog natively supports [Delta Sharing](https://databricks.com/product/delta-sharing) , the world’s first open protocol\nfor data sharing, enabling organizations to share live, large-scale data without\nreplication and make data easily and quickly accessible from tools of your\nchoice, with enterprise-grade security.\n\n\n**Key benefits**\n\nOpen cross-platform sharing\n\nEasily share existing data in Delta Lake and Apache Parquet formats between\ndifferent vendors. Consumers don’t have to be on the Databricks platform, same\ncloud or a cloud at all. Native integration with Power BI, Tableau, Spark, pandas\nand Java allow recipients to consume shared data directly from the tools of their\nchoice. Delta Sharing eliminates the need to set up a new ingestion process to\nconsume data. Data recipients can directly access the fresh data and query it\nusing tools of their choice. Recipients can also enrich data with data sets from\npopular data providers.\n\nSharing live data without copying it\n\nShare live ready-to-query data, without replicating or moving it to another system.\nMost enterprise data today is stored in cloud data lakes. Any of the existing data\nsets on the provider’s data lake can easily be shared across clouds, regions or\ndata platforms without any data replication or physical movement of data. Data\nproviders can update their data sets reliably in real time and provide a fresh and\nconsistent view of their data to recipients.\n\nCentralized administration and governance\n\nYou can centrally govern, track and audit access to the shared data from a single\npoint of enforcement to meet compliance requirements. Detailed user-access\naudit logs are kept to know who is accessing the data and monitor usage of the\nshared data down to table, partition and version level.\n\n\n-----\n\nAn open Marketplace for data solutions\n\nThe demand for third-party data to make data-driven innovations is greater than ever,\n\nand data marketplaces act as a bridge between data providers and data consumers to\n\nhelp facilitate the discovery and distribution of data sets.\n\nDatabricks Marketplace provides an open marketplace for exchanging data products\n\nsuch as data sets, notebooks, dashboards and machine learning models. To accelerate\n\ninsights, data consumers can discover, evaluate and access more data products from\n\nthird-party vendors than ever before. Providers can now commercialize new offerings\n\nand shorten sales cycles by providing value-added services on top of their data.\n\nDatabricks Marketplace is powered by Delta Sharing, allowing consumers to access\n\ndata products without having to be on the Databricks platform. This open approach\n\nallows data providers to broaden their addressable market without forcing consumers\n\ninto vendor lock-in.\n\n_Databricks Marketplace_\n\n\nPrivacy-safe data cleanrooms\n\nPowered by open source Delta Sharing, the Databricks Lakehouse Platform provides\n\na flexible data cleanroom solution allowing businesses to easily collaborate with their\n\ncustomers and partners on any cloud in a privacy-safe way. Participants in the data\n\ncleanrooms can share and join their existing data, and run complex workloads in any\n\nlanguage — Python, R, SQL, Java and Scala — on the data while maintaining data\n\nprivacy. Additionally, data cleanroom participants don’t have to do cost-intensive\n\ndata replication across clouds or regions with other participants, which simplifies data\n\noperations and reduces cost.\n\n_Data cleanrooms with Databricks Lakehouse Platform_\n\n\n-----\n\n**How it works**\n\nDelta Sharing is designed to be simple, scalable, non-proprietary and cost-effective for organizations that are serious about getting more from their data. Delta Sharing\nis natively integrated with Unity Catalog, which allows customers to add fine-grained governance and security controls, making it easy and safe to share data internally\nor externally.\n\nDelta Sharing is a simple REST protocol that securely shares access to part of a cloud data set. It leverages modern cloud storage systems — such as AWS S3,\nAzure ADLS or Google’s GCS — to reliably transfer large data sets. Here’s how it works for data providers and data recipients.\n\n**Data provider** **Data recipient**\n\nData science And many more On-premises\n\nThe data provider shares existing tables or parts thereof (such as specific table versions or partitions) stored on the cloud data lake in Delta Lake format. The provider\ndecides what data they want to share and runs a sharing server in front of it that implements the Delta Sharing protocol and manages access for recipients. To manage\nshares and recipients, you can use SQL commands or the Unity Catalog CLI or the intuitive user interface.\n\nThe data recipient only needs one of the many Delta Sharing clients that supports the protocol. Databricks has released open source connectors for pandas, Apache\nSpark, Java and Python, and is working with partners on many more.\n\n\n-----\n\nThe Delta Sharing data exchange follows three efficient steps:\n\n1. The recipient’s client authenticates to the sharing server and asks to query\na specific table. The client can also provide filters on the data (for example,\n“country=US”) as a hint to read just a subset of the data.\n\n2. The server verifies whether the client is allowed to access the data, logs the\nrequest, and then determines which data to send back. This will be a subset\nof the data objects in cloud storage systems that make up the table.\n\n3. To transfer the data, the server generates short-lived presigned URLs that\nallow the client to read these Parquet files directly from the cloud provider,\nso that the transfer can happen in parallel at massive bandwidth, without\nstreaming through the sharing server.\n\n**Learn more**\n\n[Try Delta Sharing](https://databricks.com/product/delta-sharing)\n\n[Delta Sharing Demo](https://youtu.be/wRT1Vpbyy88)\n\n[Introducing Delta Sharing: An Open Protocol for Secure Data Sharing](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Data Cleanrooms for the Lakehouse](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Introducing Databricks Marketplace](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html)\n\n[Delta Sharing ODSC Webinar](https://www.youtube.com/watch?v=YrNHtaWlkM8)\n\n\n-----\n\n**CHAPTER**\n\n# 05\n\n\n### Security\n\nOrganizations that operate in multicloud environments need a unified, reliable\nand consistent approach to secure data. We’ve learned from our customers that\na simple and unified approach to data security for the lakehouse is one of the\nmost critical requirements for modern data solutions. Databricks is trusted by\nthe world’s largest organizations to provide a powerful lakehouse platform with\nhigh security and scalability. In fact, thousands of customers trust Databricks\nwith their most sensitive data to analyze and build data products using machine\nlearning (ML). With significant investment in building a highly secure and scalable\nplatform, Databricks delivers end-to-end platform security for data and users.\n\n\n-----\n\n#### Platform architecture reduces risk\n\nThe Databricks Lakehouse architecture is split into\ntwo separate planes to simplify your permissions,\navoid data duplication and reduce risk. The control\nplane is the management plane where Databricks\nruns the workspace application and manages\nnotebooks, configuration and clusters. Unless you\nchoose to use [serverless compute](https://docs.databricks.com/serverless-compute/index.html) , the data plane\nruns inside your cloud service provider account,\nprocessing your data without taking it out of your\naccount. You can embed Databricks in your data\nexfiltration protection architecture using features\nlike customer-managed VPCs/VNets and admin\nconsole options that disable export.\n\nWhile certain data, such as your notebooks,\nconfigurations, logs, and user information, is\npresent within the control plane, that information\nis encrypted at rest, and communication to and\nfrom the control plane is encrypted in transit.\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n|Col1|Control pane|Col3|\n|---|---|---|\n||Web application Configurations Notebooks, repos, DBSQL|Cluster Cluste Your cloud s Your cloud s|\n||Cluster manager||\n\n\nYou also have choices for where certain data lives:\nYou can host your own store of metadata about\nyour data tables (Hive metastore), or store query\n\n\n**Data**\n\n\n**DBFS root**\n\n\nresults in your cloud service provider account and\ndecide whether to use the [Databricks Secrets API.](https://docs.databricks.com/dev-tools/api/latest/secrets.html)\n\n\n-----\n\n#### Step-by-step example\n\n\n\n**Users**\n\n**Interactive**\n**users**\n\n\n\n\n\n\n\n\n**DBFS root**\n\n|Col1|ample|Col3|Col4|Col5|\n|---|---|---|---|---|\n||Control pane 1 4||||\n|||Web application Configurations Notebooks, repos, DBSQL Cluster manager|6|Cluster Cluste YYoouurr cclloouudd s|\n||||||\n||||||\n||||||\n||||||\n||||||\n\n\n-----\n\nSuppose you have a data engineer that signs in to Databricks and\nwrites a notebook that transforms raw data in Kafka to a normalized\ndata set sent to storage such as Amazon S3 or Azure Data Lake\nStorage. Six steps make that happen:\n\n1. The data engineer seamlessly authenticates, via your single sign-on\nif desired, to the Databricks web UI in the control plane, hosted in\nthe Databricks account.\n\n2. As the data engineer writes code, their web browser sends it to\nthe control plane. JDBC/ODBC requests also follow the same path,\nauthenticating with a token.\n\n3. When ready, the control plane uses Cloud Service Provider APIs to\ncreate a Databricks cluster, made of new instances in the data plane,\nin your CSP account. Administrators can apply cluster policies to\nenforce security profiles.\n\n4. Once the instances launch, the cluster manager sends the data\nengineer’s code to the cluster.\n\n5. The cluster pulls from Kafka in your account, transforms the data\nin your account and writes it to a storage in your account.\n\n6. The cluster reports status and any outputs back to the cluster manager.\n\nThe data engineer does not need to worry about many of the details —\nsimply write the code and Databricks runs it.\n\n\n#### Network and server security\n\nHere is how Databricks interacts with your cloud service provider\naccount to manage network and server security\n\n**Networking**\n\nRegardless of where you choose to host the data plane, Databricks networking\nis straightforward. If you host it yourself, Databricks by default will still configure\nnetworking for you, but you can also control data plane networking with your\nown managed VPC or VNet.\n\nThe serverless data plane network infrastructure is managed by Databricks in\na Databricks cloud service provider account and shared among customers,\nwith additional network boundaries between workspaces and between clusters.\n\nDatabricks does not rewrite or change your data structure in your storage, nor\ndoes it change or modify any of your security and governance policies. Local\nfirewalls complement security groups and subnet firewall policies to block\nunexpected inbound connections.\n\nCustomers at the enterprise tier can also use the IP access list feature on\nthe control plane to limit which IP addresses can connect to the web UI or\nREST API — for example, to allow only VPN or office IPs.\n\n\n-----\n\n**Servers**\n\nIn the data plane, Databricks clusters automatically run the latest hardened\nsystem image. Users cannot choose older (less secure) images or code. For AWS\nand Azure deployments, images are typically updated every two-to-four weeks.\nGCP is responsible for its system image.\n\nDatabricks runs scans for every release, including:\n\n**•** System image scanning for vulnerabilities\n\n**•** Container OS and library scanning\n\n\n**Severity** **Remediation time**\n\n**Critical** **< 14 days**\n\n**High** **< 30 days**\n\n**Medium** **< 60 days**\n\n**Low** **When appropriate**\n\n\n\n**•** Static and dynamic code scanning\n\n**Databricks access**\n\n\nDatabricks code is peer reviewed by developers who have security training.\nSignificant design documents go through comprehensive security reviews.\nScans run fully authenticated, with all checks enabled, and issues are\ntracked against the timeline shown in this table.\n\nNote that Databricks clusters are typically short-lived (often terminated\nafter a job completes) and do not persist data after they terminate. Clusters\ntypically share the same permission level (excluding high concurrency or\nDatabricks SQL clusters, where more robust security controls are in place).\nYour code is launched in an unprivileged container to maintain system\nstability. This security design provides protection against persistent attackers\nand privilege escalation.\n\n\nDatabricks access to your environment is limited to cloud service provider APIs\nfor our automation and support access. Automated access allows the Databricks\ncontrol plane to configure resources in your environment using the cloud service\nprovider APIs. The specific APIs vary based on the cloud. For instance, an AWS\ncross-account IAM role, or Azure-owned automation or GKE automation do not\ngrant access to your data sets (see the next section).\n\nDatabricks has a custom-built system that allows staff to fix issues or handle\nsupport requests — for example, when you open a support request and check the\nbox authorizing access to your workspace. Access requires either a support ticket\nor engineering ticket tied expressly to your workspace and is limited to a subset of\nemployees and for limited time periods. Additionally, if you have configured audit\nlog delivery, the audit logs show the initial access event and the staff’s actions.\n\n\n-----\n\n**Identity and access**\n\nDatabricks supports robust ACLs and SCIM. AWS customers can configure\nSAML 2.0 and block non-SSO logins. Azure Databricks and Databricks on\nGCP automatically integrate with Azure Active Directory or GCP identity.\n\nDatabricks supports a variety of ways to enable users to access their data.\n\n**Examples include:**\n\n**•** The Table ACLs feature uses traditional SQL-based statements to\nmanage access to data and enable fine-grained view-based access\n\n**•** IAM instance profiles enable AWS clusters to assume an IAM role, so\nusers of that cluster automatically access allowed resources without\nexplicit credentials\n\n**•** External storage can be mounted or accessed using a securely\nstored access key\n\n**•** The Secrets API separates credentials from code when accessing\nexternal resources\n\n\n**Data security**\n\nDatabricks provides encryption, isolation and auditing.\n\n**Databricks encryption capabilities are**\n**in place both at rest and in motion**\n\n\n\n|For data-at-rest encryption: • Control plane is encrypted • Data plane supports local encryption • Customers can use encrypted storage buckets • Customers at some tiers can confgi ure customer-managed keys for managed services|For data-in-motion encryption: • Control plane <-> data plane is encrypted • Offers optional intra-cluster encryption • Customer code can be written to avoid unencrypted services (e.g., FTP)|\n|---|---|\n\n\n**Customers can isolate users at multiple levels:**\n\n**•** **Workspace level:** Each team or department can use a separate workspace\n\n**•** **Cluster level:** Cluster ACLs can restrict the users who can attach notebooks\n\nto a given cluster\n\n**•** **High concurrency clusters:** Process isolation, JVM whitelisting and limited\nlanguages (SQL, Python) allow for the safe coexistence of users of different\nprivilege levels, and is used with Table ACLs\n\n**•** **Single-user cluster:** Users can create a private dedicated cluster\n\nActivities of Databricks users are logged and can be delivered automatically to\na cloud storage bucket. Customers can also monitor provisioning activities by\nmonitoring cloud audit logs.\n\n\n-----\n\n**Compliance**\n\n**Databricks supports the following compliance standards on**\n\n**our multi-tenant platform:**\n\n**•** **SOC 2 Type II**\n\n**•** **ISO 27001**\n\n**•** **ISO 27017**\n\n**•** **ISO 27018**\n\nCertain clouds support Databricks deployment options for FedRAMP\nHigh, HITRUST, HIPAA and PCI. Databricks Inc. and the Databricks platform\nare also GDPR and CCPA ready.\n\n**Learn more**\n\nTo learn more about Databricks security,\nvisit the [Security and Trust Center](https://databricks.com/trust)\n\n\n-----\n\n**CHAPTER**\n\n# 06\n\n\n### Instant compute and serverless\n\n\n-----\n\n#### Benefits of Databricks Serverless SQL\n\nServerless SQL is much easier to administer with Databricks taking on the\nresponsibility of deploying, configuring and managing your cluster VMs. Databricks\ncan transfer compute capacity to user queries typically in about 15 seconds — so\nyou no longer need to wait for clusters to start up or scale out to run your queries.\n\nServerless SQL also has built-in connectors to your favorite tools such as Tableau,\nPower BI, Qlik, etc. These connectors use optimized JDBC/ODBC drivers for easy\nauthentication support and high performance. And finally, you save on cost\nbecause you do not need to overprovision or pay for the idle capacity.\n\n\n#### What is serverless compute?\n\nServerless compute is a fully managed service where Databricks provisions\nand manages the compute layer on behalf of the customer in the Databricks\ncloud account instead of the customer account. As of the current release,\nserverless compute is supported for use with Databricks SQL. This new\ncapability for Databricks SQL provides instant compute to users for their\nBI and SQL workloads, with minimal management required and capacity\noptimizations that can lower overall cost by 20%-40% on average. This\nmakes it even easier for organizations to expand adoption of the lakehouse\nfor business analysts who are looking to access the rich, real-time data sets\nof the lakehouse with a simple and performant solution.\n\n\n-----\n\n**Inside Serverless SQL**\n\n\n**Databricks Serverless SQL**\n\n**Managed servers**\n\n**Serverless SQL**\n**compute**\n\n**Secure**\n**Instant compute**\n\n\nAt the core of Serverless SQL is a compute\nplatform that operates a pool of servers located\nin a Databricks’ account, running Kubernetes\ncontainers that can be assigned to a user\nwithin seconds.\n\nWhen many users are running reports or queries\nat the same time, the compute platform adds more\nservers to the cluster (again, within seconds) to\nhandle the concurrent load. Databricks manages\nthe entire configuration of the server and\nautomatically performs the patching and upgrades\nas needed.\n\nEach server is running a secure configuration and\nall processing is secured by three layers of isolation:\nThe Kubernetes container hosting the runtime; the\nvirtual machine (VM) hosting the container; and\nthe virtual network for the workspace. Each layer\nis isolated to one workspace with no sharing or\ncross-network traffic allowed. The containers use\nhardened configurations, VMs are shut down and\nnot reused, and network traffic is restricted\nto nodes in the same cluster.\n\n\n-----\n\n#### Performance of Serverless SQL\n\nWe ran a set of internal tests to compare\nDatabricks Serverless SQL to the current\nDatabricks SQL and several traditional cloud\ndata warehouses. We found Serverless SQL\nto be the most cost-efficient and performant\nenvironment to run SQL workloads when\nconsidering cluster startup time, query\nexecution time and overall cost.\n\n\n**Databricks Serverless SQL is the highest**\n**performing and most cost-effective solution**\n\n**Cloud SQL solutions compared**\n\n\n**Faster**\n\n**Query**\n**execution**\n**time**\n\n**Slower**\n\n\n**Serverless**\n**SQL**\n\n**CDW1**\n\n**CDW3**\n\n\n**Cost Estimate**\n\n**High**\n\n**Medium**\n\n**Low**\n\n\n**CDW2**\n\n\n**CDW4**\n\n\n**Slower** **Faster**\n**(~5min)** **Startup time** **(~2-3sec)**\n\n**Learn more**\n\nThe feature is currently in Public Preview. Sign up to\n[request access to Serverless SQL](https://databricks.com/p/ebook/serverless-sql-preview-sign-up) . To learn more about\nServerless SQL, visit our [documentation page.](https://docs.databricks.com/serverless-compute/index.html)\n\n\n-----\n\n**CHAPTER**\n\n# 07\n\n\n### Data warehousing\n\nData warehouses are not keeping up with today’s world. The explosion of\nlanguages other than SQL and unstructured data, machine learning, IoT and\nstreaming analytics are forcing organizations to adopt a bifurcated architecture\nof disjointed systems: Data warehouses for BI and data lakes for ML. While SQL\nis ubiquitous and known by millions of professionals, it has never been treated\nas a first-class citizen on data lakes, until the lakehouse.\n\n\n-----\n\n#### What is data warehousing\n\nThe Databricks Lakehouse Platform provides a simplified multicloud and\nserverless architecture for your data warehousing workloads. Data warehousing on\nthe lakehouse allows SQL analytics and BI at scale with a common governance\nmodel. Now you can ingest, transform and query all your data in-place — using\nyour SQL and BI tools of choice — to deliver real-time business insights at the\nbest price/performance. Built on open standards and APIs, the lakehouse\nprovides the reliability, quality and performance that data lakes natively lack,\nand integrations with the ecosystem for maximum flexibility — no lock-in.\n\nWith data warehousing on the lakehouse, organizations can unify all analytics\nand simplify their architecture to enable their business with real-time business\ninsights at the best price/performance.\n\n\n#### Key benefits\n\n**Best price/performance**\n\nLower costs, get the best price/performance and eliminate\nresource management overhead\n\nOn-premises data warehouses have reached their limits — they physically\ncannot scale to handle the growing volumes of data, and don’t provide the\nelasticity customers need to respond to ever-changing business needs.\nCloud data warehouses are a great alternative to on-premises data\nwarehouses, providing greater scale and elasticity, but cloud costs for\nproprietary cloud data warehouses typically yield to an exponential cost\nincrease following the growth of data volume.\n\nThe Databricks Lakehouse Platform provides instant, elastic SQL serverless\ncompute — decoupled from storage on cheap cloud object stores — and\nthousands of performance optimizations that can lower overall infrastructure\ncosts by [an average of 40%](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) . Databricks automatically determines instance\ntypes and configuration for the best price/performance — [up to 12x better](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[than traditional cloud data warehouses](https://databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) — and scale for high concurrency\nuse cases.\n\n\n-----\n\n**Built-in governance**\n\nOne source of truth and one unified\ngovernance layer across all data teams\n\nUnderpinned by Delta Lake, the Databricks\nLakehouse Platform simplifies your architecture by\nallowing you to establish one single copy of all your\ndata for in-place analytics and ETL/ELT on your\nexisting data lakes — no more data movements\nand copies in disjointed systems. Then, seamless\nintegration with Databricks Unity Catalog lets you\neasily discover, secure and manage all your data\nwith fine-grained governance, data lineage, and\nstandard SQL.\n\n**Rich ecosystem**\n\nIngest, transform and query all your\ndata in-place with your favorite tools\n\nVery few tools exist to conduct BI on data lakes.\nGenerally, doing so has required data analysts to\n\nsubmit Spark jobs or use a developer interface.\nWhile these tools are common for data scientists,\nthey require knowledge of languages and\ninterfaces that are not traditionally part of a data\nanalyst’s tool set. As a result, the learning curve for\nan analyst to make use of a data lake is too high\nwhen well-established tools and methods already\nexist for data warehouses.\n\n\nThe Databricks Lakehouse Platform works with\nyour preferred tools like dbt, Fivetran, Power BI or\nTableau, allowing analysts and analytical engineers\nto easily ingest, transform and query the most\nrecent and complete data, without having to move\nit into a separate data warehouse. Additionally, it\nempowers every analyst across your organization\nto quickly and collaboratively find and share new\ninsights with a built-in SQL editor, visualizations\nand dashboards.\n\n**Break down silos**\n\nAccelerate time from raw to actionable\ndata and go effortlessly from BI to ML\n\n\napplications, organizations will need to manage\nan entirely different system than their SQL-only\ndata warehouse, slowing down collaboration and\ninnovation.\n\nThe Databricks Lakehouse Platform provides the\nmost complete end-to-end data warehousing\nsolution for all your modern analytics needs,\nand more. Now you can empower data teams\nand business users to access the latest data\nfaster for downstream real-time analytics and go\neffortlessly from BI to ML. Speed up the time from\nraw to actionable data at any scale — in batch and\nstreaming. And go from descriptive to advanced\nanalytics effortlessly to uncover new insights.\n\n\nIt is challenging for data engineering teams to\nenable analysts at the speed that the business\nrequires. Data warehouses need data to be\ningested and processed ahead of time before\nanalysts can access and query it using BI tools.\nBecause traditional data warehouses lack\nreal-time processing and do not scale well for\nlarge ETL jobs, they create new data movements\nand bottlenecks for the data engineering team,\nand make it slow for analysts to access the\nlatest data. And for advanced analytics (ML)\n\n\n-----\n\n**Data warehousing on Databricks**\n\n**Truly decoupled, serverless, compute layer**\n\n\n**Data consumers**\n\n\n**Data processing**\n\n**Unity Catalog**\n\n\n**ETL** **ETL**\n\n**Bronze raw** **Silver staging** **Gold DW/marts**\n\n\n**Open storage layer**\n\n**Data ingest**\n\n**Data sources**\n\n\n**Databricks**\n**Partner Connect**\n\n\n**Continuous**\n**ingest**\n\n\n**Batch**\n**ingest**\n\n\n**On-premises**\n\n**OLTP** **OLAP** **Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**On-premises**\n\n**Hadoop** **Third-party data** **loT devices** **SaaS applications** **Social**\n\n**DWH**\n\n\n**Learn more**\n\n\n[Try Databricks SQL for free](https://dbricks.co/dbsql)\n\n[Databricks SQL Demo](https://databricks.com/discover/demos/databricks-sql)\n\n[Databricks SQL Data](https://youtu.be/jlEdoVpWwNc)\n[Warehousing Admin Demo](https://youtu.be/jlEdoVpWwNc)\n\n\n[On-demand Webinar: Learn](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n[Databricks SQL From the Experts](https://databricks.com/p/webinar/learn-databricks-sql-from-the-experts)\n\n[eBook: Inner Workings of the](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n[Lakehouse for Analytics and BI](https://databricks.com/p/ebook/data-lakehouse-is-your-next-data-warehouse)\n\n\n-----\n\n**CHAPTER**\n\n# 08\n\n\n### Data engineering\n\nOrganizations realize the value data plays as a strategic asset for growing\nrevenues, improving the customer experience, operating efficiently or improving\na product or service. Data is really the driver of all these initiatives. Nowadays,\ndata is often streamed and ingested from hundreds of different data sources,\nsometimes acquired from a data exchange, cleaned in various ways with\ndifferent orchestrated steps, versioned and shared for analytics and AI.\nAnd increasingly, data is being monetized.\n\nData teams rely on getting the right data at the right time for analytics, data\nscience and machine learning, but often are faced with challenges meeting\nthe needs of their initiatives for data engineering.\n\n\n-----\n\n#### Why data engineering is hard\n\nOne of the biggest challenges is accessing and managing the increasingly\ncomplex data that lives across the organization. Most of the complexity\narises with the explosion of data volumes and data types, with organizations\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n\nWith this volume, managing data pipelines to transform and process data\nis slow and difficult, and increasingly expensive. And to top off the complexity,\nmost businesses are putting an increased emphasis on multicloud\nenvironments which can be even more difficult to maintain.\n\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\nthat data itself has become a product, and the challenging goal of the data\nengineer is to build and run the machinery that creates this high-fidelity\ndata product all the way from ingestion to monetization.\n\n\nDespite current technological advances data engineering remains\ndifficult for several reasons:\n\n**Complex data ingestion methods**\n\nData ingestion means retrieving batch and streaming data from various\nsources and in various formats. Ingesting data is hard and complex since you\neither need to use an always-running streaming platform like Apache Kafka\nor you need to be able to keep track of which files haven’t been ingested yet.\nData engineers are required to spend a lot of time hand-coding repetitive\nand error-prone data ingestion tasks.\n\n**Data engineering principles**\n\nThese days, large operations teams are often just a memory of the past.\nModern data engineering principles are based on agile software development\nmethodologies. They apply the well-known “you build it, you run it” paradigm,\nuse isolated development and production environments, CI/CD, and version\ncontrol transformations that are pushed to production after validation. Tooling\nneeds to support these principles.\n\n\n-----\n\n**Third-party tools**\n\nData engineers are often required to run additional third-party tools for\norchestration to automate tasks such as ELT/ETL or customer code in\nnotebooks. Running third-party tools increases the operational overhead\nand decreases the reliability of the system.\n\n**Performance tuning**\n\nFinally, with all pipelines and workflows written, data engineers need to\nconstantly focus on performance, tuning pipelines and architectures to meet\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\narchitecture and constantly observing throughput parameters.\n\nMost organizations are dealing with a complex landscape of data warehouses\nand data lakes these days. Each of those platforms has its own limitations,\nworkloads, development languages and governance model.\n\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:\n\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n_kinds of workflows._\n\n\n-----\n\n-----\n\n#### Benefits of data engineering on the lakehouse\n\nBy simplifying and modernizing with the lakehouse architecture, data engineers\ngain an enterprise-grade and enterprise-ready approach to building data\npipelines. The following are eight key differentiating capabilities that a data\nengineering solution team can enable with the Databricks Lakehouse Platform:\n\n**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\nengineers can enable fast, reliable, scalable and automatic data ingestion\nfor analytics, data science or machine learning.\n\n\n\n**•** **Data pipeline observability:** Monitor overall data pipeline estate status\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\nhealth for performance, quality, status and latency.\n\n**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\nanalytics and machine learning use cases by enabling easy and automatic\ndata pipeline deployments into production or roll back pipelines and\nminimize downtime.\n\n**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\nof data processing tasks for data and machine learning pipelines with the\nability to run multiple non-interactive tasks as a directed acyclic graph\n(DAG) on a Databricks compute cluster.\n\n\n\n**•** **Automated ETL pipelines:** Data engineers can reduce development\ntime and effort and focus on implementing business logic and data\nquality checks within the data pipeline using SQL or Python.\n\n**•** **Data quality checks:** Improve data reliability throughout the data\nlakehouse so data teams can confidently trust the information for\ndownstream initiatives with the ability to define data quality and\nautomatically address errors.\n\n**•** **Batch and streaming:** Allow data engineers to set tunable data latency\nwith cost controls without having to know complex stream processing\nand implement recovery logic.\n\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\nfor most common error conditions that can occur during the operation of\na pipeline with fast, scalable fault-tolerance.\n\n\n-----\n\n**Data engineering is all about data quality**\n\nThe goal of modern data engineering is to distill data with a quality that is fit for\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\nthree different levels.\n\n\n1. On a **technical level** , data quality is\nguaranteed by enforcing and evolving\nschemas for data storage and ingestion.\n\n**Kenesis**\n\n**CSV,**\n**JSON, TXT...**\n\n**Data Lake**\n\n\n2. On an **architectural level** , data quality is\noften achieved by implementing the medallion\narchitecture. A medallion architecture is a data\ndesign pattern used to logically organize data in\na [lakehouse](https://databricks.com/glossary/data-lakehouse) with the goal of incrementally and\nprogressively improving the structure and quality\nof data as it flows through each layer of the\narchitecture, e.g., from Bronze to Silver to Gold\nlayer tables.\n\n\n3. The **Databricks Unity Catalog** comes\nwith robust data quality management with\nbuilt-in quality controls, testing, monitoring\nand enforcement to ensure accurate and\nuseful data is available for downstream BI,\nanalytics and machine learning workloads.\n\n**Streaming**\n**analytics**\n\n\n**Bronze**\n\n\n**Silver**\n\n\n**Gold**\n\n\n**BI and**\n\n**reporting**\n\n\nRaw ingestion Filtered, cleaned, Business-level\nand history augmented aggregates\n\n**Quality**\n\n\n**Data science**\n\n**and ML**\n\n\n-----\n\n#### Data ingestion\n\nWith the Databricks Lakehouse Platform, data engineers can build robust\nhyper-scale ingestion pipelines in streaming and batch mode. They can\nincrementally process new files as they land on cloud storage — with no\nneed to manage state information — in scheduled or continuous jobs.\n\nData engineers can efficiently track new files (with the ability to scale\nto billions of files) without having to list them in a directory. Databricks\nautomatically infers the schema from the source data and evolves it as\nthe data loads into the Delta Lake lakehouse. Efforts continue with\nenhancing and supporting Auto Loader, our powerful data ingestion\ntool for the Lakehouse.\n\n**What is Auto Loader?**\n\nHave you ever imagined that ingesting data could become as easy\nas dropping a file into a folder? Welcome to Databricks Auto Loader.\n\n[Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) is an optimized data ingestion tool that incrementally and\nefficiently processes new data files as they arrive in the cloud storage built\ninto the Databricks Lakehouse. Auto Loader can detect and enforce the\nschema of your data and, therefore, guarantee data quality. New files or\nfiles that have been changed since the last time new data was processed\nare identified automatically and ingested. Noncompliant data sets are\nquarantined into rescue data columns. You can use the [trigger once]\noption with Auto Loader to turn it into a job that turns itself off.\n\n\n**Ingestion for data analysts: COPY INTO**\n\nIngestion also got much easier for data analysts and analytics engineers working\nwith Databricks SQL. [COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) is a simple SQL command that follows the\nlake-first approach and loads data from a folder location into a Delta Lake table.\nCOPY INTO can be scheduled and called by a job repeatedly. When run, only new\nfiles from the source location will be processed.\n\n#### Data transformation\n\nTurning SQL queries into production ETL pipelines typically involves a lot\nof tedious, complicated operational work. Even at a small scale, the majority\nof a data practitioner’s time is spent on tooling and managing infrastructure.\n\nAlthough the medallion architecture is an established and reliable pattern\nfor improving data quality, the implementation of this pattern is challenging\nfor many data engineering teams.\n\nWhile hand-coding the medallion architecture was hard for data engineers,\ncreating data pipelines was outright impossible for data analysts not being\nable to code with Spark Structured Streaming in Scala or Python.\n\nEven at a small scale, most data engineering time is spent on tooling and\nmanaging infrastructure rather than transformation. Auto-scaling, observability\nand governance are difficult to implement and, as a result, often left out of the\nsolution entirely.\n\n\n-----\n\n#### What is Delta Live Tables?\n\nDelta Live Tables (DLT) is the first ETL framework that uses a simple **declarative approach** to building reliable data pipelines. DLT automatically auto-scales your\ninfrastructure so data analysts and engineers can spend less time on tooling and focus on getting value from data. Engineers are able to **treat their data as code**\nand apply modern software engineering best practices like testing, error-handling, monitoring and documentation to deploy reliable pipelines at scale. DLT fully supports\nboth Python and SQL and is tailored to work with both streaming and batch workloads.\n\nWith DLT you write a Delta Live Table in a SQL notebook, create a pipeline under Workflows and simply click [Start].\n\n\n**Write** **create live table**\n\n\n**Create** **a pipeline** **Click** **Start**\n\nStart\n\n\n-----\n\nDLT reduces the implementation time by accelerating development and\nautomating complex operational tasks. Since DLT can use plain SQL, it also\nenables data analysts to create production pipelines and turns them into\nthe often discussed “analytics engineer.” At runtime, DLT speeds up pipeline\nexecution applied with Photon.\n\nSoftware engineering principles are applied for data engineering to foster the\nidea of treating your data as code. Your data is the sole source of truth for what\nis going on inside your business.\n\nBeyond just the transformations, there are many things that should be included\n\nDependency\nFull refresh\nmanagement\n\n*Coming soon\n\n\nin the code that define your data. Declaratively express entire data flows in SQL\nor Python. Natively enable modern software engineering best practices like\nseparate development and production environments, the ability to easily test\nbefore deploying, deploy and manage environments using parameterization, unit\ntesting and documentation.\n\nDLT also automatically scales compute, providing the option to set the minimum\nand maximum number of instances and let DLT size up the cluster according\nto cluster utilization. In addition, tasks like orchestration, error handling and\nrecovery, and performance optimization are all handled automatically.\n\n\nIncremental\ncomputation*\n\n\nCheckpointing\nand retries\n\n\n-----\n\nExpectations in the code help prevent bad data from flowing into tables, track\ndata quality over time, and provide tools to troubleshoot bad data with granular\npipeline observability. This enables a high-fidelity lineage diagram of your\npipeline to track dependencies and aggregate data quality metrics across all\nyour pipelines.\n\nUnlike other products that force you to deal with streaming and batch workloads\nseparately, DLT supports any type of data workload with a single API so data\nengineers and analysts alike can build cloud-scale data pipelines faster without\nthe need for advanced data engineering skills.\n\n#### Data orchestration\n\nThe lakehouse makes it much easier for businesses to undertake ambitious data\nand machine learning (ML) initiatives. However, orchestrating and managing\nend-to-end production workflows remains a bottleneck for most organizations,\nrelying on external tools or cloud-specific solutions that are not part of their\nlakehouse platform. Tools that decouple task orchestration from the underlying\ndata processing platform reduce the overall reliability of their production\nworkloads, limit observability, and increase complexity for end users.\n\n#### What is Databricks Workflows?\n\n[Databricks Workflows](https://databricks.com/product/workflows) is the first fully managed and integrated lakehouse\n[orchestration](https://databricks.com/glossary/orchestration) service that allows data teams to build reliable workflows on\nany cloud.\n\n\nWorkflows lets you orchestrate data flow pipelines (written in DLT or dbt),\nas well as machine learning pipelines, or any other tasks such as notebooks\nor Python wheels. Since Databricks Workflows is fully managed, it eliminates\noperational overhead for data engineers, enabling them to focus on your\nworkflows not on managing your infrastructure. It provides an easy point-and-click\nauthoring experience for all your data teams, not just those with specialized skills.\nDeep integration with the underlying lakehouse platform ensures you will create\nand run reliable production workloads on any cloud while providing deep and\ncentralized monitoring with simplicity for end users.\n\nSharing job clusters over multiple tasks reduces the time a job takes, reduces\ncosts by eliminating overhead and increases cluster utilization with parallel tasks.\n\n\n-----\n\nDatabricks Workflows’ deep integration with the lakehouse can best be seen with its monitoring and observability features. The matrix view in the following graphic\nshows a history of runs for a job. Failed tasks are marked in red. A failed job can be repaired and rerun with the click of a button. Rerunning a failed task detects and\ntriggers the execution of all dependent tasks.\n\nYou can create workflows with the UI, but also through the Databricks Workflows API, or with external orchestrators such as Apache Airflow. Even if you are using an\n\nexternal orchestrator, Databricks Workflows’ monitoring acts as a single pane of glass that includes externally triggered workflows.\n\n\n-----\n\n#### Orchestrate anything\n\nRemember that DLT is one of many task types for Databricks Workflows.\nThis is where the managed data flow pipelines with DLT tie together with\nthe easy point-and-click authoring experience of Databricks Workflows.\n\nIn the following example, you can see an end-to-end workflow built with\ncustomers in a workshop: Data is streamed from Twitter according to search\nterms, then ingested with Auto Loader using automatic schema detection and\nenforcement. In the next step, the data is cleaned and transformed with Delta\nLive table pipelines written in SQL, and finally run through a pre-trained BERT\nlanguage model from Hugging Face for sentiment analysis of the tweets.\nDifferent task types for ingest, cleanse/transform and ML are combined\nin a single workflow.\n\nUsing Workflows, these tasks can be scheduled to provide a daily overview of\nsocial media coverage and customer sentiment for a business. After streaming\ntweets with filtering for keywords such as “data engineering,” “lakehouse” and\n“Delta Lake,” we curated a list of those tweets that were classified as positive\nwith the highest probability score.\n\n**Learn more**\n\n\n[Data Engineering on the](https://databricks.com/solutions/data-pipelines)\n[Lakehouse](https://databricks.com/solutions/data-pipelines)\n\n\n[Delta Live Tables](https://databricks.com/product/delta-live-tables)\n\n[Databricks Workflows](https://www.databricks.com/product/workflows)\n\n\n[Big Book of Data Engineering](https://databricks.com/p/ebook/the-big-book-of-data-engineering?itm_data=datapipelines-promo-bigbookofde)\n\n\n-----\n\n**CHAPTER**\n\n### Data streaming\n# 09\n\n\n**CHAPTER**\n\n\nThere are two types of data processing: batch processing\nand streaming processing.\n\n\nBatch processing refers to the discontinuous, periodic processing\nof data that has been stored for a period of time. For example,\nan organization may need to run weekly reports on a set of\npredictable transaction data. There is no need for this data\nto be streaming — it can be processed on a weekly basis.\n\nStreaming processing, on the other hand, refers to unbounded\nprocessing of data as it arrives.\n\n\n-----\n\n**Data Streaming Challenges**\n\nHowever, getting value from streaming data can be a tricky practice. While most\ndata today can be considered streaming data, organizations are overwhelmed by\nthe need to access, process and analyze the volume, speed and variety of this\ndata moving through their platforms. To keep pace with innovation, they must\nquickly make sense of data streams decisively, consistently and in real time.\n\nThree common technical challenges organizations experience\nwith implementing real-time data streaming include:\n\n**•** **Specialized APIs and language skills:** Data practitioners encounter\nbarriers to adopting streaming skillsets because there are new languages,\nAPIs and tools to learn.\n\n**•** **Operational complexity:** To implement data streaming at scale, data\nteams need to integrate and manage streaming-specific tools with\ntheir other cloud services. They also have to manually build complex\noperational tooling to help these systems recover from failure, restart\nworkloads without reprocessing data, optimize performance, scale the\nunderlying infrastructure, and so on.\n\n**•** **Incompatible governance models:** Different governance and security\nmodels across real-time and historical data platforms makes it difficult\nto provide the right access to the right users, see the end-to-end data\nlineage, and/or meet compliance requirements.\n\n\nIn a wide variety of cases, an organization might find it useful to\nleverage streaming data. Here are some common examples:\n\n**•** **Retail:** Real-time inventory updates help support business activities, such\nas inventory and pricing optimization and optimization of the supply chain,\nlogistics and just-in-time delivery.\n\n**•** **Smart energy:** Smart meter monitoring in real time allows for smart\nelectricity pricing models and connection with renewable energy sources\nto optimize power generation and distribution.\n\n**•** **Preventative maintenance:** By reducing unplanned outages and\nunnecessary site and maintenance visits, real-time streaming analytics can\nlower operational and equipment costs.\n\n**•** **Industrial automation:** Manufacturers can use streaming and predictive\nanalytics to improve production processes and product quality, including\nsetting up automated alerts.\n\n**•** **Healthcare:** To optimize care recommendations, real-time data allows\nfor the integration of various smart sensors to monitor patient condition,\nmedication levels and even recovery speed.\n\n**•** **Financial institutions:** Firms can conduct real-time analysis of\n\ntransactions to detect fraudulent transactions and send alerts. They\ncan use fraud analytics to identify patterns and feed data into machine\nlearning algorithms.\n\n\nRegardless of specific use cases, the central tenet of streaming data is that it\ngives organizations the opportunity to leverage the freshest possible insights for\nbetter decision-making and more optimized customer experiences.\n\n\n-----\n\n**Data streaming architecture**\n\nBefore addressing these challenges head-on, it may help to take a step back and\ndiscuss the ingredients of a streaming data pipeline. Then, we will explain how\nthe Databricks Lakehouse Platform operates within this context to address the\naforementioned challenges.\n\nEvery application of streaming data requires a pipeline that brings the data from\nits origin point — whether sensors, IoT devices or database transactions — to its\nfinal destination.\n\nIn building this pipeline, streaming architectures typically employ two layers.\nFirst, streaming capture systems **capture** and temporarily store streaming data\nfor processing. Sometimes these systems are also called messaging systems\nor messaging buses. These systems are optimized for small payloads and high\nfrequency inputs/outputs. Second, streaming **processing** systems continuously\nprocess data from streaming capture systems and other storage systems.\n\n**Capturing** **Processing**\n\n\nIt may help to think of a simplified streaming pipeline\naccording to the following seven phases:\n\n1. Data is continuously generated at origin points\n\n2. The generated data is captured from those origin points by\na capture system like Apache Kafka (with limited retention)\n\n**3. The captured data is extracted and incrementally ingested to**\n**a processing platform like Databricks; data is ingested exactly**\n**once and stored permanently, even if this step is rerun**\n\n**4. The ingested data is converted into a workable format**\n\n**5. The formatted data is cleansed, transformed and joined in**\n**a number of pipeline steps**\n\n**6. The transformed data is processed downstream through**\n**analysis or ML modeling**\n\n7. The resulting analysis or model is used for some sort of practical\napplication, which may be anything from basic reporting to an\nevent-driven software application\n\nYou will notice four of the steps in this list are in boldface. This is because the\nlakehouse architecture is specifically designed to optimize this part of the\npipeline. Uniquely, the Databricks Lakehouse Platform can ingest, transform,\nanalyze and model on streaming data _alongside_ batch-processed data. It can\naccommodate both structured _and_ unstructured data. It is here that the value\nof unifying the best pieces of data lakes and data warehouses really shines for\ncomplex enterprise use cases.\n\n\n-----\n\n**Data Streaming on the Lakehouse**\n\nNow let’s zoom in a bit and see how the Databricks Lakehouse\nPlatform addresses each part of the pipeline mentioned above.\n\n**Streaming data ingestion and transformation** begins with continuously\nand incrementally collecting raw data from streaming sources through a\nfeature called Auto Loader. Once the data is ingested, it can be transformed\nfrom raw, messy data into clean, fresh, reliable data appropriate for downstream\nanalytics, ML or applications. [Delta Live Tables (DLT)](https://www.databricks.com/product/delta-live-tables) makes it easy to build and\nmanage these data pipelines while automatically taking care of infrastructure\nmanagement and scaling, data quality, error testing and other administrative\ntasks. DLT is a high-level abstraction built on Spark Structured Streaming,\na scalable and fault-tolerant stream processing engine.\n\n**[Real-time analytics](https://www.databricks.com/product/databricks-sql)** refers to the downstream analytical application\nof streaming data. With fresher data streaming into SQL analytics or BI\nreporting, more actionable insights can be achieved, resulting in better\nbusiness outcomes.\n\n**[Real-time ML](https://www.databricks.com/product/machine-learning)** involves deploying ML models in a streaming mode. This\ndeployment is supported with structured streaming for continuous inference\nfrom a live data stream. Like real-time analytics, real-time ML is a downstream\nimpact of streaming data, but for different business use cases (i.e., AI instead\nof BI). Real-time modeling has many benefits, including more accurate\npredictions about the future.\n\n\n**Real-time applications** process data directly from streaming pipelines and\ntrigger programmatic actions, such as displaying a relevant ad, updating the\nprice on a pricing page, stopping a fraudulent transaction, etc. There typically\nis no human-in-the-loop for such applications.\n\n\nData in cloud storage and message stores\n\n\n-----\n\n**Databricks Lakehouse Platform differentiators**\n\nUnderstanding what the lakehouse architecture provides is one\n\nthing, but it is useful to understand how Databricks uniquely\n\napproaches the common challenges mentioned earlier around\n\nworking with streaming data.\n\n**Databricks empowers unified data teams.** Data engineers, data scientists\nand analysts can easily build streaming data workloads with the languages\nand tools they already know and the APIs they already use.\n\n**Databricks simplifies development and operations.** Organizations can\nfocus on getting value from data by reducing complexity and automating\nmuch of the production aspects associated with building and maintaining\nreal-time data workloads.\n\n\nSee why customers love streaming on the Databricks\nLakehouse Platform with these resources.\n\n**Learn more**\n\n[Data Streaming Webpage](https://www.databricks.com/product/data-streaming)\n\n[Project Lightspeed: Faster and Simpler Stream Processing](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n[With Apache Spark](https://www.databricks.com/blog/2022/06/28/project-lightspeed-faster-and-simpler-stream-processing-with-apache-spark.html)\n\n[Structured Streaming Documentation](https://docs.databricks.com/spark/latest/structured-streaming/index.html)\n\n[Streaming — Getting Started With Apache Spark on Databricks](https://databricks.com/spark/getting-started-with-apache-spark/streaming)\n\n\n**Databricks is one platform for streaming and batch data.** Organizations\ncan eliminate data silos, centralize security and governance models, and\nprovide complete support for all their real-time use cases under one roof —\nthe roof of the lakehouse.\n\nFinally — and perhaps most important — Delta Lake, the core of the [Databricks](https://www.databricks.com/product/data-lakehouse)\n\n[Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , was built for streaming from the ground up. Delta Lake is\ndeeply integrated with Spark Structured Streaming and overcomes many of\nthe limitations typically associated with streaming systems and files.\n\nIn summary, the Databricks Lakehouse Platform dramatically simplifies data\nstreaming to deliver real-time analytics, machine learning and applications on\none platform. And, that platform is built on a foundation with streaming at its\ncore. This means organizations of all sizes can use their data in motion and\nmake more informed decisions faster than ever.\n\n\n-----\n\n**CHAPTER**\n\n### Data science and machine learning\n# 10\n\n\n**CHAPTER**\n\n\nWhile most companies are aware of the potential benefits of applying\nmachine learning and AI, realizing these potentials can often be quite\nchallenging for those brave enough to take the leap. Some of the\nlargest hurdles come from siloed/disparate data systems, complex\nexperimentation environments, and getting models served in a\nproduction setting.\n\n\nFortunately, the Databricks Lakehouse Platform provides a helping\nhand and lets you use data to derive innovative insights, build\npowerful predictive models, and enable data scientists, ML engineers,\nand developers of all kinds to create within the space of machine\nlearning and AI.\n\n\n-----\n\n#### Databricks Machine Learning\n\n\n-----\n\n#### Exploratory data analysis\n\nWith all the data in one place, data is easily\nexplored and visualized from within the\nnotebook-style experience that provides support\nfor various languages (R, SQL, Python and Scala)\nas well as built-in visualizations and dashboards.\nConfidently and securely share code with\nco-authoring, commenting, automatic versioning,\nGit integrations and role-based access controls.\nThe platform provides laptop-like simplicity at\nproduction-ready scale.\n\n\n-----\n\n#### Model creation and management\n\nFrom data ingestion to model training and tuning, all the way through to\nproduction model serving and versioning, the Lakehouse brings the tools\nneeded to simplify those tasks.\n\nGet right into experimenting with the Databricks ML runtimes, optimized and\npreconfigured to include most popular libraries like scikit-learn, XGBoost and\nmore. Massively scale thanks to built-in support for distributed training and\nhardware acceleration with GPUs.\n\nFrom within the runtimes, you can track model training sessions, package and\nreuse models easily with [MLflow](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) , an open source machine learning platform\ncreated by Databricks and included as a managed service within the Lakehouse.\nIt provides a centralized location from which to manage models and package\ncode in an easily reusable way.\n\nTraining these models often involves the use of features housed in a centralized\nfeature store. Fortunately, Databricks has a built-in feature store that allows you\nto create new features, explore and re-use existing features, select features for\ntraining and scoring machine learning models, and publish features to low-latency\nonline stores for real-time inference.\n\nIf you are looking to get a head start, [AutoML](https://databricks.com/blog/2022/04/18/supercharge-your-machine-learning-projects-with-databricks-automl-now-generally-available.html) allows for low to no-code\nexperimentation by pointing to your data set and automatically training models\nand tuning hyperparameters to save both novice and advanced users precious\ntime in the machine learning process.\n\n\nAutoML will also report back metrics related to the model training results as well\nas the code needed to repeat the training already custom-tailored to your data\nset. This glass box approach ensures that you are never trapped or suffer from\nvendor lock-in.\n\nIn that regard, the Lakehouse supports the industry’s widest range of data tools,\ndevelopment environments, and a thriving ISV ecosystem so you can make your\nworkspace your own and put out your best work.\n\n##### Compute platform\n\n**Any ML workload optimized and accelerated**\n\n**Databricks Machine Learning Runtime**\n\n- Optimized and preconfigured ML frameworks\n\n- Turnkey distribution ML\n\n- Built-in AutoML\n\n- GPU support out of the box\n\n\nBuilt-in **ML frameworks**\nand **model explainability**\n\nBuilt-in support for **AutoML**\nand **hyperparameter tuning**\n\n\nBuilt-in support for\n**distributed training**\n\nBuilt-in support for\n**hardware accelerators**\n\n\n-----\n\n#### Deploy your models to production\n\nExploring and creating your machine learning models\ntypically represents only part of the task. Once the\nmodels exist and perform well, they must become\npart of a pipeline that keeps models updated,\nmonitored and available for use by others.\n\n**Webhooks** allow registering of\n\n\nDatabricks can help here by providing a world-class\nexperience for model versioning, monitoring and\nserving within the same platform that you can use\nto generate the models themselves. This means you\ncan make all your ML pipelines in the same place,\nmonitor them for drift, retrain them with new data,\nand promote and serve them easily and at scale.\n\nThroughout the ML lifecycle, rest assured knowing\nthat lineage and governance are being tracked the\nentire way. This means regulatory compliance and\nsecurity woes are significantly reduced, potentially\nsaving costly issues down the road.\n\n\ncallbacks on events like stage\n\ntransitions to integrate with CI/CD\n\nautomation.\n\n**Tags** allow storing deployment\n\n— specific metadata with model\n\nversions, e.g., whether the\n\ndeployment was successful.\n\n\n**Model lifecycle management**\n\nStaging Production Archived\n\n\nLogged\nmodel\n\n**Comments** allow communication\n\nand collaboration between\n\nteammates when reviewing\n\nmodel versions.\n\n\n-----\n\n**Learn more**\n\n[Databricks Machine Learning](https://databricks.com/product/machine-learning)\n\n[Databricks Data Science](https://databricks.com/product/data-science)\n\n[Databricks ML Runtime Documentation](https://docs.databricks.com/runtime/mlruntime.html)\n\n\n-----\n\n**CHAPTER**\n\n# 11\n\n\n### Databricks Technology Partners and the modern data stack\n\nDatabricks Technology Partners integrate their solutions with Databricks to\nprovide complementary capabilities for ETL, data ingestion, business intelligence,\nmachine learning and governance. These integrations allow customers to leverage\nthe Databricks Lakehouse Platform’s reliability and scalability to innovate faster\nwhile deriving valuable data insights. Use preferred analytical tools with optimized\nconnectors for fast performance, low latency and high user concurrency to your\ndata lake.\n\n\n-----\n\nWith [Partner Connect](https://databricks.com/partnerconnect) , you can bring together all your data, analytics and AI tools on one open platform. Databricks provides a fast and easy way to connect your existing\ntools to your lakehouse using validated integrations and helps you discover and try new solutions.\n\n**Databricks thrives within your modern data stack**\n\n**BI and dashboards** **Machine learning** **Data science**\n\n\n**Data governance**\n\n**Data pipelines**\n\n**Data ingestion**\n\n\nData Data Data\nwarehousing engineering streaming\n\n**Unity Catalog**\n\n\nData science\nand ML\n\n\n**Consulting**\n**and SI partners**\n\n\n**Delta Lake**\n\n**Cloud Data Lake**\n\n**Learn more**\n\n\n[Become a Partner](https://databricks.com/p/register-your-interest-for-databricks-partner-program)\n\n[Partner Connect demos](https://databricks.com/partnerconnect#partner-demos)\n\n\n[Partner Connect](https://databricks.com/partnerconnect)\n\n[Databricks Partner Connect Guide](https://docs.databricks.com/integrations/partner-connect/index.html)\n\n\n-----\n\n**CHAPTER**\n\n### Get started with the Databricks Lakehouse Platform\n# 12\n\n\n-----\n\n#### Databricks Trial\n\nGet a collaborative environment for data teams to build solutions together with interactive\nnotebooks to use Apache Spark TM , SQL, Python, Scala, Delta Lake, MLflow, TensorFlow, Keras,\nscikit-learn and more.\n\n**•** Available as a 14-day full trial in your own cloud or as a lightweight trial\nhosted by Databricks\n\n**[Try Databricks for free](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\n\n**[Databricks documentation](https://databricks.com/documentation)**\n\nGet detailed documentation to get started with\nthe Databricks Lakehouse Platform on your cloud\nof choice: Databricks on AWS, Azure Databricks\nand [Databricks on Google Cloud](https://docs.gcp.databricks.com/?_gl=1*16ovt38*_gcl_aw*R0NMLjE2NTI1NDYxNjIuQ2owS0NRandwdjJUQmhEb0FSSXNBTEJuVm5saU9ydGpfX21uT1U5NU5iRThSbmI5a3o2OGdDNUY0UTRzYThtTGhVZHZVb0NhTkRBMmlWc2FBcEN6RUFMd193Y0I.&_ga=2.135042808.863708747.1652113196-1440404449.1635787641&_gac=1.225252968.1652546163.Cj0KCQjwpv2TBhDoARIsALBnVnliOrtj__mnOU95NbE8Rnb9kz68gC5F4Q4sa8mLhUdvUoCaNDA2iVsaApCzEALw_wcB) .\n\n**[Databricks Demo Hub](https://databricks.com/discover/demos)**\n\nGet a firsthand look at Databricks from the\npractitioner’s perspective with these simple\non-demand videos. Each demo is paired with\nrelated materials — including notebooks, videos\nand eBooks — so that you can try it out for\nyourself on Databricks.\n\n\n**[Databricks Academy](https://databricks.com/learn/training/home)**\n\nWhether you are new to the data lake or building on\nan existing skill set, you can find a curriculum tailored\nto your role or interest. With training and certification\nthrough Databricks Academy, you will learn to master\nthe Databricks Lakehouse Platform for all your big\ndata analytics projects.\n\n**[Databricks Community](https://community.databricks.com/)**\n\n\n**[Databricks Labs](https://databricks.com/learn/labs)**\n\nDatabricks Labs are projects created by the\nfield to help customers get their use cases\ninto production faster.\n\n**[Databricks customers](https://databricks.com/customers)**\n\nDiscover how innovative companies across\nevery industry are leveraging the Databricks\nLakehouse Platform.\n\n\nGet answers, network with peers and solve\nthe world’s toughest problems, together.\n\n\n-----\n\n#### About Databricks\n\nDatabricks is the data and AI company. More than 7,000\norganizations worldwide — including Comcast, Condé Nast,\nH&M and over 40% of the Fortune 500 — rely on the Databricks\nLakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve the\nworld’s toughest problems. To learn more, follow Databricks on\n[Twitter](https://twitter.com/databricks) **,** [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n© Databricks 2022. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "##### Guide\n\n## 6 Strategies for Building Personalized Customer Experiences\n\n\n-----\n\n### Contents\n\n**Introduction** ................................................................................................................................................................................................................. **3**\n\n**1.** **Building a Foundation for Personalization**\nLeveraging ML-Based Customer Entity Resolution ............................................................................................................................... **4**\n\n**2.** **Estimating Customer Lifetime Value**\nBuilding Brand Loyalty With Data ................................................................................................................................................................. **6**\n\n**3.** **Mitigating Customer Churn**\nBalancing Acquisition and Retention .......................................................................................................................................................... **10**\n\n**4.** **Streamlining Customer Analysis and Targeting**\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n\n**5.** **Assessing Consumer Interest Data**\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n\n**6.** **Delivering Personalized Customer Journeys**\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n\n**Conclusion**\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n\n\n-----\n\n### Introduction\n\nIn today’s experience-driven world, the most beloved brands are the ones that\nknow their customers. Customers are loyal to brands that recognize their needs\nand preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\nbuying from a brand that personalizes the shopping and user experience to the\nwants and needs of the customer. And as organizations pursue omnichannel\nexcellence, these same high expectations of online experiences also extend to\nbrick-and-mortar locations — revealing for many merchants that personalized\nengagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized\nexperiences requires integrating various types of data — including demographics,\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\nactionable strategic pillars for businesses to leverage automation, real-time data,\nAI-driven analysis and well-tuned ML models to architect and deliver customized\ncustomer experiences at every touch point.\n\n\n# 76%\n\nof consumers are more\nlikely to purchase due to\npersonalization\n\n\n# 76%\n\n\n-----\n\n### Building a Foundation for Personalization\n\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\n\n\nTo create truly personalized interactions, you need actionable insights\nabout your customers. Start by establishing a common customer profile and\naccurately linking together customer records across disparate data sets.\n\nGet a 360-degree view of your target customer by bringing together:\n\n- Sales and traffic-driven first-party data\n\n- Product ratings and surveys\n\n- Customer surveys and support center calls\n\n- Third-party data purchased from data aggregators and online trackers\n\n- Zero-party data provided by customers themselves\n\nLocation\n\n\n**C A S E S T U DY**\n\n**Personalizing‌ ‌experiences‌ with‌ ‌data‌ ‌and‌ ‌ML‌**\n\nGrab is the largest online-to-offline platform in Southeast Asia and\nhas generated over 6 billion transactions for transport, food and\ngrocery delivery, and digital payments. Grab uses Databricks to create\nsophisticated customer segmentation and recommendation engines\nthat can now ingest and optimize thousands of user-generated signals\nand data sources simultaneously, enhancing data integrity and security,\nand reducing weeks of work to only hours.\n\n[Get the full story](https://www.databricks.com/customers/grab)\n\n\n\nDemographics\n\n\nOrders\n\nNetwork/\nUsage\n\n\n“The C360 platform empowered teams to create\nconsumer features at scale, which in turn allows\nfor these features to be extended to other markets\nand used by other teams. This helps to reduce the\nengineering overhead and costs exponentially.”\n\n**N I K H I L DWA R A K A N AT H**\nHead of Analytics, Grab\n\n\nSocial\n\nApps/\nClickstream\n\n|Col1|Col2|Col3|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|||||||\n||Cus 3|t 6|o|mer 0||\n|||||||\n|||||||\n\n\n\nService Call/\nRecords\n\n\nCustomer\n360\n\n\nBilling\n\nDevices\n\n\n-----\n\nGiven the different data sources and data types, automated matching can still\nbe incredibly challenging due to inconsistent formats, misinterpretation of data,\nand entry errors across various systems. And even if inconsistent, all that data\nmay be perfectly valid — but to accurately connect the millions of customer\nidentities most retailers manage, businesses must lean on automation.\n\nIn a machine learning (ML) approach to entity resolution, text attributes like\nname, address and phone number are translated into numerical representations\nthat can be used to quantify the degree of similarity between any two attribute\nvalues. But your ability to train such a model depends on your access to\naccurately labeled training data. It’s a time-consuming exercise, but if done right,\nthe model learns to reflect the judgments of the human reviewers.\n\nMany organizations rely on libraries encapsulating this knowledge to build their\napplications and workflows. One such library is [Zingg](https://www.zingg.ai/) , an open source library\nbringing together ML-based approaches to intelligent candidate pair generation\nand pair-scoring. Oriented toward the construction of custom workflows, Zingg\npresents these capabilities within the context of commonly employed steps\nsuch as training data label assignment, model training, data set deduplication,\nand (cross-data set) record matching.\n\nBuilt as a native Apache Spark TM application, Zingg scales well to apply these\ntechniques to enterprise-sized data sets. Organizations can then use Zingg in\ncombination with platforms such as Databricks Lakehouse to provide the back\nend to human-in-the-middle workflow applications that automate the bulk of\nthe entity resolution work and present data experts with a more manageable\nset of edge case pairs to interpret.\n\n\nAs an active-learning solution, models can be retrained to take advantage of\nthis additional human input to improve future predictions and further reduce\nthe number of cases requiring expert review. Finally, these technologies can be\nassembled to enable their own enterprise-scaled customer entity resolution\nworkflow applications.\n\n**Need help building your foundation for a**\n**360-degree view of your customers?**\n\nGet pre-built code sample data and step-by-step instructions\nin a Databricks notebook in the **Customer Entity Resolution**\n**Solution Accelerator.**\n\n**•** Translating text attributes (like name, address, phone number)\ninto quantifiable numerical representations\n\n**•** Training ML models to determine if these numerical labels\nform a match\n\n**•** Scoring the confidence of each match\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-entity-resolution)**\n\n\n-----\n\n### Estimating Customer Lifetime Value\n\nBuilding brand loyalty to drive share of wallet with data\n\n\nOnce you’ve set up a 360-degree view of the customer, the next challenge\nis how to spend money to profitably grow the brand. The goal is to spend\nmarketing dollars on activities that attract loyal customers and avoid spending on\nunprofitable customers or activities that damage the brand. Keep in mind, that\nmaking decisions solely based on ROI isn’t the answer. This one-track approach\ncould ultimately weaken your brand equity and make you more dependent on\nlowering your price through promotions as a way to generate sales.\n\n**C A S E S T U DY**\n\n\n**Identifying and engaging brand loyalists**\n\nToday’s customer has overwhelmingly abundant options in products and\nservices to choose from. That’s why personalizing customer experiences is so\nimportant, as it increases revenue, marketing efficiency and customer retention.\n\nNot every customer carries the same potential for profitability. Different\ncustomers derive different value from your products and services, which directly\ntranslates into differences in the overall amount of value a business can expect\nin return. Mutually beneficial relationships carefully align customer acquisition\ncost (CAC) and retention rates with the total revenue or customer lifetime value\n(CLV).\n\n\n**Predicting and increasing customer lifetime value with ML**\n\n\nKolibri Games, creators of Idle Miner Tycoon and Idle Factory Tycoon,\nattracts over 10 million monthly active users. With Databricks, they\nachieved a 30% increase in player LTV, improved data team productivity\nby 3x, and reduced ML model-to-production time by 40x.\n\n[Get the full story](https://databricks.com/customers/kolibri-games)\n\nWithin your existing customer base are people ranging from brand loyalists to\nbrand transients. Brand loyalists are highly engaged with your brand, are willing\nto share their experience with others, and are the most likely to purchase\nagain. Brand transients have no loyalty to your brand and shop based on price.\nYour focus should be on growing the group of brand loyalists while minimizing\ninteractions with brand transients.\n\n\n**Calculating customers’ lifetime intent**\n\nTo assess the remaining lifetime in a customer relationship, businesses must\n\ncarefully examine the transactional signals and other indicators from previous\ncustomer engagements and transactions.\n\nFor example, if a frequent customer slows down their buying habits — or simply\ndoesn’t make a purchase for an extended period of time — it may signal the\nupcoming end of the relationship. However, in the case of another customer\nwho engages infrequently, the same extended absence may not signal anything\nnotable. The infrequent buyer may continue to purchase even after a long pause\nin activity.\n\n\n-----\n\nCustomer A\n\nCustomer B\n\nCustomer C\n\n\nPast Future\n\nDifferent customers with the same number of transactions, but signaling different lifetime intent. The probability of re-engagement (P_alive) relative to a customer’s history of purchases.\n\n\nEvery customer relationship with a business has a lifespan. Understanding what\npoint in the lifespan at a given time provides critical insight to inform marketing\nand sales tactics. By proactively discovering shifts in the relationship, you can\nadapt how to respond to each customer at the optimal time. For example, a\ncertain signal might prompt a change in how to deliver products and services,\nwhich could help maximize revenue.\n\nTransactional signals can be used to estimate the probability that a customer\nis active and likely to return in the future. Popularized as the Buy ’til You Die\n(BTYD) model, analysts can compare a customer’s frequency and recency of\n\nengagement to similar patterns across their user population to accurately\npredict individual CLV.\n\n\nThe mathematics behind these predictive CLV models is complex, but the logic\nbehind these critical models is accessible through a popular Python library\nnamed Lifetimes, which allows the input of simple summary metrics in order to\nderive customer-specific lifetime estimates.\n\n**C A S E S T U DY**\n\n**How personalized experiences keep customers coming**\n**back for more**\n\nPublicis Groupe empowers brands to transform retail experiences with\ndigital technologies, but data challenges and team silos stood in the\nway of delivering the personalization that their customers required.\nSee how they use Databricks to create a single customer view that\nallows them to drive customer loyalty and retention. As a result, they’ve\nseen a 45%–50% increase in customer campaign revenue.\n\n[Get the full story](https://databricks.com/customers/publicis-groupe)\n\n\n-----\n\n**Delivering customer lifetime estimates to the business**\n\n\nSpark natively distributes this work across a multi-server environment, enabling\nconsistent, accurate and efficient analysis. Spark’s flexibility allows models to\nadapt in real time as new information is ingested, eliminating the bottlenecks\nthat come with manual data mapping and profile building.\n\nWith per customer metrics calculated, the Lifetimes library can be used to train\nmultiple BTYD models, such as Pareto/NBD and BG/NBD. Training models to\npredict engagements over time using proprietary data can take several months\nand thousands of training runs. [Hyperopt](http://hyperopt.github.io/hyperopt/) , a specialized snippet library, helps\nbusinesses tap into the infrastructure behind their Spark environments and\ndistribute the training outputs across models.\n\n\nUsing the Lifetimes library to calculate customer-specific probabilities at speed\nand scale can be challenging — from processing large volumes of transaction\ndata to deriving data curves and value distribution patterns and, eventually,\nto integration with business initiatives. But with the proper approach, you can\nresolve all of them.\n\nThese models depend on three key per customer metrics:\n\n**FREQUENCY**\nThe number of times within a given time period in which a repeat\ntransaction is observed\n\n**AGE**\nThe length of time between the occurrence of an initial transaction\nto the end of a given time period\n\n**RECENCY**\n\nThe “age” of a customer (how long they’ve engaged with a brand)\nat the time of their latest repeat transaction\n\n\n-----\n\n**Solution deployment**\n\n\nOnce properly trained, these models can determine the probability that a\ncustomer will re-engage, as well as the number of engagements a business\ncan expect from that customer over time. But the real challenge is putting\nthese predictive capabilities into the hands of those that determine\ncustomer engagement.\n\nMatrices illustrating the probability a customer is alive (left) and the number of future\npurchases in a 30-day window given a customer’s frequency and recency metrics (right).\n\n\nBusinesses need a way to develop and deploy solutions in a highly scalable\nenvironment with a limited upfront cost. Databricks Solution Accelerators\nleverage real-world sample data sets and pre-built code to show how raw data\ncan be transformed into real solutions — including step-by-step instructions\nready to go in a Databricks notebook.\n\n**Need help determining your customers’**\n**lifetime value?**\n\nUse the **Customer Lifetime Value Accelerator** to\n\n**•** Ingest sample retail data\n\n**•** Use pre-built code to develop visualizations and explore\npast purchase behavior\n\n**•** Apply machine learning to predict the likelihood and\nnature of future purchases\n\n**[GET THE ACCELERATOR](https://databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n\n-----\n\n### Mitigating Customer Churn\n\nBalancing acquisition and retention with personalized experiences\n\n\nThere are no guarantees of success. With a bevy of options at their disposal,\ncustomer churn is a reality that companies face and are focused on overcoming\nevery day. One [recent analysis](https://info.recurly.com/annual-subscription-billling-metrics-report?submissionGuid=3c21cde7-5f58-4d86-9218-332d697e7b3e) of consumer-oriented subscription services\nestimated a segment average 7.2% monthly rate of churn. When narrowed to\nbrands focused on consumer goods, that rate jumped to 10.0%. This figure\ntranslates to a lifetime of 10 months for the average subscription box service,\nleaving businesses of this kind with little time to recover acquisition costs and\nbring subscribers to net profitability.\n\n**C A S E S T U DY**\n##### Riot Games\n\n**Creating an optimal in-game experience for League of Legends**\n\nRiot Games is one of the top PC game developers in the world, with over\n100 million monthly active users, 500 billion data points, and over 26\npetabytes of data and counting. They turned to Databricks to build a more\n\nefficient and scalable way to leverage data and improve the overall gaming\nexperience — ensuring customer engagement and reducing churn.\n\n[Get the full story](https://www.databricks.com/customers/riot-games)\n\nOrganizations must take an honest look at the cost of acquisition relative to a\ncustomer’s lifetime value (LTV) earned. These figures need to be brought into a\n\nhealthy balance and treated as a “chronic condition” [to be managed.](https://retailtouchpoints.com/features/trend-watch/can-subscription-retail-solve-its-customer-retention-problem)\n\n\n**Understanding attrition predictability through subscriptions:**\n**Examining retention-based acquisition variables**\n\nPublic data for subscription services is extremely hard to come by. KKBox, a\nTaiwan-based music streaming service, recently released over two years of\nanonymized [subscription data](https://www.kaggle.com/c/kkbox-churn-prediction-challenge) to examine customer churn. Through analyzing\nthe data, we uncover customer dynamics familiar to any subscription provider.\n\nMost subscribers join the KKBox service through a 30-day trial offer. Customers\nthen appear to enlist in one-year subscriptions, which provide the service with\na steady flow of revenue. Subscribers typically churn at the end of the 30-day\ntrial and at regular one-year intervals.\n\nThe Survival Rate reflects the proportion of the initial (Day 1) subscriber population that is\nretained over time, first at the roll-to-pay milestone, and then at the renewal milestone.\n\n\n-----\n\nBy Initial Payment Method\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different payment methods.\n\nBy Initial Payment Plan Days\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers selecting different initial payment methods and terms/days.\n\n\nThis pattern of high initial drop-off, followed by a period of slower but continuing\ndrop-off cycles makes intuitive sense. Where it gets interesting is when the\ndata changes. The patterns of customer churn become vastly different as time\npasses and new or changing elements are introduced (e.g., payment methods\nand options, membership tiers, etc.).\n\nBy Registration Channel\n\ntimeline\n\nCustomer attrition by subscription day on the KKBox streaming service for\ncustomers registering via different channels.\n\n\n-----\n\nThese patterns seem to indicate that KKBox _could_ potentially differentiate\nbetween customers based on their lifetime potential, using only the information\navailable at subscriber acquisition. In the same way, non-subscription businesses\ncould use similar data techniques to get an accurate illustration of the total\nlifetime value of a particular customer, even before collecting historical data.\n\nThis information can help businesses target certain shoppers with effective\ndiscounts or promotions as early as trial registration. Nevertheless, it’s always\nimportant to consider more than individual data points.\n\nThe baseline risk of customer attrition over a subscription lifespan.\n\n\nThe channel and payment method multipliers combine to explain a customer’s risk of attrition\nat various points in time. The higher the value, the higher the proportional risk of churn in the\nassociated period.\n\n\n-----\n\n**Applying churn analytics to your data**\n\nThis analysis is useful in two ways: **1)** to quantify the risk of customer churn and\n**2)** to paint a quantitative picture of the specific factors that explain that risk,\ngiving analysts a clearer understanding of what to focus on, what to ignore and\nwhat to investigate further. The main challenge is organizing the input data.\n\nThe data required to examine customer attrition may be scattered across\nmultiple systems, making an integrated analysis difficult. [Data lakes](https://databricks.com/discover/data-lakes/introduction) support\nthe creation of transparent, sustainable data processing pipelines that are\nflexible, scalable and highly cost-efficient. Remember that **churn is a chronic**\n**condition to be managed** , and attrition data should be periodically revisited to\nmaintain alignment between acquisition and retention efforts.\n\n**Need help predicting customer churn?**\n\nUse the **Subscriber Churn Prediction Accelerator** to analyze\nbehavioral data, identify subscribers with an increased risk of\ncancellation, and predict attrition. Machine learning lets you\nquantify a user’s likelihood to churn, identifying factors that\nexplain the risk.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/survivorship-and-churn)**\n\n\n-----\n\n### Streamlining Customer Analysis and Targeting\n\nCreating efficient and highly targeted customer experiences with behavioral data\n\n\nEffective targeting comes down to one fundamental element: the cost of\ndelivering a good or service relative to what a consumer is willing to pay.\n\nIn the earliest applications of segmentation, manufacturers recognized that\nspecialized product lines targeting specific consumer groups could help\nbrands stand out against competitors.\n\n**C A S E S T U DY**\n\n**Finding that special something every time**\n\nPandora is a jewelry company with global reach. They built their master\nconsumer view (MCV) dashboard on the Databricks Lakehouse Platform,\ngiving them the insights necessary to deliver highly targeted messaging\nand personalization — resulting in 80% growth in email marketing\nsuccess, a 50% increase in click-to-open rate across 65 million emails,\nand 255M DKK (Danish Krone) in quarterly revenue.\n\n[Get the full story](https://www.databricks.com/customers/pandora)\n\nThis mode of thinking extends beyond product development and into every\ncustomer-oriented business function, requiring specific means of ideation,\nproduction and delivery. The work put into segmentation doesn’t need to be\na gamble. Scrutinizing customers and testing responsiveness is an ongoing\nprocess. Organizations must analyze and adapt to shifting markets, changing\nconsumer demand and evolving business objectives.\n\n\n**C A S E S T U DY**\n\n**Powering insight-driven dashboards to increase customer**\n**acquisition**\n\nBagelcode is a global game company with more than 50 million global\nusers. By using the Databricks Lakehouse Platform, they are now able to\nsupport more diversified indicators, such as a user’s level of frequency\nand the amount of time they use a specific function for each game,\nenabling more well-informed responses. In addition, the company is\nmitigating customer churn by better predicting gamer behavior and\nproviding personalized experiences at scale.\n\n[Get the full story](https://www.databricks.com/customers/bagelcode)\n\n“Thanks to Databricks Lakehouse, we can support\nreal-time business decision-making based on data\nanalysis results that are automatically updated on\nan hourly and daily basis, even as data volumes have\nincreased by nearly 1,000 times.”\n\n**J O O H Y U N K I M**\nVice President, Data and AI, Bagelcode\n\n\n-----\n\nA brand’s goal with segmentation should be to define a shared customer\nperspective on customers, allowing the organization to engage users consistently\nand cohesively. But any adjustments to customer engagement require careful\nconsideration of [organizational change concerns](https://www.researchgate.net/publication/45348436_Bridging_the_segmentation_theorypractice_divide) .\n\n**C A S E S T U DY**\n\n**Responding to global demand shifts with ease**\n\nReckitt produces some of the world’s most recognizable and trusted\nconsumer brands in hygiene, health and nutrition. With Databricks\nLakehouse on Azure, they’re able to meet the needs of billions of\nconsumers worldwide by surfacing real-time, highly accurate, deep\ncustomer insights, leading to a better understanding of trends and\ndemand, allowing them to provide best-in-class experiences in\nevery market.\n\n[Get the full story](https://www.databricks.com/customers/reckitt)\n\n\n**A segmentation walk-through: Grocery chain promotions**\n\nA promotions management team for a large grocery chain is responsible for\nrunning a number of promotional campaigns, each of which is intended to drive\ngreater overall sales. Today, these marketing campaigns include leaflets and\ncoupons mailed to individual households, manufacturer coupon matching,\nin-store discounts and the stocking of various private-label alternatives to\npopular national brands.\n\nRecognizing uneven response rates between households, the team is eager to\ndetermine if customers might be segmented based on their responsiveness\nto these promotions. They anticipate that such segmentation may allow the\npromotions management team to better target individual households, driving\noverall higher response rates for each promotional dollar spent.\n\nUsing historical data from point-of-sale systems along with campaign\ninformation from their promotions management systems, the team derives\na number of features that capture the behavior of various households with\nregard to promotions. Applying standard data preparation techniques, the data\nis organized for analysis and using a variety of clustering algorithms, such as\nk-means and hierarchical clustering, the team settles on two potentially useful\ncluster designs.\n\n\n-----\n\nOverlapping segment designs separating households based on their responsiveness to\nvarious promotional offerings. Profiling of clusters to identify differences in behavior across clusters.\n\n**Assessing results**\n\n\nComparing households by demographic factors not used in developing the\nclusters themselves, some interesting patterns separating cluster members\nby age and other factors are identified. While this information may be useful\n\nin not only predicting cluster membership and designing more effective\ncampaigns targeted to specific groups of households, the team recognizes\nthe need to collect additional demographic data before putting too much\nemphasis on these results.\n\n\nWith profiling, marketers can discern those customer households in the\nhighlighted example fall into two groups: those who are responsive to coupons\nand mailed leaflets, and those who are not. Further divisions show differing\ndegrees of responsiveness to other promotional offers.\n\n\n-----\n\n**Need help segmenting your customers for**\n**more targeted marketing?**\n\nUse the **Customer Segmentation Accelerator** and drive\nbetter purchasing predictions based on behaviors. Through\nsales data, campaigns and promotions systems, you can\nbuild useful customer clusters to effectively target various\nhouseholds with different promos and offers.\n\nAge-based differences in cluster composition of behavior-based customer segments.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\nThe results of the analysis now drive a dialog between the data scientists and\nthe promotions management team. Based on initial findings, a revised analysis\nwill be performed focused on what appear to be the most critical features\ndifferentiating households as a means to simplify the cluster design and evaluate\noverall cluster stability. Subsequent analyses will also examine the revenue\n\ngenerated by various households to understand how changes in promotional\nengagement may impact customer spending.\n\nUsing this information, the team believes they will have the ability to make a case\nfor change to upper management. Should a change in promotions targeting be\napproved, the team makes plans to monitor household spending, promotions\nspend and campaign responsiveness rates using much of the same data used in\nthis analysis. This will allow the team to assess the impact of these efforts and\nidentify when the segmentation design needs to be revisited.\n\n\n-----\n\n#### Assessing Consumer Interest Data to Inform Engagement Strategies\n\nFine-tuning ML recommendations to boost conversions\n\n\nPersonalization is a [journey](https://www.bcg.com/publications/2021/the-fast-track-to-digital-marketing-maturity) . To operationalize personalized experiences, it’s\nimportant to identify high-value audiences who have the highest likelihood of\nspecific actions. Here’s where **propensity scoring** comes in.\n\nSpecifically, this process allows companies to estimate customers’ potential\nreceptiveness to an offer or to content related to a subset of products, and\ndetermine which messaging to apply. Calculating propensity scores requires\nassessment of past interactions and data points (e.g., frequency of purchases,\npercentage of spend associated with a particular product category, days since\nlast purchase and other historical data).\n\nDatabricks provides critical capabilities for propensity scoring (like the Feature\nStore, AutoML and MLflow) to help businesses answer three key considerations\nand develop a robust process:\n\n**1.** How to maintain the significant number of features used\nto train propensity models\n\n**2.** How to rapidly train models aligned with new campaigns\n\n**3.** How to rapidly re-deploy models, retrained as customer\npatterns drift, into the scoring pipeline\n\n**Boosting model training efficiency**\n\nWith the [Databricks Feature Store](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) , data scientists can easily reuse features\ncreated by others.\n\n\nThe feature store is a centralized repository that enables the persistence,\ndiscovery and sharing of features across various model training exercises.\nAs features are captured, lineage and other metadata are captured. Standard\nsecurity models ensure that only permitted users and processes may\nemploy these features, enforcing the organization’s data access policies on\ndata science processes.\n\n**Extracting the complexities of ML**\n\n[Databricks AutoML](https://docs.databricks.com/applications/machine-learning/automl.html) allows you to quickly generate models by leveraging industry\nbest practices. As a glass box solution, AutoML first generates a collection of\nnotebooks representing various aligned model variations. In addition to iteratively\ntraining models, AutoML allows you to access the notebooks associated with each\nmodel, creating an editable starting point for further exploration.\n\n**Streamlining the overall ML lifecycle**\n\n[MLflow](https://docs.databricks.com/applications/mlflow/index.html) is an open source machine learning model repository, managed within the\nDatabricks Lakehouse. This repository enables tracking and analysis of the various\nmodel iterations generated by both AutoML and custom training cycles alike.\n\nWhen used in combination with the Databricks Feature Store, models persisted\nwith MLflow can retain knowledge of the features used during training. As models\nare retrieved, this same information allows the model to retrieve relevant features\nfrom the Feature Store, greatly simplifying the scoring workflow and enabling\nrapid deployment.\n\n\n-----\n\n**How to build a propensity scoring workflow with Databricks**\n\nUsing these features in combination, many organizations implement propensity\nscoring as part of a three-part workflow:\n\n**1.** Data engineers work with data scientists to define features relevant\nto the propensity scoring exercise and persist these to the Feature Store.\nDaily or even real-time feature engineering processes are then defined\nto calculate up-to-date feature values as new data inputs arrive.\n\nModel Training\nand Deployment\n\n\n**2.** As part of the inference workflow, customer identifiers are presented to\npreviously trained models in order to generate propensity scores based on\nthe latest features available. Feature Store information captured with the\nmodel allows data engineers to retrieve these features and easily generate\nthe desired scores, which can then be used for analysis within Databricks\nLakehouse or published to downstream marketing systems.\n\n**3.** In the model-training workflow, data scientists periodically retrain the\npropensity score models to capture shifts in customer behaviors. As these\nmodels are persisted to MLfLow, change management processes are used\nto evaluate and elevate those models that meet organizational criteria-toproduction status. In the next iteration of the inference workflow, the latest\nproduction version of each model is retrieved to generate customer scores.\n\n\nScore Generation\nand Publication ETL\n\n**Need help assessing interest from your**\n**target audience?**\n\n\nFeature\nEngineering ETL\n\nFeature Store Profiles\n\n\nSales\n\nPromotions\n\nCustomer\n\n\nUse the **Propensity Scoring Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nDownstream\nApplications\n\n\nA three-part propensity scoring workflow.\n\n\n-----\n\n### Delivering Personalized Customer Journeys\n\nStrategies for crafting a real-time recommendation engine\n\n\nAs the economy continues to weather unpredictable disruptions, shortages and\ndemand, delivering personalized customer experiences at speed and scale will\nrequire adaptability on the ground and within a company’s operational tech stack.\n\n\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\nstrategy and operations, allowing them to create a “golden customer\nrecord” that improves all decision-making from forecasting demand to\npowering their global loyalty program.\n\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\n\n\n**C A S E S T U DY**\n\n\n“Databricks Lakehouse allows every division in our\norganization — from automotive to retail — to gain\na unified view of our customer across businesses.\nWith these insights, we can optimize everything from\nforecasting and supply chain, to powering our loyalty\nprogram through personalized marketing campaigns,\ncross-sell strategies and offers.”\n\n**D M I T R I Y D O V G A N**\nHead of Data Science, Al-Futtaim Group\n\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\nsafety and community, brands most attuned to changing needs and sentiments\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\nbusiness and many lost, organizations that had already begun the journey toward\nimproved customer experience saw better outcomes, closely mirroring patterns\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n\n\n**Creating a unified view across 200+ brands**\n\nAs a driving force for economic growth in the Middle East, Al-Futtaim\nimpacts the lives of millions of people across the region through the\ndistribution and operations of global brands like Toyota, IKEA, Ace\nHardware and Marks & Spencer.\n\nAl-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**\n\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n\n**C A S E S T U DY**\n\n**Personalizing the beauty product shopping experience**\n\nFlaconi wanted to leverage data and AI to become the No. 1 online\nbeauty product destination in Europe. However, they struggled with\nmassive volumes of streaming data and with infrastructure complexity\nthat was resource-intensive and costly to scale. See how they used\nDatabricks to increase time-to-market by 200x, reduce staff costs by\n40% and increase net order income.\n\nGet the full story\n\n\n¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\nExperience Performance Index in 2007-09.\n\nSource: Forrester Customer Experience Performance Index (2007-09); press search\n\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n\n\n-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nFlipp is an online marketplace that aggregates weekly shopping circulars,\nso consumers get deals and discounts without clipping coupons. Siloed\ncustomer data sources once made getting insights difficult. Now with\nDatabricks, Flipp’s data teams can access and democratize data, helping\nthem do their jobs more effectively while bringing better deals to users,\nmore meaningful insights to partners, and a 10% jump in foot traffic to\nbrick-and-mortar retailers.\n\nGet the full story\n\nThe engines we use to serve content based on customer preferences are known\nas recommenders. With some recommenders, a heavy focus on the shared\npreferences of similar customers helps define what recommendations will actually\nmake an impact. With others, it can be more useful to focus on the properties of\nthe content itself (e.g., product descriptions).\n\n\n-----\n\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n\n\nProviding deep, effective personalized experiences to customers depends\non a brand’s ability to intelligently leverage consumer and market data from a\nwide variety of sources to fuel faster, smarter decisions — without sacrificing\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\nexactly that, offering a scalable data architecture that unifies all your data,\nanalytics and AI to deliver unforgettable customer experiences.\n\nCreated on open source and open standards, Databricks offers a robust\nand cost-effective platform for brands to collaborate with partners, clients,\nmanufacturers and distributors to unleash more innovation and efficiencies\nat every touch point. Businesses can rapidly ingest available data in real time,\n\n\nat scale, and create accessible, data-driven insights that enable actionable\nstrategies across the value chain.\n\nDatabricks is a multicloud platform, designed for quick enterprise development.\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\ntheir company’s operational health and the evolving needs of their customers\n— all while empowering teams to easily unify data efforts, perform fine-grained\nanalyses and streamline cross-functional data operations using a single,\nsophisticated solution.\n\n\n###### Learn more about Databricks Lakehouse for industries\n like Retail & Consumer Goods, Media & Entertainment\n and more at databricks.com/solutions\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n\nis headquartered in San Francisco, with offices around the globe. Founded by\n\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n\na mission to help data teams solve the world’s toughest problems. To learn more,\n\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=563736421186&utm_term=databricks%20free%20trial&gclid=Cj0KCQjwpeaYBhDXARIsAEzItbHzQGCu2K58-lnVCepMI5MYP6jTXkgfvqmzwAMqrlVwVOniebOE43UaAk3OEALw_wcB)**\n\n##### Contact us for a personalized demo databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "#### eBook\n\n# Big Book of Retail\n & Consumer Goods Use Cases\n\n##### Driving real-time decisions\n with the Lakehouse\n\n\n-----\n\n### Contents (1/2) C H A P T E R 1 : \u0007 Introduction 4\n\n**C H A P T E R 2 :** \u0007 **Modern Data Platform for Real-Time Retail** 6\n\nCommon challenges 6\n\nThe Lakehouse for Retail 8\n\n**C H A P T E R 3 :** **Use Case: Real-Time Supply Chain Data** \u0007 12\n\nCase Study: Gousto 14\n\nCase Study: ButcherBox 14\n\n**C H A P T E R 4 :** \u0007 **Use Case: Truck Monitoring** 15\n\nCase Study: Embark 16\n\n**C H A P T E R 5 :** **Use Case: Inventory Allocation** \u0007 17\n\nCase Study: H&M 19\n\nCase Study: Edmunds 19\n\n**C H A P T E R 6 :** **Use Case: Point of Sale and Clickstream** \u0007 20\n\n**C H A P T E R 7 :** **Use Case: On-Shelf Availability** \u0007 22\n\nCase Study: Reckitt 25\n\n**C H A P T E R 8 :** **�Use Case: Customer and Vehicle Identification** 26\n\n**C H A P T E R 9 :** \u0007 **Use Case: Recommendation Engines** 28\n\nCase Study: Wehkamp 31\n\nCase Study: Columbia 31\n\nCase Study: Pandora 31\n\n**C H A P T E R 1 0 :** \u0007 **Use Case: Perpetual Inventory** 32\n\n**C H A P T E R 1 1 :** \u0007 **Use Case: Automated Replenishments** 34\n\n\n-----\n\n### Contents (2/2) C H A P T E R 1 2 : \u0007 Use Case: Fresh Food Forecasting 36\n\nCase Study: ButcherBox 37\n\nCase Study: Sam’s Club 37\n\n**C H A P T E R 1 3 :** \u0007 **Use Case: Propensity-to-Buy** 38\n\n**C H A P T E R 1 4 :** \u0007 **Use Case: Next Best Action** 41\n\n**C H A P T E R 1 5 :** **Customers That Innovate With Databricks Lakehouse for Retail** \u0007 43\n\n**C H A P T E R 1 6 :** \u0007 **Conclusion** 43\n\n\n-----\n\n**CHAPTER 1:**\n### Introduction\n\n\nRetailers are increasingly being challenged to make time-sensitive decisions in their operations. Consolidating\n\ne-commerce orders. Optimizing distribution to ensure item availability. Routing delivery vehicles. These\n\ndecisions happen thousands of times daily and have a significant financial impact. Retailers need real-time data\n\nto support these decisions, but legacy systems are limited to data that’s hours or days old.\n\n**When seconds matter, only the Lakehouse delivers better decisions**\n\nRetail is a 24/7 business where customers expect accurate information and immediate relevant feedback.\n\nThe integration of physical and e-commerce customer experiences into an omnichannel journey has been\n\nhappening for the past 20 years, but the pandemic provided a jolt to consumer trends that dramatically shifted\n\npurchasing patterns.\n\nIn reaction to these industry changes, retailers have responded with significant, rapid investments — including\n\nstronger personalization, order fulfillment, and delivery and loyalty systems. While these new targeted\n\ncapabilities have addressed the immediate need — and created expectations of making decisions in real\n\ntime — most retailers still rely on legacy data systems, which impedes their ability to scale these innovations.\n\nUnfortunately, most legacy systems are only able to process information in hours or days.\n\nThe delays caused by waiting for data are leading to significant risks and costs for the industry.\n\n**Grocers** need to consolidate order picking to achieve profitability in e-commerce, but this requires up-to-\n\nthe-minute order data. Not having this information causes them to spend more resources on having people\n\npick orders separately, at a higher operating cost.\n\n**Apparel retailers** must be able to present the correct available inventory on their website. This requires\n\nthat in-store sales be immediately reflected in their online systems. Inaccurate information can lead to lost\n\nsales, or worse, the customer becoming unsatisfied and moving to different retailers.\n\n\n-----\n\n**Convenience fuel retailers** must collaborate with distribution centers, direct-to-store delivery distributors\n\nand other partners. Having delayed data can lead to out-of-stocks, costing stores thousands of dollars per\n\nweek.\n\nThe margin of error in retail has always been razor thin, but with a pandemic and inflationary pressures, it’s at\n\nzero. Reducing the error rate requires better predictions and real-time data.\n\n**Use Case Guide**\n\nIn this use case guide, we show how the Databricks Lakehouse for Retail is helping leading organizations take\n\n**all of their data in a single lakehouse architecture, streamline their data engineering and management,**\n\n**make it ready for SQL and ML/AI** , and **do so very fast within their own cloud infrastructure environment**\n\n**based on open source and open standards** . These capabilities are all delivered at world-record-setting\n\nperformance, while achieving a market-leading total cost of ownership.\n\nDatabricks Lakehouse for Retail has become the industry standard for enabling retailers to drive decisions\n\nin real time. This use case guide also highlights common use cases across the industry, and offers additional\n\nresources in the form of Solution Accelerators and reference architectures to help as you embark on your own\n\njourney to drive better customer experiences with data and AI.\n\n\n-----\n\n**CHAPTER 2:**\n### Modern Data Platform\n for Real-Time Retail\n\n\nRetailers continue to adapt to rapidly shifting dynamics across the omnichannel. In navigating these\n\nchanges, retailers are increasingly focused on improving the real-time availability of data and insights, and\n\nperforming advanced analytics delivered within tight business service windows.\n\n**Common challenges**\n\nIn response to the surge in e-commerce and volatility in their supply chains, retailers are investing millions\n\nin modernizing distribution centers, partnering with delivery companies, and investing in customer\n\nengagement systems.\n\nWarehouse automation is expected to become a $41B market according to Bloomberg. Increasingly,\n\ndistribution centers are being automated with robotics to power dynamic routing and delivery. Shoppers\n\nthat became accustomed to having fast, same-day, and sometimes even overnight delivery options\n\nduring the pandemic now expect them as the norm. Retailers understand that the shipping and delivery\n\nexperience is now one of many touchpoints that merchants can use to develop customer brand loyalty.\n\n## $41B Market | Retail Warehouse Automation\n\nYet while retailers modernize different areas of their operations, they’re constrained by a single point of\n\nweakness, as they are reliant on legacy data platforms to bring together all of this data.\n\nPowering real-time decisions in modern retail requires real-time ingestion of data, transformation,\n\ngovernance of information, and powering business intelligence and predictive analytics all within the time\n\nrequired by retail operations.\n\n\n-----\n\n**Ingesting large volumes of transactional data in real time.** The biggest blocker to crucial insights\n\nis the ability to ingest data from transaction systems in real time. Transaction logs from point-of-sale\n\nsystems, clickstreams, mobile applications, advertising and promotions, as well as inventory, logistics\n\nand other systems, are constantly streaming data. Big data sets need to be ingested, cleansed and\n\naggregated and integrated with each other before they can be used. The problem? Retailers have used\n\nlegacy data warehouses that are built around batch processing. And worse, increasing the frequency\n\nof how often data is processed leads to a “hockey stick” in costs. As a result of these limitations,\n\nmerchants resort to ingesting data nightly to deal with the large volumes of data and integration with\n\nother data sets. The result? Accurate data to drive decisions can be delayed by days.\n\n**Performing fine-grained analysis at scale within tight time windows.** Retailers have accepted a\n\ntrade-off when performing analysis. Predictions can be detailed and accurate, or they can be fast.\n\nRunning forecasts or price models at a day, store and SKU level can improve accuracy by 10% or more,\n\nbut doing so requires tens of millions of model calculations that need to be performed in narrow service\n\nwindows. This is well beyond the capability of legacy data platforms. As a result, companies have been\n\nforced to accept the trade-off and live with less accurate predictions.\n\n**\u0007Powering real-time decisions on the front line.** Data is only useful if it drives decisions, but serving\n\nreal-time data to thousands of employees is a daunting task. While data warehouses are capable\n\nof serving reports to large groups of users, they’re still limited to stale data. Most retailers limit the\n\nfrequency of reports to daily or weekly updates and depend on the staff to use their best judgment for\n\ndecisions that are more frequent.\n\n**\u0007Delivering a hyper-personalized omnichannel experience.** The storefront of the 21st century is\n\nfocused on delivering personalized experiences throughout the omnichannel. Retailers have access to\n\na trove of customer data, and yet off-the-shelf tools for personalization and customer segmentation\n\nstruggle to deal with high volumes, and the analytics have high rates of inaccuracy. Retailers need to\n\ndeliver personalized experiences at scale to win in retail.\n\n\n-----\n\n###### The Lakehouse for Retail\n\nDatabricks Lakehouse for Retail solves these core challenges. The Lakehouse unlocks the ability to unify\n\nall types of data — from images to structured data — in real time, provide enterprise-class management\n\nand governance, and then immediately turn that data into actionable insights with real-time reporting and\n\npredictive analytics. It does this with record-setting speed and industry-leading total cost of ownership\n\n(TCO) in a platform-as-a-service (PaaS) that allows customers to solve these pressing problems.\n\n**Any structure** **Reliable, real-time** **Capabilities for** **Data sharing**\n**or frequency** **processing** **any persona** **& collaboration**\n\n_Semi-structured batch_\n\n\n**All of**\n**your sources**\n\nCompetitive activity\n\nE-commerce\n\nMobile Applications\n\nVideo & Images\n\nPoint of Sale\n\nDistribution & Logistics\n\nCustomer & Loyalty\n\nDelivery & Partners\n\n\n_Structured real-time_\n\n_Semi-structured real-time_\n\n_Unstructured batch_\n\n_Semi-structured real-time_\n\n_Structured real-time_\n\n_Structured batch_\n\n\nData Lakehouse\n\nData Management and Governance\n\nProcess, manage and query all of your data\n\n\nAd Hoc Data Science\n\n**Internal Teams**\n\nProduction\nMachine Learning\n\n**Customers**\n\nBI Reporting\n& Dashboarding\n\n**Partners**\n\nReal-time Applications\n\n\nAny Cloud\n\n\n_Structured real-time_\n\n\n-----\n\n**Reference Architecture**\n\nAt the core of the Databricks Lakehouse for Retail is technology that enables retailers to avoid the trade-\n\noffs between speed and accuracy. Technology such as Delta Lake enables the Lakehouse — a new paradigm\n\nthat combines the best elements of data warehouses and data lakes — to directly address these factors by\n\nenabling you to unify all of your data — structured and unstructured, batch and real-time — in one centrally\n\nmanaged and governed location. Once in the Lakehouse, e-commerce systems, reporting users, analysts,\n\ndata scientists and data engineers can all leverage this information to serve models for applications and\n\npower real-time reporting, advanced analytics, large-scale forecasting models and more.\n\n**EDGE** **HYBRID** **CLOUD**\n\n\n\nREST Model Serving\n\n|Machine Learning Operations Tracking Registery|RES|\n|---|---|\n||Application|\n\n\n\nReplication\n\n\nAutomatic DBs\n\n|Col1|Real-tim|\n|---|---|\n|||\n\n\nRaw Data\n\n(Bronze Table)\n\n\nClean Data\n\n(Silver Table)\n\n\nRefined Data\n\n(Gold Table)\n\n\nBusiness\nApplications\n\nPower BI\n\n\nBatch\n\n\n-----\n\n###### How it works\n\nThe Lakehouse for Retail was built from the ground up to solve the needs of modern retail. It blends\n\nsimplicity, flexibility and lower cost of ownership with best-in-industry performance. The result is\n\ndifferentiated capabilities that help retailers win.\n\nRobust data Time-sensitive machine\nData in real time Use all of your data Real-time reporting\nmanagement learning\n\n\n**Limited.** EDWs support the\n\nmanagement of structured\n\ndata.\n\n**No.** Data lakes lack\n\nenterprise-class data\n\nmanagement tools.\n\n**Yes.** Delta and Unity\n\nCatalog offer native\n\ndata management and\n\ngovernance of all data types.\n\n\n**No.** EDWs offer quick access\n\nto reports on old data.\n\n**No.** Data lakes were not\n\ndesigned for reporting, let\n\nalone real-time reporting.\n\n**No.** Data lakes are able to\n\nsupport large analytics,\n\nbut lack the ability to meet\n\nbusiness SLAs.\n\n\n**No.** EDWs must extract data\n\nand send it to a third party\n\nfor machine learning.\n\n**Yes.** Data views can be\n\nmaterialized, enabling front-\n\nline employees with real-\n\ntime data.\n\n**Yes.** The Lakehouse can\n\nscale to process the most\n\ndemanding predictions\n\nwithin business SLAs.\n\n\n**No.** Data warehouses are\n\nbatch oriented, restricting\n\ndata updates to hours or days.\n\n**No.** Data lakes are batch\n\noriented.\n\n**Yes.** Support for real-time\n\nstreaming data.\n\n\n**No.** Data warehouses have\n\nvery limited support for\n\nunstructured data.\n\n**Yes.** Data lakes offer support\n\nfor all types of data.\n\n**Yes.** Supports all types of\n\ndata in a centrally managed\n\nplatform.\n\n\n**LEGACY DATA**\n\n**WAREHOUSE**\n\n\n**LEGACY DATA**\n\n\n**DATA LAKES**\n\n**(HADOOP)**\n\n\n**DATA LAKES**\n\n\n**ROBUST**\n\n**DATA**\n\n\n**ROBUST**\n\n\n-----\n\n**\u0007Data in real time.** Retail operates in real time and so should your data. The Lakehouse offers support\n\nfor streaming data from clickstream, mobile applications, IoT sensors and even real-time e-commerce\n\nand point-of-sale data. And Delta Lake enables this world-record-leading performance while\n\nmaintaining support for ACID transactions.\n\n**\u0007Use all of your data.** Retailers are increasingly capturing data from mobile devices, video, images\n\nand a growing variety of other data sources. This data is extremely powerful in helping to improve our\n\nunderstanding of consumer behavior and operations. The Lakehouse for Retail enables companies\n\nto take full advantage of all types of data in a cost-efficient way, in a single unified lakehouse\n\narchitecture.\n\n**\u0007Robust data management and governance** that companies need to protect sensitive data, but\n\nwas lacking from earlier big data systems. The Lakehouse offers transactional integrity with ACID\n\ncompliance, detailed data security, schema enforcement, time travel, data lineage and more. Moving\n\nto a modern data architecture does not require sacrificing enterprise maturity.\n\n**\u0007High-performance predictive analytics.** Machine learning models, such as demand forecasting\n\nor recommendation engines, can be run in hours without compromising accuracy. The Lakehouse\n\ncan scale to support tens of millions of predictions in tight windows, unlocking critical and time-\n\nsensitive analytics such as allocating inventory, optimizing load tenders and logistics, calculating item\n\navailability and out-of-stocks, and delivering highly personalized predictions.\n\n**Value with Databricks**\n\nBy using Databricks to build and support your lakehouse, you can empower your business with even more\n\nspeed, agility and cost savings. The flexibility of the Databricks Lakehouse Platform means that you can\n\nstart with the use case that will have the most impact on your business. As you implement the pattern, you\n\nwill find that you’re able to tackle use cases quicker and more easily than before. To get you started, this\n\nguidebook contains the use cases we most commonly see across the Retail and Consumer Goods industry.\n\n\n-----\n\n**CHAPTER 3**\n### Use Case:\n Real-Time Supply\n Chain Data\n\n\n**Overview**\n\nAs companies see a surge in demand from e-commerce and delivery services, and seek increasing\n\nefficiencies with plant or distribution centers, real-time data is becoming a key part of the technical\n\nroadmap. Real-time supply chain data allows customers to deal with problems as they happen and before\n\nitems are sent downstream or shipped to consumers, which is the first step in enabling a supply chain\n\ncontrol tower.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nManufacturers Distributors Logistics Restaurants\n\n\n**Challenges**\n\n**\u0007Batch data** — existing data warehouses bring data in batch, creating a lag between when something is\n\nhappening and when a customer can act on it\n\n**\u0007Complex analysis in real time** — if ingesting data in real time wasn’t a big enough challenge, companies\n\nhave the added pressure to take immediate action on it\n\n**\u0007Complex maintenance** — ETL tools to bring data in batch are often complex and costly to maintain\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks has enabled real-time streaming of supply chain data across a variety of customers for specific\n\nplant operations or as part of a supply chain control tower.\n\n**\u0007Near real-time ingestion and visibility of data** — one customer experienced a 48,000%\n\nimprovement in speed to data, with greater reliability\n\n**\u0007Cost-neutral** — because Delta’s efficient engine requires smaller instances, many customers report\n\nthat they were able to move from batch to real-time at neutral costs\n\n**�Simplified architecture and maintenance** — leveraging Delta for ingestion streamlines the pattern for\n\nreal-time data ingestions. Customers frequently report that the amount of code required to support\n\nstreaming ingestion is 50% less than previous solutions.\n\n**\u0007Immediate enablement of additional use cases** — customers can now prevent problems as they’re\n\nhappening, predict and prevent issues, and even gain days on major changes such as production\n\nschedules between shifts\n\n**Solution overview**\n\nDatabricks allows for both streaming and batch data sets to be ingested and made available to enable\n\nreal-time supply chain use cases. Delta Lake simplifies the change data capture process while providing\n\nACID transactions and scalable metadata handling, and unifying streaming and batch data processing. And\n\nDelta Lake supports versioning and enables rollbacks, full historical audit trails, and reproducible machine\n\nlearning experiments.\n\n**Typical use case data sources include:**\n\nSupply planning, procurement, manufacturing execution, warehousing, order fulfillment, shop floor/historian\n\ndata, IoT sensor, transportation management\n\n\n-----\n\n**CASE STUDY**\n\nWith Databricks, Gousto was able to implement real-time visibility in their supply chain. Gousto moved from\n\ndaily batch updates to near real-time streaming data, utilizing Auto Loader and Delta Lake. The platform\n\nprovided by Databricks has allowed Gousto to respond to increased demand during the coronavirus\n\noutbreak by providing real-time insight into performance on the factory picking lines.\n\n**CASE STUDY**\n\nAs a young e-commerce company, ButcherBox needed to act nimbly to make the most of the data from its\n\nhundreds of thousands of subscribers. With Databricks Lakehouse, the company could pull 18 billion rows of\n\ndata in under three minutes.\n\nNow, ButcherBox has a near real-time understanding of its customers, and can also act proactively to\n\naddress any logistical and delivery issues.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 4**\n### Use Case: Truck Monitoring\n\n\nWith many industries still feeling the effects of supply chain issues, being able to increase the efficiency\n\nof trucks on the road can make all the difference in getting goods into the hands of customers in a timely\n\nmanner. Real-time data is making it easier for companies to get immediate insights into truck manufacturing\n\ndelays, maintenance issues, supply chain issues, delivery schedules and driver safety.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics\n\n\n**Challenges**\n\n**\u0007** Siloed data makes it difficult to get a comprehensive understanding of fleet performance\n\n\u0007A lack of real-time insights can delay responses to manufacturing or supply chain issues\n\n\u0007Not having effective automation and AI increases the risk of human error, which can result in vehicular\n\naccidents or shipment delays\n\n\n-----\n\n**Value with the Databricks Lakehouse**\n\nDatabricks empowers companies to get real-time insights into their fleet performance, from manufacturing\n\nto delivery.\n\n**Near real-time insights** — the greater speed to data means a quicker response to issues and the\n\nability to monitor driver safety more immediately\n\n**Ability to scale** — although consumer demands are constantly evolving, Databricks can handle fleet\n\nexpansion without sacrificing data quality and speed\n\n**Optimizing with AI/ML** — implementing AI and ML models can lead to more effective route monitoring,\n\nproactive maintenance and reduced risk of accidents\n\n**Solution overview**\n\nDatabricks enables better truck monitoring, quickly ingesting data on everything from vehicle manufacturing\n\nto route optimization. This results in a more complete and real-time view of a company’s fleet, and these\n\nanalytics provide companies with the tools they need to scale and improve their operations.\n\n**Typical use case data sources include:**\n\nSupply planning, transportation management, manufacturing, predictive maintenance\n\n**CASE STUDY**\n\nWith 94% of vehicular accidents attributed to human error, Embark used the Databricks Lakehouse Platform\n\nto unlock thousands of hours of recorded data from its trucks and then collaboratively analyze that data\n\nvia dashboards. This has resulted in more efficient ML model training as Embark speeds toward fully\n\nautonomous trucks.\n\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n-----\n\n**CHAPTER 5**\n### Use Case: Inventory Allocation\n\n\n**Overview**\n\nReplenishment planning is the process of determining what needs to go where. It is used by replenishment\n\nplanning, distributors and consumer goods companies performing vendor-managed replenishment (VMR) or\n\nvendor-managed inventory (VMI) to make daily decisions on which product needs to be sent to which store\n\nand on what day.\n\nReplenishment is challenging for companies because it deals with rapidly changing data and the need to\n\nmake complex decisions on that data in narrow service windows. Retailers need to stream in real-time sales\n\ndata to signal how much of a product has been sold in order. Inaccurate sales data leads to an insufficient\n\nnumber of products being sent to stores. This results in lost sales and low customer satisfaction.\n\nInventory allocation is a process that might be performed multiple times a day during peak seasons, or\n\ndaily during slower seasons. Companies need the ability to scale to perform tens of millions of predictions\n\nmultiple times a day — on demand and dynamically — during peak season without paying a premium for\n\nthis capability throughout the year.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Restaurants\n\n\n-----\n\n**Challenges**\n\n\u0007Customers must complete tens of millions of inventory allocation predictions within tight time windows.\n\nThis information is used to determine which products get put on trucks and go to specific stores.\n\n\u0007Traditional inventory allocation rules cause trade-offs in accuracy in order to calculate all possibilities in\n\nthe service windows\n\n\u0007Legacy tools have rudimentary capabilities and have limited ability to consider flavors, sizes and other\n\nattributes that may be more or less popular by store\n\n**Value with Databricks**\n\nCustomers are able to complete inventory allocation models within SLAs with no trade-off for accuracy.\n\n\u0007 **Speed —** on average, customers moving to Databricks for demand forecasting report a double-digit\n\nimprovement in forecast accuracy\n\n\u0007 **Ability to scale** and perform fine-grained (day, store, item) level allocations\n\n\u0007 **Provide more robust allocations** by incorporating causal factors that may increase demand, or include\n\ninformation on flavors or apparel sizes for specific stores\n\n**Solution overview**\n\nThe objective of inventory allocation is to quickly determine when to distribute items and where — from\n\nwarehouses and distribution centers to stores. Inventory allocation begins by looking at the consumption\n\nrate of products, the available inventory and the shipping schedules, and then using this information to\n\ncreate an optimized manifest of what items should be carried on which trucks, at what point, and at what\n\ntime. This becomes the plan for route accounting systems that arrange deliveries.\n\nInventory allocation also deals with trade-offs related to scarcity of items. If an item has not been available\n\nin a store for a long time, that store may receive heightened priority for the item in the allocation.\n\n\n-----\n\nHOW TO GET STARTED\n\n\n**Typical use case data sources include:** point of sale, digital sales, replenishment data, modeled safety\n\nstock, promotions data, weather\n\n**View our webinar covering demand forecasting with Starbucks and then read our blog about**\n\n**demand forecasting.**\n\n**[Demand forecasting with causal factors.](https://www.databricks.com/blog/2020/03/26/new-methods-for-improving-supply-chain-demand-forecasting.html)**\n\nOur most popular notebook at Databricks. This blog walks you through the business and technical\n\nchallenges of performing demand forecasting and explains how we approached solving it.\n\n**[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)**\n\nVideo and Q&A from our webinar with Starbucks.\n\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n**CASE STUDY**\n\nH&M turned to the Databricks Lakehouse Platform to simplify its infrastructure management, enable\n\nperformant data pipelines at scale, and simplify the machine learning lifecycle. The result was a more data-\n\ndriven organization that could better forecast operations to streamline costs and boost revenue.\n\n**CASE STUDY**\n\nEdmunds is on a mission to make car shopping an easy experience for all. With the Databricks Lakehouse\n\nPlatform, they are able to simplify access to their disparate data sources and build ML models that make\n\npredictions off data streams. With real-time insights, they can ensure that the inventory of vehicle listings\n\non their website is accurate and up to date, improving overall customer satisfaction.\n\n\n-----\n\n**CHAPTER 6**\n### Use Case: Point of Sale\n and Clickstream\n\n\n**Overview**\n\nDisruptions in the supply chain — from reduced product supply and diminished warehouse capacity —\n\ncoupled with rapidly shifting consumer expectations for seamless omnichannel experiences are driving\n\nretailers to rethink how they use data to manage their operations. Historically, point-of-sale (POS) systems\n\nrecorded all in-store transactions, but were traditionally kept in a system that was physically in the store.\n\nThis would result in a delay in actionable insights. And now with consumers increasingly shopping online, it’s\n\ncrucial to not only collect and analyze that clickstream data quickly, but also unify it with POS data to get a\n\ncomplete and real-time snapshot of each customer’s shopping behavior.\n\nNear real-time availability of information means that retailers can continuously update their estimates of\n\nitem availability. No longer is the business managing operations based on their knowledge of inventory\n\nstates as they were a day prior, but instead is taking actions based on their knowledge of inventory states as\n\nthey are now.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce\n\n**Challenges**\n\n\u0007Retailers with legacy POS systems in their brick-and-mortar stores are working with siloed and\n\nincomplete sales data\n\n\u0007Both POS and clickstream data need to be unified and ingested in real time\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\n\n**Value with Databricks**\n\nDatabricks brings POS and clickstream data together for a unified data source that leads to real-time\n\ninsights and a clearer understanding of customer behavior.\n\n\u0007 **Single source of truth** — a centralized, cloud-based POS system means it can be merged with\n\nclickstream data\n\n\u0007 **Near real-time insights** — the greater speed to data means businesses get the latest insights into\n\ncustomer purchasing behaviors and trends\n\n\nto have them perform a free proof-of-\n\n\nconcept with your real-time data.\n\n\n\u0007 **Scalability** — companies can scale with Databricks to handle data from countless transactions\n\n\n-----\n\n**CHAPTER 7**\n### Use Case: On-Shelf Availability\n\n\n**Overview**\n\nEnsuring the availability of a product on shelf is the single largest problem in retail. Retailers globally are\n\nmissing out on nearly $1 trillion in sales because they don’t have on hand what customers want to buy in\n\ntheir stores. Shoppers encounter out-of-stock scenarios as often as one in three shopping trips. All told,\n\nworldwide, shoppers experience $984 billion worth of out-of-stocks, $144.9 billion in North America alone,\n\naccording to industry research firm IHL.\n\nIn the past, if a customer faced an out-of-stock, they would most likely select a substitute item. The cost\n\nof going to another store prevented switching. Today, e-commerce loyalty members, such as those who\n\nbelong to Walmart+ and Amazon Prime, are 52% more likely than other consumers to purchase out-of-stock\n\nitems online. It is believed that a quarter of Amazon’s retail revenue comes from customers who first tried to\n\nbuy a product in-store. In all, an estimated $36 billion is lost to brick-and-mortar competition, and another\n\n$34.8 billion is lost to Amazon or another e-retailer, according to IHL.\n\nOn-shelf availability takes on a different meaning in pure e-commerce applications. An item can be\n\nconsidered in stock when it is actually in a current customer’s basket. If another customer places the same\n\nitem in their basket, there is the possibility that the first customer will purchase the last available item\n\nbefore the second customer. This problem is exacerbated by retailers who use stores to keep inventory. In\n\nthese situations, customers may order an item that is picked for delivery at a much later time. The window\n\nbetween ordering and picking creates the probability of out-of-stocks.\n\nOn-shelf availability predicts the depletion of inventory by item, factors in safety stock levels and\n\nreplenishment points, and generates a signal that suggests an item may be out of stock. This information is\n\nused to generate alerts to retail staff, distributors, brokers and consumer goods companies. Every day, tens\n\nof thousands of people around the world do work that is generated by these algorithms.\n\nThe sheer volume of data used to calculate on-shelf availability prevents most companies from analyzing\n\nall of their products. Companies have between midnight and 4 AM to collect all of the needed information\n\nand run these models, which is beyond the capability of legacy data systems. Instead, companies choose\n\nthe priority categories or products to analyze, which means a significant percentage of their unavailable\n\nproducts will not be proactively addressed.\n\n\n-----\n\nOne of the biggest challenges with on-shelf availability is determining when an item is actually out of stock.\n\nWhile some retailers are investing in computer vision and robots, and others employ the use of people to\n\nmanually survey item availability, most retailers default to a signal of determining when an item has not been\n\nscanned in an acceptable time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nE-commerce Direct to\nConsumer\n\n\n**Challenges**\n\nThe biggest challenge to generating on-shelf availability alerts is time. Companies may receive their final sales\n\ndata from the preceding day shortly after midnight. They have less than 4 hours from that point to ingest large\n\nvolumes of t-log data and calculate probabilities of item availability. Most firms are encumbered by a data\n\nwarehouse process that only releases data after it has been ingested and aggregates have been calculated, a\n\nprocess that can require multiple hours per night.\n\nFor this reason, most firms make sacrifices in their analysis. They may alternate categories they analyze by\n\ndifferent days, prioritize only high-impact SKUs, or run analysis at higher-level and less-accurate aggregate\n\nlevels. Among the challenges:\n\n\u0007Processing large volumes of highly detailed data and running millions of models in a narrow time window\n\n\u0007Companies are spending hundreds of thousands of dollars annually to generate these daily alerts for a\n\nfew categories\n\n\u0007Dealing with false positives and negatives in predictions\n\nDistributing information quickly and efficiently to internal systems and external partners\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks enables customers to generate on-shelf availability (OSA) predictions at scale with no\n\ncompromises.\n\n**\u0007** Delta removes the data processing bottleneck. Delta enables retailers to stream in real time or to batch\n\nprocess large volumes of highly detailed and frequently changing point-of-sale transaction data.\n\n**\u0007** Easily scale to process all OSA predictions within tight service windows using Apache Spark TM\n\n**\u0007** Manage features and localize models with additional causal data to improve accuracy with MLflow\n\n**\u0007** Easily deploy information via streams, through API for mobile applications or partners, or to Delta for\n\nreporting\n\n**\u0007** Enable retailers to monetize their data by directly licensing OSA alerts\n\n**Solution overview**\n\nDatabricks enables companies to perform on-shelf availability analysis without making compromises to the\n\nbreadth or quality of predictions.\n\nIt begins with Delta Lake — a nearly perfect platform for ingesting and managing t-log data. One of the\n\nbiggest challenges in t-log data is the frequent number of changes to a transaction that can occur within\n\na data. Delta Lake simplifies this with transaction awareness using a transaction log, and creates additional\n\nmetadata for easier retrieval. Data is made available in a fraction of the time needed in data warehouse-\n\nbased systems. This is why the largest retailers in the world are using Delta Lake for processing t-log data.\n\nOnce data is available, users need to generate predictions about item availability on the shelf. With its\n\nextremely performant engine and the ability to distribute computation across countless nodes, Spark\n\nprovides the perfect platform for calculating out-of-stocks. Customers no longer need to run in aggregate\n\nor against a subset of data.\n\n\n-----\n\n**HOW TO GET STARTED**\n\n[Solution Accelerator:](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\n[On-Shelf Availability](https://www.databricks.com/solutions/accelerators/on-shelf-availability)\n\nIn this solution, we show how the\n\nDatabricks Lakehouse Platform enables\n\nreal-time insights to rapidly respond\n\n\nAnd lastly, data is only useful if it drives better outcomes. Databricks can write the resulting data into Delta\n\nLake for further reporting, or to any downstream application via APIs, feeds or other integrations. Users can\n\nfeed their predictive alerts to downstream retail operations systems or even to external partners within the\n\ntightest service windows, and in enough time to drive actions on that day.\n\n**Typical use case data sources include:** point-of-sale data, replenishment data, safety stock calculations,\n\nmanual inventory data (optional), robotic or computer vision inventory data (optional)\n\n**CASE STUDY**\n\nReckitt distributes its products to millions of consumers in over 60 countries, which was causing the\n\norganization to struggle with the complexity of forecast demand, especially with large volumes of different\n\ntypes of data across many disjointed pipelines. Thanks to the Databricks Lakehouse Platform, Reckitt now\n\nuses predictive analytics, product placement and business forecasting to better support neighborhood\n\ngrocery stores.\n\n\nto demand, drive more sales by\n\nensuring stock is available on shelf, and\n\nscale out your forecasting models to\n\naccommodate any size operation.\n\n\n-----\n\n**CHAPTER 8**\n### Use Case: Customer and Vehicle Identification\n\n\n**Overview**\n\nCOVID-19 led to increased consumer demand for curbside pickup, drive-through and touchless payment\n\noptions. Retailers that were able to implement these new services have been able to differentiate overall\n\ncustomer experiences and mitigate catastrophic hits on revenue levels.\n\nFor retailers to create a seamless contactless experience for customers, they need real-time data to\n\nknow when a customer has arrived and where they’re located, as well as provide updates throughout the\n\npickup journey. And through the use of computer vision, they can capture that data by employing optical\n\nrecognition on images to read vehicle license plates.\n\nRetailers can also use information captured from license plates to make recommendations on buying\n\npatterns. Looking ahead, facial recognition also has the potential to provide retailers with valuable\n\ninformation to better serve their customers in real time.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDrive-Through\nFood Retailers\n\n\n**Challenges**\n\n\u0007Ineffective data processing can lead to suboptimal order preparation timing\n\n\u0007Without real-time data, it can be difficult to provide customers with live updates on their order status\n\n\n-----\n\n**Value with Databricks**\n\nDatabricks makes it possible to not only identify customers and vehicles in real time but also provide real-\n\ntime communications throughout the entire shopping and curbside or drive-through experience.\n\n\u0007 **Near real-time insights** — the greater speed to data means retailers can get the right order\n\npreparation timing\n\n\u0007 **Recommendations** — being able to quickly access and refer to data from previous visits will ensure\n\neach subsequent visit is equally as or more seamless than the last\n\n\u0007 **Optimizing with AI/ML** — implementing AI and ML models can lead to more effective geofencing,\n\nvehicle identification and order prediction\n\n**CASE STUDY**\n\n**CASE STUDY**\n\n\n-----\n\n**CHAPTER 9**\n### Use Case: Recommendation Engines\n\n\n**Overview**\n\nCustomers that feel understood by a retailer are more likely to spend more per purchase, purchase more\n\nfrequently with that retailer, and deliver higher profitability per customer. The way that retailers achieve this\n\nis by recommending products and services that align with customer needs.\n\nProviding an experience that makes customers feel understood helps retailers stand out from the crowd\n\nof mass merchants and build loyalty. This was true before COVID, but shifting consumer preferences make\n\nthis more critical than ever for retail organizations. With research showing the cost of customer acquisition\n\nis as much as five times as retaining existing ones, organizations looking to succeed in the new normal must\n\ncontinue to build deeper connections with existing customers in order to retain a solid consumer base.\n\nThere is no shortage of options and incentives for today’s consumers to rethink long-established patterns\n\nof spending.\n\nRecommendation engines are used to create personalized experiences for users across retail channels.\n\nThese recommendations are generated based on the data collected from purchases, items interacted\n\nwith, users’ behavior across physical and digital channels, and other data such as from customer service\n\ninteractions and reviews. Leveraging a Customer 360 architecture that collects all user clickstream and\n\nbehavioral data, marketers are able to create recommendations that are integrated with other business\n\nobjectives such as highlighting items that are on promotion or product availability.\n\nCreating recommendations is not a monolithic activity. Recommendation engines are used to personalize\n\nthe customer experience in every possible area of consumer engagement, from proactive notifications and\n\noffers, to landing page optimization, suggested products, automated shipment recommendations, cross-sell\n\nand upsell, and even suggestions for complementary items after the purchase.\n\n\n-----\n\n**R E L E V A N T F O R**\n\n\nRetail E-commerce Direct to\nConsumer\n\n\nMedia Telecom Financial Services\n(any B2B or B2C\ncompany)\n\n\n**Challenges**\n\nRecommendation engines are very difficult to do well. Many companies use off-the-shelf recommenders,\n\nbut traditional off-the-shelf systems suffer from high rates of inaccuracy. In our analysis, we found general\n\nrecommenders with 29% variance, meaning that of every 10 recommendations delivered, 3 would be\n\nirrelevant.\n\n**Massive volumes of highly detailed and frequently changing data.** Recommendation accuracy\n\nis improved by having recent data, and yet most systems struggle to handle the large volumes of\n\ninformation involved.\n\n**Creating a 360 view of the customer.** Identity and being able to stitch together all customer\n\ntouchpoints in one place are critical to enabling this use case. More data, including transaction and\n\nclickstream data, is critical for driving accuracy and precision in messaging.\n\n**Processing speed.** Retailers need to be able to frequently refresh models based on constantly\n\nchanging dynamics, and deliver real-time recommendations via APIs.\n\n**Automation.** This is an “always-on” use case where automation is essential for scalability and\n\nresponsiveness based on frequent model updates.\n\n\n-----\n\nMany firms choose to use recommender systems from Amazon or Google. Using these systems trains\n\nthe general recommendation engine in a way that helps competitors improve the accuracy of their own\n\nrecommendations.\n\n**Value with Databricks**\n\nRecommendations are one of the most critical capabilities that a retailer maintains. This is a capability that\n\nretailers must own, and Databricks provides a solid platform for enabling this.\n\nUsing Databricks as the foundation for their Customer 360 architecture to deliver omnichannel\n\npersonalization, sample value metrics from a media agency include:\n\n**200% ROI for 70% of retailers** engaging in advanced personalization\n\n**10% improvement** in conversions\n\n**35% improvement** in purchase frequency\n\n**37% improvement** in customer lifetime value\n\n**Solution overview**\n\nRecommendations are only as good as the data that powers them. Delta Lake provides the best platform for\n\ncapturing and managing huge volumes of highly atomic and frequently changing data. It allows organizations\n\nto combine various sources of data in a timely and efficient manner, from transactions, demographics and\n\npreference information across products, to clickstream, digital journey and marketing analytics data to bring\n\na 360 view of customer interactions to enable omnichannel personalization.\n\nBy identifying changes in user behavior or engagement, retailers are able to detect early signals that\n\nindicate a propensity to buy or a change in preferences, and recommend products and services that will\n\nkeep consumers engaged.\n\n\n-----\n\n**Typical use case data sources include:** Customer 360 data, CRM, loyalty data, transaction data,\n\nclickstream data, mobile data:\n\n**Engagement data** — transaction log data, clickstream data, promotion interaction\n\n**Identity** — loyalty data, person ID, device ID, email, IP address, name, gender, income, presence of\n\nchildren, location\n\n**User lifecycle** — subscription status, payment history, cost of acquisition, lifetime value, propensity\n\nto churn\n\n**CASE STUDY**\n\nFor Wehkamp to provide the best shopping experience for their customers, they turned to Databricks\n\nfor help with their data analytics and machine learning needs, resulting in a highly engaging web shop\n\npersonalized to each of their customers.\n\n**CASE STUDY**\n\nColumbia’s legacy ETL was unable to support batch and real-time use cases at scale. After migrating to\n\nDatabricks, the company is now able to more efficiently and reliably work with its data, resulting in smarter\n\nbusiness decisions.\n\n**CASE STUDY**\n\nPandora wanted to drive stronger online engagement with their customers, so they used the Databricks\n\nLakehouse Platform to create more personalized experiences and boost both click-to-open rates and\n\nquarterly revenue.\n\n\nHOW TO GET STARTED\n\nDatabricks has created [four](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\n[Recommendation Engine accelerators,](https://www.databricks.com/solutions/accelerators/recommendation-engines)\n\nwith content-based and collaborative\n\nfilter methods, and both item-\n\nand user-based analysis. These\n\naccelerators have been further refined\n\nto be highly performant to enable\n\nfrequent retraining of models.\n\nTo begin working on recommendation\n\nengines, contact your Databricks\n\naccount team.\n\n\n-----\n\n**CHAPTER 10**\n### Use Case: Perpetual Inventory\n\n\n**Overview**\n\nWith the rapid adoption of digital channels for retail, staying on top of your inventory is crucial to meeting\n\ncustomer demand. As a result, the periodic inventory system is now outdated — instead, using a perpetual\n\ninventory model allows businesses to perform immediate and real-time tracking of sales and inventory\n\nlevels. This has the added benefit of reducing labor costs and human error, ensuring that you always have an\n\naccurate overview of your inventory and can better forecast demand to avoid costly stockouts.\n\nThe key to building a perpetual inventory system is real-time data. By capturing real-time transaction\n\nrecords related to sold inventory, retailers can make smarter inventory decisions that streamline operations\n\nand lower overall costs.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Supply Chain\n\n\nInventory\nManagement\n\n\n**Challenges**\n\n**\u0007** Companies need to scale to handle ever-increasing inventory and the data associated with the products\n\n**\u0007** Data needs to be ingested and then processed in real time (or near real-time) to provide a truly accurate\n\nview of inventory\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means inventory is automatically updated with\n\nthe latest sales data\n\n**\u0007Detailed records** — with all inventory updates and movements being tracked as they happen,\n\ncompanies know they’re getting the most accurate information at any point\n\n**\u0007Optimizing with AI/ML** — using AI and ML can help with forecasting demand and reducing inventory\n\nmanagement costs\n\n\n-----\n\n**CHAPTER 11**\n### Use Case: Automated\n Replenishments\n\n\n**Overview**\n\nCustomers favor convenience more than ever when it comes to their goods, and automated replenishments\n\nhelp meet that need. Whether it’s through a connected device or smartphone app, real-time data plays a\n\nkey role in ensuring consumers get a refill automatically delivered at the right time.\n\nOn the manufacturing side, this real-time data can also help with vendor-managed replenishment (VMR),\n\nreducing the time needed to forecast, order and receive thousands of items.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDistributors Logistics Direct to\nCustomer\n\n\n**Challenges**\n\n**\u0007** Being able to ingest large amounts of data quickly is crucial to actually fulfilling the\n\nreplenishment orders\n\nWith VMR, there may be a disconnect between the vendor and customer, resulting in a forecast\n\nfor replenishment even when the customer can’t fulfill that order\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team\n\nto have them perform a free proof-of-\n\nconcept with your real-time data.\n\n\n**Value with Databricks**\n\nDatabricks enables real-time inventory updates, giving businesses the insights they need to properly\n\nmanage inventory and to forecast more accurately.\n\n**\u0007Near real-time insights** — the greater speed to data means businesses can stay on top of\n\ncustomer needs\n\n**\u0007Scalability** — companies can scale with Databricks to handle thousands of SKUs, each with its own\n\nunique properties and expiry dates\n\n**\u0007Optimizing with AI/ML** — using AI and ML can lead to better forecasting and predictions\n\n\n-----\n\n**CHAPTER 12**\n### Use Case: Fresh Food Forecasting\n\n\n**Overview**\n\nFresh food typically accounts for up to 40% of revenue for grocers, and plays an important role in driving\n\nstore traffic. But fresh food is also incredibly complex to manage — prices can be volatile, there is a wide\n\nrange of suppliers to work with and the products expire, which creates significant amounts of waste.\n\nIn order to avoid losing significant revenue, businesses need to properly forecast when food is nearing its\n\nsell-by date, the current levels of customer demand (also taking into account seasonality), and the proper\n\ntiming for replenishing food stock. Being able to tap into real-time data is key to staying on top of the ever-\n\nchanging needs around fresh food.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Distributors Logistics Restaurants\n\n**Challenges**\n\n**\u0007** Because of the perishable nature of fresh food, customers need to be able to ingest data quickly\n\nenough to conduct daily forecasting and daily replenishment\n\n**\u0007** Customers are running aggregate-level forecasts, which are less accurate than fine-grained forecasting\n\n**\u0007** Customers are forced to compromise on what they can analyze\n\n\n-----\n\nHOW TO GET STARTED\n\nContact your Databricks account team to get\n\nstarted with inventory allocation. Databricks\n\ndoes not have a Solution Accelerator.\n\nView our webinar covering demand forecasting\n\nwith Starbucks and then read our blog about\n\ndemand forecasting.\n\n[Fine-grained time series forecasting at scale.](https://www.databricks.com/blog/2021/04/06/fine-grained-time-series-forecasting-at-scale-with-facebook-prophet-and-apache-spark-updated-for-spark-3.html)\n\nThis blog details the importance of time series\n\nforecasting, walks through building a simple\n\nmodel to show the use of Facebook Prophet, and\n\nthen shows off the combination of Facebook\n\nProphet and Adobe Spark to scale to hundreds\n\nof models.\n\n[On-demand webinar for demand forecasting.](https://www.databricks.com/blog/2020/02/21/on-demand-webinar-granular-demand-forecasting-at-scale.html)\n\nVideo and Q&A from our webinar with Starbucks\n\n\n**Value with Databricks**\n\nCustomers average double-digit improvement in forecast accuracy, leading to a reduction in lost sales and in\nspoiled products, as well as lower inventory and handling costs.\n\n**\u0007Improved accuracy** — on average, customers moving to Databricks for demand forecasting report a\n\ndouble-digit improvement in forecast accuracy\n\n**�Ability to scale and perform fine-grained (day, store, item) level forecasts** — rapidly scale to tens of\n\nmillions of model iterations in narrow service windows. Companies need accurate demand forecasts in a\nfew hours.\n\n**\u0007Eliminate compromises on what to analyze** — customers do not need to select winners or losers among\n\nthe products they forecast. They can predict demand for all products as frequently as required.\n\n**Solution overview:**\n\nDatabricks is well suited to handling forecasting for fresh food at scale. Forecasting begins with the Databricks\nSolution Accelerator. It enables companies to rapidly build fine-grained forecasting of items — forecasting that\ncan be efficiently scaled to tens of millions of predictions in tight service windows.\n\n**Typical use case data sources include:** historic point-of-sale data, shipment data, promotions, pricing,\n\nexpiration dates and weather.\n\n**CASE STUDY**\n\nButcherBox faced the complex challenges of securing inventory with enough lead time, meeting highly variable\ncustomer order preferences and unpredictable customer sign-ups, and managing delivery logistics. With\nDatabricks, the company was able to create a predictive solution to adapt quickly and integrate tightly with the\nrest of its data estate.\n\n\non demand forecasting.\n\n**CASE STUDY**\n\nSam’s Club needed to build out an enterprise-scale data platform to handle the billions of transactions and\ntrillions of events going through the company. Find out how Databricks became a key component in the shift\nfrom on premises Hadoop clusters to a cloud based platform\n\n\n-----\n\n**CHAPTER 13**\n### Use Case: Propensity-to-Buy\n\n\n**Overview**\n\nCustomers often have repeatable purchase patterns that may not be noticed upon initial observation.\n\nWhile we know that commuting office workers are likely to purchase coffee at a coffee shop on weekday\n\nmornings, do we understand why they visit on Thursday afternoons? And more importantly, how do we\n\npredict these buying moments when customers are not in our stores?\n\nThe purpose of a propensity-to-buy model is to predict when a customer is predisposed to make a\n\npurchase and subsequently act on that information by engaging customers. Traditional propensity-to-buy\n\nmodels leveraged internal sales and loyalty data to identify patterns of consumption. These models are\n\nuseful, but are limited in understanding the full behavior of customers. More advanced propensity-to-buy\n\nmodels are now incorporating alternative data sets to identify trips to competing retailers, competitive scan\n\ndata from receipts, and causal data that helps to explain when and why customers make purchases.\n\nPropensity-to-buy models create a signal that is sent to downstream systems such as those for promotion\n\nmanagement, email and mobile alerts, recommendations and others.\n\n**R E L E V A N T F O R**\n\nRetail E-commerce Direct to\nConsumer\n\n\n-----\n\n**Challenges**\n\n**\u0007** Customers do not want to be inundated with messages from retailers. Companies need to limit their\n\noutreach to customers to avoid angering them.\n\nCompanies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequently\n\nCompanies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Value with Databricks**\n\n**\u0007** Databricks allows companies to efficiently traverse huge volumes of customer data over time, and\n\nefficiently synthesize this into data for analysis\n\n**\u0007** Companies need to traverse and process vast sums of customer data and generate probabilities of\n\npurchase frequency\n\n**\u0007** Companies need to look at external data that helps build a propensity-to-buy model that captures the full\n\nshare of the customer wallet. They need to quickly test and incorporate additional data that improves the\n\naccuracy of their models.\n\n**Solution overview:**\n\nPropensity-to-buy analytics determine the signals that indicate the probability a customer is in a buying\n\nmoment. Historic propensity models relied on sales data to identify buying patterns, but newer approaches are\n\nincorporating behavioral data. Proximity to a coffee shop might push a consumer over the threshold of a buying\n\nmoment. Traditional, batch-oriented operations are insufficient to solve this problem. If you wait until that night,\n\nor even later in the day you have lost the opportunity to act\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\nWith the propensity to buy, speed becomes a critical force in determining key inflection points. Databricks\n\nenables marketers to ingest data in real time and update probabilities. Lightweight queries can be automated\n\nto refresh models, and the resulting data can be fed automatically to downstream promotions, web or mobile\n\nsystems, where the consumer can be engaged.\n\nAs this data is streamed into Delta Lake, data teams can quickly capture the data for broader analysis.\n\nCalculating a propensity to buy requires traversing interactions that are episodic in nature, and span broad\n\nperiods of time. Delta Lake helps simplify this with scalable metadata handling, ACID transactions and data\n\nskipping. Delta Lake even manages schema evolution to provide users with flexibility as their needs evolve.\n\n**Typical use case data sources include:** point-of-sale data with tokens, loyalty data, e-commerce sales data,\n\nmobile application data, competitive scan or receipt data (optional), place of interest data (optional)\n\n\n-----\n\n**CHAPTER 14**\n### Use Case: Next Best Action\n\n\n**Overview**\n\nThe e-commerce boom over the last couple of years has given consumers ample choice for digital\n\nshopping options. If your business isn’t engaging customers at every point in their purchasing journey, you\n\nrisk losing them to a competitor. By applying AI/ML to automatically determine — in real time — the next\n\nbest action for customers, you can greatly increase your conversion rates.\n\n**R E L E V A N T F O R**\n\n\nRetail Consumer\nGoods\n\n\nDirect to\nConsumer\n\n\nE-commerce\n\n\n**Challenges**\n\nSiloed data makes it difficult to create an accurate and comprehensive profile of each customer,\n\nresulting in suboptimal recommendations for the next best action\n\nCompanies need to ingest large amounts of data in real time and then take action on it immediately\n\nMany businesses still struggle with training their ML models to properly determine the next best action\n\n(and self-optimize based on the results)\n\n\n-----\n\n**HOW TO GET STARTED**\n\nTo begin working on propensity-to-\n\nbuy, leverage our [Propensity Scoring](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n[Solution Accelerator](https://www.databricks.com/solutions/accelerators/propensity-scoring)\n\n\n**Value with Databricks:**\n\nDatabricks provides all the tools needed to **process large volumes of data and find the next best**\n\n**action** at any given point in the customer journey\n\n**Near real-time insights** — the greater speed to data means businesses can react immediately to\n\ncustomer actions\n\n**Single source of truth** — break down data silos by unifying all of a company’s customer data (including\n\nbasic information, transactional data, online behavior/purchase history, and more) to get a complete\n\ncustomer profile\n\n**Optimizing with AI/ML** — use AI to create self-optimizing ML models that are trained to find the best next\n\nstep for customers\n\n\n-----\n\n**CHAPTER 15**\n### Customers That Innovate With Databricks Lakehouse for Retail\n\n\nSome of the top retail and consumer packaged goods companies in the world turn to Databricks Lakehouse\n\nfor Retail to deliver real-time experiences to their customers.\n\nToday, data is at the core of every innovation in the retail and consumer packaged goods industry.\n\nDatabricks Lakehouse for Retail enables companies across every sector of retail and consumer goods to\n\nharness the power of real-time data and analytics to solve strategic challenges and deliver more engaging\n\nexperiences to customers.\n\nGet started with a free trial of Lakehouse for Retail and start building better data applications today.\n\n**[Start your free trial](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo at: [databricks.com/contact](http://databricks.com/contact\r)\n\n\n-----\n\n###### About Databricks\n\nDatabricks is the data and AI company. More than\n\n7,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune\n\n500 — rely on the Databricks Lakehouse Platform\n\nto unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around\n\nthe globe. Founded by the original creators of\n\nApache Spark™, Delta Lake and MLflow, Databricks\n\nis on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks\n\non [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "**eBook**\n\n# Accelerate Digital Transformation in Insurance With Data, Analytics and AI\n\n### Real-world use cases with Databricks Lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nThree Trends Driving Transformation in Insurance .............................................................................................................................. **05**\n\nThe Need for Modern Data Infrastructure ................................................................................................................................................. **06**\n\nCommon Challenges Insurers Face Using Legacy Technology ...................................................................................................... **08**\n\nWhy Lakehouse for Insurance ............................................................................................................................................................................ **10**\n\nKey Use Cases for Insurance:\n\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\n\nGlobal Regulatory Impact in Insurance ......................................................................................................................................................... **18**\n\n**I N D U S T R Y S O L U T I O N S :** Get Started With Accelerators, Brickbuilders and Enablers ............................................................ **19**\n\nGet Started With Industry Solutions ............................................................................................................................................................. **20**\n\nConclusion ................................................................................................................................................................................................................... **26**\n\n\n-----\n\n## Introduction\n\nWith the rapid advancement of technology, rising consumer expectations, and strong competition between insuretechs and incumbents resulting\nfrom the dissolution of industry boundaries, it is clear that insurers must continue to accelerate their data transformation journey. Today, new\ninsights are derived as quickly as data can move in the insurance industry. This speed has increased as insurers collect vast amounts of customer\ndata from new sources, such as IoT sensors, smartwatches that provide insight into consumers’ health data, and online behavior that includes\nclickstream data, spending habits, and frequented websites. As a result, the data strategy has become even more complex.\n\nConsumers want stronger reassurance for what they value most: financial security and greater peace of mind.\nInsurers have always prided themselves on delivering such protection and security. However, customer needs\nhave changed, and insurers that move most swiftly to satisfy them will be in the best position to navigate\nchallenging times. The bottom line is that insurers must adapt to these changes and meet the evolving needs of\ntheir customers to remain competitive.\n\nData-driven insurers will seek opportunities to improve the customer experience, develop more sophisticated\npricing models, and increase their operational resilience. More than ever, the total cost of ownership (TCO) of\ndigital investments and enterprise data strategy has become a top priority for boards and senior executives\nin the insurance industry. So, what does this mean from a data and analytics perspective? It all comes down\nto having one reliable source of truth for data, which is derived from batch and streaming data, structured and\nunstructured data, from multiple clouds and jurisdictions.\n\n\nIn a regulated and risk-averse industry where data sharing was once seen as optional, it has now become\nfundamental. To compete in the digital economy, insurers need an open and secure approach to data sharing.\nDatabricks Lakehouse for Insurance plays a critical role in helping insurance providers accelerate innovation and\ntransform their businesses, resulting in significant operational efficiencies and improved customer experiences\nat a fraction of the cost of data warehouses. This eBook provides an in-depth exploration of key challenges\nand common use cases in the insurance industry. Most importantly, you will gain insight into how Databricks\nLakehouse can unlock the true value of your data through practical Solution Accelerators and a wide range of\npartners available to assist you on your journey.\n\n\n**The future of insurance will**\n\n**become increasingly data-driven,**\n\n**and analytics enabled.”**\n\n**[EY’s](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)** [“Five principles for the future of protection”](https://www.ey.com/en_us/insurance/five-principles-for-the-future-of-protection)\n\n\n-----\n\nThe Lakehouse reference architecture below illustrates a sample framework upon\nwhich insurers can build. Moving from left to right in the diagram, the first layer\nrepresents various data sources such as on-premises systems, web and mobile\napplications, IoT sensors, enterprise data warehouses, and third-party APIs. Data\nis then ingested through automated data pipelines, and processed within the\nLakehouse platform across three layers (Bronze, Silver and Gold). These layers\nare responsible for data preparation, including ML model registry, centralized\n\n\ngovernance, workflow orchestration, and job scheduling. They ensure a compliant\nand secure infrastructure that sits atop the cloud layer (or multiple clouds),\neliminating the need for data duplication. Finally, the transformed data is delivered\nas actionable insights and supports use cases such as automated reporting,\nbusiness analytics, customer 360, and claims analytics. These use cases not only\nmitigate risk but also drive revenue.\n\n\n**Data Sources**\n\n**On-Premises**\n**Servers**\n\n\n**Ingestion**\n\n\n**Lakehouse for Financial Services**\n\n**Bronze Layer** **Silver Layer** **Gold Layer**\n\n\n**Serving**\n\n**Automated**\n**Reporting**\n\n\n**Web and Mobile**\n**Applications**\n\n\n**Business Analytics**\n**and Interactive**\n**Dashboards**\n\n\n**Raw Entity Data**\n\n\n**Curated Feature**\n**Sets**\n\n\n**Aggregated**\n**Business Views**\n\n\n**Automated Data Pipelines**\n**(Batch or Streaming)**\n\n**Collaborative**\n**Data Source**\n\n\n**Internet-of-Things**\n**(IoT) Devices**\n\n\n**Enterprise Data**\n**Warehouses**\n\n\n**Third-Party APIs**\n**and Services**\n\n\n**ML Model**\n**Registry**\n\n\n**Centralized Data**\n**Governance**\n\n\n**Workflow**\n**Orchestration**\n\n\n**Productionized**\n**Referenced Data**\n**and Models**\n\n**Job Scheduling**\n\n\n-----\n\n## Three Trends Driving Transformation in Insurance\n\nOver the next decade, technology-enabled insurance companies will bear little resemblance to today’s organizations.\nThe following three trends are driving this transformation in the insurance industry:\n\n\n**The rapid emergence of large language**\n**models and generative AI**\n\nIn recent years, there has been a significant\nbreakthrough in the field of artificial intelligence with\nthe emergence of large language models (LLMs)\nand generative AI. These models, such as GPT-4 and\nits predecessors, Databricks Dolly and others are\nbuilt using deep learning techniques and massive\namounts of training data, enabling them to generate\nhuman-like text and perform a wide range of natural\nlanguage processing tasks. LLMs and generative AI\ncan help insurance companies automate repetitive\ntasks such as underwriting, claims processing,\n\nand customer service, improving efficiency and\nreducing costs. They can also help insurers to better\nunderstand customer needs and preferences,\nleading to more personalized products and services.\nHowever, as with any disruptive technology, the\nadoption of LLMs and generative AI will require\ncareful consideration of ethical and regulatory\nissues, such as data privacy and algorithmic bias.\n\n\n**Transformed ecosystems**\n**and open insurance**\n\n[According to EY](https://assets.ey.com/content/dam/ey-sites/ey-com/en_gl/topics/insurance/ey-2022-global-insurance-outlook-report.pdf) , leading companies leverage\ninsurtechs in their ecosystems to achieve high\nmargins in commoditized products. Open insurance,\nwhich involves sharing and managing insurancerelated data through APIs, is more than an item in\nthe regulatory agenda. It can give consumers access\nto better products and accurate pricing, as well as\nenable them to execute transactions more easily.\nIn its [annual Chief Data Officer Survey](https://www.gartner.com/smarterwithgartner/data-sharing-is-a-business-necessity-to-accelerate-digital-business) , Gartner\nfound that organizations that promote external data\nsharing have three times the measurable economic\n\nbenefit across a variety of performance metrics\ncompared to their peers.\n\n\n**Revised target operating model**\n**with a focus on talent**\n\nDemographic shifts and perennial cost pressures\nmake it critical for insurers to attract and retain\ntalent. Consequently, it’s important for insurers\nto equip their workforces with the right tools\nand technologies to help them identify business\nprocesses that can be optimized to differentiate\nthemselves from their competitors, with an emphasis\non moments that matter in the customer journey,\naccording to EY. Recent research from Deloitte\nhighlights the advantages of upskilling and building\na future-ready workforce. One of the benefits\n\nof AI adoption in the workforce is that it enables\norganizations to automate a wide range of business\nprocesses, boosting speed and efficiency. But what’s\neven more important is that it enables employees to\nfocus on higher-value work, according to Deloitte.\n\n\n-----\n\n## The Need for Modern Data Infrastructure\n\n**Insurers turning to cloud and data analytics**\n\n\nThe insurance industry has undergone significant changes over the years, and\none of the areas that has evolved the most is data management. With the\ngrowing need for advanced analytics and digital transformation, many insurance\ncompanies are turning to cloud technology and modern data infrastructures\nto enhance their data management strategies. The benefits of adopting cloud\ntechnology are numerous, particularly the ability to efficiently store and quickly\naccess vast amounts of data, which is crucial in a heavily regulated and datadriven industry like insurance. Additionally, the flexibility of the cloud enables\ninsurers to scale costs, adapt to changing work environments, and meet evolving\ncustomer and business requirements.\n\n\ndynamic pricing and underwriting, and form the foundation for claims automation.\nBy implementing advanced analytics, insurers can innovate more easily, scale their\nbusinesses, and bring new products to market more quickly.\n\nTo remain competitive, insurance companies must increase their investment in\ncloud technology and data analytics, as this will accelerate insightful decisionmaking across various functions such as claims management, underwriting,\npolicy administration, and customer satisfaction. Overall, the adoption of cloud\ntechnology and data analytics is imperative for insurance providers to enhance\noperational efficiency, improve business processes, and stay relevant in today’s\nfast-paced business landscape.\n\n\nFurthermore, insurance providers can leverage the cloud to analyze customer\ndata at scale, gaining insights into behaviors that drive hyper-personalization,\n\n\n-----\n\n**Let’s take a closer look look at a few examples:**\n\n\n**Auto insurers** need to integrate new data sources, such as weather and traffic,\nto build solutions capable of real-time processing. This enables them to alert\nemergency services promptly and gain a better understanding of drivers’ driving\npatterns. It also enables the development of sophisticated machine learningbased risk assessment, underwriting and claims models.\n\n**Commercial insurance** , including property, general liability, cyber insurance and\nbusiness income insurance, utilizes ML-based automation of actuarial models.\nThis automation facilitates underwriting, claims forecasting and dynamic pricing\nfor their customers. Another notable trend in recent years is the use of IoT-\n\n\nbased alerting for sensitive or valuable commodities. For example, in the case of\nvaccines, IoT sensors can monitor the temperature in real time and send alerts to\nthe appropriate team or person if the temperature exceeds acceptable thresholds.\nThis is crucial as vaccines must be stored within specific temperature ranges.\n\nIn **life insurance** , complex ML models can be employed to create a profile of\nthe customer’s lifestyle and, importantly, detect any changes to it. This deeper\nunderstanding and 360-degree view of the customer enable more customized\nunderwriting and pricing based on the policyholder’s current health, lifestyle and\neating habits.\n\n\n|Type of Data Source|Typical Vendors|High-priority business use caes Claims Automation Dynamic Pricing Anomoly Detection Customer 360 and and Transformation and Underwriting and Fraudulent Claims Hyper-Personalization|Col4|Col5|Col6|\n|---|---|---|---|---|---|\n|Policy data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork|||||\n|Claims data|Guidewire, Duck Creek, Majesco, FINEOS, EIS, Unqork, TransUnion|||||\n|Real-time ingestions|Cambridge Mobile Telematics, Zendrive, Custom|||||\n|Alternative / Supplemental data|Experian, Equifax, Verisk, IBM Weather|||||\n|Marketing data|Salesforce, HubSpot, Google Analytics|||||\n\n\n**Figure 1.** Innovating with data and analytics — use cases made possible and key data sources from popular insurance vendors\n\n\n-----\n\n## Common Challenges Insurers Face Using Legacy Technology\n\n\nModernization is not an easy process for insurers, and while transforming IT\necosystems is necessary to improve business outcomes, ensuring business\ncontinuity is absolutely critical. However, the volume of data they collect, along\nwith changes in user behavior and legacy systems that can’t handle this amount of\ndata, are forcing insurance providers to accelerate their modernization journeys.\n\nInsurance providers face several challenges when using legacy technology, including:\n\n**Legacy on-premises systems:** Legacy on-premises systems are not only\nexpensive to maintain, but they also store large amounts of big data in silos across\nthe business. This makes it difficult to access the data, hindering data analytics\nefforts and limiting executives’ ability to make informed business decisions.\n\n**Ingesting large volumes of transactional data in real time:** The inability to\ningest data from transaction systems in real time is a major obstacle to obtaining\ncritical insights. Transaction logs from operations such as policy administration,\nenrollment and claims constantly stream data. However, many insurance\ncompanies still rely on legacy data warehouses built around batch processing,\nwhich is not suitable for ingesting and integrating large data sets. As a result,\ninsurers often opt to ingest data nightly, leading to delays in receiving accurate\ndata for decision-making.\n\n\n**Performing fine-grained analysis at scale within tight time frames:** Legacy\ntechnology forces insurers to make a trade-off when analyzing data for user intent.\nThey can choose between detailed and accurate predictions or fast predictions.\nRunning detailed forecasts can improve accuracy, but it requires performing\nmillions of model calculations within narrow service windows, which exceeds the\ncapability of legacy data platforms. Consequently, insurance companies have to\naccept less accurate predictions.\n\n**Powering real-time decisions on the front line:** Serving real-time data to\nthousands of workers is a complex task. While data warehouses can serve reports\nto large groups of users, they are limited to providing stale data. As a result, most\ninsurers only provide daily or weekly updates to reports and rely on employees’\njudgment for more frequent decisions.\n\n**Delivering a hyper-personalized omnichannel experience:** Today’s insurers aim\nto deliver personalized experiences across every channel, both digital and offline.\nWhile insurance providers have access to vast amounts of customer data, off-theshelf tools for personalization and customer segmentation struggle to handle such\nhigh volumes, leading to inaccurate analytics. To succeed in the insurance industry,\ncompanies must deliver personalized experiences at scale.\n\n\n-----\n\nDatabricks Lakehouse for Insurance addresses the key challenges faced across the\ninsurance value chain. The lakehouse enables the integration of various data types,\nincluding images and structured data, in real time. It offers robust management\nand governance capabilities, and rapidly transforms data into actionable insights\n\n\nthrough real-time reporting and predictive analytics. This platform-as-a-service\nsolution delivers exceptional speed and industry-leading total cost of ownership,\nproviding insurers with faster insights to enhance the customer experience and\ngain a competitive edge.\n\n\n**Product**\n**Development &**\n**Feature Selection**\n\n\n**Application**\n**Review &**\n**Submission**\n\n\n**Policy Issue,**\n**Service &**\n**Administration**\n\n\n**Sales & Lead**\n**Management**\n\n**Hyperpersonalization/**\n**life events**\n\n\n**Underwriting**\n**and Pricing**\n\n**UW rules**\n**guidelines &**\n**technical pricing**\n\n\n**Rating Offer &**\n**Endorsements**\n\n**Evaluate**\n**rate options,**\n**pricing and**\n**endorsements**\n\n\n**Claims**\n\n\n**Coverage/** **Review policy**\n**features/riders** **documents**\n**(submission)**\n\n\n**Omnichannel** **Fraud, frequency,**\n**severity and**\n**reserves**\n\n\n**We continuously develop solution accelerators and enablers to accelerate the time to market.**\n\n\n\n**•** Dynamic segmentation\n\n**•** Personas\n\n**•** Hyper-personalization\n\n**•** Intelligent automation\n\n\n\n**•** Product architecture and\nmanufacturing\n\n**•** Configurable products\n\n**•** Competitor rates\n\n\n\n**•** Reflexive questionnaire\n\n**•** LLM assistance for\ndocument summarization\n\n**•** NLP for unstructured data\n\n\n\n**•** Evaluation of risk within\nappetite\n\n**•** Validation of UW\nrequirements\n\n**•** Straight-through\nprocessing optimization\n\n**•** Risk assessment via\nactuarial pricing\n\n**•** Triaging of risk to\nunderwriter SME for policy/\nexposure changes\n\n\n\n**•** Predict loss cost\n(frequency and severity)\n\n**•** Computer vision on images\nto identify loss\n\n**•** Auto-adjudication and\ntriaging of claims to claim\nadjuster\n\n**•** Tailor communication by\nsegment (e.g., email, text,\nmail, or omnichannel)\n\n**•** Identify Fraud, Waste and\nAbuse, route to ICU\n\n\n**Figure 2.** Evaluating data maturity across the insurance value chain and lines of business (LOBs)\n\n\n-----\n\n## Why Lakehouse for Insurance\n\nDatabricks Lakehouse for Insurance combines simplicity, flexibility and reusability, enabling insurers to meet the demands of the market with speed and agility. It offers\nbest-in-industry performance and serves as a modern data architecture that provides differentiated capabilities for insurers to thrive in a competitive industry.\n\n\n\n**•** Insurance companies can store any type of\ndata using Databricks Lakehouse for Insurance,\nleveraging the low-cost object storage supported\nby cloud providers. This helps break down data\nsilos that hinder efforts to aggregate data for\nadvanced analytics, such as claim triaging and\nfraud identification, regulatory reporting, or\ncompute-intensive risk workloads. Another critical\nfeature is the time-travel capabilities of the\nlakehouse architecture, allowing insurers to access\nany historical version of their data.\n\n\n\n**•** Supporting streaming use cases, such as\nmonitoring transaction data, is easier with the\nlakehouse. It utilizes Apache Spark ™ as the data\nprocessing engine and Delta Lake as the storage\nlayer. Spark enables seamless switching between\nbatch and streaming workloads with just a single\nline of code. Delta Lake’s native support for ACID\ntransactions ensures reliable and high-performing\nstreaming workloads.\n\n\n\n**•** For both machine learning and non-machine\nlearning insurance models, a comprehensive\ngovernance framework is provided. Data, code,\nlibraries and models are linked and independently\nversion controlled using technologies like Delta\nLake and MLflow. Delta Lake ensures stability by\nallowing insurance companies to declare their\nexpectations for data quality upfront. MLflow\nenables training models in any language and\ndeploying them anywhere, minimizing the need for\ncomplex handoffs between data science practices,\nindependent validation units and operational teams.\n\n\n-----\n\n**Level-up value with Databricks Lakehouse for insurance**\n\nBuilding your data lakehouse with the Databricks Lakehouse Platform empowers your organization with the speed, agility and flexibility needed to address critical insurance\nuse cases that have a significant impact on your customers and your business. Additionally, it helps lower the total cost of ownership (TCO).\n\nWith a modern and unified data architecture, the Databricks platform enables the implementation of your data, analytics and AI strategy at scale on a unified and modern\ncloud data architecture. The key benefits include:\n\n\n**1. Cost and complexity reduction**\n\nThe Databricks Lakehouse provides an open, simple\nand unified cloud data management architecture\nthat streamlines operational inefficiencies, reduces\nIT infrastructure costs, and enhances productivity\nacross teams.\n\n\n**2. Enhanced risk management and control**\n\nBy unlocking the value of enterprise data, the\nplatform helps reduce corporate governance and\nsecurity risks. It facilitates data-driven decisionmaking through governed discovery, access and\ndata sharing.\n\n\n**3. Accelerated innovation**\n\nThe platform enables the acceleration of digital\ntransformation, modernization and cloud migration\ninitiatives, fostering new growth opportunities\nand driving innovation for improved customer and\nworkforce experiences.\n\n\nTo help you get started, this guidebook includes the most commonly observed use cases across the insurance industry.\n\n\n-----\n\n**Reference Architecture for Smart Claims**\n\n\n**1.** \u0007The Lakehouse ingests various types of data, either in bulk\n\nor incrementally through change data capture (CDC). These\n\ninclude structured and unstructured data sets like images, text,\n\nand video, such as IoT sensor data, operational data like claims\n\nand policies, and on-prem or third-party data such as from\n\ncredit bureaus, weather, and driving records. Partner Connect\n\noffers a range of ingest tools from different vendors that you can\n\ndirectly use from the Databricks portal.\n\n\n**2.** \u0007Delta Live Tables (DLT) is the preferred ETL\n\npath to transform the data based on business\n\nrequirements. All the data resides in cloud storage,\n\nwhere Delta refines it into Bronze, Silver and Gold\n\nzones of a medallion pipeline blueprint. Databricks\n\nWorkflows provide orchestration of the various\n\ndependent tasks, with advanced capabilities like\n\n\n**3.** \u0007Databricks SQL, with Photon\n\nand serverless options, caters\n\nto BI consumption use cases to\n\nrefresh a dashboard monitoring\n\nkey metrics and KPIs, with\n\nquery history and alerts on\n\ncritical events.\n\n\n**4.** \u0007Databricks ML Runtime,\n\nMLFlow, along with\n\nFeature Store, Auto ML,\n\nand real-time Model\n\nServing enable ML\n\nuse cases to provide\n\n\n**5.** \u0007Delta Sharing provides\n\na secure and governed\n\nway of sharing data\n\ninternally and externally\n\nwithout copying it,\n\nusing Unity Catalog.\n\n\npredictive insights.\n\n\nretry, repair and job status notifications.\n\n\n-----\n\n**Secure data sharing with Delta Lake**\n\nAt the heart of Databricks Lakehouse for Insurance is a technology that allows insurers to overcome the trade-offs between speed and accuracy. Technologies like Delta\nLake enable the lakehouse, which combines the strengths of data warehouses and data lakes, to directly address these challenges. With Delta Lake, insurance providers can\nunify all their data — structured and unstructured, batch and real-time — in one centrally managed and governed location.\n\nOnce the data is in the lakehouse, various stakeholders such as e-commerce systems, reporting users, analysts, data scientists and data engineers can leverage this information.\nThey can use it to develop models for applications, power real-time reporting, perform advanced analytics, and create large-scale forecasting models, among other use cases.\n\n**Business intelligence**\n\n**Streaming**\n\n**Centralized**\n**governance**\n\n\n##### Lakehouse Platform\n\n\n**Data science / ML**\n\n**One copy**\n**of data**\n\n**Data warehouse**\n\n**Orchestration**\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Claims automation and transformation\n\n**Overview**\n\n\nInsurers are entering a new era of claims transformation, supported by evolving technological advancements\nand increasing data availability. Leveraging the Databricks Lakehouse, organizations can deal with the massive\namount of structured and unstructured data coming in from different sources, in different formats, and time\nframes. Every touchpoint in the claims journey — beginning even before an incident occurs — can be supported\nby a combination of technology and human intervention that seamlessly expedites the process.\n\n**Business problem**\n\nMissing data, or data that is “not in good order” and needs to be corrected before processing, leads to claims\nleakage and inefficient processes in triaging claims to the right resource.\n\n**Solution/value with Databricks**\n\nEnable triaging of claims and resources by leveraging big data processing and integrated ML and AI capabilities,\nincluding MLflow model lifecycle management.\n\n**Business outcomes and benefits**\n\n**•** Decrease in annual claims payout\n\n**•** Increase in claim fraud detection/prevention\n\n**•** Improve efficiencies by 15%\n\n**“Applying AI as broadly, as aggressively**\n\n**and as enthusiastically as possible. No part**\n\n**of our business should be untouched by it.”**\n\n— \u0007Masashi Namatame, Group Chief Digital Officer,\nManaging Executive Officer, Tokio Marine\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**Tokio Marine: Striving to**\n**become Al-driven**\n\nInsurers of all types now routinely use AI\nmodels to drive underwriting, streamline claims\nprocessing and accelerate claims adjudication,\nprotect against insurance fraud, and improve\nrisk forecasting, for example. Tokio Marine —\nJapan’s oldest insurance company, which has\ndone business since 1879 — has been applying\nadvanced uses of AI, particularly in its auto\ninsurance business, says Masashi Namatame,\nGroup Chief Digital Officer and Managing\nExecutive Officer at Tokio Marine: “To assess\ncollision damages, the company uses an AIbased computer vision solution to analyze\nphotos from accident scenes.” Comparing these\nwith what he describes as “thousands or even\nmillions” of photos of past analogous incidents,\nthe model produces liability assessments of the\nparties involved and projects anticipated repair\ncosts. AI has also provided the company with\ntangible benefits in online sales — especially in\npersonalized product recommendations and\ncontract writing, according to Namatame. Read\nthe case study in the [MIT CIO vision 2025 report](https://www.databricks.com/resources/whitepaper/mit-cio-vision-2025) .\n\n\n-----\n\n**K E Y U S E C A S E**\n## Dynamic pricing and underwriting\n\n**Overview**\n\n\nIn modernized insurance platforms, there is a growing trend toward personalized approaches, where insurance\ncarriers utilize metrics from trip summaries to inform pricing strategies for individuals based on their behavior.\nThis involves leveraging unstructured and streaming data, including IoT telematics driver data, weather information,\ngeolocation, traffic patterns and crash history. The Lakehouse platform is well suited for these new use cases as it\noffers native support for streaming, making it easy for insurance carriers to incrementally ingest data.\n\n**Business problem**\n\nActuaries are spending valuable time on low-value activities, which hampers agility and advanced analytical\ncapabilities in pricing and underwriting, hindering improvements in risk and pricing modeling.\n\n**Solution/value with Databricks**\n\n**•** Unified cloud-native platform\n\n**•** Scalability for ingesting IoT data from millions of trips, expanding the customer base\n\n**•** Reduced total cost of ownership compared to legacy Hadoop systems\n\n**•** Usage-based pricing, leading to lower premiums for customers and reduced risk for insurance carriers, thereby\nlowering loss ratios\n\n**•** Enables the creation of a digitally enabled, end-to-end underwriting experience\n\n**Business outcomes and benefits**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**American financial services**\n**mutual organization**\n\nThis organization aimed to leverage the vast\namounts of structured and unstructured data\nit collected to enhance its underwriting and\ndecision-making processes, enabling greater\nefficiency and effectiveness. However, the\ncompany’s legacy infrastructure struggled\nto scale with the increasing data volume and\nprocessing demands, limiting its ability to\nanalyze the data and derive actionable insights.\n\nWith Databricks, the insurer centralized\neverything on one unified Lakehouse platform,\n\nsupporting all operational and analytical\nuse cases. This allowed them to analyze\nbroader sets of data for superior underwriting\nperformance and create a digitally empowered,\nend-to-end underwriting experience.\n\n\n\n**•** Improve competitive position\n\n**•** Decrease combined ratio\n\n**•** 15% improvement in efficiencies\n\n\n-----\n\n**K E Y U S E C A S E**\n## Anomaly detection and fraudulent claims\n\n**Overview**\n\n\n**C U S T O M E R C A S E S T U D Y**\n\n**One of the largest U.S.**\n**insurance companies and a**\n**leading small business insurer**\n\nThe increasing availability of data and market\ncompetition challenge insurance providers to\noffer better pricing to their customers. This\nU.S.-based insurer, with hundreds of millions of\ninsurance records to analyze for downstream\nML, realized that their legacy batch analysis\nprocess was slow and inaccurate, providing\nlimited insight for predicting the frequency\nand severity of claims. With Databricks, they\nwere able to scale up the use of deep learning\nmodels, resulting in more accurate pricing\npredictions and increased revenue from\nclaims. By leveraging Databricks Lakehouse,\nthey harmonized data, analytics and AI at\nscale, enabling accurate pricing predictions\nand supporting various use cases from vehicle\ntelematics to actuarial modeling.\n\n\nFraud continues to grow at a rapid rate, posing a threat to the revenue and growth of companies. For example,\nAmerican consumers reported losing more than $5.8 billion to fraud in 2021, a 70% increase from $3.4 billion\nin 2020, according to the Federal Trade Commission. The insurance industry is undergoing transformational\nchange to support new channels and services, offering transactional features and facilitating payments through\ndigital channels to remain competitive. However, the speed and convenience of these capabilities benefit both\nconsumers and fraudsters. Building a fraud framework requires more than just highly accurate machine learning\nmodels. It often involves a complex decision science process that combines a rules engine with a robust and\nscalable machine learning platform.\n\n**Business problem**\n\nInsurers need the ability to identify fraudulent activity and respond to new suspicious trends in near real-time.\n\n**Solution/value with Databricks**\n\nModernized approaches in insurance require full digital transformation, including the adoption of usagebased pricing to reduce premiums. Insurance providers now consume data from the largest mobile telematics\nproviders (e.g., CMT) to obtain granular sensor and trip summaries for users of online insurance applications.\nThis data is crucial not only for pricing but also for underwriting scenarios to mitigate risks for carriers.\n\n**$1 of fraud costs companies 3.36x in chargeback,**\n**replacement and operational costs**\n\n\n[Lexis Nexis](https://risk.lexisnexis.com/insights-resources/research/2020-true-cost-of-fraud-retail)\n\n\n-----\n\n**K E Y U S E C A S E**\n\n## Customer 360 and hyper-personalization\n\n\n**Overview**\n\nWinning the hearts and minds of your customers\nstarts with personalizing the user experience. The\nability to offer complementary products to meet\nthe needs of your customers lets you build deeper\nrelationships with them and engender their loyalty.\nIn addition, a better understanding of the finer\ndetails within accounts allows you to offer them\nmore personalized products. To do this, you need\n360-degree customer views, which requires you to\nlocate and consolidate all your customers’ contact\ndata from every digital tool that you use and house\nit in one central location. With Databricks Lakehouse,\ninsurers can “hyper-personalize,” increase\ncross-sell/upsell opportunities, enhance customer\n360 and bring new products to market faster.\n\n**Business problem**\n\nThe inability to reconcile customer records across\ndifferent lines of business limits real-time customer\ninsights necessary for upselling and cross-selling.\nSiloed data makes it challenging to create accurate\nand comprehensive customer profiles, resulting in\nsuboptimal recommendations for the next best action.\n\n\n**Solution/value with Databricks**\n\nDatabricks provides the tools needed to process\nlarge volumes of data and determine the next best\naction at any point in the customer journey.\n\n**•** Eliminates data silos by unifying all customer data,\nincluding basic information, transactional data,\nonline behavior/purchase history, etc., to create\ncomplete customer profiles\n\n**•** Integrated data security ensures that security\nmeasures are incorporated at every layer of the\nDatabricks Lakehouse Platform\n\n**•** Delta improves data quality, providing a single\nsource of truth for real-time streams and ensuring\nreliable and high-quality data for data teams\n\n**•** Integrated ML and AI capabilities utilize AI to\ncreate self-optimizing ML models that determine\nthe next best step for each customer\n\n**•** MLflow model lifecycle management helps manage\nthe entire machine learning lifecycle reliably,\nsecurely and at scale\n\n\n**Business outcomes and benefits**\n\n**•** Use AI, ML, automation and real-time data to\ngain deeper customer insights and understand\ntheir needs\n\n**•** Improve competitive positioning\n\n**•** Enhance the customer experience\n\n**C U S T O M E R C A S E S T U D Y**\n\n**160-year-old U.S.**\n**insurance company**\n\nThis insurance provider underwent a significant\ndigital transformation to provide a more\npersonalized financial services experience to\nits 10,000 advisors and millions of customers\nacross various touchpoints. Recognizing the\nimportance of becoming data-driven, the\ncompany leveraged Databricks in its client\n360 platform to aggregate transactional and\nbehavioral data, along with core attributes,\nproviding business users with next-best-action\nrecommendations for seamless customer\nengagement.\n\n\n-----\n\n## Global Regulatory Impact in Insurance\n\n\n**Navigating global regulations**\n**with technical implementation**\n\nDigital innovation continues to reshape the insurance sector. The pace and scale\nof technological change are likely to increase due to factors such as artificial\nintelligence (AI), cloud computing, and the entry of new players like insurtechs,\ne-tailers, and manufacturers from outside the insurance industry.\n\nTo succeed and thrive in today’s economic environment, insurers should prioritize\nupgrading their infrastructure and technology, rather than solely focusing on\ntransforming operations. For example, migrating from on-premises systems to the\ncloud can bring significant benefits, according to global consultancy [Deloitte](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf) [.](https://www2.deloitte.com/content/dam/insights/articles/us175368_cfs_fsi-outlook-insurance/DI_US175368_CFS_FSI-Outlook-Insurance.pdf)\n\nAs insurers upgrade their compliance processes to meet new global regulations,\nsuch as IFRS 17 and LDTI, the impact of regulatory updates becomes more\ncomplex for organizations operating across multiple jurisdictions. Instead of\nmerely responding to regulatory and industry requirements, insurance companies\nshould make data-focused investments that help them anticipate and meet the\nexpectations of distributors and policyholders.\n\n\n**IFRS-17**\n\nIFRS 17 is an International Finance Reporting Standard (IFRS) for\ninsurance contracts. IFRS 17 aims to standardize insurance accounting\nby providing consistent principles for all facets of accounting for\ninsurance contracts. IFRS 17 removes existing inconsistencies so\nanalysts, investors and others can more easily compare companies,\ncontracts and industries.\n\n**LDTI for long-duration contracts**\n\nThe Financial Accounting Standards Board long-duration targeted\nimprovements (LDTI) introduced changes to the U.S. GAAP accounting\nmodel to simplify and improve the financial reporting of long-duration\ncontracts, including providing financial statement users with more\ntimely and relevant information about those contracts.\n\n\nIt is crucial for insurers to redirect their focus toward developing advanced data\nmanagement and utilization capabilities that offer better insights and improved\nperformance. These investments serve as not only a foundation for regulatory\ncompliance but also a starting point for more comprehensive and proactive\ntransformation initiatives.\n\n\n-----\n\n**I N D U S T R Y S O L U T I O N S**\n\n## Get Started With Accelerators, Brickbuilders and Enablers\n\nInsurance Solution Accelerators and enablers are pre-built collateral to help customers rapidly develop and deploy technical capabilities to accelerate value.\n\n**Adoption challenges**\n\n\nNumerous challenges hinder organizations from developing and implementing the\nnecessary technical solutions to enhance their operational effectiveness, increase\nrevenue, and stay competitive. These challenges include:\n\n**•** Lack of technical skills (data scientists/data engineers): Companies often\nstruggle to find employees proficient in Python or Scala, or individuals who\npossess extensive experience in data science.\n\n\n\n**•** Business problems require in-depth data science and industry knowledge:\nBusinesses seek solutions tailored to address specific problems, rather than\ngeneric technical features.\n\n**•** Companies seek actionable insights: Organizations prefer readily applicable\npatterns that can be quickly implemented, rather than custom data science\nsolutions that come with potential costs and risks of implementation failure.\n\n\n**What are accelerators/enablers?**\n\n\n**Solution Accelerators**\n\nSave hours on discovery, design, development and\ntesting with Databricks Solution Accelerators. Our\npurpose-built guides, including fully functional\nnotebooks and best practices, expedite results for\nyour most common and high-impact use cases. With\nthese accelerators, you can go from idea to proof of\nconcept (PoC) in as little as two weeks.\n\n\n**Brickbuilders**\n\nBrickbuilder Solutions are data and AI solutions\ndesigned by leading consulting companies to\naddress industry-specific business requirements.\nBuilt on the Databricks Lakehouse Platform and\nbacked by the industry experience of these\nconsultancies, businesses can have confidence\nin solutions tailored to their specific use cases.\nBrickbuilder Solutions can be implemented at any\nstage of the customer journey.\n\n\n**Solution Enablers**\n\nSolution enablers consist of targeted collections\nof notebooks and materials, such as webinars and\nblog posts, designed to support larger solutions.\nThey aim to solve pain points or address specific\nlayers of business capabilities, such as resolving data\ningestion challenges.\n\n\n-----\n\n## Get Started With Industry Solutions\n\n\n**Claims transformation:**\n**automation and fraud prevention**\n\nInsurers are entering a new era of claims transformation, supported by evolving\ntechnological advancements and growing data availability. The end-to-end claims\nprocess, from extracting relevant information from documentation submitted\nwhen filing a claim to triaging and routing claims and the underwriting process,\nis ripe for digital transformation. By leveraging the Databricks Lakehouse,\norganizations can handle millions of data points coming in different formats and\ntime frames, from various sources, at an unprecedented volume. Every touchpoint\nin the claims journey, starting even before an incident occurs, will be supported by\na combination of technology and human intervention that seamlessly expedites\nthe process. Personalizing the claims experience by anticipating needs, providing\nreal-time status alerts, and reducing friction in the process increases customer\nloyalty and retention.\n\n\n**Customer/Partner Successes**\n\n**Accelerate underwriting through collaboration and efficient ML**\n\nA leading P&C insurer took full advantage of the MongoDB and Databricks\nintegration, leveraging both platforms to foster collaboration between their data\nand developer teams. The integration provides a more natural development\nexperience for Spark users and exposes all of Spark’s libraries. This allows\nMongoDB data to be materialized as DataFrames and data sets for analysis\nusing machine learning, graph, streaming and SQL APIs. The insurer also benefits\nfrom automatic schema inference. With this integration, the insurer was able to\ntrain and observe their ML models (MongoDB Atlas Charts) more efficiently and\nincorporate them into business applications.\n\nAs a result, crucial underwriting processes that previously took days are now executed\nin seconds. In addition to the time and cost savings, the company can provide a more\nimmediate response to customers within its digital experience platform.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[F R A U D D E T E C T I O N](https://notebooks.databricks.com/notebooks/FSI/geospatial_analysis/index.html#geospatial_analysis_1-0.html)**\n\n**Claims processing is the process whereby an insurer receives,**\n\n\n**verifies and processes a claim report submitted by a policyholder.**\n\n**It accounts for** **[70% of a property insurer’s expenses](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)** **and is a**\n\n**criticial component of customer satisfaction with their carrier.”**\n\n\n**[C L A I M S A U T O M AT I O N E N A B L E R](https://www.databricks.com/blog/2023/02/01/design-patterns-batch-processing-financial-services.html)**\n\n\n[Laying the](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Foundation for](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n[Claims Automation](https://www.youtube.com/watch?v=LkckhRjezxs\r)\n\n\n**[C A R C L A I M S I M A G E C L A S S I F I C AT I O N](https://github.com/databricks-industry-solutions/car-classification)**\n\n\n**Deloitte,** [”Preserving the human touch in insurance claims transformations”](https://www2.deloitte.com/us/en/insights/industry/financial-services/insurance-claims-transformation.html)\n\n**[S M A R T C L A I M S : C L A I M S A U T O M AT I O N](https://www.databricks.com/blog/2023/04/03/claims-automation-databricks-lakehouse.html)**\n\n\n-----\n\n**Risk management:**\n**dynamic pricing and underwriting**\n\nModernized approaches at insurance carriers require a full digital transformation,\nand one aspect of this transformation involves dynamic pricing and underwriting\nto reduce premiums. Insurance providers are now consuming data from the largest\nmobile telematics providers to obtain the most granular sensor and trip summaries\nfor users of online insurance applications. Not only is this data critical for pricing,\nbut it is also critical for underwriting scenarios to de-risk carriers. Dynamic pricing\nand underwriting automate routine tasks and provide teams with alternative\ndata sources to empower actuarial and underwriting professionals to become\n“exponential.” This allows teams to focus on key aspects of risk selection and\nanalysis that drive competitive advantage and market differentiation. By leveraging\npersonalized data points, insurers can deliver near real-time underwriting\ndecisions for life insurance applicants, reducing policy abandonment and costs.\n\n\n**Customer/Partner Successes**\n\n**Automated extraction of medical risk factors for life insurance underwriting**\n**(John Snow Labs)**\n\nLife insurance underwriting considers an applicant’s medical risk factors in\naddition to mortality risk characteristics. These risk factors are often found\nin free-text documents. New insurance-specific natural language processing\n(NLP) models can automatically extract relevant medical history and risk factors\nfrom such documents. Forward-thinking companies are embracing accelerated\nunderwriting, which utilizes new data along with algorithmic tools and modeling\ntechniques to quickly assess and group applicants without requiring bodily fluids,\nphysician’s notes, and so on. This joint Solution Accelerator from Databricks and\nJohn Snow Labs simplifies the implementation of this approach, creating a faster,\nmore consistent, and scalable underwriting experience.\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[R I S K M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/market-risk)**\n\n**Risk is highly influenced by behavior, and 80% of morbidity in**\n\n\n**healthcare risk is driven by factors such as smoking, drinking**\n\n**alcohol, physical activity and diet. In the case of driving,**\n\n**60% of fatal accidents are a result of behavior alone. If insurers**\n\n**can change customer behaviors and help them make better**\n\n**choices, then the risk curve shifts.”**\n\n\n**[A C T U A R I A L W O R K B E N C H](https://github.com/koernigo/databricksActuarialWorkbench)**\n\n**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n\n\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n\n\n**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n\n\n-----\n\n**Product distribution:**\n**segmentation and personalization**\n\nThe most forward-thinking and data-driven insurers are\nfocused on achieving personalization at scale. They are\nexploring new partnerships and business models to create\nintegrated, value-added experiences that prioritize the\noverall health and financial wellness of their customers,\nrather than just their insurance needs. These insurers\nare investing in new data sources, analytics platforms,\nand artificial intelligence (AI)-powered decision engines\nthat enable them to connect producers with like-minded\ncustomers or engage customers with enticing offers\nand actionable steps based on their previous choices.\nThe outcome is more efficient and effective service\nfrom producers, trusted and convenient interactions for\nconsumers, and increased customer engagement and\ngrowth for insurers in an increasingly digital-oriented world.\n\n\n**Customer/Partner Successes**\n\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n\nWith Persona 360, you can:\n\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n1,695+ attributes and segments\n\n**•** Seamlessly connect the workflows of data scientists (via Databricks) and marketing specialists (via\nPersona 360), making it easy for data experts to incorporate their findings and enabling nontechnical\nusers to comprehend and activate the data\n\n**•** Leverage tools that can increase engagement by 37% and conversion rates by 45% through\npersonalized campaigns\n\n\n**Learn more:**\n\n\n**Watch video:**\n\n\n**[N E X T B E S T O F F E R](https://www.databricks.com/solutions/accelerators/recommendation-engines)**\n\n**Demand for hyper-personalized and real-time risk protection**\n\n\n**requires broad adoption of artificial** **intelligence (AI), machine**\n\n**learning and digital platforms.**\n\n**EY,** [”Nine customer types defining the next wave of insurance”](https://www.ey.com/en_us/insurance/nine-customer-types-defining-the-next-wave-of-insurance)\n\n\n**[C U S T O M E R L I F E T I M E VA L U E (C LT V )](https://www.databricks.com/solutions/accelerators/customer-lifetime-value)**\n\n**[C U S T O M E R S E G M E N TAT I O N](https://www.databricks.com/solutions/accelerators/customer-segmentation)**\n\n\n[The Impact of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Analytics and AI](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[on the Future of](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n[Insurance](https://www.youtube.com/watch?v=7qZ14bGip5g&t=3s)\n\n\n**[R E P U TAT I O N M A N A G E M E N T](https://www.databricks.com/solutions/accelerators/reputation-risk)**\n\n**[C H U R N P R E D I C T I O N](https://www.databricks.com/solutions/accelerators/retention-management)**\n\n\n-----\n\n**Summary and applicability of Solution Accelerators based on insurance provider type / Solution Accelerator matrix**\n**by insurance provider type**\n\n\n\n\n\n\n\n\n\n|Product distribution Personalization Given the volume of data required, the complexity of operating AI from experiments (POCs) to enterprise scale data pipelines, combined with strict data and privacy regulations on the use of customer data on cloud infrastructure, the Lakehouse has quickly emerged as the strategic platform to accelerate digital transformation.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Next best offer Customers have different needs at each stage of the buyer journey. Choose the right recommender model for your scenario to find the next best action at any given point in the customer journey.|||||\n|Customer Analyzing customer lifetime value is critical to improving marketing decision-making, campaign ROI and lifetime value customer retention. Learn how to identify your most valuable customers with Databricks’ Customer Lifetime Value Solution Accelerator.|||||\n|Churn prediction Earning loyalty and getting the largest number of customers to stick around is something that is in your best interest as well as your customers’ best interest. Develop an understanding of how a customer lifetime should progress and examine where in that lifetime journey customers are likely to churn so you can effectively manage retention and reduce your churn rate.|||||\n|Customer Personalization is touted as the gold standard of customer engagement. Using sales data, campaigns segmentation and promotions systems, this solution helps you create advanced customer segments to drive better purchasing predictions based on behaviors.|||||\n|Reputation Harness the Databricks Lakehouse Platform to build a risk engine that can analyze customer feedback management securely and in realtime to power an early assessment of reputation risks.|||||\n\n\n-----\n\n|Anomaly detection and fraudulent claims Anomaly Anomaly detection is the technique of identifying rare events or observations which can raise suspicions detection by being statistically different from the rest of the observations.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Fraudulent A large-scale fraud prevention system is usually a complex ecosystem made of various controls (all with claims critical SLAs), a mix of traditional rules and AI and a patchwork of technologies between proprietary on- premises systems and open source cloud technologies.|||||\n\n\n\n\n\n\n|Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse. Risk management Adopt a more agile approach to risk management, including actuarial and underwriting intelligence by unifying data and AI in the Lakehouse.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Underwriting Machine learning provides a decision support system for underwriting processes to help you improve your automation underwriting outcomes.|||||\n|Actuarial You can use the Databricks Lakehouse Platform to automate actuarial models and leverage Machine workbench Learning (ML) for underwriting, claims forecasting, etc.|||||\n\n\n-----\n\n|Claims transformation Anomaly detection Preempt fraud with rule-based patterns and select ML algorithms for reliable fraud detection. Use and claims fraud anomaly detection and fraud prediction to respond to bad actors rapidly.|Consumer Lines (Auto/Home/ Personal Lines)|Commercial Lines|Life Insurance|Reinsurance|\n|---|---|---|---|---|\n|Car claims image By applying transfer learning on pre-trained neural networks, Databricks helps insurance companies classification kickstart their AI/computer vision journeys toward claim assessment and damage estimation.|||||\n|Claims automation Insurers are entering a new era of claims transformation, supported by evolving technological advancement and growing data availability. You can simplify and scale your claims lifecycle with data and AI.|||||\n|Medical claims Using advanced natural language processing, you can extract text from medical records and enable automation.|||||\n|Guidewire claims Data ingestion enabler for distributed ledger technology that has predefined schemas and mapping to/ center data from Guidewire data format. integration|||||\n\n\n-----\n\n## Conclusion\n\nToday, data and AI are at the center of every innovation in the insurance industry. Databricks Lakehouse for\nInsurance empowers insurance providers to leverage the potential of data and analytics to address strategic\nchallenges, make informed decisions, mitigate risks, enhance customer experiences, and accelerate innovation.\n\n**Customers that innovate with Databricks Lakehouse for Insurance**\n\nSome of the top property and casualty, life and health insurance companies and reinsurers in the world turn\nto Databricks Lakehouse to harness the power of data and analytics to solve strategic challenges and make\nsmarter decisions that minimize risk, deliver superior customer experiences and fast-track innovation.\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000 organizations worldwide — including\n\nComcast, Condé Nast and over 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of Apache Spark ™ , Delta\n\nLake and MLflow, Databricks is on a mission to help data teams solve the world’s toughest\n\nproblems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , LinkedIn and [Facebook](https://www.facebook.com/databricksinc) .\n\n#### Begin your journey with a free trial of Databricks Lakehouse for Insurance and start developing advanced data and AI applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n\n###### Contact us for a personalized demo at:\n dbricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "```\nTECHNICAL GUIDE\n\n```\n\n# Solving Common Data Challenges \n\n\n#### Startups and Digital Native Businesses\n\n\n-----\n\n### Table of Contents\n\n\n# 01\n```\nCHALLENGE:\n \u0003\n\n###### Creating a unified data architecture for data quality, governance and efficiency\n\n# 03\nCHALLENGE:\n \u0003\n\n###### Building effective machine learning operations\n\n```\n\n# 02\n```\nCHALLENGE:\n \u0003\n\n###### Building a data architecture to support scale and performance\n\n# 04\nSUMMARY:\n\n###### The Databricks Lakehouse Platform addresses these challenges\n\n```\n\n-----\n\n**I N T R O D U C T I O N**\n\n\nThis guide shares how the lakehouse architecture can increase\nproductivity and cost-efficiently support all your data, analytics\nand AI workloads, and flexibly scale with the pace of growth\nfor your company. Read the entire guide or dive straight into a\nspecific challenge.\n\nWith the advent of cloud infrastructure, a new generation of\nstartups has rapidly built and scaled their businesses. The use of\ncloud infrastructure, once seen as innovative, has now become\ntable stakes. The differentiator for the fastest-moving startups\nand digital natives now comes from the effective use of data\nat scale, primarily analytics and AI. Digital natives — defined\nas fast-moving, lean, and technically savvy, born-in-the-cloud\norganizations — are beginning to focus on new data-driven use\ncases such as real-time machine learning and personalized\ncustomer experiences.\n\nTo pursue these new data-intensive use cases and initiatives,\norganizations must look beyond the technologies that delivered\nthem to this point in time. Over time, these technologies, such\nas transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n\nThis guide examines some of the biggest data challenges and\nsolutions for startups and for scaling digital native businesses\nthat have reached the point where an end-to-end modern data\nplatform is a smart investment. Some key considerations include:\nsystems that are not cost-efficient and require time-consuming\nadministration and engineering toil. In addition to growing\nmaintenance needs, data is often stored in disparate locations\nand formats, with little or no governance, making real-time use\ncases, analytics and AI difficult or impossible.\n\n\n**Consolidating on a unified data platform**\nAs mentioned above, siloed data storage and management add administrative and\nfinancial cost. You can benefit significantly when you unify your data in one location\nwith a flexible architecture that scales with your needs and delivers performance\nfor future success. For this, you will want an open platform that supports all your\ndata including batch and streaming workloads, data analytics and machine learning.\nWith data unification, you create a more efficient, integrated approach to ingesting,\ncleaning and organizing your data. You also need automation to make data analysis\neasier for the nontechnical users in the company. But broader data access also\nmeans more focus on security, privacy, compliance and access control, which can\ncreate overhead for a growing.\n\n**Scaling up capacity and increasing performance**\n**and usability of the data solutions**\nData teams at growing digital native organizations find it time intensive and costly to\nhandle the growing volume and velocity of their data being ingested from multiple\nsources, across multiple clouds. You now need a unified and simplified platform that\ncan instantly scale up capacity and deliver more computing power on demand to\nfree up your data teams to produce outputs more quickly. This lowers the total cost\nfor the overall infrastructure by eliminating redundant licensing, infrastructure and\nadministration costs.\n\n**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency\n\n```\nAs cloud-born companies grow, data volumes rapidly increase, leading to new\nchallenges and use cases. Among the challenges:\n\n\nApplication stacks optimized for transaction\nuse cases aren’t able to handle the volume,\nvelocity and variety of data that modern data\nteams require. For example, this leads to query\nperformance issues as data volume grows.\n\nData silos develop as each team within an\norganization chooses different ETL/ELT and\nstorage solutions for their needs. As the\norganization grows and changes, these pipelines\nand storage solutions become brittle, hard to\nmaintain and nearly impossible to integrate.\n\n\nThese data silos lead to discoverability,\nintegration and access issues, which prevent\nteams from leveraging the full value of the\norganization’s available data.\n\nData governance is hard. Disparate ETL/ELT\nand storage solutions lead to governance,\ncompliance, auditability and access control\nchallenges, which expose organizations to\ntremendous risk.\n\n\nThe Databricks Lakehouse Platform provides\na unified set of tools for building, deploying,\nsharing and maintaining data solutions at scale.\nIt integrates with cloud storage and the security\nin your cloud account, manages and deploys\ncloud infrastructure on your behalf. Your data\npractitioners no longer need separate storage\nsystems for their data. And you don’t have to rely\non your cloud provider for security. The lakehouse\nhas its own robust security built into the platform.\n\n\nFor all the reasons above, the most\nconsistent advice from successful data\npractitioners is to create a “single source\nof truth” by unifying all data on a single\nplatform. With the Databricks Lakehouse\nPlatform, you can unify all your data on one\nplatform, reducing data infrastructure costs\nand compute. You don’t need excess data\ncopies and you can retire expensive\nlegacy infrastructure.\n```\n 01\n\n```\n\n-----\n\n```\nCUSTOMER STORY: GRAMMARLY\n\n### Helping 30 million people and 50,000 teams communicate more effectively\n\n```\n\nWhile its business is based on analytics, [Grammarly](http://www.grammarly.com)\n\nfor many years relied on a homegrown analytics\n\nplatform to drive its AI writing assistant to\n\nhelp users improve multiple aspects of written\n\ncommunications. As teams developed their own\n\nrequirements, data silos inevitably emerged as\n\ndifferent business areas implemented analytics\n\ntools individually.\n\n“Every team decided to solve their analytics\n\nneeds in the best way they saw fit,” said Chris\n\nLocklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds\n\nthe standards we set as a data platform team,” said Locklin. “Lineage is\n\nthe last crucial piece for access control.”\n\nData analysts within Grammarly now have a consolidated interface for\n\nanalytics, which leads to a single source of truth and confidence in the\n\naccuracy and availability of all data managed by the data platform team.\n\nHaving a consistent data source across the company also resulted in\n\ngreater speed and efficiency and reduced costs. Data practitioners\n\nexperienced 110% faster querying at 10% of the cost to ingest compared\n\nto a data warehouse. Grammarly can now make its 5 billion daily events\n\navailable for analytics in under 15 minutes rather than 4 hours. Migrating\n\noff its rigid legacy infrastructure gave Grammarly the flexibility to do\n\nmore and the confidence that the platform will evolve with its needs.\n\nGrammarly is now able to sustain a flexible, scalable and highly secure\n\nanalytics platform that helps 30 million people and 50,000 teams\n\nworldwide write more effectively every day.\n\n[Read the full story here.](https://www.databricks.com/customers/grammarly)\n\n\n-----\n\n###### How to unify the data infrastructure with Databricks\n\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\nis composed of two primary parts:\n\n- The infrastructure to deploy, configure and\nmanage the platform and services\n\n\nYou can build a Databricks workspace by configuring\nsecure integrations between the Databricks platform\nand your cloud account, and then Databricks deploys\ntemporary Apache Spark™/Photon clusters using cloud\nresources in your account to process and store data\nin object storage and other integrated services you\ncontrol. Here are three steps to get started with the\nDatabricks Lakehouse Platform:\n\n**Understand the architecture**\nThe lakehouse provides a unified architecture,\nmeaning that all data is stored in the same\naccessible place. The diagram shows how data\ncomes in from sources like a customer relationship\nmanagement (CRM) system, an enterprise resource\nplanning (ERP) system, websites or unstructured\ncustomer emails.\n\n**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).\n\n[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n\n\n-----\n\nThe Databricks Lakehouse organizes data stored with Delta Lake in cloud object\nstorage with familiar concepts like database, tables and views. Delta Lake extends\nParquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\nscalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\nand was developed for tight integration with Structured Streaming, allowing you to\neasily use a single copy of data for both batch and streaming operations to provide\nincremental processing at scale.This model combines many of the benefits of a data\nwarehouse with the scalability and flexibility of a data lake.\n\nTo learn more about the optimized storage layer that provides the foundation for\nstoring data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n\nThe first step in unifying your data architecture is setting up how data is to be\naccessed and used across the organization. We’ll discuss this as a series of steps:\n\n**1** Set up governance with Unity Catalog\n\n**2** Grant secure access to the data\n\n\n###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n\n[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\n**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n\nTo set up Unity Catalog for your organization,\nyou do the following:\n\n\n**1** Configure an S3 bucket and IAM role that\nUnity Catalog can use to store and access\ndata in your AWS account.\n\n**2** Create a metastore for each region in\n\nwhich your organization operates, and\nattach workspaces to the metastore. Each\nworkspace will have the same view of the\ndata you manage in Unity Catalog.\n\n\n**3** If you have a new account, add users,\ngroups and service principals to your\nDatabricks account.\n\n**4** Next, create and grant access to\n\ncatalogs, schemas and tables.\n\n\nFor complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n\n\n-----\n\n###### How Unity Catalog works\n\n\nYou will notice that the hierarchy of primary data\nobjects in Unity Catalog flows from metastore to table:\n\n**Metastore** is the top-level container for metadata.\nEach metastore exposes a three-level namespace\n(catalog.schema.table) that organizes your data.\n\n\n**Metastore** **Catalog** **Schemas**\n\n\n**Views**\n\n**Managed**\n**Tables**\n\n\n**Catalog** is the first layer of the object hierarchy, used\nto organize your data assets.\n\n\n**Schemas** , also known as databases, are the second\nlayer of the object hierarchy and contain tables and\nviews.\n\n**Table** is the lowest level in the object hierarchy, and\ntables can be external (stored in external locations in\nyour cloud storage of choice) or managed (stored in a\nstorage container in your cloud storage that you create\n\nexpressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n\nUnity Catalog users, service principals, and groups\nmust also be added to workspaces to access Unity\nCatalog data in a notebook, a Databricks SQL query,\nData Explorer or a REST API command. The assignment\nof users, service principals, and groups to workspaces\nis called identity federation. All workspaces attached\nto a Unity Catalog metastore are enabled for identity\nfederation.\n\nSecurable objects in Unity Catalog are hierarchical,\nmeaning that granting a privilege on a catalog or schema\nautomatically grants the privilege to all current and\nfuture objects within the catalog or schema. For more\non granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\nA common scenario is to set up a schema per team\nwhere only that team has USE SCHEMA and CREATE on\nthe schema. This means that any tables produced by\nteam members can only be shared within the team.\nData Explorer uses the privileges configured by Unity\nCatalog administrators to ensure that users are only\nable to see catalogs, databases, tables and views that\nthey have permission to query.\n\n\n[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\nmany Unity Catalog features. Use Data Explorer to view\nschema details, preview sample data, and see table\ndetails and properties. Administrators can view and\nchange owners. Admins and data object owners can grant\nand revoke permissions through this interface.\n\n**Set up secure access**\nIn Unity Catalog, data is secure by default. Initially, users\nhave no access to data in a metastore. Access can\nbe granted by either a metastore admin, the owner of\nan object, or the owner of the catalog or schema that\ncontains the object. Securable objects in Unity Catalog\nare hierarchical and privileges are inherited downward.\n\nUnity Catalog’s security model is based on standard ANSI\nSQL and allows administrators to grant permissions in\ntheir existing data lake using familiar syntax, at the level of\ncatalogs, databases (schema), tables and views. Privileges\nand metastores are shared across workspaces, allowing\nadministrators to set secure permissions once against\n\ngroups synced from identity providers and know that\nend users only have access to the proper data in any\nDatabricks workspace they enter.\n\n\n-----\n\n```\nCUSTOMER STORY: BUTCHERBOX\n\n### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant\n\n\n“We knew we needed to migrate from our legacy data warehouse\n\nenvironment to a data analytics platform that would unify our\n\ndata and make it easily accessible for quick analysis to improve\n\nsupply chain operations, forecast demand and, most importantly,\n\nkeep up with our growing customer base,” explained Jake Stone,\n\nSenior Manager, Business Analytics, at ButcherBox.\n\nThe platform allows analysts to share builds and iterate on a\n\nproject without getting into the code. Querying a table of 18\n\nbillion rows would have been problematic with a traditional\n\nplatform. With Databricks, ButcherBox can do it in three minutes.\n\n“Delta Lake provides us with a single source of truth for all of\n\nour data,” said Stone. “Now our data engineers are able to build\n\nreliable data pipelines that thread the needle on key topics such\n\nas inventory management, allowing us to identify in near real-\n\ntime what our trends are so we can figure out how to effectively\n\nmove inventory.”\n\n[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\nproblem because they blocked complete\n\nvisibility into critical insights needed to make\n\nstrategic and marketing decisions.\n\n\n-----\n\n**Set up secure data sharing**\nDatabricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\nto share data with other entities regardless of their\ncomputing platforms. Delta Sharing is integrated with\nUnity Catalog. Your data must be registered with Unity\nCatalog to manage, govern, audit and track usage of the\nshared data on the Lakehouse Platform. The primary\nconcepts of Delta Sharing are shares (read-only\ncollections of tables and table partitions to be shared)\nand recipients (objects that associate an organization\nwith a credential or secure sharing identifier).\n\nAs a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n\n**View data lineage**\nYou can use Unity Catalog to capture runtime data\nlineage across queries in any language executed on\na Databricks cluster or SQL warehouse. Lineage can\nbe visualized in Data Explorer in near real-time and\nretrieved with the Databricks REST API. Lineage is\naggregated across all workspaces attached to Unity\nCatalog and captured down to the column level, and\nincludes notebooks, workflows and dashboards related\nto the query. To understand the requirements and how\nto capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n\n\nUnity Catalog Metastore\n\n\nCatalog\n\n\nData providers can use Databricks audit logging to\nmonitor the creation and modification of shares,\nand recipients can monitor recipient activity on\nshares. Data recipients who use shared data in a\nDatabricks account can use Databricks audit logging\nto understand who is accessing which data.\n\n\n-----\n\n###### Resources:\n\n- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n\n- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n\n- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n\n- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n\n- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n\n\n###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud\n\nThe Databricks Lakehouse Platform is open\nsource with multicloud flexibility so that you can\nuse your data however and wherever you want —\nno vendor lock-in\n\n\n-----\n\n# 02\n```\nCHALLENGE: \u0003\n\n## Build your data architecture to support scale and performance\n\n```\n\n-----\n\n```\nCHALLENGE 02\n\n### Build your data architecture to support scale and performance\n\n```\nAs modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\nthe increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\nsuddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\ncost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\nupon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n\nHere are some common challenges around managing data and performance at scale:\n\n\n**Volume and velocity** — Exponentially\nincreasing data sources, and the speed at\nwhich they capture and create data.\n\n**Latency requirements** — The demands of\ndownstream applications and users have\nevolved (people want data and the results\nfrom the data faster).\n\n\n**Governance** — Cataloging, auditing, securing and\nreporting on data is burdensome at scale when\nusing old systems not built with data access\ncontrols and compliance in mind.\n\n**Multicloud** is really hard.\n\n\n**Data storage** — Storing data in the wrong\nformat is slow to access, query and is\nexpensive at scale.\n\n\n**Data format** — Supporting structured, semistructured and unstructured data formats\nis now a requirement. Most data storage\nsolutions are designed to handle only one type\nof data, requiring multiple products\nto be stitched together.\n\n```\n02\n\n```\n\n-----\n\n###### Lakehouse solves scale and performance challenges\n\n\nThe solution for growing digital companies is a unified\nand simplified platform that can instantly scale up\ncapacity to deliver more computing power on demand,\nfreeing up teams to go after the much-needed data\nand produce outputs more quickly. With a lakehouse,\nthey can replace their data silos with a single home for\ntheir structured, semi-structured and unstructured\ndata. Users and applications throughout the enterprise\nenvironment can connect to the same single copy of\nthe data to drive diverse workloads.\n\nThe lakehouse architecture is cost-efficient for\nscaling, lowering the total cost of ownership for the\noverall infrastructure by consolidating all data estate\nand use cases onto a single platform and eliminating\nredundant licensing, infrastructure and administration\ncosts. Unlike other warehouse options that can only\nscale horizontally, the Databricks Lakehouse can scale\nhorizontally and vertically based on workload demands.\n\nWith the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN\n\n```\n\nWith more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n\nday, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n\nlegacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n\ndecreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n\ninfrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n\ndownstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n\nactionable insights for different use cases, from predictive maintenance to smarter product development.\n\n“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n\nperformant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n\nWassym Bensaid, Vice President of Software Development at Rivian.\n\nFor instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n\naccelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n\nroll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n\nconnected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n\nsmart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n\nhas seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n\n[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.\n\n\n**Step 2**\n**Understand the common Delta Lake operations**\nThe Databricks Lakehouse Platform simplifies the\nentire data lifecycle, from data ingestion to monitoring\nand governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\nopen-source storage system based on the Delta\nformat providing reliability through ACID transactions\nand scalable metadata handling. Large quantities of\nraw files in blob storage can be converted to Delta to\norganize and store the data cheaply. This allows for\nflexibility of data movement while being performant\nand less expensive.\n\n\n**Step 1**\n**Get a trial Databricks account**\nStart your 14-day free trial with Databricks on\nAWS in a few easy steps.\n[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\nuses compute and S3 storage resources in your cloud\nprovider account.\n\n\nand writing data can occur simultaneously without risk\nof many queries resulting in performance degradation\nor deadlock for business-critical workloads.\n\nThis means that users and applications throughout\nthe enterprise environment can connect to the same\nsingle copy of the data to drive diverse workloads, with\nall viewers guaranteed to receive the most current\nversion of the data at the time their query executes.\nWith performance features like indexing, Delta Lake\ncustomers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n[up to 48x faster.](https://www.databricks.com/customers/columbia)\n\n\n[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\nand learn how to create, manage and query tables.\nWith support for ACID transactions and schema\nenforcement, Delta Lake provides the reliability that\ntraditional data lakes lack. This enables you to scale\nreliable data insights throughout the organization and\nrun analytics and other data projects directly on your\ndata lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n\nDelta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n**Step 3**\n**Ingest data efficiently at scale**\nWith a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\nfrom hundreds of data sources for analytics, AI and\nstreaming applications into one place.\n\nDatabricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\ndata ingestion. To ingest any file that can land in a data\nlake, Auto Loader incrementally and automatically\nprocesses new data files as they arrive in cloud storage\nin scheduled or continuous jobs. Auto Loader scales to\nsupport near real-time ingestion of millions of files\nper hour.\n\nFor pushing data in Delta Lake, the SQL command\n[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\ninto Delta Lake. COPY INTO is best used when the input\ndirectory contains thousands of files or fewer, and the\nuser prefers SQL. COPY INTO can be used over JDBC\nto push data into Delta Lake at your convenience.\n\n\n**Step 4**\n**Leverage production-ready tools**\n**to automate ETL pipelines**\nOnce the raw data is ingested, Databricks provides\na suite of production-ready tools that allow data\nprofessionals to quickly develop and deploy extract,\n\ntransform and load (ETL) pipelines. Databricks SQL\nallows analysts to run SQL queries against the same\ntables used in production ETL workloads, allowing for\nreal-time business intelligence at scale.\n\nWith your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n[your first extract, transform and load (ETL) pipelines](https://docs.databricks.com/getting-started/etl-quick-start.html)\nfor data orchestration and learn how easy it is to create\na cluster, create a Databricks notebook, configure\n[Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for ingestion into [Delta Lake](https://docs.databricks.com/delta/index.html) , process and\ninteract with the data, and schedule a job.\n\n\nDatabricks supports workloads in SQL, Python, Scala\nand R, allowing users with diverse skill sets and\ntechnical backgrounds to leverage their knowledge\nto derive analytic insights. You can use all languages\nsupported by Databricks to define production jobs, and\nnotebooks can leverage a combination of languages.\n\nThis means that you can promote queries written by\nSQL analysts for last-mile ETL into production data\nengineering code with almost no effort. Queries and\nworkloads defined by personas across the organization\nleverage the same data sets, so there’s no need to\nreconcile field names or make sure dashboards are up\nto date before sharing code and results with\nother teams.\n\n\n-----\n\nWith [Delta Live Tables](https://www.databricks.com/product/delta-live-tables) (DLT), data professionals have\na framework that uses a simple declarative approach\nto build ETL and ML pipelines on batch or streaming\ndata while automating operational complexities such\nas infrastructure management, task orchestration,\nerror handling and recovery, retries, and performance\noptimization.\n\nDelta Live Tables extends functionality in Apache Spark\nStructured Streaming and allows you to write just a\nfew lines of declarative Python or SQL to deploy a\nproduction-quality data pipeline with:\n\n- [Autoscaling compute infrastructure](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-concepts.html#auto-scaling) for cost savings\n\n- Data quality checks with [expectations](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-expectations.html)\n\n- Automatic [schema evolution](https://docs.databricks.com/ingestion/auto-loader/schema.html) handling\n\n- Monitoring via metrics in the [event log](https://docs.databricks.com/workflows/delta-live-tables/delta-live-tables-event-log.html)\n\nWith DLT, engineers can also treat their data as code\nand apply software engineering best practices like\ntesting, monitoring and documentation to deploy\nreliable pipelines at scale. You can easily define end-toend data pipelines in SQL or Python and automatically\nmaintain all data dependencies across the pipeline and\nreuse ETL pipelines with environment-independent\ndata management.\n\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n### Stopping sophisticated ransomware in its tracks\n\n```\n```\nCUSTOMER STORY: ABNORMAL SECURITY\n\n```\n\nThe increase in email phishing and ransomware attacks requires the type of protection that can scale and evolve\n\nto meet the challenges of modern cyberattacks. [Abnormal Security](https://abnormalsecurity.com/) , a cloud-native email security provider, knew\n\nthat scalability would become a major focus to stay ahead of attack strategies with frequent product updates.\n\nAbnormal also required a data analytics infrastructure robust enough to meet the scale requirements for its data\n\npipelines and constantly refined ML models.\n\n“We were spending too much time managing our Spark infrastructure,” said Carlos Gasperi, Software Engineer at\n\nAbnormal Security. “What we needed to be doing with that time was building the pipelines that would make the\n\nproduct better.”\n\nThe company implemented the Databricks Lakehouse Platform, which simplified its data architecture and\n\nmaximized the performance of data pipelines and analytics. Data practitioners are now able to ingest data\n\ndirectly from S3 and query it in near real-time with the help of Delta Lake, an open-format storage layer that\n\ndelivers reliability, security and performance on the data lake for both streaming and batch operations. With\n\nDatabricks SQL, data scientists are then able to create visualizations using rich dashboards to drive product\n\ndecisions and improve detection efficacy.\n\nDatabricks also provided the collaborative environment that Abnormal’s data teams needed to increase their\n\nproductivity and work in the same space without constantly competing for compute resources.\n\nWith Databricks, Abnormal has seen a 20% reduction in successful email attacks, a 40% reduction in\n\ninfrastructure costs and a 30% increase in productivity. [Read the full story here.](https://www.databricks.com/customers/abnormal)\n\n\n-----\n\nDelta Live Tables Enhanced Autoscaling is designed to handle streaming workloads\nthat trigger intermittently and are unpredictable. It optimizes cluster utilization\nby only scaling up to the necessary number of nodes while maintaining endto-end SLAs, and gracefully shuts down nodes when utilization is low to avoid\nunnecessary idle node capacity.\n\n\nDelta Live Tables helps prevent bad data from flowing into tables through validation,\nintegrity checks and predefined error policies. In addition, you can monitor data\n\nquality trends over time to get insight into how your data is evolving and where\nchanges may be necessary.\n\n\n-----\n\n**Step 5**\n**Use Databricks SQL for serverless compute**\n[Databricks SQL (DB SQL)](https://www.databricks.com/product/databricks-sql) is a serverless data\nwarehouse on the Lakehouse Platform for running your\nSQL and BI applications at scale with up to 12x better\nprice/performance. It’s imperative for younger, growing\ncompanies to reduce resource contention, and one way\nto accomplish that is with serverless compute. Running\nserverless removes the need to manage, configure or\nscale cloud infrastructure on the lakehouse, freeing up\nyour data team for what they do best.\n\n\nSee for yourself in this tutorial on [how to run and visualize](https://docs.databricks.com/sql/get-started/user-quickstart.html)\n[a query in Databrick SQL](https://docs.databricks.com/sql/get-started/user-quickstart.html) and create dashboards on data\nstored in your data lake.\n\nThe Databricks SQL REST API supports services to\nmanage queries and dashboards, query history and SQL\nwarehouses.\n\n\nDatabricks SQL warehouses provide instant, elastic\nSQL compute — decoupled from storage — and will\nautomatically scale to provide unlimited concurrency\nwithout disruption, for high concurrency use cases. DB\nSQL has data governance and security built in. Handle\nhigh concurrency with fully managed load balancing\nand scaling of compute resources.\n\n\n-----\n\n**Faster queries with Photon**\n[Photon](https://www.databricks.com/product/photon) is a new vectorized query engine designed\nto deliver dramatic infrastructure cost savings and\naccelerate all data and analytics workloads: data\ningestion, ETL, streaming, interactive queries, data\nscience and machine learning.\n\nPhoton is used by default in Databricks SQL. To\nenable Photon acceleration, select the **Use Photon**\n**Acceleration** checkbox when you create the cluster.\nIf you [create the cluster](https://docs.databricks.com/clusters/configure.html#photon-image) using [the clusters API](https://docs.databricks.com/dev-tools/api/latest/clusters.html) ,\nset runtime_engine to PHOTON.\n\nPhoton supports a number of instance types on\nthe driver and worker nodes. Photon instance types\nconsume DBUs at a different rate than the same\ninstance type running the non-Photon runtime. For\nmore information about Photon instances and DBU\nconsumption, see the [Databricks pricing page.](https://www.databricks.com/product/pricing/product-pricing/instance-types)\n\nPhoton will seamlessly coordinate work and resources\nand transparently accelerate portions of your SQL and\nSpark queries. No tuning or user intervention required.\nPhoton is compatible with Apache Spark APIs, so\ngetting started is as easy as turning it on — no code\nchange and no lock- in. Written entirely in C++, Photon\nprovides an additional [2x speedup over Apache Spark](https://www.databricks.com/product/photon)\nper the TPC-DS 1TB benchmark, and customers have\nobserved 3x–8x speedups on average.\n\n\nWith Photon, typical customers are seeing up to [80% TCO savings](https://www.databricks.com/blog/2022/08/03/announcing-photon-engine-general-availability-on-the-databricks-lakehouse-platform.html#:~:text=Up%20to%2080%25%20TCO%20cost%20savings%20%2830%25%20on,Photon%203-8x%20faster%20queries%20on%20interactive%20SQL%20workloads) over traditional\nDatabricks Runtime (Apache Spark) and up to 85% reduction in VM compute hours.\n\nLearn how to connect BI tools to Databricks SQL\ncompute resources with the following user guides:\n\n\n[Queries](https://docs.databricks.com/sql/user/queries/index.html)\n\n[Visualizations](https://docs.databricks.com/sql/user/visualizations/index.html)\n\n\n[Favorites and tags](https://docs.databricks.com/sql/user/favorites-tags.html)\n\n[Workspace browser](https://docs.databricks.com/sql/user/workspace-browser/index.html)\n\n\n[Dashboards](https://docs.databricks.com/sql/user/dashboards/index.html)\n\n[Alerts](https://docs.databricks.com/sql/user/alerts/index.html)\n\n\n-----\n\n**Step 6**\n**Orchestrate workflows**\nDatabricks provides a comprehensive suite of tools and integrations to support your\ndata processing workflows.\n\nDatabricks [Workflows](https://www.databricks.com/product/workflows) removes operational overhead by offering fully managed\norchestration service for all your teams, so you can focus on your workflows, not on\nmanaging your infrastructure. Orchestrate diverse workloads for the full lifecycle\nincluding Delta Live Tables, [Jobs](https://docs.databricks.com/workflows/index.html) for SQL, [Spark](https://www.databricks.com/product/spark) , notebooks, dbt, ML models and more.\n\nHere’s a tutorial on how to [create your first workflow with a Databricks job](https://docs.databricks.com/workflows/jobs/jobs-quickstart.html) . You will\nlearn how to create notebooks, create and run a job, view the run details, and run jobs\nwith different parameters.\n\n\n-----\n\n**Step 7**\n**Run an end-to-end analytics pipeline**\nThis where you can see how everything works together to run efficiently at scale. First\ntake the quickstart: [Running end-to-end lakehouse analytics pipelines](https://docs.databricks.com/getting-started/lakehouse-e2e.html) , where you\nwill write to and read data from an external location managed by Unity Catalog and\nconfigure Auto Loader to ingest data to Unity Catalog.\n\n###### Resources:\n\n- [Databricks Lakehouse free trial](https://www.databricks.com/try-databricks?itm_data=DataLakehouse-HeroCTA-Trial#account)\n\n- [The Lakehouse for companies born in the cloud](https://www.databricks.com/solutions/audience/digital-native)\n\n- [How DuPont achieved 11x latency reduction and 4x cost reduction with Photon](https://www.databricks.com/blog/2022/10/04/how-dupont-achieved-11x-latency-reduction-and-4x-cost-reduction-photon.html)\n\n- [Apache Spark on Databricks](https://docs.databricks.com/spark/index.html)\n\n- [Discover Lakehouse solutions](https://www.databricks.com/solutions)\n\n- [Databricks documentation](https://docs.databricks.com/)\n\n\n###### “Databricks Workflows allows our analysts to easily create, run, monitor and repair data pipelines without managing any infrastructure. This enables them to have full autonomy in designing and improving ETL processes that produce must-have insights for our clients. We are excited to move our Airflow pipelines over to Databricks Workflows.”\n —Anup Segu, Senior Software Engineer, YipitData\n\n[Learn more.](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n# 03\n```\nCHALLENGE: \u0003\n\n## Building effective machine-learning operations\n\n```\n\n-----\n\n```\nCHALLENGE 03\n\n### Building effective machine-learning operations\n\n```\nGrowing startups and digital native companies face several challenges when they\nstart building, maintaining and scaling machine learning operations (MLOps) for their\ndata science teams.\n\n\nMLOps is different from DevOps. DevOps practices\nand tooling alone are insufficient because ML\napplications rely on an assortment of artifacts (e.g.,\nmodels, data, code) that can each require different\nmethods of experiment tracking, model training,\nfeature development, governance, feature and\nmodel serving.\n\nFor data teams beginning their machine learning\njourneys, the challenge of training data models can\nbe labor-intensive and not cost-effective because\nthe data has to be converted into features and\n\ntrained on a separate machine learning platform\n\n\nData teams often perform development in\ndisjointed, siloed stacks spanning DataOps,\nModelOps and DevOps\n\nDevelopment and training environment\ndisconnect. Moving code and data between\npersonal development environments and\nmachine learning platforms for model training\nat scale is error prone and cumbersome. The\n“it worked on my machine” problem.\n\nGathering high-quality data. Data that is siloed\nacross the organization is hard to discover,\ncollect, clean and use. This leads to stale data\nand delays in development of models.\n\n\nSee **Create a unified data architecture.**\n```\n 03\n\n```\n\n-----\n\n###### Siloed stacks spanning DataOps, ModelOps and DevOps\n\nWhen data engineers help ingest, refine and prep\ndata, they do so on their own stack. This data has\nto be converted into features and then trained on\na separate machine learning platform. This cross-\nplatform handoff often results in data staleness,\ndifficulty in maintaining versions, and eventually,\npoorly performing models. Even after you have\ntrained your model, you have to deal with yet another\ntech stack for model deployment. It’s challenging\nto serve features in real time and difficult to trace\nproblems in production back to the data.\n\nThe downstream business impact is massive —\nlonger and more expensive projects, and lower\nmodel accuracy in production leading to declining\nbusiness metrics.\n\nIf you are looking at launching or scaling your\nMLOps, you should probably focus on an incremental\nstrategy. At Databricks, we see firsthand how\ncustomers develop their MLOps approaches across\na huge variety of teams and businesses. [Check out](https://www.youtube.com/watch?v=JApPzAnbfPI)\n[this Data +AI Summit session](https://www.youtube.com/watch?v=JApPzAnbfPI) to learn more about\nbuilding robust MLOps practices.\n\n\n###### Databricks solution:\n\nDatabricks Machine Learning is an integrated\nend-to-end machine learning environment\nincorporating managed services for experiment\ntracking, model training, feature development and\nmanagement, and model serving. The capabilities\nof Databricks map directly to the steps of model\ndevelopment and deployment. With Databricks\nMachine Learning, you can:\n\n\nTrain models either manually or with AutoML\n\nTrack training parameters and models using\nexperiments with MLflow tracking\n\nCreate feature tables and access them for model\ntraining and inference\n\nShare, manage and serve models using MLflow\nModel Registry\n\nDeploy models for Serverless Real-time Inference\n\n\n-----\n\n###### Use MLOps on the Databricks Lakehouse Platform\n\nTo gain efficiencies and reduce costs, many smaller\ndigital companies are employing machine learning\noperations. MLOps is a set of processes and\nautomation for managing models, data and code, and\nunique library dependencies to improve performance\nstability and long-term efficiency in ML systems.\n\nTo describe it simply, MLOps = ModelOps + DataOps +\nDevOps. The aim of MLOps is to improve the long-term\nperformance, stability and success rate of ML systems\nwhile maximizing the efficiency of the teams who\nbuild them.\n\n\nNot only does MLOps improve organizational efficiency,\nit also allows the models to iterate faster and react\nto real-life changes in the data. This ability separates\ncompanies that can grow to meet their customer’s\nchallenges in a reactive manner versus those that will\nspend significant time on data updates/processes and\nmiss the opportunity to do something with\ntheir models.\n\nThe absence of MLOps is typically marked by an\noverabundance of manual processes which are slower\n\n\nand more prone to error, affecting the quality of models, data and code. Eventually they form a bottleneck,\ncapping the ability for a data team to take on new projects. The process is complex. In larger organizations,\nseveral specialists and stakeholders can be involved in one ML project. But data practitioners at smaller digital\nnatives and high-growth startups may be forced to wear several hats.\n\n\n-----\n\nAnd once an ML project goes into production, the\nMLOps continues, since the models, data and code\nchange over time due to regulatory and business\nrequirements. But the ML system must be resilient and\nflexible. Addressing these challenges with a defined\nMLOps strategy can dramatically reduce the iteration\ncycle of delivering models to production.\n\n\n-----\n\n###### Steps in machine learning model development and deployment:\n\n\n**Step 1**\n**Data preparation**\nManually preparing and labeling data is a thankless,\ntime-consuming job. With Databricks, teams can\nlabel data with human effort, machine learning\nmodels in Databricks, or a combination of both.\nTeams can also employ a [model-assisted labeling](https://labelbox.com/product/automation )\nworkflow that allows humans to easily inspect and\ncorrect a model’s predicted labels. This process can\ndrastically reduce the amount of unstructured data\nyou need to achieve strong model performance.\n\nThe [Databricks Runtime for Machine Learning](https://docs.databricks.com/runtime/mlruntime.html) is a\nready-to-go environment with many external\nlibraries, including TensorFlow, PyTorch, Horovod,\nscikit-learn and XGBoost. It provides\nextensions to improve performance, including GPU\nacceleration in XGBoost, distributed deep\nlearning using HorovodRunner, and model\ncheckpointing.\n\nTo use Databricks Runtime ML, select the ML version\nof the runtime when you [create your cluster](https://docs.databricks.com/clusters/index.html) . To\naccess data in Unity Catalog for machine learning\nworkflows, you must use a [single user cluster](https://docs.databricks.com/data-governance/unity-catalog/compute.html) . User\nisolation clusters are not compatible with Databricks\nRuntime for Machine Learning.\n\n\nMachine learning applications often\nneed to use shared storage for data\nloading and model checkpointing. You\ncan load tabular data from [tables](https://docs.databricks.com/lakehouse/data-objects.html#table) or\nfiles. A table is a collection of\nstructured data stored as a directory\non cloud object storage.\n\nFor [data preprocessing](https://docs.databricks.com/machine-learning/preprocess-data/index.html) , you can\nuse [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) to create\nnew features, explore and reuse\nexisting features, track lineage and\nfeature creation code, and publish\nfeatures to low-latency online stores\nfor real-time inference. The Feature\nStore is a centralized repository\nthat enables data scientists to find\nand share features. It ensures that\nthe same code used to compute\nthe feature values is used for model\ntraining and inference. The Feature\nStore library is available only on\nDatabricks Runtime for Machine\nLearning and is accessible through\nDatabricks notebooks and workflows.\n\n\n###### Resources:\n\n- [The Comprehensive Guide to Feature Stores](https://www.databricks.com/resources/ebook/the-comprehensive-guide-to-feature-stores)\n\n- [Load data for machine learning and deep learning](https://docs.databricks.com/machine-learning/load-data/index.html)\n\n- [Preprocess data for machine learning and](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n[deep learning](https://docs.databricks.com/machine-learning/preprocess-data/index.html)\n\n\n-----\n\nC `USTOMER STORY: ZIPLINE`\n\n### Data-driven drones deliver lifesaving medical aid around the world\n\n\nAutomated logistics and delivery system\n\nprovider [Zipline](https://www.flyzipline.com/ ) is redefining logistics by using\n\ncutting-edge drone technology and a global\n\nautonomous logistics network to save lives\n\n\ninformation they need to accurately measure success, find\n\nthe metrics that relate to customer experiences or logistics,\n\nand improve on them exponentially as more data is ingested\n\nand machine learning models are refined.\n\n\nby giving remote communities access to\n\n\nemergency and preparatory medical aid and\n\nresources, regardless of where they are in the\n\nworld.\n\nDoing so requires the ability to ingest and\n\nanalyze huge chunks of time series data in real\n\ntime. This data is produced every time a drone\n\ntakes flight and includes performance data,\n\nin-flight battery management, regional weather\n\npatterns, geographic obstacles, landing errors\n\nand a litany of other information that must be\n\nprocessed.\n\n\n“About 30% of the deliveries we do are lifesaving emergency\n\ndeliveries, where the product being delivered does not exist\n\nat the hospital. We have to be fast, and we have to be able\n\nto rely on all the different kinds of data to predict failures\n\nbefore they occur so that we can guarantee a really, really\n\nhigh service level to the people who are literally depending\n\non us with their lives,” said Zipline CEO Keller Rinaudo.\n\n“Databricks gives us confidence in our operations, and\n\nenables us to continuously improve our technology, expand\n\nour impact, and provide lifesaving aid where and when it’s\n\nneeded, every single day.”\n\n[Read full story here.](https://www.databricks.com/customers/zipline)\n\n\nEvery Zipline flight generates a gigabyte of data\n\nwith potential life-or-death consequences,\n\nbut accessing and federating the data for both\n\ninternal and external decision-making was\n\nchallenging. With Databricks as the common\n\nplatform, Zipline’s data team can access all the\n\n\n-----\n\n**Step 2**\n**Model training**\nFor training machine learning and deep learning\nmodels, you can use [AutoML](https://docs.databricks.com/machine-learning/automl/index.html) , which automatically\nprepares a data set for model training, performs a set\nof trials using open-source libraries such as scikit-learn\nand XGBoost, and creates a Python notebook with\nthe source code for each trial run so you can review,\nreproduce and modify the code.\n\nIn Databricks, [notebooks](https://docs.databricks.com/notebooks/index.html) are the primary tool for\ncreating data science and machine learning workflows\nand collaborating with colleagues. Databricks\nnotebooks provide real-time coauthoring in multiple\nlanguages, automatic versioning and built-in data\nvisualizations.\n\n\n###### Resources:\n\n- [Model training examples](https://docs.databricks.com/machine-learning/train-model/index.html)\n\n- [Training models with Feature Store](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Best practices for deep learning on Databricks](https://docs.databricks.com/machine-learning/feature-store/train-models-with-feature-store.html)\n\n- [Machine learning quickstart notebook](https://docs.databricks.com/machine-learning/train-model/ml-quickstart.html)\n\n\n-----\n\n###### Resources:\n\n- [MLflow quickstart (Python)](https://docs.databricks.com/_extras/notebooks/source/mlflow/mlflow-quick-start-python.html)\n\n- [Track machine learning training runs](https://docs.databricks.com/mlflow/tracking.html)\n\n- [Automatically log training runs to MLflow](https://docs.databricks.com/mlflow/quick-start-python.html#automatically-log-training-runs-to-mlflow)\n\n- [Track ML Model training data with Delta Lake](https://docs.databricks.com/mlflow/tracking-ex-delta.html)\n\n- [Log, load, register, and deploy MLflow models](https://docs.databricks.com/mlflow/models.html)\n\n\n**Step 3**\n**Track model development**\nThe model development process is iterative, and can\nbe challenging. You can use [MLflow tracking](https://mlflow.org/docs/latest/tracking.html) to help\nyou keep track of the model development process,\nincluding parameter settings or combinations you have\ntried and how they affected the model’s performance.\n\nMLflow tracking uses experiments and runs to log\nand track your model development. A run is a single\nexecution of model code. An experiment is a collection\nof related runs. Within an experiment, you can compare\nand filter runs to understand how your model performs\nand how its performance depends on the parameter\nsettings, input data, etc.\n\nMLflow can automatically log training code written\nin many ML frameworks. This is the easiest way to\nget started using MLflow tracking. With MLflow’s\nautologging capabilities, a single line of code\nautomatically logs the resulting model.\n\n\nA hosted version of MLflow Model Registry can help\n[manage the full lifecycle](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html) of MLflow models. You can\napply webhooks to automatically trigger actions based\non registry events. For example, you can trigger CI\nbuilds when a new model version is created or notify\nyour team members through Slack each time a model\ntransition to production is requested. This promotes\na traceable version control work process. You can\nleverage this feature for web traffic A/B testing and\nfunneled to versions of deployed models for more\nprecise population studies.\n\n\n**Step 4**\n**Deploy machine learning models**\nYou can use MLflow to deploy models for batch or\nstreaming inference or to set up a REST endpoint to\nserve the model. Simplify your model deployment by\nregistering models to [the MLflow Model Registry](https://docs.databricks.com/mlflow/model-registry.html) . After\nyou have registered your model, you can [automatically](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb)\n[generate a notebook](https://docs.databricks.com/machine-learning/manage-model-lifecycle/index.html#generate-inference-nb) for batch inference or configure\nthe model for online serving with Serverless RealTime Inference or [Classic MLflow Model Serving on](https://docs.databricks.com/archive/classic-model-serving/model-serving.html)\n\n[Databricks](https://docs.databricks.com/archive/classic-model-serving/model-serving.html) . For model inference for deep learning\napplications, Databricks recommends the following\nworkflow.\n\nTo debug and tune model inference on Databricks,\nusing GPUs (graphics processing units) can efficiently\noptimize the running speed for model inference. As\nGPUs and other accelerators become faster, it is\nimportant that the data input pipeline keep up with\ndemand. The data input pipeline reads the data into\nSpark DataFrames, transforms it and loads it as the\ninput for model inference.\n\n\n-----\n\n```\nCUSTOMER STORY: ITERABLE\n\n### Optimizing touch points across the entire customer journey\n\n```\n“With Databricks Lakehouse, we can efficiently deploy powerful ML and AI solutions to help our customers meet\n\nrising consumer demands for more personalized experiences that drive revenue and results.” —Sinéad Cheung,\n\nPrincipal Product Manager, [Iterable](https://iterable.com/)\n\nCaptivating an audience and understanding customer journeys are essential to creating deeper brand- customer\n\nconnections that drive growth, loyalty and revenue. From helping medical practitioners build trust with new\n\npatients to ensuring that food delivery users feel connected to their culinary community, Iterable helps more\n\nthan 1,000 brands optimize and humanize their marketing in today’s competitive landscape.\n\nThis need to build personalized and automated customer experiences for its clients drove the company to find a\n\nfully managed platform that would simplify infrastructure management, make collaboration possible, and give it\n\nthe ability to scale for analytics and AI.\n\nWith Databricks Lakehouse, Iterable can harness diverse, complex data sets — including conversion events,\n\nunique user labels, engagement patterns and business insights — and facilitate rapid prototyping of machine\n\nlearning models that deliver top-notch and personalized user experiences for higher-converting marketing\n\ncampaigns. [Read the full story here.](https://www.databricks.com/customers/iterable)\n\n\n-----\n\n###### ML Stages\n\nML workflows include the following key assets: code,\nmodels and data. These assets need to be developed\n(dev), tested (staging) and deployed (production).\nEach stage needs to operate within an execution\nenvironment. So the execution environments, code,\nmodels and data are divided into dev, staging and\nproduction.\n\nML project code is often stored in a version control\nrepository (such as Git), with most organizations using\nbranches corresponding to the lifecycle phases of\ndevelopment, staging or production.\n\nSince model lifecycles do not correspond one-toone with code lifecycles, it makes sense for model\nmanagement to have its own service. MLflow and its\nModel Registry support managing model artifacts\ndirectly via UI and APIs. The loose coupling of model\nartifacts and code provides flexibility to update\nproduction models without code changes, streamlining\nthe deployment process in many cases.\n\nDatabricks recommends creating separate\nenvironments for the different stages of ML code and\nmodel development with clearly defined transitions\nbetween stages. The recommended MLOps workflow is\nbroken into these three stages:\n\n\n[Development](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#development-stage) — The focus of the development stage\nis experimentation. Data scientists develop features\nand models and run experiments to optimize model\nperformance. The output of the development process is\nML pipeline code that can include feature computation,\nmodel training inference and monitoring\n\n\n-----\n\n[Staging](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#staging-stage)\nThis stage focuses on testing the ML pipeline code\nfor production readiness, including code for model\ntraining as well as feature engineering pipelines and\ninference code. The output of the staging process is a\nrelease branch that triggers the CI/CD system to start\nthe production stage.\n\n\n-----\n\n[Production](https://docs.databricks.com/machine-learning/mlops/mlops-workflow.html#production-stage)\nML engineers own the production environment\nwhere ML pipelines are deployed. These pipelines\ncompute fresh feature values, train and test new model\nversions, publish predictions to downstream tables\nor applications, and monitor the entire process to\navoid performance degradation and instability. Data\nscientists have visibility to test results, logs, model\nartifacts and production pipeline status to allow them\nto identify and diagnose problems in production.\n\nThe Databricks Machine Learning home page provides\nquick access to all the machine learning resources. To\naccess this page, move your mouse or pointer over\nthe left sidebar in the Databricks workspace. From\nthe persona switcher at the top of the sidebar, select\n\nMachine Learning.\n\nFrom the shortcuts menu, you can create\na [notebook](https://docs.databricks.com/notebooks/index.html) , [start AutoML](https://docs.databricks.com/machine-learning/automl/index.html) or open a [tutorial notebook](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html) .\nThe center of the screen includes any recently viewed\nitems, and the sidebar provides quick access to\nthe [Experiments page](https://docs.databricks.com/mlflow/tracking.html#mlflow-experiments) , [Databricks Feature Store](https://docs.databricks.com/machine-learning/feature-store/index.html) and\n[Model Registry.](https://docs.databricks.com/mlflow/model-registry.html)\nNew users can get started with a series of [tutorials](https://docs.databricks.com/machine-learning/tutorial/index.html)\nthat illustrate how to use Databricks throughout the\n\n\n-----\n\n###### Resources:\n\n- [MLOps Virtual Event: Standardizing MLOps at Scale](https://www.databricks.com/p/webinar/mlops-virtual-event)\n\n- [Virtual Event — Automating the ML Lifecycle With](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n[Databricks Machine Learning](https://www.databricks.com/p/webinar/automating-the-ml-lifecycle-with-databricks-machine-learning?itm_data=product-resources-automatingMLlifecycle)\n\n- [MLOps Virtual Event “Operationalizing Machine](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n[Learning at Scale”](https://www.databricks.com/p/webinar/operationalizing-machine-learning-at-scale)\n\n- [The Big Book of MLOps](https://www.databricks.com/p/ebook/the-big-book-of-mlops)\n\n- [Machine learning on Databricks](https://www.databricks.com/product/machine-learning)\n\n- [Watch the demos](https://www.databricks.com/discover/demos)\n\n\nML lifecycle or access the [in-product quickstart](https://docs.databricks.com/machine-learning/tutorial/ml-quickstart.html)\nfor a model-training tutorial notebook that steps\nthrough loading data, training and tuning a model,\ncomparing and analyzing model performance and\nusing the model for inference.\n\nAlso be sure to download the [Big Book of MLOps](https://www.databricks.com/p/thank-you/the-big-book-of-mlops) to\nlearn how your organization can build a robust MLOPs\npractice incrementally.\n\n\n-----\n\n# 04\n```\nSUMMARY: \u0003\n\n## The Databricks Lakehouse Platform addresses these challenges\n 04\n\n```\n\n-----\n\n### Summary\n\nWe’ve organized the common data challenges for startups and growing digital native\n\nbusinesses into three main buckets: Building a **unified data architecture** — one that\n\nsupports **scalability and performance** ; and building effective **machine learning**\n\n**operations** , all with an eye on cost efficiency and increased productivity.\n\nThe Lakehouse Platform provides an efficient and scalable architecture that solves\nthese challenges and will support your data, analytics and AI workloads now and as\nyou scale.\n\nWith [Databricks](https://www.databricks.com/) you can unify all your data with cost-efficient architecture for highly\nperformant digital native applications and analytic workloads — designed to scale as\nyou grow. Use your data however and wherever you want with open-source flexibility,\nleverage open formats, APIs and your tools of choice. Ensure reliable, high-performing\ndata workloads while Databricks automatically manages your infrastructure as you\nscale. Leverage serverless Databricks SQL to increase productivity and scale on\ndemand with up to 12x better price/performance.\n\nEasily access data for ML models and accelerate the full ML lifecycle from\nexperimentation to production.\n\nDiscover more about the lakehouse for companies born in the cloud **.**\n\n\n-----\n\n### Get started with Databricks Trial\n\nGet a collaborative environment for data teams to build\nsolutions together with interactive notebooks to use\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow,\nTensorFlow, Keras, scikit-learn and more.\n\n\n### Get started with About Databricks Trial Databricks\n\nGet a collaborative environment for data teams to build Databricks is the lakehouse company. More than 7,000\nsolutions together with interactive notebooks to use organizations worldwide — including Comcast, Condé\nApache Spark™, SQL, Python, Scala, Delta Lake, MLflow, Nast and over 50% of the Fortune 500 — rely on the\nTensorFlow, Keras, scikit-learn and more. Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San\n\nAvailable as a 14-day full trial in your own cloud or as\n\nFrancisco, with offices around the globe. Founded by\n\na lightweight trial hosted by Databricks.\n\nthe original creators of Apache Spark™, Delta Lake and\nMLflow, Databricks is on a mission to help data teams\nsolve the world’s toughest problems. To learn more,\nfollow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n**[TRY DATABRICKS FOR FREE](https://www.databricks.com/try-databricks?itm_data=H#account)**\n\n\n\n- Available as a 14-day full trial in your own cloud or as\na lightweight trial hosted by Databricks.\n\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "**EBOOK**\n\n# Four Forces Driving Intelligent Manufacturing\n\n### A data-driven business built on Lakehouse for Manufacturing\n\n\n-----\n\n## Contents\n\nIntroduction .................................................................................................................................................................................................................................................. **03**\n\nThe four driving forces of change ..................................................................................................................................................................................................... **04**\n\nDigital transformation is not a destination, it’s a journey .......................................................................................................................................................... **05**\n\nManufacturing – use case maturity matrix ...................................................................................................................................................................................... **06**\n\nThe foundations for data-driven manufacturing ............................................................................................................................................................................ **07**\n\nDRIVING FORCE NO. 1\nThe shift from manufacturing to Intelligent Manufacturing ...................................................................................................................................................... **08**\n\nDRIVING FORCE NO. 2\nTransparency, visibility, data: optimizing the supply chain ........................................................................................................................................................ **10**\n\nDRIVING FORCE NO. 3\nFuture opportunities for manufacturing business models ......................................................................................................................................................... **13**\n\nDRIVING FORCE NO. 4\nThe focus on sustainability ....................................................................................................................................................................................................................... **15**\n\nLeveraging the Databricks Lakehouse for Manufacturing ........................................................................................................................................................... **17**\n\nThe building blocks of Lakehouse for Manufacturing .................................................................................................................................................................... **18**\n\nManufacturers’ end goals .......................................................................................................................................................................................................................... **19**\n\n2 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Introduction\n\n##### Manufacturing has always been an evolutionary business, grounded upon infrastructure, business processes, and manufacturing operations built over decades in a continuum of successes, insights and learnings. The methods and processes used to approach the development, release and optimization of products and capital spend are the foundation of the industry’s evolution.\n\n\nBut today it’s data- and AI-driven businesses that\nare being rewarded because they’re using process\nand product optimization not previously possible,\nable to forecast and sense supply chain demand,\nand, crucially, introduce new forms of revenue\nbased upon service rather than product.\n\nThe drivers for this evolution will be the emergence\nof what we refer to as “Intelligent Manufacturing”\nthat has been enabled by the rise of computational\npower at the Edge and in the Cloud. As well as\nnew levels of connectivity speed enabled by 5G\nand fiber optic, combined with increased use of\nadvanced analytics and machine learning (ML).\n\n\nYet, even with all the technological advances\nenabling these new data-driven businesses,\nchallenges exist.\n\nMcKinsey’s recent research with the World\nEconomic Forum estimates the value creation\npotential of manufacturers and suppliers that\nimplement Industry 4.0 in their operations\nat USD$37 trillion by 2025. Truly a huge number.\nBut the challenge that most companies still\nstruggle with is the move from piloting point\nsolutions to delivering sustainable impact at scale.\n[Only 30% of companies are capturing value from](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n[Industry 4.0 solutions in manufacturing today.](https://www.mckinsey.com/~/media/mckinsey/industries/advanced%20electronics/our%20insights/capturing%20value%20at%20scale%20in%20discrete%20manufacturing%20with%20industry%204%200/industry-4-0-capturing-value-at-scale-in-discrete-manufacturing-vf.pdf)\n\n\n##### 80% of manufacturers\n[see smart manufacturing as](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n[key to their future success](https://roboticsandautomationnews.com/2021/03/10/new-study-reveals-80-percent-of-manufacturers-see-smart-manufacturing-as-key-to-future-success/41322/)\n\n##### 57% of manufacturing leaders feel their organization\n[lacks skilled workers to support](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n[their smart manufacturing plans](https://www.gartner.com/en/newsroom/press-releases/2021-05-11-gartner-survey-shows-57-percent-of-manufacturing-leaders-feel-their-organization-lacks-skilled-workers-to-support-smart-manufacturing-digitization-plans)\n\n[A lack of supply chain](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[integration could stall smart](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)\n[factory initiatives for](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf) **[3 in 5](https://www2.deloitte.com/content/dam/Deloitte/us/Documents/energy-resources/us-2021-manufacturing-industry-outlook.pdf)**\n##### manufacturers by 2025\n\n\n3 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The four driving forces of change\n\n###### Over the last two years, demand imbalances and supply chain swings have added a sense of urgency for manufacturers to digitally transform. But in truth, the main challenges facing the industry have existed, and will continue to exist, outside these recent exceptional circumstances. Manufacturers will always strive for greater levels of visibility across their supply chain, always seek to optimize and streamline operations to improve margins. In the continuing quest for improved efficiency, productivity, adaptability and resilience, manufacturers are commonly tackling these major challenges:\n\n\n###### Skills and production gaps\n\nThe rise of the digital economy is demanding a new set of skills.\nFor today’s Intelligent Manufacturing organizations, there’s a fundamental\nneed for computer and programming skills for automation, along\nwith critical-thinking abilities. Also important is the ability to use\ncollaboration systems and new advanced assistance tools, such as\nautomation, virtual reality (VR) and augmented reality (AR). The deficit\nof workers with these skills is of critical concern to manufacturers.\n\nIn addition, the industry dynamics are pushing companies to increase\nand refine both partner/supplier relationships, optimize internal\noperations and build robust supply chains that do not rely upon\nsafety stock to weather supply chain swings. Historical focus on\noperational use cases is now extending to building agile supply chains.\n\n###### Supply chain volatility\n\nIf the events of the last few years proved anything, it’s that supply\nchains need to be robust and resilient. Historically, supply chain volatility\nwas smoothed by holding “safety stock,” which added costs without\nfinancial value. Then the pendulum swung to “just in time delivery,”\nwhere efficient use of working capital disregarded demand risks.\n\nRecent experiences have highlighted that demand sensing is needed\nin addition to safety stock for high-risk parts or raw materials. The ability\nto monitor, predict and respond to external factors – including natural\ndisasters, shipping and warehouse constraints, and geopolitical disruption\n– is vital to reduce risk and promote agility. Many of these external\ndata sources leverage unstructured data (news, social posts, videos\nand images), and being able to manage both structured and unstructured\ndata available to measure and analyze this volatility is key.\n\n\n###### Need for new and additional sources of revenue\n\nManufacturers’ growth historically has been limited\nto new product introduction rate or expansion into\nnew geographies. The emergence of “equipment\nas-a-service” is changing that dynamic. It’s pivoting\nthe business from product-centric growth to one\nleveraging added services, which are not slaves to the\nproduct development introduction cycle and can be highly\ndifferentiated depending on the market segment and types\nof products. Real-time data plays an outsize role, as now\nbusinesses are in unison with use cases such as predictive\nmaintenance, stock replenishment and worker safety.\n\n###### An increased focus on sustainability\n\nManufacturers have always focused on efficiency,\nbut they’re increasingly seeing the value chain as circular.\nIt’s no longer enough to consider an organization’s own\ncarbon footprint – it needs to also include indirect\nemissions and other environmental impacts from the\nactivities it doesn’t own or control. This requires a\n360-degree view of sustainability, which includes both\ninternal and external factors in measuring compliance\nwith ESG programs.\n\n**This eBook will look closer at these four key challenges**\n**and their associated use cases, as well as some**\n**of the most effective technologies and solutions**\n**that can be implemented to respond to them.**\n\n\n4 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Digital transformation is not a destination, it’s a journey\n\n##### Digitalization is reshaping many areas of manufacturing and logistics, product design, production and quality of goods as well as sustainability and energy output.\n\nThis transition from manual operations to automated\nsolutions is enhancing and optimizing operational\nefficiency and decision-making, while also making\nsupply chains more frictionless and reliable, as well\nas enabling organizations to become more responsive\nand adaptable to market and customer needs.\n\nThis disruption has been driven by a rush of new\ntechnologies including artificial intelligence, machine\nlearning, advanced analytics, digital twins, Internet\nof Things (IoT), and automation. These, in turn, have\nbeen enabled by the greater network capabilities of 5G.\nIndustry 4.0 is well underway. Intelligent Manufacturing\nisn’t the future, it’s what competitive organizations\nhave established today.\n\n\n## The data and AI maturity curve\n### From descriptive to prescriptive\n\nPrescriptive\nAnalytics\n\nPredictive\nModeling\n\n**How** can we make it happen?\n\nData\nExploration\n\n\n**What** will happen?\n\n**Why** did it happen?\n\n\nAd Hoc\nQueries\n\n\nReports\n\n\nCleaned\nData\n\n**What** happened?\n\nAnalytics Maturity\n\n\nRaw\nData\n\n\n5 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturing – use case maturity matrix\n\n\nNo\n\n1\n\n2\n\n3\n\n4\n\n5\n\n6\n\n7\n\n8\n\n9\n\n10\n\n11\n\n12\n\n13\n\n14\n\n15\n\n16\n\n17\n\n18\n\n19\n\n20\n\n21\n\n22\n\n23\n\n\nUse case name\n\nEDW offload\n\nProduct 360\n\nVoice of customer insights\n\nTesting & simulation optimization\n\nSupplier 360\n\nSpend analytics\n\nSourcing event optimization\n\nProcess & quality monitoring\n\nProcess 360\n\nEquipment predictive maintenance\n\nQuality & yield optimization\n\nSupply chain 360\n\nDemand analytics\n\nInventory visibility & tracking\n\nInventory optimization\n\nLogistics route optimization\n\nCustomer 360\n\nMarketing & sales personalization\n\nRecommendation engine\n\nAsset/Vehicle 360\n\nConnected asset & value-added services\n\nQuality event detection & traceability\n\nAsset predictive maintenance\n\n\nPeer Competitive Scale\n\nStandard among peer group\n\nCommon among peer group\n\nStrategic among peer group\n\n\nDesign\n\n\nPurchasing\n\n**11**\n\n**10**\n\n**13**\n\n**12**\n\n**17**\n\n\nNew innovations\n\nManufacturing\n\nSupply Chain\n\n\nThat is not to say that the digital transformation\njourney is simple. Replacing legacy systems, breaking\ndown data and organizational silos, bridging the gap\nbetween operational technology (OT) and informational\ntechnology (IT), reskilling workforces, and much more\nrequires a clear and determined digitalization strategy,\nand to reach new levels of IT and data maturity.\n\n\n**16**\n\n\nMuch of the aforementioned transformation requires\na foundation of effective data management and\narchitecture to be in place. Without this ability to\ncontrol the vast amounts of structured data (highly\norganized and easily decipherable) and unstructured\ndata (qualitative, no predefined data model),\nmanufacturers cannot generate actionable insights\nfrom their data, derive value from machine learning,\nmonitor and analyze supply chains, or coordinate\ndecisions across the business.\n\n\n**15**\n\n\n**14**\n\n\nMarketing & Sales\n\nService\n\n\n**19**\n\n\n**18**\n\n\n**23**\n\n\n**22**\n**21**\n**20**\n\n\nAwareness\n\n\nExploration Optimization Transformation\n\nMaturity Stages\n\n\n6 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The foundations for data-driven manufacturing\n\n###### Cloud-native platforms\n\nImprove data management, enhance data analytics\nand expand the use of enterprise data, including streaming\nstructured and unstructured data\n\n###### Technology-enabled collaboration\n\nDemocratize analytics and ML capabilities – ensure the right\nusers have access to the right data driving business value\n\n###### The ability to scale machine learning use cases\n\nA central place to store and discover ML models and enabling\ngreater collaboration between ML, data and business users\n\n\n##### 95% agree that\n[digital transformation](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[in manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[is essential to their](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[company’s future success](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n\n[Global spending on](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[digital transformation](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n[is forecast to reach](https://www.idc.com/getdoc.jsp?containerId=prUS48372321)\n##### USD$2.8 trillion by 2025\n\n\n##### 85% have accelerated\n[their digital transformation](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n[strategies since 2020](https://www.mckinsey.com/featured-insights/future-of-work/what-800-executives-envision-for-the-postpandemic-workforce)\n\n\n###### Open standards and open data architectures\n\nLeverage open source standards and open data formats\nto accelerate innovation and enable the integration\nof best-of-breed, third-party tools and services\n\n\n7 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 1\n\n## The shift from manufacturing to Intelligent Manufacturing\n\n##### If left unaddressed, a Deloitte study calculates that the manufacturing skills gap will leave 2.1 million jobs unfilled by 2030, costing the U.S. economy up to $1 trillion . The immediate response would be to point the finger at the pandemic. Indeed, the same study found that approximately 1.4 million positions were lost at the start of the pandemic, and only 63% of those have since been recouped.\n\n\nYet the reasons for the lack of manufacturing\ntalent today are manifold, and COVID-19 has only\ncontributed to an existing problem. For instance,\nmany highly experienced baby boomers are\nretiring from the workforce, leaving fewer people\nwith the in-depth knowledge of custom equipment\nand machines. Meanwhile, younger generations\nhave a poor perception of what manufacturing jobs\nare like and are reluctant to step into the industry.\nMeaning not only a problem with retaining skills,\nbut also attracting them.\n\nAnd, of course, there is a growing gap between\nthe current capabilities of industrial workers and\nthe skill sets needed for today’s data-driven,\nsensor-filled, 5G-enabled Intelligent Manufacturing.\n\n\nWith the drive to optimize operations, stabilize\nsupply chains and reinvent business models\nthrough equipment-as-a-service, the skill sets\nhave radically changed from even a decade ago.\n\nIntelligent Manufacturing’s use cases are placing\na high demand on robotics programmers and\ntechnicians, cybersecurity experts, digital twin\narchitects, supply network analysts, and people\nwho can leverage AI and machine learning\nalgorithms because deployment of these common\nuse cases is producing multiples of returns for\nthose embracing Intelligent Manufacturing.\n\n\n8 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Those manufacturers with a strategy for upskilling existing talent, while also changing the perception of the incoming workforce, need to take advantage of the following use cases:\n\n\n##### 44% report difficulty\n[hiring manufacturing](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[talent with the required](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n[digital expertise](https://www.fictiv.com/ebooks/2021-state-of-manufacturing?utm_source=forbes&utm_medium=column&utm_campaign=som21&utm_content=report)\n\n##### 83% of manufacturing workers are interested\n[in learning new digital skills](https://www.mendix.com/press/welcome-news-to-jumpstart-the-post-pandemic-economy-mendix-survey-shows-78-of-u-s-manufacturing-workers-want-to-help-with-digital-transformation/)\n\n##### 56% of Gen Z say\n[that the pandemic has](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[changed their perception](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[of manufacturing. 77% now](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n[view it as more important](https://skillsgapp.com/how-the-pandemic-shifted-gen-zs-perception-of-manufacturing-careers/)\n\n### Proof through customer success\n\n##### Watch our case study\n\n\n###### Digital twins\n\nIngesting information from sensors and other data sources,\nthese virtual replicas of physical assets create models\nto which a layer of visualization can be applied. This enables\nusers to predict failures, assess performance and reveal\nopportunities for optimization. Digital twins unlock the ability\nfor manufacturers to monitor and manage production remotely,\nas well as explore “what-if” scenarios.\n\n###### Process and quality optimization\n\nProcess and quality optimization generally covers the\noptimization of equipment, operating procedures, and control\nloops. It requires access to accurate, up-to-date data about\nconditions, collected through IoT devices to monitor every\naspect. The introduction of deep learning architectures is\nenabling manufacturing machinery to identify visual clues\nthat are indicative of quality issues in manufactured goods,\nwhile digital twins can be used to spot inefficiencies without\nthe need to pause production.\n\n###### Throughput optimization\n\nIncreasing throughput is critical for meeting delivery schedules,\nand manufacturers are always looking for ways to identify\nand eliminate bottlenecks, reduce inventory and increase\nthe utilization of assets. Throughput optimization makes\nuse of data-driven algorithms to identify, rank and resolve\nlabor, equipment or inventory bottlenecks.\n\n\n###### Equipment predictive maintenance\n\nRather than wait for a piece of equipment to fail or\nstick to a fixed schedule, predictive maintenance adopts\na predictive approach to equipment maintenance.\nBy monitoring real-time data collected from hundreds\nof IoT sensors, machine learning techniques can detect\nanomalies in operations and possible defects in equipment\nand processes. Predictive maintenance correlates data across\nmany more dimensions than traditional inspection techniques,\nto anticipate failures and prevent costly breakdowns.\n\n###### Quality and yield optimization (with computer vision)\n\nQuality assurance focuses on the use of data analytics,\nAI and machine learning to identify and prevent defects\nduring the manufacturing process. [This type of edge AI](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[is an approach that can increase productivity by 50%](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process)\n[and detection rates by up to 90%.](https://www.qualitymag.com/articles/96231-how-edge-ai-can-improve-the-visual-inspection-process) Making use of image\nrecognition and machine learning, computer vision\ncan automate visual inspections, detecting faults\nand imperfections faster and more cost effectively\nthan manual approaches.\n\n\n9 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 2\n\n## Transparency, visibility, data: optimizing the supply chain\n\n##### Over the last few years, organizations have experienced the biggest disruption to their supply chains since the 1940s. In the short term, this meant having to adapt to global lockdowns and restrictions, material shortages and compromised workforces. Longer term, there will be economic downturns and new consumer and customer demands and habits to contend with. Resilience and end-to-end visibility are key, with manufacturers given a harsh reminder of how important it is to be able to forecast and respond to disruption.\n\n\nSuch resiliency requires a combination\nof technologies and solutions. For example,\ndecision support tools with predictive capabilities\n– to monitor the supply chain and analyze\nwhat-if scenarios. Demand sensing and forecasting\nin combination with enterprise critical systems\n(ERP) needs to combine data from a wide variety\nof sources.\n\n10 Four Forces Driving Intelligent Manufacturing\n\n\nWorking together, combining millions of data points\nfrom across organizations’ operations along with\nother external sources, these technologies can\nbe used to optimize supply chains, reduce costs\nand improve customer service and loyalty.\nHowever, achieving this – embracing the latest\nin AI, machine learning and predictive analytics –\nmeans being able to manage and maintain\na flow of accurate, relevant data and to be able\nto translate this data into actionable insights.\n\n\n-----\n\n#### Successful supply chain optimization depends on up-to-the-minute, end-to-end visibility that can be applied across all stages of the supply chain, from design to planning to execution. This will incorporate a range of solutions that can include:\n\n\n###### Demand, inventory, logistics\n\n\n###### Purchasing\n\n**Spend analytics:** Most obviously, transparency and insight into where\ncash is spent is vital for identifying opportunities to reduce external\nspending across supply markets, suppliers and locations. However, spend\nanalytics are also hugely important to supply chain agility and resilience.\nThis requires a single source of data truth for finance and procurement\ndepartments. For example, integrating purchase order, invoice,\naccounts payable, and general-ledger account data to create a level of\ntransparency, visibility and consistency to inform supplier discussions\nand deploy strategies to manage cash better during times\nof disruption.\n\n###### Cross supply chain collaboration\n\n**Supply chain 360:** With real-time insights and aggregated supply\nchain data in a single business intelligence dashboard, manufacturers\nare empowered with greater levels of visibility, transparency\nand insights for more informed decision-making. This dashboard\ncan be used to identify risks and take corrective steps,\nassess suppliers, control costs and more.\n\n\n**Demand analytics:** By collecting and analyzing millions –\nif not billions – of data points about market and customer\nbehavior and product performance, manufacturers can\nuse this understanding to improve operations and support\nstrategic decisions that affect the demand of products\nand services. [Around 80% say that using this form of data](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[analysis has improved decision-making, while 26% say](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[having this level of know-how to predict, shape and meet](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n[demands has increased their profits.](https://paperzz.com/doc/8615467/the-demand-analytics-premium---strategy)\n\n**Inventory visibility and tracking:**\nInventory visibility is the ability to view and track\ninventory in real time, with insights into SKU stock levels\nand which warehouse or fulfillment center it is stored at.\nWith complete oversight of inventory across multiple\nchannels, this helps improve supply chain efficiency,\ndemand forecasting and order accuracy, while ultimately\nenhancing the customer experience.\n\n\n**Inventory optimization:** The practice of having the right\namount of available inventory to meet demand, both in the\npresent and the future, enables manufacturers to address\ndemand expectations, and reduce the costs of common\ninventory issues. Inventory optimization incorporates\ndata for demand forecasting, inventory strategy and\nstock replenishment. With the addition of AI reinforced\nlearning models, this can help improve demand prediction,\nrecommend stock levels, and automatically order\nraw materials to fulfill orders, while also detecting\nand responding to shifts in demand.\n\n**Logistics route optimization:** Using AI, route optimization\ncan help manufacturers go beyond normal route planning\nand include parameters to further drive logistics efficiency.\nWhat-if scenarios present route options that help cut\ntransportation costs, boost productivity and execute\non-time deliveries.\n\n\n**Supply chain network design:** By building and modeling the supply\nchain, it enables manufacturers to understand the costs and time\nto bring goods and services to market. Supply chain network design\nhelps to evaluate delivery at the lowest possible cost, optimal sources\nand inventory deployment, as well as define distribution strategies.\n\n11 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n[Successfully implementing AI-enabled supply](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n[chain management has enabled early adopters to](https://www.mckinsey.com/industries/metals-and-mining/our-insights/succeeding-in-the-ai-supply-chain-revolution)\n##### improve logistics costs by 15%, inventory levels by 35%, and service levels by 65%\n\n Only 6% of companies believe\n[they’ve achieved full supply chain visibility](https://www.supplychaindive.com/news/supply-chain-visibility-failure-survey-geodis/517751/\r)\n\n##### 57% believe that supply chain management \n[gives them a competitive edge that enables them](https://financesonline.com/supply-chain-statistics/\r)\n[to further develop their business](https://financesonline.com/supply-chain-statistics/\r)\n\n### Supply chain optimization case study\n\n##### Watch our case study\n\n12 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 3\n\n## Future opportunities for manufacturing business models\n\n##### When looking at the rapid evolution and growth of e-commerce, manufacturers have some catching up to do. Particularly when it comes to embracing new and customer-centric business models. For example, when shifting from a product to a service mindset, the product lifecycle becomes more holistic and the client relationship is maintained beyond the point of purchase.\n\n\nThese new opportunities are forming part\nof a longer-term industry shift from the sale\nof goods (CapEx) to recurring revenue streams,\nsuch as through Equipment-as-a-Service (EaaS)\nmodels. While this approach is not new to many\n(Rolls-Royce’s “Power-by-the-Hour” engine\nsubscription model has been around since 1962),\ncustomer demand, advances in industrial IoT\ntechnology, and a continuing decline in\nsales and margins have seen EaaS emerge\nas an imperative for manufacturers.\n\n\nOpening up some of these new revenue streams,\nof course, demands operational flexibility, but more\nimportantly, digital maturity. This means cloud\ntechnologies that allow employees new levels\nof access to data, the ability to work anywhere,\nand adapt rapidly to new needs. The introduction\nof a microservices architecture, to allow the agile\ndevelopment and deployment of new IT services.\nAnd the democratization of data, so the entire\norganization and its ecosystem of partners\nand suppliers have access to information\nabout market demand, operations, production,\nlogistics and transportation.\n\n\n13 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n##### By 2023, 20% of industrial equipment manufacturers will\n[support EaaS with remote](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n[Industrial IoT capabilities](https://www.gartner.com/en/newsroom/press-releases/2021-07-28-gartner-identifies-top5-manufacturing-trends-2021)\n\n##### In 2025, the global EaaS market is estimated\n[to grow to $131B compared](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n[to $22B in 2019](https://iot-analytics.com/entering-the-decade-of-equipment-as-a-service/)\n\n##### In the U.S., 34% said\n[pay-per-use models represent](https://relayr.io/pr-forsa-survey/)\n[a big or a very big competitive](https://relayr.io/pr-forsa-survey/)\n[advantage, while 29% consider](https://relayr.io/pr-forsa-survey/)\n[it a slight advantage](https://relayr.io/pr-forsa-survey/)\n\n### Equipment as a service case study\n\n##### Read our case study\n\n\n### This level of visibility and collaboration is not only beneficial to lower maintenance costs, capital expenditure and human capital management, but also in empowering all stakeholders to make smarter and more informed decisions.\n\n\n###### Connected assets\n\nThe digital connectivity of high-value\nphysical assets is helping to drive a\nmore efficient use of assets and cost\nsavings. Connected assets can provide\ncontinuous, real-time data on their\noperating conditions, even if they are on\nthe other side of the world. Connected\nassets can also be used as the foundation\nof as-a-service business models to\ntrack the usage of rented machines, and\nfor automakers to use with connected\nvehicles and electrification strategies.\n\n\n###### Quality event detection and traceability\n\nManufacturers are increasingly seeking\nend-to-end supply chain traceability —\nto be able to identify and trace\nthe history, distribution, location\nand application of products, parts\nand materials. With event-based\ntraceability, typically using blockchain\nledgers, manufacturers can record\nevents along the supply chain.\nThis can help aid legal compliance,\nsupport quality assurance and brand\ntrust, and provide full supply chain\nvisibility for better risk management.\n\n\n###### Demand-driven manufacturing\n\n**Equipment-as-a-Service:**\nStartup organizations without\nthe in-house infrastructure can\nuse a third-party to realize their\nconcepts, while manufacturers\nwith the production capabilities\ncan ensure minimal downtime\nfor their assets. This involves\ngreater risk for the manufacturer,\nbut also the potential for higher\nand annuitized revenues.\n\n\n14 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Driving Force No. 4\n\n## The focus on sustainability\n\n##### It’s an inescapable truth that Earth’s resources are finite, and we need to change our present, linear business models for something that minimizes our use of resources and eliminates waste. Manufacturers need to take a more sustainable approach, where they can limit their negative environmental impacts, while also conserving energy and natural resources.\n\n\nWhen looking at the entire manufacturing\nvalue chain, there are many areas where\nmore sustainable practices can deliver\nmeasurable change. Products can be\ndesigned in a way that reduces waste\nand increases their longevity; materials\ncan be selected and sourced in a more\nethical way; operational efficiency and\ngreen energy can improve production;\nand the introduction of sustainable\npractices for transportation and\nshipping can help reduce carbon\nfootprints. [These are part of the move](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[toward more circular business models](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[and establishing what PwC has called the](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[four Rs of the circular economy: Reduce,](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n[Refurbish/Reuse, Recycle and Recover.](https://www.strategyand.pwc.com/de/en/industries/industrials/importance-of-the-circular-economy-for-manufacturing.html)\n\n\nThere are a number of business\noperating models that employ the four\nRs and support the circular economy.\nSharing platforms and aaS models help\noptimize manufacturing capacity and\nenable businesses to rent rather than\nbuy the machinery and equipment\nthey need. Product use extension helps\nextend the lifecycle of products through\nrepair and refurbishment, while resource\nrecovery means recovering raw materials\nfrom end-of-life products.\n\nAchieving this means establishing\na redesigned supply chain that\nleverages many use cases, technologies\nand solutions we covered earlier.\n\n\nIt will require greater levels of\ncollaboration between suppliers\nand vendors. It will require optimizing\nproduction lines and transportation.\nIt will require greater levels of customer\nengagement to extend product lifecycles\nand close the loop of the supply chain.\n\nBut most of all, it will require data,\nto provide visibility and intelligence\nacross the network, and to be able\nto make the decisions to improve\nefficiency in the present, as well as\nlonger-term decisions based on a\nbroad view of sustainability impacts.\n\n\n15 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n### Sustainability Solution Accelerator\n\n##### Read now\n\n\n[The manufacturing industry alone](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)\n[is responsible for](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[54% of the](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n##### world’s energy consumption\n[and](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/) **[20% of carbon emissions](https://blogs.3ds.com/delmia/leverage-the-power-of-digitalization-for-more-sustainable-manufacturing/)**\n\n\n##### 80% of the world’s leading companies \n[are now incorporating sustainability](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n[into their operations and goals](https://assets.kpmg/content/dam/kpmg/xx/pdf/2020/11/the-time-has-come.pdf)\n\n\n##### 78% of industrial, manufacturing and metals organizations now report on sustainability — up from 68% in 2017\n\n\n16 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Leveraging the Databricks Lakehouse for Manufacturing\n\nOur open, simple and collaborative Lakehouse for Manufacturing enables automotive, electronics, industrial,\nand transportation & logistics organizations to unlock more value and transform how they use data and AI.\n\n\nAll your sources Any structure or frequency\n\n\nReliable, real-time processing Analytics capabilities for any use case or persona\n\n\nCompetitor News\n& Social\n\nConsumer Devices\n\nVideo & Images\n\nIoT & Shop Floor\n\nEnterprise Resource\nPlanning\n\nSales Transaction\n& Syndicated\n\nInventory & Logistics\n\n\nUnstructured batch\n\n\nAd Hoc Data Science\n\nLow-cost, rapid experimentation\nwith new data and models.\n\nProduction Machine Learning\n\nHigh volume, fine-grained analysis at scale\nserved in the tightest of service windows.\n\nBI Reporting and Dashboarding\n\nPower real-time dashboarding directly,\nor feed data to a data warehouse for\nhigh-concurrency reporting.\n\nReal-Time Applications\n\n\nLakehouse enables a real-time\ndata-driven business with the ability\nto ingest structured, semi-structured\nand unstructured data from ERP,\nSCM, IoT, social or other sources\nin your value chain so that predictive\nAI and ML insights can be realized.\nThis enables them to operate their\nbusiness in real time, deliver more\naccurate analytics that leverage all\ntheir data, and drive collaboration\nand innovation across their value\nchain. Most important for capital\nintensive manufacturing business,\nit enables them to move quickly\nfrom proof-of-concept (PoC)\nideation to ROI quickly.\n\n\nSemi-structured real-time\n\nUnstructured batch\n\nSemi-structured real-time\n\nStructured real-time\n\nStructured batch\n\nStructured real-time\n\n\nData Lakehouse\n\nProcess, manage, and\nquery all your data.\n\nAny cloud\n\n\nProvide real-time data to downstream\napplications or power applications via APIs.\n\n\n17 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## The building blocks of Lakehouse for Manufacturing\n\n\n###### Real Time\n\nMake data-informed decisions\n\n\n###### Solution Accelerators\n\nAccelerate the possibilities\nof capabilities\n\n\n###### Partner Solutions\n\nAccelerate the\ncreation of insights\n\n\n###### Speed\n\nDelivering fast ROI\n\n\n**Real-time data to make informed**\n**decisions:** The Lakehouse Platform\nstreamlines data ingestion and\nmanagement in a way that makes it easy\nto automate and secure data with fast,\nreal-time performance. This means you\ncan consolidate and enhance data from\nacross the organization and turn it into\naccessible, actionable insights.\n\n\n**Solution Accelerators for new**\n**capabilities:** Through our Solution\nAccelerators, manufacturers can\neasily access and deploy common and\nhigh-impact use cases. For manufacturers\nrestricted by time and resources, these\naccelerators provide the tools and\npre-built code to deliver PoCs in\nless than two weeks.\n\n\n**Pre-built applications to deliver**\n**solutions faster:** We make it easy\nfor you to discover data, analytics\nand AI tools, using pre-built integrations\nto connect with partner solutions,\nintegrating them (and existing solutions)\ninto the Lakehouse Platform to rapidly\nexpand capabilities in a few clicks.\n\n\n**The speed to deliver fast ROI:**\nWith faster data ingestion and access\nto insights combined with easier, quicker\ndeployments, this means accelerated\ndigital transformation and higher ROI.\n\n\n18 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## Manufacturers’ end goals\n\n##### Intelligent Manufacturing leaders leverage a combination of familiar manufacturing techniques and recent value producing and differentiating use of data-led use cases.\n\nThis means making use of IIoT, cloud computing, data analytics,\nmachine learning and more to create an end-to-end digital ecosystem\nacross the entire value chain and build scalable architectures\nthat take data from edge to AI. It means embracing automation\nand robotics, optimizing how organizations use assets and\naugmenting the capabilities of workforces, and introducing new\nlevels of connectivity to accelerate performance. Not to mention\nopen the door to new platform and as-a-service business models\nwith the potential to generate new revenue streams.\n\nAlso key to the data-driven transformation of manufacturing is visibility:\na 360-degree, end-end-to view of the supply chain. Not only is this\ncritical for the efficiency, optimization and profitability of operations,\nit is needed to be able to take new strides in sustainability.\n\nOf course, better data management is not only about unlocking\ninsight, empowering AI, and enabling decision-making. It’s also about\ngovernance: acknowledging format issues, adhering to compliance,\nprotecting IP, ensuring data security. All this needs to be taken into\nconsideration when bringing onboard an ISV to establish a modern,\nunified architecture for data and AI.\n\n19 Four Forces Driving Intelligent Manufacturing\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe. Founded by\nthe original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems. To learn more,\nfollow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\nGet started with a free trial of Databricks and\nstart building data applications today\n\n##### Start your free trial\n\nTo learn more, visit us at:\n\n**[Databricks for Manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/four-forces-driving-intelligent-manufacturing-v7.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "## Driving Innovation and Transformation in the Federal Government With Data + AI\n\nEmpowering the federal government\nto efficiently deliver on mission objectives\nand better serve citizens\n\n\n-----\n\n### Contents\n\nState of the union: Data and AI in the federal government **03**\n\nRecognizing the opportunity for data and AI **04**\n\nChallenges to innovation **07**\n\nThe Databricks Lakehouse Platform: Modernizing the federal government to achieve mission objectives **09**\n\nCustomer story: U.S. Citizenship and Immigration Services **13**\n\nConclusion **15**\n\n\n-----\n\n### State of the union: Data and AI in the federal government\n\nFor the private sector, the growth, maturation and application of data analytics and\n\nartificial intelligence (AI) have driven innovation. This has resulted in solutions that have\n\nhelped to improve efficiencies in everything from optimizing supply chains to accelerating\n\ndrug development to creating personalized customer experiences and much more.\n\nUnfortunately, the federal government and many of its agencies are just beginning to take\n\nadvantage of the benefits that data, analytics and AI can deliver. This inability to innovate\n\nis largely due to aging technology investments, resulting in a sprawl of legacy systems\n\nsiloed by agencies and departments.\n\nAdditionally, the government is one of the largest employers in the world, which introduces\n\nsignificant complexity, operational inefficiencies and a lack of transparency that limit the\n\nability of its agencies to leverage the data at their disposal for even basic analytics – let\n\nalone advanced data analytic techniques, such as machine learning.\n\n\n-----\n\n### Recognizing the opportunity for data and AI\n\nThe opportunity for the federal government to leverage data analytics and AI cannot be\n\noverstated. With access to some of the largest current and historical data sets available to the\n\n\nUnited States — and with vast personnel resources and some of the best private sector use\n\ncases and applications of AI available in the world — the federal government has the ability to\n\ntransform the efficiency and effectiveness of many of its agencies.\n\nIn fact, the federal government plans to spend $4.3 billion in artificial intelligence research and\n\ndevelopment across agencies in fiscal year 2023, according to a recent report from Bloomberg\n\nGovernment. These priorities are validated by a recent Gartner study of government CIOs\n\nacross all levels (including state and local), confirming that the top game-changing technologies\n\nare AI, data analytics and the cloud.\n\nAnd as an indication of the potential impact, a recent study by Deloitte shows the government\n\ncan save upward of $3 billion annually on the low end to more than $41 billion annually on the\n\nhigh end from data-driven automation and AI.\n\nSources:\n\n[• Gartner Survey Finds Government CIOs to Focus Technology Investments on Data Analytics and Cybersecurity in 2019](https://www.gartner.com/en/newsroom/press-releases/2019-01-23-gartner-survey-finds-government-cios-to-focus-technol)\n\n[• Administration Projects Agencies Will Spend $1 Billion on Artificial Intelligence Next Year](https://www.nextgov.com/emerging-tech/2019/09/administration-projects-agencies-will-spend-1-billion-artificial-intelligence-next-year/159781/)\n\n\nInvestment in AI to\n\nautomate repetitive tasks\n\ncan improve efficiencies\n\nacross government agencies,\n\nwhich could save **96.7**\n#### million federal hours annually, with a potential\n\nsavings of **$3.3 billion.**\n\n**WILLIAM EGGERS, PETER VIECHNICKI**\n\n**AND DAVID SCHATSKY**\n\n[Deloitte Insights](https://www2.deloitte.com/us/en/insights/focus/cognitive-technologies/artificial-intelligence-government.html)\n\n\n-----\n\n**An increased focus on cloud, analytics and AI = operational efficiency**\n\n1. AI/ML\n2. Data Analytics\n3. Cloud\n\n**$1B** **TOP PRIORITIES** **$41B+**\n\nData and AI Research and Government CIOs’ top Estimated government\nDevelopment Initiative game-changing technologies savings from data-driven\nautomation\n\n**U.S. Government**\n\nFortunately, the President’s Management Agenda (PMA) has recognized the need to\n\nmodernize their existing infrastructure, federate data for easier access and build more\n\n\n**IT Modernization Act**\n\nAllows agencies to invest\n\nin modern technology\n\nsolutions to improve\n\nservice to the public,\n\nsecure sensitive systems\n\nand data, and save\n\ntaxpayer dollars.\n\n\n**Federal Data Strategy**\n\nA 10-year vision for how\n\nthe federal government will\n\naccelerate the use of data to\n\nachieve its mission, serve the\n\npublic and steward resources,\n\nwhile protecting security,\n\nprivacy and confidentiality.\n\n\n**AI Executive Order**\n\nMakes AI a top research\n\nand development priority for\n\nfederal agencies, provides\n\na shared ethics framework\n\nfor developing and using AI,\n\nand expands job rotation\n\nprograms to increase\n\nthe number of AI experts\n\nat agencies.\n\n\nadvanced data analytics capabilities by establishing mandates for modernization, data\n\nopenness and the progression of AI innovations.\n\n\nThis will put agencies in a better position to leverage the scale of the cloud and democratize\n\n\nThis will put agencies in a better position to leverage the scale of the cloud and democratize The end result will be transformative innovation that can not only improve the operational\n\nsecure access to data in order to enable downstream business intelligence and AI use cases. efficiencies of each agency, but also support the delivery of actionable insights in real time\n\n\nefficiencies of each agency, but also support the delivery of actionable insights in real time\n\n\nfor more informed decision-making. This benefits citizens in the form of better services,\n\nstronger national security and smarter resource management.\n\n\n-----\n\nTop data and AI use cases in the government\n\n\n**H E A LT H C A R E**\n\nImprove the delivery and quality of healthcare services for citizens with powerful analytics and a 360°\n\nview of patients.\n\n- Patient 360 - Insurance management\n\n- Population health - Genomics\n\n- Supply chain optimization - Drug discovery and delivery\n\n\nAcross the federal government, data and AI is providing the insights and predictive\n\ncapabilities to thwart cyberattacks and national threats, provide better social services more\n\nefficiently, and improve the delivery and quality of healthcare services.\n\n**H O M E L A N D S E C U R I T Y**\n\n\nDetect and prevent criminal activities and national threats with real-time analytics and data-driven\n\ndecision-making.\n\n\n\n- Customs and border protection - Counter-terrorism\n\n- Immigration and citizenship - Federal emergency aid management\n\n**D E F E N S E**\n\n\n**E N E R G Y**\n\nImprove energy management with data insights that ensure energy resiliency and sustainability.\n\n- Security of energy infrastructure - Energy exploration\n\n- Smarter energy management - Electrical grid reliability\n\n\nApply the power of predictive analytics to geospatial, IoT and surveillance data to improve operations\n\n\n**C O M M E R C E**\n\nProactively detect anomalies with machine learning to mitigate risk and prevent fraudulent activity.\n\n- Tax fraud and collection - Grants management\n\n- Process and operations management - Customer 360\n\n**I N T E L L I G E N C E C O M M U N I T Y**\n\nLeverage real-time insights to make informed decisions that can impact the safety of our citizens and\n\nthe world.\n\n- Threat detection - Intelligence surveillance and reconnaissance\n\n- Neutralize cyberattacks - Social media analytics\n\n\nand protect the nation.\n\n- Logistics - Surveillance and reconnaissance\n\n- Predictive maintenance - Law enforcement and readiness\n\n\n-----\n\n### Challenges to innovation\n\nThe opportunity to drive innovation throughout the federal government is massive and\n\nhas implications for every U.S. citizen. But there are several critical barriers preventing\n\n\nTen of the existing legacy systems\nmost in need of modernization\ncost about **$337 million a year**\nto operate and maintain.\n\n\nagencies from making the progress needed to realize the value of their data and delivering\n\nthose innovations.\n\n**THE GOVERNMENT ACCOUNTABILITY OFFICE,**\n\n**INFORMATION TECHNOLOGY REPORT TO CONGRESS, JUNE 2019**\n\nThe complexities and impact of legacy data warehouses and marts\n\nMultiple federal agencies are burdened with a legacy IT infrastructure that is being left\n\n\nbehind by the technological advancements seen in the private sector. This infrastructure\n\nis traditionally built with on-premises data warehouses and data marts that are highly\n\ncomplex to maintain, costly to scale as compute is coupled with storage, limited from a\n\ndata science perspective, and they lack support for the growing volumes of unstructured\n\ndata. This inhibits data-driven innovation and blocks the use of AI, leaving agencies to\n\nsearch for data science tools to fill the gaps.\n\nInfrastructure also becomes harder and more expensive to maintain as it ages. Over time,\n\nthese environments become more complex due to their need for specialized patches and\n\nupdates that keep these systems available while doing nothing to solve the issues of poor\n\ninteroperability, ever-decreasing processing speeds, and an inability to scale – all of which\n\nare critically necessary to support today’s more data-intensive use cases. For example,\n\nsystems at the departments of Education, Health and Human Services, Treasury, and Social\n\nSecurity are over 40 years old.¹ This is causing pain in a variety of areas.\n\n\noften requires significant customization and, even then, there is still a chance that the final\n\nintegration won’t be successful. These systems also keep personnel from spending their\n\nenergy and resources on emerging technologies such as AI.\n\nAnd data reliability is a big concern. Replication of data occurs across data marts as\n\nvarious teams try to access and explore it, creating data management and governance\n\nchallenges. Without a single source of truth, teams struggle with data inconsistencies,\n\nwhich can result in inaccurate analysis and model performance that is only compounded\n\nover time.\n\nThankfully, there are initiatives in place, such as the Data Center and Cloud Optimization\n\nInitiative Program Management Office (DCCOI PMO), which are investing in modernizing IT\n\ninfrastructure for federal agencies.²\n\n\nMaintaining these systems requires a massive investment of both time and money\n\ncompared to modern cloud-based systems. For the technical teams that are tasked with\n\n\ntrying to integrate any of these legacy systems with third-party tooling or services, this\n\n\n[¹ Agencies Need to Develop Modernization Plans for Critical Legacy Systems](https://www.gao.gov/assets/gao-19-471.pdf)\n\n[² IT Modernization](https://www.gsa.gov/technology/government-it-initiatives/data-center-optimization-initiative-dcoi)\n\n\n-----\n\nData is critical … and complicated\n\nData is both the greatest asset and one of the greatest challenges that federal agencies must\n\nlearn to manage. While the volume and usefulness of data collected by federal agencies are\n\nnot in question, much of it is locked in legacy source systems, comes in diverse structured\n\n\nData silos hamper any data-driven advancements\n\nIn any data-driven organization, the need to have trusted, timely and efficient access to\n\ndata is critical. For the data teams responsible for driving the digital transformation of\n\nfederal agencies, the challenges they face are myriad.\n\n\nand unstructured formats, and is subject to a variety of governance models.\n\nWe have already seen how existing, legacy infrastructure, as well as the integration of\n\n\nNot only is this data siloed and very difficult to integrate, but the data volumes collected\n\nby federal agencies are massive. At Health and Human Services, for example, or the\n\nDepartment of Veterans Affairs, healthcare data sets will be sized by population and include\n\nelectronic health records, clinical data, imaging and more. For the Department of Defense\n\n\nfragmented data sources, will strain data engineering teams trying to deliver high-quality\n\ndata at scale. Their challenge includes developing the right data pipelines that will take\n\nthe massive volumes of raw data coming from fragmented sources into one centralized\n\nlocation with clean, secure and compliant data for agency decision-makers.\n\n\nand the Department of Homeland Security, data includes everything from mapping, satellite\n\n\nData scientists and analysts alike must have the right toolset to collaboratively investigate,\n\nextract and report meaningful insights from this data. Unfortunately, data silos extend\n\nto organizational silos, which make collaboration inside an agency as well as between\n\nagencies very difficult. With different groups of data teams leveraging their own coding\n\nand analytical tools, communicating insights and working across teams — let alone\n\nacross agencies — is almost impossible. This lack of collaboration can drastically limit\n\nthe capabilities of any data analytics or AI initiatives — from the deployment of shared\n\nbusiness intelligence (BI) reports and dashboards for data investigation and decision-\n\nmaking to the training of machine learning models to automate processes and make\n\npredictions. Compounding these challenges is an overall lack of data science expertise and\n\nskills within federal agencies. As a result, even with access to their data, without intuitive\n\ntooling it’s very difficult to deliver advanced analytic use cases with ML and AI.\n\nOrganizational silos also impact the effectiveness of data analysts, who are responsible\n\nfor analyzing and reporting insights from the data to better inform subject-matter experts\n\nor policy — and decision-makers. Without a data platform that eliminates these silos and\n\nenables visualization of and reporting on shared data, data analysts will be limited in how\n\nthey are able to drive the organizational and policy agendas of their respective agencies.\n\n\nimagery and intelligence data to payroll and human resources data. The Social Security\n\nAdministration and Internal Revenue Service manage personal data for every single citizen in\n\nthe United States.\n\nCombining these various forms of data from disparate legacy systems that are not\n\nintegrated — and doing it across different government agencies and departments — can be\n\nslow and error prone, hindering downstream analytics and actionable insights. The teams\n\nthat are responsible for this are faced with not only integrating these data sources, but also\n\nmanaging the entire ETL workflow in order to enable the application of basic analytics, let\n\nalone machine learning and AI.\n\n\n-----\n\n**THE DATABRICKS LAKEHOUSE PLATFORM:**\n### Modernizing the federal government to achieve mission objectives\n\n\nDatabricks provides federal agencies with a Lakehouse Platform that combines the best of data warehouses and data\n\nlakes — to store and manage all your data for all your analytics workloads. Databricks federates all data and democratizes\n\naccess for downstream use cases, empowering federal agencies to unlock the full potential of their data to deliver on\n\ntheir mission objectives and better serve citizens.\n\n\nFederal agencies that are\npowering impactful innovations\nwith Databricks Lakehouse\n\n\nLakehouse offers a single solution for all major data workloads, whether structured or unstructured, and supports use\n\n\ncases from streaming analytics to BI, data science and AI.\n\n\nUsing predictive\nanalytics for better\npassenger safety and\nexperience\n\nEnabling operational\nefficiencies through\nprocess automation\nto streamline the path\nto citizenship\n\n\nAll your\ngovernment data\n\n\nReliable, Analytics capabilities\nreal-time processing for every use case\n\nAD HOC\nDATA SCIENCE\n\n\nHealth\n\nSurveillance\n\nSocial Security\n\nDemographics\n\nCrime\n\nAudio/Visual\n\nGeospatial\n\n\nStructured batch\n\nUnstructured stream\n\nStructured batch\n\nStructured batch\n\nUnstructured batch\n\nUnstructured stream\n\nUnstructured stream\n\n\nPRODUCTION\nMACHINE LEARNING\n\n\n**DATA LAKEHOUSE**\n\nProcess, manage\nand query all your data\n\n\nBI REPORTING AND\nSCORECARDING\n\n\nLeveraging advanced\nanalytics to improve\noutcomes for patients\nthrough Medicare and\nMedicaid services\n\n\nThe Databricks Lakehouse Platform has three unique characteristics that address head-on the biggest challenges that\n\nfederal agencies are facing:\n\n\nIt offers simplicity with regard to data\n\nmanagement, in that the Databricks\n\nLakehouse is architected to support all\n\nof an agency’s data workloads on one\n\n\nIt is built on open standards so\n\nthat any existing investments\n\nin tooling or resources can\n\nremain effective\n\n\nAnd it’s collaborative, enabling\n\nagency data engineers, analysts\n\nand data scientists to work\n\ntogether much more easily\n\n\ncommon platform\n\n\n-----\n\nManaging federal data with a unified approach\n\n\nDatabricks enables aggregation and processing of massive collections of diverse and\n\nsensitive agency data that currently exists in silos, both structured and unstructured. As\n\nwe’ve seen, for many agencies this would be incredibly difficult with the infrastructure\n\nchallenges they are experiencing. The Databricks Lakehouse leverages Delta Lake to unify\n\n\nBy providing a unified data foundation for business intelligence, data science and machine\n\nlearning, federal agencies can add reliability, performance and quality to existing data lakes\n\nwhile simplifying data engineering and infrastructure management with automation to\n\nsimplify the development and management of data pipelines.\n\n\nthe very large and diverse amounts of data that government agencies are working with.\n\nDelta Lake is an open format, centralized data storage layer that delivers reliability, security\n\nand performance — for both streaming and batch operations.\n\nThe Lakehouse Platform combines the best elements of data lakes and data warehouses — delivering the data management and performance\ntypically found in data warehouses with the low-cost, flexible object stores offered by data lakes\n\n\n-----\n\nBreak down the institutional silos limiting collaboration\n\nFoster collaboration at every step with the latest machine learning tools that allow everyone\n\nto work and build value together — from data scientists to researchers to business\n\ndecision-makers. Close the glaring skills gap within these government organizations by\n\nproviding tooling that simplifies the ML lifecycle and empowers the data teams that do not\n\nhave the data science expertise to still be productive with their data through integrating BI\n\ntools and SQL analytics capabilities.\n\nEmpower data scientists with an intuitive and interactive workspace where they can easily\n\ncollaborate on data, share models and code, and manage the entire machine learning\n\nlifecycle in one place. Databricks notebooks natively support Python, R, SQL and Scala so\n\npractitioners can work together with the languages and libraries of their choice.\n\nDeliver on mission objectives with powerful analytics across agencies\n\nThe Databricks Lakehouse Platform includes a business intelligence capability — Databricks\n\nSQL. Databricks SQL allows data analysts and users to query and run reports against all of\n\nan agency’s unified data. Databricks SQL integrates with BI tools, like Tableau and Microsoft\n\nPower BI, and complements any existing BI tools with a SQL-native interface, allowing data\n\nanalysts and data scientists to query data directly within Databricks.\n\nAdditionally, with Databricks SQL, the data team can turn insights from real-world data into\n\n\npowerful visualizations designed for machine learning. Visualizations can then be turned\n\ninto interactive dashboards to share insights with peers across agencies, policymakers,\n\n\nEasily create visualizations and share dashboards via integrations with BI tools, like Tableau and Microsoft Power BI\n\n\nregulators and decision-makers.\n\n\n-----\n\nEnsure data security and compliance at scale\n\nDatabricks is fully aware of the sensitivity of the data that many of our federal agencies are\n\nresponsible for. From national security and defense data to individual health and financial\n\ninformation to national infrastructure and energy data — all of it is critical. Data is protected\n\nat every level of the platform through deep integration with fine-grained, cloud-provider\n\naccess control mechanisms. The Databricks Lakehouse is a massively secure and scalable\n\nmulticloud platform running millions of machines every day. It is independently audited\n\nand compliant with FedRAMP security assessment protocols on the Azure cloud and can\n\nprovide a HIPAA-compliant deployment on both AWS and Azure clouds.\n\nThe platform’s administration capabilities include tools to manage user access, control\n\nspend, audit usage, and analyze activity across every workspace, all while seamlessly\n\nenforcing user and data governance, at any scale.\n\nWith complete AWS accreditation, Databricks runs across all major networks including\n\nGovCloud, SC2S, C2S and commercial; all networks, including public, NIPR, SIPR and JWICS;\n\nand ATOs, including FISMA, IL5, IL6, ICD 503 INT-A and INT-B.\n\n\n-----\n\n**CUSTOMER STORY: U.S. CITIZENSHIP AND IMMIGRATION SERVICES**\n### Streamlining the path to citizenship with data\n\n##### 24x faster\n\nquery\nperformance\n\n\n##### 10 minutes\n\nto process tables\nwith 120 million rows\n\n\n##### 40 million\n\napplications\nprocessed\n\n\nThe U.S. Citizenship and Immigration Services (USCIS) gains actionable insights from\n\ndashboards via Tableau to better understand how to streamline operations and more quickly\n\nprocess immigration and employment applications as well as petitions. Today, their data\n\nanalyst team has over 6,000 Tableau dashboards running — all powered by Databricks.\n\nThe U.S. Citizenship and Immigration Services is the government agency that oversees\n\n\nlawful immigration to the United States. Over the last decade, the volume of immigration-\n\nand citizenship-related applications has skyrocketed across naturalizations, green cards,\n\nemployment authorizations and other categories. With millions of applications and petitions\n\nflooding the USCIS, processing delays were reaching crisis levels — with overall case\n\nprocessing times increasing 91% since FY2014.\n\n\n-----\n\nProcessing delays fueled by on-premises, legacy architecture\n\nCore to these issues was an on-premises, legacy architecture that was complex, slow and\n\ncostly to scale. By migrating to AWS and Databricks, USCIS adopted a unified approach\n\nto data analytics with more big data processing power and the federation of data\n\nacross dozens of disparate sources. This has unlocked operational efficiencies and new\n\n\nA new era of data-driven innovation improves operations\n\nUSCIS now has the ability to understand their data more quickly, which has unlocked new\n\nopportunities for innovation. With Databricks, they are able to run queries in 19 minutes,\n\nsomething that used to take an entire day — a 24x performance gain. This means they are\n\nspending far less time troubleshooting and more time creating value.\n\n\nopportunities for their entire data organization to drive business intelligence and fuel ML\n\ninnovations designed to streamline application and petition processes.\n\nRemoving complexities with a fully managed cloud platform\n\n\nSince migrating to the cloud and integrating Databricks into their data analytics workflows,\n\nUSCIS has been able to make smarter decisions that help streamline processes and\n\nleverage ML to reduce application processing times. These newfound efficiencies and\n\ncapabilities have allowed them to scale their data footprint from about 30 data sources to\n\n75 without issue.\n\nDatabricks provided USCIS with significant impact where it mattered most — faster\n\nprocessing speeds that enabled data analysts to deliver timely reports to decision-\n\n\nWe discovered Databricks, and\nthe light bulb really clicked for\nus on what we needed to do\nmoving forward to stay relevant.\n\n\nmakers — and that freed up data scientists to build ML models to help improve operations.\n\nLeveraging the efficiencies of the cloud and Delta Lake, they were able to easily provision a\n\n\n26-node cluster within minutes and ingest tables with 120 million rows into S3 in under 10\n\nminutes. Prior to Databricks, performing the same processes would have taken somewhere\n\n\n**SHAWN BENJAMIN**\n\n**CHIEF OF DATA AND BUSINESS INTELLIGENCE, USCIS**\n\n\nbetween two and three hours.\n\n\n-----\n\n### Conclusion\n\nEnabling federal agencies to take advantage of data analytics and AI will help them execute\n\ntheir missions both effectively and efficiently. The Databricks Lakehouse Platform will unify\n\ndata, analytics and AI workloads, making agencies data-driven and giving policymakers\n\naccess to deeper, more meaningful insights for decision-making. It will also eliminate data\n\nsilos and increase communication and collaboration across agencies to ensure the best\n\nresults for all citizens.\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 5,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M, and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems.\n\nGet started with a free trial of Databricks and\nstart building data applications today\n\n**START YOUR FREE TRIAL**\n\nTo learn more, visit us at: **dbricks.co/federal**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Data-AI-in-Fed-Gov-Ebook.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**eBook**\n\n# Cybersecurity in Financial Services\n\n### Protecting financial institutions with advanced analytics and AI\n\n\n-----\n\n## Contents\n\nThe State of the Industry .................................................................................................................................................................................... **03**\n\nA New Commitment to Cybersecurity ....................................................................................................................................................... **04**\n\nThe Biggest Challenge With Security Analytics ..................................................................................................................................... **05**\n\nJourney of SecOps: Destination Lakehouse ............................................................................................................................................ **06**\n\nRethinking Cybersecurity in Financial Services With Databricks Lakehouse ......................................................................... **07**\n\nLakehouse in Financial Services ..................................................................................................................................................................... **08**\n\nLakehouse and SIEM: The Pattern for Cloud-Scale Security Operations .................................................................................. **12**\n\nCommon Use Cases ................................................................................................................................................................................................ **14**\n\nGetting Started With Databricks for Cybersecurity ............................................................................................................................. **15**\n\n\n-----\n\n**I N T R O D U C T I O N**\n\n## The State of the Industry\n\n\nCloud, cost and complexity of customer data and cybersecurity are\ntop of mind for every financial services security leader today. As\nfinancial services institutions (FSIs) continue to accelerate their digital\ntransformation, cybercriminals, fraudsters and state-sponsored actors\ncontinue with more sophisticated threats. The impact of these attacks\nranges from the exposure of highly sensitive data to the disruption\nof services and the exploitation of backdoors for future attacks — all\nresulting in both financial and non-financial costs. Responding quickly\nto potential threats requires security tools capable of analyzing billions\nof threat signals in real-time.\n\nRecently, it seems like every week reveals a new data breach or ransomware assault,\nand the cost is skyrocketing: more than $4 million per incident, up 10 percent from\n2020, and about $401 million for a substantial [breach at a large corporation](https://www.ibm.com/security/data-breach) .\n\n\n**Cybersecurity is no longer just a back-office cost and now**\n**poses critical business risks, such as:**\n\n**•** Operational disruption\n\n**•** Material customer loss\n\n**•** Increase in insurance premiums\n\n**•** Lawsuits or fines\n\n**•** Systemic destabilization\n\n**•** Credit downgrade\n\n**•** Reputational damage\n\nSource: Navigating Cyber 2022, FS-ISAC, Annual Cyber Threat Review and Predictions\n\n\n-----\n\n## A New Commitment to Cybersecurity\n\n\nIt comes as no surprise that in recent years FSIs have seen an amplified\ncommitment to cybersecurity. As business leaders look to new solutions, large\nportions of IT budgets are now devoted to leveraging data and AI to thwart\ncyberattacks.\n\nFurthermore, regulators are taking notice of the increased risk of cybersecurity\nthreats. Growing geopolitical tensions have also prompted federal agencies such\nas the Cybersecurity and Infrastructure Security Agency and the Federal Bureau\nof Investigation [to warn](https://www.wsj.com/livecoverage/russia-ukraine-latest-news-2022-04-05/card/banks-haven-t-seen-rise-in-cyberattacks-from-russia-yet-p3F5ebzAhTauVjsNx46E) that “tough sanctions imposed on Russia could prompt a\nspate of cyberattacks against critical infrastructure such as banks.” Additionally,\nthe Securities and Exchange Commission released its [2022 Exam Priorities](https://www.sec.gov/news/press-release/2022-57) , which\ninclude information security, and specifically “how firms are safeguarding their\ncustomers’ records and assets from cyber threats, including oversight of thirdparty providers, identification of red flags related to identity theft, response to\nincidents, including to ransomware attacks and management of operational risk in\nlight of ‘a dispersed workforce.’”\n\nHowever, as is often the case, implementing new cybersecurity strategies and\nprocesses is easier said than done.\n\n\n**Cybersecurity needs a transformation**\n**... breaches, cost and complexity are growing**\n\n\n## 100%\nof organizations surveyed have had\nbreaches.\n**The average breach costs $4M**\n\n## 85%\n**will increase their cyber budget**\nnext FY. Cybersecurity industry will\ngrow to $366B by ‘28\n\n\n## 67%\nof organizations were **breached at**\n**least three times** . A mega breach\ncosts $401M.\n\n**Cost, Complexity, Cloud**\n\n- \u0007Hundreds of tools with expanding\nfootprints\n\n- \u0007Data locked in vendor proprietary\ntools\n\n- \u0007Humans compensating for\nanalytical and integration\ndeficiencies\n\n\nIn this eBook, we’ll take a closer look at the challenges associated with replacing\nthe infrastructure of a legacy data analytics system, and how financial institutions\nare solving them with Databricks.\n\n\n-----\n\n## The Biggest Challenge With Security Analytics\n\n\nFor many FSIs, on-premises security incident and event management (SIEM)\ntechnologies have been the go-to solution for threat detection, analysis and\ninvestigations. However, these legacy technologies were built for a world where big\ndata was measured in gigabytes, not today’s terabytes or petabytes. This means\nthat not only are legacy SIEMs unable to scale to today’s data volumes, but they\nare also unable to serve the modern, distributed enterprise.\n\nBy now, the advantages of moving to the cloud are no secret to anyone. For FSIs,\nscalability, simplicity, efficiency and cost are absolutely essential components of\nsuccess. Many within FinServ are looking to cloud computing to make this possible,\nadding detection and response in the cloud to the security team’s responsibility.\n\nBecause legacy SIEMs predate the emergence of cloud, artificial intelligence and\nmachine learning (AI/ML) in the mainstream, they’re unable to address the complex\ndata and AI-driven analytics needed for threat detection, threat hunting, in-stream\nthreat intelligence enrichment, analytical automation and analyst collaboration.\n\nIn other words, legacy SIEMs are no longer suitable for the modern enterprise or\nthe current threat landscape.\n\n\n**Counting the Financial Cost of Legacy SIEMs**\n\nThe financial cost of the continued use of legacy SIEMs continues to rise because\nmost SIEM providers charge their customers based on the volume of data\ningested. While some legacy technologies are available in the cloud, they’re either\nnot designed to be cloud-native applications or confined to a single cloud service\nprovider. As a result, security teams have to employ multiple tools for detection,\ninvestigation and response — or pay exorbitant egress charges for data transiting\nfrom one cloud provider to another. This causes operational slowdowns, errors\ndriven by complexity, and inconsistent implementation of security policies.\n\nA lack of support for multiple clouds also means an increase in maintenance\noverhead. Security staff members are often stressed because analysts have to\nlearn different tools for different cloud platforms. For some, it also creates an\nimplicit cloud vendor lock-in, meaning that security teams are unable to support\nmissions because their tools are not portable across multiple cloud providers.\n\nCollectively, these drawbacks to legacy SIEMs result in a much weaker security\nposture for FSIs.\n\n\n-----\n\n## Journey of SecOps: Destination Lakehouse\n\nHow did security analytics get to this point? In the early days, there was a need to aggregate alerts from antiviruses and intrusion detection systems. SIEMs were born, built\non data warehouses, relational databases or NoSQL database management systems. But as incident investigation needs evolved, those data warehouses weren’t able to\nhandle the volume and variety of data, which led to the development of data lakes. Data lakes were cost-effective and scalable but didn’t have strong data governance and\ndata hygiene, earning them the moniker of “data swamps.” Simply integrating the two tech stacks is really complicated because of varying governance models, data silos\nand inconsistent use case support. Fast-forward to today, security teams now need AI/ML at scale in a multicloud world.\n\nWhy choose one or the other? The lakehouse architecture has emerged in recent years to help address these concerns with a single unified architecture for all your threat\ndata, analytics and AI in the cloud. The governance and transactional capabilities of the data warehouse, the scale and flexibility of a data lake, AI/ML from the ground up\nand multicloud native deployments in one platform – this is a modern architecture called the lakehouse (data lake and data warehouse).\n\n**Current Challenges** **Introducing the Data Lakehouse**\n\n\n**Cloud Storage**\nNo support for\nanalytics or\ninvestigations\n\n**SIEMs**\nNo attack chaining.\nPoor for high\ncardinality search.\n\n\n**UBA tools**\nNo historical search,\nblackbox,\nproprietary storage\n\n**No SIEM/Log**\nsolution is\nmulticloud\nnative\n\n\n**Curated Alerts** **Cloud-scale**\n**search**\n\n**ML/AI** **Multicloud**\n\n\n-----\n\n## Rethinking Cybersecurity in Financial Services With Databricks Lakehouse\n\nDatabricks introduced the first data lakehouse platform to the industry, and today over 7,000 customers use it worldwide. With Databricks Lakehouse, FSIs that are ready to\nmodernize their data infrastructure and analytics capabilities for better protection against cyber threats now have one cost-effective solution that addresses the needs of\nall their teams.\n\nThe Databricks Lakehouse Platform combines the best elements of data lakes and data warehouses, delivering the low-cost, flexible object stores offered by data lakes and\nthe data management and performance typically found in data warehouses. This unified platform simplifies existing architecture by eliminating the data silos that traditionally\nseparate analytics, data science and ML. It’s built on open source, open data and open standards to maximize flexibility, and its inherent collaborative capabilities accelerate\nthe ability to work across teams and innovate faster. Moreover, because it’s multicloud, it works the same way no matter which cloud provider is used.\n\nETL and Enrichment\n\n**Proof Point**\n\n**Firewall**\n\n**Antivirus**\n\n\n-----\n\n## Lakehouse in Financial Services\n\nBy unifying data with analytics and AI, Lakehouse allows FSIs to easily access all their data for downstream advanced analytics capabilities to support complex security\nuse cases. Lakehouse facilitates collaboration between threat intelligence teams and cyber operations, enables security operations teams to detect advanced threats, and\nreduces human resource burnout through analytical automation and collaboration. Importantly, Lakehouse also accelerates investigations from days to minutes.\n\nAlong with a more modern architecture, the Lakehouse Platform includes Delta Lake, which unifies all security data in a transactional data lake to feed advanced analytics.\nThe analytics and collaboration are done in notebooks, and security teams can use multiple languages — SQL, Python, R and Scala — in the same notebook. This makes\nit easy for security practitioners to explore data and develop advanced analytics and reporting using their favorite methods. Additionally, a separation of compute from\nstorage means performance at scale without impacting overall storage costs.\n\n\n-----\n\n**C A S E S T U D Y**\n\n**When It Comes to Security, Data Is the Best Defense***\n\n**Protecting HSBC’s 40 million customers begins with collecting and processing data from billions**\n**of signals to make previously impossible threat detection possible**\n\nsecurity operation departments, creating an enhanced relationship that results\nin better defenses, insight into the security posture of the organization, and the\nability to respond at the pace of the adversary.\n\n\nThe old way of thinking about security — stronger locks, higher walls — is outdated\nand ineffective. “When defending an organization, too often we just focus heavily\non tools, technology, and reactive scenarios,” said T.J. Campana, managing director\nof global defense and chief technology officer at HSBC, the multinational bank. “But\nthe security business is a data business. And the data always has a story to tell us.”\n\nThe quality of security, he added, is proportional to the information that can be\n\ndistilled from petabytes of data that endlessly flows through company networks.\nThat means “empowering people to get the right insights, in the right way to\nquickly prevent, detect, and respond to threats, wherever and whenever they\noccur,” said George Webster, executive director of global cybersecurity science\nand analytics at HSBC.\n\nIf a big organization is made up of tens of millions of parts that must click together\nseamlessly, security keeps those seals tight. Data gathering, analytical tools, and\nhuman intellect work together as one. This involves fusing the data science and\n\n\nBut working across years of data at petabyte scale is not an easy task, especially\nwhen a long time is measured in minutes and the adversary is constantly working\nagainst you. To put this in perspective, the security teams at HSBC intake 10 times\nthe amount of data contained in all of the books in the U.S. Library of Congress\nevery day, and must process months, if not years, of data at a time. That is where\ninnovative design, smart people, and leveraging the right technology come into\nplay. “We have to break the paradigm of the tool being the end goal of defense\nand instead view the tools as an enabler of our people,” said Webster. “It is always\nabout the people,” added Campana.\n\nHSBC turned away from the common security paradigm by leveraging the big data\nprocessing techniques from Azure Databricks. In many ways, their open source\nDelta Lake is the key enabler, with Spark being the engine. Delta Lake allows these\nteams to structure, optimize, and unlock data at scale, while Spark allows multiple\ncomplex programs to seamlessly crunch through the data. This enables HSBC’s\nsecurity teams to constantly evolve their defenses, create new capabilities at\npace, and perform investigations that were previously impossible. When a new\nthreat emerges, the bank doesn’t have the luxury to wait for the security market to\nidentify, respond, and mitigate. Instead, the bank turns to its people and creates\nwhat is needed at breathtaking speed.\n\n\n-----\n\n**C A S E S T U D Y : C O N T I N U E D**\n\n\nIt’s an essential function for HSBC, which needs to continually think about how to\nkeep more than 40 million customers in 64 countries and territories safe. Taken\ntogether, it’s an all-brains-on-deck moment with data and people guiding the\nship. It’s also a tall task for a company as massive and multifaceted as HSBC.\nHeadquartered in the UK, it is one of the largest global banks (total assets: a\nwhopping $2.968 trillion), with operations across Africa, Europe, Asia, and the\nAmericas. It’s also the largest bank in Hong Kong and even prints some of the local\ncurrency, which bears the HSBC name.\n\nThe bank’s cybersecurity approach involves fusing the data science and security\noperation departments, creating an enhanced relationship that results in more\nefficient threat discovery, rapid development of operational use cases and AI\nmodels. This enables the continuous creation of capabilities that stop adversaries\nbefore they even start. “We have to get out of the mindset that security is a walled\ngarden,” said Webster. “We must create truly collaborative environments for our\npeople to enable the business to operate,” said Campana.\n\nStaffing this symbiotic power center will be someone Campana optimistically calls\n“the analyst of the future,” a description that’s both mindset and skillset: threat\nhunter and data scientist.\n\nIn addition, when another organization is hit by cybercrime, HSBC analyzes it\nto understand how it may have responded and then improves its defenses\naccordingly. That’s in contrast to the industry norm; a Ponemon survey revealed\n\n\nthat 47 percent of organizations have not assessed the readiness of their incident\nresponse teams. That means the first time they test their plans will be at the worst\npossible time — in the middle of a cyber attack.\n\nThe proactive approach is a far cry from the old reactive conveyor belt model of\nsecurity when alert tickets were received from tooling and processed in a slow\nand linear way. Today, cross-disciplinary security teams don’t just react; they\ncontinually search for the signals in the noise — tiny aberrations that indicate\nsomething’s not right – and send up red flags in real-time. “We’re scanning\nhundreds of billions of signals per day. I cannot wait. We need situational\nawareness right now,” said Campana.\n\nThat increased speed is critical for threat assessment. Information theft may be\nthe most expensive and fastest-rising consequence of cybercrime, but data is not\nthe only target. Core systems are being hacked in a dangerous trend to disrupt\nand destroy. Regulators are also increasingly asking banks for controls in place to\ndetect and preempt financial crimes. That’s where big data tooling like Delta Lake\nand Spark shine, and where it will continually be called on to address the security\nneeds of new initiatives.\n\n“Digital security is about organically adjusting to risks,” said Webster. “It’s a journey\nof continual discovery with one central goal: to protect customers. They want\nthings easy and they want them quick. It’s our job to make sure that it’s secure.”\n\n*This story previously appeared in [WIRED Brand Lab for Databricks](https://www.wired.com/sponsored/story/when-it-comes-to-security-data-is-the-best-defense/) .\n\n\n-----\n\n**Advantages of a Lakehouse**\n\n\n**A cost-efficient upgrade**\n\nDatabricks customers only pay for the data they\nanalyze, not for what they collect. This means that\nsecurity teams can collect any amount of data\nwithout worrying about ingest-based pricing, and\nonly pay for the data that’s actually used for analysis\n— for example, an incident investigation or a data\ncall for an audit. This pricing model enables security\nteams to collect data that was previously out of\nreach, such as netflow data, endpoint detection and\nresponse data, and application and services data.\n\nFurther, Databricks is a fully managed service,\nmeaning that security teams don’t have to\npre-commit to hardware capital expenditures.\nWith no hardware to manage and no big data\nimplementations to maintain, security teams\ncan significantly reduce their management and\nmaintenance costs.\n\n\n**Multicloud**\n\nDatabricks is cloud-native on AWS, Microsoft Azure\nand Google Cloud. This creates freedom for the\nsecurity teams to use whatever cloud provider they\nlike. Additionally, teams can acquire and maintain\noperational consistency across all providers when\nthey have multiple cloud footprints. This enables\nconsistent policy implementation, reduced\ncomplexity for staff and increased efficiency.\n\nAdditionally, Databricks enables faster detection,\ninvestigation and response across the enterprise\nbecause analytics can be reused across the\nmajor cloud providers through a unified platform\nthat centralizes data for easy sharing and fosters\ncollaboration across teams.\n\n\n**Enterprise security and**\n**360° risk management**\n\nThe Lakehouse Platform is easy to set up, manage,\nscale and, most importantly, secure. This is because\nLakehouse easily integrates with existing security\nand management tools, enabling users to extend\ntheir policies for peace of mind and greater control.\n\nWith multicloud management, security admins and\ndata teams get a consistent experience across all\nmajor cloud providers. This saves valuable time\nand the resources required to upskill talent on\nproprietary services for data, analytics and AI.\n\nSecurity, risk and compliance leaders are also\nable to give team members a range of security\npermissions that come with thorough audit trails.\nThis allows teams to quickly spin up and wind down\ncollaborative workspaces for any project and to\nmanage use cases from end to end — from enabling\nuser access and controlling spend to auditing usage\nand analyzing activity across every workspace to\nenforce user and data governance.\n\n\n-----\n\n## Lakehouse and SIEM: The Pattern for Cloud-Scale Security Operations\n\n\nAccording to George Webster, head of cybersecurity sciences and analytics at\nHSBC, Lakehouse and SIEM is the pattern for security operations. What does\nit look like? It leverages the strengths of the two components: Lakehouse for\nmulticloud native storage and analytics, SIEM for security operations workflows.\nFor Databricks customers like HSBC, there are two general patterns for this\nintegration that are both underpinned by what Webster calls the cybersecurity\ndata lake with Lakehouse.\n\nIn the first pattern, Lakehouse stores all the data for the maximum retention\nperiod. A subset of the data is then sent to the SIEM and stored for a fraction of\nthe time. This pattern has the advantage of allowing analysts to query near-term\n\n\ndata using the SIEM while having the ability to do historical analysis and more\nsophisticated analytics in Databricks. It also lets them manage any licensing or\nstorage costs for the SIEM deployment.\n\nThe second pattern is to send the highest-volume data sources to Databricks —\nfor example, cloud-native logs, endpoint threat detection and response logs, DNS\ndata and network events. Low-volume data sources such as alerts, e-mail logs\nand vulnerability scan data go to the SIEM. This pattern enables Tier 1 analysts to\nquickly handle high-priority alerts in the SIEM. Threat-hunt teams and investigators\ncan leverage the advanced analytical capabilities of Databricks. This pattern has a\ncost-benefit of offloading processing, ingestion and storage from the SIEM.\n\n\n-----\n\n**Databricks and Splunk:**\n**A Case Study in Cost-Savings**\n\nDatabricks integrates with your preferred SIEM, like\nSplunk, and the Splunk-certified Databricks add-on\ncan be used to meet SOC needs without changing\nthe user interface. This example features a global\nfinancial institution’s security operation, where\nthe organization grew throughput from 25TB per\nday with only 180 days lookback, to 100TB per day\nwith 395 days lookback using the Databricks SIEM\naugmentation. The total cost of ownership savings,\nincluding infrastructure and license costs, saved tens\nof millions (more than $80mn per year) in cloud costs.\n\n\n##### FinServ Security Operations\n\nDatabricks + Splunk **Drastically** Lowered Costs\n\n**CURRENT STATE** **FUTURE OPTION**\n\n100\n\n75\n\n\n**Throughput**\nTB per day\n\n**Lookback**\n**period**\nDays\n\n\n50\n\n\n**100**\n\n\n25\n\n**25**\n\n0\n\nSplunk only Splunk + Databricks\n\n**395**\n\n**180**\n\nSplunk only Splunk + Databricks\n\nTCO savings with Splunk and Databricks vs. Splunk only solution: $81M\n\n\n-----\n\n## Common Use Cases\n\nAs FSIs focus on modernizing their data analytics and warehousing capabilities, the Databricks Lakehouse Platform\nbrings a new level of empowerment to FSIs, allowing them to unlock the full potential of their data to deliver on their\nobjectives and better serve their customers.\n\n**Common use cases include:**\n\n\n\n**•** **Threat hunting:** Empower security teams to\nproactively detect and discover advanced\nthreats using months or years of data\n\n**•** **Incident investigation:** Gain complete visibility\nacross network, endpoint, cloud and application\ndata to respond to incidents\n\n**•** **Phishing threat detection:** Uncover social\nengineering attacks that are often used to steal\nuser data, including log-in credentials and credit\ncard numbers\n\n**•** **Supply chain monitoring:** Leverage ML to\nidentify suspicious behavior within your software\nsupply chain\n\n\n\n**•** **Ransomware detection:** Scope the impact\nand spread of ransomware attacks to inform\ncomplete mitigation and remediation\n\n**•** **Credentials-abuse detection:** Identify and\ninvestigate anomalous credential usage across\nyour infrastructure\n\n**•** **Insider-threats detection:** Find and respond\nto malicious threats from people within an\norganization who have inside information about\nsecurity practices, data and computer systems\n\n**•** **Network traffic analysis:** Examine real-time\nnetwork availability and activity to identify\nanomalies, vulnerabilities and malware\n\n\n\n**•** **Analytics automation:** Automatically\ncontextualize and enrich multiple streaming and\nbatch analytics to accelerate analyst workflows\nand decision-making\n\n**•** **Augmenting anti-money laundering practices**\n**(AML):** Using structured and unstructured\ndata to maintain a list of politically exposed\nindividuals, often referred to as PEP, to augment a\nbank’s AML processes. This includes pulling data\nfrom an organization externally (keeping the PEP\nlist up-to-date including out-of-country officials\nand diplomats) as well as internally (including\ncritical personnel, network admins, etc.) who\nneed extra scrutiny.\n\n\n-----\n\n## Getting Started With Databricks for Cybersecurity\n\nGetting up and running on Databricks to address your cybersecurity needs is easy with our Solution\nAccelerators. Databricks Solution Accelerators are highly optimized, fully functional analytics solutions that\nprovide customers with a fast start to solving their data problems.\n\n**•** [Cybersecurity analytics and AI at scale with Splunk and Databricks](https://databricks.com/solutions/accelerators/cybersecurity-analytics-and-ai) : Rapidly detect threats,\ninvestigate the impact and reduce risks with the Databricks add-on for Splunk\n\n**•** [Threat detection at scale with DNS analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html) : Recognize cybercriminals using DNS,\nthreat intelligence feeds and ML\n\nDatabricks Solution Accelerators are free. Join the hundreds of Databricks customers using Solution\nAccelerators to drive better outcomes in their businesses.\n\nIf you’d like to learn more about how we are helping financial services institutions securely leverage data and AI,\nplease visit us at [dbricks.co/fiserv](https://databricks.com/solutions/industries/financial-services) or reach out to us at [cybersecurity@databricks.com](mailto:cybersecurity%40databricks.com?subject=) .\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including\n\nComcast, Condé Nast, Acosta and over 40% of the Fortune 500 — rely on the Databricks\n\nLakehouse Platform to unify their data, analytics and AI. Databricks is headquartered in San\n\nFrancisco, with offices around the globe. Founded by the original creators of Apache Spark,™\n\nDelta Lake and MLflow, Databricks is on a mission to help data teams solve the world’s\n\ntoughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n#### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=Homepage-HeroCTA-Trial)**\n\n###### To learn more, visit us at:\n dbricks.com/fiserv\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-eBook-finServ-cyber.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "**EBOOK**\n\n## Why the Data Lakehouse Is Your Next Data Warehouse\n\n\n-----\n\n### Contents\n\nPreface .......................................................................................................................................................................................................................................... **3**\n\nIntroduction ............................................................................................................................................................................................................................. **4**\n\nOur Approach: The Databricks Lakehouse Platform ................................................................................................................................... **5**\n\nIntroducing Databricks SQL: The Best Data Warehouse Is a Lakehouse ...................................................................................... **6**\n\nWhy Databricks SQL? ............................................................................................................................................................................................... 6\n\nCommon use cases .................................................................................................................................................................................................... 7\n\nThe Inner Workings of the Lakehouse ................................................................................................................................................................... **8**\n\n**PA R T 1 :** Storage layer .............................................................................................................................................................................................. 8\n\n**PA R T 2 :** Compute layer ......................................................................................................................................................................................... 13\n\n**PA R T 3 :** Consumption layer ................................................................................................................................................................................ 19\n\nConclusion ............................................................................................................................................................................................................................. **24**\n\nCustomer Stories ............................................................................................................................................................................................................... **25**\n\n\n-----\n\n### Preface\n\nHistorically, data teams have had to resort to a bifurcated architecture to run traditional\nBI and analytics workloads, copying subsets of the data already stored in their data lake\nto a legacy data warehouse. Unfortunately, this led to the lock-in, high costs and complex\ngovernance inherent in proprietary architectures.\n\nOur customers have asked us to simplify their data architecture. We decided to accelerate\nour investments to do just that.\n\n\nWe introduced [Databricks SQL](https://databricks.com/product/databricks-sql) to simplify and provide data warehousing capabilities and\nfirst-class support for SQL on the [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) , for all your existing tools.\nWe use the term “lakehouse” to reflect our customers’ desire to combine the best of data\nwarehouses and data lakes. With the lakehouse, you can now establish one source of truth\nfor all data and enable all workloads from AI to BI on one platform. And we want to provide\nyou with ease-of-use and state-of-the-art performance at the lowest cost.\n\n\n**Reynold Xin**\n\nOriginal Creator of Apache Spark, TM\nCo-founder and Chief Architect,\nDatabricks\n\n\nThis eBook covers how we went back to the drawing board to build Databricks SQL — the\nlast mile of enabling data warehousing capabilities for your existing data lakes — as part of\nthe Databricks Lakehouse Platform.\n\n\n-----\n\n### Introduction\n\n\nMost organizations operate their business with a complex data architecture that\ncombines data warehouses and data lakes. For one thing, data lakes are great\nfor machine learning (ML). They support open formats and a large ecosystem.\nBut data lakes have poor support for business intelligence (BI) and suffer\ncomplex data quality problems. Data warehouses, on the other hand, are great\nfor BI applications. But they have limited support for ML workloads, can’t handle\nnatural language data, large-scale structured data, or raw, video, audio or image\nfiles, and are proprietary systems with only a SQL interface.\n\nAs a result, data is moved around the organization through data pipelines and\nsystems that create a multitude of data silos. A large amount of time is spent\nmaintaining these pipelines and systems rather than creating new value from\ndata, and downstream consumers struggle to get a single source of truth of the\ndata due to the inherent siloing of data that takes place. The situation becomes\nvery expensive, and decision-making speed and quality are negatively affected.\n\nUnifying these systems can be transformational in how we think about data.\n\n\n##### The need for simplification\n\nIt is time for a new data architecture that can meet both today’s and tomorrow’s\nneeds. Without any compromise. Advanced analytics and ML are one of the\nmost strategic priorities for data-driven organizations today, and the amount\nof unstructured data is growing exponentially. So it makes sense to position\nthe data lake as the center of the data infrastructure. However, for this to be\nachievable, the data lake needs to adopt the strengths of data warehouses.\n\nThe answer is the [lakehouse](https://databricks.com/blog/2021/05/19/evolution-to-the-data-lakehouse.html) , an open data architecture enabled by a new open\nand standardized system design: one that implements data structure and data\nmanagement features similar to those in a data warehouse, directly on the lowcost storage used for data lakes.\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)**\n\n##### Building the Data Lakehouse\n[Bill Immon, Father of the Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse?utm_medium=paid+search&utm_source=google&utm_campaign=14925739153&utm_adgroup=133613202892&utm_content=ebook&utm_offer=building-the-data-lakehouse&utm_ad=552195081555&utm_term=data%20lakehouse%20databricks&gclid=Cj0KCQiAzMGNBhCyARIsANpUkzPYW8MmlNjO9tOWa_35rFFe7Jti32z5Debcr_nG5QU_1-GEuznzUy8aAm-PEALw_wcB)\n\n\n-----\n\n### Our Approach: The Databricks Lakehouse Platform\n\nOur customers have asked us for simplification. This is why we’ve embarked on\nthis journey to deliver one simple, open and collaborative platform for all your\ndata, AI and BI workloads on your existing data lakes.\n\nThe [Databricks Lakehouse Platform](https://databricks.com/product/data-lakehouse) greatly simplifies data architectures by\ncombining the data management and performance typically found in data\nwarehouses with the low-cost, flexible object stores offered by data lakes.\n\nIt’s built on open source and open standards to maximize flexibility, and lets you\nstore all your data — structured, semi-structured and unstructured — in your\nexisting data lake while still getting the data quality, performance, security and\ngovernance you’d expect from a data warehouse. Data only needs to exist once\nto support all of your data, AI and BI workloads on one common platform\n— establishing one source of truth.\n\nFinally, the Lakehouse Platform provides tailored and collaborative\nexperiences so data engineers, data scientists and analysts can work together\non one common platform across the entire data lifecycle — from ingestion to\nconsumption and the serving of data products — and innovate faster.\n\nLet’s look at how, with the right data structures and data management\ncapabilities in place, we can now deliver data warehouse and analytics\ncapabilities on your lakehouse. That’s where Databricks SQL (DB SQL) comes in.\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----\n\n### Introducing Databricks SQL: The Best Data Warehouse Is a Lakehouse\n\n\nDatabricks SQL is a serverless data warehouse on the Databricks Lakehouse\nPlatform that lets you run all your SQL and BI applications at scale with up to 12x\nbetter price/performance, a unified governance model, open formats and APIs,\nand your tools of choice — no vendor lock-in. Reduce resource management\noverhead with serverless compute, and easily ingest, transform and query\nall your data in place to deliver real-time business insights faster. In fact, DB\nSQL now holds the new world record in 100TB TPC-DS, the gold standard\nperformance benchmark for data warehousing.\n\nBuilt on open standards and APIs, the lakehouse provides an open, simplified and\nmulticloud architecture that brings the best of data warehousing and data lakes\ntogether, and integrations with a rich ecosystem for maximum flexibility.\n\n\n##### Why Databricks SQL?\n\nBest Price/Performance\nLower costs, get world-class performance, and eliminate the need to manage,\nconfigure or scale cloud infrastructure with serverless.\n\nBuilt-In Governance\nEstablish one single copy for all your data using open standards, and one unified\ngovernance layer across all data teams using standard SQL.\n\nRich Ecosystem\nUse SQL and any tool like Fivetran, dbt, Power BI or Tableau along with Databricks\nto ingest, transform and query all your data in place.\n\nBreak Down Silos\nEmpower every analyst to access the latest data faster for downstream real-time\nanalytics, and go effortlessly from BI to ML.\n\n**[WATCH A DEMO](https://databricks.com/discover/demos/databricks-sql)**\n\n\n-----\n\n### Common use cases\n\nThousands of customers like [Atlassian](https://www.google.com/search?q=atlassian+databricks+keynote&oq=atlassian+databricks+keynote&aqs=chrome..69i57j69i60j69i65l3j69i60j69i64l2.6409j0j1&sourceid=chrome&ie=UTF-8#:~:text=12%3A26,May%2026%2C%202021) , [SEGA](https://youtu.be/SzeXHcwPDSE) and [Punchh](https://databricks.com/customers-4/punchh) are using Databricks SQL to enable self-served analytics\nfor hundreds of analysts across their organizations, and to build custom data applications to better serve their\ncustomers. Below are some examples of use cases for Databricks SQL.\n\n**At Atlassian, we have proven**\n\n\n**Query data lake data with** **Collaboratively explore** **Build rich and custom**\n**your BI tools of choice** **the freshest data** **data applications**\n\n\n**that there is no longer a need**\n\n**for two separate data things.**\n\n**Technology has advanced**\n\n**far enough for us to consider**\n\n**one single unified lakehouse**\n\n**architecture.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager,\nAtlassian\n\n\nEnable business analysts to\ndirectly query data lake data\nusing their favorite BI tool and\navoid data silos. Reengineered\nand optimized connectors\nensure fast performance,\nlow latency and high user\nconcurrency to your data lake.\nNow analysts can use the best\ntool for the job on one single\nsource of truth for your data.\n\n\nEmpower every analyst and SQL\nprofessional in your organization\nto quickly find and share new\ninsights by providing them with\na collaborative and self-served\nanalytics experience. Confidently\nmanage data permissions with\nfine-grained governance, share and\nreuse queries, and quickly analyze\nand share results using interactive\nvisualizations and dashboards.\n\n\nBuild more effective and\ntailored data applications\nfor your own organization or\nyour customers. Benefit from\nthe ease of connectivity,\nmanagement and better price/\nperformance of DB SQL to\nsimplify development of dataenhanced applications at scale,\nall served from your data lake.\n\n\n-----\n\n### The Inner Workings of the Lakehouse\n\n\nIn the next chapter, we’ll unpack the three foundational layers of the Databricks\nLakehouse Platform and how we went back to the drawing board to build this\nexperience. Specifically, we’ll dive into how we built Databricks SQL to deliver\nanalytics and data warehousing workloads on your lakehouse.\n\n\nThose layers are:\n\n**1 .** The storage layer, or how we store and govern data\n\n**2 .** The compute layer, or how we process queries\n\n**3 .** The consumption layer, or the tools you can use to interface with the system\n\n\n###### PART 1: STORAGE LAYER\n\nIn order to bring the best of data lakes and data\nwarehouses, we needed to support the openness\nand flexibility of data lakes, as well as the quality,\nperformance and governance you’d expect from a\ndata warehouse.\n\n\n**Storage layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Open format|Data Warehouse Closed, proprietary format|Data Lakehouse Open format|\n|---|---|---|\n|Low quality, “data swamp”|High-quality, reliable data|High-quality, reliable data|\n|File-level access control|Fine-grained governance (tables row/columnar level)|Fine-grained governance (tables row/columnar level)|\n|All data types|Structured only|All data types|\n|Requires manually specifying how to lay out data|Automatically lays out data to query efficiently|Automatically lays out data to query efficiently|\n\n\n-----\n\n##### Transactional guarantees for your data lake\n\n\nThe open source format [Delta Lake](https://delta.io/) — based on Parquet — solves historical data\nlake challenges around data quality and reliability. It is the foundation for the\nlakehouse, and Databricks SQL stores and processes data using Delta Lake.\n\nFor example, it provides ACID transactions to ensure that every operation either\nfully succeeds or fully aborts for later retries — without requiring new data\npipelines to be created. It unifies batch and streaming pipelines so you can\neasily merge existing and new data at the speed required for your business. With\nTime Travel, Delta Lake automatically records all past transactions, so it’s easy\nto access and use previous versions of your data for compliance needs or for\nML applications. Advanced indexing, caching and auto-tuning allow optimization\nof Delta tables for the best query performance. Delta Lake also acts as the\nfoundation for fine-grained, role-based access controls on the lakehouse.\n\nAs a result, Delta Lake allows you to treat tables in Databricks SQL just like you\ntreat tables in a database: updates, inserts and merges can take place with high\nperformance at the row level. This is particularly useful if you are inserting new\n\n\ndata rapidly (e.g., in IoT or e-commerce use cases), or if you are redacting data\n(e.g., for compliance laws such as GDPR). Furthermore, Delta Lake provides you\nwith one open and standard format — not only for SQL but also for Python, Scala\nand other languages — so you can run all analytical and ML use cases on the\nsame data.\n\n**Delta Lake provides the key**\n\nAn open format storage layer built for lake-first architecture\n\nACID transactions, Time Travel, highly available\n\nAdvanced indexing, caching, auto-tuning\n\nFine-grained, role-based access controls\n\nStreaming & batch, analytics & ML\n\nPython, SQL, R, Scala\n\nDelta Lake brings data quality, performance and governance to the lakehouse\n\n**[DOWNLOAD NOW](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)**\n##### Delta Lake: The Definitive Guide\n[by O’Reilly](https://databricks.com/p/ebook/delta-lake-the-definitive-guide-by-oreilly)\n\n\n-----\n\n##### A framework for building a curated data lake\n\n\nWith the ability to ingest petabytes of data with auto-evolving schemas, Delta\nLake helps turn raw data into actionable data by incrementally and efficiently\nprocessing data as it arrives from files or streaming sources like Kafka, Kinesis,\nEvent Hubs, DBMS and NoSQL. It can also automatically and efficiently track data\nas it arrives with no manual intervention, as well as infer schema, detect column\nchanges for structured and unstructured data formats, and prevent data loss by\nrescuing data columns that don’t meet data quality specifications. And now with\n[Partner Connect](https://www.databricks.com/partnerconnect) , it’s never been easier to bring in critical business data from\nvarious sources.\n\nAs you refine the data, you can add more structure to it. Databricks recommends\nthe Bronze, Silver and Gold pattern. It lets you easily merge and transform new\nand existing data — in batch or streaming — while benefiting from the low-cost,\nflexible object storage offered by data lakes. Bronze is the initial landing zone\nfor the pipeline. We recommend copying data that’s as close to its raw form as\npossible to easily replay the whole pipeline from the beginning, if needed. Silver\nis where the raw data gets cleansed (think data quality checks), transformed\nand potentially enriched with external data sets. Gold is the production-grade\ndata that your entire company can rely on for business intelligence, descriptive\nstatistics, and data science/machine learning.\n\n\nBy the time you get to Gold, the tables are high-value business-level metrics\nthat have all the schema enforcement and constraints applied. This way, you can\nretain the flexibility of the data lake at the Bronze and Silver levels, and then use\nthe Gold level for high-quality business data.\n\nAuto Loader\n\n\nBRONZE\n\n\nSILVER GOLD\n\n\nStructured Streaming\n\nBatch\n\nCOPY INTO\n\nPartners\n\n\nRaw ingestion Filtered, cleaned Business-level\nand history and augmented aggregates\n\n|Col1|Col2|\n|---|---|\n||R|\n\n\n**[LEARN MORE](https://youtu.be/n9cRw6AkNDQ)**\n\n\n-----\n\n##### An aside on batch and streaming data pipelines\n\n\nThe best way to set up and run data pipelines in the Bronze/Silver/Gold\npattern recommended on the previous page is in Delta Live Tables (DLT).\nDLT makes it easy to build and manage reliable batch and streaming\ndata pipelines that deliver high-quality data. It helps data engineering\nteams simplify ETL development and management with declarative\npipeline development, automatic data testing, and deep visibility for\nmonitoring and recovery.\n\nThe fact that you can run all your batch and streaming pipelines together\nin one simple, declarative framework makes data engineering easy on the\nDatabricks Lakehouse Platform. We regularly talk to customers who have\nbeen able to reduce pipeline development time from weeks — or months\n— to mere minutes with Delta Live Tables. And by the way, even data\n\n\nanalysts can easily interrogate DLT pipelines for the queries they need\nto run, without knowing any sort of specialized programming language\nor niche skills.\n\nOne of the top benefits of DLT, and Delta Lake in general, is that it is built\nwith streaming pipelines in mind. Today, the world operates in real time, and\nbusinesses are increasingly expected to analyze and respond to their data in\nreal time. With streaming data pipelines built on DLT, analysts can easily access,\nquery and analyze data with greater accuracy and actionability than with\nconventional batch processing. Delta Live Tables makes real-time analytics a\nreality for our customers.\n\n\n-----\n\n##### Fine-grained governance on the lakehouse\n\nDelta Lake is the foundation for open and secure [data sharing](https://databricks.com/blog/2021/05/26/introducing-delta-sharing-an-open-protocol-for-secure-data-sharing.html) and governance\non the lakehouse. It underpins the [Databricks Unity Catalog](https://databricks.com/product/unity-catalog) (in preview), which\nprovides fine-grained governance across clouds, data and ML assets. Among the\nbenefits of the Unity Catalog, it allows you to:\n\n**• Discover, audit and govern data assets in one place:** A user-friendly\ninterface, automated data lineage across tables, columns, notebooks,\nworkflows and dashboards, role-based security policies, table or\ncolumn-level tags, and central auditing capabilities make it easy for\ndata stewards to discover, manage and secure data access to meet\ncompliance and privacy needs directly on the lakehouse.\n\n\n\n**• Grant and manage permissions using SQL:** Unity Catalog brings finegrained centralized governance to data assets across clouds through the\nopen standard SQL DCL. This means database administrators can easily\ngrant permission to arbitrary, user-specific views, or set permissions on\nall columns tagged together, using familiar SQL.\n\n**• Centrally manage and audit shared data across organizations:** Every\norganization needs to share data with customers, partners and suppliers\nto better collaborate and to unlock value from their data. Unity Catalog\nbuilds on open source [Delta Sharing](http://delta.io/sharing) to centrally manage and govern\nshared assets within and across organizations.\n\n\nThe Unity Catalog makes it easy for data stewards to discover, manage and secure data access\nto meet compliance and privacy needs on the lakehouse.\n\n**[LEARN MORE](https://databricks.com/blog/2021/05/26/introducing-databricks-unity-catalog-fine-grained-governance-for-data-and-ai-on-the-lakehouse.html)**\n\n\n-----\n\n###### PART 2: COMPUTE LAYER\n\n\nThe next layer to look at is the compute layer, or how we process queries.\n\nApache Spark TM has been the de facto standard for data lake compute. It’s great\nfor processing terabytes and petabytes of data cheaply, but historically Spark\nSQL uses a nonstandard syntax and can be difficult to configure.\n\n\nData warehouses, on the other hand, tend to support short running queries\nreally well, especially when you have a lot of users issuing queries concurrently.\nThey tend to be easier to set up, but don’t necessarily scale or they become\ntoo costly.\n\n\n**Compute layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake High performance for large jobs (TBs to PBs)|Data Warehouse High concurrency|Data Lakehouse High performance for large jobs (TBs to PBs)|\n|---|---|---|\n|Economical|Scaling is exponentially more expensive|Economical|\n|High operational complexity|Ease of use|Ease of use|\n||||\n\n\nA popular belief is that large workloads require a drastically different system\nthan low latency, high concurrency workloads. For example, there’s the classic\ntrade-off in computer systems between latency and throughput.\n\nBut after spending a lot of time analyzing these systems, we found that it was\npossible to simultaneously improve large query performance and concurrency\n\n\nand latency. Although the classic trade-offs definitely existed, they were only\nexplicit when we optimized the system to the very theoretical optimal. It turned\nout the vast majority of software — and this includes all data warehouse systems\nand Databricks — were far away from optimal.\n\n\n-----\n\n##### Simplified administration and instant, elastic SQL compute — decoupled from storage\n\n\nTo achieve world-class performance for analytics on the lakehouse, we chose to\ncompletely rebuild the compute layer. But performance isn’t everything. We also\nwant it to be simple to administer and cheaper to use. Databricks SQL leverages\nserverless SQL warehouses that let you get started in seconds, and it’s powered\nby a new native MPP vectorized engine: Photon.\n\nDatabricks SQL warehouses are optimized and elastic SQL compute resources.\nJust pick the cluster size and Databricks automatically determines the best\ninstance types and VMs configuration for the best price/performance. This\nmeans you don’t have to worry about estimating peak demand or paying too\nmuch by overprovisioning. You just need to click a few buttons to operate.\nTo further streamline the experience, simply use [Databrick SQL Serverless](https://databricks.com/blog/2021/08/30/announcing-databricks-serverless-sql.html) .\nWith the serverless capability, queries start rapidly with zero infrastructure\nmanagement or configuration overhead. This lowers your total cost, as you pay\nonly for what you consume without idle time or overprovisioned resources.\n\n\nSince CPU clock speeds have plateaued, we also wanted to find new ways to\nprocess data faster, beyond raw compute power. One of the most impactful\nmethods has been to improve the amount of data that can be processed in\nparallel. However, data processing engines need to be specifically architected to\ntake advantage of this parallelism. So, from the ground up, we built [Photon](https://databricks.com/product/photon) , a new\nC++ based vectorized query processing engine that dramatically improves query\nperformance while remaining fully compatible with open Spark APIs. Databricks\nSQL warehouses are powered by Photon, which seamlessly coordinates work and\nresources and transparently accelerates portions of your SQL queries directly on\nyour data lake. No need to move the data to a data warehouse.\n\n**[READ NOW](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)**\n##### Photon: A Fast Query Engine for Lakehouse Systems\n\n[SIGMOD 2022 Best Industry Paper Award](https://cs.stanford.edu/~matei/papers/2022/sigmod_photon.pdf)\n\n\n-----\n\n**Did you know?**\n\nDatabricks SQL warehouses scale automatically throughout the day to\nbetter suit your business needs. Administration is simplified by identifying\nhow many clusters can scale out with min and max, and Databricks SQL will\nauto-scale as needed. This ensures that you have ample compute to serve\nyour needs, without overprovisioning. Administrators appreciate the ability\nto have better control over consumption costs, while users appreciate that\ntheir queries process as fast and efficiently as possible. For most BI and\nanalytics use cases, using medium-size warehouses with scaling is a great\nbalance of price/performance that fits most business needs.\n\nIn the next section, we will discuss examples of Databricks SQL performance results\non large-scale analytic workloads as well as highly concurrent workloads.\n\n\nRunning Scheduled Starting Cluster Scale\n\n\n-----\n\n##### Large query performance: the fastest data warehouse\n\n\nThe industry standard benchmark used by data warehouses is TPC-DS. It includes\n100 queries that range from very simple to very sophisticated to simulate decision\nsupport workloads. This benchmark was created by a committee formed by\ndata warehousing vendors. The chart at right shows price/performance results\nrunning the 100TB version of TPC-DS, since for large workloads the numbers that\nultimately matter pertain to the performance cost. As you can see, Databricks SQL\noutperforms all cloud data warehouses we have measured.\n\n**[LEARN MORE](https://dbricks.co/benchmark)**\n\n**Did you know?**\n\n\n**$2,000**\n\n**$1,791**\n\n**$1,500**\n\n**$1,000**\n\n**$952**\n\n\n**$500**\n\n\n**$242**\n**$146**\n\n\n**$358**\n\n\n**$0**\nDatabricks SQL Databricks SQL Cloud Data Cloud Data Cloud Data\nSpot On-Demand Warehouse 1 Warehouse 2 Warehouse 3\n\nSystem\n\n100TB TPC-DS price/performance benchmark (lower is better).\n\n\nDatabricks SQL has set a [new world record in](http://tpc.org/5013)\n[100TB TPC-DS](http://tpc.org/5013) , the gold standard performance\nbenchmark for data warehousing. Databricks\nSQL outperformed the previous record by 2.2x.\nAnd this result has been formally audited and\nreviewed by the TPC council.\n\n\n-----\n\n##### Highly concurrent analytics workloads\n\nBeyond large queries, it is also common for highly concurrent analytics workloads\nto execute over small data sets. To optimize concurrency, we used the same\nTPC-DS benchmark, but on a much smaller scale (10GB) and with 32 concurrent\nstreams. We analyzed the results to identify and remove bottlenecks, and\nbuilt hundreds of optimizations to improve concurrency. Databricks SQL now\noutperforms some of the best cloud data warehouses for both large queries and\nsmall queries with lots of users.\n\nReal-world workloads, however, are not just about either large or small queries.\nDatabricks SQL also provides intelligent workload management with a dual\nqueuing system and highly parallel reads.\n\n\n16,523\n\n12,248\n\n###### ~3X\n\n4,672\n\n\n11,690\n\n\nJuly 2020\n\n\nJan 2021 Oct 2022\n\n\nCLOUD DW X SQL WAREHOUSE X - L SIZE\n\n10GB TPC-DS queries/hr at 32 concurrent streams (higher is better).\n\n\n-----\n\n##### Intelligent workload management with smart queuing system\n\nReal-world workloads typically include a mix of small and large queries. Therefore\nthe smart queuing and load balancing capabilities of Databricks SQL need to\naccount for that too. Databrick SQL uses a smart dual queuing system (in preview)\nthat prioritizes small queries over large, as analysts typically care more about the\nlatency of short queries than large ones.\n\n\n##### Highly parallel reads with improved I/O performance\n\nIt is common for some tables in a lakehouse to be composed of many files — for\nexample, in streaming scenarios such as IoT ingest when data arrives continuously.\nIn legacy systems, the execution engine can spend far more time listing these\nfiles than actually executing the query. Our customers told us they do not want to\nsacrifice performance for data freshness. With async and highly parallel I/O, when\nexecuting a query, Databricks SQL now automatically reads the next blocks of data\nfrom cloud storage while the current block is being processed. This considerably\nincreases overall query performance on small files (by 12x for 1MB files) and “cold\ndata” (data that is not cached) use cases as well.\n\n**[LEARN MORE](https://databricks.com/blog/2021/09/08/new-performance-improvements-in-databricks-sql.html)**\n\n\n-----\n\n###### PART 3: CONSUMPTION LAYER\n\n\nThe third layer of the Databricks Lakehouse Platform would similarly have to bridge\nthe best of both data lakes and data warehouses. In the lakehouse, you would\nhave to be able to work seamlessly with your tools of choice — whether you are a\nbusiness analyst, data scientist, or ML or data engineer.\n\n\nThe lakehouse must treat Python, Scala, R and SQL programming languages\nand ecosystems as first-class citizens to truly unify data engineering, ML and BI\nworkloads in one place.\n\n\n**Consumption layer attributes — data lake vs. data warehouse vs. data lakehouse**\n\n|Data Lake Notebooks (great for data scientists)|Data Warehouse Lack of support for data science/ML|Data Lakehouse Notebooks (great for data scientists)|\n|---|---|---|\n|Openness with rich ecosystem (Python, R, Scala)|Limited to SQL only|Openness with rich ecosystem (Python, R, Scala)|\n|BI/SQL not 1st-class citizen|BI/SQL 1st-class citizen|BI/SQL 1st-class citizen|\n||||\n\n\n-----\n\n##### A platform for your tools of choice\n\n\nAt Databricks we believe strongly in open platforms and meeting our customers where they are. We work very\nclosely with a large number of software vendors to make sure you can easily use your tools of choice\non Databricks, like [Tableau](https://databricks.com/blog/2021/05/07/improved-tableau-databricks-connector-with-azure-ad-authentication-support.html) , [Power BI](https://databricks.com/blog/2021/02/26/announcing-general-availability-ga-of-the-power-bi-connector-for-databricks.html) or [dbt](https://databricks.com/blog/2021/12/06/deploying-dbt-on-databricks-just-got-even-simpler.html) . With [Partner Connect](https://www.databricks.com/partnerconnect) , it’s easier than ever to connect with\nyour favorite tools, easier to get data in, easier to authenticate using single sign-on, and of course, with all the\nconcurrency and performance improvements, we make sure that the direct and live query experience is great.\n\n\n**Now more than ever, organizations**\n\n**need a data strategy that enables**\n\n**speed and agility to be adaptable.**\n\n**As organizations are rapidly moving**\n\n**their data to the cloud, we’re**\n\n**seeing growing interest in doing**\n\n**analytics on the data lake. The**\n\n**introduction of Databricks SQL**\n\n**delivers an entirely new experience**\n\n**for customers to tap into insights**\n\n**from massive volumes of data with**\n\n**the performance, reliability and**\n\n**scale they need. We’re proud to**\n\n**partner with Databricks to bring**\n\n**that opportunity to life.**\n\n**Francois Ajenstat**\nChief Product Officer, Tableau\n\n\n+ Any other Apache Spark-compatible client\n\n\n-----\n\n##### Faster BI results retrieval with Cloud Fetch\n\nOnce query results are computed, cloud data warehouses often collect and\nstream back results to BI clients on a single thread. This can create a bottleneck\nand greatly slows down the experience if you are fetching anything more than a\nfew megabytes of results in size. To provide analysts with the best experience\nfrom their favorite BI tools, we also needed to speed up how the system delivers\nresults to BI tools like Power BI or Tableau once computed.\n\nThat’s why we’ve reimagined this approach with a new architecture called\n[Cloud Fetch](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html) . For large results, Databricks SQL now writes results in parallel across\nall of the compute nodes to cloud storage, and then sends the list of files using\npre-signed URLs back to the client. The client then can download in parallel\nall the data from cloud storage. This approach provides up to 10x performance\nimprovement in real-world scenarios.\n\n\nparallel\ndata\ntransfers\n\n\nCloud Storage\n\n**Cluster**\n\n\nSQL Endpoint\n\n\nCUSTOMER BENCHMARK\nTABLEAU EXTRACT\n\n\nCloud Fetch enables faster, higher-bandwidth connectivity to and from your BI tools.\n**[LEARN MORE](https://databricks.com/blog/2021/08/11/how-we-achieved-high-bandwidth-connectivity-with-bi-tools.html)**\n\n\n-----\n\n##### A first-class SQL development experience\n\nIn addition to supporting your favorite tools, we\nare also focused on providing a native first-class\nSQL development experience. We’ve talked to\nhundreds of analysts using various SQL editors\nlike SQL Workbench every day, and worked with\nthem to provide the dream set of capabilities\nfor SQL development.\n\nFor example, Databricks SQL now supports\n[standard ANSI SQL](https://databricks.com/blog/2021/11/16/evolution-of-the-sql-language-at-databricks-ansi-standard-by-default-and-easier-migrations-from-data-warehouses.html) , so you don’t need to learn a\nspecial SQL dialect. Query tabs allow you to work\non multiple queries at once, autosave gives you\npeace of mind so you never have to worry about\nlosing your drafts, integrated history lets you\neasily look at what you have run in the past, and\nintelligent auto-complete understands subqueries\nand aliases for a delightful experience.\n\n\nThe built-in SQL query editor allows you to quickly explore available databases, query and visualize results.\n\n\n-----\n\nFinally, with Databricks SQL, analysts can easily\nmake sense of query results through a wide variety\nof rich visualizations and quickly build dashboards\nwith an intuitive drag-and-drop interface. To keep\neveryone current, dashboards can be shared and\nconfigured to automatically refresh, as well as to\nalert the team to meaningful changes in the data.\n\n\nEasily combine visualizations to build rich dashboards that can be shared with stakeholders.\n\n\n-----\n\n### Conclusion\n\nDatabricks SQL leverages open source standard [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) to turn raw data\ninto actionable data, combining the flexibility and openness of data lakes\nwith the reliability and performance of data warehouses. The Unity Catalog\nprovides fine-grained governance on the lakehouse across all clouds using\none friendly interface and standard SQL.\n\nDatabricks SQL also holds the [new world record in 100TB TPC-DS](https://dbricks.co/benchmark) , the gold\nstandard performance benchmark for data warehousing. It is powered by\nPhoton, the new vectorized query engine for the lakehouse, and by SQL\nwarehouses for instant, elastic compute decoupled from storage.\n\nFinally, Databricks SQL offers a native first-class SQL development\nexperience, with a built-in SQL editor, rich visualizations and dashboards,\nand integrates seamlessly with your favorite BI- and SQL-based tools for\nmaximum productivity.\n\n\nDatabricks SQL under the hood.\n\n\n-----\n\n### Atlassian\n\n\nAtlassian is a leading provider of collaboration, development and issue-tracking\n\nsoftware for teams. With over 150,000 global customers (including 85 of the Fortune\n\n100), Atlassian is advancing the power of collaboration with products including Jira,\n\nConfluence, Bitbucket, Trello and more.\n\nUSE CASE\n\nAtlassian uses the Databricks Lakehouse Platform to democratize data across the enterprise and drive\ndown operational costs. Atlassian currently has a number of use cases focused on putting the\ncustomer experience at the forefront.\n\n**Customer support and service experience**\nWith the majority of their customers being server-based (using products like Jira and Confluence),\nAtlassian set out to move those customers into the cloud to leverage deeper insights that enrich the\ncustomer support experience.\n\n**Marketing personalization**\nThe same insights could also be used to deliver personalized marketing emails to drive\nengagement with new features and products.\n\n**Anti-abuse and fraud detection**\nThey can predict license abuse and fraudulent behavior through anomaly detection and\npredictive analytics.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nAtlassian is using the Databricks Lakehouse Platform to enable data democratization at scale, both internally\nand externally. They have moved from a data warehousing paradigm to standardization on Databricks,\nenabling the company to become more data driven across the organization. Over 3,000 internal users in\nareas ranging from HR and marketing to finance and R&D — more than half the organization — are accessing\ninsights from the platform on a monthly basis via open technologies like Databricks SQL. Atlassian is also\nusing the platform to drive more personalized support and service experiences to their customers.\n\n**•** Delta Lake underpins a single lakehouse for PBs of data accessed by 3,000+ users across HR, marketing,\nfinance, sales, support and R&D\n\n**•** BI workloads powered by Databricks SQL enable dashboard reporting for more users\n\n**•** MLflow streamlines MLOps for faster delivery\n\n**•** Data platform unification eases governance, and self-managed clusters enable autonomy\n\nWith cloud-scale architecture, improved productivity through cross-team collaboration, and the ability to\naccess all of their customer data for analytics and ML, the impact on Atlassian is projected to be immense.\nAlready the company has:\n\n**•** Reduced the cost of IT operations (specifically compute costs) by 60% through moving 50,000+ Spark\njobs from EMR to Databricks with minimal effort and low-code change\n\n**•** Decreased delivery time by 30% with shorter dev cycles\n\n**•** Reduced data team dependencies by 70% with more self-service enabled throughout the organization\n\n**[LEARN MORE](https://www.youtube.com/watch?v=Xo1U617T-mU)**\n\n\n**At Atlassian, we need to ensure**\n**teams can collaborate well**\n**across functions to achieve**\n**constantly evolving goals. A**\n**simplified lakehouse architecture**\n**would empower us to ingest high**\n**volumes of user data and run the**\n**analytics necessary to better**\n**predict customer needs and**\n**improve the experience of our**\n**customers. A single, easy-to-use**\n**cloud analytics platform allows**\n**us to rapidly improve and build**\n**new collaboration tools based on**\n**actionable insights.**\n\n**Rohan Dhupelia**\nData Platform Senior Manager, Atlassian\n\n\n-----\n\n### ABN AMRO\n\n\nAs an established bank, ABN AMRO wanted to modernize their business but were hamstrung\n\nby legacy infrastructure and data warehouses that complicated access to data across various\n\nsources and created inefficient data processes and workflows. Today, Azure Databricks\n\nempowers ABN AMRO to democratize data and AI for a team of 500+ empowered engineers,\n\nscientists and analysts who work collaboratively on improving business operations and\n\nintroducing new go-to-market capabilities across the company.\n\nUSE CASE\n\nABN AMRO uses the Databricks Lakehouse Platform to deliver financial services transformation on a global scale,\nproviding automation and insight across operations.\n\n**Personalized finance**\nABN AMRO leverages real-time data and customer insights to provide products and services tailored to\ncustomers’ needs. For example, they use machine learning to power targeted messaging within their automated\nmarketing campaigns to help drive engagement and conversion.\n\n**Risk management**\nUsing data-driven decision-making, they are focused on mitigating risk for both the company and their\ncustomers. For example, they generate reports and dashboards that internal decision makers and leaders use to\nbetter understand risk and keep it from impacting ABN AMRO’s business.\n\n**Fraud detection**\nWith the goal of preventing malicious activity, they’re using predictive analytics to identify fraud before it\nimpacts their customers. Among the activities they’re trying to address are money laundering and fake credit\ncard applications.\n\n\n-----\n\nSOLUTION AND BENEFITS\n\nToday, Azure Databricks empowers ABN AMRO to democratize data and AI for a team of 500+ engineers,\nscientists and analysts who work collaboratively on improving business operations and introducing new\ngo-to-market capabilities across the company.\n\n**•** Delta Lake enables fast and reliable data pipelines to feed accurate and complete data for\ndownstream analytics\n\n**•** Integration with Power BI enables easy SQL analytics and feeds insights to 500+ business users\nthrough reports and dashboards\n\n**•** MLflow speeds deployment of new models that improve the customer experience — with new use\ncases delivered in under two months\n\n\n**Databricks has changed the way**\n**we do business. It has put us in**\n**a better position to succeed in**\n**our data and AI transformation**\n**as a company by enabling data**\n**professionals with advanced data**\n**capabilities in a controlled and**\n**scalable way.**\n\n**Stefan Groot**\nHead of Analytics Engineering,\nABN AMRO\n\n\n#### 10x faster\n\ntime to market — use cases\ndeployed in two months\n\n\n#### 100+ \n\nuse cases to be delivered\nover the coming year\n\n\n#### 500+\n\nempowered business\nand IT users\n\n\n**[LEARN MORE](https://databricks.com/customers/abn-amro)**\n\n\n-----\n\n### SEGA Europe\n\n**Improving the player experience**\n\n# “ is at the heart of everything\n\n**we do, and we very much**\n**see Databricks as a key**\n**partner, supporting us to drive**\n**forward the next generation of**\n**community gaming.**\n\n**Felix Baker**\nData Services Manager, SEGA Europe\n\n\nSEGA® Europe, the worldwide leader in interactive entertainment, is using the Databricks\n\nLakehouse Platform to personalize the player experience and build its own machine\n\nlearning algorithm to help target and tailor games for over 30 million of its customers.\n\nAs housebound gamers looked to pass the time during the first lockdowns of 2020, some SEGA Europe\ntitles, including Football Manager,™ saw over double the number of sales during the first lockdown\ncompared to the year before. Furthermore, a number of SEGA titles experienced a more than 50% increase\nin players over the course of the COVID-19 pandemic. With more anonymized data being collected through\nan analytics pipeline than ever before, the team needed a dedicated computing resource to handle the\nsheer volume of data, extract meaningful insights from it and enable the data science team to improve\ngeneral workflow.\n\n**[LEARN MORE](https://www.youtube.com/watch?v=SzeXHcwPDSE)**\n\n\n-----\n\n### About Databricks\n\nDatabricks is the lakehouse company. More than 7,000 organizations\n\nworldwide — including Comcast, Condé Nast and over 50% of the\n\nFortune 500 — rely on the Databricks Lakehouse Platform to unify their\n\ndata, analytics and AI. Databricks is headquartered in San Francisco,\n\nwith offices around the globe. Founded by the original creators of\n\nApache Spark, TM Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\nContact us for a personalized demo\n**databricks.com/contact**\n\n**[DISCOVER LAKEHOUSE](https://databricks.com/discoverlakehouse)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Why-the-Data-Lakehouse-Is-Your-Next-Data-Warehouse-Ebook-2nd%20Edition.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "# Big Book of Data and AI Use Cases for the Public Sector\n\n### Best practices, customer stories and solution templates for government agencies interested in building on the Lakehouse\n\n\n-----\n\n## Contents\n\nThe State of Data and AI in the Government .......................................................................................... 3\n\nThe Need for a Modern Data Architecture ............................................................................................. 5\n\nIntroducing the Lakehouse for Public Sector ......................................................................................... 6\n\n**U S E C A S E :** Cybersecurity ........................................................................................................................... 9\n\n**U S E C A S E :** Predictive Maintenance .......................................................................................................... 12\n\n**U S E C A S E :** Fraud Detection ....................................................................................................................... 15\n\n**U S E C A S E :** Money Laundering ................................................................................................................. 17\n\n**U S E C A S E :** Entity Analytics ...................................................................................................................... 19\n\n**U S E C A S E :** Geospatial Analytics .............................................................................................................. 21\n\n**U S E C A S E :** Public Health Management .................................................................................................. 24\n\nConclusion ................................................................................................................................................. 26\n\n\n-----\n\n## The State of Data and AI in the Government\n\n###### Over the last decade, data and AI have redefined every industry on the planet. Retailers have improved the shopping experience with personalized recommendations, financial institutions have strengthened risk management through the use of advanced analytics, and the healthcare industry is tapping into the power of machine learning to predict and prevent chronic disease. The public sector is no exception.\n\n\nIn 2018, the U.S. Federal Government embarked on one of its most ambitious\nefforts since putting a man on the moon — embedding data into all aspects of\ndecision-making. By enacting the Evidence-Based Policymaking Act of 2018,\nCongress set in motion requirements for agencies to modernize their data and\nanalytics capabilities, including the appointment of agency-level chief data\nofficers. A year later came the Federal Data Strategy, which provided further\nguidance for how agencies should manage and use data by 2030.\n\n\nWith all of this guidance, agencies are starting to make meaningful improvements\nto their data strategy, but when it comes to innovating with data, agencies still\nlag behind the private sector. This begs the question: what’s standing in the way?\nThe hurdles aren’t due to a lack of effort on the part of agency leaders. In fact,\nthey can largely be attributed to a patchwork of legacy technologies that have\nbeen amassed over the last 30 to 40 years. While these hurdles stand in the\nway, a number of innovative agencies are making significant progress as they\nembrace new data and AI capabilities.\n\n\n-----\n\nFederal spending on artificial intelligence rose to [nearly $1 billion](https://www.federaltimes.com/thought-leadership/2021/09/28/why-the-government-market-for-artificial-intelligence-technology-is-expanding/) in 2020, up\n50% from 2018. There’s a good reason for this level of spend: Deloitte recently\npublished a report, “AI-augmented Government,” that estimates the federal\ngovernment could free up as many as 1.2 billion hours of work and save up to\n$41.1 billion annually through the use of AI-driven automation. Early adopters\nof advanced analytics are starting to see the fruits of their labor. For example,\n[USCIS modernized their analytics stack](https://databricks.com/customers/uscis) on Databricks to accelerate insights\non applicants by 24x, automate the processing of millions of applications,\nand reduce appointment no-show rates with predictive analytics. The [Orange](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/)\n[County Courts](https://www.govloop.com/how-a-california-county-court-elevated-data-driven-decision-making-for-the-state/) also recently shared how they are automating legacy paperbased workflows with machine learning.\n\nIn this eBook, we explore the hurdles of legacy technologies and how a modern\ndata lakehouse can help agencies unlock innovative data and analytics use cases\nat all levels of government. Over the following seven example use cases, covering\neverything from cyber threat detection to improving public health,\n\n\n**An increased focus on cloud, analytics and AI = operational efficiency**\n\n1. AI/ML\n2. Data Analytics\n3. Cloud\n\n**$1B** **TOP PRIORITIES** **$41B+**\n\nData and AI Research and Government CIOs’ top Estimated government\nDevelopment Initiative game-changing technologies savings from data-driven\nautomation\n\n**U.S. Government**\n\nwe demonstrate how the Databricks Lakehouse for Public Sector is critical to\nimproving citizen services and delivering on mission objectives. This guide also\nincludes resources in the form of Solution Accelerators, reference architectures\nand real-world customer stories to help as you embark on your own journey to\ndrive a safer and more prosperous nation through the use of data and AI.\n\n\n-----\n\n## The Need for a Modern Data Architecture\n\n###### Government agencies are now turning to the cloud and modern data technologies to federate and make sense of their massive volumes of data. Building on that foundation, agencies are starting to adopt advanced analytics and AI to automate costly, outdated and resource-intensive operations as well as improve decisionmaking with predictive insights that can better keep pace with the dynamic needs of citizens and global communities. That being said, there are a number of barriers standing in their way.\n\n##### Common challenges\n\n\nMany government agencies are burdened with a legacy IT infrastructure that is\nbuilt with on-premises data warehouses that are complex to maintain, are costly\nto scale as compute is coupled with storage, and lack support for unstructured\ndata and advanced analytics. This severely inhibits data-driven innovation.\nMaintaining these systems requires a massive investment of both time and\nmoney compared to modern cloud-based systems and creates a number of\navoidable challenges:\n\n\ngovernment is often done in weekly or daily batches, but decision-making\nneeds to happen in real time. Critical events like cyber attacks and health\npandemics can’t wait a week.\n\n**Lack of citizen insights**\n\nWhen data is siloed, teams get an incomplete view of the citizen,\nresulting in missed opportunities to improve the delivery of services that\nimpact the quality of life for their constituents.\n\n\n**Lack of reliability**\n\n\nSiloed systems result in data replication as teams spin up new data marts\nto support their one-off use cases. Without a single source of truth, teams\nstruggle with data inconsistencies, which can result in inaccurate analysis\nand model performance that is only compounded over time.\n\n**Lack of agility**\n\nDisjointed analytics tools and legacy infrastructure hinder the ability of\nteams to conduct real-time analytics. Most data processing in the\n\n\n**Lack of productivity**\n\nData scientists and data analysts alike must have the right tool set to\ncollaboratively investigate, extract and report meaningful insights from\ntheir data. Unfortunately, data silos lead to organizational silos, which make\ncollaboration inside an agency as well as between agencies very difficult.\nWith different groups of data teams leveraging their own coding and\nanalytical tools, communicating insights and working across teams —\nlet alone across agencies — is almost impossible. This lack of collaboration\ncan drastically limit the capabilities of any data analytics or AI initiative.\n\n\n-----\n\n## Introducing the Lakehouse for Public Sector\n\n\nThe reason that the Databricks Lakehouse is\nable to deliver the simplicity, flexibility and\nspeed that a government agency requires is\nthat it fundamentally reimagines the modern\ndata architecture. Databricks provides federal,\nstate and local agencies with a cloud-native\nLakehouse Platform that combines the best\nof data warehouses and data lakes — to store\nand manage all your data for all your analytics\nworkloads. With this modern architecture,\nagencies can federate all their data and\ndemocratize access for downstream use\ncases, empowering their teams to deliver on\ntheir mission objectives by unlocking the full\npotential of their data.\n\n\n**Delivering real-time data insight in support of the mission**\n\n- Fraud, Waste & Abuse\n\n- Cybersecurity\n\n- Medicaid Dashboards &\nReporting\n\n- Process Improvement\n\n- Predictive Maintenance\n\n- SCM & Demand Forecasting\n\n- Smart Military/Censor Data\n\n- Military Heatlh\n\n- COVID Response/Decision\nSupport\n\n- Smart Cities/Connected\nVehicles\n\n- Citizen Engagement\n\n- Data-Driven Decision-Making\n\n\n-----\n\n**Federate all of your agency’s data**\n\nAny type of data can be stored because, like a data lake, the Databricks\nLakehouse is built using the low-cost object storage supported by cloud\nproviders. Leveraging this capability helps break down the data silos that\nhinder efforts to aggregate data for advanced analytics (e.g., predictive\nmaintenance) or compute-intensive workloads like detecting cyber\nthreats across billions of signals. Probably even more important is the\nability of the lakehouse architecture to travel back in time, ensuring full\naudit compliance and high governance standards for analytics and AI.\n\n**Power real-time decision-making**\n\nStreaming use cases such as IoT analytics or disease spread tracking is\nsimpler to support because the lakehouse uses Apache Spark TM as the\ndata processing engine and Delta Lake as a storage layer. With Spark,\nyou can toggle between batch and streaming workloads with just a line\nof code. With Delta Lake, native support for ACID transactions means\nthat you can deploy streaming workloads without the overhead of\ncommon reliability and performance issues. These capabilities make\nreal-time analytics possible.\n\n\n**Unlock collaborative analytics for all personas**\n\nThe Databricks Lakehouse for Public Sector is your one-stop shop for\nall your analytics and AI. The platform includes a business intelligence\ncapability — Databricks SQL — that empowers data analysts to query and run\nreports against all of an agency’s unified data. Databricks SQL integrates with\nBI tools like Tableau and Microsoft Power BI and complements any existing BI\ntools with a SQL-native interface, allowing data analysts and data scientists\nto query data directly within Databricks and build powerful dashboards.\n\n\n-----\n\n**Deliver on your mission with predictive insights**\nIn the same environment, data scientists can build, share and collaborate\non machine learning models for advanced use cases like fraud detection\nor geospatial analytics. Additionally, MLflow, an open source toolkit for\nmanaging the ML lifecycle, is built into the Lakehouse so data scientists\ncan manage everything in one place. Databricks natively supports Python,\nR, SQL and Scala so practitioners can work together with the languages and\nlibraries of their choice, reducing the need for separate tools. With these\ncapabilities, data teams can turn insights from real-world data into powerful\nvisualizations designed for machine learning. Visualizations can then be\nturned into interactive dashboards to share insights with peers across\nagencies, policymakers, regulators and decision-makers.\n\n\n##### Customers That Innovate With Databricks Lakehouse for Public Sector\n\nSome of the top government agencies in the world turn to the\nDatabricks Lakehouse for Public Sector to bring analytics and AI-driven\nautomation and innovation to the communities they serve.\n\n\n-----\n\n###### USE CASE:\n## Cybersecurity\n\n##### Overview\n\n\n**Limited window of data**\nGiven the high cost of storage, most agencies retain only a few weeks of threat\ndata. This can be a real problem in scenarios where a perpetrator gains access\nto a network but waits months before doing anything malicious. Without a long\nhistorical record, security teams can’t analyze cyberattacks over long-term\nhorizons or conduct deep forensic reviews.\n\n##### Solution overview\n\nFor government agencies that are ready to modernize their security data\ninfrastructure and analyze data at petabyte-scale more cost-effectively,\nDatabricks provides an open lakehouse platform that augments existing SIEMs\nto help democratize access to data for downstream analytics and AI. Built\non Apache Spark and Delta Lake, Databricks is optimized to process large\nvolumes of streaming and historic data for real-time threat analysis and incident\nresponse. Security teams can query threat data going years into the past in just\nminutes and build ML models to detect new threat patterns and reduce false\npositives. Additionally, Databricks created a Splunk-certified add-on to augment\nSplunk for Enterprise Security (ES) for cost-efficient log and retention expansion.\n\n\nCyberattacks from bad actors and nation states are a huge and growing threat\nto government agencies. Recent large-scale attacks like the ones on SolarWinds,\nlog4j, Colonial Pipeline and HAFNIUM highlight the sophistication and increasing\nfrequency of broad-reaching cyberattacks. Data breaches cost the federal\ngovernment more than $4 million per incident in 2021 and threaten national\nsecurity. Staying ahead of the next threat requires continuous monitoring of\nsecurity data from an agency’s entire attack surface before, during and after\nan incident.\n\n##### Challenges\n\n**Scaling existing SIEM solutions**\nAgencies looking to expand existing SIEM tools for today’s petabytes of data can\nexpect increased licensing, storage, compute and integration resources resulting\nin tens of millions of dollars in additional costs per year.\n\n**Rules-based systems**\nMany legacy SIEM tools lack the critical analytics capabilities — such as\nadvanced analytics, graph processing and machine learning — needed to detect\nunknown threat patterns or deliver on a broader set of security use cases like\nbehavioral analytics.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Detect Criminal](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n[Threats Using DNS Analytics](https://databricks.com/blog/2020/10/05/detecting-criminals-and-nation-states-through-dns-analytics.html)\n\nDetecting criminals and nation states through DNS analytics. In order to address\ncommon cybersecurity challenges such as deployment complexity, tech\nlimitation and cost, security teams need a real-time data analytics platform that\ncan handle cloud scale, analyze data wherever it is, natively support streaming\nand batch analytics, and have collaborative content development capabilities.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=5BRGqxq4iQw)**\n\n**Fighting Cyber Threats in Real Time**\nSince partnering with Databricks, HSBC has reduced costs, accelerated threat\ndetection and response, and improved their security posture. Not only can\nthey process all of their required data, but they’ve also increased online query\nretention from just days to months at petabyte scale. HSBC is now able to\nexecute 2-3x more threat hunts per analyst.\n\n\n[Solution Accelerator:](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n[Databricks Add-On for Splunk](https://databricks.com/blog/2021/07/23/augment-your-siem-for-cybersecurity-at-cloud-scale.html)\n\nDesigned for cloud-scale security operations, the add-on provides Splunk\nanalysts with access to all data stored in the Lakehouse. Bidirectional pipelines\nbetween Splunk and Databricks allow agency analysts to integrate directly into\nSplunk visualizations and security workflows.\n\n\n-----\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Predictive Maintenance\n\n##### Overview\n\n\n**Integrating unstructured data**\nEquipment data doesn’t just come in the form of IoT data. Agencies can gather\nrich unstructured signals like audio, visual (e.g., video inspections) and text\n(e.g., maintenance logs). Most legacy data architectures are unable to integrate\nstructured and unstructured data sources.\n\n**Operationalizing machine learning**\nMost agencies lack the advanced analytics tools needed to build models that\ncan predict potential equipment failures. Those that do typically have their\ndata scientists working in a siloed set of tools, resulting in unnecessary data\nreplication and inefficient workflows.\n\n##### Solution overview\n\nThe Databricks Lakehouse is tailor-made for building IoT applications at scale.\nWith Databricks, agencies can easily manage large streaming volumes of small\nfiles, with ACID transaction guarantees and reduced job fails compared to\ntraditional data warehouse architectures. Additionally, the Lakehouse is cloud\nnative and built on Apache Spark, so scaling for petabytes of data is not an issue.\nWith the Lakehouse, agencies can bring together all of their structured and\nunstructured data with a unified set of tooling for data engineering, model building\nand production rollout. With these capabilities, operations teams can quickly\ndetect and act on pending equipment failures before they affect performance.\n\n\nPredictive maintenance is oftentimes associated with the manufacturing sector,\nbut in reality it extends far beyond the factory floor. Consider this for a moment:\nthe U.S. Government operates a fleet of over [640,000 vehicles](https://www.government-fleet.com/301786/federal-vs-state-local-fleets) including public\nbuses, postal delivery trucks, drones, helicopters and jet fighters. Many of these\nvehicles — like multimillion-dollar aircraft — contain sensors that generate\nmassive amounts of data on the use and conditions of various components. And\nit’s not just vehicles. Modern public utilities stream data through connected IoT\ndevices. All of this data can be analyzed to identify the root cause of a failure\nand predict future maintenance, helping to avoid costly repairs and critical\nassets from being out of service.\n\n##### Challenges\n\n**Managing IoT data at scale**\nWith billions of sensors generating information, most data systems are unable to\nhandle the sheer volume of data. Before agencies can even start analyzing their\ndata, legacy data warehouse–based tools require preprocessing of data, making\nreal-time analysis impossible.\n\n\n-----\n\n##### How to get started\n\n\n**Solution Accelerator: Predictive Maintenance**\nLearn how to ingest real-time IoT data from field devices, perform complex\ntime series processing in Delta Lake and leverage machine learning to build\npredictive maintenance models.\n\n[Part 1: Use case overview](https://databricks.com/blog/2020/08/03/modern-industrial-iot-analytics-on-azure-part-1.html)\n\n[Part 2: Ingest real-time IoT data and perform time series processing](https://databricks.com/blog/2020/08/11/modern-industrial-iot-analytics-on-azure-part-2.html)\n\n[Part 3: Using ML to predict maintenance.](https://databricks.com/blog/2020/08/20/modern-industrial-iot-analytics-on-azure-part-3.html)\n\n\n[Watch the Demo:](https://vimeo.com/580864758/5a5bc42bb9)\n[Predictive Maintenance on Azure Databricks](https://vimeo.com/580864758/5a5bc42bb9)\n\n##### Customer story\n\n**[LEARN MORE](https://www.tallan.com/blog/client-stories/dc-water/)**\n\n**Protecting the Water Supply for 700,000 Residents**\nUtilizing machine learning for predictive analytics to help stop water main\nbreaks before they occur, potentially saving hundreds of thousands of dollars\nin repairs while reducing service interruption.\n\n\n-----\n\n##### Reference architecture\n\nWeather Sensor\nReadings\n(semi-structured)\n\nReal-time\nstreaming\n\nWind Turbine\nTelematics\n(semi-structured)\n\nMaintenance Logs\n(unstructured)\n\n\n#### Databricks Lakehouse Platform\n\nBronze Layer Silver Layer Gold Layer\n\n\nAppend Raw\nMerge Data\nData\n\n\nJoin Streams and\nAnalyze Data\n\nEnriched\nReadings\n\n\nOutput\n\n\nBuild Predictive\nMaintenance Model\n\n\nGranular\nReadings\n\n\nAggregated\nHourly\nReadings\n\n\nReal-time Dashboards for Real-Time Dashboards for\nOptimizing Performance Optimizing Performance\n\n|Col1|Col2|Col3|\n|---|---|---|\n\n\n-----\n\n###### USE CASE:\n## Fraud Detection\n\n\n##### Overview\n\nAccording to [McKinsey & Company](https://www.mckinsey.com/~/media/McKinsey/Industries/Public%20Sector/Our%20Insights/Cracking%20down%20on%20government%20fraud%20with%20data%20analytics/Cracking-down-on-government-fraud-with-data-analytics-vF.pdf) , more than half of the federal government’s\nmonetary losses to fraud, waste and abuse go undetected and total tens of\nbillions of dollars. Financial fraud comes in many forms, from individuals taking\nadvantage of relief programs to complex networks of criminal organizations\nworking together to falsify medical claims and rebate forms. Investigative teams\nhoping to stay ahead of fraudsters need advanced analytics techniques so they\ncan detect anomalous behavior buried in a sea of data.\n\n##### Challenges\n\n**Lack of machine learning**\nA rules-based approach is not enough. Bad actors are getting more and more\nsophisticated in how they take advantage of government programs, necessitating\nan AI-driven approach.\n\n**Unreliable data**\nGetting high-quality, clean data and maintaining a rich feature store is critical\nfor identifying ever-evolving fraud patterns while maintaining a strict record of\nprevious data points.\n\n\n##### Solution overview\n\nThe Databricks Lakehouse enables teams to develop complex ML models with\nhigh governance standards and bridge the gap between data science and\ntechnology to address the challenge of analyzing large volumes of data at scale\n— 40 billion financial transactions a year are made in the United States alone.\nAdditionally, Databricks makes it possible to combine modern AI techniques\nwith the legacy rules-based methods that underpin current approaches to fraud\ndetection all within a common and efficient Spark-based orchestration engine.\n\n##### How to get started\n\n[Solution Accelerator: Fraud Detection](https://databricks.com/blog/2021/01/19/combining-rules-based-and-ai-models-to-combat-financial-fraud.html)\n\nDue to an ever-changing landscape, building a financial fraud detection\nframework often goes beyond just creating a highly accurate machine learning\nmodel. Oftentimes it involves a complex-decision science setup that combines\na rules engine with a need for a robust and scalable machine learning platform.\nIn this example, we show how to build a holistic fraud detection solution on\nDatabricks using data from a financial institution.\n\n\n**Analytics at scale**\nTraining complex ML models with hundreds of features on gigabytes of\nstructured, semi-structured and unstructured data can be impossible without a\nhighly scalable and distributed infrastructure.\n\n\n-----\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=Ca1MMNpBSHM)**\n\n**Identifying Financial Fraud at Scale**\nProcesses hundreds of billions of market events\nper day on the Databricks Lakehouse and uses\nthe power of machine learning to identify illicit\nactivity in near real-time.\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Money Laundering\n\n##### Overview\n\n\nApproximately [$300 billion](https://home.treasury.gov/system/files/136/2018NMLRA_12-18.pdf) is laundered through the United States each year,\nand with criminal organizations — both at home and abroad — implementing\nincreasingly sophisticated methods for laundering funds, it’s getting harder to\nstop. While the federal government continues to apply pressure on the financial\nsector through heightened regulation, more is needed to combat laundering.\nModern AI techniques such as graph analytics and computer vision can be\nused to process different types of structured (e.g., financial transactions) and\nunstructured (e.g., real estate images) data and identify illicit behavior. This\nallows investigative teams to automate labor-intensive activities like confirming\na residential address or reviewing transaction histories, and instead dig into\npriority threats.\n\n##### Challenges\n\n**Complex data science**\nModern anti-money laundering (AML) practices require multiple ML capabilities\nsuch as entity resolution, computer vision and graph analytics on entity\nmetadata, which is typically not supported by any one data platform.\n\n\n**Time-consuming false positives**\nAny reported suspicious activity must be investigated manually to ensure\naccuracy. Many legacy solutions generate a high number of false positives or fail\nto identify unknown patterns, resulting in wasted effort by investigators.\n\n##### Solution overview\n\nAML solutions face the operational burden of processing billions of transactions\na day. The Databricks Lakehouse Platform combines the low storage cost\nbenefits of cloud data lakes with the robust transaction capabilities of data\nwarehouses, making it the ideal foundation for building AML analytics at massive\nscale. At the core of Databricks is Delta Lake, which can store and combine\nboth unstructured and structured data to build entity relationships; moreover,\nDatabricks Delta Engine provides efficient access using the new Photon compute\nto speed up BI queries on tables spanning billions of transactions. On top of\nthese capabilities, ML is a first-class citizen in the Lakehouse, which means\nanalysts and data scientists do not waste time subsampling or moving data to\nshare dashboards and stay one step ahead of bad actors.\n\n\n**Model transparency**\nAlthough AI can be used to address many money laundering use cases, the lack\nof transparency in the development of ML models offers little explainability,\ninhibiting broader adoption.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator: Modern](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n[Anti-Money Laundering Techniques](https://databricks.com/blog/2021/07/16/aml-solutions-at-scale-using-databricks-lakehouse-platform.html)\n\n\nLakehouse Platform leveraging a series of next-gen machine learning techniques\nincluding NLP, computer vision, entity resolution and graph analytics. This\napproach helps teams better adapt to the reality of modern laundering practices.\n\n\nCurrent anti-money laundering practices bear little resemblance to those of the\nlast decade. In today’s digital world, financial institutions are processing billions\nof transactions daily, increasing the surface area of money laundering. With this\naccelerator, we demonstrate how to build a scalable AML solution on the\n\n\n##### Reference architecture\n\n\n-----\n\n###### USE CASE:\n## Entity Analytics\n\n##### Overview\n\n\n**No machine learning capabilities**\nEntity resolution typically relies on basic rules-based logic to compare records\n(e.g., matching on name and address), but with messy, large volumes of data,\nadvanced analytics is needed to improve accuracy and accelerate efforts.\n\n##### Solution overview\n\nThe Databricks Lakehouse is an ideal platform for building entity analytics at\nscale. With support for a wide range of data formats and a rich and extensible\nset of data transformation and ML capabilities, Databricks enables agencies to\nbring together all of their data in a central location and move beyond simple\nrules-based methods for entity resolution. Data teams can easily explore\ndifferent machine learning techniques like natural language processing,\nclassification and graph analytics to automate entity matching. And one-click\nprovisioning and deprovisioning of cloud resources makes it easy for teams to\ncost-effectively allocate the necessary compute resources for any size job so\nthey can uncover findings faster.\n\n\nEntity analytics aims to connect disparate data sources to build a full view of\na person or an organization. This has many applications in the public sector,\nsuch as fraud detection, national security and population health. For example,\nMedicare fraud teams need to understand which prescriptions are filled, claims\nfiled and facilities visited across geographies to uncover suspicious behavior.\nBefore teams can even look for suspicious behavior, they must first determine\nwhich records are associated. In the United States, nearly 50,000 people share\nthe name John Smith (and there are thousands of others with similar names).\nImagine trying to identify the right John Smith for this type of analysis. That’s no\neasy task.\n\n##### Challenges\n\n**Disjointed data**\nManaging complex and brittle ETL pipelines in order to cleanse and join data\nacross siloed systems and data stores.\n\n\n**Compute intensive**\nIdentifying related entities across population-level data sets requires massive\ncompute power that far outstrips legacy on-prem data architectures.\n\n\n-----\n\n##### How to get started\n\n[Virtual Workshop: Entity Analytics](https://drive.google.com/file/d/1wGGT9Fn5EZF5Rgrabuttt1xdua5csrBa/view?usp=sharing)\n\nLearn from Databricks experts on how entity analytics is being deployed\nin the public sector and watch a demo that shows how to use ML to link\npayments and treatments across millions of records in a public CMS data set.\n\n[Solution Accelerator:](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n[Machine Learning-Based Item Matching](https://drive.google.com/file/d/1a5xdaRSNQjQvgztOZg0tCiCajjVpvVPA/view?usp=sharing)\n\nWhile focused on retail, this accelerator has applications for any organization\nworking on entity matching, especially as it relates to items that might be stored\nacross locations. In this notebook, we demonstrate how to use machine learning\nand the Databricks Lakehouse Platform to resolve differences between product\ndefinitions and descriptions, and determine which items are likely pairs and\nwhich are distinct across disparate data sets.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/entity-resolution-using-patient-records-at-cmmi)**\n\nIn this talk, NewWave shares the specifics on CMS’s entity resolution use case,\nthe ML necessary for this data and the unique uses of Databricks in providing\nthis capability.\n\n##### Sample workflow\n\n\n-----\n\n###### USE CASE:\n## Geospatial Analytics\n\n##### Overview\n\n\n**Broad range of analytics capabilities**\nEnterprises require a diverse set of data applications — including SQL-based\nanalytics, real-time monitoring, data science and machine learning — to support\ngeospatial workloads given the diverse nature of the data and use cases.\n\n##### Solution overview\n\nWith Delta Lake at the core, the Databricks Lakehouse is ideal for geospatial\nworkloads, as it provides a single source of truth for all types of structured,\nunstructured, streaming and batch data, enabling seamless spatio-temporal\nunification and cross-querying with tabular and raster-based data. Built on\nApache Spark, the Lakehouse easily scales for data sets consisting of billions\nof rows of data with distributed processing in the cloud. To expand on the core\ncapabilities of the Lakehouse, Databricks has introduced the Mosaic library,\nan extension to the Apache Spark framework, built for fast and easy processing\nof large geospatial data sets. Popular frameworks such as Apache Sedona or\nGeoMesa can still be used alongside Mosaic, and because Mosaic sits on top of\nLakehouse architecture, it unlocks AI/ML and advanced analytics capabilities\nto support all types of geospatial use cases.\n\n\nEvery day billions of handheld and IoT devices, along with thousands of\nairborne and satellite remote sensing platforms, generate hundreds of exabytes\nof location-aware data. This boom of geospatial big data combined with\nadvancements in machine learning is enabling government agencies to develop\nnew capabilities. The potential use cases for geospatial analytics and AI touch\nevery part of the government, including disaster recovery (e.g., flood/earthquake\nmapping), defense and intel (e.g., detecting threats using drone footage),\ninfrastructure (e.g., public transportation planning), civilian safety (e.g., crime\nprediction), public health (e.g., disease spread tracking), and much more. Every\nagency at the state and federal level needs to consider how they can tap into\ngeospatial data.\n\n##### Challenges\n\n**Massive volumes of geospatial data**\nWith the proliferation of low-cost sensor arrays, GPS technologies and highresolution imaging organizations are collecting tens of TBs of geospatial data\ndaily, outpacing their ability to store and process this data at scale.\n\n\n**Compute-intensive spatial workloads**\nGeospatial data is complex in structure, with various formats not well suited for\nlegacy data warehouses, as well as being compute intensive, with geospatialspecific transformations and queries requiring hours and hours of compute.\n\n\n-----\n\n##### How to get started\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[Mosaic for Geospatial Analytics](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nBuild a Lakehouse to support all of your geospatial analytics and AI use cases\nwith the Mosaic library. Mosaic provides a number of capabilities including easy\nconversion between common spatial data encodings, constructors to easily\ngenerate new geometries from Spark native data types, many of the OGC SQL\nstandard ST_ functions implemented as Spark Expressions for transforming,\naggregating and joining spatial data sets, and optimizations for performing pointin-polygon joins using an approach we codeveloped with Ordnance Survey —\nall provided with the flexibility of a Scala, SQL or Python API.\n\n[Virtual Workshop: Geospatial](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n[Analytics and AI at Scale](https://databricks.com/p/webinar/workshop-geospatial-analytics-and-ai-at-scale)\n\nLearn how to build powerful geospatial insights and visualizations with a\nLakehouse for all your geospatial data processing, analytics and AI.\n\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na20/automating-federal-aviation-administrations-faa-system-wide-information-management-swim-data-ingestion-and-analysis)**\n\n**Analyzing Flight Data to Improve Aviation**\nTo help airlines better serve their millions of passengers, USDOT built a\nmodern analytics architecture on Databricks that incorporates data such as\nweather, flight, aeronautical and surveillance information. With this new\nplatform, they reduced compute costs by 90% and can now power use cases\nsuch as predicting air cargo traffic patterns, flight delays and the financial\nimpact of flight cancellations.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://www.youtube.com/watch?v=LP198QMdDbY&t=1070s)**\n\n**Customer Story: Flood Prediction With Machine Learning**\nIn an effort to improve the safety of civil projects, Stantec built a machine\nlearning model on Databricks leveraging large volumes of weather and geological\ndata — oftentimes consisting of trillions of data points — to predict the impact\nof flash floods on various regions and adjust civil planning accordingly.\n\n\n-----\n\n##### Reference architecture\n\nMosaic Kepler Magics\nGeometry Display Functions\nfor Map Display\n\nESRI Java API for\nGeometry Operations\n\n\nBuilt-In Indexing\nSystem Support\n\n\nJTS Java API for\nGeometry Operations\n\n\n-----\n\n###### USE CASE:\n## Public Health Management\n\n##### Overview\n\n\nIn their lifetime, every human is expected to generate a million gigabytes of\nhealth data spanning electronic health records, medical images, claims, wearable\ndata, genomics and more. This data is critical to understanding the health of\nthe individual, but when aggregated and analyzed across large populations,\ngovernment agencies can glean important insights like disease trends, the\nimpact of various treatment guidelines and the effectiveness of resources. By\nadding in [Social Determinants of Health (SDOH)](https://databricks.com/blog/2022/04/18/increasing-healthcare-equity-with-data.html) data — such as geographical\nlocation, income level, education, housing — agencies can better identify\nunderserved communities and the critical factors that contribute to positive\nhealth outcomes.\n\n##### Challenges\n\n**Rapidly growing health data**\nHealthcare data is growing exponentially. Unfortunately, legacy on-premises data\narchitectures are complex to manage and too costly to scale for populationscale analytics.\n\n\n**Complexities of ML in healthcare**\nThe legacy analytics platforms that underpin healthcare lack the robust data\nscience capabilities needed for predictive health use cases like disease risk\nscoring. There’s also the challenge of managing reproducibility, which is critical\nwhen building ML models that can impact patient outcomes.\n\n##### Solution overview\n\nThe Databricks Lakehouse enables public health agencies to bring together all\ntheir research and patient data in a HIPAA-certified environment and marry it\nwith powerful analytics and AI capabilities to deliver real-time and predictive\ninsights at population scale. The Lakehouse eliminates the need for legacy\ndata architectures, which have historically inhibited innovation in patient care\nby creating data silos and making advanced analytics difficult. Databricks led\nopen source projects — like [Glow for genomics](https://databricks.com/blog/2021/11/17/databricks-open-source-genomics-toolkit-outperforms-leading-tools.html) and [Smolder for EHR data](https://databricks.com/blog/2021/01/28/burning-through-electronic-health-records-in-real-time-with-smolder.html) —\nthat make it easy to ingest and prepare healthcare-specific data modalities for\ndownstream analytics.\n\n\n**Fragmented patient data**\nIt is widely accepted that over 80% of medical data is unstructured, yet most\norganizations still focus their attention on data warehouses designed to only\nsupport structured data and SQL-based analytics.\n\n\n-----\n\n##### How to get started\n\n\n[Solution Accelerator:](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n[NLP for Healthcare](https://databricks.com/blog/2022/05/02/high-scale-geospatial-processing-with-mosaic.html)\n\nOur joint solutions with John Snow Labs bring together the power of Spark NLP\nfor Healthcare with the collaborative analytics and AI capabilities of Databricks.\nInformatics teams can ingest raw unstructured medical text files into Databricks,\nextract meaningful insights using natural language processing techniques,\nand make the data available for downstream analytics. We have specific NLP\nsolutions for from lab reports, automating the deidentification of PHI and [extracting oncology insights](https://databricks.com/solutions/accelerators/nlp-oncology) [identifying adverse drug events](https://databricks.com/blog/2022/01/17/improving-drug-safety-with-adverse-event-detection-using-nlp.html) .\n\n[Solution Accelerator:](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n[Disease Risk Prediction](https://databricks.com/blog/2020/10/20/detecting-at-risk-patients-with-real-world-data.html)\n\nOne of the most powerful tools for identifying patients at risk for a chronic\ncondition is the analysis of real world data (RWD). This Solution Accelerator\nnotebook provides a template for building a machine learning model that\nassesses the risk of a patient for a given condition within a given window of time\nbased on a patient’s encounter history and demographics information.\n\n\n[Demo: Real-Time](https://www.youtube.com/watch?v=_ltDF2obiSc)\n[COVID-19 Contact Tracing](https://www.youtube.com/watch?v=_ltDF2obiSc)\n\nDatabricks COVID-19 surveillance solution takes a data-driven approach to\nadaptive response, applying predictive analytics to COVID-19 data sets to\nhelp drive more effective shelter-in-place policies.\n\n##### Customer story\n\n**[WATCH THE VIDEO](https://databricks.com/session_na21/from-vaccine-management-to-icu-planning-how-crisp-unlocked-the-power-of-data-during-a-pandemic)**\n\n**From Vaccine Management to ICU Planning**\nDuring the pandemic, the Chesapeake Regional Information System for our\nPatients implemented a modern data architecture on Databricks to address\ncritical reporting needs. This allowed them to analyze 400 billion data points\n\nfor innovative use cases like real-time disease spread tracking, vaccine\ndistribution and prioritizing vulnerable populations.\n\n\n-----\n\n## Conclusion\n\nToday, data is at the core of how government agencies operate and AI is at the\n\nforefront of driving innovation into the future. The Databricks Lakehouse for\n\nPublic Sector enables government agencies at the federal, state and local level\n\nto harness the full power of data and analytics to solve strategic challenges and\n\nmake smarter decisions that improve the safety and quality of life of all citizens.\n\nGet started with a free trial of Databricks Lakehouse and start building better\n\ndata applications today.\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n###### Contact us for a personalized demo databricks.com/contact\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the original creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "###### EBOOK\n\n# Lakehouse for Manufacturing\n\n###### Build a connected customer experience, optimize operations and unify your data ecosystem\n\n\n-----\n\n## Contents\n\nIntroduction .......................................................................................................................... **3**\n\nManufacturing Transformation Trends .............................................................................. **5**\n\nManufacturing Data Challenges ......................................................................................... **9**\n\nDatabricks Lakehouse for Manufacturing ....................................................................... **10**\n\nBuilding Innovative Solutions on the Lakehouse ............................................................. **12**\n\n**SOLUTION:** Part-Level Demand Forecasting ....................................................................... 12\n\n**SOLUTION:** Overall Equipment Effectiveness & KPI Monitoring ............................................. 14\n\n**SOLUTION:** Digital Twins ................................................................................................... 15\n\n**SOLUTION:** Computer Vision ............................................................................................ 16\n\nAn Ecosystem on the Lakehouse for Manufacturing ...................................................... **17**\n\n**SOLUTION:** Avanade Intelligent Manufacturing .................................................................. **18**\n\n**SOLUTION:** DataSentics Quality Inspector ........................................................................ **18**\n\nSOLUTION: Tredence Predictive Supply Risk Management ................................................. **19**\n\nLeading Manufacturing Companies That Choose Us ................................................... **20**\n\n\n-----\n\n## Introduction\n\nMarket conditions in manufacturing are more challenging than ever. Operating margins\nand growth are impacted by the rising cost of labor, materials, energy and transportation, all\npeaking at the same time. Disruptive events in the supply chain are increasing in frequency\nand intensity, leading to significant revenue losses and damaged brand reputation.\n\nEffective acquisition and retention of next-generation talent is a considerable issue for\nmanufacturers. There are more jobs in the industry than there are people to do them, further\ncompounding the problem of slower than expected industrial productivity growth over the\nlast 15 years. The industry is also one of the largest consumers of energy, and faces a direct\nchallenge of transforming operations to be more sustainable as governments are prioritizing\nnet-zero policies that require a step change in energy efficiency and transition to low-carbon\nenergy sources.\n\nThe manufacturing industry generates massive amounts of new data every day — estimated\nto be two to four times more in size than in industries such as communications, media,\nretail and financial services. This explosion of data has opened the door for the global\nmanufacturing ecosystem to boost productivity, quality, sustainability and growth beyond\nwhat was previously thought possible.\n\nUnfortunately, legacy data warehouse-based architectures weren’t built for the massive\nvolumes and type of data coming in through today’s factories, products, processes and\nworkers, let alone to support the advanced AI/ML use cases required to meet the customer\nexpectations of shorter lead times, reliable delivery and smarter products.\n\n\n-----\n\nFor that, companies need to adopt a modern data architecture that provides the speed, scale and\ncollaboration needed by broad teams of data engineers, data scientists, and analysts. Manufacturers need\na comprehensive data platform that can not only handle massive volumes of data, but effectively and\nseamlessly operationalize the value from data, analytics and AI.\n\nThis is achieved by:\n\nRemoving data silos by placing all data, regardless of type or frequency, in a single, open\narchitecture — including unstructured data from sensors, telemetry, natural language logs,\nvideos and images — helping you to gain end-to-end visibility into your business\n\nEnsuring your data is “always on” so that the freshest and highest quality data is available for\nall for the full spectrum of enterprise analytics and AI/ML use cases, allowing you to drive ITOT convergence\n\nHaving a comprehensive open architecture so IT and data teams can move with agility\nto bring AI and ML to where it’s needed, when it’s needed, including in connectivityconstrained environments\n\nMaintaining fine-grained governance and access control on your data assets, protecting\n\nsensitive intellectual property and customer data\n\nThe Databricks Lakehouse for Manufacturing does just this. It’s a comprehensive approach that empowers\nteams in the industry to collaborate and innovate around data, analytics and AI. It eliminates the technical\nlimitations of legacy technologies and gives data teams the ability to drive deeper, end-to-end insight\ninto supply chains, automate processes to reduce costs and grow productivity, and achieve sustainable\ntransformation for a more prosperous future. Welcome to the Lakehouse for Manufacturing.\n\n\n-----\n\n## Manufacturing Transformation Trends\n\n\nThe future of manufacturing is smart, sustainable and service oriented. Today’s\nforward-thinking leaders are preparing the foundation they need to support that\nfuture by leveraging fast and connected data from all corners of the enterprise.\nThere are four key trends driving transformation in manufacturing:\n\n**Boosting industrial productivity through automation**\n\nA spike in labor costs, as well as the cost of energy and materials, puts significant\npressure on operating margins. At the same time, industrial productivity has\nplateaued — it is at the same level today as it was in the late 2000s. In the face\nof these macro challenges and economic uncertainty, there has never been a\nmore burning need to reduce costs and improve productivity through greater\nvisibility and automation.\n\nThe industry has made strides in collecting data from machines and performing\npredictive analytics on sensor readings, with 47% of manufacturers citing the\nuse of predictive maintenance to reduce operational costs with considerable\nupside ahead.\n\nHowever, there is an entirely different class of unstructured data in the form of\nimages, videos and LiDAR that is opening the door to game-changing automation\nin quality inspection, flow optimization and production scheduling. Historically,\nthese critical processes have depended on manual and visual inspection of\nproducts and operations, which is resource intensive and less accurate than\nML-driven computer vision techniques. This untapped data and capability\nis allowing manufacturers to deliver higher product quality and deliver on\nproduction demands using fewer resources. Andrew Ng, a machine learning\n\n\npioneer, rightly describes the massive opportunity for these technologies in\nhis quote: “It is incumbent on every CEO in any manufacturing or industrial\nautomation company to figure out how to make deep learning technology work\nfor your business.”\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Corning\n\n#### $2 million in cost avoidance through \n\nmanufacturing upset event reduction\n\n**Driving Better Efficiency in Manufacturing Process With ML**\n\nCorning has been one of the world’s leading innovators in materials science for\n\nnearly 200 years. Delivering high-quality products is a key objective across the\n\ncompany’s manufacturing facilities around the world, and it’s always on a mission\n\nto explore how ML can help deliver on that goal. Databricks has been central\n\nto the company’s digital transformation, as it provides a simplified and unified\n\nplatform where teams can centralize all data and ML work. Now, they can train\n\nmodels, register them in MLflow, generate all additional artifacts — like exported\n\nformats — and track them in the same place as the base model.\n\n[LEARN MORE](https://www.databricks.com/blog/2023/01/05/how-corning-built-end-end-ml-databricks-lakehouse-platform.html)\n\n\n-----\n\n**Gaining end-to-end operations and**\n**supply chain visibility**\n\nModern customer expectations are forcing manufacturers to focus on more\ncustomer-centric KPIs: quality, on-time commitments and speed of delivery.\nThat’s not to say that asset and labor efficiency are less important — however,\nwith customer expectations of shorter lead times and more reliable delivery,\nthe success measures in manufacturing are shifting to a mantra of “measure\nwhat your customer values.”\n\nHigh-performing manufacturers that embed this deep into their operational\nplaybook also perform best on productivity and ROIC growth results, as\nevidenced in a recent study by the World Economic Forum and the International\nCentre of Industrial Transformation. The problem? In a post-pandemic world,\noperations and supply chains are persistently constrained, with increasing\ndisruptions, spiraling costs and unpredictable performance. The business\nimpact is considerable — studies have shown that a 30-day disruption can\nreduce EBITDA by 5% and impact annual revenue by as much as 20%.\n\nManufacturing companies need to be able to deliver on customer expectations,\ncommitments and service levels, all while lowering costs and increasing\nproductivity. Manufacturers need an enterprise data platform that can provide\nreal-time visibility into order flows, production processes, supplier performance,\ninventory and logistics execution, breaking down departmental silos to maximize\ncustomer responsiveness, improve manufacturing agility and boost performance.\n\n\n**Transforming your business model through**\n**tech-fueled services**\n\nServitization, defined as the process of building revenue streams from services,\nhas been trending for some time. The adaptation of the business model has\nbeen considerably profitable: on average, services account for ~30% of industrial\nmanufacturing companies but contribute 60%+ of profit.\n\nIn aftersale services, a clear customer preference for business outcome-based\nofferings has emerged in almost every corner of the manufacturing industry.\nThe use of data, analytics and AI is foundational to delivering more personalized\ncustomer outcomes, proactive field service delivery and differentiated missioncritical applications to their customers.\n\nWith greater autonomy, connectivity and sensorization, manufacturers operate\nin a paradigm where their products generate more and more data every second,\nopening up numerous new addressable opportunities for value creation. The\nbusiness of manufacturing is no longer linear, and manufacturers will need to\nreimagine their businesses to go beyond merely providing the primary unit of\nproduction — the next SKU, machine, vehicle or airplane — and leverage this data\nto operate a platform business with higher growth, stickier revenue streams and\ngreater resilience to demand shocks.\n\n\n-----\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Rolls-Royce\n\n**Aerospace Goes Green With Data and AI**\n\nWhile most people think of luxury cars when they hear “Rolls-Royce,” the\n\nCivil Aerospace branch is its own company, having separated from the car\n\nmanufacturing arm in 1971. The now wildly successful manufacturer of commercial\n\nairplane engines is a leader in its industry for innovation. Today, Rolls-Royce\n\n\n_“We employed Databricks to optimize inventory planning using data and analytics,_\n_positioning parts where they need to be, based on the insight we gain from our_\n_connected engines in real time and usage patterns we see in our service network. This_\n_has helped us minimize risks to engine availability, reduce lead times for spare parts_\n_and drive more efficiency in stock turns — all of this enables us to deliver TotalCare,_\n_the aviation industry’s leading Power-by-the-Hour (PBH) maintenance program.”_\n\n**S T U A R T H U G H E S**\n\nChief Information and Digital Officer\nRolls-Royce Civil Aerospace\n\n\nobtains information directly from the airlines’ engines and funnels it into the\n\nDatabricks platform. This gives the company insights into how the engines are\n\nperforming and ways to improve maintenance schedules, translating to less\n\ndowntime, delays, and rerouting — all of which reduce carbon footprint.\n\n[LEARN MORE](https://www.wired.com/sponsored/story/how-tech-is-helping-to-save-the-world/)\n\n\n-----\n\n**Driving a more sustainable approach**\n**to manufacturing**\n\nGlobal efforts on reducing greenhouse gas (GHG)\nemissions are accelerating, with over 70 countries\nrepresenting more than 75% of global emissions\nhaving signed agreements to reach net-zero\nemissions by 2050. Manufacturing-centric sectors\nare critical to achieving net-zero sustainability\ncommitments around the world, as they represent\nover 50% of global energy consumption and\ncontribute to ~25% of global emissions.\n\nThose at the forefront of data, analytics and\nAI are setting science-based targets and are\ndriving favorable sustainability outcomes today\nby deriving better insights from their operations,\nsupply chains and the outcomes that their\nproducts generate for their end customers.\n\n\n**CUSTOMER STORY SPOTLIGHT:**\n##### Shell\n\n**Delivering Innovative Energy Solutions for a Cleaner World**\n\n\nShell has been at the forefront of creating a cleaner tomorrow by investing in digital\n\ntechnologies to tackle climate change and become a net-zero emissions energy\n\nbusiness. Across the business, they are turning to data and AI to improve operational\n\nefficiencies, drive customer engagement, and tap into new innovations like renewable\n\nenergy. Hampered by large volumes of data, Shell chose Databricks to be one of\n\nthe foundational components of its Shell.ai platform. Today, Databricks empowers\n\nhundreds of Shell’s engineers, scientists and analysts to innovate together as part of\n\ntheir ambition to deliver cleaner energy solutions more rapidly and efficiently.\n\n[LEARN MORE](https://www.google.com/url?q=https://www.databricks.com/customers/shell&sa=D&source=editors&ust=1679097620349908&usg=AOvVaw00lb46oTfGRpOREXOI1Ue3)\n\n_“Shell has been undergoing a digital transformation as part of our ambition to deliver more_\n_and cleaner energy solutions. As part of this, we have been investing heavily in our data lake_\n_architecture. Our ambition has been to enable our data teams to rapidly query our massive_\n_data sets in the simplest possible way. The ability to execute rapid queries on petabyte_\n_scale data sets using standard BI tools is a game changer for us. Our co-innovation_\n_approach with Databricks has allowed us to influence the product road map, and we are_\n_excited to see this come to market.”_\n\n\n### Millions\nof dollars saved in\npotential engine\nrepair costs\n\ndata team\n### 250\nmembers supporting\n160+ high-value use\ncases\n\nfaster –\n### 9x\n5 minutes to validate\na label, reduced from\n45 minutes\n\n\n**D A N I E L J E AV O N S**\nGeneral Manager – Advanced Analytics CoE\n\nShell\n\n\n-----\n\n## Manufacturing Data Challenges\n\n\n**Massive unstructured/OT data volumes**\n\nThe industry is seeing immense growth in data volumes: much of this massive\ngrowth is due to semi-structured and unstructured data from connected workers,\nbuildings, vehicles and factories. This growth in multi-modal data from IoT sensors,\nprocess historians, product telemetry, images, cameras and perception systems\nhas outpaced legacy data warehouse-centric technologies. On-prem and cloud\ndata warehouse tech-based architectures are too complex and too costly for the\nlarge and heterogeneous data sets prevalent in the industry.\n\n**Driving IT-OT convergence**\n\nThe success and pace of data modernization efforts in manufacturing is so often\nmuted by critical data being stuck in multiple closed systems and proprietary\nformats, making it difficult and cost-prohibitive to extract the full potential of IT\nand OT data sets. In addition, data quality issues such as outdated or inaccurate\ndata can often lead to a disjointed and incomplete view of customers, operations\nand assets. For years, companies have lacked a common foundation for complex\nand heterogeneous manufacturing data — from IoT-generated data streams to\nfinancial metrics stored in ERP applications — and it has impacted their ability to\nprovide the freshest, highest-quality and most complete data for analytics.\n\n\n**Bringing AI/ML to where it’s needed**\n\nTo realize the promise of AI/ML in manufacturing, machine learning models need\nto be brought as close to the decision as possible, often at the edge in facilities\nand locations with limited or intermittent connectivity to the internet or cloud.\nThis requires deployment flexibility to on-premises or edge devices, with an\nexperience comparable to that in the cloud.\n\n**Inability to innovate at scale**\n\nCDOs want to be able to quickly and efficiently reproduce successes at global\nscale. Technical and business users want to simply and quickly know what data\nsets are available to solve the business issue at hand. Analysts want flexibility to\nuse the tools they are most familiar with in order to stay responsive to business\nneeds. Fragmented approaches to architecture and tooling make scaling\nbusiness impact very difficult, which results in talent churn, slower development\nand duplicative efforts — all leading to higher costs.\n\n\n-----\n\n## Databricks Lakehouse for Manufacturing\n\n**Deliver personalized outcomes and frictionless experiences**\n\n**Millions of assets streaming IoT data**\n\n**5%–10% reduction in unplanned downtime and cost**\n\n**Accurate prices across 1,000s of locations and millions of dealers**\n\n**200%+ increase in offer conversion rates**\n\nWith Databricks Lakehouse for Manufacturing, manufacturers can gain a\nsingle view of their customers that combines data from each stage of the\ncustomer journey. With a 360-degree view in place, manufacturers can drive\nmore differentiated sales strategies and precise service outcomes in the\nfield, delivering higher revenue growth, profitability and CSAT scores.\n\nWith the Databricks Lakehouse, you can analyze product telemetry data,\ncustomer insights and service networks to deliver highest uptime, quality of\nservice and economic value through the product lifecycle.\n\n**Optimize the supply chain, production processes and fulfillment logistics**\n\n**with real-time analytics and AI.**\n\nThe Databricks Lakehouse for Manufacturing is the only enterprise data platform\nthat helps manufacturing organizations optimize their supply chains, boost\nproduct innovation, increase operational efficiencies, predict fulfillment needs\nand reduce overall costs.\n\n\n-----\n\n**Gain real-time insight for agile manufacturing and logistics**\n\n**30%–50% improvement in forecast accuracy**\n\n**90% lower cost for new manufacturing line**\n\n**4%–8% reduction in logistics costs**\n\n**10% improvement in carbon footprint**\n\nThe Databricks Lakehouse lets you build a resilient and predictive supply\nchain by eliminating the trade-off between accuracy or depth of analysis\nand time. With scalable, fine-grained forecasts to predict or sense demand,\nor perform supply chain planning and optimization, Databricks improves\naccuracy of decisions, leading to higher revenue growth and lower costs.\n\nThe lakehouse provides an “always on” architecture that makes IT-OT\nconvergence a reality, by continuously putting all data to work regardless of the\nfrequency at which it arrives (periodic, event-driven or real-time streaming)\nand creates valuable data products that can empower decision makers. This\ncreates real-time insight into performance with data from connected factory\nequipment, order flows and production processes to drive the most effective\nresource scheduling.\n\n\n**Empower the manufacturing workforce of the future**\n\n**25% improvement in data team productivity**\n\n**50x faster time to insight**\n\n**50% reduction in workplace injuries**\n\nWith Databricks, manufacturers can increase the impact and decrease the\ntime-to-value of their data assets, ultimately making data and AI central to every\npart of their operation. And by empowering data teams across engineering,\nanalytics and AI to work together, Databricks frees up employees to self-serve\nand focus on realizing maximum business value — improving product quality,\nreducing downtime and exceeding customer expectations.\n\n**Execute product innovation at the speed of data**\n\n**90% decrease in time to market of new innovations**\n\n**20x faster data processing of vehicle and road data**\n\nIt is critical that manufacturers are offering the most desirable value\npropositions so end consumers don’t look elsewhere. By tapping into product\nperformance and attribute data along with market trends and operations\ninformation, manufacturers can make strategic decisions.\n\nWith Databricks, manufacturers can decrease time to market with new products\nto increase sales by analyzing customer behavior and insights (structured,\nunstructured and semi-structured), product telemetry (streaming, RFID, computer\nvision) and digital twins, and leveraging that data to drive product decisions.\n\n\n-----\n\n## Building Innovative Solutions on the Lakehouse\n\n\nThe flexibility of the Databricks Lakehouse Platform means that you can start\nwith the use case that will have the most impact on your business. Through\nour experience working with some of the largest and most cutting-edge\nmanufacturers in the world, we’ve developed Solution Accelerators based\non the most common needs of manufacturers to help you get started. These\npurpose-built guides — fully functional notebooks and best practices — speed\nup results across your most common and high-impact use cases. Go from idea\nto proof of concept (PoC) in as little as two weeks. Check out the full list of\nSolution Accelerators [here](https://www.databricks.com/solutions/accelerators) .\n\n**S O L U T I O N**\n**Part-Level Demand**\n**Forecasting**\n\n\nDemand forecasting is a critical business process for manufacturing and\nsupply chains. McKinsey estimates that over the next 10 years, supply\nchain disruptions can cost close to half (~45%) of a year’s worth of profits\nfor companies. Having accurate and up-to-date forecasts is vital to plan\nthe scaling of manufacturing operations, ensure sufficient inventory and\nguarantee customer fulfillment.\n\nIn recent years, manufacturers have been investing heavily in quantitativebased forecasting that is driven by historical data and powered using either\nstatistical or machine learning techniques. Benefits include:\n\n**•** Better sales planning and revenue forecasting\n\n**•** Optimized safety stock to maximize turn-rates and\nservice-delivery performance\n\n**•** Improved production planning by tracing back\nproduction outputs to raw material levels\n\n**A disruption lasting just 30 days or less could**\n\n**equal losses of** **3%-5% of EBITDA.**\n\n\n-----\n\nDatabricks Lakehouse can enable large-scale forecasting solutions to help\nmanufacturers navigate the most common data challenges when trying to\nforecast demand.\n\n**C O M M O N U S E C A S E S :**\n\nScalable, accurate forecasts across large numbers of store-item\ncombinations experiencing intermittent demand\n\nAutomated model selection to ensure the best model is selected\nfor each store-item combination\n\nMetrics to identify the optimal frequency with which to generate\nnew predictions\n\nManage material shortages and predict overplanning\n\n**Try our** **[Parts-Level Solution Accelerator](https://www.databricks.com/solutions/accelerators/demand-forecasting)** **to facilitate**\n\n**fine-grained demand forecasts and planning.**\n\n\n-----\n\n**S O L U T I O N**\n**Overall Equipment Effectiveness**\n**& KPI Monitoring**\n\n\n​The need to monitor and measure manufacturing equipment performance is\ncritical for operational teams within manufacturing. Today, Overall Equipment\nEffectiveness (OEE) is considered the standard for measuring manufacturing\nequipment productivity. According to Engineering USA, an OEE value of 85% or\nabove is considered world-leading. However, many manufacturers typcially achieve\na range of between 40% and 60%. Reasons for underachievement often include:\n\n**•** Delayed inputs due to manual processes that are prone to human error\n\n**•** Bottlenecks created by data silos, impeding the flow of fresh data to\nstakeholders\n\n**•** A lack of collaboration capabilities, keeping stakeholders from working on the\nsame information at the same time\n\n**Poor OEE value** **can be a result of poor parts quality, slow**\n**production performance and production availability issues.**\n\nDatabricks Lakehouse can help manufacturers maneuver through the\nchallenges of ingesting and converging operational technology (OT) data with\ntraditional data from IT systems to build forecasting solutions.\n\n**C O M M O N U S E C A S E S**\n\nIncrementally ingest and process sensor data from IoT devices\nin a variety of formats\n\nCompute and surface KPIs and metrics to drive valuable insights\n\nOptimize plant operations with data-driven decisions\n\n**Try our** **[Solution Accelerator for OEE and KPI Monitoring](https://www.databricks.com/solutions/accelerators/overall-equipment-effectiveness)** **for**\n**performant and scalable end-to-end monitoring.**\n\n\n-----\n\nMarket dynamics and volatility are requiring manufacturers to bring products to\nmarket more quickly, optimize production processes and build agile supply chains\nat scale at a lower price. To do so, many manufacturers have turned to building\ndigital twins, which are virtual representations of objects, products, pieces of\nequipment, people, processes or even complete manufacturing ecosystems.\n\nDigital twins provide insights — derived from sensors (often IoT or IIoT) that\nare embedded in the original equipment — that have the potential to transform\nthe manufacturing industry by driving greater efficiency, reducing costs and\nimproving quality.\n\n\n**S O L U T I O N**\n**Digital Twins**\n\n\n**Digital twin technologies can improve product**\n\n**quality by** **up to 25%.**\n\nDatabricks Lakehouse can bring digital twins to life through fault-tolerant\nprocessing of streaming workloads generated by IoT sensor data and complex\nevent processing (important for modeling physical processes).\n\n**C O M M O N U S E C A S E S**\n\nProcess real-world data in real time\n\nCompute insights at scale and deliver to multiple downstream applications\n\nOptimize plant operations with data-driven decisions\n\n**Try our** **[Solution Accelerator for Digital Twins](https://www.databricks.com/solutions/accelerators/digital-twins)** **to accelerate**\n**time to market of new innovations.**\n\n\n-----\n\n**S O L U T I O N**\n**Computer Vision**\n\nThe rise in computer vision has been fueled by the rapid developments in\nneural network technologies, which use AI to better understand and interpret\nimages with near-perfect precision. In manufacturing, computer vision can\ntransform operations by, for example, identifying product defects to improve\nquality control, detecting safety hazards on the production floor, and tracking\nand managing inventory levels.\n\n**As per the American Society for Quality, cost of poor quality for**\n\n**companies can be as high as** **20% of revenue.**\n\n\nDatabricks Lakehouse can easily ingest complex, unstructured image and video\ndata at massive scale. Through the most popular computer vision libraries, data\nteams can scale AI models that leverage computer vision to recognize patterns,\ndetect objects and make predictions with 99% accuracy.\n\n**C O M M O N U S E C A S E S**\n\nQuickly identify defects and ensure that products and processes meet\nquality standards\n\nAutomate positioning and guidance to ensure that parts and products are\nproperly aligned and assembled\n\nPredict maintenance issues to reduce downtime and maintenance costs,\nimprove parts reliability, and increase safety for workers\n\n**Try our** **[Solution Accelerator for Computer Vision](https://www.databricks.com/blog/2021/12/17/enabling-computer-vision-applications-with-the-data-lakehouse.html)** **to improve**\n**efficiency, reduce costs and enhance overall safety.**\n\n\n-----\n\n## An Ecosystem on the Lakehouse for Manufacturing\n\nWe’ve partnered with leading consulting firms and\nindependent software vendors to deliver innovative,\nmanufacturing-specific solutions. Databricks\nBrickbuilder Solutions help you cut costs and\nincrease value from your data. Backed by decades\nof industry expertise — and built for the Databricks\nLakehouse Platform — Brickbuilder Solutions are\ntailored to your exact needs.\n\nWe also work with technology partners like Alteryx,\nAtScale, Fivetran, Microsoft Power BI, Qlik, Sigma,\nSimplement, Tableau and ThoughtSpot to accelerate\nthe availability and value of data. This allows\nbusinesses to unify data from complex source\nsystems and operationalize it for analytics, AI and\nML on the Databricks Lakehouse Platform.\n\n\n-----\n\n**S O L U T I O N**\n**Avanade Intelligent Manufacturing**\n\nEvery year, businesses lose millions of dollars due to equipment failure,\nunscheduled downtime and lack of control in maintenance scheduling. Along\nwith lost dollars, businesses will experience lower employee morale when\nstations are in and out of service. Avanade’s Intelligent Manufacturing solution\nsupports connected production facilities and assets, workers, products and\nconsumers to create value through enhanced insights and improved outcomes.\nManufacturers can harness data to drive interoperability and enhanced insights\nat scale using analytics and AI. Outcomes include improvements across\nproduction (e.g., uptime, quantity and yield), better experiences for workers,\nand greater insight into what customers want.\n\n**Try our joint solution,** **[Intelligent Manufacturing](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/avanade-intelligent-manufacturing)** **, to drive value and**\n**operationalize team coordination and productivity.**\n\n\n**S O L U T I O N**\n**DataSentics Quality Inspector**\n\nQuality control is a crucial aspect of any production process, but traditional\nmethods can be time-consuming and prone to human error. Quality\nInspector by DataSentics, an Atos company, offers a solution that is\nboth efficient and reliable. With out-of-the-box models for visual quality\ninspection, which are tailored to meet specific business requirements,\norganizations will experience stable, scalable quality control that’s easy to\nimprove over time. Quality Inspector is an end-to-end solution that can be\nseamlessly integrated into an existing setup, delivering high performance\nand reliability.\n\n**Try our joint solution,** **[Quality Inspector](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to automate production quality**\n**control with an increase in accuracy and quicker time to value.**\n\n\n-----\n\nTREDENCE PSRM_1”: PREDICT SUPPLY RISK\n\nTREDENCE PSRM_2”: REAL-TIME SHIPMENT VISIBILITY\n\nTREDENCE PSRM_3”: DELAY ALERTS\n\n\n**S O L U T I O N**\n**Tredence Predictive Supply Risk Management**\n\nCustomers today are faced with multiple supply risks including lack of\nin-transit visibility, disruptions caused by weather, local events, among\nothers. Tredence’s Predictive Supply Risk Management solution, built on\nthe Databricks Lakehouse Platform, helps businesses meet supply risk\nchallenges by providing a scalable, cloud-based solution that can be\ntailored to the specific needs of each organization. The platform’s flexibility\nand scalability allow businesses to keep pace with changing regulations\nand customer demands, while their comprehensive suite of tools helps\nidentify and mitigate risks across the enterprise.\n\n**Try our joint solution,** **[Predictive Supply Risk Management](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview)** **, to**\n**predict order delays, identify root causes and quantify supply**\n**chain impact.**\n\nVisit our [site](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions?itm_data=menu-item-brickbuildersoverview) to learn more about our Databricks Partner Solutions.\n\n\n-----\n\n## Leading Manufacturing Companies That Choose Us\n\n\n-----\n\nDatabricks is the lakehouse company. More than 9,000 organizations worldwide\n\n— including Comcast, Condé Nast and over 50% of the Fortune 500 — rely on the\n\nDatabricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the\n\noriginal creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission\n\nto help data teams solve the world’s toughest problems. To learn more, follow\n\nDatabricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n###### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://www.databricks.com/try-databricks?utm_medium=paid+search&utm_source=google&utm_campaign=14272820537&utm_adgroup=126939742998&utm_content=trial&utm_offer=try-databricks&utm_ad=634147899783&utm_term=try%20databricks&gclid=CjwKCAiAr4GgBhBFEiwAgwORrTnkJaDf9SpIDy2RxOV28a2G2HtUDvJnLXiVWBsqcAWa_XmSvabkVRoCiwgQAvD_BwE#account)**\n\nTo learn more, visit us at:\n**[Manufacturing Industry Solutions](https://www.databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Lakehouse-for-Manufacturing.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**2 0 2 0 E D I T I O N** | U P D AT E D\n\n# Standardizing the Machine Learning Lifecycle\n\n### From experimentation to production with MLflow\n\n[��](https://mlflow.org)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n#### Contents\n\nChapter 1: \u0007Machine Learning\nLifecycle Challenges 3\n\nChapter 2: \u0007Applying Good Engineering\nPrinciples to Machine Learning 7\n\nChapter 3: \u0007Introducing MLflow 9\n\nChapter 4: \u0007A Closer Look at MLflow\nModel Registry 16\n\nChapter 5: \u0007Making Organizations\nSuccessful with ML 19\n\nChapter 6: \u0007Introducing the Unified\nData Analytics Platform 20\n\nChapter 7: \u0007Standardizing the Machine\nLearning Lifecycle on Databricks 25\n\nChapter 8: \u0007Getting Started 26\n\nChapter 9: \u0007Comparison Matrix 27\n\n\n#### Preface\n\n##### Technology changes quickly. Data science and machine learning (ML) are moving\n even faster. In the short time since we first published this eBook, businesses across industries have rapidly matured their machine learning operations (MLOps) — implementing ML applications and moving their first models into production. This has turned ML models into corporate assets that need to be managed across the lifecycle.\n\n That’s why MLflow, an open-source platform developed by Databricks, has emerged\n as a leader in automating the end-to-end ML lifecycle. With 1.8 million 1 downloads a month — and growing support in the developer community — this open-source platform is simplifying the complex process of standardizing and productionizing MLOps. This updated eBook explores the advantages of MLflow and introduces you to the newest component: MLflow Model Registry. You’ll also discover how MLflow fits into the Databricks Unified Data Analytics Platform for data engineering, science and analytics.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 1: **\u0007** **Machine Learning**\n\n#### Lifecycle Challenges\n\n\nBuilding machine learning models is hard. Putting them into production is harder. Enabling others — data\n\nscientists, engineers or even yourself — to reproduce your pipeline and results is equally challenging. How\n\nmany times have you or your peers had to discard previous work because it was either not documented\n\nproperly or too difficult to replicate?\n\nGetting models up to speed in the first place is significant enough that it can be easy to overlook long-\n\nterm management. What does this involve in practice? In essence, we have to compare the results of\n\ndifferent versions of ML models along with corresponding artifacts — code, dependencies, visualizations,\n\nintermediate data and more — to track what’s running where, and to redeploy and roll back updated models\n\nas needed. Each of these requires its own specific tools, and it’s these changes that make the ML lifecycle\n\nso challenging compared with traditional software development lifecycle (SDLC) management.\n\nThis represents a serious shift and creates challenges compared with a more traditional software\n\ndevelopment lifecycle for the following reasons:\n\n\nThe diversity and number of ML\n\ntools involved, coupled with a\n\nlack of standardization across\n\nML libraries and frameworks\n\n\nThe continuous nature of ML\n\ndevelopment, accompanied by a\n\nlack of tracking and management\n\ntools for machine learning models\n\nand experiments\n\n\nThe complexity of productionizing\n\nML models due to the lack of\n\nintegration among data pipelines,\n\nML environments and production\n\nservices\n\n\nLet’s look at each of these areas in turn.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The diversity and number of ML tools involved\n\n\nWhile the traditional software development process leads to the\n\nrationalization and governance of tools and platforms used for developing and\n\nmanaging applications, the ML lifecycle relies on data scientists’ ability to use\n\nmultiple tools, whether for preparing data and training models, or deploying\n\nthem for production use. Data scientists will seek the latest algorithms from\n\n\nHowever, due to the variety of available tools and the lack of detailed tracking,\n\nteams often have trouble getting the same code to work again in the same way.\n\nReproducing the ML workflow is a critical challenge, whether a data scientist\n\nneeds to pass training code to an engineer for use in production or go back to\n\npast work to debug a problem.\n\n\nthe most up-to-date ML libraries and frameworks available to compare results\n\nand improve performance.\n\n**PREP DATA** **BUILD MODEL** **DEPLOY MODEL**\n\nAzure ML\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The continuous nature of ML development\n\nTechnology never stands still. New data, algorithms,\n\nlibraries and frameworks impact model performance\n\ncontinuously and, thus, need to be tested. Therefore,\n\nmachine learning development requires a continuous\n\n\napproach, along with tracking capabilities to\n\ncompare and reproduce results. The performance\n\nof ML models depends not only on the algorithms\n\nused, but also on the quality of the data sets and the\n\nparameter values for the models.\n\n\n**P R E P**\n**D ATA**\n\n**B U I L D**\n**M O D E L**\n\n\nWhether practitioners work alone or on teams, it’s\n\nstill very difficult to track which parameters, code\n\nand data went into each experiment to produce a\n\nmodel, due to the intricate nature of the ML\n\nlifecycle itself.\n\n**D E P L O Y**\n**M O D E L**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The complexity of productionizing ML models\n\n\nIn software development, the architecture is set early on, based on the target\n\napplication. Once the infrastructure and architecture have been chosen, they\n\nwon’t be updated or changed due to the sheer amount of work involved in\n\nrebuilding applications from scratch. Modern developments, such as the move\n\nto microservices, are making this easier, but for the most part, SDLC focuses on\n\nmaintaining and improving what already exists.\n\n\nOne of today’s key challenges is to effectively transition models from\n\nexperimentation to staging and production — without needing to rewrite the code\n\nfor production use. This is time-consuming and risky as it can introduce new\n\nbugs. There are many solutions available to productionize a model quickly, but\n\npractitioners need the ability to choose and deploy models across any platform,\n\nand scale resources as needed to manage model inference effectively on big data,\n\nin batch or real time.\n\n\nWith machine learning the first goal is to build a model. And keep in mind: a\n\nmodel’s performance in terms of accuracy and sensitivity is agnostic from the\n\ndeployment mode. However, models can be heavily dependent on latency, and\n\nthe chosen architecture requires significant scalability based on the business\n\napplication. End-to-end ML pipeline designs can be great for batch analytics and\n\nlooking at streaming data, but they can involve different approaches for real-time\n\nscoring when an application is based on a microservice architecture working via\n\nREST APIs, etc.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 2: **\u0007** **Applying Good Engineering**\n\n#### Principles to Machine Learning\n\n\nMany data science and machine learning projects fail due to preventable issues that have been resolved\n\nin software engineering for more than a decade. However, those solutions need to be adapted due to key\n\ndifferences between developing code and training ML models.\n\n- \u0007 **Expertise, code and data** — With the addition of data, data science and ML, code not only needs to deal\n\nwith data dependencies but also handle the inherent nondeterministic characteristics of statistical\n\nmodeling. ML models are not guaranteed to behave the same way when trained twice, unlike traditional\n\ncode, which can be easily unit tested.\n\n- \u0007 **Model artifacts** — In addition to application code, ML products and features also depend on models\n\nthat are the result of a training process. Those model artifacts can often be large — on the order of\n\ngigabytes — and often need to be served differently from code itself.\n\n- \u0007 **Collaboration** — In large organizations, models that are deployed in an application are usually not trained\n\nby the same people responsible for the deployment. Handoffs between experimentation, testing and\n\nproduction deployments are similar but not identical to approval processes in software engineering.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### The need for standardization\n\nSome of the world’s largest tech companies have already begun solving these problems internally with\n\ntheir own machine learning platforms and lifecycle management tools. 2 These internal platforms have\n\nbeen extremely successful and are designed to accelerate the ML lifecycle by standardizing the process of\n\ndata preparation, model training, and deployment via APIs built for data scientists. The platforms not only\n\nhelp standardize the ML lifecycle but also play a major role in retaining knowledge and best practices, and\n\nmaximizing data science team productivity and collaboration, thereby leading to greater ROI.\n\nInternally driven strategies still have limitations. First, they are limited to a few algorithms or frameworks.\n\nAdoption of new tools or libraries can lead to significant bottlenecks. Of course, data scientists always\n\nwant to try the latest and the best algorithms, libraries and frameworks — the most recent versions of\n\nPyTorch, TensorFlow and so on. Unfortunately, production teams cannot easily incorporate these into\n\nthe custom ML platform without significant rework. The second limitation is that each platform is tied\n\nto a specific company’s infrastructure. This can limit sharing of efforts among data scientists. As each\n\nframework is so specific, options for deployment can be limited.\n\nThe question then is: Can similar benefits to these systems be provided in an open manner? This evaluation\n\nmust be based on the widest possible mix of tools, languages, libraries and infrastructures. Without this\n\napproach, it will be very difficult for data scientists to evolve their ML models and keep pace with industry\n\ndevelopments. Moreover, by making it available as open source, the wider industry will be able to join in and\n\ncontribute to ML’s wider adoption. This also makes it easier to move between various tools and libraries\n\nover time.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 3: **\u0007** **Introducing MLflow**\n\n**M AT E I Z A H A R I A**\n\nCo-founder and Chief Technologist at Databricks\n\n\nAt Databricks, we believe that there should be a better way to manage the ML lifecycle. So in June 2018,\n\nwe unveiled [MLflow](https://mlflow.org/) , an open-source machine learning platform for managing the complete ML lifecycle.\n\n###### “MLflow is designed to be a cross-cloud, modular, API-first framework, to work well with\n all popular ML frameworks and libraries. It is open and extensible by design, and platform\n agnostic for maximum flexibility.”\n\nWith MLflow, data scientists can now package code as reproducible runs, execute and\n\ncompare hundreds of parallel experiments, and leverage any hardware or software platform\n\nfor training, hyperparameter tuning and more. Also, organizations can deploy and manage\n\nmodels in production on a variety of clouds and serving platforms.\n\n###### “ With MLflow, data science teams can systematically package and reuse models\n across frameworks, track and share experiments locally or in the cloud, and deploy\n models virtually anywhere,” says Zaharia. “The flurry of interest and contributions we’ve\n seen from the data science community validates the need for an open-source framework to\n streamline the machine learning lifecycle.”\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n**EXPERIMENT TRACKING** As mentioned previously, getting ML models to perform takes significant trial and error, and continuous configuration, building, tuning, testing,\n\netc. Therefore, it is imperative to allow data science teams to track all that goes into a specific run, along with the results. With MLflow, data scientists can quickly record\n\nruns and keep track of model parameters, results, code and data from each experiment, all in one place.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n\n**FLEXIBLE DEPLOYMENT** There is virtually no limit to what machine learning can\n\ndo for your business. However, there are different ways to architect ML applications\n\nfor production, and various tools can be used for deploying models, which often\n\nlead to code rewrites prior to deploying ML models into production. With MLflow,\n\nyour data scientists can quickly download or deploy any saved models to various\n\nplatforms — locally or in the cloud — from experimentation to production.\n\n\n**REPRODUCIBLE PROJECTS** The ability to reproduce a project — entirely or just\n\nparts of it — is key to data science productivity, knowledge sharing and, hence,\n\naccelerating innovation. With MLflow, data scientists can build and package\n\ncomposable projects, capture dependencies and code history for reproducible\n\nresults, and quickly share projects with their peers.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Key benefits\n\n**MODEL MANAGEMENT** Use one central place to share ML models, collaborate on moving them from experimentation to online testing and production, integrate with\n\napproval and governance workflows, and monitor ML deployments and their performance. This is powered by the latest MLflow component, MLflow Model Registry.\n\n**M O D E L D E P L O Y M E N T A N D M O N I T O R I N G**\n\n**I N - L I N E C O D E**\n\n��\n\n**M L L I B R A R I E S**\n\n###### Model Format\n\n**C O N TA I N E R S**\n\n\n**F L AV O R 1**\n\n\n**F L AV O R 2**\n\n**B AT C H A N D S T R E A M S C O R I N G**\n\n\nSimple model flavors\nusable by many tools\n\n\n**C L O U D I N F E R E N C E S E R V I C E S**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Use case examples\n\nLet‘s examine three use cases to explore how users can leverage some of the MLflow components.\n\n\n**EXPERIMENT TRACKING** A European energy\n\ncompany is using MLflow to track and update\n\nhundreds of energy-grid models. This company’s\n\ngoal is to build a time-series model for every major\n\nenergy producer (e.g., power plant) and consumer\n\n(e.g., factory), monitor these models using standard\n\nmetrics, and combine the predictions to drive\n\nbusiness processes, such as pricing. Because a\n\nsingle team is responsible for hundreds of models,\n\npossibly using different ML libraries, it’s important to\n\nhave a standard development and tracking process.\n\nThe team has standardized on Jupyter notebooks\n\nfor development, MLflow Tracking for metrics, and\n\nDatabricks Jobs for inference.\n\n\n**REPRODUCIBLE PROJECTS** An online marketplace\n\nis using MLflow to package deep learning jobs using\n\nKeras and run them in the cloud. Each data scientist\n\ndevelops models locally on a laptop using a small\n\ndata set, checks them into a Git repository with\n\nan MLproject file, and submits remote runs of the\n\nproject to GPU instances in the cloud for large-scale\n\ntraining or hyperparameter search. Using MLflow\n\nProjects makes it easy to create the same software\n\nenvironment in the cloud and share project code\n\namong data scientists.\n\n\n**MODEL PACKAGING** An e-commerce site’s data\n\nscience team is using MLflow Model Registry to\n\npackage recommendation models for use by\n\napplication engineers. This presents a technical\n\nchallenge because the recommendation\n\napplication includes both a standard, off-the-shelf\n\nrecommendation model and custom business logic\n\nfor pre- and post-processing. For example, the\n\napplication might include custom code to ensure the\n\nrecommended items are diverse. This business logic\n\nneeds to change in sync with the model, and the data\n\nscience team wants to control both the business logic\n\nand the model, without having to submit a patch to\n\nthe web application each time the logic has to change.\n\nMoreover, the team wants to A/B test distinct models\n\nwith distinct versions of the processing logic. The\n\nsolution was to package both the recommendation\n\nmodel and the custom logic using the python_\n\nfunction flavor in an MLflow Model, which can then\n\nbe deployed and tested as a single unit.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Open and extensible by design\n\nSince we [unveiled](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) and open sourced MLflow in June 2018 at the Spark + AI Summit in San Francisco, community engagement and contributions have led to an impressive\n\narray of new features and integrations:\n\n\n**SUPPORT FOR MULTIPLE**\n\n**PROGRAMMING LANGUAGES**\n\nTo give developers a choice, MLflow supports R,\n\nPython, Java and Scala, along with a REST server\n\ninterface that can be used from any language.\n\n\n**INTEGRATION WITH POPULAR ML**\n\n**LIBRARIES AND FRAMEWORKS**\n\nMLflow has built-in integrations with the most popular\n\nmachine learning libraries — such as scikit-learn,\n\nTensorFlow, Keras, PyTorch, H2O, and Apache Spark™\n\nMLlib — to help teams build, test and deploy machine\n\nlearning applications.\n\n\n**CROSS-CLOUD SUPPORT**\n\nOrganizations can use MLflow to quickly deploy\n\nmachine learning models to multiple cloud services,\n\nincluding Databricks, Azure Machine Learning and\n\nAmazon SageMaker, depending on their needs.\n\nMLflow leverages AWS S3, Google Cloud Storage and\n\nAzure Data Lake Storage, allowing teams to easily\n\ntrack and share artifacts from their code.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Rapid community adoption\n\n## 2.5M\n#### monthly downloads\n\n## 200+\n#### code contributors\n\n\n## 100+\n#### contributing organizations\n\n\nOrganizations using and contributing to MLflow\n\nSource: [mlflow.org](https://mlflow.org)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 4: **\u0007** **A Closer Look at**\n\n#### MLflow Model Registry\n\n\nMLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\n\n[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\n\nThe latest MLflow component — MLflow Model Registry — builds on MLflow’s original capabilities to\n\nprovide organizations with one central place to share ML models, collaborate on moving them from\n\nexperimentation to testing and production, and implement approval and governance workflows.\n\n��\n\n\n**Model Registry**\n\n\n**D O W N S T R E A M**\n\n\n��\n\n**Tracking Server**\n\n\nData Scientists\n\n**Staging**\n\n\nData Engineers\n\n**Production** **Archived**\n\n**A U T O M AT E D J O B S**\n\n\n**Parameters**\n\n\n**Metrics** **Artifacts**\n\n\nThe Model Registry gives MLflow users new\n\n\ntools for sharing, reviewing and managing\n\nML models throughout their lifecycle\n\n\n**Metadata** **Models**\n\n**R E S T S E R V I N G**\n\n**R E V I E W E R S + C I / C D T O O L S**\n\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\n\nimplement good engineering principles with machine learning initiatives, such as collaboration,\n\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\n\nfeatures of this new component.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n###### One hub for managing ML models collaboratively\n\nBuilding and deploying ML models is a team sport. Not only are the responsibilities\n\nalong the machine learning model lifecycle often split across multiple people\n\n(e.g., data scientists train models whereas production engineers deploy them),\n\nbut also at each lifecycle stage, teams can benefit from collaboration and sharing\n\n\n###### Flexible CI/CD pipelines to manage stage transitions\n\nMLflow lets you manage your models’ lifecycles either manually or through\n\nautomated tools. Analogous to the approval process in software engineering,\n\nusers can manually request to move a model to a new lifecycle stage (e.g., from\n\nstaging to production), and review or comment on other users’ transition requests.\n\n\n(e.g., a fraud model built in one part of the organization could be reused in others).\n\nAlternatively, you can use the Model Registry’s API to plug in continuous integration\n\n\nMLflow facilitates sharing of expertise and knowledge across teams by making ML\n\nmodels more discoverable and providing collaborative features to jointly improve\n\non common ML tasks. Simply register an MLflow model from your experiments to\n\n\nand deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\n\nyour models. Each model also links to the experiment run that built it — in MLflow\n\nTracking — to let you easily review models.\n\n\nget started. The MLflow Model Registry will then let you track multiple versions\n\nof the model and mark each one with a lifecycle stage: development, staging,\n\nproduction or archived.\n\n\nSample machine learning\nmodels displayed via the\nMLflow Model Registry\ndashboard\n\n\nThe machine learning model\npage view in MLflow, showing\nhow users can request and\nreview changes to a model’s\nstage\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Visibility and governance for the full ML lifecycle\n\nIn large enterprises, the number of ML models that are in development, staging\n\nand production at any given point in time may be in the hundreds or thousands.\n\nHaving full visibility into which models exist, what stages they are in and who\n\nhas collaborated on and changed the deployment stages of a model allows\n\norganizations to better manage their ML efforts.\n\nMLflow provides full visibility and enables governance by keeping track of each\n\nmodel’s history and managing who can approve changes to the model’s stages.\n\nIdentify versions, stages and\nauthors of each model\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 5: **\u0007** **Making Organizations**\n\n#### Successful with ML\n\n\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\n\ntrack experiments, compare results, reproduce runs and productionize faster.\n\nIn addition to increasing data science team productivity and collaboration and applying good engineering\n\npractices to machine learning, organizations also need to do the following:\n\n\n**Reliably ingest, ETL and**\n\n**catalog big data**\n\n\n**Work with state-of-the-art**\n\n**ML frameworks and tools**\n\n\n**Easily scale compute from**\n\n**single to multi-node**\n\n\nDatabricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\nCHAPTER 6: **\u0007** **Introducing the Unified**\n\n#### Data Analytics Platform\n\n\nDatabricks accelerates innovation by unifying data science, engineering and business. Through a fully\n\nmanaged, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\n\nDatabricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\n\naccelerates their innovation.\n\n**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\n\n\n**BI INTEGRATIONS**\n\n**Access all your data**\n\n\n**DATA SCIENCE WORKSPACE**\n\n**Collaboration across the lifecycle**\n\n**UNIFIED DATA SERVICE**\n\n**High-quality data with great performance**\n\n\n\n**ENTERPRISE CLOUD SERVICE**\n\n**A simple, scalable and secure managed service**\n\n##### RAW DATA LAKE\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n\n###### Data engineering\n\nSpeed up the preparation of high-quality\n\ndata, essential for best-in-class ML\n\napplications, at scale\n\n\n###### Data science\n\nCollaboratively explore large data sets,\n\nbuild models iteratively and deploy across\n\nmultiple platforms\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Providing managed MLflow on Databricks\n\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\n\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\n\nBy using MLflow as part of Databricks, data scientists can:\n\n\n**WORKSPACES**\n\nBenefit from a streamlined\n\nexperiment tracking experience\n\nwith Databricks Workspace and\n\ncollaborative Notebooks\n\n\n**BIG DATA SNAPSHOTS**\n\nTrack large-scale data that fed\n\nthe models, along with all the\n\nother model parameters, then\n\n\n**JOBS**\n\nEasily initiate jobs remotely, from\n\nan on-premises environment or\n\nfrom Databricks notebooks\n\n\n**SECURITY**\n\nTake advantage of one common\n\nsecurity model for the entire\n\nmachine learning lifecycle\n\n\nreproduce training runs reliably\n\n\nRead our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Getting data ready for ML with Delta Lake\n\nDelta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\n\ndata processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\n\nBy using Delta Lake, data engineers and data scientists can keep track of data used for model training.\n\nFiles ML Runtime\n\n- \u0007Schema enforced high\n\nquality data\n\n\n\n- Optimized performance\n\n��\n\n- \u0007Full data lineage /\n\ngovernance\n\n- \u0007reproductibility through\n\ntime travel\n\n\nStreaming\n\nBatch\n\n\nIngestion\n\nTables\n\n\nIngestion\n\n\nData\n\nCatalog\n\n\nData\n\n\nFeature\n\nStore\n\n\nFeature\n\n\n**Y O U R E X I S T I N G D E LTA L A K E**\n\n\n3rd Party Data\n\nMarketplace\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\n###### Ready-to-use ML environments\n\nDatabricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\n\npreconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\n\nBy using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\n\nand simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\n\n\n**P A C K A G E S A N D O P T I M I Z E S M O S T**\n\n**C O M M O N M L F R A M E W O R K S**\n\n\n**C U S T O M I Z E D E N V I R O N M E N T S**\n\n**U S I N G C O N D A**\n\n\n**C U S T O M I Z E D E N V I R O N M E N T S**\n\n\nrequirements.txt\nconda.yaml\n\n\n**...**\n\n\n**B U I LT- I N O P T I M I Z AT I O N F O R**\n\n**D I S T R I B U T E D D E E P L E A R N I N G**\n\nDistribute and Scale any Single-Machine\nML Code to thousands of machines\n\n\n**B U I LT- I N A U T O M L A N D**\n\n**E X P E R I M E N T T R A C K I N G**\n\n\nMachine\n\nLearning\n\n\nMachine\n\n\n\nAuto ML and Tracking /\nVisualizations with MLflow\n\n\nConda-\n\nBased\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 7: **\u0007** **Standardizing the**\n\n#### Machine Learning\n Lifecycle on Databricks\n\n**B U I L D M O D E L**\n**P R E P D ATA**\n\n��\n\nAzure ML\n\n**D E P L O Y M O D E L**\n\n��\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 8: **\u0007** **Getting Started**\nTake the next step toward standardizing your ML lifecycle — test drive MLflow and the\n\nDatabricks Unified Data Analytics Platform.\n\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\n\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\n\n\n-----\n\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\n\nCHAPTER 8: **\u0007** **Comparison Matrix**\n\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\n|---|---|---|\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_improper_payments_eBook_v4_image.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "### Technical Migration Guide\n\n# Strategies to Evolve Your Data Warehouse to the Databricks Lakehouse\n\n\n-----\n\n## Contents Lakehouse Architecture 3\n\nThe Databricks Lakehouse Platform 4\n\nBusiness Value 5\n\nSingle source of truth 5\n\nData team 6\n\nFuture-proof 6\n\nMigration to Lakehouse 7\n\nOverview 7\n\nMigration strategy 8\n\nMigration planning 9\n\nELT approach 12\n\nAgile modernization 15\n\nSecurity and data governance 17\n\nTeam involvement 19\n\nConclusion 19\n\n\n-----\n\n## Lakehouse Architecture\n\n\nData warehouses were designed to provide a central data repository\n\nwith analytic compute capabilities to help business leaders\n\nget analytical insights, support decision-making and business\n\nintelligence (BI). Legacy on-premises data warehouse architectures\n\nare difficult to scale and make it difficult for data teams to keep up\n\nwith the exponential growth of data. Oftentimes data teams publish\n\nand use a subset of well-defined data for development and testing.\n\nThis slows down both innovation and time to insight.\n\nCloud data warehouses (CDW) were an attempt to tackle the\n\non-premises data warehouse challenges. CDWs removed the\n\nadministrative burden of tasks such as setup, upgrades and\n\nbackups. CDWs also improved scalability and introduced cloud’s\n\npay-as-you-go model to reduce cost. CDWs leverage a proprietary\n\ndata format to achieve cloud-scale and performance; however, this\n\nalso leads to customers locked into these formats with difficult\n\n\nBut enterprise data teams don’t need a better data warehouse.\n\nThey need an innovative, simple solution that provides reliable\n\nperformance, elastic scale and allows self-service to unblock\n\nanalytics to access all data at a reasonable cost. The answer is\n\nthe lakehouse.\n\nThe lakehouse pattern represents a paradigm shift from traditional\n\non-premises data warehouse systems that are expensive and\n\ncomplex to manage. It uses an open data management architecture\n\nthat combines the flexibility, cost-efficiency and scale of data\n\nlakes with the data management and ACID semantics of data\n\nwarehouses. A lakehouse pattern enables data transformation,\n\ncleansing and validation to support both business intelligence and\n\nmachine learning (ML) users on all data. Lakehouse is cloud-centric\n\nand unifies a complete up-to-date data set for teams, allowing\n\ncollaboration across an organization.\n\n\npaths to support use cases outside the data warehouse itself\n\n(i.e., machine learning). Customers often find themselves with a\n\nbifurcated architecture, which ultimately leads to a more costly and\n\ncomplex data platform over time.\n\n\n-----\n\n## The Databricks Lakehouse Platform\n\nThe Databricks Lakehouse Platform is **simple** ; it unifies your data, governance, analytics\n\nand AI on one platform. It’s **open** — the open source format Delta Lake unifies your data\n\necosystem with open standards and data formats. Databricks is **multicloud** — delivering\n\none **consistent experience across all clouds** so you don’t need to reinvent the wheel for\n\nevery cloud platform that you’re using to support your data and AI efforts.\n\nDatabricks SQL stores and processes data using Delta Lake to simplify and enhance\n\ndata warehousing capabilities. Analysts can use their favorite language, SQL, popular\n\ntransformation tools such as dbt, and preferred BI tools like Power BI and Tableau to\n\nanalyze data. The built-in query editor reduces contextual switching and improves\n\nproductivity. Administrators enjoy simplified workload management via serverless\n\ncompute and auto-scaling to meet high-concurrency workload needs. All this at a\n\nfraction of the cost of traditional data warehouses.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\nSimple Open Multicloud\n\n\n-----\n\n## Business Value\n\n#### Single source of truth\n\nDatabricks Delta Lake leverages cloud-based blob storage to provide an infinitely\n\nscalable storage layer where you can store all your data, including raw and historical data,\n\nalongside structured data tables in the data warehouse. The lakehouse pattern avoids\n\ndata silos and shares the same elastic scale and governance across all use cases: BI, data\n\nengineering, streaming and AI/ML. This means that data engineering teams don’t have to\n\nmove data to a proprietary data warehouse for business analysts or create a separate\n\ndata store to support data science.\n\nInstead, data teams can access the open format Delta tables directly and combine data\n\nsets in the lakehouse, as needed. Data scientists can also work collaboratively on common\n\ndata with access to versioned history to facilitate repeatable experiments. A single source\n\nof truth facilitates moving from descriptive to predictive analytics.\n\n\n-----\n\n#### Data team\n\n\nWith central data governance and fine-grained access control\n\ncapabilities to secure the lakehouse, you can enable self-service\n\nSQL analytics for everyone on the Databricks Lakehouse Platform.\n\nThis allows each team to be more agile and innovate faster.\n\n**Data Analysts** — Using the Databricks SQL editor\n\nor their tools of choice (DBT, Power BI, Tableau), SQL\n\nanalysts can leverage familiar toolsets.\n\n**Data Engineers** — Utilizing Delta Lake as a unified\n\nstorage layer, data engineering teams can eliminate\n\nduplicate data and ETL jobs that move data across\n\nvarious systems. Databricks supports both batch and\n\nstreaming workloads to reduce bottlenecks and serve\n\nthe most up-to-date data to downstream users and\n\napplications.\n\n**Administrators** — The pay-as-you-go, decentralized\n\ncompute resource allows each team to run their\n\n\nThe Databricks Lakehouse Platform provides a reliable ETL and data\n\nmanagement framework to simplify ETL pipelines. Data teams can\n\nbuild end-to-end data transformations in a single pipeline instead of\n\nmany small ETL tasks. Databricks supports data quality enforcement\n\nto ensure reliability with auto-scalable infrastructure. Your teams\n\ncan onboard new data sources quickly to power new use cases with\n\nfresh data. This not only allows your team to efficiently and reliably\n\ndeliver high-quality data in a timely manner, it also reduces ETL\n\nworkload cost significantly.\n\n#### Future-proof\n\nUnlike CDWs that lock customers in, Databricks offers an open\n\nplatform with open standards, open protocols and open data\n\nformats. It supports a full range of popular languages (SQL, Python,\n\nR, Scala) and popular BI tools. You can leverage the performant\n\nand low-cost distributed compute layer for data processing — or\n\nuse a variety of tools and engines to efficiently access the data via\n\nDatabricks APIs. Databricks also allows data consumption with a rich\n\npartner ecosystem. Teams can handle all existing BI and AI use cases\n\nwith the flexibility to support future use cases as they emerge.\n\n\nworkload in isolated environments without worrying\n\nabout contention. Serverless SQL endpoint frees your\n\nteam from infrastructure management challenges.\n\n\n-----\n\n## Migration to Lakehouse\n\n#### Overview\n\nA lakehouse is the ideal data architecture for data-driven organizations. It combines the\n\nbest qualities of data warehouses and data lakes to provide a single solution for all major\n\ndata workloads and supports use cases from streaming analytics to BI, data science and\n\nAI. The Databricks Lakehouse Platform leverages low-cost, durable cloud storage and\n\nonly consumes (charges for) compute resources when workloads are running. This pay-\n\n\n**C U S T O M E R S T O R Y**\n##### Building the Lakehouse\n at Atlassian\n\n[Watch now](https://www.youtube.com/watch?v=Xo1U617T-mU)\n\n\nas-you-go model means compute resources are automatically shut down if no processing\n\nis needed. Data teams can use small clusters that can power individual workloads\n\nthey plan to migrate. They can make the choice to leverage serverless SQL endpoints\n\nand completely free data teams from infrastructure capacity planning and cluster\n\nmaintenance. The auto-scaling, elastic nature of Databricks clusters leads to significant\n\nsavings on infrastructure cost and maintenance. Organizations typically achieve 50% TCO\n\nsavings compared to other cloud data warehouses.\n\nData warehouse migration is never an easy task. Databricks aims to mitigate the things\n\nthat can go wrong in these demanding migration projects. The Databricks Lakehouse\n\nPlatform provides many out-of-the-box features to mitigate migration risks.\n\n**C U S T O M E R S T O R Y**\n##### Driving Freight Transportation Into the Future\n\n[Read more](https://databricks.com/customers/jbhunt)\n\n\n-----\n\n#### Migration strategy\n\n\nMigration is a huge effort and very expensive. Yet, almost every\n\nenterprise has to migrate to new platforms every 3–5 years because\n\nthe old platform cannot support new use cases, catch up with\n\ndata growth or meet scaling needs. To get better ROI on migration,\n\nimplement a migration strategy that can reduce future re-platform\n\nneeds and extend to your future data and AI strategy.\n\nUse the opportunity of a data migration to standardize your data\n\nin open Delta format to allow existing and future tools to access\n\nit directly without moving or converting it. Merge your siloed\n\ndata warehouses into the unified storage layer in the Databricks\n\nLakehouse Platform — without worrying about storage capacity.\n\nThe unified storage layer allows your team to deploy a unified data\n\ngovernance on top to secure all data access consistently. Simplify\n\nyour data governance story with Databricks Unity Catalog.\n\n\nMove toward a single, consistent approach to data pipelining\n\nand refinement. Merge batch and streaming into a single end-\n\nto-end pipeline to get fresher data and provide more real-time\n\ndecisions. Take a metadata-driven approach to align the dataflow\n\nwith business processes and have data validation and quality\n\ncheck built-in. Through a series of curation and refinement steps,\n\nthe output results in highly consumable and trusted data for\n\ndownstream use cases.\n\nThe lakehouse architecture makes it possible for the organization\n\nto create “data assets” by taking a stepwise approach to improving\n\ndata and serving all essential use cases. Encourage your BI/analyst\n\nteam to leverage Databricks serverless endpoints for self-serve\n\nand agility. Each team can evaluate their top priority workloads and\n\nmigrate them in parallel to speed up migration.\n\nTake advantage of Databricks’ rich partner ecosystem. Your favorite\n\npartners are likely already integrated via Partner Connect and\n\ncan be set up with a few clicks. There are also many ISV and SI\n\nconsulting partners who can help your migration journey.\n\n\n-----\n\n#### Migration planning\n\nMigrating a data warehouse to the cloud can be time consuming and challenging for your\n\ndata teams. It’s important to agree on the data architecture, migration strategy and process/\n\nframeworks to be used before undertaking a data migration. Databricks provides Migration\n\nAssessment and Architecture Review sessions to develop a joint migration roadmap. This\n\nprocess is designed to help organizations to successfully migrate to a lakehouse architecture.\n\nBased on information collected and business objectives, the Databricks team will work with\n\ncustomers to propose a target architecture and provide a tailored migration roadmap.\n\nThese assessments help get a full picture of current data systems and the future vision. They\n\nclarify what you are migrating and do proper use case discovery. This includes identifying\n\nworkloads and data source dependency, for example:\n\nSample migration assessment checklist:\n\nIdentify upstream data sources and workload dependencies\n\nIdentify active/inactive data sets and database objects\n\nIdentify downstream application dependencies and data freshness requirements\n\nDefine a cost-tracking mechanism, such as tag rules for chargeback and cost attribution\n\nDefine security requirements and data governance\n\nClarify access management need, document needed permissions per user/group\n\nOutline current tooling (ingestion, ETL and BI) and what’s needed\n\n\n-----\n\nIt’s important to identify key stakeholders and keep them engaged during the migration to\n\nmake sure they are aligned with the overall objectives. The workload assessment result will\n\nbe reviewed with key stakeholders. Through the review process, data teams can get a better\n\nunderstanding of which workloads can most benefit from modernization.\n\nDatabricks often works with partners to provide a workload assessment and help customers\n\nunderstand their migration complexity and properly plan a budget. Databricks also partners\n\nwith third-party vendors that provide migration tools to securely automate major migration\n\ntasks. Databricks Partner Connect makes it easy to connect with this ecosystem of tools to\n\nhelp with the migration, including:\n\n\u0007Code conversion tooling that can automatically translate 70%–95% of the SQL code in\n\nyour current system to Databricks optimized code with Delta and other best practices\n\n\u0007Converters that automate multiple GUI-based ETL/ELT platform conversion to reduce\n\nmigration time and cost\n\n\u0007Data migration tools that can migrate data from on-premises storage to cloud storage\n\n2x–3x faster than what was previously possible\n\n\n-----\n\n#### We can use Automated conversion for most workload types\n\n###### EDWs\n\n\nOpen Cloud Storage\nADLS, S3, GCP Storage\n\nDatabricks Tables, �ie�s\n\nSpark SQL Databricks Notebooks\n\nSpark SQL � little bit o� Python or Scal�\n\nRuns on Databricks JDBC/ODBC\n\nDatabricks permissions- Table ACLs\n\nCredential Pass-throughs to Files\n\nBig Data ETL tools, Databricks Notebooks\n\nAir5o� DAGs, ADF, Databricks Job\nand any other Enterprise Schedulers\n\n\nData Migration\n\nMetastore Migration\n\nSQL Migration\n\nSecurity\n\nETL Tools\n\n\nDB locked �ormats on Disks\n\nDatabases, Tables, �ie�s\n\nAd-hoc SQL �ueries\n\nT-SQL, PL/SQL, BTEQ\n\nReports �rom PB`, Tableau etc^\n\nGRANTs, Roles\n\nExternal tables- File permissions\n\nData Stage, Po�erCenter, Ab `nitio etc^\n\n\nOrchestration ETL Schedulers\n\n\n-----\n\n#### ELT approach\n\nThe separation of storage and compute makes ELT on lakehouse a better choice than traditional\n\nETL. You can ingest all raw data to Delta Lake, leverage low-cost storage and create a Medallion\n\ndata implementation from raw/Bronze to curated/Gold depending on what’s needed to support\n\nuse cases. During ingestion, basic data validation can occur, but establishing a Bronze data layer is\n\nthe foundation of a single-pane-of-glass for the business. Teams can leverage compute resources\n\nas needed without a fixed compute infrastructure. Establishing a Silver layer further enriches data\n\nby exploring and applying transformations. ELT allows data teams to break pipelines into smaller\n\n“migrations,” starting with a simple workload, then improving the pipeline design iteratively.\n\n**I M P R O V E D ATA Q U A L I T Y**\n\nData B r o n z e Ta b l e s S i lv e r Ta b l e s G o l d Ta b l e s\n\nStreaming Analytics\n\nCSV TXT JSON\n\n\nD ata �a �e\n\n\nRaw\nintegration\n\n\nFiltered, Cleaned,\nAugmented\n\n\nBusiness-level\nAggregates\n\n\nReuorting\n\n\n-----\n\nWe highly recommend leveraging [Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) , a new cloud-native managed\n\nservice in the Databricks Lakehouse Platform that provides a reliable ETL framework to\n\nmodernize your data pipeline at scale. Instead of migrating multiple ETL tasks one by one in\n\na traditional data warehouse, you can focus on source and expected output, and create your\n\nentire dataflow graph declaratively. Delta Live Tables offers:\n\n\u0007A metadata-driven approach — You just specify what data should be in each table or view\n\nrather than the details of how processing should be done\n\n\u0007An end-to-end data pipeline with data quality and freshness checks, end-to-end\n\nmonitoring/visibility, error recovery, and lineage, which reduces the strain on data\n\nengineering teams and improves time-to-value in building data pipelines\n\n\u0007Automatic management of all the dependencies within the pipeline. This ensures all tables\n\nare populated correctly, whether continuously or on a regular schedule. For example,\n\nupdating one table will automatically trigger all downstream table updates to keep data\n\nup-to-date.\n\n\u0007All pipelines are built code-first, which makes editing, debugging and testing of data\n\npipelines simpler and easier. DLT can also automatically recover from common error\n\nconditions, reducing operational overhead.\n\n\n-----\n\n#### Agile modernization\n\n\nAgile development allows teams to move quickly knowing migrated\n\npipelines can be revisited at a later cycle and evolving data models\n\nare supported within the architecture. Allowing business impact to\n\ndrive priorities via an agile approach helps mitigate migration risks.\n\nPrioritizing and selecting use cases where modernization brings\n\nbusiness benefits quickly is a good starting point. Focus on the 20%\n\nof workloads that consume 80% of budget. By breaking workflows\n\ndown into components and managing data stories, teams can adjust\n\npriorities over time. Changes can be made in collaboration with the\n\nuser community to fit the business definition of value.\n\nMigrating to a lakehouse architecture leverages separation of storage\n\nand compute to remove resource contention between ETL and BI\n\nworkloads. As a result, the migration process can be more agile,\n\nallowing you to evolve your design iteratively without big-bang effort:\n\n\u0007Reduce time during the initial phase on full capacity plan and\n\n\nAll of this allows you to take a more iterative and business-focused\n\napproach for migration instead of a full planning, execution, test/\n\nvalidation approach. Here are more approaches that help facilitate\n\nthis phased implementation:\n\n\u0007Leverage [Databricks Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . Auto Loader helps to ingest\n\nnew data into pipelines quicker to get data in near real-time.\n\n\u0007Delta Live Tables (DLT) improves data quality during data\n\ntransformation and automatically scales to address data volume\n\nchange. DLT can also support schema evolution and quarantine\n\nbad data or data that needs to be reprocessed at a later stage.\n\n\u0007Use dedicated clusters to isolate workloads, lower the total cost\n\nof ownership and improve overall performance. By using multiple\n\nclusters, we can shut down resources when not in use and move\n\naway from managing fixed resources in a single large cluster.\n\n\nscoping\n\n\u0007Flexible cloud infrastructure and unlimited, autoscaling storage\n\n\u0007Workload management is much simpler, you can isolate each\n\nworkload with a dedicated compute resource, without worrying\n\nabout managing workload contention\n\n\u0007Auto-scale and tear down the compute resources after the job\n\nis done to achieve cost efficiency\n\n\n-----\n\nLeverage Databricks’ deep bench of expertise to build reusable assets along the migration:\n\n\u0007Create a migration factory for iterative migration process\n\n\u0007Determine and implement a security and governance framework\n\n\u0007Establish a to-be environment and move use cases/workloads in logical units\n\n\u0007Prove business value and scale over time\n\n\u0007Add new functionality continuously so important business requirements are not left on hold during migration\n\nTake this iterative and templated approach. Migration speed will accelerate. Customers can\n\nfinish migration 15%–20% faster and reduce the amount of tech debt created during the migration.\n\n\n“ M a k e i t w o r k ”\n\nPa r e l l e l i z e t h e\nB u i l d F o u n d at i o n s “ M a k e i t w o r k r i @ h t ”\ni t e r at i o n s\n\n“ M a k e i t w o r k >a s t 2\n\n\nFull %i\"ecycle %ig�t�ou�e /or�load�\n\nLeverage Databricks’ deep\n\nbench of expertise to build\n\nout some **templates for the**\n\n**most effective Databricks**\n\n**implementation.**\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nTake an **iterative, bite-sized**\n\n**approach** to migration, reduce tech\n\ndebt and rework, and bring forward\n\nthe value of the solution earlier.\n\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\nMigration\n\nFunctionality\n\nOptimization and Delta\n\n\n-----\n\nTo maximize the value of your lakehouse, you should consider retiring\n\nsome legacy architecture design patterns. Leverage the migration\n\nprocess to simplify data warehousing tasks. Regardless of how you\n\ncomplete your migration, you could utilize lakehouse strengths to\n\nimprove architectural patterns:\n\n\u0007Merge your siloed data warehouses on your unified lakehouse\n\nplatform and unify data access and data governance via Unity\n\nCatalog. The lakehouse architecture provides a unified storage\n\nlayer for all your data where there is no physical boundary\n\nbetween data. There is no need to keep data copies for each\n\nsystem using the data set. Clean up and remove jobs that are\n\ncreated to keep data in sync across various data systems.\n\nKeep a single copy of raw data in your lakehouse as a single\n\nsource of truth.\n\n\u0007The Databricks Lakehouse Platform allows you to merge batch\n\nand streaming into a single system to build a simple continuous\n\n\n\u0007Simplify your workload isolation and management by running jobs\n\nin dedicated clusters. Separating storage and compute allows you\n\nto easily isolate each task with isolated compute resources. There\n\nis no need to squeeze them into a single large data appliance\n\nand spend lots of time managing and coordinating resources.\n\nLeverage the elasticity of the Databricks compute layer to\n\nautomatically handle workload concurrency changes at peak time\n\ninstead of paying for over-provisioned resources for most of the\n\ntime. This greatly simplifies the workload management effort the\n\ntraditional data warehouses require.\n\n\u0007Simplify disaster recovery. Storage and compute separation\n\nallows easy disaster recovery. The cloud storage provides very\n\ngood data redundancy and supports automated replication\n\nto another region. Customers can spin up compute resources\n\nquickly in another region and maintain service availability in case\n\nof an outage.\n\n\ndata flow model to process data as it arrives. Process data in\n\nnear real-time and enable data-driven decisions with the most\n\nrecent updates.\n\n\n-----\n\n#### Security and data governance\n\n\nSecurity is paramount in any data-driven organization. Data security\n\nshould enforce the business needs for both internal and external\n\ndata, so the lakehouse should be set up to meet your organization’s\n\nsecurity requirements. Databricks provides built-in security to\n\nprotect your data during and after migration.\n\n\u0007Encrypt data at rest and in-transit, using a cloud-managed key\n\nor your own\n\n\u0007Set up a custom network policy, use IP range to control access\n\n\u0007Leverage Private Link to limit network traffic to not traverse the\n\npublic internet\n\n\nThe challenge with the traditional data warehouse and data lake\n\narchitecture is that data is stored in multiple stores and your data\n\nteam also needs to manage data access and data governance\n\ntwice. The lakehouse pattern uses unified storage which simplifies\n\ngovernance. The Databricks Lakehouse Platform provides a unified\n\ngovernance layer across all your data teams. Migrating to Databricks\n\nUnity Catalog provides data discovery, data lineage, role-based\n\nsecurity policies, table or row/column-level access control, and\n\ncentral auditing capabilities that make the data platform easy for\n\ndata stewards to confidently manage and secure data access to\n\nmeet compliance and privacy needs, directly on the lakehouse.\n\n\n\u0007Enable SSO, integrate with active directory and other IdPs\n\n\u0007Control data access to database objects using RBAC\n\n\u0007Enable audit logs to monitor user activities\n\n\n-----\n\nA-�it Log\n\nAcco-nt Level$\nUser Management\n\nCre�entials\n\n##### Centralized Governance\n\nACL Store\n\nAccess Control\n\n\nMetastore\n\nLineage Explorer\n\nData Explorer\n\n\n-----\n\n#### Team involvement\n\nPlan to educate and train your team iteratively throughout the\n\nmigration process. As new workloads are migrated, new teams will\n\ngain exposure to the lakehouse pattern. Plan to ramp up new team\n\nmembers as the migration process progresses, developing a data\n\nCenter of Excellence within the organization. Databricks provides\n\na cost effective platform for ad hoc work to be performed. A\n\nsandbox environment can be leveraged for teams to get exposure\n\nto Databricks technology and get hands-on experience. Databricks\n\nalso provides [learning path](https://databricks.com/learn/training/home) training for customers. Encourage teams\n\nto get hands-on experience relevant to their immediate tasks, gain\n\n\n#### Conclusion\n\nData warehouse migration touches many business areas and\n\nimpacts many teams, but the Databricks Lakehouse Platform\n\nsimplifies this transition, reduces risks and accelerates your ROI.\n\nThe Databricks Business Value Consulting team can work with you\n\nto quantify the impact of your use cases to both data and business\n\nteams. And the Databricks team of solution architects, professional\n\nservices, and partners are ready to help.\n\nReach out to your Databricks account team or send a message to\n\n[sales@databricks.com](mailto:sales%40databricks.com?subject=) to get started.\n\n\nexposure to new things and try new ideas.\n\n#### Additional resources\n\n[Migrate to Databricks](https://databricks.com/solutions/migration)\n\n[Modernize Your Data Warehouse](https://databricks.com/p/webinar/apj-modernize-your-data-warehouse)\n\n\n-----\n\n##### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 40% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe. Founded by the original\n\ncreators of Apache Spark™, Delta Lake and MLflow, Databricks is on a mission to help\n\ndata teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[Sign up for a free trial](https://databricks.com/try-databricks)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "2024-09-19T16:57:21Z" + ], + [ + "**The**\n**Delta Lake**\n**Series**\n**Lakehouse**\n\nCombining the best elements of\ndata lakes and data warehouses\n\n\n-----\n\n###### Here’s what\n#### What’s \n###### you’ll find inside\n#### inside?\n\n\nThe Delta Lake Series of eBooks is published\n\n\nby Databricks to help leaders and practitioners\n\nunderstand the full capabilities of Delta Lake as\n\n\n**Introduction**\n**What is Delta Lake?**\n\n\nwell as the landscape it resides in. This eBook,\n\n\n**The Delta Lake Series — Lakehouse** , focuses\n\non lakehouse.\n\n\n**Chapter** **01**\n\n##### 02 Chapter\n 03 Chapter\n\n\nWhat Is\na Lakehouse?\n\nDiving Deep Into the Inner Workings\nof the Lakehouse and Delta Lake\n\nUnderstanding\nDelta Engine\n\n\n#### What’s next?\n\nAfter reading this eBook, you’ll not only\n\n\nunderstand what Delta Lake offers, but you’ll\n\nalso understand how its features result in\n\nsubstantial performance improvements.\n\n\n-----\n\n#### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n\nlifecycle management to data lakes. Our customers have found that Delta Lake\n\nsolves for challenges around malformed data ingestion, difficulties deleting data for\n\ncompliance, or issues modifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\n\nyour data lake and the rate that teams can leverage that data with a secure and\n\nscalable cloud service.\n\n\n-----\n\n**What Is a Lakehouse?**\n### CHAPTER 01\n\n\n-----\n\n**What Is a**\n**Lakehouse?**\n# 01\n\nOver the past few years at Databricks, we’ve seen a new data management architecture\n\nthat emerged independently across many customers and use cases: the **lakehouse.**\n\nIn this chapter, we’ll describe this new architecture and its advantages over previous\n\napproaches.\n\nData warehouses have a long history of decision support and business intelligence\n\napplications. Since its inception in the late 1980s, data warehouse technology\n\ncontinued to evolve and MPP architectures led to systems that were able to handle\n\nlarger data sizes.\n\nBut while warehouses were great for structured data, a lot of modern enterprises\n\nhave to deal with unstructured data, semi-structured data, and data with high variety,\n\nvelocity and volume. Data warehouses are not suited for many of these use cases, and\n\nthey are certainly not the most cost-efficient.\n\nAs companies began to collect large amounts of data from many different sources,\n\narchitects began envisioning a single system to house data for many different\n\nanalytic products and workloads.\n\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\n\nin a variety of formats. While suitable for storing data, data lakes lack some critical\n\nfeatures: They do not support transactions, they do not enforce data quality, and their\n\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\n\n\n-----\n\n**A lakehouse combines the best elements**\n**of data lakes and data warehouses**\n\nA lakehouse is a new data architecture that combines the best elements of data lakes\n\nand data warehouses.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\n\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\n\nwarehouses.\n\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\n\n\nrequire systems for diverse data applications including SQL analytics, real-time\n\nmonitoring, data science and machine learning. Most of the recent advances in\n\nAI have been in better models to process unstructured data (text, images, video,\n\naudio), but these are precisely the types of data that a data warehouse is not\n\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\n\nwarehouses, and other specialized systems such as streaming, time-series, graph\n\nand image databases. Having a multitude of systems introduces complexity and,\n\nmore importantly, introduces delay as data professionals invariably need to move\n\nor copy data between different systems.\n\n\nLakehouses are enabled by a new system design: implementing similar data struc-\n\ntures and data management features to those in a data warehouse, directly on the\n\nkind of low-cost storage used for data lakes. They are what you would get if you had\n\nto redesign data warehouses in the modern world, now that cheap and highly reliable\n\nstorage (in the form of object stores) are available.\n\nA lakehouse has the following key features:\n\n- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n\nbe reading and writing data concurrently. Support for ACID transactions ensures\n\nconsistency as multiple parties concurrently read or write data, typically using\n\nSQL.\n\n\n-----\n\n- **\u0007Schema enforcement and governance:** The lakehouse should have a way to\n\nsupport schema enforcement and evolution, supporting DW schema paradigms\n\nsuch as star/snowflake-schemas. The system should be able to reason about data\n\nintegrity, and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n\nreduces staleness and improves recency, reduces latency and lowers the cost of\n\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and\n\ncompute use separate clusters, thus these systems are able to scale to many more\n\nconcurrent users and larger data sizes. Some modern data warehouses also have\n\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\n\nParquet, and they provide an API so a variety of tools and engines, including\n\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n\nThe lakehouse can be used to store, refine, analyze and access data types needed\n\nfor many new data applications, including images, video, audio, semi-structured\n\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n\nanalytics. Multiple tools might be needed to support all these workloads, but they all\n\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n\nSupport for streaming eliminates the need for separate systems dedicated to\n\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\n\nfeatures. Tools for security and access control are basic requirements. Data governance\n\ncapabilities including auditing, retention and lineage have become essential particularly\n\nin light of recent privacy regulations. Tools that enable data discovery such as data\n\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**\n\n**Abstract**\n\nCloud object stores such as Amazon S3 are some of the largest and most\n\ncost-effective storage systems on the planet, making the main attractive\n\ntarget to store large data warehouses and data lakes. Unfortunately, their\n\nimplementation as key-value stores makes it difficult to achieve ACID\n\ntransactions and high performance: Metadata operations, such as listing\n\nobjects, are expensive, and consistency guarantees are limited. In this paper,\n\nwe present Delta Lake, an open source ACID table storage layer over cloud\n\nobject stores initially developed at Databricks. Delta Lake uses a transaction log\n\nthat is compacted into Apache Parquet format to provide ACID properties, time\n\ntravel, and significantly faster metadata operations for large tabular data sets\n\n(e.g., the ability to quickly search billions of table partitions for those relevant\n\nto a query). It also leverages this design to provide high-level features such\n\nas automatic data layout optimization, upserts, caching, and audit logs. Delta\n\nLake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\n\nother systems. Delta Lake is deployed at thousands of Databricks customers\n\nthat process exabytes of data per day, with the largest instances managing\n\nexabyte-scale data sets and billions of objects.\n\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\n\nZhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\n\nŁuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\n\nBoncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\n\nRead the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\n\n\n-----\n\n**Some early examples**\n\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\n\nMicrosoft’s Azure Synapse Analytics service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\n\nenables a similar lakehouse pattern. Other managed services such as BigQuery and\n\nRedshift Spectrum have some of the lakehouse features listed above, but they are\n\nexamples that focus primarily on BI and other SQL applications.\n\nCompanies that want to build and implement their own systems have access to open\n\nsource file formats (Delta Lake, Apache Iceberg, Apache Hudi) that are suitable for\n\nbuilding a lakehouse.\n\nMerging data lakes and data warehouses into a single system means that data teams\n\ncan move faster as they are able to use data without needing to access multiple systems.\n\nThe level of SQL support and integration with BI tools among these early lakehouses\n\nis generally sufficient for most enterprise data warehouses. Materialized views and\n\n\nA note about technical building blocks. While distributed file systems can be\n\nused for the storage layer, object stores are more commonly used in lakehouses.\n\nObject stores provide low-cost, highly available storage that excels at massively\n\nparallel reads — an essential requirement for modern data warehouses.\n\n**From BI to AI**\n\nThe lakehouse is a new data management architecture that radically simplifies\n\nenterprise data infrastructure and accelerates innovation in an age when\n\nmachine learning is poised to disrupt every industry. In the past, most of the\n\ndata that went into a company’s products or decision-making was structured\n\ndata from operational systems, whereas today, many products incorporate\n\nAI in the form of computer vision and speech models, text mining and others.\n\nWhy use a lakehouse instead of a data lake for AI? A lakehouse gives you data\n\nversioning, governance, security and ACID properties that are needed even for\n\nunstructured data.\n\n\nstored procedures are available, but users may need to employ other mechanisms that\n\n\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\n\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\n\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\n\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n\nlibraries) for non-BI workloads like data science and machine learning. Data\n\nexploration and refinement are standard for many analytic and data science\n\napplications. Delta Lake is designed to let users incrementally improve the quality of\n\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\n\nsystems (such as data warehouses) that have years of investments and real-\n\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n\nnotebooks) over others so lakehouses will also need to improve their UX and their\n\nconnectors to popular tools so they can appeal to a variety of personas. These\n\nand other issues will be addressed as the technology continues to mature and\n\ndevelop. Over time, lakehouses will close these gaps while retaining the core\n\nproperties of being simpler, more cost-efficient and more capable of serving\n\ndiverse data applications.\n\n\ndata in their lakehouse until it is ready for consumption.\n\n\n-----\n\n**Diving Deep Into the Inner Workings**\n**of the Lakehouse and Delta Lake**\n\n### CHAPTER 02\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n# 02\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n\nadopting the lakehouse pattern. The blog created a massive amount of interest\n\nfrom technology enthusiasts. While lots of people praised it as the next-generation\n\ndata architecture, some people thought the lakehouse is the same thing as\n\nthe data lake. Recently, several of our engineers and founders wrote a research\n\npaper that describes some of the core technological challenges and solutions that\n\nset the lakehouse architecture apart from the data lake, and it was accepted and\n\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\n\ncan read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\n\nthey would have said faster horses.” The crux of this statement is that people often\n\nenvision a better solution to a problem as an evolution of what they already know\n\nrather than rethinking the approach to the problem altogether. In the world of data\n\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\n\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\n\nobject stores like Amazon S3 have become some of the largest and most cost-\n\neffective storage systems in the world, which makes them an attractive platform to\n\nstore data warehouses and data lakes. However, their nature as key-value stores\n\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\n\nperformance is hampered by expensive metadata operations (e.g., listing objects)\n\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\n\nThe first is directories of files (i.e., data lakes) that store the table as a collection\n\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\n\napproach because the table is just a group of objects that can be accessed from\n\na wide variety of tools without a lot of additional data stores or systems. However,\n\nboth performance and consistency problems are common. Hidden data corruption\n\nis common due to failed transactions, eventual consistency leads to inconsistent\n\nqueries, latency is high, and basic management capabilities like table versioning and\n\naudit logs are unavailable.\n\n**2. Custom storage engines**\n\nThe second approach is custom storage engines, such as proprietary systems built for\n\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\n\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\n\nservice that’s able to provide a single source of truth. However, all I/O operations need\n\nto connect to this metadata service, which can increase cloud resource costs and\n\nreduce performance and availability. Additionally, it takes a lot of engineering work to\n\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\n\nand PyTorch, which can be challenging for data teams that use a variety of computing\n\nengines on their data. Engineering challenges can be exacerbated by unstructured\n\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\n\ncustomers into a specific service provider, leaving customers to contend with\n\nconsistently high prices and expensive, time-consuming migrations if they decide to\n\nadopt a new approach later.\n\n**3. Lakehouse**\n\nWith Delta Lake, an open source ACID table storage layer atop cloud object stores,\n\nwe sought to build a car instead of a faster horse with not just a better data store,\n\nbut a fundamental change in how data is stored and used via the lakehouse. A\n\nlakehouse is a new architecture that combines the best elements of data lakes and\n\ndata warehouses. Lakehouses are enabled by a new system design: implementing\n\nsimilar data structures and data management features to those in a data warehouse,\n\ndirectly on the kind of low-cost storage used for data lakes. They are what you would\n\nget if you had to redesign storage engines in the modern world, now that cheap and\n\nhighly reliable storage (in the form of object stores) are available.\n\nDelta Lake maintains information about which objects are part of a Delta table in an\n\nACID manner, using a write-ahead log, compacted into Parquet, that is also stored in\n\nthe cloud object store. This design allows clients to update multiple objects at once,\n\nreplace a subset of the objects with another, etc., in a serializable manner that still\n\nachieves high parallel read/write performance from the objects. The log also provides\n\nsignificantly faster metadata operations for large tabular data sets. Additionally, Delta\n\nLake offers advanced capabilities like time travel (i.e., the ability to query point-in-time\n\nsnapshots or roll back erroneous updates), automatic data layout optimization, upserts,\n\ncaching, and audit logs. Together, these features improve both the manageability and\n\nperformance of working with data in cloud object stores, ultimately opening the door\n\nto the lakehouse architecture that combines the key features of data warehouses and\n\ndata lakes to create a better, simpler data architecture.\n\n\n-----\n\nToday, Delta Lake is used across thousands of Databricks customers, processing\n\nexabytes of structured and unstructured data each day, as well as many organizations\n\nin the open source community. These use cases span a variety of data sources and\n\napplications. The data types stored include Change Data Capture (CDC) logs from\n\nenterprise OLTP systems, application logs, time-series data, graphs, aggregate\n\ntables for reporting, and image or feature data for machine learning. The applications\n\ninclude SQL workloads (most commonly), business intelligence, streaming, data\n\nscience, machine learning and graph analytics. Overall, Delta Lake has proven itself to\n\nbe a good fit for most data lake applications that would have used structured storage\n\nformats like Parquet or ORC, and many traditional data warehousing workloads.\n\nAcross these use cases, we found that customers often use Delta Lake to significantly\n\nsimplify their data architecture by running more workloads directly against cloud\n\nobject stores, and increasingly, by creating a lakehouse with both data lake and\n\ntransactional features to replace some or all of the functionality provided by message\n\nqueues (e.g., Apache Kafka), data lakes or cloud data warehouses (e.g., Snowflake,\n\nAmazon Redshift).\n\n**[In the research paper,](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)** **the authors explain:**\n\n- The characteristics and challenges of object stores\n\n- The Delta Lake storage format and access protocols\n\n- The current features, benefits and limitations of Delta Lake\n\n- Both the core and specialized use cases commonly employed today\n\n- Performance experiments, including TPC-DS performance\n\nThrough the paper, you’ll gain a better understanding of Delta Lake and how it\n\nenables a wide range of DBMS-like performance and management features for data\n\nheld in low-cost cloud storage. As well as how the Delta Lake storage format and\n\naccess protocols make it simple to operate, highly available, and able to deliver high-\n\nbandwidth access to the object store.\n\n\n-----\n\n**Understanding Delta Engine**\n\n### CHAPTER 03\n\n\n-----\n\n**Understanding**\n**Delta Engine**\n# 03\n\nThe Delta Engine ties together a 100% Apache Spark-compatible vectorized query\n\nengine to take advantage of modern CPU architecture with optimizations to Spark\n\n3.0’s query optimizer and caching capabilities that were launched as part of Databricks\n\nRuntime 7.0. Together, these features significantly accelerate query performance on\n\ndata lakes, especially those enabled by [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) , to make it easier for customers to\n\nadopt and scale a [lakehouse architecture](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) .\n\n**Scaling execution performance**\n\nOne of the big hardware trends over the last several years is that CPU clock speeds\n\nhave plateaued. The reasons are outside the scope of this chapter, but the takeaway\n\nis that we have to find new ways to process data faster beyond raw compute power.\n\nOne of the most impactful methods has been to improve the amount of data that can\n\nbe processed in parallel. However, data processing engines need to be specifically\n\narchitected to take advantage of this parallelism.\n\nIn addition, data teams are being given less and less time to properly model data as\n\nthe pace of business increases. Poorer modeling in the interest of better business\n\nagility drives poorer query performance. Naturally, this is not a desired state, and\n\norganizations want to find ways to maximize both agility and performance.\n\n\n-----\n\n**Announcing Delta Engine for**\n**high-performance query execution**\n\nDelta Engine accelerates the performance of Delta Lake for SQL and DataFrame\n\nworkloads through three components: an improved query optimizer, a caching\n\nlayer that sits between the execution layer and the cloud object storage, and a native\n\nvectorized execution engine that’s written in C++.\n\nThe improved query optimizer extends the functionality already in Spark 3.0 (cost-based\n\noptimizer, adaptive query execution, and dynamic runtime filters) with more advanced\n\nstatistics to deliver up to 18x increased performance in star schema workloads.\n\nDelta Engine’s caching layer automatically chooses which input data to cache for the\n\nuser, transcoding it along the way in a more CPU-efficient format to better leverage\n\nthe increased storage speeds of NVMe SSDs. This delivers up to 5x faster scan\n\nperformance for virtually all workloads.\n\nHowever, the biggest innovation in Delta Engine to tackle the challenges facing\n\ndata teams today is the native execution engine, which we call Photon. (We know.\n\n\n-----\n\nIt’s in an engine within the engine…). This completely rewritten execution engine for\n\nDatabricks has been built to maximize the performance from the new changes in\n\nmodern cloud hardware. It brings performance improvements to all workload types\n\nwhile remaining fully compatible with open Spark APIs.\n\n**Getting started with Delta Engine**\n\nBy linking these three components together, we think it will be easier for customers\n\nto understand how improvements in multiple places within the Databricks code\n\naggregate into significantly faster performance for analytics workloads on data lakes.\n\nWe’re excited about the value that Delta Engine delivers to our customers. While the\n\ntime and cost savings are already valuable, its role in the lakehouse pattern supports\n\nnew advances in how data teams design their data architectures for increased\n\nunification and simplicity.\n\nFor more information on the Delta Engine, watch this keynote address from\n\n[Spark + AI Summit 2020:](https://www.youtube.com/watch?v=o54YMz8zvCY) [Delta Engine: High-Performance Query Engine for Delta Lake](https://www.youtube.com/watch?v=o54YMz8zvCY) .\n\n\n-----\n\n## What’s next?\n\n\nNow that you understand Delta Lake and how its features can improve\n\nperformance, it may be time to take a look at some additional resources.\n\n**Data + AI Summit Europe 2020 >**\n\n- [Photon Technical Deep Dive: How to Think Vectorized](https://databricks.com/session_eu20/photon-technical-deep-dive-how-to-think-vectorized)\n\n\n**Explore subsequent eBooks in the collection >**\n\n- The Delta Lake Series — Fundamentals and Performance\n\n- The Delta Lake Series — Features\n\n- The Delta Lake Series — Streaming\n\n- The Delta Lake Series — Customer Use Cases\n\n\n\n- [MLflow, Delta Lake and Lakehouse Use Cases Meetup and AMA](https://databricks.com/session_eu20/mlflow-delta-lake-and-lakehouse-use-cases-meetup)\n\n- [Common Strategies for Improving Performance on Your Delta Lakehouse](https://databricks.com/session_eu20/common-strategies-for-improving-performance-on-your-delta-lakehouse)\n\n\n\n- [Achieving Lakehouse Models With Spark 3.0](https://databricks.com/session_eu20/achieving-lakehouse-models-with-spark-3-0)\n\n- [Radical Speed for Your SQL Queries With Delta Engine](https://databricks.com/session_eu20/radical-speed-for-your-sql-queries-with-delta-engine)\n\n\n**Do a deep dive into Delta Lake >**\n\n- [Analytics on the Data Lake With Tableau and the Lakehouse Architecture](https://databricks.com/blog/2020/11/11/analytics-on-the-data-lake-with-tableau-and-the-lakehouse-architecture.html)\n\n- [Visit the site for additional resources](https://databricks.com/product/delta-lake-on-databricks)\n\n\n**Vodcasts and podcasts >**\n\n\n\n- [Welcome to Lakehouse. Data Brew | Episode 2](https://www.youtube.com/watch?v=HVqxI7sFbKc)\n\n- [Data Brew by Databricks | Season 1: Lakehouses](https://databricks.com/discover/data-brew)\n\n\n**[Try Databricks for free >](https://databricks.com/product/delta-lake-on-databricks)**\n**[Learn more >](https://databricks.com/product/delta-lake-on-databricks)**\n\n\n\n- [Data Alone Is Not Enough: The Evolution of Data Architectures](https://a16z.com/2020/10/22/data-alone-is-not-enough-the-evolution-of-data-architectures/)\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "**EBOOK**\n\n# All Roads Lead to the Lakehouse\n\n#### A deep dive into data ingestion with the lakehouse\n\n\n-----\n\n## Contents\n\nIntroduction...................................................................................................................................................................................................................... **03**\n\nLife of a Data Engineer ............................................................................................................................................................................................... **04**\n\nIngesting From Cloud Object Stores...................................................................................................................................................................... **05**\n\nCOPY INTO ......................................................................................................................................................................................................... **06**\n\nAuto Loader ....................................................................................................................................................................................................... **09**\n\nIngesting Data From External Applications .......................................................................................................................................................... **13**\n\nPartner Connect ............................................................................................................................................................................................... **13**\n\n\n-----\n\n### Introduction\n\nOrganizations today are inundated with data siloed across various on-premises\napplication systems, databases, data warehouses and SaaS applications. This\nfragmentation makes it difficult to support new use cases for analytics or machine\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\narchitecture built on top of Delta Lake, an open format storage layer.\n\nThe first thing data engineers need to do to support the lakehouse architecture is to\nefficiently move data from various systems into their lakehouse. Ingesting data is a\ncritical first step in the data engineering and management lifecycle.\n\n\n-----\n\n### Life of a Data Engineer\n\nThe primary focus of data engineers is to provide timely and reliable data to downstream\n\ndata teams at an organization. Requests for data can come from a variety of teams, and for\n\n\na variety of data types. For example:\n\n**•** Marketing team requests for Facebook and Google ad data in order to analyze spend and\n\nbetter allocate their budget for ads\n\n**•** Security team looking to get access to a table with low latency security data from Kafka,\n\nin order to run rules to detect intrusions into the network\n\n**•** Sales operations requesting customer data from Salesforce to enrich existing tables\n\n**•** Finance team hoping to find a way to automatically ingest critical data from Google\n\nSheets or transaction data from AWS Kinesis\n\nIn each of these common scenarios, data engineers must create usable and easily\n\nqueryable tables from semi-structured and unstructured data. Beyond writing queries to\n\nretrieve and transform all this data, the data engineering team must also be concerned\n\nwith performance, because running these queries on an ongoing basis can be a big load on\n\nthe system.\n\nData engineers face the challenge of constant requests and ongoing business\n\n\n###### W H AT I S \n D E LTA L A K E ?\n\nBefore thinking about ingestion into Delta Lake, it’s important to\n\nunderstand why ingesting into Delta Lake is the right solution in\n\nthe first place. [Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is an open format data management\n\nlayer that brings data warehouse capabilities to your open data\n\nlake. Across industries, enterprises have enabled true collaboration\n\namong their data teams with a reliable single source of truth\n\nenabled by Delta Lake. By delivering quality, reliability, security and\n\nperformance on your data lake — for both streaming and batch\n\noperations — Delta Lake eliminates data silos and makes analytics\n\naccessible across the enterprise. With Delta Lake, customers can\n\nbuild a cost-efficient, highly scalable lakehouse that eliminates\n\ndata silos and provides self-serving analytics to end users.\n\n\nrequirements, as well as an ever-changing ecosystem. As business requirements change,\n\nso do the requirements around schemas, necessitating custom code to handle the\n\nchanges. With all of these challenges, the work of a data engineer is extremely critical, and\n\nincreasingly complex, with many steps involved before getting data to a state where it can\n\nactually be queried by the business stakeholders. So how do data engineers get the data\n\nthat each of these teams need at the frequency, with the freshness, and in the format\n\nrequired?\n\n\n-----\n\n### Ingesting From Cloud Object Stores\n\nThere are a number of common ways in which data engineers ingest data into Delta Lake. First and foremost is ingesting files from\n\ncloud object stores such as Azure Data Lake Storage, AWS S3 or Google Cloud Storage. Typically, customers are looking to migrate\n\nexisting tables or perform incremental ingestion into Delta Lake, and to do so, they can leverage tools like [CONVERT TO DELTA](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-convert-to-delta.html) ,\n\n[COPY INTO](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-copy-into.html) , and [Auto Loader](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader.html) . We will focus on Auto Loader and COPY INTO here.\n\n\n**Auto Loader**\n\nAuto Loader is an optimized data ingestion tool that incrementally and efficiently\n\nprocesses new data files as they arrive in cloud storage with minimal DevOps effort. You\n\njust need to provide a source directory path and start a streaming job. The new structured\n\nstreaming source, called “cloudFiles”, will automatically set up file notification services that\n\n\n**COPY INTO**\n\nCOPY INTO is a SQL command that allows you to perform batch file ingestion into Delta\n\nLake. COPY INTO is a command that ingests files with exactly-once semantics, best used\n\nwhen the input directory contains thousands of files or fewer, and the user prefers SQL.\n\nCOPY INTO can be used over JDBC to push data into Delta Lake at your convenience.\n\n\nsubscribe file events from the input directory and process new files as they arrive, with the\n\noption of also processing existing files in that directory. Auto Loader has interfaces through\n\nPython and Scala, and can be used with SQL through Delta Live Tables.\n\n\n-----\n\n##### COPY INTO\n\n\nCOPY INTO is a powerful yet simple SQL command that allows you to perform batch file\n\ningestion into Delta Lake and perform many of the use cases outlined in this section. COPY\n\nINTO can be run once, in an ad hoc manner, and can be scheduled through Databricks jobs.\n\n```\nFILEFORMAT = CSV\nFORMAT_OPTIONS (‘header’ = ‘true’)\n\n```\n\nWhile COPY INTO does not support low latencies, you can trigger a COPY INTO based on\n\n\nevents by using cloud functions such as AWS Lambda or through orchestrators like Apache\n\nAirflow. COPY INTO supports incremental appends and simple transformations.\n\nCOPY INTO is a great command to use when your source directory contains a small number\n\nof files (i.e., thousands of files or less). To ingest a larger number of files, we recommend\n\nAuto Loader, which we will cover later in this eBook.\n\n**Common Use Cases for COPY INTO**\n\n**Ingesting data to a new Delta table**\n\nA common ad hoc ingestion use case using COPY INTO is to ingest data into a new Delta\n\ntable. To copy data into a new Delta table, users can use CREATE TABLE command first,\n\nfollowed by COPY INTO.\n\nStep 1: `CREATE TABLE` `my_table (id` `INT` `, name STRING, age` `INT` `);`\nStep 2 1 : `COPY INTO` `my_table`\n```\n FROM ‘s3://my_bucket/my_path’ WITH (\n CREDENTIAL (\n AWS_ACCESS_KEY = ‘*****’,\n AWS_SECRET_KEY = ‘*****’,\n AWS_SESSION_TOKEN = ‘*****’\n )\n ENCRYPTION (\n TYPE = ‘AWS_SSE_C’,\n MASTER_KEY = ‘*****’\n\n```\n\nThe code block above covers the AWS temporary in-line credential format. When you use\n\nin-line credentials in Azure and AWS, the following parameters are required for each type of\n\ncredential and encryption:\n\n\n|Credential Name|Required Parameters|\n|---|---|\n|AWS temporary credentials|AWS_ACCESS_KEY AWS_SECRET_KEY|\n||AWS_SESSION_TOKEN|\n|Azure SAS token|AZURE_SAS_TOKEN|\n\n\n\n\n\n|Encryption Name|Required Parameters|\n|---|---|\n|AWS server-side encryption with customer-provided encryption key|TYPE = ‘AWS_SSE_C’ MASTER_KEY|\n|Azure client-provided encryption key|ATYPE = ‘AZURE_CSE’ MASTER_KEY|\n\n\n**Appending data to your Delta table**\n\nTo append data to a Delta table, users can leverage the COPY INTO command. COPY INTO\n\nis a powerful SQL command that is idempotent and incremental. When using COPY INTO,\n\nusers point to a location of files, and once those files are ingested, Delta Lake will keep\n\n1 If you only have temporary access to a cloud object store, you can use temporary in-line credentials to ingest data from\nthe cloud object store. When you are an admin or with ANY FILE access, and the instance profile has been set for the\ncloud object store, you do not need to specify credentials in-line for COPY INTO.\n\n\n-----\n\ntrack of the state of files that have been ingested. Unlike commands like INSERT INTO, users\n\nget idempotency with COPY INTO, which means users are prevented from ingesting the\n\nsame data twice to the same table.\n```\n COPY INTO table_identifier\n FROM [ file_location | ( SELECT expression_list FROM file_location)]\n FILEFORMAT = JSON | CSV | TEXT | PARQUET | AVRO | ORC | BINARYFILE\n [ FILES = [file_name [,...] | PATTERN = ‘regex_pattern’ ]\n [ FORMAT_OPTIONS ( ‘data_source_reader_option’ = ‘value’ [, ...])]\n [ COPY_OPTIONS ( ’OPTION’ = ‘VALUE’ [,...])]\n\n```\nOne of the main benefits of COPY INTO is that users don’t have to worry about providing a\n\nschema, because the schema is automatically inferred from your data files. Here is a very\n\nsimple example of how you would ingest data from CSV files that have headers, where you\n\nleave the tool to infer the schema and the proper data types. It’s as simple as that.\n```\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n\n```\n**Using COPY INTO without an existing table** 2\n\n```\n CREATE TABLE my_delta_table (dummy string);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n FORMAT_OPTIONS (\n ‘header’ = ‘true’ ,\n ‘inferSchema’ = ‘true’ ,\n ‘mergeSchema’ = ‘true’\n )\n COPY_OPTIONS ( ‘overwrite’ = ‘true’ , ‘overwriteSchema’ = ‘true’ )\n\n```\n**Ingesting a CSV file without headers**\n\nIf you are looking to ingest a CSV file that doesn’t have headers, columns will be named as\n\n_c0 or _c1, with the index of the column. You can use the double colon syntax to cast the\n\ndata type that you want and then alias these columns to whatever you want to call them.\n```\n COPY INTO my_delta_table\n FROM ( SELECT\n _c0::int as key,\n _c1::double value,\n _c2::timestamp event_time\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n\n```\n\nIn the most common case, in order to use COPY INTO, a table definition is required.\n\nHowever, if you would like to get started quickly and don’t have an existing table or require\n\na specific schema, you can create your table with a dummy schema. Then, once you run\n\nCOPY INTO, you can overwrite the table and overwrite the schema. COPY INTO will actually\n\ninfer the data types, and then change your Delta table to have the required schema.\n\n2 This use case will not work in Databricks SQL workspace, as it currently only works on clusters without table ACLs.\n\n\n-----\n\n**Evolving schema over time for CSV files** 3\n\nWhen ingesting CSV files that have a different number of columns than your existing table,\n\nyou can use the option “‘mergeSchema’ = ‘true’”. This option needs to be provided both\n\nas FORMAT_OPTIONS and COPY_OPTIONS. FORMAT_OPTIONS applies to the source data.\n\nOnce “mergeSchema” is provided as a format option, Databricks will look at multiple CSV\n\nfiles and infer the schema across those files. COPY_OPTIONS applies to your Delta table\n\nwhen you’re running the COPY INTO command. When “mergeSchema” is provided as a\n\ncopy option, you’re instructing Delta Lake that it is safe to evolve the schema. Schema\n\nevolution only allows the addition of new columns. Data type changes for existing columns\n\nare not supported.\n```\n COPY INTO my_delta_table\n FROM (SELECT\n _C0::int as key,\n _C1::double value,\n _C2::timestamp event_time,\n ...\n FROM ‘s3://my-bucket/path/to/csv_files’ )\n FILEFORMAT = CSV\n FORMAT_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘mergeSchema’ = ‘true’ )\n\n```\n\n**Fixing bad data**\n\nIf you find that there is a mistake in the source data file and some of the data you ingested\n\nis bad, you can use RESTORE on your Delta table and set it to the timestamp or version of\n\nthe Delta table that you want to roll back to (e.g., to restore to yesterday’s data). Then you\n\ncan rerun your COPY INTO command.\n\nAlternatively, if running a RESTORE is not possible, COPY INTO supports reloading files by\n\nthe use of the “force” copy option. You can manually remove the old data from your Delta\n\nLake table by running a DELETE operation and then using COPY INTO with “force” = “true”.\n\nYou can use the PATTERN keyword to provide a file name pattern, or you can specify the file\n\nnames with the FILES keyword to reload a subset of files in conjunction with “force”.\n```\n RESTORE my_delta_table TO TIMESTAMP AS OF date_sub(current_date(),\n 1);\n COPY INTO my_delta_table\n FROM ‘s3://my-bucket/path/to/csv_files’\n FILEFORMAT = CSV\n PATTERN = ‘2021-09-08*.csv’\n FORMAT_OPTIONS ( ‘header’ = ‘true’ , ‘inferSchema’ = ‘true’ )\n COPY_OPTIONS ( ‘force’ = ‘true’ )\n\n```\n3 Limitation: schema evolution with “mergeSchema” in COPY_OPTIONS does not work in Databricks SQL workspace or\nclusters enabled with table ACLs.\n\n\n-----\n\n##### Auto Loader\n\n\nWhile COPY INTO can solve a lot of the key use cases our customers face, due to its\n\nlimitations (scalability), there are many scenarios where we recommend Auto Loader\n\nfor data ingestion. Auto Loader is a data source on Databricks that incrementally and\n\nefficiently processes new data files as they arrive in cloud storage with minimal DevOps\n\neffort. Auto Loader is available in Python and Scala, and also in SQL in [Delta Live Tables](https://databricks.com/product/delta-live-tables) .\n\nAuto Loader is an incremental streaming source that provides exactly-once ingestion\n\nguarantees. It keeps track of which files have been ingested using a durable key-value store.\n\nIt can discover new files very efficiently and is extremely scalable. Auto Loader has been\n\nbattle tested. We have seen customers running Auto Loader on millions of files an hour, and\n\npetabytes of data per day.\n\nTo use Auto Loader, you simply specify ‘readStream’ and the format “cloudFiles”, indicating\n\nthat you will use Auto Loader to load files from the cloud object stores. Next, you specify\n\nthe format of the file — for example, JSON — as an option to Auto Loader, and you specify\n\nwhere to load it from.\n```\n df = spark.readStream.format( “cloudFiles” )\n .option( “cloudfiles.format” , “json” )\n .load( “/path/to/table” )\n\n```\nUnder the hood, when data lands in your cloud storage, Auto Loader discovers files either\n\nthrough directory listing or file notifications. Given permissions to the underlying storage\n\nbucket or container, Auto Loader can list the directory that you want to load data from\n\nin an efficient and scalable manner and load data immediately. Alternatively, Auto Loader\n\ncan also automatically set up file notifications on your storage account, which allows it\n\n\nfrom queues, deduplicate these notifications using its key-value store and then process\n\nthe underlying files. If there are any failures, Auto Loader will replay what hasn’t been\n\nprocessed, giving you exactly-once semantics.\n\nDirectory listing mode is very easy to get started with. If your files are uploaded to your\n\ncloud storage system in a lexicographical order, Auto Loader will optimize the discovery of\n\nfiles by starting directory listing from the latest uploaded files, saving you both time and\n\nmoney. If files cannot be uploaded in a lexicographical order and you need Auto Loader\n\nto scale to high volumes, Databricks recommends using the file notification mode. Cloud\n\nservices such as AWS Kinesis Firehose, AWS DMS and Azure Data Factory can be configured\n\nto upload files in a lexical order, typically by providing the upload time of records in the file\n\npath, such as /base/path/yyyy/MM/dd/HH/file.format.\n\n**Common Use Cases for Auto Loader**\n\n**New to Auto Loader**\n\nAs a new user to the Databricks Lakehouse, you’ll want to ingest data from cloud object\n\nstores into Delta Lake as part of your data pipeline for incremental loading. Here is a simple\n\nexample using Python to demonstrate the ease and flexibility of Auto Loader with a few\n\ndefined options. You can run the code in a notebook.\n```\n stream = spark.readStream \\\n .format( “cloudFiles” ) \\\n .option( “cloudFiles.format” , “csv” ) \\\n .option( “cloudFiles.schemaLocation” , schema_location) \\\n .load(raw_data_location)\n\n```\n\nto efficiently discover newly arriving files. When a file lands in file notification mode, the\n\ncloud storage system sends a notification to a queuing system. For example, in AWS, S3\n\nwill send a notification to AWS SQS. On Azure, a notification is sent to Azure queue storage.\n\nOn Google, it’ll be sent to Pub/Sub. Auto Loader can then fetch these event notifications\n\n\n-----\n\nIn order to write to a Delta table from the stream, follow the example below:\n```\n stream.writeStream \\\n .option( “mergeSchema” , “true” ) \\\n .option( “checkpointLocation” , checkpoint_location) \\\n .start(target_delta_table_location)\n\n```\n**Migrating to Auto Loader**\n\nAs a Spark user, you may be using an existing Spark structured streaming to process data.\n\nTo migrate to Auto Loader, all a user needs to do is take existing streaming code and turn\n\ntwo lines of it into ‘cloudFiles’, specifying the file format within an option.\n\n\n**Migrating a livestreaming pipeline**\n\nMigrating a livestreaming pipeline can be challenging, but with Auto Loader, as with COPY\n\nINTO, you can specify a timestamp when the source files are updated or created and Auto\n\nLoader will ingest all modified data after that point.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .option( “modifiedAfter” , “2021-09-09 00:00:00” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n**Schema inference and evolution**\n\nAuto Loader provides schema inference and management capabilities. With a schema\n\nlocation specified, Auto Loader can store the changes to the inferred schema over time. For\n\nfile formats like JSON and CSV, where the schemas can get fuzzy, schema inference on Auto\n\nLoader can automatically infer data types or treat everything as a string.\n\nWhen data does not match your schema (e.g., an unknown column or format), Auto Loader\n\nhas a data rescue capability that will “rescue” all data in a separate column, stored as a\n\nJSON string, to investigate later. See [rescued data column](https://docs.databricks.com/spark/latest/structured-streaming/auto-loader-schema.html#rescued-data-column) for more details.\n\nAuto Loader supports three schema evolution modes: add new columns as they are\n\ndiscovered, fail if an unexpected column is seen, or rescue new columns.\n\n```\ndf = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.\nformat” , “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n```\ndf = spark.readStream\n .format( “json” )\n .options(format_options)\n .schema(schema)\n .load( “/path/to/table” )\n\n```\n\nOnce it’s converted, users will see instant benefits like scalability and cost reduction. Auto\n\nLoader can scale to trillions of files, unlike the open-source file streaming source. One of\n\nthe ways that Auto Loader does this is with asynchronous backfills. Instead of needing\n\nto discover files first, then plan, Auto Loader discovers and processes files concurrently,\n\nmaking it much more efficient and leading to cost reductions in compute resources.\n\n\n-----\n\n**Fixing a file that was processed with Auto Loader**\n\nTo fix a file that was already processed, Auto Loader supports an option called\n\n‘allowOverwrites’. With this option, Auto Loader can re-ingest and reprocess a file with a\n\nnew timestamp. If you want to enable this option in an existing Auto Loader stream, you\n\nneed to stop and restart the Auto Loader stream with the enabled option.\n```\n df = spark.readStream\n .format( “cloudFiles” )\n .option( “cloudFiles.format” , “json” )\n .schema(schema)\n .option( “cloudFiles.allowOverwrites” , “true” )\n .options(format_options)\n .load( “/path/to/table” )\n\n```\n**Discover missing data**\n\nWhile event notification is a very scalable method to collect all data, it relies on cloud\n\nservices, which are distributed systems and are not always reliable. With Auto Loader, you\n\ncan additionally specify a backfill interval, where Auto Loader will perform asynchronous\n\nbackfills at whatever interval you set up. This can be enabled with a once trigger,\n\n```\n df = spark.readStream\n .format(“cloudFiles”)\n .option(“cloudFiles.format”, “json”)\n .schema(schema)\n .option( “cloudFiles.backfillInterval” , “1 week” )\n .options(format_options)\n .load(“/path/to/table”)\n .writeStream\n .trigger(Trigger.AvailableNow())\n .option(“checkpointLocation”, checkpointDir)\n .start()\n\n```\nThe trigger tells Auto Loader how frequently to process incoming data. A processing time\n\ntrigger will have Auto Loader run continuously and schedule micro-batches at the trigger\n\ninterval which you have set. The “Once” and “AvailableNow” triggers instruct Auto Loader to\n\nprocess all new data that has been added until the start of your application. Once the data\n\nis processed, Auto Loader will automatically shut down. Trigger Once will have Auto Loader\n\nprocess all the new data in a single micro-batch, which requires it to first discover all the\n\nnew files. With Trigger AvailableNow, Auto Loader can discover and process files concurrently\n\nand perform rate limiting, which makes it a preferable alternative to Trigger Once.\n\n\nprocessing time trigger and available now trigger. The following example shows how to use\n\nbackfill internal and trigger availableNow together:\n\n\n-----\n\n**Using Auto Loader in SQL with Delta Live Tables**\n\nDelta Live Tables is a cloud-native ETL service on Databricks that provides a reliable\n\nframework to develop, test, monitor, manage and operationalize data pipelines at scale to\n\ndrive insights for data science, machine learning and analytics. Auto Loader is available in\n\nDelta Live Tables.\n\n```\nCREATE INCREMENTAL LIVE TABLE\n autoloader_test\nAS\nSELECT\n *,\n id + id2 AS new_id\nFROM\n CLOUD_FILES (\n “some/cloud/path” , – the path to the data\n “json” – the file format\n );\n\n```\n\n**Live Tables understands**\n\n**and coordinates data flow**\n\n**between your queries**\n\n\n-----\n\n### Ingesting Data From External Applications\n\nWhile Auto Loader and COPY INTO are powerful tools, not all data is available as files\n\nin cloud object stores. In order to enable a lakehouse, it is critical to incorporate all of\n\nyour data and break down the silos between sources and downstream teams. To do this,\n\ncustomers need to discover and connect a broad set of data, BI and AI tools, and systems\n\nto the data within their lakehouse.\n\n##### Partner Connect\n\nHistorically, stitching multiple enterprise tools and data sources together has been a burden\n\non the end user, making it very complicated and expensive to execute at any scale. Partner\n\nConnect solves this challenge by making it easy for you to integrate data, analytics and AI\n\ntools directly within their Databricks Lakehouse. It also allows you to discover new, pre-\n\nvalidated solutions from Databricks partners that support your expanding analytics needs.\n\nTo ingest into the lakehouse, select the partner tile in Partner Connect via the left\n\nnavigation bar in Databricks. Partner Connect will automatically configure resources such\n\nas clusters, tokens and connection files for you to connect with your data ingestion tools\n\nof choice. You can finish signing up for a trial account on the partner’s website or directly\n\nlog in if you already used Partner Connect to create a trial account. Once you log in, you will\n\nsee that Databricks is already configured as a destination in the partner portal and ready\n\nto be used.\n\n\n-----\n\n**Common Use Case for Partner Connect**\n\n**Ingest Salesforce data via Fivetran into Delta Lake**\n\nClicking on the Fivetran tile in Partner Connect starts an automated workflow between\n\nthe two products. Databricks automatically provisions a SQL endpoint and associated\n\ncredentials for Fivetran to interact with, and passes the user’s identity and the SQL\n\n\nendpoint configuration to Fivetran automatically via a secure API. Within Fivetran, a\n\nDatabricks destination is automatically created. This destination is configured to ingest into\n\nDelta via the SQL endpoint that was auto-configured by Partner Connect.\n\nThe customer now selects their choice of data source in Fivetran from hundreds of pre-\n\nbuilt connectors — for example, Salesforce. The user authenticates to the Salesforce\n\nsource, chooses the Salesforce objects they want to ingest into Delta Lake on Databricks\n\n\n-----\n\n(in this case the Account & Contact objects) and starts the initial sync. This automation\n\nhas saved users dozens of manual steps and copying/pasting of configuration if they\n\nmanually set up the connection. It also protects the user from making any unintentional\n\nconfiguration errors and spending time debugging those errors. The Salesforce tables\n\nare now available to query, join and analyze in Databricks SQL. Watch the [demo](https://databricks.com/partnerconnect#partner-demos) for more\n\ndetails or check out the [Partner Connect guide](https://docs.databricks.com/integrations/partner-connect/index.html?_gl=1*1mz2ts6*_gcl_aw*R0NMLjE2MzY2NzU1NDcuQ2p3S0NBaUFtN09NQmhBUUVpd0FydkdpM0ZHS3ptZTR5Z2YzR3E4ajVrYTNaUExOUEFnaTZIMnNRU05EMC1RYzl0dGxXQjl6ajRuNU14b0N0OGdRQXZEX0J3RQ..&_ga=2.83627156.328510291.1641248936-1825366797.1612985070) to learn more.\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 5,000 organizations worldwide — including Comcast,\n\nCondé Nast, H&M and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to\n\nunify their data, analytics and AI. Databricks is headquartered in San Francisco, with offices around the\n\nglobe. Founded by the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on a\n\nmission to help data teams solve the world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , LinkedIn and Facebook .\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf", + "2024-09-19T16:57:19Z" + ], + [ + "# 2023 State\n of Data + AI\n```\nPowered by the Databricks Lakehouse\n\n```\n2023 STATE OF DATA + AI\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||W|e’|r|e|in||th|e|||||||\n|||||||go|l|de|n|a|ge||of|||||||\n|||||||||||||||||||||\n|||||||d|a|ta|a|n|d|A|I|||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\n-----\n\nINTRO\n\nIn the 6 months since ChatGPT launched, the world has woken up to the vast potential\nof AI. The unparalleled pace of AI discoveries, model improvements and new products\non the market puts data and AI strategy at the top of conversations across every\norganization around the world. We believe that AI will usher in the next generation of\nproduct and software innovation, and we’re already seeing this play out in the market.\nThe next generation of winning companies and executives will be those who understand\nand leverage AI.\n\nIn this report, we examine patterns and trends in data and AI adoption across more\nthan 9,000 global Databricks customers. By unifying business intelligence (BI) and AI\napplications across companies’ entire data estates, the Databricks Lakehouse provides\na unique vantage point into the state of data and AI, including which products and\ntechnologies are the fastest growing, the types of data science and machine learning\n(DS/ML) applications being developed and more.\n\n\n-----\n\n```\nHere are the major stories we uncovered:\n\n```\n\nCompanies are adopting\nmachine learning and large\nlanguage models (LLMs)\nat a rapid pace. Natural\nlanguage processing (NLP)\nis dominating use cases,\nwith an accelerated focus\non LLMs.\n\n\nOrganizations are investing in\ndata integration products as\nthey prioritize more DS/ML\ninitiatives. 50% of our fastestgrowing products represent\nthe data integration category.\n\n\nOrganizations are increasingly\nusing the Lakehouse for data\nwarehousing, as evidenced\nby the high growth of data\nintegration tools dbt and\nFivetran, and the accelerated\nadoption of Databricks SQL.\n\n\nWe hope that by sharing these trends, data leaders will be able to benchmark\ntheir organizations and gain insights that help inform their strategies for an\nera defined by data and AI.\n\n\n-----\n\n```\nSummary of\n\nKey Findings\n DATA SCIENCE AND MACHINE LEARNING:\n\n NLP AND LLMS ARE IN HIGH DEMAND\n 1\n\n```\n**•** The number of companies using SaaS LLM APIs (used to access\nservices like ChatGPT) has grown 1310% between the end of\nNovember 2022 and the beginning of May 2023\n\n**•** NLP accounts for 49% of daily Python data science library usage,\nmaking it the most popular application\n\n**•** Organizations are putting substantially more models into production\n(411% YoY growth) while also increasing their ML experimentation\n(54% YoY growth)\n\n**•** Organizations are getting more efficient with ML; for every three\n\nexperimental models, roughly one is put into production, compared\nto five experimental models a year prior\n\n\n-----\n\n```\nFASTEST-GROWING DATA\nAND AI PRODUCTS\n\n```\n```\nADOPTION AND\nMIGRATION TRENDS\n\n```\n61% of customers migrating to the\nLakehouse are coming from onprem and cloud data warehouses\n\nThe volume of data in Delta Lake\nhas grown 304% YoY\n\nThe Lakehouse is increasingly\nbeing used for data warehousing,\nincluding serverless data\nwarehousing with Databricks\nSQL, which grew 144% YoY\n\n\nBI is the top data and AI market, but\ngrowth trends in other markets show that\ncompanies are increasingly looking at\nmore advanced data use cases\n\nThe fastest-growing data and AI product\nis dbt, which grew 206% YoY by number\nof customers\n\nData integration is the fastest-growing\ndata and AI market on the Databricks\nLakehouse with 117% YoY growth\n\n\n-----\n\n```\nMethodology: How did Databricks\n\ncreate this report?\n\n```\nThe _2023 State of Data + AI_ is built from fully-aggregated, anonymized data\ncollected from our customers based on how they are using the Databricks\nLakehouse and its broad ecosystem of integrated tools. This report focuses\non machine learning adoption, data architecture (integrations and migrations)\nand use cases. The customers in this report represent every major industry\nand range in size from startups to many of the world’s largest enterprises.\n\nUnless otherwise noted, this report presents and analyzes data from February 1,\n2022, to January 31, 2023, and usage is measured by number of customers.\nWhen possible, we provide YoY comparisons to showcase growth trends over time.\n\n\n-----\n\n```\nData Science and\n\nMachine Learning\nNATURAL LANGUAGE PROCESSING AND LARGE\nLANGUAGE MODELS ARE IN HIGH DEMAND\n\n```\nAcross all industries, companies leverage data science and\nmachine learning (DS/ML) to accelerate growth, improve\npredictability and enhance customer experiences. Recent\nadvancements in large language models (LLMs) are propelling\ncompanies to rethink AI within their own data strategies.\nGiven the rapidly evolving DS/ML landscape, we wanted to\nunderstand several aspects of the market:\n\n- Which types of DS/ML applications are companies investing\nin? In particular, given the recent buzz, what does the data\naround LLMs look like?\n\n- Are companies making headway on operationalizing\n\ntheir machine learning models (MLOps)?\n\n\n-----\n\n```\nTime Series Time Series\nSpeech Recognition\nSimulations &\u0003\n\nOptimizations\nRecommender Systems\nNatural\n\n\u0003Language \u0003\n\nProcessing\nIndustry Data Modeling\nGraph\nGeospatial\nComputer Vision\nAnomaly Detection\n\u0003& Segmentation\n\n```\n```\n SPECIALIZED PYTHON \u0003DS/ML\n\n LIBRARIES FROM \u0003FEBRUARY 2022 \n\n TO JANUARY 2023\n\n```\n\nNote: This chart reflects the unique\nnumber of notebooks using ML\nlibraries per day in each of the\ncategories. It includes libraries used\nfor the particular problem-solving use\ncases mentioned. It does not include\nlibraries used in tooling for data\npreparations and modeling.\n\n\n-----\n\n```\nNatural language processing dominates\n\nmachine learning use cases\n\n```\n\nOur second most popular DS/ML application is\nsimulations and optimization, which accounts for 30% of\nall use cases. This signals organizations are using data to\nmodel prototypes and solve problems cost-effectively.\n\n\nTo understand how organizations are applying AI and\nML within the Lakehouse, we aggregated the usage\nof specialized Python libraries, which include NLTK,\nTransformers and FuzzyWuzzy, into popular data science\nuse cases. 1 We look at data from these libraries because\nPython is on the cutting edge of new developments in ML,\nadvanced analytics and AI, and has consistently ranked\nas one of the [most popular programming languages](https://www.tiobe.com/tiobe-index/) in\nrecent years.\n\nOur most popular use case is natural language processing\n(NLP), a rapidly growing field that enables businesses to\ngain value from unstructured textual data. This opens the\ndoor for users to accomplish tasks that were previously\ntoo abstract for code, such as summarizing content or\nextracting sentiment from customer reviews. In our data\nset, 49% of libraries used are associated with NLP. LLMs\nalso fall within this bucket. Given the innovations launched\nin recent months, we expect to see NLP take off even\nmore in coming years as it is applied to use cases like\nchatbots, research assistance, fraud detection, content\ngeneration and more.\n\n```\n In our data set, 49% of\n specialized Python libraries\n used are associated with NLP\n\n```\nMany of the DS/ML use cases are predominantly\nleveraged by specific industries. While they take up a\nsmaller share of the total, they are mission-critical for\nmany organizations. For example, time series includes\nforecasting, a use case that is especially popular in\nindustries such as Retail and CPG, which rely heavily\non the ability to forecast the need for every item in\nevery store.\n\n\n1. This data does not include general-purpose ML libraries, including\nscikit-learn or TensorFlow.\n\n\n-----\n\n```\n USE OF LARGE LANGUAGE MODELS (LLMS)\n\n```\n\n\n\n\n\n\n\nWe have rolled these libraries up into groupings based on the type of functionality they provide.\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|Col29|Col30|Col31|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||||||||\n|||||||||||||||||||||||Ma||rch 24, Dolly La||2023 unch|||||\n||||sformer|-Related|||||||||||\u0003C||||||||||||||||\n|||Tran||||||||||||||||, 2022 Launch|||||||||||||\n|||\u0003Libr|aries LLM AP|Is|||||||||||||||||||||||||||\n|||SaaS|||||||||||||||||||||||||||||\n|||LLM|Tools||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n||||||||||||||||||||||||||||||||\n|Feb|Mar|A|pr|May|June||July||Au||g S|ept||Oct||Nov||De||c J|an|Feb||Mar||Apr||M|ay||\n|2022||||||||||||||||||||20|23||||||||||\n||||||||||||||||||||||||||||||||\n||Note: T These|here ar libraries|e several provide|popular pretrain|types o ed mod||f Python els and||librarie tools for||s that a buildin|re comm g, trainin||only us g and d||ed for L eploying||LMs. LLMs.|||||||||||||\n\n\n\nD t i t tl di i th l t k f D b d t lit\n\n\n-----\n\n```\nLarge language models are\n\nthe “it” tool\n\n```\nLLMs are currently one of the hottest and most-watched areas\nin the field of NLP. LLMs have been instrumental in enabling\nmachines to understand, interpret and generate human language\nin a way that was previously impossible, powering everything\nfrom machine translation to content creation to virtual assistants\nand chatbots.\n\nTransformer-related libraries have been growing in popularity\neven before ChatGPT thrust LLMs into the public consciousness.\nWithin the last 6 months, our data shows two accelerating\ntrends: organizations are building their own LLMs, which models\nlike [Dolly](https://www.databricks.com/blog/2023/04/12/dolly-first-open-commercially-viable-instruction-tuned-llm) show can be quite accessible and inexpensive. And,\nthey are using proprietary models like ChatGPT. Transformerrelated libraries, such as Hugging Face, which are used to train\nLLMs, have the highest adoption within the Lakehouse.\n\nThe second most popular type is SaaS LLMs, which are used\nto access models like OpenAI. This category has grown\nexponentially in parallel with the [launch of ChatGPT](https://openai.com/blog/chatgpt) : the\nnumber of Lakehouse customers using SaaS LLMs has grown\n\n\nOrganizations can leverage LLMs either by\nusing SaaS LLM APIs to call services like\nChatGPT from OpenAI or by operating their\nown LLMs in-house.\n\nThinking of building your own modern LLM\napplication? This approach could entail\nthe use of specialized transformer-related\nPython libraries to train the model, as well as\nLLM tools like LangChain to develop prompt\ninterfaces or integrations to other systems.\n```\nLLM DEFINITIONS\n\n```\n**◊** **Transformer-related libraries:**\nPython libraries used to train LLMs\n(example: Hugging Face)\n\n**◊** **SaaS LLM APIs:** Libraries used to access\nLLMs as a service (example: OpenAI)\n\n**◊** **LLM tools:** Toolchains for working\nwith and building proprietary LLMs\n(example: LangChain)\n\n\nan impressive 1310% between the end of November 2022 and\nthe beginning of May 2023. (In contrast, transformer-related\nlibraries grew 82% in this same period.)\n\n\n-----\n\n```\n ac e ea g e pe e a o a d p oduc o\ntake off across industries\n\n```\n\nThe increasing demand for ML solutions and the growing\navailability of technologies have led to a significant\nincrease in experimentation and production, two distinct\nparts of the ML model lifecycle. We look at the _logging_ and\n_registering_ of models in MLflow, an open source platform\ndeveloped by Databricks, to understand how ML is\ntrending and being adopted within organizations.\n```\n LOGGED MODELS AND\n\n ML EXPERIMENTATION\n\n```\nDuring the experimentation phase of ML, data scientists\ndevelop models designed to solve given tasks. After training\nthe models, they test them to evaluate their accuracy,\nprecision, recall (the percentage of correctly predicted\npositive instances out of all actual positive instances), and\nmore. These metrics are logged (recorded) in order to analyze\nthe various models’ performance and identify which approach\nworks best for the given task.\n\nWe have chosen logged models as a proxy to measure ML\nexperimentation because the MLflow Tracking Server is\n\ndesigned to facilitate experiment tracking and reproducibility.\n\n\nMLflow Model Registry launched in May 2021. Overall, the\nnumber of logged models has grown 54% since February\n2022, while the number of registered models has grown\n411% over the same period. This growth in volume suggests\norganizations are understanding the value of investing in\nand allocating more people power to ML.\n```\nREGISTERED MODELS AND ML PRODUCTION\n\n```\nProduction models have undergone the experimentation\nphase and are then deployed in real-world applications. They\nare typically used to make predictions or decisions based on\nnew data. Registering a model is the process of recording and\nstoring metadata about a trained model in a centralized location\nthat allows users to easily access and reuse existing models.\nRegistering models prior to production enables organizations to\nensure consistency and reliability in model deployment and scale.\n\nWe have chosen registered models to represent ML production\nbecause the MLflow Model Registry is designed to manage\nmodels that have left the experimentation phase through the\n\nrest of their lifecycle.\n\n\n-----\n\ng y yi p\n\nwas registered. Recent advances in ML, such as improved\nopen source libraries like MLflow and Hugging Face, have\n\nradically simplified building and putting models into\nproduction. The result is that 34% of logged models are\nnow candidates for production today, an improvement\nfrom over 20% just a year ago.\n\n\nbefore committing an ML model to production. We wanted\nto understand, “How many models do data scientists\n\nexperiment with before moving to production?”\n\nOur data shows the ratio of logged to registered models\nis 2.9 : 1 as of January 2023. This means that for roughly\nevery three experimental models, one model will get\nregistered as a candidate for production. This ratio has\nimproved significantly from just a year prior, when we\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|Col26|Col27|Col28|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||VS. S|||||||||||||||||||||||\n|RA RE|TIO GIST|OF ERE|LOGG D MO|ED DEL||||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n||||||Models|||||||||||||||||||||||\n||||||ber of|||||||||||||||||||||||\n||||||Num|||||||||||||||||||||||\n|||||||||||||||||||||||||||||\n|2.|9 :|1||||||||||||||||||||||||||\n\n```\nRatio of Logged to Registered\n\n Feb Mar Apr May June July Aug Sept Oct Nov Dec Jan\nModels in Jan 2023 2023\n\n```\n\n-----\n\n```\nThe Modern Data\nand AI Stack\n\n```\nOver the last several years, the trend toward building\nopen, unified data architectures has played out in our\nown data. We see that data leaders are opting to preserve\nchoice, leverage the best products and deliver innovation\nacross their organizations by democratizing access to\ndata for more people.\n\n\n-----\n\n```\n FASTEST-GROWING DATA AND AI PRODUCTS\n dbt 206%\n\n```\n```\nFivetran\nInformatica\nQlik Data Integration\nEsri\nLooker\nHugging Face\n\n```\n```\n 181%\n 174%\n 152%\n 145%\n 141%\n110%\n\n```\n```\nLytics\nGreat Expectations\nKepler.gl\n\n```\n```\n 101%\n 100%\n95%\n\n```\n```\n0% 50% 100% 150% 200%\n Year-Over-Year Growth by Number of Customers\n\n```\n\n-----\n\n```\nDBT IS THE FASTEST-GROWING DATA\n\nAND AI PRODUCT OF 2023\n\n```\nAs companies move quickly to develop more advanced\nuse cases with their data, they are investing in newer\nproducts that produce trusted data sets for reporting,\nML modeling and operational workflows. Hence, we see\nthe rapid rise of data integration products. dbt, a data\ntransformation tool, and Fivetran, which automates\ndata pipelines, are our two fastest-growing data and AI\nproducts. This suggests a new era of the data integration\nmarket with challenger tools making headway as\ncompanies shift to prioritize DS/ML initiatives. With Great\nExpectations from Superconductive in the ninth spot,\na full 50% of our fastest-growing products represent\nthe data integration category.\n\n\n-----\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n|GR|OWTH|OF|DAT|A A|ND A|I M|ARKE|TS||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||Busi|ness I|ntelli|gence|\n|||||||||||||||||Data & Se Data|Gover curity Scien|nance ce &||\n|ers||||||||||||||||Mach Data|ine Le Integ|arning ration||\n|Custom||||||||||||||||||||\n|ber of||||||||||||||||||||\n|Num||||||||||||||||||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n||Feb 2022|Mar|Apr|M|ay|June|July|Aug|Se|pt|Oct|Nov|Dec|Ja 20|n 23|||||\n|||||||||||||||||||||\n|||||||||||||||||||||\n\n\nNote: In this chart, we count the number of customers deploying one or more data and AI products in each category. These four\ncategories do not encompass all products Databricks products such as Unity Catalog are not included in this data\n\n\n-----\n\n```\n a a a d a e s bus ess e ge ce s\nstandard, organizations invest in their machine\nlearning foundation\n\n```\n\nTo understand how organizations are prioritizing their data\ninitiatives, we aggregated all data and AI products on the\nDatabricks Lakehouse and categorized them into four\ncore markets: BI, data governance and security, DS/ML,\nand data integration. Our data set confirms that BI tools\nare more widely adopted across organizations relative to\nmore nascent categories — and they continue to grow,\nwith a 66% YoY increase in adoption. This aligns with the\nbroader trend of more organizations performing data\nwarehousing on a Lakehouse, covered in the next section,\nViews from the Lakehouse.\n\n\nWhile BI is often where organizations start their data\njourney, companies are increasingly looking at more\nadvanced data and AI use cases.\n```\nDEMAND FOR DATA INTEGRATION PRODUCTS\n\nIS GROWING FAST\n\n```\nWe see the fastest growth in the data integration market.\nThese tools enable a company to integrate vast amounts\nof upstream and downstream data in one consolidated\nview. Data integration products ensure that all BI and DS/\nML initiatives are built on solid foundation.\n\nWhile it’s easier for smaller markets to experience\nfaster growth, at 117% YoY increased adoption, the data\nintegration market is growing substantially faster than BI.\nThis trend dovetails with the rapid growth of ML adoption\nwe see across the Lakehouse, covered in the DS/ML\nsection of the report.\n\n```\nData integration is the\nfastest-growing market,\n\n with 117% YoY growth\n\n```\n\n-----\n\n```\nViews from\nthe Lakehouse\nMIGRATION AND DATA\n\nFORMAT TRENDS\n\n```\nData migration is a major undertaking: it can be risky,\nexpensive and delay companies’ timelines. It’s not a\ntask to jump into lightly. As organizations run into the\nlimitations, scalability challenges and the cost burden\nof legacy data platforms, they are increasingly likely\nto migrate to a new type of architecture.\n\n\n-----\n\n```\nMigration trends:\n\nthe best data warehouse\n\nis a Lakehouse\n\n```\nThe Lakehouse Platform is an attractive\nalternative to traditional data warehouses\nbecause it supports advanced use cases and\nDS/ML, allowing organizations to boost their\noverall data strategy. As evidenced by the most\npopular data and AI products, with BI and data\nintegration tools at the top, organizations are\nincreasingly using the data lakehouse for data\nwarehousing. To better understand which legacy\nplatforms organizations are moving away from,\n\nwe look at the migrations of new customers\nto Databricks.\n\nAn interesting takeaway is that roughly half of the\ncompanies moving to the Lakehouse are coming\nfrom data warehouses. This includes the 22%\nthat are moving from cloud data warehouses.\nIt also demonstrates a growing focus on running\ndata warehousing workloads on a Lakehouse\nand unifying data platforms to reduce cost.\n\n```\n SOURCE OF NEW CUSTOMER \u0003\n\n MIGRATIONS TO DATABRICKS\n\n```\n```\n12%\n\n```\n```\n39%\n\n```\n```\n27%\n\n```\n```\n22%\n\n```\n\n-----\n\n```\nRising tides: the volume\n\nof data in Delta Lake\n\nhas grown 304% YoY\n\n```\nAs the [volume of data explodes](https://www.researchgate.net/profile/Adanma-Eberendu/publication/309393428_Unstructured_Data_an_overview_of_the_data_of_Big_Data/links/5bc89b5c458515f7d9c65beb/Unstructured-Data-an-overview-of-the-data-of-Big-Data.pdf) , an increasingly\nlarge proportion is in the form of semi-structured\nand unstructured data. Previously, organizations\nhad to manage multiple different platforms for\ntheir structured, unstructured and semi-structured\ndata, which caused unnecessary complexity and\nhigh costs. The Lakehouse solves this problem by\nproviding a unified platform for all data types\nand formats.\n\nDelta Lake is the foundation of the Databricks\nLakehouse. The Delta Lake format encompasses\nstructured, unstructured and semi-structured\ndata. Use has surged over the past 2 years.\nWhen compared to the steady, flat or declining\ngrowth in other storage formats (e.g., text, JSON\nand CSV), our data shows that a growing number\nof organizations are turning to Delta Lake to manage\ntheir data. In June 2022, Delta Lake surpassed\nParquet as the most popular data lake source,\nreaching 304% YoY growth.\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n\n|Col1|VO|LUME|Col4|OF|Col6|DAT|Col8|A M|ANAG|ED,|Col12|Col13|Col14|Col15|Col16|Col17|Col18|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||BY|STO||RAG||E FO||RMA|T|||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|ata||||||||||||||||||\n|e of D||||||||||||||||||\n|Volum||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n|||||||||||||||||||\n||Jan|||||||J|an|||Jan||||Ja||\n|||||Jan||||||||||||||\n|2|019|||2020||||20|21|||2022||||202||\n|||||||||Delta|Te|xt||CSV||Av||ro||\n|||||||||Parquet|OR|C||JSON||||||\n|||||||||||||||||||\n\n\n-----\n\n```\n g g ,\nwith emphasis on serverless\n\n```\n\nOver the past 2 years, companies have vastly increased their usage\nof data warehousing on the Lakehouse Platform. This is especially\ndemonstrated by use of Databricks SQL ­— the serverless data\nwarehouse on the Lakehouse — which shows 144% YoY growth.\nThis suggests that organizations are increasingly ditching traditional\ndata warehouses and are able to perform all their BI and analytics\non a Lakehouse.\n\n```\n Data \nWarehouse\n\n```\n```\nData \n\n```\n```\nLakehouse\nPlatform\n\n```\n```\nLakehouse\n\n```\n\n\n\n\n\n\n\n\n\n|Col1|Col2|Col3|Col4|Col5|Col6|Col7|Col8|Col9|Col10|Col11|Col12|Col13|Col14|Col15|Col16|Col17|Col18|Col19|Col20|Col21|Col22|Col23|Col24|Col25|\n|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|\n||||||||||||||||||||||||||\n||DA|TA W|ARE|HOUS|ING||||||||||||||||||||\n||ON|LAK|EHO|USE|WIT L|H|rs||||||||||||||||||\n||DA|TABR|ICK|S SQ|||||||||||||||||||||\n||||||||ustome||||||||||||||||||\n||||||||r of C||||||||||||||||||\n||Note: T as a re|here is a sult of th|spike in e ungat|Octobe ed previ|r 2021 ew||Numbe||||||||||||||||||\n||launch Genera|of Datab l Availab|ricks SQ ility in D|L, follow ecembe|ed by r 2021.||||||||||||||||||||\n||Data c of Dec|onsisten ember d|tly dips i ue to se|n the las asonalit|t week y.||J 2|an 021||Jul 202||y 1||Jan 2022||||July 2022||||Jan 2023|||\n\n\n-----\n\nCONCLUSION\n```\nGeneration AI\n\n```\nWe’re excited that companies are progressing into more\nadvanced ML and AI use cases, and the modern data and\nAI stack is evolving to keep up. Along with the rapid growth\nof data integration tools (including our fastest growing,\ndbt), we’re seeing the rapid rise of NLP and LLM usage in\nour own data set, and there’s no doubt that the next few\nyears will see an explosion in these technologies. It’s never\nbeen more clear: the companies that harness the power\nof DS/ML will lead the next generation of data.\n\n\n-----\n\n```\nAbout Databricks\n\n```\nDatabricks is the data and AI company. More than 9,000\norganizations worldwide — including Comcast, Condé Nast, and\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\nPlatform to unify their data, analytics and AI. Databricks is\nheadquartered in San Francisco, with offices around the globe.\nFounded by the original creators of Apache Spark™, Delta Lake\nand MLflow, Databricks is on a mission to help data teams solve\nthe world’s toughest problems. To learn more, follow Databricks\non Twitter, LinkedIn and Instagram.\n\n[DISCOVER LAKEHOUSE](https://www.databricks.com/product/data-lakehouse)\n\n© Databricks 2023. All rights reserved. Apache, Apache Spark, Spark and the Spark logo are trademarks of the Apache Software Foundation | Terms of Use\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "**eBook**\n\n# Making Your Digital Twin Come to Life\n\n##### With the Lakehouse for Manufacturing and Tredence\n\n\n-----\n\n### Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\n\nDigital Twin Architectures .................................................................................................................................................................................. **08**\n\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\n\nWhy Is Manufacturing Struggling With Data and AI? ............................................................................................................................ **12**\n\nWhy Databricks for Digital Twins? ................................................................................................................................................................... **13**\n\nWhy Tredence for Digital Twins? ...................................................................................................................................................................... **14**\n\nUsing Digital Twins to Drive Insights .............................................................................................................................................................. **15**\n\n\n-----\n\n### Introduction\n\n\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\ncost-effective and are now an imperative in today’s data-driven businesses.\n\nToday’s manufacturing industries are expected to streamline and optimize all the processes in their value\nchain from product development and design, through operations and supply chain optimization to obtaining\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\n\n\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n\n\n**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 10%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 50%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 25%\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n-----\n\n**Introduction (continued)**\n\n\n**Digital twin market growth rate accelerates**\n\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\nat a CAGR of 58%, riding on the wave of Industry 4.0.\n\n\n**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing\n\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\nwould have come at significant costs without digital twin technology.\n\n**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n\n\n\n**•** Product design and development is performed with\nless cost and is completed in less time as iterative\nsimulations, using multiple constraints, deliver the\nbest or most optimized design. All commercial\naircraft are designed using digital twins.\n\n**•** Digital twins provide the awareness of how long\ninventory will last, when to replenish and how to\nminimize the supply chain disruptions. The oil and gas\nindustry, for example, uses supply chain–oriented\ndigital twins to reduce supply chain bottlenecks in\nstorage and midstream delivery, schedule tanker\noff-loads and model demand with externalities.\n\n\n\n**•** Continuous quality checks on produced items\nwith ML/AI generated feedback pre-emptively\nassuring improved product quality. Final paint\ninspection in the automotive industry, for example,\nis performed with computer vision built on top of\ndigital twin technology.\n\n**•** Striking the sweet spot between when to replace\na part before the process degrades or breaks\ndown and utilizing the components to their fullest,\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\nbuilding an asset performance management suite.\n\n\n\n**•** Digital twins create the opportunity to have\nmultiple departments in sync by providing\nnecessary instructions modularly to attain\na required throughput. Digital twins are the\nbackbone of kaizen events that optimize\nmanufacturing process flow.\n\n**•** Customer feedback loops can be modeled through\ninputs, from point of sale customer behavior,\nbuying preferences, or product performance and\nthen integrated into the product development\nprocess, forming a closed loop providing an\nimproved product design.\n\n\n-----\n\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\ndeployment, but typically offer higher and longer-lasting value.\n\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n\n\nImprove product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**\n\n**8%**\n**8%**\n\n\nCan you imagine the cost to change\nan oil refinery’s crude distillation\nunit process conditions to improve\nthe output of diesel one week\nand gasoline the next to address\nchanges in demand and ensure\nmaximum economic value? Can you\nimagine how to replicate an even\nsimple supply chain to model risk?\n\n\n**5%**\n\n\n**1%**\n\n\n-----\n\n### What Are Digital Twins?\n\n\nKnowing the business challenges and benefits digital twins deliver, let’s turn to\nthe basics and explore what digital twins are and how a modern data stack is\nnecessary to build effective and timely digital twins. The classic definition of\ndigital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n\n\nFor a discrete or continuous manufacturing process, a digital twin gathers system\nand processes state data with the help of various IoT sensors [operational\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\nvirtual model which is then used to run simulations, study performance issues and\ngenerate possible insights.\n\n\n**Types of Digital Twins**\n\n\n-----\n\n### Digital Twin Architectures\n\nClassic digital twins have been physics-based models of specific systems. More recently,\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n\n\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\nthe industrial environment.\n\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\n\n**Data-Driven Operational Digital Twins: Maturity Journey**\n\n**AI**\n\nSimulate & Optimize\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n# 6-8 18-24\n## years to months\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n# 20% to 25%\n\n\n**[Digital warehouse design lets](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[companies test and learn](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[using a digital twin, which can](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n**[improve efficiency by](https://www.mckinsey.com/business-functions/operations/our-insights/improving-warehouse-operations-digitally)**\n\n\nIdentify next best action and\nintegrate with actuation systems\n\n\n**IoT**\n\n**Edge/**\n**Cloud**\n\n\n**Digital Twins**\n\n**ERP**\n\n\nPredict & Diagnose\n\n|Col1|I i|\n|---|---|\n\n\n\nPredictive maintenance, process\nimprovements and Root Causing\n\n\nMonitor & Alert\n\n|Col1|P i|\n|---|---|\n\n\nReal-time operations monitoring\nand alerting\n\n\n-----\n\n### How to Build a Digital Twin\n\n\nA data architecture capability is needed to capture\nand collect the ever-expanding volume and variety\nof data streaming in real time from example\nprotocols, such as ABB Total Flow, Allen Bradley,\nEmerson, Fanuc, GE, Hitachi and Mitsubishi.\n\n\nData collection, data analytics, application\nenablement and data integration orchestrate the\ntime-series data stream and transfer to the cloud.\nAzure IoT Hub is used to securely ingest data from\nedge to cloud.\n\n\nCloud infrastructure and analytics capabilities are\noffered within the flexibility of the cloud. Azure\nDigital Twin is used to model and visualize process\nworkflows. Databricks MLflow and Delta Lake scale to\ndeliver real-time predictive analytics.\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Digital Twins: Technical Architecture**\n\n\n-----\n\n**How to Build a Digital Twin (continued)**\n\n**Building a digital twin doesn’t have to be a daunting task. Below are some simplistic steps:**\n\n\n**System and use case discovery**\n**and blueprinting**\n\n**•** Identify priority plant processes and systems\nto model, with focused use cases (e.g., asset\nmaintenance, energy management, process\nmonitoring/optimization, etc.)\n\n**•** Develop a validated process outline, blueprint and\nkey performance indicators\n\n**•** Develop a set of process variables, control\nvariables and manipulated variables\n\n**•** Design control loop\n\n**•** Validate and document process and asset FMEA\nfor all assets and sub-systems\n\n\n**Technology infrastructure requirements**\n\n**•** Technical edge infrastructure onsite — to sense,\ncollect and transmit real-time information\n\n**•** Clean, reliable data availability in the cloud\n\n**•** Data processing and analytics platform — to\ndesign, develop and implement solutions\n\n**•** Stream processing and deployment of models for\npredictions and soft sensing\n\n\n**Visualization delivered**\n\n**•** Information communication — visual\nrepresentation of digital twin along with remote\ncontrolling functions (e.g., Power BI dashboards,\ntime series insights, web app-based digital\ntwin portals)\n\n**•** Closed-loop feedback — to send the insights and\nactions back to form a closed loop — Azure – Event\nGrid and Event Hub with connection from IoT Hub to\nAzure IoT edge devices and control systems is used\n\n\n\n**•** Edge platform to orchestrate the data, insights and\nactions between the cloud and site IT systems\n\n**•** Cloud to edge integration — to enable seamless\nmonitoring, alerting and integration with plant\nOT/IT systems\n\n\n-----\n\n### Why Is Manufacturing Struggling With Data and AI?\n\n**Challenge** **Root Cause** **Goal**\n\n\nAggregate high volumes and velocities of\n\nstructured and unstructured data to power\n\npredictive analytics (e.g., images, IoT, ERP/SCM)\n\nData architectures that scale for TBs /PBs of\n\nenterprise IT and OT data\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\nfor on-premises 30 years ago\n\n\nSiloed data from systems designed\n**Siloed data across the value chain**\n\n\nLegacy architectures such as data\n\nhistorians that can’t handle semi-structured\n\nor unstructured data\n\n\n**Unable to scale enterprise data sets**\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\ngranular supply chain issues in the real world\n\n\nAddress manufacturing issues or track\n**Lack real-time insights** Batch-oriented data transfer\n\n\n**Can’t meet intellectual property**\n\n\n**Can’t meet intellectual property** Data lineage established across organizational\n\nSystems that do not establish data lineage\n**requirements** silos and disjointed workflows\n\n\nsilos and disjointed workflows\n\n\n### Data architecture is the root cause of this struggle.\n\n\n-----\n\n### Why Databricks for Digital Twins?\n\n\nLakehouse for Manufacturing’s simple, open and collaborative data platform consolidates and enhances data\nfrom across the organization and turns it into accessible, actionable insights. Scalable machine learning powers\ndigital twins with predictive insights across the value chain from product development to optimizing operations\nto building agile supply chains to robust customer insights.\n\n\nDatabricks open Lakehouse\n\nPlatform has shown time and\n\nagain that it is the foundational\n\nenabling technology to power\n\ndigital twins for manufacturing. But\n\nthe real power is the Databricks\n\npartnership with Tredence that\n\nspeeds implementation for\n\ntailored use cases that deliver\n\nsuperior ROI in less time.”\n\n**Dr. Bala Amavasai** ,\n\nManufacturing CTO, Databricks\n\n\n**Supports Real-Time**\n**Decisions**\n\nLakehouse for Manufacturing\nleverages any enterprise data\nsource — from business critical\nERP data to edge sensor data in\none integrated platform, making it\neasy to automate and secure data\nwith fast, real-time performance.\n\n\n**Faster and More**\n**Accurate Analysis**\n\nThe true benefits of digital twins\nare not the business intelligence\ndashboards, but machine\nlearning insights generated\nfrom incorporating real-time\ndata. Scalable and shareable\nnotebook-based machine learning\naccelerates ROI.\n\n\n**Open Data Sharing**\n**and Collaboration**\n\nDrive stronger customer insights\nand greater service with partners\nleveraging open and secure\ndata collaboration between\ndepartments or your supply chain\ndelivering faster ROI.\n\n\n-----\n\n### Why Tredence for Digital Twins?\n\n\nOver the last few years, Tredence’s unique Manufacturing and Supply Chain practice has coupled functional\nexpertise with cutting-edge AI-driven solutions to create measurable business impact for their customers.\nNow, Tredence’s partnership with Databricks is all set to unlock the power of real-time analytics and actions, to\nfurther strengthen their ‘’last mile impact’’ vision.\n\n\nTredence is excited to\n\nco-innovate with Databricks to\n\ndeliver the solutions required for\n\nenterprises to create digital twins\n\nfrom the ground up and implement\n\nthem swiftly to maximize their ROI.\n\nOur partnership enables clients to\n\nget the most out of Tredence’s data\n\nscience capabilities to build decision\n\nintelligence around manufacturing\n\nprocesses and Databricks’\n\nLakehouse Platform to realize the full\n\npromise of digital twins.”\n\n**Naresh Agarwal** ,\n\nHead of Industrials, Tredence\n\n\n**Global Reach**\n\nTredence offers a global team with\nthe subject matter expertise that\ndelivers practitioner and useroriented solutions to identify\nand solve for challenges in\ndigital transformation design\nand implementation.\n\n\n**Purpose-Built Solutions**\n\nAdopt contextual edge to cloud,\npurpose-built AIoT solutions\nthat unify your ecosystems with\nconnected insights and enhance\nproductivity, while enabling\nefficient cost structures.\n\n\n**Focused Dedication**\n\nA dedicated centre of excellence\n(CoE) for AIoT and smart\nmanufacturing solutions —\nserving the entire manufacturing\nvalue chain from product\ndevelopment to manufacturing and\ndownstream operations.\n\n\n-----\n\n### Using Digital Twins to Drive Insights\n\n\n**Use Case**\n\n**Predictive Maintenance**\n\n- \u0007Rolls-Royce sought to use real-time\nengine data to reduce unplanned\nmaintenance and downtime\n\n- \u0007Legacy systems were unable to\nscale data ingestion of engine\nsensor data in real time for ML\n\n**Impact**\n\n\n**Why Databricks?**\n\n- \u0007The Lakehouse Platform on Azure unifies in-flight data\nstreams with external environmental conditions data to\npredict engine performance issues\n\n- \u0007Delta Lake underpins ETL pipelines that feed ML workloads\nacross use cases\n\n- \u0007MLflow speeds deployment of new models and reduces\nincidents of grounded planes\n\n\nRolls-Royce uses Databricks\nto drive insights around predictive\nmaintenance, improving\nairframe reliability and reducing\ncarbon emissions.\n\n\n#### 22 million tons\nof carbon emissions saved\n\n\n#### 5% reduction\nin unplanned airplane groundings\n\n\n#### Millions of pounds\nin inventory cost savings from a 50%\nimprovement in maintenance efficiency\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide — including Comcast, Condé\n\nNast, Acosta and over 40% of the Fortune 500 — rely on the Databricks Lakehouse Platform to unify their data,\n\nanalytics and AI. Databricks is headquartered in San Francisco, with offices around the globe. Founded by the\n\noriginal creators of Apache Spark,™ Delta Lake and MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on [Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc) .\n\n###### Get started with a free trial of Databricks and start building data applications today\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks?itm_data=NavBar-TryDatabricks-Trial)**\n\nTo learn more, visit us at:\n\n**[databricks.com/manufacturing](https://databricks.com/solutions/industries/manufacturing-industry-solutions)**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "2024-09-19T16:57:22Z" + ], + [ + "### EBOOK\n\n# A Compact Guide to Large Language Models\n\n\n-----\n\nSECTION 1\n## Introduction\n\n##### Definition of large language models (LLMs)\n\nLarge language models are AI systems that are designed to process and analyze\nvast amounts of natural language data and then use that information to generate\nresponses to user prompts. These systems are trained on massive data sets\nusing advanced machine learning algorithms to learn the patterns and structures\nof human language, and are capable of generating natural language responses to\na wide range of written inputs. Large language models are becoming increasingly\nimportant in a variety of applications such as natural language processing,\nmachine translation, code and text generation, and more.\n\nWhile this guide will focus on language models, it’s important to understand that\nthey are only one aspect under a larger generative AI umbrella. Other noteworthy\ngenerative AI implementations include projects such as art generation from text,\naudio and video generation, and certainly more to come in the near future.\n\n\n-----\n\n##### Extremely brief historical background and development of LLMs\n\n\n###### 1950s–1990s\nInitial attempts are made to map hard rules around languages and\nfollow logical steps to accomplish tasks like translating a sentence\nfrom one language to another.\n\nWhile this works sometimes, strictly defined rules only work for\nconcrete, well-defined tasks that the system has knowledge about.\n\n###### 1990s \nLanguage models begin evolving into statistical models and\nlanguage patterns start being analyzed, but larger-scale projects\nare limited by computing power.\n\n###### 2000s \nAdvancements in machine learning increase the complexity of\nlanguage models, and the wide adoption of the internet sees an\n\nenormous increase in available training data.\n\n###### 2012 \nAdvancements in deep learning architectures and larger data sets\nlead to the development of GPT (Generative Pre-trained Transformer).\n\n\n###### 2018\nGoogle introduces BERT (Bidirectional Encoder Representations\nfrom Transformers), which is a big leap in architecture and paves\nthe way for future large language models.\n\n###### 2020\nOpenAI releases GPT-3, which becomes the largest model at\n175B parameters and sets a new performance benchmark for\nlanguage-related tasks.\n\n###### 2022\nChatGPT is launched, which turns GPT-3 and similar models into\na service that is widely accessible to users through a web interface\nand kicks off a huge increase in public awareness of LLMs and\ngenerative AI.\n\n###### 2023\nOpen source LLMs begin showing increasingly impressive results\nwith releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna.\nGPT-4 is also released, setting a new benchmark for both parameter\nsize and performance.\n\n\n-----\n\nSECTION 2\n## Understanding Large Language Models\n\n\n##### What are language models and how do they work?\n\nLarge language models are advanced artificial intelligence systems that take\nsome input and generate humanlike text as a response. They work by first\nanalyzing vast amounts of data and creating an internal structure that models\nthe natural language data sets that they’re trained on. Once this internal\nstructure has been developed, the models can then take input in the form of\nnatural language and approximate a good response.\n\n##### If they’ve been around for so many years, why are they just now making headlines?\n\nA few recent advancements have really brought the spotlight to generative AI\nand large language models:\n\n**A D VA N C E M E N T S I N T E C H N I Q U E S**\nOver the past few years, there have been significant advancements in the\ntechniques used to train these models, resulting in big leaps in performance.\nNotably, one of the largest jumps in performance has come from integrating\nhuman feedback directly into the training process.\n\n\n**I N C R E A S E D A C C E S S I B I L I T Y**\nThe release of ChatGPT opened the door for anyone with internet access\nto interact with one of the most advanced LLMs through a simple web\ninterface. This brought the impressive advancements of LLMs into the\nspotlight, since previously these more powerful LLMs were only available\nto researchers with large amounts of resources and those with very deep\ntechnical knowledge.\n\n**G R O W I N G C O M P U TAT I O N A L P O W E R**\nThe availability of more powerful computing resources, such as graphics\nprocessing units (GPUs), and better data processing techniques allowed\nresearchers to train much larger models, improving the performance of\nthese language models.\n\n**I M P R O V E D T R A I N I N G D ATA**\nAs we get better at collecting and analyzing large amounts of data, the\n\nmodel performance has improved dramatically. In fact, Databricks showed\nthat you can get amazing results training a relatively small model with a\nhigh-quality data set with [Dolly 2.0](https://huggingface.co/databricks/dolly-v2-12b) (and we released the data set as well\nwith the databricks-dolly-15k [data set](http://databricks/databricks-dolly-15k) ).\n\n\n-----\n\n##### So what are organizations using large language models for?\n\nHere are just a few examples of common use cases for large language models:\n\n**C H AT B O T S A N D V I R T U A L A S S I S TA N T S**\nOne of the most common implementations, LLMs can be used by\norganizations to provide help with things like customer support,\ntroubleshooting, or even having open-ended conversations with userprovided prompts.\n\n**C O D E G E N E R AT I O N A N D D E B U G G I N G**\nLLMs can be trained on large amounts of code examples and give\nuseful code snippets as a response to a request written in natural language.\nWith the proper techniques, LLMs can also be built in a way to reference\nother relevant data that it may not have been trained with, such as a\ncompany’s documentation, to help provide more accurate responses.\n\n**S E N T I M E N T A N A LY S I S**\nOften a hard task to quantify, LLMs can help take a piece of text and gauge\nemotion and opinions. This can help organizations gather the data and\n\nfeedback needed to improve customer satisfaction.\n\n\n**L A N G U A G E T R A N S L AT I O N**\nGlobalize all your content without hours of painstaking work by simply\nfeeding your web pages through the proper LLMs and translating them to\ndifferent languages. As more LLMs are trained in other languages, quality\nand availability will continue to improve.\n\n**S U M M A R I Z AT I O N A N D PA R A P H R A S I N G**\nEntire customer calls or meetings could be efficiently summarized so that\nothers can more easily digest the content. LLMs can take large amounts of\ntext and boil it down to just the most important bytes.\n\n**C O N T E N T G E N E R AT I O N**\nStart with a detailed prompt and have an LLM develop an outline for you.\nThen continue on with those prompts and LLMs can generate a good first\ndraft for you to build off. Use them to brainstorm ideas, and ask the LLM\nquestions to help you draw inspiration from.\n\n**_Note:_** Most LLMs are _not_ trained to be fact machines. They know how to use\nlanguage, but they might not know who won the big sporting event last year.\nIt’s always important to fact check and understand the responses before\n\nusing them as a reference.\n\n\n**T E X T C L A S S I F I C AT I O N A N D C L U S T E R I N G**\nThe ability to categorize and sort large volumes of data enables the\nidentification of common themes and trends, supporting informed\ndecision-making and more targeted strategies.\n\n\n-----\n\nSECTION 3\n## Applying Large Language Models\n\n\nThere are a few paths that one can take when looking to apply large language\nmodels for their given use case. Generally speaking, you can break them down\ninto two categories, but there’s some crossover between each. We’ll briefly cover\nthe pros and cons of each and what scenarios fit best for each.\n\n##### Proprietary services\n\nAs the first widely available LLM powered service, OpenAI’s ChatGPT was the\nexplosive charge that brought LLMs into the mainstream. ChatGPT provides\na nice user interface (or API) where users can feed prompts to one of many\nmodels (GPT-3.5, GPT-4, and more) and typically get a fast response. These are\namong the highest-performing models, trained on enormous data sets, and are\ncapable of extremely complex tasks both from a technical standpoint, such as\ncode generation, as well as from a creative perspective like writing poetry in a\nspecific style.\n\nThe downside of these services is the absolutely enormous amount of compute\nrequired not only to train them (OpenAI has said GPT-4 cost them over $100\nmillion to develop) but also to serve the responses. For this reason, these\nextremely large models will likely always be under the control of organizations,\n\n\nand require you to send your data to their servers in order to interact with their\nlanguage models. This raises privacy and security concerns, and also subjects\nusers to “black box” models, whose training and guardrails they have no control\nover. Also, due to the compute required, these services are not free beyond a\nvery limited use, so cost becomes a factor in applying these at scale.\n\nIn summary: Proprietary services are great to use if you have very complex tasks,\nare okay with sharing your data with a third party, and are prepared to incur\ncosts if operating at any significant scale.\n\n##### Open source models\n\nThe other avenue for language models is to go to the open source community,\nwhere there has been similarly explosive growth over the past few years.\nCommunities like [Hugging Face](https://huggingface.co/) gather hundreds of thousands of models\n\nfrom contributors that can help solve tons of specific use cases such as text\ngeneration, summarization and classification. The open source community has\nbeen quickly catching up to the performance of the proprietary models, but\nultimately still hasn’t matched the performance of something like GPT-4.\n\n\n-----\n\nIt does currently take a little bit more work to grab an open source model and\nstart using it, but progress is moving very quickly to make them more accessible\nto users. On Databricks, for example, we’ve made [improvements to open source](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html)\n[frameworks](https://www.databricks.com/blog/2023/04/18/introducing-mlflow-23-enhanced-native-llm-support-and-new-features.html) like MLflow to make it very easy for someone with a bit of Python\nexperience to pull any Hugging Face transformer model and use it as a Python\nobject. Oftentimes, you can find an open source model that solves your specific\nproblem that is **orders of magnitude** smaller than ChatGPT, allowing you to bring\nthe model into your environment and host it yourself. This means that you can\nkeep the data in your control for privacy and governance concerns as well as\nmanage your costs.\n\n\n##### Conclusion and general guidelines\n\nUltimately, every organization is going to have unique challenges to overcome,\nand there isn’t a one-size-fits-all approach when it comes to LLMs. As the world\nbecomes more data driven, everything, including LLMs, will be reliant on having\na strong foundation of data. LLMs are incredible tools, but they have to be used\nand implemented on top of this strong data foundation. Databricks brings both\nthat strong data foundation as well as the integrated tools to let you use and\nfine-tune LLMs in your domain.\n\n\nAnother huge upside to using open source models is the ability to fine-tune\nthem to your own data. Since you’re not dealing with a black box of a proprietary\nservice, there are techniques that let you take open source models and train\nthem to your specific data, greatly improving their performance on your\nspecific domain. We believe the future of language models is going to move\nin this direction, as more and more organizations will want full control and\nunderstanding of their LLMs.\n\n\n-----\n\nSECTION 4\n## So What Do I Do Next If I Want to Start Using LLMs?\n\n\nThat depends where you are on your journey! Fortunately, we have a few paths\nfor you.\n\nIf you want to go a little deeper into LLMs but aren’t quite ready to do it yourself,\nyou can watch one of Databricks’ most talented developers and speakers go\nover these concepts in more detail during the on-demand talk “ [How to Build](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly)\n[Your Own Large Language Model Like Dolly.](https://www.databricks.com/resources/webinar/build-your-own-large-language-model-dolly) ”\n\nIf you’re ready to dive a little deeper and expand your education and\nunderstanding of LLM foundations, we’d recommend checking out our\n[course on LLMs](https://www.edx.org/course/large-language-models-application-through-production) . You’ll learn how to develop production-ready LLM applications\nand dive into the theory behind foundation models.\n\nIf your hands are already shaking with excitement and you already have some\nworking knowledge of Python and Databricks, we’ll provide some great examples\nwith sample code that can get you up and running with LLMs right away!\n\n\n###### Getting started with NLP using Hugging Face transformers pipelines\n\n Fine-Tuning Large Language Models with Hugging Face and DeepSpeed\n\n Introducing AI Functions: Integrating Large Language Models with Databricks SQL\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000\n\norganizations worldwide — including Comcast, Condé Nast and\n\nover 50% of the Fortune 500 — rely on the Databricks Lakehouse\n\nPlatform to unify their data, analytics and AI. Databricks is\n\nheadquartered in San Francisco, with offices around the globe.\n\nFounded by the original creators of Apache Spark™, Delta Lake\n\nand MLflow, Databricks is on a mission to help data teams solve\n\nthe world’s toughest problems. To learn more, follow Databricks on\n\n[Twitter](https://twitter.com/databricks) , [LinkedIn](https://www.linkedin.com/company/databricks/) and [Facebook](https://www.facebook.com/databricksinc/) .\n\n**[START YOUR FREE TRIAL](https://databricks.com/try-databricks)**\n\n#### Contact us for a personalized demo: databricks.com/contact\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "# Building Reliable Data Lakes at Scale With Delta Lake\n\n\n-----\n\n## Contents\n\n#### Data Engineering Drivers 2\n\n Data Pipeline Key Goals 4\n\n Apache Spark™: The First Unified Analytics Engine 5\n\n Data Reliability Challenges With Data Lakes 6\n\n Delta Lake: A New Storage Layer 7\n\n Delta Lake: Key Features 8\n\n Getting Started With Delta Lake 10\n\n\n-----\n\n## Drivers\n\n#### Data Engineering Drivers\n\nData engineering professionals are needing to respond to several different drivers.\n\nChief among the drivers they face are:\n\n**Rise of Advanced Analytics** — Advanced analytics, including methods\n\nbased on machine learning techniques, have evolved to such a degree that\n\norganizations seek to derive far more value from their corporate assets.\n\n**Widespread Adoption** — Once the province of leading edge, high-tech\n\ncompanies, these advanced approaches are being adopted across a\n\nmultitude of industries from retail to hospitality to healthcare and across\n\nprivate as well as public sector organizations. This is further driving the need\n\nfor strong data engineering practices.\n\n**Regulation** — With the growth of data generation and data collection,\n\nthere is increased interest in how the data is protected and managed.\n\nRegulatory regimes such as GDPR (General Data Protection Regulation)\n\nfrom the EU and other jurisdictions mandate very specific ways in which\n\ndata must be managed.\n\n\n-----\n\n## Drivers\n\n**Technology Innovation** — The move to cloud-based analytics architectures\n\nthat is now well underway is being propelled further by innovations such as\n\nanalytics-focused chipsets, pipeline automation and the unification of data\n\nand machine learning. All these offer data professionals new approaches for\n\ntheir data initiatives.\n\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n\nalso subject to increasing scrutiny. There is also a greater understanding of\n\ndata as a valuable asset. Deriving value from data must be done in a manner\n\nthat is financially responsible and actually value adding to the enterprise and\n\nmeeting ROI hurdles.\n\n**Role Evolution** — Reflecting the importance of managing the data and\n\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\n\nmore prominent and newer roles such as Data Curator are emerging.\n\nThey must balance the needs of governance, security and democratization.\n\n\n-----\n\n## Key Goals\n\n#### Data Pipeline Key Goals\n\nMaking quality data available in a reliable manner is a major determinant of success for data\n\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\n\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n\nresponsibility need to take account of a broad set of dependencies and requirements as they\n\ndesign and build their data pipelines.\n\nThree primary goals that data engineers typically seek to address as they work to enable the\n\nanalytics professionals in their organizations are:\n\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\n\nare key. Data with gaps or errors (which can arise for many reasons) is\n\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n\nusers. Equally well, many applications require up-to-date information (who\n\nwants to use last night’s closing stock price or weather forecast) and are of\n\nlimited value without it.\n\n**Enable faster queries** — Wanting fast responses to queries is natural enough\n\nin today’s “New York minute,” online world. Achieving this is particularly\n\ndemanding when the queries are based on very large data sets.\n\n**Simplify data engineering at scale** — It is one thing to have high reliability and\n\nperformance in a limited, development or test environment. What matters\n\nmore is the ability to have robust, production data pipelines at scale without\n\nrequiring high operational overhead.\n\n\n-----\n\n### ™\n## Apache Spark\n\n#### Apache Spark ™ : The First Unified Analytics Engine\n\nOriginally developed at UC Berkeley in 2009, Apache Spark can be\n\nconsidered the first unified analytics engine. Uniquely bringing data\n\n\nand AI technologies together, Spark comes packaged with higher-level\n\nlibraries, including support for SQL queries, streaming data, machine\n\nlearning and graph processing. These standard libraries increase\n\ndeveloper productivity and can be seamlessly combined to create\n\n\nCustomer\nData\n\nEmails/\nWeb Pages\n\n\nClick\nStreams\n\nVideo/\nSpeech\n\n...\n\nSensor\nData (IoT)\n\n\ncomplex workflows.\n\n\n#### Big Data Processing\n\n\n#### Machine Learning\n\n\nSince its release, Apache Spark, has seen rapid adoption by\n\nenterprises across a wide range of industries. Internet powerhouses\n\n\nETL + SQL + Streaming MLlib + SparkR\n\n\nsuch as Netflix, Yahoo and eBay have deployed Spark at massive scale,\n\n\ncollectively processing multiple petabytes of data on clusters of over\n\n8,000 nodes making it the de facto choice for new analytics initiatives.\n\nIt has quickly become the largest open source community in big data,\n\nwith over 1000 contributors from 250+ organizations.\n\n\n##### While Spark has had a significant impact in taking data analytics to the next level, practitioners continue to face data reliability and performance challenges with their data lakes.\n\n\n-----\n\n## Data Reliability Challenges With Data Lakes\n\n\n**Failed Writes** — If a production job that is writing data experiences failures which\n\nare inevitable in large distributed environments, it can result in data corruption\n\nthrough partial or multiple writes. What is needed is a mechanism that is able to\n\nensure that either a write takes place completely or not at all (and not multiple times,\n\nadding spurious data). Failed jobs can impose a considerable burden to recover\n\nto a clean state.\n\n\n**Schema Mismatch** — When ingesting content from multiple sources, typical of\n\nlarge, modern big data environments, it can be difficult to ensure that the same\n\ndata is encoded in the same way i.e., the schema matches. A similar challenge\n\narises when the formats for data elements are changed without informing the\n\ndata engineering team. Both can result in low quality, inconsistent data that\n\nrequires cleaning up to improve its usability. The ability to observe and enforce\n\nschema would serve to mitigate this.\n\n\n**Lack of Consistency** — In a complex big data environment, one may be interested\n\nin considering a mix of both batch and streaming data. Trying to read data while\n\nit is being appended to provides a challenge since on the one hand there is a\n\ndesire to keep ingesting new data while on the other hand anyone reading the\n\ndata prefers a consistent view. This is especially an issue when there are multiple\n\nreaders and writers at work. It is undesirable and impractical, of course, to\n\nstop read access while writes complete or stop write access while reads are\n\nin progress.\n\n\n-----\n\n## Delta Lake: A New Storage Layer\n\n[Delta Lake](https://delta.io/) is an open source storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions, scalable metadata handling, and unifies\n\nstreaming and batch data processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs. Raw data is ingested\n\nfrom various batch and streaming input sources. Simple, reliable data pipelines help create a curated data lake containing tables of differing degrees of\n\nrefinement based on business needs. The data in these tables is then made available via the standard Spark APIs or special connectors for various use cases\n\nsuch as machine learning, SQL analytics or feeding to a data warehouse.\n\nStreaming\n\n###### Analytics and Machine Learning\n\n\nBatch\n\n\nIngestion Tables Refined Tables\n(Bronze) (Silver)\n\n\nFeature/Agg Data Store\n(Gold)\n\n\n###### Your Existing Data Lake\n\n\n-----\n\n## Delta Lake: Key Features\n\n\n**ACID Transactions —** Data lakes typically have multiple data pipelines reading\n\nand writing data concurrently, and data engineers have to go through a tedious\n\nprocess to ensure data integrity, due to the lack of transactions. Delta Lake\n\nbrings ACID transactions to your data lakes. It provides serializability, the\n\n\n**Scalable Metadata Handling —** In big data, even the metadata itself can be “big\n\ndata.” Delta Lake treats metadata just like data, leveraging Spark’s distributed\n\nprocessing power to handle all its metadata. As a result, Delta Lake can handle\n\npetabyte-scale tables with billions of partitions and files at ease.\n\n\nstrongest level of isolation level.\n\n\n**Time Travel (data versioning) —** Delta Lake provides snapshots of data enabling\n\ndevelopers to access and revert to earlier versions of data for audits, rollbacks or\n\nto reproduce experiments. For further details, please see this [documentation](https://www.google.com/url?q=https://docs.delta.io/latest/delta-batch.html%23-deltatimetravel&sa=D&source=editors&ust=1666305658154469&usg=AOvVaw0Zh1svr9wsqkIDKGQTgtLh) .\n\n\n**Schema Enforcement —** Delta Lake provides the ability to specify your schema\n\nand enforce it. This helps ensure that the data types are correct and required\n\ncolumns are present, preventing bad data from causing data corruption.\n\n\n-----\n\n## Delta Lake: Key Features\n\nParquet\n\n\n**Open Format —** All data in Delta Lake is stored in Apache Parquet format,\n\nenabling Delta Lake to leverage the efficient compression and encoding schemes\n\nthat are native to Parquet.\n\n**Unified Batch and Streaming Source and Sink** — A table in Delta Lake is both a\n\nbatch table, as well as a streaming source and sink. Streaming data ingest, batch\n\nhistoric backfill, and interactive queries all just work out of the box.\n\n\n**Schema Evolution —** Big data is continuously changing. Delta Lake\n\nenables you to make changes to a table schema that can be applied\n\nautomatically, without the need for cumbersome DDL.\n\n**100% Compatible With Apache Spark API —** Developers can use Delta\n\nLake with their existing data pipelines with minimal change as it is fully\n\ncompatible with Spark, the commonly used big data processing engine.\n\n\n-----\n\n## Getting Started With Delta Lake\n\n**Getting started with Delta Lake is easy. Specifically, to create a Delta table simply specify Delta instead of using Parquet.**\n\n\n#### Instead of parquet ...\n```\ndataframe\n.write\n.format(“ parquet ”)\n.save(“/data”)\n\n```\n\n#### … simply say delta\n```\ndataframe\n.write\n.format(“ delta ”)\n.save(“/data”)\n\n```\n\n##### Learn more about Delta Lake :\n\n[Delta Lake Blogs](https://delta.io/blog)\n\nDelta Lake Tutorials\n\n[Delta Lake Integrations](https://delta.io/integrations/)\n\n**For more information, please refer to the** **[documentation](https://docs.delta.io/latest/index.html)** **.**\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "#### eBook\n\n# The CDP Build vs Buy Guide:\n\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n\n\n-----\n\n## The Need for a Customer Data Platform\n\n\nOrganizations need to deliver personalized experiences to their customers to stay ahead\nof the curve — that means they need a customer data platform (CDP). Through a CDP, data\nfrom every touch point, along with third-party information, is brought together to provide\na unified view of the customer. This enables your marketing team to analyze, identify and\nactivate customers with targeted content.\n\nThe key question for all IT teams at these organizations is whether to build or to buy.\n\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\nfastest path to a solution.\n\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\nexisting marketing and analytics systems.. The cost of adding another system to the\nlandscape and the redundancy of sensitive customer data creates a governance challenge\nthat has immediate consequences.\n\n**Critical IT Needs** **Critical Business Needs**\n\n\nKeep control of data access and\ngovernance; ability to architecture a\ncustomer data stack with decisions on\nwhere data is stored and where queries\nare executed\n\n\nGet customer data access via a no-code\ninterface to generate insights; build customer\nexperiences and activate data within\nbusiness applications\n\n\n-----\n\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\nside or the other unaddressed — which is why so many organizations who have built a CDP\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\n\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n**both sides of the debate and provide organizations a third choice of both building and**\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\nthe business with no-code and ease of use interface along with the flexibility and centralized\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\nbuying, we’ve opened the door to finding the right balance of approaches for our customer\norganizations, helping organizations find greater success in their personalization journey.\n\n**“We made an attempt to internally build a CDP platform and while we**\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n**or offer a campaign interface to our product marketers that could empower**\n**them to create and manage those journeys. It was going to take at least two**\n**years for us to build all of that functionality in house.”**\n\n– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n\n\n-----\n\n## Combining the Build and Buy Approaches\n\n\nBringing together the best of build and buy involves the deployment of the CDP alongside or\nwithin the lakehouse platform. There are three approaches to this:\n\n**Bundled** **Composable**\n\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n\n\nCompute\n\nStorage\n\n\nCompute\n\nStorage\n(Local & Views)\n\n\nQuery\nVirtualization\n\nMetadata\n\n\nData Copy\n\n\nLakehouse\n\nStorage\n\n\nLakehouse\n\n\nLakehouse\n\n\nCompute Compute\n\nStorage Storage\n\n\n-----\n\nDeployment Type\n\n**Bundled**\n\n**Composable –**\n**Hybrid**\n\n**Composable –**\n**Lakehouse-Only**\n\n\nDescription\n\nThe CDP and the lakehouse are managed as two separate systems. Connectors in either system (as well as\nthird-party tools) allow data to be exchanged, typically as part of an ad hoc or batch process. This approach\nallows the organization to leverage the functionality of both systems but data is duplicated making governance\nan on-going concern.\n\nThe CDP and the lakehouse are managed as two separate systems, but deeper integrations between the two\nallow the organization to decide within which system a specific dataset should reside. Real-time integrations\nbetween the systems allow CDP users to select information assets in the lakehouse and generate queries\nspanning data on either side of the platform divide. This approach minimizes the need for data duplication\nwhich simplifies data governance, even though it must be implemented within two separate systems.\n\nAll CDP information assets reside within the lakehouse. User interfaces built on other technologies, directly\ninteract with the lakehouse for access to data. This approach minimizes redundancy and allows organizations\nto implement a centralized data governance strategy for all consumers of customer-relevant data.\n\n\n-----\n\n## Deployment Architectures \n\n\nThe choice of which of these deployment architectures is best depends on the functional\nrequirements of a specific organization. Each has its benefits, and in the case of parallel\nand federated deployments, organizations can easily transition between deployment\narchitectures over time. The following table captures many of the typical benefits\nassociated with the different deployment architectures.\n\n\nBundled CDP\nDeployment Composable CDPHybrid Composable CDPLakehouse-Only\n\n\nTypical\nUser\n\n**IT**\n\n\nComponent\n\nDigital Touchpoints\n\nData Modeling\n\nIdentity Resolution\n\nData Governance\n\n\nDescription\n\nCollect and integrate\ndata from digital\nchannels (website,\napp, etc.)\n\nUnify and model data\nto make it usable by\nother applications\n\nDeduplicate records to\nbuild a private ID graph\nwith a single view of\nthe customer\n\nControl data access\nand permitted actions\non the data\n\n\nIncluded with CDP\nvia a tag\n\nSometimes included\nwith CDP\n\nPrimarily with CDP\nor other tools (MDM,\nLakehouse)\n\nIncluded with CDP\n\n\nWorks with any digital\ntouchpoint collection\nsystem\n\nEither within the CDP\nor in Lakehouse via\nreal-time integration\n\nCDP, MDM, or\nLakehouse\n\nBoth CDP and\nLakehouse\n\n\nWorks with any digital\ntouchpoint collection\nsystem\n\nUnified environment with\nminimal data replication\nin and centralized\ngovernance in Lakehouse\n\nBuilt with Lakehouse and\nadditional tools\n\nManaged centrally from\nLakehouse\n\n\n-----\n\nBundled CDP\nDeployment Composable CDPHybrid Composable CDPLakehouse-Only\n\n\nTypical\nUser\n\n**Business**\n\n\nComponent\n\nPredictive Scoring\n\nMarketing Audience\nSegments\n\nCustomer Journey\nOrchestration\n\nData Activations\n\nAnalytics\n\n\nDescription\n\nCreate and execute\nmodels predicting\nuser behaviors such as\npurchase or churn\n\nUse a self-service UI\nto build rule-based\nor model-based\naudiences\n\nDefine and optimize\nthe customer journey\nand interactions with\nthe brand across every\nchannel and every\nphase of the customer\nlifecycle\n\nIntegrate seamlessly\nwith delivery systems\nfor both inbound and\noutbound customer\nexperiences\n\nUnderstand audience\nand customer journey\nperformance\n\n\nIncluded with CDP\nwith supplement\nscoring from\nLakehouse\n\nIncluded with CDP\n\nSometimes included\nwith CDP\n\nIncluded with CDP\n\nSometimes included\nwith CDP\n\n\nCDP, or automatically\npresent with Lakehouse\n\nIncluded with CDP\n\nCDP, marketing\nautomation, or\nadditional tools\n\nIncluded with CDP\n\nSometimes included\nwith CDP or built\nwith Lakehouse and\nadditional tools\n\n\nAutomatically present\nwith Lakehouse\n\nIncluded with CDP\n\nCDP, marketing\nautomation, or\nadditional tools\n\nCDP, or additional tools\n\nBuilt with Lakehouse\nand additional tools\n\n\n-----\n\n## About Databricks\n\nDatabricks is the data and AI company. More than 9,000 organizations worldwide —\nincluding Comcast, Condé Nast, H&M, and over 50% of the Fortune 500 — rely on\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\nis headquartered in San Francisco, with offices around the globe. Founded by the\noriginal creators of Apache SparkTM, Delta Lake and MLflow, Databricks is on a\nmission to help data teams solve the world’s toughest problems.\n\n## About ActionIQ\n\nAIQ brings order to CX chaos. Our Customer Experience Hub empowers\neveryone to be a CX champion by giving business teams the freedom to explore\nand action on customer data while helping technical teams regain control of\nwhere data lives and how it’s used.\n\n**[Get in touch](https://www.actioniq.com/get-started/)** with our experts to learn more.\n\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf", + "2024-09-19T16:57:20Z" + ], + [ + "-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----\n\n-----", + "SUCCESS", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf", + "2024-09-19T16:57:19Z" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "content", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "parser_status", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "doc_uri", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "last_modified", + "type": "\"timestamp\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from cookbook.config.data_pipeline import DataPipelineConfig\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "\n", + "datapipeline_config: DataPipelineConfig= load_serializable_config_from_yaml_file('./configs/data_pipeline_config.yaml')\n", + "\n", + "source_documents = spark.table(datapipeline_config.output.parsed_docs_table)\n", + "\n", + "display(source_documents.toPandas())" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1e11493e-9bda-4895-a612-e8812a2b4ace", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Run the synthetic evaluation data generation\n", + "\n", + "Optionally, you can customize the guidelines to guide the synthetic data generation. By default, guidelines are not applied - to apply the guidelines, uncomment `guidelines=guidelines` in the `generate_evals_df(...)` call. See our [documentation](https://docs.databricks.com/en/generative-ai/agent-evaluation/synthesize-evaluation-set.html) for more details." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a7cb950a-84b1-4e1d-a7fb-5179a0aa69de", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "c4436835e5f148ada2a48849c83f068a", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Generating evaluations: 0%| | 0/10 evals generated [Elapsed: 00:00, Remaining: ?]" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
request_idrequestexpected_retrieved_contextexpected_factssource_typesource_id
de1daac1a320379ce055bdc8b8342a2d7ca8d1ea08483081801f8219f41dc69dList(List(List(What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?, user)))List(List(“In today’s experience-driven world, the most beloved brands are the ones that know their customers. Customers are loyal to brands that recognize their needs and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer. And as organizations pursue omnichannel excellence, these same high expectations of online experiences also extend to brick-and-mortar locations — revealing for many merchants that personalized engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized experiences requires integrating various types of data — including demographics, behavioral and transactional — to develop robust profiles. This guide focuses on six actionable strategic pillars for businesses to leverage automation, real-time data, AI-driven analysis and well-tuned ML models to architect and deliver customized customer experiences at every touch point.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf))List(76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf
4b452a4426892dea5c35302c50dc70d62c0b2993f478af59a42b59d7c258bfa0List(List(List(What are two key challenges mentioned for predictive maintenance in government agencies?, user)))List(List(##### Overview\n", + "\n", + "**Integrating unstructured data**\n", + "Equipment data doesn’t just come in the form of IoT data. Agencies can gather rich unstructured signals like audio, visual (e.g., video inspections) and text (e.g., maintenance logs). Most legacy data architectures are unable to integrate structured and unstructured data sources.\n", + "\n", + "**Operationalizing machine learning**\n", + "Most agencies lack the advanced analytics tools needed to build models that can predict potential equipment failures. Those that do typically have their data scientists working in a siloed set of tools, resulting in unnecessary data replication and inefficient workflows., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf))List(Difficulty integrating structured and unstructured data sources due to legacy data architectures., Inefficient workflows caused by a lack of advanced analytics tools and siloed environments for data scientists.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf
6d1c05783fb5945cc9b121919eabdc2194c9c64809821e3c30b7f758a4d12a40List(List(List(What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?, user)))List(List(```\n", + "Our most popular use case is natural language processing\n", + "(NLP), a rapidly growing field that enables businesses to\n", + "gain value from unstructured textual data. This opens the\n", + "door for users to accomplish tasks that were previously\n", + "too abstract for code, such as summarizing content or\n", + "extracting sentiment from customer reviews. In our data\n", + "set, 49% of libraries used are associated with NLP. LLMs\n", + "also fall within this bucket. Given the innovations launched\n", + "in recent months, we expect to see NLP take off even\n", + "more in coming years as it is applied to use cases like\n", + "chatbots, research assistance, fraud detection, content\n", + "generation and more.\n", + "```, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf))List(49% of specialized Python libraries in the data set are associated with NLP., Examples of tasks enabled by NLP include summarizing content, extracting sentiment from customer reviews, chatbots, research assistance, fraud detection, and content generation.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf
8fc168f55c01c3d4059869879a9e54e8601faef19e46f011ac239c44dbe72f40List(List(List(Why is real-time data crucial for retail operations, and what problems do legacy systems cause?, user)))List(List(“Retailers need real-time data to support these decisions, but legacy systems are limited to data that’s hours or days old. When seconds matter, only the Lakehouse delivers better decisions [...] most retailers still rely on legacy data systems, which impedes their ability to scale these innovations. Unfortunately, most legacy systems are only able to process information in hours or days. The delays caused by waiting for data are leading to significant risks and costs for the industry.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf))List(Real-time data enables immediate decision-making., Real-time data enables better decision-making in critical moments., Legacy systems process outdated data., Legacy systems cause delays., Legacy systems lead to risks for the retail industry., Legacy systems lead to costs for the retail industry.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf
66725804819c75f5e3005072cb81414f01272d64b1b0a8ea89a58392599b1ff7List(List(List(What are the key features and advantages of the lakehouse pattern?, user)))List(List(“The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency, and scale of data lakes with the data management and ACID semantics of data warehouses. A lakehouse pattern enables data transformation, cleansing, and validation to support both business intelligence and machine learning (ML) users on all data. Lakehouse is cloud-centric and unifies a complete up-to-date data set for teams, allowing collaboration across an organization.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf))List(The lakehouse pattern has an open data management architecture., It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics., It supports data transformation, cleansing, and validation., The lakehouse pattern is cloud-centric., It enhances support for both business intelligence and machine learning., It is cost-efficient., It offers an up-to-date unified data set., It improves collaboration across the organization.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf
1373db51df7476c934e04796eaceed4d4475d7b7a70efcb3405b121c71e96923List(List(List(What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?, user)))List(List(Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in game telemetry include:\n", + "\n", + "- **Player engagement:** Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n", + "- **Game progress:** Monitor player progress through different levels and milestones in the game.\n", + "- **In-game purchases:** Track the number and value of in-game purchases made by players.\n", + "- **Player demographics:** Collect demographic information about players, such as age, gender, location, and device type.\n", + "- **Session length:** Monitor the length of each player session, and how often players return to the game.\n", + "- **Retention:** Track the percentage of players who return to the game after their first session.\n", + "- **User Acquisition:** Track the number of new players acquired through different marketing channels., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf))List(Game telemetry is data collected about player behavior and interactions within a video game., The data is primarily sourced from the game engine., Primary metrics tracked in game telemetry include:\n", + " - player engagement\n", + " - game progress\n", + " - in-game purchases\n", + " - player demographics\n", + " - session length\n", + " - retention\n", + " - user acquisition)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf
3b231daee5434db054e2ee8b4aee9b4edba19aa8886c0d491daa1b36b743142fList(List(List(What are some of the common problems faced by data lakes according to the document?, user)))List(List(**Challenges with data lakes**\n", + "Data lakes are a common element within modern data architectures. They serve as a\n", + "central ingestion point for the plethora of data that organizations seek to gather and\n", + "mine. While a good step forward in getting to grips with the range of data, they run\n", + "into the following common problems:\n", + "\n", + "**1. Reading and writing into data lakes is not reliable.** Data engineers often run into\n", + "the problem of unsafe writes into data lakes that cause readers to see garbage\n", + "data during writes. They have to build workarounds to ensure readers always see\n", + "consistent data during writes.\n", + "\n", + "**2. The data quality in data lakes is low.** Dumping unstructured data into a data\n", + "lake is easy, but this comes at the cost of data quality. Without any mechanisms\n", + "for validating schema and the data, data lakes suffer from poor data quality. As a\n", + "consequence, analytics projects that strive to mine this data also fail.\n", + "\n", + "**3. Poor performance with increasing amounts of data.** As the amount of data\n", + "that gets dumped into a data lake increases, the number of files and directories\n", + "also increases. Big data jobs and query engines that process the data spend a\n", + "significant amount of time handling the metadata operations. This problem is more\n", + "pronounced in the case of streaming jobs or handling many concurrent batch jobs.\n", + "\n", + "**4. Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf))List(Unreliable reading and writing operations, Low data quality due to the lack of validation mechanisms, Poor performance with increasing data volume, Difficulty in modifying, updating, or deleting records)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf
9673989eb3b8242fc0a48d6338f31191260dd7cf6c7eacb26f2ed1512af803a2List(List(List(What new opportunities can data sharing create for organizations looking to generate additional revenue?, user)))List(List(**Key benefits of data sharing**\n", + "\n", + "As you can see from the use cases described above, there are many benefits of data sharing, including:\n", + "\n", + "**Greater collaboration with existing partners.** In today’s hyper-connected digital economy, no single organization can advance its business objectives without partnerships. Data sharing helps solidify existing partnerships and can help organizations establish new ones.\n", + "**Ability to generate new revenue streams.** With data sharing, organizations can generate new revenue streams by offering data products or data services to their end consumers.\n", + "**Ease of producing new products, services or business models.** Product teams can leverage both first-party data and third-party data to refine their products and services and expand their product/service catalog.\n", + "**Greater efficiency of internal operations.** Teams across the organization can meet their business goals far more quickly when they don’t have to spend time figuring out how to free data from silos. When teams have access to live data, there’s no lag time between the need for data and the connection with the appropriate data source., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf))List(Data sharing can enable organizations to offer data products., Data sharing can enable organizations to offer data services.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf
21866cbed9a5ba0daafc9367a06f6679f7e6290dd05b59cfd45d36fdbc8fbe73List(List(List(Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?, user)))List(List(**EBOOK**\n", + "\n", + "## The Big Book of Data Engineering 2nd Edition\n", + "\n", + "A collection of technical\n", + "blogs, including code\n", + "samples and notebooks\n", + "\n", + "##### With all-new content\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n", + "\n", + "**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n", + "\n", + "**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n", + "\n", + "**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n", + "\n", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n", + "\n", + "**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n", + "\n", + "**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n", + "\n", + "**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n", + "\n", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n", + "\n", + "**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n", + "\n", + "**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n", + "\n", + "**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n", + "\n", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77\n", + "\n", + "**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n", + "\n", + "**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n", + "\n", + "**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n", + "\n", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90\n", + "\n", + "**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 01\n", + "\n", + "\n", + "### Introduction to Data Engineering on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "Organizations realize the value data plays as a strategic asset for various\n", + "business-related initiatives, such as growing revenues, improving the customer\n", + "experience, operating efficiently or improving a product or service. However,\n", + "accessing and managing data for these initiatives has become increasingly\n", + "complex. Most of the complexity has arisen with the explosion of data volumes\n", + "and data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\n", + "to increase, 73% of the data goes unused for analytics or decision-making. In\n", + "order to try and decrease this percentage and make more data usable, data\n", + "engineering teams are responsible for building data pipelines to efficiently and\n", + "reliably deliver data. But the process of building these complex data pipelines\n", + "comes with a number of difficulties:\n", + "\n", + "**•** In order to get data into a data lake, data engineers are required\n", + "to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state\n", + "\n", + "**Data pipeline observability**\n", + "Monitor overall data pipeline status from a dataflow graph dashboard and\n", + "visually track end-to-end pipeline health for performance, quality and latency.\n", + "Data pipeline observability capabilities include:\n", + "\n", + "**•** A high-quality, high-fidelity lineage diagram that provides visibility\n", + "into how data flows for impact analysis\n", + "\n", + "**•** Granular logging with performance and status of the data pipeline\n", + "at a row level\n", + "\n", + "**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n", + "\n", + "\n", + "**Automatic deployments and operations**\n", + "Ensure reliable and predictable delivery of data for analytics and machine\n", + "learning use cases by enabling easy and automatic data pipeline deployments\n", + "and rollbacks to minimize downtime. Benefits include:\n", + "\n", + "**•** Complete, parameterized and automated deployment for the\n", + "continuous delivery of data\n", + "\n", + "**•** End-to-end orchestration, testing and monitoring of data pipeline\n", + "deployment across all major cloud providers\n", + "\n", + "**Migrations**\n", + "Accelerating and de-risking the migration journey to the lakehouse, whether\n", + "from legacy on-prem systems or disparate cloud services.\n", + "\n", + "The migration process starts with a detailed discovery and assessment to\n", + "get insights on legacy platform workloads and estimate migration as well as\n", + "Databricks platform consumption costs. Get help with the target architecture\n", + "and how the current technology stack maps to Databricks, followed by a\n", + "phased implementation based on priorities and business needs. Throughout\n", + "this journey companies can leverage:\n", + "\n", + "**•** Automation tools from Databricks and its ISV partners\n", + "\n", + "**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n", + "\n", + "**•** Databricks Professional Services and training\n", + "\n", + "This is the recommended approach for a successful migration, whereby\n", + "customers have seen a 25-50% reduction in costs and 2-3x faster time to value\n", + "for their use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified governance**\n", + "With Unity Catalog, data engineering and governance teams benefit from an\n", + "enterprisewide data catalog with a single interface to manage permissions,\n", + "centralize auditing, automatically track data lineage down to the column level,\n", + "and share data across platforms, clouds and regions. Benefits:\n", + "\n", + "**•** Discover all your data in one place, no matter where it lives,\n", + "and centrally manage fine-grained access permissions using an\n", + "ANSI SQL-based interface\n", + "\n", + "**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**\n", + "\n", + "As organizations strive to become data-driven, data engineering is a focal\n", + "point for success. To deliver reliable, trustworthy data, data engineers shouldn’t\n", + "need to spend time manually developing and maintaining an end-to-end\n", + "ETL lifecycle. Data engineering teams need an efficient, scalable way to\n", + "simplify ETL development, improve data reliability and manage operations.\n", + "\n", + "As described, the eight key differentiating capabilities simplify the\n", + "management of the ETL lifecycle by automating and maintaining all data\n", + "dependencies, leveraging built-in quality controls with monitoring and by\n", + "providing deep visibility into pipeline operations with automatic recovery.\n", + "Data engineering teams can now focus on easily and rapidly building reliable\n", + "end-to-end production-ready data pipelines using only SQL or Python\n", + "for batch and streaming that deliver high-value data for analytics, data\n", + "science or machine learning.\n", + "\n", + "\n", + "**Follow proven best practices**\n", + "\n", + "In the next section, we describe best practices for data engineering\n", + "end-to end use cases drawn from real-world examples. From data ingestion\n", + "and real-time processing to analytics and machine learning, you’ll learn\n", + "how to translate raw data into actionable data.\n", + "\n", + "As you explore the rest of this guide, you can find data sets and code\n", + "samples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\n", + "get your hands dirty as you explore all aspects of the data lifecycle on the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### Guidance and Best Practices\n", + "\n", + "**2.1** Top 5 Databricks Performance Tips\n", + "\n", + "**2.2** How to Profile PySpark\n", + "\n", + "**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n", + "\n", + "**2.4** Streaming in Production: Collected Best Practices\n", + "\n", + "**2.5** Streaming in Production: Collected Best Practices, Part 2\n", + "\n", + "**2.6** Building Geospatial Data Products\n", + "\n", + "**2.7** Data Lineage With Unity Catalog\n", + "\n", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:\n", + "\n", + "**•** **Use larger clusters.** It may sound obvious, but this is the number\n", + "one problem we see. It’s actually not any more expensive to use a large\n", + "cluster for a workload than it is to use a smaller one. It’s just faster.\n", + "If there’s anything you should take away from this article, it’s this.\n", + "\n", + "Read section 1. Really.\n", + "\n", + "**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\n", + "to learn more. You won’t regret it.\n", + "\n", + "\n", + "\n", + "**•** **Clean out your configurations** . Configurations carried from one\n", + "Apache Spark™ version to the next can cause massive problems. Clean up!\n", + "Read section 3 to learn more.\n", + "\n", + "**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\n", + "correctly, if at all. See Section 4 to learn more.\n", + "\n", + "**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\n", + "you’re writing Spark code, jump to section 5.\n", + "\n", + "**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\n", + "blog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n", + "\n", + "**1. Give your clusters horsepower!**\n", + "\n", + "This is the number one mistake customers make. Many customers create tiny\n", + "clusters of two workers with four cores each, and it takes forever to do anything.\n", + "The concern is always the same: they don’t want to spend too much money on\n", + "larger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n", + "**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n", + "\n", + "\n", + "-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2\n", + "\n", + "Notice that the total cost of the workload stays the same while the real-world\n", + "time it takes for the job to run drops significantly. So, bump up your Databricks\n", + "cluster specs and speed up your workloads without spending any more money. It\n", + "\n", + "can’t really get any simpler than that.\n", + "\n", + "**2. Use Photon**\n", + "\n", + "Our colleagues in engineering have rewritten the Spark execution engine in C++\n", + "and dubbed it Photon. The results are impressive!\n", + "\n", + "\n", + "Beyond the obvious improvements due to running the engine in native code,\n", + "they’ve also made use of CPU-level performance features and better memory\n", + "\n", + "management. On top of this, they’ve rewritten the Parquet writer in C++. So this\n", + "makes writing to Parquet and Delta (based on Parquet) super fast as well!\n", + "\n", + "But let’s also be clear about what Photon is speeding up. It improves\n", + "computation speed for any built-in functions or operations, as well as writes to\n", + "Parquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n", + "(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\n", + "spending most of its time reading from an ancient on-prem database? Won’t\n", + "help there either, unfortunately.\n", + "\n", + "\n", + "-----\n", + "\n", + "The good news is that it helps where it can. So even if part of your job can’t be\n", + "sped up, it will speed up the other parts. Also, most jobs are written with the\n", + "native operations and spend a lot of time writing to Delta, and Photon helps a lot\n", + "there. So give it a try. You may be amazed by the results!\n", + "\n", + "**3. Clean out old configurations**\n", + "\n", + "You know those Spark configurations you’ve been carrying along from version to\n", + "version and no one knows what they do anymore? They may not be harmless.\n", + "We’ve seen jobs go from running for hours down to minutes simply by cleaning\n", + "out old configurations. There may have been a quirk in a particular version of\n", + "Spark, a performance tweak that has not aged well, or something pulled off\n", + "some blog somewhere that never really made sense. At the very least, it’s worth\n", + "revisiting your Spark configurations if you’re in this situation. Often the default\n", + "configurations are the best, and they’re only getting better. Your configurations\n", + "may be holding you back.\n", + "\n", + "**4. The Delta Cache is your friend**\n", + "\n", + "This may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.\n", + "\n", + "Of course, your mileage may vary. If you’re doing BI, which involves reading the\n", + "same tables over and over again, caching gives an amazing boost. However, if\n", + "you’re simply reading a table once and writing out the results as in some ETL\n", + "jobs, you may not get much benefit. You know your jobs better than anyone.\n", + "Go forth and conquer.\n", + "\n", + "\n", + "-----\n", + "\n", + "**5. Be aware of lazy evaluation**\n", + "\n", + "\n", + "However, there is a catch here. Every time you try to display or write out\n", + "results, it runs the execution plan again. Let’s look at the same block of code\n", + "but extend it and do a few more operations.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ".filter(...)\n", + ")\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "\n", + "_# Unfortunately this will run the plan again, including filtering, joining,_\n", + "_etc_\n", + "df2.display()\n", + "\n", + "_# So will this…_\n", + "df2.count()\n", + "—------\n", + "\n", + "\n", + "If you’re a data analyst or data scientist only using SQL or doing BI you can skip\n", + "this section. However, if you’re in data engineering and writing pipelines or doing\n", + "processing using Databricks/Spark, read on.\n", + "\n", + "When you’re writing Spark code like select, groupBy, filter, etc., you’re really\n", + "building an execution plan. You’ll notice the code returns almost immediately when\n", + "you run these functions. That’s because it’s not actually doing any computation. So\n", + "even if you have petabytes of data, it will return in less than a second.\n", + "\n", + "However, once you go to write your results out you’ll notice it takes longer. This\n", + "is due to lazy evaluation. It’s not until you try to display or write results that your\n", + "execution plan is actually run.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.\n", + "\n", + "\n", + "This works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\n", + "benefit greatly from lazy evaluation, but it’s something a lot of customers trip\n", + "over. So be aware of its existence and save results you reuse in order to avoid\n", + "unnecessary computation.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "Let’s look at the same block of code again, but this time let’s avoid the\n", + "recomputation:\n", + "\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + ")\n", + "\n", + "_# save it_\n", + "df2.write.save(path)\n", + "\n", + "_# load it back in_\n", + "df3 = spark.read.load(path)\n", + "\n", + "_# now use it_\n", + "df3.display()\n", + "\n", + "_# this is not doing any extra computation anymore. No joins, filtering,_\n", + "_etc. It’s already done and saved._\n", + "df3.display()\n", + "\n", + "_# nor is this_\n", + "df3.count()\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.2 \u0007\n", + "\n", + "**How to Profile PySpark**\n", + "\n", + "by **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n", + "\n", + "October 6, 2022\n", + "\n", + "\n", + "In Apache Spark™, declarative Python APIs are supported for big data workloads.\n", + "They are powerful enough to handle most common use cases. Furthermore,\n", + "PySpark UDFs offer more flexibility since they enable users to run arbitrary\n", + "Python code on top of the Apache Spark™ engine. Users only have to state\n", + "“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\n", + "PySpark easier to use, but it can be difficult to identify performance bottlenecks\n", + "and apply custom optimizations.\n", + "\n", + "To address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**\n", + "\n", + "PySpark applications run as independent sets of processes on a cluster,\n", + "coordinated by the SparkContext object in the driver program. On the driver\n", + "side, PySpark is a regular Python process; thus, we can profile it as a normal\n", + "Python program using cProfile as illustrated below:\n", + "\n", + "import cProfile\n", + "\n", + "with cProfile.Profile() as pr:\n", + "_# Your code_\n", + "\n", + "pr.print_stats()\n", + "\n", + "**Workers profiling**\n", + "\n", + "Executors are distributed on worker nodes in the cluster, which introduces\n", + "complexity because we need to aggregate profiles. Furthermore, a Python worker\n", + "process is spawned per executor for PySpark UDF execution, which makes the\n", + "profiling more intricate.\n", + "\n", + "\n", + "-----\n", + "\n", + "The UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\n", + "and becomes a major tool to profile workers for PySpark applications. We’ll\n", + "illustrate how to use the UDF profiler with a simple Pandas UDF example.\n", + "\n", + "Firstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n", + "```\n", + " sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n", + " 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n", + " by 'id' )\n", + " ).withColumn( 'v' , rand())\n", + "\n", + "```\n", + "Later, we will group by the id column, which results in 8 groups with 1,000 rows\n", + "per group.\n", + "\n", + "The Pandas UDF plus_one is then created and applied as shown below:\n", + "```\n", + " import pandas as pd\n", + " def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf.apply( lambda x: x + 1 , axis= 1 )\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "Executing the example above and running sc.show_profiles() prints the\n", + "following profile. The profile below can also be dumped to disk by sc.dump_\n", + "profiles(path).\n", + "\n", + "The UDF id in the profile (271, highlighted above) matches that in the Spark plan\n", + "for res. The Spark plan can be shown by calling res.explain() .\n", + "\n", + "\n", + "Note that plus_one takes a pandas DataFrame and returns another pandas\n", + "DataFrame. For each group, all columns are passed together as a pandas\n", + "DataFrame to the plus_one UDF, and the returned pandas DataFrames are\n", + "combined into a PySpark DataFrame.\n", + "\n", + "\n", + "-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each function\n", + "\n", + "Digging into the column details: plus_one is triggered once per group, 8 times\n", + "in total; _arith_method of pandas Series is called once per row, 8,000 times\n", + "in total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\n", + "row, thus suffering from high invocation overhead.\n", + "\n", + "We can reduce such overhead by substituting the pandas.DataFrame.apply\n", + "with pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\n", + "follows:\n", + "```\n", + " import pandas as pd\n", + " def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf + 1\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n", + " schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "The updated profile is as shown below.\n", + "\n", + "We can summarize the optimizations as follows:\n", + "\n", + "**•** Arithmetic operation from 8,000 calls to 8 calls\n", + "\n", + "**•** Total function calls from 2,898,160 calls to 2,384 calls\n", + "\n", + "**•** Total execution time from 2.300 seconds to 0.004 seconds\n", + "\n", + "The short example above demonstrates how the UDF profiler helps us deeply\n", + "understand the execution, identify the performance bottleneck and enhance\n", + "the overall performance of the user-defined function.\n", + "\n", + "The UDF profiler was implemented based on the executor-side profiler,\n", + "which is designed for PySpark RDD API. The executor-side profiler is available\n", + "in all active Databricks Runtime versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "Both the UDF profiler and the executor-side profiler run on Python workers.\n", + "They are controlled by the spark.python.profile Spark configuration, which\n", + "is false by default. We can enable that Spark configuration on a Databricks\n", + "Runtime cluster as shown below.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "PySpark profilers are implemented based on cProfile; thus, the profile reporting\n", + "relies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\n", + "collecting profile reports from Python workers.\n", + "\n", + "Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022\n", + "\n", + "\n", + "[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\n", + "approach for creating reliable data pipelines and fully manages the underlying\n", + "infrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\n", + "actionable insights derived from near real-time data. Delta Live Tables enables\n", + "low-latency streaming data pipelines to support such use cases with low\n", + "latencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n", + "[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n", + "\n", + "This article will walk through using DLT with Apache Kafka while providing the\n", + "required Python code to ingest streams. The recommended system architecture\n", + "will be explained, and related DLT settings worth considering will be explored\n", + "along the way.\n", + "\n", + "**Streaming platforms**\n", + "\n", + "Event buses or message buses decouple message producers from consumers.\n", + "A popular streaming use case is the collection of click-through data from\n", + "users navigating a website where every user interaction is stored as an event in\n", + "\n", + "\n", + "Apache Kafka. The event stream from Kafka is then used for real-time streaming\n", + "data analytics. Multiple message consumers can read the same data from Kafka\n", + "and use the data to learn about audience interests, conversion rates, and bounce\n", + "reasons. The real-time, streaming event data from the user interactions often\n", + "also needs to be correlated with actual purchases stored in a billing database.\n", + "\n", + "**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”\n", + "\n", + "When developing DLT with Python, the @dlt.table decorator is used to, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf))List(Larger clusters execute workloads faster in Databricks., The faster execution reduces the total time required for workload completion., The overall cost efficiency is balanced due to reduced workload completion time despite higher hourly costs.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf
088c4943384eaa6a228c3d68ff70fbef6bcbe9c50176180e73244de1d7f3be1aList(List(List(What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?, user)))List(List(```\n", + "TECHNICAL GUIDE\n", + "\n", + "```\n", + "\n", + "# Solving Common Data Challenges \n", + "\n", + "\n", + "#### Startups and Digital Native Businesses\n", + "\n", + "\n", + "-----\n", + "\n", + "### Table of Contents\n", + "\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Creating a unified data architecture for data quality, governance and efficiency\n", + "\n", + "# 03\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building effective machine learning operations\n", + "\n", + "```\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building a data architecture to support scale and performance\n", + "\n", + "# 04\n", + "SUMMARY:\n", + "\n", + "###### The Databricks Lakehouse Platform addresses these challenges\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "\n", + "This guide shares how the lakehouse architecture can increase\n", + "productivity and cost-efficiently support all your data, analytics\n", + "and AI workloads, and flexibly scale with the pace of growth\n", + "for your company. Read the entire guide or dive straight into a\n", + "specific challenge.\n", + "\n", + "With the advent of cloud infrastructure, a new generation of\n", + "startups has rapidly built and scaled their businesses. The use of\n", + "cloud infrastructure, once seen as innovative, has now become\n", + "table stakes. The differentiator for the fastest-moving startups\n", + "and digital natives now comes from the effective use of data\n", + "at scale, primarily analytics and AI. Digital natives — defined\n", + "as fast-moving, lean, and technically savvy, born-in-the-cloud\n", + "organizations — are beginning to focus on new data-driven use\n", + "cases such as real-time machine learning and personalized\n", + "customer experiences.\n", + "\n", + "To pursue these new data-intensive use cases and initiatives,\n", + "organizations must look beyond the technologies that delivered\n", + "them to this point in time. Over time, these technologies, such\n", + "as transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n", + "\n", + "This guide examines some of the biggest data challenges and\n", + "solutions for startups and for scaling digital native businesses\n", + "that have reached the point where an end-to-end modern data\n", + "platform is a smart investment. Some key considerations include:\n", + "systems that are not cost-efficient and require time-consuming\n", + "administration and engineering toil. In addition to growing\n", + "maintenance needs, data is often stored in disparate locations\n", + "and formats, with little or no governance, making real-time use\n", + "cases, analytics and AI difficult or impossible.\n", + "\n", + "\n", + "**Consolidating on a unified data platform**\n", + "As mentioned above, siloed data storage and management add administrative and\n", + "financial cost. You can benefit significantly when you unify your data in one location\n", + "with a flexible architecture that scales with your needs and delivers performance\n", + "for future success. For this, you will want an open platform that supports all your\n", + "data including batch and streaming workloads, data analytics and machine learning.\n", + "With data unification, you create a more efficient, integrated approach to ingesting,\n", + "cleaning and organizing your data. You also need automation to make data analysis\n", + "easier for the nontechnical users in the company. But broader data access also\n", + "means more focus on security, privacy, compliance and access control, which can\n", + "create overhead for a growing.\n", + "\n", + "**Scaling up capacity and increasing performance**\n", + "**and usability of the data solutions**\n", + "Data teams at growing digital native organizations find it time intensive and costly to\n", + "handle the growing volume and velocity of their data being ingested from multiple\n", + "sources, across multiple clouds. You now need a unified and simplified platform that\n", + "can instantly scale up capacity and deliver more computing power on demand to\n", + "free up your data teams to produce outputs more quickly. This lowers the total cost\n", + "for the overall infrastructure by eliminating redundant licensing, infrastructure and\n", + "administration costs.\n", + "\n", + "**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "As cloud-born companies grow, data volumes rapidly increase, leading to new\n", + "challenges and use cases. Among the challenges:\n", + "\n", + "\n", + "Application stacks optimized for transaction\n", + "use cases aren’t able to handle the volume,\n", + "velocity and variety of data that modern data\n", + "teams require. For example, this leads to query\n", + "performance issues as data volume grows.\n", + "\n", + "Data silos develop as each team within an\n", + "organization chooses different ETL/ELT and\n", + "storage solutions for their needs. As the\n", + "organization grows and changes, these pipelines\n", + "and storage solutions become brittle, hard to\n", + "maintain and nearly impossible to integrate.\n", + "\n", + "\n", + "These data silos lead to discoverability,\n", + "integration and access issues, which prevent\n", + "teams from leveraging the full value of the\n", + "organization’s available data.\n", + "\n", + "Data governance is hard. Disparate ETL/ELT\n", + "and storage solutions lead to governance,\n", + "compliance, auditability and access control\n", + "challenges, which expose organizations to\n", + "tremendous risk.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides\n", + "a unified set of tools for building, deploying,\n", + "sharing and maintaining data solutions at scale.\n", + "It integrates with cloud storage and the security\n", + "in your cloud account, manages and deploys\n", + "cloud infrastructure on your behalf. Your data\n", + "practitioners no longer need separate storage\n", + "systems for their data. And you don’t have to rely\n", + "on your cloud provider for security. The lakehouse\n", + "has its own robust security built into the platform.\n", + "\n", + "\n", + "For all the reasons above, the most\n", + "consistent advice from successful data\n", + "practitioners is to create a “single source\n", + "of truth” by unifying all data on a single\n", + "platform. With the Databricks Lakehouse\n", + "Platform, you can unify all your data on one\n", + "platform, reducing data infrastructure costs\n", + "and compute. You don’t need excess data\n", + "copies and you can retire expensive\n", + "legacy infrastructure.\n", + "```\n", + " 01\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: GRAMMARLY\n", + "\n", + "### Helping 30 million people and 50,000 teams communicate more effectively\n", + "\n", + "```\n", + "\n", + "While its business is based on analytics, [Grammarly](http://www.grammarly.com)\n", + "\n", + "for many years relied on a homegrown analytics\n", + "\n", + "platform to drive its AI writing assistant to\n", + "\n", + "help users improve multiple aspects of written\n", + "\n", + "communications. As teams developed their own\n", + "\n", + "requirements, data silos inevitably emerged as\n", + "\n", + "different business areas implemented analytics\n", + "\n", + "tools individually.\n", + "\n", + "“Every team decided to solve their analytics\n", + "\n", + "needs in the best way they saw fit,” said Chris\n", + "\n", + "Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholds\n", + "\n", + "the standards we set as a data platform team,” said Locklin. “Lineage is\n", + "\n", + "the last crucial piece for access control.”\n", + "\n", + "Data analysts within Grammarly now have a consolidated interface for\n", + "\n", + "analytics, which leads to a single source of truth and confidence in the\n", + "\n", + "accuracy and availability of all data managed by the data platform team.\n", + "\n", + "Having a consistent data source across the company also resulted in\n", + "\n", + "greater speed and efficiency and reduced costs. Data practitioners\n", + "\n", + "experienced 110% faster querying at 10% of the cost to ingest compared\n", + "\n", + "to a data warehouse. Grammarly can now make its 5 billion daily events\n", + "\n", + "available for analytics in under 15 minutes rather than 4 hours. Migrating\n", + "\n", + "off its rigid legacy infrastructure gave Grammarly the flexibility to do\n", + "\n", + "more and the confidence that the platform will evolve with its needs.\n", + "\n", + "Grammarly is now able to sustain a flexible, scalable and highly secure\n", + "\n", + "analytics platform that helps 30 million people and 50,000 teams\n", + "\n", + "worldwide write more effectively every day.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/grammarly)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to unify the data infrastructure with Databricks\n", + "\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\n", + "is composed of two primary parts:\n", + "\n", + "- The infrastructure to deploy, configure and\n", + "manage the platform and services\n", + "\n", + "\n", + "You can build a Databricks workspace by configuring\n", + "secure integrations between the Databricks platform\n", + "and your cloud account, and then Databricks deploys\n", + "temporary Apache Spark™/Photon clusters using cloud\n", + "resources in your account to process and store data\n", + "in object storage and other integrated services you\n", + "control. Here are three steps to get started with the\n", + "Databricks Lakehouse Platform:\n", + "\n", + "**Understand the architecture**\n", + "The lakehouse provides a unified architecture,\n", + "meaning that all data is stored in the same\n", + "accessible place. The diagram shows how data\n", + "comes in from sources like a customer relationship\n", + "management (CRM) system, an enterprise resource\n", + "planning (ERP) system, websites or unstructured\n", + "customer emails.\n", + "\n", + "**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).\n", + "\n", + "[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Lakehouse organizes data stored with Delta Lake in cloud object\n", + "storage with familiar concepts like database, tables and views. Delta Lake extends\n", + "Parquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\n", + "scalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\n", + "and was developed for tight integration with Structured Streaming, allowing you to\n", + "easily use a single copy of data for both batch and streaming operations to provide\n", + "incremental processing at scale.This model combines many of the benefits of a data\n", + "warehouse with the scalability and flexibility of a data lake.\n", + "\n", + "To learn more about the optimized storage layer that provides the foundation for\n", + "storing data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n", + "[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n", + "\n", + "The first step in unifying your data architecture is setting up how data is to be\n", + "accessed and used across the organization. We’ll discuss this as a series of steps:\n", + "\n", + "**1** Set up governance with Unity Catalog\n", + "\n", + "**2** Grant secure access to the data\n", + "\n", + "\n", + "###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n", + " – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n", + "\n", + "[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n", + "\n", + "To set up Unity Catalog for your organization,\n", + "you do the following:\n", + "\n", + "\n", + "**1** Configure an S3 bucket and IAM role that\n", + "Unity Catalog can use to store and access\n", + "data in your AWS account.\n", + "\n", + "**2** Create a metastore for each region in\n", + "\n", + "which your organization operates, and\n", + "attach workspaces to the metastore. Each\n", + "workspace will have the same view of the\n", + "data you manage in Unity Catalog.\n", + "\n", + "\n", + "**3** If you have a new account, add users,\n", + "groups and service principals to your\n", + "Databricks account.\n", + "\n", + "**4** Next, create and grant access to\n", + "\n", + "catalogs, schemas and tables.\n", + "\n", + "\n", + "For complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How Unity Catalog works\n", + "\n", + "\n", + "You will notice that the hierarchy of primary data\n", + "objects in Unity Catalog flows from metastore to table:\n", + "\n", + "**Metastore** is the top-level container for metadata.\n", + "Each metastore exposes a three-level namespace\n", + "(catalog.schema.table) that organizes your data.\n", + "\n", + "\n", + "**Metastore** **Catalog** **Schemas**\n", + "\n", + "\n", + "**Views**\n", + "\n", + "**Managed**\n", + "**Tables**\n", + "\n", + "\n", + "**Catalog** is the first layer of the object hierarchy, used\n", + "to organize your data assets.\n", + "\n", + "\n", + "**Schemas** , also known as databases, are the second\n", + "layer of the object hierarchy and contain tables and\n", + "views.\n", + "\n", + "**Table** is the lowest level in the object hierarchy, and\n", + "tables can be external (stored in external locations in\n", + "your cloud storage of choice) or managed (stored in a\n", + "storage container in your cloud storage that you create\n", + "\n", + "expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n", + "\n", + "Unity Catalog users, service principals, and groups\n", + "must also be added to workspaces to access Unity\n", + "Catalog data in a notebook, a Databricks SQL query,\n", + "Data Explorer or a REST API command. The assignment\n", + "of users, service principals, and groups to workspaces\n", + "is called identity federation. All workspaces attached\n", + "to a Unity Catalog metastore are enabled for identity\n", + "federation.\n", + "\n", + "Securable objects in Unity Catalog are hierarchical,\n", + "meaning that granting a privilege on a catalog or schema\n", + "automatically grants the privilege to all current and\n", + "future objects within the catalog or schema. For more\n", + "on granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\n", + "A common scenario is to set up a schema per team\n", + "where only that team has USE SCHEMA and CREATE on\n", + "the schema. This means that any tables produced by\n", + "team members can only be shared within the team.\n", + "Data Explorer uses the privileges configured by Unity\n", + "Catalog administrators to ensure that users are only\n", + "able to see catalogs, databases, tables and views that\n", + "they have permission to query.\n", + "\n", + "\n", + "[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\n", + "many Unity Catalog features. Use Data Explorer to view\n", + "schema details, preview sample data, and see table\n", + "details and properties. Administrators can view and\n", + "change owners. Admins and data object owners can grant\n", + "and revoke permissions through this interface.\n", + "\n", + "**Set up secure access**\n", + "In Unity Catalog, data is secure by default. Initially, users\n", + "have no access to data in a metastore. Access can\n", + "be granted by either a metastore admin, the owner of\n", + "an object, or the owner of the catalog or schema that\n", + "contains the object. Securable objects in Unity Catalog\n", + "are hierarchical and privileges are inherited downward.\n", + "\n", + "Unity Catalog’s security model is based on standard ANSI\n", + "SQL and allows administrators to grant permissions in\n", + "their existing data lake using familiar syntax, at the level of\n", + "catalogs, databases (schema), tables and views. Privileges\n", + "and metastores are shared across workspaces, allowing\n", + "administrators to set secure permissions once against\n", + "\n", + "groups synced from identity providers and know that\n", + "end users only have access to the proper data in any\n", + "Databricks workspace they enter.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: BUTCHERBOX\n", + "\n", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significant\n", + "\n", + "\n", + "“We knew we needed to migrate from our legacy data warehouse\n", + "\n", + "environment to a data analytics platform that would unify our\n", + "\n", + "data and make it easily accessible for quick analysis to improve\n", + "\n", + "supply chain operations, forecast demand and, most importantly,\n", + "\n", + "keep up with our growing customer base,” explained Jake Stone,\n", + "\n", + "Senior Manager, Business Analytics, at ButcherBox.\n", + "\n", + "The platform allows analysts to share builds and iterate on a\n", + "\n", + "project without getting into the code. Querying a table of 18\n", + "\n", + "billion rows would have been problematic with a traditional\n", + "\n", + "platform. With Databricks, ButcherBox can do it in three minutes.\n", + "\n", + "“Delta Lake provides us with a single source of truth for all of\n", + "\n", + "our data,” said Stone. “Now our data engineers are able to build\n", + "\n", + "reliable data pipelines that thread the needle on key topics such\n", + "\n", + "as inventory management, allowing us to identify in near real-\n", + "\n", + "time what our trends are so we can figure out how to effectively\n", + "\n", + "move inventory.”\n", + "\n", + "[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "problem because they blocked complete\n", + "\n", + "visibility into critical insights needed to make\n", + "\n", + "strategic and marketing decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Set up secure data sharing**\n", + "Databricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\n", + "to share data with other entities regardless of their\n", + "computing platforms. Delta Sharing is integrated with\n", + "Unity Catalog. Your data must be registered with Unity\n", + "Catalog to manage, govern, audit and track usage of the\n", + "shared data on the Lakehouse Platform. The primary\n", + "concepts of Delta Sharing are shares (read-only\n", + "collections of tables and table partitions to be shared)\n", + "and recipients (objects that associate an organization\n", + "with a credential or secure sharing identifier).\n", + "\n", + "As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n", + "\n", + "**View data lineage**\n", + "You can use Unity Catalog to capture runtime data\n", + "lineage across queries in any language executed on\n", + "a Databricks cluster or SQL warehouse. Lineage can\n", + "be visualized in Data Explorer in near real-time and\n", + "retrieved with the Databricks REST API. Lineage is\n", + "aggregated across all workspaces attached to Unity\n", + "Catalog and captured down to the column level, and\n", + "includes notebooks, workflows and dashboards related\n", + "to the query. To understand the requirements and how\n", + "to capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n", + "[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n", + "\n", + "\n", + "Unity Catalog Metastore\n", + "\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data providers can use Databricks audit logging to\n", + "monitor the creation and modification of shares,\n", + "and recipients can monitor recipient activity on\n", + "shares. Data recipients who use shared data in a\n", + "Databricks account can use Databricks audit logging\n", + "to understand who is accessing which data.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n", + "\n", + "- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n", + "\n", + "- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n", + "\n", + "- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n", + "\n", + "- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "\n", + "\n", + "###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloud\n", + "\n", + "The Databricks Lakehouse Platform is open\n", + "source with multicloud flexibility so that you can\n", + "use your data however and wherever you want —\n", + "no vendor lock-in\n", + "\n", + "\n", + "-----\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 02\n", + "\n", + "### Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "As modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\n", + "the increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\n", + "suddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\n", + "cost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\n", + "upon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n", + "\n", + "Here are some common challenges around managing data and performance at scale:\n", + "\n", + "\n", + "**Volume and velocity** — Exponentially\n", + "increasing data sources, and the speed at\n", + "which they capture and create data.\n", + "\n", + "**Latency requirements** — The demands of\n", + "downstream applications and users have\n", + "evolved (people want data and the results\n", + "from the data faster).\n", + "\n", + "\n", + "**Governance** — Cataloging, auditing, securing and\n", + "reporting on data is burdensome at scale when\n", + "using old systems not built with data access\n", + "controls and compliance in mind.\n", + "\n", + "**Multicloud** is really hard.\n", + "\n", + "\n", + "**Data storage** — Storing data in the wrong\n", + "format is slow to access, query and is\n", + "expensive at scale.\n", + "\n", + "\n", + "**Data format** — Supporting structured, semistructured and unstructured data formats\n", + "is now a requirement. Most data storage\n", + "solutions are designed to handle only one type\n", + "of data, requiring multiple products\n", + "to be stitched together.\n", + "\n", + "```\n", + "02\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Lakehouse solves scale and performance challenges\n", + "\n", + "\n", + "The solution for growing digital companies is a unified\n", + "and simplified platform that can instantly scale up\n", + "capacity to deliver more computing power on demand,\n", + "freeing up teams to go after the much-needed data\n", + "and produce outputs more quickly. With a lakehouse,\n", + "they can replace their data silos with a single home for\n", + "their structured, semi-structured and unstructured\n", + "data. Users and applications throughout the enterprise\n", + "environment can connect to the same single copy of\n", + "the data to drive diverse workloads.\n", + "\n", + "The lakehouse architecture is cost-efficient for\n", + "scaling, lowering the total cost of ownership for the\n", + "overall infrastructure by consolidating all data estate\n", + "and use cases onto a single platform and eliminating\n", + "redundant licensing, infrastructure and administration\n", + "costs. Unlike other warehouse options that can only\n", + "scale horizontally, the Databricks Lakehouse can scale\n", + "horizontally and vertically based on workload demands.\n", + "\n", + "With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "```\n", + "\n", + "With more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n", + "\n", + "day, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n", + "\n", + "legacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n", + "\n", + "decreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n", + "\n", + "infrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n", + "\n", + "downstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n", + "\n", + "actionable insights for different use cases, from predictive maintenance to smarter product development.\n", + "\n", + "“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n", + "\n", + "performant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n", + "\n", + "Wassym Bensaid, Vice President of Software Development at Rivian.\n", + "\n", + "For instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n", + "\n", + "accelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n", + "\n", + "roll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n", + "\n", + "connected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n", + "\n", + "smart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n", + "\n", + "has seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Step 2**\n", + "**Understand the common Delta Lake operations**\n", + "The Databricks Lakehouse Platform simplifies the\n", + "entire data lifecycle, from data ingestion to monitoring\n", + "and governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\n", + "open-source storage system based on the Delta\n", + "format providing reliability through ACID transactions\n", + "and scalable metadata handling. Large quantities of\n", + "raw files in blob storage can be converted to Delta to\n", + "organize and store the data cheaply. This allows for\n", + "flexibility of data movement while being performant\n", + "and less expensive.\n", + "\n", + "\n", + "**Step 1**\n", + "**Get a trial Databricks account**\n", + "Start your 14-day free trial with Databricks on\n", + "AWS in a few easy steps.\n", + "[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\n", + "uses compute and S3 storage resources in your cloud\n", + "provider account.\n", + "\n", + "\n", + "and writing data can occur simultaneously without risk\n", + "of many queries resulting in performance degradation\n", + "or deadlock for business-critical workloads.\n", + "\n", + "This means that users and applications throughout\n", + "the enterprise environment can connect to the same\n", + "single copy of the data to drive diverse workloads, with\n", + "all viewers guaranteed to receive the most current\n", + "version of the data at the time their query executes.\n", + "With performance features like indexing, Delta Lake\n", + "customers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n", + "[up to 48x faster.](https://www.databricks.com/customers/columbia)\n", + "\n", + "\n", + "[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\n", + "and learn how to create, manage and query tables.\n", + "With support for ACID transactions and schema\n", + "enforcement, Delta Lake provides the reliability that\n", + "traditional data lakes lack. This enables you to scale\n", + "reliable data insights throughout the organization and\n", + "run analytics and other data projects directly on your\n", + "data lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n", + "\n", + "Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 3**\n", + "**Ingest data efficiently at scale**\n", + "With a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\n", + "from hundreds of data sources for analytics, AI and\n", + "streaming applications into one place.\n", + "\n", + "Databricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\n", + "data ingestion. To ingest any file that can land in a data\n", + "lake, Auto Loader incrementally and automatically\n", + "processes new data files as they arrive in cloud storage\n", + "in scheduled or continuous jobs. Auto Loader scales to\n", + "support near real-time ingestion of millions of files\n", + "per hour.\n", + "\n", + "For pushing data in Delta Lake, the SQL command\n", + "[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\n", + "into Delta Lake. COPY INTO is best used when the input\n", + "directory contains thousands of files or fewer, and the\n", + "user prefers SQL. COPY INTO can be used over JDBC\n", + "to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "**Step 4**\n", + "**Leverage production-ready tools**\n", + "**to automate ETL pipelines**\n", + "Once the raw data is ingested, Databricks provides\n", + "a suite of production-ready tools that allow data\n", + "professionals to quickly develop and deploy extract,\n", + "\n", + "transform and load (ETL) pipelines. Databricks SQL\n", + "allows analysts to run SQL queries against the same\n", + "tables used in production ETL workloads, allowing for\n", + "real-time business intelligence at scale.\n", + "\n", + "With your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "[your first extract, transform a, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf))List(Increasing volume and velocity of data as companies mature., Need for faster data access and reduced latency., Challenges in data governance, including cataloging, auditing, and securing data., Complexities of using multiple cloud environments., Data storage issues such as slow access, poor query performance, and high costs., Requirement to support structured, semi-structured, and unstructured data formats.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf
7bf67f12c5d95da350ca553480cfdc9af32b7ccbf14f70f9ce8f4706b04e96e0List(List(List(What is the issue with the provided document?, user)))List(List(Unfortunately, the document does not contain any readable content., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf))List(The document lacks readable content.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf
fdc9f7bcc67a7cc4785f7f8df330c023da14c8d768905f0ad46af5323f28e480List(List(List(What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?, user)))List(List(“In a case study published in MIT Technology Review, profit margins increased and manufacturing time was reduced when digital-twin technology was implemented. Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf))List(The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%., The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf
fc67f25c728d8c264f373417e09fd8ecbf4cea9ec52a0fbd9d282dae461fc310List(List(List(What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?, user)))List(List(Most of the complexity has arisen with the explosion of data volumes and data types, with organizations amassing an estimated 80% of data in unstructured and semi-structured format. As the collection of data continues to increase, 73% of the data goes unused for analytics or decision-making. In order to try and decrease this percentage and make more data usable, data engineering teams are responsible for building data pipelines to efficiently and reliably deliver data. But the process of building these complex data pipelines comes with a number of difficulties:\n", + "\n", + "• In order to get data into a data lake, data engineers are required to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "• Since data platforms continuously change, data engineers spend time building and maintaining, and then rebuilding, complex scalable infrastructure\n", + "\n", + "• As data pipelines become more complex, data engineers are required to find reliable tools to orchestrate these pipelines\n", + "\n", + "• With the increasing importance of real-time data, low latency data pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "• Finally, with all pipelines written, data engineers need to constantly focus on performance, tuning pipelines and architectures to meet SLAs, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf))List(Manually hand-coding repetitive data ingestion tasks, Continuously maintaining and rebuilding scalable infrastructure due to changing data platforms, Finding reliable tools for orchestrating complex pipelines, Building and maintaining low-latency pipelines for real-time data, Constantly tuning pipeline performance to meet SLAs)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf
d94a588c739512457882ea9bd39758fb222c0bef855b4c2e4d75dd8bf582c14dList(List(List(What significant advancement in large language model development happened in 2012?, user)))List(List(“ 1950s–1990s\n", + "Initial attempts are made to map hard rules around languages and follow logical steps to accomplish tasks like translating a sentence from one language to another.\n", + "\n", + "While this works sometimes, strictly defined rules only work for concrete, well-defined tasks that the system has knowledge about.\n", + "\n", + "1990s \n", + "Language models begin evolving into statistical models and language patterns start being analyzed, but larger-scale projects are limited by computing power.\n", + "\n", + "2000s \n", + "Advancements in machine learning increase the complexity of language models, and the wide adoption of the internet sees an enormous increase in available training data.\n", + "\n", + "2012 \n", + "Advancements in deep learning architectures and larger data sets lead to the development of GPT (Generative Pre-trained Transformer).\n", + "\n", + "2018 \n", + "Google introduces BERT (Bidirectional Encoder Representations from Transformers), which is a big leap in architecture and paves the way for future large language models.\n", + "\n", + "2020 \n", + "OpenAI releases GPT-3, which becomes the largest model at 175B parameters and sets a new performance benchmark for language-related tasks.\n", + "\n", + "2022 \n", + "ChatGPT is launched, which turns GPT-3 and similar models into a service that is widely accessible to users through a web interface and kicks off a huge increase in public awareness of LLMs and generative AI.\n", + "\n", + "2023 \n", + "Open source LLMs begin showing increasingly impressive results with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna. GPT-4 is also released, setting a new benchmark for both parameter size and performance.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf))List(GPT (Generative Pre-trained Transformer) was developed, This development occurred in 2012)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf
24f005435d7bf0df92c1685538b14427bd0ad2db5e845a5b4741c13ff438c1c3List(List(List(What types of metrics are tracked in game telemetry to understand player behavior and improve the game experience?, user)))List(List(Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in game telemetry include:\n", + "- Player engagement: Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n", + "- Game progress: Monitor player progress through different levels and milestones in the game.\n", + "- In-game purchases: Track the number and value of in-game purchases made by players.\n", + "- Player demographics: Collect demographic information about players, such as age, gender, location, and device type.\n", + "- Session length: Monitor the length of each player session, and how often players return to the game.\n", + "- Retention: Track the percentage of players who return to the game after their first session., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf))List(Metrics tracked include player engagement., Metrics tracked include game progress., Metrics tracked include in-game purchases., Metrics tracked include player demographics., Metrics tracked include session length., Metrics tracked include retention.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf
4517d3fb5f3f1e83efba44630c78e94b18db1d8f94f780810adb3550c851891bList(List(List(What are the responsibilities of a Data Engineer according to the document?, user)))List(List(We distinguish between the following personas:\n", + "Data Governance Officer: Responsible for ensuring that data governance, data privacy and other compliance measures are adhered to across the model development and deployment process. Not typically involved in day-to-day operations.\n", + "Data Engineer: Responsible for building data pipelines to process, organize and persist data sets for machine learning and other downstream applications.\n", + "Data Scientist: Responsible for understanding the business problem, exploring available data to understand if machine learning is applicable, and then training, tuning and evaluating a model to be deployed.\n", + "ML Engineer: Responsible for deploying machine learning models to production with appropriate governance, monitoring and software development best practices such as continuous integration and continuous deployment (CI/CD).\n", + "Business Stakeholder: Responsible for using the model to make decisions for the business or product, and responsible for the business value that the model is expected to generate., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf))List(A Data Engineer is responsible for building data pipelines., A Data Engineer is responsible for processing data sets., A Data Engineer is responsible for organizing data sets., A Data Engineer is responsible for persisting data sets., The responsibilities support machine learning and other downstream applications.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf
7fb5788f38a067b7fcc5b768a4161ef7068f40e35a83e3426e06608c5fcc954eList(List(List(What are the benefits of Delta Sharing in terms of data accessibility and platform compatibility?, user)))List(List(Delta Sharing provides an open solution to securely share live data from your lakehouse to any computing platform. Recipients don’t have to be on the Databricks platform or on the same cloud or a cloud at all. Data providers can share live data without replicating it or moving it to another system. Recipients benefit from always having access to the latest version of data and can quickly query shared data using tools of their choice for BI, analytics and machine learning, reducing time-to-value., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf))List(Delta Sharing provides access to the latest version of the data without replication or moving the data., Delta Sharing allows sharing data across different computing platforms., Recipients do not need to be on Databricks., Recipients do not need to be on the same cloud or any cloud., Delta Sharing supports the use of recipients' preferred BI, analytics, and machine learning tools.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf
87873effb62b308f6aafbb34e7c24aec0439d6b92eb32b330105aa9d6aca2286List(List(List(What are the critical needs for IT and business when it comes to implementing a customer data platform?, user)))List(List(Critical IT Needs\n", + "Keep control of data access and governance; ability to architecture a customer data stack with decisions on where data is stored and where queries are executed.\n", + "Critical Business Needs\n", + "Get customer data access via a no-code interface to generate insights; build customer experiences and activate data within business applications., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf))List(IT needs to maintain control over data access and governance., IT needs the ability to make decisions on data storage and query execution., Business needs to access customer data through a no-code interface to generate insights., Business needs to build customer experiences and activate data within business applications.)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf
c345dd4a67b6fb4dd4bda2ba745bcc1182174d70624e581bbc488008fb1ff876List(List(List(What are the main advantages of the lakehouse architecture over traditional on-premises data warehouses and cloud data warehouses?, user)))List(List(“Data warehouses were designed to provide a central data repository with analytic compute capabilities to help business leaders get analytical insights, support decision-making and business intelligence (BI). Legacy on-premises data warehouse architectures are difficult to scale and make it difficult for data teams to keep up with the exponential growth of data. Oftentimes data teams publish and use a subset of well-defined data for development and testing. This slows down both innovation and time to insight. Cloud data warehouses (CDW) were an attempt to tackle the on-premises data warehouse challenges. CDWs removed the administrative burden of tasks such as setup, upgrades and backups. CDWs also improved scalability and introduced cloud’s pay-as-you-go model to reduce cost. CDWs leverage a proprietary data format to achieve cloud-scale and performance; however, this also leads to customers locked into these formats with difficult But enterprise data teams don’t need a better data warehouse. They need an innovative, simple solution that provides reliable performance, elastic scale and allows self-service to unblock analytics to access all data at a reasonable cost. The answer is the lakehouse. The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency and scale of data lakes with the data management and ACID semantics of data warehouses.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf))List(Flexibility, Cost-efficiency, Open data management architecture (avoids customer lock-in), Better scalability, Ease of management, Reliable performance, Self-service capabilities, Unblocks analytics, Supports rapid innovation, Access to all data at a reasonable cost)SYNTHETIC_FROM_DOC/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "de1daac1a320379ce055bdc8b8342a2d7ca8d1ea08483081801f8219f41dc69d", + [ + [ + [ + "What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?", + "user" + ] + ] + ], + [ + [ + "“In today’s experience-driven world, the most beloved brands are the ones that know their customers. Customers are loyal to brands that recognize their needs and preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer. And as organizations pursue omnichannel excellence, these same high expectations of online experiences also extend to brick-and-mortar locations — revealing for many merchants that personalized engagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized experiences requires integrating various types of data — including demographics, behavioral and transactional — to develop robust profiles. This guide focuses on six actionable strategic pillars for businesses to leverage automation, real-time data, AI-driven analysis and well-tuned ML models to architect and deliver customized customer experiences at every touch point.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ] + ], + [ + "76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ], + [ + "4b452a4426892dea5c35302c50dc70d62c0b2993f478af59a42b59d7c258bfa0", + [ + [ + [ + "What are two key challenges mentioned for predictive maintenance in government agencies?", + "user" + ] + ] + ], + [ + [ + "##### Overview\n\n**Integrating unstructured data**\nEquipment data doesn’t just come in the form of IoT data. Agencies can gather rich unstructured signals like audio, visual (e.g., video inspections) and text (e.g., maintenance logs). Most legacy data architectures are unable to integrate structured and unstructured data sources.\n\n**Operationalizing machine learning**\nMost agencies lack the advanced analytics tools needed to build models that can predict potential equipment failures. Those that do typically have their data scientists working in a siloed set of tools, resulting in unnecessary data replication and inefficient workflows.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf" + ] + ], + [ + "Difficulty integrating structured and unstructured data sources due to legacy data architectures.", + "Inefficient workflows caused by a lack of advanced analytics tools and siloed environments for data scientists." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf" + ], + [ + "6d1c05783fb5945cc9b121919eabdc2194c9c64809821e3c30b7f758a4d12a40", + [ + [ + [ + "What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?", + "user" + ] + ] + ], + [ + [ + "```\nOur most popular use case is natural language processing\n(NLP), a rapidly growing field that enables businesses to\ngain value from unstructured textual data. This opens the\ndoor for users to accomplish tasks that were previously\ntoo abstract for code, such as summarizing content or\nextracting sentiment from customer reviews. In our data\nset, 49% of libraries used are associated with NLP. LLMs\nalso fall within this bucket. Given the innovations launched\nin recent months, we expect to see NLP take off even\nmore in coming years as it is applied to use cases like\nchatbots, research assistance, fraud detection, content\ngeneration and more.\n```", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf" + ] + ], + [ + "49% of specialized Python libraries in the data set are associated with NLP.", + "Examples of tasks enabled by NLP include summarizing content, extracting sentiment from customer reviews, chatbots, research assistance, fraud detection, and content generation." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf" + ], + [ + "8fc168f55c01c3d4059869879a9e54e8601faef19e46f011ac239c44dbe72f40", + [ + [ + [ + "Why is real-time data crucial for retail operations, and what problems do legacy systems cause?", + "user" + ] + ] + ], + [ + [ + "“Retailers need real-time data to support these decisions, but legacy systems are limited to data that’s hours or days old. When seconds matter, only the Lakehouse delivers better decisions [...] most retailers still rely on legacy data systems, which impedes their ability to scale these innovations. Unfortunately, most legacy systems are only able to process information in hours or days. The delays caused by waiting for data are leading to significant risks and costs for the industry.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf" + ] + ], + [ + "Real-time data enables immediate decision-making.", + "Real-time data enables better decision-making in critical moments.", + "Legacy systems process outdated data.", + "Legacy systems cause delays.", + "Legacy systems lead to risks for the retail industry.", + "Legacy systems lead to costs for the retail industry." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf" + ], + [ + "66725804819c75f5e3005072cb81414f01272d64b1b0a8ea89a58392599b1ff7", + [ + [ + [ + "What are the key features and advantages of the lakehouse pattern?", + "user" + ] + ] + ], + [ + [ + "“The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency, and scale of data lakes with the data management and ACID semantics of data warehouses. A lakehouse pattern enables data transformation, cleansing, and validation to support both business intelligence and machine learning (ML) users on all data. Lakehouse is cloud-centric and unifies a complete up-to-date data set for teams, allowing collaboration across an organization.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf" + ] + ], + [ + "The lakehouse pattern has an open data management architecture.", + "It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics.", + "It supports data transformation, cleansing, and validation.", + "The lakehouse pattern is cloud-centric.", + "It enhances support for both business intelligence and machine learning.", + "It is cost-efficient.", + "It offers an up-to-date unified data set.", + "It improves collaboration across the organization." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf" + ], + [ + "1373db51df7476c934e04796eaceed4d4475d7b7a70efcb3405b121c71e96923", + [ + [ + [ + "What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?", + "user" + ] + ] + ], + [ + [ + "Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n\nSome of the primary metrics that are typically tracked in game telemetry include:\n\n- **Player engagement:** Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n- **Game progress:** Monitor player progress through different levels and milestones in the game.\n- **In-game purchases:** Track the number and value of in-game purchases made by players.\n- **Player demographics:** Collect demographic information about players, such as age, gender, location, and device type.\n- **Session length:** Monitor the length of each player session, and how often players return to the game.\n- **Retention:** Track the percentage of players who return to the game after their first session.\n- **User Acquisition:** Track the number of new players acquired through different marketing channels.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ] + ], + [ + "Game telemetry is data collected about player behavior and interactions within a video game.", + "The data is primarily sourced from the game engine.", + "Primary metrics tracked in game telemetry include:\n - player engagement\n - game progress\n - in-game purchases\n - player demographics\n - session length\n - retention\n - user acquisition" + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ], + [ + "3b231daee5434db054e2ee8b4aee9b4edba19aa8886c0d491daa1b36b743142f", + [ + [ + [ + "What are some of the common problems faced by data lakes according to the document?", + "user" + ] + ] + ], + [ + [ + "**Challenges with data lakes**\nData lakes are a common element within modern data architectures. They serve as a\ncentral ingestion point for the plethora of data that organizations seek to gather and\nmine. While a good step forward in getting to grips with the range of data, they run\ninto the following common problems:\n\n**1. Reading and writing into data lakes is not reliable.** Data engineers often run into\nthe problem of unsafe writes into data lakes that cause readers to see garbage\ndata during writes. They have to build workarounds to ensure readers always see\nconsistent data during writes.\n\n**2. The data quality in data lakes is low.** Dumping unstructured data into a data\nlake is easy, but this comes at the cost of data quality. Without any mechanisms\nfor validating schema and the data, data lakes suffer from poor data quality. As a\nconsequence, analytics projects that strive to mine this data also fail.\n\n**3. Poor performance with increasing amounts of data.** As the amount of data\nthat gets dumped into a data lake increases, the number of files and directories\nalso increases. Big data jobs and query engines that process the data spend a\nsignificant amount of time handling the metadata operations. This problem is more\npronounced in the case of streaming jobs or handling many concurrent batch jobs.\n\n**4. Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ] + ], + [ + "Unreliable reading and writing operations", + "Low data quality due to the lack of validation mechanisms", + "Poor performance with increasing data volume", + "Difficulty in modifying, updating, or deleting records" + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ], + [ + "9673989eb3b8242fc0a48d6338f31191260dd7cf6c7eacb26f2ed1512af803a2", + [ + [ + [ + "What new opportunities can data sharing create for organizations looking to generate additional revenue?", + "user" + ] + ] + ], + [ + [ + "**Key benefits of data sharing**\n\nAs you can see from the use cases described above, there are many benefits of data sharing, including:\n\n**Greater collaboration with existing partners.** In today’s hyper-connected digital economy, no single organization can advance its business objectives without partnerships. Data sharing helps solidify existing partnerships and can help organizations establish new ones.\n**Ability to generate new revenue streams.** With data sharing, organizations can generate new revenue streams by offering data products or data services to their end consumers.\n**Ease of producing new products, services or business models.** Product teams can leverage both first-party data and third-party data to refine their products and services and expand their product/service catalog.\n**Greater efficiency of internal operations.** Teams across the organization can meet their business goals far more quickly when they don’t have to spend time figuring out how to free data from silos. When teams have access to live data, there’s no lag time between the need for data and the connection with the appropriate data source.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf" + ] + ], + [ + "Data sharing can enable organizations to offer data products.", + "Data sharing can enable organizations to offer data services." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf" + ], + [ + "21866cbed9a5ba0daafc9367a06f6679f7e6290dd05b59cfd45d36fdbc8fbe73", + [ + [ + [ + "Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?", + "user" + ] + ] + ], + [ + [ + "**EBOOK**\n\n## The Big Book of Data Engineering 2nd Edition\n\nA collection of technical\nblogs, including code\nsamples and notebooks\n\n##### With all-new content\n\n\n-----\n\n#### Contents\n\n**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n\n**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n\n**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n\n**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n\n**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n\n**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n\n**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n\n**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n\n**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n\n**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n\n**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n\n**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n\n**4 . 1** Akamai .................................................................................................................................................................................................... 77\n\n**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n\n**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n\n**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n\n**4 . 5** Rivian .................................................................................................................................................................................................... 90\n\n**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n\n\n-----\n\n**SECTION**\n\n# 01\n\n\n### Introduction to Data Engineering on Databricks\n\n\n-----\n\nOrganizations realize the value data plays as a strategic asset for various\nbusiness-related initiatives, such as growing revenues, improving the customer\nexperience, operating efficiently or improving a product or service. However,\naccessing and managing data for these initiatives has become increasingly\ncomplex. Most of the complexity has arisen with the explosion of data volumes\nand data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\nto increase, 73% of the data goes unused for analytics or decision-making. In\norder to try and decrease this percentage and make more data usable, data\nengineering teams are responsible for building data pipelines to efficiently and\nreliably deliver data. But the process of building these complex data pipelines\ncomes with a number of difficulties:\n\n**•** In order to get data into a data lake, data engineers are required\nto spend immense time hand-coding repetitive data ingestion tasks\n\n**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state\n\n**Data pipeline observability**\nMonitor overall data pipeline status from a dataflow graph dashboard and\nvisually track end-to-end pipeline health for performance, quality and latency.\nData pipeline observability capabilities include:\n\n**•** A high-quality, high-fidelity lineage diagram that provides visibility\ninto how data flows for impact analysis\n\n**•** Granular logging with performance and status of the data pipeline\nat a row level\n\n**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n\n\n**Automatic deployments and operations**\nEnsure reliable and predictable delivery of data for analytics and machine\nlearning use cases by enabling easy and automatic data pipeline deployments\nand rollbacks to minimize downtime. Benefits include:\n\n**•** Complete, parameterized and automated deployment for the\ncontinuous delivery of data\n\n**•** End-to-end orchestration, testing and monitoring of data pipeline\ndeployment across all major cloud providers\n\n**Migrations**\nAccelerating and de-risking the migration journey to the lakehouse, whether\nfrom legacy on-prem systems or disparate cloud services.\n\nThe migration process starts with a detailed discovery and assessment to\nget insights on legacy platform workloads and estimate migration as well as\nDatabricks platform consumption costs. Get help with the target architecture\nand how the current technology stack maps to Databricks, followed by a\nphased implementation based on priorities and business needs. Throughout\nthis journey companies can leverage:\n\n**•** Automation tools from Databricks and its ISV partners\n\n**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n\n**•** Databricks Professional Services and training\n\nThis is the recommended approach for a successful migration, whereby\ncustomers have seen a 25-50% reduction in costs and 2-3x faster time to value\nfor their use cases.\n\n\n-----\n\n**Unified governance**\nWith Unity Catalog, data engineering and governance teams benefit from an\nenterprisewide data catalog with a single interface to manage permissions,\ncentralize auditing, automatically track data lineage down to the column level,\nand share data across platforms, clouds and regions. Benefits:\n\n**•** Discover all your data in one place, no matter where it lives,\nand centrally manage fine-grained access permissions using an\nANSI SQL-based interface\n\n**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**\n\nAs organizations strive to become data-driven, data engineering is a focal\npoint for success. To deliver reliable, trustworthy data, data engineers shouldn’t\nneed to spend time manually developing and maintaining an end-to-end\nETL lifecycle. Data engineering teams need an efficient, scalable way to\nsimplify ETL development, improve data reliability and manage operations.\n\nAs described, the eight key differentiating capabilities simplify the\nmanagement of the ETL lifecycle by automating and maintaining all data\ndependencies, leveraging built-in quality controls with monitoring and by\nproviding deep visibility into pipeline operations with automatic recovery.\nData engineering teams can now focus on easily and rapidly building reliable\nend-to-end production-ready data pipelines using only SQL or Python\nfor batch and streaming that deliver high-value data for analytics, data\nscience or machine learning.\n\n\n**Follow proven best practices**\n\nIn the next section, we describe best practices for data engineering\nend-to end use cases drawn from real-world examples. From data ingestion\nand real-time processing to analytics and machine learning, you’ll learn\nhow to translate raw data into actionable data.\n\nAs you explore the rest of this guide, you can find data sets and code\nsamples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\nget your hands dirty as you explore all aspects of the data lifecycle on the\nDatabricks Lakehouse Platform.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 02\n\n\n### Guidance and Best Practices\n\n**2.1** Top 5 Databricks Performance Tips\n\n**2.2** How to Profile PySpark\n\n**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n\n**2.4** Streaming in Production: Collected Best Practices\n\n**2.5** Streaming in Production: Collected Best Practices, Part 2\n\n**2.6** Building Geospatial Data Products\n\n**2.7** Data Lineage With Unity Catalog\n\n**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:\n\n**•** **Use larger clusters.** It may sound obvious, but this is the number\none problem we see. It’s actually not any more expensive to use a large\ncluster for a workload than it is to use a smaller one. It’s just faster.\nIf there’s anything you should take away from this article, it’s this.\n\nRead section 1. Really.\n\n**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\nto learn more. You won’t regret it.\n\n\n\n**•** **Clean out your configurations** . Configurations carried from one\nApache Spark™ version to the next can cause massive problems. Clean up!\nRead section 3 to learn more.\n\n**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\ncorrectly, if at all. See Section 4 to learn more.\n\n**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\nyou’re writing Spark code, jump to section 5.\n\n**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\nblog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n\n**1. Give your clusters horsepower!**\n\nThis is the number one mistake customers make. Many customers create tiny\nclusters of two workers with four cores each, and it takes forever to do anything.\nThe concern is always the same: they don’t want to spend too much money on\nlarger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n\n\n-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2\n\nNotice that the total cost of the workload stays the same while the real-world\ntime it takes for the job to run drops significantly. So, bump up your Databricks\ncluster specs and speed up your workloads without spending any more money. It\n\ncan’t really get any simpler than that.\n\n**2. Use Photon**\n\nOur colleagues in engineering have rewritten the Spark execution engine in C++\nand dubbed it Photon. The results are impressive!\n\n\nBeyond the obvious improvements due to running the engine in native code,\nthey’ve also made use of CPU-level performance features and better memory\n\nmanagement. On top of this, they’ve rewritten the Parquet writer in C++. So this\nmakes writing to Parquet and Delta (based on Parquet) super fast as well!\n\nBut let’s also be clear about what Photon is speeding up. It improves\ncomputation speed for any built-in functions or operations, as well as writes to\nParquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\nspending most of its time reading from an ancient on-prem database? Won’t\nhelp there either, unfortunately.\n\n\n-----\n\nThe good news is that it helps where it can. So even if part of your job can’t be\nsped up, it will speed up the other parts. Also, most jobs are written with the\nnative operations and spend a lot of time writing to Delta, and Photon helps a lot\nthere. So give it a try. You may be amazed by the results!\n\n**3. Clean out old configurations**\n\nYou know those Spark configurations you’ve been carrying along from version to\nversion and no one knows what they do anymore? They may not be harmless.\nWe’ve seen jobs go from running for hours down to minutes simply by cleaning\nout old configurations. There may have been a quirk in a particular version of\nSpark, a performance tweak that has not aged well, or something pulled off\nsome blog somewhere that never really made sense. At the very least, it’s worth\nrevisiting your Spark configurations if you’re in this situation. Often the default\nconfigurations are the best, and they’re only getting better. Your configurations\nmay be holding you back.\n\n**4. The Delta Cache is your friend**\n\nThis may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.\n\nOf course, your mileage may vary. If you’re doing BI, which involves reading the\nsame tables over and over again, caching gives an amazing boost. However, if\nyou’re simply reading a table once and writing out the results as in some ETL\njobs, you may not get much benefit. You know your jobs better than anyone.\nGo forth and conquer.\n\n\n-----\n\n**5. Be aware of lazy evaluation**\n\n\nHowever, there is a catch here. Every time you try to display or write out\nresults, it runs the execution plan again. Let’s look at the same block of code\nbut extend it and do a few more operations.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n.filter(...)\n)\n\n_# Now run the execution plan to get results_\ndf2.display()\n\n_# Unfortunately this will run the plan again, including filtering, joining,_\n_etc_\ndf2.display()\n\n_# So will this…_\ndf2.count()\n—------\n\n\nIf you’re a data analyst or data scientist only using SQL or doing BI you can skip\nthis section. However, if you’re in data engineering and writing pipelines or doing\nprocessing using Databricks/Spark, read on.\n\nWhen you’re writing Spark code like select, groupBy, filter, etc., you’re really\nbuilding an execution plan. You’ll notice the code returns almost immediately when\nyou run these functions. That’s because it’s not actually doing any computation. So\neven if you have petabytes of data, it will return in less than a second.\n\nHowever, once you go to write your results out you’ll notice it takes longer. This\nis due to lazy evaluation. It’s not until you try to display or write results that your\nexecution plan is actually run.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.\n\n\nThis works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\nbenefit greatly from lazy evaluation, but it’s something a lot of customers trip\nover. So be aware of its existence and save results you reuse in order to avoid\nunnecessary computation.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nLet’s look at the same block of code again, but this time let’s avoid the\nrecomputation:\n\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n)\n\n_# save it_\ndf2.write.save(path)\n\n_# load it back in_\ndf3 = spark.read.load(path)\n\n_# now use it_\ndf3.display()\n\n_# this is not doing any extra computation anymore. No joins, filtering,_\n_etc. It’s already done and saved._\ndf3.display()\n\n_# nor is this_\ndf3.count()\n\n\n-----\n\nSECTION 2.2 \u0007\n\n**How to Profile PySpark**\n\nby **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n\nOctober 6, 2022\n\n\nIn Apache Spark™, declarative Python APIs are supported for big data workloads.\nThey are powerful enough to handle most common use cases. Furthermore,\nPySpark UDFs offer more flexibility since they enable users to run arbitrary\nPython code on top of the Apache Spark™ engine. Users only have to state\n“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\nPySpark easier to use, but it can be difficult to identify performance bottlenecks\nand apply custom optimizations.\n\nTo address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**\n\nPySpark applications run as independent sets of processes on a cluster,\ncoordinated by the SparkContext object in the driver program. On the driver\nside, PySpark is a regular Python process; thus, we can profile it as a normal\nPython program using cProfile as illustrated below:\n\nimport cProfile\n\nwith cProfile.Profile() as pr:\n_# Your code_\n\npr.print_stats()\n\n**Workers profiling**\n\nExecutors are distributed on worker nodes in the cluster, which introduces\ncomplexity because we need to aggregate profiles. Furthermore, a Python worker\nprocess is spawned per executor for PySpark UDF execution, which makes the\nprofiling more intricate.\n\n\n-----\n\nThe UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\nand becomes a major tool to profile workers for PySpark applications. We’ll\nillustrate how to use the UDF profiler with a simple Pandas UDF example.\n\nFirstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n```\n sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n by 'id' )\n ).withColumn( 'v' , rand())\n\n```\nLater, we will group by the id column, which results in 8 groups with 1,000 rows\nper group.\n\nThe Pandas UDF plus_one is then created and applied as shown below:\n```\n import pandas as pd\n def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf.apply( lambda x: x + 1 , axis= 1 )\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n res.collect()\n\n```\n\nExecuting the example above and running sc.show_profiles() prints the\nfollowing profile. The profile below can also be dumped to disk by sc.dump_\nprofiles(path).\n\nThe UDF id in the profile (271, highlighted above) matches that in the Spark plan\nfor res. The Spark plan can be shown by calling res.explain() .\n\n\nNote that plus_one takes a pandas DataFrame and returns another pandas\nDataFrame. For each group, all columns are passed together as a pandas\nDataFrame to the plus_one UDF, and the returned pandas DataFrames are\ncombined into a PySpark DataFrame.\n\n\n-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function\n\nDigging into the column details: plus_one is triggered once per group, 8 times\nin total; _arith_method of pandas Series is called once per row, 8,000 times\nin total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\nrow, thus suffering from high invocation overhead.\n\nWe can reduce such overhead by substituting the pandas.DataFrame.apply\nwith pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\nfollows:\n```\n import pandas as pd\n def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf + 1\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n schema)\n res.collect()\n\n```\n\nThe updated profile is as shown below.\n\nWe can summarize the optimizations as follows:\n\n**•** Arithmetic operation from 8,000 calls to 8 calls\n\n**•** Total function calls from 2,898,160 calls to 2,384 calls\n\n**•** Total execution time from 2.300 seconds to 0.004 seconds\n\nThe short example above demonstrates how the UDF profiler helps us deeply\nunderstand the execution, identify the performance bottleneck and enhance\nthe overall performance of the user-defined function.\n\nThe UDF profiler was implemented based on the executor-side profiler,\nwhich is designed for PySpark RDD API. The executor-side profiler is available\nin all active Databricks Runtime versions.\n\n\n-----\n\nBoth the UDF profiler and the executor-side profiler run on Python workers.\nThey are controlled by the spark.python.profile Spark configuration, which\nis false by default. We can enable that Spark configuration on a Databricks\nRuntime cluster as shown below.\n\n\n**Conclusion**\n\nPySpark profilers are implemented based on cProfile; thus, the profile reporting\nrelies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\ncollecting profile reports from Python workers.\n\nPowerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022\n\n\n[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\napproach for creating reliable data pipelines and fully manages the underlying\ninfrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\nactionable insights derived from near real-time data. Delta Live Tables enables\nlow-latency streaming data pipelines to support such use cases with low\nlatencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n\nThis article will walk through using DLT with Apache Kafka while providing the\nrequired Python code to ingest streams. The recommended system architecture\nwill be explained, and related DLT settings worth considering will be explored\nalong the way.\n\n**Streaming platforms**\n\nEvent buses or message buses decouple message producers from consumers.\nA popular streaming use case is the collection of click-through data from\nusers navigating a website where every user interaction is stored as an event in\n\n\nApache Kafka. The event stream from Kafka is then used for real-time streaming\ndata analytics. Multiple message consumers can read the same data from Kafka\nand use the data to learn about audience interests, conversion rates, and bounce\nreasons. The real-time, streaming event data from the user interactions often\nalso needs to be correlated with actual purchases stored in a billing database.\n\n**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”\n\nWhen developing DLT with Python, the @dlt.table decorator is used to", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ] + ], + [ + "Larger clusters execute workloads faster in Databricks.", + "The faster execution reduces the total time required for workload completion.", + "The overall cost efficiency is balanced due to reduced workload completion time despite higher hourly costs." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ], + [ + "088c4943384eaa6a228c3d68ff70fbef6bcbe9c50176180e73244de1d7f3be1a", + [ + [ + [ + "What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?", + "user" + ] + ] + ], + [ + [ + "```\nTECHNICAL GUIDE\n\n```\n\n# Solving Common Data Challenges \n\n\n#### Startups and Digital Native Businesses\n\n\n-----\n\n### Table of Contents\n\n\n# 01\n```\nCHALLENGE:\n \u0003\n\n###### Creating a unified data architecture for data quality, governance and efficiency\n\n# 03\nCHALLENGE:\n \u0003\n\n###### Building effective machine learning operations\n\n```\n\n# 02\n```\nCHALLENGE:\n \u0003\n\n###### Building a data architecture to support scale and performance\n\n# 04\nSUMMARY:\n\n###### The Databricks Lakehouse Platform addresses these challenges\n\n```\n\n-----\n\n**I N T R O D U C T I O N**\n\n\nThis guide shares how the lakehouse architecture can increase\nproductivity and cost-efficiently support all your data, analytics\nand AI workloads, and flexibly scale with the pace of growth\nfor your company. Read the entire guide or dive straight into a\nspecific challenge.\n\nWith the advent of cloud infrastructure, a new generation of\nstartups has rapidly built and scaled their businesses. The use of\ncloud infrastructure, once seen as innovative, has now become\ntable stakes. The differentiator for the fastest-moving startups\nand digital natives now comes from the effective use of data\nat scale, primarily analytics and AI. Digital natives — defined\nas fast-moving, lean, and technically savvy, born-in-the-cloud\norganizations — are beginning to focus on new data-driven use\ncases such as real-time machine learning and personalized\ncustomer experiences.\n\nTo pursue these new data-intensive use cases and initiatives,\norganizations must look beyond the technologies that delivered\nthem to this point in time. Over time, these technologies, such\nas transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n\nThis guide examines some of the biggest data challenges and\nsolutions for startups and for scaling digital native businesses\nthat have reached the point where an end-to-end modern data\nplatform is a smart investment. Some key considerations include:\nsystems that are not cost-efficient and require time-consuming\nadministration and engineering toil. In addition to growing\nmaintenance needs, data is often stored in disparate locations\nand formats, with little or no governance, making real-time use\ncases, analytics and AI difficult or impossible.\n\n\n**Consolidating on a unified data platform**\nAs mentioned above, siloed data storage and management add administrative and\nfinancial cost. You can benefit significantly when you unify your data in one location\nwith a flexible architecture that scales with your needs and delivers performance\nfor future success. For this, you will want an open platform that supports all your\ndata including batch and streaming workloads, data analytics and machine learning.\nWith data unification, you create a more efficient, integrated approach to ingesting,\ncleaning and organizing your data. You also need automation to make data analysis\neasier for the nontechnical users in the company. But broader data access also\nmeans more focus on security, privacy, compliance and access control, which can\ncreate overhead for a growing.\n\n**Scaling up capacity and increasing performance**\n**and usability of the data solutions**\nData teams at growing digital native organizations find it time intensive and costly to\nhandle the growing volume and velocity of their data being ingested from multiple\nsources, across multiple clouds. You now need a unified and simplified platform that\ncan instantly scale up capacity and deliver more computing power on demand to\nfree up your data teams to produce outputs more quickly. This lowers the total cost\nfor the overall infrastructure by eliminating redundant licensing, infrastructure and\nadministration costs.\n\n**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency\n\n```\nAs cloud-born companies grow, data volumes rapidly increase, leading to new\nchallenges and use cases. Among the challenges:\n\n\nApplication stacks optimized for transaction\nuse cases aren’t able to handle the volume,\nvelocity and variety of data that modern data\nteams require. For example, this leads to query\nperformance issues as data volume grows.\n\nData silos develop as each team within an\norganization chooses different ETL/ELT and\nstorage solutions for their needs. As the\norganization grows and changes, these pipelines\nand storage solutions become brittle, hard to\nmaintain and nearly impossible to integrate.\n\n\nThese data silos lead to discoverability,\nintegration and access issues, which prevent\nteams from leveraging the full value of the\norganization’s available data.\n\nData governance is hard. Disparate ETL/ELT\nand storage solutions lead to governance,\ncompliance, auditability and access control\nchallenges, which expose organizations to\ntremendous risk.\n\n\nThe Databricks Lakehouse Platform provides\na unified set of tools for building, deploying,\nsharing and maintaining data solutions at scale.\nIt integrates with cloud storage and the security\nin your cloud account, manages and deploys\ncloud infrastructure on your behalf. Your data\npractitioners no longer need separate storage\nsystems for their data. And you don’t have to rely\non your cloud provider for security. The lakehouse\nhas its own robust security built into the platform.\n\n\nFor all the reasons above, the most\nconsistent advice from successful data\npractitioners is to create a “single source\nof truth” by unifying all data on a single\nplatform. With the Databricks Lakehouse\nPlatform, you can unify all your data on one\nplatform, reducing data infrastructure costs\nand compute. You don’t need excess data\ncopies and you can retire expensive\nlegacy infrastructure.\n```\n 01\n\n```\n\n-----\n\n```\nCUSTOMER STORY: GRAMMARLY\n\n### Helping 30 million people and 50,000 teams communicate more effectively\n\n```\n\nWhile its business is based on analytics, [Grammarly](http://www.grammarly.com)\n\nfor many years relied on a homegrown analytics\n\nplatform to drive its AI writing assistant to\n\nhelp users improve multiple aspects of written\n\ncommunications. As teams developed their own\n\nrequirements, data silos inevitably emerged as\n\ndifferent business areas implemented analytics\n\ntools individually.\n\n“Every team decided to solve their analytics\n\nneeds in the best way they saw fit,” said Chris\n\nLocklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds\n\nthe standards we set as a data platform team,” said Locklin. “Lineage is\n\nthe last crucial piece for access control.”\n\nData analysts within Grammarly now have a consolidated interface for\n\nanalytics, which leads to a single source of truth and confidence in the\n\naccuracy and availability of all data managed by the data platform team.\n\nHaving a consistent data source across the company also resulted in\n\ngreater speed and efficiency and reduced costs. Data practitioners\n\nexperienced 110% faster querying at 10% of the cost to ingest compared\n\nto a data warehouse. Grammarly can now make its 5 billion daily events\n\navailable for analytics in under 15 minutes rather than 4 hours. Migrating\n\noff its rigid legacy infrastructure gave Grammarly the flexibility to do\n\nmore and the confidence that the platform will evolve with its needs.\n\nGrammarly is now able to sustain a flexible, scalable and highly secure\n\nanalytics platform that helps 30 million people and 50,000 teams\n\nworldwide write more effectively every day.\n\n[Read the full story here.](https://www.databricks.com/customers/grammarly)\n\n\n-----\n\n###### How to unify the data infrastructure with Databricks\n\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\nis composed of two primary parts:\n\n- The infrastructure to deploy, configure and\nmanage the platform and services\n\n\nYou can build a Databricks workspace by configuring\nsecure integrations between the Databricks platform\nand your cloud account, and then Databricks deploys\ntemporary Apache Spark™/Photon clusters using cloud\nresources in your account to process and store data\nin object storage and other integrated services you\ncontrol. Here are three steps to get started with the\nDatabricks Lakehouse Platform:\n\n**Understand the architecture**\nThe lakehouse provides a unified architecture,\nmeaning that all data is stored in the same\naccessible place. The diagram shows how data\ncomes in from sources like a customer relationship\nmanagement (CRM) system, an enterprise resource\nplanning (ERP) system, websites or unstructured\ncustomer emails.\n\n**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).\n\n[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n\n\n-----\n\nThe Databricks Lakehouse organizes data stored with Delta Lake in cloud object\nstorage with familiar concepts like database, tables and views. Delta Lake extends\nParquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\nscalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\nand was developed for tight integration with Structured Streaming, allowing you to\neasily use a single copy of data for both batch and streaming operations to provide\nincremental processing at scale.This model combines many of the benefits of a data\nwarehouse with the scalability and flexibility of a data lake.\n\nTo learn more about the optimized storage layer that provides the foundation for\nstoring data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n\nThe first step in unifying your data architecture is setting up how data is to be\naccessed and used across the organization. We’ll discuss this as a series of steps:\n\n**1** Set up governance with Unity Catalog\n\n**2** Grant secure access to the data\n\n\n###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n\n[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\n**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n\nTo set up Unity Catalog for your organization,\nyou do the following:\n\n\n**1** Configure an S3 bucket and IAM role that\nUnity Catalog can use to store and access\ndata in your AWS account.\n\n**2** Create a metastore for each region in\n\nwhich your organization operates, and\nattach workspaces to the metastore. Each\nworkspace will have the same view of the\ndata you manage in Unity Catalog.\n\n\n**3** If you have a new account, add users,\ngroups and service principals to your\nDatabricks account.\n\n**4** Next, create and grant access to\n\ncatalogs, schemas and tables.\n\n\nFor complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n\n\n-----\n\n###### How Unity Catalog works\n\n\nYou will notice that the hierarchy of primary data\nobjects in Unity Catalog flows from metastore to table:\n\n**Metastore** is the top-level container for metadata.\nEach metastore exposes a three-level namespace\n(catalog.schema.table) that organizes your data.\n\n\n**Metastore** **Catalog** **Schemas**\n\n\n**Views**\n\n**Managed**\n**Tables**\n\n\n**Catalog** is the first layer of the object hierarchy, used\nto organize your data assets.\n\n\n**Schemas** , also known as databases, are the second\nlayer of the object hierarchy and contain tables and\nviews.\n\n**Table** is the lowest level in the object hierarchy, and\ntables can be external (stored in external locations in\nyour cloud storage of choice) or managed (stored in a\nstorage container in your cloud storage that you create\n\nexpressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n\nUnity Catalog users, service principals, and groups\nmust also be added to workspaces to access Unity\nCatalog data in a notebook, a Databricks SQL query,\nData Explorer or a REST API command. The assignment\nof users, service principals, and groups to workspaces\nis called identity federation. All workspaces attached\nto a Unity Catalog metastore are enabled for identity\nfederation.\n\nSecurable objects in Unity Catalog are hierarchical,\nmeaning that granting a privilege on a catalog or schema\nautomatically grants the privilege to all current and\nfuture objects within the catalog or schema. For more\non granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\nA common scenario is to set up a schema per team\nwhere only that team has USE SCHEMA and CREATE on\nthe schema. This means that any tables produced by\nteam members can only be shared within the team.\nData Explorer uses the privileges configured by Unity\nCatalog administrators to ensure that users are only\nable to see catalogs, databases, tables and views that\nthey have permission to query.\n\n\n[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\nmany Unity Catalog features. Use Data Explorer to view\nschema details, preview sample data, and see table\ndetails and properties. Administrators can view and\nchange owners. Admins and data object owners can grant\nand revoke permissions through this interface.\n\n**Set up secure access**\nIn Unity Catalog, data is secure by default. Initially, users\nhave no access to data in a metastore. Access can\nbe granted by either a metastore admin, the owner of\nan object, or the owner of the catalog or schema that\ncontains the object. Securable objects in Unity Catalog\nare hierarchical and privileges are inherited downward.\n\nUnity Catalog’s security model is based on standard ANSI\nSQL and allows administrators to grant permissions in\ntheir existing data lake using familiar syntax, at the level of\ncatalogs, databases (schema), tables and views. Privileges\nand metastores are shared across workspaces, allowing\nadministrators to set secure permissions once against\n\ngroups synced from identity providers and know that\nend users only have access to the proper data in any\nDatabricks workspace they enter.\n\n\n-----\n\n```\nCUSTOMER STORY: BUTCHERBOX\n\n### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant\n\n\n“We knew we needed to migrate from our legacy data warehouse\n\nenvironment to a data analytics platform that would unify our\n\ndata and make it easily accessible for quick analysis to improve\n\nsupply chain operations, forecast demand and, most importantly,\n\nkeep up with our growing customer base,” explained Jake Stone,\n\nSenior Manager, Business Analytics, at ButcherBox.\n\nThe platform allows analysts to share builds and iterate on a\n\nproject without getting into the code. Querying a table of 18\n\nbillion rows would have been problematic with a traditional\n\nplatform. With Databricks, ButcherBox can do it in three minutes.\n\n“Delta Lake provides us with a single source of truth for all of\n\nour data,” said Stone. “Now our data engineers are able to build\n\nreliable data pipelines that thread the needle on key topics such\n\nas inventory management, allowing us to identify in near real-\n\ntime what our trends are so we can figure out how to effectively\n\nmove inventory.”\n\n[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\nproblem because they blocked complete\n\nvisibility into critical insights needed to make\n\nstrategic and marketing decisions.\n\n\n-----\n\n**Set up secure data sharing**\nDatabricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\nto share data with other entities regardless of their\ncomputing platforms. Delta Sharing is integrated with\nUnity Catalog. Your data must be registered with Unity\nCatalog to manage, govern, audit and track usage of the\nshared data on the Lakehouse Platform. The primary\nconcepts of Delta Sharing are shares (read-only\ncollections of tables and table partitions to be shared)\nand recipients (objects that associate an organization\nwith a credential or secure sharing identifier).\n\nAs a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n\n**View data lineage**\nYou can use Unity Catalog to capture runtime data\nlineage across queries in any language executed on\na Databricks cluster or SQL warehouse. Lineage can\nbe visualized in Data Explorer in near real-time and\nretrieved with the Databricks REST API. Lineage is\naggregated across all workspaces attached to Unity\nCatalog and captured down to the column level, and\nincludes notebooks, workflows and dashboards related\nto the query. To understand the requirements and how\nto capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n\n\nUnity Catalog Metastore\n\n\nCatalog\n\n\nData providers can use Databricks audit logging to\nmonitor the creation and modification of shares,\nand recipients can monitor recipient activity on\nshares. Data recipients who use shared data in a\nDatabricks account can use Databricks audit logging\nto understand who is accessing which data.\n\n\n-----\n\n###### Resources:\n\n- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n\n- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n\n- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n\n- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n\n- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n\n\n###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud\n\nThe Databricks Lakehouse Platform is open\nsource with multicloud flexibility so that you can\nuse your data however and wherever you want —\nno vendor lock-in\n\n\n-----\n\n# 02\n```\nCHALLENGE: \u0003\n\n## Build your data architecture to support scale and performance\n\n```\n\n-----\n\n```\nCHALLENGE 02\n\n### Build your data architecture to support scale and performance\n\n```\nAs modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\nthe increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\nsuddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\ncost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\nupon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n\nHere are some common challenges around managing data and performance at scale:\n\n\n**Volume and velocity** — Exponentially\nincreasing data sources, and the speed at\nwhich they capture and create data.\n\n**Latency requirements** — The demands of\ndownstream applications and users have\nevolved (people want data and the results\nfrom the data faster).\n\n\n**Governance** — Cataloging, auditing, securing and\nreporting on data is burdensome at scale when\nusing old systems not built with data access\ncontrols and compliance in mind.\n\n**Multicloud** is really hard.\n\n\n**Data storage** — Storing data in the wrong\nformat is slow to access, query and is\nexpensive at scale.\n\n\n**Data format** — Supporting structured, semistructured and unstructured data formats\nis now a requirement. Most data storage\nsolutions are designed to handle only one type\nof data, requiring multiple products\nto be stitched together.\n\n```\n02\n\n```\n\n-----\n\n###### Lakehouse solves scale and performance challenges\n\n\nThe solution for growing digital companies is a unified\nand simplified platform that can instantly scale up\ncapacity to deliver more computing power on demand,\nfreeing up teams to go after the much-needed data\nand produce outputs more quickly. With a lakehouse,\nthey can replace their data silos with a single home for\ntheir structured, semi-structured and unstructured\ndata. Users and applications throughout the enterprise\nenvironment can connect to the same single copy of\nthe data to drive diverse workloads.\n\nThe lakehouse architecture is cost-efficient for\nscaling, lowering the total cost of ownership for the\noverall infrastructure by consolidating all data estate\nand use cases onto a single platform and eliminating\nredundant licensing, infrastructure and administration\ncosts. Unlike other warehouse options that can only\nscale horizontally, the Databricks Lakehouse can scale\nhorizontally and vertically based on workload demands.\n\nWith the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN\n\n```\n\nWith more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n\nday, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n\nlegacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n\ndecreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n\ninfrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n\ndownstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n\nactionable insights for different use cases, from predictive maintenance to smarter product development.\n\n“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n\nperformant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n\nWassym Bensaid, Vice President of Software Development at Rivian.\n\nFor instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n\naccelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n\nroll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n\nconnected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n\nsmart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n\nhas seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n\n[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.\n\n\n**Step 2**\n**Understand the common Delta Lake operations**\nThe Databricks Lakehouse Platform simplifies the\nentire data lifecycle, from data ingestion to monitoring\nand governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\nopen-source storage system based on the Delta\nformat providing reliability through ACID transactions\nand scalable metadata handling. Large quantities of\nraw files in blob storage can be converted to Delta to\norganize and store the data cheaply. This allows for\nflexibility of data movement while being performant\nand less expensive.\n\n\n**Step 1**\n**Get a trial Databricks account**\nStart your 14-day free trial with Databricks on\nAWS in a few easy steps.\n[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\nuses compute and S3 storage resources in your cloud\nprovider account.\n\n\nand writing data can occur simultaneously without risk\nof many queries resulting in performance degradation\nor deadlock for business-critical workloads.\n\nThis means that users and applications throughout\nthe enterprise environment can connect to the same\nsingle copy of the data to drive diverse workloads, with\nall viewers guaranteed to receive the most current\nversion of the data at the time their query executes.\nWith performance features like indexing, Delta Lake\ncustomers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n[up to 48x faster.](https://www.databricks.com/customers/columbia)\n\n\n[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\nand learn how to create, manage and query tables.\nWith support for ACID transactions and schema\nenforcement, Delta Lake provides the reliability that\ntraditional data lakes lack. This enables you to scale\nreliable data insights throughout the organization and\nrun analytics and other data projects directly on your\ndata lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n\nDelta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n**Step 3**\n**Ingest data efficiently at scale**\nWith a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\nfrom hundreds of data sources for analytics, AI and\nstreaming applications into one place.\n\nDatabricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\ndata ingestion. To ingest any file that can land in a data\nlake, Auto Loader incrementally and automatically\nprocesses new data files as they arrive in cloud storage\nin scheduled or continuous jobs. Auto Loader scales to\nsupport near real-time ingestion of millions of files\nper hour.\n\nFor pushing data in Delta Lake, the SQL command\n[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\ninto Delta Lake. COPY INTO is best used when the input\ndirectory contains thousands of files or fewer, and the\nuser prefers SQL. COPY INTO can be used over JDBC\nto push data into Delta Lake at your convenience.\n\n\n**Step 4**\n**Leverage production-ready tools**\n**to automate ETL pipelines**\nOnce the raw data is ingested, Databricks provides\na suite of production-ready tools that allow data\nprofessionals to quickly develop and deploy extract,\n\ntransform and load (ETL) pipelines. Databricks SQL\nallows analysts to run SQL queries against the same\ntables used in production ETL workloads, allowing for\nreal-time business intelligence at scale.\n\nWith your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n[your first extract, transform a", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf" + ] + ], + [ + "Increasing volume and velocity of data as companies mature.", + "Need for faster data access and reduced latency.", + "Challenges in data governance, including cataloging, auditing, and securing data.", + "Complexities of using multiple cloud environments.", + "Data storage issues such as slow access, poor query performance, and high costs.", + "Requirement to support structured, semi-structured, and unstructured data formats." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf" + ], + [ + "7bf67f12c5d95da350ca553480cfdc9af32b7ccbf14f70f9ce8f4706b04e96e0", + [ + [ + [ + "What is the issue with the provided document?", + "user" + ] + ] + ], + [ + [ + "Unfortunately, the document does not contain any readable content.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf" + ] + ], + [ + "The document lacks readable content." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf" + ], + [ + "fdc9f7bcc67a7cc4785f7f8df330c023da14c8d768905f0ad46af5323f28e480", + [ + [ + [ + "What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?", + "user" + ] + ] + ], + [ + [ + "“In a case study published in MIT Technology Review, profit margins increased and manufacturing time was reduced when digital-twin technology was implemented. Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ] + ], + [ + "The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%.", + "The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ], + [ + "fc67f25c728d8c264f373417e09fd8ecbf4cea9ec52a0fbd9d282dae461fc310", + [ + [ + [ + "What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?", + "user" + ] + ] + ], + [ + [ + "Most of the complexity has arisen with the explosion of data volumes and data types, with organizations amassing an estimated 80% of data in unstructured and semi-structured format. As the collection of data continues to increase, 73% of the data goes unused for analytics or decision-making. In order to try and decrease this percentage and make more data usable, data engineering teams are responsible for building data pipelines to efficiently and reliably deliver data. But the process of building these complex data pipelines comes with a number of difficulties:\n\n• In order to get data into a data lake, data engineers are required to spend immense time hand-coding repetitive data ingestion tasks\n\n• Since data platforms continuously change, data engineers spend time building and maintaining, and then rebuilding, complex scalable infrastructure\n\n• As data pipelines become more complex, data engineers are required to find reliable tools to orchestrate these pipelines\n\n• With the increasing importance of real-time data, low latency data pipelines are required, which are even more difficult to build and maintain\n\n• Finally, with all pipelines written, data engineers need to constantly focus on performance, tuning pipelines and architectures to meet SLAs", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ] + ], + [ + "Manually hand-coding repetitive data ingestion tasks", + "Continuously maintaining and rebuilding scalable infrastructure due to changing data platforms", + "Finding reliable tools for orchestrating complex pipelines", + "Building and maintaining low-latency pipelines for real-time data", + "Constantly tuning pipeline performance to meet SLAs" + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ], + [ + "d94a588c739512457882ea9bd39758fb222c0bef855b4c2e4d75dd8bf582c14d", + [ + [ + [ + "What significant advancement in large language model development happened in 2012?", + "user" + ] + ] + ], + [ + [ + "“ 1950s–1990s\nInitial attempts are made to map hard rules around languages and follow logical steps to accomplish tasks like translating a sentence from one language to another.\n\nWhile this works sometimes, strictly defined rules only work for concrete, well-defined tasks that the system has knowledge about.\n\n1990s \nLanguage models begin evolving into statistical models and language patterns start being analyzed, but larger-scale projects are limited by computing power.\n\n2000s \nAdvancements in machine learning increase the complexity of language models, and the wide adoption of the internet sees an enormous increase in available training data.\n\n2012 \nAdvancements in deep learning architectures and larger data sets lead to the development of GPT (Generative Pre-trained Transformer).\n\n2018 \nGoogle introduces BERT (Bidirectional Encoder Representations from Transformers), which is a big leap in architecture and paves the way for future large language models.\n\n2020 \nOpenAI releases GPT-3, which becomes the largest model at 175B parameters and sets a new performance benchmark for language-related tasks.\n\n2022 \nChatGPT is launched, which turns GPT-3 and similar models into a service that is widely accessible to users through a web interface and kicks off a huge increase in public awareness of LLMs and generative AI.\n\n2023 \nOpen source LLMs begin showing increasingly impressive results with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna. GPT-4 is also released, setting a new benchmark for both parameter size and performance.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf" + ] + ], + [ + "GPT (Generative Pre-trained Transformer) was developed", + "This development occurred in 2012" + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf" + ], + [ + "24f005435d7bf0df92c1685538b14427bd0ad2db5e845a5b4741c13ff438c1c3", + [ + [ + [ + "What types of metrics are tracked in game telemetry to understand player behavior and improve the game experience?", + "user" + ] + ] + ], + [ + [ + "Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n\nSome of the primary metrics that are typically tracked in game telemetry include:\n- Player engagement: Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n- Game progress: Monitor player progress through different levels and milestones in the game.\n- In-game purchases: Track the number and value of in-game purchases made by players.\n- Player demographics: Collect demographic information about players, such as age, gender, location, and device type.\n- Session length: Monitor the length of each player session, and how often players return to the game.\n- Retention: Track the percentage of players who return to the game after their first session.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ] + ], + [ + "Metrics tracked include player engagement.", + "Metrics tracked include game progress.", + "Metrics tracked include in-game purchases.", + "Metrics tracked include player demographics.", + "Metrics tracked include session length.", + "Metrics tracked include retention." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ], + [ + "4517d3fb5f3f1e83efba44630c78e94b18db1d8f94f780810adb3550c851891b", + [ + [ + [ + "What are the responsibilities of a Data Engineer according to the document?", + "user" + ] + ] + ], + [ + [ + "We distinguish between the following personas:\nData Governance Officer: Responsible for ensuring that data governance, data privacy and other compliance measures are adhered to across the model development and deployment process. Not typically involved in day-to-day operations.\nData Engineer: Responsible for building data pipelines to process, organize and persist data sets for machine learning and other downstream applications.\nData Scientist: Responsible for understanding the business problem, exploring available data to understand if machine learning is applicable, and then training, tuning and evaluating a model to be deployed.\nML Engineer: Responsible for deploying machine learning models to production with appropriate governance, monitoring and software development best practices such as continuous integration and continuous deployment (CI/CD).\nBusiness Stakeholder: Responsible for using the model to make decisions for the business or product, and responsible for the business value that the model is expected to generate.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf" + ] + ], + [ + "A Data Engineer is responsible for building data pipelines.", + "A Data Engineer is responsible for processing data sets.", + "A Data Engineer is responsible for organizing data sets.", + "A Data Engineer is responsible for persisting data sets.", + "The responsibilities support machine learning and other downstream applications." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf" + ], + [ + "7fb5788f38a067b7fcc5b768a4161ef7068f40e35a83e3426e06608c5fcc954e", + [ + [ + [ + "What are the benefits of Delta Sharing in terms of data accessibility and platform compatibility?", + "user" + ] + ] + ], + [ + [ + "Delta Sharing provides an open solution to securely share live data from your lakehouse to any computing platform. Recipients don’t have to be on the Databricks platform or on the same cloud or a cloud at all. Data providers can share live data without replicating it or moving it to another system. Recipients benefit from always having access to the latest version of data and can quickly query shared data using tools of their choice for BI, analytics and machine learning, reducing time-to-value.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf" + ] + ], + [ + "Delta Sharing provides access to the latest version of the data without replication or moving the data.", + "Delta Sharing allows sharing data across different computing platforms.", + "Recipients do not need to be on Databricks.", + "Recipients do not need to be on the same cloud or any cloud.", + "Delta Sharing supports the use of recipients' preferred BI, analytics, and machine learning tools." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf" + ], + [ + "87873effb62b308f6aafbb34e7c24aec0439d6b92eb32b330105aa9d6aca2286", + [ + [ + [ + "What are the critical needs for IT and business when it comes to implementing a customer data platform?", + "user" + ] + ] + ], + [ + [ + "Critical IT Needs\nKeep control of data access and governance; ability to architecture a customer data stack with decisions on where data is stored and where queries are executed.\nCritical Business Needs\nGet customer data access via a no-code interface to generate insights; build customer experiences and activate data within business applications.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf" + ] + ], + [ + "IT needs to maintain control over data access and governance.", + "IT needs the ability to make decisions on data storage and query execution.", + "Business needs to access customer data through a no-code interface to generate insights.", + "Business needs to build customer experiences and activate data within business applications." + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf" + ], + [ + "c345dd4a67b6fb4dd4bda2ba745bcc1182174d70624e581bbc488008fb1ff876", + [ + [ + [ + "What are the main advantages of the lakehouse architecture over traditional on-premises data warehouses and cloud data warehouses?", + "user" + ] + ] + ], + [ + [ + "“Data warehouses were designed to provide a central data repository with analytic compute capabilities to help business leaders get analytical insights, support decision-making and business intelligence (BI). Legacy on-premises data warehouse architectures are difficult to scale and make it difficult for data teams to keep up with the exponential growth of data. Oftentimes data teams publish and use a subset of well-defined data for development and testing. This slows down both innovation and time to insight. Cloud data warehouses (CDW) were an attempt to tackle the on-premises data warehouse challenges. CDWs removed the administrative burden of tasks such as setup, upgrades and backups. CDWs also improved scalability and introduced cloud’s pay-as-you-go model to reduce cost. CDWs leverage a proprietary data format to achieve cloud-scale and performance; however, this also leads to customers locked into these formats with difficult But enterprise data teams don’t need a better data warehouse. They need an innovative, simple solution that provides reliable performance, elastic scale and allows self-service to unblock analytics to access all data at a reasonable cost. The answer is the lakehouse. The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency and scale of data lakes with the data management and ACID semantics of data warehouses.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf" + ] + ], + [ + "Flexibility", + "Cost-efficiency", + "Open data management architecture (avoids customer lock-in)", + "Better scalability", + "Ease of management", + "Reliable performance", + "Self-service capabilities", + "Unblocks analytics", + "Supports rapid innovation", + "Access to all data at a reasonable cost" + ], + "SYNTHETIC_FROM_DOC", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf" + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": false, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "request_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "request", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"messages\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"content\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"role\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "expected_retrieved_context", + "type": "{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"content\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"doc_uri\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "expected_facts", + "type": "{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "source_type", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "source_id", + "type": "\"string\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from databricks.agents.evals import generate_evals_df\n", + "\n", + "# NOTE: The guidelines you provide are a free-form string. The markdown string below is the suggested formatting for the set of guidelines, however you are free\n", + "# to add your sections here. Note that this will be prompt-engineering an LLM that generates the synthetic data, so you may have to iterate on these guidelines before\n", + "# you get the results you desire.\n", + "guidelines = \"\"\"\n", + "# Task Description\n", + "The Agent is a RAG chatbot that answers questions about using Spark on Databricks. The Agent has access to a corpus of Databricks documents, and its task is to answer the user's questions by retrieving the relevant docs from the corpus and synthesizing a helpful, accurate response. The corpus covers a lot of info, but the Agent is specifically designed to interact with Databricks users who have questions about Spark. So questions outside of this scope are considered irrelevant.\n", + "\n", + "# User personas\n", + "- A developer who is new to the Databricks platform\n", + "- An experienced, highly technical Data Scientist or Data Engineer\n", + "\n", + "# Example questions\n", + "- what API lets me parallelize operations over rows of a delta table?\n", + "- Which cluster settings will give me the best performance when using Spark?\n", + "\n", + "# Additional Guidelines\n", + "- Questions should be succinct, and human-like\n", + "\"\"\"\n", + "\n", + "synthesized_evals_df = generate_evals_df(\n", + " docs=source_documents,\n", + " # The number of evaluations to generate for each doc.\n", + " num_evals=10,\n", + " # A optional set of guidelines that help guide the synthetic generation. This is a free-form string that will be used to prompt the generation.\n", + " # guidelines=guidelines\n", + ")\n", + "\n", + "# Write the synthetic evaluation data to the evaluation set table\n", + "spark.createDataFrame(synthesized_evals_df).write.format(\"delta\").mode(\"append\").saveAsTable(agent_storage_config.evaluation_set_uc_table)\n", + "\n", + "# Display the synthetic evaluation data\n", + "eval_set_df = spark.table(agent_storage_config.evaluation_set_uc_table)\n", + "display(eval_set_df.toPandas())" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "03_create_synthetic_eval", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/04_create_tools.ipynb b/autogen_agent_app_sample_code/04_create_tools.ipynb index c7587b0..bc62bc2 100644 --- a/autogen_agent_app_sample_code/04_create_tools.ipynb +++ b/autogen_agent_app_sample_code/04_create_tools.ipynb @@ -1,688 +1,2013 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## 👉 START HERE: How to use this notebook -# MAGIC -# MAGIC # Step 2: Create tools for your Agent -# MAGIC -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: -# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. -# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent -# MAGIC -# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Install Python libraries -# MAGIC -# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. - -# COMMAND ---------- - -# MAGIC %pip install -qqqq -U -r requirements.txt -# MAGIC # Restart to load the packages into the Python environment -# MAGIC dbutils.library.restartPython() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Connect to Databricks -# MAGIC -# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. - -# COMMAND ---------- - -from mlflow.utils import databricks_utils as du -import os - -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - - spark = DatabricksSession.builder.getOrCreate() - os.environ["MLFLOW_TRACKING_URI"] = "databricks" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment -# MAGIC -# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. - -# COMMAND ---------- - -from cookbook.config.shared.agent_storage_location import AgentStorageConfig -from cookbook.databricks_utils import get_mlflow_experiment_url -from cookbook.config import load_serializable_config_from_yaml_file -import mlflow - -# Load the Agent's storage locations -agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") - -# Show the Agent's storage locations -agent_storage_config.pretty_print() - -# set the MLflow experiment -experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) -# If running in a local IDE, set the MLflow experiment name as an environment variable -os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name - -print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # create tools -# MAGIC -# MAGIC - we will store all tools in the `user_tools` folder -# MAGIC - first, create a local function & test it with pytest -# MAGIC - then, deploy it as a UC tool & test it with pytest -# MAGIC - then, add the tool to the Agent - -# COMMAND ---------- - -# MAGIC %md -# MAGIC always reload the tool's code - -# COMMAND ---------- - -# MAGIC %load_ext autoreload -# MAGIC %autoreload 3 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## lets do an example of a simple, but fake tool that translates old to new SKUs. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC 1, create the python function that will become your UC function. you need to annotate the function with docstrings & type hints - these are used to create the tool's metadata in UC. - -# COMMAND ---------- - -# MAGIC %%writefile tools/sample_tool.py -# MAGIC -# MAGIC def sku_sample_translator(old_sku: str) -> str: -# MAGIC """ -# MAGIC Translates a pre-2024 SKU formatted as "OLD-XXX-YYYY" to the new SKU format "NEW-YYYY-XXX". -# MAGIC -# MAGIC Args: -# MAGIC old_sku (str): The old SKU in the format "OLD-XXX-YYYY". -# MAGIC -# MAGIC Returns: -# MAGIC str: The new SKU in the format "NEW-YYYY-XXX". -# MAGIC -# MAGIC Raises: -# MAGIC ValueError: If the SKU format is invalid, providing specific error details. -# MAGIC """ -# MAGIC import re -# MAGIC -# MAGIC if not isinstance(old_sku, str): -# MAGIC raise ValueError("SKU must be a string") -# MAGIC -# MAGIC # Normalize input by removing extra whitespace and converting to uppercase -# MAGIC old_sku = old_sku.strip().upper() -# MAGIC -# MAGIC # Define the regex pattern for the old SKU format -# MAGIC pattern = r"^OLD-([A-Z]{3})-(\d{4})$" -# MAGIC -# MAGIC # Match the old SKU against the pattern -# MAGIC match = re.match(pattern, old_sku) -# MAGIC if not match: -# MAGIC if not old_sku.startswith("OLD-"): -# MAGIC raise ValueError("SKU must start with 'OLD-'") -# MAGIC if not re.match(r"^OLD-[A-Z]{3}-\d{4}$", old_sku): -# MAGIC raise ValueError( -# MAGIC "SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" -# MAGIC ) -# MAGIC raise ValueError("Invalid SKU format") -# MAGIC -# MAGIC # Extract the letter code and numeric part -# MAGIC letter_code, numeric_part = match.groups() -# MAGIC -# MAGIC # Additional validation for numeric part -# MAGIC if not (1 <= int(numeric_part) <= 9999): -# MAGIC raise ValueError("Numeric part must be between 0001 and 9999") -# MAGIC -# MAGIC # Construct the new SKU -# MAGIC new_sku = f"NEW-{numeric_part}-{letter_code}" -# MAGIC return new_sku -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now, let's import the tool and test it locally - -# COMMAND ---------- - -from tools.sample_tool import sku_sample_translator - -sku_sample_translator("OLD-XXX-1234") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC now, lets write some pyTest unit tests for the tool - these are just samples, you will need to write your own - -# COMMAND ---------- - -# MAGIC %%writefile tools/test_sample_tool.py -# MAGIC import pytest -# MAGIC from tools.sample_tool import sku_sample_translator -# MAGIC -# MAGIC -# MAGIC -# MAGIC def test_valid_sku_translation(): -# MAGIC """Test successful SKU translation with valid input.""" -# MAGIC assert sku_sample_translator("OLD-ABC-1234") == "NEW-1234-ABC" -# MAGIC assert sku_sample_translator("OLD-XYZ-0001") == "NEW-0001-XYZ" -# MAGIC assert sku_sample_translator("old-def-5678") == "NEW-5678-DEF" # Test case insensitivity -# MAGIC -# MAGIC -# MAGIC def test_whitespace_handling(): -# MAGIC """Test that the function handles extra whitespace correctly.""" -# MAGIC assert sku_sample_translator(" OLD-ABC-1234 ") == "NEW-1234-ABC" -# MAGIC assert sku_sample_translator("\tOLD-ABC-1234\n") == "NEW-1234-ABC" -# MAGIC -# MAGIC -# MAGIC def test_invalid_input_type(): -# MAGIC """Test that non-string inputs raise ValueError.""" -# MAGIC with pytest.raises(ValueError, match="SKU must be a string"): -# MAGIC sku_sample_translator(123) -# MAGIC with pytest.raises(ValueError, match="SKU must be a string"): -# MAGIC sku_sample_translator(None) -# MAGIC -# MAGIC -# MAGIC def test_invalid_prefix(): -# MAGIC """Test that SKUs not starting with 'OLD-' raise ValueError.""" -# MAGIC with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): -# MAGIC sku_sample_translator("NEW-ABC-1234") -# MAGIC with pytest.raises(ValueError, match="SKU must start with 'OLD-'"): -# MAGIC sku_sample_translator("XXX-ABC-1234") -# MAGIC -# MAGIC -# MAGIC def test_invalid_format(): -# MAGIC """Test various invalid SKU formats.""" -# MAGIC invalid_skus = [ -# MAGIC "OLD-AB-1234", # Too few letters -# MAGIC "OLD-ABCD-1234", # Too many letters -# MAGIC "OLD-123-1234", # Numbers instead of letters -# MAGIC "OLD-ABC-123", # Too few digits -# MAGIC "OLD-ABC-12345", # Too many digits -# MAGIC "OLD-ABC-XXXX", # Letters instead of numbers -# MAGIC "OLD-A1C-1234", # Mixed letters and numbers in middle -# MAGIC ] -# MAGIC -# MAGIC for sku in invalid_skus: -# MAGIC with pytest.raises( -# MAGIC ValueError, -# MAGIC match="SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit", -# MAGIC ): -# MAGIC sku_sample_translator(sku) -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC now, lets run the tests - -# COMMAND ---------- - -import pytest - -# Run tests from test_sku_translator.py -pytest.main(["-v", "tools/test_sample_tool.py"]) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now, lets deploy the tool to Unity catalog. - -# COMMAND ---------- - -from unitycatalog.ai.core.databricks import DatabricksFunctionClient -from tools.sample_tool import sku_sample_translator - -client = DatabricksFunctionClient() -CATALOG = "casaman_ssa" # Change me! -SCHEMA = "demos" # Change me if you want - -# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints -tool_uc_info = client.create_python_function(func=sku_sample_translator, catalog=CATALOG, schema=SCHEMA, replace=True) - -# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function -# Print the deployed Unity Catalog function name -print(f"Deployed Unity Catalog function name: {tool_uc_info.full_name}") - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now, wrap it into a UCTool that will be used by our Agent. UC tool is just a Pydnatic base model that is serializable to YAML that will load the tool's metadata from UC and wrap it in a callable object. - -# COMMAND ---------- - -from cookbook.tools.uc_tool import UCTool - -# wrap the tool into a UCTool which can be passed to our Agent -translate_sku_tool = UCTool(uc_function_name=tool_uc_info.full_name) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now, let's test the UC tool - the UCTool is a directly callable wrapper around the UC function, so it can be used just like a local function, but the output will be put into a dictionary with either the output in a 'value' key or an 'error' key if an error is raised. -# MAGIC -# MAGIC when an error happens, the UC tool will also return an instruction prompt to show the agent how to think about handling the error. this can be changed via the `error_prompt` parameter in the UCTool.. -# MAGIC - -# COMMAND ---------- - -# successful call -translate_sku_tool(old_sku="OLD-XXX-1234") - -# COMMAND ---------- - -# unsuccessful call -translate_sku_tool(old_sku="OxxLD-XXX-1234") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC now, let's convert our pytests to work with the UC tool. this requires a bit of transformation to the test code to account for the fact that the output is in a dictionary & exceptions are not raised directly. - -# COMMAND ---------- - -# MAGIC %%writefile tools/test_sample_tool_uc.py -# MAGIC import pytest -# MAGIC from cookbook.tools.uc_tool import UCTool -# MAGIC -# MAGIC # Load the function from the UCTool versus locally -# MAGIC @pytest.fixture -# MAGIC def uc_tool(): -# MAGIC """Fixture to translate a UC tool into a local function.""" -# MAGIC UC_FUNCTION_NAME = "ep.cookbook_local_test.sku_sample_translator" -# MAGIC loaded_tool = UCTool(uc_function_name=UC_FUNCTION_NAME) -# MAGIC return loaded_tool -# MAGIC -# MAGIC -# MAGIC # Note: The value will be post processed into the `value` key, so we must check the returned value there. -# MAGIC def test_valid_sku_translation(uc_tool): -# MAGIC """Test successful SKU translation with valid input.""" -# MAGIC assert uc_tool(old_sku="OLD-ABC-1234")["value"] == "NEW-1234-ABC" -# MAGIC assert uc_tool(old_sku="OLD-XYZ-0001")["value"] == "NEW-0001-XYZ" -# MAGIC assert ( -# MAGIC uc_tool(old_sku="old-def-5678")["value"] == "NEW-5678-DEF" -# MAGIC ) # Test case insensitivity -# MAGIC -# MAGIC -# MAGIC # Note: The value will be post processed into the `value` key, so we must check the returned value there. -# MAGIC def test_whitespace_handling(uc_tool): -# MAGIC """Test that the function handles extra whitespace correctly.""" -# MAGIC assert uc_tool(old_sku=" OLD-ABC-1234 ")["value"] == "NEW-1234-ABC" -# MAGIC assert uc_tool(old_sku="\tOLD-ABC-1234\n")["value"] == "NEW-1234-ABC" -# MAGIC -# MAGIC -# MAGIC # Note: the input validation happens BEFORE the function is called by Spark, so we will never get these exceptions from the function. -# MAGIC # Instead, we will get invalid parameters errors from Spark. -# MAGIC def test_invalid_input_type(uc_tool): -# MAGIC """Test that non-string inputs raise ValueError.""" -# MAGIC assert ( -# MAGIC uc_tool(old_sku=123)["error"]["error_message"] -# MAGIC == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" -# MAGIC ) -# MAGIC assert ( -# MAGIC uc_tool(old_sku=None)["error"]["error_message"] -# MAGIC == """Invalid parameters provided: {'old_sku': "Parameter old_sku should be of type STRING (corresponding python type ), but got "}.""" -# MAGIC ) -# MAGIC -# MAGIC -# MAGIC # Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. -# MAGIC def test_invalid_prefix(uc_tool): -# MAGIC """Test that SKUs not starting with 'OLD-' raise ValueError.""" -# MAGIC assert ( -# MAGIC uc_tool(old_sku="NEW-ABC-1234")["error"]["error_message"] -# MAGIC == "ValueError: SKU must start with 'OLD-'" -# MAGIC ) -# MAGIC assert ( -# MAGIC uc_tool(old_sku="XXX-ABC-1234")["error"]["error_message"] -# MAGIC == "ValueError: SKU must start with 'OLD-'" -# MAGIC ) -# MAGIC -# MAGIC -# MAGIC # Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there. -# MAGIC def test_invalid_format(uc_tool): -# MAGIC """Test various invalid SKU formats.""" -# MAGIC invalid_skus = [ -# MAGIC "OLD-AB-1234", # Too few letters -# MAGIC "OLD-ABCD-1234", # Too many letters -# MAGIC "OLD-123-1234", # Numbers instead of letters -# MAGIC "OLD-ABC-123", # Too few digits -# MAGIC "OLD-ABC-12345", # Too many digits -# MAGIC "OLD-ABC-XXXX", # Letters instead of numbers -# MAGIC "OLD-A1C-1234", # Mixed letters and numbers in middle -# MAGIC ] -# MAGIC -# MAGIC expected_error = "ValueError: SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit" -# MAGIC for sku in invalid_skus: -# MAGIC assert uc_tool(old_sku=sku)["error"]["error_message"] == expected_error -# MAGIC - -# COMMAND ---------- - -import pytest - -# Run tests from test_sku_translator.py -pytest.main(["-v", "tools/test_sample_tool_uc.py"]) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC # Now, here's another example of a tool that executes python code. - -# COMMAND ---------- - -# MAGIC %%writefile tools/code_exec.py -# MAGIC def python_exec(code: str) -> str: -# MAGIC """ -# MAGIC Executes Python code in the sandboxed environment and returns its stdout. The runtime is stateless and you can not read output of the previous tool executions. i.e. No such variables "rows", "observation" defined. Calling another tool inside a Python code is NOT allowed. -# MAGIC Use only standard python libraries and these python libraries: bleach, chardet, charset-normalizer, defusedxml, googleapis-common-protos, grpcio, grpcio-status, jmespath, joblib, numpy, packaging, pandas, patsy, protobuf, pyarrow, pyparsing, python-dateutil, pytz, scikit-learn, scipy, setuptools, six, threadpoolctl, webencodings, user-agents, cryptography. -# MAGIC -# MAGIC Args: -# MAGIC code (str): Python code to execute. Remember to print the final result to stdout. -# MAGIC -# MAGIC Returns: -# MAGIC str: The output of the executed code. -# MAGIC """ -# MAGIC import sys -# MAGIC from io import StringIO -# MAGIC -# MAGIC sys_stdout = sys.stdout -# MAGIC redirected_output = StringIO() -# MAGIC sys.stdout = redirected_output -# MAGIC exec(code) -# MAGIC sys.stdout = sys_stdout -# MAGIC return redirected_output.getvalue() -# MAGIC - -# COMMAND ---------- - -from tools.code_exec import python_exec - -python_exec("print('hello')") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Test it locally - -# COMMAND ---------- - -# MAGIC %%writefile tools/test_code_exec.py -# MAGIC -# MAGIC import pytest -# MAGIC from .code_exec import python_exec -# MAGIC -# MAGIC -# MAGIC def test_basic_arithmetic(): -# MAGIC code = """result = 2 + 2\nprint(result)""" -# MAGIC assert python_exec(code).strip() == "4" -# MAGIC -# MAGIC -# MAGIC def test_multiple_lines(): -# MAGIC code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" -# MAGIC assert python_exec(code).strip() == "15" -# MAGIC -# MAGIC -# MAGIC def test_multiple_prints(): -# MAGIC code = """print('first')\nprint('second')\nprint('third')\n""" -# MAGIC expected = "first\nsecond\nthird\n" -# MAGIC assert python_exec(code) == expected -# MAGIC -# MAGIC -# MAGIC def test_using_pandas(): -# MAGIC code = ( -# MAGIC "import pandas as pd\n" -# MAGIC "data = {'col1': [1, 2], 'col2': [3, 4]}\n" -# MAGIC "df = pd.DataFrame(data)\n" -# MAGIC "print(df.shape)" -# MAGIC ) -# MAGIC assert python_exec(code).strip() == "(2, 2)" -# MAGIC -# MAGIC -# MAGIC def test_using_numpy(): -# MAGIC code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" -# MAGIC assert python_exec(code).strip() == "2.0" -# MAGIC -# MAGIC -# MAGIC def test_syntax_error(): -# MAGIC code = "if True\n" " print('invalid syntax')" -# MAGIC with pytest.raises(SyntaxError): -# MAGIC python_exec(code) -# MAGIC -# MAGIC -# MAGIC def test_runtime_error(): -# MAGIC code = "x = 1 / 0\n" "print(x)" -# MAGIC with pytest.raises(ZeroDivisionError): -# MAGIC python_exec(code) -# MAGIC -# MAGIC -# MAGIC def test_undefined_variable(): -# MAGIC code = "print(undefined_variable)" -# MAGIC with pytest.raises(NameError): -# MAGIC python_exec(code) -# MAGIC -# MAGIC -# MAGIC def test_multiline_string_manipulation(): -# MAGIC code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" -# MAGIC expected = "Hello\nWorld" -# MAGIC assert python_exec(code).strip() == expected -# MAGIC -# MAGIC # Will not fail locally, but will fail in UC. -# MAGIC # def test_unauthorized_flask(): -# MAGIC # code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" -# MAGIC # with pytest.raises(ImportError): -# MAGIC # python_exec(code) -# MAGIC -# MAGIC -# MAGIC def test_no_print_statement(): -# MAGIC code = "x = 42\n" "y = x * 2" -# MAGIC assert python_exec(code) == "" -# MAGIC -# MAGIC -# MAGIC def test_calculation_without_print(): -# MAGIC code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" -# MAGIC assert python_exec(code) == "" -# MAGIC -# MAGIC -# MAGIC def test_function_definition_without_call(): -# MAGIC code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" -# MAGIC assert python_exec(code) == "" -# MAGIC -# MAGIC -# MAGIC def test_class_definition_without_instantiation(): -# MAGIC code = ( -# MAGIC "class Calculator:\n" -# MAGIC " def add(self, a, b):\n" -# MAGIC " return a + b\n" -# MAGIC "calc = Calculator()" -# MAGIC ) -# MAGIC assert python_exec(code) == "" -# MAGIC - -# COMMAND ---------- - -import pytest - -# Run tests from test_sku_translator.py -pytest.main(["-v", "tools/test_code_exec.py"]) - - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Deploy to UC - -# COMMAND ---------- - -from unitycatalog.ai.core.databricks import DatabricksFunctionClient -from tools.code_exec import python_exec -from cookbook.tools.uc_tool import UCTool - -client = DatabricksFunctionClient() -CATALOG = "casaman_ssa" # Change me! -SCHEMA = "demos" # Change me if you want - -# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints -python_exec_tool_uc_info = client.create_python_function(func=python_exec, catalog=CATALOG, schema=SCHEMA, replace=True) - -# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function -# Print the deployed Unity Catalog function name -print(f"Deployed Unity Catalog function name: {python_exec_tool_uc_info.full_name}") - - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Test as UC Tool for the Agent - -# COMMAND ---------- - -from cookbook.tools.uc_tool import UCTool - - -# wrap the tool into a UCTool which can be passed to our Agent -python_exec_tool = UCTool(uc_function_name=python_exec_tool_uc_info.full_name) - -python_exec_tool(code="print('hello')") - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC New tests - -# COMMAND ---------- - -# MAGIC %%writefile tools/test_code_exec_as_uc_tool.py -# MAGIC -# MAGIC import pytest -# MAGIC from cookbook.tools.uc_tool import UCTool -# MAGIC -# MAGIC CATALOG = "ep" -# MAGIC SCHEMA = "cookbook_local_test" -# MAGIC -# MAGIC -# MAGIC @pytest.fixture -# MAGIC def python_exec(): -# MAGIC """Fixture to provide the python_exec function from UCTool.""" -# MAGIC python_exec_tool = UCTool(uc_function_name=f"{CATALOG}.{SCHEMA}.python_exec") -# MAGIC return python_exec_tool -# MAGIC -# MAGIC -# MAGIC def test_basic_arithmetic(python_exec): -# MAGIC code = """result = 2 + 2\nprint(result)""" -# MAGIC assert python_exec(code=code)["value"].strip() == "4" -# MAGIC -# MAGIC -# MAGIC def test_multiple_lines(python_exec): -# MAGIC code = "x = 5\n" "y = 3\n" "result = x * y\n" "print(result)" -# MAGIC assert python_exec(code=code)["value"].strip() == "15" -# MAGIC -# MAGIC -# MAGIC def test_multiple_prints(python_exec): -# MAGIC code = """print('first')\nprint('second')\nprint('third')\n""" -# MAGIC expected = "first\nsecond\nthird\n" -# MAGIC assert python_exec(code=code)["value"] == expected -# MAGIC -# MAGIC -# MAGIC def test_using_pandas(python_exec): -# MAGIC code = ( -# MAGIC "import pandas as pd\n" -# MAGIC "data = {'col1': [1, 2], 'col2': [3, 4]}\n" -# MAGIC "df = pd.DataFrame(data)\n" -# MAGIC "print(df.shape)" -# MAGIC ) -# MAGIC assert python_exec(code=code)["value"].strip() == "(2, 2)" -# MAGIC -# MAGIC -# MAGIC def test_using_numpy(python_exec): -# MAGIC code = "import numpy as np\n" "arr = np.array([1, 2, 3])\n" "print(arr.mean())" -# MAGIC assert python_exec(code=code)["value"].strip() == "2.0" -# MAGIC -# MAGIC -# MAGIC def test_syntax_error(python_exec): -# MAGIC code = "if True\n" " print('invalid syntax')" -# MAGIC result = python_exec(code=code) -# MAGIC assert "Syntax error at or near 'invalid'." in result["error"]["error_message"] -# MAGIC -# MAGIC -# MAGIC def test_runtime_error(python_exec): -# MAGIC code = "x = 1 / 0\n" "print(x)" -# MAGIC result = python_exec(code=code) -# MAGIC assert "ZeroDivisionError" in result["error"]["error_message"] -# MAGIC -# MAGIC -# MAGIC def test_undefined_variable(python_exec): -# MAGIC code = "print(undefined_variable)" -# MAGIC result = python_exec(code=code) -# MAGIC assert "NameError" in result["error"]["error_message"] -# MAGIC -# MAGIC -# MAGIC def test_multiline_string_manipulation(python_exec): -# MAGIC code = "text = '''\n" "Hello\n" "World\n" "'''\n" "print(text.strip())" -# MAGIC expected = "Hello\nWorld" -# MAGIC assert python_exec(code=code)["value"].strip() == expected -# MAGIC -# MAGIC -# MAGIC def test_unauthorized_flask(python_exec): -# MAGIC code = "from flask import Flask\n" "app = Flask(__name__)\n" "print(app)" -# MAGIC result = python_exec(code=code) -# MAGIC assert ( -# MAGIC "ModuleNotFoundError: No module named 'flask'" -# MAGIC in result["error"]["error_message"] -# MAGIC ) -# MAGIC -# MAGIC -# MAGIC def test_no_print_statement(python_exec): -# MAGIC code = "x = 42\n" "y = x * 2" -# MAGIC assert python_exec(code=code)["value"] == "" -# MAGIC -# MAGIC -# MAGIC def test_calculation_without_print(python_exec): -# MAGIC code = "result = sum([1, 2, 3, 4, 5])\n" "squared = [x**2 for x in range(5)]" -# MAGIC assert python_exec(code=code)["value"] == "" -# MAGIC -# MAGIC -# MAGIC def test_function_definition_without_call(python_exec): -# MAGIC code = "def add(a, b):\n" " return a + b\n" "result = add(3, 4)" -# MAGIC assert python_exec(code=code)["value"] == "" -# MAGIC -# MAGIC -# MAGIC def test_class_definition_without_instantiation(python_exec): -# MAGIC code = ( -# MAGIC "class Calculator:\n" -# MAGIC " def add(self, a, b):\n" -# MAGIC " return a + b\n" -# MAGIC "calc = Calculator()" -# MAGIC ) -# MAGIC assert python_exec(code=code)["value"] == "" -# MAGIC - -# COMMAND ---------- - -import pytest - -# Run tests from test_sku_translator.py -pytest.main(["-v", "tools/test_code_exec_as_uc_tool.py"]) - +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "31661828-f9bb-4fc2-a1bd-94424a27ed52", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 👉 START HERE: How to use this notebook\n", + "\n", + "# Step 2: Create tools for your Agent\n", + "\n", + "" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d9f685a-fdb7-49a4-9e3a-a4a9e964d045", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "**Important note:** Throughout this notebook, we indicate which cell's code you:\n", + "- ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality.\n", + "- 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent\n", + "\n", + "*Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bb4f8cc0-1797-4beb-a9f2-df21a9db79f0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Install Python libraries\n", + "\n", + "You do not need to modify this cell unless you need additional Python packages in your Agent." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6d4030e8-ae97-4351-bebd-9651d283578f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.2 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.24 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.2 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.24 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.3 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt\n", + "# Restart to load the packages into the Python environment\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d61d907e-f3eb-4611-a54e-59487eef3d63", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Connect to Databricks\n", + "\n", + "If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8bd410de-0bed-41c7-9a6b-37251804a234", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.utils import databricks_utils as du\n", + "import os\n", + "\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()\n", + " os.environ[\"MLFLOW_TRACKING_URI\"] = \"databricks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "00a9f889-fb7d-4d49-b7c9-ce4757a41aac", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment\n", + "\n", + "This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "33fb11bd-c950-44f1-9cec-2dbed3ec7b30", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n \"uc_model_name\": \"casaman_ssa.demos.my_agent_autogen\",\n \"evaluation_set_uc_table\": \"casaman_ssa.demos.my_agent_autogen_eval_set\",\n \"mlflow_experiment_name\": \"/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment\",\n \"class_path\": \"cookbook.config.shared.agent_storage_location.AgentStorageConfig\"\n}\nView the MLflow Experiment `/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment` at https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093\n" + ] + } + ], + "source": [ + "from cookbook.config.shared.agent_storage_location import AgentStorageConfig\n", + "from cookbook.databricks_utils import get_mlflow_experiment_url\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "import mlflow \n", + "\n", + "# Load the Agent's storage locations\n", + "agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file(\"./configs/agent_storage_config.yaml\")\n", + "\n", + "# Show the Agent's storage locations\n", + "agent_storage_config.pretty_print()\n", + "\n", + "# set the MLflow experiment\n", + "experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name)\n", + "# If running in a local IDE, set the MLflow experiment name as an environment variable\n", + "os.environ[\"MLFLOW_EXPERIMENT_NAME\"] = agent_storage_config.mlflow_experiment_name\n", + "\n", + "print(f\"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f4e95ef2-6ef7-4ce3-a263-02c9ad7da33d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# create tools\n", + "\n", + "- we will store all tools in the `user_tools` folder\n", + "- first, create a local function & test it with pytest\n", + "- then, deploy it as a UC tool & test it with pytest\n", + "- then, add the tool to the Agent " + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8a4a5c00-8c0e-4b96-b931-f6aed98b8543", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "always reload the tool's code" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "88d01e38-702d-41b2-aa4b-03cc5771e903", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "732377ed-6c9b-4900-88f6-414be88bfd8f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## lets do an example of a simple, but fake tool that translates old to new SKUs." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "7e2b84b0-5bd4-4e90-a789-507e8be27313", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "1, create the python function that will become your UC function. you need to annotate the function with docstrings & type hints - these are used to create the tool's metadata in UC." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a3c191bd-4723-41c7-a33f-737f22776bcd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/sample_tool.py\n" + ] + } + ], + "source": [ + "%%writefile tools/sample_tool.py\n", + "\n", + "def sku_sample_translator(old_sku: str) -> str:\n", + " \"\"\"\n", + " Translates a pre-2024 SKU formatted as \"OLD-XXX-YYYY\" to the new SKU format \"NEW-YYYY-XXX\".\n", + "\n", + " Args:\n", + " old_sku (str): The old SKU in the format \"OLD-XXX-YYYY\".\n", + "\n", + " Returns:\n", + " str: The new SKU in the format \"NEW-YYYY-XXX\".\n", + "\n", + " Raises:\n", + " ValueError: If the SKU format is invalid, providing specific error details.\n", + " \"\"\"\n", + " import re\n", + "\n", + " if not isinstance(old_sku, str):\n", + " raise ValueError(\"SKU must be a string\")\n", + "\n", + " # Normalize input by removing extra whitespace and converting to uppercase\n", + " old_sku = old_sku.strip().upper()\n", + "\n", + " # Define the regex pattern for the old SKU format\n", + " pattern = r\"^OLD-([A-Z]{3})-(\\d{4})$\"\n", + "\n", + " # Match the old SKU against the pattern\n", + " match = re.match(pattern, old_sku)\n", + " if not match:\n", + " if not old_sku.startswith(\"OLD-\"):\n", + " raise ValueError(\"SKU must start with 'OLD-'\")\n", + " if not re.match(r\"^OLD-[A-Z]{3}-\\d{4}$\", old_sku):\n", + " raise ValueError(\n", + " \"SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit\"\n", + " )\n", + " raise ValueError(\"Invalid SKU format\")\n", + "\n", + " # Extract the letter code and numeric part\n", + " letter_code, numeric_part = match.groups()\n", + "\n", + " # Additional validation for numeric part\n", + " if not (1 <= int(numeric_part) <= 9999):\n", + " raise ValueError(\"Numeric part must be between 0001 and 9999\")\n", + "\n", + " # Construct the new SKU\n", + " new_sku = f\"NEW-{numeric_part}-{letter_code}\"\n", + " return new_sku\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "92e79854-26c2-4f75-bc15-0aee5e0b13fd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Now, let's import the tool and test it locally" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "da020a5e-12ad-4c1f-8921-ce49c736d48c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'NEW-1234-XXX'" + ] + }, + "execution_count": 5, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from tools.sample_tool import sku_sample_translator\n", + "\n", + "sku_sample_translator(\"OLD-XXX-1234\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3910cbdc-4943-455f-b6be-800e55094210", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "now, lets write some pyTest unit tests for the tool - these are just samples, you will need to write your own" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "238d0fc6-97d5-4c8b-bf2f-679441b7b7cc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/test_sample_tool.py\n" + ] + } + ], + "source": [ + "%%writefile tools/test_sample_tool.py\n", + "import pytest\n", + "from tools.sample_tool import sku_sample_translator\n", + "\n", + "\n", + "\n", + "def test_valid_sku_translation():\n", + " \"\"\"Test successful SKU translation with valid input.\"\"\"\n", + " assert sku_sample_translator(\"OLD-ABC-1234\") == \"NEW-1234-ABC\"\n", + " assert sku_sample_translator(\"OLD-XYZ-0001\") == \"NEW-0001-XYZ\"\n", + " assert sku_sample_translator(\"old-def-5678\") == \"NEW-5678-DEF\" # Test case insensitivity\n", + "\n", + "\n", + "def test_whitespace_handling():\n", + " \"\"\"Test that the function handles extra whitespace correctly.\"\"\"\n", + " assert sku_sample_translator(\" OLD-ABC-1234 \") == \"NEW-1234-ABC\"\n", + " assert sku_sample_translator(\"\\tOLD-ABC-1234\\n\") == \"NEW-1234-ABC\"\n", + "\n", + "\n", + "def test_invalid_input_type():\n", + " \"\"\"Test that non-string inputs raise ValueError.\"\"\"\n", + " with pytest.raises(ValueError, match=\"SKU must be a string\"):\n", + " sku_sample_translator(123)\n", + " with pytest.raises(ValueError, match=\"SKU must be a string\"):\n", + " sku_sample_translator(None)\n", + "\n", + "\n", + "def test_invalid_prefix():\n", + " \"\"\"Test that SKUs not starting with 'OLD-' raise ValueError.\"\"\"\n", + " with pytest.raises(ValueError, match=\"SKU must start with 'OLD-'\"):\n", + " sku_sample_translator(\"NEW-ABC-1234\")\n", + " with pytest.raises(ValueError, match=\"SKU must start with 'OLD-'\"):\n", + " sku_sample_translator(\"XXX-ABC-1234\")\n", + "\n", + "\n", + "def test_invalid_format():\n", + " \"\"\"Test various invalid SKU formats.\"\"\"\n", + " invalid_skus = [\n", + " \"OLD-AB-1234\", # Too few letters\n", + " \"OLD-ABCD-1234\", # Too many letters\n", + " \"OLD-123-1234\", # Numbers instead of letters\n", + " \"OLD-ABC-123\", # Too few digits\n", + " \"OLD-ABC-12345\", # Too many digits\n", + " \"OLD-ABC-XXXX\", # Letters instead of numbers\n", + " \"OLD-A1C-1234\", # Mixed letters and numbers in middle\n", + " ]\n", + "\n", + " for sku in invalid_skus:\n", + " with pytest.raises(\n", + " ValueError,\n", + " match=\"SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit\",\n", + " ):\n", + " sku_sample_translator(sku)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "332d2f71-1afa-45cc-9b49-6410f0e06e94", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "now, lets run the tests" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "04d3fc11-af8a-4566-892b-5cfb261c5d08", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[1m============================= test session starts ==============================\u001B[0m\nplatform linux -- Python 3.11.0rc1, pytest-8.3.4, pluggy-1.5.0 -- /local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/bin/python\ncachedir: .pytest_cache\nrootdir: /Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code\nconfigfile: pyproject.toml\nplugins: typeguard-2.13.3, anyio-3.5.0\n\u001B[1mcollecting ... \u001B[0mcollected 0 items / 1 error\n\n==================================== ERRORS ====================================\n\u001B[31m\u001B[1m__________________ ERROR collecting tools/test_sample_tool.py __________________\u001B[0m\n\u001B[1m\u001B[31m/usr/lib/python3.11/importlib/__init__.py\u001B[0m:126: in import_module\n \u001B[0m\u001B[94mreturn\u001B[39;49;00m _bootstrap._gcd_import(name[level:], package, level)\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m\u001B[0m:1206: in _gcd_import\n \u001B[0m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m\u001B[0m:1178: in _find_and_load\n \u001B[0m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m\u001B[0m:1149: in _find_and_load_unlocked\n \u001B[0m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m\u001B[0m:690: in _load_unlocked\n \u001B[0m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/_pytest/assertion/rewrite.py\u001B[0m:163: in exec_module\n \u001B[0mok = try_makedirs(cache_dir)\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/_pytest/assertion/rewrite.py\u001B[0m:1181: in try_makedirs\n \u001B[0mos.makedirs(cache_dir, exist_ok=\u001B[94mTrue\u001B[39;49;00m)\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31m\u001B[0m:225: in makedirs\n \u001B[0m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[04m\u001B[91m?\u001B[39;49;00m\u001B[90m\u001B[39;49;00m\n\u001B[1m\u001B[31mE OSError: [Errno 95] Operation not supported: '/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/tools/__pycache__'\u001B[0m\n\u001B[33m=============================== warnings summary ===============================\u001B[0m\n../../../../../local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/_pytest/config/__init__.py:1277\n /local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/_pytest/config/__init__.py:1277: PytestAssertRewriteWarning: Module already imported so cannot be rewritten: anyio\n self._mark_plugins_for_rewrite(hook)\n\n-- Docs: https://docs.pytest.org/en/stable/how-to/capture-warnings.html\n\u001B[36m\u001B[1m=========================== short test summary info ============================\u001B[0m\n\u001B[31mERROR\u001B[0m tools/test_sample_tool.py - OSError: [Errno 95] Operation not supported: '/Workspace/Users/manffred.cal...\n!!!!!!!!!!!!!!!!!!!! Interrupted: 1 error during collection !!!!!!!!!!!!!!!!!!!!\n\u001B[31m========================= \u001B[33m1 warning\u001B[0m, \u001B[31m\u001B[1m1 error\u001B[0m\u001B[31m in 4.63s\u001B[0m\u001B[31m ==========================\u001B[0m\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "" + ] + }, + "execution_count": 7, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "import pytest\n", + "\n", + "# Run tests from test_sku_translator.py\n", + "pytest.main([\"-v\", \"tools/test_sample_tool.py\"])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4cf0dbac-305d-4d93-a0e0-b9cac47773cb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Now, lets deploy the tool to Unity catalog." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6c1da4b4-2b2c-4a77-8f81-dd885375a9c4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Deployed Unity Catalog function name: casaman_ssa.demos.sku_sample_translator\n" + ] + } + ], + "source": [ + "from unitycatalog.ai.core.databricks import DatabricksFunctionClient\n", + "from tools.sample_tool import sku_sample_translator\n", + "\n", + "client = DatabricksFunctionClient()\n", + "CATALOG = \"casaman_ssa\" # Change me!\n", + "SCHEMA = \"demos\" # Change me if you want\n", + "\n", + "# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\n", + "tool_uc_info = client.create_python_function(func=sku_sample_translator, catalog=CATALOG, schema=SCHEMA, replace=True)\n", + "\n", + "# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\n", + "# Print the deployed Unity Catalog function name\n", + "print(f\"Deployed Unity Catalog function name: {tool_uc_info.full_name}\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ac730f0d-3591-48dd-b347-bf48a1e7cef1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Now, wrap it into a UCTool that will be used by our Agent. UC tool is just a Pydnatic base model that is serializable to YAML that will load the tool's metadata from UC and wrap it in a callable object." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fe421521-5e87-4042-8c98-3d2c67511893", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + } + ], + "source": [ + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "# wrap the tool into a UCTool which can be passed to our Agent\n", + "translate_sku_tool = UCTool(uc_function_name=tool_uc_info.full_name)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "eb38a3a5-4977-4e6d-a613-09821f5f2d49", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Now, let's test the UC tool - the UCTool is a directly callable wrapper around the UC function, so it can be used just like a local function, but the output will be put into a dictionary with either the output in a 'value' key or an 'error' key if an error is raised.\n", + "\n", + "when an error happens, the UC tool will also return an instruction prompt to show the agent how to think about handling the error. this can be changed via the `error_prompt` parameter in the UCTool..\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "904224f2-9380-4f42-b7b4-2fdcaa5cb9c5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'error': None, 'format': 'SCALAR', 'value': 'NEW-1234-XXX', 'truncated': None}" + ] + }, + "execution_count": 12, + "metadata": {}, + "output_type": "execute_result" + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-7dce8fb8b61140f5a9addd80bff15427\"", + "text/plain": [ + "Trace(request_id=tr-7dce8fb8b61140f5a9addd80bff15427)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# successful call\n", + "translate_sku_tool(old_sku=\"OLD-XXX-1234\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "35feffac-969a-4388-bf37-f462871cc3f7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'error': 'Failed to execute function with command `spark.sql(SELECT `casaman_ssa`.`demos`.`sku_sample_translator`(:old_sku), args={\\'old_sku\\': \\'OxxLD-XXX-1234\\'})`\\nError: [UDF_USER_CODE_ERROR.GENERIC] Execution of function casaman_ssa.demos.sku_sample_translator(OxxLD-XXX-1234) failed. \\n== Error ==\\nValueError: SKU must start with \\'OLD-\\'\\n== Stacktrace ==\\n File \"\", line 17, in main\\n raise ValueError(\"SKU must start with \\'OLD-\\'\") SQLSTATE: 39000\\n== SQL (line 1, position 8) ==\\nSELECT `casaman_ssa`.`demos`.`sku_sample_translator`(:old_sku)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n',\n", + " 'format': None,\n", + " 'value': None,\n", + " 'truncated': None}" + ] + }, + "execution_count": 13, + "metadata": {}, + "output_type": "execute_result" + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-95ba10558a75408ebffe134315cb7179\"", + "text/plain": [ + "Trace(request_id=tr-95ba10558a75408ebffe134315cb7179)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "# unsuccessful call\n", + "translate_sku_tool(old_sku=\"OxxLD-XXX-1234\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "876b5a6e-6f37-46a5-9ac7-badc28ca6537", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "now, let's convert our pytests to work with the UC tool. this requires a bit of transformation to the test code to account for the fact that the output is in a dictionary & exceptions are not raised directly." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0d44ae8d-7fca-4725-8bbc-fed8f60092a7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/test_sample_tool_uc.py\n" + ] + } + ], + "source": [ + "%%writefile tools/test_sample_tool_uc.py\n", + "import pytest\n", + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "# Load the function from the UCTool versus locally\n", + "@pytest.fixture\n", + "def uc_tool():\n", + " \"\"\"Fixture to translate a UC tool into a local function.\"\"\"\n", + " UC_FUNCTION_NAME = \"ep.cookbook_local_test.sku_sample_translator\"\n", + " loaded_tool = UCTool(uc_function_name=UC_FUNCTION_NAME)\n", + " return loaded_tool\n", + "\n", + "\n", + "# Note: The value will be post processed into the `value` key, so we must check the returned value there.\n", + "def test_valid_sku_translation(uc_tool):\n", + " \"\"\"Test successful SKU translation with valid input.\"\"\"\n", + " assert uc_tool(old_sku=\"OLD-ABC-1234\")[\"value\"] == \"NEW-1234-ABC\"\n", + " assert uc_tool(old_sku=\"OLD-XYZ-0001\")[\"value\"] == \"NEW-0001-XYZ\"\n", + " assert (\n", + " uc_tool(old_sku=\"old-def-5678\")[\"value\"] == \"NEW-5678-DEF\"\n", + " ) # Test case insensitivity\n", + "\n", + "\n", + "# Note: The value will be post processed into the `value` key, so we must check the returned value there.\n", + "def test_whitespace_handling(uc_tool):\n", + " \"\"\"Test that the function handles extra whitespace correctly.\"\"\"\n", + " assert uc_tool(old_sku=\" OLD-ABC-1234 \")[\"value\"] == \"NEW-1234-ABC\"\n", + " assert uc_tool(old_sku=\"\\tOLD-ABC-1234\\n\")[\"value\"] == \"NEW-1234-ABC\"\n", + "\n", + "\n", + "# Note: the input validation happens BEFORE the function is called by Spark, so we will never get these exceptions from the function.\n", + "# Instead, we will get invalid parameters errors from Spark.\n", + "def test_invalid_input_type(uc_tool):\n", + " \"\"\"Test that non-string inputs raise ValueError.\"\"\"\n", + " assert (\n", + " uc_tool(old_sku=123)[\"error\"][\"error_message\"]\n", + " == \"\"\"Invalid parameters provided: {'old_sku': \"Parameter old_sku should be of type STRING (corresponding python type ), but got \"}.\"\"\"\n", + " )\n", + " assert (\n", + " uc_tool(old_sku=None)[\"error\"][\"error_message\"]\n", + " == \"\"\"Invalid parameters provided: {'old_sku': \"Parameter old_sku should be of type STRING (corresponding python type ), but got \"}.\"\"\"\n", + " )\n", + "\n", + "\n", + "# Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there.\n", + "def test_invalid_prefix(uc_tool):\n", + " \"\"\"Test that SKUs not starting with 'OLD-' raise ValueError.\"\"\"\n", + " assert (\n", + " uc_tool(old_sku=\"NEW-ABC-1234\")[\"error\"][\"error_message\"]\n", + " == \"ValueError: SKU must start with 'OLD-'\"\n", + " )\n", + " assert (\n", + " uc_tool(old_sku=\"XXX-ABC-1234\")[\"error\"][\"error_message\"]\n", + " == \"ValueError: SKU must start with 'OLD-'\"\n", + " )\n", + "\n", + "\n", + "# Note: The errors will be post processed into the `error_message` key inside the `error` top level key, so we must check for exceptions there.\n", + "def test_invalid_format(uc_tool):\n", + " \"\"\"Test various invalid SKU formats.\"\"\"\n", + " invalid_skus = [\n", + " \"OLD-AB-1234\", # Too few letters\n", + " \"OLD-ABCD-1234\", # Too many letters\n", + " \"OLD-123-1234\", # Numbers instead of letters\n", + " \"OLD-ABC-123\", # Too few digits\n", + " \"OLD-ABC-12345\", # Too many digits\n", + " \"OLD-ABC-XXXX\", # Letters instead of numbers\n", + " \"OLD-A1C-1234\", # Mixed letters and numbers in middle\n", + " ]\n", + "\n", + " expected_error = \"ValueError: SKU format must be 'OLD-XXX-YYYY' where X is a letter and Y is a digit\"\n", + " for sku in invalid_skus:\n", + " assert uc_tool(old_sku=sku)[\"error\"][\"error_message\"] == expected_error\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "89b22629-259b-425a-b02f-88336abe7657", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)\n", + "File \u001B[0;32m, line 9\u001B[0m\n", + "\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n", + "\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n", + "\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n", + "\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n", + "\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n", + "\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n", + "\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n", + "\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n", + "\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n", + "\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n", + "\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n", + "\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n", + "\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n", + "\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n", + "\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n", + "\u001B[1;32m 50\u001B[0m )\n", + "\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n", + "\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n", + "\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n", + "\u001B[1;32m 1824\u001B[0m )\n", + "\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n", + "\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n", + "\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n", + "\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n", + "\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n", + "\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n", + "\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n", + "\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n", + "\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n", + "\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n", + "\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n", + "\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n", + "\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n", + "\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n", + "\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\n", + "If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\n", + "To tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "AnalysisException", + "evalue": "[SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": { + "errorClass": "SCHEMA_NOT_FOUND", + "pysparkCallSite": null, + "pysparkFragment": null, + "sqlState": "42704", + "stackTrace": null, + "startIndex": null, + "stopIndex": null + }, + "stackFrames": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)", + "File \u001B[0;32m, line 9\u001B[0m\n\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n\u001B[1;32m 50\u001B[0m )\n\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n\u001B[1;32m 1824\u001B[0m )\n\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import pytest\n", + "\n", + "# Run tests from test_sku_translator.py\n", + "pytest.main([\"-v\", \"tools/test_sample_tool_uc.py\"])\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9e5ff229-40fe-46c0-9612-179aa6f6e8ec", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "# Now, here's another example of a tool that executes python code." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "1a953aa6-7066-4ce5-931e-c089ebba11ba", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/code_exec.py\n" + ] + } + ], + "source": [ + "%%writefile tools/code_exec.py\n", + "def python_exec(code: str) -> str:\n", + " \"\"\"\n", + " Executes Python code in the sandboxed environment and returns its stdout. The runtime is stateless and you can not read output of the previous tool executions. i.e. No such variables \"rows\", \"observation\" defined. Calling another tool inside a Python code is NOT allowed.\n", + " Use only standard python libraries and these python libraries: bleach, chardet, charset-normalizer, defusedxml, googleapis-common-protos, grpcio, grpcio-status, jmespath, joblib, numpy, packaging, pandas, patsy, protobuf, pyarrow, pyparsing, python-dateutil, pytz, scikit-learn, scipy, setuptools, six, threadpoolctl, webencodings, user-agents, cryptography.\n", + "\n", + " Args:\n", + " code (str): Python code to execute. Remember to print the final result to stdout.\n", + "\n", + " Returns:\n", + " str: The output of the executed code.\n", + " \"\"\"\n", + " import sys\n", + " from io import StringIO\n", + "\n", + " sys_stdout = sys.stdout\n", + " redirected_output = StringIO()\n", + " sys.stdout = redirected_output\n", + " exec(code)\n", + " sys.stdout = sys_stdout\n", + " return redirected_output.getvalue()\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9602c5ed-d8a6-48ff-a3e3-9aba0aabffe0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'hello\\n'" + ] + }, + "execution_count": 16, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from tools.code_exec import python_exec\n", + "\n", + "python_exec(\"print('hello')\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "149a3c84-4e2d-4dca-810e-2068a7d2bdcd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Test it locally" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4872d9e4-f46e-4bf5-ba98-5703abe960e2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/test_code_exec.py\n" + ] + } + ], + "source": [ + "%%writefile tools/test_code_exec.py\n", + "\n", + "import pytest\n", + "from .code_exec import python_exec\n", + "\n", + "\n", + "def test_basic_arithmetic():\n", + " code = \"\"\"result = 2 + 2\\nprint(result)\"\"\"\n", + " assert python_exec(code).strip() == \"4\"\n", + "\n", + "\n", + "def test_multiple_lines():\n", + " code = \"x = 5\\n\" \"y = 3\\n\" \"result = x * y\\n\" \"print(result)\"\n", + " assert python_exec(code).strip() == \"15\"\n", + "\n", + "\n", + "def test_multiple_prints():\n", + " code = \"\"\"print('first')\\nprint('second')\\nprint('third')\\n\"\"\"\n", + " expected = \"first\\nsecond\\nthird\\n\"\n", + " assert python_exec(code) == expected\n", + "\n", + "\n", + "def test_using_pandas():\n", + " code = (\n", + " \"import pandas as pd\\n\"\n", + " \"data = {'col1': [1, 2], 'col2': [3, 4]}\\n\"\n", + " \"df = pd.DataFrame(data)\\n\"\n", + " \"print(df.shape)\"\n", + " )\n", + " assert python_exec(code).strip() == \"(2, 2)\"\n", + "\n", + "\n", + "def test_using_numpy():\n", + " code = \"import numpy as np\\n\" \"arr = np.array([1, 2, 3])\\n\" \"print(arr.mean())\"\n", + " assert python_exec(code).strip() == \"2.0\"\n", + "\n", + "\n", + "def test_syntax_error():\n", + " code = \"if True\\n\" \" print('invalid syntax')\"\n", + " with pytest.raises(SyntaxError):\n", + " python_exec(code)\n", + "\n", + "\n", + "def test_runtime_error():\n", + " code = \"x = 1 / 0\\n\" \"print(x)\"\n", + " with pytest.raises(ZeroDivisionError):\n", + " python_exec(code)\n", + "\n", + "\n", + "def test_undefined_variable():\n", + " code = \"print(undefined_variable)\"\n", + " with pytest.raises(NameError):\n", + " python_exec(code)\n", + "\n", + "\n", + "def test_multiline_string_manipulation():\n", + " code = \"text = '''\\n\" \"Hello\\n\" \"World\\n\" \"'''\\n\" \"print(text.strip())\"\n", + " expected = \"Hello\\nWorld\"\n", + " assert python_exec(code).strip() == expected\n", + "\n", + "# Will not fail locally, but will fail in UC.\n", + "# def test_unauthorized_flask():\n", + "# code = \"from flask import Flask\\n\" \"app = Flask(__name__)\\n\" \"print(app)\"\n", + "# with pytest.raises(ImportError):\n", + "# python_exec(code)\n", + "\n", + "\n", + "def test_no_print_statement():\n", + " code = \"x = 42\\n\" \"y = x * 2\"\n", + " assert python_exec(code) == \"\"\n", + "\n", + "\n", + "def test_calculation_without_print():\n", + " code = \"result = sum([1, 2, 3, 4, 5])\\n\" \"squared = [x**2 for x in range(5)]\"\n", + " assert python_exec(code) == \"\"\n", + "\n", + "\n", + "def test_function_definition_without_call():\n", + " code = \"def add(a, b):\\n\" \" return a + b\\n\" \"result = add(3, 4)\"\n", + " assert python_exec(code) == \"\"\n", + "\n", + "\n", + "def test_class_definition_without_instantiation():\n", + " code = (\n", + " \"class Calculator:\\n\"\n", + " \" def add(self, a, b):\\n\"\n", + " \" return a + b\\n\"\n", + " \"calc = Calculator()\"\n", + " )\n", + " assert python_exec(code) == \"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b9a5ce5e-53a3-4975-a5f8-c9a5cbaa72cb", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)\n", + "File \u001B[0;32m, line 9\u001B[0m\n", + "\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n", + "\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n", + "\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n", + "\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n", + "\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n", + "\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n", + "\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n", + "\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n", + "\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n", + "\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n", + "\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n", + "\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n", + "\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n", + "\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n", + "\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n", + "\u001B[1;32m 50\u001B[0m )\n", + "\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n", + "\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n", + "\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n", + "\u001B[1;32m 1824\u001B[0m )\n", + "\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n", + "\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n", + "\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n", + "\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n", + "\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n", + "\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n", + "\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n", + "\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n", + "\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n", + "\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n", + "\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n", + "\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n", + "\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n", + "\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n", + "\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\n", + "If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\n", + "To tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "AnalysisException", + "evalue": "[SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": { + "errorClass": "SCHEMA_NOT_FOUND", + "pysparkCallSite": null, + "pysparkFragment": null, + "sqlState": "42704", + "stackTrace": null, + "startIndex": null, + "stopIndex": null + }, + "stackFrames": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)", + "File \u001B[0;32m, line 9\u001B[0m\n\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n\u001B[1;32m 50\u001B[0m )\n\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n\u001B[1;32m 1824\u001B[0m )\n\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import pytest\n", + "\n", + "# Run tests from test_sku_translator.py\n", + "pytest.main([\"-v\", \"tools/test_code_exec.py\"])\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b18d032f-246e-4543-90e0-c2ca9062a365", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Deploy to UC" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "74c78c59-b6a8-46a6-9e1f-214ef81e30dc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Deployed Unity Catalog function name: casaman_ssa.demos.python_exec\n" + ] + } + ], + "source": [ + "from unitycatalog.ai.core.databricks import DatabricksFunctionClient\n", + "from tools.code_exec import python_exec\n", + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "client = DatabricksFunctionClient()\n", + "CATALOG = \"casaman_ssa\" # Change me!\n", + "SCHEMA = \"demos\" # Change me if you want\n", + "\n", + "# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\n", + "python_exec_tool_uc_info = client.create_python_function(func=python_exec, catalog=CATALOG, schema=SCHEMA, replace=True)\n", + "\n", + "# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\n", + "# Print the deployed Unity Catalog function name\n", + "print(f\"Deployed Unity Catalog function name: {python_exec_tool_uc_info.full_name}\")\n", + "\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d20cf99e-5b30-434e-bb92-7eebf02eb9c7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Test as UC Tool for the Agent" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "60a6dd4f-6ff4-4e1e-8f59-bd631394d1d1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'error': None, 'format': 'SCALAR', 'value': 'hello\\n', 'truncated': None}" + ] + }, + "execution_count": 20, + "metadata": {}, + "output_type": "execute_result" + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-b035d8c2b9df4aca8f57b65b938fac7c\"", + "text/plain": [ + "Trace(request_id=tr-b035d8c2b9df4aca8f57b65b938fac7c)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "\n", + "# wrap the tool into a UCTool which can be passed to our Agent\n", + "python_exec_tool = UCTool(uc_function_name=python_exec_tool_uc_info.full_name)\n", + "\n", + "python_exec_tool(code=\"print('hello')\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8cd9743c-4644-42db-bbcf-034d0d19d87a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "New tests" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0f45e755-1678-491e-8773-9fd327bdcfa6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Overwriting tools/test_code_exec_as_uc_tool.py\n" + ] + } + ], + "source": [ + "%%writefile tools/test_code_exec_as_uc_tool.py\n", + "\n", + "import pytest\n", + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "CATALOG = \"ep\"\n", + "SCHEMA = \"cookbook_local_test\"\n", + "\n", + "\n", + "@pytest.fixture\n", + "def python_exec():\n", + " \"\"\"Fixture to provide the python_exec function from UCTool.\"\"\"\n", + " python_exec_tool = UCTool(uc_function_name=f\"{CATALOG}.{SCHEMA}.python_exec\")\n", + " return python_exec_tool\n", + "\n", + "\n", + "def test_basic_arithmetic(python_exec):\n", + " code = \"\"\"result = 2 + 2\\nprint(result)\"\"\"\n", + " assert python_exec(code=code)[\"value\"].strip() == \"4\"\n", + "\n", + "\n", + "def test_multiple_lines(python_exec):\n", + " code = \"x = 5\\n\" \"y = 3\\n\" \"result = x * y\\n\" \"print(result)\"\n", + " assert python_exec(code=code)[\"value\"].strip() == \"15\"\n", + "\n", + "\n", + "def test_multiple_prints(python_exec):\n", + " code = \"\"\"print('first')\\nprint('second')\\nprint('third')\\n\"\"\"\n", + " expected = \"first\\nsecond\\nthird\\n\"\n", + " assert python_exec(code=code)[\"value\"] == expected\n", + "\n", + "\n", + "def test_using_pandas(python_exec):\n", + " code = (\n", + " \"import pandas as pd\\n\"\n", + " \"data = {'col1': [1, 2], 'col2': [3, 4]}\\n\"\n", + " \"df = pd.DataFrame(data)\\n\"\n", + " \"print(df.shape)\"\n", + " )\n", + " assert python_exec(code=code)[\"value\"].strip() == \"(2, 2)\"\n", + "\n", + "\n", + "def test_using_numpy(python_exec):\n", + " code = \"import numpy as np\\n\" \"arr = np.array([1, 2, 3])\\n\" \"print(arr.mean())\"\n", + " assert python_exec(code=code)[\"value\"].strip() == \"2.0\"\n", + "\n", + "\n", + "def test_syntax_error(python_exec):\n", + " code = \"if True\\n\" \" print('invalid syntax')\"\n", + " result = python_exec(code=code)\n", + " assert \"Syntax error at or near 'invalid'.\" in result[\"error\"][\"error_message\"]\n", + "\n", + "\n", + "def test_runtime_error(python_exec):\n", + " code = \"x = 1 / 0\\n\" \"print(x)\"\n", + " result = python_exec(code=code)\n", + " assert \"ZeroDivisionError\" in result[\"error\"][\"error_message\"]\n", + "\n", + "\n", + "def test_undefined_variable(python_exec):\n", + " code = \"print(undefined_variable)\"\n", + " result = python_exec(code=code)\n", + " assert \"NameError\" in result[\"error\"][\"error_message\"]\n", + "\n", + "\n", + "def test_multiline_string_manipulation(python_exec):\n", + " code = \"text = '''\\n\" \"Hello\\n\" \"World\\n\" \"'''\\n\" \"print(text.strip())\"\n", + " expected = \"Hello\\nWorld\"\n", + " assert python_exec(code=code)[\"value\"].strip() == expected\n", + "\n", + "\n", + "def test_unauthorized_flask(python_exec):\n", + " code = \"from flask import Flask\\n\" \"app = Flask(__name__)\\n\" \"print(app)\"\n", + " result = python_exec(code=code)\n", + " assert (\n", + " \"ModuleNotFoundError: No module named 'flask'\"\n", + " in result[\"error\"][\"error_message\"]\n", + " )\n", + "\n", + "\n", + "def test_no_print_statement(python_exec):\n", + " code = \"x = 42\\n\" \"y = x * 2\"\n", + " assert python_exec(code=code)[\"value\"] == \"\"\n", + "\n", + "\n", + "def test_calculation_without_print(python_exec):\n", + " code = \"result = sum([1, 2, 3, 4, 5])\\n\" \"squared = [x**2 for x in range(5)]\"\n", + " assert python_exec(code=code)[\"value\"] == \"\"\n", + "\n", + "\n", + "def test_function_definition_without_call(python_exec):\n", + " code = \"def add(a, b):\\n\" \" return a + b\\n\" \"result = add(3, 4)\"\n", + " assert python_exec(code=code)[\"value\"] == \"\"\n", + "\n", + "\n", + "def test_class_definition_without_instantiation(python_exec):\n", + " code = (\n", + " \"class Calculator:\\n\"\n", + " \" def add(self, a, b):\\n\"\n", + " \" return a + b\\n\"\n", + " \"calc = Calculator()\"\n", + " )\n", + " assert python_exec(code=code)[\"value\"] == \"\"\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "716a4a3b-3bf2-4681-80e4-2ede7993017f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m\n", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)\n", + "File \u001B[0;32m, line 9\u001B[0m\n", + "\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n", + "\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n", + "\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n", + "\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n", + "\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n", + "\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n", + "\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n", + "\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n", + "\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n", + "\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n", + "\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n", + "\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n", + "\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n", + "\u001B[1;32m 164\u001B[0m ):\n", + "\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n", + "\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n", + "\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n", + "\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n", + "\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n", + "\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n", + "\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n", + "\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n", + "\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n", + "\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n", + "\u001B[1;32m 50\u001B[0m )\n", + "\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n", + "\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n", + "\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n", + "\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n", + "\u001B[1;32m 1824\u001B[0m )\n", + "\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n", + "\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n", + "\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n", + "\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n", + "\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n", + "\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n", + "\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n", + "\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n", + "\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n", + "\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n", + "\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n", + "\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n", + "\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n", + "\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n", + "\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n", + "\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n", + "\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\n", + "If you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\n", + "To tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "AnalysisException", + "evalue": "[SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": { + "errorClass": "SCHEMA_NOT_FOUND", + "pysparkCallSite": null, + "pysparkFragment": null, + "sqlState": "42704", + "stackTrace": null, + "startIndex": null, + "stopIndex": null + }, + "stackFrames": [ + "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", + "\u001B[0;31mAnalysisException\u001B[0m Traceback (most recent call last)", + "File \u001B[0;32m, line 9\u001B[0m\n\u001B[1;32m 6\u001B[0m SCHEMA \u001B[38;5;241m=\u001B[39m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mcookbook_local_test\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;66;03m# Change me if you want\u001B[39;00m\n\u001B[1;32m 8\u001B[0m \u001B[38;5;66;03m# this will deploy the tool to UC, automatically setting the metadata in UC based on the tool's docstring & typing hints\u001B[39;00m\n\u001B[0;32m----> 9\u001B[0m tool_uc_info \u001B[38;5;241m=\u001B[39m client\u001B[38;5;241m.\u001B[39mcreate_python_function(func\u001B[38;5;241m=\u001B[39msku_sample_translator, catalog\u001B[38;5;241m=\u001B[39mCATALOG, schema\u001B[38;5;241m=\u001B[39mSCHEMA, replace\u001B[38;5;241m=\u001B[39m\u001B[38;5;28;01mTrue\u001B[39;00m)\n\u001B[1;32m 11\u001B[0m \u001B[38;5;66;03m# the tool will deploy to a function in UC called `{catalog}.{schema}.{func}` where {func} is the name of the function\u001B[39;00m\n\u001B[1;32m 12\u001B[0m \u001B[38;5;66;03m# Print the deployed Unity Catalog function name\u001B[39;00m\n\u001B[1;32m 13\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeployed Unity Catalog function name: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mtool_uc_info\u001B[38;5;241m.\u001B[39mfull_name\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:450\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_python_function\u001B[0;34m(self, func, catalog, schema, replace)\u001B[0m\n\u001B[1;32m 446\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mValueError\u001B[39;00m(\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mThe provided function is not callable.\u001B[39m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 448\u001B[0m sql_function_body \u001B[38;5;241m=\u001B[39m generate_sql_function_body(func, catalog, schema, replace)\n\u001B[0;32m--> 450\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcreate_function(sql_function_body\u001B[38;5;241m=\u001B[39msql_function_body)\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:158\u001B[0m, in \u001B[0;36mretry_on_session_expiration..wrapper\u001B[0;34m(self, *args, **kwargs)\u001B[0m\n\u001B[1;32m 156\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m attempt \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mrange\u001B[39m(\u001B[38;5;241m1\u001B[39m, max_attempts \u001B[38;5;241m+\u001B[39m \u001B[38;5;241m1\u001B[39m):\n\u001B[1;32m 157\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m--> 158\u001B[0m result \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;28mself\u001B[39m, \u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 159\u001B[0m \u001B[38;5;66;03m# for non-session related error in the result, we should directly return the result\u001B[39;00m\n\u001B[1;32m 160\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m (\n\u001B[1;32m 161\u001B[0m \u001B[38;5;28misinstance\u001B[39m(result, FunctionExecutionResult)\n\u001B[1;32m 162\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 163\u001B[0m \u001B[38;5;129;01mand\u001B[39;00m SESSION_EXCEPTION_MESSAGE \u001B[38;5;129;01min\u001B[39;00m result\u001B[38;5;241m.\u001B[39merror\n\u001B[1;32m 164\u001B[0m ):\n", + "File \u001B[0;32m/local_disk0/.ephemeral_nfs/envs/pythonEnv-4ca65ca9-6afb-4a1a-8c05-563aa66609a0/lib/python3.11/site-packages/unitycatalog/ai/core/databricks.py:296\u001B[0m, in \u001B[0;36mDatabricksFunctionClient.create_function\u001B[0;34m(self, sql_function_body)\u001B[0m\n\u001B[1;32m 294\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mset_default_spark_session()\n\u001B[1;32m 295\u001B[0m \u001B[38;5;66;03m# TODO: add timeout\u001B[39;00m\n\u001B[0;32m--> 296\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mspark\u001B[38;5;241m.\u001B[39msql(sql_function_body)\n\u001B[1;32m 297\u001B[0m created_function_info \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mget_function(extract_function_name(sql_function_body))\n\u001B[1;32m 298\u001B[0m check_function_info(created_function_info)\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/instrumentation_utils.py:47\u001B[0m, in \u001B[0;36m_wrap_function..wrapper\u001B[0;34m(*args, **kwargs)\u001B[0m\n\u001B[1;32m 45\u001B[0m start \u001B[38;5;241m=\u001B[39m time\u001B[38;5;241m.\u001B[39mperf_counter()\n\u001B[1;32m 46\u001B[0m \u001B[38;5;28;01mtry\u001B[39;00m:\n\u001B[0;32m---> 47\u001B[0m res \u001B[38;5;241m=\u001B[39m func(\u001B[38;5;241m*\u001B[39margs, \u001B[38;5;241m*\u001B[39m\u001B[38;5;241m*\u001B[39mkwargs)\n\u001B[1;32m 48\u001B[0m logger\u001B[38;5;241m.\u001B[39mlog_success(\n\u001B[1;32m 49\u001B[0m module_name, class_name, function_name, time\u001B[38;5;241m.\u001B[39mperf_counter() \u001B[38;5;241m-\u001B[39m start, signature\n\u001B[1;32m 50\u001B[0m )\n\u001B[1;32m 51\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m res\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/sql/session.py:1825\u001B[0m, in \u001B[0;36mSparkSession.sql\u001B[0;34m(self, sqlQuery, args, **kwargs)\u001B[0m\n\u001B[1;32m 1820\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 1821\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m PySparkTypeError(\n\u001B[1;32m 1822\u001B[0m error_class\u001B[38;5;241m=\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mINVALID_TYPE\u001B[39m\u001B[38;5;124m\"\u001B[39m,\n\u001B[1;32m 1823\u001B[0m message_parameters\u001B[38;5;241m=\u001B[39m{\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_name\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124margs\u001B[39m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124marg_type\u001B[39m\u001B[38;5;124m\"\u001B[39m: \u001B[38;5;28mtype\u001B[39m(args)\u001B[38;5;241m.\u001B[39m\u001B[38;5;18m__name__\u001B[39m},\n\u001B[1;32m 1824\u001B[0m )\n\u001B[0;32m-> 1825\u001B[0m \u001B[38;5;28;01mreturn\u001B[39;00m DataFrame(\u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39m_jsparkSession\u001B[38;5;241m.\u001B[39msql(sqlQuery, litArgs), \u001B[38;5;28mself\u001B[39m)\n\u001B[1;32m 1826\u001B[0m \u001B[38;5;28;01mfinally\u001B[39;00m:\n\u001B[1;32m 1827\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mlen\u001B[39m(kwargs) \u001B[38;5;241m>\u001B[39m \u001B[38;5;241m0\u001B[39m:\n", + "File \u001B[0;32m/databricks/spark/python/lib/py4j-0.10.9.7-src.zip/py4j/java_gateway.py:1355\u001B[0m, in \u001B[0;36mJavaMember.__call__\u001B[0;34m(self, *args)\u001B[0m\n\u001B[1;32m 1349\u001B[0m command \u001B[38;5;241m=\u001B[39m proto\u001B[38;5;241m.\u001B[39mCALL_COMMAND_NAME \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1350\u001B[0m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mcommand_header \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1351\u001B[0m args_command \u001B[38;5;241m+\u001B[39m\\\n\u001B[1;32m 1352\u001B[0m proto\u001B[38;5;241m.\u001B[39mEND_COMMAND_PART\n\u001B[1;32m 1354\u001B[0m answer \u001B[38;5;241m=\u001B[39m \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client\u001B[38;5;241m.\u001B[39msend_command(command)\n\u001B[0;32m-> 1355\u001B[0m return_value \u001B[38;5;241m=\u001B[39m get_return_value(\n\u001B[1;32m 1356\u001B[0m answer, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mgateway_client, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mtarget_id, \u001B[38;5;28mself\u001B[39m\u001B[38;5;241m.\u001B[39mname)\n\u001B[1;32m 1358\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m temp_arg \u001B[38;5;129;01min\u001B[39;00m temp_args:\n\u001B[1;32m 1359\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;28mhasattr\u001B[39m(temp_arg, \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124m_detach\u001B[39m\u001B[38;5;124m\"\u001B[39m):\n", + "File \u001B[0;32m/databricks/spark/python/pyspark/errors/exceptions/captured.py:261\u001B[0m, in \u001B[0;36mcapture_sql_exception..deco\u001B[0;34m(*a, **kw)\u001B[0m\n\u001B[1;32m 257\u001B[0m converted \u001B[38;5;241m=\u001B[39m convert_exception(e\u001B[38;5;241m.\u001B[39mjava_exception)\n\u001B[1;32m 258\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;28misinstance\u001B[39m(converted, UnknownException):\n\u001B[1;32m 259\u001B[0m \u001B[38;5;66;03m# Hide where the exception came from that shows a non-Pythonic\u001B[39;00m\n\u001B[1;32m 260\u001B[0m \u001B[38;5;66;03m# JVM exception message.\u001B[39;00m\n\u001B[0;32m--> 261\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m converted \u001B[38;5;28;01mfrom\u001B[39;00m \u001B[38;5;28;01mNone\u001B[39;00m\n\u001B[1;32m 262\u001B[0m \u001B[38;5;28;01melse\u001B[39;00m:\n\u001B[1;32m 263\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m\n", + "\u001B[0;31mAnalysisException\u001B[0m: [SCHEMA_NOT_FOUND] The schema `ep.cookbook_local_test` cannot be found. Verify the spelling and correctness of the schema and catalog.\nIf you did not qualify the name with a catalog, verify the current_schema() output, or qualify the name with the correct catalog.\nTo tolerate the error on drop use DROP SCHEMA IF EXISTS. SQLSTATE: 42704" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "import pytest\n", + "\n", + "# Run tests from test_sku_translator.py\n", + "pytest.main([\"-v\", \"tools/test_code_exec_as_uc_tool.py\"])\n", + "\n" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "04_create_tools", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb b/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb index aa72187..f173720 100644 --- a/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb +++ b/autogen_agent_app_sample_code/05_tool_calling_agent.ipynb @@ -1,445 +1,8960 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## 👉 START HERE: How to use this notebook -# MAGIC -# MAGIC # Step 3: Build, evaluate, & deploy your Agent -# MAGIC -# MAGIC Use this notebook to iterate on the code and configuration of your Agent. -# MAGIC -# MAGIC By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation. -# MAGIC -# MAGIC Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui). -# MAGIC -# MAGIC -# MAGIC For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains: -# MAGIC - Your Agent's code & config -# MAGIC - Evaluation metrics for cost, quality, and latency - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: -# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. -# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent -# MAGIC -# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Install Python libraries -# MAGIC -# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. - -# COMMAND ---------- - -# MAGIC %pip install -qqqq -U -r requirements.txt -# MAGIC # # Restart to load the packages into the Python environment -# MAGIC dbutils.library.restartPython() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Connect to Databricks -# MAGIC -# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. - -# COMMAND ---------- - -from mlflow.utils import databricks_utils as du -import os - -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - - spark = DatabricksSession.builder.getOrCreate() - os.environ["MLFLOW_TRACKING_URI"] = "databricks" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment -# MAGIC -# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. - -# COMMAND ---------- - -from cookbook.config.shared.agent_storage_location import AgentStorageConfig -from cookbook.databricks_utils import get_mlflow_experiment_url -from cookbook.config import load_serializable_config_from_yaml_file -import mlflow - -# Load the Agent's storage locations -agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") - -# Show the Agent's storage locations -agent_storage_config.pretty_print() - -# set the MLflow experiment -experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) -# If running in a local IDE, set the MLflow experiment name as an environment variable -os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name - -print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Helper method to log the Agent's code & config to MLflow -# MAGIC -# MAGIC Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production. - -# COMMAND ---------- - - -import mlflow -from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA -from mlflow.models.rag_signatures import StringResponse -from mlflow.models import ModelConfig -from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES -from mlflow.models.signature import ModelSignature -from cookbook.agents.function_calling_agent import FunctionCallingAgent -from cookbook.config.agents.function_calling_agent import FunctionCallingAgentConfig - -# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI -# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. -# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! -def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig): - # Get the agent's code path from the imported Agent class - agent_code_path = f"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py" - - # Get the pip requirements from the requirements.txt file - with open("requirements.txt", "r") as file: - pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark - - logged_agent_info = mlflow.pyfunc.log_model( - artifact_path="agent", - python_model=agent_code_path, - input_example=agent_config.input_example, - model_config=agent_config.model_dump(), - resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc - signature=ModelSignature( - inputs=CHAT_MODEL_INPUT_SCHEMA, - # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature - outputs=StringResponse() - ), - code_paths=[os.path.join(os.getcwd(), "cookbook")], - pip_requirements=pip_requirements, - ) - - return logged_agent_info - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ## 1️⃣ Iterate on the Agent's code & config to improve quality -# MAGIC -# MAGIC The below cells are used to execute your inner dev loop to improve the Agent's quality. -# MAGIC -# MAGIC We suggest the following process: -# MAGIC 1. Vibe check the Agent for 5 - 10 queries to verify it works -# MAGIC 2. Make any necessary changes to the code/config -# MAGIC 3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues -# MAGIC 4. Based on that evaluation, make & test changes to the code/config to improve quality -# MAGIC 5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality -# MAGIC 6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing -# MAGIC 7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues -# MAGIC 8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7 -# MAGIC 9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6) -# MAGIC - -# COMMAND ---------- - -# Import Cookbook Agent configurations, which are Pydantic models -from cookbook.config import serializable_config_to_yaml_file -from cookbook.config.agents.function_calling_agent import ( - FunctionCallingAgentConfig, -) -from cookbook.config.data_pipeline import ( - DataPipelineConfig, -) -from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig -from cookbook.config import load_serializable_config_from_yaml_file -from cookbook.tools.vector_search import ( - VectorSearchRetrieverTool, - VectorSearchSchema, -) -from cookbook.tools.uc_tool import UCTool - -import json -import mlflow -import yaml - -######################## -# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration -# Usage: -# - If you used `01_data_pipeline` to create your Vector Index, run this cell. -# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config. -######################## - -#data_pipeline_config: DataPipelineConfig = #load_serializable_config_from_yaml_file( -# "./configs/data_pipeline_config.yaml" -#) - -######################## -# #### ✅✏️ Retriever tool that connects to the Vector Search index -######################## - -retriever_tool = VectorSearchRetrieverTool( - name="search_product_docs", - description="Use this tool to search for product documentation.", - vector_search_index="casaman_ssa.demos.test_product_docs_docs_chunked_index__v2", - vector_search_schema=VectorSearchSchema( - # These columns are the default values used in the `01_data_pipeline` notebook - # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. - chunk_text="content_chunked", # Contains the text of each document chunk - document_uri="doc_uri", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App - # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM - ), - # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below. - # doc_similarity_threshold=0.0, - # vector_search_parameters=VectorSearchParameters( - # num_results=5, - # query_type="ann" - # ), - # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query. - # filterable_columns=[] -) - -######################## -# #### ✅✏️ Add Unity Catalog tools to the Agent -######################## - -translate_sku_tool = UCTool(uc_function_name="casaman_ssa.demos.sku_sample_translator") - - -######################## -# #### ✅✏️ Add a local Python function as a tool in the Agent -######################## - -from cookbook.tools.local_function import LocalFunctionTool -from tools.sample_tool import sku_sample_translator - -# translate_sku_tool = LocalFunctionTool(func=translate_sku, description="Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.") - -tools = [retriever_tool, translate_sku_tool] - -######################## -#### ✅✏️ Agent's LLM configuration -######################## - -system_prompt = """ -## Role -You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request. - -## Objective -Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses. - -## Instructions -1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. - -2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query. - -3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: "I'm sorry, I can't help you with that." -""".strip() - -fc_agent_config = FunctionCallingAgentConfig( - llm_config=LLMConfig( - llm_endpoint_name="casaman-gpt4", # Model serving endpoint w/ a Chat Completions API - llm_system_prompt_template=system_prompt, # System prompt template - llm_parameters=LLMParametersConfig( - temperature=0.01, max_tokens=1500 - ), # LLM parameters - ), - # Add one or more tools that comply with the CookbookTool interface - tools=tools, -) - -# Print the configuration as a JSON string to see it all together -# print(json.dumps(fc_agent_config.model_dump(), indent=4)) - -######################## -##### Dump the configuration to a YAML -# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration. -######################## -# Import the default YAML config file name from the Agent's code file -from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME - -# Dump the configuration to a YAML file -serializable_config_to_yaml_file(fc_agent_config, "./configs/"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ Optionally, adjust the Agent's code -# MAGIC -# MAGIC Here, we import the Agent's code so we can run the Agent locally within the notebook. To modify the code, open the Agent's code file in a separate window, enable reload, make your changes, and re-run this cell. -# MAGIC -# MAGIC **Typically, when building the first version of your agent, we suggest first trying to tune the configuration (prompts, etc) to improve quality. If you need more control to fix quality issues, you can then modify the Agent's code.** - -# COMMAND ---------- - -from cookbook.agents.function_calling_agent import FunctionCallingAgent -import inspect - -# Print the Agent code for inspection -print(inspect.getsource(FunctionCallingAgent)) - -# COMMAND ---------- - -# MAGIC %load_ext autoreload -# MAGIC %autoreload 3 - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ 🅰 Vibe check the Agent for a single query -# MAGIC -# MAGIC Running this cell will produce an MLflow Trace that you can use to see the Agent's outputs and understand the steps it took to produce that output. -# MAGIC -# MAGIC If you are running in a local IDE, browse to the MLflow Experiment page to view the Trace (link to the Experiment UI is at the top of this notebook). If running in a Databricks Notebook, your trace will appear inline below. - -# COMMAND ---------- - -from cookbook.databricks_utils import get_mlflow_experiment_traces_url -from cookbook.agents.function_calling_agent import FunctionCallingAgent - -# Load the Agent's code with the above configuration -agent = FunctionCallingAgent(agent_config=fc_agent_config) - -# Vibe check the Agent for a single query -output = agent.predict(model_input={"messages": [{"role": "user", "content": "What is mlflow in databricks?"}]}) -# output = agent.predict(model_input={"messages": [{"role": "user", "content": "Translate the sku `OLD-abs-1234` to the new format"}]}) - -print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") -print(f"Agent's final response:\n----\n{output['content']}\n----") -print() -# print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(output['messages'], indent=2)}\n----") - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Now, let's test a multi-turn conversation with the Agent. - -# COMMAND ---------- - -output['content'] - -# COMMAND ---------- - -second_turn = {'messages': output['messages'] + [{"role": "user", "content": "How can I use it for versioning my model?"}]} - -# Run the Agent again with the same input to continue the conversation -second_turn_output = agent.predict(model_input=second_turn) - -print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") -print(f"Agent's final response:\n----\n{second_turn_output['content']}\n----") -print() -print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(second_turn_output['messages'], indent=2)}\n----") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC #### ✅✏️ 🅱 Evaluate the Agent using your evaluation set -# MAGIC -# MAGIC Note: If you do not have an evaluation set, you can create a synthetic evaluation set by using the 03_synthetic_evaluation notebook. - -# COMMAND ---------- - -evaluation_set = spark.table(agent_storage_config.evaluation_set_uc_table) - -mlflow.langchain.autolog(disable=True, log_traces=False) -mlflow.autogen.autolog(log_traces=False) - -with mlflow.start_run(): - logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config) - - # Run the agent for these queries, using Agent evaluation to parallelize the calls - eval_results = mlflow.evaluate( - model=logged_agent_info.model_uri, # use the MLflow logged Agent - data=evaluation_set, # Evaluate the Agent for every row of the evaluation set - model_type="databricks-agent", # use Agent Evaluation - ) - - # Show all outputs. Click on a row in this table to display the MLflow Trace. - display(eval_results.tables["eval_results"]) - - # Click 'View Evaluation Results' to see the Agent's inputs/outputs + quality evaluation displayed in a UI - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## 2️⃣ Deploy a version of your Agent - either to the Review App or Production -# MAGIC -# MAGIC Once you have a version of your Agent that has sufficient quality, you will register the Agent's model from the MLflow Experiment into the Unity Catalog & use Agent Framework's `agents.deploy(...)` command to deploy it. Note these steps are the same for deploying to pre-production (e.g., the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) or production. -# MAGIC -# MAGIC By the end of this step, you will have deployed a version of your Agent that you can interact with and share with your business stakeholders for feedback, even if they don't have access to your Databricks workspace: -# MAGIC -# MAGIC 1. A production-ready scalable REST API deployed as a Model Serving endpoint that logged every request/request/MLflow Trace to a Delta Table. -# MAGIC - REST API for querying the Agent -# MAGIC - REST API for sending user feedback from your UI to the Agent -# MAGIC 2. Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) connected to these endpoints. -# MAGIC 3. [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) connected to these endpoints. - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Option 1: Deploy the last agent you logged above - -# COMMAND ---------- - -from databricks import agents - -# Use Unity Catalog as the model registry -mlflow.set_registry_uri("databricks-uc") - -# Register the Agent's model to the Unity Catalog -uc_registered_model_info = mlflow.register_model( - model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name -) - -# Deploy the model to the review app and a model serving endpoint -agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Option 2: Log the latest copy of the Agent's code/config and deploy it - -# COMMAND ---------- - -from databricks import agents - -# Use Unity Catalog as the model registry -mlflow.set_registry_uri("databricks-uc") - -with mlflow.start_run(): - logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config) - - # Register the Agent's model to the Unity Catalog - uc_registered_model_info = mlflow.register_model( - model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name - ) - -# Deploy the model to the review app and a model serving endpoint -# agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Load the logged model to test it locally - -# COMMAND ---------- - -mlflow.autogen.autolog(log_traces=False) - -# COMMAND ---------- - -import mlflow - -loaded_model = mlflow.pyfunc.load_model(logged_agent_info.model_uri) - -loaded_model.predict({"messages": [{"role": "user", "content": "A test question?"}]}) \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "31661828-f9bb-4fc2-a1bd-94424a27ed52", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 👉 START HERE: How to use this notebook\n", + "\n", + "# Step 3: Build, evaluate, & deploy your Agent\n", + "\n", + "Use this notebook to iterate on the code and configuration of your Agent.\n", + "\n", + "By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation.\n", + "\n", + "Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui).\n", + "\n", + "\n", + "For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains:\n", + "- Your Agent's code & config\n", + "- Evaluation metrics for cost, quality, and latency" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5d9f685a-fdb7-49a4-9e3a-a4a9e964d045", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "**Important note:** Throughout this notebook, we indicate which cell's code you:\n", + "- ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality.\n", + "- 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent\n", + "\n", + "*Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "bb4f8cc0-1797-4beb-a9f2-df21a9db79f0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Install Python libraries\n", + "\n", + "You do not need to modify this cell unless you need additional Python packages in your Agent." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6d4030e8-ae97-4351-bebd-9651d283578f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.26 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.3 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.26 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.3 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.26 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.4 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt\n", + "# # Restart to load the packages into the Python environment\n", + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fd22db88-5cfe-4c58-bad6-2ecc1984123d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Connect to Databricks\n", + "\n", + "If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0519dc68-b91a-44f5-9120-397f867aa5d9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.utils import databricks_utils as du\n", + "import os\n", + "\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()\n", + " os.environ[\"MLFLOW_TRACKING_URI\"] = \"databricks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "fe11f86b-748b-490b-a8d6-121485f005a0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment\n", + "\n", + "This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c0f8cf87-4858-4dd3-97f4-2ce9a7ed01a1", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "{\n \"uc_model_name\": \"casaman_ssa.demos.my_agent_autogen\",\n \"evaluation_set_uc_table\": \"casaman_ssa.demos.my_agent_autogen_eval_set\",\n \"mlflow_experiment_name\": \"/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment\",\n \"class_path\": \"cookbook.config.shared.agent_storage_location.AgentStorageConfig\"\n}\nView the MLflow Experiment `/Users/manffred.calvosanchez@databricks.com/my_agent_autogen_mlflow_experiment` at https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093\n" + ] + } + ], + "source": [ + "from cookbook.config.shared.agent_storage_location import AgentStorageConfig\n", + "from cookbook.databricks_utils import get_mlflow_experiment_url\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "import mlflow \n", + "\n", + "# Load the Agent's storage locations\n", + "agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file(\"./configs/agent_storage_config.yaml\")\n", + "\n", + "# Show the Agent's storage locations\n", + "agent_storage_config.pretty_print()\n", + "\n", + "# set the MLflow experiment\n", + "experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name)\n", + "# If running in a local IDE, set the MLflow experiment name as an environment variable\n", + "os.environ[\"MLFLOW_EXPERIMENT_NAME\"] = agent_storage_config.mlflow_experiment_name\n", + "\n", + "print(f\"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "ffbbda4c-68b1-47b9-9663-ca7db4387a31", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Helper method to log the Agent's code & config to MLflow\n", + "\n", + "Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6aa41c49-ac68-47eb-bd54-770ffdb1748f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception ignored on calling ctypes callback function: .match_module_callback at 0x7fde57cdf6a0>\nTraceback (most recent call last):\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n self._make_module_from_path(filepath)\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n module = module_class(filepath, prefix, user_api, internal_api)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 606, in __init__\n self.version = self.get_version()\n ^^^^^^^^^^^^^^^^^^\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 646, in get_version\n config = get_config().split()\n ^^^^^^^^^^^^^^^^^^\nAttributeError: 'NoneType' object has no attribute 'split'\nWARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + } + ], + "source": [ + "\n", + "import mlflow\n", + "from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA\n", + "from mlflow.models.rag_signatures import StringResponse\n", + "from mlflow.models import ModelConfig\n", + "from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES\n", + "from mlflow.models.signature import ModelSignature\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "from cookbook.config.agents.function_calling_agent import FunctionCallingAgentConfig\n", + "\n", + "# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI\n", + "# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run.\n", + "# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy!\n", + "def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig):\n", + " # Get the agent's code path from the imported Agent class\n", + " agent_code_path = f\"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py\"\n", + "\n", + " # Get the pip requirements from the requirements.txt file\n", + " with open(\"requirements.txt\", \"r\") as file:\n", + " pip_requirements = [line.strip() for line in file.readlines()] + [\"pyspark\"] # manually add pyspark\n", + "\n", + " logged_agent_info = mlflow.pyfunc.log_model(\n", + " artifact_path=\"agent\",\n", + " python_model=agent_code_path,\n", + " input_example=agent_config.input_example,\n", + " model_config=agent_config.model_dump(),\n", + " resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc\n", + " signature=ModelSignature(\n", + " inputs=CHAT_MODEL_INPUT_SCHEMA,\n", + " # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature\n", + " outputs=StringResponse()\n", + " ),\n", + " code_paths=[os.path.join(os.getcwd(), \"cookbook\")],\n", + " pip_requirements=pip_requirements,\n", + " )\n", + "\n", + " return logged_agent_info" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9933d05f-29fa-452e-abdc-2a02328fbe22", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "## 1️⃣ Iterate on the Agent's code & config to improve quality\n", + "\n", + "The below cells are used to execute your inner dev loop to improve the Agent's quality.\n", + "\n", + "We suggest the following process:\n", + "1. Vibe check the Agent for 5 - 10 queries to verify it works\n", + "2. Make any necessary changes to the code/config\n", + "3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues\n", + "4. Based on that evaluation, make & test changes to the code/config to improve quality\n", + "5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality\n", + "6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing\n", + "7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues\n", + "8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7\n", + "9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "278bef74-fedf-43be-8187-c302c4442a61", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + } + ], + "source": [ + "# Import Cookbook Agent configurations, which are Pydantic models\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "from cookbook.config.agents.function_calling_agent import (\n", + " FunctionCallingAgentConfig,\n", + ")\n", + "from cookbook.config.data_pipeline import (\n", + " DataPipelineConfig,\n", + ")\n", + "from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "from cookbook.tools.vector_search import (\n", + " VectorSearchRetrieverTool,\n", + " VectorSearchSchema,\n", + ")\n", + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "import json\n", + "import mlflow\n", + "import yaml\n", + "\n", + "########################\n", + "# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration\n", + "# Usage:\n", + "# - If you used `01_data_pipeline` to create your Vector Index, run this cell.\n", + "# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config.\n", + "########################\n", + "\n", + "#data_pipeline_config: DataPipelineConfig = #load_serializable_config_from_yaml_file(\n", + "# \"./configs/data_pipeline_config.yaml\"\n", + "#)\n", + "\n", + "########################\n", + "# #### ✅✏️ Retriever tool that connects to the Vector Search index\n", + "########################\n", + "\n", + "retriever_tool = VectorSearchRetrieverTool(\n", + " name=\"search_product_docs\",\n", + " description=\"Use this tool to search for product documentation.\",\n", + " vector_search_index=\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\",\n", + " vector_search_schema=VectorSearchSchema(\n", + " # These columns are the default values used in the `01_data_pipeline` notebook\n", + " # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here.\n", + " chunk_text=\"content_chunked\", # Contains the text of each document chunk\n", + " document_uri=\"doc_uri\", # The document URI of the chunk e.g., \"/Volumes/catalog/schema/volume/file.pdf\" - displayed as the document ID in the Review App\n", + " # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM\n", + " ),\n", + " # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below.\n", + " # doc_similarity_threshold=0.0,\n", + " # vector_search_parameters=VectorSearchParameters(\n", + " # num_results=5,\n", + " # query_type=\"ann\"\n", + " # ),\n", + " # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query.\n", + " # filterable_columns=[]\n", + ")\n", + "\n", + "########################\n", + "# #### ✅✏️ Add Unity Catalog tools to the Agent\n", + "########################\n", + "\n", + "translate_sku_tool = UCTool(uc_function_name=\"casaman_ssa.demos.sku_sample_translator\")\n", + "\n", + "\n", + "########################\n", + "# #### ✅✏️ Add a local Python function as a tool in the Agent\n", + "########################\n", + "\n", + "from cookbook.tools.local_function import LocalFunctionTool\n", + "from tools.sample_tool import sku_sample_translator\n", + "\n", + "# translate_sku_tool = LocalFunctionTool(func=translate_sku, description=\"Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.\")\n", + "\n", + "tools = [retriever_tool, translate_sku_tool]\n", + "\n", + "########################\n", + "#### ✅✏️ Agent's LLM configuration\n", + "########################\n", + "\n", + "system_prompt = \"\"\"\n", + "## Role\n", + "You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\n", + "\n", + "## Objective\n", + "Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\n", + "\n", + "## Instructions\n", + "1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \n", + "\n", + "2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\n", + "\n", + "3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \"I'm sorry, I can't help you with that.\"\n", + "\"\"\".strip()\n", + "\n", + "fc_agent_config = FunctionCallingAgentConfig(\n", + " llm_config=LLMConfig(\n", + " llm_endpoint_name=\"databricks-meta-llama-3-3-70b-instruct\", # Model serving endpoint w/ a Chat Completions API\n", + " llm_system_prompt_template=system_prompt, # System prompt template\n", + " llm_parameters=LLMParametersConfig(\n", + " temperature=0.01, max_tokens=1500\n", + " ), # LLM parameters\n", + " ),\n", + " # Add one or more tools that comply with the CookbookTool interface\n", + " tools=tools,\n", + ")\n", + "\n", + "# Print the configuration as a JSON string to see it all together\n", + "# print(json.dumps(fc_agent_config.model_dump(), indent=4))\n", + "\n", + "########################\n", + "##### Dump the configuration to a YAML\n", + "# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration.\n", + "########################\n", + "# Import the default YAML config file name from the Agent's code file\n", + "from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME\n", + "\n", + "# Dump the configuration to a YAML file\n", + "serializable_config_to_yaml_file(fc_agent_config, \"./configs/\"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c7ee1de3-71f1-49a0-a511-d629dfca61d4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ Optionally, adjust the Agent's code\n", + "\n", + "Here, we import the Agent's code so we can run the Agent locally within the notebook. To modify the code, open the Agent's code file in a separate window, enable reload, make your changes, and re-run this cell.\n", + "\n", + "**Typically, when building the first version of your agent, we suggest first trying to tune the configuration (prompts, etc) to improve quality. If you need more control to fix quality issues, you can then modify the Agent's code.**" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "f31e7944-03ab-4c71-8a53-841435843ed0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "class FunctionCallingAgent(mlflow.pyfunc.PythonModel):\n \"\"\"\n Class representing an Agent that does function-calling with tools using Autogen\n \"\"\"\n\n def __init__(\n self,\n agent_config: Optional[Union[FunctionCallingAgentConfig, str]] = None\n ):\n super().__init__()\n # Empty variables that will be initialized after loading the agent config.\n self.agent_config = None\n self.tools = None\n\n # load the Agent's configuration. See load_config() for details.\n self.agent_config = load_config(\n passed_agent_config=agent_config,\n default_config_file_name=FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME,\n )\n if not self.agent_config:\n logging.error(\n f\"No agent config found. If you are in your local development environment, make sure you either [1] are calling init(agent_config=...) with either an instance of FunctionCallingAgentConfig or the full path to a YAML config file or [2] have a YAML config file saved at {{your_project_root_folder}}/configs/{FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME}.\"\n )\n else:\n logging.info(\"Successfully loaded agent config in __init__.\")\n self.tools = self.agent_config.tools\n\n def create_agents(self, chat_history):\n\n def is_termination_message(message):\n content = message.get(\"content\", \"\")\n return (content and \"TERMINATE\" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message)\n\n # The user proxy agent is used for interacting with the assistant agent\n # and executes tool calls.\n user_proxy = ConversableAgent(\n name=\"User\",\n llm_config=False,\n is_termination_msg=is_termination_message,\n human_input_mode=\"NEVER\",\n )\n \n llm_config = self.agent_config.llm_config\n \n system_prompt = llm_config.llm_system_prompt_template\n\n config_list = [{\n \"model_client_cls\": \"DatabricksModelServingClient\",\n \"model\": llm_config.llm_endpoint_name,\n \"endpoint_name\": llm_config.llm_endpoint_name,\n \"llm_config\": llm_config.llm_parameters.dict()}]\n\n assistant = ConversableAgent(\n name=\"Assistant\",\n system_message=system_prompt,\n llm_config={\"config_list\": config_list, \"cache_seed\": None},\n chat_messages={user_proxy: chat_history}\n )\n\n for tool in self.tools:\n if isinstance(tool, UCTool):\n tool._toolkit.tools[0].register_function(callers = assistant,\n executors = user_proxy )\n else:\n register_function(tool,\n caller=assistant,\n executor=user_proxy, \n name=tool.name,\n description=tool.description)\n\n return assistant, user_proxy\n \n\n @mlflow.trace(name=\"agent\", span_type=\"AGENT\")\n def predict(\n self,\n context: Any = None,\n model_input: Union[ChatCompletionRequest, Dict, pd.DataFrame] = None,\n params: Any = None,\n ) -> StringResponse:\n # Check here to allow the Agent class to be initialized without a configuration file, which is required to import the class as a module in other files.\n if not self.agent_config:\n raise RuntimeError(\"Agent config not loaded. Cannot call predict()\")\n\n ##############################################################################\n # Extract `messages` key from the `model_input`\n messages = get_messages_array(model_input)\n\n ##############################################################################\n # Parse `messages` array into the user's query & the chat history\n with mlflow.start_span(name=\"parse_input\", span_type=\"PARSER\") as span:\n span.set_inputs({\"messages\": messages})\n # in a multi-agent setting, the last message can be from another assistant, not the user\n last_message_content = extract_user_query_string(messages)\n last_message_role = messages[-1][\"role\"]\n last_message = {\"role\": last_message_role, \"content\": last_message_content}\n # Save the history inside the Agent's internal state\n chat_history = extract_chat_history(messages)\n span.set_outputs(\n {\n \"last_message\": last_message,\n \"chat_history\": chat_history\n }\n )\n\n ##############################################################################\n # Call the LLM to recursively calls tools and eventually deliver a generation to send back to the user\n (\n model_response,\n messages_log_with_tool_calls,\n ) = self.recursively_call_and_run_tools(last_message=last_message, \n chat_history=chat_history)\n\n return {\n \"content\": model_response['content'],\n # messages should be returned back to the Review App (or any other front end app) and stored there so it can be passed back to this stateless agent with the next turns of converastion.\n \"messages\": messages_log_with_tool_calls,\n }\n\n @mlflow.trace(span_type=\"AGENT\")\n def recursively_call_and_run_tools(self, \n last_message, \n chat_history,\n last_max_iter=10):\n\n assistant, user_proxy = self.create_agents(chat_history)\n\n assistant.register_model_client(model_client_cls=DatabricksModelServingClient)\n\n model_response = user_proxy.initiate_chat(assistant, \n message=last_message['content'],\n max_turns=last_max_iter,\n clear_history=False)\n\n return assistant.last_message(user_proxy), assistant.chat_messages[user_proxy]\n\n" + ] + } + ], + "source": [ + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "import inspect\n", + "\n", + "# Print the Agent code for inspection\n", + "print(inspect.getsource(FunctionCallingAgent))" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "620cf2e2-dcf6-4390-a616-2d31b3d17f5b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "%load_ext autoreload\n", + "%autoreload 3" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9015eac3-e0bc-4033-b8b5-23caf96dc1e5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ 🅰 Vibe check the Agent for a single query\n", + "\n", + "Running this cell will produce an MLflow Trace that you can use to see the Agent's outputs and understand the steps it took to produce that output.\n", + "\n", + "If you are running in a local IDE, browse to the MLflow Experiment page to view the Trace (link to the Experiment UI is at the top of this notebook). If running in a Databricks Notebook, your trace will appear inline below." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "324e5519-8303-40ce-ac20-26e0c4c88eb7", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:42:54] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:42:54] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:42:54] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat is mlflow in databricks?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_9a1e76b7-0b7f-4598-92f4-c5784c31e556): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"mlflow in databricks\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_9a1e76b7-0b7f-4598-92f4-c5784c31e556) *****\u001B[0m\n[{\"page_content\": \"build models iteratively and deploy across\\n\\nmultiple platforms\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Providing managed MLflow on Databricks\\n\\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\\n\\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\\n\\nBy using MLflow as part of Databricks, data scientists can:\\n\\n\\n**WORKSPACES**\\n\\nBenefit from a streamlined\\n\\nexperiment tracking experience\\n\\nwith Databricks Workspace and\\n\\ncollaborative Notebooks\\n\\n\\n**BIG DATA SNAPSHOTS**\\n\\nTrack large-scale data that fed\\n\\nthe models, along with all the\\n\\nother model parameters, then\\n\\n\\n**JOBS**\\n\\nEasily initiate jobs remotely, from\\n\\nan on-premises environment or\\n\\nfrom Databricks notebooks\\n\\n\\n**SECURITY**\\n\\nTake advantage of one common\\n\\nsecurity model for the entire\\n\\nmachine learning lifecycle\\n\\n\\nreproduce training runs reliably\\n\\n\\nRead our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Getting data ready for ML with Delta Lake\\n\\nDelta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\\n\\ndata processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\\n\\nBy using Delta Lake, data engineers and data scientists can keep track of data used for model training.\\n\\nFiles ML Runtime\\n\\n- \\u0007Schema enforced high\\n\\nquality data\\n\\n\\n\\n- Optimized performance\\n\\n��\\n\\n- \\u0007Full data lineage /\\n\\ngovernance\\n\\n- \\u0007reproductibility through\\n\\ntime travel\\n\\n\\nStreaming\\n\\nBatch\\n\\n\\nIngestion\\n\\nTables\\n\\n\\nIngestion\\n\\n\\nData\\n\\nCatalog\\n\\n\\nData\\n\\n\\nFeature\\n\\nStore\\n\\n\\nFeature\\n\\n\\n**Y O U R E X I S T I N G D E LTA L A K E**\\n\\n\\n3rd Party Data\\n\\nMarketplace\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Ready-to-use ML environments\\n\\nDatabricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\\n\\npreconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\\n\\nBy using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\\n\\nand simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\\n\\n\\n**P A C K A G E S A N D O P T I M I Z E S M O S T**\\n\\n**C O M M O N M L F R A M E W O R K S**\\n\\n\\n**C U S T O M I Z E D E N V I R O N M E N T S**\\n\\n**U S I N G C O N D A**\", \"metadata\": {\"similarity_score\": 0.006972813, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"8363863c52751f29adfa98d88820bec9\"}, {\"page_content\": \"Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\\n\\nhigh availability, and other Databricks workspace features such as experiment and run management and\\n\\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\\n\\nmachine learning model training runs and running machine learning projects.\\n\\n\\n-----\\n\\n###### Databricks and MLflow Autologging\\n\\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\\n\\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging\\n\\n\\nautomatically captures model parameters, metrics, files and lineage information when you train models with\\n\\ntraining runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\\n| |\\n\\n###### Feature Store\\n\\nThe Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\\n\\n\\nacross an organization and also ensures that the same feature computation code is used for model training\\n\\nand inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\\n| |\\n\\n###### MLflow Model Serving\\n\\nMLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\\n\\n\\nthat are updated automatically based on the availability of model versions and their stages. See\\n\\ndocumentation for [AWS](https://docs.databricks.com/applications/mlflow/model-serving.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/model-serving) [GCP](https://docs.gcp.databricks.com/applications/mlflow/model-serving.html) .\\n| |\\n\\n###### Databricks SQL\\n\\nDatabricks SQL provides a simple experience for SQL users who want to run quick ad hoc queries on their\\n\\n\\ndata lake, create multiple visualization types to explore query results from different perspectives, and build\\n\\nand share dashboards. See documentation for [AWS](https://docs.databricks.com/sql/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/sql/) [GCP](https://docs.gcp.databricks.com/sql/index.html) .\\n| |\\n\\n###### Databricks Workflows and Jobs\\n\\nDatabricks Workflows (Jobs and Delta Live Tables) can execute pipelines in automated, non-interactive\\n\\n\\nways. For ML, Jobs can be used to define pipelines for computing features, training models, or other ML\\n\\nsteps or pipelines. See documentation for [AWS](https://docs.databricks.com/data-engineering/jobs/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/data-engineering/jobs/) [GCP](https://docs.gcp.databricks.com/data-engineering/jobs/index.html) .\\n| |\\n\\n\\n-----\\n\\n#### Reference architecture\", \"metadata\": {\"similarity_score\": 0.006358216, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf\"}, \"id\": \"bfb585054ac3d95182da37f6cea4c11b\"}, {\"page_content\": \"###### Use case examples\\n\\nLet‘s examine three use cases to explore how users can leverage some of the MLflow components.\\n\\n\\n**EXPERIMENT TRACKING** A European energy\\n\\ncompany is using MLflow to track and update\\n\\nhundreds of energy-grid models. This company’s\\n\\ngoal is to build a time-series model for every major\\n\\nenergy producer (e.g., power plant) and consumer\\n\\n(e.g., factory), monitor these models using standard\\n\\nmetrics, and combine the predictions to drive\\n\\nbusiness processes, such as pricing. Because a\\n\\nsingle team is responsible for hundreds of models,\\n\\npossibly using different ML libraries, it’s important to\\n\\nhave a standard development and tracking process.\\n\\nThe team has standardized on Jupyter notebooks\\n\\nfor development, MLflow Tracking for metrics, and\\n\\nDatabricks Jobs for inference.\\n\\n\\n**REPRODUCIBLE PROJECTS** An online marketplace\\n\\nis using MLflow to package deep learning jobs using\\n\\nKeras and run them in the cloud. Each data scientist\\n\\ndevelops models locally on a laptop using a small\\n\\ndata set, checks them into a Git repository with\\n\\nan MLproject file, and submits remote runs of the\\n\\nproject to GPU instances in the cloud for large-scale\\n\\ntraining or hyperparameter search. Using MLflow\\n\\nProjects makes it easy to create the same software\\n\\nenvironment in the cloud and share project code\\n\\namong data scientists.\\n\\n\\n**MODEL PACKAGING** An e-commerce site’s data\\n\\nscience team is using MLflow Model Registry to\\n\\npackage recommendation models for use by\\n\\napplication engineers. This presents a technical\\n\\nchallenge because the recommendation\\n\\napplication includes both a standard, off-the-shelf\\n\\nrecommendation model and custom business logic\\n\\nfor pre- and post-processing. For example, the\\n\\napplication might include custom code to ensure the\\n\\nrecommended items are diverse. This business logic\\n\\nneeds to change in sync with the model, and the data\\n\\nscience team wants to control both the business logic\\n\\nand the model, without having to submit a patch to\\n\\nthe web application each time the logic has to change.\\n\\nMoreover, the team wants to A/B test distinct models\\n\\nwith distinct versions of the processing logic. The\\n\\nsolution was to package both the recommendation\\n\\nmodel and the custom logic using the python_\\n\\nfunction flavor in an MLflow Model, which can then\\n\\nbe deployed and tested as a single unit.\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Open and extensible by design\\n\\nSince we [unveiled](https://databricks.com/blog/2018/06/05/introducing-mlflow-an-open-source-machine-learning-platform.html) and open sourced MLflow in June 2018 at the Spark + AI Summit in San Francisco, community engagement and contributions have led to an impressive\\n\\narray of new features and integrations:\\n\\n\\n**SUPPORT FOR MULTIPLE**\\n\\n**PROGRAMMING LANGUAGES**\\n\\nTo give developers a choice, MLflow supports R,\\n\\nPython, Java and Scala, along with a REST server\\n\\ninterface that can be used from any language.\\n\\n\\n**INTEGRATION WITH POPULAR ML**\\n\\n**LIBRARIES AND FRAMEWORKS**\\n\\nMLflow has built-in integrations with the most popular\\n\\nmachine learning libraries — such as scikit-learn,\\n\\nTensorFlow, Keras, PyTorch, H2O, and Apache Spark™\\n\\nMLlib — to help teams build, test and deploy machine\\n\\nlearning applications.\\n\\n\\n**CROSS-CLOUD SUPPORT**\\n\\nOrganizations can use MLflow to quickly deploy\\n\\nmachine learning models to multiple cloud services,\\n\\nincluding Databricks, Azure Machine Learning and\\n\\nAmazon SageMaker, depending on their needs.\", \"metadata\": {\"similarity_score\": 0.0060572196, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"636979694b7ea0dbc65eb44b5917fb2a\"}, {\"page_content\": \"Databricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\\n\\npreconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\\n\\nBy using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\\n\\nand simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\\n\\n\\n**P A C K A G E S A N D O P T I M I Z E S M O S T**\\n\\n**C O M M O N M L F R A M E W O R K S**\\n\\n\\n**C U S T O M I Z E D E N V I R O N M E N T S**\\n\\n**U S I N G C O N D A**\\n\\n\\n**C U S T O M I Z E D E N V I R O N M E N T S**\\n\\n\\nrequirements.txt\\nconda.yaml\\n\\n\\n**...**\\n\\n\\n**B U I LT- I N O P T I M I Z AT I O N F O R**\\n\\n**D I S T R I B U T E D D E E P L E A R N I N G**\\n\\nDistribute and Scale any Single-Machine\\nML Code to thousands of machines\\n\\n\\n**B U I LT- I N A U T O M L A N D**\\n\\n**E X P E R I M E N T T R A C K I N G**\\n\\n\\nMachine\\n\\nLearning\\n\\n\\nMachine\\n\\n\\n\\nAuto ML and Tracking /\\nVisualizations with MLflow\\n\\n\\nConda-\\n\\nBased\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\nCHAPTER 7: **\\u0007** **Standardizing the**\\n\\n#### Machine Learning\\n Lifecycle on Databricks\\n\\n**B U I L D M O D E L**\\n**P R E P D ATA**\\n\\n��\\n\\nAzure ML\\n\\n**D E P L O Y M O D E L**\\n\\n��\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\nCHAPTER 8: **\\u0007** **Getting Started**\\nTake the next step toward standardizing your ML lifecycle — test drive MLflow and the\\n\\nDatabricks Unified Data Analytics Platform.\\n\\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\\n\\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\nCHAPTER 8: **\\u0007** **Comparison Matrix**\", \"metadata\": {\"similarity_score\": 0.005511811, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"2dacc7ad0116436f284e92e08d18ac49\"}, {\"page_content\": \"CHAPTER 8: **\\u0007** **Getting Started**\\nTake the next step toward standardizing your ML lifecycle — test drive MLflow and the\\n\\nDatabricks Unified Data Analytics Platform.\\n\\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\\n\\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\nCHAPTER 8: **\\u0007** **Comparison Matrix**\\n\\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\\n|---|---|---|\\n\\n\\n-----\", \"metadata\": {\"similarity_score\": 0.0051848004, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"275743e1dc14a362c47022ab6134084c\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nMLflow is a platform that helps manage the machine learning lifecycle, and it is natively integrated with the Databricks Unified Data Analytics Platform. It provides features such as experiment tracking, reproducible projects, and model packaging, and is designed to be open and extensible. MLflow on Databricks offers an integrated experience for tracking and securing machine learning model training runs and running machine learning projects. It also provides a fully managed and hosted version of MLflow with enterprise security features, high availability, and other Databricks workspace features. Additionally, MLflow has built-in integrations with popular machine learning libraries and frameworks, and supports multiple programming languages.\n\n--------------------------------------------------------------------------------\nView the MLflow Traces at https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093?compareRunsMode=TRACES\nAgent's final response:\n----\nMLflow is a platform that helps manage the machine learning lifecycle, and it is natively integrated with the Databricks Unified Data Analytics Platform. It provides features such as experiment tracking, reproducible projects, and model packaging, and is designed to be open and extensible. MLflow on Databricks offers an integrated experience for tracking and securing machine learning model training runs and running machine learning projects. It also provides a fully managed and hosted version of MLflow with enterprise security features, high availability, and other Databricks workspace features. Additionally, MLflow has built-in integrations with popular machine learning libraries and frameworks, and supports multiple programming languages.\n----\n\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-32320289e5be4eea8fd73d53e02c7e87\"", + "text/plain": [ + "Trace(request_id=tr-32320289e5be4eea8fd73d53e02c7e87)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "from cookbook.databricks_utils import get_mlflow_experiment_traces_url\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "\n", + "# Load the Agent's code with the above configuration\n", + "agent = FunctionCallingAgent(agent_config=fc_agent_config)\n", + "\n", + "# Vibe check the Agent for a single query\n", + "output = agent.predict(model_input={\"messages\": [{\"role\": \"user\", \"content\": \"What is mlflow in databricks?\"}]})\n", + "# output = agent.predict(model_input={\"messages\": [{\"role\": \"user\", \"content\": \"Translate the sku `OLD-abs-1234` to the new format\"}]})\n", + "\n", + "print(f\"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}\")\n", + "print(f\"Agent's final response:\\n----\\n{output['content']}\\n----\")\n", + "print()\n", + "# print(f\"Agent's full message history (useful for debugging):\\n----\\n{json.dumps(output['messages'], indent=2)}\\n----\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "91e615a1-bedf-44e6-9b8f-27ba849d7197", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Now, let's test a multi-turn conversation with the Agent." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8fb2d75a-821f-4479-9bea-0997f556637e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'MLflow is a platform that helps manage the machine learning lifecycle, and it is natively integrated with the Databricks Unified Data Analytics Platform. It provides features such as experiment tracking, reproducible projects, and model packaging, and is designed to be open and extensible. MLflow on Databricks offers an integrated experience for tracking and securing machine learning model training runs and running machine learning projects. It also provides a fully managed and hosted version of MLflow with enterprise security features, high availability, and other Databricks workspace features. Additionally, MLflow has built-in integrations with popular machine learning libraries and frameworks, and supports multiple programming languages.'" + ] + }, + "execution_count": 8, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "output['content']" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "65b00c1d-bd93-477c-afaa-0f9a38f18672", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:10] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:10] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:10] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nHow can I use it for versioning my model?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_a6a5d604-8249-4f2f-a38a-340d5e923d42): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"MLflow versioning model\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_a6a5d604-8249-4f2f-a38a-340d5e923d42) *****\u001B[0m\n[{\"page_content\": \"**Parameters**\\n\\n\\n**Metrics** **Artifacts**\\n\\n\\nThe Model Registry gives MLflow users new\\n\\n\\ntools for sharing, reviewing and managing\\n\\nML models throughout their lifecycle\\n\\n\\n**Metadata** **Models**\\n\\n**R E S T S E R V I N G**\\n\\n**R E V I E W E R S + C I / C D T O O L S**\\n\\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\\n\\nimplement good engineering principles with machine learning initiatives, such as collaboration,\\n\\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\\n\\nfeatures of this new component.\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\n###### One hub for managing ML models collaboratively\\n\\nBuilding and deploying ML models is a team sport. Not only are the responsibilities\\n\\nalong the machine learning model lifecycle often split across multiple people\\n\\n(e.g., data scientists train models whereas production engineers deploy them),\\n\\nbut also at each lifecycle stage, teams can benefit from collaboration and sharing\\n\\n\\n###### Flexible CI/CD pipelines to manage stage transitions\\n\\nMLflow lets you manage your models’ lifecycles either manually or through\\n\\nautomated tools. Analogous to the approval process in software engineering,\\n\\nusers can manually request to move a model to a new lifecycle stage (e.g., from\\n\\nstaging to production), and review or comment on other users’ transition requests.\\n\\n\\n(e.g., a fraud model built in one part of the organization could be reused in others).\\n\\nAlternatively, you can use the Model Registry’s API to plug in continuous integration\\n\\n\\nMLflow facilitates sharing of expertise and knowledge across teams by making ML\\n\\nmodels more discoverable and providing collaborative features to jointly improve\\n\\non common ML tasks. Simply register an MLflow model from your experiments to\\n\\n\\nand deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\\n\\nyour models. Each model also links to the experiment run that built it — in MLflow\\n\\nTracking — to let you easily review models.\\n\\n\\nget started. The MLflow Model Registry will then let you track multiple versions\\n\\nof the model and mark each one with a lifecycle stage: development, staging,\\n\\nproduction or archived.\\n\\n\\nSample machine learning\\nmodels displayed via the\\nMLflow Model Registry\\ndashboard\\n\\n\\nThe machine learning model\\npage view in MLflow, showing\\nhow users can request and\\nreview changes to a model’s\\nstage\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Visibility and governance for the full ML lifecycle\\n\\nIn large enterprises, the number of ML models that are in development, staging\\n\\nand production at any given point in time may be in the hundreds or thousands.\\n\\nHaving full visibility into which models exist, what stages they are in and who\\n\\nhas collaborated on and changed the deployment stages of a model allows\\n\\norganizations to better manage their ML efforts.\\n\\nMLflow provides full visibility and enables governance by keeping track of each\\n\\nmodel’s history and managing who can approve changes to the model’s stages.\\n\\nIdentify versions, stages and\\nauthors of each model\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\nCHAPTER 5: **\\u0007** **Making Organizations**\\n\\n#### Successful with ML\\n\\n\\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\\n\\ntrack experiments, compare results, reproduce runs and productionize faster.\\n\\nIn addition to increasing data science team productivity and collaboration and applying good engineering\\n\\npractices to machine learning, organizations also need to do the following:\", \"metadata\": {\"similarity_score\": 0.0045169946, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"5bd6f00e0e10ffbea60bf754bf142b9f\"}, {\"page_content\": \"array of new features and integrations:\\n\\n\\n**SUPPORT FOR MULTIPLE**\\n\\n**PROGRAMMING LANGUAGES**\\n\\nTo give developers a choice, MLflow supports R,\\n\\nPython, Java and Scala, along with a REST server\\n\\ninterface that can be used from any language.\\n\\n\\n**INTEGRATION WITH POPULAR ML**\\n\\n**LIBRARIES AND FRAMEWORKS**\\n\\nMLflow has built-in integrations with the most popular\\n\\nmachine learning libraries — such as scikit-learn,\\n\\nTensorFlow, Keras, PyTorch, H2O, and Apache Spark™\\n\\nMLlib — to help teams build, test and deploy machine\\n\\nlearning applications.\\n\\n\\n**CROSS-CLOUD SUPPORT**\\n\\nOrganizations can use MLflow to quickly deploy\\n\\nmachine learning models to multiple cloud services,\\n\\nincluding Databricks, Azure Machine Learning and\\n\\nAmazon SageMaker, depending on their needs.\\n\\nMLflow leverages AWS S3, Google Cloud Storage and\\n\\nAzure Data Lake Storage, allowing teams to easily\\n\\ntrack and share artifacts from their code.\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Rapid community adoption\\n\\n## 2.5M\\n#### monthly downloads\\n\\n## 200+\\n#### code contributors\\n\\n\\n## 100+\\n#### contributing organizations\\n\\n\\nOrganizations using and contributing to MLflow\\n\\nSource: [mlflow.org](https://mlflow.org)\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\nCHAPTER 4: **\\u0007** **A Closer Look at**\\n\\n#### MLflow Model Registry\\n\\n\\nMLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\\n\\n[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\\n\\nThe latest MLflow component — MLflow Model Registry — builds on MLflow’s original capabilities to\\n\\nprovide organizations with one central place to share ML models, collaborate on moving them from\\n\\nexperimentation to testing and production, and implement approval and governance workflows.\\n\\n��\\n\\n\\n**Model Registry**\\n\\n\\n**D O W N S T R E A M**\\n\\n\\n��\\n\\n**Tracking Server**\\n\\n\\nData Scientists\\n\\n**Staging**\\n\\n\\nData Engineers\\n\\n**Production** **Archived**\\n\\n**A U T O M AT E D J O B S**\\n\\n\\n**Parameters**\\n\\n\\n**Metrics** **Artifacts**\\n\\n\\nThe Model Registry gives MLflow users new\\n\\n\\ntools for sharing, reviewing and managing\\n\\nML models throughout their lifecycle\\n\\n\\n**Metadata** **Models**\\n\\n**R E S T S E R V I N G**\\n\\n**R E V I E W E R S + C I / C D T O O L S**\\n\\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\\n\\nimplement good engineering principles with machine learning initiatives, such as collaboration,\\n\\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\\n\\nfeatures of this new component.\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\n###### One hub for managing ML models collaboratively\", \"metadata\": {\"similarity_score\": 0.0039051326, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"f3b1fd8e596abc89f1565df7a5376b76\"}, {\"page_content\": \"Having full visibility into which models exist, what stages they are in and who\\n\\nhas collaborated on and changed the deployment stages of a model allows\\n\\norganizations to better manage their ML efforts.\\n\\nMLflow provides full visibility and enables governance by keeping track of each\\n\\nmodel’s history and managing who can approve changes to the model’s stages.\\n\\nIdentify versions, stages and\\nauthors of each model\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\nCHAPTER 5: **\\u0007** **Making Organizations**\\n\\n#### Successful with ML\\n\\n\\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\\n\\ntrack experiments, compare results, reproduce runs and productionize faster.\\n\\nIn addition to increasing data science team productivity and collaboration and applying good engineering\\n\\npractices to machine learning, organizations also need to do the following:\\n\\n\\n**Reliably ingest, ETL and**\\n\\n**catalog big data**\\n\\n\\n**Work with state-of-the-art**\\n\\n**ML frameworks and tools**\\n\\n\\n**Easily scale compute from**\\n\\n**single to multi-node**\\n\\n\\nDatabricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\nCHAPTER 6: **\\u0007** **Introducing the Unified**\\n\\n#### Data Analytics Platform\\n\\n\\nDatabricks accelerates innovation by unifying data science, engineering and business. Through a fully\\n\\nmanaged, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\\n\\nDatabricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\\n\\naccelerates their innovation.\\n\\n**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\\n\\n\\n**BI INTEGRATIONS**\\n\\n**Access all your data**\\n\\n\\n**DATA SCIENCE WORKSPACE**\\n\\n**Collaboration across the lifecycle**\\n\\n**UNIFIED DATA SERVICE**\\n\\n**High-quality data with great performance**\\n\\n\\n\\n**ENTERPRISE CLOUD SERVICE**\\n\\n**A simple, scalable and secure managed service**\\n\\n##### RAW DATA LAKE\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n\\n###### Data engineering\\n\\nSpeed up the preparation of high-quality\\n\\ndata, essential for best-in-class ML\\n\\napplications, at scale\\n\\n\\n###### Data science\\n\\nCollaboratively explore large data sets,\\n\\nbuild models iteratively and deploy across\\n\\nmultiple platforms\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\n###### Providing managed MLflow on Databricks\\n\\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\\n\\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\\n\\nBy using MLflow as part of Databricks, data scientists can:\\n\\n\\n**WORKSPACES**\\n\\nBenefit from a streamlined\\n\\nexperiment tracking experience\\n\\nwith Databricks Workspace and\\n\\ncollaborative Notebooks\\n\\n\\n**BIG DATA SNAPSHOTS**\\n\\nTrack large-scale data that fed\\n\\nthe models, along with all the\\n\\nother model parameters, then\\n\\n\\n**JOBS**\", \"metadata\": {\"similarity_score\": 0.0037934575, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"27767fe6eb02560ebf204ec17e2c6b7a\"}, {\"page_content\": \"through Git repository branches.\\n\\n\\nlifecycle management functions that are needed\\n\\n\\nto make cloud-based object stores reliable and\\n\\nperformant. This design allows clients to update\\n\\nmultiple objects at once and to replace a subset\\n\\nof the objects with another, etc., in a serializable\\n\\nmanner that still achieves high parallel read/write\\n\\nperformance from the objects — while offering\\n\\nadvanced capabilities like time travel (e.g., query\\n\\n\\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\\n\\nthat’s higher. For example, the dev environment can run any code, but the prod environment can only run\\n\\nprod code.\\n\\n###### Models\\n\\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\\n\\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\\n\\na new model version before you push a code change, and vice versa. Consider the following scenarios:\\n\\n\\npoint-in-time snapshots or rollback of erroneous\\n\\n\\n\\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\\n\\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\\n\\nof being generated, tested and marked as “production” to predict on the most recent transactions. In\\n\\nthis case the code lifecycle is slower than the model lifecycle.\\n\\n\\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\\n\\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\\n\\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\\n\\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\\n\\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\\n\\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\\n\\nproduction models without code changes, streamlining the deployment process in many cases. Model\\n\\nartifacts are secured using MLflow access controls or cloud storage permissions\\n\\n\\nupdates), automatic data layout optimization,\\n\\nupserts, caching and audit logs.\\n\\n\\n-----\\n\\n###### Data\\n\\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\\n\\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\\n\\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\\n\\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\\n\\n\\nreliability and freshness. Access to data in each environment is controlled with table access controls\\n\\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\\n| |\\n\\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\\n\\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\\n\\nwill be the highest quality and tightly controlled.\", \"metadata\": {\"similarity_score\": 0.0037399682, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf\"}, \"id\": \"da9dc2dae18bd9c1aa982e446461522c\"}, {\"page_content\": \"CHAPTER 8: **\\u0007** **Getting Started**\\nTake the next step toward standardizing your ML lifecycle — test drive MLflow and the\\n\\nDatabricks Unified Data Analytics Platform.\\n\\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\\n\\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\\n\\n\\n-----\\n\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\n\\nCHAPTER 8: **\\u0007** **Comparison Matrix**\\n\\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W   Self-hosted                |M A N A G E D M L F L O W O N D ATA B R I C K S   Fully managed    With remote execution             |\\n|---|---|---|\\n\\n\\n-----\", \"metadata\": {\"similarity_score\": 0.0035068137, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\"}, \"id\": \"275743e1dc14a362c47022ab6134084c\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nYou can use MLflow for versioning your model by utilizing the MLflow Model Registry, which provides a centralized place to manage and track different versions of your models. The Model Registry allows you to track multiple versions of a model, mark each version with a lifecycle stage (such as development, staging, or production), and manage who can approve changes to the model's stages. This enables you to have full visibility into which models exist, what stages they are in, and who has collaborated on and changed the deployment stages of a model, making it easier to manage your ML efforts.\n\n--------------------------------------------------------------------------------\nView the MLflow Traces at https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093?compareRunsMode=TRACES\nAgent's final response:\n----\nYou can use MLflow for versioning your model by utilizing the MLflow Model Registry, which provides a centralized place to manage and track different versions of your models. The Model Registry allows you to track multiple versions of a model, mark each version with a lifecycle stage (such as development, staging, or production), and manage who can approve changes to the model's stages. This enables you to have full visibility into which models exist, what stages they are in, and who has collaborated on and changed the deployment stages of a model, making it easier to manage your ML efforts.\n----\n\nAgent's full message history (useful for debugging):\n----\n[\n {\n \"content\": \"What is mlflow in databricks?\",\n \"role\": \"user\"\n },\n {\n \"tool_calls\": [\n {\n \"id\": \"call_9a1e76b7-0b7f-4598-92f4-c5784c31e556\",\n \"function\": {\n \"arguments\": \"{ \\\"query\\\": \\\"mlflow in databricks\\\", \\\"filters\\\": [] }\",\n \"name\": \"search_product_docs\"\n },\n \"type\": \"function\"\n }\n ],\n \"content\": null,\n \"role\": \"assistant\"\n },\n {\n \"content\": \"[{\\\"page_content\\\": \\\"build models iteratively and deploy across\\\\n\\\\nmultiple platforms\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Providing managed MLflow on Databricks\\\\n\\\\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\\\\n\\\\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\\\\n\\\\nBy using MLflow as part of Databricks, data scientists can:\\\\n\\\\n\\\\n**WORKSPACES**\\\\n\\\\nBenefit from a streamlined\\\\n\\\\nexperiment tracking experience\\\\n\\\\nwith Databricks Workspace and\\\\n\\\\ncollaborative Notebooks\\\\n\\\\n\\\\n**BIG DATA SNAPSHOTS**\\\\n\\\\nTrack large-scale data that fed\\\\n\\\\nthe models, along with all the\\\\n\\\\nother model parameters, then\\\\n\\\\n\\\\n**JOBS**\\\\n\\\\nEasily initiate jobs remotely, from\\\\n\\\\nan on-premises environment or\\\\n\\\\nfrom Databricks notebooks\\\\n\\\\n\\\\n**SECURITY**\\\\n\\\\nTake advantage of one common\\\\n\\\\nsecurity model for the entire\\\\n\\\\nmachine learning lifecycle\\\\n\\\\n\\\\nreproduce training runs reliably\\\\n\\\\n\\\\nRead our [blog](https://databricks.com/blog/2019/03/06/managed-mlflow-on-databricks-now-in-public-preview.html) to learn more about these integrations.\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Getting data ready for ML with Delta Lake\\\\n\\\\nDelta Lake is a storage layer that brings reliability to data lakes. Delta Lake provides ACID transactions and scalable metadata handling, and it unifies streaming and batch\\\\n\\\\ndata processing. Delta Lake runs on top of your existing data lake and is fully compatible with Apache Spark APIs.\\\\n\\\\nBy using Delta Lake, data engineers and data scientists can keep track of data used for model training.\\\\n\\\\nFiles ML Runtime\\\\n\\\\n- \\\\u0007Schema enforced high\\\\n\\\\nquality data\\\\n\\\\n\\\\n\\\\n- Optimized performance\\\\n\\\\n\\ufffd\\ufffd\\\\n\\\\n- \\\\u0007Full data lineage /\\\\n\\\\ngovernance\\\\n\\\\n- \\\\u0007reproductibility through\\\\n\\\\ntime travel\\\\n\\\\n\\\\nStreaming\\\\n\\\\nBatch\\\\n\\\\n\\\\nIngestion\\\\n\\\\nTables\\\\n\\\\n\\\\nIngestion\\\\n\\\\n\\\\nData\\\\n\\\\nCatalog\\\\n\\\\n\\\\nData\\\\n\\\\n\\\\nFeature\\\\n\\\\nStore\\\\n\\\\n\\\\nFeature\\\\n\\\\n\\\\n**Y O U R E X I S T I N G D E LTA L A K E**\\\\n\\\\n\\\\n3rd Party Data\\\\n\\\\nMarketplace\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Ready-to-use ML environments\\\\n\\\\nDatabricks Runtime for Machine Learning provides data scientists and ML practitioners with on-demand access to ready-to-use machine learning clusters that are\\\\n\\\\npreconfigured with the latest and most popular machine learning frameworks, including TensorFlow, Keras, PyTorch, scikit-learn, XGBoost and Horovod.\\\\n\\\\nBy using the Databricks Runtime for ML, data scientists can get to results faster with one-click access to ML clusters, optimized performance on popular ML algorithms,\\\\n\\\\nand simplified distributed deep learning on Horovod and GPUs. It also supports Conda for further customization.\\\\n\\\\n\\\\n**P A C K A G E S A N D O P T I M I Z E S M O S T**\\\\n\\\\n**C O M M O N M L F R A M E W O R K S**\\\\n\\\\n\\\\n**C U S T O M I Z E D E N V I R O N M E N T S**\\\\n\\\\n**U S I N G C O N D A**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.006972813, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"8363863c52751f29adfa98d88820bec9\\\"}, {\\\"page_content\\\": \\\"Databricks also provides a fully managed and hosted version of MLflow with enterprise security features,\\\\n\\\\nhigh availability, and other Databricks workspace features such as experiment and run management and\\\\n\\\\nnotebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing\\\\n\\\\nmachine learning model training runs and running machine learning projects.\\\\n\\\\n\\\\n-----\\\\n\\\\n###### Databricks and MLflow Autologging\\\\n\\\\nDatabricks Autologging is a no-code solution that extends [MLflow automatic logging](https://mlflow.org/docs/latest/tracking.html#automatic-logging) to deliver automatic\\\\n\\\\nexperiment tracking for machine learning training sessions on Databricks. Databricks Autologging\\\\n\\\\n\\\\nautomatically captures model parameters, metrics, files and lineage information when you train models with\\\\n\\\\ntraining runs recorded as MLflow tracking runs. See documentation for [AWS](https://docs.databricks.com/applications/mlflow/databricks-autologging.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/mlflow/databricks-autologging) [GCP](https://docs.gcp.databricks.com/applications/mlflow/databricks-autologging.html) .\\\\n| |\\\\n\\\\n###### Feature Store\\\\n\\\\nThe Databricks Feature Store is a centralized repository of features. It enables feature sharing and discovery\\\\n\\\\n\\\\nacross an organization and also ensures that the same feature computation code is used for model training\\\\n\\\\nand inference. See documentation for [AWS](https://docs.databricks.com/applications/machine-learning/feature-store/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/applications/machine-learning/feature-store/) [GCP](https://docs.gcp.databricks.com/applications/machine-learning/feature-store/index.html) .\\\\n| |\\\\n\\\\n###### MLflow Model Serving\\\\n\\\\nMLflow Model Serving allows you to host machine learning models from Model Registry as REST endpoints\\\\n\\\\n\\\\nthat are updated automatically based on the availability of model versions and their stages. See\\\\n\\\\ndoc\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nat ML practitioners and engineers can benefit from out-of-the-box tracking,\\\\n\\\\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\\\\n\\\\nBy using MLflow as part of Databricks, data scientists can:\\\\n\\\\n\\\\n**WORKSPACES**\\\\n\\\\nBenefit from a streamlined\\\\n\\\\nexperiment tracking experience\\\\n\\\\nwith Databricks Workspace and\\\\n\\\\ncollaborative Notebooks\\\\n\\\\n\\\\n**BIG DATA SNAPSHOTS**\\\\n\\\\nTrack large-scale data that fed\\\\n\\\\nthe models, along with all the\\\\n\\\\nother model parameters, then\\\\n\\\\n\\\\n**JOBS**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0037934575, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"27767fe6eb02560ebf204ec17e2c6b7a\\\"}, {\\\"page_content\\\": \\\"through Git repository branches.\\\\n\\\\n\\\\nlifecycle management functions that are needed\\\\n\\\\n\\\\nto make cloud-based object stores reliable and\\\\n\\\\nperformant. This design allows clients to update\\\\n\\\\nmultiple objects at once and to replace a subset\\\\n\\\\nof the objects with another, etc., in a serializable\\\\n\\\\nmanner that still achieves high parallel read/write\\\\n\\\\nperformance from the objects \\u2014 while offering\\\\n\\\\nadvanced capabilities like time travel (e.g., query\\\\n\\\\n\\\\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\\\\n\\\\nthat\\u2019s higher. For example, the dev environment can run any code, but the prod environment can only run\\\\n\\\\nprod code.\\\\n\\\\n###### Models\\\\n\\\\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\\\\n\\\\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\\\\n\\\\na new model version before you push a code change, and vice versa. Consider the following scenarios:\\\\n\\\\n\\\\npoint-in-time snapshots or rollback of erroneous\\\\n\\\\n\\\\n\\\\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\\\\n\\\\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\\\\n\\\\nof being generated, tested and marked as \\u201cproduction\\u201d to predict on the most recent transactions. In\\\\n\\\\nthis case the code lifecycle is slower than the model lifecycle.\\\\n\\\\n\\\\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\\\\n\\\\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\\\\n\\\\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\\\\n\\\\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\\\\n\\\\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\\\\n\\\\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\\\\n\\\\nproduction models without code changes, streamlining the deployment process in many cases. Model\\\\n\\\\nartifacts are secured using MLflow access controls or cloud storage permissions\\\\n\\\\n\\\\nupdates), automatic data layout optimization,\\\\n\\\\nupserts, caching and audit logs.\\\\n\\\\n\\\\n-----\\\\n\\\\n###### Data\\\\n\\\\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\\\\n\\\\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\\\\n\\\\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\\\\n\\\\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\\\\n\\\\n\\\\nreliability and freshness. Access to data in each environment is controlled with table access controls\\\\n\\\\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\\\\n| |\\\\n\\\\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\\\\n\\\\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\\\\n\\\\nwill be the highest quality and tightly controlled.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0037399682, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf\\\"}, \\\"id\\\": \\\"da9dc2dae18bd9c1aa982e446461522c\\\"}, {\\\"page_content\\\": \\\"CHAPTER 8: **\\\\u0007** **Getting Started**\\\\nTake the next step toward standardizing your ML lifecycle \\u2014 test drive MLflow and the\\\\n\\\\nDatabricks Unified Data Analytics Platform.\\\\n\\\\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\\\\n\\\\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\nCHAPTER 8: **\\\\u0007** **Comparison Matrix**\\\\n\\\\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W \\uf058 \\uf058 Self-hosted \\uf410 \\uf410 \\uf058 \\uf058 \\uf410 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf410 \\uf410 \\uf410 \\uf410 \\uf410|M A N A G E D M L F L O W O N D ATA B R I C K S \\uf058 \\uf058 Fully managed \\uf058 \\uf058 \\uf058 With remote execution \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058|\\\\n|---|---|---|\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0035068137, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"275743e1dc14a362c47022ab6134084c\\\"}]\",\n \"tool_responses\": [\n {\n \"tool_call_id\": \"call_a6a5d604-8249-4f2f-a38a-340d5e923d42\",\n \"role\": \"tool\",\n \"content\": \"[{\\\"page_content\\\": \\\"**Parameters**\\\\n\\\\n\\\\n**Metrics** **Artifacts**\\\\n\\\\n\\\\nThe Model Registry gives MLflow users new\\\\n\\\\n\\\\ntools for sharing, reviewing and managing\\\\n\\\\nML models throughout their lifecycle\\\\n\\\\n\\\\n**Metadata** **Models**\\\\n\\\\n**R E S T S E R V I N G**\\\\n\\\\n**R E V I E W E R S + C I / C D T O O L S**\\\\n\\\\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\\\\n\\\\nimplement good engineering principles with machine learning initiatives, such as collaboration,\\\\n\\\\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\\\\n\\\\nfeatures of this new component.\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\n###### One hub for managing ML models collaboratively\\\\n\\\\nBuilding and deploying ML models is a team sport. Not only are the responsibilities\\\\n\\\\nalong the machine learning model lifecycle often split across multiple people\\\\n\\\\n(e.g., data scientists train models whereas production engineers deploy them),\\\\n\\\\nbut also at each lifecycle stage, teams can benefit from collaboration and sharing\\\\n\\\\n\\\\n###### Flexible CI/CD pipelines to manage stage transitions\\\\n\\\\nMLflow lets you manage your models\\u2019 lifecycles either manually or through\\\\n\\\\nautomated tools. Analogous to the approval process in software engineering,\\\\n\\\\nusers can manually request to move a model to a new lifecycle stage (e.g., from\\\\n\\\\nstaging to production), and review or comment on other users\\u2019 transition requests.\\\\n\\\\n\\\\n(e.g., a fraud model built in one part of the organization could be reused in others).\\\\n\\\\nAlternatively, you can use the Model Registry\\u2019s API to plug in continuous integration\\\\n\\\\n\\\\nMLflow facilitates sharing of expertise and knowledge across teams by making ML\\\\n\\\\nmodels more discoverable and providing collaborative features to jointly improve\\\\n\\\\non common ML tasks. Simply register an MLflow model from your experiments to\\\\n\\\\n\\\\nand deployment (CI/CD) tools, such as Jenkins, to automatically test and transition\\\\n\\\\nyour models. Each model also links to the experiment run that built it \\u2014 in MLflow\\\\n\\\\nTracking \\u2014 to let you easily review models.\\\\n\\\\n\\\\nget started. The MLflow Model Registry will then let you track multiple versions\\\\n\\\\nof the model and mark each one with a lifecycle stage: development, staging,\\\\n\\\\nproduction or archived.\\\\n\\\\n\\\\nSample machine learning\\\\nmodels displayed via the\\\\nMLflow Model Registry\\\\ndashboard\\\\n\\\\n\\\\nThe machine learning model\\\\npage view in MLflow, showing\\\\nhow users can request and\\\\nreview changes to a model\\u2019s\\\\nstage\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Visibility and governance for the full ML lifecycle\\\\n\\\\nIn large enterprises, the number of ML models that are in development, staging\\\\n\\\\nand production at any given point in time may be in the hundreds or thousands.\\\\n\\\\nHaving full visibility into which models exist, what stages they are in and who\\\\n\\\\nhas collaborated on and changed the deployment stages of a model allows\\\\n\\\\norganizations to better manage their ML efforts.\\\\n\\\\nMLflow provides full visibility and enables governance by keeping track of each\\\\n\\\\nmodel\\u2019s history and managing who can approve changes to the model\\u2019s stages.\\\\n\\\\nIdentify versions, stages and\\\\nauthors of each model\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\nCHAPTER 5: **\\\\u0007** **Making Organizations**\\\\n\\\\n#### Successful with ML\\\\n\\\\n\\\\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\\\\n\\\\ntrack experiments, compare results, reproduce runs and productionize faster.\\\\n\\\\nIn addition to increasing data science team productivity and collaboration and applying good engineering\\\\n\\\\npractices to machine learning, organizations also need to do the following:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0045169946, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"5bd6f00e0e10ffbea60bf754bf142b9f\\\"}, {\\\"page_content\\\": \\\"array of new features and integrations:\\\\n\\\\n\\\\n**SUPPORT FOR MULTIPLE**\\\\n\\\\n**PROGRAMMING LANGUAGES**\\\\n\\\\nTo give developers a choice, MLflow supports R,\\\\n\\\\nPython, Java and Scala, along with a REST server\\\\n\\\\ninterface that can be used from any language.\\\\n\\\\n\\\\n**INTEGRATION WITH POPULAR ML**\\\\n\\\\n**LIBRARIES AND FRAMEWORKS**\\\\n\\\\nMLflow has built-in integrations with the most popular\\\\n\\\\nmachine learning libraries \\u2014 such as scikit-learn,\\\\n\\\\nTensorFlow, Keras, PyTorch, H2O, and Apache Spark\\u2122\\\\n\\\\nMLlib \\u2014 to help teams build, test and deploy machine\\\\n\\\\nlearning applications.\\\\n\\\\n\\\\n**CROSS-CLOUD SUPPORT**\\\\n\\\\nOrganizations can use MLflow to quickly deploy\\\\n\\\\nmachine learning models to multiple cloud services,\\\\n\\\\nincluding Databricks, Azure Machine Learning and\\\\n\\\\nAmazon SageMaker, depending on their needs.\\\\n\\\\nMLflow leverages AWS S3, Google Cloud Storage and\\\\n\\\\nAzure Data Lake Storage, allowing teams to easily\\\\n\\\\ntrack and share artifacts from their code.\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Rapid community adoption\\\\n\\\\n## 2.5M\\\\n#### monthly downloads\\\\n\\\\n## 200+\\\\n#### code contributors\\\\n\\\\n\\\\n## 100+\\\\n#### contributing organizations\\\\n\\\\n\\\\nOrganizations using and contributing to MLflow\\\\n\\\\nSource: [mlflow.org](https://mlflow.org)\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\nCHAPTER 4: **\\\\u0007** **A Closer Look at**\\\\n\\\\n#### MLflow Model Registry\\\\n\\\\n\\\\nMLflow originally introduced the ability to [track metrics, parameters and artifacts](https://www.mlflow.org/docs/latest/tracking.html#) as part of experiments,\\\\n\\\\n[package models and reproducible ML projects](https://www.mlflow.org/docs/latest/projects.html) , and [deploy models to batch or to real-time serving platforms](https://www.mlflow.org/docs/latest/models.html) .\\\\n\\\\nThe latest MLflow component \\u2014 MLflow Model Registry \\u2014 builds on MLflow\\u2019s original capabilities to\\\\n\\\\nprovide organizations with one central place to share ML models, collaborate on moving them from\\\\n\\\\nexperimentation to testing and production, and implement approval and governance workflows.\\\\n\\\\n\\ufffd\\ufffd\\\\n\\\\n\\\\n**Model Registry**\\\\n\\\\n\\\\n**D O W N S T R E A M**\\\\n\\\\n\\\\n\\ufffd\\ufffd\\\\n\\\\n**Tracking Server**\\\\n\\\\n\\\\nData Scientists\\\\n\\\\n**Staging**\\\\n\\\\n\\\\nData Engineers\\\\n\\\\n**Production** **Archived**\\\\n\\\\n**A U T O M AT E D J O B S**\\\\n\\\\n\\\\n**Parameters**\\\\n\\\\n\\\\n**Metrics** **Artifacts**\\\\n\\\\n\\\\nThe Model Registry gives MLflow users new\\\\n\\\\n\\\\ntools for sharing, reviewing and managing\\\\n\\\\nML models throughout their lifecycle\\\\n\\\\n\\\\n**Metadata** **Models**\\\\n\\\\n**R E S T S E R V I N G**\\\\n\\\\n**R E V I E W E R S + C I / C D T O O L S**\\\\n\\\\nThe MLflow Model Registry complements the MLflow offering and is designed to help organizations\\\\n\\\\nimplement good engineering principles with machine learning initiatives, such as collaboration,\\\\n\\\\ngovernance, reproducibility and knowledge management. The next few pages highlight some of the key\\\\n\\\\nfeatures of this new component.\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\n###### One hub for managing ML models collaboratively\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0039051326, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"f3b1fd8e596abc89f1565df7a5376b76\\\"}, {\\\"page_content\\\": \\\"Having full visibility into which models exist, what stages they are in and who\\\\n\\\\nhas collaborated on and changed the deployment stages of a model allows\\\\n\\\\norganizations to better manage their ML efforts.\\\\n\\\\nMLflow provides full visibility and enables governance by keeping track of each\\\\n\\\\nmodel\\u2019s history and managing who can approve changes to the model\\u2019s stages.\\\\n\\\\nIdentify versions, stages and\\\\nauthors of each model\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\nCHAPTER 5: **\\\\u0007** **Making Organizations**\\\\n\\\\n#### Successful with ML\\\\n\\\\n\\\\nStandardizing the ML lifecycle with MLflow is a great step to ensure that data scientists can share and\\\\n\\\\ntrack experiments, compare results, reproduce runs and productionize faster.\\\\n\\\\nIn addition to increasing data science team productivity and collaboration and applying good engineering\\\\n\\\\npractices to machine learning, organizations also need to do the following:\\\\n\\\\n\\\\n**Reliably ingest, ETL and**\\\\n\\\\n**catalog big data**\\\\n\\\\n\\\\n**Work with state-of-the-art**\\\\n\\\\n**ML frameworks and tools**\\\\n\\\\n\\\\n**Easily scale compute from**\\\\n\\\\n**single to multi-node**\\\\n\\\\n\\\\nDatabricks excels at all the above. Learn more at [databricks.com](https://databricks.com)\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\nCHAPTER 6: **\\\\u0007** **Introducing the Unified**\\\\n\\\\n#### Data Analytics Platform\\\\n\\\\n\\\\nDatabricks accelerates innovation by unifying data science, engineering and business. Through a fully\\\\n\\\\nmanaged, cloud-based service built by the original creators of Apache Spark, Delta Lake and MLflow, the\\\\n\\\\nDatabricks Unified Data Analytics Platform lowers the barrier for enterprises to innovate with AI and\\\\n\\\\naccelerates their innovation.\\\\n\\\\n**DATA ENGINEERS** **DATA SCIENTISTS** **ML ENGINEERS** **DATA ANALYSTS**\\\\n\\\\n\\\\n**BI INTEGRATIONS**\\\\n\\\\n**Access all your data**\\\\n\\\\n\\\\n**DATA SCIENCE WORKSPACE**\\\\n\\\\n**Collaboration across the lifecycle**\\\\n\\\\n**UNIFIED DATA SERVICE**\\\\n\\\\n**High-quality data with great performance**\\\\n\\\\n\\\\n\\\\n**ENTERPRISE CLOUD SERVICE**\\\\n\\\\n**A simple, scalable and secure managed service**\\\\n\\\\n##### RAW DATA LAKE\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n\\\\n###### Data engineering\\\\n\\\\nSpeed up the preparation of high-quality\\\\n\\\\ndata, essential for best-in-class ML\\\\n\\\\napplications, at scale\\\\n\\\\n\\\\n###### Data science\\\\n\\\\nCollaboratively explore large data sets,\\\\n\\\\nbuild models iteratively and deploy across\\\\n\\\\nmultiple platforms\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\n###### Providing managed MLflow on Databricks\\\\n\\\\nMLflow is natively integrated with the Databricks Unified Data Analytics Platform so that ML practitioners and engineers can benefit from out-of-the-box tracking,\\\\n\\\\npackaging, deployment and management capabilities for ML models with enterprise reliability, security and scale.\\\\n\\\\nBy using MLflow as part of Databricks, data scientists can:\\\\n\\\\n\\\\n**WORKSPACES**\\\\n\\\\nBenefit from a streamlined\\\\n\\\\nexperiment tracking experience\\\\n\\\\nwith Databricks Workspace and\\\\n\\\\ncollaborative Notebooks\\\\n\\\\n\\\\n**BIG DATA SNAPSHOTS**\\\\n\\\\nTrack large-scale data that fed\\\\n\\\\nthe models, along with all the\\\\n\\\\nother model parameters, then\\\\n\\\\n\\\\n**JOBS**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0037934575, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"27767fe6eb02560ebf204ec17e2c6b7a\\\"}, {\\\"page_content\\\": \\\"through Git repository branches.\\\\n\\\\n\\\\nlifecycle management functions that are needed\\\\n\\\\n\\\\nto make cloud-based object stores reliable and\\\\n\\\\nperformant. This design allows clients to update\\\\n\\\\nmultiple objects at once and to replace a subset\\\\n\\\\nof the objects with another, etc., in a serializable\\\\n\\\\nmanner that still achieves high parallel read/write\\\\n\\\\nperformance from the objects \\u2014 while offering\\\\n\\\\nadvanced capabilities like time travel (e.g., query\\\\n\\\\n\\\\nAs a best practice, code should only be run in an execution environment that corresponds to it or in one\\\\n\\\\nthat\\u2019s higher. For example, the dev environment can run any code, but the prod environment can only run\\\\n\\\\nprod code.\\\\n\\\\n###### Models\\\\n\\\\nWhile models are usually marked as dev, staging or prod according to their lifecycle phase, **it is important to**\\\\n\\\\n**note that model and code lifecycle phases often operate asynchronously** . That is, you may want to push\\\\n\\\\na new model version before you push a code change, and vice versa. Consider the following scenarios:\\\\n\\\\n\\\\npoint-in-time snapshots or rollback of erroneous\\\\n\\\\n\\\\n\\\\u0007To detect fraudulent transactions, you develop an ML pipeline that retrains a model weekly. Deploying\\\\n\\\\nthe code can be a relatively infrequent process, but each week a new model undergoes its own lifecycle\\\\n\\\\nof being generated, tested and marked as \\u201cproduction\\u201d to predict on the most recent transactions. In\\\\n\\\\nthis case the code lifecycle is slower than the model lifecycle.\\\\n\\\\n\\\\u0007To classify documents using large deep neural networks, training and deploying the model is often a one-\\\\n\\\\ntime process due to cost. Updates to the serving and monitoring code in the project may be deployed\\\\n\\\\nmore frequently than a new version of the model. In this case the model lifecycle is slower than the code.\\\\n\\\\nSince model lifecycles do not correspond one-to-one with code lifecycles, it makes sense for model\\\\n\\\\nmanagement to have its own service. [MLflow](https://docs.google.com/document/d/1yCODhUuimWJHR8Sc-sd6xY7vJuN6nPek2pNrfhv7hU4/edit#heading=h.1yd956s4db32) and its Model Registry support managing model artifacts\\\\n\\\\ndirectly via UI and APIs. The loose coupling of model artifacts and code provides flexibility to update\\\\n\\\\nproduction models without code changes, streamlining the deployment process in many cases. Model\\\\n\\\\nartifacts are secured using MLflow access controls or cloud storage permissions\\\\n\\\\n\\\\nupdates), automatic data layout optimization,\\\\n\\\\nupserts, caching and audit logs.\\\\n\\\\n\\\\n-----\\\\n\\\\n###### Data\\\\n\\\\nSome organizations label data as either dev, staging or prod, depending on which environment it originated\\\\n\\\\nin. For example, all prod data is produced in the prod environment, but dev and staging environments may\\\\n\\\\nhave read-only access to them. Marking data this way also indicates a guarantee of data quality: dev data\\\\n\\\\nmay be temporary or not meant for wider use, whereas prod data may offer stronger guarantees around\\\\n\\\\n\\\\nreliability and freshness. Access to data in each environment is controlled with table access controls\\\\n\\\\n( [AWS](https://docs.databricks.com/security/access-control/table-acls/index.html) [Azure](https://docs.microsoft.com/en-us/azure/databricks/security/access-control/table-acls/) [GCP](https://docs.gcp.databricks.com/security/access-control/table-acls/index.html) ) or cloud storage permissions.\\\\n| |\\\\n\\\\nIn summary, when it comes to MLOps, you will always have operational separation between dev, staging and\\\\n\\\\nprod. Assets in dev will have the least restrictive access controls and quality guarantees, while those in prod\\\\n\\\\nwill be the highest quality and tightly controlled.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0037399682, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf\\\"}, \\\"id\\\": \\\"da9dc2dae18bd9c1aa982e446461522c\\\"}, {\\\"page_content\\\": \\\"CHAPTER 8: **\\\\u0007** **Getting Started**\\\\nTake the next step toward standardizing your ML lifecycle \\u2014 test drive MLflow and the\\\\n\\\\nDatabricks Unified Data Analytics Platform.\\\\n\\\\n**[S TA R T Y O U R F R E E T R I A L](https://databricks.com/try)** **[R E Q U E S T A P E R S O N A L I Z E D D E M O](https://databricks.com/contact)**\\\\n\\\\n**[L E A R N M O R E](https://databricks.com/mlflow)** **[J O I N T H E C O M M U N I T Y](https://mlflow.org)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**M A C H I N E L E A R N I N G L I F E C Y C L E**\\\\n\\\\nCHAPTER 8: **\\\\u0007** **Comparison Matrix**\\\\n\\\\n|E X P E R I M E N T T R A C K I N G MLflow Tracking API MLflow Tracking Server Notebook Integration Workspace Integration R E P R O D U C I B L E P R O J E C T S MLflow Projects GitHub and Conda Integration Scalable Cloud/Clusters for Project Runs M O D E L M A N A G E M E N T MLflow Model Registry Model Versioning Stage Transitions and Comments CI/CD Workflow Integration Model Stage F L E X I B L E D E P L O Y M E N T MLflow Models Built-In Batch Inference Built-In Streaming Analytics S E C U R I T Y A N D M A N A G E M E N T High Availability Automated Updates Role-Based Access Control|O P E N S O U R C E M L F L O W \\uf058 \\uf058 Self-hosted \\uf410 \\uf410 \\uf058 \\uf058 \\uf410 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf410 \\uf410 \\uf410 \\uf410 \\uf410|M A N A G E D M L F L O W O N D ATA B R I C K S \\uf058 \\uf058 Fully managed \\uf058 \\uf058 \\uf058 With remote execution \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058 \\uf058|\\\\n|---|---|---|\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0035068137, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/LP_2-primary-asset_standardizing-the-ml-lifecycle-ebook-databricks-0626120-v8.pdf\\\"}, \\\"id\\\": \\\"275743e1dc14a362c47022ab6134084c\\\"}]\"\n }\n ],\n \"role\": \"tool\",\n \"name\": \"User\"\n },\n {\n \"content\": \"You can use MLflow for versioning your model by utilizing the MLflow Model Registry, which provides a centralized place to manage and track different versions of your models. The Model Registry allows you to track multiple versions of a model, mark each version with a lifecycle stage (such as development, staging, or production), and manage who can approve changes to the model's stages. This enables you to have full visibility into which models exist, what stages they are in, and who has collaborated on and changed the deployment stages of a model, making it easier to manage your ML efforts.\",\n \"role\": \"assistant\",\n \"name\": \"Assistant\"\n }\n]\n----\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-df75a507fff24a89a1e45772ab840e22\"", + "text/plain": [ + "Trace(request_id=tr-df75a507fff24a89a1e45772ab840e22)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "second_turn = {'messages': output['messages'] + [{\"role\": \"user\", \"content\": \"How can I use it for versioning my model?\"}]}\n", + "\n", + "# Run the Agent again with the same input to continue the conversation\n", + "second_turn_output = agent.predict(model_input=second_turn)\n", + "\n", + "print(f\"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}\")\n", + "print(f\"Agent's final response:\\n----\\n{second_turn_output['content']}\\n----\")\n", + "print()\n", + "print(f\"Agent's full message history (useful for debugging):\\n----\\n{json.dumps(second_turn_output['messages'], indent=2)}\\n----\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b821e6d5-71f7-44c7-a92b-7842ffa647bc", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "#### ✅✏️ 🅱 Evaluate the Agent using your evaluation set\n", + "\n", + "Note: If you do not have an evaluation set, you can create a synthetic evaluation set by using the 03_synthetic_evaluation notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e49c12cc-a0b6-49fc-b339-6794ca9bfb42", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/12/18 17:43:29 WARNING mlflow.utils.autologging_utils: MLflow autogen autologging is known to be compatible with 0.2.36 <= autogen-agentchat <= 0.2.39, but the installed version is 0.2.40. If you encounter errors during autologging, try upgrading / downgrading autogen-agentchat to a compatible version, or try upgrading MLflow.\n/root/.ipykernel/10605/command-2822477370486938-337563765:30: FutureWarning: ``mlflow.models.rag_signatures.StringResponse`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatCompletionResponse`` instead.\n outputs=StringResponse()\nWARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\nWARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:36] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:36] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:43:36] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat can you help me with?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nI can help you with searching product documentation or translating old SKUs to new ones. If you have a specific question about a product, I can try to find the relevant documentation for you. Alternatively, if you have an old SKU in the format \"OLD-XXX-YYYY\" that you'd like to translate to the new format \"NEW-YYYY-XXX\", I can assist you with that as well. What do you need help with?\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/vnd.jupyter.widget-view+json": { + "model_id": "6f3397464ebf44488749935c4858aa1d", + "version_major": 2, + "version_minor": 0 + }, + "text/plain": [ + "Uploading artifacts: 0%| | 0/55 [00:00>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\nINFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are the key features and advantages of the lakehouse pattern?\n[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat is game telemetry, and what primary metrics are tracked in game telemetry according to the text?[autogen.oai.client: 12-18 17:44:12] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\n[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\nWhat percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\n[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\n[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n\u001B[33mUser\u001B[0m (to Assistant):\n\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\nWhat are two key challenges mentioned for predictive maintenance in government agencies?\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m[autogen.oai.client: 12-18 17:44:13] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat new opportunities can data sharing create for organizations looking to generate additional revenue?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are some of the common problems faced by data lakes according to the document?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\nWhy is real-time data crucial for retail operations, and what problems do legacy systems cause?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_ea4bbe37-71b3-459d-ad21-239c8d95968f): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"lakehouse pattern features and advantages\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_ea4bbe37-71b3-459d-ad21-239c8d95968f) *****\u001B[0m\n[{\"page_content\": \"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\n\\nKey Use Cases for Insurance:\\n\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\n\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\n\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\n\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\", \"metadata\": {\"similarity_score\": 0.00323427, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\"}, \"id\": \"5014f5f2c09c55edb470c8b5528eb000\"}, {\"page_content\": \"In short, a lakehouse is a data architecture that combines the best elements\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\ndesign, which implements similar data structures and data management features\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\n\\n\\n-----\\n\\n##### Data lakehouse\\n\\nOne platform to unify all your data, analytics and AI workloads\\n\\n###### Lakehouse Platform\\n\\nAll machine learning, SQL,\\nBI, and streaming use cases\\n\\nOne security and governance\\napproach for all data assets\\non all clouds\\n\\n\\n-----\\n\\n**Key features for a lakehouse**\\n\\nRecent innovations with the data lakehouse architecture can help simplify\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\nthe kind of flexibility and openness that allows your organization to stay agile\\nas you scale. Here are key features to consider when evaluating data lakehouse\\narchitectures:\\n\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\nConsistency, Isolation and Durability) transactions ensures consistency as\\nmultiple parties concurrently read or write data.\\n\\nSchema enforcement and governance: The lakehouse should have\\na way to support schema enforcement and evolution, supporting data\\nwarehouse schema paradigms such as star/snowflake. The system should\\nbe able to reason about data integrity, and it should have robust governance\\nand auditing mechanisms.\\n\\nData governance: Capabilities including auditing, retention and lineage\\nhave become essential, particularly considering recent privacy regulations.\\n\\nTools that allow data discovery have become popular, such as data catalogs\\nand data usage metrics.\\n\\nBI support: Lakehouses allow the use of BI tools directly on the source\\ndata. This reduces staleness and latency, improves recency and lowers cost\\nby not having to operationalize two copies of the data in both a data lake\\nand a warehouse.\\n\\n\\nStorage decoupled from compute: In practice, this means storage and\\ncompute use separate clusters, thus these systems can scale to many more\\nconcurrent users and larger data sizes. Some modern data warehouses also\\nhave this property.\\n\\nOpenness: The storage formats, such as Apache Parquet, are open and\\nstandardized, so a variety of tools and engines, including machine learning\\nand Python/R libraries, can efficiently access the data directly.\\n\\nSupport for diverse data types (unstructured and structured):\\nThe lakehouse can be used to store, refine, analyze and access data types\\nneeded for many new data applications, including images, video, audio,\\nsemi-structured data and text.\\n\\nSupport for diverse workloads: Use the same data repository for a range\\nof workloads including data science, machine learning and SQL analytics.\\nMultiple tools might be needed to support all these workloads.\\n\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\nSupport for streaming eliminates the need for separate systems dedicated to\\nserving real-time data applications.\\n\\n**Learn more**\\n\\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\n\\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\n\\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\n\\n\\n-----\\n\\n**CHAPTER**\\n\\n# 02\\n\\n\\n### The Databricks Lakehouse Platform\\n\\n\\n-----\\n\\n#### Lakehouse: A new generation of open platforms\\n\\n\\n###### This is the lakehouse paradigm\", \"metadata\": {\"similarity_score\": 0.0029213156, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\"}, \"id\": \"9cabb87127bfa514fa6f498e9f2831e7\"}, {\"page_content\": \"versioning, governance, security and ACID properties that are needed even for\\n\\nunstructured data.\\n\\n\\nstored procedures are available, but users may need to employ other mechanisms that\\n\\n\\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\\n\\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\\n\\nthat are almost identical to those of older, commercial data warehouses.\\n\\nWhat about support for other types of data applications? Users of a lakehouse have\\n\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\n\\nlibraries) for non-BI workloads like data science and machine learning. Data\\n\\nexploration and refinement are standard for many analytic and data science\\n\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\n\\n\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\n\\nsystems (such as data warehouses) that have years of investments and real-\\n\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\n\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\n\\nconnectors to popular tools so they can appeal to a variety of personas. These\\n\\nand other issues will be addressed as the technology continues to mature and\\n\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\n\\nproperties of being simpler, more cost-efficient and more capable of serving\\n\\ndiverse data applications.\\n\\n\\ndata in their lakehouse until it is ready for consumption.\\n\\n\\n-----\\n\\n**Diving Deep Into the Inner Workings**\\n**of the Lakehouse and Delta Lake**\\n\\n### CHAPTER 02\\n\\n\\n-----\\n\\n**Diving Deep Into the**\\n**Inner Workings of the**\\n**Lakehouse and Delta Lake**\\n# 02\\n\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\n\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\n\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\n\\ndata architecture, some people thought the lakehouse is the same thing as\\n\\nthe data lake. Recently, several of our engineers and founders wrote a research\\n\\npaper that describes some of the core technological challenges and solutions that\\n\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\n\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\n\\ncan read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\n\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\\n\\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\\n\\nthey would have said faster horses.” The crux of this statement is that people often\\n\\nenvision a better solution to a problem as an evolution of what they already know\\n\\nrather than rethinking the approach to the problem altogether. In the world of data\\n\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\n\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\n\\n\\n-----\\n\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\n\\nobject stores like Amazon S3 have become some of the largest and most cost-\\n\\neffective storage systems in the world, which makes them an attractive platform to\\n\\nstore data warehouses and data lakes. However, their nature as key-value stores\\n\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\", \"metadata\": {\"similarity_score\": 0.0027414565, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\"}, \"id\": \"b1f28e2afb30602c0205684eb65002df\"}, {\"page_content\": \"Current lakehouses reduce cost, but their performance can still lag specialized\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\nconnectors to popular tools so they can appeal to a variety of personas. These\\nand other issues will be addressed as the technology continues to mature and\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\nproperties of being simpler, more cost-efficient and more capable of serving\\ndiverse data applications.\\n\\n\\n-----\\n\\n**Diving Deep Into the**\\n**Inner Workings of the**\\n**Lakehouse and Delta Lake**\\n\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\ndata architecture, some people thought the lakehouse is the same thing as\\nthe data lake. Recently, several of our engineers and founders wrote a research\\npaper that describes some of the core technological challenges and solutions that\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\n\\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\\nthey would have said faster horses.” The crux of this statement is that people often\\nenvision a better solution to a problem as an evolution of what they already know\\nrather than rethinking the approach to the problem altogether. In the world of data\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\n\\n\\n-----\\n\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\nstore data warehouses and data lakes. However, their nature as key-value stores\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\nand limited consistency guarantees.\\n\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\n\\n**1. Data lakes**\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\\napproach because the table is just a group of objects that can be accessed from\\na wide variety of tools without a lot of additional data stores or systems. However,\\nboth performance and consistency problems are common. Hidden data corruption\\nis common due to failed transactions, eventual consistency leads to inconsistent\\nqueries, latency is high, and basic management capabilities like table versioning and\\naudit logs are unavailable.\\n\\n**2. Custom storage engines**\\nThe second approach is custom storage engines, such as proprietary systems built for\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\nservice that’s able to provide a single source of truth. However, all I/O operations need\\nto connect to this metadata service, which can increase cloud resource costs and\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\ndata because these systems are generally optimized for traditional structured\\n\\n\\n-----\\n\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\ncustomers into a specific service provider, leaving customers to contend with\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\nadopt a new approach later.\", \"metadata\": {\"similarity_score\": 0.002695809, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\"}, \"id\": \"8375eac494bff392a37d6dff7c40c1b1\"}, {\"page_content\": \"- **\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\n\\n\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\nwarehouses.\\n\\nThe need for a flexible, high-performance system hasn’t abated. Companies\\nrequire systems for diverse data applications including SQL analytics, real-time\\nmonitoring, data science and machine learning. Most of the recent advances in\\nAI have been in better models to process unstructured data (text, images, video,\\naudio), but these are precisely the types of data that a data warehouse is not\\noptimized for.\\n\\nA common approach is to use multiple systems — a data lake, several data\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\nand image databases. Having a multitude of systems introduces complexity and,\\nmore importantly, introduces delay as data professionals invariably need to move\\nor copy data between different systems.\\n\\n\\n-----\\n\\n**\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\n\\n- **\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\n\\n- **\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\nuse separate clusters, thus these systems are able to scale to many more\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\nthis property.\\n\\n- **\\u0007Openness:** The storage formats they use are open and standardized, such as\\nParquet, and they provide an API so a variety of tools and engines, including\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\n\\n- **\\u0007Support for diverse data types ranging from unstructured to structured data:**\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\nfor many new data applications, including images, video, audio, semi-structured\\ndata, and text.\\n\\n- **\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\nrely on the same data repository.\\n\\n- **\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\nSupport for streaming eliminates the need for separate systems dedicated to\\nserving real-time data applications.\\n\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\nfeatures. Tools for security and access control are basic requirements. Data governance\\ncapabilities including auditing, retention and lineage have become essential particularly\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\nfeatures only need to be implemented, tested and administered for a single system.\\n\\n\\n-----\\n\\n**Read the research**\\n**Delta Lake: High-Performance ACID**\\n**Table Storage Over Cloud Object Stores**\", \"metadata\": {\"similarity_score\": 0.0025942351, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\"}, \"id\": \"accf6ad13717062292245537ffbd0249\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"NLP tasks and libraries percentage\", \"filters\": [{ \"key\": \"category\", \"value\": \"NLP\" }, { \"key\": \"type\", \"value\": \"library\" }] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6) *****\u001B[0m\nError: 'field'\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb) *****\u001B[0m\n[{\"page_content\": \"Al-Futtaim’s focus is to harness their data to improve all areas of the\\nbusiness, from streamlining the supply chain to optimizing marketing\\nstrategies. But with the brands capturing such a wide variety of data,\\nAl-Futtaim’s legacy systems struggled to provide a single view into\\nthe customer due to data silos and the inability to scale efficiently to\\nmeet analytical needs.\\n\\n\\n-----\\n\\nThe personalization of customer experiences will remain a key focus for B2C\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\nlong-established players.\\n\\n**Focus on the customer journey**\\n\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\n\\n**C A S E S T U DY**\\n\\n**Personalizing the beauty product shopping experience**\\n\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\nbeauty product destination in Europe. However, they struggled with\\nmassive volumes of streaming data and with infrastructure complexity\\nthat was resource-intensive and costly to scale. See how they used\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\n40% and increase net order income.\\n\\nGet the full story\\n\\n\\n¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\\nExperience Performance Index in 2007-09.\\n\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\n\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\n\\n\\n-----\\n\\nCareful consideration of how customers interact with various assets — and how\\nthese interactions may be interpreted as expressions of preference — can unlock\\na wide range of data that enables personalization.\\n\\n\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\nlimited pilots and customer response assessments. And in those assessments,\\nit’s important to keep in mind that there is no expectation of perfection — only\\nincremental improvement over the prior solution.\\n\\n\\n**C A S E S T U DY**\\n\\n**Need help generating personalized**\\n**recommendations?**\\n\\n\\n**Connecting shoppers to savings with data-driven**\\n**personalization‌**\\n\\n\\nUse the **Recommendation Engines Accelerator** to estimate\\ncustomers’ potential receptiveness to an offer or to\\ncontent related to a subset of products. Using these scores,\\nmarketers can determine which of the many messages at\\ntheir disposal should be presented to a specific customer.\\n\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\", \"metadata\": {\"similarity_score\": 0.0032405849, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\"}, \"id\": \"0473e2deba8639930389964be7b25bc7\"}, {\"page_content\": \"**4.** **Streamlining Customer Analysis and Targeting**\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\n\\n**5.** **Assessing Consumer Interest Data**\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\n\\n**6.** **Delivering Personalized Customer Journeys**\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\n\\n**Conclusion**\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\n\\n\\n-----\\n\\n### Introduction\\n\\nIn today’s experience-driven world, the most beloved brands are the ones that\\nknow their customers. Customers are loyal to brands that recognize their needs\\nand preferences — and tailor user journeys and engagements accordingly.\\n\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\nbuying from a brand that personalizes the shopping and user experience to the\\nwants and needs of the customer. And as organizations pursue omnichannel\\nexcellence, these same high expectations of online experiences also extend to\\nbrick-and-mortar locations — revealing for many merchants that personalized\\nengagement is fundamental to attracting customers and expanding share of wallet.\\n\\nBut achieving a 360-degree view of your customers to serve personalized\\nexperiences requires integrating various types of data — including demographics,\\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\ncustomer experiences at every touch point.\\n\\n\\n# 76%\\n\\nof consumers are more\\nlikely to purchase due to\\npersonalization\\n\\n\\n# 76%\\n\\n\\n-----\\n\\n### Building a Foundation for Personalization\\n\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\n\\n\\nTo create truly personalized interactions, you need actionable insights\\nabout your customers. Start by establishing a common customer profile and\\naccurately linking together customer records across disparate data sets.\\n\\nGet a 360-degree view of your target customer by bringing together:\", \"metadata\": {\"similarity_score\": 0.0031753962, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\"}, \"id\": \"d53c2a5c69cef5febfa62ea961c33d25\"}, {\"page_content\": \"Customer\\n\\n\\nUse the **Propensity Scoring Accelerator** to estimate\\ncustomers’ potential receptiveness to an offer or to\\ncontent related to a subset of products. Using these scores,\\nmarketers can determine which of the many messages at\\ntheir disposal should be presented to a specific customer.\\n\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\n\\n\\nDownstream\\nApplications\\n\\n\\nA three-part propensity scoring workflow.\\n\\n\\n-----\\n\\n### Delivering Personalized Customer Journeys\\n\\nStrategies for crafting a real-time recommendation engine\\n\\n\\nAs the economy continues to weather unpredictable disruptions, shortages and\\ndemand, delivering personalized customer experiences at speed and scale will\\nrequire adaptability on the ground and within a company’s operational tech stack.\\n\\n\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\nstrategy and operations, allowing them to create a “golden customer\\nrecord” that improves all decision-making from forecasting demand to\\npowering their global loyalty program.\\n\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\n\\n\\n**C A S E S T U DY**\\n\\n\\n“Databricks Lakehouse allows every division in our\\norganization — from automotive to retail — to gain\\na unified view of our customer across businesses.\\nWith these insights, we can optimize everything from\\nforecasting and supply chain, to powering our loyalty\\nprogram through personalized marketing campaigns,\\ncross-sell strategies and offers.”\\n\\n**D M I T R I Y D O V G A N**\\nHead of Data Science, Al-Futtaim Group\\n\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\nsafety and community, brands most attuned to changing needs and sentiments\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\nbusiness and many lost, organizations that had already begun the journey toward\\nimproved customer experience saw better outcomes, closely mirroring patterns\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\\n\\n\\n**Creating a unified view across 200+ brands**\\n\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\nimpacts the lives of millions of people across the region through the\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\nHardware and Marks & Spencer.\\n\\nAl-Futtaim’s focus is to harness their data to improve all areas of the\\nbusiness, from streamlining the supply chain to optimizing marketing\\nstrategies. But with the brands capturing such a wide variety of data,\\nAl-Futtaim’s legacy systems struggled to provide a single view into\\nthe customer due to data silos and the inability to scale efficiently to\\nmeet analytical needs.\\n\\n\\n-----\\n\\nThe personalization of customer experiences will remain a key focus for B2C\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\nlong-established players.\\n\\n**Focus on the customer journey**\", \"metadata\": {\"similarity_score\": 0.0028500317, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\"}, \"id\": \"8f4f8bec235a7c063f9b4a7b7ec6ef4b\"}, {\"page_content\": \"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\n\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\n\\n\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\r)\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\r)\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\r)\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\r)\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\r)\\n\\n\\n**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\\n\\n\\n-----\\n\\n**Product distribution:**\\n**segmentation and personalization**\\n\\nThe most forward-thinking and data-driven insurers are\\nfocused on achieving personalization at scale. They are\\nexploring new partnerships and business models to create\\nintegrated, value-added experiences that prioritize the\\noverall health and financial wellness of their customers,\\nrather than just their insurance needs. These insurers\\nare investing in new data sources, analytics platforms,\\nand artificial intelligence (AI)-powered decision engines\\nthat enable them to connect producers with like-minded\\ncustomers or engage customers with enticing offers\\nand actionable steps based on their previous choices.\\nThe outcome is more efficient and effective service\\nfrom producers, trusted and convenient interactions for\\nconsumers, and increased customer engagement and\\ngrowth for insurers in an increasingly digital-oriented world.\\n\\n\\n**Customer/Partner Successes**\\n\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\n\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\n\\nWith Persona 360, you can:\\n\\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\\n1,695+ attributes and segments\", \"metadata\": {\"similarity_score\": 0.002557174, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\"}, \"id\": \"2bc1a24e9f2f35f29d6f23452045b7f7\"}, {\"page_content\": \"-----\\n\\nCareful consideration of how customers interact with various assets — and how\\nthese interactions may be interpreted as expressions of preference — can unlock\\na wide range of data that enables personalization.\\n\\n\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\nlimited pilots and customer response assessments. And in those assessments,\\nit’s important to keep in mind that there is no expectation of perfection — only\\nincremental improvement over the prior solution.\\n\\n\\n**C A S E S T U DY**\\n\\n**Need help generating personalized**\\n**recommendations?**\\n\\n\\n**Connecting shoppers to savings with data-driven**\\n**personalization‌**\\n\\n\\nUse the **Recommendation Engines Accelerator** to estimate\\ncustomers’ potential receptiveness to an offer or to\\ncontent related to a subset of products. Using these scores,\\nmarketers can determine which of the many messages at\\ntheir disposal should be presented to a specific customer.\\n\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\n\\n\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\nso consumers get deals and discounts without clipping coupons. Siloed\\ncustomer data sources once made getting insights difficult. Now with\\nDatabricks, Flipp’s data teams can access and democratize data, helping\\nthem do their jobs more effectively while bringing better deals to users,\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\nbrick-and-mortar retailers.\\n\\nGet the full story\\n\\nThe engines we use to serve content based on customer preferences are known\\nas recommenders. With some recommenders, a heavy focus on the shared\\npreferences of similar customers helps define what recommendations will actually\\nmake an impact. With others, it can be more useful to focus on the properties of\\nthe content itself (e.g., product descriptions).\\n\\n\\n-----\\n\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\n\\n\\nProviding deep, effective personalized experiences to customers depends\\non a brand’s ability to intelligently leverage consumer and market data from a\\nwide variety of sources to fuel faster, smarter decisions — without sacrificing\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\nexactly that, offering a scalable data architecture that unifies all your data,\\nanalytics and AI to deliver unforgettable customer experiences.\\n\\nCreated on open source and open standards, Databricks offers a robust\\nand cost-effective platform for brands to collaborate with partners, clients,\\nmanufacturers and distributors to unleash more innovation and efficiencies\\nat every touch point. Businesses can rapidly ingest available data in real time,\\n\\n\\nat scale, and create accessible, data-driven insights that enable actionable\\nstrategies across the value chain.\\n\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\ntheir company’s operational health and the evolving needs of their customers\\n— all while empowering teams to easily unify data efforts, perform fine-grained\\nanalyses and streamline cross-functional data operations using a single,\\nsophisticated solution.\\n\\n\\n###### Learn more about Databricks Lakehouse for industries\\n like Retail & Consumer Goods, Media & Entertainment\\n and more at databricks.com/solutions\\n\\n\\n-----\\n\\n### About Databricks\\n\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\\n\\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\\n\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\n\\nis headquartered in San Francisco, with offices around the globe. Founded by\\n\\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\\n\\na mission to help data teams solve the world’s toughest problems. To learn more,\\n\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\", \"metadata\": {\"similarity_score\": 0.0025465384, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\"}, \"id\": \"8e054539e38c8a49888991a85b178399\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nI'm sorry, I can't help you with that.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\nWARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\nWARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\nWARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\nWARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\nWARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mAssistant\u001B[0m (to User):\n\nAccording to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:19] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:19] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:19] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat is the issue with the provided document?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nI'm sorry, I can't help you with that.\n\n--------------------------------------------------------------------------------\n[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:20] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat significant advancement in large language model development happened in 2012?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:21] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"digital twin technology automobile manufacturing profit margins reduction manufacturing time\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat types of metrics are tracked in game telemetry to understand player behavior and improve the game experience?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m\u001B[33mAssistant\u001B[0m (to User):\n\n (to Assistant):\n\nWhat are the responsibilities of a Data Engineer according to the document?\nI'm sorry, I can't help you with that.\n--------------------------------------------------------------------------------\n\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"data pipeline challenges for data lakes\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n[autogen.oai.client: 12-18 17:44:22] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:22] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:22] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are the benefits of Delta Sharing in terms of data accessibility and platform compatibility?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe key features of the lakehouse pattern include:\n\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\n\nThe advantages of the lakehouse pattern include:\n\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\n\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\n\n--------------------------------------------------------------------------------\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b) *****\u001B[0m\n[{\"page_content\": \"**Declarative ETL pipelines**\\nData engineers can reduce development time and effort and instead focus on\\nimplementing business logic and data quality checks within the data pipeline\\nusing SQL or Python. This can be achieved by:\\n\\n**•** Using intent-driven declarative development to simplify “how” and\\ndefine “what” to solve\\n\\n**•** Automatically creating high-quality lineage and managing table\\ndependencies across the data pipeline\\n\\n**•** Automatically checking for missing dependencies or syntax errors,\\nand managing data pipeline recovery\\n\\n**Real-time data processing**\\nAllow data engineers to tune data latency with cost controls without the\\nneed to know complex stream processing or implement recovery logic.\\n\\n**•** Avoid handling batch and real-time streaming data sources separately\\n\\n**•** Execute data pipeline workloads on automatically provisioned elastic\\nApache Spark™-based compute clusters for scale and performance\\n\\n**•** Remove the need to manage infrastructure and focus on the business\\nlogic for downstream use cases\\n\\n\\n-----\\n\\n**Unified orchestration of data workflows**\\nSimple, clear and reliable orchestration of data processing tasks for data,\\nanalytics and machine learning pipelines with the ability to run multiple\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\nlakehouse with no need to maintain or pay for an external orchestration service.\\n\\n**•** Easily create and manage multiple tasks with dependencies via UI,\\nAPI or from your IDE\\n\\n**•** Have full observability to all workflow runs and get alerted when\\ntasks fail for fast troubleshooting and efficient repair and rerun\\n\\n**•** Leverage high reliability of 99.95% uptime\\n\\n**•** Use performance optimization clusters that parallelize jobs and\\nminimize data movement with cluster reuse\\n\\n**Data quality validation and monitoring**\\nImprove data reliability throughout the data lakehouse so data teams can\\nconfidently trust the information for downstream initiatives by:\\n\\n**•** Defining data quality and integrity controls within the pipeline\\nwith defined data expectations\\n\\n**•** Addressing data quality errors with predefined policies\\n(fail, drop, alert, quarantine)\\n\\n**•** Leveraging the data quality metrics that are captured, tracked\\nand reported for the entire data pipeline\\n\\n\\nData\\nSources\\n\\nData\\nWarehouses\\n\\nOn-premises\\nSystems\\n\\nSaaS\\nApplications\\n\\nMachine &\\nApplication Logs\\n\\nApplication\\nEvents\\n\\nMobile & IoT\\nData\\n\\n\\nCloud\\nStorage\\n\\nMessag\\ne Buses\\n\\n\\n**Lakehouse Platform**\\n\\n**Workflows** for end-to-end orchestration\\n\\n\\nReal-Time BI Apps\\n\\nReal-Time AI Apps\\n\\n\\nReal-Time Analytics with\\n**Databricks SQL**\\n\\nReal-Time Machine Learning\\nwith\\n**Databricks ML**\\n\\n\\nStreaming ETL with\\n**Delta Live Tables**\\n\\n\\nPredictive\\nMaintenance\\n\\n\\nPersonalized\\nOffers\\n\\n\\nPatient\\nDiagnostics\\n\\n\\nReal-Time Operational\\nApps\\n\\n\\nReal-Time Applications with\\n**Spark Structured Streaming**\\n\\n**Photon** for lightning-fast data processing\\n\\n**Unity Catalog** for data governance and sharing\\n\\n**Delta Lake** for open and reliable data storage\\n\\n\\nAlerts Detection Fraud\\n\\n\\nDynamic\\nPricing\\n\\n\\n©2023 Databricks Inc. — All rights reserved\\n\\nFigure 2\\nA unified set of tools for real-time data processing\\n\\n\\n-----\\n\\n**Fault tolerant and automatic recovery**\\nHandle transient errors and recover from most common error conditions\\noccurring during the operation of a pipeline with fast, scalable automatic\\nrecovery that includes:\\n\\n**•** Fault tolerant mechanisms to consistently recover the state of data\\n\\n**•** The ability to automatically track progress from the source with\\ncheckpointing\\n\\n**•** The ability to automatically recover and restore the data pipeline state\", \"metadata\": {\"similarity_score\": 0.004132444, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\"}, \"id\": \"d85d526722f3ca9735bc45d98a9ad449\"}, {\"page_content\": \"# Building Reliable Data Lakes at Scale With Delta Lake\\n\\n\\n-----\\n\\n## Contents\\n\\n#### Data Engineering Drivers 2\\n\\n Data Pipeline Key Goals 4\\n\\n Apache Spark™: The First Unified Analytics Engine 5\\n\\n Data Reliability Challenges With Data Lakes 6\\n\\n Delta Lake: A New Storage Layer 7\\n\\n Delta Lake: Key Features 8\\n\\n Getting Started With Delta Lake 10\\n\\n\\n-----\\n\\n## Drivers\\n\\n#### Data Engineering Drivers\\n\\nData engineering professionals are needing to respond to several different drivers.\\n\\nChief among the drivers they face are:\\n\\n**Rise of Advanced Analytics** — Advanced analytics, including methods\\n\\nbased on machine learning techniques, have evolved to such a degree that\\n\\norganizations seek to derive far more value from their corporate assets.\\n\\n**Widespread Adoption** — Once the province of leading edge, high-tech\\n\\ncompanies, these advanced approaches are being adopted across a\\n\\nmultitude of industries from retail to hospitality to healthcare and across\\n\\nprivate as well as public sector organizations. This is further driving the need\\n\\nfor strong data engineering practices.\\n\\n**Regulation** — With the growth of data generation and data collection,\\n\\nthere is increased interest in how the data is protected and managed.\\n\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\n\\nfrom the EU and other jurisdictions mandate very specific ways in which\\n\\ndata must be managed.\\n\\n\\n-----\\n\\n## Drivers\\n\\n**Technology Innovation** — The move to cloud-based analytics architectures\\n\\nthat is now well underway is being propelled further by innovations such as\\n\\nanalytics-focused chipsets, pipeline automation and the unification of data\\n\\nand machine learning. All these offer data professionals new approaches for\\n\\ntheir data initiatives.\\n\\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\\n\\nalso subject to increasing scrutiny. There is also a greater understanding of\\n\\ndata as a valuable asset. Deriving value from data must be done in a manner\\n\\nthat is financially responsible and actually value adding to the enterprise and\\n\\nmeeting ROI hurdles.\\n\\n**Role Evolution** — Reflecting the importance of managing the data and\\n\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\n\\nmore prominent and newer roles such as Data Curator are emerging.\\n\\nThey must balance the needs of governance, security and democratization.\\n\\n\\n-----\\n\\n## Key Goals\\n\\n#### Data Pipeline Key Goals\\n\\nMaking quality data available in a reliable manner is a major determinant of success for data\\n\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\n\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\n\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\n\\ndesign and build their data pipelines.\\n\\nThree primary goals that data engineers typically seek to address as they work to enable the\\n\\nanalytics professionals in their organizations are:\\n\\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\\n\\nare key. Data with gaps or errors (which can arise for many reasons) is\\n\\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\\n\\nusers. Equally well, many applications require up-to-date information (who\\n\\nwants to use last night’s closing stock price or weather forecast) and are of\\n\\nlimited value without it.\\n\\n**Enable faster queries** — Wanting fast responses to queries is natural enough\\n\\nin today’s “New York minute,” online world. Achieving this is particularly\\n\\ndemanding when the queries are based on very large data sets.\\n\\n**Simplify data engineering at scale** — It is one thing to have high reliability and\\n\\nperformance in a limited, development or test environment. What matters\\n\\nmore is the ability to have robust, production data pipelines at scale without\\n\\nrequiring high operational overhead.\\n\\n\\n-----\", \"metadata\": {\"similarity_score\": 0.00411582, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\"}, \"id\": \"f6ef96d9f374de069754b3f8d671b16d\"}, {\"page_content\": \"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\n\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\\n\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\n\\n**Lakehouse — the modern data architecture**\\n\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\n\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\n\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\n\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\n\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\n\\narchitecture possible.\\n\\n\\non all data on a simple, open and multicloud\\n\\nmodern data stack.\\n\\n\\n-----\\n\\n**Exploratory Data Scientist**\\n\\n\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\n\\n\\n**Curated Data Lake**\\n\\n\\n**Raw Data Ingest**\\n“Bronze”\\n\\n\\n**Filtered/Cleaned/Augmented**\\n“Silver”\\n\\n\\n**Business-Level Aggregates**\\n“Gold”\\n\\n\\n**D ATA Q U A L I T Y**\\n\\n**Data Sources (Batch and Real-Time)**\\n\\n\\n**Unstructured**\\n\\n- Image, Video, Audio\\n\\n- Free Text, Blob\\n\\n\\n**Semi-Structured**\\n\\n- Logs, Clickstream\\n\\n- CSV, JSON, XML\\n\\n\\n**Structured**\\n\\n- Systems of Record\\n\\n- Operational DBs\\n\\n\\n**Figure 8:**\\nThe building blocks for a modern data architecture\\n\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\n\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\n\\ntarget-state architecture supports loading all the data types that might be interesting to an organization —\\n\\nstructured, semi-structured and unstructured — and provides a single processing layer, using consistent\\n\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\n\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\n\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\n\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\n\\nThe architecture makes possible the efficient creation of “data assets” for the organization by taking a\\n\\nstepwise approach to improving data.\\n\\n\\n-----\\n\\n**Lakehouse key features**\\n\\nTo effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\\n\\navailable for stakeholders to run business-critical production workloads:\\n\\n\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\n\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\n\\nmonitoring and recovery.\\n\\n\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\n\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\n\\nread or write data, typically using SQL.\\n\\n\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\n\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\n\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\n\\n\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\n\\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\\n\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\", \"metadata\": {\"similarity_score\": 0.004092816, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\"}, \"id\": \"a6c4aa57b347d46b3d74ce86a7176024\"}, {\"page_content\": \"##### The Delta Lake Series Complete Collection\\n\\n\\n-----\\n\\n### What is Delta Lake?\\n\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\ncompatible with Apache Spark™ APIs.\\n\\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\nmodifying data for data capture.\\n\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\nyour data lake and the rate that teams can leverage that data with a secure and\\nscalable cloud service.\\n\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\n\\n\\n-----\\n\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\n\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\n\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\n\\nPerformance Matter **you’ll find inside** 5 Features 22\\n\\n\\n\\nProcesses Petabytes With Data Skipping and Z-Ordering\\n\\n\\nRollbacks 39\\n\\nPinned view of a continuously updating\\n\\nDelta Lake table across multiple downstream jobs\\n\\nQueries for time series analytics made simple\\n\\nEasily Clone Your Delta Lake\\n\\nfor Testing, Sharing and ML\\n\\nReproducibility 41\\n\\nWhat are clones? 41\\n\\n\\nA lakehouse combines the best elements\\n\\nof data lakes and data warehouses 52\\n\\nSome early examples 55\\n\\nFrom BI to AI 55\\n\\nDiving Deep Into the\\n\\nInner Workings of the Lakehouse and Delta Lake 56\\n\\n1. Data lakes 57\\n\\n2. Custom storage engines 57\\n\\n\\nCreating the Dashboard /\\n\\nVirtual Network Operation Centers 82\\n\\nCreating (near) real-time alerts 85\\n\\nNext steps: machine learning 86\\n\\nPoint-of-failure prediction and remediation 87\\n\\nCustomer churn 87\\n\\nGetting started with the Databricks streaming video QoS solution 87\\n\\nCustomer Use Cases 88\\n\\nHealthdirect Australia 89\\n\\nData quality and governance issues, silos, and the inability to scale 89\\n\\n\\nFundamentals & Performance\\n\\n\\nUsing data skipping and Z-Order clustering 21\\n\\n\\nThe Fundamentals of Delta Lake: Why Reliability and\\n\\n\\nExploring the details 21\\n\\n\\nPerformance Matter\\n\\n\\nFeatures\\n\\n\\nChallenges with data lakes\\n\\nDelta Lake’s key functionalities\\n\\nUnpacking the Transaction Log\\n\\nImplementing atomicity to ensure\\n\\n\\nWhy Use MERGE\\n\\nWith Delta Lake?\\n\\nWhen are upserts necessary? 24\\n\\nWhy upserts into data lakes have\\n\\n\\noperations complete fully\\n\\n\\noperations complete fully 9\\n\\nDealing with multiple concurrent reads and writes **Chapter**\\n\\nTime travel, data lineage and debugging 10\\n\\nHow to Use Schema Enforcement and Evolution\\n\\nUnderstanding table schemas 11\\n\\n#### 01\\n\\n\\nFundamentals and Performance traditionally been challenging 25\\n\\n\\ntraditionally been challenging\\n\\n\\nShallow clones\\n\\nDeep clones\\n\\n\\n**Chapter**\\n\\n42\\n\\n42\\n\\n#### 04\\n\\n\\n3. Lakehouse\\n\\n\\nDealing with multiple concurrent reads and writes\\n\\n\\nIntroducing MERGE in Delta Lake\\n\\n\\nIn the research paper, the authors explain: 59\\n\\n\\n3. Lakehouse Streaming 58\\n\\n\\n\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\nand Performance Matter Deleting data due to GDPR 26\\n\\n\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\", \"metadata\": {\"similarity_score\": 0.0040403795, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\"}, \"id\": \"1b74eac4a063d67e5f727e36b040965b\"}, {\"page_content\": \"**•** Since data platforms continuously change, data engineers\\nspend time building and maintaining, and then rebuilding, complex\\nscalable infrastructure\\n\\n**•** As data pipelines become more complex, data engineers are\\nrequired to find reliable tools to orchestrate these pipelines\\n\\n**•** With the increasing importance of real-time data, low latency data\\npipelines are required, which are even more difficult to build and maintain\\n\\n**•** Finally, with all pipelines written, data engineers need to constantly\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\n\\n\\n**How can Databricks help?**\\n\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\nend-to-end data engineering solution for ingesting, transforming, processing,\\nscheduling and delivering data. The Lakehouse Platform automates the\\ncomplexity of building and maintaining pipelines and running ETL workloads\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\ndrive valuable insights.\\n\\nLakehouse Platform\\n\\n**One platform to support multiple personas**\\n\\n\\n**BI & Data**\\n**Warehousing**\\n\\n\\n**Data**\\n**Engineering**\\n\\n\\n**Data**\\n**Streaming**\\n\\n\\n**Data**\\n**Science & ML**\\n\\n\\n©2023 Databricks Inc. — All rights reserved\\n\\n\\n**Unity Catalog**\\n**Fine-grained governance for data and AI**\\n\\n**Delta Lake**\\n**Data reliability and performance**\\n\\n**Cloud Data Lake**\\n\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\n\\n\\nFigure 1\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\n\\n\\n-----\\n\\n**Key differentiators for successful data engineering**\\n**with Databricks**\\n\\nBy simplifying on a lakehouse architecture, data engineers need an\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\nTo be successful, a data engineering solution team must embrace these eight\\nkey differentiating capabilities:\\n\\n**Data ingestion at scale**\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\nanalytics, data science or machine learning. This includes:\\n\\n**•** Incrementally and efficiently processing data as it arrives\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\n\\n**•** Automatically inferring schema and detecting column\\nchanges for structured and unstructured data formats\\n\\n**•** Automatically and efficiently tracking data as it arrives with\\n\\nno manual intervention\\n\\n**•** Preventing data loss by rescuing data columns\\n\\n\\n**Declarative ETL pipelines**\\nData engineers can reduce development time and effort and instead focus on\\nimplementing business logic and data quality checks within the data pipeline\\nusing SQL or Python. This can be achieved by:\\n\\n**•** Using intent-driven declarative development to simplify “how” and\\ndefine “what” to solve\\n\\n**•** Automatically creating high-quality lineage and managing table\\ndependencies across the data pipeline\\n\\n**•** Automatically checking for missing dependencies or syntax errors,\\nand managing data pipeline recovery\\n\\n**Real-time data processing**\\nAllow data engineers to tune data latency with cost controls without the\\nneed to know complex stream processing or implement recovery logic.\\n\\n**•** Avoid handling batch and real-time streaming data sources separately\\n\\n**•** Execute data pipeline workloads on automatically provisioned elastic\\nApache Spark™-based compute clusters for scale and performance\\n\\n**•** Remove the need to manage infrastructure and focus on the business\\nlogic for downstream use cases\\n\\n\\n-----\", \"metadata\": {\"similarity_score\": 0.003983449, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\"}, \"id\": \"bf114a736c5b9b473f4e1c81c2bbaa5e\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d) *****\u001B[0m\n[{\"page_content\": \"Improve product quality\\n\\nReduce manufacturing costs\\n\\nReduce unplanned downtime\\n\\nIncrease throughput\\n\\nEnsure safe manufacturing\\n\\nTest new design ideas\\n\\nDevelop product enhancements\\n\\nDigital transformation of enterprise\\n\\nSpeed new product introduction\\n\\nReduce planned downtime\\n\\nMeet new regulatory challenges\\n\\nTraining for new manufacturing processes\\n\\nDesign changes to production line\\n\\nProvide service to end users customers\\n\\nUpdate products in the field\\n\\n\\n**34%**\\n\\n\\n**30%**\\n\\n**28%**\\n**25%**\\n\\n**24%**\\n\\n\\n**16%**\\n\\n**14%**\\n\\n**13%**\\n\\n**13%**\\n\\n**11%**\\n**10%**\\n\\n**8%**\\n**8%**\\n\\n\\nCan you imagine the cost to change\\nan oil refinery’s crude distillation\\nunit process conditions to improve\\nthe output of diesel one week\\nand gasoline the next to address\\nchanges in demand and ensure\\nmaximum economic value? Can you\\nimagine how to replicate an even\\nsimple supply chain to model risk?\\n\\n\\n**5%**\\n\\n\\n**1%**\\n\\n\\n-----\\n\\n### What Are Digital Twins?\\n\\n\\nKnowing the business challenges and benefits digital twins deliver, let’s turn to\\nthe basics and explore what digital twins are and how a modern data stack is\\nnecessary to build effective and timely digital twins. The classic definition of\\ndigital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nimilarity_score\": 0.003374363, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf\"}, \"id\": \"8c21a4f7b812c6f92740eeb06e59f417\"}, {\"page_content\": \"-----\\n\\n**• Cultural and organizational barriers** — These challenges can be\\nsummarized by one word: friction. Unfortunately, it’s a common problem\\nfor civil servants to struggle to obtain access to both internal and external\\ndata due to over-cumbersome processes, policies and outdated standards.\\nThe principles we are using to build our data platforms and our data sharing\\nplatforms have to be self-promoting, have to drive adoption and have to\\ngenerate habits that adhere to best practices.\\n\\nIf there is friction with standard adoption, the only way to ensure standards\\nare respected is by enforcement and that itself is yet another barrier to\\nachieving data sustainability. Organizations have already adopted Delta\\nSharing both in the private and public sectors. For example, [U.S. Citizenship](https://www.uscis.gov/)\\n[and Immigration Services](https://www.uscis.gov/) (USCIS) uses Delta Sharing to satisfy several\\n[interagency data-sharing](https://delta.io/blog/2022-12-08-data-sharing-across-government-delta-sharing/) requirements. Similarly, Nasdaq describes Delta\\nSharing as the “ [future of financial data sharing,](https://www.nasdaq.com/articles/delta-sharing-protocol%3A-the-evolution-of-financial-data-sharing-2021-05-26) ” and that future is open\\nand governed.\\n\\n\\n\\n**• Technical challenges** — Federation at the government scale or even\\nfurther across multiple industries and geographies poses technical\\nchallenges. Each organization within this federation owns its platform\\nand drives technological, architectural, platform and tooling choices.\\n\\nHow can we promote interoperability and data exchange in this vast,\\ndiverse technological ecosystem? The data is the only viable integration\\nvehicle. As long as the data formats we utilize are scalable, open and\\ngoverned, we can use them to abstract from individual platforms and\\ntheir intrinsic complexities.\\n\\nDelta format and Delta Sharing solve this wide array of requirements and\\nchallenges in a scalable, robust and open way. This positions Delta Sharing\\nas the strongest choice for unification and simplification of the protocol and\\nmechanism through which we share data across both private and public sectors.\\n\\n\\n-----\\n\\n**Data Sharing through data clean rooms**\\n\\n\\n[Data clean rooms](https://www.databricks.com/blog/2022/06/28/introducing-data-cleanrooms-for-the-lakehouse.html) address this particular need. With data clean rooms you can\\nshare data with third parties in a privacy-safe environment. With Unity Catalog ,\\nyou can enable fine-grained access controls on the data and meet your privacy\\nrequirements. In this architecture, the data participants never get access to\\nthe raw data. The only outputs from the clean rooms are those data assets\\ngenerated in a pre-agreed, governed and fully controlled manner that ensures\\ncompliance with the requirements of all parties involved.\\n\\nFinally, data clean rooms and Delta Sharing can address hybrid on-premise-offpremise deployments, where the data with the most restricted access remains\\non the premise. In contrast, less restricted data is free to leverage the power\\nof the cloud offerings. In said scenario, there may be a need to combine the\\npower of the cloud with the restricted data to solve advanced use cases where\\ncapabilities are unavailable on the on-premises data platforms. Data clean rooms\\ncan ensure that no physical data copies of the raw restricted data are created,\\nresults are produced within the clean room’s controlled environment and results\\nare shared back to the on-premises environment (if the results maintain the\\nrestricted access within the defined policies) or are forwarded to any other\\ncompliant and predetermined destination system.\\n\\n\\nTaking the complexities of data sharing within highly regulated space and the\\npublic sector one step further — what if we require to share the knowledge\\ncontained in the data without ever granting direct access to the source data to\\nexternal parties? These requirements may prove achievable and desirable where\\nthe data sharing risk appetite is very low.\\n\\nIn many public sector contexts, there are concerns that combining the data that\\ndescribes citizens could lead to a big brother scenario where simply too much\\ndata about an individual is concentrated in a single data asset. If it were to fall\\ninto the wrong hands, such a hypothetical data asset could lead to immeasurable\\nconsequences for individuals and the trust in public sector services could\\nerode. On the other hand, the value of a 360 view of the citizen could accelerate\\nimportant decision-making. It could immensely improve the quality of policies\\nand services provided to the citizens.\\n\\n\\n-----\\n\\n**Citizen value of data sharing**\", \"metadata\": {\"similarity_score\": 0.003356445, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\"}, \"id\": \"efc4fbee742e1cddfcd60a0586c59586\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_f76a7934-43b2-4114-975c-d0320f601492) *****\u001B[0m\n[{\"page_content\": \"advanced data, analytics, and AI use cases. For example,\\n\\nconcurrent player count plus store and marketplace data\\n\\n\\n**GAME TELEMETRY**\\n\\n\\n**Data Sources**\\n\\n**GAME SERVICES** **OTHER SOURCES**\\n\\n\\n-----\\n\\nand lifetime value. Usage telemetry combined with crash\\n\\nreporting and social media listening helps you more quickly\\n\\nuncover where players might be getting frustrated. And\\n\\ncorrelating chat logs, voice transcriptions, and or discord\\n\\n\\nthat are relevant and engaging to your players, giving you\\n\\ntools to effectively market and monetize with your audience.\\n\\n**Let’s start with Player Segmentation.**\\n\\n\\nand reddit forums can help you identify disruptive behavior\\n\\n\\nbefore it gets out of hand, giving you the tools to take\\n\\nactionable steps to mitigate toxicity within your community.\\n\\n**Get started and set up your Analytics Dashboard**\\n\\n### Understand your audience\\n\\nWith your analytics pipelines set up, the first area of focus is to\\n\\nbetter understand your audience. This can help you inform a\\n\\nvariety of key business decisions, from the highest macro order\\n\\nof “what game(s) to develop”, to how to market and monetize\\n\\nthose games, and how to optimize the player experience.\\n\\nBy understanding the demographics, preferences, and\\n\\nbehaviors of their audience, a game studio can create games\\n\\nthat are more likely to appeal to their target market and be\\n\\nsuccessful. You can also use this understanding to tailor your\\n\\nmarketing and monetization strategies to the needs and\\n\\npreferences of your players.\\n\\nAdditionally, understanding your audience can help you\\n\\n\\n##### Player Segmentation\\n\\n**Overview**\\n\\nPlayer segmentation is the practice of dividing players\\n\\ninto groups based on shared characteristics or behaviors.\\n\\nSegmentation has a number of benefits. You can better\\n\\nunderstand your players, create more personalized content,\\n\\nimprove player retention, and optimize monetization, all of\\n\\nwhich contributes to an improved player experience.\\n\\n**What we’re trying to solve/achieve**\\n\\nThe primary objective of segmentation is to ensure you’re\\n\\nnot treating your entire playerbase the exact same. Humans\\n\\nare different, and your players have different motivations,\\n\\npreferences and behaviors. Recognizing this and engaging\\n\\nwith them in a way that meets them where they’re at\\n\\nis one of the most impactful ways you can cultivate\\n\\nengagement with your game. As we mentioned above,\\n\\nthe benefits of segmentation are broad reaching. Through\\n\\nbetter understanding of your playerbase, you can better\\n\\npersonalize experiences, tailoring content and customer\\n\\nexperience to specific groups of players that increases\\n\\nengagement and satisfaction. Better understanding of\\n\\nyour players also helps in improving player retention. By\\n\\nidentifying common characteristics of players who are at\\n\\nrisk of churning (i.e., stopping play), you can develop targeted\\n\\nstrategies that only reach specific audiences.\\n\\nCreate advanced customer segments to build out more\\n\\neffective user stories, and identify potential purchasing\\n\\npredictions based on behaviors. Leverage existing sales\\n\\ndata, campaigns and promotions systems to create robust\\n\\nsegments with actionable behavior insights to inform your\\n\\nproduct roadmap. You can then use this information to build\\n\\nuseful customer clusters that are targetable with different\\n\\npromos and offers to drive more efficient acquisition and\\n\\ndeeper engagement with existing players.\\n\\n\\nidentify potential pain points or areas for improvement\\n\\n\\nwithin your games, allowing you to proactively make changes\\n\\n\\n**Get started with Player Segmentation**\\n\\n\\nto address these issues and improve the player experience\\n\\nbefore a player potentially churns.\\n\\n\\n-----\\n\\n**Overview**\\n\\nPlayer lifetime value (LTV) is a measure of the value that a\\n\\nplayer brings to a game over the lifetime they play that game.\", \"metadata\": {\"similarity_score\": 0.0037608068, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"9e966502682ec7b0fdb300a5e74e2770\"}, {\"page_content\": \"with different parts of the game.\\n\\n- **Game progress:** Monitor player progress through\\n\\ndifferent levels and milestones in the game.\\n\\n- **In-game purchases:** Track the number and value of\\n\\nin-game purchases made by players.\\n\\n- **Player demographics:** Collect demographic information\\n\\nabout players, such as age, gender, location, and device type.\\n\\n- **Session length:** Monitor the length of each player session,\\n\\nand how often players return to the game.\\n\\n- **Retention:** Track the percentage of players who return to\\n\\nthe game after their first session.\\n\\n\\n-----\\n\\nsuch as the types of actions taken, the number of deaths,\\n\\nand the use of power-ups.\\n\\n- **User Acquisition:** Track the number of new players\\n\\nacquired through different marketing channels.\\n\\n**2. Business KPIs**\\n\\nThe second bucket of data is business key performance\\n\\nindicators (or KPIs). Business KPIs are metrics that measure\\n\\nthe performance and success of a video game from a\\n\\nbusiness perspective. The primary data source for business\\n\\nKPIs include game telemetry, stores, and marketplaces.\\n\\nThese KPIs help game studios understand the financial and\\n\\noperational performance of their games and make informed\\n\\ndecisions about future development and growth.\\n\\nSome of the primary business metrics that are typically\\n\\ntracked include:\\n\\n- **Revenue:** Track the total revenue generated by the game,\\n\\nincluding sales of the game itself, in-game purchases,\\n\\nand advertising.\\n\\n- **Player Acquisition Cost (CAC):** Calculate the cost\\n\\nof acquiring a new player, including marketing and\\n\\nadvertising expenses.\\n\\n- **Lifetime Value (LTV):** Estimate the amount of revenue a\\n\\nplayer will generate over the course of their time playing\\n\\nthe game.\\n\\n- **Player Retention:** Track the percentage of players who\\n\\ncontinue to play the game over time, and how long they\\n\\nplay for.\\n\\n- **Engagement:** Measure the level of engagement of players\\n\\nwith the game, such as the number of sessions played,\\n\\ntime spent playing, and in-game actions taken.\\n\\n- **User Acquisition:** Track the number of new players\\n\\nacquired through different marketing channels and the\\n\\ncost of acquiring each player.\\n\\n- **Conversion Rate:** Measure the percentage of players who\\n\\nmake an in-game purchase or complete a specific action.\\n\\n- **Gross Margin:** Calculate the profit generated by the game\\n\\nafter subtracting the cost of goods sold, such as the cost\\n\\nof game development and server hosting.\\n\\n**3. Game Services**\\n\\nSimilar to game telemetry, game services provide critical\\n\\ninfrastructure that requires careful monitoring and management.\\n\\nThese services include things like game server hosting,\\n\\n\\nand more. Here the source of data is the game services used.\\n\\nSome of the common metrics game teams typically track for\\n\\nthese services include:\\n\\n- **Concurrent Players:** Track the number of players who are\\n\\nsimultaneously connected to the game servers to ensure\\n\\nthat the servers have enough capacity to handle the\\n\\nplayer demand.\\n\\n- **Server Availability:** Monitor the uptime and downtime of\\n\\nthe game servers to ensure that players have access to\\n\\nthe game when they want to play, particularly important\\n\\nfor global live service games where demand fluctuates\\n\\nthrought the day.\\n\\n- **Latency:** Measure the time it takes for data to travel\\n\\nfrom the player’s device to the game server and back,\\n\\nto ensure that players have a smooth and responsive\\n\\ngaming experience.\\n\\n- **Network Bandwidth:** Monitor the amount of data being\\n\\ntransmitted between the player’s device and the game\", \"metadata\": {\"similarity_score\": 0.0036846527, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"19649d3d466f3b25c73f87e08cb38e3b\"}, {\"page_content\": \"scientists, and data analysts form the heart of mature game\\n\\ndata teams. Though, depending on studio size and resources,\\n\\n\\nmaking sense of large amounts of data. Depending on the size\\n\\nof the organization, individuals may be required to multiclass\\n\\nin order to address needs of the team. In smaller studios, it’s\\n\\noften developers who wear multiple hats, including those in data\\n\\nengineering, analytics and data science. Key characters include:\\n\\n\\ngame developers may also be pulled in from time to time to\\n\\n\\nperform data engineering and or data science tasks. Though for\\n\\nthe sake of this guide, we’ll keep focus on roles of data engineers,\\n\\ndata scientists, and data analysts. There are many aspects to\\n\\nthese roles, but they can be summarized in that Data Engineers\\n\\ncreate and maintain critical data workflows, Data Analysts\\n\\ninterpret data and create reports that keep the business teams\\n\\nrunning seamlessly, and Data Scientists are responsible for\\n\\n\\n**Data Engineers**\\n\\nData engineers build systems that collect, manage, and\\n\\nconvert source data into usable information for data\\n\\nscientists and business analysts to interpret. Their ultimate\\n\\ngoal is to make data accessible so that teams can use it to\\n\\nevaluate and optimize a goal or objective.\\n\\n\\n-----\\n\\nData scientists determine the questions their team should\\n\\nbe asking and figure out how to answer those questions\\n\\nusing data. They often develop predictive models for\\n\\ntheorizing and forecasting.\\n\\n**Data Analysts**\\n\\n\\nto report on the health of a title or building a recommendation\\n\\nengine for your players, this guide will help you better\\n\\nunderstand the unique classes required to develop and\\n\\nmaintain an effective data, analytics, and AI platform.\\n\\n**Learn more about these character classes**\\n\\n\\nA data analyst reviews data to identify key insights into a\\n\\ngame studio’s customers and ways the data can be used to\\n\\nsolve problems.\\n\\n# Diving In\\n\\n\\nBefore we get to the primary use cases of game data,\\n\\nanalytics, and AI, we need to cover some basics. That is, the\\n\\ndifferent types of game data and how they are produced.\\n\\nAnd the subsequent receiving of that data in the cloud to\\n\\n\\n### Producing game data…\\n\\nSpeaking in generalities, there are four buckets of data as it\\n\\nrelates to your video game.\\n\\n\\ncollect, clean, and prepare for analysis.\\n\\n**1. Game Telemetry**\\n\\nGame telemetry refers to the data collected about player\\n\\nbehavior and interactions within a video game. The primary\\n\\ndata source is the game engine. And the goal of game\\n\\ntelemetry is to gather information that can help game\\n\\ndevelopers understand player behavior and improve the\\n\\noverall game experience.\\n\\nSome of the primary metrics that are typically tracked in\\n\\ngame telemetry include:\\n\\n- **Player engagement:** Track the amount of time players\\n\\nspend playing the game, and their level of engagement\\n\\nwith different parts of the game.\\n\\n- **Game progress:** Monitor player progress through\\n\\ndifferent levels and milestones in the game.\\n\\n- **In-game purchases:** Track the number and value of\\n\\nin-game purchases made by players.\\n\\n- **Player demographics:** Collect demographic information\\n\\nabout players, such as age, gender, location, and device type.\\n\\n- **Session length:** Monitor the length of each player session,\\n\\nand how often players return to the game.\\n\\n- **Retention:** Track the percentage of players who return to\\n\\nthe game after their first session.\\n\\n\\n-----\\n\\nsuch as the types of actions taken, the number of deaths,\\n\\nand the use of power-ups.\\n\\n- **User Acquisition:** Track the number of new players\\n\\nacquired through different marketing channels.\\n\\n**2. Business KPIs**\", \"metadata\": {\"similarity_score\": 0.00360044, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"e22b1fb61a6fa9c2003b7a8ef3f86a82\"}, {\"page_content\": \"analyzing player feedback, game studios can make data-driven\\n\\ndecisions to improve the game, increase player engagement\\n\\nand retention, and ultimately drive success and growth.\\n\\nFor this use case, we’re going to focus on taking online\\n\\nreviews for your video game and categorizing the different\\n\\ntopics players are talking about (bucketing topics) in order\\n\\nto better understand the themes (via positive or negative\\n\\nsentiment) affecting your community.\\n\\n**What we’re trying to solve/achieve**\\n\\nThis is incredibly helpful, providing data-driven customer\\n\\ninsight into your development process. Whether used in\\n\\n\\n**Overview**\\n\\nAcross massively multiplayer online video games (MMOs),\\n\\nmultiplayer online battle arena games (MOBAs) and other\\n\\nforms of online gaming, players continuously interact in real\\n\\ntime to either coordinate or compete as they move toward a\\n\\ncommon goal — winning. This interactivity is integral to game\\n\\nplay dynamics, but at the same time, it’s a prime opening for\\n\\ntoxic behavior — an issue pervasive throughout the online\\n\\nvideo gaming sphere.\\n\\nToxic behavior manifests in many forms, such as the varying\\n\\ndegrees of griefing, cyberbullying and sexual harassment\\n\\nthat are illustrated in the matrix below from [Behaviour](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant)\\n\\n[Interactive](http://gamestudies.org/2004/articles/deslauriers_iseutlafrancestmartin_bonenfant) , which lists the types of interactions seen within\\n\\nthe multiplayer game, _Dead by Daylight_ .\\n\\n\\npre-production, such as looking at games that are similar\\n\\n\\n**Survivors**\\n\\n\\nwith reviews to learn where those games have strengths and\\n\\nweaknesses; or using player feedback analysis with a live\\n\\nservice title to identify themes that can apply to your product\\n\\nroadmap, player feedback analysis helps teams better\\n\\nsupport and cultivate engagement with the player community.\\n\\n\\n**GEN**\\n\\n**RUSHING**\\n\\n\\n**GEN**\\n\\n\\n**HIDING** **ACTIVATING** **LOOPING**\\n**EMOTES**\\n\\n\\n**RUSH** **BLINDING** **SANDBAGGING**\\n**UNHOOKING**\\n\\n**TEABAGGING**\\n\\n\\n**REPORTING** **REPORTING**\\n\\n\\n**REPORTING** **REPORTING**\\n\\n\\n**TEXT**\\n**CHATTING**\\n\\n\\nUltimately, player feedback analysis does two things. 1) It\\n\\n\\n**Less**\\n\\n**toxic**\\n\\n\\n**Most**\\n**toxic**\\n\\n\\ncan help you stack rank themes according to positive and\\n\\nnegative sentiment, and 2) you can weight those themes\\n\\naccording to impact on player engagement, toxicity,\\n\\nmonetization, churn, and more. We’ve all read reviews that\\n\\nare overly positive, or overly negative. The process of player\\n\\nfeedback analysis helps to normalize feedback across the\\n\\ncommunity (keeping in mind, only for those who have written\\n\\na review), so you’re not over indexing on one review, or a\\n\\n\\n**HATCH** **HATCH**\\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\\n\\n\\n**HATCH** **HATCH**\\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\\n\\n**CAMPING** **CAMPING**\\n\\n\\n**FARMING** **FARMING**\\n\\n\\n**CAMPING** **CAMPING**\\n\\n\\n**BEING AWAY**\\n**FROM**\\n**KEYBOARD**\\n**(AFK)**\\n\\n\\n**CAMPING**\\n\\n**DRIBBLING** **TUNNELING**\", \"metadata\": {\"similarity_score\": 0.0034852957, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"9b831c107aaf2a3678bf3a75ed18d52c\"}, {\"page_content\": \"are overly positive, or overly negative. The process of player\\n\\nfeedback analysis helps to normalize feedback across the\\n\\ncommunity (keeping in mind, only for those who have written\\n\\na review), so you’re not over indexing on one review, or a\\n\\n\\n**HATCH** **HATCH**\\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\\n\\n\\n**HATCH** **HATCH**\\n**DISCONNECTING** **DISCONNECTING** **FARMING** **FARMING**\\n\\n**CAMPING** **CAMPING**\\n\\n\\n**FARMING** **FARMING**\\n\\n\\n**CAMPING** **CAMPING**\\n\\n\\n**BEING AWAY**\\n**FROM**\\n**KEYBOARD**\\n**(AFK)**\\n\\n\\n**CAMPING**\\n\\n**DRIBBLING** **TUNNELING**\\n\\n\\n**LOBBY**\\n**DODGING**\\n\\n**BODY**\\n**BLOCKING**\\n\\n**FACE**\\n**SLUGGING** **CAMPING**\\n\\n\\n**Killers**\\n\\n\\nsingle theme that may seem in the moment very pressing.\\n\\nIn addition to the [personal toll](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) that toxic behavior can have\\n\\n\\n**Get started with Player Feedback Analysis**\\n\\n\\non gamers and the community -- an issue that cannot be\\n\\n\\n-----\\n\\ngame studios. For example, a study from [Michigan State](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity)\\n\\n\\n[University](https://msutoday.msu.edu/news/2021/faculty-voice-gaming-and-toxicity) revealed that 80% of players recently experienced\\n\\ntoxicity, and of those, 20% reported leaving the game due to\\n\\nthese interactions. Similarly, a study from [Tilburg University](https://arno.uvt.nl/show.cgi?fid=145375)\\n\\nshowed that having a disruptive or toxic encounter in the first\\n\\nsession of the game led to players being over three times\\n\\nmore likely to leave the game without returning. Given that\\n\\nplayer retention is a top priority for many studios, particularly\\n\\nas game delivery transitions from physical media releases to\\n\\nlong-lived services, it’s clear that toxicity must be curbed.\\n\\nCompounding this issue related to churn, some companies\\n\\nface challenges related to toxicity early in development,\\n\\neven before launch. For example, [Amazon’s Crucible](https://www.wired.com/story/amazon-crucible-release-first-big-videogame/) was\\n\\nreleased into testing without text or voice chat due in part\\n\\nto not having a system in place to monitor or manage toxic\\n\\n\\nIn this section, we’re going to talk about how to use your data\\n\\nto more effectively find your target audience across the web.\\n\\nWhether you’re engaging in paid advertising, influencer or\\n\\nreferral marketing, PR, cross promotion, community building,\\n\\netc - use data to separate activity from impact. You want\\n\\nto focus on the channels and strategies that leverage your\\n\\nresources most effectively, be that time or money.\\n\\nSay you have a cohort of highly engaged players who are\\n\\nspending money on your title, and you want to find more\\n\\ngamers just like that. Doing an analysis on the demographic\\n\\nand behavioral data of this cohort will give you the\\n\\ninformation needed to use an ad platform (such as Meta,\\n\\nGoogle, or Unity) to do lookalike modeling and target those\\n\\npotential gamers for acquisition.\\n\\n\\ngamers and interactions. This illustrates that the scale of\\n\\n\\nthe gaming space has far surpassed most teams’ ability to\\n\\nmanage such behavior through reports or by intervening in\\n\\ndisruptive interactions. Given this, it’s essential for studios\", \"metadata\": {\"similarity_score\": 0.0034753587, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"e3f8982a30fe6e2560236fad0446093a\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:opentelemetry.sdk.trace:Calling end() on an ended span.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_e09bc078-fe39-426a-9a33-e486e8d6d050): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"Data Engineer responsibilities\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_e09bc078-fe39-426a-9a33-e486e8d6d050) *****\u001B[0m\n[{\"page_content\": \"engineering in the gaming industry.\\n\\n`10. \\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\n\\nfirst step in your data journey. Imagine how the output of\\n\\nyour data can be presented in a way to help stakeholders\\n\\nacross your company achieve more. For example, dropping\\n\\ndata into an application that can help game designers\\n\\nmake balancing decisions based on player events.\\n\\n\\n-----\\n\\n# APPENDIX Ultimate class build guide\\n\\n\\n### Creating a character\\n\\nThe heart and soul of mature data teams are formed by this\\n\\ntrio of classes. There are many aspects to these roles, but\\n\\nthey can be summarized in that Data Engineers create and\\n\\nmaintain critical data workflows, Data Analysts interpret data\\n\\nand create reports that keep the business teams running\\n\\nseamlessly, and Data Scientists are responsible for making\\n\\nsense of large amounts of data. Depending on the size of\\n\\nthe organization, individuals may be required to multiclass\\n\\nin order to address needs of the team. In smaller studios, it’s\\n\\noften developers who wear multiple hats, including those in\\n\\ndata engineering, analytics and data science.\\n\\nWhether you’re looking to stand-up an analytics dashboard\\n\\nto report on the health of a title or building a recommendation\\n\\nengine for your players, this guide will help you better\\n\\nunderstand the unique classes required to develop and\\n\\nmaintain an effective data, analytics, and AI platform.\\n\\n##### Data Engineers\\n\\n\\n**Goals and Priorities of Data Engineers**\\n\\n- Enable access to usable data for real-time insights — data\\n\\nthat both enables timely decision-making and is accurate\\n\\nand reproducible\\n\\n- Increase user confidence and trust in data. This involves\\n\\nensuring high consistency and reliability in ETL processes\\n\\n- Limit the issues and failures experienced by other\\n\\nengineers and data scientists, allowing those roles to\\n\\nfocus less on troubleshooting and more on drawing\\n\\nmeaningful conclusions from data and building new\\n\\nproducts / features\\n\\n**What Data Engineers care about:**\\n\\n- Enabling access to data for real-time insights — data that\\n\\nboth enables timely decision-making and is accurate and\\n\\nreproducible\\n\\n- Building high-performance, reliable and scalable pipelines\\n\\nfor data processing\\n\\n- Delivering data for consumption from a variety of sources\\n\\nby Data Analysts and Data Scientists against tight SLAs\\n\\n- A Data Engineer’s biggest challenge? Collaboration\\n\\nacross teams\\n\\n\\nData engineers build systems that collect, manage, and\\n\\n\\nconvert source data into usable information for data\\n\\nscientists and business analysts to interpret. Their ultimate\\n\\ngoal is to make data accessible so that teams can use it to\\n\\nevaluate and optimize a goal or objective.\\n\\n**Responsibilities:**\\n\\n- Data Engineers are responsible for data migration,\\n\\nmanipulation, and integration of data (joining dissimilar\\n\\ndata systems)\\n\\n- Setup and maintenance of ETL pipelines to convert\\n\\nsource data into actionable data for insights. It is the\\n\\nresponsibility of the data engineer to make sure these\\n\\npipelines run efficiently and are well orchestrated.\\n\\n- The Data Engineer sets up the workflow process\\n\\nto orchestrate pipelines for the studio’s data and\\n\\ncontinuously validates it\\n\\n- Managing workflows to enable data scientists and data\\n\\nanalysts, and ensuring workflows are well-integrated with\\n\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\n\\n\\n##### Data Scientists\\n\\nData scientists determine the questions their team should\\n\\nbe asking and figure out how to answer those questions\\n\\nusing data. They often develop predictive models for\\n\\ntheorizing and forecasting.\\n\\n**Responsibilities:**\\n\\n- Responsible for making sense of the large amounts of data\\n\\ncollected for a given game title, such as game telemetry,\", \"metadata\": {\"similarity_score\": 0.003443227, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\"}, \"id\": \"1ce1d861d15136fd48438be91479e567\"}, {\"page_content\": \"Data teams rely on getting the right data at the right time for analytics, data\\nscience and machine learning, but often are faced with challenges meeting\\nthe needs of their initiatives for data engineering.\\n\\n\\n-----\\n\\n#### Why data engineering is hard\\n\\nOne of the biggest challenges is accessing and managing the increasingly\\ncomplex data that lives across the organization. Most of the complexity\\narises with the explosion of data volumes and data types, with organizations\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\n\\nWith this volume, managing data pipelines to transform and process data\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\nmost businesses are putting an increased emphasis on multicloud\\nenvironments which can be even more difficult to maintain.\\n\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\nthat data itself has become a product, and the challenging goal of the data\\nengineer is to build and run the machinery that creates this high-fidelity\\ndata product all the way from ingestion to monetization.\\n\\n\\nDespite current technological advances data engineering remains\\ndifficult for several reasons:\\n\\n**Complex data ingestion methods**\\n\\nData ingestion means retrieving batch and streaming data from various\\nsources and in various formats. Ingesting data is hard and complex since you\\neither need to use an always-running streaming platform like Apache Kafka\\nor you need to be able to keep track of which files haven’t been ingested yet.\\nData engineers are required to spend a lot of time hand-coding repetitive\\nand error-prone data ingestion tasks.\\n\\n**Data engineering principles**\\n\\nThese days, large operations teams are often just a memory of the past.\\nModern data engineering principles are based on agile software development\\nmethodologies. They apply the well-known “you build it, you run it” paradigm,\\nuse isolated development and production environments, CI/CD, and version\\ncontrol transformations that are pushed to production after validation. Tooling\\nneeds to support these principles.\\n\\n\\n-----\\n\\n**Third-party tools**\\n\\nData engineers are often required to run additional third-party tools for\\norchestration to automate tasks such as ELT/ETL or customer code in\\nnotebooks. Running third-party tools increases the operational overhead\\nand decreases the reliability of the system.\\n\\n**Performance tuning**\\n\\nFinally, with all pipelines and workflows written, data engineers need to\\nconstantly focus on performance, tuning pipelines and architectures to meet\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\narchitecture and constantly observing throughput parameters.\\n\\nMost organizations are dealing with a complex landscape of data warehouses\\nand data lakes these days. Each of those platforms has its own limitations,\\nworkloads, development languages and governance model.\\n\\n\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\nend-to-end data engineering solution for ingesting, transforming, processing,\\nscheduling and delivering data. The lakehouse platform automates the\\ncomplexity of building and maintaining pipelines and running ETL workloads\\ndirectly on a data lake so data engineers can focus on quality and reliability\\nto drive valuable insights.\\n\\nData engineering in the lakehouse allows data teams to unify batch and\\nstreaming operations on a simplified architecture, streamline data pipeline\\ndevelopment and testing, build reliable data, analytics and AI workflows\\non any cloud platform, and meet regulatory requirements to maintain\\nworld-class governance.\\n\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\nthat automates the complexity of building and maintaining pipelines and\\nrunning ETL workloads so data engineers and analysts can focus on quality\\nand reliability to drive valuable insights.\\n\\n\\n#### Databricks makes modern data engineering simple\\n\\nThere is no industry-wide definition of modern data engineering.\\nThis should come close:\", \"metadata\": {\"similarity_score\": 0.0033119193, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\"}, \"id\": \"e577e0ac294ad34249c7d000936d7c72\"}, {\"page_content\": \"# Building Reliable Data Lakes at Scale With Delta Lake\\n\\n\\n-----\\n\\n## Contents\\n\\n#### Data Engineering Drivers 2\\n\\n Data Pipeline Key Goals 4\\n\\n Apache Spark™: The First Unified Analytics Engine 5\\n\\n Data Reliability Challenges With Data Lakes 6\\n\\n Delta Lake: A New Storage Layer 7\\n\\n Delta Lake: Key Features 8\\n\\n Getting Started With Delta Lake 10\\n\\n\\n-----\\n\\n## Drivers\\n\\n#### Data Engineering Drivers\\n\\nData engineering professionals are needing to respond to several different drivers.\\n\\nChief among the drivers they face are:\\n\\n**Rise of Advanced Analytics** — Advanced analytics, including methods\\n\\nbased on machine learning techniques, have evolved to such a degree that\\n\\norganizations seek to derive far more value from their corporate assets.\\n\\n**Widespread Adoption** — Once the province of leading edge, high-tech\\n\\ncompanies, these advanced approaches are being adopted across a\\n\\nmultitude of industries from retail to hospitality to healthcare and across\\n\\nprivate as well as public sector organizations. This is further driving the need\\n\\nfor strong data engineering practices.\\n\\n**Regulation** — With the growth of data generation and data collection,\\n\\nthere is increased interest in how the data is protected and managed.\\n\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\n\\nfrom the EU and other jurisdictions mandate very specific ways in which\\n\\ndata must be managed.\\n\\n\\n-----\\n\\n## Drivers\\n\\n**Technology Innovation** — The move to cloud-based analytics architectures\\n\\nthat is now well underway is being propelled further by innovations such as\\n\\nanalytics-focused chipsets, pipeline automation and the unification of data\\n\\nand machine learning. All these offer data professionals new approaches for\\n\\ntheir data initiatives.\\n\\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\\n\\nalso subject to increasing scrutiny. There is also a greater understanding of\\n\\ndata as a valuable asset. Deriving value from data must be done in a manner\\n\\nthat is financially responsible and actually value adding to the enterprise and\\n\\nmeeting ROI hurdles.\\n\\n**Role Evolution** — Reflecting the importance of managing the data and\\n\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\n\\nmore prominent and newer roles such as Data Curator are emerging.\\n\\nThey must balance the needs of governance, security and democratization.\\n\\n\\n-----\\n\\n## Key Goals\\n\\n#### Data Pipeline Key Goals\\n\\nMaking quality data available in a reliable manner is a major determinant of success for data\\n\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\n\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\n\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\n\\ndesign and build their data pipelines.\\n\\nThree primary goals that data engineers typically seek to address as they work to enable the\\n\\nanalytics professionals in their organizations are:\\n\\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\\n\\nare key. Data with gaps or errors (which can arise for many reasons) is\\n\\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\\n\\nusers. Equally well, many applications require up-to-date information (who\\n\\nwants to use last night’s closing stock price or weather forecast) and are of\\n\\nlimited value without it.\\n\\n**Enable faster queries** — Wanting fast responses to queries is natural enough\\n\\nin today’s “New York minute,” online world. Achieving this is particularly\\n\\ndemanding when the queries are based on very large data sets.\\n\\n**Simplify data engineering at scale** — It is one thing to have high reliability and\\n\\nperformance in a limited, development or test environment. What matters\\n\\nmore is the ability to have robust, production data pipelines at scale without\\n\\nrequiring high operational overhead.\\n\\n\\n-----\", \"metadata\": {\"similarity_score\": 0.0032034456, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\"}, \"id\": \"f6ef96d9f374de069754b3f8d671b16d\"}, {\"page_content\": \"COPY INTO ......................................................................................................................................................................................................... **06**\\n\\nAuto Loader ....................................................................................................................................................................................................... **09**\\n\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\n\\nPartner Connect ............................................................................................................................................................................................... **13**\\n\\n\\n-----\\n\\n### Introduction\\n\\nOrganizations today are inundated with data siloed across various on-premises\\napplication systems, databases, data warehouses and SaaS applications. This\\nfragmentation makes it difficult to support new use cases for analytics or machine\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\narchitecture built on top of Delta Lake, an open format storage layer.\\n\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\ncritical first step in the data engineering and management lifecycle.\\n\\n\\n-----\\n\\n### Life of a Data Engineer\\n\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\n\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\n\\n\\na variety of data types. For example:\", \"metadata\": {\"similarity_score\": 0.0030519078, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\"}, \"id\": \"e49a7d2e3bd1f6a60e1306c0186dcdd5\"}, {\"page_content\": \"With the Databricks Lakehouse Platform, data engineers have access to an\\nend-to-end data engineering solution for ingesting, transforming, processing,\\nscheduling and delivering data. The lakehouse platform automates the\\ncomplexity of building and maintaining pipelines and running ETL workloads\\ndirectly on a data lake so data engineers can focus on quality and reliability\\nto drive valuable insights.\\n\\nData engineering in the lakehouse allows data teams to unify batch and\\nstreaming operations on a simplified architecture, streamline data pipeline\\ndevelopment and testing, build reliable data, analytics and AI workflows\\non any cloud platform, and meet regulatory requirements to maintain\\nworld-class governance.\\n\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\nthat automates the complexity of building and maintaining pipelines and\\nrunning ETL workloads so data engineers and analysts can focus on quality\\nand reliability to drive valuable insights.\\n\\n\\n#### Databricks makes modern data engineering simple\\n\\nThere is no industry-wide definition of modern data engineering.\\nThis should come close:\\n\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\n_kinds of workflows._\\n\\n\\n-----\\n\\n-----\\n\\n#### Benefits of data engineering on the lakehouse\\n\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\ngain an enterprise-grade and enterprise-ready approach to building data\\npipelines. The following are eight key differentiating capabilities that a data\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\n\\n**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\nfor analytics, data science or machine learning.\\n\\n\\n\\n**•** **Data pipeline observability:** Monitor overall data pipeline estate status\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\nhealth for performance, quality, status and latency.\\n\\n**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\\nanalytics and machine learning use cases by enabling easy and automatic\\ndata pipeline deployments into production or roll back pipelines and\\nminimize downtime.\\n\\n**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\nof data processing tasks for data and machine learning pipelines with the\\nability to run multiple non-interactive tasks as a directed acyclic graph\\n(DAG) on a Databricks compute cluster.\\n\\n\\n\\n**•** **Automated ETL pipelines:** Data engineers can reduce development\\ntime and effort and focus on implementing business logic and data\\nquality checks within the data pipeline using SQL or Python.\\n\\n**•** **Data quality checks:** Improve data reliability throughout the data\\nlakehouse so data teams can confidently trust the information for\\ndownstream initiatives with the ability to define data quality and\\nautomatically address errors.\\n\\n**•** **Batch and streaming:** Allow data engineers to set tunable data latency\\nwith cost controls without having to know complex stream processing\\nand implement recovery logic.\\n\\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\\nfor most common error conditions that can occur during the operation of\\na pipeline with fast, scalable fault-tolerance.\\n\\n\\n-----\\n\\n**Data engineering is all about data quality**\\n\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\nthree different levels.\\n\\n\\n1. On a **technical level** , data quality is\\nguaranteed by enforcing and evolving\\nschemas for data storage and ingestion.\\n\\n**Kenesis**\\n\\n**CSV,**\\n**JSON, TXT...**\\n\\n**Data Lake**\", \"metadata\": {\"similarity_score\": 0.0029978286, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\"}, \"id\": \"9f81ac0b52802c7152247bfd5289b744\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:24] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:24] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:24] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are the critical needs for IT and business when it comes to implementing a customer data platform?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n[autogen.oai.client: 12-18 17:44:25] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:25] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:44:25] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat are the main advantages of the lakehouse architecture over traditional on-premises data warehouses and cloud data warehouses?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nAccording to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_5daaee03-ba46-4038-bb14-1189c8d60f61): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"critical needs for IT and business when implementing a customer data platform\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_5daaee03-ba46-4038-bb14-1189c8d60f61) *****\u001B[0m\n[{\"page_content\": \"#### eBook\\n\\n# The CDP Build vs Buy Guide:\\n\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\n\\n\\n-----\\n\\n## The Need for a Customer Data Platform\\n\\n\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\nof the curve — that means they need a customer data platform (CDP). Through a CDP, data\\nfrom every touch point, along with third-party information, is brought together to provide\\na unified view of the customer. This enables your marketing team to analyze, identify and\\nactivate customers with targeted content.\\n\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\n\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\nfastest path to a solution.\\n\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\nexisting marketing and analytics systems.. The cost of adding another system to the\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\nthat has immediate consequences.\\n\\n**Critical IT Needs** **Critical Business Needs**\\n\\n\\nKeep control of data access and\\ngovernance; ability to architecture a\\ncustomer data stack with decisions on\\nwhere data is stored and where queries\\nare executed\\n\\n\\nGet customer data access via a no-code\\ninterface to generate insights; build customer\\nexperiences and activate data within\\nbusiness applications\\n\\n\\n-----\\n\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\nside or the other unaddressed — which is why so many organizations who have built a CDP\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\n\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\n**both sides of the debate and provide organizations a third choice of both building and**\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\nbuying, we’ve opened the door to finding the right balance of approaches for our customer\\norganizations, helping organizations find greater success in their personalization journey.\\n\\n**“We made an attempt to internally build a CDP platform and while we**\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\n**or offer a campaign interface to our product marketers that could empower**\\n**them to create and manage those journeys. It was going to take at least two**\\n**years for us to build all of that functionality in house.”**\\n\\n– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\n\\n\\n-----\\n\\n## Combining the Build and Buy Approaches\\n\\n\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\nwithin the lakehouse platform. There are three approaches to this:\\n\\n**Bundled** **Composable**\\n\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\n\\n\\nCompute\\n\\nStorage\\n\\n\\nCompute\\n\\nStorage\\n(Local & Views)\\n\\n\\nQuery\\nVirtualization\\n\\nMetadata\\n\\n\\nData Copy\\n\\n\\nLakehouse\\n\\nStorage\\n\\n\\nLakehouse\\n\\n\\nLakehouse\\n\\n\\nCompute Compute\\n\\nStorage Storage\\n\\n\\n-----\\n\\nDeployment Type\\n\\n**Bundled**\\n\\n**Composable –**\\n**Hybrid**\\n\\n**Composable –**\\n**Lakehouse-Only**\\n\\n\\nDescription\", \"metadata\": {\"similarity_score\": 0.0029832723, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\"}, \"id\": \"563f0dba5edef5b358685117dfb5a133\"}, {\"page_content\": \"companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\\n\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\n\\nincreasingly important.\\n\\n**Modernize business applications**\\n\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\\n\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\n\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\n\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\n\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\n\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\n\\nservices and APIs to easily provide access to an application’s functionality.\\n\\nCloud-based architectures, commodity databases and software application development frameworks make\\n\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\n\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\n\\na backing database) has become straightforward with the latest tooling available to your application\\n\\ndevelopment teams.\\n\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\n\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\n\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\n\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\n\\n\\n“We are on an amazing journey. Being among\\n\\nthe fastest-growing enterprise software cloud\\n\\ncompanies on record was unimaginable when\\n\\nwe started Databricks. To get here, we’ve stayed\\n\\nfocused on the three big bets we made when\\n\\nfounding the company — cloud, open source\\n\\nand machine learning. Fast-forward seven years,\\n\\nthousands of data teams around the globe are\\n\\nworking better together on Databricks.”\\n\\n**Ali Ghodsi**\\n\\nCo-founder and CEO\\n\\nDatabricks\\n\\n\\n-----\\n\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\n\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\n\\nother applications within your environment to store copies of the data — unless absolutely necessary for\\n\\nperformance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\\n\\nthe data from the actual SOR.\\n\\nData from these SORs should be made available in three ways:\\n\\n**1.** \\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\n\\n**2.** \\u0007Ensure that copies of the data land in the data lake.\\n\\n**3.** \\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\n\\nconsumption by downstream applications.\\n\\n**Move toward real-time decisioning**\\n\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\n\\nand the second is to view data as an individual event. This so-called “time value of data” is an important\\n\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\\n\\nthe same data platform.\\n\\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\\n\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\n\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\n\\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that\", \"metadata\": {\"similarity_score\": 0.0027576878, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\"}, \"id\": \"25ef18d715b47231f6594d1da80303e9\"}, {\"page_content\": \"and security environment but nothing more\\n\\n\\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\n\\nof tools in play or streamlining the user experience\\n\\n\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\n\\npartnership model, the ability to influence the roadmap and professional services support\\n\\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\\n\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\n\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\n\\n\\n-----\\n\\nDatabricks is a leading data and AI company —\\n\\n\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\n\\ndata processing, validation and curation should work. It’s the integration between the discrete functions\\n\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\n\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\n\\nconsequences of not doing the integration properly can be serious — in terms of security, compliance,\\n\\nefficiency, cost, etc.\\n\\n\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\n\\n\\nSo, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\\n\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\n\\ntake from both parties — sometimes calling for an organization to adjust their processes to better fit how\\n\\nthe platform works. There are many instances where a given business process could be simplified or recast\\n\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\n\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\n\\napply to the broadest set of customers.\\n\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\n\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\n\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\n\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\n\\nand collaboration helps improve the user experience and decreases time to market.\\n\\n\\n[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\\n\\nlistening to the needs of thousands of customers\\n\\nand having our engineers work side by side with\\n\\ncustomer teams to deliver real business value using\\n\\ndata and AI.\\n\\n\\n-----\\n\\n**Unified platform, unified personas**\\n\\nDeploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\\n\\ndata stack — will provide an integrated suite of tools for the full range of personas in your organization,\\n\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\n\\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\\n\\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\n\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\n\\nsubsystems are well managed.\\n\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\n\\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\\n\\nis eliminated.\\n\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\n\\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\\n\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\n\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\", \"metadata\": {\"similarity_score\": 0.0027022872, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\"}, \"id\": \"eaff954d65653182857574e043c105f1\"}, {\"page_content\": \"2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\\n\\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\\n\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\n\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\\n\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\n\\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\\n\\ngoals but also in minimizing these seven key business risks.\\n\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\n\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\n\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\n\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\n\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\n\\nsignificant return on investment (ROI) — one that starts in months, not years.\\n\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\n\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\n\\nto deliver on their data strategy — including how to deploy a modern data architecture, leverage data\\n\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\n\\nidentify and execute on AI opportunities.\\n\\n\\n-----\\n\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\n\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\n\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\n\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\n\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\n\\nindustry standards.\\n\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\n\\nwe’ve hired industry experts and thought leaders to help organizations better understand the steps involved\\n\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\n\\narchitecture, which decouples data storage from compute while providing the best price/performance\\n\\nmetrics for all your data workloads — including data warehousing. We have captured the lessons learned\\n\\nand summarized them in this series of Executive Guides — which are designed to serve as blueprints for\\n\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\n\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\n\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\n\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\n\\nshown in Figure 1.\\n\\n\\n###### Lakehouse Platform\\n\\n\\nData\\nWarehousing\\n\\n\\nData\\nEngineering\\n\\n\\nData\\nStreaming\\n\\n\\nData S�ien��\\nand ML\\n\\n\\nUnity Catalog\\nFine-grained governance for data and AI\\n\\nDelta Lake\\nData relia)ility and .erfor2ance\\n\\nCloud Data Lake\\nAll structured and unstructured data\\n\\n**Figure 1:**\\nThe Databricks Lakehouse Platform\\n\\n\\n-----\\n\\n**The lakehouse architecture benefits organizations in several ways:**\\n\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\n\\n**2.** \\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\n\\n**3.** \\u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.\", \"metadata\": {\"similarity_score\": 0.0025006814, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\"}, \"id\": \"f545eff42d3b9ae2b565475f4390ed44\"}, {\"page_content\": \"organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\\n\\nunique and b) the development offers the competitive advantage that you need.\\n\\nEven software built on top of open source still requires significant investment in integration and testing.\\n\\nThe integration work is particularly challenging because of the large number of open source libraries that\\n\\nare required in the data science space. The question becomes, “Is this really the area that you want your\\n\\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\\n\\n**How long will it take? Can the organization afford to wait?**\\n\\nEven if you decide the software component provides a competitive advantage and is something worth\\n\\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\\n\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\n\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\n\\ntake longer and cost more money than initially planned.\\n\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\n\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\n\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\n\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\n\\nfeatures and schedule.\\n\\n\\nDatabricks is built on top of popular open source\\n\\nsoftware that it created. Engineering teams can\\n\\nimprove the underpinnings of the Databricks\\n\\nplatform by submitting code via pull request and\\n\\nbecoming committers to the projects. The benefit\\n\\nto organizations is that their engineers contribute\\n\\nto the feature set of the data platform while\\n\\nDatabricks remains responsible for all integration\\n\\nand performance testing plus all the runtime\\n\\nsupport, including failover and disaster recovery.\\n\\n\\n-----\\n\\n**Don’t forget about the data**\\n\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\n\\n“data assets” consumable to the end users or systems. Data insights, model training and model execution\\n\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\n\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\n\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\n\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\n\\ncreating true competitive advantage.\\n\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\n\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\n\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\n\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\n\\nengineers innovate on components that don’t bring true competitive advantage.\\n\\n\\n-----\\n\\n#### 9. Allocate, monitor and optimize costs\\n\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\\n\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\\n\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\n\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\n\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\n\\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\\n\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\n\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\n\\nplatform, the more they collaborated and their level of expertise increased.\", \"metadata\": {\"similarity_score\": 0.0024809677, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\"}, \"id\": \"b5f4bd0258226132f89697f6e660b09b\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_f04da41e-2d5b-49c1-8d0b-0054ae126f07): search_product_docs *****\u001B[0m\nArguments: \n{ \"query\": \"lakehouse architecture advantages over traditional on-premises data warehouses and cloud data warehouses\", \"filters\": [] }\n\u001B[32m************************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_f04da41e-2d5b-49c1-8d0b-0054ae126f07) *****\u001B[0m\n[{\"page_content\": \"In short, a lakehouse is a data architecture that combines the best elements\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\ndesign, which implements similar data structures and data management features\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\n\\n\\n-----\\n\\n##### Data lakehouse\\n\\nOne platform to unify all your data, analytics and AI workloads\\n\\n###### Lakehouse Platform\\n\\nAll machine learning, SQL,\\nBI, and stream\n\n*** WARNING: max output size exceeded, skipping output. ***\n\nine learning and SQL\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\nrely on the same data repository.\\n\\n- **\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\nSupport for streaming eliminates the need for separate systems dedicated to\\nserving real-time data applications.\\n\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\nfeatures. Tools for security and access control are basic requirements. Data governance\\ncapabilities including auditing, retention and lineage have become essential particularly\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\nfeatures only need to be implemented, tested and administered for a single system.\\n\\n\\n-----\\n\\n**Read the research**\\n**Delta Lake: High-Performance ACID**\\n**Table Storage Over Cloud Object Stores**\\n\\n**Abstract**\\nCloud object stores such as Amazon S3 are some of the largest and most costeffective storage systems on the planet, making the main attractive target to\\nstore large data warehouses and data lakes. Unfortunately, their implementation\\nas key-value stores makes it difficult to achieve ACID transactions and high\\nperformance: Metadata operations, such as listing objects, are expensive, and\\nconsistency guarantees are limited. In this paper, we present Delta Lake, an\\nopen source ACID table storage layer over cloud object stores initially developed\\nat Databricks. Delta Lake uses a transaction log that is compacted into Apache\\nParquet format to provide ACID properties, time travel, and significantly faster\\nmetadata operations for large tabular data sets (e.g., the ability to quickly search\\nbillions of table partitions for those relevant to a query). It also leverages this\\ndesign to provide high-level features such as automatic data layout optimization,\\nupserts, caching, and audit logs. Delta Lake tables can be accessed from Apache\\nSpark, Hive, Presto, Redshift, and other systems. Delta Lake is deployed at\\nthousands of Databricks customers that process exabytes of data per day, with\\nthe largest instances managing exabyte-scale data sets and billions of objects.\\n\\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong Zhu,\\nMukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja Łuszczak,\\nMichał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter Boncz, Ali Ghodsi,\\nSameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\\n\\nRead the full research paper on the [inner workings of the lakehouse](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores) [.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\\n\\n\\n-----\\n\\n**Some early examples**\\nThe [Databricks Unified Data Platform](https://databricks.com/product/data-lakehouse) has the architectural features of a lakehouse.\\nMicrosoft’s [Azure Synapse Analytics](https://azure.microsoft.com/en-us/blog/simply-unmatched-truly-limitless-announcing-azure-synapse-analytics/) service, which [integrates with Azure Databricks](https://databricks.com/blog/2019/11/04/new-microsoft-azure-data-warehouse-service-and-azure-databricks-combine-analytics-bi-and-data-science.html) ,\\nenables a similar lakehouse pattern. Other managed services such as [BigQuery](https://cloud.google.com/bigquery/) and\\n[Redshift Spectrum](https://docs.aws.amazon.com/redshift/latest/dg/c-using-spectrum.html) have some of the lakehouse features listed above, but they are\\nexamples that focus primarily on BI and other SQL applications.\", \"metadata\": {\"similarity_score\": 0.0062149367, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\"}, \"id\": \"9b85a3fa086f1fa4e09197bc46d91dab\"}, {\"page_content\": \"reduces staleness and improves recency, reduces latency and lowers the cost of\\n\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\n\\n- **\\u0007Storage is decoupled from compute:** In practice, this means storage and\\n\\ncompute use separate clusters, thus these systems are able to scale to many more\\n\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\n\\nthis property.\\n\\n- **\\u0007Openness:** The storage formats they use are open and standardized, such as\\n\\nParquet, and they provide an API so a variety of tools and engines, including\\n\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\n\\n- **\\u0007Support for diverse data types ranging from unstructured to structured data:**\\n\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\n\\nfor many new data applications, including images, video, audio, semi-structured\\n\\ndata, and text.\\n\\n- **\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\n\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\n\\nrely on the same data repository.\\n\\n- **\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\n\\nSupport for streaming eliminates the need for separate systems dedicated to\\n\\nserving real-time data applications.\\n\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\n\\nfeatures. Tools for security and access control are basic requirements. Data governance\\n\\ncapabilities including auditing, retention and lineage have become essential particularly\\n\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\n\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\n\\nfeatures only need to be implemented, tested and administered for a single system.\\n\\n\\n-----\\n\\n**Read the research**\\n**Delta Lake: High-Performance ACID**\\n**Table Storage Over Cloud Object Stores**\\n\\n**Abstract**\\n\\nCloud object stores such as Amazon S3 are some of the largest and most\\n\\ncost-effective storage systems on the planet, making the main attractive\\n\\ntarget to store large data warehouses and data lakes. Unfortunately, their\\n\\nimplementation as key-value stores makes it difficult to achieve ACID\\n\\ntransactions and high performance: Metadata operations, such as listing\\n\\nobjects, are expensive, and consistency guarantees are limited. In this paper,\\n\\nwe present Delta Lake, an open source ACID table storage layer over cloud\\n\\nobject stores initially developed at Databricks. Delta Lake uses a transaction log\\n\\nthat is compacted into Apache Parquet format to provide ACID properties, time\\n\\ntravel, and significantly faster metadata operations for large tabular data sets\\n\\n(e.g., the ability to quickly search billions of table partitions for those relevant\\n\\nto a query). It also leverages this design to provide high-level features such\\n\\nas automatic data layout optimization, upserts, caching, and audit logs. Delta\\n\\nLake tables can be accessed from Apache Spark, Hive, Presto, Redshift, and\\n\\nother systems. Delta Lake is deployed at thousands of Databricks customers\\n\\nthat process exabytes of data per day, with the largest instances managing\\n\\nexabyte-scale data sets and billions of objects.\\n\\nAuthors: Michael Armbrust, Tathagata Das, Liwen Sun, Burak Yavuz, Shixiong\\n\\nZhu, Mukul Murthy, Joseph Torres, Herman van H Ö vell, Adrian Ionescu, Alicja\\n\\nŁuszczak, Michał Szafra ́nski, Xiao Li, Takuya Ueshin, Mostafa Mokhtar, Peter\\n\\nBoncz, Ali Ghodsi, Sameer Paranjpye, Pieter Senster, Reynold Xin, Matei Zaharia\\n\\nRead the full research paper on the [inner workings of the lakehouse.](https://databricks.com/research/delta-lake-high-performance-acid-table-storage-overcloud-object-stores)\", \"metadata\": {\"similarity_score\": 0.00619995, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\"}, \"id\": \"d260bbdbcefe5b169f94c612022b7f40\"}, {\"page_content\": \"velocity and volume. Data warehouses are not suited for many of these use cases, and\\n\\nthey are certainly not the most cost-efficient.\\n\\nAs companies began to collect large amounts of data from many different sources,\\n\\narchitects began envisioning a single system to house data for many different\\n\\nanalytic products and workloads.\\n\\nAbout a decade ago, companies began building [data lakes](https://databricks.com/glossary/data-lake) -- repositories for raw data\\n\\nin a variety of formats. While suitable for storing data, data lakes lack some critical\\n\\nfeatures: They do not support transactions, they do not enforce data quality, and their\\n\\nlack of consistency / isolation makes it almost impossible to mix appends and reads,\\n\\n\\n-----\\n\\n**A lakehouse combines the best elements**\\n**of data lakes and data warehouses**\\n\\nA lakehouse is a new data architecture that combines the best elements of data lakes\\n\\nand data warehouses.\\n\\n\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\n\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\n\\nwarehouses.\\n\\n\\nThe need for a flexible, high-performance system hasn’t abated. Companies\\n\\n\\nrequire systems for diverse data applications including SQL analytics, real-time\\n\\nmonitoring, data science and machine learning. Most of the recent advances in\\n\\nAI have been in better models to process unstructured data (text, images, video,\\n\\naudio), but these are precisely the types of data that a data warehouse is not\\n\\noptimized for.\\n\\nA common approach is to use multiple systems — a data lake, several data\\n\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\n\\nand image databases. Having a multitude of systems introduces complexity and,\\n\\nmore importantly, introduces delay as data professionals invariably need to move\\n\\nor copy data between different systems.\\n\\n\\nLakehouses are enabled by a new system design: implementing similar data struc-\\n\\ntures and data management features to those in a data warehouse, directly on the\\n\\nkind of low-cost storage used for data lakes. They are what you would get if you had\\n\\nto redesign data warehouses in the modern world, now that cheap and highly reliable\\n\\nstorage (in the form of object stores) are available.\\n\\nA lakehouse has the following key features:\\n\\n- **\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\n\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\n\\nconsistency as multiple parties concurrently read or write data, typically using\\n\\nSQL.\\n\\n\\n-----\\n\\n- **\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\n\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\n\\nsuch as star/snowflake-schemas. The system should be able to reason about data\\n\\nintegrity, and it should have robust governance and auditing mechanisms.\\n\\n- **\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\n\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\n\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\n\\n- **\\u0007Storage is decoupled from compute:** In practice, this means storage and\\n\\ncompute use separate clusters, thus these systems are able to scale to many more\\n\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\n\\nthis property.\\n\\n- **\\u0007Openness:** The storage formats they use are open and standardized, such as\\n\\nParquet, and they provide an API so a variety of tools and engines, including\\n\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\n\\n- **\\u0007Support for diverse data types ranging from unstructured to structured data:**\\n\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\n\\nfor many new data applications, including images, video, audio, semi-structured\", \"metadata\": {\"similarity_score\": 0.006188987, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\"}, \"id\": \"27aeb4ec0df5550cb0a51cb193c439bd\"}, {\"page_content\": \"Current lakehouses reduce cost, but their performance can still lag specialized\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\nconnectors to popular tools so they can appeal to a variety of personas. These\\nand other issues will be addressed as the technology continues to mature and\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\nproperties of being simpler, more cost-efficient and more capable of serving\\ndiverse data applications.\\n\\n\\n-----\\n\\n**Diving Deep Into the**\\n**Inner Workings of the**\\n**Lakehouse and Delta Lake**\\n\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\ndata architecture, some people thought the lakehouse is the same thing as\\nthe data lake. Recently, several of our engineers and founders wrote a research\\npaper that describes some of the core technological challenges and solutions that\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\n\\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\\nthey would have said faster horses.” The crux of this statement is that people often\\nenvision a better solution to a problem as an evolution of what they already know\\nrather than rethinking the approach to the problem altogether. In the world of data\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\n\\n\\n-----\\n\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\nstore data warehouses and data lakes. However, their nature as key-value stores\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\nand limited consistency guarantees.\\n\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\n\\n**1. Data lakes**\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\\napproach because the table is just a group of objects that can be accessed from\\na wide variety of tools without a lot of additional data stores or systems. However,\\nboth performance and consistency problems are common. Hidden data corruption\\nis common due to failed transactions, eventual consistency leads to inconsistent\\nqueries, latency is high, and basic management capabilities like table versioning and\\naudit logs are unavailable.\\n\\n**2. Custom storage engines**\\nThe second approach is custom storage engines, such as proprietary systems built for\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\nservice that’s able to provide a single source of truth. However, all I/O operations need\\nto connect to this metadata service, which can increase cloud resource costs and\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\ndata because these systems are generally optimized for traditional structured\\n\\n\\n-----\\n\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\ncustomers into a specific service provider, leaving customers to contend with\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\nadopt a new approach later.\", \"metadata\": {\"similarity_score\": 0.0060871253, \"doc_uri\": \"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\"}, \"id\": \"8375eac494bff392a37d6dff7c40c1b1\"}]\n\u001B[32m**********************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe responsibilities of a Data Engineer include:\n\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\n\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe types of metrics tracked in game telemetry to understand player behavior and improve the game experience include:\n\n1. Player engagement: Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n2. Game progress: Monitor player progress through different levels and milestones in the game.\n3. In-game purchases: Track the number and value of in-game purchases made by players.\n4. Player demographics: Collect demographic information about players, such as age, gender, location, and device type.\n5. Session length: Monitor the length of each player session, and how often players return to the game.\n6. Retention: Track the percentage of players who return to the game after their first session.\n7. User Acquisition: Track the number of new players acquired through different marketing channels.\n\nAdditionally, game telemetry can also track other metrics such as:\n\n* Concurrent players: Track the number of players who are simultaneously connected to the game servers.\n* Server availability: Monitor the uptime and downtime of the game servers.\n* Latency: Measure the time it takes for data to travel from the player’s device to the game server and back.\n* Network bandwidth: Monitor the amount of data being transmitted between the player’s device and the game server.\n\nThese metrics can help game developers understand player behavior, identify areas for improvement, and make data-driven decisions to enhance the overall game experience.\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe benefits of Delta Sharing in terms of data accessibility and platform compatibility include:\n\n1. **Open cross-platform sharing**: Delta Sharing allows for secure data sharing across different clouds and even from cloud to on-premises setups, without requiring data recipients to be on the same platform or cloud.\n2. **Securely share live data without replication**: Data providers can share live data without replicating or moving it to another system, and recipients can access the latest version of the data.\n3. **Centralized governance**: Data providers can centrally manage, govern, audit, and track usage of shared data sets from a single point of enforcement, meeting compliance and regulatory requirements.\n4. **Reduced time-to-value**: Delta Sharing eliminates the need to set up a new ingestion process to consume data, allowing recipients to directly access fresh data and query it using their tools of choice.\n5. **Lower cost**: Delta Sharing reduces the cost of managing and consuming shares for both data providers and recipients, as it eliminates the need for data replication and separate computing environments.\n6. **Increased flexibility**: Delta Sharing supports sharing of non-tabular data and data derivatives, such as data streams, AI models, SQL views, and arbitrary files, enabling increased collaboration and innovation.\n7. **Simplified data pipelines**: Delta Sharing simplifies data pipelines by enabling data providers to share existing large-scale data sets without moving data, and recipients can query, visualize, transform, ingest, or enrich shared data with their tools of choice.\n\nOverall, Delta Sharing provides an open, secure, and scalable solution for data sharing, allowing organizations to share data across different platforms and clouds, while maintaining control and governance over their data assets.\n\n--------------------------------------------------------------------------------\n\u001B[33mAssistant\u001B[0m (to User):\n\nThe main advantages of the lakehouse architecture over traditional on-premises data warehouses and cloud data warehouses include:\n\n1. **Support for diverse workloads**: Lakehouses allow for data science, machine learning, and SQL analytics to be performed on the same data repository, reducing the need for separate systems.\n2. **End-to-end streaming**: Lakehouses support real-time reports and eliminate the need for separate systems dedicated to serving real-time data applications.\n3. **Storage decoupled from compute**: Lakehouses allow storage and compute to use separate clusters, enabling them to scale to many more concurrent users and larger data sizes.\n4. **Openness**: Lakehouses use open and standardized storage formats, such as Apache Parquet, and provide an API for various tools and engines to access the data directly.\n5. **Support for diverse data types**: Lakehouses can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\n6. **Transaction support**: Lakehouses support ACID transactions, ensuring consistency as multiple parties concurrently read or write data.\n7. **Schema enforcement and governance**: Lakehouses have a way to support schema enforcement and evolution, supporting data warehouse schema paradigms such as star/snowflake-schemas.\n8. **BI support**: Lakehouses enable using BI tools directly on the source data, reducing staleness and improving recency, reducing latency, and lowering the cost of having to operationalize two copies of the data in both a data lake and a warehouse.\n\nThese advantages make lakehouses a more flexible, cost-efficient, and capable solution for serving diverse data applications compared to traditional on-premises data warehouses and cloud data warehouses.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "\n", + "\n", + "\n", + " Evaluation output\n", + " \n", + " \n", + " \n", + "\n", + "\n", + "
\n", + "
\n", + " \n", + " \n", + " \n", + " \n", + " \n", + " Evaluation completed: 5/19 pass all assessments\n", + "
\n", + "
\n", + "
    \n", + " \n", + "
  • ℹ️ 8/19 could not run model
  • \n", + " \n", + "
  • ℹ️ 1/19 could not run all selected judges
  • \n", + " \n", + "
\n", + "
\n", + " \n", + "
\n", + "
Python APIs:
\n", + "
    \n", + "
  • mlflow.evaluate() returns a result object that is an instance of EvaluationResult.\n", + "
  • \n", + "
  • \n", + " result.metrics: A dictionary of aggregated evaluation results.\n", + "
  • \n", + "
  • \n", + " result.tables['eval_results']: A DataFrame of per-request evaluation results.\n", + "
  • \n", + "
\n", + "\n", + "
\n", + " For more details, see the mlflow.evaluate\n", + " API reference or the MLflow\n", + " LLM Evaluate Quickstart.\n", + "
\n", + "
\n", + "\n", + "
\n", + "\n", + "" + ] + }, + "metadata": {}, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/12/18 17:44:45 WARNING mlflow.utils.autologging_utils: MLflow autogen autologging is known to be compatible with 0.2.36 <= autogen-agentchat <= 0.2.39, but the installed version is 0.2.40. If you encounter errors during autologging, try upgrading / downgrading autogen-agentchat to a compatible version, or try upgrading MLflow.\n" + ] + }, + { + "output_type": "display_data", + "data": { + "text/html": [ + "
request_idrequestexpected_retrieved_contextexpected_factstracemodel_error_messagesource_idresponseresponse/overall_assessment/ratingresponse/overall_assessment/rationaleresponse/llm_judged/safety/ratingresponse/llm_judged/safety/rationaleresponse/llm_judged/correctness/ratingresponse/llm_judged/correctness/rationaleagent/latency_secondsagent/total_token_countagent/total_input_token_countagent/total_output_token_countretrieval/llm_judged/context_sufficiency/error_messageresponse/llm_judged/groundedness/error_messageretrieved_contextresponse/llm_judged/groundedness/ratingresponse/llm_judged/groundedness/rationaleretrieval/llm_judged/context_sufficiency/ratingretrieval/llm_judged/context_sufficiency/rationaleretrieval/ground_truth/document_ratingsretrieval/ground_truth/document_recall
21866cbed9a5ba0daafc9367a06f6679f7e6290dd05b59cfd45d36fdbc8fbe73List(List(List(Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?, user)))List(List(**EBOOK**\n", + "\n", + "## The Big Book of Data Engineering 2nd Edition\n", + "\n", + "A collection of technical\n", + "blogs, including code\n", + "samples and notebooks\n", + "\n", + "##### With all-new content\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Contents\n", + "\n", + "**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n", + "\n", + "**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n", + "\n", + "**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n", + "\n", + "**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n", + "\n", + "**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n", + "\n", + "**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n", + "\n", + "**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n", + "\n", + "**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n", + "\n", + "**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n", + "\n", + "**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n", + "\n", + "**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n", + "\n", + "**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n", + "\n", + "**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n", + "\n", + "**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n", + "\n", + "**4 . 1** Akamai .................................................................................................................................................................................................... 77\n", + "\n", + "**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n", + "\n", + "**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n", + "\n", + "**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n", + "\n", + "**4 . 5** Rivian .................................................................................................................................................................................................... 90\n", + "\n", + "**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 01\n", + "\n", + "\n", + "### Introduction to Data Engineering on Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "Organizations realize the value data plays as a strategic asset for various\n", + "business-related initiatives, such as growing revenues, improving the customer\n", + "experience, operating efficiently or improving a product or service. However,\n", + "accessing and managing data for these initiatives has become increasingly\n", + "complex. Most of the complexity has arisen with the explosion of data volumes\n", + "and data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\n", + "to increase, 73% of the data goes unused for analytics or decision-making. In\n", + "order to try and decrease this percentage and make more data usable, data\n", + "engineering teams are responsible for building data pipelines to efficiently and\n", + "reliably deliver data. But the process of building these complex data pipelines\n", + "comes with a number of difficulties:\n", + "\n", + "**•** In order to get data into a data lake, data engineers are required\n", + "to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state\n", + "\n", + "**Data pipeline observability**\n", + "Monitor overall data pipeline status from a dataflow graph dashboard and\n", + "visually track end-to-end pipeline health for performance, quality and latency.\n", + "Data pipeline observability capabilities include:\n", + "\n", + "**•** A high-quality, high-fidelity lineage diagram that provides visibility\n", + "into how data flows for impact analysis\n", + "\n", + "**•** Granular logging with performance and status of the data pipeline\n", + "at a row level\n", + "\n", + "**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n", + "\n", + "\n", + "**Automatic deployments and operations**\n", + "Ensure reliable and predictable delivery of data for analytics and machine\n", + "learning use cases by enabling easy and automatic data pipeline deployments\n", + "and rollbacks to minimize downtime. Benefits include:\n", + "\n", + "**•** Complete, parameterized and automated deployment for the\n", + "continuous delivery of data\n", + "\n", + "**•** End-to-end orchestration, testing and monitoring of data pipeline\n", + "deployment across all major cloud providers\n", + "\n", + "**Migrations**\n", + "Accelerating and de-risking the migration journey to the lakehouse, whether\n", + "from legacy on-prem systems or disparate cloud services.\n", + "\n", + "The migration process starts with a detailed discovery and assessment to\n", + "get insights on legacy platform workloads and estimate migration as well as\n", + "Databricks platform consumption costs. Get help with the target architecture\n", + "and how the current technology stack maps to Databricks, followed by a\n", + "phased implementation based on priorities and business needs. Throughout\n", + "this journey companies can leverage:\n", + "\n", + "**•** Automation tools from Databricks and its ISV partners\n", + "\n", + "**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n", + "\n", + "**•** Databricks Professional Services and training\n", + "\n", + "This is the recommended approach for a successful migration, whereby\n", + "customers have seen a 25-50% reduction in costs and 2-3x faster time to value\n", + "for their use cases.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified governance**\n", + "With Unity Catalog, data engineering and governance teams benefit from an\n", + "enterprisewide data catalog with a single interface to manage permissions,\n", + "centralize auditing, automatically track data lineage down to the column level,\n", + "and share data across platforms, clouds and regions. Benefits:\n", + "\n", + "**•** Discover all your data in one place, no matter where it lives,\n", + "and centrally manage fine-grained access permissions using an\n", + "ANSI SQL-based interface\n", + "\n", + "**•** Leverage automated column-level data lineage to perform impact\n", + "analysis of any data changes across the pipeline and conduct\n", + "root cause analysis of any errors in the data pipelines\n", + "\n", + "**•** Centrally audit data entitlements and access\n", + "\n", + "**•** Share data across clouds, regions and data platforms,\n", + "while maintaining a single copy of your data in your cloud storage\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 3\n", + "The Databricks Lakehouse Platform integrates with a large collection of technologies\n", + "\n", + "\n", + "**A rich ecosystem of data solutions**\n", + "The Databricks Lakehouse Platform is built on open source technologies and\n", + "uses open standards so leading data solutions can be leveraged with anything\n", + "you build on the lakehouse. A large collection of technology partners make it\n", + "easy and simple to integrate the technologies you rely on when migrating to\n", + "Databricks and to know you are not locked into a closed data technology stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Conclusion**\n", + "\n", + "As organizations strive to become data-driven, data engineering is a focal\n", + "point for success. To deliver reliable, trustworthy data, data engineers shouldn’t\n", + "need to spend time manually developing and maintaining an end-to-end\n", + "ETL lifecycle. Data engineering teams need an efficient, scalable way to\n", + "simplify ETL development, improve data reliability and manage operations.\n", + "\n", + "As described, the eight key differentiating capabilities simplify the\n", + "management of the ETL lifecycle by automating and maintaining all data\n", + "dependencies, leveraging built-in quality controls with monitoring and by\n", + "providing deep visibility into pipeline operations with automatic recovery.\n", + "Data engineering teams can now focus on easily and rapidly building reliable\n", + "end-to-end production-ready data pipelines using only SQL or Python\n", + "for batch and streaming that deliver high-value data for analytics, data\n", + "science or machine learning.\n", + "\n", + "\n", + "**Follow proven best practices**\n", + "\n", + "In the next section, we describe best practices for data engineering\n", + "end-to end use cases drawn from real-world examples. From data ingestion\n", + "and real-time processing to analytics and machine learning, you’ll learn\n", + "how to translate raw data into actionable data.\n", + "\n", + "As you explore the rest of this guide, you can find data sets and code\n", + "samples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\n", + "get your hands dirty as you explore all aspects of the data lifecycle on the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "**SECTION**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### Guidance and Best Practices\n", + "\n", + "**2.1** Top 5 Databricks Performance Tips\n", + "\n", + "**2.2** How to Profile PySpark\n", + "\n", + "**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n", + "\n", + "**2.4** Streaming in Production: Collected Best Practices\n", + "\n", + "**2.5** Streaming in Production: Collected Best Practices, Part 2\n", + "\n", + "**2.6** Building Geospatial Data Products\n", + "\n", + "**2.7** Data Lineage With Unity Catalog\n", + "\n", + "**2.8** Easy Ingestion to Lakehouse With COPY INTO\n", + "\n", + "**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n", + "\n", + "**2.10** Best Practices for Cross-Government Data Sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.1\n", + "\n", + "**Top 5 Databricks Performance Tips**\n", + "\n", + "by **B R YA N S M I T H** and **R O B S A K E R**\n", + "\n", + "March 10, 2022\n", + "\n", + "\n", + "As solutions architects, we work closely with customers every day to help them\n", + "get the best performance out of their jobs on Databricks — and we often end\n", + "up giving the same advice. It’s not uncommon to have a conversation with a\n", + "customer and get double, triple, or even more performance with just a few\n", + "tweaks. So what’s the secret? How are we doing this? Here are the top 5 things\n", + "we see that can make a huge impact on the performance customers get\n", + "from Databricks.\n", + "\n", + "Here’s a TLDR:\n", + "\n", + "**•** **Use larger clusters.** It may sound obvious, but this is the number\n", + "one problem we see. It’s actually not any more expensive to use a large\n", + "cluster for a workload than it is to use a smaller one. It’s just faster.\n", + "If there’s anything you should take away from this article, it’s this.\n", + "\n", + "Read section 1. Really.\n", + "\n", + "**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\n", + "to learn more. You won’t regret it.\n", + "\n", + "\n", + "\n", + "**•** **Clean out your configurations** . Configurations carried from one\n", + "Apache Spark™ version to the next can cause massive problems. Clean up!\n", + "Read section 3 to learn more.\n", + "\n", + "**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\n", + "correctly, if at all. See Section 4 to learn more.\n", + "\n", + "**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\n", + "you’re writing Spark code, jump to section 5.\n", + "\n", + "**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\n", + "blog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n", + "\n", + "**1. Give your clusters horsepower!**\n", + "\n", + "This is the number one mistake customers make. Many customers create tiny\n", + "clusters of two workers with four cores each, and it takes forever to do anything.\n", + "The concern is always the same: they don’t want to spend too much money on\n", + "larger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n", + "**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n", + "\n", + "\n", + "-----\n", + "\n", + "The key is that you’re renting the cluster for the length of the workload. So, if\n", + "you spin up that two worker cluster and it takes an hour, you’re paying for those\n", + "workers for the full hour. However, if you spin up a four worker cluster and it takes\n", + "only half an hour, the cost is actually the same! And that trend continues as long\n", + "as there’s enough work for the cluster to do.\n", + "\n", + "Here’s a hypothetical scenario illustrating the point:\n", + "\n", + "**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n", + "\n", + "1 $1 2 $2\n", + "\n", + "2 $2 1 $2\n", + "\n", + "4 $4 0.5 $2\n", + "\n", + "8 $8 0.25 $2\n", + "\n", + "Notice that the total cost of the workload stays the same while the real-world\n", + "time it takes for the job to run drops significantly. So, bump up your Databricks\n", + "cluster specs and speed up your workloads without spending any more money. It\n", + "\n", + "can’t really get any simpler than that.\n", + "\n", + "**2. Use Photon**\n", + "\n", + "Our colleagues in engineering have rewritten the Spark execution engine in C++\n", + "and dubbed it Photon. The results are impressive!\n", + "\n", + "\n", + "Beyond the obvious improvements due to running the engine in native code,\n", + "they’ve also made use of CPU-level performance features and better memory\n", + "\n", + "management. On top of this, they’ve rewritten the Parquet writer in C++. So this\n", + "makes writing to Parquet and Delta (based on Parquet) super fast as well!\n", + "\n", + "But let’s also be clear about what Photon is speeding up. It improves\n", + "computation speed for any built-in functions or operations, as well as writes to\n", + "Parquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n", + "(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\n", + "spending most of its time reading from an ancient on-prem database? Won’t\n", + "help there either, unfortunately.\n", + "\n", + "\n", + "-----\n", + "\n", + "The good news is that it helps where it can. So even if part of your job can’t be\n", + "sped up, it will speed up the other parts. Also, most jobs are written with the\n", + "native operations and spend a lot of time writing to Delta, and Photon helps a lot\n", + "there. So give it a try. You may be amazed by the results!\n", + "\n", + "**3. Clean out old configurations**\n", + "\n", + "You know those Spark configurations you’ve been carrying along from version to\n", + "version and no one knows what they do anymore? They may not be harmless.\n", + "We’ve seen jobs go from running for hours down to minutes simply by cleaning\n", + "out old configurations. There may have been a quirk in a particular version of\n", + "Spark, a performance tweak that has not aged well, or something pulled off\n", + "some blog somewhere that never really made sense. At the very least, it’s worth\n", + "revisiting your Spark configurations if you’re in this situation. Often the default\n", + "configurations are the best, and they’re only getting better. Your configurations\n", + "may be holding you back.\n", + "\n", + "**4. The Delta Cache is your friend**\n", + "\n", + "This may seem obvious, but you’d be surprised how many people are not using\n", + "the [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\n", + "the workers’ SSDs for faster access.\n", + "\n", + "\n", + "If you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\n", + "by default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\n", + "your “hot” tables when you’re starting an endpoint. This will ensure blazing fast\n", + "speeds for any queries on those tables.\n", + "\n", + "If you’re using regular clusters, be sure to use the i3 series on Amazon Web\n", + "Services (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\n", + "all have fast SSDs and caching enabled by default.\n", + "\n", + "Of course, your mileage may vary. If you’re doing BI, which involves reading the\n", + "same tables over and over again, caching gives an amazing boost. However, if\n", + "you’re simply reading a table once and writing out the results as in some ETL\n", + "jobs, you may not get much benefit. You know your jobs better than anyone.\n", + "Go forth and conquer.\n", + "\n", + "\n", + "-----\n", + "\n", + "**5. Be aware of lazy evaluation**\n", + "\n", + "\n", + "However, there is a catch here. Every time you try to display or write out\n", + "results, it runs the execution plan again. Let’s look at the same block of code\n", + "but extend it and do a few more operations.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ".filter(...)\n", + ")\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "\n", + "_# Unfortunately this will run the plan again, including filtering, joining,_\n", + "_etc_\n", + "df2.display()\n", + "\n", + "_# So will this…_\n", + "df2.count()\n", + "—------\n", + "\n", + "\n", + "If you’re a data analyst or data scientist only using SQL or doing BI you can skip\n", + "this section. However, if you’re in data engineering and writing pipelines or doing\n", + "processing using Databricks/Spark, read on.\n", + "\n", + "When you’re writing Spark code like select, groupBy, filter, etc., you’re really\n", + "building an execution plan. You’ll notice the code returns almost immediately when\n", + "you run these functions. That’s because it’s not actually doing any computation. So\n", + "even if you have petabytes of data, it will return in less than a second.\n", + "\n", + "However, once you go to write your results out you’ll notice it takes longer. This\n", + "is due to lazy evaluation. It’s not until you try to display or write results that your\n", + "execution plan is actually run.\n", + "\n", + "—-------\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + "\n", + "\n", + "_# Now run the execution plan to get results_\n", + "df2.display()\n", + "—------\n", + "\n", + "\n", + "-----\n", + "\n", + "The developer of this code may very well be thinking that they’re just printing\n", + "out results three times, but what they’re really doing is kicking off the same\n", + "processing three times. Oops. That’s a lot of extra work. This is a very common\n", + "mistake we run into. So why is there lazy evaluation, and what do we do about it?\n", + "\n", + "In short, processing with lazy evaluation is way faster than without it.\n", + "Databricks/Spark looks at the full execution plan and finds opportunities\n", + "for optimization that can reduce processing time by orders of magnitude.\n", + "So that’s great, but how do we avoid the extra computation? The answer\n", + "is pretty straightforward: save computed results you will reuse.\n", + "\n", + "\n", + "This works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\n", + "benefit greatly from lazy evaluation, but it’s something a lot of customers trip\n", + "over. So be aware of its existence and save results you reuse in order to avoid\n", + "unnecessary computation.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "Let’s look at the same block of code again, but this time let’s avoid the\n", + "recomputation:\n", + "\n", + "_# Build an execution plan._\n", + "_# This returns in less than a second but does no work_\n", + "df2 = (df\n", + ".join(...)\n", + ".select(...)\n", + ". filter (...)\n", + ")\n", + "\n", + "_# save it_\n", + "df2.write.save(path)\n", + "\n", + "_# load it back in_\n", + "df3 = spark.read.load(path)\n", + "\n", + "_# now use it_\n", + "df3.display()\n", + "\n", + "_# this is not doing any extra computation anymore. No joins, filtering,_\n", + "_etc. It’s already done and saved._\n", + "df3.display()\n", + "\n", + "_# nor is this_\n", + "df3.count()\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.2 \u0007\n", + "\n", + "**How to Profile PySpark**\n", + "\n", + "by **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n", + "\n", + "October 6, 2022\n", + "\n", + "\n", + "In Apache Spark™, declarative Python APIs are supported for big data workloads.\n", + "They are powerful enough to handle most common use cases. Furthermore,\n", + "PySpark UDFs offer more flexibility since they enable users to run arbitrary\n", + "Python code on top of the Apache Spark™ engine. Users only have to state\n", + "“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\n", + "PySpark easier to use, but it can be difficult to identify performance bottlenecks\n", + "and apply custom optimizations.\n", + "\n", + "To address the difficulty mentioned above, PySpark supports various profiling\n", + "tools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n", + "[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\n", + "of function calls, total time spent in the given function, and filename, as well\n", + "as line number to help navigation. That information is essential to exposing\n", + "tight loops in your PySpark programs, and allowing you to make performance\n", + "\n", + "improvement decisions.\n", + "\n", + "\n", + "**Driver profiling**\n", + "\n", + "PySpark applications run as independent sets of processes on a cluster,\n", + "coordinated by the SparkContext object in the driver program. On the driver\n", + "side, PySpark is a regular Python process; thus, we can profile it as a normal\n", + "Python program using cProfile as illustrated below:\n", + "\n", + "import cProfile\n", + "\n", + "with cProfile.Profile() as pr:\n", + "_# Your code_\n", + "\n", + "pr.print_stats()\n", + "\n", + "**Workers profiling**\n", + "\n", + "Executors are distributed on worker nodes in the cluster, which introduces\n", + "complexity because we need to aggregate profiles. Furthermore, a Python worker\n", + "process is spawned per executor for PySpark UDF execution, which makes the\n", + "profiling more intricate.\n", + "\n", + "\n", + "-----\n", + "\n", + "The UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\n", + "and becomes a major tool to profile workers for PySpark applications. We’ll\n", + "illustrate how to use the UDF profiler with a simple Pandas UDF example.\n", + "\n", + "Firstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n", + "```\n", + " sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n", + " 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n", + " by 'id' )\n", + " ).withColumn( 'v' , rand())\n", + "\n", + "```\n", + "Later, we will group by the id column, which results in 8 groups with 1,000 rows\n", + "per group.\n", + "\n", + "The Pandas UDF plus_one is then created and applied as shown below:\n", + "```\n", + " import pandas as pd\n", + " def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf.apply( lambda x: x + 1 , axis= 1 )\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "Executing the example above and running sc.show_profiles() prints the\n", + "following profile. The profile below can also be dumped to disk by sc.dump_\n", + "profiles(path).\n", + "\n", + "The UDF id in the profile (271, highlighted above) matches that in the Spark plan\n", + "for res. The Spark plan can be shown by calling res.explain() .\n", + "\n", + "\n", + "Note that plus_one takes a pandas DataFrame and returns another pandas\n", + "DataFrame. For each group, all columns are passed together as a pandas\n", + "DataFrame to the plus_one UDF, and the returned pandas DataFrames are\n", + "combined into a PySpark DataFrame.\n", + "\n", + "\n", + "-----\n", + "\n", + "The first line in the profile’s body indicates the total number of calls that were\n", + "monitored. The column heading includes\n", + "\n", + "**•** ncalls , for the number of calls.\n", + "\n", + "**•** tottime , for the total time spent in the given function (excluding time\n", + "spent in calls to sub-functions)\n", + "\n", + "**•** percall , the quotient of tottime divided by ncalls\n", + "\n", + "**•** cumtime , the cumulative time spent in this and all subfunctions (from\n", + "invocation till exit)\n", + "\n", + "**•** percall , the quotient of cumtime divided by primitive calls\n", + "\n", + "**•** filename:lineno(function) , which provides the respective information\n", + "for each function\n", + "\n", + "Digging into the column details: plus_one is triggered once per group, 8 times\n", + "in total; _arith_method of pandas Series is called once per row, 8,000 times\n", + "in total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\n", + "row, thus suffering from high invocation overhead.\n", + "\n", + "We can reduce such overhead by substituting the pandas.DataFrame.apply\n", + "with pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\n", + "follows:\n", + "```\n", + " import pandas as pd\n", + " def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n", + " return pdf + 1\n", + " res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n", + " schema)\n", + " res.collect()\n", + "\n", + "```\n", + "\n", + "The updated profile is as shown below.\n", + "\n", + "We can summarize the optimizations as follows:\n", + "\n", + "**•** Arithmetic operation from 8,000 calls to 8 calls\n", + "\n", + "**•** Total function calls from 2,898,160 calls to 2,384 calls\n", + "\n", + "**•** Total execution time from 2.300 seconds to 0.004 seconds\n", + "\n", + "The short example above demonstrates how the UDF profiler helps us deeply\n", + "understand the execution, identify the performance bottleneck and enhance\n", + "the overall performance of the user-defined function.\n", + "\n", + "The UDF profiler was implemented based on the executor-side profiler,\n", + "which is designed for PySpark RDD API. The executor-side profiler is available\n", + "in all active Databricks Runtime versions.\n", + "\n", + "\n", + "-----\n", + "\n", + "Both the UDF profiler and the executor-side profiler run on Python workers.\n", + "They are controlled by the spark.python.profile Spark configuration, which\n", + "is false by default. We can enable that Spark configuration on a Databricks\n", + "Runtime cluster as shown below.\n", + "\n", + "\n", + "**Conclusion**\n", + "\n", + "PySpark profilers are implemented based on cProfile; thus, the profile reporting\n", + "relies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\n", + "collecting profile reports from Python workers.\n", + "\n", + "Powerful profilers are provided by PySpark in order to identify hot loops and\n", + "suggest potential improvements. They are easy to use and critical to enhance\n", + "the performance of PySpark programs. The UDF profiler, which is available\n", + "starting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\n", + "challenges and brings insights to user-defined functions.\n", + "\n", + "In addition, there is an ongoing effort in the Apache Spark™ open source\n", + "community to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\n", + "more information.\n", + "\n", + "**Start experimenting with these**\n", + "**free Databricks** **notebooks** **.**\n", + "\n", + "\n", + "-----\n", + "\n", + "SECTION 2.3 \u0007\n", + "\n", + "**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n", + "**and Apache Kafka**\n", + "\n", + "by **F R A N K M U N Z**\n", + "\n", + "August 9, 2022\n", + "\n", + "\n", + "[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\n", + "approach for creating reliable data pipelines and fully manages the underlying\n", + "infrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\n", + "actionable insights derived from near real-time data. Delta Live Tables enables\n", + "low-latency streaming data pipelines to support such use cases with low\n", + "latencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n", + "[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n", + "\n", + "This article will walk through using DLT with Apache Kafka while providing the\n", + "required Python code to ingest streams. The recommended system architecture\n", + "will be explained, and related DLT settings worth considering will be explored\n", + "along the way.\n", + "\n", + "**Streaming platforms**\n", + "\n", + "Event buses or message buses decouple message producers from consumers.\n", + "A popular streaming use case is the collection of click-through data from\n", + "users navigating a website where every user interaction is stored as an event in\n", + "\n", + "\n", + "Apache Kafka. The event stream from Kafka is then used for real-time streaming\n", + "data analytics. Multiple message consumers can read the same data from Kafka\n", + "and use the data to learn about audience interests, conversion rates, and bounce\n", + "reasons. The real-time, streaming event data from the user interactions often\n", + "also needs to be correlated with actual purchases stored in a billing database.\n", + "\n", + "**Apache Kafka**\n", + "\n", + "[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\n", + "topic, an append-only distributed log of events where messages are buffered for\n", + "a certain amount of time. Although messages in Kafka are not deleted once they\n", + "are consumed, they are also not stored indefinitely. The message retention for\n", + "\n", + "Kafka can be configured per topic and defaults to 7 days. Expired messages will\n", + "be deleted eventually.\n", + "\n", + "This article is centered around Apache Kafka; however, the concepts discussed\n", + "also apply to many other event busses or messaging systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Streaming data pipelines**\n", + "\n", + "\n", + "In a data flow pipeline, Delta Live Tables and their dependencies can be declared\n", + "with a standard SQL Create Table As Select (CTAS) statement and the DLT\n", + "keyword “live.”\n", + "\n", + "When developing DLT with Python, the @dlt.table decorator is used to, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf))List(Larger clusters execute workloads faster in Databricks., The faster execution reduces the total time required for workload completion., The overall cost efficiency is balanced due to reduced workload completion time despite higher hourly costs.){\"info\": {\"request_id\": \"tr-fdd84cee84c14b1cbd05fef9afda0573\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852705, \"execution_time_ms\": 1874, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"0928244e-ca9f-4d04-839e-afa0c6c57ecc\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-fdd84cee84c14b1cbd05fef9afda0573/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x1c2b5d55408ec680\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": null, \"start_time\": 1734543852705036571, \"end_time\": 1734543854579582448, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xc378632f7d05e4e5\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852723331103, \"end_time\": 1734543852888574298, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xa3dbaf3bb677995f\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852888704900, \"end_time\": 1734543852889256407, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x0027ea0bf61b6abb\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0xa3dbaf3bb677995f\", \"start_time\": 1734543852888865002, \"end_time\": 1734543852889028004, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb720815986795f4f\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0xa3dbaf3bb677995f\", \"start_time\": 1734543852889092305, \"end_time\": 1734543852889221106, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x456d9ec6e5fd8501\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852889318008, \"end_time\": 1734543855416920308, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543855416856807, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xb9e384e320bcec52\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x456d9ec6e5fd8501\", \"start_time\": 1734543853073605944, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854579418, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
088c4943384eaa6a228c3d68ff70fbef6bcbe9c50176180e73244de1d7f3be1aList(List(List(What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?, user)))List(List(```\n", + "TECHNICAL GUIDE\n", + "\n", + "```\n", + "\n", + "# Solving Common Data Challenges \n", + "\n", + "\n", + "#### Startups and Digital Native Businesses\n", + "\n", + "\n", + "-----\n", + "\n", + "### Table of Contents\n", + "\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Creating a unified data architecture for data quality, governance and efficiency\n", + "\n", + "# 03\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building effective machine learning operations\n", + "\n", + "```\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE:\n", + " \u0003\n", + "\n", + "###### Building a data architecture to support scale and performance\n", + "\n", + "# 04\n", + "SUMMARY:\n", + "\n", + "###### The Databricks Lakehouse Platform addresses these challenges\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "**I N T R O D U C T I O N**\n", + "\n", + "\n", + "This guide shares how the lakehouse architecture can increase\n", + "productivity and cost-efficiently support all your data, analytics\n", + "and AI workloads, and flexibly scale with the pace of growth\n", + "for your company. Read the entire guide or dive straight into a\n", + "specific challenge.\n", + "\n", + "With the advent of cloud infrastructure, a new generation of\n", + "startups has rapidly built and scaled their businesses. The use of\n", + "cloud infrastructure, once seen as innovative, has now become\n", + "table stakes. The differentiator for the fastest-moving startups\n", + "and digital natives now comes from the effective use of data\n", + "at scale, primarily analytics and AI. Digital natives — defined\n", + "as fast-moving, lean, and technically savvy, born-in-the-cloud\n", + "organizations — are beginning to focus on new data-driven use\n", + "cases such as real-time machine learning and personalized\n", + "customer experiences.\n", + "\n", + "To pursue these new data-intensive use cases and initiatives,\n", + "organizations must look beyond the technologies that delivered\n", + "them to this point in time. Over time, these technologies, such\n", + "as transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n", + "\n", + "This guide examines some of the biggest data challenges and\n", + "solutions for startups and for scaling digital native businesses\n", + "that have reached the point where an end-to-end modern data\n", + "platform is a smart investment. Some key considerations include:\n", + "systems that are not cost-efficient and require time-consuming\n", + "administration and engineering toil. In addition to growing\n", + "maintenance needs, data is often stored in disparate locations\n", + "and formats, with little or no governance, making real-time use\n", + "cases, analytics and AI difficult or impossible.\n", + "\n", + "\n", + "**Consolidating on a unified data platform**\n", + "As mentioned above, siloed data storage and management add administrative and\n", + "financial cost. You can benefit significantly when you unify your data in one location\n", + "with a flexible architecture that scales with your needs and delivers performance\n", + "for future success. For this, you will want an open platform that supports all your\n", + "data including batch and streaming workloads, data analytics and machine learning.\n", + "With data unification, you create a more efficient, integrated approach to ingesting,\n", + "cleaning and organizing your data. You also need automation to make data analysis\n", + "easier for the nontechnical users in the company. But broader data access also\n", + "means more focus on security, privacy, compliance and access control, which can\n", + "create overhead for a growing.\n", + "\n", + "**Scaling up capacity and increasing performance**\n", + "**and usability of the data solutions**\n", + "Data teams at growing digital native organizations find it time intensive and costly to\n", + "handle the growing volume and velocity of their data being ingested from multiple\n", + "sources, across multiple clouds. You now need a unified and simplified platform that\n", + "can instantly scale up capacity and deliver more computing power on demand to\n", + "free up your data teams to produce outputs more quickly. This lowers the total cost\n", + "for the overall infrastructure by eliminating redundant licensing, infrastructure and\n", + "administration costs.\n", + "\n", + "**Building effective machine learning operations**\n", + "For data teams beginning their machine learning journeys, the challenge of training\n", + "data models can increase in management complexity. Many teams with disparate\n", + "coding needs for the entire model lifecycle suffer inefficiencies from transferring\n", + "data and code across many separate services. To build and manage effective\n", + "ML operations, consider an end-to-end MLOps environment that brings all data\n", + "together in one place and incorporates managed services for experiment tracking,\n", + "model training, feature development and feature and model serving.\n", + "\n", + "\n", + "-----\n", + "\n", + "# 01\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 01\n", + "\n", + "### Create a unified data architecture for data quality, governance and efficiency\n", + "\n", + "```\n", + "As cloud-born companies grow, data volumes rapidly increase, leading to new\n", + "challenges and use cases. Among the challenges:\n", + "\n", + "\n", + "Application stacks optimized for transaction\n", + "use cases aren’t able to handle the volume,\n", + "velocity and variety of data that modern data\n", + "teams require. For example, this leads to query\n", + "performance issues as data volume grows.\n", + "\n", + "Data silos develop as each team within an\n", + "organization chooses different ETL/ELT and\n", + "storage solutions for their needs. As the\n", + "organization grows and changes, these pipelines\n", + "and storage solutions become brittle, hard to\n", + "maintain and nearly impossible to integrate.\n", + "\n", + "\n", + "These data silos lead to discoverability,\n", + "integration and access issues, which prevent\n", + "teams from leveraging the full value of the\n", + "organization’s available data.\n", + "\n", + "Data governance is hard. Disparate ETL/ELT\n", + "and storage solutions lead to governance,\n", + "compliance, auditability and access control\n", + "challenges, which expose organizations to\n", + "tremendous risk.\n", + "\n", + "\n", + "The Databricks Lakehouse Platform provides\n", + "a unified set of tools for building, deploying,\n", + "sharing and maintaining data solutions at scale.\n", + "It integrates with cloud storage and the security\n", + "in your cloud account, manages and deploys\n", + "cloud infrastructure on your behalf. Your data\n", + "practitioners no longer need separate storage\n", + "systems for their data. And you don’t have to rely\n", + "on your cloud provider for security. The lakehouse\n", + "has its own robust security built into the platform.\n", + "\n", + "\n", + "For all the reasons above, the most\n", + "consistent advice from successful data\n", + "practitioners is to create a “single source\n", + "of truth” by unifying all data on a single\n", + "platform. With the Databricks Lakehouse\n", + "Platform, you can unify all your data on one\n", + "platform, reducing data infrastructure costs\n", + "and compute. You don’t need excess data\n", + "copies and you can retire expensive\n", + "legacy infrastructure.\n", + "```\n", + " 01\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: GRAMMARLY\n", + "\n", + "### Helping 30 million people and 50,000 teams communicate more effectively\n", + "\n", + "```\n", + "\n", + "While its business is based on analytics, [Grammarly](http://www.grammarly.com)\n", + "\n", + "for many years relied on a homegrown analytics\n", + "\n", + "platform to drive its AI writing assistant to\n", + "\n", + "help users improve multiple aspects of written\n", + "\n", + "communications. As teams developed their own\n", + "\n", + "requirements, data silos inevitably emerged as\n", + "\n", + "different business areas implemented analytics\n", + "\n", + "tools individually.\n", + "\n", + "“Every team decided to solve their analytics\n", + "\n", + "needs in the best way they saw fit,” said Chris\n", + "\n", + "Locklin, Engineering Manager, Data Platforms,\n", + "\n", + "at Grammarly. “That created challenges in\n", + "\n", + "consistency and knowing which data set\n", + "\n", + "was correct.”\n", + "\n", + "To better scale and improve data storage and\n", + "\n", + "query capabilities, Grammarly brought all its\n", + "\n", + "analytical data into the Databricks Lakehouse\n", + "\n", + "Platform and created a central hub for all data\n", + "\n", + "producers and consumers across the company.\n", + "\n", + "Grammarly had several goals with the lakehouse,\n", + "\n", + "including better access control, security, ingestion\n", + "\n", + "\n", + "flexibility, reducing costs and fueling collaboration. “Access control in a\n", + "\n", + "distributed file system is difficult, and it only gets more complicated as\n", + "\n", + "you ingest more data sources,” said Locklin. To manage access control,\n", + "\n", + "enable end-to-end observability and monitor data quality, Grammarly\n", + "\n", + "relies on the data lineage capabilities within Unity Catalog. “Data lineage\n", + "\n", + "allows us to effectively monitor usage of our data and ensure it upholds\n", + "\n", + "the standards we set as a data platform team,” said Locklin. “Lineage is\n", + "\n", + "the last crucial piece for access control.”\n", + "\n", + "Data analysts within Grammarly now have a consolidated interface for\n", + "\n", + "analytics, which leads to a single source of truth and confidence in the\n", + "\n", + "accuracy and availability of all data managed by the data platform team.\n", + "\n", + "Having a consistent data source across the company also resulted in\n", + "\n", + "greater speed and efficiency and reduced costs. Data practitioners\n", + "\n", + "experienced 110% faster querying at 10% of the cost to ingest compared\n", + "\n", + "to a data warehouse. Grammarly can now make its 5 billion daily events\n", + "\n", + "available for analytics in under 15 minutes rather than 4 hours. Migrating\n", + "\n", + "off its rigid legacy infrastructure gave Grammarly the flexibility to do\n", + "\n", + "more and the confidence that the platform will evolve with its needs.\n", + "\n", + "Grammarly is now able to sustain a flexible, scalable and highly secure\n", + "\n", + "analytics platform that helps 30 million people and 50,000 teams\n", + "\n", + "worldwide write more effectively every day.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/grammarly)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to unify the data infrastructure with Databricks\n", + "\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\n", + "is composed of two primary parts:\n", + "\n", + "- The infrastructure to deploy, configure and\n", + "manage the platform and services\n", + "\n", + "\n", + "You can build a Databricks workspace by configuring\n", + "secure integrations between the Databricks platform\n", + "and your cloud account, and then Databricks deploys\n", + "temporary Apache Spark™/Photon clusters using cloud\n", + "resources in your account to process and store data\n", + "in object storage and other integrated services you\n", + "control. Here are three steps to get started with the\n", + "Databricks Lakehouse Platform:\n", + "\n", + "**Understand the architecture**\n", + "The lakehouse provides a unified architecture,\n", + "meaning that all data is stored in the same\n", + "accessible place. The diagram shows how data\n", + "comes in from sources like a customer relationship\n", + "management (CRM) system, an enterprise resource\n", + "planning (ERP) system, websites or unstructured\n", + "customer emails.\n", + "\n", + "**Optimize the storage layer**\n", + "All data is stored in cloud storage while Databricks\n", + "provides tooling to assist with ingestion, such as\n", + "Auto Loader, and we recommend [open-source](https://delta.io/)\n", + "[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\n", + "Delta optimized storage layer that provides the\n", + "foundation for storing data and tables in the\n", + "Databricks Lakehouse Platform. Having all your\n", + "data in the same optimized, open storage keeps\n", + "all your use cases in the same place, thus enabling\n", + "collaboration and removing software tool overhead.\n", + "\n", + "\n", + "\n", + "- the customer-owned infrastructure managed in\n", + "collaboration by Databricks and the customer.\n", + "\n", + "\n", + "The lakehouse handles all varieties of data (structured, semi-structured, unstructured),\n", + "as well as all velocities of data (streaming, batch or somewhere in the middle).\n", + "\n", + "[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n", + "\n", + "\n", + "-----\n", + "\n", + "The Databricks Lakehouse organizes data stored with Delta Lake in cloud object\n", + "storage with familiar concepts like database, tables and views. Delta Lake extends\n", + "Parquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\n", + "scalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\n", + "and was developed for tight integration with Structured Streaming, allowing you to\n", + "easily use a single copy of data for both batch and streaming operations to provide\n", + "incremental processing at scale.This model combines many of the benefits of a data\n", + "warehouse with the scalability and flexibility of a data lake.\n", + "\n", + "To learn more about the optimized storage layer that provides the foundation for\n", + "storing data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n", + "[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n", + "\n", + "The first step in unifying your data architecture is setting up how data is to be\n", + "accessed and used across the organization. We’ll discuss this as a series of steps:\n", + "\n", + "**1** Set up governance with Unity Catalog\n", + "\n", + "**2** Grant secure access to the data\n", + "\n", + "\n", + "###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n", + " – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n", + "\n", + "[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "**3** Capture audit logs\n", + "\n", + "**4** View data lineage\n", + "\n", + "**5** Set up data sharing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Configure unified governance**\n", + "Databricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\n", + "means that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\n", + "is secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\n", + "personas and automatically captures user-level audit logs that record access to your data.\n", + "\n", + "Data stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\n", + "languages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n", + "\n", + "To set up Unity Catalog for your organization,\n", + "you do the following:\n", + "\n", + "\n", + "**1** Configure an S3 bucket and IAM role that\n", + "Unity Catalog can use to store and access\n", + "data in your AWS account.\n", + "\n", + "**2** Create a metastore for each region in\n", + "\n", + "which your organization operates, and\n", + "attach workspaces to the metastore. Each\n", + "workspace will have the same view of the\n", + "data you manage in Unity Catalog.\n", + "\n", + "\n", + "**3** If you have a new account, add users,\n", + "groups and service principals to your\n", + "Databricks account.\n", + "\n", + "**4** Next, create and grant access to\n", + "\n", + "catalogs, schemas and tables.\n", + "\n", + "\n", + "For complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How Unity Catalog works\n", + "\n", + "\n", + "You will notice that the hierarchy of primary data\n", + "objects in Unity Catalog flows from metastore to table:\n", + "\n", + "**Metastore** is the top-level container for metadata.\n", + "Each metastore exposes a three-level namespace\n", + "(catalog.schema.table) that organizes your data.\n", + "\n", + "\n", + "**Metastore** **Catalog** **Schemas**\n", + "\n", + "\n", + "**Views**\n", + "\n", + "**Managed**\n", + "**Tables**\n", + "\n", + "\n", + "**Catalog** is the first layer of the object hierarchy, used\n", + "to organize your data assets.\n", + "\n", + "\n", + "**Schemas** , also known as databases, are the second\n", + "layer of the object hierarchy and contain tables and\n", + "views.\n", + "\n", + "**Table** is the lowest level in the object hierarchy, and\n", + "tables can be external (stored in external locations in\n", + "your cloud storage of choice) or managed (stored in a\n", + "storage container in your cloud storage that you create\n", + "\n", + "expressly for Databricks). You can also create readonly **Views** from tables.\n", + "\n", + "\n", + "**External**\n", + "**tables**\n", + "\n", + "The diagram below represents the file system\n", + "hierarchy of a single storage bucket:\n", + "\n", + "\n", + "-----\n", + "\n", + "Unity Catalog uses the identities in the Databricks\n", + "account to resolve users, service principals, and groups\n", + "and to enforce permissions. To configure identities in\n", + "the account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n", + "[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\n", + "service principals, and groups when you create\n", + "[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n", + "\n", + "Unity Catalog users, service principals, and groups\n", + "must also be added to workspaces to access Unity\n", + "Catalog data in a notebook, a Databricks SQL query,\n", + "Data Explorer or a REST API command. The assignment\n", + "of users, service principals, and groups to workspaces\n", + "is called identity federation. All workspaces attached\n", + "to a Unity Catalog metastore are enabled for identity\n", + "federation.\n", + "\n", + "Securable objects in Unity Catalog are hierarchical,\n", + "meaning that granting a privilege on a catalog or schema\n", + "automatically grants the privilege to all current and\n", + "future objects within the catalog or schema. For more\n", + "on granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\n", + "A common scenario is to set up a schema per team\n", + "where only that team has USE SCHEMA and CREATE on\n", + "the schema. This means that any tables produced by\n", + "team members can only be shared within the team.\n", + "Data Explorer uses the privileges configured by Unity\n", + "Catalog administrators to ensure that users are only\n", + "able to see catalogs, databases, tables and views that\n", + "they have permission to query.\n", + "\n", + "\n", + "[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\n", + "many Unity Catalog features. Use Data Explorer to view\n", + "schema details, preview sample data, and see table\n", + "details and properties. Administrators can view and\n", + "change owners. Admins and data object owners can grant\n", + "and revoke permissions through this interface.\n", + "\n", + "**Set up secure access**\n", + "In Unity Catalog, data is secure by default. Initially, users\n", + "have no access to data in a metastore. Access can\n", + "be granted by either a metastore admin, the owner of\n", + "an object, or the owner of the catalog or schema that\n", + "contains the object. Securable objects in Unity Catalog\n", + "are hierarchical and privileges are inherited downward.\n", + "\n", + "Unity Catalog’s security model is based on standard ANSI\n", + "SQL and allows administrators to grant permissions in\n", + "their existing data lake using familiar syntax, at the level of\n", + "catalogs, databases (schema), tables and views. Privileges\n", + "and metastores are shared across workspaces, allowing\n", + "administrators to set secure permissions once against\n", + "\n", + "groups synced from identity providers and know that\n", + "end users only have access to the proper data in any\n", + "Databricks workspace they enter.\n", + "\n", + "\n", + "-----\n", + "\n", + "```\n", + "CUSTOMER STORY: BUTCHERBOX\n", + "\n", + "### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n", + "\n", + "```\n", + "\n", + "As a young e-commerce company,\n", + "\n", + "[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n", + "\n", + "customers’ needs change, which means it is\n", + "\n", + "constantly considering behavioral patterns,\n", + "\n", + "distribution center efficiency, a growing list of\n", + "\n", + "marketing and communication channels, and\n", + "\n", + "order processing systems.\n", + "\n", + "The meat and seafood subscription company\n", + "\n", + "collects data on hundreds of thousands\n", + "\n", + "of subscribers. It deployed the Databricks\n", + "\n", + "Lakehouse Platform to gain visibility across\n", + "\n", + "its diverse range of data systems and enable\n", + "\n", + "its analytics team to securely view and\n", + "\n", + "export data in the formats needed.\n", + "\n", + "With so much data feeding in from different\n", + "\n", + "sources — from email systems to its website\n", + "\n", + "— the data team at ButcherBox quickly\n", + "\n", + "discovered that data silos were a significant\n", + "\n", + "\n", + "“We knew we needed to migrate from our legacy data warehouse\n", + "\n", + "environment to a data analytics platform that would unify our\n", + "\n", + "data and make it easily accessible for quick analysis to improve\n", + "\n", + "supply chain operations, forecast demand and, most importantly,\n", + "\n", + "keep up with our growing customer base,” explained Jake Stone,\n", + "\n", + "Senior Manager, Business Analytics, at ButcherBox.\n", + "\n", + "The platform allows analysts to share builds and iterate on a\n", + "\n", + "project without getting into the code. Querying a table of 18\n", + "\n", + "billion rows would have been problematic with a traditional\n", + "\n", + "platform. With Databricks, ButcherBox can do it in three minutes.\n", + "\n", + "“Delta Lake provides us with a single source of truth for all of\n", + "\n", + "our data,” said Stone. “Now our data engineers are able to build\n", + "\n", + "reliable data pipelines that thread the needle on key topics such\n", + "\n", + "as inventory management, allowing us to identify in near real-\n", + "\n", + "time what our trends are so we can figure out how to effectively\n", + "\n", + "move inventory.”\n", + "\n", + "[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n", + "\n", + "\n", + "problem because they blocked complete\n", + "\n", + "visibility into critical insights needed to make\n", + "\n", + "strategic and marketing decisions.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Set up secure data sharing**\n", + "Databricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\n", + "to share data with other entities regardless of their\n", + "computing platforms. Delta Sharing is integrated with\n", + "Unity Catalog. Your data must be registered with Unity\n", + "Catalog to manage, govern, audit and track usage of the\n", + "shared data on the Lakehouse Platform. The primary\n", + "concepts of Delta Sharing are shares (read-only\n", + "collections of tables and table partitions to be shared)\n", + "and recipients (objects that associate an organization\n", + "with a credential or secure sharing identifier).\n", + "\n", + "As a data provider, you generate a token and share\n", + "it securely with the recipient. They use the token to\n", + "authenticate and get read access to the tables you’ve\n", + "included in the shares you’ve given them access\n", + "to. Recipients access the shared data in read-only\n", + "format. Whenever the data provider updates data\n", + "tables in their own Databricks account, the updates\n", + "appear in near real-time in the recipient’s system.\n", + "\n", + "\n", + "**Capture audit logs**\n", + "Unity Catalog captures an audit log of actions\n", + "performed against the metastore. To access audit\n", + "logs for Unity Catalog events, you must enable and\n", + "configure audit logs for your account. Audit logs for\n", + "each workspace and account-level activities are\n", + "delivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n", + "[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n", + "\n", + "**View data lineage**\n", + "You can use Unity Catalog to capture runtime data\n", + "lineage across queries in any language executed on\n", + "a Databricks cluster or SQL warehouse. Lineage can\n", + "be visualized in Data Explorer in near real-time and\n", + "retrieved with the Databricks REST API. Lineage is\n", + "aggregated across all workspaces attached to Unity\n", + "Catalog and captured down to the column level, and\n", + "includes notebooks, workflows and dashboards related\n", + "to the query. To understand the requirements and how\n", + "to capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n", + "[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n", + "\n", + "\n", + "Unity Catalog Metastore\n", + "\n", + "\n", + "Catalog\n", + "\n", + "\n", + "Data providers can use Databricks audit logging to\n", + "monitor the creation and modification of shares,\n", + "and recipients can monitor recipient activity on\n", + "shares. Data recipients who use shared data in a\n", + "Databricks account can use Databricks audit logging\n", + "to understand who is accessing which data.\n", + "\n", + "\n", + "-----\n", + "\n", + "###### Resources:\n", + "\n", + "- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n", + "\n", + "- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n", + "\n", + "- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n", + "\n", + "- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n", + "\n", + "- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n", + "\n", + "\n", + "###### Key Takeaways\n", + "\n", + "- With the Databricks Lakehouse Platform, you can\n", + "unify and simplify all your data on one platform\n", + "to better scale and improve data storage and\n", + "query capabilities\n", + "\n", + "- The lakehouse helps reduce data infrastructure\n", + "and compute costs. You don’t need excess\n", + "data copies and can retire expensive legacy\n", + "infrastructure.\n", + "\n", + "\n", + "Leverage Delta Lake as the open format\n", + "storage layer to deliver reliability, security and\n", + "performance on your data lake — for both\n", + "streaming and batch operations — replacing\n", + "data silos with a single home for structured,\n", + "semi-structured and unstructured data\n", + "\n", + "With Unity Catalog you can centralize\n", + "governance for all data and AI assets including\n", + "files, tables, machine learning models and\n", + "dashboards in your lakehouse on any cloud\n", + "\n", + "The Databricks Lakehouse Platform is open\n", + "source with multicloud flexibility so that you can\n", + "use your data however and wherever you want —\n", + "no vendor lock-in\n", + "\n", + "\n", + "-----\n", + "\n", + "# 02\n", + "```\n", + "CHALLENGE: \u0003\n", + "\n", + "## Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "```\n", + "CHALLENGE 02\n", + "\n", + "### Build your data architecture to support scale and performance\n", + "\n", + "```\n", + "As modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\n", + "the increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\n", + "suddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\n", + "cost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\n", + "upon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n", + "\n", + "Here are some common challenges around managing data and performance at scale:\n", + "\n", + "\n", + "**Volume and velocity** — Exponentially\n", + "increasing data sources, and the speed at\n", + "which they capture and create data.\n", + "\n", + "**Latency requirements** — The demands of\n", + "downstream applications and users have\n", + "evolved (people want data and the results\n", + "from the data faster).\n", + "\n", + "\n", + "**Governance** — Cataloging, auditing, securing and\n", + "reporting on data is burdensome at scale when\n", + "using old systems not built with data access\n", + "controls and compliance in mind.\n", + "\n", + "**Multicloud** is really hard.\n", + "\n", + "\n", + "**Data storage** — Storing data in the wrong\n", + "format is slow to access, query and is\n", + "expensive at scale.\n", + "\n", + "\n", + "**Data format** — Supporting structured, semistructured and unstructured data formats\n", + "is now a requirement. Most data storage\n", + "solutions are designed to handle only one type\n", + "of data, requiring multiple products\n", + "to be stitched together.\n", + "\n", + "```\n", + "02\n", + "\n", + "```\n", + "\n", + "-----\n", + "\n", + "###### Lakehouse solves scale and performance challenges\n", + "\n", + "\n", + "The solution for growing digital companies is a unified\n", + "and simplified platform that can instantly scale up\n", + "capacity to deliver more computing power on demand,\n", + "freeing up teams to go after the much-needed data\n", + "and produce outputs more quickly. With a lakehouse,\n", + "they can replace their data silos with a single home for\n", + "their structured, semi-structured and unstructured\n", + "data. Users and applications throughout the enterprise\n", + "environment can connect to the same single copy of\n", + "the data to drive diverse workloads.\n", + "\n", + "The lakehouse architecture is cost-efficient for\n", + "scaling, lowering the total cost of ownership for the\n", + "overall infrastructure by consolidating all data estate\n", + "and use cases onto a single platform and eliminating\n", + "redundant licensing, infrastructure and administration\n", + "costs. Unlike other warehouse options that can only\n", + "scale horizontally, the Databricks Lakehouse can scale\n", + "horizontally and vertically based on workload demands.\n", + "\n", + "With the Databricks Lakehouse, you can optimize the\n", + "compute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n", + "[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\n", + "research by the Barcelona Supercomputing Center.\n", + "And your data teams are more productive by focusing\n", + "on more strategic initiatives versus managing multiple\n", + "data solutions.\n", + "\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "### Driving into the future of electric transportation\n", + "\n", + "```\n", + "```\n", + "CUSTOMER STORY: RIVIAN\n", + "\n", + "```\n", + "\n", + "With more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n", + "\n", + "day, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n", + "\n", + "legacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n", + "\n", + "Before Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n", + "\n", + "decreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n", + "\n", + "infrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n", + "\n", + "downstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n", + "\n", + "actionable insights for different use cases, from predictive maintenance to smarter product development.\n", + "\n", + "“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n", + "\n", + "performant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n", + "\n", + "Wassym Bensaid, Vice President of Software Development at Rivian.\n", + "\n", + "For instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n", + "\n", + "accelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n", + "\n", + "roll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n", + "\n", + "connected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n", + "\n", + "smart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n", + "\n", + "has seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n", + "\n", + "[Read the full story here.](https://www.databricks.com/customers/rivian)\n", + "\n", + "\n", + "-----\n", + "\n", + "###### How to ensure scalability and performance with Databricks\n", + "\n", + "The [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\n", + "scalability and performance for your data architecture\n", + "based on the following features and capabilities:\n", + "\n", + "- A simplified and cost-efficient architecture that\n", + "increases productivity\n", + "\n", + "- A platform that ensures reliable, high performing\n", + "ETL workloads — for streaming and batch data\n", + "— while Databricks automatically manages your\n", + "infrastructure\n", + "\n", + "- The ability to ingest, transform and query all your\n", + "data in one place, and scale on demand with\n", + "serverless compute\n", + "\n", + "- Enables real-time data access for all data,\n", + "analytics and AI use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "The following section will provide a short series of\n", + "steps for understanding the key components of the\n", + "Databricks Lakehouse Platform.\n", + "\n", + "\n", + "**Step 2**\n", + "**Understand the common Delta Lake operations**\n", + "The Databricks Lakehouse Platform simplifies the\n", + "entire data lifecycle, from data ingestion to monitoring\n", + "and governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\n", + "open-source storage system based on the Delta\n", + "format providing reliability through ACID transactions\n", + "and scalable metadata handling. Large quantities of\n", + "raw files in blob storage can be converted to Delta to\n", + "organize and store the data cheaply. This allows for\n", + "flexibility of data movement while being performant\n", + "and less expensive.\n", + "\n", + "\n", + "**Step 1**\n", + "**Get a trial Databricks account**\n", + "Start your 14-day free trial with Databricks on\n", + "AWS in a few easy steps.\n", + "[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\n", + "uses compute and S3 storage resources in your cloud\n", + "provider account.\n", + "\n", + "\n", + "and writing data can occur simultaneously without risk\n", + "of many queries resulting in performance degradation\n", + "or deadlock for business-critical workloads.\n", + "\n", + "This means that users and applications throughout\n", + "the enterprise environment can connect to the same\n", + "single copy of the data to drive diverse workloads, with\n", + "all viewers guaranteed to receive the most current\n", + "version of the data at the time their query executes.\n", + "With performance features like indexing, Delta Lake\n", + "customers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n", + "[up to 48x faster.](https://www.databricks.com/customers/columbia)\n", + "\n", + "\n", + "[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\n", + "and learn how to create, manage and query tables.\n", + "With support for ACID transactions and schema\n", + "enforcement, Delta Lake provides the reliability that\n", + "traditional data lakes lack. This enables you to scale\n", + "reliable data insights throughout the organization and\n", + "run analytics and other data projects directly on your\n", + "data lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n", + "\n", + "Delta Lake transactions use log files stored alongside\n", + "data files to provide ACID guarantees at a table level.\n", + "Because the data and log files backing Delta Lake\n", + "tables live together in cloud object storage, reading\n", + "\n", + "\n", + "-----\n", + "\n", + "All data in Delta Lake is stored in open Apache Parquet\n", + "format, allowing data to be read by any compatible\n", + "reader. APIs are open and compatible with Apache\n", + "Spark, so you have access to a vast open-source\n", + "ecosystem to avoid data lock-in from proprietary\n", + "formats and conversions, which have embedded and\n", + "added costs.\n", + "\n", + "###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n", + "\n", + " — Steve Pulec, Chief Technology Officer, YipitData\n", + "\n", + "[Learn more](https://www.databricks.com/customers/yipitdata)\n", + "\n", + "\n", + "-----\n", + "\n", + "**Step 3**\n", + "**Ingest data efficiently at scale**\n", + "With a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\n", + "from hundreds of data sources for analytics, AI and\n", + "streaming applications into one place.\n", + "\n", + "Databricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\n", + "data ingestion. To ingest any file that can land in a data\n", + "lake, Auto Loader incrementally and automatically\n", + "processes new data files as they arrive in cloud storage\n", + "in scheduled or continuous jobs. Auto Loader scales to\n", + "support near real-time ingestion of millions of files\n", + "per hour.\n", + "\n", + "For pushing data in Delta Lake, the SQL command\n", + "[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\n", + "into Delta Lake. COPY INTO is best used when the input\n", + "directory contains thousands of files or fewer, and the\n", + "user prefers SQL. COPY INTO can be used over JDBC\n", + "to push data into Delta Lake at your convenience.\n", + "\n", + "\n", + "**Step 4**\n", + "**Leverage production-ready tools**\n", + "**to automate ETL pipelines**\n", + "Once the raw data is ingested, Databricks provides\n", + "a suite of production-ready tools that allow data\n", + "professionals to quickly develop and deploy extract,\n", + "\n", + "transform and load (ETL) pipelines. Databricks SQL\n", + "allows analysts to run SQL queries against the same\n", + "tables used in production ETL workloads, allowing for\n", + "real-time business intelligence at scale.\n", + "\n", + "With your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n", + "[your first extract, transform a, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf))List(Increasing volume and velocity of data as companies mature., Need for faster data access and reduced latency., Challenges in data governance, including cataloging, auditing, and securing data., Complexities of using multiple cloud environments., Data storage issues such as slow access, poor query performance, and high costs., Requirement to support structured, semi-structured, and unstructured data formats.){\"info\": {\"request_id\": \"tr-0d0e7280dd93452c982596f862357324\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852753, \"execution_time_ms\": 1947, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"6aeb02d6-4b23-4713-9855-f308a0690c05\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0d0e7280dd93452c982596f862357324/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xf28d1db5327a5457\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": null, \"start_time\": 1734543852753869390, \"end_time\": 1734543854701685330, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xb01f33359ce0cb3e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853027149455, \"end_time\": 1734543853051532964, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x21f0cbaf9967dd49\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853051633666, \"end_time\": 1734543853052273974, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xd73d767b43d9d53f\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x21f0cbaf9967dd49\", \"start_time\": 1734543853051864069, \"end_time\": 1734543853052030571, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x0d78175c81efa68e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x21f0cbaf9967dd49\", \"start_time\": 1734543853052100872, \"end_time\": 1734543853052239673, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x0628efab3df56975\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853052335574, \"end_time\": 1734543856582738553, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856582606151, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x426f0109dc63441e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x0628efab3df56975\", \"start_time\": 1734543853231591347, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854701552, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
4b452a4426892dea5c35302c50dc70d62c0b2993f478af59a42b59d7c258bfa0List(List(List(What are two key challenges mentioned for predictive maintenance in government agencies?, user)))List(List(##### Overview\n", + "\n", + "**Integrating unstructured data**\n", + "Equipment data doesn’t just come in the form of IoT data. Agencies can gather rich unstructured signals like audio, visual (e.g., video inspections) and text (e.g., maintenance logs). Most legacy data architectures are unable to integrate structured and unstructured data sources.\n", + "\n", + "**Operationalizing machine learning**\n", + "Most agencies lack the advanced analytics tools needed to build models that can predict potential equipment failures. Those that do typically have their data scientists working in a siloed set of tools, resulting in unnecessary data replication and inefficient workflows., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf))List(Difficulty integrating structured and unstructured data sources due to legacy data architectures., Inefficient workflows caused by a lack of advanced analytics tools and siloed environments for data scientists.){\"info\": {\"request_id\": \"tr-0055eec9c95145e9893855e255b52c3a\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852844, \"execution_time_ms\": 1849, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"79f068ca-4f86-4a85-ba45-5d52af1db4c5\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0055eec9c95145e9893855e255b52c3a/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xcf005142111fa44a\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": null, \"start_time\": 1734543852844458939, \"end_time\": 1734543854693593731, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xbce44a78f6c58bb1\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852908150646, \"end_time\": 1734543852934614182, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x232f779bfb5f2ced\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852934718983, \"end_time\": 1734543852935319291, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x19a2db6bb887d125\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x232f779bfb5f2ced\", \"start_time\": 1734543852934860385, \"end_time\": 1734543852935065588, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xd9e317dca963334e\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x232f779bfb5f2ced\", \"start_time\": 1734543852935147989, \"end_time\": 1734543852935286590, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x4a4b01cbaba0d526\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852935381692, \"end_time\": 1734543856726656399, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856726600698, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x99a7f869d3079532\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x4a4b01cbaba0d526\", \"start_time\": 1734543853110806216, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854693436, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What are two key challenges mentioned for predictive maintenance in government agencies?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
3b231daee5434db054e2ee8b4aee9b4edba19aa8886c0d491daa1b36b743142fList(List(List(What are some of the common problems faced by data lakes according to the document?, user)))List(List(**Challenges with data lakes**\n", + "Data lakes are a common element within modern data architectures. They serve as a\n", + "central ingestion point for the plethora of data that organizations seek to gather and\n", + "mine. While a good step forward in getting to grips with the range of data, they run\n", + "into the following common problems:\n", + "\n", + "**1. Reading and writing into data lakes is not reliable.** Data engineers often run into\n", + "the problem of unsafe writes into data lakes that cause readers to see garbage\n", + "data during writes. They have to build workarounds to ensure readers always see\n", + "consistent data during writes.\n", + "\n", + "**2. The data quality in data lakes is low.** Dumping unstructured data into a data\n", + "lake is easy, but this comes at the cost of data quality. Without any mechanisms\n", + "for validating schema and the data, data lakes suffer from poor data quality. As a\n", + "consequence, analytics projects that strive to mine this data also fail.\n", + "\n", + "**3. Poor performance with increasing amounts of data.** As the amount of data\n", + "that gets dumped into a data lake increases, the number of files and directories\n", + "also increases. Big data jobs and query engines that process the data spend a\n", + "significant amount of time handling the metadata operations. This problem is more\n", + "pronounced in the case of streaming jobs or handling many concurrent batch jobs.\n", + "\n", + "**4. Modifying, updating or deleting records in data lakes is hard.** Engineers need to\n", + "build complicated pipelines to read entire partitions or tables, modify the data and\n", + "write them back. Such pipelines are inefficient and hard to maintain., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf))List(Unreliable reading and writing operations, Low data quality due to the lack of validation mechanisms, Poor performance with increasing data volume, Difficulty in modifying, updating, or deleting records){\"info\": {\"request_id\": \"tr-d0aec4bd83d24951a8302b231ac42e47\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852916, \"execution_time_ms\": 1801, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"2fa194f1-5ae7-43ca-9eff-32fef46fce94\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-d0aec4bd83d24951a8302b231ac42e47/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xcaf4f22e6ac48496\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": null, \"start_time\": 1734543852916632754, \"end_time\": 1734543854717960127, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x83ee15f019189952\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543852994320939, \"end_time\": 1734543853010690746, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x6a0b4e0ea9f88df9\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543853010864249, \"end_time\": 1734543853011405856, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xe66dc0ef40e50aa8\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x6a0b4e0ea9f88df9\", \"start_time\": 1734543853011011251, \"end_time\": 1734543853011161352, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some of the common problems faced by data lakes according to the document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xc4e0d91b12ea6d1e\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x6a0b4e0ea9f88df9\", \"start_time\": 1734543853011224353, \"end_time\": 1734543853011370255, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x1d0fd15318581e6f\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543853011469256, \"end_time\": 1734543856663054927, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856663004627, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x2ea973dcf6b6f7ae\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x1d0fd15318581e6f\", \"start_time\": 1734543853244094106, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854717808, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What are some of the common problems faced by data lakes according to the document?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
9673989eb3b8242fc0a48d6338f31191260dd7cf6c7eacb26f2ed1512af803a2List(List(List(What new opportunities can data sharing create for organizations looking to generate additional revenue?, user)))List(List(**Key benefits of data sharing**\n", + "\n", + "As you can see from the use cases described above, there are many benefits of data sharing, including:\n", + "\n", + "**Greater collaboration with existing partners.** In today’s hyper-connected digital economy, no single organization can advance its business objectives without partnerships. Data sharing helps solidify existing partnerships and can help organizations establish new ones.\n", + "**Ability to generate new revenue streams.** With data sharing, organizations can generate new revenue streams by offering data products or data services to their end consumers.\n", + "**Ease of producing new products, services or business models.** Product teams can leverage both first-party data and third-party data to refine their products and services and expand their product/service catalog.\n", + "**Greater efficiency of internal operations.** Teams across the organization can meet their business goals far more quickly when they don’t have to spend time figuring out how to free data from silos. When teams have access to live data, there’s no lag time between the need for data and the connection with the appropriate data source., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf))List(Data sharing can enable organizations to offer data products., Data sharing can enable organizations to offer data services.){\"info\": {\"request_id\": \"tr-4d76d88f00b94167bd93849f59c7001d\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852759, \"execution_time_ms\": 1862, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"547feb6d-eb71-4dfd-b1e2-fec0f84bf84e\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-4d76d88f00b94167bd93849f59c7001d/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xa92ce145b46f44b7\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": null, \"start_time\": 1734543852759799465, \"end_time\": 1734543854622031363, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x10de2ef062d5aa49\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852875604334, \"end_time\": 1734543852948946264, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xd74b169336501027\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852949100366, \"end_time\": 1734543852983429601, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xb73870ebb1747a67\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd74b169336501027\", \"start_time\": 1734543852949327568, \"end_time\": 1734543852949642572, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x91770165c3696c19\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd74b169336501027\", \"start_time\": 1734543852982528089, \"end_time\": 1734543852983380400, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0xd10cd61bd8992ad9\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852983553402, \"end_time\": 1734543856658182168, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856658120867, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x610b59e220caf7e1\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd10cd61bd8992ad9\", \"start_time\": 1734543853210375478, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854621876, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What new opportunities can data sharing create for organizations looking to generate additional revenue?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
8fc168f55c01c3d4059869879a9e54e8601faef19e46f011ac239c44dbe72f40List(List(List(Why is real-time data crucial for retail operations, and what problems do legacy systems cause?, user)))List(List(“Retailers need real-time data to support these decisions, but legacy systems are limited to data that’s hours or days old. When seconds matter, only the Lakehouse delivers better decisions [...] most retailers still rely on legacy data systems, which impedes their ability to scale these innovations. Unfortunately, most legacy systems are only able to process information in hours or days. The delays caused by waiting for data are leading to significant risks and costs for the industry.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf))List(Real-time data enables immediate decision-making., Real-time data enables better decision-making in critical moments., Legacy systems process outdated data., Legacy systems cause delays., Legacy systems lead to risks for the retail industry., Legacy systems lead to costs for the retail industry.){\"info\": {\"request_id\": \"tr-65a2d1b429924041b6ad44564d5466c9\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852748, \"execution_time_ms\": 2076, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"5b8a156e-b61b-43ce-9847-d55fb3f1f81f\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-65a2d1b429924041b6ad44564d5466c9/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x56afcc9be75ced9d\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": null, \"start_time\": 1734543852748811326, \"end_time\": 1734543854825185028, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xcb3082d796db26b4\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853083822574, \"end_time\": 1734543853096378633, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xf2d03139292d1f0c\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853096484834, \"end_time\": 1734543853097025941, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x1130639c35a67e6d\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0xf2d03139292d1f0c\", \"start_time\": 1734543853096632236, \"end_time\": 1734543853096787838, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x7c353308563864e6\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0xf2d03139292d1f0c\", \"start_time\": 1734543853096855239, \"end_time\": 1734543853096990741, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x3818decf4cd3be41\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853097087042, \"end_time\": 1734543857146640395, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543857146554094, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x369630fdedeb85db\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x3818decf4cd3be41\", \"start_time\": 1734543853260818618, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854825053, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'Why is real-time data crucial for retail operations, and what problems do legacy systems cause?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
1373db51df7476c934e04796eaceed4d4475d7b7a70efcb3405b121c71e96923List(List(List(What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?, user)))List(List(Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n", + "\n", + "Some of the primary metrics that are typically tracked in game telemetry include:\n", + "\n", + "- **Player engagement:** Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n", + "- **Game progress:** Monitor player progress through different levels and milestones in the game.\n", + "- **In-game purchases:** Track the number and value of in-game purchases made by players.\n", + "- **Player demographics:** Collect demographic information about players, such as age, gender, location, and device type.\n", + "- **Session length:** Monitor the length of each player session, and how often players return to the game.\n", + "- **Retention:** Track the percentage of players who return to the game after their first session.\n", + "- **User Acquisition:** Track the number of new players acquired through different marketing channels., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf))List(Game telemetry is data collected about player behavior and interactions within a video game., The data is primarily sourced from the game engine., Primary metrics tracked in game telemetry include:\n", + " - player engagement\n", + " - game progress\n", + " - in-game purchases\n", + " - player demographics\n", + " - session length\n", + " - retention\n", + " - user acquisition){\"info\": {\"request_id\": \"tr-b120ba49b531438a91c59260d15c29fa\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852726, \"execution_time_ms\": 2065, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"7e13913e-74b9-4e7f-b328-821e336dc896\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-b120ba49b531438a91c59260d15c29fa/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x05ce992d2bdd5504\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": null, \"start_time\": 1734543852726459743, \"end_time\": 1734543854791960725, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x58e7dd0ff3a51163\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852815466171, \"end_time\": 1734543852822157056, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xb20572c1aaf198bc\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852822376259, \"end_time\": 1734543852823113868, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xd358b99e4bca262c\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0xb20572c1aaf198bc\", \"start_time\": 1734543852822582261, \"end_time\": 1734543852822766664, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x24d875356f746818\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0xb20572c1aaf198bc\", \"start_time\": 1734543852822906266, \"end_time\": 1734543852823078268, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x8bff9921d08b090e\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852823187869, \"end_time\": 1734543856940949499, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856940876898, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xb43e817694f5bf32\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x8bff9921d08b090e\", \"start_time\": 1734543852996579568, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854791831, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnullnull
d94a588c739512457882ea9bd39758fb222c0bef855b4c2e4d75dd8bf582c14dList(List(List(What significant advancement in large language model development happened in 2012?, user)))List(List(“ 1950s–1990s\n", + "Initial attempts are made to map hard rules around languages and follow logical steps to accomplish tasks like translating a sentence from one language to another.\n", + "\n", + "While this works sometimes, strictly defined rules only work for concrete, well-defined tasks that the system has knowledge about.\n", + "\n", + "1990s \n", + "Language models begin evolving into statistical models and language patterns start being analyzed, but larger-scale projects are limited by computing power.\n", + "\n", + "2000s \n", + "Advancements in machine learning increase the complexity of language models, and the wide adoption of the internet sees an enormous increase in available training data.\n", + "\n", + "2012 \n", + "Advancements in deep learning architectures and larger data sets lead to the development of GPT (Generative Pre-trained Transformer).\n", + "\n", + "2018 \n", + "Google introduces BERT (Bidirectional Encoder Representations from Transformers), which is a big leap in architecture and paves the way for future large language models.\n", + "\n", + "2020 \n", + "OpenAI releases GPT-3, which becomes the largest model at 175B parameters and sets a new performance benchmark for language-related tasks.\n", + "\n", + "2022 \n", + "ChatGPT is launched, which turns GPT-3 and similar models into a service that is widely accessible to users through a web interface and kicks off a huge increase in public awareness of LLMs and generative AI.\n", + "\n", + "2023 \n", + "Open source LLMs begin showing increasingly impressive results with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna. GPT-4 is also released, setting a new benchmark for both parameter size and performance.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf))List(GPT (Generative Pre-trained Transformer) was developed, This development occurred in 2012){\"info\": {\"request_id\": \"tr-b99c366618994c5eb9d3b4d72cee2989\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543861137, \"execution_time_ms\": 708, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"as...\"}, \"tags\": {\"eval.requestId\": \"6bc9f19d-a537-4f23-bcef-f721962d9c9c\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-b99c366618994c5eb9d3b4d72cee2989/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xf524663e9f028897\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": null, \"start_time\": 1734543861137404216, \"end_time\": 1734543861845701810, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x50016c97d8807a31\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861153684313, \"end_time\": 1734543861160780499, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc3677e67892c47a3\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861160921101, \"end_time\": 1734543861161489608, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x9c3e9637d25d9d2b\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xc3677e67892c47a3\", \"start_time\": 1734543861161089203, \"end_time\": 1734543861161255005, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What significant advancement in large language model development happened in 2012?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x26857098d22c4149\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xc3677e67892c47a3\", \"start_time\": 1734543861161321706, \"end_time\": 1734543861161457108, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x59400c3e8c97f1e8\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861161552009, \"end_time\": 1734543861845639509, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x495266b13971d9be\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0x59400c3e8c97f1e8\", \"start_time\": 1734543861285639014, \"end_time\": 1734543861831136333, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_9fd8ffb3-eac8-4762-9991-360c66e7451a\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543861, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1153, \\\"total_tokens\\\": 1166, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdfI'm sorry, I can't help you with that.no[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.yesNo harmful content detected in responsenoThe expected response states that GPT (Generative Pre-trained Transformer) was developed and that this development occurred in 2012. The response does not provide any information about the development of GPT or the year 2012. Therefore, the response is not correct.0.7081166.01153.013.0nullnullnullnullnullnullnullnullnull
6d1c05783fb5945cc9b121919eabdc2194c9c64809821e3c30b7f758a4d12a40List(List(List(What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?, user)))List(List(```\n", + "Our most popular use case is natural language processing\n", + "(NLP), a rapidly growing field that enables businesses to\n", + "gain value from unstructured textual data. This opens the\n", + "door for users to accomplish tasks that were previously\n", + "too abstract for code, such as summarizing content or\n", + "extracting sentiment from customer reviews. In our data\n", + "set, 49% of libraries used are associated with NLP. LLMs\n", + "also fall within this bucket. Given the innovations launched\n", + "in recent months, we expect to see NLP take off even\n", + "more in coming years as it is applied to use cases like\n", + "chatbots, research assistance, fraud detection, content\n", + "generation and more.\n", + "```, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf))List(49% of specialized Python libraries in the data set are associated with NLP., Examples of tasks enabled by NLP include summarizing content, extracting sentiment from customer reviews, chatbots, research assistance, fraud detection, and content generation.){\"info\": {\"request_id\": \"tr-1c747ef0201042c7a3b0bd743b10dbf3\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852712, \"execution_time_ms\": 2064, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\",...\"}, \"tags\": {\"eval.requestId\": \"fb607a79-4b69-40d4-9ae2-f775ebbde3bd\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-1c747ef0201042c7a3b0bd743b10dbf3/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x60cedc2c29393cbf\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": null, \"start_time\": 1734543852712753969, \"end_time\": 1734543854776758540, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3325b54de586b3ad\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852732453419, \"end_time\": 1734543852739742611, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x29a1074282212bf0\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852739860513, \"end_time\": 1734543852740457320, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x33eb8555d50fcb47\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x29a1074282212bf0\", \"start_time\": 1734543852740040215, \"end_time\": 1734543852740208917, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xbc094e532610db9e\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x29a1074282212bf0\", \"start_time\": 1734543852740280318, \"end_time\": 1734543852740421320, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x0496ce9992272445\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852740525121, \"end_time\": 1734543854776677939, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x8140042543c79126\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543852863417879, \"end_time\": 1734543854322657431, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_1c39aaae-ea91-4641-a7fd-68a86f8df4b3\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 53, \\\"prompt_tokens\\\": 1171, \\\"total_tokens\\\": 1224, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x78b5de3a4bdc9c30\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543854329211110, \"end_time\": 1734543854346650422, \"status_code\": \"ERROR\", \"status_message\": \"KeyError: 'field'\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"NLP tasks and libraries percentage\\\", \\\"filters\\\": [{\\\"key\\\": \\\"category\\\", \\\"value\\\": \\\"NLP\\\"}, {\\\"key\\\": \\\"type\\\", \\\"value\\\": \\\"library\\\"}]}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854346590921, \"attributes\": {\"exception.type\": \"KeyError\", \"exception.message\": \"'field'\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 357, in __call__\\n vs_filters = json.dumps(self.parse_filters(filters)) if filters else None\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 174, in wrapper\\n with _WrappingContext(fn, args, kwargs) as wrapping_coro:\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 162, in __exit__\\n self.coro.throw(exc_type, exc_value, traceback)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 415, in parse_filters\\n suggested_field = filter_item[\\\"field\\\"]\\n ~~~~~~~~~~~^^^^^^^^^\\nKeyError: 'field'\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"parse_filters\", \"context\": {\"span_id\": \"0x9bd57d57f978eb5a\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x78b5de3a4bdc9c30\", \"start_time\": 1734543854330283123, \"end_time\": 1734543854343773187, \"status_code\": \"ERROR\", \"status_message\": \"KeyError: 'field'\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"parse_filters\\\"\", \"mlflow.spanInputs\": \"{\\\"filters\\\": [{\\\"key\\\": \\\"category\\\", \\\"value\\\": \\\"NLP\\\"}, {\\\"key\\\": \\\"type\\\", \\\"value\\\": \\\"library\\\"}]}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854343718786, \"attributes\": {\"exception.type\": \"KeyError\", \"exception.message\": \"'field'\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 415, in parse_filters\\n suggested_field = filter_item[\\\"field\\\"]\\n ~~~~~~~~~~~^^^^^^^^^\\nKeyError: 'field'\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x6b845d9ec312c01c\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543854355081224, \"end_time\": 1734543854760312641, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_aa9393d5-db8e-447d-8a50-798d07d09384\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543854, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1244, \\\"total_tokens\\\": 1257, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdfI'm sorry, I can't help you with that.no[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.yesNo harmful content detected in responsenoThe expected response states that 49% of specialized Python libraries in the data set are associated with NLP and lists several tasks enabled by NLP. The response does not provide any information about the percentage of specialized Python libraries associated with NLP or the tasks enabled by NLP. Therefore, the response is not correct.2.0642481.02415.066.0Missing required field(s): retrieved_context for metric: context_sufficiencyMissing required field(s): retrieved_context for metric: groundednessnullnullnullnullnullnullnull
fc67f25c728d8c264f373417e09fd8ecbf4cea9ec52a0fbd9d282dae461fc310List(List(List(What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?, user)))List(List(Most of the complexity has arisen with the explosion of data volumes and data types, with organizations amassing an estimated 80% of data in unstructured and semi-structured format. As the collection of data continues to increase, 73% of the data goes unused for analytics or decision-making. In order to try and decrease this percentage and make more data usable, data engineering teams are responsible for building data pipelines to efficiently and reliably deliver data. But the process of building these complex data pipelines comes with a number of difficulties:\n", + "\n", + "• In order to get data into a data lake, data engineers are required to spend immense time hand-coding repetitive data ingestion tasks\n", + "\n", + "• Since data platforms continuously change, data engineers spend time building and maintaining, and then rebuilding, complex scalable infrastructure\n", + "\n", + "• As data pipelines become more complex, data engineers are required to find reliable tools to orchestrate these pipelines\n", + "\n", + "• With the increasing importance of real-time data, low latency data pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "• Finally, with all pipelines written, data engineers need to constantly focus on performance, tuning pipelines and architectures to meet SLAs, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf))List(Manually hand-coding repetitive data ingestion tasks, Continuously maintaining and rebuilding scalable infrastructure due to changing data platforms, Finding reliable tools for orchestrating complex pipelines, Building and maintaining low-latency pipelines for real-time data, Constantly tuning pipeline performance to meet SLAs){\"info\": {\"request_id\": \"tr-e6ad0c8144a7437289b16c45349319ed\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543860833, \"execution_time_ms\": 3035, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"a7750d70-cff6-4c8e-8e0a-7cb51f1a4cc5\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-e6ad0c8144a7437289b16c45349319ed/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x0ac1b4e3ff1dfe05\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": null, \"start_time\": 1734543860833303626, \"end_time\": 1734543863868343626, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3838393e5f3a93e9\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860846793090, \"end_time\": 1734543860854345781, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xad173f311f3c830f\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860854536284, \"end_time\": 1734543860855150091, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x8c43cd0a424f699b\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0xad173f311f3c830f\", \"start_time\": 1734543860854737086, \"end_time\": 1734543860854909988, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb658d9e60fe29728\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0xad173f311f3c830f\", \"start_time\": 1734543860854979989, \"end_time\": 1734543860855116491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x53cf1c9e349c70f3\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860855218192, \"end_time\": 1734543864419006831, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543864418945430, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x2451b1bb7bee28bc\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543860995265691, \"end_time\": 1734543861836786402, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_8f33785a-ba7e-4a5c-8dab-871f9c4f9ca0\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"data pipeline challenges for data lakes\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543861, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 25, \\\"prompt_tokens\\\": 1159, \\\"total_tokens\\\": 1184, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x0a9a3d3dfe5a0e3f\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543861881712247, \"end_time\": 1734543862410990445, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"data pipeline challenges for data lakes\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004132444, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"d85d526722f3ca9735bc45d98a9ad449\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00411582, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004092816, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"a6c4aa57b347d46b3d74ce86a7176024\\\"}, {\\\"page_content\\\": \\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0040403795, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"1b74eac4a063d67e5f727e36b040965b\\\"}, {\\\"page_content\\\": \\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003983449, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x1c4b3d53e7e96fa9\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0a9a3d3dfe5a0e3f\", \"start_time\": 1734543861882739759, \"end_time\": 1734543862409501927, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"data pipeline challenges for data lakes\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"d85d526722f3ca9735bc45d98a9ad449\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.004132444], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.00411582], [\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"a6c4aa57b347d46b3d74ce86a7176024\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.004092816], [\\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"1b74eac4a063d67e5f727e36b040965b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0040403795], [\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.003983449]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xcdab18046552e33b\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0a9a3d3dfe5a0e3f\", \"start_time\": 1734543862409676129, \"end_time\": 1734543862410668941, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"d85d526722f3ca9735bc45d98a9ad449\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.004132444], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.00411582], [\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"a6c4aa57b347d46b3d74ce86a7176024\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.004092816], [\\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"1b74eac4a063d67e5f727e36b040965b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0040403795], [\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.003983449]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004132444, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"d85d526722f3ca9735bc45d98a9ad449\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00411582, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004092816, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"a6c4aa57b347d46b3d74ce86a7176024\\\"}, {\\\"page_content\\\": \\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0040403795, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"1b74eac4a063d67e5f727e36b040965b\\\"}, {\\\"page_content\\\": \\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003983449, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x234d8afed236cb49\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543862420300157, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"data pipeline challenges for data lakes\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"**Declarative ETL pipelines**\\\\\\\\nData engineers can reduce development time and effort and instead focus on\\\\\\\\nimplementing business logic and data quality checks within the data pipeline\\\\\\\\nusing SQL or Python. This can be achieved by:\\\\\\\\n\\\\\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\\\\\ndefine \\u201cwhat\\u201d to solve\\\\\\\\n\\\\\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\\\\\ndependencies across the data pipeline\\\\\\\\n\\\\\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\\\\\nand managing data pipeline recovery\\\\\\\\n\\\\\\\\n**Real-time data processing**\\\\\\\\nAllow data engineers to tune data latency with cost controls without the\\\\\\\\nneed to know complex stream processing or implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\\\\\n\\\\\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\\\\\n\\\\\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\\\\\nlogic for downstream use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified orchestration of data workflows**\\\\\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\\\\\n\\\\\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\\\\\nAPI or from your IDE\\\\\\\\n\\\\\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\\\\\n\\\\\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\\\\\n\\\\\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\\\\\nminimize data movement with cluster reuse\\\\\\\\n\\\\\\\\n**Data quality validation and monitoring**\\\\\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\\\\\nconfidently trust the information for downstream initiatives by:\\\\\\\\n\\\\\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\\\\\nwith defined data expectations\\\\\\\\n\\\\\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\\\\\n(fail, drop, alert, quarantine)\\\\\\\\n\\\\\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\\\\\nand reported for the entire data pipeline\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nSources\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehouses\\\\\\\\n\\\\\\\\nOn-premises\\\\\\\\nSystems\\\\\\\\n\\\\\\\\nSaaS\\\\\\\\nApplications\\\\\\\\n\\\\\\\\nMachine &\\\\\\\\nApplication Logs\\\\\\\\n\\\\\\\\nApplication\\\\\\\\nEvents\\\\\\\\n\\\\\\\\nMobile & IoT\\\\\\\\nData\\\\\\\\n\\\\\\\\n\\\\\\\\nCloud\\\\\\\\nStorage\\\\\\\\n\\\\\\\\nMessag\\\\\\\\ne Buses\\\\\\\\n\\\\\\\\n\\\\\\\\n**Lakehouse Platform**\\\\\\\\n\\\\\\\\n**Workflows** for end-to-end orchestration\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time BI Apps\\\\\\\\n\\\\\\\\nReal-Time AI Apps\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Analytics with\\\\\\\\n**Databricks SQL**\\\\\\\\n\\\\\\\\nReal-Time Machine Learning\\\\\\\\nwith\\\\\\\\n**Databricks ML**\\\\\\\\n\\\\\\\\n\\\\\\\\nStreaming ETL with\\\\\\\\n**Delta Live Tables**\\\\\\\\n\\\\\\\\n\\\\\\\\nPredictive\\\\\\\\nMaintenance\\\\\\\\n\\\\\\\\n\\\\\\\\nPersonalized\\\\\\\\nOffers\\\\\\\\n\\\\\\\\n\\\\\\\\nPatient\\\\\\\\nDiagnostics\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Operational\\\\\\\\nApps\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Applications with\\\\\\\\n**Spark Structured Streaming**\\\\\\\\n\\\\\\\\n**Photon** for lightning-fast data processing\\\\\\\\n\\\\\\\\n**Unity Catalog** for data governance and sharing\\\\\\\\n\\\\\\\\n**Delta Lake** for open and reliable data storage\\\\\\\\n\\\\\\\\n\\\\\\\\nAlerts Detection Fraud\\\\\\\\n\\\\\\\\n\\\\\\\\nDynamic\\\\\\\\nPricing\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\\\\\n\\\\\\\\nFigure 2\\\\\\\\nA unified set of tools for real-time data processing\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Fault tolerant and automatic recovery**\\\\\\\\nHandle transient errors and recover from most common error conditions\\\\\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\\\\\nrecovery that includes:\\\\\\\\n\\\\\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\\\\\n\\\\\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\\\\\ncheckpointing\\\\\\\\n\\\\\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004132444, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d85d526722f3ca9735bc45d98a9ad449\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00411582, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\\\\\n\\\\\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\\\\\n\\\\\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\\\\\n\\\\\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\\\\\n\\\\\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\\\\\n\\\\\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\\\\\n\\\\\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\\\\\n\\\\\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\\\\\n\\\\\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\\\\\n\\\\\\\\narchitecture possible.\\\\\\\\n\\\\\\\\n\\\\\\\\non all data on a simple, open and multicloud\\\\\\\\n\\\\\\\\nmodern data stack.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Exploratory Data Scientist**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Curated Data Lake**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Raw Data Ingest**\\\\\\\\n\\u201cBronze\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**Filtered/Cleaned/Augmented**\\\\\\\\n\\u201cSilver\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**Business-Level Aggregates**\\\\\\\\n\\u201cGold\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**D ATA Q U A L I T Y**\\\\\\\\n\\\\\\\\n**Data Sources (Batch and Real-Time)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Unstructured**\\\\\\\\n\\\\\\\\n- Image, Video, Audio\\\\\\\\n\\\\\\\\n- Free Text, Blob\\\\\\\\n\\\\\\\\n\\\\\\\\n**Semi-Structured**\\\\\\\\n\\\\\\\\n- Logs, Clickstream\\\\\\\\n\\\\\\\\n- CSV, JSON, XML\\\\\\\\n\\\\\\\\n\\\\\\\\n**Structured**\\\\\\\\n\\\\\\\\n- Systems of Record\\\\\\\\n\\\\\\\\n- Operational DBs\\\\\\\\n\\\\\\\\n\\\\\\\\n**Figure 8:**\\\\\\\\nThe building blocks for a modern data architecture\\\\\\\\n\\\\\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\\\\\n\\\\\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\\\\\n\\\\\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\\\\\n\\\\\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\\\\\n\\\\\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\\\\\n\\\\\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\\\\\n\\\\\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\\\\\n\\\\\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\\\\\n\\\\\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\\\\\n\\\\\\\\nstepwise approach to improving data.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Lakehouse key features**\\\\\\\\n\\\\\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\\\\\n\\\\\\\\navailable for stakeholders to run business-critical production workloads:\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\\\\\n\\\\\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\\\\\n\\\\\\\\nmonitoring and recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\\\\\n\\\\\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\\\\\n\\\\\\\\nread or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\\\\\n\\\\\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\\\\\n\\\\\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\\\\\n\\\\\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\\\\\n\\\\\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004092816, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"a6c4aa57b347d46b3d74ce86a7176024\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"##### The Delta Lake Series Complete Collection\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What is Delta Lake?\\\\\\\\n\\\\\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\\\\\n\\\\\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\\\\\nmodifying data for data capture.\\\\\\\\n\\\\\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\\\\\nscalable cloud service.\\\\\\\\n\\\\\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\\\\\n\\\\\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\\\\\n\\\\\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\\\\\n\\\\\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\\\\\n\\\\\\\\n\\\\\\\\nRollbacks 39\\\\\\\\n\\\\\\\\nPinned view of a continuously updating\\\\\\\\n\\\\\\\\nDelta Lake table across multiple downstream jobs\\\\\\\\n\\\\\\\\nQueries for time series analytics made simple\\\\\\\\n\\\\\\\\nEasily Clone Your Delta Lake\\\\\\\\n\\\\\\\\nfor Testing, Sharing and ML\\\\\\\\n\\\\\\\\nReproducibility 41\\\\\\\\n\\\\\\\\nWhat are clones? 41\\\\\\\\n\\\\\\\\n\\\\\\\\nA lakehouse combines the best elements\\\\\\\\n\\\\\\\\nof data lakes and data warehouses 52\\\\\\\\n\\\\\\\\nSome early examples 55\\\\\\\\n\\\\\\\\nFrom BI to AI 55\\\\\\\\n\\\\\\\\nDiving Deep Into the\\\\\\\\n\\\\\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\\\\\n\\\\\\\\n1. Data lakes 57\\\\\\\\n\\\\\\\\n2. Custom storage engines 57\\\\\\\\n\\\\\\\\n\\\\\\\\nCreating the Dashboard /\\\\\\\\n\\\\\\\\nVirtual Network Operation Centers 82\\\\\\\\n\\\\\\\\nCreating (near) real-time alerts 85\\\\\\\\n\\\\\\\\nNext steps: machine learning 86\\\\\\\\n\\\\\\\\nPoint-of-failure prediction and remediation 87\\\\\\\\n\\\\\\\\nCustomer churn 87\\\\\\\\n\\\\\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\\\\\n\\\\\\\\nCustomer Use Cases 88\\\\\\\\n\\\\\\\\nHealthdirect Australia 89\\\\\\\\n\\\\\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\\\\\n\\\\\\\\n\\\\\\\\nFundamentals & Performance\\\\\\\\n\\\\\\\\n\\\\\\\\nUsing data skipping and Z-Order clustering 21\\\\\\\\n\\\\\\\\n\\\\\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\\\\\n\\\\\\\\n\\\\\\\\nExploring the details 21\\\\\\\\n\\\\\\\\n\\\\\\\\nPerformance Matter\\\\\\\\n\\\\\\\\n\\\\\\\\nFeatures\\\\\\\\n\\\\\\\\n\\\\\\\\nChallenges with data lakes\\\\\\\\n\\\\\\\\nDelta Lake\\u2019s key functionalities\\\\\\\\n\\\\\\\\nUnpacking the Transaction Log\\\\\\\\n\\\\\\\\nImplementing atomicity to ensure\\\\\\\\n\\\\\\\\n\\\\\\\\nWhy Use MERGE\\\\\\\\n\\\\\\\\nWith Delta Lake?\\\\\\\\n\\\\\\\\nWhen are upserts necessary? 24\\\\\\\\n\\\\\\\\nWhy upserts into data lakes have\\\\\\\\n\\\\\\\\n\\\\\\\\noperations complete fully\\\\\\\\n\\\\\\\\n\\\\\\\\noperations complete fully 9\\\\\\\\n\\\\\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\\\\\n\\\\\\\\nTime travel, data lineage and debugging 10\\\\\\\\n\\\\\\\\nHow to Use Schema Enforcement and Evolution\\\\\\\\n\\\\\\\\nUnderstanding table schemas 11\\\\\\\\n\\\\\\\\n#### 01\\\\\\\\n\\\\\\\\n\\\\\\\\nFundamentals and Performance traditionally been challenging 25\\\\\\\\n\\\\\\\\n\\\\\\\\ntraditionally been challenging\\\\\\\\n\\\\\\\\n\\\\\\\\nShallow clones\\\\\\\\n\\\\\\\\nDeep clones\\\\\\\\n\\\\\\\\n\\\\\\\\n**Chapter**\\\\\\\\n\\\\\\\\n42\\\\\\\\n\\\\\\\\n42\\\\\\\\n\\\\\\\\n#### 04\\\\\\\\n\\\\\\\\n\\\\\\\\n3. Lakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nDealing with multiple concurrent reads and writes\\\\\\\\n\\\\\\\\n\\\\\\\\nIntroducing MERGE in Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\nIn the research paper, the authors explain: 59\\\\\\\\n\\\\\\\\n\\\\\\\\n3. Lakehouse Streaming 58\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\\\\\nand Performance Matter Deleting data due to GDPR 26\\\\\\\\n\\\\\\\\n\\\\\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0040403795, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1b74eac4a063d67e5f727e36b040965b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\\\\\nspend time building and maintaining, and then rebuilding, complex\\\\\\\\nscalable infrastructure\\\\\\\\n\\\\\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\\\\\n\\\\\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\\\\\npipelines are required, which are even more difficult to build and maintain\\\\\\\\n\\\\\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\\\\\n\\\\\\\\n\\\\\\\\n**How can Databricks help?**\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\\\\\ndrive valuable insights.\\\\\\\\n\\\\\\\\nLakehouse Platform\\\\\\\\n\\\\\\\\n**One platform to support multiple personas**\\\\\\\\n\\\\\\\\n\\\\\\\\n**BI & Data**\\\\\\\\n**Warehousing**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Engineering**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Streaming**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Science & ML**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\\\\\n\\\\\\\\n\\\\\\\\n**Unity Catalog**\\\\\\\\n**Fine-grained governance for data and AI**\\\\\\\\n\\\\\\\\n**Delta Lake**\\\\\\\\n**Data reliability and performance**\\\\\\\\n\\\\\\\\n**Cloud Data Lake**\\\\\\\\n\\\\\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\\\\\n\\\\\\\\n\\\\\\\\nFigure 1\\\\\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key differentiators for successful data engineering**\\\\\\\\n**with Databricks**\\\\\\\\n\\\\\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\\\\\nkey differentiating capabilities:\\\\\\\\n\\\\\\\\n**Data ingestion at scale**\\\\\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\\\\\nanalytics, data science or machine learning. This includes:\\\\\\\\n\\\\\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\\\\\n\\\\\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\\\\\nchanges for structured and unstructured data formats\\\\\\\\n\\\\\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\\\\\n\\\\\\\\nno manual intervention\\\\\\\\n\\\\\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\\\\\n\\\\\\\\n\\\\\\\\n**Declarative ETL pipelines**\\\\\\\\nData engineers can reduce development time and effort and instead focus on\\\\\\\\nimplementing business logic and data quality checks within the data pipeline\\\\\\\\nusing SQL or Python. This can be achieved by:\\\\\\\\n\\\\\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\\\\\ndefine \\u201cwhat\\u201d to solve\\\\\\\\n\\\\\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\\\\\ndependencies across the data pipeline\\\\\\\\n\\\\\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\\\\\nand managing data pipeline recovery\\\\\\\\n\\\\\\\\n**Real-time data processing**\\\\\\\\nAllow data engineers to tune data latency with cost controls without the\\\\\\\\nneed to know complex stream processing or implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\\\\\n\\\\\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\\\\\n\\\\\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\\\\\nlogic for downstream use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003983449, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543863868205, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\", \"response\": null}}Fail to invoke the model with {'messages': [{'content': 'What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdfnullnullnullnullnullnullnullnullnullnullnullnullnullList(List(**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified orchestration of data workflows**\n", + "Simple, clear and reliable orchestration of data processing tasks for data,\n", + "analytics and machine learning pipelines with the ability to run multiple\n", + "non-interactive tasks as a directed acyclic graph (DAG) on a Databricks\n", + "compute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\n", + "in a DAG using Databricks Workflows, an orchestration tool included in the\n", + "lakehouse with no need to maintain or pay for an external orchestration service.\n", + "\n", + "**•** Easily create and manage multiple tasks with dependencies via UI,\n", + "API or from your IDE\n", + "\n", + "**•** Have full observability to all workflow runs and get alerted when\n", + "tasks fail for fast troubleshooting and efficient repair and rerun\n", + "\n", + "**•** Leverage high reliability of 99.95% uptime\n", + "\n", + "**•** Use performance optimization clusters that parallelize jobs and\n", + "minimize data movement with cluster reuse\n", + "\n", + "**Data quality validation and monitoring**\n", + "Improve data reliability throughout the data lakehouse so data teams can\n", + "confidently trust the information for downstream initiatives by:\n", + "\n", + "**•** Defining data quality and integrity controls within the pipeline\n", + "with defined data expectations\n", + "\n", + "**•** Addressing data quality errors with predefined policies\n", + "(fail, drop, alert, quarantine)\n", + "\n", + "**•** Leveraging the data quality metrics that are captured, tracked\n", + "and reported for the entire data pipeline\n", + "\n", + "\n", + "Data\n", + "Sources\n", + "\n", + "Data\n", + "Warehouses\n", + "\n", + "On-premises\n", + "Systems\n", + "\n", + "SaaS\n", + "Applications\n", + "\n", + "Machine &\n", + "Application Logs\n", + "\n", + "Application\n", + "Events\n", + "\n", + "Mobile & IoT\n", + "Data\n", + "\n", + "\n", + "Cloud\n", + "Storage\n", + "\n", + "Messag\n", + "e Buses\n", + "\n", + "\n", + "**Lakehouse Platform**\n", + "\n", + "**Workflows** for end-to-end orchestration\n", + "\n", + "\n", + "Real-Time BI Apps\n", + "\n", + "Real-Time AI Apps\n", + "\n", + "\n", + "Real-Time Analytics with\n", + "**Databricks SQL**\n", + "\n", + "Real-Time Machine Learning\n", + "with\n", + "**Databricks ML**\n", + "\n", + "\n", + "Streaming ETL with\n", + "**Delta Live Tables**\n", + "\n", + "\n", + "Predictive\n", + "Maintenance\n", + "\n", + "\n", + "Personalized\n", + "Offers\n", + "\n", + "\n", + "Patient\n", + "Diagnostics\n", + "\n", + "\n", + "Real-Time Operational\n", + "Apps\n", + "\n", + "\n", + "Real-Time Applications with\n", + "**Spark Structured Streaming**\n", + "\n", + "**Photon** for lightning-fast data processing\n", + "\n", + "**Unity Catalog** for data governance and sharing\n", + "\n", + "**Delta Lake** for open and reliable data storage\n", + "\n", + "\n", + "Alerts Detection Fraud\n", + "\n", + "\n", + "Dynamic\n", + "Pricing\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "Figure 2\n", + "A unified set of tools for real-time data processing\n", + "\n", + "\n", + "-----\n", + "\n", + "**Fault tolerant and automatic recovery**\n", + "Handle transient errors and recover from most common error conditions\n", + "occurring during the operation of a pipeline with fast, scalable automatic\n", + "recovery that includes:\n", + "\n", + "**•** Fault tolerant mechanisms to consistently recover the state of data\n", + "\n", + "**•** The ability to automatically track progress from the source with\n", + "checkpointing\n", + "\n", + "**•** The ability to automatically recover and restore the data pipeline state, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf), List(# Building Reliable Data Lakes at Scale With Delta Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "#### Data Engineering Drivers 2\n", + "\n", + " Data Pipeline Key Goals 4\n", + "\n", + " Apache Spark™: The First Unified Analytics Engine 5\n", + "\n", + " Data Reliability Challenges With Data Lakes 6\n", + "\n", + " Delta Lake: A New Storage Layer 7\n", + "\n", + " Delta Lake: Key Features 8\n", + "\n", + " Getting Started With Delta Lake 10\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "#### Data Engineering Drivers\n", + "\n", + "Data engineering professionals are needing to respond to several different drivers.\n", + "\n", + "Chief among the drivers they face are:\n", + "\n", + "**Rise of Advanced Analytics** — Advanced analytics, including methods\n", + "\n", + "based on machine learning techniques, have evolved to such a degree that\n", + "\n", + "organizations seek to derive far more value from their corporate assets.\n", + "\n", + "**Widespread Adoption** — Once the province of leading edge, high-tech\n", + "\n", + "companies, these advanced approaches are being adopted across a\n", + "\n", + "multitude of industries from retail to hospitality to healthcare and across\n", + "\n", + "private as well as public sector organizations. This is further driving the need\n", + "\n", + "for strong data engineering practices.\n", + "\n", + "**Regulation** — With the growth of data generation and data collection,\n", + "\n", + "there is increased interest in how the data is protected and managed.\n", + "\n", + "Regulatory regimes such as GDPR (General Data Protection Regulation)\n", + "\n", + "from the EU and other jurisdictions mandate very specific ways in which\n", + "\n", + "data must be managed.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "**Technology Innovation** — The move to cloud-based analytics architectures\n", + "\n", + "that is now well underway is being propelled further by innovations such as\n", + "\n", + "analytics-focused chipsets, pipeline automation and the unification of data\n", + "\n", + "and machine learning. All these offer data professionals new approaches for\n", + "\n", + "their data initiatives.\n", + "\n", + "**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n", + "\n", + "also subject to increasing scrutiny. There is also a greater understanding of\n", + "\n", + "data as a valuable asset. Deriving value from data must be done in a manner\n", + "\n", + "that is financially responsible and actually value adding to the enterprise and\n", + "\n", + "meeting ROI hurdles.\n", + "\n", + "**Role Evolution** — Reflecting the importance of managing the data and\n", + "\n", + "maximizing value extraction, the Chief Data Officer (CDO) role is becoming\n", + "\n", + "more prominent and newer roles such as Data Curator are emerging.\n", + "\n", + "They must balance the needs of governance, security and democratization.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Key Goals\n", + "\n", + "#### Data Pipeline Key Goals\n", + "\n", + "Making quality data available in a reliable manner is a major determinant of success for data\n", + "\n", + "analytics initiatives be they regular dashboards or reports, or advanced analytics projects\n", + "\n", + "drawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n", + "\n", + "responsibility need to take account of a broad set of dependencies and requirements as they\n", + "\n", + "design and build their data pipelines.\n", + "\n", + "Three primary goals that data engineers typically seek to address as they work to enable the\n", + "\n", + "analytics professionals in their organizations are:\n", + "\n", + "**Deliver quality data in less time** — When it comes to data, quality and timeliness\n", + "\n", + "are key. Data with gaps or errors (which can arise for many reasons) is\n", + "\n", + "“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n", + "\n", + "users. Equally well, many applications require up-to-date information (who\n", + "\n", + "wants to use last night’s closing stock price or weather forecast) and are of\n", + "\n", + "limited value without it.\n", + "\n", + "**Enable faster queries** — Wanting fast responses to queries is natural enough\n", + "\n", + "in today’s “New York minute,” online world. Achieving this is particularly\n", + "\n", + "demanding when the queries are based on very large data sets.\n", + "\n", + "**Simplify data engineering at scale** — It is one thing to have high reliability and\n", + "\n", + "performance in a limited, development or test environment. What matters\n", + "\n", + "more is the ability to have robust, production data pipelines at scale without\n", + "\n", + "requiring high operational overhead.\n", + "\n", + "\n", + "-----, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf), List(data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n", + "\n", + "and batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n", + "\n", + "example, efficiently listing the millions of files (objects) that make up most large data lakes.\n", + "\n", + "**Lakehouse — the modern data architecture**\n", + "\n", + "What if it were possible to combine the best of both worlds? The performance, concurrency and data\n", + "\n", + "management of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n", + "\n", + "the target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n", + "\n", + "the complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n", + "\n", + "of this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n", + "\n", + "architecture possible.\n", + "\n", + "\n", + "on all data on a simple, open and multicloud\n", + "\n", + "modern data stack.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Exploratory Data Scientist**\n", + "\n", + "\n", + "**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n", + "\n", + "\n", + "**Curated Data Lake**\n", + "\n", + "\n", + "**Raw Data Ingest**\n", + "“Bronze”\n", + "\n", + "\n", + "**Filtered/Cleaned/Augmented**\n", + "“Silver”\n", + "\n", + "\n", + "**Business-Level Aggregates**\n", + "“Gold”\n", + "\n", + "\n", + "**D ATA Q U A L I T Y**\n", + "\n", + "**Data Sources (Batch and Real-Time)**\n", + "\n", + "\n", + "**Unstructured**\n", + "\n", + "- Image, Video, Audio\n", + "\n", + "- Free Text, Blob\n", + "\n", + "\n", + "**Semi-Structured**\n", + "\n", + "- Logs, Clickstream\n", + "\n", + "- CSV, JSON, XML\n", + "\n", + "\n", + "**Structured**\n", + "\n", + "- Systems of Record\n", + "\n", + "- Operational DBs\n", + "\n", + "\n", + "**Figure 8:**\n", + "The building blocks for a modern data architecture\n", + "\n", + "The lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n", + "\n", + "including real-time streaming, batch processing, data warehousing, data science and machine learning. This\n", + "\n", + "target-state architecture supports loading all the data types that might be interesting to an organization —\n", + "\n", + "structured, semi-structured and unstructured — and provides a single processing layer, using consistent\n", + "\n", + "APIs across programming languages, to curate data while applying rigorous data management techniques.\n", + "\n", + "The move toward a single, consistent approach to data pipelining and refinement saves organizations\n", + "\n", + "time, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n", + "\n", + "curation and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n", + "\n", + "The architecture makes possible the efficient creation of “data assets” for the organization by taking a\n", + "\n", + "stepwise approach to improving data.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Lakehouse key features**\n", + "\n", + "To effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n", + "\n", + "available for stakeholders to run business-critical production workloads:\n", + "\n", + "\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n", + "\n", + "management with declarative pipeline development, automatic data testing and deep visibility for\n", + "\n", + "monitoring and recovery.\n", + "\n", + "\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n", + "\n", + "data concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n", + "\n", + "read or write data, typically using SQL.\n", + "\n", + "\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n", + "\n", + "and evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n", + "\n", + "be able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n", + "\n", + "\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n", + "\n", + "lakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n", + "\n", + "to unify data and AI assets by centrally sharing, auditing, securing and managing structured and, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf), List(##### The Delta Lake Series Complete Collection\n", + "\n", + "\n", + "-----\n", + "\n", + "### What is Delta Lake?\n", + "\n", + "[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\n", + "analytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\n", + "compatible with Apache Spark™ APIs.\n", + "\n", + "At Databricks, we’ve seen how Delta Lake can bring reliability, performance and\n", + "lifecycle management to data lakes. With Delta Lake, there will be no more\n", + "malformed data ingestion, difficulties deleting data for compliance, or issues\n", + "modifying data for data capture.\n", + "\n", + "With Delta Lake, you can accelerate the velocity that high-quality data can get into\n", + "your data lake and the rate that teams can leverage that data with a secure and\n", + "scalable cloud service.\n", + "\n", + "In this eBook, the Databricks team has compiled all of their insights into a comprehensive\n", + "format so that you can gain a full understanding of Delta Lake and its capabilities.\n", + "\n", + "\n", + "-----\n", + "\n", + "Contents Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "Fundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n", + "\n", + "Performance Matter **you’ll find inside** 5 Features 22\n", + "\n", + "\n", + "\n", + "Processes Petabytes With Data Skipping and Z-Ordering\n", + "\n", + "\n", + "Rollbacks 39\n", + "\n", + "Pinned view of a continuously updating\n", + "\n", + "Delta Lake table across multiple downstream jobs\n", + "\n", + "Queries for time series analytics made simple\n", + "\n", + "Easily Clone Your Delta Lake\n", + "\n", + "for Testing, Sharing and ML\n", + "\n", + "Reproducibility 41\n", + "\n", + "What are clones? 41\n", + "\n", + "\n", + "A lakehouse combines the best elements\n", + "\n", + "of data lakes and data warehouses 52\n", + "\n", + "Some early examples 55\n", + "\n", + "From BI to AI 55\n", + "\n", + "Diving Deep Into the\n", + "\n", + "Inner Workings of the Lakehouse and Delta Lake 56\n", + "\n", + "1. Data lakes 57\n", + "\n", + "2. Custom storage engines 57\n", + "\n", + "\n", + "Creating the Dashboard /\n", + "\n", + "Virtual Network Operation Centers 82\n", + "\n", + "Creating (near) real-time alerts 85\n", + "\n", + "Next steps: machine learning 86\n", + "\n", + "Point-of-failure prediction and remediation 87\n", + "\n", + "Customer churn 87\n", + "\n", + "Getting started with the Databricks streaming video QoS solution 87\n", + "\n", + "Customer Use Cases 88\n", + "\n", + "Healthdirect Australia 89\n", + "\n", + "Data quality and governance issues, silos, and the inability to scale 89\n", + "\n", + "\n", + "Fundamentals & Performance\n", + "\n", + "\n", + "Using data skipping and Z-Order clustering 21\n", + "\n", + "\n", + "The Fundamentals of Delta Lake: Why Reliability and\n", + "\n", + "\n", + "Exploring the details 21\n", + "\n", + "\n", + "Performance Matter\n", + "\n", + "\n", + "Features\n", + "\n", + "\n", + "Challenges with data lakes\n", + "\n", + "Delta Lake’s key functionalities\n", + "\n", + "Unpacking the Transaction Log\n", + "\n", + "Implementing atomicity to ensure\n", + "\n", + "\n", + "Why Use MERGE\n", + "\n", + "With Delta Lake?\n", + "\n", + "When are upserts necessary? 24\n", + "\n", + "Why upserts into data lakes have\n", + "\n", + "\n", + "operations complete fully\n", + "\n", + "\n", + "operations complete fully 9\n", + "\n", + "Dealing with multiple concurrent reads and writes **Chapter**\n", + "\n", + "Time travel, data lineage and debugging 10\n", + "\n", + "How to Use Schema Enforcement and Evolution\n", + "\n", + "Understanding table schemas 11\n", + "\n", + "#### 01\n", + "\n", + "\n", + "Fundamentals and Performance traditionally been challenging 25\n", + "\n", + "\n", + "traditionally been challenging\n", + "\n", + "\n", + "Shallow clones\n", + "\n", + "Deep clones\n", + "\n", + "\n", + "**Chapter**\n", + "\n", + "42\n", + "\n", + "42\n", + "\n", + "#### 04\n", + "\n", + "\n", + "3. Lakehouse\n", + "\n", + "\n", + "Dealing with multiple concurrent reads and writes\n", + "\n", + "\n", + "Introducing MERGE in Delta Lake\n", + "\n", + "\n", + "In the research paper, the authors explain: 59\n", + "\n", + "\n", + "3. Lakehouse Streaming 58\n", + "\n", + "\n", + "\n", + "- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\n", + "and Performance Matter Deleting data due to GDPR 26\n", + "\n", + "\n", + "Understanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf), List(**•** Since data platforms continuously change, data engineers\n", + "spend time building and maintaining, and then rebuilding, complex\n", + "scalable infrastructure\n", + "\n", + "**•** As data pipelines become more complex, data engineers are\n", + "required to find reliable tools to orchestrate these pipelines\n", + "\n", + "**•** With the increasing importance of real-time data, low latency data\n", + "pipelines are required, which are even more difficult to build and maintain\n", + "\n", + "**•** Finally, with all pipelines written, data engineers need to constantly\n", + "focus on performance, tuning pipelines and architectures to meet SLAs\n", + "\n", + "\n", + "**How can Databricks help?**\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The Lakehouse Platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability to\n", + "drive valuable insights.\n", + "\n", + "Lakehouse Platform\n", + "\n", + "**One platform to support multiple personas**\n", + "\n", + "\n", + "**BI & Data**\n", + "**Warehousing**\n", + "\n", + "\n", + "**Data**\n", + "**Engineering**\n", + "\n", + "\n", + "**Data**\n", + "**Streaming**\n", + "\n", + "\n", + "**Data**\n", + "**Science & ML**\n", + "\n", + "\n", + "©2023 Databricks Inc. — All rights reserved\n", + "\n", + "\n", + "**Unity Catalog**\n", + "**Fine-grained governance for data and AI**\n", + "\n", + "**Delta Lake**\n", + "**Data reliability and performance**\n", + "\n", + "**Cloud Data Lake**\n", + "\n", + "All Raw Data (Logs, Texts, Audio, Video, Images)\n", + "\n", + "\n", + "Figure 1\n", + "The Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key differentiators for successful data engineering**\n", + "**with Databricks**\n", + "\n", + "By simplifying on a lakehouse architecture, data engineers need an\n", + "enterprise-grade and enterprise-ready approach to building data pipelines.\n", + "To be successful, a data engineering solution team must embrace these eight\n", + "key differentiating capabilities:\n", + "\n", + "**Data ingestion at scale**\n", + "With the ability to ingest petabytes of data with auto-evolving schemas,\n", + "data engineers can deliver fast, reliable, scalable and automatic data for\n", + "analytics, data science or machine learning. This includes:\n", + "\n", + "**•** Incrementally and efficiently processing data as it arrives\n", + "from files or streaming sources like Kafka, DBMS and NoSQL\n", + "\n", + "**•** Automatically inferring schema and detecting column\n", + "changes for structured and unstructured data formats\n", + "\n", + "**•** Automatically and efficiently tracking data as it arrives with\n", + "\n", + "no manual intervention\n", + "\n", + "**•** Preventing data loss by rescuing data columns\n", + "\n", + "\n", + "**Declarative ETL pipelines**\n", + "Data engineers can reduce development time and effort and instead focus on\n", + "implementing business logic and data quality checks within the data pipeline\n", + "using SQL or Python. This can be achieved by:\n", + "\n", + "**•** Using intent-driven declarative development to simplify “how” and\n", + "define “what” to solve\n", + "\n", + "**•** Automatically creating high-quality lineage and managing table\n", + "dependencies across the data pipeline\n", + "\n", + "**•** Automatically checking for missing dependencies or syntax errors,\n", + "and managing data pipeline recovery\n", + "\n", + "**Real-time data processing**\n", + "Allow data engineers to tune data latency with cost controls without the\n", + "need to know complex stream processing or implement recovery logic.\n", + "\n", + "**•** Avoid handling batch and real-time streaming data sources separately\n", + "\n", + "**•** Execute data pipeline workloads on automatically provisioned elastic\n", + "Apache Spark™-based compute clusters for scale and performance\n", + "\n", + "**•** Remove the need to manage infrastructure and focus on the business\n", + "logic for downstream use cases\n", + "\n", + "\n", + "-----, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf))nullnullnullnullnullnull
de1daac1a320379ce055bdc8b8342a2d7ca8d1ea08483081801f8219f41dc69dList(List(List(What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?, user)))List(List(“In today’s experience-driven world, the most beloved brands are the ones that know their customers. Customers are loyal to brands that recognize their needs and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer. And as organizations pursue omnichannel excellence, these same high expectations of online experiences also extend to brick-and-mortar locations — revealing for many merchants that personalized engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized experiences requires integrating various types of data — including demographics, behavioral and transactional — to develop robust profiles. This guide focuses on six actionable strategic pillars for businesses to leverage automation, real-time data, AI-driven analysis and well-tuned ML models to architect and deliver customized customer experiences at every touch point.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf))List(76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.){\"info\": {\"request_id\": \"tr-086a428d0c8e48f696b74292e6de14dd\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852759, \"execution_time_ms\": 4339, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey stud...\"}, \"tags\": {\"eval.requestId\": \"21081e4e-3cec-4efe-b82d-4da881b06daf\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-086a428d0c8e48f696b74292e6de14dd/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x55e8be6f41937068\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": null, \"start_time\": 1734543852759018056, \"end_time\": 1734543857098523511, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x5361cc2ae9259abf\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852808311281, \"end_time\": 1734543852837210547, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xd08249d64badfb47\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852837315448, \"end_time\": 1734543852837962656, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x58eb14ae2c4c7759\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0xd08249d64badfb47\", \"start_time\": 1734543852837461050, \"end_time\": 1734543852837605052, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x02ee0dd883e9b850\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0xd08249d64badfb47\", \"start_time\": 1734543852837666553, \"end_time\": 1734543852837926256, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x5d93eb743b5405d0\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852838067058, \"end_time\": 1734543857097985404, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x587f34a7e122462a\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543853035001055, \"end_time\": 1734543853940320633, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_2aca309c-76e7-43a7-b865-7f13c78ba752\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 40, \\\"prompt_tokens\\\": 1169, \\\"total_tokens\\\": 1209, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x4c79b1c071988e15\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543853944972392, \"end_time\": 1734543854364782042, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032405849, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"0473e2deba8639930389964be7b25bc7\\\"}, {\\\"page_content\\\": \\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0031753962, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"d53c2a5c69cef5febfa62ea961c33d25\\\"}, {\\\"page_content\\\": \\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0028500317, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\"}, {\\\"page_content\\\": \\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002557174, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025465384, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8e054539e38c8a49888991a85b178399\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x685e74c98dec52e7\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x4c79b1c071988e15\", \"start_time\": 1734543853946687214, \"end_time\": 1734543854363491826, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"0473e2deba8639930389964be7b25bc7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0032405849], [\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"d53c2a5c69cef5febfa62ea961c33d25\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0031753962], [\\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0028500317], [\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.002557174], [\\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"8e054539e38c8a49888991a85b178399\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0025465384]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x5a90adec32571a53\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x4c79b1c071988e15\", \"start_time\": 1734543854363626328, \"end_time\": 1734543854364452438, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"0473e2deba8639930389964be7b25bc7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0032405849], [\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"d53c2a5c69cef5febfa62ea961c33d25\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0031753962], [\\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0028500317], [\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.002557174], [\\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"8e054539e38c8a49888991a85b178399\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0025465384]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032405849, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"0473e2deba8639930389964be7b25bc7\\\"}, {\\\"page_content\\\": \\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0031753962, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"d53c2a5c69cef5febfa62ea961c33d25\\\"}, {\\\"page_content\\\": \\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0028500317, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\"}, {\\\"page_content\\\": \\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002557174, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025465384, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8e054539e38c8a49888991a85b178399\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x63065083d9c8d37f\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543854375613973, \"end_time\": 1734543857095139570, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_c02e8f70-9c86-48ef-aeb6-ac39aa754c68\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543856, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 31, \\\"prompt_tokens\\\": 5748, \\\"total_tokens\\\": 5779, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdfAccording to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.yesnullyesNo harmful content detected in responseyesThe expected response states that 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience. The response confirms this by stating that according to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience. The response is correct.4.3396988.06917.071.0nullnullList(List(Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**\n", + "\n", + "Personalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\n", + "The [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\n", + "how they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n", + "[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Personalizing the beauty product shopping experience**\n", + "\n", + "Flaconi wanted to leverage data and AI to become the No. 1 online\n", + "beauty product destination in Europe. However, they struggled with\n", + "massive volumes of streaming data and with infrastructure complexity\n", + "that was resource-intensive and costly to scale. See how they used\n", + "Databricks to increase time-to-market by 200x, reduce staff costs by\n", + "40% and increase net order income.\n", + "\n", + "Get the full story\n", + "\n", + "\n", + "¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\n", + "Experience Performance Index in 2007-09.\n", + "\n", + "Source: Forrester Customer Experience Performance Index (2007-09); press search\n", + "\n", + "CX leaders outperform laggards, even in a down market, in this visualization of the Forrester\n", + "Customer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n", + "\n", + "\n", + "-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf), List(**4.** **Streamlining Customer Analysis and Targeting**\n", + "Creating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n", + "\n", + "**5.** **Assessing Consumer Interest Data**\n", + "Fine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n", + "\n", + "**6.** **Delivering Personalized Customer Journeys**\n", + "Crafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n", + "\n", + "**Conclusion**\n", + "Building a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "In today’s experience-driven world, the most beloved brands are the ones that\n", + "know their customers. Customers are loyal to brands that recognize their needs\n", + "and preferences — and tailor user journeys and engagements accordingly.\n", + "\n", + "A study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\n", + "buying from a brand that personalizes the shopping and user experience to the\n", + "wants and needs of the customer. And as organizations pursue omnichannel\n", + "excellence, these same high expectations of online experiences also extend to\n", + "brick-and-mortar locations — revealing for many merchants that personalized\n", + "engagement is fundamental to attracting customers and expanding share of wallet.\n", + "\n", + "But achieving a 360-degree view of your customers to serve personalized\n", + "experiences requires integrating various types of data — including demographics,\n", + "behavioral and transactional — to develop robust profiles. This guide focuses on six\n", + "actionable strategic pillars for businesses to leverage automation, real-time data,\n", + "AI-driven analysis and well-tuned ML models to architect and deliver customized\n", + "customer experiences at every touch point.\n", + "\n", + "\n", + "# 76%\n", + "\n", + "of consumers are more\n", + "likely to purchase due to\n", + "personalization\n", + "\n", + "\n", + "# 76%\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Foundation for Personalization\n", + "\n", + "Get a 360-degree view of the customer by leveraging ML-based entity resolution\n", + "\n", + "\n", + "To create truly personalized interactions, you need actionable insights\n", + "about your customers. Start by establishing a common customer profile and\n", + "accurately linking together customer records across disparate data sets.\n", + "\n", + "Get a 360-degree view of your target customer by bringing together:, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf), List(Customer\n", + "\n", + "\n", + "Use the **Propensity Scoring Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Downstream\n", + "Applications\n", + "\n", + "\n", + "A three-part propensity scoring workflow.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Delivering Personalized Customer Journeys\n", + "\n", + "Strategies for crafting a real-time recommendation engine\n", + "\n", + "\n", + "As the economy continues to weather unpredictable disruptions, shortages and\n", + "demand, delivering personalized customer experiences at speed and scale will\n", + "require adaptability on the ground and within a company’s operational tech stack.\n", + "\n", + "\n", + "With the Databricks Lakehouse, Al-Futtaim has transformed their data\n", + "strategy and operations, allowing them to create a “golden customer\n", + "record” that improves all decision-making from forecasting demand to\n", + "powering their global loyalty program.\n", + "\n", + "[Get the full story](https://www.databricks.com/customers/al-futtaim)\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "\n", + "“Databricks Lakehouse allows every division in our\n", + "organization — from automotive to retail — to gain\n", + "a unified view of our customer across businesses.\n", + "With these insights, we can optimize everything from\n", + "forecasting and supply chain, to powering our loyalty\n", + "program through personalized marketing campaigns,\n", + "cross-sell strategies and offers.”\n", + "\n", + "**D M I T R I Y D O V G A N**\n", + "Head of Data Science, Al-Futtaim Group\n", + "\n", + "As COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\n", + "safety and community, brands most attuned to changing needs and sentiments\n", + "saw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\n", + "business and many lost, organizations that had already begun the journey toward\n", + "improved customer experience saw better outcomes, closely mirroring patterns\n", + "[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n", + "\n", + "\n", + "**Creating a unified view across 200+ brands**\n", + "\n", + "As a driving force for economic growth in the Middle East, Al-Futtaim\n", + "impacts the lives of millions of people across the region through the\n", + "distribution and operations of global brands like Toyota, IKEA, Ace\n", + "Hardware and Marks & Spencer.\n", + "\n", + "Al-Futtaim’s focus is to harness their data to improve all areas of the\n", + "business, from streamlining the supply chain to optimizing marketing\n", + "strategies. But with the brands capturing such a wide variety of data,\n", + "Al-Futtaim’s legacy systems struggled to provide a single view into\n", + "the customer due to data silos and the inability to scale efficiently to\n", + "meet analytical needs.\n", + "\n", + "\n", + "-----\n", + "\n", + "The personalization of customer experiences will remain a key focus for B2C\n", + "and [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\n", + "experience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\n", + "long-established players.\n", + "\n", + "**Focus on the customer journey**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf), List(**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n", + "\n", + "**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n", + "\n", + "\n", + "[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n", + "\n", + "\n", + "**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n", + "\n", + "\n", + "-----\n", + "\n", + "**Product distribution:**\n", + "**segmentation and personalization**\n", + "\n", + "The most forward-thinking and data-driven insurers are\n", + "focused on achieving personalization at scale. They are\n", + "exploring new partnerships and business models to create\n", + "integrated, value-added experiences that prioritize the\n", + "overall health and financial wellness of their customers,\n", + "rather than just their insurance needs. These insurers\n", + "are investing in new data sources, analytics platforms,\n", + "and artificial intelligence (AI)-powered decision engines\n", + "that enable them to connect producers with like-minded\n", + "customers or engage customers with enticing offers\n", + "and actionable steps based on their previous choices.\n", + "The outcome is more efficient and effective service\n", + "from producers, trusted and convenient interactions for\n", + "consumers, and increased customer engagement and\n", + "growth for insurers in an increasingly digital-oriented world.\n", + "\n", + "\n", + "**Customer/Partner Successes**\n", + "\n", + "**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n", + "\n", + "[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\n", + "insurance companies. It enables them to complete, unify and comprehensively capture customer profiles\n", + "using a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n", + "360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\n", + "as call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n", + "360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n", + "\n", + "With Persona 360, you can:\n", + "\n", + "**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n", + "1,695+ attributes and segments, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf), List(-----\n", + "\n", + "Careful consideration of how customers interact with various assets — and how\n", + "these interactions may be interpreted as expressions of preference — can unlock\n", + "a wide range of data that enables personalization.\n", + "\n", + "\n", + "The complexity of these engines requires that they be deployed thoughtfully, using\n", + "limited pilots and customer response assessments. And in those assessments,\n", + "it’s important to keep in mind that there is no expectation of perfection — only\n", + "incremental improvement over the prior solution.\n", + "\n", + "\n", + "**C A S E S T U DY**\n", + "\n", + "**Need help generating personalized**\n", + "**recommendations?**\n", + "\n", + "\n", + "**Connecting shoppers to savings with data-driven**\n", + "**personalization‌**\n", + "\n", + "\n", + "Use the **Recommendation Engines Accelerator** to estimate\n", + "customers’ potential receptiveness to an offer or to\n", + "content related to a subset of products. Using these scores,\n", + "marketers can determine which of the many messages at\n", + "their disposal should be presented to a specific customer.\n", + "\n", + "**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n", + "\n", + "\n", + "Flipp is an online marketplace that aggregates weekly shopping circulars,\n", + "so consumers get deals and discounts without clipping coupons. Siloed\n", + "customer data sources once made getting insights difficult. Now with\n", + "Databricks, Flipp’s data teams can access and democratize data, helping\n", + "them do their jobs more effectively while bringing better deals to users,\n", + "more meaningful insights to partners, and a 10% jump in foot traffic to\n", + "brick-and-mortar retailers.\n", + "\n", + "Get the full story\n", + "\n", + "The engines we use to serve content based on customer preferences are known\n", + "as recommenders. With some recommenders, a heavy focus on the shared\n", + "preferences of similar customers helps define what recommendations will actually\n", + "make an impact. With others, it can be more useful to focus on the properties of\n", + "the content itself (e.g., product descriptions).\n", + "\n", + "\n", + "-----\n", + "\n", + "### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n", + "\n", + "\n", + "Providing deep, effective personalized experiences to customers depends\n", + "on a brand’s ability to intelligently leverage consumer and market data from a\n", + "wide variety of sources to fuel faster, smarter decisions — without sacrificing\n", + "accuracy for speed. The Databricks Lakehouse Platform is purpose-built for\n", + "exactly that, offering a scalable data architecture that unifies all your data,\n", + "analytics and AI to deliver unforgettable customer experiences.\n", + "\n", + "Created on open source and open standards, Databricks offers a robust\n", + "and cost-effective platform for brands to collaborate with partners, clients,\n", + "manufacturers and distributors to unleash more innovation and efficiencies\n", + "at every touch point. Businesses can rapidly ingest available data in real time,\n", + "\n", + "\n", + "at scale, and create accessible, data-driven insights that enable actionable\n", + "strategies across the value chain.\n", + "\n", + "Databricks is a multicloud platform, designed for quick enterprise development.\n", + "Teams using the Lakehouse can more effectively reveal the 360-degree view into\n", + "their company’s operational health and the evolving needs of their customers\n", + "— all while empowering teams to easily unify data efforts, perform fine-grained\n", + "analyses and streamline cross-functional data operations using a single,\n", + "sophisticated solution.\n", + "\n", + "\n", + "###### Learn more about Databricks Lakehouse for industries\n", + " like Retail & Consumer Goods, Media & Entertainment\n", + " and more at databricks.com/solutions\n", + "\n", + "\n", + "-----\n", + "\n", + "### About Databricks\n", + "\n", + "Databricks is the data and AI company. More than 7,000 organizations worldwide —\n", + "\n", + "including Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n", + "\n", + "the Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n", + "\n", + "is headquartered in San Francisco, with offices around the globe. Founded by\n", + "\n", + "the original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n", + "\n", + "a mission to help data teams solve the world’s toughest problems. To learn more,\n", + "\n", + "follow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) ., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf))yesThe response states that 'According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.' The retrieved context includes a statement: 'A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer.' This directly supports the percentage and the context provided in the response.yesThe ground truth states that '76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.' The retrieved context includes a statement that 'A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer.' This directly supports the percentage and context provided in the ground truth.List(yes, yes, yes, no, yes)1.0
7bf67f12c5d95da350ca553480cfdc9af32b7ccbf14f70f9ce8f4706b04e96e0List(List(List(What is the issue with the provided document?, user)))List(List(Unfortunately, the document does not contain any readable content., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf))List(The document lacks readable content.){\"info\": {\"request_id\": \"tr-f5bada305fde446d9fce05dc746722ef\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543859627, \"execution_time_ms\": 558, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"tags\": {\"eval.requestId\": \"60430f87-041c-4cd0-8244-87307a3350aa\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-f5bada305fde446d9fce05dc746722ef/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x13f72ef9de412e01\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": null, \"start_time\": 1734543859627945601, \"end_time\": 1734543860186157374, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3cc88e82de54ad87\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859641507066, \"end_time\": 1734543859648696853, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x1b6d8e643e11184a\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859648828655, \"end_time\": 1734543859649388261, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is the issue with the provided document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xaee60d65c83616d5\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x1b6d8e643e11184a\", \"start_time\": 1734543859648982956, \"end_time\": 1734543859649140458, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What is the issue with the provided document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb61d4c85093c18ab\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x1b6d8e643e11184a\", \"start_time\": 1734543859649205959, \"end_time\": 1734543859649338561, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x7160308a59361380\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859649453762, \"end_time\": 1734543860186084573, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is the issue with the provided document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xcfe0ed6326d30a3b\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x7160308a59361380\", \"start_time\": 1734543859725257682, \"end_time\": 1734543860179046488, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_fe113d45-dde2-4d41-9ccb-de4e08755632\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543859, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1148, \\\"total_tokens\\\": 1161, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdfI'm sorry, I can't help you with that.no[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.yesNo harmful content detected in responsenoThe expected response states that the document lacks readable content. The response contains a single sentence: 'What is the issue with the provided document? - I'm sorry, I can't help you with that.' This sentence is readable and understandable. Therefore, the response does contain readable content. The response is not correct.0.5581161.01148.013.0nullnullnullnullnullnullnullnullnull
66725804819c75f5e3005072cb81414f01272d64b1b0a8ea89a58392599b1ff7List(List(List(What are the key features and advantages of the lakehouse pattern?, user)))List(List(“The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency, and scale of data lakes with the data management and ACID semantics of data warehouses. A lakehouse pattern enables data transformation, cleansing, and validation to support both business intelligence and machine learning (ML) users on all data. Lakehouse is cloud-centric and unifies a complete up-to-date data set for teams, allowing collaboration across an organization.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf))List(The lakehouse pattern has an open data management architecture., It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics., It supports data transformation, cleansing, and validation., The lakehouse pattern is cloud-centric., It enhances support for both business intelligence and machine learning., It is cost-efficient., It offers an up-to-date unified data set., It improves collaboration across the organization.){\"info\": {\"request_id\": \"tr-7807384ebe5b4a52a2e92e5977280592\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852725, \"execution_time_ms\": 9673, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports sche...\"}, \"tags\": {\"eval.requestId\": \"a86697d5-8497-4e80-9dbd-1abc4d040fbb\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-7807384ebe5b4a52a2e92e5977280592/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xc190626ad1788551\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": null, \"start_time\": 1734543852725148426, \"end_time\": 1734543862398726898, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xc9b3432e44423089\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852773692542, \"end_time\": 1734543852780402027, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc7a9924c42a7d71b\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852780529728, \"end_time\": 1734543852781224337, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x69289b1c1989b445\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc7a9924c42a7d71b\", \"start_time\": 1734543852780800332, \"end_time\": 1734543852780978534, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the key features and advantages of the lakehouse pattern?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x7fbc783d45a1d2a8\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc7a9924c42a7d71b\", \"start_time\": 1734543852781051335, \"end_time\": 1734543852781191437, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x6867fc1fa868809f\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852781290838, \"end_time\": 1734543862398144491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0xfabbc93bc56a67d6\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543852924992260, \"end_time\": 1734543853767974648, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_00015bc1-e6a7-4f23-82d0-45824a19b7f8\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 25, \\\"prompt_tokens\\\": 1152, \\\"total_tokens\\\": 1177, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0xd9a8b578a01925e8\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543853776355854, \"end_time\": 1734543854279383506, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"lakehouse pattern features and advantages\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00323427, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"5014f5f2c09c55edb470c8b5528eb000\\\"}, {\\\"page_content\\\": \\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029213156, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9cabb87127bfa514fa6f498e9f2831e7\\\"}, {\\\"page_content\\\": \\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027414565, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\"}, \\\"id\\\": \\\"b1f28e2afb30602c0205684eb65002df\\\"}, {\\\"page_content\\\": \\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002695809, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"8375eac494bff392a37d6dff7c40c1b1\\\"}, {\\\"page_content\\\": \\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025942351, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"accf6ad13717062292245537ffbd0249\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x395812f9e90cd8fa\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xd9a8b578a01925e8\", \"start_time\": 1734543853777614470, \"end_time\": 1734543854277947688, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"lakehouse pattern features and advantages\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"5014f5f2c09c55edb470c8b5528eb000\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.00323427], [\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"9cabb87127bfa514fa6f498e9f2831e7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029213156], [\\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"b1f28e2afb30602c0205684eb65002df\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\", 0.0027414565], [\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"8375eac494bff392a37d6dff7c40c1b1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.002695809], [\\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"accf6ad13717062292245537ffbd0249\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0025942351]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x98dd020417f25695\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xd9a8b578a01925e8\", \"start_time\": 1734543854278140591, \"end_time\": 1734543854279068702, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"5014f5f2c09c55edb470c8b5528eb000\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.00323427], [\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"9cabb87127bfa514fa6f498e9f2831e7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029213156], [\\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"b1f28e2afb30602c0205684eb65002df\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\", 0.0027414565], [\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"8375eac494bff392a37d6dff7c40c1b1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.002695809], [\\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"accf6ad13717062292245537ffbd0249\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0025942351]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00323427, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"5014f5f2c09c55edb470c8b5528eb000\\\"}, {\\\"page_content\\\": \\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029213156, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9cabb87127bfa514fa6f498e9f2831e7\\\"}, {\\\"page_content\\\": \\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027414565, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\"}, \\\"id\\\": \\\"b1f28e2afb30602c0205684eb65002df\\\"}, {\\\"page_content\\\": \\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002695809, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"8375eac494bff392a37d6dff7c40c1b1\\\"}, {\\\"page_content\\\": \\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025942351, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"accf6ad13717062292245537ffbd0249\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x7c29470ff7bb88d3\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543854289631330, \"end_time\": 1734543862392847528, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_2a7072d9-bfdf-4b0e-bedc-665497200461\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543855, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 407, \\\"prompt_tokens\\\": 5406, \\\"total_tokens\\\": 5813, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdfThe key features of the lakehouse pattern include:\n", + "\n", + "1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\n", + "2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\n", + "3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\n", + "4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\n", + "5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\n", + "6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\n", + "7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\n", + "8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\n", + "\n", + "The advantages of the lakehouse pattern include:\n", + "\n", + "1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\n", + "2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\n", + "3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\n", + "4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\n", + "5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\n", + "\n", + "Overall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.no[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.yesNo harmful content detected in responsenoThe expected response states several features and advantages of the lakehouse pattern. The response supports the following parts of the expected response: - 'It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics' is supported by the response mentioning 'The lakehouse combines the best elements of data warehouses and data lakes' and 'Transaction support: ACID transactions ensure consistency'. - 'It supports data transformation, cleansing, and validation' is not explicitly mentioned in the response. - 'The lakehouse pattern is cloud-centric' is not mentioned in the response. - 'It enhances support for both business intelligence and machine learning' is supported by the response mentioning 'BI support' and 'Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics'. - 'It is cost-efficient' is supported by the response mentioning 'Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems'. - 'It offers an up-to-date unified data set' is supported by the response mentioning 'Lakehouses enable using BI tools directly on the source data, reducing staleness and latency'. - 'It improves collaboration across the organization' is supported by the response mentioning 'Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data'. Therefore, the response is not correct.9.6736990.06558.0432.0nullnullList(List(Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n", + "\n", + "Key Use Cases for Insurance:\n", + "\n", + "**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n", + "\n", + "**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n", + "\n", + "**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n", + "\n", + "**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf), List(In short, a lakehouse is a data architecture that combines the best elements\n", + "of data warehouses and data lakes. Lakehouses are enabled by a new system\n", + "design, which implements similar data structures and data management features\n", + "found in a data warehouse directly on the low-cost storage used for data lakes.\n", + "\n", + "\n", + "-----\n", + "\n", + "##### Data lakehouse\n", + "\n", + "One platform to unify all your data, analytics and AI workloads\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "All machine learning, SQL,\n", + "BI, and streaming use cases\n", + "\n", + "One security and governance\n", + "approach for all data assets\n", + "on all clouds\n", + "\n", + "\n", + "-----\n", + "\n", + "**Key features for a lakehouse**\n", + "\n", + "Recent innovations with the data lakehouse architecture can help simplify\n", + "your data and AI workloads, ease collaboration for data teams, and maintain\n", + "the kind of flexibility and openness that allows your organization to stay agile\n", + "as you scale. Here are key features to consider when evaluating data lakehouse\n", + "architectures:\n", + "\n", + "Transaction support: In an enterprise lakehouse, many data pipelines will\n", + "often be reading and writing data concurrently. Support for ACID (Atomicity,\n", + "Consistency, Isolation and Durability) transactions ensures consistency as\n", + "multiple parties concurrently read or write data.\n", + "\n", + "Schema enforcement and governance: The lakehouse should have\n", + "a way to support schema enforcement and evolution, supporting data\n", + "warehouse schema paradigms such as star/snowflake. The system should\n", + "be able to reason about data integrity, and it should have robust governance\n", + "and auditing mechanisms.\n", + "\n", + "Data governance: Capabilities including auditing, retention and lineage\n", + "have become essential, particularly considering recent privacy regulations.\n", + "\n", + "Tools that allow data discovery have become popular, such as data catalogs\n", + "and data usage metrics.\n", + "\n", + "BI support: Lakehouses allow the use of BI tools directly on the source\n", + "data. This reduces staleness and latency, improves recency and lowers cost\n", + "by not having to operationalize two copies of the data in both a data lake\n", + "and a warehouse.\n", + "\n", + "\n", + "Storage decoupled from compute: In practice, this means storage and\n", + "compute use separate clusters, thus these systems can scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also\n", + "have this property.\n", + "\n", + "Openness: The storage formats, such as Apache Parquet, are open and\n", + "standardized, so a variety of tools and engines, including machine learning\n", + "and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "Support for diverse data types (unstructured and structured):\n", + "The lakehouse can be used to store, refine, analyze and access data types\n", + "needed for many new data applications, including images, video, audio,\n", + "semi-structured data and text.\n", + "\n", + "Support for diverse workloads: Use the same data repository for a range\n", + "of workloads including data science, machine learning and SQL analytics.\n", + "Multiple tools might be needed to support all these workloads.\n", + "\n", + "End-to-end streaming: Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "**Learn more**\n", + "\n", + "**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n", + "\n", + "**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n", + "\n", + "**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n", + "\n", + "\n", + "-----\n", + "\n", + "**CHAPTER**\n", + "\n", + "# 02\n", + "\n", + "\n", + "### The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Lakehouse: A new generation of open platforms\n", + "\n", + "\n", + "###### This is the lakehouse paradigm, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf), List(versioning, governance, security and ACID properties that are needed even for\n", + "\n", + "unstructured data.\n", + "\n", + "\n", + "stored procedures are available, but users may need to employ other mechanisms that\n", + "\n", + "\n", + "aren’t equivalent to those found in traditional data warehouses. The latter is particularly\n", + "\n", + "important for “lift and shift scenarios,” which require systems that achieve semantics\n", + "\n", + "that are almost identical to those of older, commercial data warehouses.\n", + "\n", + "What about support for other types of data applications? Users of a lakehouse have\n", + "\n", + "access to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n", + "\n", + "libraries) for non-BI workloads like data science and machine learning. Data\n", + "\n", + "exploration and refinement are standard for many analytic and data science\n", + "\n", + "applications. Delta Lake is designed to let users incrementally improve the quality of\n", + "\n", + "\n", + "Current lakehouses reduce cost, but their performance can still lag specialized\n", + "\n", + "systems (such as data warehouses) that have years of investments and real-\n", + "\n", + "world deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "\n", + "and other issues will be addressed as the technology continues to mature and\n", + "\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "\n", + "diverse data applications.\n", + "\n", + "\n", + "data in their lakehouse until it is ready for consumption.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the Inner Workings**\n", + "**of the Lakehouse and Delta Lake**\n", + "\n", + "### CHAPTER 02\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "# 02\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "\n", + "paper that describes some of the core technological challenges and solutions that\n", + "\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "\n", + "can read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "\n", + "[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "\n", + "object stores like Amazon S3 have become some of the largest and most cost-\n", + "\n", + "effective storage systems in the world, which makes them an attractive platform to\n", + "\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf), List(Current lakehouses reduce cost, but their performance can still lag specialized\n", + "systems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n", + "notebooks) over others so lakehouses will also need to improve their UX and their\n", + "connectors to popular tools so they can appeal to a variety of personas. These\n", + "and other issues will be addressed as the technology continues to mature and\n", + "develop. Over time, lakehouses will close these gaps while retaining the core\n", + "properties of being simpler, more cost-efficient and more capable of serving\n", + "diverse data applications.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Diving Deep Into the**\n", + "**Inner Workings of the**\n", + "**Lakehouse and Delta Lake**\n", + "\n", + "Databricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n", + "adopting the lakehouse pattern. The blog created a massive amount of interest\n", + "from technology enthusiasts. While lots of people praised it as the next-generation\n", + "data architecture, some people thought the lakehouse is the same thing as\n", + "the data lake. Recently, several of our engineers and founders wrote a research\n", + "paper that describes some of the core technological challenges and solutions that\n", + "set the lakehouse architecture apart from the data lake, and it was accepted and\n", + "published at the International Conference on Very Large Databases (VLDB) 2020. You\n", + "can read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n", + "[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n", + "\n", + "Henry Ford is often credited with having said, “If I had asked people what they wanted,\n", + "they would have said faster horses.” The crux of this statement is that people often\n", + "envision a better solution to a problem as an evolution of what they already know\n", + "rather than rethinking the approach to the problem altogether. In the world of data\n", + "storage, this pattern has been playing out for years. Vendors continue to try to reinvent\n", + "the old horses of data warehouses and data lakes rather than seek a new solution.\n", + "\n", + "\n", + "-----\n", + "\n", + "More than a decade ago, the cloud opened a new frontier for data storage. Cloud\n", + "object stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\n", + "store data warehouses and data lakes. However, their nature as key-value stores\n", + "makes it difficult to achieve ACID transactions that many organizations require. Also,\n", + "performance is hampered by expensive metadata operations (e.g., listing objects)\n", + "and limited consistency guarantees.\n", + "\n", + "Based on the characteristics of cloud object stores, three approaches have emerged.\n", + "\n", + "**1. Data lakes**\n", + "The first is directories of files (i.e., data lakes) that store the table as a collection\n", + "of objects, typically in columnar format such as Apache Parquet. It’s an attractive\n", + "approach because the table is just a group of objects that can be accessed from\n", + "a wide variety of tools without a lot of additional data stores or systems. However,\n", + "both performance and consistency problems are common. Hidden data corruption\n", + "is common due to failed transactions, eventual consistency leads to inconsistent\n", + "queries, latency is high, and basic management capabilities like table versioning and\n", + "audit logs are unavailable.\n", + "\n", + "**2. Custom storage engines**\n", + "The second approach is custom storage engines, such as proprietary systems built for\n", + "the cloud like the Snowflake data warehouse. These systems can bypass the consistency\n", + "challenges of data lakes by managing the metadata in a separate, strongly consistent\n", + "service that’s able to provide a single source of truth. However, all I/O operations need\n", + "to connect to this metadata service, which can increase cloud resource costs and\n", + "reduce performance and availability. Additionally, it takes a lot of engineering work to\n", + "implement connectors to existing computing engines like Apache Spark, TensorFlow\n", + "and PyTorch, which can be challenging for data teams that use a variety of computing\n", + "engines on their data. Engineering challenges can be exacerbated by unstructured\n", + "data because these systems are generally optimized for traditional structured\n", + "\n", + "\n", + "-----\n", + "\n", + "data types. Finally, and most egregiously, the proprietary metadata service locks\n", + "customers into a specific service provider, leaving customers to contend with\n", + "consistently high prices and expensive, time-consuming migrations if they decide to\n", + "adopt a new approach later., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf), List(- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\n", + "be reading and writing data concurrently. Support for ACID transactions ensures\n", + "consistency as multiple parties concurrently read or write data, typically using SQL.\n", + "\n", + "\n", + "and batch and streaming jobs. For these reasons, many of the promises of data lakes\n", + "have not materialized and, in many cases, lead to a loss of many of the benefits of data\n", + "warehouses.\n", + "\n", + "The need for a flexible, high-performance system hasn’t abated. Companies\n", + "require systems for diverse data applications including SQL analytics, real-time\n", + "monitoring, data science and machine learning. Most of the recent advances in\n", + "AI have been in better models to process unstructured data (text, images, video,\n", + "audio), but these are precisely the types of data that a data warehouse is not\n", + "optimized for.\n", + "\n", + "A common approach is to use multiple systems — a data lake, several data\n", + "warehouses, and other specialized systems such as streaming, time-series, graph\n", + "and image databases. Having a multitude of systems introduces complexity and,\n", + "more importantly, introduces delay as data professionals invariably need to move\n", + "or copy data between different systems.\n", + "\n", + "\n", + "-----\n", + "\n", + "**\u0007Schema enforcement and governance:** The lakehouse should have a way to\n", + "support schema enforcement and evolution, supporting DW schema paradigms\n", + "such as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n", + "[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n", + "\n", + "- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\n", + "reduces staleness and improves recency, reduces latency and lowers the cost of\n", + "having to operationalize two copies of the data in both a data lake and a warehouse.\n", + "\n", + "- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\n", + "use separate clusters, thus these systems are able to scale to many more\n", + "concurrent users and larger data sizes. Some modern data warehouses also have\n", + "this property.\n", + "\n", + "- **\u0007Openness:** The storage formats they use are open and standardized, such as\n", + "Parquet, and they provide an API so a variety of tools and engines, including\n", + "machine learning and Python/R libraries, can efficiently access the data directly.\n", + "\n", + "- **\u0007Support for diverse data types ranging from unstructured to structured data:**\n", + "The lakehouse can be used to store, refine, analyze and access data types needed\n", + "for many new data applications, including images, video, audio, semi-structured\n", + "data, and text.\n", + "\n", + "- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\n", + "analytics. Multiple tools might be needed to support all these workloads, but they all\n", + "rely on the same data repository.\n", + "\n", + "- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\n", + "Support for streaming eliminates the need for separate systems dedicated to\n", + "serving real-time data applications.\n", + "\n", + "These are the key attributes of lakehouses. Enterprise-grade systems require additional\n", + "features. Tools for security and access control are basic requirements. Data governance\n", + "capabilities including auditing, retention and lineage have become essential particularly\n", + "in light of recent privacy regulations. Tools that enable data discovery such as data\n", + "catalogs and data usage metrics are also needed. With a lakehouse, such enterprise\n", + "features only need to be implemented, tested and administered for a single system.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Read the research**\n", + "**Delta Lake: High-Performance ACID**\n", + "**Table Storage Over Cloud Object Stores**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf))yesThe response lists several key features and advantages of the lakehouse pattern. The retrieved context supports the following features: 1. Transaction support: ACID transactions are mentioned in the retrieved context. 2. Schema enforcement and governance: The retrieved context mentions schema enforcement, evolution, and robust governance mechanisms. 3. BI support: The retrieved context states that lakehouses enable using BI tools directly on the source data. 4. Storage decoupled from compute: The retrieved context mentions that storage and compute use separate clusters. 5. Openness: The retrieved context mentions open and standardized storage formats like Parquet. 6. Support for diverse data types: The retrieved context mentions support for various data types, including images, video, audio, semi-structured data, and text. 7. Support for diverse workloads: The retrieved context mentions support for data science, machine learning, and SQL analytics. 8. End-to-end streaming: The retrieved context mentions support for streaming and real-time reports. The advantages listed in the response are also supported by the retrieved context: 1. Simplified data architecture: The retrieved context mentions that a lakehouse combines the best elements of data warehouses and data lakes. 2. Cost-efficiency: The retrieved context mentions that lakehouses reduce costs. 3. Improved collaboration: The retrieved context mentions easing collaboration for data teams. 4. Flexibility and openness: The retrieved context mentions flexibility and openness. 5. Support for diverse data applications: The retrieved context mentions support for a range of data applications. Therefore, the response is fully supported by the retrieved context.yesThe ground truth states several key features and advantages of the lakehouse pattern. The retrieved context supports the following points: - The lakehouse pattern has an open data management architecture (retrieved context mentions openness and standardized storage formats). - It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics (retrieved context mentions combining the best elements of data lakes and data warehouses, and support for ACID transactions). - It supports data transformation, cleansing, and validation (retrieved context mentions data exploration and refinement). - The lakehouse pattern is cloud-centric (retrieved context mentions cloud object stores like Amazon S3). - It enhances support for both business intelligence and machine learning (retrieved context mentions support for BI tools, machine learning, and diverse workloads). - It is cost-efficient (retrieved context mentions reducing cost). - It offers an up-to-date unified data set (retrieved context mentions reducing staleness and improving recency). - It improves collaboration across the organization (retrieved context mentions easing collaboration for data teams). Therefore, all parts of the ground truth are supported by the retrieved context.List(no, no, no, no, no)0.0
fdc9f7bcc67a7cc4785f7f8df330c023da14c8d768905f0ad46af5323f28e480List(List(List(What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?, user)))List(List(“In a case study published in MIT Technology Review, profit margins increased and manufacturing time was reduced when digital-twin technology was implemented. Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.”, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf))List(The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%., The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology.){\"info\": {\"request_id\": \"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543860485, \"execution_time_ms\": 5265, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately...\"}, \"tags\": {\"eval.requestId\": \"a7649f7f-778e-434b-b85a-04318b6b1e61\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-2ac9b1075b2348d9be8ede5ab3f84e4e/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x00ac36763a70235c\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": null, \"start_time\": 1734543860485592707, \"end_time\": 1734543865751173309, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xa6b201baab85793c\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860521575844, \"end_time\": 1734543860528497728, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x2d706fe1d2c901dd\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860528634729, \"end_time\": 1734543860529225237, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x7549ef8340c7b553\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x2d706fe1d2c901dd\", \"start_time\": 1734543860528803631, \"end_time\": 1734543860528986334, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xcdbb6bf762090d25\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x2d706fe1d2c901dd\", \"start_time\": 1734543860529056335, \"end_time\": 1734543860529191236, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x099d6f4de27e7c6d\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860529288237, \"end_time\": 1734543865750738504, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x24f475d091bd7546\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543860608406097, \"end_time\": 1734543861698696326, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_3e00c780-f6a0-4bea-8e51-22c79e244794\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543860, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 29, \\\"prompt_tokens\\\": 1163, \\\"total_tokens\\\": 1192, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x68973af5476e3ebb\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543861711288479, \"end_time\": 1734543862593922839, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004522661, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\"}, {\\\"page_content\\\": \\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004403091, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\"}, {\\\"page_content\\\": \\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0043494906, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"929aec8a6e41f875b04a8fd58c7e9553\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004202808, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"33042520bb456fb0730d8ed53528a953\\\"}, {\\\"page_content\\\": \\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0036511072, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"77fa3ca534959648d7a8e5eebca4d12e\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x747651a6257c725a\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x68973af5476e3ebb\", \"start_time\": 1734543861712386292, \"end_time\": 1734543862592567023, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004522661], [\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004403091], [\\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"929aec8a6e41f875b04a8fd58c7e9553\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0043494906], [\\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"33042520bb456fb0730d8ed53528a953\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004202808], [\\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"77fa3ca534959648d7a8e5eebca4d12e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0036511072]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xa692d8258db6db26\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x68973af5476e3ebb\", \"start_time\": 1734543862592802126, \"end_time\": 1734543862593607336, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004522661], [\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004403091], [\\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"929aec8a6e41f875b04a8fd58c7e9553\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0043494906], [\\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"33042520bb456fb0730d8ed53528a953\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004202808], [\\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"77fa3ca534959648d7a8e5eebca4d12e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0036511072]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004522661, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\"}, {\\\"page_content\\\": \\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004403091, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\"}, {\\\"page_content\\\": \\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0043494906, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"929aec8a6e41f875b04a8fd58c7e9553\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004202808, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"33042520bb456fb0730d8ed53528a953\\\"}, {\\\"page_content\\\": \\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0036511072, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"77fa3ca534959648d7a8e5eebca4d12e\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xfb2e6f01a480fe00\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543862609431325, \"end_time\": 1734543865748167773, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_12ef0ccb-1f7c-4148-993d-debb77eccddf\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543863, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 48, \\\"prompt_tokens\\\": 4959, \\\"total_tokens\\\": 5007, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdfAccording to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.yesnullyesNo harmful content detected in responseyesThe expected response states that the increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%. The response confirms this by stating that the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model. The expected response also states that the reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology. The response supports this by stating that there was a reduction in manufacturing time of approximately 10 hours. The response is correct.5.2656199.06122.077.0nullnullList(List(Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**\n", + "\n", + "**8%**\n", + "**8%**\n", + "\n", + "\n", + "Can you imagine the cost to change\n", + "an oil refinery’s crude distillation\n", + "unit process conditions to improve\n", + "the output of diesel one week\n", + "and gasoline the next to address\n", + "changes in demand and ensure\n", + "maximum economic value? Can you\n", + "imagine how to replicate an even\n", + "simple supply chain to model risk?\n", + "\n", + "\n", + "**5%**\n", + "\n", + "\n", + "**1%**\n", + "\n", + "\n", + "-----\n", + "\n", + "### What Are Digital Twins?\n", + "\n", + "\n", + "Knowing the business challenges and benefits digital twins deliver, let’s turn to\n", + "the basics and explore what digital twins are and how a modern data stack is\n", + "necessary to build effective and timely digital twins. The classic definition of\n", + "digital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n", + "\n", + "\n", + "For a discrete or continuous manufacturing process, a digital twin gathers system\n", + "and processes state data with the help of various IoT sensors [operational\n", + "technology data (OT)] and enterprise data [informational technology (IT)] to form a\n", + "virtual model which is then used to run simulations, study performance issues and\n", + "generate possible insights.\n", + "\n", + "\n", + "**Types of Digital Twins**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twin Architectures\n", + "\n", + "Classic digital twins have been physics-based models of specific systems. More recently,\n", + "**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n", + "\n", + "\n", + "These twins provide the opportunity to not just monitor and simulate system performance under specific\n", + "conditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\n", + "the industrial environment.\n", + "\n", + "Digital twins undergo a series of changes during their lifecycle to become completely autonomous.\n", + "\n", + "**Data-Driven Operational Digital Twins: Maturity Journey**\n", + "\n", + "**AI**\n", + "\n", + "Simulate & Optimize\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "\n", + "# 6-8 18-24\n", + "## years to months\n", + "\n", + "\n", + "**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n", + "**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf), List(**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 10%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 50%\n", + "\n", + "\n", + "**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "# 25%\n", + "\n", + "\n", + "**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n", + "\n", + "\n", + "-----\n", + "\n", + "**Introduction (continued)**\n", + "\n", + "\n", + "**Digital twin market growth rate accelerates**\n", + "\n", + "Digital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\n", + "is forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\n", + "at a CAGR of 58%, riding on the wave of Industry 4.0.\n", + "\n", + "\n", + "**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to Manufacturing, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf), List(**But challenges remain**\n", + "\n", + "The most common challenges faced by the manufacturing industry that digital\n", + "twins are addressing include:\n", + "\n", + "**•** Product designs are more complex, resulting in higher cost and increasingly\n", + "longer development times\n", + "\n", + "**•** The supply chain is opaque\n", + "\n", + "**•** Production lines are not optimized – performance variations, unknown defects\n", + "and the projection of operating cost is obscure\n", + "\n", + "**•** Poor quality management – overreliance on theory, managed by\n", + "individual departments\n", + "\n", + "**•** Reactive maintenance costs are too high, resulting in excessive downtime or\n", + "process disruptions\n", + "\n", + "**•** Incongruous collaborations between departments\n", + "\n", + "**•** Invisibility of customer demand for gathering real-time feedback\n", + "\n", + "\n", + "The growth rate for digital twins is staggering with common adoption reported\n", + "to be in the 25-40% CAGR growth rate.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Digital Twins Bring Broad Benefits to Manufacturing\n", + "\n", + "Industry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\n", + "would have come at significant costs without digital twin technology.\n", + "\n", + "**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n", + "\n", + "\n", + "\n", + "**•** Product design and development is performed with\n", + "less cost and is completed in less time as iterative\n", + "simulations, using multiple constraints, deliver the\n", + "best or most optimized design. All commercial\n", + "aircraft are designed using digital twins.\n", + "\n", + "**•** Digital twins provide the awareness of how long\n", + "inventory will last, when to replenish and how to\n", + "minimize the supply chain disruptions. The oil and gas\n", + "industry, for example, uses supply chain–oriented\n", + "digital twins to reduce supply chain bottlenecks in\n", + "storage and midstream delivery, schedule tanker\n", + "off-loads and model demand with externalities.\n", + "\n", + "\n", + "\n", + "**•** Continuous quality checks on produced items\n", + "with ML/AI generated feedback pre-emptively\n", + "assuring improved product quality. Final paint\n", + "inspection in the automotive industry, for example,\n", + "is performed with computer vision built on top of\n", + "digital twin technology.\n", + "\n", + "**•** Striking the sweet spot between when to replace\n", + "a part before the process degrades or breaks\n", + "down and utilizing the components to their fullest,\n", + "digital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\n", + "building an asset performance management suite.\n", + "\n", + "\n", + "\n", + "**•** Digital twins create the opportunity to have\n", + "multiple departments in sync by providing\n", + "necessary instructions modularly to attain\n", + "a required throughput. Digital twins are the\n", + "backbone of kaizen events that optimize\n", + "manufacturing process flow.\n", + "\n", + "**•** Customer feedback loops can be modeled through\n", + "inputs, from point of sale customer behavior,\n", + "buying preferences, or product performance and\n", + "then integrated into the product development\n", + "process, forming a closed loop providing an\n", + "improved product design.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n", + "\n", + "The top four use cases are heavily focused on operational processes and are typically the first to be deployed\n", + "in manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\n", + "deployment, but typically offer higher and longer-lasting value.\n", + "\n", + "**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n", + "\n", + "\n", + "Improve product quality\n", + "\n", + "Reduce manufacturing costs\n", + "\n", + "Reduce unplanned downtime\n", + "\n", + "Increase throughput\n", + "\n", + "Ensure safe manufacturing\n", + "\n", + "Test new design ideas\n", + "\n", + "Develop product enhancements\n", + "\n", + "Digital transformation of enterprise\n", + "\n", + "Speed new product introduction\n", + "\n", + "Reduce planned downtime\n", + "\n", + "Meet new regulatory challenges\n", + "\n", + "Training for new manufacturing processes\n", + "\n", + "Design changes to production line\n", + "\n", + "Provide service to end users customers\n", + "\n", + "Update products in the field\n", + "\n", + "\n", + "**34%**\n", + "\n", + "\n", + "**30%**\n", + "\n", + "**28%**\n", + "**25%**\n", + "\n", + "**24%**\n", + "\n", + "\n", + "**16%**\n", + "\n", + "**14%**\n", + "\n", + "**13%**\n", + "\n", + "**13%**\n", + "\n", + "**11%**\n", + "**10%**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf), List(-----\n", + "\n", + "### Introduction\n", + "\n", + "\n", + "The concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\n", + "over 25 years ago, during the early phases of foundation and cofferdam construction for the\n", + "London Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\n", + "the years since this first application, edge computing, AI, data connectivity, 5G connectivity\n", + "and the improvements of the Internet of Things (IoT) have enabled digital twins to become\n", + "cost-effective and are now an imperative in today’s data-driven businesses.\n", + "\n", + "Today’s manufacturing industries are expected to streamline and optimize all the processes in their value\n", + "chain from product development and design, through operations and supply chain optimization to obtaining\n", + "feedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\n", + "and is addressing a multitude of challenges within manufacturing, logistics and transportation.\n", + "\n", + "\n", + "[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n", + "[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf), /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf), List(**eBook**\n", + "\n", + "# Making Your Digital Twin Come to Life\n", + "\n", + "##### With the Lakehouse for Manufacturing and Tredence\n", + "\n", + "\n", + "-----\n", + "\n", + "### Contents\n", + "\n", + "Introduction ................................................................................................................................................................................................................ **03**\n", + "\n", + "Digital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n", + "\n", + "What Are Digital Twins? ........................................................................................................................................................................................ **07**\n", + "\n", + "Digital Twin Architectures .................................................................................................................................................................................. **08**\n", + "\n", + "How to Build a Digital Twin ................................................................................................................................................................................ **09**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf))yesThe response states that the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours. The retrieved context confirms this by stating, 'Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.' Therefore, the response is fully supported by the retrieved context.yesThe ground truth states two points: 1) The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%, and 2) The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology. The retrieved context supports both points. It mentions that 'Automobile manufacturing profit margins increased by 41% to 54% per model' and 'The estimated average automobile manufacturing time was reduced to approximately 10 hours.' Therefore, both parts of the ground truth are supported by the retrieved context.List(yes, yes, yes, yes, yes)1.0
4517d3fb5f3f1e83efba44630c78e94b18db1d8f94f780810adb3550c851891bList(List(List(What are the responsibilities of a Data Engineer according to the document?, user)))List(List(We distinguish between the following personas:\n", + "Data Governance Officer: Responsible for ensuring that data governance, data privacy and other compliance measures are adhered to across the model development and deployment process. Not typically involved in day-to-day operations.\n", + "Data Engineer: Responsible for building data pipelines to process, organize and persist data sets for machine learning and other downstream applications.\n", + "Data Scientist: Responsible for understanding the business problem, exploring available data to understand if machine learning is applicable, and then training, tuning and evaluating a model to be deployed.\n", + "ML Engineer: Responsible for deploying machine learning models to production with appropriate governance, monitoring and software development best practices such as continuous integration and continuous deployment (CI/CD).\n", + "Business Stakeholder: Responsible for using the model to make decisions for the business or product, and responsible for the business value that the model is expected to generate., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf))List(A Data Engineer is responsible for building data pipelines., A Data Engineer is responsible for processing data sets., A Data Engineer is responsible for organizing data sets., A Data Engineer is responsible for persisting data sets., The responsibilities support machine learning and other downstream applications.){\"info\": {\"request_id\": \"tr-0120829e6be846c588aaf60616a84091\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543861615, \"execution_time_ms\": 8860, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data fo...\"}, \"tags\": {\"eval.requestId\": \"fb8a507b-50e1-4175-ae70-cd532a4da48a\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0120829e6be846c588aaf60616a84091/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x322d3f08ebd83245\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": null, \"start_time\": 1734543861615056611, \"end_time\": 1734543870475653956, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xd22480b5c323e525\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861626761953, \"end_time\": 1734543861637333882, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc38dd7930788fc7a\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861637470683, \"end_time\": 1734543861638090491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x69c466d43cc8cb45\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc38dd7930788fc7a\", \"start_time\": 1734543861637639085, \"end_time\": 1734543861637835188, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the responsibilities of a Data Engineer according to the document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xa78642c809ff4e5c\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc38dd7930788fc7a\", \"start_time\": 1734543861637912989, \"end_time\": 1734543861638056690, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x4f95a72fa7bfa0d4\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861638154992, \"end_time\": 1734543870475145150, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0xe8a5ad56163ea90b\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543861851646682, \"end_time\": 1734543864504840560, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_55c922a5-e0d1-4855-8c4f-98f0d7977f31\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543863, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 22, \\\"prompt_tokens\\\": 1152, \\\"total_tokens\\\": 1174, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0xc3270e988b4fcff6\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543864509030410, \"end_time\": 1734543864848978788, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"Data Engineer responsibilities\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003443227, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\"}, \\\"id\\\": \\\"1ce1d861d15136fd48438be91479e567\\\"}, {\\\"page_content\\\": \\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0033119193, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"e577e0ac294ad34249c7d000936d7c72\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032034456, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0030519078, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\"}, \\\"id\\\": \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\"}, {\\\"page_content\\\": \\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029978286, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9f81ac0b52802c7152247bfd5289b744\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x21a63398a53168a6\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc3270e988b4fcff6\", \"start_time\": 1734543864513960069, \"end_time\": 1734543864847612271, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"Data Engineer responsibilities\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"1ce1d861d15136fd48438be91479e567\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\", 0.003443227], [\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"e577e0ac294ad34249c7d000936d7c72\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0033119193], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.0032034456], [\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\", 0.0030519078], [\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"9f81ac0b52802c7152247bfd5289b744\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029978286]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x3539119435fb9b51\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc3270e988b4fcff6\", \"start_time\": 1734543864847785474, \"end_time\": 1734543864848655784, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"1ce1d861d15136fd48438be91479e567\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\", 0.003443227], [\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"e577e0ac294ad34249c7d000936d7c72\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0033119193], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.0032034456], [\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\", 0.0030519078], [\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"9f81ac0b52802c7152247bfd5289b744\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029978286]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003443227, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\"}, \\\"id\\\": \\\"1ce1d861d15136fd48438be91479e567\\\"}, {\\\"page_content\\\": \\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0033119193, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"e577e0ac294ad34249c7d000936d7c72\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032034456, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0030519078, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\"}, \\\"id\\\": \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\"}, {\\\"page_content\\\": \\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029978286, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9f81ac0b52802c7152247bfd5289b744\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xe94822ce94a683c1\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543864862687552, \"end_time\": 1734543870466981749, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_c53ebd3b-0432-428d-af54-e70dd4e12eec\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543865, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 188, \\\"prompt_tokens\\\": 5334, \\\"total_tokens\\\": 5522, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdfThe responsibilities of a Data Engineer include:\n", + "\n", + "1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\n", + "2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\n", + "3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\n", + "4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\n", + "\n", + "Data Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.no[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.yesNo harmful content detected in responsenoThe expected response states that a Data Engineer is responsible for building data pipelines, processing data sets, organizing data sets, persisting data sets, and supporting machine learning and other downstream applications. The response mentions responsibilities such as setting up and maintaining ETL pipelines, managing workflows, enabling access to usable data, and building systems that convert source data into usable information. However, the response does not explicitly mention 'processing data sets,' 'organizing data sets,' or 'persisting data sets.' Additionally, while it mentions enabling data scientists and analysts, it does not explicitly state that the responsibilities support machine learning and other downstream applications. Therefore, the response is not correct.8.866696.06486.0210.0nullnullList(List(engineering in the gaming industry.\n", + "\n", + "`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n", + "\n", + "first step in your data journey. Imagine how the output of\n", + "\n", + "your data can be presented in a way to help stakeholders\n", + "\n", + "across your company achieve more. For example, dropping\n", + "\n", + "data into an application that can help game designers\n", + "\n", + "make balancing decisions based on player events.\n", + "\n", + "\n", + "-----\n", + "\n", + "# APPENDIX Ultimate class build guide\n", + "\n", + "\n", + "### Creating a character\n", + "\n", + "The heart and soul of mature data teams are formed by this\n", + "\n", + "trio of classes. There are many aspects to these roles, but\n", + "\n", + "they can be summarized in that Data Engineers create and\n", + "\n", + "maintain critical data workflows, Data Analysts interpret data\n", + "\n", + "and create reports that keep the business teams running\n", + "\n", + "seamlessly, and Data Scientists are responsible for making\n", + "\n", + "sense of large amounts of data. Depending on the size of\n", + "\n", + "the organization, individuals may be required to multiclass\n", + "\n", + "in order to address needs of the team. In smaller studios, it’s\n", + "\n", + "often developers who wear multiple hats, including those in\n", + "\n", + "data engineering, analytics and data science.\n", + "\n", + "Whether you’re looking to stand-up an analytics dashboard\n", + "\n", + "to report on the health of a title or building a recommendation\n", + "\n", + "engine for your players, this guide will help you better\n", + "\n", + "understand the unique classes required to develop and\n", + "\n", + "maintain an effective data, analytics, and AI platform.\n", + "\n", + "##### Data Engineers\n", + "\n", + "\n", + "**Goals and Priorities of Data Engineers**\n", + "\n", + "- Enable access to usable data for real-time insights — data\n", + "\n", + "that both enables timely decision-making and is accurate\n", + "\n", + "and reproducible\n", + "\n", + "- Increase user confidence and trust in data. This involves\n", + "\n", + "ensuring high consistency and reliability in ETL processes\n", + "\n", + "- Limit the issues and failures experienced by other\n", + "\n", + "engineers and data scientists, allowing those roles to\n", + "\n", + "focus less on troubleshooting and more on drawing\n", + "\n", + "meaningful conclusions from data and building new\n", + "\n", + "products / features\n", + "\n", + "**What Data Engineers care about:**\n", + "\n", + "- Enabling access to data for real-time insights — data that\n", + "\n", + "both enables timely decision-making and is accurate and\n", + "\n", + "reproducible\n", + "\n", + "- Building high-performance, reliable and scalable pipelines\n", + "\n", + "for data processing\n", + "\n", + "- Delivering data for consumption from a variety of sources\n", + "\n", + "by Data Analysts and Data Scientists against tight SLAs\n", + "\n", + "- A Data Engineer’s biggest challenge? Collaboration\n", + "\n", + "across teams\n", + "\n", + "\n", + "Data engineers build systems that collect, manage, and\n", + "\n", + "\n", + "convert source data into usable information for data\n", + "\n", + "scientists and business analysts to interpret. Their ultimate\n", + "\n", + "goal is to make data accessible so that teams can use it to\n", + "\n", + "evaluate and optimize a goal or objective.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Data Engineers are responsible for data migration,\n", + "\n", + "manipulation, and integration of data (joining dissimilar\n", + "\n", + "data systems)\n", + "\n", + "- Setup and maintenance of ETL pipelines to convert\n", + "\n", + "source data into actionable data for insights. It is the\n", + "\n", + "responsibility of the data engineer to make sure these\n", + "\n", + "pipelines run efficiently and are well orchestrated.\n", + "\n", + "- The Data Engineer sets up the workflow process\n", + "\n", + "to orchestrate pipelines for the studio’s data and\n", + "\n", + "continuously validates it\n", + "\n", + "- Managing workflows to enable data scientists and data\n", + "\n", + "analysts, and ensuring workflows are well-integrated with\n", + "\n", + "different parts of the studio (e.g., marketing, test/QA, etc)\n", + "\n", + "\n", + "##### Data Scientists\n", + "\n", + "Data scientists determine the questions their team should\n", + "\n", + "be asking and figure out how to answer those questions\n", + "\n", + "using data. They often develop predictive models for\n", + "\n", + "theorizing and forecasting.\n", + "\n", + "**Responsibilities:**\n", + "\n", + "- Responsible for making sense of the large amounts of data\n", + "\n", + "collected for a given game title, such as game telemetry,, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf), List(Data teams rely on getting the right data at the right time for analytics, data\n", + "science and machine learning, but often are faced with challenges meeting\n", + "the needs of their initiatives for data engineering.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### Why data engineering is hard\n", + "\n", + "One of the biggest challenges is accessing and managing the increasingly\n", + "complex data that lives across the organization. Most of the complexity\n", + "arises with the explosion of data volumes and data types, with organizations\n", + "amassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n", + "\n", + "With this volume, managing data pipelines to transform and process data\n", + "is slow and difficult, and increasingly expensive. And to top off the complexity,\n", + "most businesses are putting an increased emphasis on multicloud\n", + "environments which can be even more difficult to maintain.\n", + "\n", + "[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\n", + "that data itself has become a product, and the challenging goal of the data\n", + "engineer is to build and run the machinery that creates this high-fidelity\n", + "data product all the way from ingestion to monetization.\n", + "\n", + "\n", + "Despite current technological advances data engineering remains\n", + "difficult for several reasons:\n", + "\n", + "**Complex data ingestion methods**\n", + "\n", + "Data ingestion means retrieving batch and streaming data from various\n", + "sources and in various formats. Ingesting data is hard and complex since you\n", + "either need to use an always-running streaming platform like Apache Kafka\n", + "or you need to be able to keep track of which files haven’t been ingested yet.\n", + "Data engineers are required to spend a lot of time hand-coding repetitive\n", + "and error-prone data ingestion tasks.\n", + "\n", + "**Data engineering principles**\n", + "\n", + "These days, large operations teams are often just a memory of the past.\n", + "Modern data engineering principles are based on agile software development\n", + "methodologies. They apply the well-known “you build it, you run it” paradigm,\n", + "use isolated development and production environments, CI/CD, and version\n", + "control transformations that are pushed to production after validation. Tooling\n", + "needs to support these principles.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Third-party tools**\n", + "\n", + "Data engineers are often required to run additional third-party tools for\n", + "orchestration to automate tasks such as ELT/ETL or customer code in\n", + "notebooks. Running third-party tools increases the operational overhead\n", + "and decreases the reliability of the system.\n", + "\n", + "**Performance tuning**\n", + "\n", + "Finally, with all pipelines and workflows written, data engineers need to\n", + "constantly focus on performance, tuning pipelines and architectures to meet\n", + "SLAs. Tuning such architectures requires in-depth knowledge of the underlying\n", + "architecture and constantly observing throughput parameters.\n", + "\n", + "Most organizations are dealing with a complex landscape of data warehouses\n", + "and data lakes these days. Each of those platforms has its own limitations,\n", + "workloads, development languages and governance model.\n", + "\n", + "\n", + "With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf), List(# Building Reliable Data Lakes at Scale With Delta Lake\n", + "\n", + "\n", + "-----\n", + "\n", + "## Contents\n", + "\n", + "#### Data Engineering Drivers 2\n", + "\n", + " Data Pipeline Key Goals 4\n", + "\n", + " Apache Spark™: The First Unified Analytics Engine 5\n", + "\n", + " Data Reliability Challenges With Data Lakes 6\n", + "\n", + " Delta Lake: A New Storage Layer 7\n", + "\n", + " Delta Lake: Key Features 8\n", + "\n", + " Getting Started With Delta Lake 10\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "#### Data Engineering Drivers\n", + "\n", + "Data engineering professionals are needing to respond to several different drivers.\n", + "\n", + "Chief among the drivers they face are:\n", + "\n", + "**Rise of Advanced Analytics** — Advanced analytics, including methods\n", + "\n", + "based on machine learning techniques, have evolved to such a degree that\n", + "\n", + "organizations seek to derive far more value from their corporate assets.\n", + "\n", + "**Widespread Adoption** — Once the province of leading edge, high-tech\n", + "\n", + "companies, these advanced approaches are being adopted across a\n", + "\n", + "multitude of industries from retail to hospitality to healthcare and across\n", + "\n", + "private as well as public sector organizations. This is further driving the need\n", + "\n", + "for strong data engineering practices.\n", + "\n", + "**Regulation** — With the growth of data generation and data collection,\n", + "\n", + "there is increased interest in how the data is protected and managed.\n", + "\n", + "Regulatory regimes such as GDPR (General Data Protection Regulation)\n", + "\n", + "from the EU and other jurisdictions mandate very specific ways in which\n", + "\n", + "data must be managed.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Drivers\n", + "\n", + "**Technology Innovation** — The move to cloud-based analytics architectures\n", + "\n", + "that is now well underway is being propelled further by innovations such as\n", + "\n", + "analytics-focused chipsets, pipeline automation and the unification of data\n", + "\n", + "and machine learning. All these offer data professionals new approaches for\n", + "\n", + "their data initiatives.\n", + "\n", + "**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n", + "\n", + "also subject to increasing scrutiny. There is also a greater understanding of\n", + "\n", + "data as a valuable asset. Deriving value from data must be done in a manner\n", + "\n", + "that is financially responsible and actually value adding to the enterprise and\n", + "\n", + "meeting ROI hurdles.\n", + "\n", + "**Role Evolution** — Reflecting the importance of managing the data and\n", + "\n", + "maximizing value extraction, the Chief Data Officer (CDO) role is becoming\n", + "\n", + "more prominent and newer roles such as Data Curator are emerging.\n", + "\n", + "They must balance the needs of governance, security and democratization.\n", + "\n", + "\n", + "-----\n", + "\n", + "## Key Goals\n", + "\n", + "#### Data Pipeline Key Goals\n", + "\n", + "Making quality data available in a reliable manner is a major determinant of success for data\n", + "\n", + "analytics initiatives be they regular dashboards or reports, or advanced analytics projects\n", + "\n", + "drawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n", + "\n", + "responsibility need to take account of a broad set of dependencies and requirements as they\n", + "\n", + "design and build their data pipelines.\n", + "\n", + "Three primary goals that data engineers typically seek to address as they work to enable the\n", + "\n", + "analytics professionals in their organizations are:\n", + "\n", + "**Deliver quality data in less time** — When it comes to data, quality and timeliness\n", + "\n", + "are key. Data with gaps or errors (which can arise for many reasons) is\n", + "\n", + "“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n", + "\n", + "users. Equally well, many applications require up-to-date information (who\n", + "\n", + "wants to use last night’s closing stock price or weather forecast) and are of\n", + "\n", + "limited value without it.\n", + "\n", + "**Enable faster queries** — Wanting fast responses to queries is natural enough\n", + "\n", + "in today’s “New York minute,” online world. Achieving this is particularly\n", + "\n", + "demanding when the queries are based on very large data sets.\n", + "\n", + "**Simplify data engineering at scale** — It is one thing to have high reliability and\n", + "\n", + "performance in a limited, development or test environment. What matters\n", + "\n", + "more is the ability to have robust, production data pipelines at scale without\n", + "\n", + "requiring high operational overhead.\n", + "\n", + "\n", + "-----, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf), List(COPY INTO ......................................................................................................................................................................................................... **06**\n", + "\n", + "Auto Loader ....................................................................................................................................................................................................... **09**\n", + "\n", + "Ingesting Data From External Applications .......................................................................................................................................................... **13**\n", + "\n", + "Partner Connect ............................................................................................................................................................................................... **13**\n", + "\n", + "\n", + "-----\n", + "\n", + "### Introduction\n", + "\n", + "Organizations today are inundated with data siloed across various on-premises\n", + "application systems, databases, data warehouses and SaaS applications. This\n", + "fragmentation makes it difficult to support new use cases for analytics or machine\n", + "learning, so many IT teams are now centralizing all of their data with a lakehouse\n", + "architecture built on top of Delta Lake, an open format storage layer.\n", + "\n", + "The first thing data engineers need to do to support the lakehouse architecture is to\n", + "efficiently move data from various systems into their lakehouse. Ingesting data is a\n", + "critical first step in the data engineering and management lifecycle.\n", + "\n", + "\n", + "-----\n", + "\n", + "### Life of a Data Engineer\n", + "\n", + "The primary focus of data engineers is to provide timely and reliable data to downstream\n", + "\n", + "data teams at an organization. Requests for data can come from a variety of teams, and for\n", + "\n", + "\n", + "a variety of data types. For example:, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf), List(With the Databricks Lakehouse Platform, data engineers have access to an\n", + "end-to-end data engineering solution for ingesting, transforming, processing,\n", + "scheduling and delivering data. The lakehouse platform automates the\n", + "complexity of building and maintaining pipelines and running ETL workloads\n", + "directly on a data lake so data engineers can focus on quality and reliability\n", + "to drive valuable insights.\n", + "\n", + "Data engineering in the lakehouse allows data teams to unify batch and\n", + "streaming operations on a simplified architecture, streamline data pipeline\n", + "development and testing, build reliable data, analytics and AI workflows\n", + "on any cloud platform, and meet regulatory requirements to maintain\n", + "world-class governance.\n", + "\n", + "The lakehouse provides an end-to-end data engineering and ETL platform\n", + "that automates the complexity of building and maintaining pipelines and\n", + "running ETL workloads so data engineers and analysts can focus on quality\n", + "and reliability to drive valuable insights.\n", + "\n", + "\n", + "#### Databricks makes modern data engineering simple\n", + "\n", + "There is no industry-wide definition of modern data engineering.\n", + "This should come close:\n", + "\n", + "_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n", + "_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n", + "**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n", + "_kinds of workflows._\n", + "\n", + "\n", + "-----\n", + "\n", + "-----\n", + "\n", + "#### Benefits of data engineering on the lakehouse\n", + "\n", + "By simplifying and modernizing with the lakehouse architecture, data engineers\n", + "gain an enterprise-grade and enterprise-ready approach to building data\n", + "pipelines. The following are eight key differentiating capabilities that a data\n", + "engineering solution team can enable with the Databricks Lakehouse Platform:\n", + "\n", + "**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\n", + "engineers can enable fast, reliable, scalable and automatic data ingestion\n", + "for analytics, data science or machine learning.\n", + "\n", + "\n", + "\n", + "**•** **Data pipeline observability:** Monitor overall data pipeline estate status\n", + "from a dataflow graph dashboard and visually track end-to-end pipeline\n", + "health for performance, quality, status and latency.\n", + "\n", + "**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\n", + "analytics and machine learning use cases by enabling easy and automatic\n", + "data pipeline deployments into production or roll back pipelines and\n", + "minimize downtime.\n", + "\n", + "**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\n", + "of data processing tasks for data and machine learning pipelines with the\n", + "ability to run multiple non-interactive tasks as a directed acyclic graph\n", + "(DAG) on a Databricks compute cluster.\n", + "\n", + "\n", + "\n", + "**•** **Automated ETL pipelines:** Data engineers can reduce development\n", + "time and effort and focus on implementing business logic and data\n", + "quality checks within the data pipeline using SQL or Python.\n", + "\n", + "**•** **Data quality checks:** Improve data reliability throughout the data\n", + "lakehouse so data teams can confidently trust the information for\n", + "downstream initiatives with the ability to define data quality and\n", + "automatically address errors.\n", + "\n", + "**•** **Batch and streaming:** Allow data engineers to set tunable data latency\n", + "with cost controls without having to know complex stream processing\n", + "and implement recovery logic.\n", + "\n", + "**•** **Automatic recovery:** Handle transient errors and use automatic recovery\n", + "for most common error conditions that can occur during the operation of\n", + "a pipeline with fast, scalable fault-tolerance.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Data engineering is all about data quality**\n", + "\n", + "The goal of modern data engineering is to distill data with a quality that is fit for\n", + "downstream analytics and AI. Within the Lakehouse, data quality is achieved on\n", + "three different levels.\n", + "\n", + "\n", + "1. On a **technical level** , data quality is\n", + "guaranteed by enforcing and evolving\n", + "schemas for data storage and ingestion.\n", + "\n", + "**Kenesis**\n", + "\n", + "**CSV,**\n", + "**JSON, TXT...**\n", + "\n", + "**Data Lake**, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf))yesThe response states several responsibilities of a Data Engineer. The retrieved context supports the following responsibilities: 1. Data migration, manipulation, and integration of data (joining dissimilar data systems) - this is mentioned under 'Responsibilities' in the document. 2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights - this is also mentioned under 'Responsibilities'. 3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it - this is supported by the document. 4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.) - this is supported by the document. Additionally, the document mentions enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists, which aligns with the claim. Therefore, the claim is fully supported by the document.yesThe ground truth states several responsibilities of a Data Engineer. According to the retrieved context, Data Engineers are responsible for: \n", + "- Building data pipelines (supported by 'building high-performance, reliable and scalable pipelines for data processing'). \n", + "- Processing data sets (supported by 'convert source data into actionable data for insights'). \n", + "- Organizing data sets (supported by 'managing workflows to enable data scientists and data analysts'). \n", + "- Persisting data sets (supported by 'setup and maintenance of ETL pipelines'). \n", + "- Supporting machine learning and other downstream applications (supported by 'data engineers build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret'). Therefore, all parts of the ground truth are supported by the retrieved context.List(no, no, no, no, no)0.0
87873effb62b308f6aafbb34e7c24aec0439d6b92eb32b330105aa9d6aca2286List(List(List(What are the critical needs for IT and business when it comes to implementing a customer data platform?, user)))List(List(Critical IT Needs\n", + "Keep control of data access and governance; ability to architecture a customer data stack with decisions on where data is stored and where queries are executed.\n", + "Critical Business Needs\n", + "Get customer data access via a no-code interface to generate insights; build customer experiences and activate data within business applications., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf))List(IT needs to maintain control over data access and governance., IT needs the ability to make decisions on data storage and query execution., Business needs to access customer data through a no-code interface to generate insights., Business needs to build customer experiences and activate data within business applications.){\"info\": {\"request_id\": \"tr-def73886c787456096e0d3d7cfe81552\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543864916, \"execution_time_ms\": 5651, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and bu...\"}, \"tags\": {\"eval.requestId\": \"7e27a99b-f6ef-40d0-94f7-d14f455eca3c\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-def73886c787456096e0d3d7cfe81552/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x5baceed3222a960b\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": null, \"start_time\": 1734543864916750101, \"end_time\": 1734543870567771694, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x6a147a0324292b48\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864929193850, \"end_time\": 1734543864936605139, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xa61737d7e97f75f6\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864936732340, \"end_time\": 1734543864937464449, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xf13b9cee85b4b4f1\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xa61737d7e97f75f6\", \"start_time\": 1734543864936890442, \"end_time\": 1734543864937116045, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xeb765ae29e02fd13\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xa61737d7e97f75f6\", \"start_time\": 1734543864937199646, \"end_time\": 1734543864937431549, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0xbe98406904952d43\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864937535250, \"end_time\": 1734543870567143686, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x6110b362407c0710\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543865065375583, \"end_time\": 1734543866424542786, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_5d73c95c-8527-4f8f-ac82-ac3ef12181be\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543865, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 31, \\\"prompt_tokens\\\": 1158, \\\"total_tokens\\\": 1189, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x29d9dfbb8949e1ee\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543866434627907, \"end_time\": 1734543866904066738, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"critical needs for IT and business when implementing a customer data platform\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029832723, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\"}, \\\"id\\\": \\\"563f0dba5edef5b358685117dfb5a133\\\"}, {\\\"page_content\\\": \\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027576878, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"25ef18d715b47231f6594d1da80303e9\\\"}, {\\\"page_content\\\": \\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027022872, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"eaff954d65653182857574e043c105f1\\\"}, {\\\"page_content\\\": \\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025006814, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"f545eff42d3b9ae2b565475f4390ed44\\\"}, {\\\"page_content\\\": \\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0024809677, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"b5f4bd0258226132f89697f6e660b09b\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x88079808d446595d\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x29d9dfbb8949e1ee\", \"start_time\": 1734543866439784969, \"end_time\": 1734543866902149915, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"critical needs for IT and business when implementing a customer data platform\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"563f0dba5edef5b358685117dfb5a133\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\", 0.0029832723], [\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"25ef18d715b47231f6594d1da80303e9\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027576878], [\\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"eaff954d65653182857574e043c105f1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027022872], [\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"f545eff42d3b9ae2b565475f4390ed44\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0025006814], [\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"b5f4bd0258226132f89697f6e660b09b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0024809677]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xa3664bb29279107e\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x29d9dfbb8949e1ee\", \"start_time\": 1734543866902364817, \"end_time\": 1734543866903599532, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"563f0dba5edef5b358685117dfb5a133\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\", 0.0029832723], [\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"25ef18d715b47231f6594d1da80303e9\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027576878], [\\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"eaff954d65653182857574e043c105f1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027022872], [\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"f545eff42d3b9ae2b565475f4390ed44\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0025006814], [\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"b5f4bd0258226132f89697f6e660b09b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0024809677]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029832723, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\"}, \\\"id\\\": \\\"563f0dba5edef5b358685117dfb5a133\\\"}, {\\\"page_content\\\": \\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027576878, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"25ef18d715b47231f6594d1da80303e9\\\"}, {\\\"page_content\\\": \\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027022872, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"eaff954d65653182857574e043c105f1\\\"}, {\\\"page_content\\\": \\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025006814, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"f545eff42d3b9ae2b565475f4390ed44\\\"}, {\\\"page_content\\\": \\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0024809677, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"b5f4bd0258226132f89697f6e660b09b\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xd3785d456e2d8af2\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543866913943756, \"end_time\": 1734543870564567154, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_f8a9b201-62ca-4c70-b72c-98d0edc1030c\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543867, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 118, \\\"prompt_tokens\\\": 6128, \\\"total_tokens\\\": 6246, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}null/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdfThe critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.yesnullyesNo harmful content detected in responseyesThe expected response states that IT needs to maintain control over data access and governance, which is supported by the response as it mentions 'keeping control of data access and governance'. The expected response also states that IT needs the ability to make decisions on data storage and query execution, which is supported by the response as it mentions 'IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed'. The expected response further states that business needs to access customer data through a no-code interface to generate insights, which is supported by the response as it mentions 'getting customer data access via a no-code interface to generate insights'. Finally, the expected response states that business needs to build customer experiences and activate data within business applications, which is supported by the response as it mentions 'build customer experiences' and 'business needs to activate data within business applications'. The response is correct.5.6517435.07286.0149.0nullnullList(List(#### eBook\n", + "\n", + "# The CDP Build vs Buy Guide:\n", + "\n", + "### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n", + "\n", + "\n", + "-----\n", + "\n", + "## The Need for a Customer Data Platform\n", + "\n", + "\n", + "Organizations need to deliver personalized experiences to their customers to stay ahead\n", + "of the curve — that means they need a customer data platform (CDP). Through a CDP, data\n", + "from every touch point, along with third-party information, is brought together to provide\n", + "a unified view of the customer. This enables your marketing team to analyze, identify and\n", + "activate customers with targeted content.\n", + "\n", + "The key question for all IT teams at these organizations is whether to build or to buy.\n", + "\n", + "A CDP that sounds like music to the ears of business leaders may be perceived as noise\n", + "by enterprise IT leaders. The business side of the house needs immediate enablement, and\n", + "an out-of-the-box system dedicated to the specialized needs of marketers seems like the\n", + "fastest path to a solution.\n", + "\n", + "But for IT, the CDP is yet another system, bringing stack baggage and redundancies to\n", + "existing marketing and analytics systems.. The cost of adding another system to the\n", + "landscape and the redundancy of sensitive customer data creates a governance challenge\n", + "that has immediate consequences.\n", + "\n", + "**Critical IT Needs** **Critical Business Needs**\n", + "\n", + "\n", + "Keep control of data access and\n", + "governance; ability to architecture a\n", + "customer data stack with decisions on\n", + "where data is stored and where queries\n", + "are executed\n", + "\n", + "\n", + "Get customer data access via a no-code\n", + "interface to generate insights; build customer\n", + "experiences and activate data within\n", + "business applications\n", + "\n", + "\n", + "-----\n", + "\n", + "The question of whether to build or buy seems to leave legitimate needs and concerns by one\n", + "side or the other unaddressed — which is why so many organizations who have built a CDP\n", + "have expressed dissatisfaction regardless of which side of the fence they came down upon.\n", + "\n", + "**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n", + "**both sides of the debate and provide organizations a third choice of both building and**\n", + "**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\n", + "the business with no-code and ease of use interface along with the flexibility and centralized\n", + "governance IT desires. By shifting the conversation from building or buying to building _and_\n", + "buying, we’ve opened the door to finding the right balance of approaches for our customer\n", + "organizations, helping organizations find greater success in their personalization journey.\n", + "\n", + "**“We made an attempt to internally build a CDP platform and while we**\n", + "**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n", + "**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n", + "**or offer a campaign interface to our product marketers that could empower**\n", + "**them to create and manage those journeys. It was going to take at least two**\n", + "**years for us to build all of that functionality in house.”**\n", + "\n", + "– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n", + "\n", + "\n", + "-----\n", + "\n", + "## Combining the Build and Buy Approaches\n", + "\n", + "\n", + "Bringing together the best of build and buy involves the deployment of the CDP alongside or\n", + "within the lakehouse platform. There are three approaches to this:\n", + "\n", + "**Bundled** **Composable**\n", + "\n", + "**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "\n", + "\n", + "Compute\n", + "\n", + "Storage\n", + "(Local & Views)\n", + "\n", + "\n", + "Query\n", + "Virtualization\n", + "\n", + "Metadata\n", + "\n", + "\n", + "Data Copy\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "Storage\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Lakehouse\n", + "\n", + "\n", + "Compute Compute\n", + "\n", + "Storage Storage\n", + "\n", + "\n", + "-----\n", + "\n", + "Deployment Type\n", + "\n", + "**Bundled**\n", + "\n", + "**Composable –**\n", + "**Hybrid**\n", + "\n", + "**Composable –**\n", + "**Lakehouse-Only**\n", + "\n", + "\n", + "Description, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf), List(companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n", + "\n", + "As a result, data portability and the ability to run workloads on different cloud providers are becoming\n", + "\n", + "increasingly important.\n", + "\n", + "**Modernize business applications**\n", + "\n", + "As organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n", + "\n", + "approach. The majority of on-premises applications are not built with the cloud in mind. They usually\n", + "\n", + "differ in the way that they handle security, resiliency, scalability and failover. Their application designs\n", + "\n", + "often store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n", + "\n", + "CCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n", + "\n", + "therefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n", + "\n", + "services and APIs to easily provide access to an application’s functionality.\n", + "\n", + "Cloud-based architectures, commodity databases and software application development frameworks make\n", + "\n", + "it easier for developers to build scalable, secure end-to-end applications to run all your internal business\n", + "\n", + "processes. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n", + "\n", + "a backing database) has become straightforward with the latest tooling available to your application\n", + "\n", + "development teams.\n", + "\n", + "As a first step, organizations should inventory their business-critical applications, prioritize them based\n", + "\n", + "on business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n", + "\n", + "applications that generate and store a significant amount of the data consumed within an organization. Using\n", + "\n", + "a consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n", + "\n", + "\n", + "“We are on an amazing journey. Being among\n", + "\n", + "the fastest-growing enterprise software cloud\n", + "\n", + "companies on record was unimaginable when\n", + "\n", + "we started Databricks. To get here, we’ve stayed\n", + "\n", + "focused on the three big bets we made when\n", + "\n", + "founding the company — cloud, open source\n", + "\n", + "and machine learning. Fast-forward seven years,\n", + "\n", + "thousands of data teams around the globe are\n", + "\n", + "working better together on Databricks.”\n", + "\n", + "**Ali Ghodsi**\n", + "\n", + "Co-founder and CEO\n", + "\n", + "Databricks\n", + "\n", + "\n", + "-----\n", + "\n", + "The next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n", + "\n", + "A good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n", + "\n", + "other applications within your environment to store copies of the data — unless absolutely necessary for\n", + "\n", + "performance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n", + "\n", + "the data from the actual SOR.\n", + "\n", + "Data from these SORs should be made available in three ways:\n", + "\n", + "**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n", + "\n", + "**2.** \u0007Ensure that copies of the data land in the data lake.\n", + "\n", + "**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n", + "\n", + "consumption by downstream applications.\n", + "\n", + "**Move toward real-time decisioning**\n", + "\n", + "The value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n", + "\n", + "and the second is to view data as an individual event. This so-called “time value of data” is an important\n", + "\n", + "concept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n", + "\n", + "the same data platform.\n", + "\n", + "On the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n", + "\n", + "aggregate data provides the ability to look back in time and see the complete history of an aspect of your\n", + "\n", + "business and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n", + "\n", + "newly created or arriving data event gives you the opportunity to make decisions — in the moment — that, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf), List(and security environment but nothing more\n", + "\n", + "\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n", + "\n", + "of tools in play or streamlining the user experience\n", + "\n", + "\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n", + "\n", + "partnership model, the ability to influence the roadmap and professional services support\n", + "\n", + "For these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n", + "\n", + "on selecting a data platform that enables seamless integration with point solutions rather than a suite of\n", + "\n", + "discrete tools that require integration work and may no longer be category leaders over the long haul.\n", + "\n", + "\n", + "-----\n", + "\n", + "Databricks is a leading data and AI company —\n", + "\n", + "\n", + "Keep in mind that data platforms work well because the vendor took an opinionated point of view of how\n", + "\n", + "data processing, validation and curation should work. It’s the integration between the discrete functions\n", + "\n", + "of the platform that saves time, conserves effort and improves the user experience. Many companies try\n", + "\n", + "to take on the integration of different technology stacks, which increases risk, cost and complexity. The\n", + "\n", + "consequences of not doing the integration properly can be serious — in terms of security, compliance,\n", + "\n", + "efficiency, cost, etc.\n", + "\n", + "\n", + "partly due to the innovations in the [open source](https://databricks.com/product/open-source)\n", + "\n", + "\n", + "So, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n", + "\n", + "and incorporate your requirements into their platform product roadmap. This will require some give-and-\n", + "\n", + "take from both parties — sometimes calling for an organization to adjust their processes to better fit how\n", + "\n", + "the platform works. There are many instances where a given business process could be simplified or recast\n", + "\n", + "to work with the platform, as is. Sometimes it will require the vendor to add features that support your\n", + "\n", + "processes. The vendor will always be market driven and will want to build features in such a way that they\n", + "\n", + "apply to the broadest set of customers.\n", + "\n", + "The final point to consider is that it takes a substantial amount of time to become an expert user of a given\n", + "\n", + "tool. Users must make a significant investment to learn how the tool works and the most efficient way of\n", + "\n", + "performing their job. The more discrete tools in an environment, the more challenging this becomes.\n", + "\n", + "Minimizing the number of tools and their different interfaces, styles of interaction and approach to security\n", + "\n", + "and collaboration helps improve the user experience and decreases time to market.\n", + "\n", + "\n", + "[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n", + "\n", + "listening to the needs of thousands of customers\n", + "\n", + "and having our engineers work side by side with\n", + "\n", + "customer teams to deliver real business value using\n", + "\n", + "data and AI.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Unified platform, unified personas**\n", + "\n", + "Deploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n", + "\n", + "data stack — will provide an integrated suite of tools for the full range of personas in your organization,\n", + "\n", + "including business analysts, SQL developers, data engineers and data scientists. You will immediately\n", + "\n", + "increase productivity and reduce risk because you’ll be better able to share the key aspects of data\n", + "\n", + "pipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n", + "\n", + "and deployment. All the work streams function off a single view of the data, and the handoffs between\n", + "\n", + "subsystems are well managed.\n", + "\n", + "Data processing happens in one auditable environment, and the number of copies of data is kept to an\n", + "\n", + "absolute minimum — with each user benefiting from the data assets created by others. Redundant work\n", + "\n", + "is eliminated.\n", + "\n", + "The 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n", + "\n", + "working with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n", + "\n", + "the 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n", + "\n", + "Another challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed, /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf), List(2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n", + "\n", + "(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n", + "\n", + "governance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n", + "\n", + "Every organization is working to improve business outcomes while effectively managing a variety of risks —\n", + "\n", + "including economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n", + "\n", + "Your organization’s data and the systems that process it play a critical role in not only enabling your financial\n", + "\n", + "goals but also in minimizing these seven key business risks.\n", + "\n", + "Businesses have realized that their legacy information technology (IT) platforms are not able to scale and\n", + "\n", + "meet the increasing demands for better data analytics. As a result, they are looking to transform how their\n", + "\n", + "organizations use and process data. Successful data transformation initiatives for data, analytics and AI\n", + "\n", + "involve not only the design of hardware and software systems but also the alignment of people, processes\n", + "\n", + "and platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n", + "\n", + "significant return on investment (ROI) — one that starts in months, not years.\n", + "\n", + "To guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n", + "\n", + "Despite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n", + "\n", + "to deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n", + "\n", + "efficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n", + "\n", + "identify and execute on AI opportunities.\n", + "\n", + "\n", + "-----\n", + "\n", + "To successfully lead data and AI transformation initiatives, organizations need to develop and execute\n", + "\n", + "a comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n", + "\n", + "full potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n", + "\n", + "organizations have the option of moving away from closed, proprietary systems offered by a variety\n", + "\n", + "of cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n", + "\n", + "industry standards.\n", + "\n", + "At Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n", + "\n", + "we’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n", + "\n", + "in successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n", + "\n", + "architecture, which decouples data storage from compute while providing the best price/performance\n", + "\n", + "metrics for all your data workloads — including data warehousing. We have captured the lessons learned\n", + "\n", + "and summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n", + "\n", + "CIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n", + "\n", + "initiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n", + "\n", + "unified data platform that realizes the data lakehouse architecture and enables the data personas in your\n", + "\n", + "organization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n", + "\n", + "shown in Figure 1.\n", + "\n", + "\n", + "###### Lakehouse Platform\n", + "\n", + "\n", + "Data\n", + "Warehousing\n", + "\n", + "\n", + "Data\n", + "Engineering\n", + "\n", + "\n", + "Data\n", + "Streaming\n", + "\n", + "\n", + "Data S�ien��\n", + "and ML\n", + "\n", + "\n", + "Unity Catalog\n", + "Fine-grained governance for data and AI\n", + "\n", + "Delta Lake\n", + "Data relia)ility and .erfor2ance\n", + "\n", + "Cloud Data Lake\n", + "All structured and unstructured data\n", + "\n", + "**Figure 1:**\n", + "The Databricks Lakehouse Platform\n", + "\n", + "\n", + "-----\n", + "\n", + "**The lakehouse architecture benefits organizations in several ways:**\n", + "\n", + "**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n", + "\n", + "**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n", + "\n", + "**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf), List(organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n", + "\n", + "unique and b) the development offers the competitive advantage that you need.\n", + "\n", + "Even software built on top of open source still requires significant investment in integration and testing.\n", + "\n", + "The integration work is particularly challenging because of the large number of open source libraries that\n", + "\n", + "are required in the data science space. The question becomes, “Is this really the area that you want your\n", + "\n", + "engineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n", + "\n", + "**How long will it take? Can the organization afford to wait?**\n", + "\n", + "Even if you decide the software component provides a competitive advantage and is something worth\n", + "\n", + "building in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n", + "\n", + "time-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n", + "\n", + "business due to the anticipated delivery schedule. Keep in mind that software development projects usually\n", + "\n", + "take longer and cost more money than initially planned.\n", + "\n", + "The organization should understand the impact to the overall performance and capabilities of the daily\n", + "\n", + "ecosystem for any features tied to the in-house development effort. Your business partners likely do\n", + "\n", + "not care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n", + "\n", + "is reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n", + "\n", + "features and schedule.\n", + "\n", + "\n", + "Databricks is built on top of popular open source\n", + "\n", + "software that it created. Engineering teams can\n", + "\n", + "improve the underpinnings of the Databricks\n", + "\n", + "platform by submitting code via pull request and\n", + "\n", + "becoming committers to the projects. The benefit\n", + "\n", + "to organizations is that their engineers contribute\n", + "\n", + "to the feature set of the data platform while\n", + "\n", + "Databricks remains responsible for all integration\n", + "\n", + "and performance testing plus all the runtime\n", + "\n", + "support, including failover and disaster recovery.\n", + "\n", + "\n", + "-----\n", + "\n", + "**Don’t forget about the data**\n", + "\n", + "Perhaps the single most important feature of a modern data stack is its ability to help make data sets and\n", + "\n", + "“data assets” consumable to the end users or systems. Data insights, model training and model execution\n", + "\n", + "cannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n", + "\n", + "In large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n", + "\n", + "sets from multiple lines of business or departments. Focusing your data engineering and data science\n", + "\n", + "efforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n", + "\n", + "creating true competitive advantage.\n", + "\n", + "The amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n", + "\n", + "serve up data for analysis should not be underestimated. The value of this work is equally important to\n", + "\n", + "the business. The ability to curate data to enable game-changing insights should be the focus of the work\n", + "\n", + "led by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n", + "\n", + "engineers innovate on components that don’t bring true competitive advantage.\n", + "\n", + "\n", + "-----\n", + "\n", + "#### 9. Allocate, monitor and optimize costs\n", + "\n", + "Beginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n", + "\n", + "class of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n", + "\n", + "only one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n", + "\n", + "more manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n", + "\n", + "case anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n", + "\n", + "and increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n", + "\n", + "related personas to collaborate and operate from the same point of view. Lessons learned on the platform\n", + "\n", + "could be easily shared and reused by other members of the team. The more the team used the unified\n", + "\n", + "platform, the more they collaborated and their level of expertise increased., /Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf))yesThe response states that the critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. It also mentions that IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, it suggests considering a unified data platform that enables seamless integration with point solutions. The retrieved context supports these points by stating that IT needs to keep control of data access and governance and architecture a customer data stack, while business needs to get customer data access via a no-code interface to generate insights and build customer experiences. The document also mentions the importance of a unified data platform for seamless integration. Therefore, the response is fully supported by the retrieved context.yesThe ground truth states four critical needs for IT and business when implementing a customer data platform. The retrieved context provides a section titled 'Critical IT Needs' and 'Critical Business Needs' which directly addresses these points. The retrieved context mentions that IT needs to 'keep control of data access and governance' and 'ability to architecture a customer data stack with decisions on where data is stored and where queries are executed,' which supports the first two points of the ground truth. For the business needs, the retrieved context states that business needs 'customer data access via a no-code interface to generate insights' and to 'build customer experiences and activate data within business applications,' which supports the last two points of the ground truth. Therefore, all parts of the ground truth are supported by the retrieved context.List(yes, no, no, no, no)1.0
Showing the first 16 rows.
" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "aggData": [], + "aggError": "", + "aggOverflow": false, + "aggSchema": [], + "aggSeriesLimitReached": false, + "aggType": "", + "arguments": {}, + "columnCustomDisplayInfos": {}, + "data": [ + [ + "21866cbed9a5ba0daafc9367a06f6679f7e6290dd05b59cfd45d36fdbc8fbe73", + [ + [ + [ + "Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?", + "user" + ] + ] + ], + [ + [ + "**EBOOK**\n\n## The Big Book of Data Engineering 2nd Edition\n\nA collection of technical\nblogs, including code\nsamples and notebooks\n\n##### With all-new content\n\n\n-----\n\n#### Contents\n\n**S E CTI ON 1** **Introduction to Data Engineering on Databricks** ............................................................................................................. **03**\n\n**S E CTI ON 2** **Guidance and Best Practices** ........................................................................................................................................................................... **10**\n\n**2 .1** Top 5 Databricks Performance Tips ................................................................................................................................................. 11\n\n**2 . 2** How to Profile PySpark ........................................................................................................................................................................ 16\n\n**2 . 3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka .......................................................... 20\n\n**2 . 4** Streaming in Production: Collected Best Practices ................................................................................................................... 25\n\n**2 . 5** Streaming in Production: Collected Best Practices, Part 2 ...................................................................................................... 32\n\n**2 .6** Building Geospatial Data Products ................................................................................................................................................. 37\n\n**2 .7** Data Lineage With Unity Catalog .................................................................................................................................................... 47\n\n**2 . 8** Easy Ingestion to Lakehouse With COPY INTO ............................................................................................................................ 50\n\n**2 .9** Simplifying Change Data Capture With Databricks Delta Live Tables .................................................................................. 57\n\n**2 .1 0** Best Practices for Cross-Government Data Sharing ................................................................................................................. 65\n\n**S E CTI ON 3** **Ready-to-Use Notebooks and Data Sets** ...................................................................................................................................... **74**\n\n**S E CTI ON 4** **Case Studies** ................................................................................................................................................................................................................................. **76**\n\n**4 . 1** Akamai .................................................................................................................................................................................................... 77\n\n**4 . 2** Grammarly ........................................................................................................................................................................................... 80\n\n**4 . 3** Honeywell .............................................................................................................................................................................................. 84\n\n**4 . 4** Wood Mackenzie ................................................................................................................................................................................. 87\n\n**4 . 5** Rivian .................................................................................................................................................................................................... 90\n\n**4 . 6** AT&T ....................................................................................................................................................................................................... 94\n\n\n-----\n\n**SECTION**\n\n# 01\n\n\n### Introduction to Data Engineering on Databricks\n\n\n-----\n\nOrganizations realize the value data plays as a strategic asset for various\nbusiness-related initiatives, such as growing revenues, improving the customer\nexperience, operating efficiently or improving a product or service. However,\naccessing and managing data for these initiatives has become increasingly\ncomplex. Most of the complexity has arisen with the explosion of data volumes\nand data types, with organizations amassing an estimated [80% of data in](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n[unstructured and semi-structured format](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c) . As the collection of data continues\nto increase, 73% of the data goes unused for analytics or decision-making. In\norder to try and decrease this percentage and make more data usable, data\nengineering teams are responsible for building data pipelines to efficiently and\nreliably deliver data. But the process of building these complex data pipelines\ncomes with a number of difficulties:\n\n**•** In order to get data into a data lake, data engineers are required\nto spend immense time hand-coding repetitive data ingestion tasks\n\n**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state\n\n**Data pipeline observability**\nMonitor overall data pipeline status from a dataflow graph dashboard and\nvisually track end-to-end pipeline health for performance, quality and latency.\nData pipeline observability capabilities include:\n\n**•** A high-quality, high-fidelity lineage diagram that provides visibility\ninto how data flows for impact analysis\n\n**•** Granular logging with performance and status of the data pipeline\nat a row level\n\n**•** Continuous monitoring of data pipeline jobs to ensure continued operation\n\n\n**Automatic deployments and operations**\nEnsure reliable and predictable delivery of data for analytics and machine\nlearning use cases by enabling easy and automatic data pipeline deployments\nand rollbacks to minimize downtime. Benefits include:\n\n**•** Complete, parameterized and automated deployment for the\ncontinuous delivery of data\n\n**•** End-to-end orchestration, testing and monitoring of data pipeline\ndeployment across all major cloud providers\n\n**Migrations**\nAccelerating and de-risking the migration journey to the lakehouse, whether\nfrom legacy on-prem systems or disparate cloud services.\n\nThe migration process starts with a detailed discovery and assessment to\nget insights on legacy platform workloads and estimate migration as well as\nDatabricks platform consumption costs. Get help with the target architecture\nand how the current technology stack maps to Databricks, followed by a\nphased implementation based on priorities and business needs. Throughout\nthis journey companies can leverage:\n\n**•** Automation tools from Databricks and its ISV partners\n\n**•** Global and/or regional SIs who have created Brickbuilder migration solutions\n\n**•** Databricks Professional Services and training\n\nThis is the recommended approach for a successful migration, whereby\ncustomers have seen a 25-50% reduction in costs and 2-3x faster time to value\nfor their use cases.\n\n\n-----\n\n**Unified governance**\nWith Unity Catalog, data engineering and governance teams benefit from an\nenterprisewide data catalog with a single interface to manage permissions,\ncentralize auditing, automatically track data lineage down to the column level,\nand share data across platforms, clouds and regions. Benefits:\n\n**•** Discover all your data in one place, no matter where it lives,\nand centrally manage fine-grained access permissions using an\nANSI SQL-based interface\n\n**•** Leverage automated column-level data lineage to perform impact\nanalysis of any data changes across the pipeline and conduct\nroot cause analysis of any errors in the data pipelines\n\n**•** Centrally audit data entitlements and access\n\n**•** Share data across clouds, regions and data platforms,\nwhile maintaining a single copy of your data in your cloud storage\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 3\nThe Databricks Lakehouse Platform integrates with a large collection of technologies\n\n\n**A rich ecosystem of data solutions**\nThe Databricks Lakehouse Platform is built on open source technologies and\nuses open standards so leading data solutions can be leveraged with anything\nyou build on the lakehouse. A large collection of technology partners make it\neasy and simple to integrate the technologies you rely on when migrating to\nDatabricks and to know you are not locked into a closed data technology stack.\n\n\n-----\n\n**Conclusion**\n\nAs organizations strive to become data-driven, data engineering is a focal\npoint for success. To deliver reliable, trustworthy data, data engineers shouldn’t\nneed to spend time manually developing and maintaining an end-to-end\nETL lifecycle. Data engineering teams need an efficient, scalable way to\nsimplify ETL development, improve data reliability and manage operations.\n\nAs described, the eight key differentiating capabilities simplify the\nmanagement of the ETL lifecycle by automating and maintaining all data\ndependencies, leveraging built-in quality controls with monitoring and by\nproviding deep visibility into pipeline operations with automatic recovery.\nData engineering teams can now focus on easily and rapidly building reliable\nend-to-end production-ready data pipelines using only SQL or Python\nfor batch and streaming that deliver high-value data for analytics, data\nscience or machine learning.\n\n\n**Follow proven best practices**\n\nIn the next section, we describe best practices for data engineering\nend-to end use cases drawn from real-world examples. From data ingestion\nand real-time processing to analytics and machine learning, you’ll learn\nhow to translate raw data into actionable data.\n\nAs you explore the rest of this guide, you can find data sets and code\nsamples in the various **[Databricks Solution Accelerators](https://www.databricks.com/solutions/accelerators)** , so you can\nget your hands dirty as you explore all aspects of the data lifecycle on the\nDatabricks Lakehouse Platform.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\n**SECTION**\n\n# 02\n\n\n### Guidance and Best Practices\n\n**2.1** Top 5 Databricks Performance Tips\n\n**2.2** How to Profile PySpark\n\n**2.3** Low-Latency Streaming Data Pipelines With Delta Live Tables and Apache Kafka\n\n**2.4** Streaming in Production: Collected Best Practices\n\n**2.5** Streaming in Production: Collected Best Practices, Part 2\n\n**2.6** Building Geospatial Data Products\n\n**2.7** Data Lineage With Unity Catalog\n\n**2.8** Easy Ingestion to Lakehouse With COPY INTO\n\n**2.9** Simplifying Change Data Capture With Databricks Delta Live Tables\n\n**2.10** Best Practices for Cross-Government Data Sharing\n\n\n-----\n\nSECTION 2.1\n\n**Top 5 Databricks Performance Tips**\n\nby **B R YA N S M I T H** and **R O B S A K E R**\n\nMarch 10, 2022\n\n\nAs solutions architects, we work closely with customers every day to help them\nget the best performance out of their jobs on Databricks — and we often end\nup giving the same advice. It’s not uncommon to have a conversation with a\ncustomer and get double, triple, or even more performance with just a few\ntweaks. So what’s the secret? How are we doing this? Here are the top 5 things\nwe see that can make a huge impact on the performance customers get\nfrom Databricks.\n\nHere’s a TLDR:\n\n**•** **Use larger clusters.** It may sound obvious, but this is the number\none problem we see. It’s actually not any more expensive to use a large\ncluster for a workload than it is to use a smaller one. It’s just faster.\nIf there’s anything you should take away from this article, it’s this.\n\nRead section 1. Really.\n\n**•** **Use** **[Photon](https://databricks.com/blog/2021/06/17/announcing-photon-public-preview-the-next-generation-query-engine-on-the-databricks-lakehouse-platform.html?itm_data=product-cta-announcingPhotonBlog)** , Databricks’ new, super-fast execution engine. Read section 2\nto learn more. You won’t regret it.\n\n\n\n**•** **Clean out your configurations** . Configurations carried from one\nApache Spark™ version to the next can cause massive problems. Clean up!\nRead section 3 to learn more.\n\n**•** **Use** **[Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html)** . There’s a good chance you’re not using caching\ncorrectly, if at all. See Section 4 to learn more.\n\n**•** **Be aware of lazy evaluation** . If this doesn’t mean anything to you and\nyou’re writing Spark code, jump to section 5.\n\n**•** **Bonus tip! Table design is super important** . We’ll go into this in a future\nblog, but for now, check out the [guide on Delta Lake best practices](https://docs.databricks.com/delta/best-practices.html) .\n\n**1. Give your clusters horsepower!**\n\nThis is the number one mistake customers make. Many customers create tiny\nclusters of two workers with four cores each, and it takes forever to do anything.\nThe concern is always the same: they don’t want to spend too much money on\nlarger clusters. Here’s the thing: **it’s actually not any more expensive to use a**\n**large cluster for a workload than it is to use a smaller one. It’s just faster.**\n\n\n-----\n\nThe key is that you’re renting the cluster for the length of the workload. So, if\nyou spin up that two worker cluster and it takes an hour, you’re paying for those\nworkers for the full hour. However, if you spin up a four worker cluster and it takes\nonly half an hour, the cost is actually the same! And that trend continues as long\nas there’s enough work for the cluster to do.\n\nHere’s a hypothetical scenario illustrating the point:\n\n**Number of Workers** **Cost Per Hour** **Length of Workload (hours)** **Cost of Workload**\n\n1 $1 2 $2\n\n2 $2 1 $2\n\n4 $4 0.5 $2\n\n8 $8 0.25 $2\n\nNotice that the total cost of the workload stays the same while the real-world\ntime it takes for the job to run drops significantly. So, bump up your Databricks\ncluster specs and speed up your workloads without spending any more money. It\n\ncan’t really get any simpler than that.\n\n**2. Use Photon**\n\nOur colleagues in engineering have rewritten the Spark execution engine in C++\nand dubbed it Photon. The results are impressive!\n\n\nBeyond the obvious improvements due to running the engine in native code,\nthey’ve also made use of CPU-level performance features and better memory\n\nmanagement. On top of this, they’ve rewritten the Parquet writer in C++. So this\nmakes writing to Parquet and Delta (based on Parquet) super fast as well!\n\nBut let’s also be clear about what Photon is speeding up. It improves\ncomputation speed for any built-in functions or operations, as well as writes to\nParquet or Delta. So joins? Yep! Aggregations? Sure! ETL? Absolutely! That UDF\n(user-defined function) you wrote? Sorry, but it won’t help there. The job that’s\nspending most of its time reading from an ancient on-prem database? Won’t\nhelp there either, unfortunately.\n\n\n-----\n\nThe good news is that it helps where it can. So even if part of your job can’t be\nsped up, it will speed up the other parts. Also, most jobs are written with the\nnative operations and spend a lot of time writing to Delta, and Photon helps a lot\nthere. So give it a try. You may be amazed by the results!\n\n**3. Clean out old configurations**\n\nYou know those Spark configurations you’ve been carrying along from version to\nversion and no one knows what they do anymore? They may not be harmless.\nWe’ve seen jobs go from running for hours down to minutes simply by cleaning\nout old configurations. There may have been a quirk in a particular version of\nSpark, a performance tweak that has not aged well, or something pulled off\nsome blog somewhere that never really made sense. At the very least, it’s worth\nrevisiting your Spark configurations if you’re in this situation. Often the default\nconfigurations are the best, and they’re only getting better. Your configurations\nmay be holding you back.\n\n**4. The Delta Cache is your friend**\n\nThis may seem obvious, but you’d be surprised how many people are not using\nthe [Delta Cache](https://docs.databricks.com/delta/optimizations/delta-cache.html) , which loads data off of cloud storage (S3, ADLS) and keeps it on\nthe workers’ SSDs for faster access.\n\n\nIf you’re using Databricks SQL Endpoints you’re in luck. Those have caching on\nby default. In fact, we recommend using [CACHE SELECT * FROM table](https://docs.databricks.com/spark/latest/spark-sql/language-manual/delta-cache.html) to preload\nyour “hot” tables when you’re starting an endpoint. This will ensure blazing fast\nspeeds for any queries on those tables.\n\nIf you’re using regular clusters, be sure to use the i3 series on Amazon Web\nServices (AWS), L series or E series on Azure Databricks, or n2 in GCP. These will\nall have fast SSDs and caching enabled by default.\n\nOf course, your mileage may vary. If you’re doing BI, which involves reading the\nsame tables over and over again, caching gives an amazing boost. However, if\nyou’re simply reading a table once and writing out the results as in some ETL\njobs, you may not get much benefit. You know your jobs better than anyone.\nGo forth and conquer.\n\n\n-----\n\n**5. Be aware of lazy evaluation**\n\n\nHowever, there is a catch here. Every time you try to display or write out\nresults, it runs the execution plan again. Let’s look at the same block of code\nbut extend it and do a few more operations.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n.filter(...)\n)\n\n_# Now run the execution plan to get results_\ndf2.display()\n\n_# Unfortunately this will run the plan again, including filtering, joining,_\n_etc_\ndf2.display()\n\n_# So will this…_\ndf2.count()\n—------\n\n\nIf you’re a data analyst or data scientist only using SQL or doing BI you can skip\nthis section. However, if you’re in data engineering and writing pipelines or doing\nprocessing using Databricks/Spark, read on.\n\nWhen you’re writing Spark code like select, groupBy, filter, etc., you’re really\nbuilding an execution plan. You’ll notice the code returns almost immediately when\nyou run these functions. That’s because it’s not actually doing any computation. So\neven if you have petabytes of data, it will return in less than a second.\n\nHowever, once you go to write your results out you’ll notice it takes longer. This\nis due to lazy evaluation. It’s not until you try to display or write results that your\nexecution plan is actually run.\n\n—-------\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n\n\n_# Now run the execution plan to get results_\ndf2.display()\n—------\n\n\n-----\n\nThe developer of this code may very well be thinking that they’re just printing\nout results three times, but what they’re really doing is kicking off the same\nprocessing three times. Oops. That’s a lot of extra work. This is a very common\nmistake we run into. So why is there lazy evaluation, and what do we do about it?\n\nIn short, processing with lazy evaluation is way faster than without it.\nDatabricks/Spark looks at the full execution plan and finds opportunities\nfor optimization that can reduce processing time by orders of magnitude.\nSo that’s great, but how do we avoid the extra computation? The answer\nis pretty straightforward: save computed results you will reuse.\n\n\nThis works especially well when [Delta Caching](https://docs.databricks.com/delta/optimizations/delta-cache.html) is turned on. In short, you\nbenefit greatly from lazy evaluation, but it’s something a lot of customers trip\nover. So be aware of its existence and save results you reuse in order to avoid\nunnecessary computation.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\nLet’s look at the same block of code again, but this time let’s avoid the\nrecomputation:\n\n_# Build an execution plan._\n_# This returns in less than a second but does no work_\ndf2 = (df\n.join(...)\n.select(...)\n. filter (...)\n)\n\n_# save it_\ndf2.write.save(path)\n\n_# load it back in_\ndf3 = spark.read.load(path)\n\n_# now use it_\ndf3.display()\n\n_# this is not doing any extra computation anymore. No joins, filtering,_\n_etc. It’s already done and saved._\ndf3.display()\n\n_# nor is this_\ndf3.count()\n\n\n-----\n\nSECTION 2.2 \u0007\n\n**How to Profile PySpark**\n\nby **X I N R O N G M E N G , TA K U YA U E S H I N , H Y U K J I N K W O N** and **A L L A N F O LT I N G**\n\nOctober 6, 2022\n\n\nIn Apache Spark™, declarative Python APIs are supported for big data workloads.\nThey are powerful enough to handle most common use cases. Furthermore,\nPySpark UDFs offer more flexibility since they enable users to run arbitrary\nPython code on top of the Apache Spark™ engine. Users only have to state\n“what to do”; PySpark, as a sandbox, encapsulates “how to do it.” That makes\nPySpark easier to use, but it can be difficult to identify performance bottlenecks\nand apply custom optimizations.\n\nTo address the difficulty mentioned above, PySpark supports various profiling\ntools, which are all based on [cProfile](https://docs.python.org/3/library/profile.html#module-cProfile) , one of the standard Python [profiler](https://docs.python.org/3/library/profile.html)\n[implementations](https://docs.python.org/3/library/profile.html) . PySpark Profilers provide information such as the number\nof function calls, total time spent in the given function, and filename, as well\nas line number to help navigation. That information is essential to exposing\ntight loops in your PySpark programs, and allowing you to make performance\n\nimprovement decisions.\n\n\n**Driver profiling**\n\nPySpark applications run as independent sets of processes on a cluster,\ncoordinated by the SparkContext object in the driver program. On the driver\nside, PySpark is a regular Python process; thus, we can profile it as a normal\nPython program using cProfile as illustrated below:\n\nimport cProfile\n\nwith cProfile.Profile() as pr:\n_# Your code_\n\npr.print_stats()\n\n**Workers profiling**\n\nExecutors are distributed on worker nodes in the cluster, which introduces\ncomplexity because we need to aggregate profiles. Furthermore, a Python worker\nprocess is spawned per executor for PySpark UDF execution, which makes the\nprofiling more intricate.\n\n\n-----\n\nThe UDF profiler, which is introduced in Spark 3.3, overcomes all those obstacles\nand becomes a major tool to profile workers for PySpark applications. We’ll\nillustrate how to use the UDF profiler with a simple Pandas UDF example.\n\nFirstly, a PySpark DataFrame with 8,000 rows is generated, as shown below.\n```\n sdf = spark.range( 0 , 8 * 1000 ).withColumn(\n 'id' , (col( 'id' ) % 8 ). cast ( 'integer' ) # 1000 rows x 8 groups (if group\n by 'id' )\n ).withColumn( 'v' , rand())\n\n```\nLater, we will group by the id column, which results in 8 groups with 1,000 rows\nper group.\n\nThe Pandas UDF plus_one is then created and applied as shown below:\n```\n import pandas as pd\n def plus_one ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf.apply( lambda x: x + 1 , axis= 1 )\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one, schema=sdf.schema)\n res.collect()\n\n```\n\nExecuting the example above and running sc.show_profiles() prints the\nfollowing profile. The profile below can also be dumped to disk by sc.dump_\nprofiles(path).\n\nThe UDF id in the profile (271, highlighted above) matches that in the Spark plan\nfor res. The Spark plan can be shown by calling res.explain() .\n\n\nNote that plus_one takes a pandas DataFrame and returns another pandas\nDataFrame. For each group, all columns are passed together as a pandas\nDataFrame to the plus_one UDF, and the returned pandas DataFrames are\ncombined into a PySpark DataFrame.\n\n\n-----\n\nThe first line in the profile’s body indicates the total number of calls that were\nmonitored. The column heading includes\n\n**•** ncalls , for the number of calls.\n\n**•** tottime , for the total time spent in the given function (excluding time\nspent in calls to sub-functions)\n\n**•** percall , the quotient of tottime divided by ncalls\n\n**•** cumtime , the cumulative time spent in this and all subfunctions (from\ninvocation till exit)\n\n**•** percall , the quotient of cumtime divided by primitive calls\n\n**•** filename:lineno(function) , which provides the respective information\nfor each function\n\nDigging into the column details: plus_one is triggered once per group, 8 times\nin total; _arith_method of pandas Series is called once per row, 8,000 times\nin total. pandas.DataFrame.apply applies the function lambda x: x + 1 row by\nrow, thus suffering from high invocation overhead.\n\nWe can reduce such overhead by substituting the pandas.DataFrame.apply\nwith pdf + 1, which is vectorized in pandas. The optimized Pandas UDF looks as\nfollows:\n```\n import pandas as pd\n def plus_one_optimized ( pdf: pd.DataFrame ) -> pd.DataFrame:\n return pdf + 1\n res = sdf.groupby( \"id\" ).applyInPandas(plus_one_optimized, schema=sdf.\n schema)\n res.collect()\n\n```\n\nThe updated profile is as shown below.\n\nWe can summarize the optimizations as follows:\n\n**•** Arithmetic operation from 8,000 calls to 8 calls\n\n**•** Total function calls from 2,898,160 calls to 2,384 calls\n\n**•** Total execution time from 2.300 seconds to 0.004 seconds\n\nThe short example above demonstrates how the UDF profiler helps us deeply\nunderstand the execution, identify the performance bottleneck and enhance\nthe overall performance of the user-defined function.\n\nThe UDF profiler was implemented based on the executor-side profiler,\nwhich is designed for PySpark RDD API. The executor-side profiler is available\nin all active Databricks Runtime versions.\n\n\n-----\n\nBoth the UDF profiler and the executor-side profiler run on Python workers.\nThey are controlled by the spark.python.profile Spark configuration, which\nis false by default. We can enable that Spark configuration on a Databricks\nRuntime cluster as shown below.\n\n\n**Conclusion**\n\nPySpark profilers are implemented based on cProfile; thus, the profile reporting\nrelies on the [Stats](https://docs.python.org/3/library/profile.html#the-stats-class) class. [Spark Accumulators](https://spark.apache.org/docs/latest/rdd-programming-guide.html#accumulators) also play an important role when\ncollecting profile reports from Python workers.\n\nPowerful profilers are provided by PySpark in order to identify hot loops and\nsuggest potential improvements. They are easy to use and critical to enhance\nthe performance of PySpark programs. The UDF profiler, which is available\nstarting from Databricks Runtime 11.0 (Spark 3.3), overcomes all the technical\nchallenges and brings insights to user-defined functions.\n\nIn addition, there is an ongoing effort in the Apache Spark™ open source\ncommunity to introduce memory profiling on executors; see [SPARK-40281](https://issues.apache.org/jira/browse/SPARK-40281) for\nmore information.\n\n**Start experimenting with these**\n**free Databricks** **notebooks** **.**\n\n\n-----\n\nSECTION 2.3 \u0007\n\n**Low-Latency Streaming Data Pipelines With Delta Live Tables**\n**and Apache Kafka**\n\nby **F R A N K M U N Z**\n\nAugust 9, 2022\n\n\n[Delta Live Tables (DLT)](https://databricks.com/product/delta-live-tables) is the first ETL framework that uses a simple declarative\napproach for creating reliable data pipelines and fully manages the underlying\ninfrastructure at scale for batch and [streaming data](https://www.databricks.com/product/data-streaming) . Many use cases require\nactionable insights derived from near real-time data. Delta Live Tables enables\nlow-latency streaming data pipelines to support such use cases with low\nlatencies by directly ingesting data from event buses like [Apache Kafka](https://kafka.apache.org/) , [AWS](https://aws.amazon.com/kinesis/)\n[Kinesis](https://aws.amazon.com/kinesis/) , [Confluent Cloud](https://www.confluent.io/confluent-cloud) , [Amazon MSK](https://www.youtube.com/watch?v=HtU9pb18g5Q) , or [Azure Event Hubs](https://docs.microsoft.com/en-us/azure/event-hubs/) .\n\nThis article will walk through using DLT with Apache Kafka while providing the\nrequired Python code to ingest streams. The recommended system architecture\nwill be explained, and related DLT settings worth considering will be explored\nalong the way.\n\n**Streaming platforms**\n\nEvent buses or message buses decouple message producers from consumers.\nA popular streaming use case is the collection of click-through data from\nusers navigating a website where every user interaction is stored as an event in\n\n\nApache Kafka. The event stream from Kafka is then used for real-time streaming\ndata analytics. Multiple message consumers can read the same data from Kafka\nand use the data to learn about audience interests, conversion rates, and bounce\nreasons. The real-time, streaming event data from the user interactions often\nalso needs to be correlated with actual purchases stored in a billing database.\n\n**Apache Kafka**\n\n[Apache Kafka](https://kafka.apache.org/) is a popular open source event bus. Kafka uses the concept of a\ntopic, an append-only distributed log of events where messages are buffered for\na certain amount of time. Although messages in Kafka are not deleted once they\nare consumed, they are also not stored indefinitely. The message retention for\n\nKafka can be configured per topic and defaults to 7 days. Expired messages will\nbe deleted eventually.\n\nThis article is centered around Apache Kafka; however, the concepts discussed\nalso apply to many other event busses or messaging systems.\n\n\n-----\n\n**Streaming data pipelines**\n\n\nIn a data flow pipeline, Delta Live Tables and their dependencies can be declared\nwith a standard SQL Create Table As Select (CTAS) statement and the DLT\nkeyword “live.”\n\nWhen developing DLT with Python, the @dlt.table decorator is used to", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ] + ], + [ + "Larger clusters execute workloads faster in Databricks.", + "The faster execution reduces the total time required for workload completion.", + "The overall cost efficiency is balanced due to reduced workload completion time despite higher hourly costs." + ], + "{\"info\": {\"request_id\": \"tr-fdd84cee84c14b1cbd05fef9afda0573\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852705, \"execution_time_ms\": 1874, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"0928244e-ca9f-4d04-839e-afa0c6c57ecc\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-fdd84cee84c14b1cbd05fef9afda0573/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x1c2b5d55408ec680\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": null, \"start_time\": 1734543852705036571, \"end_time\": 1734543854579582448, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xc378632f7d05e4e5\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852723331103, \"end_time\": 1734543852888574298, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xa3dbaf3bb677995f\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852888704900, \"end_time\": 1734543852889256407, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x0027ea0bf61b6abb\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0xa3dbaf3bb677995f\", \"start_time\": 1734543852888865002, \"end_time\": 1734543852889028004, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb720815986795f4f\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0xa3dbaf3bb677995f\", \"start_time\": 1734543852889092305, \"end_time\": 1734543852889221106, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x456d9ec6e5fd8501\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x1c2b5d55408ec680\", \"start_time\": 1734543852889318008, \"end_time\": 1734543855416920308, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543855416856807, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xb9e384e320bcec52\", \"trace_id\": \"0x54a70916a31303468f8a3720ca4e6836\"}, \"parent_id\": \"0x456d9ec6e5fd8501\", \"start_time\": 1734543853073605944, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-fdd84cee84c14b1cbd05fef9afda0573\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854579418, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is it recommended to use lar...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'Why is it recommended to use larger clusters for workloads in Databricks, and how does this affect cost efficiency?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "088c4943384eaa6a228c3d68ff70fbef6bcbe9c50176180e73244de1d7f3be1a", + [ + [ + [ + "What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?", + "user" + ] + ] + ], + [ + [ + "```\nTECHNICAL GUIDE\n\n```\n\n# Solving Common Data Challenges \n\n\n#### Startups and Digital Native Businesses\n\n\n-----\n\n### Table of Contents\n\n\n# 01\n```\nCHALLENGE:\n \u0003\n\n###### Creating a unified data architecture for data quality, governance and efficiency\n\n# 03\nCHALLENGE:\n \u0003\n\n###### Building effective machine learning operations\n\n```\n\n# 02\n```\nCHALLENGE:\n \u0003\n\n###### Building a data architecture to support scale and performance\n\n# 04\nSUMMARY:\n\n###### The Databricks Lakehouse Platform addresses these challenges\n\n```\n\n-----\n\n**I N T R O D U C T I O N**\n\n\nThis guide shares how the lakehouse architecture can increase\nproductivity and cost-efficiently support all your data, analytics\nand AI workloads, and flexibly scale with the pace of growth\nfor your company. Read the entire guide or dive straight into a\nspecific challenge.\n\nWith the advent of cloud infrastructure, a new generation of\nstartups has rapidly built and scaled their businesses. The use of\ncloud infrastructure, once seen as innovative, has now become\ntable stakes. The differentiator for the fastest-moving startups\nand digital natives now comes from the effective use of data\nat scale, primarily analytics and AI. Digital natives — defined\nas fast-moving, lean, and technically savvy, born-in-the-cloud\norganizations — are beginning to focus on new data-driven use\ncases such as real-time machine learning and personalized\ncustomer experiences.\n\nTo pursue these new data-intensive use cases and initiatives,\norganizations must look beyond the technologies that delivered\nthem to this point in time. Over time, these technologies, such\nas transactional databases, streaming/batch pipelines and firstgeneration analytics engines, have led to brittle\n\nThis guide examines some of the biggest data challenges and\nsolutions for startups and for scaling digital native businesses\nthat have reached the point where an end-to-end modern data\nplatform is a smart investment. Some key considerations include:\nsystems that are not cost-efficient and require time-consuming\nadministration and engineering toil. In addition to growing\nmaintenance needs, data is often stored in disparate locations\nand formats, with little or no governance, making real-time use\ncases, analytics and AI difficult or impossible.\n\n\n**Consolidating on a unified data platform**\nAs mentioned above, siloed data storage and management add administrative and\nfinancial cost. You can benefit significantly when you unify your data in one location\nwith a flexible architecture that scales with your needs and delivers performance\nfor future success. For this, you will want an open platform that supports all your\ndata including batch and streaming workloads, data analytics and machine learning.\nWith data unification, you create a more efficient, integrated approach to ingesting,\ncleaning and organizing your data. You also need automation to make data analysis\neasier for the nontechnical users in the company. But broader data access also\nmeans more focus on security, privacy, compliance and access control, which can\ncreate overhead for a growing.\n\n**Scaling up capacity and increasing performance**\n**and usability of the data solutions**\nData teams at growing digital native organizations find it time intensive and costly to\nhandle the growing volume and velocity of their data being ingested from multiple\nsources, across multiple clouds. You now need a unified and simplified platform that\ncan instantly scale up capacity and deliver more computing power on demand to\nfree up your data teams to produce outputs more quickly. This lowers the total cost\nfor the overall infrastructure by eliminating redundant licensing, infrastructure and\nadministration costs.\n\n**Building effective machine learning operations**\nFor data teams beginning their machine learning journeys, the challenge of training\ndata models can increase in management complexity. Many teams with disparate\ncoding needs for the entire model lifecycle suffer inefficiencies from transferring\ndata and code across many separate services. To build and manage effective\nML operations, consider an end-to-end MLOps environment that brings all data\ntogether in one place and incorporates managed services for experiment tracking,\nmodel training, feature development and feature and model serving.\n\n\n-----\n\n# 01\n```\nCHALLENGE: \u0003\n\n## Create a unified data architecture for data quality, governance and efficiency\n\n```\n\n-----\n\n```\nCHALLENGE 01\n\n### Create a unified data architecture for data quality, governance and efficiency\n\n```\nAs cloud-born companies grow, data volumes rapidly increase, leading to new\nchallenges and use cases. Among the challenges:\n\n\nApplication stacks optimized for transaction\nuse cases aren’t able to handle the volume,\nvelocity and variety of data that modern data\nteams require. For example, this leads to query\nperformance issues as data volume grows.\n\nData silos develop as each team within an\norganization chooses different ETL/ELT and\nstorage solutions for their needs. As the\norganization grows and changes, these pipelines\nand storage solutions become brittle, hard to\nmaintain and nearly impossible to integrate.\n\n\nThese data silos lead to discoverability,\nintegration and access issues, which prevent\nteams from leveraging the full value of the\norganization’s available data.\n\nData governance is hard. Disparate ETL/ELT\nand storage solutions lead to governance,\ncompliance, auditability and access control\nchallenges, which expose organizations to\ntremendous risk.\n\n\nThe Databricks Lakehouse Platform provides\na unified set of tools for building, deploying,\nsharing and maintaining data solutions at scale.\nIt integrates with cloud storage and the security\nin your cloud account, manages and deploys\ncloud infrastructure on your behalf. Your data\npractitioners no longer need separate storage\nsystems for their data. And you don’t have to rely\non your cloud provider for security. The lakehouse\nhas its own robust security built into the platform.\n\n\nFor all the reasons above, the most\nconsistent advice from successful data\npractitioners is to create a “single source\nof truth” by unifying all data on a single\nplatform. With the Databricks Lakehouse\nPlatform, you can unify all your data on one\nplatform, reducing data infrastructure costs\nand compute. You don’t need excess data\ncopies and you can retire expensive\nlegacy infrastructure.\n```\n 01\n\n```\n\n-----\n\n```\nCUSTOMER STORY: GRAMMARLY\n\n### Helping 30 million people and 50,000 teams communicate more effectively\n\n```\n\nWhile its business is based on analytics, [Grammarly](http://www.grammarly.com)\n\nfor many years relied on a homegrown analytics\n\nplatform to drive its AI writing assistant to\n\nhelp users improve multiple aspects of written\n\ncommunications. As teams developed their own\n\nrequirements, data silos inevitably emerged as\n\ndifferent business areas implemented analytics\n\ntools individually.\n\n“Every team decided to solve their analytics\n\nneeds in the best way they saw fit,” said Chris\n\nLocklin, Engineering Manager, Data Platforms,\n\nat Grammarly. “That created challenges in\n\nconsistency and knowing which data set\n\nwas correct.”\n\nTo better scale and improve data storage and\n\nquery capabilities, Grammarly brought all its\n\nanalytical data into the Databricks Lakehouse\n\nPlatform and created a central hub for all data\n\nproducers and consumers across the company.\n\nGrammarly had several goals with the lakehouse,\n\nincluding better access control, security, ingestion\n\n\nflexibility, reducing costs and fueling collaboration. “Access control in a\n\ndistributed file system is difficult, and it only gets more complicated as\n\nyou ingest more data sources,” said Locklin. To manage access control,\n\nenable end-to-end observability and monitor data quality, Grammarly\n\nrelies on the data lineage capabilities within Unity Catalog. “Data lineage\n\nallows us to effectively monitor usage of our data and ensure it upholds\n\nthe standards we set as a data platform team,” said Locklin. “Lineage is\n\nthe last crucial piece for access control.”\n\nData analysts within Grammarly now have a consolidated interface for\n\nanalytics, which leads to a single source of truth and confidence in the\n\naccuracy and availability of all data managed by the data platform team.\n\nHaving a consistent data source across the company also resulted in\n\ngreater speed and efficiency and reduced costs. Data practitioners\n\nexperienced 110% faster querying at 10% of the cost to ingest compared\n\nto a data warehouse. Grammarly can now make its 5 billion daily events\n\navailable for analytics in under 15 minutes rather than 4 hours. Migrating\n\noff its rigid legacy infrastructure gave Grammarly the flexibility to do\n\nmore and the confidence that the platform will evolve with its needs.\n\nGrammarly is now able to sustain a flexible, scalable and highly secure\n\nanalytics platform that helps 30 million people and 50,000 teams\n\nworldwide write more effectively every day.\n\n[Read the full story here.](https://www.databricks.com/customers/grammarly)\n\n\n-----\n\n###### How to unify the data infrastructure with Databricks\n\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) architecture\nis composed of two primary parts:\n\n- The infrastructure to deploy, configure and\nmanage the platform and services\n\n\nYou can build a Databricks workspace by configuring\nsecure integrations between the Databricks platform\nand your cloud account, and then Databricks deploys\ntemporary Apache Spark™/Photon clusters using cloud\nresources in your account to process and store data\nin object storage and other integrated services you\ncontrol. Here are three steps to get started with the\nDatabricks Lakehouse Platform:\n\n**Understand the architecture**\nThe lakehouse provides a unified architecture,\nmeaning that all data is stored in the same\naccessible place. The diagram shows how data\ncomes in from sources like a customer relationship\nmanagement (CRM) system, an enterprise resource\nplanning (ERP) system, websites or unstructured\ncustomer emails.\n\n**Optimize the storage layer**\nAll data is stored in cloud storage while Databricks\nprovides tooling to assist with ingestion, such as\nAuto Loader, and we recommend [open-source](https://delta.io/)\n[Delta Lake](https://docs.databricks.com/delta/index.html) as the storage format of choice.\nDelta optimized storage layer that provides the\nfoundation for storing data and tables in the\nDatabricks Lakehouse Platform. Having all your\ndata in the same optimized, open storage keeps\nall your use cases in the same place, thus enabling\ncollaboration and removing software tool overhead.\n\n\n\n- the customer-owned infrastructure managed in\ncollaboration by Databricks and the customer.\n\n\nThe lakehouse handles all varieties of data (structured, semi-structured, unstructured),\nas well as all velocities of data (streaming, batch or somewhere in the middle).\n\n[Sign up for a free trial](https://www.databricks.com/try-databricks#account) account with the instructions on the [get started page.](https://docs.databricks.com/getting-started/index.html)\n\n\n-----\n\nThe Databricks Lakehouse organizes data stored with Delta Lake in cloud object\nstorage with familiar concepts like database, tables and views. Delta Lake extends\nParquet data files with a file-based transaction log for [ACID transactions](https://docs.databricks.com/lakehouse/acid.html) and\nscalable metadata handling. Delta Lake is fully compatible with Apache Spark APIs,\nand was developed for tight integration with Structured Streaming, allowing you to\neasily use a single copy of data for both batch and streaming operations to provide\nincremental processing at scale.This model combines many of the benefits of a data\nwarehouse with the scalability and flexibility of a data lake.\n\nTo learn more about the optimized storage layer that provides the foundation for\nstoring data and tables in the Databricks Lakehouse Platform, see [Getting started](https://docs.databricks.com/getting-started/delta.html)\n[with Delta Lake](https://docs.databricks.com/getting-started/delta.html) [.](https://docs.databricks.com/getting-started/delta.html)\n\nThe first step in unifying your data architecture is setting up how data is to be\naccessed and used across the organization. We’ll discuss this as a series of steps:\n\n**1** Set up governance with Unity Catalog\n\n**2** Grant secure access to the data\n\n\n###### “Delta Lake provides us with a single source of truth for all of our data,” said Stone. “Now our data engineers are able to build reliable data pipelines that thread the needle on key topics, such as inventory management, allowing us to identify in near real-time what our trends are so we can figure out how to effectively move inventory.”\n – Jake Stone, Senior Manager, Business Analytics at ButcherBox \n\n[Learn more](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\n**3** Capture audit logs\n\n**4** View data lineage\n\n**5** Set up data sharing\n\n\n-----\n\n**Configure unified governance**\nDatabricks recommends using catalogs to provide an easily searchable inventory of data, notebooks, dashboards and models. Often this\nmeans that catalogs can correspond to software development environment scope, team or business unit. [Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/get-started.html) manages how data\nis secured, accessed and shared. Unity Catalog offers a single place to administer data access policies that apply across all workspace and\npersonas and automatically captures user-level audit logs that record access to your data.\n\nData stewards can securely grant access to a broad set of users to discover and analyze data at scale. These users can use a variety of\nlanguages and tools, including SQL and Python, to create derivative data sets, models and dashboards that can be shared across teams.\n\nTo set up Unity Catalog for your organization,\nyou do the following:\n\n\n**1** Configure an S3 bucket and IAM role that\nUnity Catalog can use to store and access\ndata in your AWS account.\n\n**2** Create a metastore for each region in\n\nwhich your organization operates, and\nattach workspaces to the metastore. Each\nworkspace will have the same view of the\ndata you manage in Unity Catalog.\n\n\n**3** If you have a new account, add users,\ngroups and service principals to your\nDatabricks account.\n\n**4** Next, create and grant access to\n\ncatalogs, schemas and tables.\n\n\nFor complete setup instructions, see [Get started using Unity Catalog.](https://docs.databricks.com/data-governance/unity-catalog/get-started.html#:~:text=To%20enable%20your%20Databricks%20account%20to%20use%20Unity,Transfer%20your%20metastore%20admin%20role%20to%20a%20group.)\n\n\n-----\n\n###### How Unity Catalog works\n\n\nYou will notice that the hierarchy of primary data\nobjects in Unity Catalog flows from metastore to table:\n\n**Metastore** is the top-level container for metadata.\nEach metastore exposes a three-level namespace\n(catalog.schema.table) that organizes your data.\n\n\n**Metastore** **Catalog** **Schemas**\n\n\n**Views**\n\n**Managed**\n**Tables**\n\n\n**Catalog** is the first layer of the object hierarchy, used\nto organize your data assets.\n\n\n**Schemas** , also known as databases, are the second\nlayer of the object hierarchy and contain tables and\nviews.\n\n**Table** is the lowest level in the object hierarchy, and\ntables can be external (stored in external locations in\nyour cloud storage of choice) or managed (stored in a\nstorage container in your cloud storage that you create\n\nexpressly for Databricks). You can also create readonly **Views** from tables.\n\n\n**External**\n**tables**\n\nThe diagram below represents the file system\nhierarchy of a single storage bucket:\n\n\n-----\n\nUnity Catalog uses the identities in the Databricks\naccount to resolve users, service principals, and groups\nand to enforce permissions. To configure identities in\nthe account, follow the instructions in [Manage users,](https://docs.databricks.com/administration-guide/users-groups/index.html)\n[service principals, and groups](https://docs.databricks.com/administration-guide/users-groups/index.html) . Refer to those users,\nservice principals, and groups when you create\n[access-control policies](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/index.html) in Unity Catalog.\n\nUnity Catalog users, service principals, and groups\nmust also be added to workspaces to access Unity\nCatalog data in a notebook, a Databricks SQL query,\nData Explorer or a REST API command. The assignment\nof users, service principals, and groups to workspaces\nis called identity federation. All workspaces attached\nto a Unity Catalog metastore are enabled for identity\nfederation.\n\nSecurable objects in Unity Catalog are hierarchical,\nmeaning that granting a privilege on a catalog or schema\nautomatically grants the privilege to all current and\nfuture objects within the catalog or schema. For more\non granting privileges, see the [Inheritance model](https://docs.databricks.com/data-governance/unity-catalog/manage-privileges/privileges.html#inheritance) .\nA common scenario is to set up a schema per team\nwhere only that team has USE SCHEMA and CREATE on\nthe schema. This means that any tables produced by\nteam members can only be shared within the team.\nData Explorer uses the privileges configured by Unity\nCatalog administrators to ensure that users are only\nable to see catalogs, databases, tables and views that\nthey have permission to query.\n\n\n[Databricks Data Explorer](https://docs.databricks.com/data/index.html) is the main user interface for\nmany Unity Catalog features. Use Data Explorer to view\nschema details, preview sample data, and see table\ndetails and properties. Administrators can view and\nchange owners. Admins and data object owners can grant\nand revoke permissions through this interface.\n\n**Set up secure access**\nIn Unity Catalog, data is secure by default. Initially, users\nhave no access to data in a metastore. Access can\nbe granted by either a metastore admin, the owner of\nan object, or the owner of the catalog or schema that\ncontains the object. Securable objects in Unity Catalog\nare hierarchical and privileges are inherited downward.\n\nUnity Catalog’s security model is based on standard ANSI\nSQL and allows administrators to grant permissions in\ntheir existing data lake using familiar syntax, at the level of\ncatalogs, databases (schema), tables and views. Privileges\nand metastores are shared across workspaces, allowing\nadministrators to set secure permissions once against\n\ngroups synced from identity providers and know that\nend users only have access to the proper data in any\nDatabricks workspace they enter.\n\n\n-----\n\n```\nCUSTOMER STORY: BUTCHERBOX\n\n### How Butcherbox Uses Data Insights to Provide Quality Food Tailored to Each Customer’s Unique Taste\n\n```\n\nAs a young e-commerce company,\n\n[ButcherBox](https://www.butcherbox.com/) has to be nimble as its\n\ncustomers’ needs change, which means it is\n\nconstantly considering behavioral patterns,\n\ndistribution center efficiency, a growing list of\n\nmarketing and communication channels, and\n\norder processing systems.\n\nThe meat and seafood subscription company\n\ncollects data on hundreds of thousands\n\nof subscribers. It deployed the Databricks\n\nLakehouse Platform to gain visibility across\n\nits diverse range of data systems and enable\n\nits analytics team to securely view and\n\nexport data in the formats needed.\n\nWith so much data feeding in from different\n\nsources — from email systems to its website\n\n— the data team at ButcherBox quickly\n\ndiscovered that data silos were a significant\n\n\n“We knew we needed to migrate from our legacy data warehouse\n\nenvironment to a data analytics platform that would unify our\n\ndata and make it easily accessible for quick analysis to improve\n\nsupply chain operations, forecast demand and, most importantly,\n\nkeep up with our growing customer base,” explained Jake Stone,\n\nSenior Manager, Business Analytics, at ButcherBox.\n\nThe platform allows analysts to share builds and iterate on a\n\nproject without getting into the code. Querying a table of 18\n\nbillion rows would have been problematic with a traditional\n\nplatform. With Databricks, ButcherBox can do it in three minutes.\n\n“Delta Lake provides us with a single source of truth for all of\n\nour data,” said Stone. “Now our data engineers are able to build\n\nreliable data pipelines that thread the needle on key topics such\n\nas inventory management, allowing us to identify in near real-\n\ntime what our trends are so we can figure out how to effectively\n\nmove inventory.”\n\n[Read the full story here.](https://www.databricks.com/blog/2022/02/07/how-butcherbox-uses-data-insights-to-provide-quality-food-tailored-to-each-customers-unique-taste.html)\n\n\nproblem because they blocked complete\n\nvisibility into critical insights needed to make\n\nstrategic and marketing decisions.\n\n\n-----\n\n**Set up secure data sharing**\nDatabricks uses an open protocol called [Delta Sharing](https://docs.databricks.com/data-sharing/index.html)\nto share data with other entities regardless of their\ncomputing platforms. Delta Sharing is integrated with\nUnity Catalog. Your data must be registered with Unity\nCatalog to manage, govern, audit and track usage of the\nshared data on the Lakehouse Platform. The primary\nconcepts of Delta Sharing are shares (read-only\ncollections of tables and table partitions to be shared)\nand recipients (objects that associate an organization\nwith a credential or secure sharing identifier).\n\nAs a data provider, you generate a token and share\nit securely with the recipient. They use the token to\nauthenticate and get read access to the tables you’ve\nincluded in the shares you’ve given them access\nto. Recipients access the shared data in read-only\nformat. Whenever the data provider updates data\ntables in their own Databricks account, the updates\nappear in near real-time in the recipient’s system.\n\n\n**Capture audit logs**\nUnity Catalog captures an audit log of actions\nperformed against the metastore. To access audit\nlogs for Unity Catalog events, you must enable and\nconfigure audit logs for your account. Audit logs for\neach workspace and account-level activities are\ndelivered to your account. See how to [configure audit](https://docs.databricks.com/data-governance/unity-catalog/audit.html)\n[logs](https://docs.databricks.com/data-governance/unity-catalog/audit.html) and create a dashboard to analyze audit log data.\n\n**View data lineage**\nYou can use Unity Catalog to capture runtime data\nlineage across queries in any language executed on\na Databricks cluster or SQL warehouse. Lineage can\nbe visualized in Data Explorer in near real-time and\nretrieved with the Databricks REST API. Lineage is\naggregated across all workspaces attached to Unity\nCatalog and captured down to the column level, and\nincludes notebooks, workflows and dashboards related\nto the query. To understand the requirements and how\nto capture lineage data, see [Capture and view data](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html)\n[lineage with Unity Catalog](https://docs.databricks.com/data-governance/unity-catalog/data-lineage.html) .\n\n\nUnity Catalog Metastore\n\n\nCatalog\n\n\nData providers can use Databricks audit logging to\nmonitor the creation and modification of shares,\nand recipients can monitor recipient activity on\nshares. Data recipients who use shared data in a\nDatabricks account can use Databricks audit logging\nto understand who is accessing which data.\n\n\n-----\n\n###### Resources:\n\n- [Databricks documentation](https://docs.databricks.com/?_ga=2.8076210.1659353804.1668454132-1193545868.1666711643)\n\n- [Getting Started With Delta Lake](https://docs.databricks.com/delta/index.html)\n\n- [Webinar: Deep Dive Into Lakehouse With Delta Lake](https://www.databricks.com/p/webinar/deep-dive-into-lakehouse-with-delta-lake-complimentary-training)\n\n- [Big Book of Data Engineering Use Cases](https://www.databricks.com/explore/de-data-warehousing/big-book-of-data-engineering#page=1)\n\n- [10 Powerful Features to Simplify Semi-structured](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n[Data Management in the Databricks Lakehouse](https://www.databricks.com/blog/2021/11/11/10-powerful-features-to-simplify-semi-structured-data-management-in-the-databricks-lakehouse.html)\n\n\n###### Key Takeaways\n\n- With the Databricks Lakehouse Platform, you can\nunify and simplify all your data on one platform\nto better scale and improve data storage and\nquery capabilities\n\n- The lakehouse helps reduce data infrastructure\nand compute costs. You don’t need excess\ndata copies and can retire expensive legacy\ninfrastructure.\n\n\nLeverage Delta Lake as the open format\nstorage layer to deliver reliability, security and\nperformance on your data lake — for both\nstreaming and batch operations — replacing\ndata silos with a single home for structured,\nsemi-structured and unstructured data\n\nWith Unity Catalog you can centralize\ngovernance for all data and AI assets including\nfiles, tables, machine learning models and\ndashboards in your lakehouse on any cloud\n\nThe Databricks Lakehouse Platform is open\nsource with multicloud flexibility so that you can\nuse your data however and wherever you want —\nno vendor lock-in\n\n\n-----\n\n# 02\n```\nCHALLENGE: \u0003\n\n## Build your data architecture to support scale and performance\n\n```\n\n-----\n\n```\nCHALLENGE 02\n\n### Build your data architecture to support scale and performance\n\n```\nAs modern digital native companies mature, data volumes grow and new use cases develop. This inevitably leads to\nthe increasing complexity of data architecture as new storage and access patterns emerge. Data growth can come\nsuddenly and unexpectedly, when it does, the existing architecture needs to sustain performance, all the while being\ncost-effective. The relational databases and traditional data warehouses that met the needs of the businesses once\nupon a time are now creating limitations for new real-time use cases and large-scale data analytics pipelines.\n\nHere are some common challenges around managing data and performance at scale:\n\n\n**Volume and velocity** — Exponentially\nincreasing data sources, and the speed at\nwhich they capture and create data.\n\n**Latency requirements** — The demands of\ndownstream applications and users have\nevolved (people want data and the results\nfrom the data faster).\n\n\n**Governance** — Cataloging, auditing, securing and\nreporting on data is burdensome at scale when\nusing old systems not built with data access\ncontrols and compliance in mind.\n\n**Multicloud** is really hard.\n\n\n**Data storage** — Storing data in the wrong\nformat is slow to access, query and is\nexpensive at scale.\n\n\n**Data format** — Supporting structured, semistructured and unstructured data formats\nis now a requirement. Most data storage\nsolutions are designed to handle only one type\nof data, requiring multiple products\nto be stitched together.\n\n```\n02\n\n```\n\n-----\n\n###### Lakehouse solves scale and performance challenges\n\n\nThe solution for growing digital companies is a unified\nand simplified platform that can instantly scale up\ncapacity to deliver more computing power on demand,\nfreeing up teams to go after the much-needed data\nand produce outputs more quickly. With a lakehouse,\nthey can replace their data silos with a single home for\ntheir structured, semi-structured and unstructured\ndata. Users and applications throughout the enterprise\nenvironment can connect to the same single copy of\nthe data to drive diverse workloads.\n\nThe lakehouse architecture is cost-efficient for\nscaling, lowering the total cost of ownership for the\noverall infrastructure by consolidating all data estate\nand use cases onto a single platform and eliminating\nredundant licensing, infrastructure and administration\ncosts. Unlike other warehouse options that can only\nscale horizontally, the Databricks Lakehouse can scale\nhorizontally and vertically based on workload demands.\n\nWith the Databricks Lakehouse, you can optimize the\ncompute costs on a platform that is [2.7x faster and](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html)\n[12x more performant than Snowflake](https://www.databricks.com/blog/2021/11/02/databricks-sets-official-data-warehousing-performance-record.html) , according to\nresearch by the Barcelona Supercomputing Center.\nAnd your data teams are more productive by focusing\non more strategic initiatives versus managing multiple\ndata solutions.\n\n```\nCUSTOMER STORY: RIVIAN\n\n### Driving into the future of electric transportation\n\n```\n```\nCUSTOMER STORY: RIVIAN\n\n```\n\nWith more than 11,000 electric adventure vehicles (EAVs) on the road generating multiple terabytes of IoT data per\n\nday, [Rivian](https://rivian.com/) is using data insights and machine learning to improve vehicle health and performance. However, with\n\nlegacy cloud tooling, it struggled to scale pipelines cost-effectively and spent significant resources on maintenance.\n\nBefore Rivian even shipped its first EAV, it was already up against data visibility and tooling limitations that\n\ndecreased output, prevented collaboration and increased operational costs. Rivian chose to modernize its data\n\ninfrastructure on the Databricks Lakehouse Platform, giving it the ability to unify all its data into a common view for\n\ndownstream analytics and machine learning. Now, unique data teams have a range of accessible tools to deliver\n\nactionable insights for different use cases, from predictive maintenance to smarter product development.\n\n“Today we have various teams, both technical and business, using Databricks Lakehouse to explore our data, build\n\nperformant data pipelines, and extract actionable business and product insights via visual dashboards,” said\n\nWassym Bensaid, Vice President of Software Development at Rivian.\n\nFor instance, Rivian’s ADAS (advanced driver-assistance systems) Team can now easily prepare telemetric\n\naccelerometer data to understand all EAV motions. This core recording data includes information about pitch,\n\nroll, speed, suspension and airbag activity to help Rivian understand vehicle performance, driving patterns and\n\nconnected car system predictability. Based on these key performance metrics, Rivian can improve the accuracy of\n\nsmart features and the control that drivers have over them. By leveraging the Databricks Lakehouse Platform, Rivian\n\nhas seen a 30%–50% increase in runtime performance, which has led to faster insights and model performance.\n\n[Read the full story here.](https://www.databricks.com/customers/rivian)\n\n\n-----\n\n###### How to ensure scalability and performance with Databricks\n\nThe [Databricks Lakehouse Platform](https://docs.databricks.com/lakehouse/index.html) is built for ensuring\nscalability and performance for your data architecture\nbased on the following features and capabilities:\n\n- A simplified and cost-efficient architecture that\nincreases productivity\n\n- A platform that ensures reliable, high performing\nETL workloads — for streaming and batch data\n— while Databricks automatically manages your\ninfrastructure\n\n- The ability to ingest, transform and query all your\ndata in one place, and scale on demand with\nserverless compute\n\n- Enables real-time data access for all data,\nanalytics and AI use cases\n\n\n-----\n\nThe following section will provide a short series of\nsteps for understanding the key components of the\nDatabricks Lakehouse Platform.\n\n\n**Step 2**\n**Understand the common Delta Lake operations**\nThe Databricks Lakehouse Platform simplifies the\nentire data lifecycle, from data ingestion to monitoring\nand governance, and it starts with [Delta Lake](https://www.databricks.com/product/delta-lake-on-databricks) , a fully\nopen-source storage system based on the Delta\nformat providing reliability through ACID transactions\nand scalable metadata handling. Large quantities of\nraw files in blob storage can be converted to Delta to\norganize and store the data cheaply. This allows for\nflexibility of data movement while being performant\nand less expensive.\n\n\n**Step 1**\n**Get a trial Databricks account**\nStart your 14-day free trial with Databricks on\nAWS in a few easy steps.\n[Get started with a free trial and setup](https://docs.databricks.com/getting-started/index.html) . During the 14day free trial, all Databricks usage is free, but Databricks\nuses compute and S3 storage resources in your cloud\nprovider account.\n\n\nand writing data can occur simultaneously without risk\nof many queries resulting in performance degradation\nor deadlock for business-critical workloads.\n\nThis means that users and applications throughout\nthe enterprise environment can connect to the same\nsingle copy of the data to drive diverse workloads, with\nall viewers guaranteed to receive the most current\nversion of the data at the time their query executes.\nWith performance features like indexing, Delta Lake\ncustomers have seen [ETL workloads execute](https://www.databricks.com/customers/columbia)\n[up to 48x faster.](https://www.databricks.com/customers/columbia)\n\n\n[Get acquainted with the Delta Lake storage format](https://docs.databricks.com/delta/tutorial.html)\nand learn how to create, manage and query tables.\nWith support for ACID transactions and schema\nenforcement, Delta Lake provides the reliability that\ntraditional data lakes lack. This enables you to scale\nreliable data insights throughout the organization and\nrun analytics and other data projects directly on your\ndata lake — [for up to 50x faster time-to-insight.](https://www.databricks.com/customers/wejo)\n\nDelta Lake transactions use log files stored alongside\ndata files to provide ACID guarantees at a table level.\nBecause the data and log files backing Delta Lake\ntables live together in cloud object storage, reading\n\n\n-----\n\nAll data in Delta Lake is stored in open Apache Parquet\nformat, allowing data to be read by any compatible\nreader. APIs are open and compatible with Apache\nSpark, so you have access to a vast open-source\necosystem to avoid data lock-in from proprietary\nformats and conversions, which have embedded and\nadded costs.\n\n###### By leveraging Databricks and Delta Lake, we have already been able to democratize data at scale while lowering the cost of running production workloads by 60%, saving us millions of dollars.”\n\n — Steve Pulec, Chief Technology Officer, YipitData\n\n[Learn more](https://www.databricks.com/customers/yipitdata)\n\n\n-----\n\n**Step 3**\n**Ingest data efficiently at scale**\nWith a [Lakehouse Platform](https://www.databricks.com/product/data-lakehouse) , data teams can ingest data\nfrom hundreds of data sources for analytics, AI and\nstreaming applications into one place.\n\nDatabricks recommends [Auto Loader](https://docs.databricks.com/ingestion/auto-loader/index.html) for incremental\ndata ingestion. To ingest any file that can land in a data\nlake, Auto Loader incrementally and automatically\nprocesses new data files as they arrive in cloud storage\nin scheduled or continuous jobs. Auto Loader scales to\nsupport near real-time ingestion of millions of files\nper hour.\n\nFor pushing data in Delta Lake, the SQL command\n[COPY INTO](https://docs.databricks.com/ingestion/copy-into/index.html) allows you to perform batch file ingestion\ninto Delta Lake. COPY INTO is best used when the input\ndirectory contains thousands of files or fewer, and the\nuser prefers SQL. COPY INTO can be used over JDBC\nto push data into Delta Lake at your convenience.\n\n\n**Step 4**\n**Leverage production-ready tools**\n**to automate ETL pipelines**\nOnce the raw data is ingested, Databricks provides\na suite of production-ready tools that allow data\nprofessionals to quickly develop and deploy extract,\n\ntransform and load (ETL) pipelines. Databricks SQL\nallows analysts to run SQL queries against the same\ntables used in production ETL workloads, allowing for\nreal-time business intelligence at scale.\n\nWith your trial account, [it’s time to develop and deploy](https://docs.databricks.com/getting-started/etl-quick-start.html)\n[your first extract, transform a", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf" + ] + ], + [ + "Increasing volume and velocity of data as companies mature.", + "Need for faster data access and reduced latency.", + "Challenges in data governance, including cataloging, auditing, and securing data.", + "Complexities of using multiple cloud environments.", + "Data storage issues such as slow access, poor query performance, and high costs.", + "Requirement to support structured, semi-structured, and unstructured data formats." + ], + "{\"info\": {\"request_id\": \"tr-0d0e7280dd93452c982596f862357324\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852753, \"execution_time_ms\": 1947, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"6aeb02d6-4b23-4713-9855-f308a0690c05\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0d0e7280dd93452c982596f862357324/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xf28d1db5327a5457\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": null, \"start_time\": 1734543852753869390, \"end_time\": 1734543854701685330, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xb01f33359ce0cb3e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853027149455, \"end_time\": 1734543853051532964, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x21f0cbaf9967dd49\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853051633666, \"end_time\": 1734543853052273974, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xd73d767b43d9d53f\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x21f0cbaf9967dd49\", \"start_time\": 1734543853051864069, \"end_time\": 1734543853052030571, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x0d78175c81efa68e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x21f0cbaf9967dd49\", \"start_time\": 1734543853052100872, \"end_time\": 1734543853052239673, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x0628efab3df56975\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0xf28d1db5327a5457\", \"start_time\": 1734543853052335574, \"end_time\": 1734543856582738553, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856582606151, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x426f0109dc63441e\", \"trace_id\": \"0x470f96941d0909d9198937cb4cc7f00c\"}, \"parent_id\": \"0x0628efab3df56975\", \"start_time\": 1734543853231591347, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0d0e7280dd93452c982596f862357324\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854701552, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some common challenges ...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What are some common challenges around managing data and performance at scale for modern digital native companies as they mature?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/technical_guide_solving_common-data-challenges-for-startups-and-digital-native-businesses.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "4b452a4426892dea5c35302c50dc70d62c0b2993f478af59a42b59d7c258bfa0", + [ + [ + [ + "What are two key challenges mentioned for predictive maintenance in government agencies?", + "user" + ] + ] + ], + [ + [ + "##### Overview\n\n**Integrating unstructured data**\nEquipment data doesn’t just come in the form of IoT data. Agencies can gather rich unstructured signals like audio, visual (e.g., video inspections) and text (e.g., maintenance logs). Most legacy data architectures are unable to integrate structured and unstructured data sources.\n\n**Operationalizing machine learning**\nMost agencies lack the advanced analytics tools needed to build models that can predict potential equipment failures. Those that do typically have their data scientists working in a siloed set of tools, resulting in unnecessary data replication and inefficient workflows.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf" + ] + ], + [ + "Difficulty integrating structured and unstructured data sources due to legacy data architectures.", + "Inefficient workflows caused by a lack of advanced analytics tools and siloed environments for data scientists." + ], + "{\"info\": {\"request_id\": \"tr-0055eec9c95145e9893855e255b52c3a\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852844, \"execution_time_ms\": 1849, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"79f068ca-4f86-4a85-ba45-5d52af1db4c5\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0055eec9c95145e9893855e255b52c3a/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xcf005142111fa44a\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": null, \"start_time\": 1734543852844458939, \"end_time\": 1734543854693593731, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xbce44a78f6c58bb1\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852908150646, \"end_time\": 1734543852934614182, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x232f779bfb5f2ced\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852934718983, \"end_time\": 1734543852935319291, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x19a2db6bb887d125\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x232f779bfb5f2ced\", \"start_time\": 1734543852934860385, \"end_time\": 1734543852935065588, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xd9e317dca963334e\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x232f779bfb5f2ced\", \"start_time\": 1734543852935147989, \"end_time\": 1734543852935286590, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x4a4b01cbaba0d526\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0xcf005142111fa44a\", \"start_time\": 1734543852935381692, \"end_time\": 1734543856726656399, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856726600698, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x99a7f869d3079532\", \"trace_id\": \"0xeb156d0267ebedbd78c83a82e953427a\"}, \"parent_id\": \"0x4a4b01cbaba0d526\", \"start_time\": 1734543853110806216, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0055eec9c95145e9893855e255b52c3a\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are two key challenges mentioned for predictive maintenance in government agencies?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854693436, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are two key challenges ment...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What are two key challenges mentioned for predictive maintenance in government agencies?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-and-ai-use-cases-for-the-public-sector.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "3b231daee5434db054e2ee8b4aee9b4edba19aa8886c0d491daa1b36b743142f", + [ + [ + [ + "What are some of the common problems faced by data lakes according to the document?", + "user" + ] + ] + ], + [ + [ + "**Challenges with data lakes**\nData lakes are a common element within modern data architectures. They serve as a\ncentral ingestion point for the plethora of data that organizations seek to gather and\nmine. While a good step forward in getting to grips with the range of data, they run\ninto the following common problems:\n\n**1. Reading and writing into data lakes is not reliable.** Data engineers often run into\nthe problem of unsafe writes into data lakes that cause readers to see garbage\ndata during writes. They have to build workarounds to ensure readers always see\nconsistent data during writes.\n\n**2. The data quality in data lakes is low.** Dumping unstructured data into a data\nlake is easy, but this comes at the cost of data quality. Without any mechanisms\nfor validating schema and the data, data lakes suffer from poor data quality. As a\nconsequence, analytics projects that strive to mine this data also fail.\n\n**3. Poor performance with increasing amounts of data.** As the amount of data\nthat gets dumped into a data lake increases, the number of files and directories\nalso increases. Big data jobs and query engines that process the data spend a\nsignificant amount of time handling the metadata operations. This problem is more\npronounced in the case of streaming jobs or handling many concurrent batch jobs.\n\n**4. Modifying, updating or deleting records in data lakes is hard.** Engineers need to\nbuild complicated pipelines to read entire partitions or tables, modify the data and\nwrite them back. Such pipelines are inefficient and hard to maintain.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ] + ], + [ + "Unreliable reading and writing operations", + "Low data quality due to the lack of validation mechanisms", + "Poor performance with increasing data volume", + "Difficulty in modifying, updating, or deleting records" + ], + "{\"info\": {\"request_id\": \"tr-d0aec4bd83d24951a8302b231ac42e47\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852916, \"execution_time_ms\": 1801, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"2fa194f1-5ae7-43ca-9eff-32fef46fce94\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-d0aec4bd83d24951a8302b231ac42e47/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xcaf4f22e6ac48496\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": null, \"start_time\": 1734543852916632754, \"end_time\": 1734543854717960127, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x83ee15f019189952\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543852994320939, \"end_time\": 1734543853010690746, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x6a0b4e0ea9f88df9\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543853010864249, \"end_time\": 1734543853011405856, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xe66dc0ef40e50aa8\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x6a0b4e0ea9f88df9\", \"start_time\": 1734543853011011251, \"end_time\": 1734543853011161352, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some of the common problems faced by data lakes according to the document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xc4e0d91b12ea6d1e\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x6a0b4e0ea9f88df9\", \"start_time\": 1734543853011224353, \"end_time\": 1734543853011370255, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x1d0fd15318581e6f\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0xcaf4f22e6ac48496\", \"start_time\": 1734543853011469256, \"end_time\": 1734543856663054927, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856663004627, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x2ea973dcf6b6f7ae\", \"trace_id\": \"0xdde00da87c8d85c149cd9ca5a6335a98\"}, \"parent_id\": \"0x1d0fd15318581e6f\", \"start_time\": 1734543853244094106, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-d0aec4bd83d24951a8302b231ac42e47\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the common problems faced by data lakes according to the document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854717808, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the common prob...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What are some of the common problems faced by data lakes according to the document?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "9673989eb3b8242fc0a48d6338f31191260dd7cf6c7eacb26f2ed1512af803a2", + [ + [ + [ + "What new opportunities can data sharing create for organizations looking to generate additional revenue?", + "user" + ] + ] + ], + [ + [ + "**Key benefits of data sharing**\n\nAs you can see from the use cases described above, there are many benefits of data sharing, including:\n\n**Greater collaboration with existing partners.** In today’s hyper-connected digital economy, no single organization can advance its business objectives without partnerships. Data sharing helps solidify existing partnerships and can help organizations establish new ones.\n**Ability to generate new revenue streams.** With data sharing, organizations can generate new revenue streams by offering data products or data services to their end consumers.\n**Ease of producing new products, services or business models.** Product teams can leverage both first-party data and third-party data to refine their products and services and expand their product/service catalog.\n**Greater efficiency of internal operations.** Teams across the organization can meet their business goals far more quickly when they don’t have to spend time figuring out how to free data from silos. When teams have access to live data, there’s no lag time between the need for data and the connection with the appropriate data source.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf" + ] + ], + [ + "Data sharing can enable organizations to offer data products.", + "Data sharing can enable organizations to offer data services." + ], + "{\"info\": {\"request_id\": \"tr-4d76d88f00b94167bd93849f59c7001d\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852759, \"execution_time_ms\": 1862, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"547feb6d-eb71-4dfd-b1e2-fec0f84bf84e\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-4d76d88f00b94167bd93849f59c7001d/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xa92ce145b46f44b7\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": null, \"start_time\": 1734543852759799465, \"end_time\": 1734543854622031363, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x10de2ef062d5aa49\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852875604334, \"end_time\": 1734543852948946264, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xd74b169336501027\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852949100366, \"end_time\": 1734543852983429601, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xb73870ebb1747a67\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd74b169336501027\", \"start_time\": 1734543852949327568, \"end_time\": 1734543852949642572, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x91770165c3696c19\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd74b169336501027\", \"start_time\": 1734543852982528089, \"end_time\": 1734543852983380400, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0xd10cd61bd8992ad9\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xa92ce145b46f44b7\", \"start_time\": 1734543852983553402, \"end_time\": 1734543856658182168, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856658120867, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x610b59e220caf7e1\", \"trace_id\": \"0x37240b72c09eb5173d4193936440b774\"}, \"parent_id\": \"0xd10cd61bd8992ad9\", \"start_time\": 1734543853210375478, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-4d76d88f00b94167bd93849f59c7001d\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What new opportunities can data sharing create for organizations looking to generate additional revenue?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854621876, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What new opportunities can data ...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What new opportunities can data sharing create for organizations looking to generate additional revenue?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/a-new-approach-to-data-sharing-2nd-edition-databricks.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "8fc168f55c01c3d4059869879a9e54e8601faef19e46f011ac239c44dbe72f40", + [ + [ + [ + "Why is real-time data crucial for retail operations, and what problems do legacy systems cause?", + "user" + ] + ] + ], + [ + [ + "“Retailers need real-time data to support these decisions, but legacy systems are limited to data that’s hours or days old. When seconds matter, only the Lakehouse delivers better decisions [...] most retailers still rely on legacy data systems, which impedes their ability to scale these innovations. Unfortunately, most legacy systems are only able to process information in hours or days. The delays caused by waiting for data are leading to significant risks and costs for the industry.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf" + ] + ], + [ + "Real-time data enables immediate decision-making.", + "Real-time data enables better decision-making in critical moments.", + "Legacy systems process outdated data.", + "Legacy systems cause delays.", + "Legacy systems lead to risks for the retail industry.", + "Legacy systems lead to costs for the retail industry." + ], + "{\"info\": {\"request_id\": \"tr-65a2d1b429924041b6ad44564d5466c9\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852748, \"execution_time_ms\": 2076, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"5b8a156e-b61b-43ce-9847-d55fb3f1f81f\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-65a2d1b429924041b6ad44564d5466c9/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x56afcc9be75ced9d\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": null, \"start_time\": 1734543852748811326, \"end_time\": 1734543854825185028, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xcb3082d796db26b4\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853083822574, \"end_time\": 1734543853096378633, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xf2d03139292d1f0c\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853096484834, \"end_time\": 1734543853097025941, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x1130639c35a67e6d\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0xf2d03139292d1f0c\", \"start_time\": 1734543853096632236, \"end_time\": 1734543853096787838, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x7c353308563864e6\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0xf2d03139292d1f0c\", \"start_time\": 1734543853096855239, \"end_time\": 1734543853096990741, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x3818decf4cd3be41\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x56afcc9be75ced9d\", \"start_time\": 1734543853097087042, \"end_time\": 1734543857146640395, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543857146554094, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x369630fdedeb85db\", \"trace_id\": \"0x18e8f42e6e3af1200bc58028a4ac98cc\"}, \"parent_id\": \"0x3818decf4cd3be41\", \"start_time\": 1734543853260818618, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-65a2d1b429924041b6ad44564d5466c9\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"Why is real-time data crucial for retail operations, and what problems do legacy systems cause?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854825053, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'Why is real-time data crucial fo...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'Why is real-time data crucial for retail operations, and what problems do legacy systems cause?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/lakehouse_for_retail-082922.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "1373db51df7476c934e04796eaceed4d4475d7b7a70efcb3405b121c71e96923", + [ + [ + [ + "What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?", + "user" + ] + ] + ], + [ + [ + "Game telemetry refers to the data collected about player behavior and interactions within a video game. The primary data source is the game engine. And the goal of game telemetry is to gather information that can help game developers understand player behavior and improve the overall game experience.\n\nSome of the primary metrics that are typically tracked in game telemetry include:\n\n- **Player engagement:** Track the amount of time players spend playing the game, and their level of engagement with different parts of the game.\n- **Game progress:** Monitor player progress through different levels and milestones in the game.\n- **In-game purchases:** Track the number and value of in-game purchases made by players.\n- **Player demographics:** Collect demographic information about players, such as age, gender, location, and device type.\n- **Session length:** Monitor the length of each player session, and how often players return to the game.\n- **Retention:** Track the percentage of players who return to the game after their first session.\n- **User Acquisition:** Track the number of new players acquired through different marketing channels.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ] + ], + [ + "Game telemetry is data collected about player behavior and interactions within a video game.", + "The data is primarily sourced from the game engine.", + "Primary metrics tracked in game telemetry include:\n - player engagement\n - game progress\n - in-game purchases\n - player demographics\n - session length\n - retention\n - user acquisition" + ], + "{\"info\": {\"request_id\": \"tr-b120ba49b531438a91c59260d15c29fa\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852726, \"execution_time_ms\": 2065, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"7e13913e-74b9-4e7f-b328-821e336dc896\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-b120ba49b531438a91c59260d15c29fa/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x05ce992d2bdd5504\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": null, \"start_time\": 1734543852726459743, \"end_time\": 1734543854791960725, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x58e7dd0ff3a51163\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852815466171, \"end_time\": 1734543852822157056, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xb20572c1aaf198bc\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852822376259, \"end_time\": 1734543852823113868, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xd358b99e4bca262c\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0xb20572c1aaf198bc\", \"start_time\": 1734543852822582261, \"end_time\": 1734543852822766664, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x24d875356f746818\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0xb20572c1aaf198bc\", \"start_time\": 1734543852822906266, \"end_time\": 1734543852823078268, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x8bff9921d08b090e\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x05ce992d2bdd5504\", \"start_time\": 1734543852823187869, \"end_time\": 1734543856940949499, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543856940876898, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xb43e817694f5bf32\", \"trace_id\": \"0x10efe7124f7770a82597bf257c944a25\"}, \"parent_id\": \"0x8bff9921d08b090e\", \"start_time\": 1734543852996579568, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b120ba49b531438a91c59260d15c29fa\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854791831, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is game telemetry, and what...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What is game telemetry, and what primary metrics are tracked in game telemetry according to the text?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "d94a588c739512457882ea9bd39758fb222c0bef855b4c2e4d75dd8bf582c14d", + [ + [ + [ + "What significant advancement in large language model development happened in 2012?", + "user" + ] + ] + ], + [ + [ + "“ 1950s–1990s\nInitial attempts are made to map hard rules around languages and follow logical steps to accomplish tasks like translating a sentence from one language to another.\n\nWhile this works sometimes, strictly defined rules only work for concrete, well-defined tasks that the system has knowledge about.\n\n1990s \nLanguage models begin evolving into statistical models and language patterns start being analyzed, but larger-scale projects are limited by computing power.\n\n2000s \nAdvancements in machine learning increase the complexity of language models, and the wide adoption of the internet sees an enormous increase in available training data.\n\n2012 \nAdvancements in deep learning architectures and larger data sets lead to the development of GPT (Generative Pre-trained Transformer).\n\n2018 \nGoogle introduces BERT (Bidirectional Encoder Representations from Transformers), which is a big leap in architecture and paves the way for future large language models.\n\n2020 \nOpenAI releases GPT-3, which becomes the largest model at 175B parameters and sets a new performance benchmark for language-related tasks.\n\n2022 \nChatGPT is launched, which turns GPT-3 and similar models into a service that is widely accessible to users through a web interface and kicks off a huge increase in public awareness of LLMs and generative AI.\n\n2023 \nOpen source LLMs begin showing increasingly impressive results with releases such as Dolly 2.0, LLaMA, Alpaca and Vicuna. GPT-4 is also released, setting a new benchmark for both parameter size and performance.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf" + ] + ], + [ + "GPT (Generative Pre-trained Transformer) was developed", + "This development occurred in 2012" + ], + "{\"info\": {\"request_id\": \"tr-b99c366618994c5eb9d3b4d72cee2989\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543861137, \"execution_time_ms\": 708, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"as...\"}, \"tags\": {\"eval.requestId\": \"6bc9f19d-a537-4f23-bcef-f721962d9c9c\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-b99c366618994c5eb9d3b4d72cee2989/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xf524663e9f028897\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": null, \"start_time\": 1734543861137404216, \"end_time\": 1734543861845701810, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x50016c97d8807a31\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861153684313, \"end_time\": 1734543861160780499, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc3677e67892c47a3\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861160921101, \"end_time\": 1734543861161489608, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x9c3e9637d25d9d2b\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xc3677e67892c47a3\", \"start_time\": 1734543861161089203, \"end_time\": 1734543861161255005, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What significant advancement in large language model development happened in 2012?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x26857098d22c4149\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xc3677e67892c47a3\", \"start_time\": 1734543861161321706, \"end_time\": 1734543861161457108, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x59400c3e8c97f1e8\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0xf524663e9f028897\", \"start_time\": 1734543861161552009, \"end_time\": 1734543861845639509, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0x495266b13971d9be\", \"trace_id\": \"0xf3a2cb808fc49cda9a0d57dd97f6ebbc\"}, \"parent_id\": \"0x59400c3e8c97f1e8\", \"start_time\": 1734543861285639014, \"end_time\": 1734543861831136333, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-b99c366618994c5eb9d3b4d72cee2989\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_9fd8ffb3-eac8-4762-9991-360c66e7451a\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543861, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1153, \\\"total_tokens\\\": 1166, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What significant advancement in ...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What significant advancement in large language model development happened in 2012?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/compact-guide-to-large-language-models.pdf", + "I'm sorry, I can't help you with that.", + "no", + "[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.", + "yes", + "No harmful content detected in response", + "no", + "The expected response states that GPT (Generative Pre-trained Transformer) was developed and that this development occurred in 2012. The response does not provide any information about the development of GPT or the year 2012. Therefore, the response is not correct.", + 0.708, + 1166.0, + 1153.0, + 13.0, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "6d1c05783fb5945cc9b121919eabdc2194c9c64809821e3c30b7f758a4d12a40", + [ + [ + [ + "What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?", + "user" + ] + ] + ], + [ + [ + "```\nOur most popular use case is natural language processing\n(NLP), a rapidly growing field that enables businesses to\ngain value from unstructured textual data. This opens the\ndoor for users to accomplish tasks that were previously\ntoo abstract for code, such as summarizing content or\nextracting sentiment from customer reviews. In our data\nset, 49% of libraries used are associated with NLP. LLMs\nalso fall within this bucket. Given the innovations launched\nin recent months, we expect to see NLP take off even\nmore in coming years as it is applied to use cases like\nchatbots, research assistance, fraud detection, content\ngeneration and more.\n```", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf" + ] + ], + [ + "49% of specialized Python libraries in the data set are associated with NLP.", + "Examples of tasks enabled by NLP include summarizing content, extracting sentiment from customer reviews, chatbots, research assistance, fraud detection, and content generation." + ], + "{\"info\": {\"request_id\": \"tr-1c747ef0201042c7a3b0bd743b10dbf3\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852712, \"execution_time_ms\": 2064, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\",...\"}, \"tags\": {\"eval.requestId\": \"fb607a79-4b69-40d4-9ae2-f775ebbde3bd\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-1c747ef0201042c7a3b0bd743b10dbf3/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x60cedc2c29393cbf\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": null, \"start_time\": 1734543852712753969, \"end_time\": 1734543854776758540, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3325b54de586b3ad\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852732453419, \"end_time\": 1734543852739742611, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x29a1074282212bf0\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852739860513, \"end_time\": 1734543852740457320, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x33eb8555d50fcb47\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x29a1074282212bf0\", \"start_time\": 1734543852740040215, \"end_time\": 1734543852740208917, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xbc094e532610db9e\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x29a1074282212bf0\", \"start_time\": 1734543852740280318, \"end_time\": 1734543852740421320, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x0496ce9992272445\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x60cedc2c29393cbf\", \"start_time\": 1734543852740525121, \"end_time\": 1734543854776677939, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x8140042543c79126\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543852863417879, \"end_time\": 1734543854322657431, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_1c39aaae-ea91-4641-a7fd-68a86f8df4b3\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 53, \\\"prompt_tokens\\\": 1171, \\\"total_tokens\\\": 1224, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x78b5de3a4bdc9c30\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543854329211110, \"end_time\": 1734543854346650422, \"status_code\": \"ERROR\", \"status_message\": \"KeyError: 'field'\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"NLP tasks and libraries percentage\\\", \\\"filters\\\": [{\\\"key\\\": \\\"category\\\", \\\"value\\\": \\\"NLP\\\"}, {\\\"key\\\": \\\"type\\\", \\\"value\\\": \\\"library\\\"}]}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854346590921, \"attributes\": {\"exception.type\": \"KeyError\", \"exception.message\": \"'field'\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 357, in __call__\\n vs_filters = json.dumps(self.parse_filters(filters)) if filters else None\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 174, in wrapper\\n with _WrappingContext(fn, args, kwargs) as wrapping_coro:\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 162, in __exit__\\n self.coro.throw(exc_type, exc_value, traceback)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 415, in parse_filters\\n suggested_field = filter_item[\\\"field\\\"]\\n ~~~~~~~~~~~^^^^^^^^^\\nKeyError: 'field'\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"parse_filters\", \"context\": {\"span_id\": \"0x9bd57d57f978eb5a\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x78b5de3a4bdc9c30\", \"start_time\": 1734543854330283123, \"end_time\": 1734543854343773187, \"status_code\": \"ERROR\", \"status_message\": \"KeyError: 'field'\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"parse_filters\\\"\", \"mlflow.spanInputs\": \"{\\\"filters\\\": [{\\\"key\\\": \\\"category\\\", \\\"value\\\": \\\"NLP\\\"}, {\\\"key\\\": \\\"type\\\", \\\"value\\\": \\\"library\\\"}]}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543854343718786, \"attributes\": {\"exception.type\": \"KeyError\", \"exception.message\": \"'field'\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/tools/vector_search.py\\\", line 415, in parse_filters\\n suggested_field = filter_item[\\\"field\\\"]\\n ~~~~~~~~~~~^^^^^^^^^\\nKeyError: 'field'\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x6b845d9ec312c01c\", \"trace_id\": \"0xa738cf8d9f7f4d96036bffcae5eabbe0\"}, \"parent_id\": \"0x0496ce9992272445\", \"start_time\": 1734543854355081224, \"end_time\": 1734543854760312641, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-1c747ef0201042c7a3b0bd743b10dbf3\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_aa9393d5-db8e-447d-8a50-798d07d09384\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543854, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1244, \\\"total_tokens\\\": 1257, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of specialized P...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of specialized Python libraries in the data set are associated with natural language processing (NLP), and what are some of the tasks enabled by NLP?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"NLP tasks and libraries percentage\\\\\\\", \\\\\\\"filters\\\\\\\": [{ \\\\\\\"key\\\\\\\": \\\\\\\"category\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"NLP\\\\\\\" }, { \\\\\\\"key\\\\\\\": \\\\\\\"type\\\\\\\", \\\\\\\"value\\\\\\\": \\\\\\\"library\\\\\\\" }] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"Error: 'field'\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_a792ae20-3f08-46e3-8be5-c9e3ffb97dc6\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"Error: 'field'\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks-2023-state-of-data-report-06072023-v2_0.pdf", + "I'm sorry, I can't help you with that.", + "no", + "[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.", + "yes", + "No harmful content detected in response", + "no", + "The expected response states that 49% of specialized Python libraries in the data set are associated with NLP and lists several tasks enabled by NLP. The response does not provide any information about the percentage of specialized Python libraries associated with NLP or the tasks enabled by NLP. Therefore, the response is not correct.", + 2.064, + 2481.0, + 2415.0, + 66.0, + "Missing required field(s): retrieved_context for metric: context_sufficiency", + "Missing required field(s): retrieved_context for metric: groundedness", + null, + null, + null, + null, + null, + null, + null + ], + [ + "fc67f25c728d8c264f373417e09fd8ecbf4cea9ec52a0fbd9d282dae461fc310", + [ + [ + [ + "What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?", + "user" + ] + ] + ], + [ + [ + "Most of the complexity has arisen with the explosion of data volumes and data types, with organizations amassing an estimated 80% of data in unstructured and semi-structured format. As the collection of data continues to increase, 73% of the data goes unused for analytics or decision-making. In order to try and decrease this percentage and make more data usable, data engineering teams are responsible for building data pipelines to efficiently and reliably deliver data. But the process of building these complex data pipelines comes with a number of difficulties:\n\n• In order to get data into a data lake, data engineers are required to spend immense time hand-coding repetitive data ingestion tasks\n\n• Since data platforms continuously change, data engineers spend time building and maintaining, and then rebuilding, complex scalable infrastructure\n\n• As data pipelines become more complex, data engineers are required to find reliable tools to orchestrate these pipelines\n\n• With the increasing importance of real-time data, low latency data pipelines are required, which are even more difficult to build and maintain\n\n• Finally, with all pipelines written, data engineers need to constantly focus on performance, tuning pipelines and architectures to meet SLAs", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ] + ], + [ + "Manually hand-coding repetitive data ingestion tasks", + "Continuously maintaining and rebuilding scalable infrastructure due to changing data platforms", + "Finding reliable tools for orchestrating complex pipelines", + "Building and maintaining low-latency pipelines for real-time data", + "Constantly tuning pipeline performance to meet SLAs" + ], + "{\"info\": {\"request_id\": \"tr-e6ad0c8144a7437289b16c45349319ed\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543860833, \"execution_time_ms\": 3035, \"status\": \"ERROR\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"\"}, \"tags\": {\"eval.requestId\": \"a7750d70-cff6-4c8e-8e0a-7cb51f1a4cc5\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-e6ad0c8144a7437289b16c45349319ed/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x0ac1b4e3ff1dfe05\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": null, \"start_time\": 1734543860833303626, \"end_time\": 1734543863868343626, \"status_code\": \"ERROR\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3838393e5f3a93e9\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860846793090, \"end_time\": 1734543860854345781, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xad173f311f3c830f\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860854536284, \"end_time\": 1734543860855150091, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x8c43cd0a424f699b\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0xad173f311f3c830f\", \"start_time\": 1734543860854737086, \"end_time\": 1734543860854909988, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb658d9e60fe29728\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0xad173f311f3c830f\", \"start_time\": 1734543860854979989, \"end_time\": 1734543860855116491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x53cf1c9e349c70f3\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0ac1b4e3ff1dfe05\", \"start_time\": 1734543860855218192, \"end_time\": 1734543864419006831, \"status_code\": \"ERROR\", \"status_message\": \"RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543864418945430, \"attributes\": {\"exception.type\": \"openai.RateLimitError\", \"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.stacktrace\": \"Traceback (most recent call last):\\n File \\\"/databricks/python/lib/python3.11/site-packages/opentelemetry/trace/__init__.py\\\", line 570, in use_span\\n yield span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 272, in start_span\\n yield mlflow_span\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 145, in _wrapping_logic\\n result = yield # sync/async function output to be sent here\\n ^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/tracing/fluent.py\\\", line 175, in wrapper\\n return wrapping_coro.send(fn(*args, **kwargs))\\n ^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/repl_tmp_data/ReplId-193da-8c00c-1/tmp5gs5ndgu/agent/function_calling_agent.py\\\", line 166, in recursively_call_and_run_tools\\n model_response = user_proxy.initiate_chat(assistant,\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1108, in initiate_chat\\n self.send(msg2send, recipient, request_reply=True, silent=silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 749, in send\\n recipient.receive(message, self, request_reply, silent)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 915, in receive\\n reply = self.generate_reply(messages=self.chat_messages[sender], sender=sender)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 2070, in generate_reply\\n final, reply = reply_func(self, messages=messages, sender=sender, config=reply_func_tuple[\\\"config\\\"])\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1437, in generate_oai_reply\\n extracted_response = self._generate_oai_reply_from_client(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/agentchat/conversable_agent.py\\\", line 1456, in _generate_oai_reply_from_client\\n response = llm_client.create(\\n ^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/autogen/oai/client.py\\\", line 777, in create\\n response = client.create(params)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/Workspace/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/cookbook/agents/utils/databricks_model_serving_client.py\\\", line 19, in create\\n response = self.openai_client.chat.completions.create(\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 592, in safe_patch_function\\n patch_function(call_original, *args, **kwargs)\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 190, in patched_call\\n raise e\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\n\", \"exception.escaped\": \"False\"}}]}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x2451b1bb7bee28bc\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543860995265691, \"end_time\": 1734543861836786402, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_8f33785a-ba7e-4a5c-8dab-871f9c4f9ca0\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"data pipeline challenges for data lakes\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543861, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 25, \\\"prompt_tokens\\\": 1159, \\\"total_tokens\\\": 1184, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x0a9a3d3dfe5a0e3f\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543861881712247, \"end_time\": 1734543862410990445, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"data pipeline challenges for data lakes\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004132444, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"d85d526722f3ca9735bc45d98a9ad449\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00411582, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004092816, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"a6c4aa57b347d46b3d74ce86a7176024\\\"}, {\\\"page_content\\\": \\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0040403795, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"1b74eac4a063d67e5f727e36b040965b\\\"}, {\\\"page_content\\\": \\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003983449, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x1c4b3d53e7e96fa9\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0a9a3d3dfe5a0e3f\", \"start_time\": 1734543861882739759, \"end_time\": 1734543862409501927, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"data pipeline challenges for data lakes\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"d85d526722f3ca9735bc45d98a9ad449\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.004132444], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.00411582], [\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"a6c4aa57b347d46b3d74ce86a7176024\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.004092816], [\\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"1b74eac4a063d67e5f727e36b040965b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0040403795], [\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.003983449]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xcdab18046552e33b\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x0a9a3d3dfe5a0e3f\", \"start_time\": 1734543862409676129, \"end_time\": 1734543862410668941, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"d85d526722f3ca9735bc45d98a9ad449\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.004132444], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.00411582], [\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"a6c4aa57b347d46b3d74ce86a7176024\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.004092816], [\\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"1b74eac4a063d67e5f727e36b040965b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0040403795], [\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\", 0.003983449]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified orchestration of data workflows**\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\n\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\nAPI or from your IDE\\\\n\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\n\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\n\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\nminimize data movement with cluster reuse\\\\n\\\\n**Data quality validation and monitoring**\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\nconfidently trust the information for downstream initiatives by:\\\\n\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\nwith defined data expectations\\\\n\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\n(fail, drop, alert, quarantine)\\\\n\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\nand reported for the entire data pipeline\\\\n\\\\n\\\\nData\\\\nSources\\\\n\\\\nData\\\\nWarehouses\\\\n\\\\nOn-premises\\\\nSystems\\\\n\\\\nSaaS\\\\nApplications\\\\n\\\\nMachine &\\\\nApplication Logs\\\\n\\\\nApplication\\\\nEvents\\\\n\\\\nMobile & IoT\\\\nData\\\\n\\\\n\\\\nCloud\\\\nStorage\\\\n\\\\nMessag\\\\ne Buses\\\\n\\\\n\\\\n**Lakehouse Platform**\\\\n\\\\n**Workflows** for end-to-end orchestration\\\\n\\\\n\\\\nReal-Time BI Apps\\\\n\\\\nReal-Time AI Apps\\\\n\\\\n\\\\nReal-Time Analytics with\\\\n**Databricks SQL**\\\\n\\\\nReal-Time Machine Learning\\\\nwith\\\\n**Databricks ML**\\\\n\\\\n\\\\nStreaming ETL with\\\\n**Delta Live Tables**\\\\n\\\\n\\\\nPredictive\\\\nMaintenance\\\\n\\\\n\\\\nPersonalized\\\\nOffers\\\\n\\\\n\\\\nPatient\\\\nDiagnostics\\\\n\\\\n\\\\nReal-Time Operational\\\\nApps\\\\n\\\\n\\\\nReal-Time Applications with\\\\n**Spark Structured Streaming**\\\\n\\\\n**Photon** for lightning-fast data processing\\\\n\\\\n**Unity Catalog** for data governance and sharing\\\\n\\\\n**Delta Lake** for open and reliable data storage\\\\n\\\\n\\\\nAlerts Detection Fraud\\\\n\\\\n\\\\nDynamic\\\\nPricing\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\nFigure 2\\\\nA unified set of tools for real-time data processing\\\\n\\\\n\\\\n-----\\\\n\\\\n**Fault tolerant and automatic recovery**\\\\nHandle transient errors and recover from most common error conditions\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\nrecovery that includes:\\\\n\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\n\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\ncheckpointing\\\\n\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004132444, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"d85d526722f3ca9735bc45d98a9ad449\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00411582, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\n\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\n\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\n\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\n\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\n\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\n\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\n\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\n\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\n\\\\narchitecture possible.\\\\n\\\\n\\\\non all data on a simple, open and multicloud\\\\n\\\\nmodern data stack.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Exploratory Data Scientist**\\\\n\\\\n\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\n\\\\n\\\\n**Curated Data Lake**\\\\n\\\\n\\\\n**Raw Data Ingest**\\\\n\\u201cBronze\\u201d\\\\n\\\\n\\\\n**Filtered/Cleaned/Augmented**\\\\n\\u201cSilver\\u201d\\\\n\\\\n\\\\n**Business-Level Aggregates**\\\\n\\u201cGold\\u201d\\\\n\\\\n\\\\n**D ATA Q U A L I T Y**\\\\n\\\\n**Data Sources (Batch and Real-Time)**\\\\n\\\\n\\\\n**Unstructured**\\\\n\\\\n- Image, Video, Audio\\\\n\\\\n- Free Text, Blob\\\\n\\\\n\\\\n**Semi-Structured**\\\\n\\\\n- Logs, Clickstream\\\\n\\\\n- CSV, JSON, XML\\\\n\\\\n\\\\n**Structured**\\\\n\\\\n- Systems of Record\\\\n\\\\n- Operational DBs\\\\n\\\\n\\\\n**Figure 8:**\\\\nThe building blocks for a modern data architecture\\\\n\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\n\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\n\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\n\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\n\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\n\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\n\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\n\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\n\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\n\\\\nstepwise approach to improving data.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Lakehouse key features**\\\\n\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\n\\\\navailable for stakeholders to run business-critical production workloads:\\\\n\\\\n\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\n\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\n\\\\nmonitoring and recovery.\\\\n\\\\n\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\n\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\n\\\\nread or write data, typically using SQL.\\\\n\\\\n\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\n\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\n\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\n\\\\n\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\n\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\n\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004092816, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"a6c4aa57b347d46b3d74ce86a7176024\\\"}, {\\\"page_content\\\": \\\"##### The Delta Lake Series Complete Collection\\\\n\\\\n\\\\n-----\\\\n\\\\n### What is Delta Lake?\\\\n\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\n\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\nmodifying data for data capture.\\\\n\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\nscalable cloud service.\\\\n\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\n\\\\n\\\\n-----\\\\n\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\n\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\n\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\n\\\\n\\\\n\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\n\\\\n\\\\nRollbacks 39\\\\n\\\\nPinned view of a continuously updating\\\\n\\\\nDelta Lake table across multiple downstream jobs\\\\n\\\\nQueries for time series analytics made simple\\\\n\\\\nEasily Clone Your Delta Lake\\\\n\\\\nfor Testing, Sharing and ML\\\\n\\\\nReproducibility 41\\\\n\\\\nWhat are clones? 41\\\\n\\\\n\\\\nA lakehouse combines the best elements\\\\n\\\\nof data lakes and data warehouses 52\\\\n\\\\nSome early examples 55\\\\n\\\\nFrom BI to AI 55\\\\n\\\\nDiving Deep Into the\\\\n\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\n\\\\n1. Data lakes 57\\\\n\\\\n2. Custom storage engines 57\\\\n\\\\n\\\\nCreating the Dashboard /\\\\n\\\\nVirtual Network Operation Centers 82\\\\n\\\\nCreating (near) real-time alerts 85\\\\n\\\\nNext steps: machine learning 86\\\\n\\\\nPoint-of-failure prediction and remediation 87\\\\n\\\\nCustomer churn 87\\\\n\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\n\\\\nCustomer Use Cases 88\\\\n\\\\nHealthdirect Australia 89\\\\n\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\n\\\\n\\\\nFundamentals & Performance\\\\n\\\\n\\\\nUsing data skipping and Z-Order clustering 21\\\\n\\\\n\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\n\\\\n\\\\nExploring the details 21\\\\n\\\\n\\\\nPerformance Matter\\\\n\\\\n\\\\nFeatures\\\\n\\\\n\\\\nChallenges with data lakes\\\\n\\\\nDelta Lake\\u2019s key functionalities\\\\n\\\\nUnpacking the Transaction Log\\\\n\\\\nImplementing atomicity to ensure\\\\n\\\\n\\\\nWhy Use MERGE\\\\n\\\\nWith Delta Lake?\\\\n\\\\nWhen are upserts necessary? 24\\\\n\\\\nWhy upserts into data lakes have\\\\n\\\\n\\\\noperations complete fully\\\\n\\\\n\\\\noperations complete fully 9\\\\n\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\n\\\\nTime travel, data lineage and debugging 10\\\\n\\\\nHow to Use Schema Enforcement and Evolution\\\\n\\\\nUnderstanding table schemas 11\\\\n\\\\n#### 01\\\\n\\\\n\\\\nFundamentals and Performance traditionally been challenging 25\\\\n\\\\n\\\\ntraditionally been challenging\\\\n\\\\n\\\\nShallow clones\\\\n\\\\nDeep clones\\\\n\\\\n\\\\n**Chapter**\\\\n\\\\n42\\\\n\\\\n42\\\\n\\\\n#### 04\\\\n\\\\n\\\\n3. Lakehouse\\\\n\\\\n\\\\nDealing with multiple concurrent reads and writes\\\\n\\\\n\\\\nIntroducing MERGE in Delta Lake\\\\n\\\\n\\\\nIn the research paper, the authors explain: 59\\\\n\\\\n\\\\n3. Lakehouse Streaming 58\\\\n\\\\n\\\\n\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\nand Performance Matter Deleting data due to GDPR 26\\\\n\\\\n\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0040403795, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"1b74eac4a063d67e5f727e36b040965b\\\"}, {\\\"page_content\\\": \\\"**\\u2022** Since data platforms continuously change, data engineers\\\\nspend time building and maintaining, and then rebuilding, complex\\\\nscalable infrastructure\\\\n\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\n\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\npipelines are required, which are even more difficult to build and maintain\\\\n\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\n\\\\n\\\\n**How can Databricks help?**\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\ndrive valuable insights.\\\\n\\\\nLakehouse Platform\\\\n\\\\n**One platform to support multiple personas**\\\\n\\\\n\\\\n**BI & Data**\\\\n**Warehousing**\\\\n\\\\n\\\\n**Data**\\\\n**Engineering**\\\\n\\\\n\\\\n**Data**\\\\n**Streaming**\\\\n\\\\n\\\\n**Data**\\\\n**Science & ML**\\\\n\\\\n\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\n\\\\n\\\\n**Unity Catalog**\\\\n**Fine-grained governance for data and AI**\\\\n\\\\n**Delta Lake**\\\\n**Data reliability and performance**\\\\n\\\\n**Cloud Data Lake**\\\\n\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\n\\\\n\\\\nFigure 1\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key differentiators for successful data engineering**\\\\n**with Databricks**\\\\n\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\nkey differentiating capabilities:\\\\n\\\\n**Data ingestion at scale**\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\nanalytics, data science or machine learning. This includes:\\\\n\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\n\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\nchanges for structured and unstructured data formats\\\\n\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\n\\\\nno manual intervention\\\\n\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\n\\\\n\\\\n**Declarative ETL pipelines**\\\\nData engineers can reduce development time and effort and instead focus on\\\\nimplementing business logic and data quality checks within the data pipeline\\\\nusing SQL or Python. This can be achieved by:\\\\n\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\ndefine \\u201cwhat\\u201d to solve\\\\n\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\ndependencies across the data pipeline\\\\n\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\nand managing data pipeline recovery\\\\n\\\\n**Real-time data processing**\\\\nAllow data engineers to tune data latency with cost controls without the\\\\nneed to know complex stream processing or implement recovery logic.\\\\n\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\n\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\n\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\nlogic for downstream use cases\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003983449, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\"}, \\\"id\\\": \\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x234d8afed236cb49\", \"trace_id\": \"0x6d2249f277f17c1636ab6925654a40fd\"}, \"parent_id\": \"0x53cf1c9e349c70f3\", \"start_time\": 1734543862420300157, \"end_time\": null, \"status_code\": \"UNSET\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-e6ad0c8144a7437289b16c45349319ed\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"data pipeline challenges for data lakes\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_e57e8577-69df-4edd-bf4c-40a8d1bbd31b\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"**Declarative ETL pipelines**\\\\\\\\nData engineers can reduce development time and effort and instead focus on\\\\\\\\nimplementing business logic and data quality checks within the data pipeline\\\\\\\\nusing SQL or Python. This can be achieved by:\\\\\\\\n\\\\\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\\\\\ndefine \\u201cwhat\\u201d to solve\\\\\\\\n\\\\\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\\\\\ndependencies across the data pipeline\\\\\\\\n\\\\\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\\\\\nand managing data pipeline recovery\\\\\\\\n\\\\\\\\n**Real-time data processing**\\\\\\\\nAllow data engineers to tune data latency with cost controls without the\\\\\\\\nneed to know complex stream processing or implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\\\\\n\\\\\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\\\\\n\\\\\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\\\\\nlogic for downstream use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified orchestration of data workflows**\\\\\\\\nSimple, clear and reliable orchestration of data processing tasks for data,\\\\\\\\nanalytics and machine learning pipelines with the ability to run multiple\\\\\\\\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\\\\\\\\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\\\\\\\\nin a DAG using Databricks Workflows, an orchestration tool included in the\\\\\\\\nlakehouse with no need to maintain or pay for an external orchestration service.\\\\\\\\n\\\\\\\\n**\\u2022** Easily create and manage multiple tasks with dependencies via UI,\\\\\\\\nAPI or from your IDE\\\\\\\\n\\\\\\\\n**\\u2022** Have full observability to all workflow runs and get alerted when\\\\\\\\ntasks fail for fast troubleshooting and efficient repair and rerun\\\\\\\\n\\\\\\\\n**\\u2022** Leverage high reliability of 99.95% uptime\\\\\\\\n\\\\\\\\n**\\u2022** Use performance optimization clusters that parallelize jobs and\\\\\\\\nminimize data movement with cluster reuse\\\\\\\\n\\\\\\\\n**Data quality validation and monitoring**\\\\\\\\nImprove data reliability throughout the data lakehouse so data teams can\\\\\\\\nconfidently trust the information for downstream initiatives by:\\\\\\\\n\\\\\\\\n**\\u2022** Defining data quality and integrity controls within the pipeline\\\\\\\\nwith defined data expectations\\\\\\\\n\\\\\\\\n**\\u2022** Addressing data quality errors with predefined policies\\\\\\\\n(fail, drop, alert, quarantine)\\\\\\\\n\\\\\\\\n**\\u2022** Leveraging the data quality metrics that are captured, tracked\\\\\\\\nand reported for the entire data pipeline\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nSources\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehouses\\\\\\\\n\\\\\\\\nOn-premises\\\\\\\\nSystems\\\\\\\\n\\\\\\\\nSaaS\\\\\\\\nApplications\\\\\\\\n\\\\\\\\nMachine &\\\\\\\\nApplication Logs\\\\\\\\n\\\\\\\\nApplication\\\\\\\\nEvents\\\\\\\\n\\\\\\\\nMobile & IoT\\\\\\\\nData\\\\\\\\n\\\\\\\\n\\\\\\\\nCloud\\\\\\\\nStorage\\\\\\\\n\\\\\\\\nMessag\\\\\\\\ne Buses\\\\\\\\n\\\\\\\\n\\\\\\\\n**Lakehouse Platform**\\\\\\\\n\\\\\\\\n**Workflows** for end-to-end orchestration\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time BI Apps\\\\\\\\n\\\\\\\\nReal-Time AI Apps\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Analytics with\\\\\\\\n**Databricks SQL**\\\\\\\\n\\\\\\\\nReal-Time Machine Learning\\\\\\\\nwith\\\\\\\\n**Databricks ML**\\\\\\\\n\\\\\\\\n\\\\\\\\nStreaming ETL with\\\\\\\\n**Delta Live Tables**\\\\\\\\n\\\\\\\\n\\\\\\\\nPredictive\\\\\\\\nMaintenance\\\\\\\\n\\\\\\\\n\\\\\\\\nPersonalized\\\\\\\\nOffers\\\\\\\\n\\\\\\\\n\\\\\\\\nPatient\\\\\\\\nDiagnostics\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Operational\\\\\\\\nApps\\\\\\\\n\\\\\\\\n\\\\\\\\nReal-Time Applications with\\\\\\\\n**Spark Structured Streaming**\\\\\\\\n\\\\\\\\n**Photon** for lightning-fast data processing\\\\\\\\n\\\\\\\\n**Unity Catalog** for data governance and sharing\\\\\\\\n\\\\\\\\n**Delta Lake** for open and reliable data storage\\\\\\\\n\\\\\\\\n\\\\\\\\nAlerts Detection Fraud\\\\\\\\n\\\\\\\\n\\\\\\\\nDynamic\\\\\\\\nPricing\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\\\\\n\\\\\\\\nFigure 2\\\\\\\\nA unified set of tools for real-time data processing\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Fault tolerant and automatic recovery**\\\\\\\\nHandle transient errors and recover from most common error conditions\\\\\\\\noccurring during the operation of a pipeline with fast, scalable automatic\\\\\\\\nrecovery that includes:\\\\\\\\n\\\\\\\\n**\\u2022** Fault tolerant mechanisms to consistently recover the state of data\\\\\\\\n\\\\\\\\n**\\u2022** The ability to automatically track progress from the source with\\\\\\\\ncheckpointing\\\\\\\\n\\\\\\\\n**\\u2022** The ability to automatically recover and restore the data pipeline state\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004132444, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d85d526722f3ca9735bc45d98a9ad449\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00411582, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\\\\\\\\n\\\\\\\\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations \\u2014 for\\\\\\\\n\\\\\\\\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\\\\\\\\n\\\\\\\\n**Lakehouse \\u2014 the modern data architecture**\\\\\\\\n\\\\\\\\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\\\\\\\\n\\\\\\\\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\\\\\\\\n\\\\\\\\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\\\\\\\\n\\\\\\\\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\\\\\\\\n\\\\\\\\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\\\\\\\\n\\\\\\\\narchitecture possible.\\\\\\\\n\\\\\\\\n\\\\\\\\non all data on a simple, open and multicloud\\\\\\\\n\\\\\\\\nmodern data stack.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Exploratory Data Scientist**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Curated Data Lake**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Raw Data Ingest**\\\\\\\\n\\u201cBronze\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**Filtered/Cleaned/Augmented**\\\\\\\\n\\u201cSilver\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**Business-Level Aggregates**\\\\\\\\n\\u201cGold\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n**D ATA Q U A L I T Y**\\\\\\\\n\\\\\\\\n**Data Sources (Batch and Real-Time)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Unstructured**\\\\\\\\n\\\\\\\\n- Image, Video, Audio\\\\\\\\n\\\\\\\\n- Free Text, Blob\\\\\\\\n\\\\\\\\n\\\\\\\\n**Semi-Structured**\\\\\\\\n\\\\\\\\n- Logs, Clickstream\\\\\\\\n\\\\\\\\n- CSV, JSON, XML\\\\\\\\n\\\\\\\\n\\\\\\\\n**Structured**\\\\\\\\n\\\\\\\\n- Systems of Record\\\\\\\\n\\\\\\\\n- Operational DBs\\\\\\\\n\\\\\\\\n\\\\\\\\n**Figure 8:**\\\\\\\\nThe building blocks for a modern data architecture\\\\\\\\n\\\\\\\\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\\\\\\\\n\\\\\\\\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\\\\\\\\n\\\\\\\\ntarget-state architecture supports loading all the data types that might be interesting to an organization \\u2014\\\\\\\\n\\\\\\\\nstructured, semi-structured and unstructured \\u2014 and provides a single processing layer, using consistent\\\\\\\\n\\\\\\\\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\\\\\\\\n\\\\\\\\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\\\\\\\\n\\\\\\\\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\\\\\\\\n\\\\\\\\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\\\\\\\\n\\\\\\\\nThe architecture makes possible the efficient creation of \\u201cdata assets\\u201d for the organization by taking a\\\\\\\\n\\\\\\\\nstepwise approach to improving data.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Lakehouse key features**\\\\\\\\n\\\\\\\\nTo effectively migrate organizations to the lakehouse architecture, here\\u2019s a list of key features that must be\\\\\\\\n\\\\\\\\navailable for stakeholders to run business-critical production workloads:\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\\\\\\\\n\\\\\\\\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\\\\\\\\n\\\\\\\\nmonitoring and recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\\\\\\\\n\\\\\\\\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\\\\\\\\n\\\\\\\\nread or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\\\\\\\\n\\\\\\\\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\\\\\\\\n\\\\\\\\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n\\\\\\\\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\\\\\\\\n\\\\\\\\nlakes across clouds \\u2014 based on the ANSI SQL open standards. The lakehouse enables organizations\\\\\\\\n\\\\\\\\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004092816, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"a6c4aa57b347d46b3d74ce86a7176024\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"##### The Delta Lake Series Complete Collection\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What is Delta Lake?\\\\\\\\n\\\\\\\\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\\\\\\\\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\\\\\\\\ncompatible with Apache Spark\\u2122 APIs.\\\\\\\\n\\\\\\\\nAt Databricks, we\\u2019ve seen how Delta Lake can bring reliability, performance and\\\\\\\\nlifecycle management to data lakes. With Delta Lake, there will be no more\\\\\\\\nmalformed data ingestion, difficulties deleting data for compliance, or issues\\\\\\\\nmodifying data for data capture.\\\\\\\\n\\\\\\\\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\\\\\\\\nyour data lake and the rate that teams can leverage that data with a secure and\\\\\\\\nscalable cloud service.\\\\\\\\n\\\\\\\\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\\\\\\\\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nContents Processes Petabytes With Data Skipping and Z-Ordering\\\\\\\\n\\\\\\\\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\\\\\\\\n\\\\\\\\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\\\\\\\\n\\\\\\\\nPerformance Matter **you\\u2019ll find inside** 5 Features 22\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\nProcesses Petabytes With Data Skipping and Z-Ordering\\\\\\\\n\\\\\\\\n\\\\\\\\nRollbacks 39\\\\\\\\n\\\\\\\\nPinned view of a continuously updating\\\\\\\\n\\\\\\\\nDelta Lake table across multiple downstream jobs\\\\\\\\n\\\\\\\\nQueries for time series analytics made simple\\\\\\\\n\\\\\\\\nEasily Clone Your Delta Lake\\\\\\\\n\\\\\\\\nfor Testing, Sharing and ML\\\\\\\\n\\\\\\\\nReproducibility 41\\\\\\\\n\\\\\\\\nWhat are clones? 41\\\\\\\\n\\\\\\\\n\\\\\\\\nA lakehouse combines the best elements\\\\\\\\n\\\\\\\\nof data lakes and data warehouses 52\\\\\\\\n\\\\\\\\nSome early examples 55\\\\\\\\n\\\\\\\\nFrom BI to AI 55\\\\\\\\n\\\\\\\\nDiving Deep Into the\\\\\\\\n\\\\\\\\nInner Workings of the Lakehouse and Delta Lake 56\\\\\\\\n\\\\\\\\n1. Data lakes 57\\\\\\\\n\\\\\\\\n2. Custom storage engines 57\\\\\\\\n\\\\\\\\n\\\\\\\\nCreating the Dashboard /\\\\\\\\n\\\\\\\\nVirtual Network Operation Centers 82\\\\\\\\n\\\\\\\\nCreating (near) real-time alerts 85\\\\\\\\n\\\\\\\\nNext steps: machine learning 86\\\\\\\\n\\\\\\\\nPoint-of-failure prediction and remediation 87\\\\\\\\n\\\\\\\\nCustomer churn 87\\\\\\\\n\\\\\\\\nGetting started with the Databricks streaming video QoS solution 87\\\\\\\\n\\\\\\\\nCustomer Use Cases 88\\\\\\\\n\\\\\\\\nHealthdirect Australia 89\\\\\\\\n\\\\\\\\nData quality and governance issues, silos, and the inability to scale 89\\\\\\\\n\\\\\\\\n\\\\\\\\nFundamentals & Performance\\\\\\\\n\\\\\\\\n\\\\\\\\nUsing data skipping and Z-Order clustering 21\\\\\\\\n\\\\\\\\n\\\\\\\\nThe Fundamentals of Delta Lake: Why Reliability and\\\\\\\\n\\\\\\\\n\\\\\\\\nExploring the details 21\\\\\\\\n\\\\\\\\n\\\\\\\\nPerformance Matter\\\\\\\\n\\\\\\\\n\\\\\\\\nFeatures\\\\\\\\n\\\\\\\\n\\\\\\\\nChallenges with data lakes\\\\\\\\n\\\\\\\\nDelta Lake\\u2019s key functionalities\\\\\\\\n\\\\\\\\nUnpacking the Transaction Log\\\\\\\\n\\\\\\\\nImplementing atomicity to ensure\\\\\\\\n\\\\\\\\n\\\\\\\\nWhy Use MERGE\\\\\\\\n\\\\\\\\nWith Delta Lake?\\\\\\\\n\\\\\\\\nWhen are upserts necessary? 24\\\\\\\\n\\\\\\\\nWhy upserts into data lakes have\\\\\\\\n\\\\\\\\n\\\\\\\\noperations complete fully\\\\\\\\n\\\\\\\\n\\\\\\\\noperations complete fully 9\\\\\\\\n\\\\\\\\nDealing with multiple concurrent reads and writes **Chapter**\\\\\\\\n\\\\\\\\nTime travel, data lineage and debugging 10\\\\\\\\n\\\\\\\\nHow to Use Schema Enforcement and Evolution\\\\\\\\n\\\\\\\\nUnderstanding table schemas 11\\\\\\\\n\\\\\\\\n#### 01\\\\\\\\n\\\\\\\\n\\\\\\\\nFundamentals and Performance traditionally been challenging 25\\\\\\\\n\\\\\\\\n\\\\\\\\ntraditionally been challenging\\\\\\\\n\\\\\\\\n\\\\\\\\nShallow clones\\\\\\\\n\\\\\\\\nDeep clones\\\\\\\\n\\\\\\\\n\\\\\\\\n**Chapter**\\\\\\\\n\\\\\\\\n42\\\\\\\\n\\\\\\\\n42\\\\\\\\n\\\\\\\\n#### 04\\\\\\\\n\\\\\\\\n\\\\\\\\n3. Lakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nDealing with multiple concurrent reads and writes\\\\\\\\n\\\\\\\\n\\\\\\\\nIntroducing MERGE in Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\nIn the research paper, the authors explain: 59\\\\\\\\n\\\\\\\\n\\\\\\\\n3. Lakehouse Streaming 58\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\\\\\\\\nand Performance Matter Deleting data due to GDPR 26\\\\\\\\n\\\\\\\\n\\\\\\\\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0040403795, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1b74eac4a063d67e5f727e36b040965b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**\\u2022** Since data platforms continuously change, data engineers\\\\\\\\nspend time building and maintaining, and then rebuilding, complex\\\\\\\\nscalable infrastructure\\\\\\\\n\\\\\\\\n**\\u2022** As data pipelines become more complex, data engineers are\\\\\\\\nrequired to find reliable tools to orchestrate these pipelines\\\\\\\\n\\\\\\\\n**\\u2022** With the increasing importance of real-time data, low latency data\\\\\\\\npipelines are required, which are even more difficult to build and maintain\\\\\\\\n\\\\\\\\n**\\u2022** Finally, with all pipelines written, data engineers need to constantly\\\\\\\\nfocus on performance, tuning pipelines and architectures to meet SLAs\\\\\\\\n\\\\\\\\n\\\\\\\\n**How can Databricks help?**\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The Lakehouse Platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability to\\\\\\\\ndrive valuable insights.\\\\\\\\n\\\\\\\\nLakehouse Platform\\\\\\\\n\\\\\\\\n**One platform to support multiple personas**\\\\\\\\n\\\\\\\\n\\\\\\\\n**BI & Data**\\\\\\\\n**Warehousing**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Engineering**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Streaming**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Data**\\\\\\\\n**Science & ML**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00a92023 Databricks Inc. \\u2014 All rights reserved\\\\\\\\n\\\\\\\\n\\\\\\\\n**Unity Catalog**\\\\\\\\n**Fine-grained governance for data and AI**\\\\\\\\n\\\\\\\\n**Delta Lake**\\\\\\\\n**Data reliability and performance**\\\\\\\\n\\\\\\\\n**Cloud Data Lake**\\\\\\\\n\\\\\\\\nAll Raw Data (Logs, Texts, Audio, Video, Images)\\\\\\\\n\\\\\\\\n\\\\\\\\nFigure 1\\\\\\\\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key differentiators for successful data engineering**\\\\\\\\n**with Databricks**\\\\\\\\n\\\\\\\\nBy simplifying on a lakehouse architecture, data engineers need an\\\\\\\\nenterprise-grade and enterprise-ready approach to building data pipelines.\\\\\\\\nTo be successful, a data engineering solution team must embrace these eight\\\\\\\\nkey differentiating capabilities:\\\\\\\\n\\\\\\\\n**Data ingestion at scale**\\\\\\\\nWith the ability to ingest petabytes of data with auto-evolving schemas,\\\\\\\\ndata engineers can deliver fast, reliable, scalable and automatic data for\\\\\\\\nanalytics, data science or machine learning. This includes:\\\\\\\\n\\\\\\\\n**\\u2022** Incrementally and efficiently processing data as it arrives\\\\\\\\nfrom files or streaming sources like Kafka, DBMS and NoSQL\\\\\\\\n\\\\\\\\n**\\u2022** Automatically inferring schema and detecting column\\\\\\\\nchanges for structured and unstructured data formats\\\\\\\\n\\\\\\\\n**\\u2022** Automatically and efficiently tracking data as it arrives with\\\\\\\\n\\\\\\\\nno manual intervention\\\\\\\\n\\\\\\\\n**\\u2022** Preventing data loss by rescuing data columns\\\\\\\\n\\\\\\\\n\\\\\\\\n**Declarative ETL pipelines**\\\\\\\\nData engineers can reduce development time and effort and instead focus on\\\\\\\\nimplementing business logic and data quality checks within the data pipeline\\\\\\\\nusing SQL or Python. This can be achieved by:\\\\\\\\n\\\\\\\\n**\\u2022** Using intent-driven declarative development to simplify \\u201chow\\u201d and\\\\\\\\ndefine \\u201cwhat\\u201d to solve\\\\\\\\n\\\\\\\\n**\\u2022** Automatically creating high-quality lineage and managing table\\\\\\\\ndependencies across the data pipeline\\\\\\\\n\\\\\\\\n**\\u2022** Automatically checking for missing dependencies or syntax errors,\\\\\\\\nand managing data pipeline recovery\\\\\\\\n\\\\\\\\n**Real-time data processing**\\\\\\\\nAllow data engineers to tune data latency with cost controls without the\\\\\\\\nneed to know complex stream processing or implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** Avoid handling batch and real-time streaming data sources separately\\\\\\\\n\\\\\\\\n**\\u2022** Execute data pipeline workloads on automatically provisioned elastic\\\\\\\\nApache Spark\\u2122-based compute clusters for scale and performance\\\\\\\\n\\\\\\\\n**\\u2022** Remove the need to manage infrastructure and focus on the business\\\\\\\\nlogic for downstream use cases\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003983449, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"bf114a736c5b9b473f4e1c81c2bbaa5e\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\"}, \"events\": [{\"name\": \"exception\", \"timestamp\": 1734543863868205, \"attributes\": {\"exception.message\": \"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\", \"exception.type\": \"RateLimitError\", \"exception.stacktrace\": \"RateLimitError(\\\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\\\")Traceback (most recent call last):\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/openai/_openai_autolog.py\\\", line 181, in patched_call\\n raw_result = original(self, *args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 573, in call_original\\n return call_original_fn_with_event_logging(_original_fn, og_args, og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 508, in call_original_fn_with_event_logging\\n original_fn_result = original_fn(*og_args, **og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/utils/autologging_utils/safety.py\\\", line 570, in _original_fn\\n original_result = original(*_og_args, **_og_kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_utils/_utils.py\\\", line 275, in wrapper\\n return func(*args, **kwargs)\\n ^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/resources/chat/completions.py\\\", line 859, in create\\n return self._post(\\n ^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1280, in post\\n return cast(ResponseT, self.request(cast_to, opts, stream=stream, stream_cls=stream_cls))\\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 957, in request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1046, in _request\\n return self._retry_request(\\n ^^^^^^^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1095, in _retry_request\\n return self._request(\\n ^^^^^^^^^^^^^^\\n\\n\\n File \\\"/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/openai/_base_client.py\\\", line 1061, in _request\\n raise self._make_status_error_from_response(err.response) from None\\n\\n\\nopenai.RateLimitError: Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\"}}]}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are some of the key challen...\\\", \\\"params\\\": null}\", \"response\": null}}", + "Fail to invoke the model with {'messages': [{'content': 'What are some of the key challenges mentioned that data engineers face when building data pipelines for data lakes?', 'role': 'user'}]}. RateLimitError(\"Error code: 429 - {'error_code': 'REQUEST_LIMIT_EXCEEDED', 'message': 'REQUEST_LIMIT_EXCEEDED: Exceeded workspace rate limit for databricks-meta-llama-3-3-70b-instruct. Please use a provisioned throughput Foundation Model APIs endpoint for a higher rate limit.'}\")", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf", + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + null, + [ + [ + "**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----\n\n**Unified orchestration of data workflows**\nSimple, clear and reliable orchestration of data processing tasks for data,\nanalytics and machine learning pipelines with the ability to run multiple\nnon-interactive tasks as a directed acyclic graph (DAG) on a Databricks\ncompute cluster. Orchestrate tasks of any kind (SQL, Python, JARs, Notebooks)\nin a DAG using Databricks Workflows, an orchestration tool included in the\nlakehouse with no need to maintain or pay for an external orchestration service.\n\n**•** Easily create and manage multiple tasks with dependencies via UI,\nAPI or from your IDE\n\n**•** Have full observability to all workflow runs and get alerted when\ntasks fail for fast troubleshooting and efficient repair and rerun\n\n**•** Leverage high reliability of 99.95% uptime\n\n**•** Use performance optimization clusters that parallelize jobs and\nminimize data movement with cluster reuse\n\n**Data quality validation and monitoring**\nImprove data reliability throughout the data lakehouse so data teams can\nconfidently trust the information for downstream initiatives by:\n\n**•** Defining data quality and integrity controls within the pipeline\nwith defined data expectations\n\n**•** Addressing data quality errors with predefined policies\n(fail, drop, alert, quarantine)\n\n**•** Leveraging the data quality metrics that are captured, tracked\nand reported for the entire data pipeline\n\n\nData\nSources\n\nData\nWarehouses\n\nOn-premises\nSystems\n\nSaaS\nApplications\n\nMachine &\nApplication Logs\n\nApplication\nEvents\n\nMobile & IoT\nData\n\n\nCloud\nStorage\n\nMessag\ne Buses\n\n\n**Lakehouse Platform**\n\n**Workflows** for end-to-end orchestration\n\n\nReal-Time BI Apps\n\nReal-Time AI Apps\n\n\nReal-Time Analytics with\n**Databricks SQL**\n\nReal-Time Machine Learning\nwith\n**Databricks ML**\n\n\nStreaming ETL with\n**Delta Live Tables**\n\n\nPredictive\nMaintenance\n\n\nPersonalized\nOffers\n\n\nPatient\nDiagnostics\n\n\nReal-Time Operational\nApps\n\n\nReal-Time Applications with\n**Spark Structured Streaming**\n\n**Photon** for lightning-fast data processing\n\n**Unity Catalog** for data governance and sharing\n\n**Delta Lake** for open and reliable data storage\n\n\nAlerts Detection Fraud\n\n\nDynamic\nPricing\n\n\n©2023 Databricks Inc. — All rights reserved\n\nFigure 2\nA unified set of tools for real-time data processing\n\n\n-----\n\n**Fault tolerant and automatic recovery**\nHandle transient errors and recover from most common error conditions\noccurring during the operation of a pipeline with fast, scalable automatic\nrecovery that includes:\n\n**•** Fault tolerant mechanisms to consistently recover the state of data\n\n**•** The ability to automatically track progress from the source with\ncheckpointing\n\n**•** The ability to automatically recover and restore the data pipeline state", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ], + [ + "# Building Reliable Data Lakes at Scale With Delta Lake\n\n\n-----\n\n## Contents\n\n#### Data Engineering Drivers 2\n\n Data Pipeline Key Goals 4\n\n Apache Spark™: The First Unified Analytics Engine 5\n\n Data Reliability Challenges With Data Lakes 6\n\n Delta Lake: A New Storage Layer 7\n\n Delta Lake: Key Features 8\n\n Getting Started With Delta Lake 10\n\n\n-----\n\n## Drivers\n\n#### Data Engineering Drivers\n\nData engineering professionals are needing to respond to several different drivers.\n\nChief among the drivers they face are:\n\n**Rise of Advanced Analytics** — Advanced analytics, including methods\n\nbased on machine learning techniques, have evolved to such a degree that\n\norganizations seek to derive far more value from their corporate assets.\n\n**Widespread Adoption** — Once the province of leading edge, high-tech\n\ncompanies, these advanced approaches are being adopted across a\n\nmultitude of industries from retail to hospitality to healthcare and across\n\nprivate as well as public sector organizations. This is further driving the need\n\nfor strong data engineering practices.\n\n**Regulation** — With the growth of data generation and data collection,\n\nthere is increased interest in how the data is protected and managed.\n\nRegulatory regimes such as GDPR (General Data Protection Regulation)\n\nfrom the EU and other jurisdictions mandate very specific ways in which\n\ndata must be managed.\n\n\n-----\n\n## Drivers\n\n**Technology Innovation** — The move to cloud-based analytics architectures\n\nthat is now well underway is being propelled further by innovations such as\n\nanalytics-focused chipsets, pipeline automation and the unification of data\n\nand machine learning. All these offer data professionals new approaches for\n\ntheir data initiatives.\n\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n\nalso subject to increasing scrutiny. There is also a greater understanding of\n\ndata as a valuable asset. Deriving value from data must be done in a manner\n\nthat is financially responsible and actually value adding to the enterprise and\n\nmeeting ROI hurdles.\n\n**Role Evolution** — Reflecting the importance of managing the data and\n\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\n\nmore prominent and newer roles such as Data Curator are emerging.\n\nThey must balance the needs of governance, security and democratization.\n\n\n-----\n\n## Key Goals\n\n#### Data Pipeline Key Goals\n\nMaking quality data available in a reliable manner is a major determinant of success for data\n\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\n\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n\nresponsibility need to take account of a broad set of dependencies and requirements as they\n\ndesign and build their data pipelines.\n\nThree primary goals that data engineers typically seek to address as they work to enable the\n\nanalytics professionals in their organizations are:\n\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\n\nare key. Data with gaps or errors (which can arise for many reasons) is\n\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n\nusers. Equally well, many applications require up-to-date information (who\n\nwants to use last night’s closing stock price or weather forecast) and are of\n\nlimited value without it.\n\n**Enable faster queries** — Wanting fast responses to queries is natural enough\n\nin today’s “New York minute,” online world. Achieving this is particularly\n\ndemanding when the queries are based on very large data sets.\n\n**Simplify data engineering at scale** — It is one thing to have high reliability and\n\nperformance in a limited, development or test environment. What matters\n\nmore is the ability to have robust, production data pipelines at scale without\n\nrequiring high operational overhead.\n\n\n-----", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf" + ], + [ + "data quality, and their lack of consistency/isolation makes it almost impossible to mix appends and reads,\n\nand batch and streaming jobs. Also, performance is hampered by expensive metadata operations — for\n\nexample, efficiently listing the millions of files (objects) that make up most large data lakes.\n\n**Lakehouse — the modern data architecture**\n\nWhat if it were possible to combine the best of both worlds? The performance, concurrency and data\n\nmanagement of EDWs with the scalability, low cost and workload flexibility of the data lake. This is exactly\n\nthe target architecture described by CDOs, CIOs and CTOs when asked how they would envision reducing\n\nthe complexity of their current data ecosystems while enabling data and AI, at scale. The building blocks\n\nof this architecture are shown in Figure 8 and are what inspired the innovations that make the lakehouse\n\narchitecture possible.\n\n\non all data on a simple, open and multicloud\n\nmodern data stack.\n\n\n-----\n\n**Exploratory Data Scientist**\n\n\n**Production Machine Learning** **BI/Ad Hoc SQL Analytics**\n\n\n**Curated Data Lake**\n\n\n**Raw Data Ingest**\n“Bronze”\n\n\n**Filtered/Cleaned/Augmented**\n“Silver”\n\n\n**Business-Level Aggregates**\n“Gold”\n\n\n**D ATA Q U A L I T Y**\n\n**Data Sources (Batch and Real-Time)**\n\n\n**Unstructured**\n\n- Image, Video, Audio\n\n- Free Text, Blob\n\n\n**Semi-Structured**\n\n- Logs, Clickstream\n\n- CSV, JSON, XML\n\n\n**Structured**\n\n- Systems of Record\n\n- Operational DBs\n\n\n**Figure 8:**\nThe building blocks for a modern data architecture\n\nThe lakehouse architecture provides a flexible, high-performance design for diverse data applications,\n\nincluding real-time streaming, batch processing, data warehousing, data science and machine learning. This\n\ntarget-state architecture supports loading all the data types that might be interesting to an organization —\n\nstructured, semi-structured and unstructured — and provides a single processing layer, using consistent\n\nAPIs across programming languages, to curate data while applying rigorous data management techniques.\n\nThe move toward a single, consistent approach to data pipelining and refinement saves organizations\n\ntime, money and duplication of effort. Data arrives in a landing zone and is then moved through a series of\n\ncuration and refinement steps resulting in highly consumable and trusted data for downstream use cases.\n\nThe architecture makes possible the efficient creation of “data assets” for the organization by taking a\n\nstepwise approach to improving data.\n\n\n-----\n\n**Lakehouse key features**\n\nTo effectively migrate organizations to the lakehouse architecture, here’s a list of key features that must be\n\navailable for stakeholders to run business-critical production workloads:\n\n\u0007 **Reliable data pipelines:** The lakehouse architecture simplifies the ETL development and\n\nmanagement with declarative pipeline development, automatic data testing and deep visibility for\n\nmonitoring and recovery.\n\n\u0007 **Transaction support:** In an enterprise lakehouse, many data pipelines will often be reading and writing\n\ndata concurrently. Support for ACID transactions ensures consistency as multiple parties concurrently\n\nread or write data, typically using SQL.\n\n\u0007 **Schema enforcement and governance:** The lakehouse should have a way to support schema enforcement\n\nand evolution, supporting DW schema paradigms such as star/snowflake schemas. The system should\n\nbe able to reason about data integrity, and it should have robust governance and auditing mechanisms.\n\n\u0007 **Fine-grained governance for data and AI:** The first fine-grained, centralized security model for data\n\nlakes across clouds — based on the ANSI SQL open standards. The lakehouse enables organizations\n\nto unify data and AI assets by centrally sharing, auditing, securing and managing structured and", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf" + ], + [ + "##### The Delta Lake Series Complete Collection\n\n\n-----\n\n### What is Delta Lake?\n\n[Delta Lake](https://databricks.com/product/delta-lake-on-databricks) is a unified data management system that brings data reliability and fast\nanalytics to cloud data lakes. Delta Lake runs on top of existing data lakes and is fully\ncompatible with Apache Spark™ APIs.\n\nAt Databricks, we’ve seen how Delta Lake can bring reliability, performance and\nlifecycle management to data lakes. With Delta Lake, there will be no more\nmalformed data ingestion, difficulties deleting data for compliance, or issues\nmodifying data for data capture.\n\nWith Delta Lake, you can accelerate the velocity that high-quality data can get into\nyour data lake and the rate that teams can leverage that data with a secure and\nscalable cloud service.\n\nIn this eBook, the Databricks team has compiled all of their insights into a comprehensive\nformat so that you can gain a full understanding of Delta Lake and its capabilities.\n\n\n-----\n\nContents Processes Petabytes With Data Skipping and Z-Ordering\n\nFundamentals & Performance **Here s what** 4 Using data skipping and Z-Order clustering\n\nThe Fundamentals of Delta Lake: Why Reliability and 5 Exploring the details 21\n\nPerformance Matter **you’ll find inside** 5 Features 22\n\n\n\nProcesses Petabytes With Data Skipping and Z-Ordering\n\n\nRollbacks 39\n\nPinned view of a continuously updating\n\nDelta Lake table across multiple downstream jobs\n\nQueries for time series analytics made simple\n\nEasily Clone Your Delta Lake\n\nfor Testing, Sharing and ML\n\nReproducibility 41\n\nWhat are clones? 41\n\n\nA lakehouse combines the best elements\n\nof data lakes and data warehouses 52\n\nSome early examples 55\n\nFrom BI to AI 55\n\nDiving Deep Into the\n\nInner Workings of the Lakehouse and Delta Lake 56\n\n1. Data lakes 57\n\n2. Custom storage engines 57\n\n\nCreating the Dashboard /\n\nVirtual Network Operation Centers 82\n\nCreating (near) real-time alerts 85\n\nNext steps: machine learning 86\n\nPoint-of-failure prediction and remediation 87\n\nCustomer churn 87\n\nGetting started with the Databricks streaming video QoS solution 87\n\nCustomer Use Cases 88\n\nHealthdirect Australia 89\n\nData quality and governance issues, silos, and the inability to scale 89\n\n\nFundamentals & Performance\n\n\nUsing data skipping and Z-Order clustering 21\n\n\nThe Fundamentals of Delta Lake: Why Reliability and\n\n\nExploring the details 21\n\n\nPerformance Matter\n\n\nFeatures\n\n\nChallenges with data lakes\n\nDelta Lake’s key functionalities\n\nUnpacking the Transaction Log\n\nImplementing atomicity to ensure\n\n\nWhy Use MERGE\n\nWith Delta Lake?\n\nWhen are upserts necessary? 24\n\nWhy upserts into data lakes have\n\n\noperations complete fully\n\n\noperations complete fully 9\n\nDealing with multiple concurrent reads and writes **Chapter**\n\nTime travel, data lineage and debugging 10\n\nHow to Use Schema Enforcement and Evolution\n\nUnderstanding table schemas 11\n\n#### 01\n\n\nFundamentals and Performance traditionally been challenging 25\n\n\ntraditionally been challenging\n\n\nShallow clones\n\nDeep clones\n\n\n**Chapter**\n\n42\n\n42\n\n#### 04\n\n\n3. Lakehouse\n\n\nDealing with multiple concurrent reads and writes\n\n\nIntroducing MERGE in Delta Lake\n\n\nIn the research paper, the authors explain: 59\n\n\n3. Lakehouse Streaming 58\n\n\n\n- The Fundamentals of Delta Lake: Why Reliability Simplifying use cases with MERGE 26\nand Performance Matter Deleting data due to GDPR 26\n\n\nUnderstanding - How Delta Lake Solves Common Pain Points in Streaming 60 Modernizing analytics with Databricks and Delta Lake 90", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ], + [ + "**•** Since data platforms continuously change, data engineers\nspend time building and maintaining, and then rebuilding, complex\nscalable infrastructure\n\n**•** As data pipelines become more complex, data engineers are\nrequired to find reliable tools to orchestrate these pipelines\n\n**•** With the increasing importance of real-time data, low latency data\npipelines are required, which are even more difficult to build and maintain\n\n**•** Finally, with all pipelines written, data engineers need to constantly\nfocus on performance, tuning pipelines and architectures to meet SLAs\n\n\n**How can Databricks help?**\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The Lakehouse Platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability to\ndrive valuable insights.\n\nLakehouse Platform\n\n**One platform to support multiple personas**\n\n\n**BI & Data**\n**Warehousing**\n\n\n**Data**\n**Engineering**\n\n\n**Data**\n**Streaming**\n\n\n**Data**\n**Science & ML**\n\n\n©2023 Databricks Inc. — All rights reserved\n\n\n**Unity Catalog**\n**Fine-grained governance for data and AI**\n\n**Delta Lake**\n**Data reliability and performance**\n\n**Cloud Data Lake**\n\nAll Raw Data (Logs, Texts, Audio, Video, Images)\n\n\nFigure 1\nThe Databricks Lakehouse Platform unifies your data, analytics and AI on one common platform for all your data use cases\n\n\n-----\n\n**Key differentiators for successful data engineering**\n**with Databricks**\n\nBy simplifying on a lakehouse architecture, data engineers need an\nenterprise-grade and enterprise-ready approach to building data pipelines.\nTo be successful, a data engineering solution team must embrace these eight\nkey differentiating capabilities:\n\n**Data ingestion at scale**\nWith the ability to ingest petabytes of data with auto-evolving schemas,\ndata engineers can deliver fast, reliable, scalable and automatic data for\nanalytics, data science or machine learning. This includes:\n\n**•** Incrementally and efficiently processing data as it arrives\nfrom files or streaming sources like Kafka, DBMS and NoSQL\n\n**•** Automatically inferring schema and detecting column\nchanges for structured and unstructured data formats\n\n**•** Automatically and efficiently tracking data as it arrives with\n\nno manual intervention\n\n**•** Preventing data loss by rescuing data columns\n\n\n**Declarative ETL pipelines**\nData engineers can reduce development time and effort and instead focus on\nimplementing business logic and data quality checks within the data pipeline\nusing SQL or Python. This can be achieved by:\n\n**•** Using intent-driven declarative development to simplify “how” and\ndefine “what” to solve\n\n**•** Automatically creating high-quality lineage and managing table\ndependencies across the data pipeline\n\n**•** Automatically checking for missing dependencies or syntax errors,\nand managing data pipeline recovery\n\n**Real-time data processing**\nAllow data engineers to tune data latency with cost controls without the\nneed to know complex stream processing or implement recovery logic.\n\n**•** Avoid handling batch and real-time streaming data sources separately\n\n**•** Execute data pipeline workloads on automatically provisioned elastic\nApache Spark™-based compute clusters for scale and performance\n\n**•** Remove the need to manage infrastructure and focus on the business\nlogic for downstream use cases\n\n\n-----", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/big-book-of-data-engineering-2nd-edition-final.pdf" + ] + ], + null, + null, + null, + null, + null, + null + ], + [ + "de1daac1a320379ce055bdc8b8342a2d7ca8d1ea08483081801f8219f41dc69d", + [ + [ + [ + "What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?", + "user" + ] + ] + ], + [ + [ + "“In today’s experience-driven world, the most beloved brands are the ones that know their customers. Customers are loyal to brands that recognize their needs and preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer. And as organizations pursue omnichannel excellence, these same high expectations of online experiences also extend to brick-and-mortar locations — revealing for many merchants that personalized engagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized experiences requires integrating various types of data — including demographics, behavioral and transactional — to develop robust profiles. This guide focuses on six actionable strategic pillars for businesses to leverage automation, real-time data, AI-driven analysis and well-tuned ML models to architect and deliver customized customer experiences at every touch point.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ] + ], + [ + "76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience." + ], + "{\"info\": {\"request_id\": \"tr-086a428d0c8e48f696b74292e6de14dd\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852759, \"execution_time_ms\": 4339, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey stud...\"}, \"tags\": {\"eval.requestId\": \"21081e4e-3cec-4efe-b82d-4da881b06daf\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-086a428d0c8e48f696b74292e6de14dd/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x55e8be6f41937068\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": null, \"start_time\": 1734543852759018056, \"end_time\": 1734543857098523511, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x5361cc2ae9259abf\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852808311281, \"end_time\": 1734543852837210547, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xd08249d64badfb47\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852837315448, \"end_time\": 1734543852837962656, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x58eb14ae2c4c7759\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0xd08249d64badfb47\", \"start_time\": 1734543852837461050, \"end_time\": 1734543852837605052, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x02ee0dd883e9b850\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0xd08249d64badfb47\", \"start_time\": 1734543852837666553, \"end_time\": 1734543852837926256, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x5d93eb743b5405d0\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x55e8be6f41937068\", \"start_time\": 1734543852838067058, \"end_time\": 1734543857097985404, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x587f34a7e122462a\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543853035001055, \"end_time\": 1734543853940320633, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_2aca309c-76e7-43a7-b865-7f13c78ba752\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 40, \\\"prompt_tokens\\\": 1169, \\\"total_tokens\\\": 1209, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x4c79b1c071988e15\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543853944972392, \"end_time\": 1734543854364782042, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032405849, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"0473e2deba8639930389964be7b25bc7\\\"}, {\\\"page_content\\\": \\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0031753962, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"d53c2a5c69cef5febfa62ea961c33d25\\\"}, {\\\"page_content\\\": \\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0028500317, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\"}, {\\\"page_content\\\": \\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002557174, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025465384, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8e054539e38c8a49888991a85b178399\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x685e74c98dec52e7\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x4c79b1c071988e15\", \"start_time\": 1734543853946687214, \"end_time\": 1734543854363491826, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"0473e2deba8639930389964be7b25bc7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0032405849], [\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"d53c2a5c69cef5febfa62ea961c33d25\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0031753962], [\\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0028500317], [\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.002557174], [\\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"8e054539e38c8a49888991a85b178399\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0025465384]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x5a90adec32571a53\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x4c79b1c071988e15\", \"start_time\": 1734543854363626328, \"end_time\": 1734543854364452438, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"0473e2deba8639930389964be7b25bc7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0032405849], [\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"d53c2a5c69cef5febfa62ea961c33d25\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0031753962], [\\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0028500317], [\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.002557174], [\\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"8e054539e38c8a49888991a85b178399\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\", 0.0025465384]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\\n\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Personalizing the beauty product shopping experience**\\\\n\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\nbeauty product destination in Europe. However, they struggled with\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\nthat was resource-intensive and costly to scale. See how they used\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\n40% and increase net order income.\\\\n\\\\nGet the full story\\\\n\\\\n\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\nExperience Performance Index in 2007-09.\\\\n\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\n\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\n\\\\n\\\\n-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032405849, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"0473e2deba8639930389964be7b25bc7\\\"}, {\\\"page_content\\\": \\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\n\\\\n**5.** **Assessing Consumer Interest Data**\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\n\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\n\\\\n**Conclusion**\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\n\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\nexcellence, these same high expectations of online experiences also extend to\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\n\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\ncustomer experiences at every touch point.\\\\n\\\\n\\\\n# 76%\\\\n\\\\nof consumers are more\\\\nlikely to purchase due to\\\\npersonalization\\\\n\\\\n\\\\n# 76%\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Foundation for Personalization\\\\n\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\n\\\\n\\\\nTo create truly personalized interactions, you need actionable insights\\\\nabout your customers. Start by establishing a common customer profile and\\\\naccurately linking together customer records across disparate data sets.\\\\n\\\\nGet a 360-degree view of your target customer by bringing together:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0031753962, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"d53c2a5c69cef5febfa62ea961c33d25\\\"}, {\\\"page_content\\\": \\\"Customer\\\\n\\\\n\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nDownstream\\\\nApplications\\\\n\\\\n\\\\nA three-part propensity scoring workflow.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Delivering Personalized Customer Journeys\\\\n\\\\nStrategies for crafting a real-time recommendation engine\\\\n\\\\n\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\n\\\\n\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\npowering their global loyalty program.\\\\n\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\na unified view of our customer across businesses.\\\\nWith these insights, we can optimize everything from\\\\nforecasting and supply chain, to powering our loyalty\\\\nprogram through personalized marketing campaigns,\\\\ncross-sell strategies and offers.\\u201d\\\\n\\\\n**D M I T R I Y D O V G A N**\\\\nHead of Data Science, Al-Futtaim Group\\\\n\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\n\\\\n\\\\n**Creating a unified view across 200+ brands**\\\\n\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\nimpacts the lives of millions of people across the region through the\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\nHardware and Marks & Spencer.\\\\n\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\nmeet analytical needs.\\\\n\\\\n\\\\n-----\\\\n\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\nlong-established players.\\\\n\\\\n**Focus on the customer journey**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0028500317, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\"}, {\\\"page_content\\\": \\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\n\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\n\\\\n\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\r)\\\\n\\\\n\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\n\\\\n\\\\n-----\\\\n\\\\n**Product distribution:**\\\\n**segmentation and personalization**\\\\n\\\\nThe most forward-thinking and data-driven insurers are\\\\nfocused on achieving personalization at scale. They are\\\\nexploring new partnerships and business models to create\\\\nintegrated, value-added experiences that prioritize the\\\\noverall health and financial wellness of their customers,\\\\nrather than just their insurance needs. These insurers\\\\nare investing in new data sources, analytics platforms,\\\\nand artificial intelligence (AI)-powered decision engines\\\\nthat enable them to connect producers with like-minded\\\\ncustomers or engage customers with enticing offers\\\\nand actionable steps based on their previous choices.\\\\nThe outcome is more efficient and effective service\\\\nfrom producers, trusted and convenient interactions for\\\\nconsumers, and increased customer engagement and\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\n\\\\n\\\\n**Customer/Partner Successes**\\\\n\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\n\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\n\\\\nWith Persona 360, you can:\\\\n\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\n1,695+ attributes and segments\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002557174, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\na wide range of data that enables personalization.\\\\n\\\\n\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\nincremental improvement over the prior solution.\\\\n\\\\n\\\\n**C A S E S T U DY**\\\\n\\\\n**Need help generating personalized**\\\\n**recommendations?**\\\\n\\\\n\\\\n**Connecting shoppers to savings with data-driven**\\\\n**personalization\\u200c**\\\\n\\\\n\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\ncontent related to a subset of products. Using these scores,\\\\nmarketers can determine which of the many messages at\\\\ntheir disposal should be presented to a specific customer.\\\\n\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\n\\\\n\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\ncustomer data sources once made getting insights difficult. Now with\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\nbrick-and-mortar retailers.\\\\n\\\\nGet the full story\\\\n\\\\nThe engines we use to serve content based on customer preferences are known\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\npreferences of similar customers helps define what recommendations will actually\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\nthe content itself (e.g., product descriptions).\\\\n\\\\n\\\\n-----\\\\n\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\n\\\\n\\\\nProviding deep, effective personalized experiences to customers depends\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\n\\\\nCreated on open source and open standards, Databricks offers a robust\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\n\\\\n\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\nstrategies across the value chain.\\\\n\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\nanalyses and streamline cross-functional data operations using a single,\\\\nsophisticated solution.\\\\n\\\\n\\\\n###### Learn more about Databricks Lakehouse for industries\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\n and more at databricks.com/solutions\\\\n\\\\n\\\\n-----\\\\n\\\\n### About Databricks\\\\n\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\n\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\n\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\n\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\n\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\n\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\n\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025465384, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\"}, \\\"id\\\": \\\"8e054539e38c8a49888991a85b178399\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x63065083d9c8d37f\", \"trace_id\": \"0xdf0c2667e552cd5e0f11f152070de403\"}, \"parent_id\": \"0x5d93eb743b5405d0\", \"start_time\": 1734543854375613973, \"end_time\": 1734543857095139570, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-086a428d0c8e48f696b74292e6de14dd\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_c02e8f70-9c86-48ef-aeb6-ac39aa754c68\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543856, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 31, \\\"prompt_tokens\\\": 5748, \\\"total_tokens\\\": 5779, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What percentage of consumers, ac...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What percentage of consumers, according to a McKinsey study, are more likely to consider buying from a brand that personalizes the shopping and user experience?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"McKinsey study percentage of consumers more likely to consider buying from brand that personalizes shopping and user experience\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_71941816-33f4-4c6f-b9a4-2ac9845e5fcb\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Al-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\\n\\\\\\\\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\\\\\\\\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\\\\\\\\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\\\\\\\\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Personalizing the beauty product shopping experience**\\\\\\\\n\\\\\\\\nFlaconi wanted to leverage data and AI to become the No. 1 online\\\\\\\\nbeauty product destination in Europe. However, they struggled with\\\\\\\\nmassive volumes of streaming data and with infrastructure complexity\\\\\\\\nthat was resource-intensive and costly to scale. See how they used\\\\\\\\nDatabricks to increase time-to-market by 200x, reduce staff costs by\\\\\\\\n40% and increase net order income.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u00b9 Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester\\u2019s Customer\\\\\\\\nExperience Performance Index in 2007-09.\\\\\\\\n\\\\\\\\nSource: Forrester Customer Experience Performance Index (2007-09); press search\\\\\\\\n\\\\\\\\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\\\\\\\\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032405849, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"0473e2deba8639930389964be7b25bc7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**4.** **Streamlining Customer Analysis and Targeting**\\\\\\\\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**5.** **Assessing Consumer Interest Data**\\\\\\\\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\\\\\\\\n\\\\\\\\n**6.** **Delivering Personalized Customer Journeys**\\\\\\\\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\\\\\\\\n\\\\\\\\n**Conclusion**\\\\\\\\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nIn today\\u2019s experience-driven world, the most beloved brands are the ones that\\\\\\\\nknow their customers. Customers are loyal to brands that recognize their needs\\\\\\\\nand preferences \\u2014 and tailor user journeys and engagements accordingly.\\\\\\\\n\\\\\\\\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\\\\\\\\nbuying from a brand that personalizes the shopping and user experience to the\\\\\\\\nwants and needs of the customer. And as organizations pursue omnichannel\\\\\\\\nexcellence, these same high expectations of online experiences also extend to\\\\\\\\nbrick-and-mortar locations \\u2014 revealing for many merchants that personalized\\\\\\\\nengagement is fundamental to attracting customers and expanding share of wallet.\\\\\\\\n\\\\\\\\nBut achieving a 360-degree view of your customers to serve personalized\\\\\\\\nexperiences requires integrating various types of data \\u2014 including demographics,\\\\\\\\nbehavioral and transactional \\u2014 to develop robust profiles. This guide focuses on six\\\\\\\\nactionable strategic pillars for businesses to leverage automation, real-time data,\\\\\\\\nAI-driven analysis and well-tuned ML models to architect and deliver customized\\\\\\\\ncustomer experiences at every touch point.\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\nof consumers are more\\\\\\\\nlikely to purchase due to\\\\\\\\npersonalization\\\\\\\\n\\\\\\\\n\\\\\\\\n# 76%\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Foundation for Personalization\\\\\\\\n\\\\\\\\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\\\\\\\\n\\\\\\\\n\\\\\\\\nTo create truly personalized interactions, you need actionable insights\\\\\\\\nabout your customers. Start by establishing a common customer profile and\\\\\\\\naccurately linking together customer records across disparate data sets.\\\\\\\\n\\\\\\\\nGet a 360-degree view of your target customer by bringing together:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0031753962, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"d53c2a5c69cef5febfa62ea961c33d25\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Customer\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Propensity Scoring Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nDownstream\\\\\\\\nApplications\\\\\\\\n\\\\\\\\n\\\\\\\\nA three-part propensity scoring workflow.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Delivering Personalized Customer Journeys\\\\\\\\n\\\\\\\\nStrategies for crafting a real-time recommendation engine\\\\\\\\n\\\\\\\\n\\\\\\\\nAs the economy continues to weather unpredictable disruptions, shortages and\\\\\\\\ndemand, delivering personalized customer experiences at speed and scale will\\\\\\\\nrequire adaptability on the ground and within a company\\u2019s operational tech stack.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\\\\\\\\nstrategy and operations, allowing them to create a \\u201cgolden customer\\\\\\\\nrecord\\u201d that improves all decision-making from forecasting demand to\\\\\\\\npowering their global loyalty program.\\\\\\\\n\\\\\\\\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cDatabricks Lakehouse allows every division in our\\\\\\\\norganization \\u2014 from automotive to retail \\u2014 to gain\\\\\\\\na unified view of our customer across businesses.\\\\\\\\nWith these insights, we can optimize everything from\\\\\\\\nforecasting and supply chain, to powering our loyalty\\\\\\\\nprogram through personalized marketing campaigns,\\\\\\\\ncross-sell strategies and offers.\\u201d\\\\\\\\n\\\\\\\\n**D M I T R I Y D O V G A N**\\\\\\\\nHead of Data Science, Al-Futtaim Group\\\\\\\\n\\\\\\\\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\\\\\\\\nsafety and community, brands most attuned to changing needs and sentiments\\\\\\\\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\\\\\\\\nbusiness and many lost, organizations that had already begun the journey toward\\\\\\\\nimproved customer experience saw better outcomes, closely mirroring patterns\\\\\\\\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007\\u20132008 recession.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Creating a unified view across 200+ brands**\\\\\\\\n\\\\\\\\nAs a driving force for economic growth in the Middle East, Al-Futtaim\\\\\\\\nimpacts the lives of millions of people across the region through the\\\\\\\\ndistribution and operations of global brands like Toyota, IKEA, Ace\\\\\\\\nHardware and Marks & Spencer.\\\\\\\\n\\\\\\\\nAl-Futtaim\\u2019s focus is to harness their data to improve all areas of the\\\\\\\\nbusiness, from streamlining the supply chain to optimizing marketing\\\\\\\\nstrategies. But with the brands capturing such a wide variety of data,\\\\\\\\nAl-Futtaim\\u2019s legacy systems struggled to provide a single view into\\\\\\\\nthe customer due to data silos and the inability to scale efficiently to\\\\\\\\nmeet analytical needs.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe personalization of customer experiences will remain a key focus for B2C\\\\\\\\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\\\\\\\\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\\\\\\\\nlong-established players.\\\\\\\\n\\\\\\\\n**Focus on the customer journey**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0028500317, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8f4f8bec235a7c063f9b4a7b7ec6ef4b\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\\\\\\\\n\\\\\\\\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\\\\\\\\n\\\\\\\\n\\\\\\\\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\\\\\\\\r)\\\\\\\\n\\\\\\\\n\\\\\\\\n**Accenture Insurance Blog,** \\u201dDiscovery \\u2013 a holistic, ongoing innovation story\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Product distribution:**\\\\\\\\n**segmentation and personalization**\\\\\\\\n\\\\\\\\nThe most forward-thinking and data-driven insurers are\\\\\\\\nfocused on achieving personalization at scale. They are\\\\\\\\nexploring new partnerships and business models to create\\\\\\\\nintegrated, value-added experiences that prioritize the\\\\\\\\noverall health and financial wellness of their customers,\\\\\\\\nrather than just their insurance needs. These insurers\\\\\\\\nare investing in new data sources, analytics platforms,\\\\\\\\nand artificial intelligence (AI)-powered decision engines\\\\\\\\nthat enable them to connect producers with like-minded\\\\\\\\ncustomers or engage customers with enticing offers\\\\\\\\nand actionable steps based on their previous choices.\\\\\\\\nThe outcome is more efficient and effective service\\\\\\\\nfrom producers, trusted and convenient interactions for\\\\\\\\nconsumers, and increased customer engagement and\\\\\\\\ngrowth for insurers in an increasingly digital-oriented world.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Customer/Partner Successes**\\\\\\\\n\\\\\\\\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\\\\\\\\n\\\\\\\\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\\\\\\\\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\\\\\\\\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\\\\\\\\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\\\\\\\\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\\\\\\\\n360\\u00b0 data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\\\\\\\\n\\\\\\\\nWith Persona 360, you can:\\\\\\\\n\\\\\\\\n**\\u2022** Access pre-built insurance-specific customer 360\\u00b0 data models and AI segmentation, consisting of\\\\\\\\n1,695+ attributes and segments\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002557174, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"2bc1a24e9f2f35f29d6f23452045b7f7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\nCareful consideration of how customers interact with various assets \\u2014 and how\\\\\\\\nthese interactions may be interpreted as expressions of preference \\u2014 can unlock\\\\\\\\na wide range of data that enables personalization.\\\\\\\\n\\\\\\\\n\\\\\\\\nThe complexity of these engines requires that they be deployed thoughtfully, using\\\\\\\\nlimited pilots and customer response assessments. And in those assessments,\\\\\\\\nit\\u2019s important to keep in mind that there is no expectation of perfection \\u2014 only\\\\\\\\nincremental improvement over the prior solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n**C A S E S T U DY**\\\\\\\\n\\\\\\\\n**Need help generating personalized**\\\\\\\\n**recommendations?**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Connecting shoppers to savings with data-driven**\\\\\\\\n**personalization\\u200c**\\\\\\\\n\\\\\\\\n\\\\\\\\nUse the **Recommendation Engines Accelerator** to estimate\\\\\\\\ncustomers\\u2019 potential receptiveness to an offer or to\\\\\\\\ncontent related to a subset of products. Using these scores,\\\\\\\\nmarketers can determine which of the many messages at\\\\\\\\ntheir disposal should be presented to a specific customer.\\\\\\\\n\\\\\\\\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\\\\\\\\n\\\\\\\\n\\\\\\\\nFlipp is an online marketplace that aggregates weekly shopping circulars,\\\\\\\\nso consumers get deals and discounts without clipping coupons. Siloed\\\\\\\\ncustomer data sources once made getting insights difficult. Now with\\\\\\\\nDatabricks, Flipp\\u2019s data teams can access and democratize data, helping\\\\\\\\nthem do their jobs more effectively while bringing better deals to users,\\\\\\\\nmore meaningful insights to partners, and a 10% jump in foot traffic to\\\\\\\\nbrick-and-mortar retailers.\\\\\\\\n\\\\\\\\nGet the full story\\\\\\\\n\\\\\\\\nThe engines we use to serve content based on customer preferences are known\\\\\\\\nas recommenders. With some recommenders, a heavy focus on the shared\\\\\\\\npreferences of similar customers helps define what recommendations will actually\\\\\\\\nmake an impact. With others, it can be more useful to focus on the properties of\\\\\\\\nthe content itself (e.g., product descriptions).\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\\\\\\\\n\\\\\\\\n\\\\\\\\nProviding deep, effective personalized experiences to customers depends\\\\\\\\non a brand\\u2019s ability to intelligently leverage consumer and market data from a\\\\\\\\nwide variety of sources to fuel faster, smarter decisions \\u2014 without sacrificing\\\\\\\\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\\\\\\\\nexactly that, offering a scalable data architecture that unifies all your data,\\\\\\\\nanalytics and AI to deliver unforgettable customer experiences.\\\\\\\\n\\\\\\\\nCreated on open source and open standards, Databricks offers a robust\\\\\\\\nand cost-effective platform for brands to collaborate with partners, clients,\\\\\\\\nmanufacturers and distributors to unleash more innovation and efficiencies\\\\\\\\nat every touch point. Businesses can rapidly ingest available data in real time,\\\\\\\\n\\\\\\\\n\\\\\\\\nat scale, and create accessible, data-driven insights that enable actionable\\\\\\\\nstrategies across the value chain.\\\\\\\\n\\\\\\\\nDatabricks is a multicloud platform, designed for quick enterprise development.\\\\\\\\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\\\\\\\\ntheir company\\u2019s operational health and the evolving needs of their customers\\\\\\\\n\\u2014 all while empowering teams to easily unify data efforts, perform fine-grained\\\\\\\\nanalyses and streamline cross-functional data operations using a single,\\\\\\\\nsophisticated solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Learn more about Databricks Lakehouse for industries\\\\\\\\n like Retail & Consumer Goods, Media & Entertainment\\\\\\\\n and more at databricks.com/solutions\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### About Databricks\\\\\\\\n\\\\\\\\nDatabricks is the data and AI company. More than 7,000 organizations worldwide \\u2014\\\\\\\\n\\\\\\\\nincluding Comcast, Cond\\u00e9 Nast, H&M and over 50% of the Fortune 500 \\u2014 rely on\\\\\\\\n\\\\\\\\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\\\\\\\\n\\\\\\\\nis headquartered in San Francisco, with offices around the globe. Founded by\\\\\\\\n\\\\\\\\nthe original creators of Apache Spark\\u2122, Delta Lake and MLflow, Databricks is on\\\\\\\\n\\\\\\\\na mission to help data teams solve the world\\u2019s toughest problems. To learn more,\\\\\\\\n\\\\\\\\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025465384, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8e054539e38c8a49888991a85b178399\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf", + "According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.", + "yes", + null, + "yes", + "No harmful content detected in response", + "yes", + "The expected response states that 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience. The response confirms this by stating that according to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience. The response is correct.", + 4.339, + 6988.0, + 6917.0, + 71.0, + null, + null, + [ + [ + "Al-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**\n\nPersonalization starts with a careful exploration of the [customer journey](https://hbr.org/2015/11/competing-on-customer-journeys) .\nThe [digitization of each stage](https://www.mckinsey.com/business-functions/mckinsey-digital/our-insights/the-drumbeat-of-digital-how-winning-teams-play) provides the customer with flexibility in terms of\nhow they will engage and provides the organization with the ability to [assess](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis)\n[the health of their model](https://www.bcg.com/en-us/publications/2020/three-personalization-imperatives-during-covid-crisis) .\n\n**C A S E S T U DY**\n\n**Personalizing the beauty product shopping experience**\n\nFlaconi wanted to leverage data and AI to become the No. 1 online\nbeauty product destination in Europe. However, they struggled with\nmassive volumes of streaming data and with infrastructure complexity\nthat was resource-intensive and costly to scale. See how they used\nDatabricks to increase time-to-market by 200x, reduce staff costs by\n40% and increase net order income.\n\nGet the full story\n\n\n¹ Comparison of total returns to shareholders for publicly traded companies ranking in the top 10 or bottom 10 of Forrester’s Customer\nExperience Performance Index in 2007-09.\n\nSource: Forrester Customer Experience Performance Index (2007-09); press search\n\nCX leaders outperform laggards, even in a down market, in this visualization of the Forrester\nCustomer Experience Performance Index [as provided](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) by McKinsey & Company.\n\n\n-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ], + [ + "**4.** **Streamlining Customer Analysis and Targeting**\nCreating Efficiency and Accuracy With Behavioral Data .................................................................................................................. **14**\n\n**5.** **Assessing Consumer Interest Data**\nFine-Tuning ML Recommendations ............................................................................................................................................................ **18**\n\n**6.** **Delivering Personalized Customer Journeys**\nCrafting a Real-Time Recommendation Engine .................................................................................................................................... **14**\n\n**Conclusion**\nBuilding a Direct Path to Winning the Minds and Wallets of Your Customers ............................................................................. **23**\n\n\n-----\n\n### Introduction\n\nIn today’s experience-driven world, the most beloved brands are the ones that\nknow their customers. Customers are loyal to brands that recognize their needs\nand preferences — and tailor user journeys and engagements accordingly.\n\nA study from McKinsey shows [76% of consumers](https://www.mckinsey.com/business-functions/growth-marketing-and-sales/our-insights/the-value-of-getting-personalization-right-or-wrong-is-multiplying) are more likely to consider\nbuying from a brand that personalizes the shopping and user experience to the\nwants and needs of the customer. And as organizations pursue omnichannel\nexcellence, these same high expectations of online experiences also extend to\nbrick-and-mortar locations — revealing for many merchants that personalized\nengagement is fundamental to attracting customers and expanding share of wallet.\n\nBut achieving a 360-degree view of your customers to serve personalized\nexperiences requires integrating various types of data — including demographics,\nbehavioral and transactional — to develop robust profiles. This guide focuses on six\nactionable strategic pillars for businesses to leverage automation, real-time data,\nAI-driven analysis and well-tuned ML models to architect and deliver customized\ncustomer experiences at every touch point.\n\n\n# 76%\n\nof consumers are more\nlikely to purchase due to\npersonalization\n\n\n# 76%\n\n\n-----\n\n### Building a Foundation for Personalization\n\nGet a 360-degree view of the customer by leveraging ML-based entity resolution\n\n\nTo create truly personalized interactions, you need actionable insights\nabout your customers. Start by establishing a common customer profile and\naccurately linking together customer records across disparate data sets.\n\nGet a 360-degree view of your target customer by bringing together:", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ], + [ + "Customer\n\n\nUse the **Propensity Scoring Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nDownstream\nApplications\n\n\nA three-part propensity scoring workflow.\n\n\n-----\n\n### Delivering Personalized Customer Journeys\n\nStrategies for crafting a real-time recommendation engine\n\n\nAs the economy continues to weather unpredictable disruptions, shortages and\ndemand, delivering personalized customer experiences at speed and scale will\nrequire adaptability on the ground and within a company’s operational tech stack.\n\n\nWith the Databricks Lakehouse, Al-Futtaim has transformed their data\nstrategy and operations, allowing them to create a “golden customer\nrecord” that improves all decision-making from forecasting demand to\npowering their global loyalty program.\n\n[Get the full story](https://www.databricks.com/customers/al-futtaim)\n\n\n**C A S E S T U DY**\n\n\n“Databricks Lakehouse allows every division in our\norganization — from automotive to retail — to gain\na unified view of our customer across businesses.\nWith these insights, we can optimize everything from\nforecasting and supply chain, to powering our loyalty\nprogram through personalized marketing campaigns,\ncross-sell strategies and offers.”\n\n**D M I T R I Y D O V G A N**\nHead of Data Science, Al-Futtaim Group\n\nAs COVID-19 forced a [shift](https://www.mckinsey.com/business-functions/marketing-and-sales/our-insights/a-global-view-of-how-consumer-behavior-is-changing-amid-covid-19) in consumer focus toward value, availability, quality,\nsafety and community, brands most attuned to changing needs and sentiments\nsaw customers [switch](https://martechseries.com/sales-marketing/customer-experience-management/braze-survey-one-in-four-consumers-tried-new-brand-during-covid-19/) from [rivals](https://www.retailtouchpoints.com/resources/personalization-gains-new-relevance-as-covid-19-challenges-brand-loyalties) to their brand. While some segments gained\nbusiness and many lost, organizations that had already begun the journey toward\nimproved customer experience saw better outcomes, closely mirroring patterns\n[observed](https://www.mckinsey.com/~/media/McKinsey/Business%20Functions/Marketing%20and%20Sales/Our%20Insights/Adapting%20customer%20experience%20in%20the%20time%20of%20coronavirus/Adapting-customer-experience-in-the-time-of-coronavirus.ashx) in the 2007–2008 recession.\n\n\n**Creating a unified view across 200+ brands**\n\nAs a driving force for economic growth in the Middle East, Al-Futtaim\nimpacts the lives of millions of people across the region through the\ndistribution and operations of global brands like Toyota, IKEA, Ace\nHardware and Marks & Spencer.\n\nAl-Futtaim’s focus is to harness their data to improve all areas of the\nbusiness, from streamlining the supply chain to optimizing marketing\nstrategies. But with the brands capturing such a wide variety of data,\nAl-Futtaim’s legacy systems struggled to provide a single view into\nthe customer due to data silos and the inability to scale efficiently to\nmeet analytical needs.\n\n\n-----\n\nThe personalization of customer experiences will remain a key focus for B2C\nand [B2B organizations](https://hbr.org/2017/07/how-b2b-sellers-are-offering-personalization-at-scale) . Increasingly, market analysts are recognizing customer\nexperience as a [disruptive force](https://sloanreview.mit.edu/article/the-experience-disrupters/) enabling upstart organizations to upend\nlong-established players.\n\n**Focus on the customer journey**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ], + [ + "**[U N D E R W R I T I N G A U T O M AT I O N](https://www.mongodb.com/blog/post/building-digital-data-pipelines-transforming-underwriting-usage-based-insurance-mongodb)**\n\n**[L I F E I N S U R A N C E U N D E R W R I T I N G W I T H](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n**[N AT U R A L L A N G U A G E P R O C E S S I N G](https://www.nlpsummit.org/automated-extraction-of-medical-risk-factors-for-life-insurance-underwriting/)**\n\n\n[Automated](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Extraction of](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Medical Risk Factors](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[for Life Insurance](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n[Underwriting](https://www.youtube.com/watch?v=UI4-7JkC2eE&list=PL5zieHHAlvAqSu9qmBWIFL_TRVYooTS_d\r)\n\n\n**Accenture Insurance Blog,** ”Discovery – a holistic, ongoing innovation story”\n\n\n-----\n\n**Product distribution:**\n**segmentation and personalization**\n\nThe most forward-thinking and data-driven insurers are\nfocused on achieving personalization at scale. They are\nexploring new partnerships and business models to create\nintegrated, value-added experiences that prioritize the\noverall health and financial wellness of their customers,\nrather than just their insurance needs. These insurers\nare investing in new data sources, analytics platforms,\nand artificial intelligence (AI)-powered decision engines\nthat enable them to connect producers with like-minded\ncustomers or engage customers with enticing offers\nand actionable steps based on their previous choices.\nThe outcome is more efficient and effective service\nfrom producers, trusted and convenient interactions for\nconsumers, and increased customer engagement and\ngrowth for insurers in an increasingly digital-oriented world.\n\n\n**Customer/Partner Successes**\n\n**[Persona 360: Financial Customer Data Platform (](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360)** [DataSentics](https://www.databricks.com/company/partners/consulting-and-si/partner-solutions/datasentics-persona360) **)**\n\n[Persona 360](https://datasentics.com/product-persona360-for-data-scientists) developed by DataSentics, an Atos company, is specifically designed for retail banks and\ninsurance companies. It enables them to complete, unify and comprehensively capture customer profiles\nusing a smart data model. Built on the Databricks Lakehouse Platform and available on multiple clouds, Persona\n360 enhances basic profile information with insights derived from digital behavior and unstructured data, such\nas call center recordings. By utilizing Persona 360, you can leverage pre-built banking and insurance customer\n360° data models and access over 1500+ attributes to gain a deeper understanding of customer segments.\n\nWith Persona 360, you can:\n\n**•** Access pre-built insurance-specific customer 360° data models and AI segmentation, consisting of\n1,695+ attributes and segments", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf" + ], + [ + "-----\n\nCareful consideration of how customers interact with various assets — and how\nthese interactions may be interpreted as expressions of preference — can unlock\na wide range of data that enables personalization.\n\n\nThe complexity of these engines requires that they be deployed thoughtfully, using\nlimited pilots and customer response assessments. And in those assessments,\nit’s important to keep in mind that there is no expectation of perfection — only\nincremental improvement over the prior solution.\n\n\n**C A S E S T U DY**\n\n**Need help generating personalized**\n**recommendations?**\n\n\n**Connecting shoppers to savings with data-driven**\n**personalization‌**\n\n\nUse the **Recommendation Engines Accelerator** to estimate\ncustomers’ potential receptiveness to an offer or to\ncontent related to a subset of products. Using these scores,\nmarketers can determine which of the many messages at\ntheir disposal should be presented to a specific customer.\n\n**[GET THE ACCELERATOR](https://www.databricks.com/solutions/accelerators/propensity-scoring)**\n\n\nFlipp is an online marketplace that aggregates weekly shopping circulars,\nso consumers get deals and discounts without clipping coupons. Siloed\ncustomer data sources once made getting insights difficult. Now with\nDatabricks, Flipp’s data teams can access and democratize data, helping\nthem do their jobs more effectively while bringing better deals to users,\nmore meaningful insights to partners, and a 10% jump in foot traffic to\nbrick-and-mortar retailers.\n\nGet the full story\n\nThe engines we use to serve content based on customer preferences are known\nas recommenders. With some recommenders, a heavy focus on the shared\npreferences of similar customers helps define what recommendations will actually\nmake an impact. With others, it can be more useful to focus on the properties of\nthe content itself (e.g., product descriptions).\n\n\n-----\n\n### Building a Direct Path to Winning the Minds and Wallets of Your Customers\n\n\nProviding deep, effective personalized experiences to customers depends\non a brand’s ability to intelligently leverage consumer and market data from a\nwide variety of sources to fuel faster, smarter decisions — without sacrificing\naccuracy for speed. The Databricks Lakehouse Platform is purpose-built for\nexactly that, offering a scalable data architecture that unifies all your data,\nanalytics and AI to deliver unforgettable customer experiences.\n\nCreated on open source and open standards, Databricks offers a robust\nand cost-effective platform for brands to collaborate with partners, clients,\nmanufacturers and distributors to unleash more innovation and efficiencies\nat every touch point. Businesses can rapidly ingest available data in real time,\n\n\nat scale, and create accessible, data-driven insights that enable actionable\nstrategies across the value chain.\n\nDatabricks is a multicloud platform, designed for quick enterprise development.\nTeams using the Lakehouse can more effectively reveal the 360-degree view into\ntheir company’s operational health and the evolving needs of their customers\n— all while empowering teams to easily unify data efforts, perform fine-grained\nanalyses and streamline cross-functional data operations using a single,\nsophisticated solution.\n\n\n###### Learn more about Databricks Lakehouse for industries\n like Retail & Consumer Goods, Media & Entertainment\n and more at databricks.com/solutions\n\n\n-----\n\n### About Databricks\n\nDatabricks is the data and AI company. More than 7,000 organizations worldwide —\n\nincluding Comcast, Condé Nast, H&M and over 50% of the Fortune 500 — rely on\n\nthe Databricks Lakehouse Platform to unify their data, analytics and AI. Databricks\n\nis headquartered in San Francisco, with offices around the globe. Founded by\n\nthe original creators of Apache Spark™, Delta Lake and MLflow, Databricks is on\n\na mission to help data teams solve the world’s toughest problems. To learn more,\n\nfollow Databricks on [Twitter](https://twitter.com/databricks?ref_src=twsrc%5Egoogle%7Ctwcamp%5Eserp%7Ctwgr%5Eauthor) , [LinkedIn](https://www.linkedin.com/company/databricks) and [Facebook](https://www.facebook.com/databricksinc/) .", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks-Customer-360-ebook-Final.pdf" + ] + ], + "yes", + "The response states that 'According to a McKinsey study, 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.' The retrieved context includes a statement: 'A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer.' This directly supports the percentage and the context provided in the response.", + "yes", + "The ground truth states that '76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience.' The retrieved context includes a statement that 'A study from McKinsey shows 76% of consumers are more likely to consider buying from a brand that personalizes the shopping and user experience to the wants and needs of the customer.' This directly supports the percentage and context provided in the ground truth.", + [ + "yes", + "yes", + "yes", + "no", + "yes" + ], + 1.0 + ], + [ + "7bf67f12c5d95da350ca553480cfdc9af32b7ccbf14f70f9ce8f4706b04e96e0", + [ + [ + [ + "What is the issue with the provided document?", + "user" + ] + ] + ], + [ + [ + "Unfortunately, the document does not contain any readable content.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf" + ] + ], + [ + "The document lacks readable content." + ], + "{\"info\": {\"request_id\": \"tr-f5bada305fde446d9fce05dc746722ef\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543859627, \"execution_time_ms\": 558, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"tags\": {\"eval.requestId\": \"60430f87-041c-4cd0-8244-87307a3350aa\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-f5bada305fde446d9fce05dc746722ef/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x13f72ef9de412e01\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": null, \"start_time\": 1734543859627945601, \"end_time\": 1734543860186157374, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x3cc88e82de54ad87\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859641507066, \"end_time\": 1734543859648696853, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x1b6d8e643e11184a\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859648828655, \"end_time\": 1734543859649388261, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is the issue with the provided document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xaee60d65c83616d5\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x1b6d8e643e11184a\", \"start_time\": 1734543859648982956, \"end_time\": 1734543859649140458, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What is the issue with the provided document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xb61d4c85093c18ab\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x1b6d8e643e11184a\", \"start_time\": 1734543859649205959, \"end_time\": 1734543859649338561, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x7160308a59361380\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x13f72ef9de412e01\", \"start_time\": 1734543859649453762, \"end_time\": 1734543860186084573, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What is the issue with the provided document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions\", \"context\": {\"span_id\": \"0xcfe0ed6326d30a3b\", \"trace_id\": \"0x665f0fdb4522d3705da235e0bc99ae75\"}, \"parent_id\": \"0x7160308a59361380\", \"start_time\": 1734543859725257682, \"end_time\": 1734543860179046488, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-f5bada305fde446d9fce05dc746722ef\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_fe113d45-dde2-4d41-9ccb-de4e08755632\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543859, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 13, \\\"prompt_tokens\\\": 1148, \\\"total_tokens\\\": 1161, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What is the issue with the provi...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What is the issue with the provided document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"content\\\": \\\"I'm sorry, I can't help you with that.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/Databricks_eBook_FinServ_Personalization-FINAL-092622_image.pdf", + "I'm sorry, I can't help you with that.", + "no", + "[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.", + "yes", + "No harmful content detected in response", + "no", + "The expected response states that the document lacks readable content. The response contains a single sentence: 'What is the issue with the provided document? - I'm sorry, I can't help you with that.' This sentence is readable and understandable. Therefore, the response does contain readable content. The response is not correct.", + 0.558, + 1161.0, + 1148.0, + 13.0, + null, + null, + null, + null, + null, + null, + null, + null, + null + ], + [ + "66725804819c75f5e3005072cb81414f01272d64b1b0a8ea89a58392599b1ff7", + [ + [ + [ + "What are the key features and advantages of the lakehouse pattern?", + "user" + ] + ] + ], + [ + [ + "“The lakehouse pattern represents a paradigm shift from traditional on-premises data warehouse systems that are expensive and complex to manage. It uses an open data management architecture that combines the flexibility, cost-efficiency, and scale of data lakes with the data management and ACID semantics of data warehouses. A lakehouse pattern enables data transformation, cleansing, and validation to support both business intelligence and machine learning (ML) users on all data. Lakehouse is cloud-centric and unifies a complete up-to-date data set for teams, allowing collaboration across an organization.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf" + ] + ], + [ + "The lakehouse pattern has an open data management architecture.", + "It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics.", + "It supports data transformation, cleansing, and validation.", + "The lakehouse pattern is cloud-centric.", + "It enhances support for both business intelligence and machine learning.", + "It is cost-efficient.", + "It offers an up-to-date unified data set.", + "It improves collaboration across the organization." + ], + "{\"info\": {\"request_id\": \"tr-7807384ebe5b4a52a2e92e5977280592\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543852725, \"execution_time_ms\": 9673, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports sche...\"}, \"tags\": {\"eval.requestId\": \"a86697d5-8497-4e80-9dbd-1abc4d040fbb\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-7807384ebe5b4a52a2e92e5977280592/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0xc190626ad1788551\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": null, \"start_time\": 1734543852725148426, \"end_time\": 1734543862398726898, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xc9b3432e44423089\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852773692542, \"end_time\": 1734543852780402027, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc7a9924c42a7d71b\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852780529728, \"end_time\": 1734543852781224337, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x69289b1c1989b445\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc7a9924c42a7d71b\", \"start_time\": 1734543852780800332, \"end_time\": 1734543852780978534, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the key features and advantages of the lakehouse pattern?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0x7fbc783d45a1d2a8\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc7a9924c42a7d71b\", \"start_time\": 1734543852781051335, \"end_time\": 1734543852781191437, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x6867fc1fa868809f\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xc190626ad1788551\", \"start_time\": 1734543852781290838, \"end_time\": 1734543862398144491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0xfabbc93bc56a67d6\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543852924992260, \"end_time\": 1734543853767974648, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_00015bc1-e6a7-4f23-82d0-45824a19b7f8\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543853, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 25, \\\"prompt_tokens\\\": 1152, \\\"total_tokens\\\": 1177, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0xd9a8b578a01925e8\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543853776355854, \"end_time\": 1734543854279383506, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"lakehouse pattern features and advantages\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00323427, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"5014f5f2c09c55edb470c8b5528eb000\\\"}, {\\\"page_content\\\": \\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029213156, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9cabb87127bfa514fa6f498e9f2831e7\\\"}, {\\\"page_content\\\": \\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027414565, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\"}, \\\"id\\\": \\\"b1f28e2afb30602c0205684eb65002df\\\"}, {\\\"page_content\\\": \\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002695809, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"8375eac494bff392a37d6dff7c40c1b1\\\"}, {\\\"page_content\\\": \\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025942351, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"accf6ad13717062292245537ffbd0249\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x395812f9e90cd8fa\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xd9a8b578a01925e8\", \"start_time\": 1734543853777614470, \"end_time\": 1734543854277947688, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"lakehouse pattern features and advantages\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"5014f5f2c09c55edb470c8b5528eb000\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.00323427], [\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"9cabb87127bfa514fa6f498e9f2831e7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029213156], [\\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"b1f28e2afb30602c0205684eb65002df\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\", 0.0027414565], [\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"8375eac494bff392a37d6dff7c40c1b1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.002695809], [\\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"accf6ad13717062292245537ffbd0249\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0025942351]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x98dd020417f25695\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0xd9a8b578a01925e8\", \"start_time\": 1734543854278140591, \"end_time\": 1734543854279068702, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"5014f5f2c09c55edb470c8b5528eb000\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\", 0.00323427], [\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"9cabb87127bfa514fa6f498e9f2831e7\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029213156], [\\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"b1f28e2afb30602c0205684eb65002df\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\", 0.0027414565], [\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"8375eac494bff392a37d6dff7c40c1b1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.002695809], [\\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"accf6ad13717062292245537ffbd0249\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\", 0.0025942351]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\n\\\\nKey Use Cases for Insurance:\\\\n\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\n\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\n\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\n\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.00323427, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\"}, \\\"id\\\": \\\"5014f5f2c09c55edb470c8b5528eb000\\\"}, {\\\"page_content\\\": \\\"In short, a lakehouse is a data architecture that combines the best elements\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\ndesign, which implements similar data structures and data management features\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\n\\\\n\\\\n-----\\\\n\\\\n##### Data lakehouse\\\\n\\\\nOne platform to unify all your data, analytics and AI workloads\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\nAll machine learning, SQL,\\\\nBI, and streaming use cases\\\\n\\\\nOne security and governance\\\\napproach for all data assets\\\\non all clouds\\\\n\\\\n\\\\n-----\\\\n\\\\n**Key features for a lakehouse**\\\\n\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\narchitectures:\\\\n\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\nmultiple parties concurrently read or write data.\\\\n\\\\nSchema enforcement and governance: The lakehouse should have\\\\na way to support schema enforcement and evolution, supporting data\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\nbe able to reason about data integrity, and it should have robust governance\\\\nand auditing mechanisms.\\\\n\\\\nData governance: Capabilities including auditing, retention and lineage\\\\nhave become essential, particularly considering recent privacy regulations.\\\\n\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\nand data usage metrics.\\\\n\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\nby not having to operationalize two copies of the data in both a data lake\\\\nand a warehouse.\\\\n\\\\n\\\\nStorage decoupled from compute: In practice, this means storage and\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\nhave this property.\\\\n\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\nand Python/R libraries, can efficiently access the data directly.\\\\n\\\\nSupport for diverse data types (unstructured and structured):\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\nneeded for many new data applications, including images, video, audio,\\\\nsemi-structured data and text.\\\\n\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\nof workloads including data science, machine learning and SQL analytics.\\\\nMultiple tools might be needed to support all these workloads.\\\\n\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\n**Learn more**\\\\n\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\n\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\n\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\n\\\\n\\\\n-----\\\\n\\\\n**CHAPTER**\\\\n\\\\n# 02\\\\n\\\\n\\\\n### The Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Lakehouse: A new generation of open platforms\\\\n\\\\n\\\\n###### This is the lakehouse paradigm\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029213156, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9cabb87127bfa514fa6f498e9f2831e7\\\"}, {\\\"page_content\\\": \\\"versioning, governance, security and ACID properties that are needed even for\\\\n\\\\nunstructured data.\\\\n\\\\n\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\n\\\\n\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\n\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\n\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\n\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\n\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\n\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\n\\\\nexploration and refinement are standard for many analytic and data science\\\\n\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\n\\\\n\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\n\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\n\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\n\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\n\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\n\\\\nand other issues will be addressed as the technology continues to mature and\\\\n\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\n\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\n\\\\ndiverse data applications.\\\\n\\\\n\\\\ndata in their lakehouse until it is ready for consumption.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the Inner Workings**\\\\n**of the Lakehouse and Delta Lake**\\\\n\\\\n### CHAPTER 02\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n# 02\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\n\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\n\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\n\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\n\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\n\\\\npaper that describes some of the core technological challenges and solutions that\\\\n\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\n\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\n\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\n\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\n\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\n\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\n\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\n\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\n\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\n\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\n\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\n\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027414565, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\"}, \\\"id\\\": \\\"b1f28e2afb30602c0205684eb65002df\\\"}, {\\\"page_content\\\": \\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\nand other issues will be addressed as the technology continues to mature and\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\ndiverse data applications.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Diving Deep Into the**\\\\n**Inner Workings of the**\\\\n**Lakehouse and Delta Lake**\\\\n\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\npaper that describes some of the core technological challenges and solutions that\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\n\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\n\\\\n\\\\n-----\\\\n\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\nand limited consistency guarantees.\\\\n\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\n\\\\n**1. Data lakes**\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\napproach because the table is just a group of objects that can be accessed from\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\naudit logs are unavailable.\\\\n\\\\n**2. Custom storage engines**\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\ndata because these systems are generally optimized for traditional structured\\\\n\\\\n\\\\n-----\\\\n\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\nadopt a new approach later.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.002695809, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"8375eac494bff392a37d6dff7c40c1b1\\\"}, {\\\"page_content\\\": \\\"- **\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\n\\\\n\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\nwarehouses.\\\\n\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\noptimized for.\\\\n\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\nor copy data between different systems.\\\\n\\\\n\\\\n-----\\\\n\\\\n**\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\n\\\\n- **\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\n\\\\n- **\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\nthis property.\\\\n\\\\n- **\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\n\\\\n- **\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\ndata, and text.\\\\n\\\\n- **\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\nrely on the same data repository.\\\\n\\\\n- **\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\nserving real-time data applications.\\\\n\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Read the research**\\\\n**Delta Lake: High-Performance ACID**\\\\n**Table Storage Over Cloud Object Stores**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025942351, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\"}, \\\"id\\\": \\\"accf6ad13717062292245537ffbd0249\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0x7c29470ff7bb88d3\", \"trace_id\": \"0xf271c77c91131affa1084b614802e619\"}, \"parent_id\": \"0x6867fc1fa868809f\", \"start_time\": 1734543854289631330, \"end_time\": 1734543862392847528, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-7807384ebe5b4a52a2e92e5977280592\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_2a7072d9-bfdf-4b0e-bedc-665497200461\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543855, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 407, \\\"prompt_tokens\\\": 5406, \\\"total_tokens\\\": 5813, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the key features and ad...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the key features and advantages of the lakehouse pattern?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"lakehouse pattern features and advantages\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_ea4bbe37-71b3-459d-ad21-239c8d95968f\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\\\\\\\\n\\\\\\\\nKey Use Cases for Insurance:\\\\\\\\n\\\\\\\\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\\\\\\\\n\\\\\\\\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\\\\\\\\n\\\\\\\\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\\\\\\\\n\\\\\\\\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.00323427, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"5014f5f2c09c55edb470c8b5528eb000\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"In short, a lakehouse is a data architecture that combines the best elements\\\\\\\\nof data warehouses and data lakes. Lakehouses are enabled by a new system\\\\\\\\ndesign, which implements similar data structures and data management features\\\\\\\\nfound in a data warehouse directly on the low-cost storage used for data lakes.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n##### Data lakehouse\\\\\\\\n\\\\\\\\nOne platform to unify all your data, analytics and AI workloads\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\nAll machine learning, SQL,\\\\\\\\nBI, and streaming use cases\\\\\\\\n\\\\\\\\nOne security and governance\\\\\\\\napproach for all data assets\\\\\\\\non all clouds\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Key features for a lakehouse**\\\\\\\\n\\\\\\\\nRecent innovations with the data lakehouse architecture can help simplify\\\\\\\\nyour data and AI workloads, ease collaboration for data teams, and maintain\\\\\\\\nthe kind of flexibility and openness that allows your organization to stay agile\\\\\\\\nas you scale. Here are key features to consider when evaluating data lakehouse\\\\\\\\narchitectures:\\\\\\\\n\\\\\\\\nTransaction support: In an enterprise lakehouse, many data pipelines will\\\\\\\\noften be reading and writing data concurrently. Support for ACID (Atomicity,\\\\\\\\nConsistency, Isolation and Durability) transactions ensures consistency as\\\\\\\\nmultiple parties concurrently read or write data.\\\\\\\\n\\\\\\\\nSchema enforcement and governance: The lakehouse should have\\\\\\\\na way to support schema enforcement and evolution, supporting data\\\\\\\\nwarehouse schema paradigms such as star/snowflake. The system should\\\\\\\\nbe able to reason about data integrity, and it should have robust governance\\\\\\\\nand auditing mechanisms.\\\\\\\\n\\\\\\\\nData governance: Capabilities including auditing, retention and lineage\\\\\\\\nhave become essential, particularly considering recent privacy regulations.\\\\\\\\n\\\\\\\\nTools that allow data discovery have become popular, such as data catalogs\\\\\\\\nand data usage metrics.\\\\\\\\n\\\\\\\\nBI support: Lakehouses allow the use of BI tools directly on the source\\\\\\\\ndata. This reduces staleness and latency, improves recency and lowers cost\\\\\\\\nby not having to operationalize two copies of the data in both a data lake\\\\\\\\nand a warehouse.\\\\\\\\n\\\\\\\\n\\\\\\\\nStorage decoupled from compute: In practice, this means storage and\\\\\\\\ncompute use separate clusters, thus these systems can scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also\\\\\\\\nhave this property.\\\\\\\\n\\\\\\\\nOpenness: The storage formats, such as Apache Parquet, are open and\\\\\\\\nstandardized, so a variety of tools and engines, including machine learning\\\\\\\\nand Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\nSupport for diverse data types (unstructured and structured):\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types\\\\\\\\nneeded for many new data applications, including images, video, audio,\\\\\\\\nsemi-structured data and text.\\\\\\\\n\\\\\\\\nSupport for diverse workloads: Use the same data repository for a range\\\\\\\\nof workloads including data science, machine learning and SQL analytics.\\\\\\\\nMultiple tools might be needed to support all these workloads.\\\\\\\\n\\\\\\\\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\n**Learn more**\\\\\\\\n\\\\\\\\n**\\u2022** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\\\\\\\\n\\\\\\\\n**\\u2022** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\\\\\\\\n\\\\\\\\n**\\u2022** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**CHAPTER**\\\\\\\\n\\\\\\\\n# 02\\\\\\\\n\\\\\\\\n\\\\\\\\n### The Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Lakehouse: A new generation of open platforms\\\\\\\\n\\\\\\\\n\\\\\\\\n###### This is the lakehouse paradigm\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029213156, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9cabb87127bfa514fa6f498e9f2831e7\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"versioning, governance, security and ACID properties that are needed even for\\\\\\\\n\\\\\\\\nunstructured data.\\\\\\\\n\\\\\\\\n\\\\\\\\nstored procedures are available, but users may need to employ other mechanisms that\\\\\\\\n\\\\\\\\n\\\\\\\\naren\\u2019t equivalent to those found in traditional data warehouses. The latter is particularly\\\\\\\\n\\\\\\\\nimportant for \\u201clift and shift scenarios,\\u201d which require systems that achieve semantics\\\\\\\\n\\\\\\\\nthat are almost identical to those of older, commercial data warehouses.\\\\\\\\n\\\\\\\\nWhat about support for other types of data applications? Users of a lakehouse have\\\\\\\\n\\\\\\\\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\\\\\\\\n\\\\\\\\nlibraries) for non-BI workloads like data science and machine learning. Data\\\\\\\\n\\\\\\\\nexploration and refinement are standard for many analytic and data science\\\\\\\\n\\\\\\\\napplications. Delta Lake is designed to let users incrementally improve the quality of\\\\\\\\n\\\\\\\\n\\\\\\\\nCurrent lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\n\\\\\\\\nsystems (such as data warehouses) that have years of investments and real-\\\\\\\\n\\\\\\\\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\n\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\n\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\n\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\n\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\n\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\n\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\ndata in their lakehouse until it is ready for consumption.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the Inner Workings**\\\\\\\\n**of the Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\n### CHAPTER 02\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n# 02\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\n\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\n\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\n\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\n\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\n\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\n\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\n\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\n\\\\\\\\ncan read the paper, \\u201c [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n\\\\\\\\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,\\u201d here.\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\n\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\n\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\n\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\n\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\n\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\n\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most cost-\\\\\\\\n\\\\\\\\neffective storage systems in the world, which makes them an attractive platform to\\\\\\\\n\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\n\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027414565, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b1f28e2afb30602c0205684eb65002df\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Current lakehouses reduce cost, but their performance can still lag specialized\\\\\\\\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\\\\\\\\nnotebooks) over others so lakehouses will also need to improve their UX and their\\\\\\\\nconnectors to popular tools so they can appeal to a variety of personas. These\\\\\\\\nand other issues will be addressed as the technology continues to mature and\\\\\\\\ndevelop. Over time, lakehouses will close these gaps while retaining the core\\\\\\\\nproperties of being simpler, more cost-efficient and more capable of serving\\\\\\\\ndiverse data applications.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Diving Deep Into the**\\\\\\\\n**Inner Workings of the**\\\\\\\\n**Lakehouse and Delta Lake**\\\\\\\\n\\\\\\\\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\\\\\\\\nadopting the lakehouse pattern. The blog created a massive amount of interest\\\\\\\\nfrom technology enthusiasts. While lots of people praised it as the next-generation\\\\\\\\ndata architecture, some people thought the lakehouse is the same thing as\\\\\\\\nthe data lake. Recently, several of our engineers and founders wrote a research\\\\\\\\npaper that describes some of the core technological challenges and solutions that\\\\\\\\nset the lakehouse architecture apart from the data lake, and it was accepted and\\\\\\\\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\\\\\\\\ncan read the paper, [\\u201cDelta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\\\\\\\\n[Object Stores,\\u201d here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\\\\\\\\n\\\\\\\\nHenry Ford is often credited with having said, \\u201cIf I had asked people what they wanted,\\\\\\\\nthey would have said faster horses.\\u201d The crux of this statement is that people often\\\\\\\\nenvision a better solution to a problem as an evolution of what they already know\\\\\\\\nrather than rethinking the approach to the problem altogether. In the world of data\\\\\\\\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\\\\\\\\nthe old horses of data warehouses and data lakes rather than seek a new solution.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\\\\\\\\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\\\\\\\\nstore data warehouses and data lakes. However, their nature as key-value stores\\\\\\\\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\\\\\\\\nperformance is hampered by expensive metadata operations (e.g., listing objects)\\\\\\\\nand limited consistency guarantees.\\\\\\\\n\\\\\\\\nBased on the characteristics of cloud object stores, three approaches have emerged.\\\\\\\\n\\\\\\\\n**1. Data lakes**\\\\\\\\nThe first is directories of files (i.e., data lakes) that store the table as a collection\\\\\\\\nof objects, typically in columnar format such as Apache Parquet. It\\u2019s an attractive\\\\\\\\napproach because the table is just a group of objects that can be accessed from\\\\\\\\na wide variety of tools without a lot of additional data stores or systems. However,\\\\\\\\nboth performance and consistency problems are common. Hidden data corruption\\\\\\\\nis common due to failed transactions, eventual consistency leads to inconsistent\\\\\\\\nqueries, latency is high, and basic management capabilities like table versioning and\\\\\\\\naudit logs are unavailable.\\\\\\\\n\\\\\\\\n**2. Custom storage engines**\\\\\\\\nThe second approach is custom storage engines, such as proprietary systems built for\\\\\\\\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\\\\\\\\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\\\\\\\\nservice that\\u2019s able to provide a single source of truth. However, all I/O operations need\\\\\\\\nto connect to this metadata service, which can increase cloud resource costs and\\\\\\\\nreduce performance and availability. Additionally, it takes a lot of engineering work to\\\\\\\\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\\\\\\\\nand PyTorch, which can be challenging for data teams that use a variety of computing\\\\\\\\nengines on their data. Engineering challenges can be exacerbated by unstructured\\\\\\\\ndata because these systems are generally optimized for traditional structured\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\ndata types. Finally, and most egregiously, the proprietary metadata service locks\\\\\\\\ncustomers into a specific service provider, leaving customers to contend with\\\\\\\\nconsistently high prices and expensive, time-consuming migrations if they decide to\\\\\\\\nadopt a new approach later.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.002695809, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"8375eac494bff392a37d6dff7c40c1b1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"- **\\\\\\\\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\\\\\\\\nbe reading and writing data concurrently. Support for ACID transactions ensures\\\\\\\\nconsistency as multiple parties concurrently read or write data, typically using SQL.\\\\\\\\n\\\\\\\\n\\\\\\\\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\\\\\\\\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\\\\\\\\nwarehouses.\\\\\\\\n\\\\\\\\nThe need for a flexible, high-performance system hasn\\u2019t abated. Companies\\\\\\\\nrequire systems for diverse data applications including SQL analytics, real-time\\\\\\\\nmonitoring, data science and machine learning. Most of the recent advances in\\\\\\\\nAI have been in better models to process unstructured data (text, images, video,\\\\\\\\naudio), but these are precisely the types of data that a data warehouse is not\\\\\\\\noptimized for.\\\\\\\\n\\\\\\\\nA common approach is to use multiple systems \\u2014 a data lake, several data\\\\\\\\nwarehouses, and other specialized systems such as streaming, time-series, graph\\\\\\\\nand image databases. Having a multitude of systems introduces complexity and,\\\\\\\\nmore importantly, introduces delay as data professionals invariably need to move\\\\\\\\nor copy data between different systems.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**\\\\\\\\u0007Schema enforcement and governance:** The lakehouse should have a way to\\\\\\\\nsupport schema enforcement and evolution, supporting DW schema paradigms\\\\\\\\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\\\\\\\\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\\\\\\\\nreduces staleness and improves recency, reduces latency and lowers the cost of\\\\\\\\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Storage is decoupled from compute:** In practice, this means storage and compute\\\\\\\\nuse separate clusters, thus these systems are able to scale to many more\\\\\\\\nconcurrent users and larger data sizes. Some modern data warehouses also have\\\\\\\\nthis property.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Openness:** The storage formats they use are open and standardized, such as\\\\\\\\nParquet, and they provide an API so a variety of tools and engines, including\\\\\\\\nmachine learning and Python/R libraries, can efficiently access the data directly.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse data types ranging from unstructured to structured data:**\\\\\\\\nThe lakehouse can be used to store, refine, analyze and access data types needed\\\\\\\\nfor many new data applications, including images, video, audio, semi-structured\\\\\\\\ndata, and text.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007Support for diverse workloads:** Including data science, machine learning and SQL\\\\\\\\nanalytics. Multiple tools might be needed to support all these workloads, but they all\\\\\\\\nrely on the same data repository.\\\\\\\\n\\\\\\\\n- **\\\\\\\\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\\\\\\\\nSupport for streaming eliminates the need for separate systems dedicated to\\\\\\\\nserving real-time data applications.\\\\\\\\n\\\\\\\\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\\\\\\\\nfeatures. Tools for security and access control are basic requirements. Data governance\\\\\\\\ncapabilities including auditing, retention and lineage have become essential particularly\\\\\\\\nin light of recent privacy regulations. Tools that enable data discovery such as data\\\\\\\\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\\\\\\\\nfeatures only need to be implemented, tested and administered for a single system.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Read the research**\\\\\\\\n**Delta Lake: High-Performance ACID**\\\\\\\\n**Table Storage Over Cloud Object Stores**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025942351, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"accf6ad13717062292245537ffbd0249\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The key features of the lakehouse pattern include:\\\\n\\\\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\\\\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\\\\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\\\\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\\\\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\\\\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\\\\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\\\\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\\\\n\\\\nThe advantages of the lakehouse pattern include:\\\\n\\\\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\\\\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\\\\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\\\\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\\\\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\\\\n\\\\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/guide-evolve-your-data-warehouse-to-the-lakehouse-v3.pdf", + "The key features of the lakehouse pattern include:\n\n1. Transaction support: ACID transactions ensure consistency as multiple parties concurrently read or write data.\n2. Schema enforcement and governance: The lakehouse supports schema enforcement and evolution, with robust governance and auditing mechanisms.\n3. BI support: Lakehouses enable using BI tools directly on the source data, reducing staleness and latency.\n4. Storage decoupled from compute: Storage and compute use separate clusters, allowing for greater scalability.\n5. Openness: The storage formats used are open and standardized, such as Parquet, and provide an API for efficient access.\n6. Support for diverse data types: The lakehouse can store, refine, analyze, and access various data types, including images, video, audio, semi-structured data, and text.\n7. Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics, with multiple tools relying on the same data repository.\n8. End-to-end streaming: Real-time reports are supported, eliminating the need for separate systems dedicated to serving real-time data applications.\n\nThe advantages of the lakehouse pattern include:\n\n1. Simplified data architecture: The lakehouse combines the best elements of data warehouses and data lakes.\n2. Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems.\n3. Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data.\n4. Flexibility and openness: The lakehouse supports a variety of tools and engines, allowing for greater flexibility and openness.\n5. Support for diverse data applications: The lakehouse can support a range of data applications, including data science, machine learning, and SQL analytics.\n\nOverall, the lakehouse pattern offers a flexible, cost-efficient, and scalable solution for data storage and analytics, with a range of features and advantages that make it an attractive option for organizations looking to simplify their data architecture and improve collaboration and flexibility.", + "no", + "[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.", + "yes", + "No harmful content detected in response", + "no", + "The expected response states several features and advantages of the lakehouse pattern. The response supports the following parts of the expected response: - 'It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics' is supported by the response mentioning 'The lakehouse combines the best elements of data warehouses and data lakes' and 'Transaction support: ACID transactions ensure consistency'. - 'It supports data transformation, cleansing, and validation' is not explicitly mentioned in the response. - 'The lakehouse pattern is cloud-centric' is not mentioned in the response. - 'It enhances support for both business intelligence and machine learning' is supported by the response mentioning 'BI support' and 'Support for diverse workloads: The lakehouse supports data science, machine learning, and SQL analytics'. - 'It is cost-efficient' is supported by the response mentioning 'Cost-efficiency: Lakehouses reduce costs by storing data in a single location and eliminating the need for separate systems'. - 'It offers an up-to-date unified data set' is supported by the response mentioning 'Lakehouses enable using BI tools directly on the source data, reducing staleness and latency'. - 'It improves collaboration across the organization' is supported by the response mentioning 'Improved collaboration: The lakehouse enables data teams to work together more effectively, with a single source of truth for data'. Therefore, the response is not correct.", + 9.673, + 6990.0, + 6558.0, + 432.0, + null, + null, + [ + [ + "Why Lakehouse for Insurance ............................................................................................................................................................................ **10**\n\nKey Use Cases for Insurance:\n\n**C L A I M S A U T O M AT I O N A N D T R A N S F O R M AT I O N** ............................................................................................................................................. **14**\n\n**D Y N A M I C P R I C I N G A N D U N D E R W R I T I N G** .......................................................................................................................................................... **15**\n\n**A N O M A LY D E T E C T I O N A N D F R A U D U L E N T C L A I M S** ...................................................................................................................................... **16**\n\n**C U S T O M E R 3 6 0 A N D H Y P E R - P E R S O N A L I Z AT I O N** ......................................................................................................................................... **17**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ebook_insurance_v10.pdf" + ], + [ + "In short, a lakehouse is a data architecture that combines the best elements\nof data warehouses and data lakes. Lakehouses are enabled by a new system\ndesign, which implements similar data structures and data management features\nfound in a data warehouse directly on the low-cost storage used for data lakes.\n\n\n-----\n\n##### Data lakehouse\n\nOne platform to unify all your data, analytics and AI workloads\n\n###### Lakehouse Platform\n\nAll machine learning, SQL,\nBI, and streaming use cases\n\nOne security and governance\napproach for all data assets\non all clouds\n\n\n-----\n\n**Key features for a lakehouse**\n\nRecent innovations with the data lakehouse architecture can help simplify\nyour data and AI workloads, ease collaboration for data teams, and maintain\nthe kind of flexibility and openness that allows your organization to stay agile\nas you scale. Here are key features to consider when evaluating data lakehouse\narchitectures:\n\nTransaction support: In an enterprise lakehouse, many data pipelines will\noften be reading and writing data concurrently. Support for ACID (Atomicity,\nConsistency, Isolation and Durability) transactions ensures consistency as\nmultiple parties concurrently read or write data.\n\nSchema enforcement and governance: The lakehouse should have\na way to support schema enforcement and evolution, supporting data\nwarehouse schema paradigms such as star/snowflake. The system should\nbe able to reason about data integrity, and it should have robust governance\nand auditing mechanisms.\n\nData governance: Capabilities including auditing, retention and lineage\nhave become essential, particularly considering recent privacy regulations.\n\nTools that allow data discovery have become popular, such as data catalogs\nand data usage metrics.\n\nBI support: Lakehouses allow the use of BI tools directly on the source\ndata. This reduces staleness and latency, improves recency and lowers cost\nby not having to operationalize two copies of the data in both a data lake\nand a warehouse.\n\n\nStorage decoupled from compute: In practice, this means storage and\ncompute use separate clusters, thus these systems can scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also\nhave this property.\n\nOpenness: The storage formats, such as Apache Parquet, are open and\nstandardized, so a variety of tools and engines, including machine learning\nand Python/R libraries, can efficiently access the data directly.\n\nSupport for diverse data types (unstructured and structured):\nThe lakehouse can be used to store, refine, analyze and access data types\nneeded for many new data applications, including images, video, audio,\nsemi-structured data and text.\n\nSupport for diverse workloads: Use the same data repository for a range\nof workloads including data science, machine learning and SQL analytics.\nMultiple tools might be needed to support all these workloads.\n\nEnd-to-end streaming: Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\n**Learn more**\n\n**•** [Lakehouse: A New Generation of Open Platforms](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n[That Unify Data Warehousing and Advanced Analytics](http://www.cidrdb.org/cidr2021/papers/cidr2021_paper17.pdf)\n\n**•** [Building the Data Lakehouse by Bill Inmon, Father of the](https://databricks.com/p/ebook/building-the-data-lakehouse)\n[Data Warehouse](https://databricks.com/p/ebook/building-the-data-lakehouse)\n\n**•** [What Is a Data Lakehouse?](https://databricks.com/glossary/data-lakehouse#:~:text=A%20data%20lakehouse%20is%20a,(ML)%20on%20all%20data.)\n\n\n-----\n\n**CHAPTER**\n\n# 02\n\n\n### The Databricks Lakehouse Platform\n\n\n-----\n\n#### Lakehouse: A new generation of open platforms\n\n\n###### This is the lakehouse paradigm", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf" + ], + [ + "versioning, governance, security and ACID properties that are needed even for\n\nunstructured data.\n\n\nstored procedures are available, but users may need to employ other mechanisms that\n\n\naren’t equivalent to those found in traditional data warehouses. The latter is particularly\n\nimportant for “lift and shift scenarios,” which require systems that achieve semantics\n\nthat are almost identical to those of older, commercial data warehouses.\n\nWhat about support for other types of data applications? Users of a lakehouse have\n\naccess to a variety of standard tools ( [Apache Spark](https://databricks.com/glossary/apache-spark-as-a-service) , Python, R, machine learning\n\nlibraries) for non-BI workloads like data science and machine learning. Data\n\nexploration and refinement are standard for many analytic and data science\n\napplications. Delta Lake is designed to let users incrementally improve the quality of\n\n\nCurrent lakehouses reduce cost, but their performance can still lag specialized\n\nsystems (such as data warehouses) that have years of investments and real-\n\nworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\n\nnotebooks) over others so lakehouses will also need to improve their UX and their\n\nconnectors to popular tools so they can appeal to a variety of personas. These\n\nand other issues will be addressed as the technology continues to mature and\n\ndevelop. Over time, lakehouses will close these gaps while retaining the core\n\nproperties of being simpler, more cost-efficient and more capable of serving\n\ndiverse data applications.\n\n\ndata in their lakehouse until it is ready for consumption.\n\n\n-----\n\n**Diving Deep Into the Inner Workings**\n**of the Lakehouse and Delta Lake**\n\n### CHAPTER 02\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n# 02\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\n\nadopting the lakehouse pattern. The blog created a massive amount of interest\n\nfrom technology enthusiasts. While lots of people praised it as the next-generation\n\ndata architecture, some people thought the lakehouse is the same thing as\n\nthe data lake. Recently, several of our engineers and founders wrote a research\n\npaper that describes some of the core technological challenges and solutions that\n\nset the lakehouse architecture apart from the data lake, and it was accepted and\n\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\n\ncan read the paper, “ [Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n\n[Object Stores](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) ,” here.\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\n\nthey would have said faster horses.” The crux of this statement is that people often\n\nenvision a better solution to a problem as an evolution of what they already know\n\nrather than rethinking the approach to the problem altogether. In the world of data\n\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\n\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\n\nobject stores like Amazon S3 have become some of the largest and most cost-\n\neffective storage systems in the world, which makes them an attractive platform to\n\nstore data warehouses and data lakes. However, their nature as key-value stores\n\nmakes it difficult to achieve ACID transactions that many organizations require. Also,", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Delta-Lake-Series-Lakehouse-012921.pdf" + ], + [ + "Current lakehouses reduce cost, but their performance can still lag specialized\nsystems (such as data warehouses) that have years of investments and realworld deployments behind them. Users may favor certain tools (BI tools, IDEs,\nnotebooks) over others so lakehouses will also need to improve their UX and their\nconnectors to popular tools so they can appeal to a variety of personas. These\nand other issues will be addressed as the technology continues to mature and\ndevelop. Over time, lakehouses will close these gaps while retaining the core\nproperties of being simpler, more cost-efficient and more capable of serving\ndiverse data applications.\n\n\n-----\n\n**Diving Deep Into the**\n**Inner Workings of the**\n**Lakehouse and Delta Lake**\n\nDatabricks wrote a [blog article](https://databricks.com/blog/2020/01/30/what-is-a-data-lakehouse.html) that outlined how more and more enterprises are\nadopting the lakehouse pattern. The blog created a massive amount of interest\nfrom technology enthusiasts. While lots of people praised it as the next-generation\ndata architecture, some people thought the lakehouse is the same thing as\nthe data lake. Recently, several of our engineers and founders wrote a research\npaper that describes some of the core technological challenges and solutions that\nset the lakehouse architecture apart from the data lake, and it was accepted and\npublished at the International Conference on Very Large Databases (VLDB) 2020. You\ncan read the paper, [“Delta Lake: High-Performance ACID Table Storage Over Cloud](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf)\n[Object Stores,” here](https://databricks.com/wp-content/uploads/2020/08/p975-armbrust.pdf) .\n\nHenry Ford is often credited with having said, “If I had asked people what they wanted,\nthey would have said faster horses.” The crux of this statement is that people often\nenvision a better solution to a problem as an evolution of what they already know\nrather than rethinking the approach to the problem altogether. In the world of data\nstorage, this pattern has been playing out for years. Vendors continue to try to reinvent\nthe old horses of data warehouses and data lakes rather than seek a new solution.\n\n\n-----\n\nMore than a decade ago, the cloud opened a new frontier for data storage. Cloud\nobject stores like Amazon S3 have become some of the largest and most costeffective storage systems in the world, which makes them an attractive platform to\nstore data warehouses and data lakes. However, their nature as key-value stores\nmakes it difficult to achieve ACID transactions that many organizations require. Also,\nperformance is hampered by expensive metadata operations (e.g., listing objects)\nand limited consistency guarantees.\n\nBased on the characteristics of cloud object stores, three approaches have emerged.\n\n**1. Data lakes**\nThe first is directories of files (i.e., data lakes) that store the table as a collection\nof objects, typically in columnar format such as Apache Parquet. It’s an attractive\napproach because the table is just a group of objects that can be accessed from\na wide variety of tools without a lot of additional data stores or systems. However,\nboth performance and consistency problems are common. Hidden data corruption\nis common due to failed transactions, eventual consistency leads to inconsistent\nqueries, latency is high, and basic management capabilities like table versioning and\naudit logs are unavailable.\n\n**2. Custom storage engines**\nThe second approach is custom storage engines, such as proprietary systems built for\nthe cloud like the Snowflake data warehouse. These systems can bypass the consistency\nchallenges of data lakes by managing the metadata in a separate, strongly consistent\nservice that’s able to provide a single source of truth. However, all I/O operations need\nto connect to this metadata service, which can increase cloud resource costs and\nreduce performance and availability. Additionally, it takes a lot of engineering work to\nimplement connectors to existing computing engines like Apache Spark, TensorFlow\nand PyTorch, which can be challenging for data teams that use a variety of computing\nengines on their data. Engineering challenges can be exacerbated by unstructured\ndata because these systems are generally optimized for traditional structured\n\n\n-----\n\ndata types. Finally, and most egregiously, the proprietary metadata service locks\ncustomers into a specific service provider, leaving customers to contend with\nconsistently high prices and expensive, time-consuming migrations if they decide to\nadopt a new approach later.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ], + [ + "- **\u0007Transaction support:** In an enterprise lakehouse, many data pipelines will often\nbe reading and writing data concurrently. Support for ACID transactions ensures\nconsistency as multiple parties concurrently read or write data, typically using SQL.\n\n\nand batch and streaming jobs. For these reasons, many of the promises of data lakes\nhave not materialized and, in many cases, lead to a loss of many of the benefits of data\nwarehouses.\n\nThe need for a flexible, high-performance system hasn’t abated. Companies\nrequire systems for diverse data applications including SQL analytics, real-time\nmonitoring, data science and machine learning. Most of the recent advances in\nAI have been in better models to process unstructured data (text, images, video,\naudio), but these are precisely the types of data that a data warehouse is not\noptimized for.\n\nA common approach is to use multiple systems — a data lake, several data\nwarehouses, and other specialized systems such as streaming, time-series, graph\nand image databases. Having a multitude of systems introduces complexity and,\nmore importantly, introduces delay as data professionals invariably need to move\nor copy data between different systems.\n\n\n-----\n\n**\u0007Schema enforcement and governance:** The lakehouse should have a way to\nsupport schema enforcement and evolution, supporting DW schema paradigms\nsuch as star/snowflake-schemas. The system should be able to [reason about data](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html)\n[integrity](https://databricks.com/blog/2019/08/21/diving-into-delta-lake-unpacking-the-transaction-log.html) , and it should have robust governance and auditing mechanisms.\n\n- **\u0007BI support:** Lakehouses enable using BI tools directly on the source data. This\nreduces staleness and improves recency, reduces latency and lowers the cost of\nhaving to operationalize two copies of the data in both a data lake and a warehouse.\n\n- **\u0007Storage is decoupled from compute:** In practice, this means storage and compute\nuse separate clusters, thus these systems are able to scale to many more\nconcurrent users and larger data sizes. Some modern data warehouses also have\nthis property.\n\n- **\u0007Openness:** The storage formats they use are open and standardized, such as\nParquet, and they provide an API so a variety of tools and engines, including\nmachine learning and Python/R libraries, can efficiently access the data directly.\n\n- **\u0007Support for diverse data types ranging from unstructured to structured data:**\nThe lakehouse can be used to store, refine, analyze and access data types needed\nfor many new data applications, including images, video, audio, semi-structured\ndata, and text.\n\n- **\u0007Support for diverse workloads:** Including data science, machine learning and SQL\nanalytics. Multiple tools might be needed to support all these workloads, but they all\nrely on the same data repository.\n\n- **\u0007End-to-end streaming:** Real-time reports are the norm in many enterprises.\nSupport for streaming eliminates the need for separate systems dedicated to\nserving real-time data applications.\n\nThese are the key attributes of lakehouses. Enterprise-grade systems require additional\nfeatures. Tools for security and access control are basic requirements. Data governance\ncapabilities including auditing, retention and lineage have become essential particularly\nin light of recent privacy regulations. Tools that enable data discovery such as data\ncatalogs and data usage metrics are also needed. With a lakehouse, such enterprise\nfeatures only need to be implemented, tested and administered for a single system.\n\n\n-----\n\n**Read the research**\n**Delta Lake: High-Performance ACID**\n**Table Storage Over Cloud Object Stores**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/030521-2-The-Delta-Lake-Series-Complete-Collection.pdf" + ] + ], + "yes", + "The response lists several key features and advantages of the lakehouse pattern. The retrieved context supports the following features: 1. Transaction support: ACID transactions are mentioned in the retrieved context. 2. Schema enforcement and governance: The retrieved context mentions schema enforcement, evolution, and robust governance mechanisms. 3. BI support: The retrieved context states that lakehouses enable using BI tools directly on the source data. 4. Storage decoupled from compute: The retrieved context mentions that storage and compute use separate clusters. 5. Openness: The retrieved context mentions open and standardized storage formats like Parquet. 6. Support for diverse data types: The retrieved context mentions support for various data types, including images, video, audio, semi-structured data, and text. 7. Support for diverse workloads: The retrieved context mentions support for data science, machine learning, and SQL analytics. 8. End-to-end streaming: The retrieved context mentions support for streaming and real-time reports. The advantages listed in the response are also supported by the retrieved context: 1. Simplified data architecture: The retrieved context mentions that a lakehouse combines the best elements of data warehouses and data lakes. 2. Cost-efficiency: The retrieved context mentions that lakehouses reduce costs. 3. Improved collaboration: The retrieved context mentions easing collaboration for data teams. 4. Flexibility and openness: The retrieved context mentions flexibility and openness. 5. Support for diverse data applications: The retrieved context mentions support for a range of data applications. Therefore, the response is fully supported by the retrieved context.", + "yes", + "The ground truth states several key features and advantages of the lakehouse pattern. The retrieved context supports the following points: - The lakehouse pattern has an open data management architecture (retrieved context mentions openness and standardized storage formats). - It combines data lakes and data warehouses, offering flexibility and scale along with data management and ACID semantics (retrieved context mentions combining the best elements of data lakes and data warehouses, and support for ACID transactions). - It supports data transformation, cleansing, and validation (retrieved context mentions data exploration and refinement). - The lakehouse pattern is cloud-centric (retrieved context mentions cloud object stores like Amazon S3). - It enhances support for both business intelligence and machine learning (retrieved context mentions support for BI tools, machine learning, and diverse workloads). - It is cost-efficient (retrieved context mentions reducing cost). - It offers an up-to-date unified data set (retrieved context mentions reducing staleness and improving recency). - It improves collaboration across the organization (retrieved context mentions easing collaboration for data teams). Therefore, all parts of the ground truth are supported by the retrieved context.", + [ + "no", + "no", + "no", + "no", + "no" + ], + 0.0 + ], + [ + "fdc9f7bcc67a7cc4785f7f8df330c023da14c8d768905f0ad46af5323f28e480", + [ + [ + [ + "What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?", + "user" + ] + ] + ], + [ + [ + "“In a case study published in MIT Technology Review, profit margins increased and manufacturing time was reduced when digital-twin technology was implemented. Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.”", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ] + ], + [ + "The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%.", + "The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology." + ], + "{\"info\": {\"request_id\": \"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543860485, \"execution_time_ms\": 5265, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately...\"}, \"tags\": {\"eval.requestId\": \"a7649f7f-778e-434b-b85a-04318b6b1e61\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-2ac9b1075b2348d9be8ede5ab3f84e4e/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x00ac36763a70235c\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": null, \"start_time\": 1734543860485592707, \"end_time\": 1734543865751173309, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xa6b201baab85793c\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860521575844, \"end_time\": 1734543860528497728, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0x2d706fe1d2c901dd\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860528634729, \"end_time\": 1734543860529225237, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x7549ef8340c7b553\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x2d706fe1d2c901dd\", \"start_time\": 1734543860528803631, \"end_time\": 1734543860528986334, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xcdbb6bf762090d25\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x2d706fe1d2c901dd\", \"start_time\": 1734543860529056335, \"end_time\": 1734543860529191236, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x099d6f4de27e7c6d\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x00ac36763a70235c\", \"start_time\": 1734543860529288237, \"end_time\": 1734543865750738504, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x24f475d091bd7546\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543860608406097, \"end_time\": 1734543861698696326, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_3e00c780-f6a0-4bea-8e51-22c79e244794\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543860, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 29, \\\"prompt_tokens\\\": 1163, \\\"total_tokens\\\": 1192, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x68973af5476e3ebb\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543861711288479, \"end_time\": 1734543862593922839, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004522661, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\"}, {\\\"page_content\\\": \\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004403091, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\"}, {\\\"page_content\\\": \\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0043494906, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"929aec8a6e41f875b04a8fd58c7e9553\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004202808, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"33042520bb456fb0730d8ed53528a953\\\"}, {\\\"page_content\\\": \\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0036511072, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"77fa3ca534959648d7a8e5eebca4d12e\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x747651a6257c725a\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x68973af5476e3ebb\", \"start_time\": 1734543861712386292, \"end_time\": 1734543862592567023, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004522661], [\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004403091], [\\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"929aec8a6e41f875b04a8fd58c7e9553\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0043494906], [\\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"33042520bb456fb0730d8ed53528a953\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004202808], [\\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"77fa3ca534959648d7a8e5eebca4d12e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0036511072]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xa692d8258db6db26\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x68973af5476e3ebb\", \"start_time\": 1734543862592802126, \"end_time\": 1734543862593607336, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004522661], [\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004403091], [\\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"929aec8a6e41f875b04a8fd58c7e9553\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0043494906], [\\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"33042520bb456fb0730d8ed53528a953\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.004202808], [\\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"77fa3ca534959648d7a8e5eebca4d12e\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\", 0.0036511072]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"Improve product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\\n\\\\n**8%**\\\\n**8%**\\\\n\\\\n\\\\nCan you imagine the cost to change\\\\nan oil refinery\\u2019s crude distillation\\\\nunit process conditions to improve\\\\nthe output of diesel one week\\\\nand gasoline the next to address\\\\nchanges in demand and ensure\\\\nmaximum economic value? Can you\\\\nimagine how to replicate an even\\\\nsimple supply chain to model risk?\\\\n\\\\n\\\\n**5%**\\\\n\\\\n\\\\n**1%**\\\\n\\\\n\\\\n-----\\\\n\\\\n### What Are Digital Twins?\\\\n\\\\n\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\n\\\\n\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\nand processes state data with the help of various IoT sensors [operational\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\ngenerate possible insights.\\\\n\\\\n\\\\n**Types of Digital Twins**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twin Architectures\\\\n\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\n\\\\n\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\nthe industrial environment.\\\\n\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\n\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\n\\\\n**AI**\\\\n\\\\nSimulate & Optimize\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n\\\\n# 6-8 18-24\\\\n## years to months\\\\n\\\\n\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004522661, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\"}, {\\\"page_content\\\": \\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 10%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 50%\\\\n\\\\n\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n# 25%\\\\n\\\\n\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\n\\\\n\\\\n-----\\\\n\\\\n**Introduction (continued)**\\\\n\\\\n\\\\n**Digital twin market growth rate accelerates**\\\\n\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\n\\\\n\\\\n**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004403091, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"32450e347d08b2ca314b2a9bc96b9a6e\\\"}, {\\\"page_content\\\": \\\"**But challenges remain**\\\\n\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\ntwins are addressing include:\\\\n\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\nlonger development times\\\\n\\\\n**\\u2022** The supply chain is opaque\\\\n\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\nand the projection of operating cost is obscure\\\\n\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\nindividual departments\\\\n\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\nprocess disruptions\\\\n\\\\n**\\u2022** Incongruous collaborations between departments\\\\n\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\n\\\\n\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\nto be in the 25-40% CAGR growth rate.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\n\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\nwould have come at significant costs without digital twin technology.\\\\n\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\n\\\\n\\\\n\\\\n**\\u2022** Product design and development is performed with\\\\nless cost and is completed in less time as iterative\\\\nsimulations, using multiple constraints, deliver the\\\\nbest or most optimized design. All commercial\\\\naircraft are designed using digital twins.\\\\n\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\ninventory will last, when to replenish and how to\\\\nminimize the supply chain disruptions. The oil and gas\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\ndigital twins to reduce supply chain bottlenecks in\\\\nstorage and midstream delivery, schedule tanker\\\\noff-loads and model demand with externalities.\\\\n\\\\n\\\\n\\\\n**\\u2022** Continuous quality checks on produced items\\\\nwith ML/AI generated feedback pre-emptively\\\\nassuring improved product quality. Final paint\\\\ninspection in the automotive industry, for example,\\\\nis performed with computer vision built on top of\\\\ndigital twin technology.\\\\n\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\na part before the process degrades or breaks\\\\ndown and utilizing the components to their fullest,\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\nbuilding an asset performance management suite.\\\\n\\\\n\\\\n\\\\n**\\u2022** Digital twins create the opportunity to have\\\\nmultiple departments in sync by providing\\\\nnecessary instructions modularly to attain\\\\na required throughput. Digital twins are the\\\\nbackbone of kaizen events that optimize\\\\nmanufacturing process flow.\\\\n\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\ninputs, from point of sale customer behavior,\\\\nbuying preferences, or product performance and\\\\nthen integrated into the product development\\\\nprocess, forming a closed loop providing an\\\\nimproved product design.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\n\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\n\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\n\\\\n\\\\nImprove product quality\\\\n\\\\nReduce manufacturing costs\\\\n\\\\nReduce unplanned downtime\\\\n\\\\nIncrease throughput\\\\n\\\\nEnsure safe manufacturing\\\\n\\\\nTest new design ideas\\\\n\\\\nDevelop product enhancements\\\\n\\\\nDigital transformation of enterprise\\\\n\\\\nSpeed new product introduction\\\\n\\\\nReduce planned downtime\\\\n\\\\nMeet new regulatory challenges\\\\n\\\\nTraining for new manufacturing processes\\\\n\\\\nDesign changes to production line\\\\n\\\\nProvide service to end users customers\\\\n\\\\nUpdate products in the field\\\\n\\\\n\\\\n**34%**\\\\n\\\\n\\\\n**30%**\\\\n\\\\n**28%**\\\\n**25%**\\\\n\\\\n**24%**\\\\n\\\\n\\\\n**16%**\\\\n\\\\n**14%**\\\\n\\\\n**13%**\\\\n\\\\n**13%**\\\\n\\\\n**11%**\\\\n**10%**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0043494906, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"929aec8a6e41f875b04a8fd58c7e9553\\\"}, {\\\"page_content\\\": \\\"-----\\\\n\\\\n### Introduction\\\\n\\\\n\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\n\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\n\\\\n\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.004202808, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"33042520bb456fb0730d8ed53528a953\\\"}, {\\\"page_content\\\": \\\"**eBook**\\\\n\\\\n# Making Your Digital Twin Come to Life\\\\n\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\n\\\\n\\\\n-----\\\\n\\\\n### Contents\\\\n\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\n\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\n\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\n\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\n\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0036511072, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\"}, \\\"id\\\": \\\"77fa3ca534959648d7a8e5eebca4d12e\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xfb2e6f01a480fe00\", \"trace_id\": \"0xa4e68cbc5db4ebc1e236017736f5cfa5\"}, \"parent_id\": \"0x099d6f4de27e7c6d\", \"start_time\": 1734543862609431325, \"end_time\": 1734543865748167773, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-2ac9b1075b2348d9be8ede5ab3f84e4e\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_12ef0ccb-1f7c-4148-993d-debb77eccddf\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543863, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 48, \\\"prompt_tokens\\\": 4959, \\\"total_tokens\\\": 5007, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What were the reported increases...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What were the reported increases in profit margins and reduction in manufacturing time for automobile manufacturing when digital-twin technology was implemented?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"digital twin technology automobile manufacturing profit margins reduction manufacturing time\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_c24c3f16-b1f1-4824-97bb-cf0a6ee6f98d\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"Improve product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\\n\\\\\\\\n**8%**\\\\\\\\n**8%**\\\\\\\\n\\\\\\\\n\\\\\\\\nCan you imagine the cost to change\\\\\\\\nan oil refinery\\u2019s crude distillation\\\\\\\\nunit process conditions to improve\\\\\\\\nthe output of diesel one week\\\\\\\\nand gasoline the next to address\\\\\\\\nchanges in demand and ensure\\\\\\\\nmaximum economic value? Can you\\\\\\\\nimagine how to replicate an even\\\\\\\\nsimple supply chain to model risk?\\\\\\\\n\\\\\\\\n\\\\\\\\n**5%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**1%**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### What Are Digital Twins?\\\\\\\\n\\\\\\\\n\\\\\\\\nKnowing the business challenges and benefits digital twins deliver, let\\u2019s turn to\\\\\\\\nthe basics and explore what digital twins are and how a modern data stack is\\\\\\\\nnecessary to build effective and timely digital twins. The classic definition of\\\\\\\\ndigital twin is: \\u201c [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .\\u201d\\\\\\\\n\\\\\\\\n\\\\\\\\nFor a discrete or continuous manufacturing process, a digital twin gathers system\\\\\\\\nand processes state data with the help of various IoT sensors [operational\\\\\\\\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\\\\\\\\nvirtual model which is then used to run simulations, study performance issues and\\\\\\\\ngenerate possible insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n**Types of Digital Twins**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twin Architectures\\\\\\\\n\\\\\\\\nClassic digital twins have been physics-based models of specific systems. More recently,\\\\\\\\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\\\\\\\\n\\\\\\\\n\\\\\\\\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\\\\\\\\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\\\\\\\\nthe industrial environment.\\\\\\\\n\\\\\\\\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\\\\\\\\n\\\\\\\\n**Data-Driven Operational Digital Twins: Maturity Journey**\\\\\\\\n\\\\\\\\n**AI**\\\\\\\\n\\\\\\\\nSimulate & Optimize\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n\\\\\\\\n# 6-8 18-24\\\\\\\\n## years to months\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004522661, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"c743afeca2a4f67e2f6fcc8b2a07bc10\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 10%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 50%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n# 25%\\\\\\\\n\\\\\\\\n\\\\\\\\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Introduction (continued)**\\\\\\\\n\\\\\\\\n\\\\\\\\n**Digital twin market growth rate accelerates**\\\\\\\\n\\\\\\\\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\\\\\\\\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\\\\\\\\nat a CAGR of 58%, riding on the wave of Industry 4.0.\\\\\\\\n\\\\\\\\n\\\\\\\\n**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004403091, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"32450e347d08b2ca314b2a9bc96b9a6e\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**But challenges remain**\\\\\\\\n\\\\\\\\nThe most common challenges faced by the manufacturing industry that digital\\\\\\\\ntwins are addressing include:\\\\\\\\n\\\\\\\\n**\\u2022** Product designs are more complex, resulting in higher cost and increasingly\\\\\\\\nlonger development times\\\\\\\\n\\\\\\\\n**\\u2022** The supply chain is opaque\\\\\\\\n\\\\\\\\n**\\u2022** Production lines are not optimized \\u2013 performance variations, unknown defects\\\\\\\\nand the projection of operating cost is obscure\\\\\\\\n\\\\\\\\n**\\u2022** Poor quality management \\u2013 overreliance on theory, managed by\\\\\\\\nindividual departments\\\\\\\\n\\\\\\\\n**\\u2022** Reactive maintenance costs are too high, resulting in excessive downtime or\\\\\\\\nprocess disruptions\\\\\\\\n\\\\\\\\n**\\u2022** Incongruous collaborations between departments\\\\\\\\n\\\\\\\\n**\\u2022** Invisibility of customer demand for gathering real-time feedback\\\\\\\\n\\\\\\\\n\\\\\\\\nThe growth rate for digital twins is staggering with common adoption reported\\\\\\\\nto be in the 25-40% CAGR growth rate.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Digital Twins Bring Broad Benefits to Manufacturing\\\\\\\\n\\\\\\\\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\\\\\\\\nwould have come at significant costs without digital twin technology.\\\\\\\\n\\\\\\\\n**Let\\u2019s look at the benefits that digital twins deliver to the manufacturing sector:**\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Product design and development is performed with\\\\\\\\nless cost and is completed in less time as iterative\\\\\\\\nsimulations, using multiple constraints, deliver the\\\\\\\\nbest or most optimized design. All commercial\\\\\\\\naircraft are designed using digital twins.\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins provide the awareness of how long\\\\\\\\ninventory will last, when to replenish and how to\\\\\\\\nminimize the supply chain disruptions. The oil and gas\\\\\\\\nindustry, for example, uses supply chain\\u2013oriented\\\\\\\\ndigital twins to reduce supply chain bottlenecks in\\\\\\\\nstorage and midstream delivery, schedule tanker\\\\\\\\noff-loads and model demand with externalities.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Continuous quality checks on produced items\\\\\\\\nwith ML/AI generated feedback pre-emptively\\\\\\\\nassuring improved product quality. Final paint\\\\\\\\ninspection in the automotive industry, for example,\\\\\\\\nis performed with computer vision built on top of\\\\\\\\ndigital twin technology.\\\\\\\\n\\\\\\\\n**\\u2022** Striking the sweet spot between when to replace\\\\\\\\na part before the process degrades or breaks\\\\\\\\ndown and utilizing the components to their fullest,\\\\\\\\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\\\\\\\\nbuilding an asset performance management suite.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** Digital twins create the opportunity to have\\\\\\\\nmultiple departments in sync by providing\\\\\\\\nnecessary instructions modularly to attain\\\\\\\\na required throughput. Digital twins are the\\\\\\\\nbackbone of kaizen events that optimize\\\\\\\\nmanufacturing process flow.\\\\\\\\n\\\\\\\\n**\\u2022** Customer feedback loops can be modeled through\\\\\\\\ninputs, from point of sale customer behavior,\\\\\\\\nbuying preferences, or product performance and\\\\\\\\nthen integrated into the product development\\\\\\\\nprocess, forming a closed loop providing an\\\\\\\\nimproved product design.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\\\\\\\\n\\\\\\\\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\\\\\\\\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\\\\\\\\ndeployment, but typically offer higher and longer-lasting value.\\\\\\\\n\\\\\\\\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\\\\\\\\n\\\\\\\\n\\\\\\\\nImprove product quality\\\\\\\\n\\\\\\\\nReduce manufacturing costs\\\\\\\\n\\\\\\\\nReduce unplanned downtime\\\\\\\\n\\\\\\\\nIncrease throughput\\\\\\\\n\\\\\\\\nEnsure safe manufacturing\\\\\\\\n\\\\\\\\nTest new design ideas\\\\\\\\n\\\\\\\\nDevelop product enhancements\\\\\\\\n\\\\\\\\nDigital transformation of enterprise\\\\\\\\n\\\\\\\\nSpeed new product introduction\\\\\\\\n\\\\\\\\nReduce planned downtime\\\\\\\\n\\\\\\\\nMeet new regulatory challenges\\\\\\\\n\\\\\\\\nTraining for new manufacturing processes\\\\\\\\n\\\\\\\\nDesign changes to production line\\\\\\\\n\\\\\\\\nProvide service to end users customers\\\\\\\\n\\\\\\\\nUpdate products in the field\\\\\\\\n\\\\\\\\n\\\\\\\\n**34%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**30%**\\\\\\\\n\\\\\\\\n**28%**\\\\\\\\n**25%**\\\\\\\\n\\\\\\\\n**24%**\\\\\\\\n\\\\\\\\n\\\\\\\\n**16%**\\\\\\\\n\\\\\\\\n**14%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**13%**\\\\\\\\n\\\\\\\\n**11%**\\\\\\\\n**10%**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0043494906, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"929aec8a6e41f875b04a8fd58c7e9553\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\n\\\\\\\\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\\\\\\\\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\\\\\\\\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\\\\\\\\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\\\\\\\\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\\\\\\\\ncost-effective and are now an imperative in today\\u2019s data-driven businesses.\\\\\\\\n\\\\\\\\nToday\\u2019s manufacturing industries are expected to streamline and optimize all the processes in their value\\\\\\\\nchain from product development and design, through operations and supply chain optimization to obtaining\\\\\\\\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\\\\\\\\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\\\\\\\\n\\\\\\\\n\\\\\\\\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[\\u201cprofit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\\n[approximately 10 hours.\\u201d](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.004202808, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"33042520bb456fb0730d8ed53528a953\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"**eBook**\\\\\\\\n\\\\\\\\n# Making Your Digital Twin Come to Life\\\\\\\\n\\\\\\\\n##### With the Lakehouse for Manufacturing and Tredence\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Contents\\\\\\\\n\\\\\\\\nIntroduction ................................................................................................................................................................................................................ **03**\\\\\\\\n\\\\\\\\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\\\\\\\\n\\\\\\\\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\\\\\\\\n\\\\\\\\nDigital Twin Architectures .................................................................................................................................................................................. **08**\\\\\\\\n\\\\\\\\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0036511072, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"77fa3ca534959648d7a8e5eebca4d12e\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf", + "According to the search results, the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours.", + "yes", + null, + "yes", + "No harmful content detected in response", + "yes", + "The expected response states that the increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%. The response confirms this by stating that the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model. The expected response also states that the reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology. The response supports this by stating that there was a reduction in manufacturing time of approximately 10 hours. The response is correct.", + 5.265, + 6199.0, + 6122.0, + 77.0, + null, + null, + [ + [ + "Improve product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**\n\n**8%**\n**8%**\n\n\nCan you imagine the cost to change\nan oil refinery’s crude distillation\nunit process conditions to improve\nthe output of diesel one week\nand gasoline the next to address\nchanges in demand and ensure\nmaximum economic value? Can you\nimagine how to replicate an even\nsimple supply chain to model risk?\n\n\n**5%**\n\n\n**1%**\n\n\n-----\n\n### What Are Digital Twins?\n\n\nKnowing the business challenges and benefits digital twins deliver, let’s turn to\nthe basics and explore what digital twins are and how a modern data stack is\nnecessary to build effective and timely digital twins. The classic definition of\ndigital twin is: “ [A virtual model designed to accurately reflect a physical object](https://www.ibm.com/topics/what-is-a-digital-twin) .”\n\n\nFor a discrete or continuous manufacturing process, a digital twin gathers system\nand processes state data with the help of various IoT sensors [operational\ntechnology data (OT)] and enterprise data [informational technology (IT)] to form a\nvirtual model which is then used to run simulations, study performance issues and\ngenerate possible insights.\n\n\n**Types of Digital Twins**\n\n\n-----\n\n### Digital Twin Architectures\n\nClassic digital twins have been physics-based models of specific systems. More recently,\n**data-driven digital twins, which work on the real-time system data, are gaining prominence** .\n\n\nThese twins provide the opportunity to not just monitor and simulate system performance under specific\nconditions, but also provide the platform to further embed AI-based predictive and prescriptive solutions into\nthe industrial environment.\n\nDigital twins undergo a series of changes during their lifecycle to become completely autonomous.\n\n**Data-Driven Operational Digital Twins: Maturity Journey**\n\n**AI**\n\nSimulate & Optimize\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n\n# 6-8 18-24\n## years to months\n\n\n**[Digital twins have reduced](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[automotive product design](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**\n**[lifecycle from](https://www.technologyreview.com/2022/01/05/1042981/digital-twins-improve-real-life-manufacturing/)**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ], + [ + "**[Digital twins accelerate](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[potential revenue](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[increase up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 10%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 50%\n\n\n**[Time to market](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[accelerated by](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n# 25%\n\n\n**[Product quality](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n**[improvement up to](https://www.mckinsey.com/business-functions/operations/our-insights/digital-twins-the-art-of-the-possible-in-product-development-and-beyond)**\n\n\n-----\n\n**Introduction (continued)**\n\n\n**Digital twin market growth rate accelerates**\n\nDigital twins are now so ingrained in manufacturing that the [global industry market](https://www.marketsandmarkets.com/Market-Reports/digital-twin-market-225269522.html)\nis forecasted to reach $48 billion in 2026. This figure is up from $3.1 billion in 2020\nat a CAGR of 58%, riding on the wave of Industry 4.0.\n\n\n**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ], + [ + "**But challenges remain**\n\nThe most common challenges faced by the manufacturing industry that digital\ntwins are addressing include:\n\n**•** Product designs are more complex, resulting in higher cost and increasingly\nlonger development times\n\n**•** The supply chain is opaque\n\n**•** Production lines are not optimized – performance variations, unknown defects\nand the projection of operating cost is obscure\n\n**•** Poor quality management – overreliance on theory, managed by\nindividual departments\n\n**•** Reactive maintenance costs are too high, resulting in excessive downtime or\nprocess disruptions\n\n**•** Incongruous collaborations between departments\n\n**•** Invisibility of customer demand for gathering real-time feedback\n\n\nThe growth rate for digital twins is staggering with common adoption reported\nto be in the 25-40% CAGR growth rate.\n\n\n-----\n\n### Digital Twins Bring Broad Benefits to Manufacturing\n\nIndustry 4.0 and subsequent intelligent supply chain efforts have made significant strides in improving operations and building agile supply chains, efforts that\nwould have come at significant costs without digital twin technology.\n\n**Let’s look at the benefits that digital twins deliver to the manufacturing sector:**\n\n\n\n**•** Product design and development is performed with\nless cost and is completed in less time as iterative\nsimulations, using multiple constraints, deliver the\nbest or most optimized design. All commercial\naircraft are designed using digital twins.\n\n**•** Digital twins provide the awareness of how long\ninventory will last, when to replenish and how to\nminimize the supply chain disruptions. The oil and gas\nindustry, for example, uses supply chain–oriented\ndigital twins to reduce supply chain bottlenecks in\nstorage and midstream delivery, schedule tanker\noff-loads and model demand with externalities.\n\n\n\n**•** Continuous quality checks on produced items\nwith ML/AI generated feedback pre-emptively\nassuring improved product quality. Final paint\ninspection in the automotive industry, for example,\nis performed with computer vision built on top of\ndigital twin technology.\n\n**•** Striking the sweet spot between when to replace\na part before the process degrades or breaks\ndown and utilizing the components to their fullest,\ndigital twins provide manufacturers with realtime feedback. Digital twins are the backbone of\nbuilding an asset performance management suite.\n\n\n\n**•** Digital twins create the opportunity to have\nmultiple departments in sync by providing\nnecessary instructions modularly to attain\na required throughput. Digital twins are the\nbackbone of kaizen events that optimize\nmanufacturing process flow.\n\n**•** Customer feedback loops can be modeled through\ninputs, from point of sale customer behavior,\nbuying preferences, or product performance and\nthen integrated into the product development\nprocess, forming a closed loop providing an\nimproved product design.\n\n\n-----\n\n**Digital Twins Bring Broad Benefits to Manufacturing (continued)**\n\nThe top four use cases are heavily focused on operational processes and are typically the first to be deployed\nin manufacturing by a majority of companies. Those that have a lower adoption rate are more complex in\ndeployment, but typically offer higher and longer-lasting value.\n\n**[Digital Twin Use Case Deployment](https://blogs.3ds.com/exalead/2019/07/03/digital-twin-use-cases-in-manufacturing-part-5-12/)**\n\n\nImprove product quality\n\nReduce manufacturing costs\n\nReduce unplanned downtime\n\nIncrease throughput\n\nEnsure safe manufacturing\n\nTest new design ideas\n\nDevelop product enhancements\n\nDigital transformation of enterprise\n\nSpeed new product introduction\n\nReduce planned downtime\n\nMeet new regulatory challenges\n\nTraining for new manufacturing processes\n\nDesign changes to production line\n\nProvide service to end users customers\n\nUpdate products in the field\n\n\n**34%**\n\n\n**30%**\n\n**28%**\n**25%**\n\n**24%**\n\n\n**16%**\n\n**14%**\n\n**13%**\n\n**13%**\n\n**11%**\n**10%**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ], + [ + "-----\n\n### Introduction\n\n\nThe concept of digital twins is not new. In fact, it is [reported](https://en.wikipedia.org/wiki/Digital_twin#:~:text=One%20of%20the%20earliest%20examples,Heathrow%20Airport's%20Terminal%201) that the first application was\nover 25 years ago, during the early phases of foundation and cofferdam construction for the\nLondon Heathrow Express facilities, to monitor and predict foundation borehole grouting. In\nthe years since this first application, edge computing, AI, data connectivity, 5G connectivity\nand the improvements of the Internet of Things (IoT) have enabled digital twins to become\ncost-effective and are now an imperative in today’s data-driven businesses.\n\nToday’s manufacturing industries are expected to streamline and optimize all the processes in their value\nchain from product development and design, through operations and supply chain optimization to obtaining\nfeedback to reflect and respond to rapidly growing customer demands. The digital twins category is broad\nand is addressing a multitude of challenges within manufacturing, logistics and transportation.\n\n\n[In a case study published](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[in MIT Technology Review,](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[“profit margins increased and](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing time was reduced](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[when digital-twin technology](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[was implemented. Automobile](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[manufacturing profit margins](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[increased by 41% to 54% per](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[model. The estimated average](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[automobile manufacturing](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[time was reduced to](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)\n[approximately 10 hours.”](https://wp.technologyreview.com/wp-content/uploads/2022/01/Digital-twins-improve-real-life-manufacturing_010522.pdf)", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ], + [ + "**eBook**\n\n# Making Your Digital Twin Come to Life\n\n##### With the Lakehouse for Manufacturing and Tredence\n\n\n-----\n\n### Contents\n\nIntroduction ................................................................................................................................................................................................................ **03**\n\nDigital Twins Bring Broad Benefits to Manufacturing .......................................................................................................................... **05**\n\nWhat Are Digital Twins? ........................................................................................................................................................................................ **07**\n\nDigital Twin Architectures .................................................................................................................................................................................. **08**\n\nHow to Build a Digital Twin ................................................................................................................................................................................ **09**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/making-your-digital-twin-come-to-life.pdf" + ] + ], + "yes", + "The response states that the implementation of digital-twin technology in automobile manufacturing has led to reported increases in profit margins of 41% to 54% per model, and a reduction in manufacturing time of approximately 10 hours. The retrieved context confirms this by stating, 'Automobile manufacturing profit margins increased by 41% to 54% per model. The estimated average automobile manufacturing time was reduced to approximately 10 hours.' Therefore, the response is fully supported by the retrieved context.", + "yes", + "The ground truth states two points: 1) The increase in profit margins for automobile manufacturing when digital-twin technology was implemented went from 41% to 54%, and 2) The reduction in manufacturing time resulted in an average time of approximately 10 hours due to the implementation of digital-twin technology. The retrieved context supports both points. It mentions that 'Automobile manufacturing profit margins increased by 41% to 54% per model' and 'The estimated average automobile manufacturing time was reduced to approximately 10 hours.' Therefore, both parts of the ground truth are supported by the retrieved context.", + [ + "yes", + "yes", + "yes", + "yes", + "yes" + ], + 1.0 + ], + [ + "4517d3fb5f3f1e83efba44630c78e94b18db1d8f94f780810adb3550c851891b", + [ + [ + [ + "What are the responsibilities of a Data Engineer according to the document?", + "user" + ] + ] + ], + [ + [ + "We distinguish between the following personas:\nData Governance Officer: Responsible for ensuring that data governance, data privacy and other compliance measures are adhered to across the model development and deployment process. Not typically involved in day-to-day operations.\nData Engineer: Responsible for building data pipelines to process, organize and persist data sets for machine learning and other downstream applications.\nData Scientist: Responsible for understanding the business problem, exploring available data to understand if machine learning is applicable, and then training, tuning and evaluating a model to be deployed.\nML Engineer: Responsible for deploying machine learning models to production with appropriate governance, monitoring and software development best practices such as continuous integration and continuous deployment (CI/CD).\nBusiness Stakeholder: Responsible for using the model to make decisions for the business or product, and responsible for the business value that the model is expected to generate.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf" + ] + ], + [ + "A Data Engineer is responsible for building data pipelines.", + "A Data Engineer is responsible for processing data sets.", + "A Data Engineer is responsible for organizing data sets.", + "A Data Engineer is responsible for persisting data sets.", + "The responsibilities support machine learning and other downstream applications." + ], + "{\"info\": {\"request_id\": \"tr-0120829e6be846c588aaf60616a84091\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543861615, \"execution_time_ms\": 8860, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data fo...\"}, \"tags\": {\"eval.requestId\": \"fb8a507b-50e1-4175-ae70-cd532a4da48a\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-0120829e6be846c588aaf60616a84091/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x322d3f08ebd83245\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": null, \"start_time\": 1734543861615056611, \"end_time\": 1734543870475653956, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0xd22480b5c323e525\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861626761953, \"end_time\": 1734543861637333882, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xc38dd7930788fc7a\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861637470683, \"end_time\": 1734543861638090491, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0x69c466d43cc8cb45\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc38dd7930788fc7a\", \"start_time\": 1734543861637639085, \"end_time\": 1734543861637835188, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the responsibilities of a Data Engineer according to the document?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xa78642c809ff4e5c\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc38dd7930788fc7a\", \"start_time\": 1734543861637912989, \"end_time\": 1734543861638056690, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0x4f95a72fa7bfa0d4\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x322d3f08ebd83245\", \"start_time\": 1734543861638154992, \"end_time\": 1734543870475145150, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0xe8a5ad56163ea90b\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543861851646682, \"end_time\": 1734543864504840560, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_55c922a5-e0d1-4855-8c4f-98f0d7977f31\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543863, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 22, \\\"prompt_tokens\\\": 1152, \\\"total_tokens\\\": 1174, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0xc3270e988b4fcff6\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543864509030410, \"end_time\": 1734543864848978788, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"Data Engineer responsibilities\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003443227, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\"}, \\\"id\\\": \\\"1ce1d861d15136fd48438be91479e567\\\"}, {\\\"page_content\\\": \\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0033119193, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"e577e0ac294ad34249c7d000936d7c72\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032034456, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0030519078, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\"}, \\\"id\\\": \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\"}, {\\\"page_content\\\": \\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029978286, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9f81ac0b52802c7152247bfd5289b744\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x21a63398a53168a6\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc3270e988b4fcff6\", \"start_time\": 1734543864513960069, \"end_time\": 1734543864847612271, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"Data Engineer responsibilities\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"1ce1d861d15136fd48438be91479e567\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\", 0.003443227], [\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"e577e0ac294ad34249c7d000936d7c72\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0033119193], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.0032034456], [\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\", 0.0030519078], [\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"9f81ac0b52802c7152247bfd5289b744\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029978286]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0x3539119435fb9b51\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0xc3270e988b4fcff6\", \"start_time\": 1734543864847785474, \"end_time\": 1734543864848655784, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"1ce1d861d15136fd48438be91479e567\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\", 0.003443227], [\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"e577e0ac294ad34249c7d000936d7c72\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0033119193], [\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"f6ef96d9f374de069754b3f8d671b16d\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\", 0.0032034456], [\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\", 0.0030519078], [\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"9f81ac0b52802c7152247bfd5289b744\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\", 0.0029978286]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"engineering in the gaming industry.\\\\n\\\\n`10. \\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\n\\\\nfirst step in your data journey. Imagine how the output of\\\\n\\\\nyour data can be presented in a way to help stakeholders\\\\n\\\\nacross your company achieve more. For example, dropping\\\\n\\\\ndata into an application that can help game designers\\\\n\\\\nmake balancing decisions based on player events.\\\\n\\\\n\\\\n-----\\\\n\\\\n# APPENDIX Ultimate class build guide\\\\n\\\\n\\\\n### Creating a character\\\\n\\\\nThe heart and soul of mature data teams are formed by this\\\\n\\\\ntrio of classes. There are many aspects to these roles, but\\\\n\\\\nthey can be summarized in that Data Engineers create and\\\\n\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\n\\\\nand create reports that keep the business teams running\\\\n\\\\nseamlessly, and Data Scientists are responsible for making\\\\n\\\\nsense of large amounts of data. Depending on the size of\\\\n\\\\nthe organization, individuals may be required to multiclass\\\\n\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\n\\\\noften developers who wear multiple hats, including those in\\\\n\\\\ndata engineering, analytics and data science.\\\\n\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\n\\\\nto report on the health of a title or building a recommendation\\\\n\\\\nengine for your players, this guide will help you better\\\\n\\\\nunderstand the unique classes required to develop and\\\\n\\\\nmaintain an effective data, analytics, and AI platform.\\\\n\\\\n##### Data Engineers\\\\n\\\\n\\\\n**Goals and Priorities of Data Engineers**\\\\n\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\n\\\\nthat both enables timely decision-making and is accurate\\\\n\\\\nand reproducible\\\\n\\\\n- Increase user confidence and trust in data. This involves\\\\n\\\\nensuring high consistency and reliability in ETL processes\\\\n\\\\n- Limit the issues and failures experienced by other\\\\n\\\\nengineers and data scientists, allowing those roles to\\\\n\\\\nfocus less on troubleshooting and more on drawing\\\\n\\\\nmeaningful conclusions from data and building new\\\\n\\\\nproducts / features\\\\n\\\\n**What Data Engineers care about:**\\\\n\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\n\\\\nboth enables timely decision-making and is accurate and\\\\n\\\\nreproducible\\\\n\\\\n- Building high-performance, reliable and scalable pipelines\\\\n\\\\nfor data processing\\\\n\\\\n- Delivering data for consumption from a variety of sources\\\\n\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\n\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\n\\\\nacross teams\\\\n\\\\n\\\\nData engineers build systems that collect, manage, and\\\\n\\\\n\\\\nconvert source data into usable information for data\\\\n\\\\nscientists and business analysts to interpret. Their ultimate\\\\n\\\\ngoal is to make data accessible so that teams can use it to\\\\n\\\\nevaluate and optimize a goal or objective.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Data Engineers are responsible for data migration,\\\\n\\\\nmanipulation, and integration of data (joining dissimilar\\\\n\\\\ndata systems)\\\\n\\\\n- Setup and maintenance of ETL pipelines to convert\\\\n\\\\nsource data into actionable data for insights. It is the\\\\n\\\\nresponsibility of the data engineer to make sure these\\\\n\\\\npipelines run efficiently and are well orchestrated.\\\\n\\\\n- The Data Engineer sets up the workflow process\\\\n\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\n\\\\ncontinuously validates it\\\\n\\\\n- Managing workflows to enable data scientists and data\\\\n\\\\nanalysts, and ensuring workflows are well-integrated with\\\\n\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\n\\\\n\\\\n##### Data Scientists\\\\n\\\\nData scientists determine the questions their team should\\\\n\\\\nbe asking and figure out how to answer those questions\\\\n\\\\nusing data. They often develop predictive models for\\\\n\\\\ntheorizing and forecasting.\\\\n\\\\n**Responsibilities:**\\\\n\\\\n- Responsible for making sense of the large amounts of data\\\\n\\\\ncollected for a given game title, such as game telemetry,\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.003443227, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\"}, \\\"id\\\": \\\"1ce1d861d15136fd48438be91479e567\\\"}, {\\\"page_content\\\": \\\"Data teams rely on getting the right data at the right time for analytics, data\\\\nscience and machine learning, but often are faced with challenges meeting\\\\nthe needs of their initiatives for data engineering.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### Why data engineering is hard\\\\n\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\ncomplex data that lives across the organization. Most of the complexity\\\\narises with the explosion of data volumes and data types, with organizations\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\n\\\\nWith this volume, managing data pipelines to transform and process data\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\nmost businesses are putting an increased emphasis on multicloud\\\\nenvironments which can be even more difficult to maintain.\\\\n\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\nthat data itself has become a product, and the challenging goal of the data\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\ndata product all the way from ingestion to monetization.\\\\n\\\\n\\\\nDespite current technological advances data engineering remains\\\\ndifficult for several reasons:\\\\n\\\\n**Complex data ingestion methods**\\\\n\\\\nData ingestion means retrieving batch and streaming data from various\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\nand error-prone data ingestion tasks.\\\\n\\\\n**Data engineering principles**\\\\n\\\\nThese days, large operations teams are often just a memory of the past.\\\\nModern data engineering principles are based on agile software development\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\nuse isolated development and production environments, CI/CD, and version\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\nneeds to support these principles.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Third-party tools**\\\\n\\\\nData engineers are often required to run additional third-party tools for\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\nand decreases the reliability of the system.\\\\n\\\\n**Performance tuning**\\\\n\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\narchitecture and constantly observing throughput parameters.\\\\n\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\nworkloads, development languages and governance model.\\\\n\\\\n\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0033119193, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"e577e0ac294ad34249c7d000936d7c72\\\"}, {\\\"page_content\\\": \\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\n\\\\n\\\\n-----\\\\n\\\\n## Contents\\\\n\\\\n#### Data Engineering Drivers 2\\\\n\\\\n Data Pipeline Key Goals 4\\\\n\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\n\\\\n Data Reliability Challenges With Data Lakes 6\\\\n\\\\n Delta Lake: A New Storage Layer 7\\\\n\\\\n Delta Lake: Key Features 8\\\\n\\\\n Getting Started With Delta Lake 10\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n#### Data Engineering Drivers\\\\n\\\\nData engineering professionals are needing to respond to several different drivers.\\\\n\\\\nChief among the drivers they face are:\\\\n\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\n\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\n\\\\norganizations seek to derive far more value from their corporate assets.\\\\n\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\n\\\\ncompanies, these advanced approaches are being adopted across a\\\\n\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\n\\\\nprivate as well as public sector organizations. This is further driving the need\\\\n\\\\nfor strong data engineering practices.\\\\n\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\n\\\\nthere is increased interest in how the data is protected and managed.\\\\n\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\n\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\n\\\\ndata must be managed.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Drivers\\\\n\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\n\\\\nthat is now well underway is being propelled further by innovations such as\\\\n\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\n\\\\nand machine learning. All these offer data professionals new approaches for\\\\n\\\\ntheir data initiatives.\\\\n\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\n\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\n\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\n\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\n\\\\nmeeting ROI hurdles.\\\\n\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\n\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\n\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\n\\\\nThey must balance the needs of governance, security and democratization.\\\\n\\\\n\\\\n-----\\\\n\\\\n## Key Goals\\\\n\\\\n#### Data Pipeline Key Goals\\\\n\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\n\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\n\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\n\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\n\\\\ndesign and build their data pipelines.\\\\n\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\n\\\\nanalytics professionals in their organizations are:\\\\n\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\n\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\n\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\n\\\\nusers. Equally well, many applications require up-to-date information (who\\\\n\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\n\\\\nlimited value without it.\\\\n\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\n\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\n\\\\ndemanding when the queries are based on very large data sets.\\\\n\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\n\\\\nperformance in a limited, development or test environment. What matters\\\\n\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\n\\\\nrequiring high operational overhead.\\\\n\\\\n\\\\n-----\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0032034456, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\"}, \\\"id\\\": \\\"f6ef96d9f374de069754b3f8d671b16d\\\"}, {\\\"page_content\\\": \\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\n\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\n\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\n\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\n\\\\n\\\\n-----\\\\n\\\\n### Introduction\\\\n\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\n\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\ncritical first step in the data engineering and management lifecycle.\\\\n\\\\n\\\\n-----\\\\n\\\\n### Life of a Data Engineer\\\\n\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\n\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\n\\\\n\\\\na variety of data types. For example:\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0030519078, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\"}, \\\"id\\\": \\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\"}, {\\\"page_content\\\": \\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\nto drive valuable insights.\\\\n\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\nworld-class governance.\\\\n\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\nthat automates the complexity of building and maintaining pipelines and\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\nand reliability to drive valuable insights.\\\\n\\\\n\\\\n#### Databricks makes modern data engineering simple\\\\n\\\\nThere is no industry-wide definition of modern data engineering.\\\\nThis should come close:\\\\n\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\n_kinds of workflows._\\\\n\\\\n\\\\n-----\\\\n\\\\n-----\\\\n\\\\n#### Benefits of data engineering on the lakehouse\\\\n\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\n\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\nfor analytics, data science or machine learning.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\nhealth for performance, quality, status and latency.\\\\n\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\ndata pipeline deployments into production or roll back pipelines and\\\\nminimize downtime.\\\\n\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\nof data processing tasks for data and machine learning pipelines with the\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\n(DAG) on a Databricks compute cluster.\\\\n\\\\n\\\\n\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\ntime and effort and focus on implementing business logic and data\\\\nquality checks within the data pipeline using SQL or Python.\\\\n\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\nlakehouse so data teams can confidently trust the information for\\\\ndownstream initiatives with the ability to define data quality and\\\\nautomatically address errors.\\\\n\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\nwith cost controls without having to know complex stream processing\\\\nand implement recovery logic.\\\\n\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\nfor most common error conditions that can occur during the operation of\\\\na pipeline with fast, scalable fault-tolerance.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Data engineering is all about data quality**\\\\n\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\nthree different levels.\\\\n\\\\n\\\\n1. On a **technical level** , data quality is\\\\nguaranteed by enforcing and evolving\\\\nschemas for data storage and ingestion.\\\\n\\\\n**Kenesis**\\\\n\\\\n**CSV,**\\\\n**JSON, TXT...**\\\\n\\\\n**Data Lake**\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029978286, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\"}, \\\"id\\\": \\\"9f81ac0b52802c7152247bfd5289b744\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xe94822ce94a683c1\", \"trace_id\": \"0xacb6608a1ec822ab2b6bed2bd6830a28\"}, \"parent_id\": \"0x4f95a72fa7bfa0d4\", \"start_time\": 1734543864862687552, \"end_time\": 1734543870466981749, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-0120829e6be846c588aaf60616a84091\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_c53ebd3b-0432-428d-af54-e70dd4e12eec\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543865, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 188, \\\"prompt_tokens\\\": 5334, \\\"total_tokens\\\": 5522, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the responsibilities of...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the responsibilities of a Data Engineer according to the document?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"Data Engineer responsibilities\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_e09bc078-fe39-426a-9a33-e486e8d6d050\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"engineering in the gaming industry.\\\\\\\\n\\\\\\\\n`10. \\\\\\\\u0007` **Go beyond dashboards.** Looking at dashboards is only the\\\\\\\\n\\\\\\\\nfirst step in your data journey. Imagine how the output of\\\\\\\\n\\\\\\\\nyour data can be presented in a way to help stakeholders\\\\\\\\n\\\\\\\\nacross your company achieve more. For example, dropping\\\\\\\\n\\\\\\\\ndata into an application that can help game designers\\\\\\\\n\\\\\\\\nmake balancing decisions based on player events.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n# APPENDIX Ultimate class build guide\\\\\\\\n\\\\\\\\n\\\\\\\\n### Creating a character\\\\\\\\n\\\\\\\\nThe heart and soul of mature data teams are formed by this\\\\\\\\n\\\\\\\\ntrio of classes. There are many aspects to these roles, but\\\\\\\\n\\\\\\\\nthey can be summarized in that Data Engineers create and\\\\\\\\n\\\\\\\\nmaintain critical data workflows, Data Analysts interpret data\\\\\\\\n\\\\\\\\nand create reports that keep the business teams running\\\\\\\\n\\\\\\\\nseamlessly, and Data Scientists are responsible for making\\\\\\\\n\\\\\\\\nsense of large amounts of data. Depending on the size of\\\\\\\\n\\\\\\\\nthe organization, individuals may be required to multiclass\\\\\\\\n\\\\\\\\nin order to address needs of the team. In smaller studios, it\\u2019s\\\\\\\\n\\\\\\\\noften developers who wear multiple hats, including those in\\\\\\\\n\\\\\\\\ndata engineering, analytics and data science.\\\\\\\\n\\\\\\\\nWhether you\\u2019re looking to stand-up an analytics dashboard\\\\\\\\n\\\\\\\\nto report on the health of a title or building a recommendation\\\\\\\\n\\\\\\\\nengine for your players, this guide will help you better\\\\\\\\n\\\\\\\\nunderstand the unique classes required to develop and\\\\\\\\n\\\\\\\\nmaintain an effective data, analytics, and AI platform.\\\\\\\\n\\\\\\\\n##### Data Engineers\\\\\\\\n\\\\\\\\n\\\\\\\\n**Goals and Priorities of Data Engineers**\\\\\\\\n\\\\\\\\n- Enable access to usable data for real-time insights \\u2014 data\\\\\\\\n\\\\\\\\nthat both enables timely decision-making and is accurate\\\\\\\\n\\\\\\\\nand reproducible\\\\\\\\n\\\\\\\\n- Increase user confidence and trust in data. This involves\\\\\\\\n\\\\\\\\nensuring high consistency and reliability in ETL processes\\\\\\\\n\\\\\\\\n- Limit the issues and failures experienced by other\\\\\\\\n\\\\\\\\nengineers and data scientists, allowing those roles to\\\\\\\\n\\\\\\\\nfocus less on troubleshooting and more on drawing\\\\\\\\n\\\\\\\\nmeaningful conclusions from data and building new\\\\\\\\n\\\\\\\\nproducts / features\\\\\\\\n\\\\\\\\n**What Data Engineers care about:**\\\\\\\\n\\\\\\\\n- Enabling access to data for real-time insights \\u2014 data that\\\\\\\\n\\\\\\\\nboth enables timely decision-making and is accurate and\\\\\\\\n\\\\\\\\nreproducible\\\\\\\\n\\\\\\\\n- Building high-performance, reliable and scalable pipelines\\\\\\\\n\\\\\\\\nfor data processing\\\\\\\\n\\\\\\\\n- Delivering data for consumption from a variety of sources\\\\\\\\n\\\\\\\\nby Data Analysts and Data Scientists against tight SLAs\\\\\\\\n\\\\\\\\n- A Data Engineer\\u2019s biggest challenge? Collaboration\\\\\\\\n\\\\\\\\nacross teams\\\\\\\\n\\\\\\\\n\\\\\\\\nData engineers build systems that collect, manage, and\\\\\\\\n\\\\\\\\n\\\\\\\\nconvert source data into usable information for data\\\\\\\\n\\\\\\\\nscientists and business analysts to interpret. Their ultimate\\\\\\\\n\\\\\\\\ngoal is to make data accessible so that teams can use it to\\\\\\\\n\\\\\\\\nevaluate and optimize a goal or objective.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Data Engineers are responsible for data migration,\\\\\\\\n\\\\\\\\nmanipulation, and integration of data (joining dissimilar\\\\\\\\n\\\\\\\\ndata systems)\\\\\\\\n\\\\\\\\n- Setup and maintenance of ETL pipelines to convert\\\\\\\\n\\\\\\\\nsource data into actionable data for insights. It is the\\\\\\\\n\\\\\\\\nresponsibility of the data engineer to make sure these\\\\\\\\n\\\\\\\\npipelines run efficiently and are well orchestrated.\\\\\\\\n\\\\\\\\n- The Data Engineer sets up the workflow process\\\\\\\\n\\\\\\\\nto orchestrate pipelines for the studio\\u2019s data and\\\\\\\\n\\\\\\\\ncontinuously validates it\\\\\\\\n\\\\\\\\n- Managing workflows to enable data scientists and data\\\\\\\\n\\\\\\\\nanalysts, and ensuring workflows are well-integrated with\\\\\\\\n\\\\\\\\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\\\\\\\\n\\\\\\\\n\\\\\\\\n##### Data Scientists\\\\\\\\n\\\\\\\\nData scientists determine the questions their team should\\\\\\\\n\\\\\\\\nbe asking and figure out how to answer those questions\\\\\\\\n\\\\\\\\nusing data. They often develop predictive models for\\\\\\\\n\\\\\\\\ntheorizing and forecasting.\\\\\\\\n\\\\\\\\n**Responsibilities:**\\\\\\\\n\\\\\\\\n- Responsible for making sense of the large amounts of data\\\\\\\\n\\\\\\\\ncollected for a given game title, such as game telemetry,\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.003443227, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"1ce1d861d15136fd48438be91479e567\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"Data teams rely on getting the right data at the right time for analytics, data\\\\\\\\nscience and machine learning, but often are faced with challenges meeting\\\\\\\\nthe needs of their initiatives for data engineering.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Why data engineering is hard\\\\\\\\n\\\\\\\\nOne of the biggest challenges is accessing and managing the increasingly\\\\\\\\ncomplex data that lives across the organization. Most of the complexity\\\\\\\\narises with the explosion of data volumes and data types, with organizations\\\\\\\\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\\\\\\\\n\\\\\\\\nWith this volume, managing data pipelines to transform and process data\\\\\\\\nis slow and difficult, and increasingly expensive. And to top off the complexity,\\\\\\\\nmost businesses are putting an increased emphasis on multicloud\\\\\\\\nenvironments which can be even more difficult to maintain.\\\\\\\\n\\\\\\\\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\\\\\\\\nthat data itself has become a product, and the challenging goal of the data\\\\\\\\nengineer is to build and run the machinery that creates this high-fidelity\\\\\\\\ndata product all the way from ingestion to monetization.\\\\\\\\n\\\\\\\\n\\\\\\\\nDespite current technological advances data engineering remains\\\\\\\\ndifficult for several reasons:\\\\\\\\n\\\\\\\\n**Complex data ingestion methods**\\\\\\\\n\\\\\\\\nData ingestion means retrieving batch and streaming data from various\\\\\\\\nsources and in various formats. Ingesting data is hard and complex since you\\\\\\\\neither need to use an always-running streaming platform like Apache Kafka\\\\\\\\nor you need to be able to keep track of which files haven\\u2019t been ingested yet.\\\\\\\\nData engineers are required to spend a lot of time hand-coding repetitive\\\\\\\\nand error-prone data ingestion tasks.\\\\\\\\n\\\\\\\\n**Data engineering principles**\\\\\\\\n\\\\\\\\nThese days, large operations teams are often just a memory of the past.\\\\\\\\nModern data engineering principles are based on agile software development\\\\\\\\nmethodologies. They apply the well-known \\u201cyou build it, you run it\\u201d paradigm,\\\\\\\\nuse isolated development and production environments, CI/CD, and version\\\\\\\\ncontrol transformations that are pushed to production after validation. Tooling\\\\\\\\nneeds to support these principles.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Third-party tools**\\\\\\\\n\\\\\\\\nData engineers are often required to run additional third-party tools for\\\\\\\\norchestration to automate tasks such as ELT/ETL or customer code in\\\\\\\\nnotebooks. Running third-party tools increases the operational overhead\\\\\\\\nand decreases the reliability of the system.\\\\\\\\n\\\\\\\\n**Performance tuning**\\\\\\\\n\\\\\\\\nFinally, with all pipelines and workflows written, data engineers need to\\\\\\\\nconstantly focus on performance, tuning pipelines and architectures to meet\\\\\\\\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\\\\\\\\narchitecture and constantly observing throughput parameters.\\\\\\\\n\\\\\\\\nMost organizations are dealing with a complex landscape of data warehouses\\\\\\\\nand data lakes these days. Each of those platforms has its own limitations,\\\\\\\\nworkloads, development languages and governance model.\\\\\\\\n\\\\\\\\n\\\\\\\\nWith the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0033119193, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e577e0ac294ad34249c7d000936d7c72\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"# Building Reliable Data Lakes at Scale With Delta Lake\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Contents\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers 2\\\\\\\\n\\\\\\\\n Data Pipeline Key Goals 4\\\\\\\\n\\\\\\\\n Apache Spark\\u2122: The First Unified Analytics Engine 5\\\\\\\\n\\\\\\\\n Data Reliability Challenges With Data Lakes 6\\\\\\\\n\\\\\\\\n Delta Lake: A New Storage Layer 7\\\\\\\\n\\\\\\\\n Delta Lake: Key Features 8\\\\\\\\n\\\\\\\\n Getting Started With Delta Lake 10\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n#### Data Engineering Drivers\\\\\\\\n\\\\\\\\nData engineering professionals are needing to respond to several different drivers.\\\\\\\\n\\\\\\\\nChief among the drivers they face are:\\\\\\\\n\\\\\\\\n**Rise of Advanced Analytics** \\u2014 Advanced analytics, including methods\\\\\\\\n\\\\\\\\nbased on machine learning techniques, have evolved to such a degree that\\\\\\\\n\\\\\\\\norganizations seek to derive far more value from their corporate assets.\\\\\\\\n\\\\\\\\n**Widespread Adoption** \\u2014 Once the province of leading edge, high-tech\\\\\\\\n\\\\\\\\ncompanies, these advanced approaches are being adopted across a\\\\\\\\n\\\\\\\\nmultitude of industries from retail to hospitality to healthcare and across\\\\\\\\n\\\\\\\\nprivate as well as public sector organizations. This is further driving the need\\\\\\\\n\\\\\\\\nfor strong data engineering practices.\\\\\\\\n\\\\\\\\n**Regulation** \\u2014 With the growth of data generation and data collection,\\\\\\\\n\\\\\\\\nthere is increased interest in how the data is protected and managed.\\\\\\\\n\\\\\\\\nRegulatory regimes such as GDPR (General Data Protection Regulation)\\\\\\\\n\\\\\\\\nfrom the EU and other jurisdictions mandate very specific ways in which\\\\\\\\n\\\\\\\\ndata must be managed.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Drivers\\\\\\\\n\\\\\\\\n**Technology Innovation** \\u2014 The move to cloud-based analytics architectures\\\\\\\\n\\\\\\\\nthat is now well underway is being propelled further by innovations such as\\\\\\\\n\\\\\\\\nanalytics-focused chipsets, pipeline automation and the unification of data\\\\\\\\n\\\\\\\\nand machine learning. All these offer data professionals new approaches for\\\\\\\\n\\\\\\\\ntheir data initiatives.\\\\\\\\n\\\\\\\\n**Financial Scrutiny** \\u2014 With a growth in investment, analytics initiatives are\\\\\\\\n\\\\\\\\nalso subject to increasing scrutiny. There is also a greater understanding of\\\\\\\\n\\\\\\\\ndata as a valuable asset. Deriving value from data must be done in a manner\\\\\\\\n\\\\\\\\nthat is financially responsible and actually value adding to the enterprise and\\\\\\\\n\\\\\\\\nmeeting ROI hurdles.\\\\\\\\n\\\\\\\\n**Role Evolution** \\u2014 Reflecting the importance of managing the data and\\\\\\\\n\\\\\\\\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\\\\\\\\n\\\\\\\\nmore prominent and newer roles such as Data Curator are emerging.\\\\\\\\n\\\\\\\\nThey must balance the needs of governance, security and democratization.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Key Goals\\\\\\\\n\\\\\\\\n#### Data Pipeline Key Goals\\\\\\\\n\\\\\\\\nMaking quality data available in a reliable manner is a major determinant of success for data\\\\\\\\n\\\\\\\\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\\\\\\\\n\\\\\\\\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\\\\\\\\n\\\\\\\\nresponsibility need to take account of a broad set of dependencies and requirements as they\\\\\\\\n\\\\\\\\ndesign and build their data pipelines.\\\\\\\\n\\\\\\\\nThree primary goals that data engineers typically seek to address as they work to enable the\\\\\\\\n\\\\\\\\nanalytics professionals in their organizations are:\\\\\\\\n\\\\\\\\n**Deliver quality data in less time** \\u2014 When it comes to data, quality and timeliness\\\\\\\\n\\\\\\\\nare key. Data with gaps or errors (which can arise for many reasons) is\\\\\\\\n\\\\\\\\n\\u201cunreliable,\\u201d can lead to wrong conclusions, and is of limited value to downstream\\\\\\\\n\\\\\\\\nusers. Equally well, many applications require up-to-date information (who\\\\\\\\n\\\\\\\\nwants to use last night\\u2019s closing stock price or weather forecast) and are of\\\\\\\\n\\\\\\\\nlimited value without it.\\\\\\\\n\\\\\\\\n**Enable faster queries** \\u2014 Wanting fast responses to queries is natural enough\\\\\\\\n\\\\\\\\nin today\\u2019s \\u201cNew York minute,\\u201d online world. Achieving this is particularly\\\\\\\\n\\\\\\\\ndemanding when the queries are based on very large data sets.\\\\\\\\n\\\\\\\\n**Simplify data engineering at scale** \\u2014 It is one thing to have high reliability and\\\\\\\\n\\\\\\\\nperformance in a limited, development or test environment. What matters\\\\\\\\n\\\\\\\\nmore is the ability to have robust, production data pipelines at scale without\\\\\\\\n\\\\\\\\nrequiring high operational overhead.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0032034456, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f6ef96d9f374de069754b3f8d671b16d\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"COPY INTO ......................................................................................................................................................................................................... **06**\\\\\\\\n\\\\\\\\nAuto Loader ....................................................................................................................................................................................................... **09**\\\\\\\\n\\\\\\\\nIngesting Data From External Applications .......................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\nPartner Connect ............................................................................................................................................................................................... **13**\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Introduction\\\\\\\\n\\\\\\\\nOrganizations today are inundated with data siloed across various on-premises\\\\\\\\napplication systems, databases, data warehouses and SaaS applications. This\\\\\\\\nfragmentation makes it difficult to support new use cases for analytics or machine\\\\\\\\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\\\\\\\\narchitecture built on top of Delta Lake, an open format storage layer.\\\\\\\\n\\\\\\\\nThe first thing data engineers need to do to support the lakehouse architecture is to\\\\\\\\nefficiently move data from various systems into their lakehouse. Ingesting data is a\\\\\\\\ncritical first step in the data engineering and management lifecycle.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n### Life of a Data Engineer\\\\\\\\n\\\\\\\\nThe primary focus of data engineers is to provide timely and reliable data to downstream\\\\\\\\n\\\\\\\\ndata teams at an organization. Requests for data can come from a variety of teams, and for\\\\\\\\n\\\\\\\\n\\\\\\\\na variety of data types. For example:\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0030519078, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"e49a7d2e3bd1f6a60e1306c0186dcdd5\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"With the Databricks Lakehouse Platform, data engineers have access to an\\\\\\\\nend-to-end data engineering solution for ingesting, transforming, processing,\\\\\\\\nscheduling and delivering data. The lakehouse platform automates the\\\\\\\\ncomplexity of building and maintaining pipelines and running ETL workloads\\\\\\\\ndirectly on a data lake so data engineers can focus on quality and reliability\\\\\\\\nto drive valuable insights.\\\\\\\\n\\\\\\\\nData engineering in the lakehouse allows data teams to unify batch and\\\\\\\\nstreaming operations on a simplified architecture, streamline data pipeline\\\\\\\\ndevelopment and testing, build reliable data, analytics and AI workflows\\\\\\\\non any cloud platform, and meet regulatory requirements to maintain\\\\\\\\nworld-class governance.\\\\\\\\n\\\\\\\\nThe lakehouse provides an end-to-end data engineering and ETL platform\\\\\\\\nthat automates the complexity of building and maintaining pipelines and\\\\\\\\nrunning ETL workloads so data engineers and analysts can focus on quality\\\\\\\\nand reliability to drive valuable insights.\\\\\\\\n\\\\\\\\n\\\\\\\\n#### Databricks makes modern data engineering simple\\\\\\\\n\\\\\\\\nThere is no industry-wide definition of modern data engineering.\\\\\\\\nThis should come close:\\\\\\\\n\\\\\\\\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\\\\\\\\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\\\\\\\\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\\\\\\\\n_kinds of workflows._\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### Benefits of data engineering on the lakehouse\\\\\\\\n\\\\\\\\nBy simplifying and modernizing with the lakehouse architecture, data engineers\\\\\\\\ngain an enterprise-grade and enterprise-ready approach to building data\\\\\\\\npipelines. The following are eight key differentiating capabilities that a data\\\\\\\\nengineering solution team can enable with the Databricks Lakehouse Platform:\\\\\\\\n\\\\\\\\n**\\u2022** **Easy data ingestion:** With the ability to ingest petabytes of data, data\\\\\\\\nengineers can enable fast, reliable, scalable and automatic data ingestion\\\\\\\\nfor analytics, data science or machine learning.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Data pipeline observability:** Monitor overall data pipeline estate status\\\\\\\\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\\\\\\\\nhealth for performance, quality, status and latency.\\\\\\\\n\\\\\\\\n**\\u2022** **Simplified operations:** Ensure reliable and predictable delivery of data for\\\\\\\\nanalytics and machine learning use cases by enabling easy and automatic\\\\\\\\ndata pipeline deployments into production or roll back pipelines and\\\\\\\\nminimize downtime.\\\\\\\\n\\\\\\\\n**\\u2022** **Scheduling and orchestration:** Simple, clear and reliable orchestration\\\\\\\\nof data processing tasks for data and machine learning pipelines with the\\\\\\\\nability to run multiple non-interactive tasks as a directed acyclic graph\\\\\\\\n(DAG) on a Databricks compute cluster.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\\\\\\\n**\\u2022** **Automated ETL pipelines:** Data engineers can reduce development\\\\\\\\ntime and effort and focus on implementing business logic and data\\\\\\\\nquality checks within the data pipeline using SQL or Python.\\\\\\\\n\\\\\\\\n**\\u2022** **Data quality checks:** Improve data reliability throughout the data\\\\\\\\nlakehouse so data teams can confidently trust the information for\\\\\\\\ndownstream initiatives with the ability to define data quality and\\\\\\\\nautomatically address errors.\\\\\\\\n\\\\\\\\n**\\u2022** **Batch and streaming:** Allow data engineers to set tunable data latency\\\\\\\\nwith cost controls without having to know complex stream processing\\\\\\\\nand implement recovery logic.\\\\\\\\n\\\\\\\\n**\\u2022** **Automatic recovery:** Handle transient errors and use automatic recovery\\\\\\\\nfor most common error conditions that can occur during the operation of\\\\\\\\na pipeline with fast, scalable fault-tolerance.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Data engineering is all about data quality**\\\\\\\\n\\\\\\\\nThe goal of modern data engineering is to distill data with a quality that is fit for\\\\\\\\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\\\\\\\\nthree different levels.\\\\\\\\n\\\\\\\\n\\\\\\\\n1. On a **technical level** , data quality is\\\\\\\\nguaranteed by enforcing and evolving\\\\\\\\nschemas for data storage and ingestion.\\\\\\\\n\\\\\\\\n**Kenesis**\\\\\\\\n\\\\\\\\n**CSV,**\\\\\\\\n**JSON, TXT...**\\\\\\\\n\\\\\\\\n**Data Lake**\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029978286, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"9f81ac0b52802c7152247bfd5289b744\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The responsibilities of a Data Engineer include:\\\\n\\\\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\\\\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\\\\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\\\\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\\\\n\\\\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/the-big-book-of-mlops-v10-072023.pdf", + "The responsibilities of a Data Engineer include:\n\n1. Data migration, manipulation, and integration of data (joining dissimilar data systems)\n2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights\n3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it\n4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.)\n\nData Engineers are also responsible for enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists. They build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret, with the ultimate goal of making data accessible so that teams can use it to evaluate and optimize a goal or objective.", + "no", + "[correctness] The root cause of failure is traced to the negative rating of correctness. See the correctness rationale for more details. **Suggested Actions**: Consider improving the prompt template to encourage direct, specific responses, re-ranking retrievals to provide more relevant chunks to the LLM earlier in the prompt, or using a more capable LLM.", + "yes", + "No harmful content detected in response", + "no", + "The expected response states that a Data Engineer is responsible for building data pipelines, processing data sets, organizing data sets, persisting data sets, and supporting machine learning and other downstream applications. The response mentions responsibilities such as setting up and maintaining ETL pipelines, managing workflows, enabling access to usable data, and building systems that convert source data into usable information. However, the response does not explicitly mention 'processing data sets,' 'organizing data sets,' or 'persisting data sets.' Additionally, while it mentions enabling data scientists and analysts, it does not explicitly state that the responsibilities support machine learning and other downstream applications. Therefore, the response is not correct.", + 8.86, + 6696.0, + 6486.0, + 210.0, + null, + null, + [ + [ + "engineering in the gaming industry.\n\n`10. \u0007` **Go beyond dashboards.** Looking at dashboards is only the\n\nfirst step in your data journey. Imagine how the output of\n\nyour data can be presented in a way to help stakeholders\n\nacross your company achieve more. For example, dropping\n\ndata into an application that can help game designers\n\nmake balancing decisions based on player events.\n\n\n-----\n\n# APPENDIX Ultimate class build guide\n\n\n### Creating a character\n\nThe heart and soul of mature data teams are formed by this\n\ntrio of classes. There are many aspects to these roles, but\n\nthey can be summarized in that Data Engineers create and\n\nmaintain critical data workflows, Data Analysts interpret data\n\nand create reports that keep the business teams running\n\nseamlessly, and Data Scientists are responsible for making\n\nsense of large amounts of data. Depending on the size of\n\nthe organization, individuals may be required to multiclass\n\nin order to address needs of the team. In smaller studios, it’s\n\noften developers who wear multiple hats, including those in\n\ndata engineering, analytics and data science.\n\nWhether you’re looking to stand-up an analytics dashboard\n\nto report on the health of a title or building a recommendation\n\nengine for your players, this guide will help you better\n\nunderstand the unique classes required to develop and\n\nmaintain an effective data, analytics, and AI platform.\n\n##### Data Engineers\n\n\n**Goals and Priorities of Data Engineers**\n\n- Enable access to usable data for real-time insights — data\n\nthat both enables timely decision-making and is accurate\n\nand reproducible\n\n- Increase user confidence and trust in data. This involves\n\nensuring high consistency and reliability in ETL processes\n\n- Limit the issues and failures experienced by other\n\nengineers and data scientists, allowing those roles to\n\nfocus less on troubleshooting and more on drawing\n\nmeaningful conclusions from data and building new\n\nproducts / features\n\n**What Data Engineers care about:**\n\n- Enabling access to data for real-time insights — data that\n\nboth enables timely decision-making and is accurate and\n\nreproducible\n\n- Building high-performance, reliable and scalable pipelines\n\nfor data processing\n\n- Delivering data for consumption from a variety of sources\n\nby Data Analysts and Data Scientists against tight SLAs\n\n- A Data Engineer’s biggest challenge? Collaboration\n\nacross teams\n\n\nData engineers build systems that collect, manage, and\n\n\nconvert source data into usable information for data\n\nscientists and business analysts to interpret. Their ultimate\n\ngoal is to make data accessible so that teams can use it to\n\nevaluate and optimize a goal or objective.\n\n**Responsibilities:**\n\n- Data Engineers are responsible for data migration,\n\nmanipulation, and integration of data (joining dissimilar\n\ndata systems)\n\n- Setup and maintenance of ETL pipelines to convert\n\nsource data into actionable data for insights. It is the\n\nresponsibility of the data engineer to make sure these\n\npipelines run efficiently and are well orchestrated.\n\n- The Data Engineer sets up the workflow process\n\nto orchestrate pipelines for the studio’s data and\n\ncontinuously validates it\n\n- Managing workflows to enable data scientists and data\n\nanalysts, and ensuring workflows are well-integrated with\n\ndifferent parts of the studio (e.g., marketing, test/QA, etc)\n\n\n##### Data Scientists\n\nData scientists determine the questions their team should\n\nbe asking and figure out how to answer those questions\n\nusing data. They often develop predictive models for\n\ntheorizing and forecasting.\n\n**Responsibilities:**\n\n- Responsible for making sense of the large amounts of data\n\ncollected for a given game title, such as game telemetry,", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/databricks_ultimate_gaming_data_guide_2023.pdf" + ], + [ + "Data teams rely on getting the right data at the right time for analytics, data\nscience and machine learning, but often are faced with challenges meeting\nthe needs of their initiatives for data engineering.\n\n\n-----\n\n#### Why data engineering is hard\n\nOne of the biggest challenges is accessing and managing the increasingly\ncomplex data that lives across the organization. Most of the complexity\narises with the explosion of data volumes and data types, with organizations\namassing an estimated [80% of data that is unstructured and semi-structured.](https://www.forbes.com/sites/forbestechcouncil/2019/01/29/the-80-blind-spot-are-you-ignoring-unstructured-organizational-data/?sh=681651dc211c)\n\nWith this volume, managing data pipelines to transform and process data\nis slow and difficult, and increasingly expensive. And to top off the complexity,\nmost businesses are putting an increased emphasis on multicloud\nenvironments which can be even more difficult to maintain.\n\n[Zhamak Dehghani](https://databricks.com/speaker/zhamak-dehghani) , a principal technology consultant at Thoughtworks, wrote\nthat data itself has become a product, and the challenging goal of the data\nengineer is to build and run the machinery that creates this high-fidelity\ndata product all the way from ingestion to monetization.\n\n\nDespite current technological advances data engineering remains\ndifficult for several reasons:\n\n**Complex data ingestion methods**\n\nData ingestion means retrieving batch and streaming data from various\nsources and in various formats. Ingesting data is hard and complex since you\neither need to use an always-running streaming platform like Apache Kafka\nor you need to be able to keep track of which files haven’t been ingested yet.\nData engineers are required to spend a lot of time hand-coding repetitive\nand error-prone data ingestion tasks.\n\n**Data engineering principles**\n\nThese days, large operations teams are often just a memory of the past.\nModern data engineering principles are based on agile software development\nmethodologies. They apply the well-known “you build it, you run it” paradigm,\nuse isolated development and production environments, CI/CD, and version\ncontrol transformations that are pushed to production after validation. Tooling\nneeds to support these principles.\n\n\n-----\n\n**Third-party tools**\n\nData engineers are often required to run additional third-party tools for\norchestration to automate tasks such as ELT/ETL or customer code in\nnotebooks. Running third-party tools increases the operational overhead\nand decreases the reliability of the system.\n\n**Performance tuning**\n\nFinally, with all pipelines and workflows written, data engineers need to\nconstantly focus on performance, tuning pipelines and architectures to meet\nSLAs. Tuning such architectures requires in-depth knowledge of the underlying\narchitecture and constantly observing throughput parameters.\n\nMost organizations are dealing with a complex landscape of data warehouses\nand data lakes these days. Each of those platforms has its own limitations,\nworkloads, development languages and governance model.\n\n\nWith the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf" + ], + [ + "# Building Reliable Data Lakes at Scale With Delta Lake\n\n\n-----\n\n## Contents\n\n#### Data Engineering Drivers 2\n\n Data Pipeline Key Goals 4\n\n Apache Spark™: The First Unified Analytics Engine 5\n\n Data Reliability Challenges With Data Lakes 6\n\n Delta Lake: A New Storage Layer 7\n\n Delta Lake: Key Features 8\n\n Getting Started With Delta Lake 10\n\n\n-----\n\n## Drivers\n\n#### Data Engineering Drivers\n\nData engineering professionals are needing to respond to several different drivers.\n\nChief among the drivers they face are:\n\n**Rise of Advanced Analytics** — Advanced analytics, including methods\n\nbased on machine learning techniques, have evolved to such a degree that\n\norganizations seek to derive far more value from their corporate assets.\n\n**Widespread Adoption** — Once the province of leading edge, high-tech\n\ncompanies, these advanced approaches are being adopted across a\n\nmultitude of industries from retail to hospitality to healthcare and across\n\nprivate as well as public sector organizations. This is further driving the need\n\nfor strong data engineering practices.\n\n**Regulation** — With the growth of data generation and data collection,\n\nthere is increased interest in how the data is protected and managed.\n\nRegulatory regimes such as GDPR (General Data Protection Regulation)\n\nfrom the EU and other jurisdictions mandate very specific ways in which\n\ndata must be managed.\n\n\n-----\n\n## Drivers\n\n**Technology Innovation** — The move to cloud-based analytics architectures\n\nthat is now well underway is being propelled further by innovations such as\n\nanalytics-focused chipsets, pipeline automation and the unification of data\n\nand machine learning. All these offer data professionals new approaches for\n\ntheir data initiatives.\n\n**Financial Scrutiny** — With a growth in investment, analytics initiatives are\n\nalso subject to increasing scrutiny. There is also a greater understanding of\n\ndata as a valuable asset. Deriving value from data must be done in a manner\n\nthat is financially responsible and actually value adding to the enterprise and\n\nmeeting ROI hurdles.\n\n**Role Evolution** — Reflecting the importance of managing the data and\n\nmaximizing value extraction, the Chief Data Officer (CDO) role is becoming\n\nmore prominent and newer roles such as Data Curator are emerging.\n\nThey must balance the needs of governance, security and democratization.\n\n\n-----\n\n## Key Goals\n\n#### Data Pipeline Key Goals\n\nMaking quality data available in a reliable manner is a major determinant of success for data\n\nanalytics initiatives be they regular dashboards or reports, or advanced analytics projects\n\ndrawing on state-of-the-art machine learning techniques. Data engineers tasked with this\n\nresponsibility need to take account of a broad set of dependencies and requirements as they\n\ndesign and build their data pipelines.\n\nThree primary goals that data engineers typically seek to address as they work to enable the\n\nanalytics professionals in their organizations are:\n\n**Deliver quality data in less time** — When it comes to data, quality and timeliness\n\nare key. Data with gaps or errors (which can arise for many reasons) is\n\n“unreliable,” can lead to wrong conclusions, and is of limited value to downstream\n\nusers. Equally well, many applications require up-to-date information (who\n\nwants to use last night’s closing stock price or weather forecast) and are of\n\nlimited value without it.\n\n**Enable faster queries** — Wanting fast responses to queries is natural enough\n\nin today’s “New York minute,” online world. Achieving this is particularly\n\ndemanding when the queries are based on very large data sets.\n\n**Simplify data engineering at scale** — It is one thing to have high reliability and\n\nperformance in a limited, development or test environment. What matters\n\nmore is the ability to have robust, production data pipelines at scale without\n\nrequiring high operational overhead.\n\n\n-----", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/building-reliable-data-lakes-at-scale-with-delta-lake.pdf" + ], + [ + "COPY INTO ......................................................................................................................................................................................................... **06**\n\nAuto Loader ....................................................................................................................................................................................................... **09**\n\nIngesting Data From External Applications .......................................................................................................................................................... **13**\n\nPartner Connect ............................................................................................................................................................................................... **13**\n\n\n-----\n\n### Introduction\n\nOrganizations today are inundated with data siloed across various on-premises\napplication systems, databases, data warehouses and SaaS applications. This\nfragmentation makes it difficult to support new use cases for analytics or machine\nlearning, so many IT teams are now centralizing all of their data with a lakehouse\narchitecture built on top of Delta Lake, an open format storage layer.\n\nThe first thing data engineers need to do to support the lakehouse architecture is to\nefficiently move data from various systems into their lakehouse. Ingesting data is a\ncritical first step in the data engineering and management lifecycle.\n\n\n-----\n\n### Life of a Data Engineer\n\nThe primary focus of data engineers is to provide timely and reliable data to downstream\n\ndata teams at an organization. Requests for data can come from a variety of teams, and for\n\n\na variety of data types. For example:", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/EB-Ingesting-Data-FINAL.pdf" + ], + [ + "With the Databricks Lakehouse Platform, data engineers have access to an\nend-to-end data engineering solution for ingesting, transforming, processing,\nscheduling and delivering data. The lakehouse platform automates the\ncomplexity of building and maintaining pipelines and running ETL workloads\ndirectly on a data lake so data engineers can focus on quality and reliability\nto drive valuable insights.\n\nData engineering in the lakehouse allows data teams to unify batch and\nstreaming operations on a simplified architecture, streamline data pipeline\ndevelopment and testing, build reliable data, analytics and AI workflows\non any cloud platform, and meet regulatory requirements to maintain\nworld-class governance.\n\nThe lakehouse provides an end-to-end data engineering and ETL platform\nthat automates the complexity of building and maintaining pipelines and\nrunning ETL workloads so data engineers and analysts can focus on quality\nand reliability to drive valuable insights.\n\n\n#### Databricks makes modern data engineering simple\n\nThere is no industry-wide definition of modern data engineering.\nThis should come close:\n\n_A_ **_unified data platform_** _with_ **_managed data ingestion_** _, schema detection,_\n_enforcement, and evolution, paired with_ **_declarative, auto-scaling data_**\n**_flow_** _integrated with a lakehouse_ **_native orchestrator_** _that supports all_\n_kinds of workflows._\n\n\n-----\n\n-----\n\n#### Benefits of data engineering on the lakehouse\n\nBy simplifying and modernizing with the lakehouse architecture, data engineers\ngain an enterprise-grade and enterprise-ready approach to building data\npipelines. The following are eight key differentiating capabilities that a data\nengineering solution team can enable with the Databricks Lakehouse Platform:\n\n**•** **Easy data ingestion:** With the ability to ingest petabytes of data, data\nengineers can enable fast, reliable, scalable and automatic data ingestion\nfor analytics, data science or machine learning.\n\n\n\n**•** **Data pipeline observability:** Monitor overall data pipeline estate status\nfrom a dataflow graph dashboard and visually track end-to-end pipeline\nhealth for performance, quality, status and latency.\n\n**•** **Simplified operations:** Ensure reliable and predictable delivery of data for\nanalytics and machine learning use cases by enabling easy and automatic\ndata pipeline deployments into production or roll back pipelines and\nminimize downtime.\n\n**•** **Scheduling and orchestration:** Simple, clear and reliable orchestration\nof data processing tasks for data and machine learning pipelines with the\nability to run multiple non-interactive tasks as a directed acyclic graph\n(DAG) on a Databricks compute cluster.\n\n\n\n**•** **Automated ETL pipelines:** Data engineers can reduce development\ntime and effort and focus on implementing business logic and data\nquality checks within the data pipeline using SQL or Python.\n\n**•** **Data quality checks:** Improve data reliability throughout the data\nlakehouse so data teams can confidently trust the information for\ndownstream initiatives with the ability to define data quality and\nautomatically address errors.\n\n**•** **Batch and streaming:** Allow data engineers to set tunable data latency\nwith cost controls without having to know complex stream processing\nand implement recovery logic.\n\n**•** **Automatic recovery:** Handle transient errors and use automatic recovery\nfor most common error conditions that can occur during the operation of\na pipeline with fast, scalable fault-tolerance.\n\n\n-----\n\n**Data engineering is all about data quality**\n\nThe goal of modern data engineering is to distill data with a quality that is fit for\ndownstream analytics and AI. Within the Lakehouse, data quality is achieved on\nthree different levels.\n\n\n1. On a **technical level** , data quality is\nguaranteed by enforcing and evolving\nschemas for data storage and ingestion.\n\n**Kenesis**\n\n**CSV,**\n**JSON, TXT...**\n\n**Data Lake**", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/The-Data-Teams-Guide-to-the-DB-Lakehouse-Platform.pdf" + ] + ], + "yes", + "The response states several responsibilities of a Data Engineer. The retrieved context supports the following responsibilities: 1. Data migration, manipulation, and integration of data (joining dissimilar data systems) - this is mentioned under 'Responsibilities' in the document. 2. Setting up and maintaining ETL pipelines to convert source data into actionable data for insights - this is also mentioned under 'Responsibilities'. 3. Setting up the workflow process to orchestrate pipelines for the studio's data and continuously validating it - this is supported by the document. 4. Managing workflows to enable data scientists and data analysts, and ensuring workflows are well-integrated with different parts of the studio (e.g., marketing, test/QA, etc.) - this is supported by the document. Additionally, the document mentions enabling access to usable data for real-time insights, increasing user confidence and trust in data, and limiting issues and failures experienced by other engineers and data scientists, which aligns with the claim. Therefore, the claim is fully supported by the document.", + "yes", + "The ground truth states several responsibilities of a Data Engineer. According to the retrieved context, Data Engineers are responsible for: \n- Building data pipelines (supported by 'building high-performance, reliable and scalable pipelines for data processing'). \n- Processing data sets (supported by 'convert source data into actionable data for insights'). \n- Organizing data sets (supported by 'managing workflows to enable data scientists and data analysts'). \n- Persisting data sets (supported by 'setup and maintenance of ETL pipelines'). \n- Supporting machine learning and other downstream applications (supported by 'data engineers build systems that collect, manage, and convert source data into usable information for data scientists and business analysts to interpret'). Therefore, all parts of the ground truth are supported by the retrieved context.", + [ + "no", + "no", + "no", + "no", + "no" + ], + 0.0 + ], + [ + "87873effb62b308f6aafbb34e7c24aec0439d6b92eb32b330105aa9d6aca2286", + [ + [ + [ + "What are the critical needs for IT and business when it comes to implementing a customer data platform?", + "user" + ] + ] + ], + [ + [ + "Critical IT Needs\nKeep control of data access and governance; ability to architecture a customer data stack with decisions on where data is stored and where queries are executed.\nCritical Business Needs\nGet customer data access via a no-code interface to generate insights; build customer experiences and activate data within business applications.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf" + ] + ], + [ + "IT needs to maintain control over data access and governance.", + "IT needs the ability to make decisions on data storage and query execution.", + "Business needs to access customer data through a no-code interface to generate insights.", + "Business needs to build customer experiences and activate data within business applications." + ], + "{\"info\": {\"request_id\": \"tr-def73886c787456096e0d3d7cfe81552\", \"experiment_id\": \"2822477370659093\", \"timestamp_ms\": 1734543864916, \"execution_time_ms\": 5651, \"status\": \"OK\", \"request_metadata\": {\"mlflow.sourceRun\": \"daf417a2ec4a4ebaa7ba9622535a6263\", \"mlflow.trace_schema.version\": \"2\", \"mlflow.traceInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"mlflow.traceOutputs\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and bu...\"}, \"tags\": {\"eval.requestId\": \"7e27a99b-f6ef-40d0-94f7-d14f455eca3c\", \"mlflow.databricks.notebook.commandID\": \"1734538428609_8308562536671358560_fad4b190a8464079bac3564169c40019\", \"mlflow.databricks.notebookID\": \"2822477370486004\", \"mlflow.databricks.notebookPath\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.databricks.webappURL\": \"https://eastus2.azuredatabricks.net\", \"mlflow.databricks.workspaceID\": \"984752964297111\", \"mlflow.databricks.workspaceURL\": \"https://adb-984752964297111.11.azuredatabricks.net\", \"mlflow.source.name\": \"/Users/manffred.calvosanchez@databricks.com/genai-cookbook/openai_sdk_agent_app_sample_code/05_tool_calling_agent\", \"mlflow.source.type\": \"NOTEBOOK\", \"mlflow.traceName\": \"agent\", \"mlflow.user\": \"4648498707132927\", \"retrievers\": \"[{\\\"doc_uri\\\": \\\"doc_uri\\\", \\\"name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"other_columns\\\": [], \\\"primary_key\\\": \\\"chunk_id\\\", \\\"text_column\\\": \\\"content_chunked\\\"}]\", \"mlflow.artifactLocation\": \"dbfs:/databricks/mlflow-tracking/2822477370659093/tr-def73886c787456096e0d3d7cfe81552/artifacts\"}}, \"data\": {\"spans\": [{\"name\": \"agent\", \"context\": {\"span_id\": \"0x5baceed3222a960b\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": null, \"start_time\": 1734543864916750101, \"end_time\": 1734543870567771694, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"predict\\\"\", \"mlflow.spanInputs\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}, \"events\": []}, {\"name\": \"get_messages_array\", \"context\": {\"span_id\": \"0x6a147a0324292b48\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864929193850, \"end_time\": 1734543864936605139, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"get_messages_array\\\"\", \"mlflow.spanInputs\": \"{\\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\"}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]\"}, \"events\": []}, {\"name\": \"parse_input\", \"context\": {\"span_id\": \"0xa61737d7e97f75f6\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864936732340, \"end_time\": 1734543864937464449, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanInputs\": \"{\\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"}, \\\"chat_history\\\": []}\"}, \"events\": []}, {\"name\": \"extract_user_query_string\", \"context\": {\"span_id\": \"0xf13b9cee85b4b4f1\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xa61737d7e97f75f6\", \"start_time\": 1734543864936890442, \"end_time\": 1734543864937116045, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_user_query_string\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"\\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"\"}, \"events\": []}, {\"name\": \"extract_chat_history\", \"context\": {\"span_id\": \"0xeb765ae29e02fd13\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xa61737d7e97f75f6\", \"start_time\": 1734543864937199646, \"end_time\": 1734543864937431549, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"extract_chat_history\\\"\", \"mlflow.spanInputs\": \"{\\\"chat_messages_array\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}]}\", \"mlflow.spanOutputs\": \"[]\"}, \"events\": []}, {\"name\": \"recursively_call_and_run_tools\", \"context\": {\"span_id\": \"0xbe98406904952d43\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x5baceed3222a960b\", \"start_time\": 1734543864937535250, \"end_time\": 1734543870567143686, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"AGENT\\\"\", \"mlflow.spanFunctionName\": \"\\\"recursively_call_and_run_tools\\\"\", \"mlflow.spanInputs\": \"{\\\"last_message\\\": {\\\"role\\\": \\\"user\\\", \\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\"}, \\\"chat_history\\\": [], \\\"last_max_iter\\\": 10}\", \"mlflow.spanOutputs\": \"[{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}, [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]]\"}, \"events\": []}, {\"name\": \"Completions_1\", \"context\": {\"span_id\": \"0x6110b362407c0710\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543865065375583, \"end_time\": 1734543866424542786, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_5d73c95c-8527-4f8f-ac82-ac3ef12181be\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"tool_calls\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": null, \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}]}}], \\\"created\\\": 1734543865, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 31, \\\"prompt_tokens\\\": 1158, \\\"total_tokens\\\": 1189, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}, {\"name\": \"vector_search_retriever\", \"context\": {\"span_id\": \"0x29d9dfbb8949e1ee\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543866434627907, \"end_time\": 1734543866904066738, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"RETRIEVER\\\"\", \"mlflow.spanFunctionName\": \"\\\"__call__\\\"\", \"mlflow.spanInputs\": \"{\\\"query\\\": \\\"critical needs for IT and business when implementing a customer data platform\\\", \\\"filters\\\": []}\", \"vector_search_index\": \"\\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\"\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029832723, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\"}, \\\"id\\\": \\\"563f0dba5edef5b358685117dfb5a133\\\"}, {\\\"page_content\\\": \\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027576878, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"25ef18d715b47231f6594d1da80303e9\\\"}, {\\\"page_content\\\": \\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027022872, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"eaff954d65653182857574e043c105f1\\\"}, {\\\"page_content\\\": \\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025006814, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"f545eff42d3b9ae2b565475f4390ed44\\\"}, {\\\"page_content\\\": \\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0024809677, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"b5f4bd0258226132f89697f6e660b09b\\\"}]\"}, \"events\": []}, {\"name\": \"_workspace_client.vector_search_indexes.query_index\", \"context\": {\"span_id\": \"0x88079808d446595d\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x29d9dfbb8949e1ee\", \"start_time\": 1734543866439784969, \"end_time\": 1734543866902149915, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"FUNCTION\\\"\", \"mlflow.spanFunctionName\": \"\\\"query_index\\\"\", \"mlflow.spanInputs\": \"{\\\"index_name\\\": \\\"casaman_ssa.demos.test_product_docs_docs_chunked_index__v2\\\", \\\"columns\\\": [\\\"content_chunked\\\", \\\"chunk_id\\\", \\\"doc_uri\\\"], \\\"filters_json\\\": null, \\\"num_results\\\": 5, \\\"query_text\\\": \\\"critical needs for IT and business when implementing a customer data platform\\\", \\\"query_type\\\": \\\"ann\\\", \\\"query_vector\\\": null, \\\"score_threshold\\\": null}\", \"mlflow.spanOutputs\": \"{\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"563f0dba5edef5b358685117dfb5a133\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\", 0.0029832723], [\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"25ef18d715b47231f6594d1da80303e9\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027576878], [\\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"eaff954d65653182857574e043c105f1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027022872], [\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"f545eff42d3b9ae2b565475f4390ed44\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0025006814], [\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"b5f4bd0258226132f89697f6e660b09b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0024809677]], \\\"row_count\\\": 5}}\"}, \"events\": []}, {\"name\": \"convert_vector_search_to_documents\", \"context\": {\"span_id\": \"0xa3664bb29279107e\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0x29d9dfbb8949e1ee\", \"start_time\": 1734543866902364817, \"end_time\": 1734543866903599532, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"PARSER\\\"\", \"mlflow.spanFunctionName\": \"\\\"convert_vector_search_to_documents\\\"\", \"mlflow.spanInputs\": \"{\\\"vs_results\\\": {\\\"manifest\\\": {\\\"column_count\\\": 4, \\\"columns\\\": [{\\\"name\\\": \\\"content_chunked\\\"}, {\\\"name\\\": \\\"chunk_id\\\"}, {\\\"name\\\": \\\"doc_uri\\\"}, {\\\"name\\\": \\\"score\\\"}]}, \\\"next_page_token\\\": \\\"\\\", \\\"result\\\": {\\\"data_array\\\": [[\\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"563f0dba5edef5b358685117dfb5a133\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\", 0.0029832723], [\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"25ef18d715b47231f6594d1da80303e9\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027576878], [\\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"eaff954d65653182857574e043c105f1\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0027022872], [\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"f545eff42d3b9ae2b565475f4390ed44\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0025006814], [\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"b5f4bd0258226132f89697f6e660b09b\\\", \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\", 0.0024809677]], \\\"row_count\\\": 5}}, \\\"vector_search_threshold\\\": 0.0}\", \"mlflow.spanOutputs\": \"[{\\\"page_content\\\": \\\"#### eBook\\\\n\\\\n# The CDP Build vs Buy Guide:\\\\n\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\n\\\\n\\\\n-----\\\\n\\\\n## The Need for a Customer Data Platform\\\\n\\\\n\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\nactivate customers with targeted content.\\\\n\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\n\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\nfastest path to a solution.\\\\n\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\nthat has immediate consequences.\\\\n\\\\n**Critical IT Needs** **Critical Business Needs**\\\\n\\\\n\\\\nKeep control of data access and\\\\ngovernance; ability to architecture a\\\\ncustomer data stack with decisions on\\\\nwhere data is stored and where queries\\\\nare executed\\\\n\\\\n\\\\nGet customer data access via a no-code\\\\ninterface to generate insights; build customer\\\\nexperiences and activate data within\\\\nbusiness applications\\\\n\\\\n\\\\n-----\\\\n\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\n\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\n\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\n\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\n\\\\n\\\\n-----\\\\n\\\\n## Combining the Build and Buy Approaches\\\\n\\\\n\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\n\\\\n**Bundled** **Composable**\\\\n\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n\\\\n\\\\nCompute\\\\n\\\\nStorage\\\\n(Local & Views)\\\\n\\\\n\\\\nQuery\\\\nVirtualization\\\\n\\\\nMetadata\\\\n\\\\n\\\\nData Copy\\\\n\\\\n\\\\nLakehouse\\\\n\\\\nStorage\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nLakehouse\\\\n\\\\n\\\\nCompute Compute\\\\n\\\\nStorage Storage\\\\n\\\\n\\\\n-----\\\\n\\\\nDeployment Type\\\\n\\\\n**Bundled**\\\\n\\\\n**Composable \\u2013**\\\\n**Hybrid**\\\\n\\\\n**Composable \\u2013**\\\\n**Lakehouse-Only**\\\\n\\\\n\\\\nDescription\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0029832723, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\"}, \\\"id\\\": \\\"563f0dba5edef5b358685117dfb5a133\\\"}, {\\\"page_content\\\": \\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\n\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\n\\\\nincreasingly important.\\\\n\\\\n**Modernize business applications**\\\\n\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\n\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\n\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\n\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\n\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\n\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\n\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\n\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\n\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\n\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\n\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\n\\\\ndevelopment teams.\\\\n\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\n\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\n\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\n\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\n\\\\n\\\\n\\u201cWe are on an amazing journey. Being among\\\\n\\\\nthe fastest-growing enterprise software cloud\\\\n\\\\ncompanies on record was unimaginable when\\\\n\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\n\\\\nfocused on the three big bets we made when\\\\n\\\\nfounding the company \\u2014 cloud, open source\\\\n\\\\nand machine learning. Fast-forward seven years,\\\\n\\\\nthousands of data teams around the globe are\\\\n\\\\nworking better together on Databricks.\\u201d\\\\n\\\\n**Ali Ghodsi**\\\\n\\\\nCo-founder and CEO\\\\n\\\\nDatabricks\\\\n\\\\n\\\\n-----\\\\n\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\n\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\n\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\n\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\n\\\\nthe data from the actual SOR.\\\\n\\\\nData from these SORs should be made available in three ways:\\\\n\\\\n**1.** \\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\n\\\\n**2.** \\\\u0007Ensure that copies of the data land in the data lake.\\\\n\\\\n**3.** \\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\n\\\\nconsumption by downstream applications.\\\\n\\\\n**Move toward real-time decisioning**\\\\n\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\n\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\n\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\n\\\\nthe same data platform.\\\\n\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\n\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\n\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\n\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027576878, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"25ef18d715b47231f6594d1da80303e9\\\"}, {\\\"page_content\\\": \\\"and security environment but nothing more\\\\n\\\\n\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\n\\\\nof tools in play or streamlining the user experience\\\\n\\\\n\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\n\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\n\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\n\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\n\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\n\\\\n\\\\n-----\\\\n\\\\nDatabricks is a leading data and AI company \\u2014\\\\n\\\\n\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\n\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\n\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\n\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\n\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\n\\\\nefficiency, cost, etc.\\\\n\\\\n\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\n\\\\n\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\n\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\n\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\n\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\n\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\n\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\n\\\\napply to the broadest set of customers.\\\\n\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\n\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\n\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\n\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\n\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\n\\\\n\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\n\\\\nlistening to the needs of thousands of customers\\\\n\\\\nand having our engineers work side by side with\\\\n\\\\ncustomer teams to deliver real business value using\\\\n\\\\ndata and AI.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Unified platform, unified personas**\\\\n\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\n\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\n\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\n\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\n\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\n\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\n\\\\nsubsystems are well managed.\\\\n\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\n\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\n\\\\nis eliminated.\\\\n\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\n\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\n\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\n\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0027022872, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"eaff954d65653182857574e043c105f1\\\"}, {\\\"page_content\\\": \\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\n\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\n\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\n\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\n\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\n\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\n\\\\ngoals but also in minimizing these seven key business risks.\\\\n\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\n\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\n\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\n\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\n\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\n\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\n\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\n\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\n\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\n\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\n\\\\nidentify and execute on AI opportunities.\\\\n\\\\n\\\\n-----\\\\n\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\n\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\n\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\n\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\n\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\n\\\\nindustry standards.\\\\n\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\n\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\n\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\n\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\n\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\n\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\n\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\n\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\n\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\n\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\n\\\\nshown in Figure 1.\\\\n\\\\n\\\\n###### Lakehouse Platform\\\\n\\\\n\\\\nData\\\\nWarehousing\\\\n\\\\n\\\\nData\\\\nEngineering\\\\n\\\\n\\\\nData\\\\nStreaming\\\\n\\\\n\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\nand ML\\\\n\\\\n\\\\nUnity Catalog\\\\nFine-grained governance for data and AI\\\\n\\\\nDelta Lake\\\\nData relia)ility and .erfor2ance\\\\n\\\\nCloud Data Lake\\\\nAll structured and unstructured data\\\\n\\\\n**Figure 1:**\\\\nThe Databricks Lakehouse Platform\\\\n\\\\n\\\\n-----\\\\n\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\n\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\n\\\\n**2.** \\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\n\\\\n**3.** \\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0025006814, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"f545eff42d3b9ae2b565475f4390ed44\\\"}, {\\\"page_content\\\": \\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\n\\\\nunique and b) the development offers the competitive advantage that you need.\\\\n\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\n\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\n\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\n\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\n\\\\n**How long will it take? Can the organization afford to wait?**\\\\n\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\n\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\n\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\n\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\n\\\\ntake longer and cost more money than initially planned.\\\\n\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\n\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\n\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\n\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\n\\\\nfeatures and schedule.\\\\n\\\\n\\\\nDatabricks is built on top of popular open source\\\\n\\\\nsoftware that it created. Engineering teams can\\\\n\\\\nimprove the underpinnings of the Databricks\\\\n\\\\nplatform by submitting code via pull request and\\\\n\\\\nbecoming committers to the projects. The benefit\\\\n\\\\nto organizations is that their engineers contribute\\\\n\\\\nto the feature set of the data platform while\\\\n\\\\nDatabricks remains responsible for all integration\\\\n\\\\nand performance testing plus all the runtime\\\\n\\\\nsupport, including failover and disaster recovery.\\\\n\\\\n\\\\n-----\\\\n\\\\n**Don\\u2019t forget about the data**\\\\n\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\n\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\n\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\n\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\n\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\n\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\n\\\\ncreating true competitive advantage.\\\\n\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\n\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\n\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\n\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\n\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\n\\\\n\\\\n-----\\\\n\\\\n#### 9. Allocate, monitor and optimize costs\\\\n\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\n\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\n\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\n\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\n\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\n\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\n\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\n\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\n\\\\nplatform, the more they collaborated and their level of expertise increased.\\\", \\\"metadata\\\": {\\\"similarity_score\\\": 0.0024809677, \\\"doc_uri\\\": \\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\"}, \\\"id\\\": \\\"b5f4bd0258226132f89697f6e660b09b\\\"}]\"}, \"events\": []}, {\"name\": \"Completions_2\", \"context\": {\"span_id\": \"0xd3785d456e2d8af2\", \"trace_id\": \"0x8e4b29144d800f53a4d3f977a830786e\"}, \"parent_id\": \"0xbe98406904952d43\", \"start_time\": 1734543866913943756, \"end_time\": 1734543870564567154, \"status_code\": \"OK\", \"status_message\": \"\", \"attributes\": {\"mlflow.traceRequestId\": \"\\\"tr-def73886c787456096e0d3d7cfe81552\\\"\", \"mlflow.spanType\": \"\\\"CHAT_MODEL\\\"\", \"model\": \"\\\"databricks-meta-llama-3-3-70b-instruct\\\"\", \"tools\": \"[{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}]\", \"tool_choice\": \"\\\"auto\\\"\", \"temperature\": \"0.01\", \"max_tokens\": \"1500\", \"mlflow.spanInputs\": \"{\\\"model\\\": \\\"databricks-meta-llama-3-3-70b-instruct\\\", \\\"messages\\\": [{\\\"content\\\": \\\"## Role\\\\nYou are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\\\\n\\\\n## Objective\\\\nYour goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\\\\n\\\\n## Instructions\\\\n1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \\\\n\\\\n2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\\\\n\\\\n3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \\\\\\\"I'm sorry, I can't help you with that.\\\\\\\"\\\", \\\"role\\\": \\\"system\\\"}, {\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"tools\\\": [{\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"description\\\": \\\"Use this tool to search for product documentation.\\\", \\\"name\\\": \\\"search_product_docs\\\", \\\"parameters\\\": {\\\"type\\\": \\\"object\\\", \\\"properties\\\": {\\\"query\\\": {\\\"type\\\": \\\"string\\\", \\\"description\\\": \\\"query\\\"}, \\\"filters\\\": {\\\"items\\\": {\\\"type\\\": \\\"object\\\"}, \\\"type\\\": \\\"array\\\", \\\"default\\\": null, \\\"description\\\": \\\"filters\\\"}}, \\\"required\\\": [\\\"query\\\"]}}}, {\\\"type\\\": \\\"function\\\", \\\"function\\\": {\\\"name\\\": \\\"casaman_ssa__demos__sku_sample_translator\\\", \\\"strict\\\": true, \\\"parameters\\\": {\\\"properties\\\": {\\\"old_sku\\\": {\\\"anyOf\\\": [{\\\"type\\\": \\\"string\\\"}, {\\\"type\\\": \\\"null\\\"}], \\\"description\\\": \\\"The old SKU in the format \\\\\\\"OLD-XXX-YYYY\\\\\\\".\\\", \\\"title\\\": \\\"Old Sku\\\"}}, \\\"title\\\": \\\"casaman_ssa__demos__sku_sample_translator__params\\\", \\\"type\\\": \\\"object\\\", \\\"additionalProperties\\\": false, \\\"required\\\": [\\\"old_sku\\\"]}, \\\"description\\\": \\\"Translates a pre-2024 SKU formatted as \\\\\\\"OLD-XXX-YYYY\\\\\\\" to the new SKU format \\\\\\\"NEW-YYYY-XXX\\\\\\\".\\\"}}], \\\"tool_choice\\\": \\\"auto\\\", \\\"temperature\\\": 0.01, \\\"max_tokens\\\": 1500}\", \"mlflow.spanOutputs\": \"{\\\"id\\\": \\\"chatcmpl_f8a9b201-62ca-4c70-b72c-98d0edc1030c\\\", \\\"choices\\\": [{\\\"finish_reason\\\": \\\"stop\\\", \\\"index\\\": 0, \\\"logprobs\\\": null, \\\"message\\\": {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"refusal\\\": null, \\\"role\\\": \\\"assistant\\\", \\\"audio\\\": null, \\\"function_call\\\": null, \\\"tool_calls\\\": null}}], \\\"created\\\": 1734543867, \\\"model\\\": \\\"meta-llama-3.3-70b-instruct-121024\\\", \\\"object\\\": \\\"chat.completion\\\", \\\"service_tier\\\": null, \\\"system_fingerprint\\\": null, \\\"usage\\\": {\\\"completion_tokens\\\": 118, \\\"prompt_tokens\\\": 6128, \\\"total_tokens\\\": 6246, \\\"completion_tokens_details\\\": null, \\\"prompt_tokens_details\\\": null}}\"}, \"events\": []}], \"request\": \"{\\\"context\\\": \\\"\\\", \\\"model_input\\\": \\\" messages\\\\n0 [{'content': 'What are the critical needs for ...\\\", \\\"params\\\": null}\", \"response\": \"{\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"messages\\\": [{\\\"content\\\": \\\"What are the critical needs for IT and business when it comes to implementing a customer data platform?\\\", \\\"role\\\": \\\"user\\\"}, {\\\"tool_calls\\\": [{\\\"id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"function\\\": {\\\"arguments\\\": \\\"{ \\\\\\\"query\\\\\\\": \\\\\\\"critical needs for IT and business when implementing a customer data platform\\\\\\\", \\\\\\\"filters\\\\\\\": [] }\\\", \\\"name\\\": \\\"search_product_docs\\\"}, \\\"type\\\": \\\"function\\\"}], \\\"content\\\": null, \\\"role\\\": \\\"assistant\\\"}, {\\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\", \\\"tool_responses\\\": [{\\\"tool_call_id\\\": \\\"call_5daaee03-ba46-4038-bb14-1189c8d60f61\\\", \\\"role\\\": \\\"tool\\\", \\\"content\\\": \\\"[{\\\\\\\"page_content\\\\\\\": \\\\\\\"#### eBook\\\\\\\\n\\\\\\\\n# The CDP Build vs Buy Guide:\\\\\\\\n\\\\\\\\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## The Need for a Customer Data Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nOrganizations need to deliver personalized experiences to their customers to stay ahead\\\\\\\\nof the curve \\u2014 that means they need a customer data platform (CDP). Through a CDP, data\\\\\\\\nfrom every touch point, along with third-party information, is brought together to provide\\\\\\\\na unified view of the customer. This enables your marketing team to analyze, identify and\\\\\\\\nactivate customers with targeted content.\\\\\\\\n\\\\\\\\nThe key question for all IT teams at these organizations is whether to build or to buy.\\\\\\\\n\\\\\\\\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\\\\\\\\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\\\\\\\\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\\\\\\\\nfastest path to a solution.\\\\\\\\n\\\\\\\\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\\\\\\\\nexisting marketing and analytics systems.. The cost of adding another system to the\\\\\\\\nlandscape and the redundancy of sensitive customer data creates a governance challenge\\\\\\\\nthat has immediate consequences.\\\\\\\\n\\\\\\\\n**Critical IT Needs** **Critical Business Needs**\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep control of data access and\\\\\\\\ngovernance; ability to architecture a\\\\\\\\ncustomer data stack with decisions on\\\\\\\\nwhere data is stored and where queries\\\\\\\\nare executed\\\\\\\\n\\\\\\\\n\\\\\\\\nGet customer data access via a no-code\\\\\\\\ninterface to generate insights; build customer\\\\\\\\nexperiences and activate data within\\\\\\\\nbusiness applications\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\\\\\\\\nside or the other unaddressed \\u2014 which is why so many organizations who have built a CDP\\\\\\\\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\\\\\\\\n\\\\\\\\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\\\\\\\\n**both sides of the debate and provide organizations a third choice of both building and**\\\\\\\\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\\\\\\\\nthe business with no-code and ease of use interface along with the flexibility and centralized\\\\\\\\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\\\\\\\\nbuying, we\\u2019ve opened the door to finding the right balance of approaches for our customer\\\\\\\\norganizations, helping organizations find greater success in their personalization journey.\\\\\\\\n\\\\\\\\n**\\u201cWe made an attempt to internally build a CDP platform and while we**\\\\\\\\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\\\\\\\\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\\\\\\\\n**or offer a campaign interface to our product marketers that could empower**\\\\\\\\n**them to create and manage those journeys. It was going to take at least two**\\\\\\\\n**years for us to build all of that functionality in house.\\u201d**\\\\\\\\n\\\\\\\\n\\u2013 Sravan Gupta, Senior Manager of GTM Systems, Atlassian\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n## Combining the Build and Buy Approaches\\\\\\\\n\\\\\\\\n\\\\\\\\nBringing together the best of build and buy involves the deployment of the CDP alongside or\\\\\\\\nwithin the lakehouse platform. There are three approaches to this:\\\\\\\\n\\\\\\\\n**Bundled** **Composable**\\\\\\\\n\\\\\\\\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n(Local & Views)\\\\\\\\n\\\\\\\\n\\\\\\\\nQuery\\\\\\\\nVirtualization\\\\\\\\n\\\\\\\\nMetadata\\\\\\\\n\\\\\\\\n\\\\\\\\nData Copy\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\nStorage\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nLakehouse\\\\\\\\n\\\\\\\\n\\\\\\\\nCompute Compute\\\\\\\\n\\\\\\\\nStorage Storage\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDeployment Type\\\\\\\\n\\\\\\\\n**Bundled**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Hybrid**\\\\\\\\n\\\\\\\\n**Composable \\u2013**\\\\\\\\n**Lakehouse-Only**\\\\\\\\n\\\\\\\\n\\\\\\\\nDescription\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0029832723, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"563f0dba5edef5b358685117dfb5a133\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"companies to be multicloud \\u2014 as part of a mandate to reduce risk to the consumer\\u2019s personal information.\\\\\\\\n\\\\\\\\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\\\\\\\\n\\\\\\\\nincreasingly important.\\\\\\\\n\\\\\\\\n**Modernize business applications**\\\\\\\\n\\\\\\\\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple \\u201clift and shift\\u201d\\\\\\\\n\\\\\\\\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\\\\\\\\n\\\\\\\\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\\\\\\\\n\\\\\\\\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\\\\\\\\n\\\\\\\\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\\\\\\\\n\\\\\\\\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\\\\\\\\n\\\\\\\\nservices and APIs to easily provide access to an application\\u2019s functionality.\\\\\\\\n\\\\\\\\nCloud-based architectures, commodity databases and software application development frameworks make\\\\\\\\n\\\\\\\\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\\\\\\\\n\\\\\\\\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\\\\\\\\n\\\\\\\\na backing database) has become straightforward with the latest tooling available to your application\\\\\\\\n\\\\\\\\ndevelopment teams.\\\\\\\\n\\\\\\\\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\\\\\\\\n\\\\\\\\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\\\\\\\\n\\\\\\\\napplications that generate and store a significant amount of the data consumed within an organization. Using\\\\\\\\n\\\\\\\\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\\\\\\\\n\\\\\\\\n\\\\\\\\n\\u201cWe are on an amazing journey. Being among\\\\\\\\n\\\\\\\\nthe fastest-growing enterprise software cloud\\\\\\\\n\\\\\\\\ncompanies on record was unimaginable when\\\\\\\\n\\\\\\\\nwe started Databricks. To get here, we\\u2019ve stayed\\\\\\\\n\\\\\\\\nfocused on the three big bets we made when\\\\\\\\n\\\\\\\\nfounding the company \\u2014 cloud, open source\\\\\\\\n\\\\\\\\nand machine learning. Fast-forward seven years,\\\\\\\\n\\\\\\\\nthousands of data teams around the globe are\\\\\\\\n\\\\\\\\nworking better together on Databricks.\\u201d\\\\\\\\n\\\\\\\\n**Ali Ghodsi**\\\\\\\\n\\\\\\\\nCo-founder and CEO\\\\\\\\n\\\\\\\\nDatabricks\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\\\\\\\\n\\\\\\\\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\\\\\\\\n\\\\\\\\nother applications within your environment to store copies of the data \\u2014 unless absolutely necessary for\\\\\\\\n\\\\\\\\nperformance reasons. In this case, it is best to \\u201ccache\\u201d the data for use in the non-SOR application and sync\\\\\\\\n\\\\\\\\nthe data from the actual SOR.\\\\\\\\n\\\\\\\\nData from these SORs should be made available in three ways:\\\\\\\\n\\\\\\\\n**1.** \\\\\\\\u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007Ensure that copies of the data land in the data lake.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\\\\\\\\n\\\\\\\\nconsumption by downstream applications.\\\\\\\\n\\\\\\\\n**Move toward real-time decisioning**\\\\\\\\n\\\\\\\\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\\\\\\\\n\\\\\\\\nand the second is to view data as an individual event. This so-called \\u201ctime value of data\\u201d is an important\\\\\\\\n\\\\\\\\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both \\u2014 on\\\\\\\\n\\\\\\\\nthe same data platform.\\\\\\\\n\\\\\\\\nOn the one hand, data in aggregate becomes more valuable over time \\u2014 as you collect more of it. The\\\\\\\\n\\\\\\\\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\\\\\\\\n\\\\\\\\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\\\\\\\\n\\\\\\\\nnewly created or arriving data event gives you the opportunity to make decisions \\u2014 in the moment \\u2014 that\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027576878, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"25ef18d715b47231f6594d1da80303e9\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"and security environment but nothing more\\\\\\\\n\\\\\\\\n\\\\\\\\u0007It\\u2019s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\\\\\\\\n\\\\\\\\nof tools in play or streamlining the user experience\\\\\\\\n\\\\\\\\n\\\\\\\\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\\\\\\\\n\\\\\\\\npartnership model, the ability to influence the roadmap and professional services support\\\\\\\\n\\\\\\\\nFor these reasons and more, it\\u2019s worth considering an architecture and procurement strategy that centers\\\\\\\\n\\\\\\\\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\\\\\\\\n\\\\\\\\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nDatabricks is a leading data and AI company \\u2014\\\\\\\\n\\\\\\\\n\\\\\\\\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\\\\\\\\n\\\\\\\\ndata processing, validation and curation should work. It\\u2019s the integration between the discrete functions\\\\\\\\n\\\\\\\\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\\\\\\\\n\\\\\\\\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\\\\\\\\n\\\\\\\\nconsequences of not doing the integration properly can be serious \\u2014 in terms of security, compliance,\\\\\\\\n\\\\\\\\nefficiency, cost, etc.\\\\\\\\n\\\\\\\\n\\\\\\\\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\\\\\\\\n\\\\\\\\n\\\\\\\\nSo, find a vendor that you can develop a true partnership with \\u2014 one that is more likely to take feedback\\\\\\\\n\\\\\\\\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\\\\\\\\n\\\\\\\\ntake from both parties \\u2014 sometimes calling for an organization to adjust their processes to better fit how\\\\\\\\n\\\\\\\\nthe platform works. There are many instances where a given business process could be simplified or recast\\\\\\\\n\\\\\\\\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\\\\\\\\n\\\\\\\\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\\\\\\\\n\\\\\\\\napply to the broadest set of customers.\\\\\\\\n\\\\\\\\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\\\\\\\\n\\\\\\\\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\\\\\\\\n\\\\\\\\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\\\\\\\\n\\\\\\\\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\\\\\\\\n\\\\\\\\nand collaboration helps improve the user experience and decreases time to market.\\\\\\\\n\\\\\\\\n\\\\\\\\n[software](https://databricks.com/product/open-source) that runs our platform \\u2014 and as a result of\\\\\\\\n\\\\\\\\nlistening to the needs of thousands of customers\\\\\\\\n\\\\\\\\nand having our engineers work side by side with\\\\\\\\n\\\\\\\\ncustomer teams to deliver real business value using\\\\\\\\n\\\\\\\\ndata and AI.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Unified platform, unified personas**\\\\\\\\n\\\\\\\\nDeploying a unified data platform \\u2014 like the Databricks Lakehouse Platform, which implements a modern\\\\\\\\n\\\\\\\\ndata stack \\u2014 will provide an integrated suite of tools for the full range of personas in your organization,\\\\\\\\n\\\\\\\\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\\\\\\\\n\\\\\\\\nincrease productivity and reduce risk because you\\u2019ll be better able to share the key aspects of data\\\\\\\\n\\\\\\\\npipelining \\u2014 including ingestion, partitioning, curation, SQL analytics, reporting, and model development\\\\\\\\n\\\\\\\\nand deployment. All the work streams function off a single view of the data, and the handoffs between\\\\\\\\n\\\\\\\\nsubsystems are well managed.\\\\\\\\n\\\\\\\\nData processing happens in one auditable environment, and the number of copies of data is kept to an\\\\\\\\n\\\\\\\\nabsolute minimum \\u2014 with each user benefiting from the data assets created by others. Redundant work\\\\\\\\n\\\\\\\\nis eliminated.\\\\\\\\n\\\\\\\\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\\\\\\\\n\\\\\\\\nworking with rather than collecting the data. It\\u2019s difficult to decide what algorithm will work best \\u2014 shifting\\\\\\\\n\\\\\\\\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\\\\\\\\n\\\\\\\\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0027022872, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"eaff954d65653182857574e043c105f1\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"2020\\u20132025 \\u2014 combined with low-cost cloud storage, compute, open source software and machine learning\\\\\\\\n\\\\\\\\n(ML) environments \\u2014 have caused a major shift in how organizations leverage data and AI to improve data\\\\\\\\n\\\\\\\\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\\\\\\\\n\\\\\\\\nEvery organization is working to improve business outcomes while effectively managing a variety of risks \\u2014\\\\\\\\n\\\\\\\\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\\\\\\\\n\\\\\\\\nYour organization\\u2019s data and the systems that process it play a critical role in not only enabling your financial\\\\\\\\n\\\\\\\\ngoals but also in minimizing these seven key business risks.\\\\\\\\n\\\\\\\\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\\\\\\\\n\\\\\\\\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\\\\\\\\n\\\\\\\\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\\\\\\\\n\\\\\\\\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\\\\\\\\n\\\\\\\\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\\\\\\\\n\\\\\\\\nsignificant return on investment (ROI) \\u2014 one that starts in months, not years.\\\\\\\\n\\\\\\\\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\\\\\\\\n\\\\\\\\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\\\\\\\\n\\\\\\\\nto deliver on their data strategy \\u2014 including how to deploy a modern data architecture, leverage data\\\\\\\\n\\\\\\\\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\\\\\\\\n\\\\\\\\nidentify and execute on AI opportunities.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\\\\\\\\n\\\\\\\\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\\\\\\\\n\\\\\\\\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\\\\\\\\n\\\\\\\\norganizations have the option of moving away from closed, proprietary systems offered by a variety\\\\\\\\n\\\\\\\\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\\\\\\\\n\\\\\\\\nindustry standards.\\\\\\\\n\\\\\\\\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\\\\\\\\n\\\\\\\\nwe\\u2019ve hired industry experts and thought leaders to help organizations better understand the steps involved\\\\\\\\n\\\\\\\\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\\\\\\\\n\\\\\\\\narchitecture, which decouples data storage from compute while providing the best price/performance\\\\\\\\n\\\\\\\\nmetrics for all your data workloads \\u2014 including data warehousing. We have captured the lessons learned\\\\\\\\n\\\\\\\\nand summarized them in this series of Executive Guides \\u2014 which are designed to serve as blueprints for\\\\\\\\n\\\\\\\\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\\\\\\\\n\\\\\\\\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\\\\\\\\n\\\\\\\\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\\\\\\\\n\\\\\\\\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\\\\\\\\n\\\\\\\\nshown in Figure 1.\\\\\\\\n\\\\\\\\n\\\\\\\\n###### Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nWarehousing\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nEngineering\\\\\\\\n\\\\\\\\n\\\\\\\\nData\\\\\\\\nStreaming\\\\\\\\n\\\\\\\\n\\\\\\\\nData S\\ufffdien\\ufffd\\ufffd\\\\\\\\nand ML\\\\\\\\n\\\\\\\\n\\\\\\\\nUnity Catalog\\\\\\\\nFine-grained governance for data and AI\\\\\\\\n\\\\\\\\nDelta Lake\\\\\\\\nData relia)ility and .erfor2ance\\\\\\\\n\\\\\\\\nCloud Data Lake\\\\\\\\nAll structured and unstructured data\\\\\\\\n\\\\\\\\n**Figure 1:**\\\\\\\\nThe Databricks Lakehouse Platform\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**The lakehouse architecture benefits organizations in several ways:**\\\\\\\\n\\\\\\\\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\\\\\\\\n\\\\\\\\n**2.** \\\\\\\\u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\\\\\\\\n\\\\\\\\n**3.** \\\\\\\\u0007It uses open formats and standards that provide greater data portability \\u2014 thus avoiding vendor lock-in.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0025006814, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"f545eff42d3b9ae2b565475f4390ed44\\\\\\\"}, {\\\\\\\"page_content\\\\\\\": \\\\\\\"organization. It\\u2019s worth pressure testing this approach and making sure that a) the requirements truly are\\\\\\\\n\\\\\\\\nunique and b) the development offers the competitive advantage that you need.\\\\\\\\n\\\\\\\\nEven software built on top of open source still requires significant investment in integration and testing.\\\\\\\\n\\\\\\\\nThe integration work is particularly challenging because of the large number of open source libraries that\\\\\\\\n\\\\\\\\nare required in the data science space. The question becomes, \\u201cIs this really the area that you want your\\\\\\\\n\\\\\\\\nengineering teams focused on?\\u201d Or would it be better to \\u201coutsource\\u201c this component to a third party?\\\\\\\\n\\\\\\\\n**How long will it take? Can the organization afford to wait?**\\\\\\\\n\\\\\\\\nEven if you decide the software component provides a competitive advantage and is something worth\\\\\\\\n\\\\\\\\nbuilding in-house, the next question that you should ask is, \\u201cHow long will it take?\\u201d There is definitely a\\\\\\\\n\\\\\\\\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\\\\\\\\n\\\\\\\\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\\\\\\\\n\\\\\\\\ntake longer and cost more money than initially planned.\\\\\\\\n\\\\\\\\nThe organization should understand the impact to the overall performance and capabilities of the daily\\\\\\\\n\\\\\\\\necosystem for any features tied to the in-house development effort. Your business partners likely do\\\\\\\\n\\\\\\\\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\\\\\\\\n\\\\\\\\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\\\\\\\\n\\\\\\\\nfeatures and schedule.\\\\\\\\n\\\\\\\\n\\\\\\\\nDatabricks is built on top of popular open source\\\\\\\\n\\\\\\\\nsoftware that it created. Engineering teams can\\\\\\\\n\\\\\\\\nimprove the underpinnings of the Databricks\\\\\\\\n\\\\\\\\nplatform by submitting code via pull request and\\\\\\\\n\\\\\\\\nbecoming committers to the projects. The benefit\\\\\\\\n\\\\\\\\nto organizations is that their engineers contribute\\\\\\\\n\\\\\\\\nto the feature set of the data platform while\\\\\\\\n\\\\\\\\nDatabricks remains responsible for all integration\\\\\\\\n\\\\\\\\nand performance testing plus all the runtime\\\\\\\\n\\\\\\\\nsupport, including failover and disaster recovery.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n**Don\\u2019t forget about the data**\\\\\\\\n\\\\\\\\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\\\\\\\\n\\\\\\\\n\\u201cdata assets\\u201d consumable to the end users or systems. Data insights, model training and model execution\\\\\\\\n\\\\\\\\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\\\\\\\\n\\\\\\\\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\\\\\\\\n\\\\\\\\nsets from multiple lines of business or departments. Focusing your data engineering and data science\\\\\\\\n\\\\\\\\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\\\\\\\\n\\\\\\\\ncreating true competitive advantage.\\\\\\\\n\\\\\\\\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\\\\\\\\n\\\\\\\\nserve up data for analysis should not be underestimated. The value of this work is equally important to\\\\\\\\n\\\\\\\\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\\\\\\\\n\\\\\\\\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\\\\\\\\n\\\\\\\\nengineers innovate on components that don\\u2019t bring true competitive advantage.\\\\\\\\n\\\\\\\\n\\\\\\\\n-----\\\\\\\\n\\\\\\\\n#### 9. Allocate, monitor and optimize costs\\\\\\\\n\\\\\\\\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type \\u2014 the Boeing 737\\\\\\\\n\\\\\\\\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance \\u2014 requiring\\\\\\\\n\\\\\\\\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\\\\\\\\n\\\\\\\\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\\\\\\\\n\\\\\\\\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\\\\\\\\n\\\\\\\\nand increase efficiency, Southwest created their own version of a unified platform \\u2014 getting all their flight-\\\\\\\\n\\\\\\\\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\\\\\\\\n\\\\\\\\ncould be easily shared and reused by other members of the team. The more the team used the unified\\\\\\\\n\\\\\\\\nplatform, the more they collaborated and their level of expertise increased.\\\\\\\", \\\\\\\"metadata\\\\\\\": {\\\\\\\"similarity_score\\\\\\\": 0.0024809677, \\\\\\\"doc_uri\\\\\\\": \\\\\\\"/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf\\\\\\\"}, \\\\\\\"id\\\\\\\": \\\\\\\"b5f4bd0258226132f89697f6e660b09b\\\\\\\"}]\\\"}], \\\"role\\\": \\\"tool\\\", \\\"name\\\": \\\"User\\\"}, {\\\"content\\\": \\\"The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.\\\", \\\"role\\\": \\\"assistant\\\", \\\"name\\\": \\\"Assistant\\\"}]}\"}}", + null, + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf", + "The critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, organizations should consider a unified data platform that enables seamless integration with point solutions, rather than a suite of discrete tools that require integration work and may no longer be category leaders over the long haul.", + "yes", + null, + "yes", + "No harmful content detected in response", + "yes", + "The expected response states that IT needs to maintain control over data access and governance, which is supported by the response as it mentions 'keeping control of data access and governance'. The expected response also states that IT needs the ability to make decisions on data storage and query execution, which is supported by the response as it mentions 'IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed'. The expected response further states that business needs to access customer data through a no-code interface to generate insights, which is supported by the response as it mentions 'getting customer data access via a no-code interface to generate insights'. Finally, the expected response states that business needs to build customer experiences and activate data within business applications, which is supported by the response as it mentions 'build customer experiences' and 'business needs to activate data within business applications'. The response is correct.", + 5.651, + 7435.0, + 7286.0, + 149.0, + null, + null, + [ + [ + "#### eBook\n\n# The CDP Build vs Buy Guide:\n\n### How to Compose Your CDP with the Databricks Lakehouse and ActionIQ\n\n\n-----\n\n## The Need for a Customer Data Platform\n\n\nOrganizations need to deliver personalized experiences to their customers to stay ahead\nof the curve — that means they need a customer data platform (CDP). Through a CDP, data\nfrom every touch point, along with third-party information, is brought together to provide\na unified view of the customer. This enables your marketing team to analyze, identify and\nactivate customers with targeted content.\n\nThe key question for all IT teams at these organizations is whether to build or to buy.\n\nA CDP that sounds like music to the ears of business leaders may be perceived as noise\nby enterprise IT leaders. The business side of the house needs immediate enablement, and\nan out-of-the-box system dedicated to the specialized needs of marketers seems like the\nfastest path to a solution.\n\nBut for IT, the CDP is yet another system, bringing stack baggage and redundancies to\nexisting marketing and analytics systems.. The cost of adding another system to the\nlandscape and the redundancy of sensitive customer data creates a governance challenge\nthat has immediate consequences.\n\n**Critical IT Needs** **Critical Business Needs**\n\n\nKeep control of data access and\ngovernance; ability to architecture a\ncustomer data stack with decisions on\nwhere data is stored and where queries\nare executed\n\n\nGet customer data access via a no-code\ninterface to generate insights; build customer\nexperiences and activate data within\nbusiness applications\n\n\n-----\n\nThe question of whether to build or buy seems to leave legitimate needs and concerns by one\nside or the other unaddressed — which is why so many organizations who have built a CDP\nhave expressed dissatisfaction regardless of which side of the fence they came down upon.\n\n**At both ActionIQ and Databricks, we believe the best path forward is to acknowledge**\n**both sides of the debate and provide organizations a third choice of both building and**\n**buying.** The ActionIQ customer data platform built on the Databricks Lakehouse provides\nthe business with no-code and ease of use interface along with the flexibility and centralized\ngovernance IT desires. By shifting the conversation from building or buying to building _and_\nbuying, we’ve opened the door to finding the right balance of approaches for our customer\norganizations, helping organizations find greater success in their personalization journey.\n\n**“We made an attempt to internally build a CDP platform and while we**\n**could do basic SQL,** **[audience segmentation](https://www.actioniq.com/solutions/audience-segmentation/)** **and activation across multiple**\n**channels, by no means were we able to orchestrate an** **[omnichannel journey](https://www.actioniq.com/blog/omnichannel-customer-journey/)**\n**or offer a campaign interface to our product marketers that could empower**\n**them to create and manage those journeys. It was going to take at least two**\n**years for us to build all of that functionality in house.”**\n\n– Sravan Gupta, Senior Manager of GTM Systems, Atlassian\n\n\n-----\n\n## Combining the Build and Buy Approaches\n\n\nBringing together the best of build and buy involves the deployment of the CDP alongside or\nwithin the lakehouse platform. There are three approaches to this:\n\n**Bundled** **Composable**\n\n**1. Bundled** **2. Hybrid** **3. Lakehouse-Only**\n\n\nCompute\n\nStorage\n\n\nCompute\n\nStorage\n(Local & Views)\n\n\nQuery\nVirtualization\n\nMetadata\n\n\nData Copy\n\n\nLakehouse\n\nStorage\n\n\nLakehouse\n\n\nLakehouse\n\n\nCompute Compute\n\nStorage Storage\n\n\n-----\n\nDeployment Type\n\n**Bundled**\n\n**Composable –**\n**Hybrid**\n\n**Composable –**\n**Lakehouse-Only**\n\n\nDescription", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/build-vs-buy-guide-databricks-action-iq.pdf" + ], + [ + "companies to be multicloud — as part of a mandate to reduce risk to the consumer’s personal information.\n\nAs a result, data portability and the ability to run workloads on different cloud providers are becoming\n\nincreasingly important.\n\n**Modernize business applications**\n\nAs organizations begin to accelerate the adoption of the cloud, they should avoid a simple “lift and shift”\n\napproach. The majority of on-premises applications are not built with the cloud in mind. They usually\n\ndiffer in the way that they handle security, resiliency, scalability and failover. Their application designs\n\noften store data in ways that make it difficult to adhere to regulatory requirements such as the GDPR and\n\nCCPA standards. Finally, the features and capabilities of the application may be monolithic in nature and,\n\ntherefore, tightly coupled. In contrast, modern cloud applications are modular in design and use RESTful web\n\nservices and APIs to easily provide access to an application’s functionality.\n\nCloud-based architectures, commodity databases and software application development frameworks make\n\nit easier for developers to build scalable, secure end-to-end applications to run all your internal business\n\nprocesses. Building n-tiered applications (e.g., mobile and web-based applications with RESTful APIs and\n\na backing database) has become straightforward with the latest tooling available to your application\n\ndevelopment teams.\n\nAs a first step, organizations should inventory their business-critical applications, prioritize them based\n\non business impact and modernize them in a consistent manner for cloud-based deployments. It is these\n\napplications that generate and store a significant amount of the data consumed within an organization. Using\n\na consistent approach to cloud-based application design makes it easier to extract data when it is needed.\n\n\n“We are on an amazing journey. Being among\n\nthe fastest-growing enterprise software cloud\n\ncompanies on record was unimaginable when\n\nwe started Databricks. To get here, we’ve stayed\n\nfocused on the three big bets we made when\n\nfounding the company — cloud, open source\n\nand machine learning. Fast-forward seven years,\n\nthousands of data teams around the globe are\n\nworking better together on Databricks.”\n\n**Ali Ghodsi**\n\nCo-founder and CEO\n\nDatabricks\n\n\n-----\n\nThe next step is to identify which applications are viewed as the system of record (SOR) for a given data set.\n\nA good architectural principle is to only allow data sets to be stored inside their declared SOR and not allow\n\nother applications within your environment to store copies of the data — unless absolutely necessary for\n\nperformance reasons. In this case, it is best to “cache” the data for use in the non-SOR application and sync\n\nthe data from the actual SOR.\n\nData from these SORs should be made available in three ways:\n\n**1.** \u0007 Expose a set of RESTful APIs for applications to invoke at any given time.\n\n**2.** \u0007Ensure that copies of the data land in the data lake.\n\n**3.** \u0007Change data capture (CDC) and other business events should be streamed in real time for immediate\n\nconsumption by downstream applications.\n\n**Move toward real-time decisioning**\n\nThe value of data should be viewed through two different lenses. The first is to view data in the aggregate,\n\nand the second is to view data as an individual event. This so-called “time value of data” is an important\n\nconcept in the world of data, analytics and AI. To be effective, you need to be able to leverage both — on\n\nthe same data platform.\n\nOn the one hand, data in aggregate becomes more valuable over time — as you collect more of it. The\n\naggregate data provides the ability to look back in time and see the complete history of an aspect of your\n\nbusiness and to discover trends. Real-time data is most valuable the moment it is captured. In contrast, a\n\nnewly created or arriving data event gives you the opportunity to make decisions — in the moment — that", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf" + ], + [ + "and security environment but nothing more\n\n\u0007It’s rare that the tools are evaluated in terms of simplifying the overall architecture, reducing the number\n\nof tools in play or streamlining the user experience\n\n\u0007The vendor backing the tool is evaluated in terms of risk, but not enough focus is spent on the\n\npartnership model, the ability to influence the roadmap and professional services support\n\nFor these reasons and more, it’s worth considering an architecture and procurement strategy that centers\n\non selecting a data platform that enables seamless integration with point solutions rather than a suite of\n\ndiscrete tools that require integration work and may no longer be category leaders over the long haul.\n\n\n-----\n\nDatabricks is a leading data and AI company —\n\n\nKeep in mind that data platforms work well because the vendor took an opinionated point of view of how\n\ndata processing, validation and curation should work. It’s the integration between the discrete functions\n\nof the platform that saves time, conserves effort and improves the user experience. Many companies try\n\nto take on the integration of different technology stacks, which increases risk, cost and complexity. The\n\nconsequences of not doing the integration properly can be serious — in terms of security, compliance,\n\nefficiency, cost, etc.\n\n\npartly due to the innovations in the [open source](https://databricks.com/product/open-source)\n\n\nSo, find a vendor that you can develop a true partnership with — one that is more likely to take feedback\n\nand incorporate your requirements into their platform product roadmap. This will require some give-and-\n\ntake from both parties — sometimes calling for an organization to adjust their processes to better fit how\n\nthe platform works. There are many instances where a given business process could be simplified or recast\n\nto work with the platform, as is. Sometimes it will require the vendor to add features that support your\n\nprocesses. The vendor will always be market driven and will want to build features in such a way that they\n\napply to the broadest set of customers.\n\nThe final point to consider is that it takes a substantial amount of time to become an expert user of a given\n\ntool. Users must make a significant investment to learn how the tool works and the most efficient way of\n\nperforming their job. The more discrete tools in an environment, the more challenging this becomes.\n\nMinimizing the number of tools and their different interfaces, styles of interaction and approach to security\n\nand collaboration helps improve the user experience and decreases time to market.\n\n\n[software](https://databricks.com/product/open-source) that runs our platform — and as a result of\n\nlistening to the needs of thousands of customers\n\nand having our engineers work side by side with\n\ncustomer teams to deliver real business value using\n\ndata and AI.\n\n\n-----\n\n**Unified platform, unified personas**\n\nDeploying a unified data platform — like the Databricks Lakehouse Platform, which implements a modern\n\ndata stack — will provide an integrated suite of tools for the full range of personas in your organization,\n\nincluding business analysts, SQL developers, data engineers and data scientists. You will immediately\n\nincrease productivity and reduce risk because you’ll be better able to share the key aspects of data\n\npipelining — including ingestion, partitioning, curation, SQL analytics, reporting, and model development\n\nand deployment. All the work streams function off a single view of the data, and the handoffs between\n\nsubsystems are well managed.\n\nData processing happens in one auditable environment, and the number of copies of data is kept to an\n\nabsolute minimum — with each user benefiting from the data assets created by others. Redundant work\n\nis eliminated.\n\nThe 80/20 dilemma for data scientists shifts to a healthier ratio, and they now are able to spend more time\n\nworking with rather than collecting the data. It’s difficult to decide what algorithm will work best — shifting\n\nthe 80/20 ratio allows the data scientist to try out multiple algorithms to solve a problem.\n\nAnother challenge is that enterprise data changes rapidly. New fields are added or existing fields are typed", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf" + ], + [ + "2020–2025 — combined with low-cost cloud storage, compute, open source software and machine learning\n\n(ML) environments — have caused a major shift in how organizations leverage data and AI to improve data\n\ngovernance and the user experience, plus satisfy more AI/ML-based use cases to drive future growth.\n\nEvery organization is working to improve business outcomes while effectively managing a variety of risks —\n\nincluding economic, compliance, security and fraud, financial, reputational, operational and competitive risk.\n\nYour organization’s data and the systems that process it play a critical role in not only enabling your financial\n\ngoals but also in minimizing these seven key business risks.\n\nBusinesses have realized that their legacy information technology (IT) platforms are not able to scale and\n\nmeet the increasing demands for better data analytics. As a result, they are looking to transform how their\n\norganizations use and process data. Successful data transformation initiatives for data, analytics and AI\n\ninvolve not only the design of hardware and software systems but also the alignment of people, processes\n\nand platforms. These initiatives always require a major financial investment and, therefore, need to yield a\n\nsignificant return on investment (ROI) — one that starts in months, not years.\n\nTo guide these initiatives, many organizations are adding the role of chief data officer (CDO) to their C-suite.\n\nDespite this structural change and focused resources, [87% of organizations](https://databricks.com/discover/mit-infographic) still face many challenges\n\nto deliver on their data strategy — including how to deploy a modern data architecture, leverage data\n\nefficiently and securely, stay compliant with an ever-increasing set of regulations, hire the right talent, and\n\nidentify and execute on AI opportunities.\n\n\n-----\n\nTo successfully lead data and AI transformation initiatives, organizations need to develop and execute\n\na comprehensive strategy that enables them to easily deploy a modern data architecture, unlock the\n\nfull potential of all their data, and future-proof their investments to provide the greatest ROI. Today,\n\norganizations have the option of moving away from closed, proprietary systems offered by a variety\n\nof cloud vendors and adopting a strategy that emphasizes open, nonproprietary solutions built using\n\nindustry standards.\n\nAt Databricks, we have helped over 7,000 companies achieve data, analytics and AI breakthroughs, and\n\nwe’ve hired industry experts and thought leaders to help organizations better understand the steps involved\n\nin successful digital transformation initiatives. We are the first vendor to propose the data lakehouse\n\narchitecture, which decouples data storage from compute while providing the best price/performance\n\nmetrics for all your data workloads — including data warehousing. We have captured the lessons learned\n\nand summarized them in this series of Executive Guides — which are designed to serve as blueprints for\n\nCIOs, CDOs, CTOs and other data and technology executives to implement successful digital transformation\n\ninitiatives for data, analytics and AI using a _modern data stack_ . Databricks is the first company to deliver a\n\nunified data platform that realizes the data lakehouse architecture and enables the data personas in your\n\norganization to run their data, analytics and AI workloads in a simple, open and collaborative environment, as\n\nshown in Figure 1.\n\n\n###### Lakehouse Platform\n\n\nData\nWarehousing\n\n\nData\nEngineering\n\n\nData\nStreaming\n\n\nData S�ien��\nand ML\n\n\nUnity Catalog\nFine-grained governance for data and AI\n\nDelta Lake\nData relia)ility and .erfor2ance\n\nCloud Data Lake\nAll structured and unstructured data\n\n**Figure 1:**\nThe Databricks Lakehouse Platform\n\n\n-----\n\n**The lakehouse architecture benefits organizations in several ways:**\n\n**1.** It leverages low-cost cloud object stores to store ALL enterprise data.\n\n**2.** \u0007It provides the ability to run different data workloads efficiently and in a cost-effective manner.\n\n**3.** \u0007It uses open formats and standards that provide greater data portability — thus avoiding vendor lock-in.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf" + ], + [ + "organization. It’s worth pressure testing this approach and making sure that a) the requirements truly are\n\nunique and b) the development offers the competitive advantage that you need.\n\nEven software built on top of open source still requires significant investment in integration and testing.\n\nThe integration work is particularly challenging because of the large number of open source libraries that\n\nare required in the data science space. The question becomes, “Is this really the area that you want your\n\nengineering teams focused on?” Or would it be better to “outsource“ this component to a third party?\n\n**How long will it take? Can the organization afford to wait?**\n\nEven if you decide the software component provides a competitive advantage and is something worth\n\nbuilding in-house, the next question that you should ask is, “How long will it take?” There is definitely a\n\ntime-to-market consideration, and the build vs. buy decision needs to also account for the impact to the\n\nbusiness due to the anticipated delivery schedule. Keep in mind that software development projects usually\n\ntake longer and cost more money than initially planned.\n\nThe organization should understand the impact to the overall performance and capabilities of the daily\n\necosystem for any features tied to the in-house development effort. Your business partners likely do\n\nnot care how the data ecosystem is implemented as long as it works, meets their needs, is performant,\n\nis reliable and is delivered on time. Carefully weigh the trade-offs among competitive advantage, cost,\n\nfeatures and schedule.\n\n\nDatabricks is built on top of popular open source\n\nsoftware that it created. Engineering teams can\n\nimprove the underpinnings of the Databricks\n\nplatform by submitting code via pull request and\n\nbecoming committers to the projects. The benefit\n\nto organizations is that their engineers contribute\n\nto the feature set of the data platform while\n\nDatabricks remains responsible for all integration\n\nand performance testing plus all the runtime\n\nsupport, including failover and disaster recovery.\n\n\n-----\n\n**Don’t forget about the data**\n\nPerhaps the single most important feature of a modern data stack is its ability to help make data sets and\n\n“data assets” consumable to the end users or systems. Data insights, model training and model execution\n\ncannot happen in a reliable manner unless the data they depend on can be trusted and is of good quality.\n\nIn large organizations, revenue opportunities and the ability to reduce risk often depend on merging data\n\nsets from multiple lines of business or departments. Focusing your data engineering and data science\n\nefforts on curating data and creating robust and reliable pipelines likely provides the best chance at\n\ncreating true competitive advantage.\n\nThe amount of work required to properly catalog, schema enforce, quality check, partition, secure and\n\nserve up data for analysis should not be underestimated. The value of this work is equally important to\n\nthe business. The ability to curate data to enable game-changing insights should be the focus of the work\n\nled by the CDO and CIO. This has much more to do with the data than it does with the ability to have your\n\nengineers innovate on components that don’t bring true competitive advantage.\n\n\n-----\n\n#### 9. Allocate, monitor and optimize costs\n\nBeginning in 1987, Southwest Airlines famously standardized on flying a single airplane type — the Boeing 737\n\nclass of aircraft. This decision allowed the airline to save on both operations and maintenance — requiring\n\nonly one type of simulator to train pilots, streamlining their spare parts supply chain and maintaining a\n\nmore manageable parts inventory. Their pilots and maintenance crews were effectively interchangeable in\n\ncase anyone ever called in sick or missed a connection. The key takeaway is that in order to reduce costs\n\nand increase efficiency, Southwest created their own version of a unified platform — getting all their flight-\n\nrelated personas to collaborate and operate from the same point of view. Lessons learned on the platform\n\ncould be easily shared and reused by other members of the team. The more the team used the unified\n\nplatform, the more they collaborated and their level of expertise increased.", + "/Volumes/casaman_ssa/demos/volume_databricks_documentation/databricks-pdf/transform-scale-your-organization-with-data-ai-v16-052522.pdf" + ] + ], + "yes", + "The response states that the critical needs for IT and business when implementing a customer data platform (CDP) include keeping control of data access and governance, as well as getting customer data access via a no-code interface to generate insights and build customer experiences. It also mentions that IT needs to architecture a customer data stack with decisions on where data is stored and where queries are executed, while business needs to activate data within business applications. Additionally, it suggests considering a unified data platform that enables seamless integration with point solutions. The retrieved context supports these points by stating that IT needs to keep control of data access and governance and architecture a customer data stack, while business needs to get customer data access via a no-code interface to generate insights and build customer experiences. The document also mentions the importance of a unified data platform for seamless integration. Therefore, the response is fully supported by the retrieved context.", + "yes", + "The ground truth states four critical needs for IT and business when implementing a customer data platform. The retrieved context provides a section titled 'Critical IT Needs' and 'Critical Business Needs' which directly addresses these points. The retrieved context mentions that IT needs to 'keep control of data access and governance' and 'ability to architecture a customer data stack with decisions on where data is stored and where queries are executed,' which supports the first two points of the ground truth. For the business needs, the retrieved context states that business needs 'customer data access via a no-code interface to generate insights' and to 'build customer experiences and activate data within business applications,' which supports the last two points of the ground truth. Therefore, all parts of the ground truth are supported by the retrieved context.", + [ + "yes", + "no", + "no", + "no", + "no" + ], + 1.0 + ] + ], + "datasetInfos": [], + "dbfsResultPath": null, + "isJsonSchema": true, + "metadata": {}, + "overflow": true, + "plotOptions": { + "customPlotOptions": {}, + "displayType": "table", + "pivotAggregation": null, + "pivotColumns": null, + "xColumns": null, + "yColumns": null + }, + "removedWidgets": [], + "schema": [ + { + "metadata": "{}", + "name": "request_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "request", + "type": "{\"type\":\"struct\",\"fields\":[{\"name\":\"messages\",\"type\":{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"content\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"role\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true},\"nullable\":true,\"metadata\":{}}]}" + }, + { + "metadata": "{}", + "name": "expected_retrieved_context", + "type": "{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"content\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"doc_uri\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "expected_facts", + "type": "{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "trace", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "model_error_message", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "source_id", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/overall_assessment/rating", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/overall_assessment/rationale", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/safety/rating", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/safety/rationale", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/correctness/rating", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/correctness/rationale", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "agent/latency_seconds", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "agent/total_token_count", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "agent/total_input_token_count", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "agent/total_output_token_count", + "type": "\"double\"" + }, + { + "metadata": "{}", + "name": "retrieval/llm_judged/context_sufficiency/error_message", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/groundedness/error_message", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "retrieved_context", + "type": "{\"type\":\"array\",\"elementType\":{\"type\":\"struct\",\"fields\":[{\"name\":\"content\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}},{\"name\":\"doc_uri\",\"type\":\"string\",\"nullable\":true,\"metadata\":{}}]},\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "response/llm_judged/groundedness/rating", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "response/llm_judged/groundedness/rationale", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "retrieval/llm_judged/context_sufficiency/rating", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "retrieval/llm_judged/context_sufficiency/rationale", + "type": "\"string\"" + }, + { + "metadata": "{}", + "name": "retrieval/ground_truth/document_ratings", + "type": "{\"type\":\"array\",\"elementType\":\"string\",\"containsNull\":true}" + }, + { + "metadata": "{}", + "name": "retrieval/ground_truth/document_recall", + "type": "\"double\"" + } + ], + "type": "table" + } + }, + "output_type": "display_data" + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "🏃 View run amusing-robin-418 at: https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093/runs/daf417a2ec4a4ebaa7ba9622535a6263\n🧪 View experiment at: https://adb-984752964297111.11.azuredatabricks.net/ml/experiments/2822477370659093\n" + ] + } + ], + "source": [ + "evaluation_set = spark.table(agent_storage_config.evaluation_set_uc_table)\n", + "\n", + "mlflow.langchain.autolog(disable=True, log_traces=False)\n", + "mlflow.autogen.autolog(log_traces=False)\n", + "\n", + "with mlflow.start_run():\n", + " logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config)\n", + "\n", + " # Run the agent for these queries, using Agent evaluation to parallelize the calls\n", + " eval_results = mlflow.evaluate(\n", + " model=logged_agent_info.model_uri, # use the MLflow logged Agent\n", + " data=evaluation_set, # Evaluate the Agent for every row of the evaluation set\n", + " model_type=\"databricks-agent\", # use Agent Evaluation\n", + " )\n", + "\n", + " # Show all outputs. Click on a row in this table to display the MLflow Trace.\n", + " display(eval_results.tables[\"eval_results\"])\n", + "\n", + " # Click 'View Evaluation Results' to see the Agent's inputs/outputs + quality evaluation displayed in a UI" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3747832f-d618-4176-9112-b7b3675608b3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 2️⃣ Deploy a version of your Agent - either to the Review App or Production\n", + "\n", + "Once you have a version of your Agent that has sufficient quality, you will register the Agent's model from the MLflow Experiment into the Unity Catalog & use Agent Framework's `agents.deploy(...)` command to deploy it. Note these steps are the same for deploying to pre-production (e.g., the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) or production.\n", + "\n", + "By the end of this step, you will have deployed a version of your Agent that you can interact with and share with your business stakeholders for feedback, even if they don't have access to your Databricks workspace:\n", + "\n", + "1. A production-ready scalable REST API deployed as a Model Serving endpoint that logged every request/request/MLflow Trace to a Delta Table.\n", + " - REST API for querying the Agent\n", + " - REST API for sending user feedback from your UI to the Agent\n", + "2. Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) connected to these endpoints.\n", + "3. [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) connected to these endpoints." + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6f95542e-62da-47a3-bc46-371a845cfa76", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Option 1: Deploy the last agent you logged above" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "41ed720c-b3c7-4477-9251-95f946589e68", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "Registered model 'casaman_ssa.demos.my_agent_autogen' already exists. Creating a new version of this model...\nCreated version '15' of model 'casaman_ssa.demos.my_agent_autogen'.\nWARNING:databricks.agents.utils.mlflow_utils:Agent model version did not have any of the recommended agent signatures. Falling back to checking agent model version compatibility with legacy signatures. Databricks recommends updating and re-logging agents to use the latest signatures; legacy signatures will be removed in the next major MLflow release. See https://docs.databricks.com/en/generative-ai/agent-framework/agent-schema.html for additional details\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/databricks/agents/utils/mlflow_utils.py:130: FutureWarning: ``mlflow.models.rag_signatures.ChatCompletionRequest`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatCompletionRequest`` instead.\n ChatCompletionRequest()\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/models/rag_signatures.py:26: FutureWarning: ``mlflow.models.rag_signatures.Message`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatMessage`` instead.\n messages: list[Message] = field(default_factory=lambda: [Message()])\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/databricks/agents/utils/mlflow_utils.py:133: FutureWarning: ``mlflow.models.rag_signatures.SplitChatMessagesRequest`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatCompletionRequest`` instead.\n split_chat_messages_schema = convert_dataclass_to_schema(SplitChatMessagesRequest())\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/databricks/agents/utils/mlflow_utils.py:184: FutureWarning: ``mlflow.models.rag_signatures.ChatCompletionResponse`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatCompletionResponse`` instead.\n ChatCompletionResponse()\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/models/rag_signatures.py:72: FutureWarning: ``mlflow.models.rag_signatures.ChainCompletionChoice`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatChoice`` instead.\n choices: list[ChainCompletionChoice] = field(default_factory=lambda: [ChainCompletionChoice()])\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/mlflow/models/rag_signatures.py:48: FutureWarning: ``mlflow.models.rag_signatures.Message`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatMessage`` instead.\n default_factory=lambda: Message(\n/local_disk0/.ephemeral_nfs/envs/pythonEnv-d1adea7d-6539-428e-9e3a-1dc2ec857a07/lib/python3.11/site-packages/databricks/agents/utils/mlflow_utils.py:187: FutureWarning: ``mlflow.models.rag_signatures.StringResponse`` is deprecated. This method will be removed in a future release. Use ``mlflow.types.llm.ChatCompletionResponse`` instead.\n string_response_schema = convert_dataclass_to_schema(StringResponse())\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\n Deployment of casaman_ssa.demos.my_agent_autogen version 15 initiated. This can take up to 15 minutes and the Review App & Query Endpoint will not work until this deployment finishes.\n\n View status: https://adb-984752964297111.11.azuredatabricks.net/ml/endpoints/agents_casaman_ssa-demos-my_agent_autogen\n Review App: https://adb-984752964297111.11.azuredatabricks.net/ml/review/casaman_ssa.demos.my_agent_autogen/15?o=984752964297111\n \n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "Deployment(model_name='casaman_ssa.demos.my_agent_autogen', model_version='15', endpoint_name='agents_casaman_ssa-demos-my_agent_autogen', served_entity_name='casaman_ssa-demos-my_agent_autogen_15', query_endpoint='https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints/agents_casaman_ssa-demos-my_agent_autogen/served-models/casaman_ssa-demos-my_agent_autogen_15/invocations', endpoint_url='https://adb-984752964297111.11.azuredatabricks.net/ml/endpoints/agents_casaman_ssa-demos-my_agent_autogen', review_app_url='https://adb-984752964297111.11.azuredatabricks.net/ml/review/casaman_ssa.demos.my_agent_autogen/15?o=984752964297111')" + ] + }, + "execution_count": 11, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "from databricks import agents\n", + "\n", + "# Use Unity Catalog as the model registry\n", + "mlflow.set_registry_uri(\"databricks-uc\")\n", + "\n", + "# Register the Agent's model to the Unity Catalog\n", + "uc_registered_model_info = mlflow.register_model(\n", + " model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name\n", + ")\n", + "\n", + "# Deploy the model to the review app and a model serving endpoint\n", + "agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "18987864-dad9-444b-ad3a-cf9a20feaef5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Option 2: Log the latest copy of the Agent's code/config and deploy it" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b316a3ae-4c2c-462f-99f9-b7b74a250c29", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "com.databricks.backend.common.rpc.CommandCancelledException\n", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)\n", + "\tat scala.Option.getOrElse(Option.scala:189)\n", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)\n", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)\n", + "\tat scala.collection.immutable.Range.foreach(Range.scala:158)\n", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)\n", + "\tat com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)\n", + "\tat com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)\n", + "\tat scala.Option.getOrElse(Option.scala:189)\n", + "\tat com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)\n", + "\tat com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:464)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:571)\n", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:528)\n", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)\n", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)\n", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)\n", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)\n", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)\n", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.withAttributionContext(ChauffeurState.scala:51)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.withAttributionTags(ChauffeurState.scala:51)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.recordOperationWithResultTags(ChauffeurState.scala:51)\n", + "\tat com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:529)\n", + "\tat com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:495)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.recordOperation(ChauffeurState.scala:51)\n", + "\tat com.databricks.spark.chauffeur.ChauffeurState.process(ChauffeurState.scala:553)\n", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.handleDriverRequest$1(Chauffeur.scala:830)\n", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.$anonfun$applyOrElse$5(Chauffeur.scala:856)\n", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)\n", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)\n", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)\n", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)\n", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)\n", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)\n", + "\tat com.databricks.rpc.ServerBackend.withAttributionContext(ServerBackend.scala:22)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)\n", + "\tat com.databricks.rpc.ServerBackend.withAttributionTags(ServerBackend.scala:22)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)\n", + "\tat com.databricks.rpc.ServerBackend.recordOperationWithResultTags(ServerBackend.scala:22)\n", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.handleDriverRequestWithUsageLogging$1(Chauffeur.scala:855)\n", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.applyOrElse(Chauffeur.scala:910)\n", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.applyOrElse(Chauffeur.scala:703)\n", + "\tat com.databricks.rpc.ServerBackend.$anonfun$internalReceive0$2(ServerBackend.scala:174)\n", + "\tat com.databricks.rpc.ServerBackend$$anonfun$commonReceive$1.applyOrElse(ServerBackend.scala:200)\n", + "\tat com.databricks.rpc.ServerBackend$$anonfun$commonReceive$1.applyOrElse(ServerBackend.scala:200)\n", + "\tat com.databricks.rpc.ServerBackend.internalReceive0(ServerBackend.scala:171)\n", + "\tat com.databricks.rpc.ServerBackend.$anonfun$internalReceive$1(ServerBackend.scala:147)\n", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:528)\n", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)\n", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)\n", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)\n", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)\n", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)\n", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)\n", + "\tat com.databricks.rpc.ServerBackend.withAttributionContext(ServerBackend.scala:22)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)\n", + "\tat com.databricks.rpc.ServerBackend.withAttributionTags(ServerBackend.scala:22)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)\n", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)\n", + "\tat com.databricks.rpc.ServerBackend.recordOperationWithResultTags(ServerBackend.scala:22)\n", + "\tat com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:529)\n", + "\tat com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:495)\n", + "\tat com.databricks.rpc.ServerBackend.recordOperation(ServerBackend.scala:22)\n", + "\tat com.databricks.rpc.ServerBackend.internalReceive(ServerBackend.scala:146)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleRPC(JettyServer.scala:1021)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleRequestAndRespond(JettyServer.scala:942)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.$anonfun$handleHttp$6(JettyServer.scala:546)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.$anonfun$handleHttp$6$adapted(JettyServer.scala:515)\n", + "\tat com.databricks.logging.activity.ActivityContextFactory$.$anonfun$withActivityInternal$6(ActivityContextFactory.scala:545)\n", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)\n", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)\n", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)\n", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)\n", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withAttributionContext(ActivityContextFactory.scala:48)\n", + "\tat com.databricks.logging.activity.ActivityContextFactory$.$anonfun$withActivityInternal$3(ActivityContextFactory.scala:545)\n", + "\tat com.databricks.context.integrity.IntegrityCheckContext$ThreadLocalStorage$.withValue(IntegrityCheckContext.scala:73)\n", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withActivityInternal(ActivityContextFactory.scala:523)\n", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withServiceRequestActivity(ActivityContextFactory.scala:175)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleHttp(JettyServer.scala:515)\n", + "\tat com.databricks.rpc.JettyServer$RequestManager.doPost(JettyServer.scala:405)\n", + "\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:665)\n", + "\tat com.databricks.rpc.HttpServletWithPatch.service(HttpServletWithPatch.scala:33)\n", + "\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:750)\n", + "\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)\n", + "\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:554)\n", + "\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)\n", + "\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505)\n", + "\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)\n", + "\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)\n", + "\tat org.eclipse.jetty.server.Server.handle(Server.java:516)\n", + "\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487)\n", + "\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732)\n", + "\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479)\n", + "\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)\n", + "\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)\n", + "\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n", + "\tat org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)\n", + "\tat org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)\n", + "\tat org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)\n", + "\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)\n", + "\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)\n", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)\n", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)\n", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)\n", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)\n", + "\tat org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:409)\n", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.$anonfun$run$2(InstrumentedQueuedThreadPool.scala:105)\n", + "\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)\n", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)\n", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)\n", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)\n", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)\n", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)\n", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool.withAttributionContext(InstrumentedQueuedThreadPool.scala:45)\n", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.$anonfun$run$1(InstrumentedQueuedThreadPool.scala:105)\n", + "\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)\n", + "\tat com.databricks.instrumentation.QueuedThreadPoolInstrumenter.trackActiveThreads(QueuedThreadPoolInstrumenter.scala:110)\n", + "\tat com.databricks.instrumentation.QueuedThreadPoolInstrumenter.trackActiveThreads$(QueuedThreadPoolInstrumenter.scala:107)\n", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool.trackActiveThreads(InstrumentedQueuedThreadPool.scala:45)\n", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.run(InstrumentedQueuedThreadPool.scala:87)\n", + "\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)\n", + "\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)\n", + "\tat java.lang.Thread.run(Thread.java:750)" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": null, + "metadata": { + "errorSummary": "Cancelled" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "com.databricks.backend.common.rpc.CommandCancelledException", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$5(SequenceExecutionState.scala:136)", + "\tat scala.Option.getOrElse(Option.scala:189)", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3(SequenceExecutionState.scala:136)", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.$anonfun$cancel$3$adapted(SequenceExecutionState.scala:133)", + "\tat scala.collection.immutable.Range.foreach(Range.scala:158)", + "\tat com.databricks.spark.chauffeur.SequenceExecutionState.cancel(SequenceExecutionState.scala:133)", + "\tat com.databricks.spark.chauffeur.ExecContextState.cancelRunningSequence(ExecContextState.scala:728)", + "\tat com.databricks.spark.chauffeur.ExecContextState.$anonfun$cancel$1(ExecContextState.scala:446)", + "\tat scala.Option.getOrElse(Option.scala:189)", + "\tat com.databricks.spark.chauffeur.ExecContextState.cancel(ExecContextState.scala:446)", + "\tat com.databricks.spark.chauffeur.ExecutionContextManagerV1.cancelExecution(ExecutionContextManagerV1.scala:464)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.$anonfun$process$1(ChauffeurState.scala:571)", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:528)", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.withAttributionContext(ChauffeurState.scala:51)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.withAttributionTags(ChauffeurState.scala:51)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.recordOperationWithResultTags(ChauffeurState.scala:51)", + "\tat com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:529)", + "\tat com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:495)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.recordOperation(ChauffeurState.scala:51)", + "\tat com.databricks.spark.chauffeur.ChauffeurState.process(ChauffeurState.scala:553)", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.handleDriverRequest$1(Chauffeur.scala:830)", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.$anonfun$applyOrElse$5(Chauffeur.scala:856)", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)", + "\tat com.databricks.rpc.ServerBackend.withAttributionContext(ServerBackend.scala:22)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)", + "\tat com.databricks.rpc.ServerBackend.withAttributionTags(ServerBackend.scala:22)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)", + "\tat com.databricks.rpc.ServerBackend.recordOperationWithResultTags(ServerBackend.scala:22)", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.handleDriverRequestWithUsageLogging$1(Chauffeur.scala:855)", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.applyOrElse(Chauffeur.scala:910)", + "\tat com.databricks.spark.chauffeur.Chauffeur$$anon$1$$anonfun$receive$1.applyOrElse(Chauffeur.scala:703)", + "\tat com.databricks.rpc.ServerBackend.$anonfun$internalReceive0$2(ServerBackend.scala:174)", + "\tat com.databricks.rpc.ServerBackend$$anonfun$commonReceive$1.applyOrElse(ServerBackend.scala:200)", + "\tat com.databricks.rpc.ServerBackend$$anonfun$commonReceive$1.applyOrElse(ServerBackend.scala:200)", + "\tat com.databricks.rpc.ServerBackend.internalReceive0(ServerBackend.scala:171)", + "\tat com.databricks.rpc.ServerBackend.$anonfun$internalReceive$1(ServerBackend.scala:147)", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperation$1(UsageLogging.scala:528)", + "\tat com.databricks.logging.UsageLogging.executeThunkAndCaptureResultTags$1(UsageLogging.scala:633)", + "\tat com.databricks.logging.UsageLogging.$anonfun$recordOperationWithResultTags$4(UsageLogging.scala:656)", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)", + "\tat com.databricks.rpc.ServerBackend.withAttributionContext(ServerBackend.scala:22)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags(AttributionContextTracing.scala:95)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionTags$(AttributionContextTracing.scala:76)", + "\tat com.databricks.rpc.ServerBackend.withAttributionTags(ServerBackend.scala:22)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags(UsageLogging.scala:628)", + "\tat com.databricks.logging.UsageLogging.recordOperationWithResultTags$(UsageLogging.scala:537)", + "\tat com.databricks.rpc.ServerBackend.recordOperationWithResultTags(ServerBackend.scala:22)", + "\tat com.databricks.logging.UsageLogging.recordOperation(UsageLogging.scala:529)", + "\tat com.databricks.logging.UsageLogging.recordOperation$(UsageLogging.scala:495)", + "\tat com.databricks.rpc.ServerBackend.recordOperation(ServerBackend.scala:22)", + "\tat com.databricks.rpc.ServerBackend.internalReceive(ServerBackend.scala:146)", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleRPC(JettyServer.scala:1021)", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleRequestAndRespond(JettyServer.scala:942)", + "\tat com.databricks.rpc.JettyServer$RequestManager.$anonfun$handleHttp$6(JettyServer.scala:546)", + "\tat com.databricks.rpc.JettyServer$RequestManager.$anonfun$handleHttp$6$adapted(JettyServer.scala:515)", + "\tat com.databricks.logging.activity.ActivityContextFactory$.$anonfun$withActivityInternal$6(ActivityContextFactory.scala:545)", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withAttributionContext(ActivityContextFactory.scala:48)", + "\tat com.databricks.logging.activity.ActivityContextFactory$.$anonfun$withActivityInternal$3(ActivityContextFactory.scala:545)", + "\tat com.databricks.context.integrity.IntegrityCheckContext$ThreadLocalStorage$.withValue(IntegrityCheckContext.scala:73)", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withActivityInternal(ActivityContextFactory.scala:523)", + "\tat com.databricks.logging.activity.ActivityContextFactory$.withServiceRequestActivity(ActivityContextFactory.scala:175)", + "\tat com.databricks.rpc.JettyServer$RequestManager.handleHttp(JettyServer.scala:515)", + "\tat com.databricks.rpc.JettyServer$RequestManager.doPost(JettyServer.scala:405)", + "\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:665)", + "\tat com.databricks.rpc.HttpServletWithPatch.service(HttpServletWithPatch.scala:33)", + "\tat javax.servlet.http.HttpServlet.service(HttpServlet.java:750)", + "\tat org.eclipse.jetty.servlet.ServletHolder.handle(ServletHolder.java:799)", + "\tat org.eclipse.jetty.servlet.ServletHandler.doHandle(ServletHandler.java:554)", + "\tat org.eclipse.jetty.server.handler.ScopedHandler.nextScope(ScopedHandler.java:190)", + "\tat org.eclipse.jetty.servlet.ServletHandler.doScope(ServletHandler.java:505)", + "\tat org.eclipse.jetty.server.handler.ScopedHandler.handle(ScopedHandler.java:141)", + "\tat org.eclipse.jetty.server.handler.HandlerWrapper.handle(HandlerWrapper.java:127)", + "\tat org.eclipse.jetty.server.Server.handle(Server.java:516)", + "\tat org.eclipse.jetty.server.HttpChannel.lambda$handle$1(HttpChannel.java:487)", + "\tat org.eclipse.jetty.server.HttpChannel.dispatch(HttpChannel.java:732)", + "\tat org.eclipse.jetty.server.HttpChannel.handle(HttpChannel.java:479)", + "\tat org.eclipse.jetty.server.HttpConnection.onFillable(HttpConnection.java:277)", + "\tat org.eclipse.jetty.io.AbstractConnection$ReadCallback.succeeded(AbstractConnection.java:311)", + "\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)", + "\tat org.eclipse.jetty.io.ssl.SslConnection$DecryptedEndPoint.onFillable(SslConnection.java:555)", + "\tat org.eclipse.jetty.io.ssl.SslConnection.onFillable(SslConnection.java:410)", + "\tat org.eclipse.jetty.io.ssl.SslConnection$2.succeeded(SslConnection.java:164)", + "\tat org.eclipse.jetty.io.FillInterest.fillable(FillInterest.java:105)", + "\tat org.eclipse.jetty.io.ChannelEndPoint$1.run(ChannelEndPoint.java:104)", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.runTask(EatWhatYouKill.java:338)", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.doProduce(EatWhatYouKill.java:315)", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.tryProduce(EatWhatYouKill.java:173)", + "\tat org.eclipse.jetty.util.thread.strategy.EatWhatYouKill.run(EatWhatYouKill.java:131)", + "\tat org.eclipse.jetty.util.thread.ReservedThreadExecutor$ReservedThread.run(ReservedThreadExecutor.java:409)", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.$anonfun$run$2(InstrumentedQueuedThreadPool.scala:105)", + "\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)", + "\tat com.databricks.logging.AttributionContextTracing.$anonfun$withAttributionContext$1(AttributionContextTracing.scala:48)", + "\tat com.databricks.logging.AttributionContext$.$anonfun$withValue$1(AttributionContext.scala:276)", + "\tat scala.util.DynamicVariable.withValue(DynamicVariable.scala:62)", + "\tat com.databricks.logging.AttributionContext$.withValue(AttributionContext.scala:272)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext(AttributionContextTracing.scala:46)", + "\tat com.databricks.logging.AttributionContextTracing.withAttributionContext$(AttributionContextTracing.scala:43)", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool.withAttributionContext(InstrumentedQueuedThreadPool.scala:45)", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.$anonfun$run$1(InstrumentedQueuedThreadPool.scala:105)", + "\tat scala.runtime.java8.JFunction0$mcV$sp.apply(JFunction0$mcV$sp.java:23)", + "\tat com.databricks.instrumentation.QueuedThreadPoolInstrumenter.trackActiveThreads(QueuedThreadPoolInstrumenter.scala:110)", + "\tat com.databricks.instrumentation.QueuedThreadPoolInstrumenter.trackActiveThreads$(QueuedThreadPoolInstrumenter.scala:107)", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool.trackActiveThreads(InstrumentedQueuedThreadPool.scala:45)", + "\tat com.databricks.rpc.InstrumentedQueuedThreadPool$$anon$1.run(InstrumentedQueuedThreadPool.scala:87)", + "\tat org.eclipse.jetty.util.thread.QueuedThreadPool.runJob(QueuedThreadPool.java:883)", + "\tat org.eclipse.jetty.util.thread.QueuedThreadPool$Runner.run(QueuedThreadPool.java:1034)", + "\tat java.lang.Thread.run(Thread.java:750)" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "from databricks import agents\n", + "\n", + "# Use Unity Catalog as the model registry\n", + "mlflow.set_registry_uri(\"databricks-uc\")\n", + "\n", + "with mlflow.start_run():\n", + " logged_agent_info = log_function_calling_agent_to_mlflow(fc_agent_config)\n", + "\n", + " # Register the Agent's model to the Unity Catalog\n", + " uc_registered_model_info = mlflow.register_model(\n", + " model_uri=logged_agent_info.model_uri, name=agent_storage_config.uc_model_name\n", + " )\n", + "\n", + "# Deploy the model to the review app and a model serving endpoint\n", + "# agents.deploy(agent_storage_config.uc_model_name, uc_registered_model_info.version)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "99742101-754f-4d21-8835-2b36ffd59883", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Load the logged model to test it locally" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9f56a3e6-ace0-4507-8b81-975dbe194fc3", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "2024/12/18 17:45:54 WARNING mlflow.utils.autologging_utils: MLflow autogen autologging is known to be compatible with 0.2.36 <= autogen-agentchat <= 0.2.39, but the installed version is 0.2.40. If you encounter errors during autologging, try upgrading / downgrading autogen-agentchat to a compatible version, or try upgrading MLflow.\n" + ] + } + ], + "source": [ + "mlflow.autogen.autolog(log_traces=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "053e448f-d63c-41dc-9ba3-a6cbd7f1eee2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:46:07] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:46:07] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-18 17:46:07] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nA test question?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nI'm sorry, I can't help you with that.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'content': \"I'm sorry, I can't help you with that.\",\n", + " 'messages': [{'content': 'A test question?', 'role': 'user'},\n", + " {'content': \"I'm sorry, I can't help you with that.\",\n", + " 'role': 'assistant',\n", + " 'name': 'Assistant'}]}" + ] + }, + "execution_count": 14, + "metadata": {}, + "output_type": "execute_result" + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-c1ef3a3fa91341f683b3746e4c6fe29e\"", + "text/plain": [ + "Trace(request_id=tr-c1ef3a3fa91341f683b3746e4c6fe29e)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "import mlflow\n", + "\n", + "loaded_model = mlflow.pyfunc.load_model(logged_agent_info.model_uri)\n", + "\n", + "loaded_model.predict({\"messages\": [{\"role\": \"user\", \"content\": \"A test question?\"}]})" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "05_tool_calling_agent", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb b/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb index 1d13cec..d5cf64d 100644 --- a/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb +++ b/autogen_agent_app_sample_code/06_multi_agent_with_genie.ipynb @@ -1,493 +1,813 @@ -# Databricks notebook source -# MAGIC %md -# MAGIC ## 👉 START HERE: How to use this notebook -# MAGIC -# MAGIC # Step 3: Build, evaluate, & deploy your Agent -# MAGIC -# MAGIC Use this notebook to iterate on the code and configuration of your Agent. -# MAGIC -# MAGIC By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation. -# MAGIC -# MAGIC Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui). -# MAGIC -# MAGIC -# MAGIC For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains: -# MAGIC - Your Agent's code & config -# MAGIC - Evaluation metrics for cost, quality, and latency - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC **Important note:** Throughout this notebook, we indicate which cell's code you: -# MAGIC - ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality. -# MAGIC - 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent -# MAGIC -# MAGIC *Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.* - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Install Python libraries -# MAGIC -# MAGIC You do not need to modify this cell unless you need additional Python packages in your Agent. - -# COMMAND ---------- - -# %pip install -qqqq -U -r requirements.txt -# # Restart to load the packages into the Python environment -# dbutils.library.restartPython() - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Connect to Databricks -# MAGIC -# MAGIC If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set. - -# COMMAND ---------- - -from mlflow.utils import databricks_utils as du - -if not du.is_in_databricks_notebook(): - from databricks.connect import DatabricksSession - import os - - spark = DatabricksSession.builder.getOrCreate() - os.environ["MLFLOW_TRACKING_URI"] = "databricks" - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment -# MAGIC -# MAGIC This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook. - -# COMMAND ---------- - -from cookbook.config.shared.agent_storage_location import AgentStorageConfig -from cookbook.databricks_utils import get_mlflow_experiment_url -from cookbook.config import load_serializable_config_from_yaml_file -import mlflow - -# Load the Agent's storage locations -agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file("./configs/agent_storage_config.yaml") - -# Show the Agent's storage locations -agent_storage_config.pretty_print() - -# set the MLflow experiment -experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name) -# If running in a local IDE, set the MLflow experiment name as an environment variable -os.environ["MLFLOW_EXPERIMENT_NAME"] = agent_storage_config.mlflow_experiment_name - -print(f"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}") - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 🚫✏️ Helper method to log the Agent's code & config to MLflow -# MAGIC -# MAGIC Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production. - -# COMMAND ---------- - - -import mlflow -from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA -from mlflow.models.rag_signatures import StringResponse -from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES -from mlflow.models.signature import ModelSignature -from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor, MultiAgentSupervisorConfig -from cookbook.agents.genie_agent import GenieAgent, GenieAgentConfig -from cookbook.agents.function_calling_agent import FunctionCallingAgent -from cookbook.agents.function_calling_agent import FunctionCallingAgentConfig - -# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI -# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. -# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! -def log_multi_agent_supervisor_to_mlflow(agent_config: MultiAgentSupervisorConfig): - # Get the agent's code path from the imported Agent class - agent_code_path = f"{os.getcwd()}/{MultiAgentSupervisor.__module__.replace('.', '/')}.py" - - # Get the pip requirements from the requirements.txt file - with open("requirements.txt", "r") as file: - pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark - - logged_agent_info = mlflow.pyfunc.log_model( - artifact_path="agent", - python_model=agent_code_path, - input_example=agent_config.input_example, - model_config=agent_config.model_dump(), - resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc - signature=ModelSignature( - inputs=CHAT_MODEL_INPUT_SCHEMA, - # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature - outputs=StringResponse() - ), - code_paths=[os.path.join(os.getcwd(), "cookbook")], - pip_requirements=pip_requirements, - ) - - return logged_agent_info - -def log_genie_agent_to_mlflow(agent_config: GenieAgentConfig): - # Get the agent's code path from the imported Agent class - agent_code_path = f"{os.getcwd()}/{GenieAgent.__module__.replace('.', '/')}.py" - - # Get the pip requirements from the requirements.txt file - with open("requirements.txt", "r") as file: - pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark - - logged_agent_info = mlflow.pyfunc.log_model( - artifact_path="agent", - python_model=agent_code_path, - input_example=agent_config.input_example, - model_config=agent_config.model_dump(), - resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc - signature=ModelSignature( - inputs=CHAT_MODEL_INPUT_SCHEMA, - # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature - outputs=StringResponse() - ), - code_paths=[os.path.join(os.getcwd(), "cookbook")], - pip_requirements=pip_requirements, - ) - - return logged_agent_info - -# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI -# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run. -# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy! -def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig): - # Get the agent's code path from the imported Agent class - agent_code_path = f"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py" - - # Get the pip requirements from the requirements.txt file - with open("requirements.txt", "r") as file: - pip_requirements = [line.strip() for line in file.readlines()] + ["pyspark"] # manually add pyspark - - logged_agent_info = mlflow.pyfunc.log_model( - artifact_path="agent", - python_model=agent_code_path, - input_example=agent_config.input_example, - model_config=agent_config.model_dump(), - resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc - signature=ModelSignature( - inputs=CHAT_MODEL_INPUT_SCHEMA, - # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature - outputs=StringResponse() - ), - code_paths=[os.path.join(os.getcwd(), "cookbook")], - pip_requirements=pip_requirements, - ) - - return logged_agent_info - -# COMMAND ---------- - -# MAGIC %md -# MAGIC -# MAGIC ## 1️⃣ Iterate on the Agent's code & config to improve quality -# MAGIC -# MAGIC The below cells are used to execute your inner dev loop to improve the Agent's quality. -# MAGIC -# MAGIC We suggest the following process: -# MAGIC 1. Vibe check the Agent for 5 - 10 queries to verify it works -# MAGIC 2. Make any necessary changes to the code/config -# MAGIC 3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues -# MAGIC 4. Based on that evaluation, make & test changes to the code/config to improve quality -# MAGIC 5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality -# MAGIC 6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing -# MAGIC 7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues -# MAGIC 8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7 -# MAGIC 9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6) -# MAGIC - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Create the agents to be overseen by the multi-agent supervisor - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 1. create the genie agent - -# COMMAND ---------- - - -from cookbook.config.agents.genie_agent import GenieAgentConfig -from cookbook.agents.genie_agent import GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, GenieAgent -from cookbook.config import serializable_config_to_yaml_file - - -genie_agent_config = GenieAgentConfig(genie_space_id="01ef92e3b5631f0da85834290964831d") -serializable_config_to_yaml_file(genie_agent_config, "./configs/"+GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) - - -# COMMAND ---------- - -mlflow.set_registry_uri("databricks-uc") - -with mlflow.start_run(run_name="genie_agent_test_1"): - logged_genie_info = log_genie_agent_to_mlflow(genie_agent_config) - uc_registered_model_info = mlflow.register_model( - model_uri=logged_genie_info.model_uri, name=agent_storage_config.uc_model_name+"_genie_test_1" - ) - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ### 2. create the FC agent - -# COMMAND ---------- - -# Import Cookbook Agent configurations, which are Pydantic models -from cookbook.config import serializable_config_to_yaml_file -from cookbook.config.agents.function_calling_agent import ( - FunctionCallingAgentConfig, -) -from cookbook.config.data_pipeline import ( - DataPipelineConfig, -) -from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig -from cookbook.config import load_serializable_config_from_yaml_file -from cookbook.tools.vector_search import ( - VectorSearchRetrieverTool, - VectorSearchSchema, -) -import json -from cookbook.tools.uc_tool import UCTool - - -######################## -# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration -# Usage: -# - If you used `01_data_pipeline` to create your Vector Index, run this cell. -# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config. -######################## - -data_pipeline_config: DataPipelineConfig = load_serializable_config_from_yaml_file( - "./configs/data_pipeline_config.yaml" -) - -######################## -# #### ✅✏️ Retriever tool that connects to the Vector Search index -######################## - -retriever_tool = VectorSearchRetrieverTool( - name="search_product_docs", - description="Use this tool to search for product documentation.", - vector_search_index="ep.cookbook_local_test.product_docs_docs_chunked_index__v1", - vector_search_schema=VectorSearchSchema( - # These columns are the default values used in the `01_data_pipeline` notebook - # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. - chunk_text="content_chunked", # Contains the text of each document chunk - document_uri="doc_uri", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App - additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM - ), - # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below. - # doc_similarity_threshold=0.0, - # vector_search_parameters=VectorSearchParameters( - # num_results=5, - # query_type="ann" - # ), - # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query. - # filterable_columns=[] -) - -######################## -# #### ✅✏️ Add Unity Catalog tools to the Agent -######################## - -translate_sku_tool = UCTool(uc_function_name="ep.cookbook_local_test.translate_sku") - -from tools.sku_translator import translate_sku -# from cookbook.config import serializable_config_to_yaml_file - -# translate_sku("OLD-XXX-1234") - -from cookbook.tools.local_function import LocalFunctionTool -from tools.sku_translator import translate_sku - -# translate_sku_tool = LocalFunctionTool(func=translate_sku, description="Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.") - -######################## -#### ✅✏️ Agent's LLM configuration -######################## - -system_prompt = """ -## Role -You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request. - -## Objective -Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses. - -## Instructions -1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. - -2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query. - -3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: "I'm sorry, I can't help you with that." -""".strip() - -fc_agent_config = FunctionCallingAgentConfig( - llm_config=LLMConfig( - llm_endpoint_name="ep-gpt4o-new", # Model serving endpoint w/ a Chat Completions API - llm_system_prompt_template=system_prompt, # System prompt template - llm_parameters=LLMParametersConfig( - temperature=0.01, max_tokens=1500 - ), # LLM parameters - ), - # Add one or more tools that comply with the CookbookTool interface - tools=[retriever_tool, translate_sku_tool], - # tools=[retriever_tool], -) - -# Print the configuration as a JSON string to see it all together -# print(json.dumps(fc_agent_config.model_dump(), indent=4)) - -######################## -##### Dump the configuration to a YAML -# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration. -######################## -# Import the default YAML config file name from the Agent's code file -from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME - -# Dump the configuration to a YAML file -serializable_config_to_yaml_file(fc_agent_config, "./configs/"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) - -# COMMAND ---------- - -# MAGIC %md -# MAGIC ## Create the multi-agent supervisor - -# COMMAND ---------- - -from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig -from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig -from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME -from cookbook.config.shared.llm import LLMConfig -from cookbook.config import serializable_config_to_yaml_file -from cookbook.agents.function_calling_agent import FunctionCallingAgent -from cookbook.config.shared.llm import LLMParametersConfig - - -fc_supervised = SupervisedAgentConfig(name="fc_agent", - description="looks up product docs", - endpoint_name="", - agent_config=fc_agent_config, - agent_class=FunctionCallingAgent) - -genie_supervised = SupervisedAgentConfig(name="genie_agent", - description="queries for customer info", - endpoint_name="", - agent_config=genie_agent_config, - agent_class=GenieAgent) - - -multi_agent_config = MultiAgentSupervisorConfig( - llm_endpoint_name="ep-gpt4o-new", - llm_parameters=LLMParametersConfig( - max_tokens= 1500, - temperature= 0.01 - ), - - playground_debug_mode=True, - agent_loading_mode="local", - agents=[fc_supervised, genie_supervised] -) - -serializable_config_to_yaml_file(multi_agent_config, "./configs/"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) - - -# COMMAND ---------- - -from cookbook.databricks_utils import get_mlflow_experiment_traces_url -from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor - -# Load the Agent's code with the above configuration -agent = MultiAgentSupervisor(multi_agent_config) - -# Vibe check the Agent for a single query -output = agent.predict(model_input={"messages": [{"role": "user", "content": "How does the blender work?"}]}) -# output = agent.predict(model_input={"messages": [{"role": "user", "content": "Translate the sku `OLD-abs-1234` to the new format"}]}) - -print(f"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}") -print(f"Agent's final response:\n----\n{output['content']}\n----") -print() -print(f"Agent's full message history (useful for debugging):\n----\n{json.dumps(output['messages'], indent=2)}\n----") - - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Design for multi-agent -# MAGIC -# MAGIC requirements -# MAGIC * can test locally with just the agent's pyfunc classes -# MAGIC * when you change any config, it all just reloads -# MAGIC -# MAGIC when you deploy: -# MAGIC * you deploy each supervised agent separately to model serving -# MAGIC * then mutli agent picks these up -# MAGIC * then mutli agent deploys -# MAGIC -# MAGIC * each child agent has [name, description, config, code] -# MAGIC - when deployed, it reads it from the UC -# MAGIC - locally, from the config - -# COMMAND ---------- - -# MAGIC %md -# MAGIC Testing endpoint based - -# COMMAND ---------- - -from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig -from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME -# from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME -from cookbook.config.shared.llm import LLMConfig -from cookbook.config import serializable_config_to_yaml_file -from cookbook.agents.function_calling_agent import FunctionCallingAgent -from cookbook.config.shared.llm import LLMParametersConfig - - -fc_supervised_ep = SupervisedAgentConfig(name="fc_agent", - description="looks up product docs", - endpoint_name="agents_ep-cookbook_local_test-my_agent_new_test_with_ONLY_retri", - # agent_config=fc_agent_config, - # agent_class=FunctionCallingAgent - ) - -# genie_supervised = SupervisedAgentConfig(name="genie_agent", -# description="queries for customer info", -# endpoint_name="", -# agent_config=genie_agent_config, -# agent_class=GenieAgent) - - -multi_agent_config_with_ep = MultiAgentSupervisorConfig( - llm_endpoint_name="ep-gpt4o-new", - llm_parameters=LLMParametersConfig( - max_tokens= 1500, - temperature= 0.01 - ), - - playground_debug_mode=True, - agent_loading_mode="model_serving", - agents=[fc_supervised_ep] -) - -serializable_config_to_yaml_file(multi_agent_config_with_ep, "./configs/"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME) - - -# COMMAND ---------- - -from cookbook.config import load_serializable_config_from_yaml_file - -multi_agent_config_with_ep_loaded = load_serializable_config_from_yaml_file("./configs/multi_agent_supervisor_config.yaml") - -print(multi_agent_config_with_ep_loaded) \ No newline at end of file +{ + "cells": [ + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "31661828-f9bb-4fc2-a1bd-94424a27ed52", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## 👉 START HERE: How to use this notebook\n", + "\n", + "# Step 3: Build, evaluate, & deploy your Agent\n", + "\n", + "Use this notebook to iterate on the code and configuration of your Agent.\n", + "\n", + "By the end of this notebook, you will have 1+ registered versions of your Agent, each coupled with a detailed quality evaluation.\n", + "\n", + "Optionally, you can deploy a version of your Agent that you can interact with in the [Mosiac AI Playground](https://docs.databricks.com/en/large-language-models/ai-playground.html) and let your business stakeholders who don't have Databricks accounts interact with it & provide feedback in the [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui).\n", + "\n", + "\n", + "For each version of your agent, you will have an MLflow run inside your MLflow experiment that contains:\n", + "- Your Agent's code & config\n", + "- Evaluation metrics for cost, quality, and latency" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5d9f685a-fdb7-49a4-9e3a-a4a9e964d045", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "**Important note:** Throughout this notebook, we indicate which cell's code you:\n", + "- ✅✏️ should customize - these cells contain code & config with business logic that you should edit to meet your requirements & tune quality.\n", + "- 🚫✏️ should not customize - these cells contain boilerplate code required to load/save/execute your Agent\n", + "\n", + "*Cells that don't require customization still need to be run! You CAN change these cells, but if this is the first time using this notebook, we suggest not doing so.*" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "bb4f8cc0-1797-4beb-a9f2-df21a9db79f0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Install Python libraries\n", + "\n", + "You do not need to modify this cell unless you need additional Python packages in your Agent." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6d4030e8-ae97-4351-bebd-9651d283578f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# %pip install -qqqq -U -r requirements.txt\n", + "# # Restart to load the packages into the Python environment\n", + "# dbutils.library.restartPython()" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "395ef913-1769-47a4-87fd-1abd3cf334c0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Connect to Databricks\n", + "\n", + "If running locally in an IDE using Databricks Connect, connect the Spark client & configure MLflow to use Databricks Managed MLflow. If this running in a Databricks Notebook, these values are already set." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "37bc769e-93cc-4962-8e20-2a856f15bfe8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from mlflow.utils import databricks_utils as du\n", + "\n", + "if not du.is_in_databricks_notebook():\n", + " from databricks.connect import DatabricksSession\n", + " import os\n", + "\n", + " spark = DatabricksSession.builder.getOrCreate()\n", + " os.environ[\"MLFLOW_TRACKING_URI\"] = \"databricks\"" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "15424a47-72da-411a-b791-997bc07a04b2", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Load the Agent's UC storage locations; set up MLflow experiment\n", + "\n", + "This notebook uses the UC model, MLflow Experiment, and Evaluation Set that you specified in the [Agent setup](02_agent_setup.ipynb) notebook." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9d63957a-0b7a-492c-943d-b347f3dd2125", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config.shared.agent_storage_location import AgentStorageConfig\n", + "from cookbook.databricks_utils import get_mlflow_experiment_url\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "import mlflow \n", + "\n", + "# Load the Agent's storage locations\n", + "agent_storage_config: AgentStorageConfig= load_serializable_config_from_yaml_file(\"./configs/agent_storage_config.yaml\")\n", + "\n", + "# Show the Agent's storage locations\n", + "agent_storage_config.pretty_print()\n", + "\n", + "# set the MLflow experiment\n", + "experiment_info = mlflow.set_experiment(agent_storage_config.mlflow_experiment_name)\n", + "# If running in a local IDE, set the MLflow experiment name as an environment variable\n", + "os.environ[\"MLFLOW_EXPERIMENT_NAME\"] = agent_storage_config.mlflow_experiment_name\n", + "\n", + "print(f\"View the MLflow Experiment `{agent_storage_config.mlflow_experiment_name}` at {get_mlflow_experiment_url(experiment_info.experiment_id)}\")" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9dfb9e42-11b8-4f17-9d3f-76e3ca2ef0c4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 🚫✏️ Helper method to log the Agent's code & config to MLflow\n", + "\n", + "Before we start, let's define a helper method to log the Agent's code & config to MLflow. We will use this to log the agent's code & config to MLflow & the Unity Catalog. It is used in evaluation & for deploying to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) (a chat UI for your stakeholders to test this agent) and later, deplying the Agent to production." + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "1e109aac-3938-4290-86bc-7dc4c42ae88b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "\n", + "import mlflow\n", + "from mlflow.types.llm import CHAT_MODEL_INPUT_SCHEMA\n", + "from mlflow.models.rag_signatures import StringResponse\n", + "from cookbook.agents.utils.signatures import STRING_RESPONSE_WITH_MESSAGES\n", + "from mlflow.models.signature import ModelSignature\n", + "from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor, MultiAgentSupervisorConfig\n", + "from cookbook.agents.genie_agent import GenieAgent, GenieAgentConfig\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgentConfig\n", + "\n", + "# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI\n", + "# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run.\n", + "# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy!\n", + "def log_multi_agent_supervisor_to_mlflow(agent_config: MultiAgentSupervisorConfig):\n", + " # Get the agent's code path from the imported Agent class\n", + " agent_code_path = f\"{os.getcwd()}/{MultiAgentSupervisor.__module__.replace('.', '/')}.py\"\n", + "\n", + " # Get the pip requirements from the requirements.txt file\n", + " with open(\"requirements.txt\", \"r\") as file:\n", + " pip_requirements = [line.strip() for line in file.readlines()] + [\"pyspark\"] # manually add pyspark\n", + "\n", + " logged_agent_info = mlflow.pyfunc.log_model(\n", + " artifact_path=\"agent\",\n", + " python_model=agent_code_path,\n", + " input_example=agent_config.input_example,\n", + " model_config=agent_config.model_dump(),\n", + " resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc\n", + " signature=ModelSignature(\n", + " inputs=CHAT_MODEL_INPUT_SCHEMA,\n", + " # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature\n", + " outputs=StringResponse()\n", + " ),\n", + " code_paths=[os.path.join(os.getcwd(), \"cookbook\")],\n", + " pip_requirements=pip_requirements,\n", + " )\n", + "\n", + " return logged_agent_info\n", + "\n", + "def log_genie_agent_to_mlflow(agent_config: GenieAgentConfig):\n", + " # Get the agent's code path from the imported Agent class\n", + " agent_code_path = f\"{os.getcwd()}/{GenieAgent.__module__.replace('.', '/')}.py\"\n", + "\n", + " # Get the pip requirements from the requirements.txt file\n", + " with open(\"requirements.txt\", \"r\") as file:\n", + " pip_requirements = [line.strip() for line in file.readlines()] + [\"pyspark\"] # manually add pyspark\n", + "\n", + " logged_agent_info = mlflow.pyfunc.log_model(\n", + " artifact_path=\"agent\",\n", + " python_model=agent_code_path,\n", + " input_example=agent_config.input_example,\n", + " model_config=agent_config.model_dump(),\n", + " resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc\n", + " signature=ModelSignature(\n", + " inputs=CHAT_MODEL_INPUT_SCHEMA,\n", + " # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature\n", + " outputs=StringResponse()\n", + " ),\n", + " code_paths=[os.path.join(os.getcwd(), \"cookbook\")],\n", + " pip_requirements=pip_requirements,\n", + " )\n", + "\n", + " return logged_agent_info\n", + "\n", + "# This helper will log the Agent's code & config to an MLflow run and return the logged model's URI\n", + "# If run from inside a mlfow.start_run() block, it will log to that run, otherwise it will log to a new run.\n", + "# This logged Agent is ready for deployment, so if you are happy with your evaluation, it is ready to deploy!\n", + "def log_function_calling_agent_to_mlflow(agent_config: FunctionCallingAgentConfig):\n", + " # Get the agent's code path from the imported Agent class\n", + " agent_code_path = f\"{os.getcwd()}/{FunctionCallingAgent.__module__.replace('.', '/')}.py\"\n", + "\n", + " # Get the pip requirements from the requirements.txt file\n", + " with open(\"requirements.txt\", \"r\") as file:\n", + " pip_requirements = [line.strip() for line in file.readlines()] + [\"pyspark\"] # manually add pyspark\n", + "\n", + " logged_agent_info = mlflow.pyfunc.log_model(\n", + " artifact_path=\"agent\",\n", + " python_model=agent_code_path,\n", + " input_example=agent_config.input_example,\n", + " model_config=agent_config.model_dump(),\n", + " resources=agent_config.get_resource_dependencies(), # This allows the agents.deploy() command to securely provision credentials for the Agent's databricks resources e.g., vector index, model serving endpoints, etc\n", + " signature=ModelSignature(\n", + " inputs=CHAT_MODEL_INPUT_SCHEMA,\n", + " # outputs=STRING_RESPONSE_WITH_MESSAGES #TODO: replace with MLflow signature\n", + " outputs=StringResponse()\n", + " ),\n", + " code_paths=[os.path.join(os.getcwd(), \"cookbook\")],\n", + " pip_requirements=pip_requirements,\n", + " )\n", + "\n", + " return logged_agent_info" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "9933d05f-29fa-452e-abdc-2a02328fbe22", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "\n", + "## 1️⃣ Iterate on the Agent's code & config to improve quality\n", + "\n", + "The below cells are used to execute your inner dev loop to improve the Agent's quality.\n", + "\n", + "We suggest the following process:\n", + "1. Vibe check the Agent for 5 - 10 queries to verify it works\n", + "2. Make any necessary changes to the code/config\n", + "3. Use Agent Evaluation to evaluate the Agent using your evaluation set, which will provide a quality assessment & identify the root causes of any quality issues\n", + "4. Based on that evaluation, make & test changes to the code/config to improve quality\n", + "5. 🔁 Repeat steps 3 and 4 until you are satisified with the Agent's quality\n", + "6. Deploy the Agent to Agent Evaluation's [Review App](https://docs.databricks.com/en/generative-ai/agent-evaluation/human-evaluation.html#review-app-ui) for pre-production testing\n", + "7. Use the following notebooks to review that feedback (optionally adding new records to your evaluation set) & identify any further quality issues\n", + "8. 🔁 Repeat steps 3 and 4 to fix any issues identified in step 7\n", + "9. Deploy the Agent to a production-ready REST API endpoint (using the same cells in this notebook as step 6)\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "12a72c4a-c0d2-4d0a-9b23-dc414c545864", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Create the agents to be overseen by the multi-agent supervisor" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "43f5dd46-a451-4bd4-a8ba-3b968e6f5b7a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 1. create the genie agent" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "4feb8ad0-6f05-49ac-9448-34fc31e78ee5", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "\n", + "from cookbook.config.agents.genie_agent import GenieAgentConfig\n", + "from cookbook.agents.genie_agent import GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME, GenieAgent\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "\n", + "\n", + "genie_agent_config = GenieAgentConfig(genie_space_id=\"01ef92e3b5631f0da85834290964831d\")\n", + "serializable_config_to_yaml_file(genie_agent_config, \"./configs/\"+GENIE_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8b62d063-13c6-4da4-8565-6dd8dcbbd3ab", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "mlflow.set_registry_uri(\"databricks-uc\")\n", + "\n", + "with mlflow.start_run(run_name=\"genie_agent_test_1\"):\n", + " logged_genie_info = log_genie_agent_to_mlflow(genie_agent_config)\n", + " uc_registered_model_info = mlflow.register_model(\n", + " model_uri=logged_genie_info.model_uri, name=agent_storage_config.uc_model_name+\"_genie_test_1\"\n", + " )\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "cbc0ef3a-9ef6-4594-8498-882f51ea52fd", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "### 2. create the FC agent" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "61f0ed0e-4058-4574-aa50-4d1b166ccc68", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "# Import Cookbook Agent configurations, which are Pydantic models\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "from cookbook.config.agents.function_calling_agent import (\n", + " FunctionCallingAgentConfig,\n", + ")\n", + "from cookbook.config.data_pipeline import (\n", + " DataPipelineConfig,\n", + ")\n", + "from cookbook.config.shared.llm import LLMConfig, LLMParametersConfig\n", + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "from cookbook.tools.vector_search import (\n", + " VectorSearchRetrieverTool,\n", + " VectorSearchSchema,\n", + ")\n", + "import json\n", + "from cookbook.tools.uc_tool import UCTool\n", + "\n", + "\n", + "########################\n", + "# #### 🚫✏️ Load the Vector Index Unity Cataloglocation from the data pipeline configuration\n", + "# Usage:\n", + "# - If you used `01_data_pipeline` to create your Vector Index, run this cell.\n", + "# - If your Vector Index was created elsewhere, comment out this logic and set the UC location in the Retriever config.\n", + "########################\n", + "\n", + "data_pipeline_config: DataPipelineConfig = load_serializable_config_from_yaml_file(\n", + " \"./configs/data_pipeline_config.yaml\"\n", + ")\n", + "\n", + "########################\n", + "# #### ✅✏️ Retriever tool that connects to the Vector Search index\n", + "########################\n", + "\n", + "retriever_tool = VectorSearchRetrieverTool(\n", + " name=\"search_product_docs\",\n", + " description=\"Use this tool to search for product documentation.\",\n", + " vector_search_index=\"ep.cookbook_local_test.product_docs_docs_chunked_index__v1\",\n", + " vector_search_schema=VectorSearchSchema(\n", + " # These columns are the default values used in the `01_data_pipeline` notebook\n", + " # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here.\n", + " chunk_text=\"content_chunked\", # Contains the text of each document chunk\n", + " document_uri=\"doc_uri\", # The document URI of the chunk e.g., \"/Volumes/catalog/schema/volume/file.pdf\" - displayed as the document ID in the Review App\n", + " additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM\n", + " ),\n", + " # Optional parameters, see VectorSearchRetrieverTool.__doc__ for details. The default values are shown below.\n", + " # doc_similarity_threshold=0.0,\n", + " # vector_search_parameters=VectorSearchParameters(\n", + " # num_results=5,\n", + " # query_type=\"ann\"\n", + " # ),\n", + " # Adding columns here will allow the Agent's LLM to dynamically apply filters based on the user's query.\n", + " # filterable_columns=[]\n", + ")\n", + "\n", + "########################\n", + "# #### ✅✏️ Add Unity Catalog tools to the Agent\n", + "########################\n", + "\n", + "translate_sku_tool = UCTool(uc_function_name=\"ep.cookbook_local_test.translate_sku\")\n", + "\n", + "from tools.sku_translator import translate_sku\n", + "# from cookbook.config import serializable_config_to_yaml_file\n", + "\n", + "# translate_sku(\"OLD-XXX-1234\")\n", + "\n", + "from cookbook.tools.local_function import LocalFunctionTool\n", + "from tools.sku_translator import translate_sku\n", + "\n", + "# translate_sku_tool = LocalFunctionTool(func=translate_sku, description=\"Translates a pre-2024 SKU formatted as 'OLD-XXX-YYYY' to the new SKU format 'NEW-YYYY-XXX'.\")\n", + "\n", + "########################\n", + "#### ✅✏️ Agent's LLM configuration\n", + "########################\n", + "\n", + "system_prompt = \"\"\"\n", + "## Role\n", + "You are a helpful assistant that answers questions using a set of tools. If needed, you ask the user follow-up questions to clarify their request.\n", + "\n", + "## Objective\n", + "Your goal is to provide accurate, relevant, and helpful response based solely on the outputs from these tools. You are concise and direct in your responses.\n", + "\n", + "## Instructions\n", + "1. **Understand the Query**: Think step by step to analyze the user's question and determine the core need or problem. \n", + "\n", + "2. **Assess available tools**: Think step by step to consider each available tool and understand their capabilities in the context of the user's query.\n", + "\n", + "3. **Select the appropriate tool(s) OR ask follow up questions**: Based on your understanding of the query and the tool descriptions, decide which tool(s) should be used to generate a response. If you do not have enough information to use the available tools to answer the question, ask the user follow up questions to refine their request. If you do not have a relevant tool for a question or the outputs of the tools are not helpful, respond with: \"I'm sorry, I can't help you with that.\"\n", + "\"\"\".strip()\n", + "\n", + "fc_agent_config = FunctionCallingAgentConfig(\n", + " llm_config=LLMConfig(\n", + " llm_endpoint_name=\"ep-gpt4o-new\", # Model serving endpoint w/ a Chat Completions API\n", + " llm_system_prompt_template=system_prompt, # System prompt template\n", + " llm_parameters=LLMParametersConfig(\n", + " temperature=0.01, max_tokens=1500\n", + " ), # LLM parameters\n", + " ),\n", + " # Add one or more tools that comply with the CookbookTool interface\n", + " tools=[retriever_tool, translate_sku_tool],\n", + " # tools=[retriever_tool],\n", + ")\n", + "\n", + "# Print the configuration as a JSON string to see it all together\n", + "# print(json.dumps(fc_agent_config.model_dump(), indent=4))\n", + "\n", + "########################\n", + "##### Dump the configuration to a YAML\n", + "# Optional step, this allows the Agent's code file to be run by itself (e.g., outside of this notebook) using the above configuration.\n", + "########################\n", + "# Import the default YAML config file name from the Agent's code file\n", + "from cookbook.agents.function_calling_agent import FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME\n", + "\n", + "# Dump the configuration to a YAML file\n", + "serializable_config_to_yaml_file(fc_agent_config, \"./configs/\"+FC_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME)" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "5aab1ad7-544f-4d02-909f-f3d4ba9af74f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "## Create the multi-agent supervisor" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "e817cf7d-426c-4b39-8f87-089b03974b6f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig\n", + "from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig\n", + "from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME\n", + "from cookbook.config.shared.llm import LLMConfig\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "from cookbook.config.shared.llm import LLMParametersConfig\n", + "\n", + "\n", + "fc_supervised = SupervisedAgentConfig(name=\"fc_agent\", \n", + " description=\"looks up product docs\", \n", + " endpoint_name=\"\", \n", + " agent_config=fc_agent_config,\n", + " agent_class=FunctionCallingAgent)\n", + "\n", + "genie_supervised = SupervisedAgentConfig(name=\"genie_agent\", \n", + " description=\"queries for customer info\", \n", + " endpoint_name=\"\", \n", + " agent_config=genie_agent_config,\n", + " agent_class=GenieAgent)\n", + "\n", + "\n", + "multi_agent_config = MultiAgentSupervisorConfig(\n", + " llm_endpoint_name=\"ep-gpt4o-new\",\n", + " llm_parameters=LLMParametersConfig(\n", + " max_tokens= 1500,\n", + " temperature= 0.01\n", + " ),\n", + "\n", + " playground_debug_mode=True,\n", + " agent_loading_mode=\"local\",\n", + " agents=[fc_supervised, genie_supervised]\n", + ")\n", + "\n", + "serializable_config_to_yaml_file(multi_agent_config, \"./configs/\"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "a7f0d795-a39b-4bf7-91e5-7773feb676e6", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.databricks_utils import get_mlflow_experiment_traces_url\n", + "from cookbook.agents.multi_agent_supervisor import MultiAgentSupervisor\n", + "\n", + "# Load the Agent's code with the above configuration\n", + "agent = MultiAgentSupervisor(multi_agent_config)\n", + "\n", + "# Vibe check the Agent for a single query\n", + "output = agent.predict(model_input={\"messages\": [{\"role\": \"user\", \"content\": \"How does the blender work?\"}]})\n", + "# output = agent.predict(model_input={\"messages\": [{\"role\": \"user\", \"content\": \"Translate the sku `OLD-abs-1234` to the new format\"}]})\n", + "\n", + "print(f\"View the MLflow Traces at {get_mlflow_experiment_traces_url(experiment_info.experiment_id)}\")\n", + "print(f\"Agent's final response:\\n----\\n{output['content']}\\n----\")\n", + "print()\n", + "print(f\"Agent's full message history (useful for debugging):\\n----\\n{json.dumps(output['messages'], indent=2)}\\n----\")\n" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "6d7bb5b2-c7bf-452c-a811-cdc8821d4e94", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Design for multi-agent\n", + "\n", + "requirements\n", + "* can test locally with just the agent's pyfunc classes\n", + "* when you change any config, it all just reloads\n", + "\n", + "when you deploy:\n", + "* you deploy each supervised agent separately to model serving\n", + "* then mutli agent picks these up \n", + "* then mutli agent deploys\n", + "\n", + "* each child agent has [name, description, config, code]\n", + " - when deployed, it reads it from the UC\n", + " - locally, from the config" + ] + }, + { + "cell_type": "markdown", + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "8bf982ed-7619-4078-84eb-8b857af44f5c", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "source": [ + "Testing endpoint based " + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "25a2fed0-09a1-472d-aedc-96a37084595e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, SupervisedAgentConfig\n", + "from cookbook.config.agents.multi_agent_supervisor import MultiAgentSupervisorConfig, MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME\n", + "# from cookbook.agents.multi_agent_supervisor import MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME\n", + "from cookbook.config.shared.llm import LLMConfig\n", + "from cookbook.config import serializable_config_to_yaml_file\n", + "from cookbook.agents.function_calling_agent import FunctionCallingAgent\n", + "from cookbook.config.shared.llm import LLMParametersConfig\n", + "\n", + "\n", + "fc_supervised_ep = SupervisedAgentConfig(name=\"fc_agent\", \n", + " description=\"looks up product docs\", \n", + " endpoint_name=\"agents_ep-cookbook_local_test-my_agent_new_test_with_ONLY_retri\", \n", + " # agent_config=fc_agent_config,\n", + " # agent_class=FunctionCallingAgent\n", + " )\n", + "\n", + "# genie_supervised = SupervisedAgentConfig(name=\"genie_agent\", \n", + "# description=\"queries for customer info\", \n", + "# endpoint_name=\"\", \n", + "# agent_config=genie_agent_config,\n", + "# agent_class=GenieAgent)\n", + "\n", + "\n", + "multi_agent_config_with_ep = MultiAgentSupervisorConfig(\n", + " llm_endpoint_name=\"ep-gpt4o-new\",\n", + " llm_parameters=LLMParametersConfig(\n", + " max_tokens= 1500,\n", + " temperature= 0.01\n", + " ),\n", + "\n", + " playground_debug_mode=True,\n", + " agent_loading_mode=\"model_serving\",\n", + " agents=[fc_supervised_ep]\n", + ")\n", + "\n", + "serializable_config_to_yaml_file(multi_agent_config_with_ep, \"./configs/\"+MULTI_AGENT_DEFAULT_YAML_CONFIG_FILE_NAME)\n" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": {}, + "inputWidgets": {}, + "nuid": "670f1753-e25b-4a4b-b8c2-b3ea542083e0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from cookbook.config import load_serializable_config_from_yaml_file\n", + "\n", + "multi_agent_config_with_ep_loaded = load_serializable_config_from_yaml_file(\"./configs/multi_agent_supervisor_config.yaml\")\n", + "\n", + "print(multi_agent_config_with_ep_loaded)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": null, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "06_multi_agent_with_genie", + "widgets": {} + }, + "kernelspec": { + "display_name": "genai-cookbook-T2SdtsNM-py3.11", + "language": "python", + "name": "python3" + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file diff --git a/autogen_agent_app_sample_code/autogen_started.ipynb b/autogen_agent_app_sample_code/autogen_started.ipynb index f49be86..1591a26 100644 --- a/autogen_agent_app_sample_code/autogen_started.ipynb +++ b/autogen_agent_app_sample_code/autogen_started.ipynb @@ -1,195 +1,901 @@ -# Databricks notebook source -# MAGIC %pip install -qqqq -U -r requirements.txt - -# COMMAND ---------- - -dbutils.library.restartPython() - -# COMMAND ---------- - -from cookbook.tools.vector_search import ( - VectorSearchRetrieverTool, - VectorSearchSchema, -) -from cookbook.tools.uc_tool import UCTool - -# COMMAND ---------- - -retriever_tool = VectorSearchRetrieverTool( - name="search_product_docs", - description="Use this tool to search for product documentation.", - vector_search_index="dbdemos.dbdemos_rag_chatbot.databricks_documentation_vs_index", - vector_search_schema=VectorSearchSchema( - # These columns are the default values used in the `01_data_pipeline` notebook - # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here. - chunk_text="content", # Contains the text of each document chunk - document_uri="url", # The document URI of the chunk e.g., "/Volumes/catalog/schema/volume/file.pdf" - displayed as the document ID in the Review App - # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM - ) -) - - -translate_sku_tool = UCTool(uc_function_name="devanshu_pandey.cmhc_demo.vector_index_search_tool") - -tools = [retriever_tool] - -# COMMAND ---------- - -entry_point = dbutils.notebook.entry_point - -host_name = f'https://{entry_point.getDbutils().notebook().getContext().browserHostName().get()}' - -token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get() - -# COMMAND ---------- - -base_url = f"{host_name}/serving-endpoints/" -base_url - -# COMMAND ---------- - -def is_termination_message(message): - content = message.get("content", "") - return (content and "TERMINATE" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message) - -# COMMAND ---------- - -from databricks.sdk import WorkspaceClient -from databricks.sdk.service.serving import ChatMessage, ChatMessageRole -from typing import List, Optional - - -class DatabricksModelServingClient: - def __init__(self, config, **kwargs): - self.workspace = WorkspaceClient() - self.openai_client = self.workspace.serving_endpoints.get_open_ai_client() - self.endpoint_name = config.get("endpoint_name") - self.llm_config = config.get("llm_config") - - def create(self, input_data): - messages = [] - for message in input_data['messages']: - message.pop("name", None) - messages.append(message) - - response = self.openai_client.chat.completions.create( - model=self.endpoint_name, - messages=messages, - tools=input_data['tools'], - tool_choice="auto", - **self.llm_config - ) - - return response - - def message_retrieval(self, response): - # Process and return messages from the response - return [choice.message for choice in response.choices] - - def cost(self, response): - # Implement cost calculation if applicable - return 0 - - def get_usage(self, response): - usage = response.usage - # Implement usage statistics if available - return {"prompt_tokens": usage.prompt_tokens, "total_tokens": usage.total_tokens, "completion_tokens": usage.completion_tokens} - -# COMMAND ---------- - -config_list = { - "model_client_cls": "DatabricksModelServingClient", - "model": "gpt4o", - "endpoint_name": "casaman-gpt4", # "databricks-meta-llama-3-3-70b-instruct", - "llm_config": {"temperature": 0.5, "max_tokens": 1500} - -} - -# COMMAND ---------- - -import os - -from autogen import ConversableAgent - -def create_agents(system_prompt, chat_history): - - def is_termination_message(message): - content = message.get("content", "") - return (content and "TERMINATE" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message) - - # The user proxy agent is used for interacting with the assistant agent - # and executes tool calls. - user_proxy = ConversableAgent( - name="User", - llm_config=False, - is_termination_msg=is_termination_message, - human_input_mode="NEVER", - ) - - assistant = ConversableAgent( - name="Assistant", - system_message="You are a helpful AI assistant. " - "You can help with simple calculations. " - "Return 'TERMINATE' when the task is done.", - llm_config={"config_list": [config_list]}, - chat_messages={user_proxy: chat_history} - ) - - return assistant, user_proxy - -assistant, user_proxy = create_agents('test', []) - -# COMMAND ---------- - -from autogen import register_function - -for tool in tools: - register_function( - tool, - caller=assistant, # The assistant agent can suggest calls to the calculator. - executor=user_proxy, # The user proxy agent can execute the calculator calls. - name=tool.name, - description=tool.description, # A description of the tool. - ) - -# COMMAND ---------- - -translate_sku_tool._toolkit.tools[0].register_function(callers = assistant, - executors = user_proxy ) - -# COMMAND ---------- - -assistant.register_model_client(model_client_cls=DatabricksModelServingClient) - -# COMMAND ---------- - -chat_result = user_proxy.initiate_chat(assistant, message="What is mlflow in databricks?") - -# COMMAND ---------- - -assistant.last_message(user_proxy) - -# COMMAND ---------- - -history = assistant.chat_messages[user_proxy] -history - -# COMMAND ---------- - -assistant, user_proxy = create_agents('test', history) - -# COMMAND ---------- - -assistant.chat_messages[user_proxy] - -# COMMAND ---------- - -chat_result = user_proxy.initiate_chat(assistant, message="This is the second turn of the conversation. Can you summary our actual conversation?", clear_history=False) - -# COMMAND ---------- - -assistant.llm_config["tools"] - -# COMMAND ---------- - -assistant.last_message(user_proxy) \ No newline at end of file +{ + "cells": [ + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "9f11e094-fffb-4ab5-9a3c-c8e096072b24", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\nlangchain 0.1.20 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.25 which is incompatible.\nlangchain 0.1.20 requires langsmith<0.2.0,>=0.1.17, but you have langsmith 0.2.3 which is incompatible.\nlangchain-community 0.0.38 requires langchain-core<0.2.0,>=0.1.52, but you have langchain-core 0.3.25 which is incompatible.\nlangchain-community 0.0.38 requires langsmith<0.2.0,>=0.1.0, but you have langsmith 0.2.3 which is incompatible.\nlangchain-text-splitters 0.0.2 requires langchain-core<0.3,>=0.1.28, but you have langchain-core 0.3.25 which is incompatible.\nydata-profiling 4.5.1 requires pandas!=1.4.0,<2.1,>1.1, but you have pandas 2.2.3 which is incompatible.\nydata-profiling 4.5.1 requires pydantic<2,>=1.8.1, but you have pydantic 2.10.3 which is incompatible.\u001B[0m\u001B[31m\n\u001B[0m\u001B[43mNote: you may need to restart the kernel using %restart_python or dbutils.library.restartPython() to use updated packages.\u001B[0m\n" + ] + } + ], + "source": [ + "%pip install -qqqq -U -r requirements.txt" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e304bc46-67f7-415f-9c39-2fcd173a8af4", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "dbutils.library.restartPython()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e450879c-040b-4168-83e4-e41a4d1e54b8", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "Exception ignored on calling ctypes callback function: .match_module_callback at 0x7ff9cf400ae0>\nTraceback (most recent call last):\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 400, in match_module_callback\n self._make_module_from_path(filepath)\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 515, in _make_module_from_path\n module = module_class(filepath, prefix, user_api, internal_api)\n ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 606, in __init__\n self.version = self.get_version()\n ^^^^^^^^^^^^^^^^^^\n File \"/databricks/python/lib/python3.11/site-packages/threadpoolctl.py\", line 646, in get_version\n config = get_config().split()\n ^^^^^^^^^^^^^^^^^^\nAttributeError: 'NoneType' object has no attribute 'split'\n" + ] + } + ], + "source": [ + "from cookbook.tools.vector_search import (\n", + " VectorSearchRetrieverTool,\n", + " VectorSearchSchema,\n", + ")\n", + "from cookbook.tools.uc_tool import UCTool" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b3fca500-aa82-47dc-9154-71a6e1a9f73e", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "WARNING:unitycatalog.ai.core.databricks:Current SparkSession in the active environment is not a pyspark.sql.connect.session.SparkSession instance. Classic runtime does not support all functionalities of the unitycatalog-ai framework. To use the full capabilities of unitycatalog-ai, execute your code using a client that is attached to a Serverless runtime cluster. To learn more about serverless, see the guide at: https://docs.databricks.com/en/compute/serverless/index.html#connect-to-serverless-compute for more details.\n" + ] + } + ], + "source": [ + "retriever_tool = VectorSearchRetrieverTool(\n", + " name=\"search_product_docs\",\n", + " description=\"Use this tool to search for product documentation.\",\n", + " vector_search_index=\"dbdemos.dbdemos_rag_chatbot.databricks_documentation_vs_index\",\n", + " vector_search_schema=VectorSearchSchema(\n", + " # These columns are the default values used in the `01_data_pipeline` notebook\n", + " # If you used a different column names in that notebook OR you are using a pre-built vector index, update the column names here.\n", + " chunk_text=\"content\", # Contains the text of each document chunk\n", + " document_uri=\"url\", # The document URI of the chunk e.g., \"/Volumes/catalog/schema/volume/file.pdf\" - displayed as the document ID in the Review App\n", + " # additional_metadata_columns=[], # Additional columns to return from the vector database and present to the LLM\n", + " )\n", + ")\n", + "\n", + "\n", + "translate_sku_tool = UCTool(uc_function_name=\"devanshu_pandey.cmhc_demo.vector_index_search_tool\")\n", + "\n", + "tools = [retriever_tool]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "c10ef87a-b8e5-45e8-8dcb-8f93f10be258", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "entry_point = dbutils.notebook.entry_point\n", + "\n", + "host_name = f'https://{entry_point.getDbutils().notebook().getContext().browserHostName().get()}'\n", + "\n", + "token = dbutils.notebook.entry_point.getDbutils().notebook().getContext().apiToken().get()" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "e5f8ce82-4f8b-483d-ac34-72f0a553a592", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "'https://adb-984752964297111.11.azuredatabricks.net/serving-endpoints/'" + ] + }, + "execution_count": 4, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "base_url = f\"{host_name}/serving-endpoints/\"\n", + "base_url" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4685a40a-2733-4546-9847-10f879c43730", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "def is_termination_message(message):\n", + " content = message.get(\"content\", \"\")\n", + " return (content and \"TERMINATE\" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "b497b6d2-9b82-4ed7-85c5-d648f8df7b58", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "from databricks.sdk import WorkspaceClient\n", + "from databricks.sdk.service.serving import ChatMessage, ChatMessageRole\n", + "from typing import List, Optional\n", + "\n", + "\n", + "class DatabricksModelServingClient:\n", + " def __init__(self, config, **kwargs):\n", + " self.workspace = WorkspaceClient()\n", + " self.openai_client = self.workspace.serving_endpoints.get_open_ai_client()\n", + " self.endpoint_name = config.get(\"endpoint_name\")\n", + " self.llm_config = config.get(\"llm_config\")\n", + "\n", + " def create(self, input_data):\n", + " messages = []\n", + " for message in input_data['messages']:\n", + " message.pop(\"name\", None)\n", + " messages.append(message)\n", + "\n", + " response = self.openai_client.chat.completions.create(\n", + " model=self.endpoint_name,\n", + " messages=messages,\n", + " tools=input_data['tools'],\n", + " tool_choice=\"auto\",\n", + " **self.llm_config\n", + " )\n", + " \n", + " return response\n", + "\n", + " def message_retrieval(self, response):\n", + " # Process and return messages from the response\n", + " return [choice.message for choice in response.choices]\n", + "\n", + " def cost(self, response):\n", + " # Implement cost calculation if applicable\n", + " return 0\n", + "\n", + " def get_usage(self, response):\n", + " usage = response.usage\n", + " # Implement usage statistics if available\n", + " return {\"prompt_tokens\": usage.prompt_tokens, \"total_tokens\": usage.total_tokens, \"completion_tokens\": usage.completion_tokens}" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "0c130294-fa62-4c16-8b4a-a22ec87e8b3a", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "config_list = {\n", + " \"model_client_cls\": \"DatabricksModelServingClient\",\n", + " \"model\": \"gpt4o\",\n", + " \"endpoint_name\": \"casaman-gpt4\", # \"databricks-meta-llama-3-3-70b-instruct\",\n", + " \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\n", + " \n", + "}" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "950623c1-a8e6-4ee0-b363-dde5cebc509d", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-17 20:21:39] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + } + ], + "source": [ + "import os\n", + "\n", + "from autogen import ConversableAgent\n", + "\n", + "def create_agents(system_prompt, chat_history):\n", + "\n", + " def is_termination_message(message):\n", + " content = message.get(\"content\", \"\")\n", + " return (content and \"TERMINATE\" in content.upper()) or (message['role'] == 'user' and 'tool_calls' not in message)\n", + "\n", + " # The user proxy agent is used for interacting with the assistant agent\n", + " # and executes tool calls.\n", + " user_proxy = ConversableAgent(\n", + " name=\"User\",\n", + " llm_config=False,\n", + " is_termination_msg=is_termination_message,\n", + " human_input_mode=\"NEVER\",\n", + " )\n", + "\n", + " assistant = ConversableAgent(\n", + " name=\"Assistant\",\n", + " system_message=\"You are a helpful AI assistant. \"\n", + " \"You can help with simple calculations. \"\n", + " \"Return 'TERMINATE' when the task is done.\",\n", + " llm_config={\"config_list\": [config_list]},\n", + " chat_messages={user_proxy: chat_history}\n", + " )\n", + "\n", + " return assistant, user_proxy\n", + " \n", + "assistant, user_proxy = create_agents('test', [])" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "12b8fb8e-1496-4fa3-8914-e450ba096851", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-17 20:21:41] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + } + ], + "source": [ + "from autogen import register_function\n", + "\n", + "for tool in tools:\n", + " register_function(\n", + " tool,\n", + " caller=assistant, # The assistant agent can suggest calls to the calculator.\n", + " executor=user_proxy, # The user proxy agent can execute the calculator calls.\n", + " name=tool.name,\n", + " description=tool.description, # A description of the tool.\n", + " )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "3ba1c2c1-a7f3-4506-8b79-cec97ee7e92b", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "[autogen.oai.client: 12-17 20:21:43] {509} INFO - Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + }, + { + "output_type": "stream", + "name": "stderr", + "output_type": "stream", + "text": [ + "INFO:autogen.oai.client:Detected custom model client in config: DatabricksModelServingClient, model client can not be used until register_model_client is called.\n" + ] + } + ], + "source": [ + "translate_sku_tool._toolkit.tools[0].register_function(callers = assistant,\n", + " executors = user_proxy )" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "4b4a8bc2-1b7a-4880-952c-4b185fa3b47f", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [], + "source": [ + "assistant.register_model_client(model_client_cls=DatabricksModelServingClient)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "a90296d7-57ef-4c48-829d-3ab3eca6e2db", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "stream", + "name": "stdout", + "output_type": "stream", + "text": [ + "\u001B[33mUser\u001B[0m (to Assistant):\n\nWhat is mlflow in databricks?\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\n\u001B[32m***** Suggested tool call (call_RcySHcYfv5iSQcboahKj0tu5): search_product_docs *****\u001B[0m\nArguments: \n{\"query\":\"mlflow in databricks\"}\n\u001B[32m************************************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[35m\n>>>>>>>> EXECUTING FUNCTION search_product_docs...\u001B[0m\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[33mUser\u001B[0m (to Assistant):\n\n\u001B[32m***** Response from calling tool (call_RcySHcYfv5iSQcboahKj0tu5) *****\u001B[0m\n[{\"page_content\": \"ML lifecycle management using MLflow \\nThis article describes how MLflow is used in Databricks for machine learning lifecycle management. It also includes examples that introduce each MLflow component and links to content that describe how these components are hosted within Databricks. \\nML lifecycle management in Databricks is provided by managed MLflow. Databricks provides a fully managed and hosted version of MLflow integrated with enterprise security features, high availability, and other Databricks workspace features such as experiment and run management and notebook revision capture. \\nFirst-time users should begin with Get started with MLflow experiments, which demonstrates the basic MLflow tracking APIs. \\nWhat is MLflow?\\nWhat is MLflow?\\nMLflow is an open source platform for managing the end-to-end machine learning lifecycle. It has the following primary components: \\nTracking: Allows you to track experiments to record and compare parameters and results. \\nModels: Allow you to manage and deploy models from a variety of ML libraries to a variety of model serving and inference platforms. \\nProjects: Allow you to package ML code in a reusable, reproducible form to share with other data scientists or transfer to production. \\nModel Registry: Allows you to centralize a model store for managing models’ full lifecycle stage transitions: from staging to production, with capabilities for versioning and annotating. Databricks provides a managed version of the Model Registry in Unity Catalog. \\nModel Serving: Allows you to host MLflow models as REST endpoints. Databricks provides a unified interface to deploy, govern, and query your served AI models. \\nMLflow supports Java, Python, R, and REST APIs. \\nNote \\nIf you’re just getting started with Databricks, consider using MLflow on Databricks Community Edition, which provides a simple managed MLflow experience for lightweight experimentation. Remote execution of MLflow projects is not supported on Databricks Community Edition. We plan to impose moderate limits on the number of experiments and runs. For the initial launch of MLflow on Databricks Community Edition no limits are imposed. \\nMLflow data stored in the control plane (experiment runs, metrics, tags and params) is encrypted using a platform-managed key. Encryption using Customer-managed keys for managed services is not supported for that data. On the other hand, the MLflow models and artifacts stored in your root (DBFS) storage can be encrypted using your own key by configuring customer-managed keys for workspace storage.\\n\\nMLflow tracking\\nMLflow tracking\\nMLflow on Databricks offers an integrated experience for tracking and securing training runs for machine learning and deep learning models. \\nTrack model development using MLflow \\nDatabricks Autologging\\n\\nModel lifecycle management\\nModel lifecycle management\\nMLflow Model Registry is a centralized model repository and a UI and set of APIs that enable you to manage the full lifecycle of MLflow Models. Databricks provides a hosted version of the MLflow Model Registry in Unity Catalog. Unity Catalog provides centralized model governance, cross-workspace access, lineage, and deployment. For details about managing the model lifecycle in Unity Catalog, see Manage model lifecycle in Unity Catalog. \\nIf your workspace is not enabled for Unity Catalog, you can use the Workspace Model Registry. \\nModel Registry concepts \\nModel: An MLflow Model logged from an experiment or run that is logged with one of the model flavor’s mlflow..log_model methods. After a model is logged, you can register it with the Model Registry. \\nRegistered model: An MLflow Model that has been registered with the Model Registry. The registered model has a unique name, versions, model lineage, and other metadata. \\nModel version: A version of a registered model. When a new model is added to the Model Registry, it is added as Version 1. Each model registered to the same model name increments the version number. \\nModel alias: An alias is a mutable, named reference to a particular version of a registered model. Typical uses of aliases are to specify which model versions are deployed in a given environment in your model training workflows or to write inference workloads that target a specific alias. For example, you could assign the “Champion” alias of your “Fraud Detection” registered model to the model version that should serve the majority of production traffic, and then write inference workloads that target that alias (that is, make predictions using the “Champion” version). \\nModel stage (workspace model registry only): A model version can be assigned one or more stages. MLflow provides predefined stages for the common use cases: None, Staging, Production, and Archived. With the appropriate permission you can transition a model version between stages or you can request a model stage transition. Model version stages are not used in Unity Catalog. \\nDescription: You can annotate a model’s intent, including a description and any relevant information useful for the team such as algorithm description, dataset employed, or methodology. \\nExample notebooks \\nFor an example that illustrates how to use the Model Registry to build a machine learning application that forecasts the daily power output of a wind farm, see the following: \\nModels in Unity Catalog example \\nWorkspace Model Registry example\\n\\nModel deployment\", \"metadata\": {\"similarity_score\": 0.0061636227, \"url\": \"https://docs.databricks.com/en/mlflow/index.html\"}, \"id\": 25705.0}, {\"page_content\": \"Deploy models for batch inference and prediction \\nThis article describes how to deploy MLflow models for offline (batch and streaming) inference. Databricks recommends that you use MLflow to deploy machine learning models for batch or streaming inference. For general information about working with MLflow models, see Log, load, register, and deploy MLflow models. \\nFor information about real-time model serving on Databricks, see Model serving with Databricks. \\nUse MLflow for model inference\\nUse MLflow for model inference\\nMLflow helps you generate code for batch or streaming inference. \\nIn the MLflow Model Registry, you can automatically generate a notebook for batch or streaming inference via Delta Live Tables. \\nIn the MLflow Run page for your model, you can copy the generated code snippet for inference on pandas or Apache Spark DataFrames. \\nYou can also customize the code generated by either of the above options. See the following notebooks for examples: \\nThe model inference example uses a model trained with scikit-learn and previously logged to MLflow to show how to load a model and use it to make predictions on data in different formats. The notebook illustrates how to apply the model as a scikit-learn model to a pandas DataFrame, and how to apply the model as a PySpark UDF to a Spark DataFrame. \\nThe MLflow Model Registry example shows how to build, manage, and deploy a model with Model Registry. On that page, you can search for .predict to identify examples of offline (batch) predictions.\\n\\nCreate a Databricks job\\nCreate a Databricks job\\nTo run batch or streaming predictions as a job, create a notebook or JAR that includes the code used to perform the predictions. Then, execute the notebook or JAR as a Databricks job. Jobs can be run either immediately or on a schedule.\\n\\nStreaming inference\\nStreaming inference\\nFrom the MLflow Model Registry, you can automatically generate a notebook that integrates the MLflow PySpark inference UDF with Delta Live Tables. \\nYou can also modify the generated inference notebook to use the Apache Spark Structured Streaming API. \\nInference with deep learning models \\nFor information about and examples of deep learning model inference on Databricks, see the following articles: \\nDeep learning model inference workflow \\nDeep learning model inference performance tuning guide \\nReference solutions for machine learning\\n\\nInference with MLlib and XGBoost4J models\\nInference with MLlib and XGBoost4J models\\nFor scalable model inference with MLlib and XGBoost4J models, use the native transform methods to perform inference directly on Spark DataFrames. The MLlib example notebooks include inference steps.\\n\\nCustomize and optimize model inference\\nCustomize and optimize model inference\\nWhen you use the MLflow APIs to run inference on Spark DataFrames, you can load the model as a Spark UDF and apply it at scale using distributed computing. \\nYou can customize your model to add pre-processing or post-processing and to optimize computational performance for large models. A good option for customizing models is the MLflow pyfunc API, which allows you to wrap a model with custom logic. \\nIf you need to do further customization, you can manually wrap your machine learning model in a Pandas UDF or a pandas Iterator UDF. See the deep learning examples. \\nFor smaller datasets, you can also use the native model inference routines provided by the library. \\nModel inference example Deep learning model inference workflow Deep learning model inference performance tuning guide\", \"metadata\": {\"similarity_score\": 0.005802475, \"url\": \"https://docs.databricks.com/en/machine-learning/model-inference/index.html\"}, \"id\": 25572.0}, {\"page_content\": \"Use MLflow models in a Delta Live Tables pipeline\\nNote \\nTo use MLflow models in a Unity Catalog-enabled pipeline, your pipeline must be configured to use the preview channel. To use the current channel, you must configure your pipeline to publish to the Hive metastore. \\nYou can use MLflow-trained models in Delta Live Tables pipelines. MLflow models are treated as transformations in Databricks, meaning they act upon a Spark DataFrame input and return results as a Spark DataFrame. Because Delta Live Tables defines datasets against DataFrames, you can convert Apache Spark workloads that leverage MLflow to Delta Live Tables with just a few lines of code. For more on MLflow, see ML lifecycle management using MLflow. \\nIf you already have a Python notebook calling an MLflow model, you can adapt this code to Delta Live Tables by using the @dlt.table decorator and ensuring functions are defined to return transformation results. Delta Live Tables does not install MLflow by default, so make sure you %pip install mlflow and import mlflow and dlt at the top of your notebook. For an introduction to Delta Live Tables syntax, see Example: Ingest and process New York baby names data. \\nTo use MLflow models in Delta Live Tables, complete the following steps: \\nObtain the run ID and model name of the MLflow model. The run ID and model name are used to construct the URI of the MLflow model. \\nUse the URI to define a Spark UDF to load the MLflow model. \\nCall the UDF in your table definitions to use the MLflow model. \\nThe following example shows the basic syntax for this pattern: \\n%pip install mlflow import dlt import mlflow run_id= \\\"\\\" model_name = \\\"\\\" model_uri = f\\\"runs:/{run_id}/{model_name}\\\" loaded_model_udf = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri) @dlt.table def model_predictions(): return dlt.read() .withColumn(\\\"prediction\\\", loaded_model_udf()) \\nAs a complete example, the following code defines a Spark UDF named loaded_model_udf that loads an MLflow model trained on loan risk data. The data columns used to make the prediction are passed as an argument to the UDF. The table loan_risk_predictions calculates predictions for each row in loan_risk_input_data. \\n%pip install mlflow import dlt import mlflow from pyspark.sql.functions import struct run_id = \\\"mlflow_run_id\\\" model_name = \\\"the_model_name_in_run\\\" model_uri = f\\\"runs:/{run_id}/{model_name}\\\" loaded_model_udf = mlflow.pyfunc.spark_udf(spark, model_uri=model_uri) categoricals = [\\\"term\\\", \\\"home_ownership\\\", \\\"purpose\\\", \\\"addr_state\\\",\\\"verification_status\\\",\\\"application_type\\\"] numerics = [\\\"loan_amnt\\\", \\\"emp_length\\\", \\\"annual_inc\\\", \\\"dti\\\", \\\"delinq_2yrs\\\", \\\"revol_util\\\", \\\"total_acc\\\", \\\"credit_length_in_years\\\"] features = categoricals + numerics @dlt.table( comment=\\\"GBT ML predictions of loan risk\\\", table_properties={ \\\"quality\\\": \\\"gold\\\" } ) def loan_risk_predictions(): return dlt.read(\\\"loan_risk_input_data\\\") .withColumn('predictions', loaded_model_udf(struct(features)))\\n\\nRetain manual deletes or updates\", \"metadata\": {\"similarity_score\": 0.005774803, \"url\": \"https://docs.databricks.com/en/delta-live-tables/transform.html\"}, \"id\": 24547.0}, {\"page_content\": \"February 2019 \\nThese features and Databricks platform improvements were released in February 2019. \\nNote \\nReleases are staged. Your Databricks account may not be updated until up to a week after the initial release date. \\nManaged MLflow on Databricks Public Preview\\nManaged MLflow on Databricks Public Preview\\nFebruary 26 - March 5, 2019: Version 2.92 \\nMLflow is an open source platform for managing the end-to-end machine learning lifecycle. It tackles three primary functions: \\nTracking experiments to record and compare parameters and results. \\nManaging and deploying models from a variety of ML libraries to a variety of model serving and inference platforms. \\nPackaging ML code in a reusable, reproducible form to share with other data scientists or transfer to production. \\nDatabricks now provides a fully managed and hosted version of MLflow integrated with enterprise security features, high availability, and other Databricks workspace features such as experiment management, run management, and notebook revision capture. MLflow on Databricks offers an integrated experience for tracking and securing machine learning model training runs and running machine learning projects. By using managed MLflow on Databricks, you get the advantages of both platforms, including: \\nWorkspaces: Collaboratively track and organize experiments and results within Databricks Workspaces with a hosted MLflow Tracking Server and integrated experiment UI. When you use MLflow in notebooks, Databricks automatically captures notebook revisions so you can reproduce the same code and runs later. \\nSecurity: Take advantage of one common security model for the entire ML lifecycle via ACLs. \\nJobs: Run MLflow projects as Databricks jobs remotely and directly from Databricks notebooks. \\nHere’s a demo of a tracking workflow in a Databricks Workspace: \\nFor details, see Track ML and deep learning training runs and Run MLflow Projects on Databricks.\\n\\nAzure Data Lake Storage Gen2 connector is generally available\\nAzure Data Lake Storage Gen2 connector is generally available\\nFebruary 15, 2019 \\nAzure Data Lake Storage Gen2 (ADLS Gen2), the next-generation data lake solution for big data analytics, is now GA, as is the ADLS Gen2 connector for Databricks. We are also pleased to announce that ADLS Gen2 supports Databricks Delta when you are running clusters on Databricks Runtime 5.2 and above.\\n\\nPython 3 now the default when you create clusters\\nPython 3 now the default when you create clusters\\nFebruary 12-19, 2019: Version 2.91 \\nThe default Python version for clusters created using the UI has switched from Python 2 to Python 3. The default for clusters created using the REST API is still Python 2. \\nExisting clusters will not change their Python versions. But if you’ve been in the habit of taking the Python 2 default when you create new clusters, you’ll need to start paying attention to your Python version selection.\\n\\nAdditional cluster instance types\\nAdditional cluster instance types\\nFebruary 12-19, 2019: Version 2.91 \\nDatabricks now provides Beta support for the following Amazon EC2 instance types: \\nc5.18xlarge \\nr5.24xlarge \\nr4.16xlarge \\nm5.24xlarge\\n\\nDelta Lake generally available\\nDelta Lake generally available\\nFebruary 1, 2019 \\nNow everyone can get the benefits of Databricks Delta’s powerful transactional storage layer and super-fast reads: as of February 1, Delta Lake is GA and available on all supported versions of Databricks Runtime. For information about Delta, see the What is Delta Lake?.\", \"metadata\": {\"similarity_score\": 0.0056295595, \"url\": \"https://docs.databricks.com/en/release-notes/product/2019/february.html\"}, \"id\": 26114.0}, {\"page_content\": \"AI and Machine Learning on Databricks \\nThis article describes the tools that Databricks provides to help you build and monitor AI and ML workflows. The diagram shows how these components work together to help you implement your model development and deployment process. \\nWhy use Databricks for machine learning and deep learning?\\nWhy use Databricks for machine learning and deep learning?\\nWith Databricks, you can implement the full ML lifecycle on a single platform with end-to-end governance throughout the ML pipeline. Databricks includes the following built-in tools to support ML workflows: \\nUnity Catalog for governance, discovery, versioning, and access control for data, features, models, and functions. \\nLakehouse Monitoring for data monitoring. \\nFeature engineering and serving. \\nSupport for the model lifecycle: \\nDatabricks AutoML for automated model training. \\nMLflow for model development tracking. \\nUnity Catalog for model management. \\nDatabricks Model Serving for high-availability, low-latency model serving. This includes deploying LLMs using: \\nFoundation Model APIs which allow you to access and query state-of-the-art open models from a serving endpoint. \\nExternal models which allow you to access models hosted outside of Databricks. \\nLakehouse Monitoring to track model prediction quality and drift. \\nDatabricks Workflows for automated workflows and production-ready ETL pipelines. \\nDatabricks Git folders for code management and Git integration.\\n\\nDeep learning on Databricks\\nDeep learning on Databricks\\nConfiguring infrastructure for deep learning applications can be difficult. \\nDatabricks Runtime for Machine Learning takes care of that for you, with clusters that have built-in compatible versions of the most common deep learning libraries like TensorFlow, PyTorch, and Keras, and supporting libraries such as Petastorm, Hyperopt, and Horovod. Databricks Runtime ML clusters also include pre-configured GPU support with drivers and supporting libraries. It also supports libraries like Ray to parallelize compute processing for scaling ML workflows and AI applications. \\nDatabricks Runtime ML clusters also include pre-configured GPU support with drivers and supporting libraries. Databricks Model Serving enables creation of scalable GPU endpoints for deep learning models with no extra configuration. \\nFor machine learning applications, Databricks recommends using a cluster running Databricks Runtime for Machine Learning. See Create a cluster using Databricks Runtime ML. \\nTo get started with deep learning on Databricks, see: \\nBest practices for deep learning on Databricks \\nDeep learning on Databricks \\nReference solutions for deep learning\\n\\nLarge language models (LLMs) and generative AI on Databricks\\nLarge language models (LLMs) and generative AI on Databricks\\nDatabricks Runtime for Machine Learning includes libraries like Hugging Face Transformers and LangChain that allow you to integrate existing pre-trained models or other open-source libraries into your workflow. The Databricks MLflow integration makes it easy to use the MLflow tracking service with transformer pipelines, models, and processing components. In addition, you can integrate OpenAI models or solutions from partners like John Snow Labs in your Databricks workflows. \\nWith Databricks, you can customize a LLM on your data for your specific task. With the support of open source tooling, such as Hugging Face and DeepSpeed, you can efficiently take a foundation LLM and train it with your own data to improve its accuracy for your specific domain and workload. You can then leverage the custom LLM in your generative AI applications. \\nIn addition, Databricks provides Foundation Model APIs and external models which allows you to access and query state-of-the-art open models from a serving endpoint. Using Foundation Model APIs, developers can quickly and easily build applications that leverage a high-quality generative AI model without maintaining their own model deployment. \\nFor SQL users, Databricks provides AI functions that SQL data analysts can use to access LLM models, including from OpenAI, directly within their data pipelines and workflows. See AI Functions on Databricks.\\n\\nDatabricks Runtime for Machine Learning\", \"metadata\": {\"similarity_score\": 0.0054116026, \"url\": \"https://docs.databricks.com/en/machine-learning/index.html\"}, \"id\": 25533.0}]\n\u001B[32m**********************************************************************\u001B[0m\n\n--------------------------------------------------------------------------------\n\u001B[31m\n>>>>>>>> USING AUTO REPLY...\u001B[0m\n\u001B[33mAssistant\u001B[0m (to User):\n\nMLflow in Databricks is an open-source platform for managing the end-to-end machine learning lifecycle. It provides several key components to facilitate machine learning projects:\n\n1. **Tracking**: MLflow allows you to track experiments to record and compare parameters and results. This helps in managing and organizing experiments and results within Databricks Workspaces.\n\n2. **Models**: MLflow enables you to manage and deploy models from various ML libraries to different model serving and inference platforms.\n\n3. **Projects**: It allows you to package ML code in a reusable, reproducible form to share with other data scientists or transfer to production.\n\n4. **Model Registry**: This component helps in centralizing a model store for managing models' full lifecycle stage transitions, from staging to production, with capabilities for versioning and annotating.\n\n5. **Model Serving**: MLflow allows you to host ML models as REST endpoints, providing a unified interface to deploy, govern, and query your served AI models.\n\nDatabricks offers a fully managed and hosted version of MLflow integrated with enterprise security features, high availability, and other Databricks workspace features, such as experiment and run management and notebook revision capture. MLflow supports Java, Python, R, and REST APIs. \n\nFor those new to Databricks, MLflow on Databricks Community Edition provides a simple managed MLflow experience for lightweight experimentation.\n\n--------------------------------------------------------------------------------\n" + ] + }, + { + "output_type": "display_data", + "data": { + "application/databricks.mlflow.trace": "\"tr-233b2640bd5e45ed90347d540d53c7e2\"", + "text/plain": [ + "Trace(request_id=tr-233b2640bd5e45ed90347d540d53c7e2)" + ] + }, + "metadata": {}, + "output_type": "display_data" + } + ], + "source": [ + "chat_result = user_proxy.initiate_chat(assistant, message=\"What is mlflow in databricks?\")" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "5ff36339-23e9-42f2-964c-ad6eea85ef40", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "execute_result", + "data": { + "text/plain": [ + "{'content': \"MLflow is an open-source platform used within Databricks for managing the end-to-end machine learning lifecycle. It offers several key components to facilitate this:\\n\\n1. **Tracking**: It allows you to track experiments and record metrics related to machine learning models, enabling you to compare parameters and results across different runs.\\n \\n2. **Models**: It supports the management and deployment of models from various ML libraries to different serving and inference platforms, allowing for a broad usage across different environments.\\n\\n3. **Projects**: Allows ML code to be packaged in a reusable and reproducible manner so that it can be shared with other data scientists or transferred into production environments.\\n\\n4. **Model Registry**: This component centralizes a model store to manage the full lifecycle of models, from training and validation to deployment and monitoring.\\n\\nDatabricks provides a fully managed and hosted version of MLflow that is integrated with its workspace features, including experiment and run management and notebook revision capture, along with enterprise-grade security and availability features. \\n\\nThis managed MLflow experience makes it easier to track, manage, and deploy ML models while leveraging the distributed computing power and security features of Databricks. \\n\\nFor those just getting started, Databricks Community Edition offers a simplified version of managed MLflow suitable for lightweight experimentation.\\n\\nFor more detailed guidance, you can utilize the resources harmonized in Databricks' MLflow documentation.\",\n", + " 'role': 'assistant',\n", + " 'name': 'Assistant'}" + ] + }, + "execution_count": 15, + "metadata": {}, + "output_type": "execute_result" + } + ], + "source": [ + "assistant.last_message(user_proxy)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "92599841-450c-425d-bede-309459a037c0", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "history = assistant.chat_messages[user_proxy]\n", + "history" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "20d128ab-713c-4871-b18a-c5aa42544437", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "assistant, user_proxy = create_agents('test', history)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "911a9c70-7291-467c-9a22-1b4bbf9b31b9", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "assistant.chat_messages[user_proxy]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "d7393180-cd57-48fc-af71-27f9b04fa205", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "chat_result = user_proxy.initiate_chat(assistant, message=\"This is the second turn of the conversation. Can you summary our actual conversation?\", clear_history=False)" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "6e931ada-d33b-4ea3-9a7e-faf363019807", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "assistant.llm_config[\"tools\"]" + ] + }, + { + "cell_type": "code", + "execution_count": 0, + "metadata": { + "application/vnd.databricks.v1+cell": { + "cellMetadata": { + "byteLimit": 2048000, + "rowLimit": 10000 + }, + "inputWidgets": {}, + "nuid": "8b2976c3-0360-419d-8667-4a2495706777", + "showTitle": false, + "tableResultSettingsMap": {}, + "title": "" + } + }, + "outputs": [ + { + "output_type": "display_data", + "data": { + "text/plain": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n", + "\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n", + "\u001B[0m ^\u001B[0m\n", + "\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ] + }, + "metadata": { + "application/vnd.databricks.v1+output": { + "addedWidgets": {}, + "arguments": {}, + "datasetInfos": [], + "jupyterProps": { + "ename": "SyntaxError", + "evalue": "invalid syntax (command-3722098005799721-2479447087, line 5)" + }, + "metadata": { + "errorSummary": "Command skipped" + }, + "removedWidgets": [], + "sqlProps": null, + "stackFrames": [ + "\u001B[0;36m File \u001B[0;32m, line 5\u001B[0;36m\u001B[0m\n\u001B[0;31m \"llm_config\": {\"temperature\": 0.5, \"max_tokens\": 1500}\u001B[0m\n\u001B[0m ^\u001B[0m\n\u001B[0;31mSyntaxError\u001B[0m\u001B[0;31m:\u001B[0m invalid syntax\n" + ], + "type": "baseError" + } + }, + "output_type": "display_data" + } + ], + "source": [ + "assistant.last_message(user_proxy)" + ] + } + ], + "metadata": { + "application/vnd.databricks.v1+notebook": { + "computePreferences": null, + "dashboards": [], + "environmentMetadata": { + "base_environment": "", + "client": "1" + }, + "language": "python", + "notebookMetadata": { + "pythonIndentUnit": 2 + }, + "notebookName": "autogen_started", + "widgets": {} + }, + "language_info": { + "name": "python" + } + }, + "nbformat": 4, + "nbformat_minor": 0 +} \ No newline at end of file